1. 程式人生 > >使用Selenium爬取淘寶商品

使用Selenium爬取淘寶商品

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.wait import WebDriverWait
from selenium.common.exceptions import TimeoutException
import time
browser=webdriver.Firefox()
wait=WebDriverWait(browser,10)
KEYWORD = 'iPad'
def index_page(page):
	"""
	    抓取索引頁
	    :param page: 頁碼
	"""
	print('正在爬取第',page,'頁')
	try:
		url='https://s.taobao.com/search?q='+KEYWORD
		browser.get(url)
		if page>1:
			input1=wait.until(EC.presence_of_element_located((By.CSS_SELECTOR,'#mainsrp-pager div.form > input')))
			submit=wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR,'#mainsrp-pager div.form > span.btn.J_Submit')))
			input1.clear()
			input1.send_keys(page)
			submit.click()
		wait.until(EC.text_to_be_present_in_element((By.CSS_SELECTOR,'#mainsrp-pager li.item.active > span'),str(page)))
		wait.until(EC.presence_of_element_located((By.CSS_SELECTOR,'.m-itemlist .items .item')))
		get_products()
		time.sleep(2)
	except TimeoutException:
		index_page(page)

from pyquery import PyQuery
def get_products():
	"""
	提取商品資料
	"""
	html=browser.page_source
	doc=PyQuery(html)
	items=doc('#mainsrp-itemlist .items .item').items()
	for item in items:
		product={
			'image':'http://'+item.find('.pic .img').attr('data-src'),
			'price':item.find('strong').text(),
			'deal':item.find('.deal-cnt').text(),
			'title':item.find('[class="row row-2 title"] a').text(),
			'shop':item.find('.shop a').text(),
			'location':item.find('.location').text()

		}
		print(product)
		save_to_mysql(product)

import pymysql
db=pymysql.connect(host='localhost',user='root',password='123456789',port=3306,db='spiders')
cursor=db.cursor()
cursor.execute('create table taobao(image varchar(100),price varchar(20),deal varchar(20),title varchar(50),shop varchar(20),location varchar(20))')
def save_to_mysql(product):
	try:
		cursor.execute('insert into taobao values(%s,%s,%s,%s,%s,%s)',(product['image'],product['price'],product['deal'],product['title'],product['shop'],product['location']))
		db.commit()
	except:
		db.rollback()
max_page=100
def main():
	"""
	遍歷每一頁
	"""
	for i in range(1,max_page+1):
		index_page(i)
main()