1. 程式人生 > >selenium模擬登陸淘寶網並且將‘衣服’相關資訊下載儲存在mysql資料庫

selenium模擬登陸淘寶網並且將‘衣服’相關資訊下載儲存在mysql資料庫

import re
import pymysql
from lxml import etree
from selenium import webdriver
#一下三行用於等待判斷頁面是否載入完畢
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

from selenium.common.exceptions import
TimeoutException #selenium驅動谷歌瀏覽器 brower = webdriver.Chrome() #建立資料庫例項物件 con = pymysql.connect(host='localhost',user='root',password='',db='taobao',port=3306) #建立遊標 cur = con.cursor() #建立一個表 cur.execute("CREATE TABLE yifu (id int(4) NOT NULL auto_increment PRIMARY KEY ,title VARCHAR(60),prince FLOAT(4,2),people int(10),city VARCHAR(10),shop VARCHAR(20),img VARCHAR(200))"
) def search(): ''' 功能:開啟網頁 載入網頁 獲取輸入框物件 獲取搜尋按鈕物件 向輸入框物件輸入關鍵字 搜尋按鈕物件執行點選一次的方法 ''' try: brower.get('https://www.taobao.com') #開啟淘寶首頁 #等待搜尋框載入 並且 獲取輸入框物件 (是一個列表) #WebDriverWait(brower,10)顯示等待,直到這個元素被載入完成才會才會繼續執行 in_put = WebDriverWait(brower,10
).until( EC.presence_of_all_elements_located((By.CSS_SELECTOR,'#q')) ) #等待 點選搜尋按鈕 載入 並且 獲取搜尋按鈕物件 (是一個列表) submit = WebDriverWait(brower,10).until( EC.presence_of_all_elements_located((By.CSS_SELECTOR,'#J_TSearchForm > div.search-button > button')) ) #在輸入框內輸入 in_put[0].send_keys('衣服') #因為獲得的是一個物件列表,所以要將需要的物件取出 in_put[0] #點選 搜尋按鈕 submit[0].click() #等待 總頁數顯示標籤 載入 並且 獲取物件 (是一個列表) total_page = WebDriverWait(brower,10).until( EC.presence_of_all_elements_located((By.CSS_SELECTOR,'#mainsrp-pager > div > div > div > div.total')) ) get_products() return total_page[0].text #該物件是一個文字標籤,獲取該標籤裡面的內容 except TimeoutException: return search() #出現超時錯誤再次執行該函式,這裡要用到return,效果就是 total = search()一直在執行並且將值傳給total, #如果不寫return則傳不了值給total def next_page(page_num): ''' 功能:執行翻頁操作 :param page_num: :return: ''' try: in_put = WebDriverWait(brower,10).until( EC.presence_of_all_elements_located((By.CSS_SELECTOR,'#mainsrp-pager > div > div > div > div.form > input')) )[0] submit = WebDriverWait(brower,10).until( EC.presence_of_all_elements_located((By.CSS_SELECTOR,'#mainsrp-pager > div > div > div > div.form > span.btn.J_Submit')) )[0] in_put.clear() in_put.send_keys(page_num) submit.click() #EC.text_to_be_present_in_element 該方法用於判斷所要的文字是否出現在指定標籤元素當中,在這裡是用來判斷當前頁面數是否是我們輸入的數。如果是則為True active = WebDriverWait(brower,10).until( EC.text_to_be_present_in_element((By.CSS_SELECTOR,'#mainsrp-pager > div > div > div > ul > li.item.active > span'),str(page_num)) ) print(active) get_products() except TimeoutException: next_page(page_num) def get_products(): #獲取商品資訊的操作物件 WebDriverWait(brower,10).until( EC.presence_of_all_elements_located((By.CSS_SELECTOR,'#mainsrp-itemlist > div > div > div:nth-child(1) > div')) ) html = brower.page_source html = etree.HTML(html) items = html.xpath('//*[@id="mainsrp-itemlist"]/div/div/div[1]/div') for i in items: product={ 'img' : i.xpath('./div/div/div/a/img/@data-src')[0], 'prince' : float(i.xpath('./div[2]/div/div/strong/text()')[0]), 'people' : int( re.compile('(\d+)').search( i.xpath('./div[2]/div[1]/div[2]/text()')[0] ).group(1) ), 'title' : i.xpath('./div/div/div/a/img/@alt')[0], 'city' : i.xpath('./div[2]/div[3]/div[2]/text()')[0], 'shop': i.xpath('./div[2]/div[3]/div/a/span[2]/text()')[0] } print(product) cur.execute("INSERT INTO yifu (title,prince,people,city,shop,img) VALUES (%s,%s,%s,%s,%s,%s)",(product['title'],product['prince'],product['people'],product['city'],product['shop'],product['img'])) con.commit() #提交 def main(): total = search() total = int( re.compile('(\d+)').search(total).group(1) ) print(total) for i in range(2,total+1): next_page(i) con.close() #關閉 brower.close() if __name__ == '__main__': main()