1. 程式人生 > >利用Python批量抓取京東評論數據

利用Python批量抓取京東評論數據

() 開始 book for return SQ 數據返回 python js對象

京東圖書評論有非常豐富的信息,這裏面就包含了購買日期、書名、作者、好評、中評、差評等等。以購買日期為例,使用Python + Mysql的搭配進行實現,程序不大,才100行。相關的解釋我都在程序裏加註了:

 1 from selenium import webdriver
 2 from bs4 import BeautifulSoup
 3 import re
 4 import win32com.client
 5 import threading,time
 6 import MySQLdb
 7 
 8 def mydebug():
 9     driver.quit()
10     exit(0)
11 #Python群125240963每天更新學習資料 12 def catchDate(s): 13 """頁面數據提取""" 14 soup = BeautifulSoup(s) 15 z = [] 16 global nowtimes 17 18 m = soup.findAll("div",class_="date-buy") 19 for obj in m: 20 try: 21 tmp = obj.find(br).contents 22 except Exception, e:
23 continue 24 if(tmp != ""): 25 z.append(tmp) 26 nowtimes += 1 27 return z 28 29 def getTimes(n,t): 30 """獲取當前進度""" 31 return "當前進度為:" + str(int(100*n/t)) + "%" 32 33 34 #———————————————————————————————————| 程序開始 |—————————————————————————————————
35 #確定圖書大類 36 cate = {"3273":"歷史","3279":"心理學","3276":"政治軍事","3275":"國學古籍","3274":"哲學宗教","3277":"法律","3280":"文化","3281":"社會科學"} 37 38 #斷點續抓 39 num1 = input("bookid:") 40 num2 = input("pagenumber:") 41 42 #生成圖書大類鏈接,共需17355*20 = 347100次 43 totaltimes = 347100.0 44 nowtimes = 0 45 46 #開啟webdirver的PhantomJS對象 47 #driver = webdriver.PhantomJS() 48 driver = webdriver.Ie(C:\Python27\Scripts\IEDriverServer) 49 #driver = webdriver.Chrome(‘C:\Python27\Scripts\chromedriver‘) 50 51 #讀出Mysql中的評論頁面,進行抓取 52 # 連接數據庫  53 try: 54 conn = MySQLdb.connect(host=localhost,user=root,passwd=‘‘,db=jd) 55 except Exception, e: 56 print e 57 sys.exit() 58 59 # 獲取cursor對象 60 cursor = conn.cursor() 61 sql = "SELECT * FROM booknew ORDER BY pagenumber DESC" 62 cursor.execute(sql) 63 alldata = cursor.fetchall() 64 65 flag = 0 66 flag2 = 0 67 68 # 如果有數據返回就循環輸出,http://club.jd.com/review/10178500-1-154.html 69 if alldata: 70 for rec in alldata: 71 #rec[0]--bookid,rec[1]--cateid,rec[2]--pagenumber 72 if(rec[0] != str(num1) and flag == 0): 73 continue 74 else: 75 flag = 1 76 for p in range(num2,rec[2]): 77 if(flag2 == 0): 78 num2 = 0 79 flag2 = 1 80 p += 1 81 link = "http://club.jd.com/review/" + rec[0] + "-1-" + str(p) + ".html" 82 #抓網頁 83 driver.get(link) 84 html = driver.page_source 85 #抓評論 86 buydate = catchDate(html) 87 #寫入數據庫 88 for z in buydate: 89 sql = "INSERT INTO ljj (id, cateid, bookid, date) VALUES (NULL, ‘" + rec[0] + "‘,‘" + rec[1] + "‘,‘" + z[0] + "‘);" 90 try: 91 cursor.execute(sql) 92 except Exception, e: 93 print e 94 conn.commit() 95 print getTimes(nowtimes,totaltimes) 96 97 driver.quit() 98 cursor.close() 99 conn.close()

利用Python批量抓取京東評論數據