一個完整的大作業:淘寶口紅銷量top10的銷量和評價
阿新 • • 發佈:2017-11-02
gen 匹配 我們 es2017 對象 啟用 網站 rgs cep
網站:淘寶口紅搜索頁
https://s.taobao.com/search?q=%E5%8F%A3%E7%BA%A2&sort=sale-desc
先爬取該頁面前十的口紅的商品名、銷售量、價格、評分以及評論數,發現該網頁使用了json的方式,使用正則表達式匹配字段,抓取我們
所需要的信息。啟用用戶代理爬取數據,預防該網站的反爬手段,並把結果存入到csv文件中,效果如下。
成功爬取到淘寶口紅top10的基本信息後,發現評論並不在同一頁面上,並且該頁面存在著進入評論頁的關鍵字,爬取下來後放入一個列表中,然後用循環整個列表和頁數,使用
正則表達式,匹配評論的關鍵字,成功爬取淘寶top10口紅的評論進十萬條,如下圖所示。
完整的源代碼如下:
from urllib import request import re import csv import time itemId=[] sellerId=[] links=[] titles=[] # ,‘商品評分‘,‘評論總數‘ def get_product_info(): fileName = ‘商品.csv‘ comment_file = open(fileName, ‘w‘, newline=‘‘) write = csv.writer(comment_file) write.writerow([‘商品名‘, ‘連接‘, ‘銷售量‘, ‘價格‘, ‘地址‘,‘商品評分‘,‘評論總數‘]) comment_file.close() fileName2 = ‘評價.csv‘ productfile = open(fileName2, ‘w‘, newline=‘‘) product_write = csv.writer(productfile) product_write.writerow([‘商品id‘,‘商品名‘,‘時間‘, ‘顏色分類‘, ‘評價‘]) productfile.close() def get_product(): global itemId global sellerId global titles url = ‘https://s.taobao.com/search?q=%E5%8F%A3%E7%BA%A2&sort=sale-desc‘ head = {} # 寫入User Agent信息 head[ ‘User-Agent‘] = ‘Mozilla/5.0 (Linux; Android 4.1.1; Nexus 7 Build/JRO03D) AppleWebKit/535.19 (KHTML, like Gecko) Chrome/18.0.1025.166 Safari/535.19‘ # 創建Request對象 req = request.Request(url, headers=head) # 傳入創建好的Request對象 response = request.urlopen(req, timeout=30) # 讀取響應信息並解碼 html = response.read().decode(‘utf-8‘) # 打印信息 pattam_id = ‘"nid":"(.*?)"‘ raw_title = ‘"raw_title":"(.*?)"‘ view_price = ‘"view_price":"(.*?)"‘ view_sales = ‘"view_sales":"(.*?)"‘ item_loc = ‘"item_loc":"(.*?)"‘ user_id = ‘"user_id":"(.*?)"‘ all_id = re.compile(pattam_id).findall(html) all_title = re.compile(raw_title).findall(html) all_price = re.compile(view_price).findall(html) all_sales = re.compile(view_sales).findall(html) all_loc = re.compile(item_loc).findall(html) all_userid = re.compile(user_id).findall(html) print("開始收集信息") try: for i in range(10): this_id = all_id[i] this_title = all_title[i] this_price = all_price[i] this_sales = all_sales[i] this_loc = all_loc[i] this_userid = all_userid[i] id = str(this_id) title = str(this_title) price = str(this_price) sales = str(this_sales) loc = str(this_loc) uid = str(this_userid) link = ‘https://item.taobao.com/item.htm?id=‘ + str(id) shoplink = ‘https://dsr-rate.tmall.com/list_dsr_info.htm?itemId=‘ +str(id) head = {} # 寫入User Agent信息 head[ ‘User-Agent‘] = ‘Mozilla/5.0 (Linux; Android 4.1.1; Nexus 7 Build/JRO03D) AppleWebKit/535.19 (KHTML, like Gecko) Chrome/18.0.1025.166 Safari/535.19‘ # 創建Request對象 req2 = request.Request(shoplink, headers=head) # 傳入創建好的Request對象 response2 = request.urlopen(req2, timeout=30) # 讀取響應信息並解碼 html2 = response2.read().decode(‘utf-8‘) gradeAvg = ‘"gradeAvg":(.*?,)"‘ rateTotal = ‘"rateTotal":(.*?,)"‘ all_gradeAvg = re.compile(gradeAvg).findall(html2) all_rateTotal = re.compile(rateTotal).findall(html2) this_gradeAvg = all_gradeAvg this_rateTotal = all_rateTotal gradeAvg = str(this_gradeAvg)[2:-3] rateTotal = str(this_rateTotal)[2:-3] # print("平均分:" + gradeAvg) # print("評論總數:" + rateTotal) # print("商品名:" + title) # print("連接:" + link) # print("銷售量:" + sales) # print("價格:" + price) # print("地址:" + loc) itemId.append(id) sellerId.append(uid) titles.append(title) comment_file = open(‘商品.csv‘, ‘a‘, newline=‘‘) write = csv.writer(comment_file) write.writerow([title, link, sales, price, loc,gradeAvg,rateTotal]) comment_file.close() except (req.ConnectionError, IndexError, UnicodeEncodeError, TimeoutError) as e: print(e.args) except response.URLError as e: print(e.reason) except IOError as e: print(e) # HTTPError except response.HTTPError as e: print(e.code) print("商品基本信息收集完畢") def get_product_comment(): # 具體商品獲取評論 # 前十銷量商品 global title for i in range(10): print("正在收集第{}件商品評論".format(str(i + 1))) for j in range(1,551): # 商品評論的url detaillinks="https://rate.tmall.com/list_detail_rate.htm?itemId="+itemId[i]+"&sellerId="+sellerId[i]+"¤tPage="+str(j) head = {} # 寫入User Agent信息 head[‘User-Agent‘] = ‘Mozilla/5.0 (Linux; Android 4.1.1; Nexus 7 Build/JRO03D) AppleWebKit/535.19 (KHTML, like Gecko) Chrome/18.0.1025.166 Safari/535.19‘ req1 = request.Request(detaillinks, headers=head) # 傳入創建好的Request對象 response1 = request.urlopen(req1,timeout=30) # 讀取響應信息並解碼 html1 = response1.read().decode(‘gbk‘) # 打印信息 # 評論 rateContent = ‘"rateContent":"(.*?)"‘ # 時間 rateDate = ‘"rateDate":"(.*?)"‘ # 顏色 auctionSku = ‘"auctionSku":"(.*?)"‘ all_date = re.compile(rateDate).findall(html1) all_content = re.compile(rateContent).findall(html1) all_sku = re.compile(auctionSku).findall(html1) # 獲取全部評論 try: for k in range(0, len(all_content)): this_date = all_date[k] this_content = all_content[k] this_sku = all_sku[k] date = str(this_date) content = str(this_content) sku = str(this_sku) # print("時間:" + date) # print(sku) # print("評價:" + content) productfile = open(‘評價.csv‘, ‘a‘, newline=‘‘) product_write = csv.writer(productfile) product_write.writerow([itemId[i] + "\t", titles[i], date, sku, content]) productfile.close() except (req1.ConnectionError, IndexError, UnicodeEncodeError, TimeoutError) as e: print(e.args) # URLError產生的原因:網絡無連接,即本機無法上網;連接不到特定的服務器;服務器不存在 except response1.URLError as e: print(e.reason) # HTTPError except response1.HTTPError as e: print(e.code) except IOError as e: print(e) print("第{}件商品評論收集完成".format(str(i+1))) if __name__ == "__main__": start=time.time() get_product_info() get_product() # get_product_comment() end=time.time() total=end-start print(‘本次爬行用時:{:.2f}s!‘.format(total)) from urllib import request import re import csv import time itemId=[] sellerId=[] links=[] titles=[] # ,‘商品評分‘,‘評論總數‘ def get_product_info(): fileName = ‘商品.csv‘ comment_file = open(fileName, ‘w‘, newline=‘‘) write = csv.writer(comment_file) write.writerow([‘商品名‘, ‘連接‘, ‘銷售量‘, ‘價格‘, ‘地址‘,‘商品評分‘,‘評論總數‘]) comment_file.close() fileName2 = ‘評價.csv‘ productfile = open(fileName2, ‘w‘, newline=‘‘) product_write = csv.writer(productfile) product_write.writerow([‘商品id‘,‘商品名‘,‘時間‘, ‘顏色分類‘, ‘評價‘]) productfile.close() def get_product(): global itemId global sellerId global titles url = ‘https://s.taobao.com/search?q=%E5%8F%A3%E7%BA%A2&sort=sale-desc‘ head = {} # 寫入User Agent信息 head[ ‘User-Agent‘] = ‘Mozilla/5.0 (Linux; Android 4.1.1; Nexus 7 Build/JRO03D) AppleWebKit/535.19 (KHTML, like Gecko) Chrome/18.0.1025.166 Safari/535.19‘ # 創建Request對象 req = request.Request(url, headers=head) # 傳入創建好的Request對象 response = request.urlopen(req, timeout=30) # 讀取響應信息並解碼 html = response.read().decode(‘utf-8‘) # 打印信息 pattam_id = ‘"nid":"(.*?)"‘ raw_title = ‘"raw_title":"(.*?)"‘ view_price = ‘"view_price":"(.*?)"‘ view_sales = ‘"view_sales":"(.*?)"‘ item_loc = ‘"item_loc":"(.*?)"‘ user_id = ‘"user_id":"(.*?)"‘ all_id = re.compile(pattam_id).findall(html) all_title = re.compile(raw_title).findall(html) all_price = re.compile(view_price).findall(html) all_sales = re.compile(view_sales).findall(html) all_loc = re.compile(item_loc).findall(html) all_userid = re.compile(user_id).findall(html) print("開始收集信息") try: for i in range(10): this_id = all_id[i] this_title = all_title[i] this_price = all_price[i] this_sales = all_sales[i] this_loc = all_loc[i] this_userid = all_userid[i] id = str(this_id) title = str(this_title) price = str(this_price) sales = str(this_sales) loc = str(this_loc) uid = str(this_userid) link = ‘https://item.taobao.com/item.htm?id=‘ + str(id) shoplink = ‘https://dsr-rate.tmall.com/list_dsr_info.htm?itemId=‘ +str(id) head = {} # 寫入User Agent信息 head[ ‘User-Agent‘] = ‘Mozilla/5.0 (Linux; Android 4.1.1; Nexus 7 Build/JRO03D) AppleWebKit/535.19 (KHTML, like Gecko) Chrome/18.0.1025.166 Safari/535.19‘ # 創建Request對象 req2 = request.Request(shoplink, headers=head) # 傳入創建好的Request對象 response2 = request.urlopen(req2, timeout=30) # 讀取響應信息並解碼 html2 = response2.read().decode(‘utf-8‘) gradeAvg = ‘"gradeAvg":(.*?,)"‘ rateTotal = ‘"rateTotal":(.*?,)"‘ all_gradeAvg = re.compile(gradeAvg).findall(html2) all_rateTotal = re.compile(rateTotal).findall(html2) this_gradeAvg = all_gradeAvg this_rateTotal = all_rateTotal gradeAvg = str(this_gradeAvg)[2:-3] rateTotal = str(this_rateTotal)[2:-3] # print("平均分:" + gradeAvg) # print("評論總數:" + rateTotal) # print("商品名:" + title) # print("連接:" + link) # print("銷售量:" + sales) # print("價格:" + price) # print("地址:" + loc) itemId.append(id) sellerId.append(uid) titles.append(title) comment_file = open(‘商品.csv‘, ‘a‘, newline=‘‘) write = csv.writer(comment_file) write.writerow([title, link, sales, price, loc,gradeAvg,rateTotal]) comment_file.close() except (req.ConnectionError, IndexError, UnicodeEncodeError, TimeoutError) as e: print(e.args) except response.URLError as e: print(e.reason) except IOError as e: print(e) # HTTPError except response.HTTPError as e: print(e.code) print("商品基本信息收集完畢") def get_product_comment(): # 具體商品獲取評論 # 前十銷量商品 global title for i in range(10): print("正在收集第{}件商品評論".format(str(i + 1))) for j in range(1,551): # 商品評論的url detaillinks="https://rate.tmall.com/list_detail_rate.htm?itemId="+itemId[i]+"&sellerId="+sellerId[i]+"¤tPage="+str(j) head = {} # 寫入User Agent信息 head[‘User-Agent‘] = ‘Mozilla/5.0 (Linux; Android 4.1.1; Nexus 7 Build/JRO03D) AppleWebKit/535.19 (KHTML, like Gecko) Chrome/18.0.1025.166 Safari/535.19‘ req1 = request.Request(detaillinks, headers=head) # 傳入創建好的Request對象 response1 = request.urlopen(req1,timeout=30) # 讀取響應信息並解碼 html1 = response1.read().decode(‘gbk‘) # 打印信息 # 評論 rateContent = ‘"rateContent":"(.*?)"‘ # 時間 rateDate = ‘"rateDate":"(.*?)"‘ # 顏色 auctionSku = ‘"auctionSku":"(.*?)"‘ all_date = re.compile(rateDate).findall(html1) all_content = re.compile(rateContent).findall(html1) all_sku = re.compile(auctionSku).findall(html1) # 獲取全部評論 try: for k in range(0, len(all_content)): this_date = all_date[k] this_content = all_content[k] this_sku = all_sku[k] date = str(this_date) content = str(this_content) sku = str(this_sku) # print("時間:" + date) # print(sku) # print("評價:" + content) productfile = open(‘評價.csv‘, ‘a‘, newline=‘‘) product_write = csv.writer(productfile) product_write.writerow([itemId[i] + "\t", titles[i], date, sku, content]) productfile.close() except (req1.ConnectionError, IndexError, UnicodeEncodeError, TimeoutError) as e: print(e.args) # URLError產生的原因:網絡無連接,即本機無法上網;連接不到特定的服務器;服務器不存在 except response1.URLError as e: print(e.reason) # HTTPError except response1.HTTPError as e: print(e.code) except IOError as e: print(e) print("第{}件商品評論收集完成".format(str(i+1))) if __name__ == "__main__": start=time.time() get_product_info() get_product() # get_product_comment() end=time.time() total=end-start print(‘本次爬行用時:{:.2f}s!‘.format(total))
一個完整的大作業:淘寶口紅銷量top10的銷量和評價