1. 程式人生 > >python爬蟲爬取京東店鋪商品價格資料(更新版)

python爬蟲爬取京東店鋪商品價格資料(更新版)

主要使用的庫:

requests:爬蟲請求並獲取原始碼
re:使用正則表示式提取資料
json:使用JSON提取資料
pandas:使用pandans儲存資料

##sqlalchemy :備用方案,上傳資料到mysql

以下是原始碼:

# -*- coding:utf-8 -*-
import requests
import re
import random
import time
import json
import pymysql
from sqlalchemy import create_engine
from requests.packages.urllib3.exceptions import InsecureRequestWarning
import pandas as pd
requests.packages.urllib3.disable_warnings(InsecureRequestWarning)  ###禁止提醒SSL警告


class jd(object):
    def __init__(self):

        self.s = requests.session()   ## 建立一個session物件
        headers = {
            'accept':'application/json, text/javascript, */*; q=0.01',
            'accept-encoding':'gzip, deflate, br',
            'User-Agent': 'Mozilla/5.0 (iPhone; CPU iPhone OS 11_0 like Mac OS X) AppleWebKit/604.1.38 (KHTML, like Gecko) Version/11.0 Mobile/15A372 Safari/604.1',
                   }
        self.s.headers.update(headers)   ### 設定請求頭
        #self.engine = create_engine('mysql+pymysql://root:
[email protected]
:3306/jd') ##儲存到SQL def getdata(self,url,name): getdate=time.strftime("%Y-%m-%d",time.localtime()) self.shopid=re.search('index-(.*?).html',url).group(1) ###獲取店鋪ID號 self.s.get('https://shop.m.jd.com/search/search?shopId='+str(self.shopid)) for i in range(1,10000): ###爬取頁數範圍 沒有找到商品後會自動退出迴圈 wareId_list = [] wname_list = [] jdPrice_list = [] time.sleep(random.random()) ##隨機延時0-1秒 t = int(time.time() * 1000) ## https://wqsou.jd.com/search/searchjson?datatype=1&page=2&pagesize=40&merge_sku=yes&qp_disable=yes&key=ids%2C%2C121614&_=1537524375713&sceneval=2&g_login_type=1&callback=jsonpCBKQ&g_ty=ls searchurl = 'https://wqsou.jd.com/search/searchjson?datatype=1&page={}&pagesize=40&merge_sku=yes&qp_disable=yes&key=ids%2C%2C{}&_={}&sceneval=2&g_login_type=1&callback=jsonpCBKA&g_ty=ls'.format(i,self.shopid,t) ##請求資料網址 print(searchurl) req=self.s.get(url=searchurl,verify=False).text ###獲取資料 print(req) print(name,i) wareId=re.findall('"wareid": "(.*?)",',req) ##獲取商品ID wname=re.findall('"warename": "(.*?)",',req) ###獲取商品名稱 jdPrice=re.findall('"dredisprice": "(.*?)",',req) ###獲取商品價格 if wareId==[]: ###如果沒有找到ID退出迴圈 break #####處理資料 wareId_list.extend(wareId) wname_list.extend(wname) jdPrice_list.extend(jdPrice) wareId_l=len(wareId_list) name_list=[] name_list.append(name) name_list.extend(name_list*(wareId_l-1)) getdate_list = [] getdate_list.append(getdate) getdate_list.extend(getdate_list * (wareId_l - 1)) jddata={ 'name':name_list, 'wareId':wareId_list, 'wname':wname_list, 'jdPrice':jdPrice_list, 'update': getdate_list } df = pd.DataFrame(data=jddata) df.to_csv(r'e:\jdmall.csv', index=False, encoding="GB18030") ###儲存csv檔案 #df.to_sql('店鋪前端', con=self.engine, if_exists='append', index=False) ##上傳到資料庫 if __name__ == '__main__': j=jd() url='https://mall.jd.com/index-1000000693.html' nm='intel' j.newdata(url,nm)