1. 程式人生 > >python3 爬取汽車之家所有車型操作步驟

python3 爬取汽車之家所有車型操作步驟

  題記:

  網際網路上關於使用python3去爬取汽車之家的汽車資料(主要是汽車基本引數,配置引數,顏色引數,內飾引數)的教程已經非常多了,但大體的方案分兩種:

  1.解析出汽車之家某個車型的網頁,然後正則表示式匹配出混淆後的資料物件與混淆後的js,並對混淆後的js使用pyv8進行解析返回正常字元,然後通過字元與資料物件進行匹配,具體方法見這位園友,傳送門:https://www.cnblogs.com/my8100/p/js_qichezhijia.html (感謝這位大神前半部分的思路)

       2.解析出汽車之家某個車型的網頁,然後正則表示式匹配出混淆後的資料物件與混淆後的js,針對混淆後的js進行進行手動匹配,因為混淆的js大概分為8大類(無引數 返回常量,無引數 返回函式,引數等於返回值函式,無引數 返回常量,無引數 返回常量中間無混淆程式碼,字串拼接時使無參常量,字串拼接時使用返回引數的函式),然後通過正則表示式進行解析出8類內容並進行逐個替換,最終也會返回一個帶有順序的字串,將這個字串與前邊的資料物件再次替換,最終資料物件中的所有span都會被替換成中文,具體操作見園友的地址,傳送門:https://www.cnblogs.com/dyfblog/p/6753251.html (感謝這位大神前半部分的思路)

不過鑑於作者技術有限,上述的兩種方案,我都沒有完整的執行完成,哪怕花了一週的時間也沒有,但是沒有辦法,誰讓我是一個很愛鑽牛角尖的人呢,下一步提出我自己琢磨出來的方案,流程上稍微有點複雜,但是穩打穩紮,還是可以爬出來的,好了話不多說了,貼出步驟;

1.獲取所有車型的網頁,儲存到本地: 

 1 import bs4
 2 import requests as req
 3 '''
 4 第一步,下載出所有車型的網頁。
 5 '''
 6 def mainMethod():
 7     '''
 8     解析汽車之家所有車型資料儲存到D盤
 9     '''
10     li = [chr(i) for
i in range(ord("A"),ord("Z")+1)] 11 firstSite="https://www.autohome.com.cn/grade/carhtml/" 12 firstSiteSurfixe=".html" 13 secondSite = "https://car.autohome.com.cn/config/series/" 14 secondSiteSurfixe = ".html" 15 16 for a in li: 17 if a is not None: 18 requestUrl = firstSite+a+firstSiteSurfixe
19 print(requestUrl) 20 #開始獲取每個品牌的車型 21 resp = req.get(requestUrl) 22 # print(str(resp.content,"gbk")) 23 bs = bs4.BeautifulSoup(str(resp.content,"gbk"),"html.parser") 24 bss = bs.find_all("li") 25 con = 0 26 for b in bss: 27 d = b.h4 28 if d is not None: 29 her = str(d.a.attrs['href']) 30 her = her.split("#")[0] 31 her = her[her.index(".cn")+3:].replace("/",'') 32 if her is not None: 33 secSite = secondSite +her + secondSiteSurfixe 34 print("secSite="+secSite) 35 # print(secSite) 36 #奧迪A3 37 if her is not None: 38 resp = req.get(secSite) 39 text = str(resp.content,encoding="utf-8") 40 print(a) 41 fil = open("d:\\autoHome\\html\\"+str(her),"a",encoding="utf-8") 42 fil.write(text) 43 con = (con+1) 44 else: 45 print(con) 46 if __name__ =="__main__": 47 mainMethod()

2.解析出每個車型的關鍵js並拼裝成一個html,儲存到本地。

 1 import os
 2 import re
 3 '''
 4 第二步,解析出每個車型的關鍵js拼裝成一個html
 5 '''
 6 if __name__=="__main__":
 7     print("Start...")
 8     rootPath = "D:\\autoHome\\html\\"
 9     files = os.listdir(rootPath)
10     for file in files:
11         print("fileName=="+file.title())
12         text = ""
13         for fi in open(rootPath+file,'r',encoding="utf-8"):
14             text = text+fi
15         else:
16             print("fileName=="+file.title())
17         #解析資料的json
18         alljs = ("var rules = '2';"
19                  "var document = {};"
20                  "function getRules(){return rules}"
21                  "document.createElement = function() {"
22                  "      return {"
23                  "              sheet: {"
24                  "                      insertRule: function(rule, i) {"
25                  "                              if (rules.length == 0) {"
26                  "                                      rules = rule;"
27                  "                              } else {"
28                  "                                      rules = rules + '#' + rules;"
29                  "                              }"
30                  "                      }"
31                  "              }"
32                  "      }"
33                  "};"
34                 
37                  "document.head = {};"
38                  "document.head.appendChild = function() {};"
39 
40                  "var window = {};"
41                  "window.decodeURIComponent = decodeURIComponent;")
42         try:
43             js = re.findall('(\(function\([a-zA-Z]{2}.*?_\).*?\(document\);)', text)
44             for item in js:
45                 alljs = alljs + item
46         except Exception as e:
47             print('makejs function exception')
48 
49 
50         newHtml = "<html><meta http-equiv='Content-Type' content='text/html; charset=utf-8' /><head></head><body>    <script type='text/javascript'>"
51         alljs = newHtml + alljs+" document.write(rules)</script></body></html>"
52         f = open("D:\\autoHome\\newhtml\\"+file+".html","a",encoding="utf-8")
53         f.write(alljs)
54         f.close()

 

3.解析出每個車型的資料json,比如var config  ,var option , var bag  var innerbag..但我就解析了基本資訊跟配置資訊,其他的無所謂。

 1 import os
 2 import re
 3 '''
 4 解析出每個車型的資料json,儲存到本地。
 5 '''
 6 if __name__=="__main__":
 7     print("Start...")
 8     rootPath = "D:\\autoHome\\html\\"
 9     files = os.listdir(rootPath)
10     for file in files:
11         print("fileName=="+file.title())
12         text = ""
13         for fi in open(rootPath+file,'r',encoding="utf-8"):
14             text = text+fi
15         else:
16             print("fileName=="+file.title())
17         #解析資料的json
18         jsonData = ""
19         config = re.search('var config = (.*?){1,};',text)
20         if config!= None:
21             print(config.group(0))
22             jsonData = jsonData+ config.group(0)
23         option = re.search('var option = (.*?)};',text)
24         if option != None:
25             print(option.group(0))
26             jsonData = jsonData+ option.group(0)
27         bag = re.search('var bag = (.*?);',text)
28         if bag != None:
29             print(bag.group(0))
30             jsonData = jsonData+ bag.group(0)
31         # print(jsonData)
32         f = open("D:\\autoHome\\json\\"+file,"a",encoding="utf-8")
33         f.write(jsonData)
34         f.close()

 

4.生成樣式檔案,儲存 到本地。

 1 import os
 2 from selenium import webdriver
 3 
 4 '''
 5     第四步,瀏覽器執行第二步生成的html檔案,抓取執行結果,儲存到本地。
 6 '''
 7 class Crack():
 8     def __init__(self,keyword,username,passod):
 9         self.url = 'https://www.baidu.com'
10         self.browser = webdriver.Chrome('E:\work\ChromePortable\App\Google Chrome\chromedriver.exe')
11 
12 if __name__=="__main__":
13     lists = os.listdir("D:/autoHome/newHtml/")
14     for fil in lists:
15         file = os.path.exists("D:\\autoHome\\content\\"+fil)
16         if file :
17             print('檔案已經解析。。。'+str(file))
18             continue
19         crack = Crack('測試公司','17610177519','17610177519')
20 
21         print(fil)
22         crack.browser.get("file:///D:/autoHome/newHtml/"+fil+"")
23         text = crack.browser.find_element_by_tag_name('body')
24         print(text.text)
25         f = open("D:\\autoHome\\content\\"+fil,"a",encoding="utf-8")
26         f.write(text.text)
27         f.close()
28         crack.browser.close()

 

5.讀取樣式檔案,匹配資料檔案,生成正常資料檔案

 1 import os
 2 import re
 3 '''
 4 匹配樣式檔案與json資料檔案,生成正常的資料檔案。
 5 '''
 6 if __name__ =="__main__":
 7     rootPath = "D:\\autoHome\\json\\"
 8     listdir = os.listdir(rootPath)
 9     for json_s in listdir:
10         print(json_s.title())
11         jso = ""
12         #讀取json資料檔案
13         for fi in open(rootPath+json_s,'r',encoding="utf-8"):
14             jso = jso+fi
15         content = ""
16         #讀取樣式檔案
17         spansPath = "D:\\autoHome\\content\\"+json_s.title()+".html"
18         # print(spansPath)
19         for spans in  open(spansPath,"r",encoding="utf-8"):
20             content = content+ spans
21         print(content)
22         #獲取所有span物件
23         jsos = re.findall("<span(.*?)></span>",jso)
24         num = 0
25         for js in jsos:
26             print("匹配到的span=>>"+js)
27             num = num +1
28             #獲取class屬性值
29             sea = re.search("'(.*?)'",js)
30             print("匹配到的class==>"+sea.group(1))
31             spanContent = str(sea.group(1))+"::before { content:(.*?)}"
32             #匹配樣式值
33             spanContentRe = re.search(spanContent,content)
34             if spanContentRe != None:
35                 if sea.group(1) != None:
36                     print("匹配到的樣式值="+spanContentRe.group(1))
37                     jso = jso.replace(str("<span class='"+sea.group(1)+"'></span>"),re.search("\"(.*?)\"",spanContentRe.group(1)).group(1))
38         print(jso)
39         fi = open("D:\\autoHome\\newJson\\"+json_s.title(),"a",encoding="utf-8")
40         fi.write(jso)
41         fi.close()

 

6.到前五步已經可以看到json資料檔案都已經是混淆前的了,說明已經爬取成功了。

7.讀取資料檔案,生成excel

 1 import json
 2 import os
 3 import re
 4 import xlwt
 5 '''
 6 讀取資料檔案,生成excel
 7 '''
 8 if __name__ == "__main__":
 9     rootPath = "D:\\autoHome\\newJson\\"
10     workbook = xlwt.Workbook(encoding = 'ascii')#建立一個檔案
11     worksheet = workbook.add_sheet('汽車之家')#建立一個表
12     files = os.listdir(rootPath)
13     startRow = 0
14     isFlag = True #預設記錄表頭
15     for file in files:
16         list = []
17         carItem = {}
18         print("fileName=="+file.title())
19         text = ""
20         for fi in open(rootPath+file,'r',encoding="utf-8"):
21             text = text+fi
22         # else:
23             # print("檔案內容=="+text)
24         #解析基本引數配置引數,顏色三種引數,其他引數
25         config = "var config = (.*?);"
26         option = "var option = (.*?);var"
27         bag = "var bag = (.*?);"
28 
29         configRe = re.findall(config,text)
30         optionRe = re.findall(option,text)
31         bagRe = re.findall(bag,text)
32         for a in configRe:
33             config = a
34         print("++++++++++++++++++++++\n")
35         for b in optionRe:
36             option = b
37             print("---------------------\n")
38         for c in bagRe:
39             bag = c
40         # print(config)
41         # print(option)
42         # print(bag)
43         config = json.loads(config)
44         option = json.loads(option)
45         # print(bag)
46         try:
47             bag = json.loads(bag)
48             # print(config)
49             # print(option)
50             # print(bag)
51             path = "D:\\autoHome\\autoHome.xls"
52 
53             configItem = config['result']['paramtypeitems'][0]['paramitems']
54             optionItem = option['result']['configtypeitems'][0]['configitems']
55             optionItem = option['result']['configtypeitems'][0]['configitems']
56         except Exception as e:
57             f =  open("D:\\autoHome\\異常資料\\exception.txt","a",encoding="utf-8")
58             f.write(file.title()+"\n")
59 
60         #解析基本引數
61         for car in configItem:
62             carItem[car['name']]=[]
63             for ca in car['valueitems']:
64                 carItem[car['name']].append(ca['value'])
65         # print(carItem)
66         #解析配置引數
67         for car in optionItem:
68             carItem[car['name']]=[]
69             for ca in car['valueitems']:
70                 carItem[car['name']].append(ca['value'])
71 
72         if isFlag:
73             co1s = 0
74 
75             for co in carItem:
76                 co1s = co1s +1
77                 worksheet.write(startRow,co1s,co)
78             else:
79                 startRow = startRow+1
80                 isFlag = False
81 
82         #計算起止行號
83         endRowNum = startRow + len(carItem['最大扭矩(N·m)']) #車輛款式記錄數
84         for row in range(startRow,endRowNum):
85             print(row)
86             colNum = 0
87             for col in carItem:
88 
89                 colNum = colNum +1
90                 print(str(carItem[col][row-startRow]),end='|')
91                 worksheet.write(row,colNum,str(carItem[col][row-startRow]))
92 
93         else:
94             startRow  = endRowNum
95     workbook.save('d:\\autoHome\\Mybook.xls')

 

8.最後開啟excel檔案,給你們看看。

資料量大概有8300的樣子。以後買車就用這個參考了。