Python 爬蟲實戰 汽車某家(五) 口碑、評分
阿新 • • 發佈:2018-12-17
文章目錄
爬取內容
1、使用者口碑明細評分
2、口碑標題、發表日期、口碑推薦級別
3、購車目的
4、購車價格
5、購車經銷商
一、專案結構
point.txt 為斷點儲存檔案,over.txt為爬取結束標識檔案,不存放任何內容
二、核心類程式碼
import time,json,re,random,datetime from io import BytesIO from PIL import Image from selenium import webdriver from selenium.webdriver import ActionChains from selenium.webdriver.common.by import By from selenium.webdriver.support.ui import WebDriverWait from selenium.webdriver.support import expected_conditions as EC from scrapy import Selector class AutoSelenium(object): requestCount=0 # 爬取連結,佔位符為車型id kouBeiUrl = "https://k.autohome.com.cn/spec/%s" # 分頁url nextPageUrl="https://k.autohome.com.cn/spec/%s%s" # 全部車型id chexingIdSet=None # 批量儲存到資料庫 scoreSaveList=list() # 批量更新到資料庫 scoreUpdateList=list() # 已儲存過的車型id集合 savedChexingIdSet=None # 用於更新車型表平均能耗的資料集合 updateSpecAvgFuelList=list() # 更新使用者集合 updateUserList=list() # 插入使用者集合 insertUserList=list() # 執行時儲存的使用者集合 crawlingUserIdSet=set() # 插入經銷商集合 insertDealerList=list() # 正在爬取中的經銷商id集合 crawlingDealerIdSet=set() # 插入使用者評分集合 insertUserScoreList=list() # 插入口碑主表集合 insertKoubeiHeadList=list() # 更新口碑主標集合 updateKoubeiHeadList=list() # 已經儲存到資料庫的口碑id集合 existKoubeiHeadIdSet=set() # 插入購車用途集合 insertCarPurposeList=list() # 斷點功能用待爬取車型id集合 waitingCrawlIdSet=None # 斷點功能用已爬取車型id集合 crawledIdSet=set() # 車型爬取計數器 specCount=0 # 儲存使用者統計 userCount=0 # 儲存經銷商統計 dealerCount=0 # 儲存口碑主表統計 koubeiHeadCount=0 # 更新口碑主表統計 updateKoubeiCount=0 # 儲存使用者評分統計 userScoreCount=0 # 儲存購車目的統計 purposeCount=0 # 評分類別元組,與scoreItem對應:(spaceScore, powerScore, operateScore, fuelScore, comfortScore, appearanceScore, interiorScore, costScore) scoreTupe=() def __init__(self): # 初始化seleniume self.browser = webdriver.Chrome() self.wait = WebDriverWait(self.browser, 20) # 初始化全部車型id集合 specRes=MySqlUtils.querySpec() self.chexingIdSet=MySqlUtils.parseToChexingIdSet(specRes) # 初始化分數型別列舉 scoreTypeRes=MySqlUtils.queryEnum('qczj_score') self.scoreType=MySqlUtils.parseEnum(scoreTypeRes) self.scoreType.__setitem__("能耗",4) self.scoreType.__setitem__("電耗",4) self.scoreType.__setitem__("耗電量",4) # 初始化已爬取車型id savedScoreRes=MySqlUtils.querySavedScoreSepc() self.savedChexingIdSet=MySqlUtils.parseToChexingIdSet(savedScoreRes) # 初始化購車型別字典 carPurposeRes=MySqlUtils.queryEnum('qczj_car_purpose') self.carPurposeDict=MySqlUtils.parseEnum(carPurposeRes) # 查詢所有的使用者,用於判重 userRes=MySqlUtils.query(MySqlUtils.sql_query_user) self.existUserIdSet=MySqlUtils.parseToSet(userRes,0) # 初始化已經存在於經銷商表中的經銷商id集合 dearlerRes=MySqlUtils.query(MySqlUtils.sql_query_dealer) self.existDealerIdSet=MySqlUtils.parseToSet(dearlerRes,0) # 初始化已經存在於資料庫中的口碑主表id koubeiHeadRes=MySqlUtils.query(MySqlUtils.sql_query_koubei_head) self.existKoubeiId=MySqlUtils.parseToSet(koubeiHeadRes,0) # 初始化已經存在於使用者評分表中的口碑id userScoreRes=MySqlUtils.query(MySqlUtils.sql_query_user_score) self.existUserScoreKoubeiIdSet = MySqlUtils.parseToSet(userScoreRes,0) # 初始化斷點 Point.init() # 切入斷點 self.waitingCrawlIdSet=Point.cutInto(self.chexingIdSet) def start_requests(self): for chexingId in self.waitingCrawlIdSet: # for chexingId in ['23009']: # 已爬取的id儲存到已爬取的id集合中 self.crawledIdSet.add(chexingId) self.specCount += 1 # 拼接請求連結 url = self.kouBeiUrl % chexingId while True: # 請求連結 if not self.requestConfigLink(url): # 請求超時,證明沒有資料,返回請求下一個 print("請求超時") # 超時原因是由於驗證沒有通過而導致,可以不儲存到已爬集合中,當爬完結束後並不會生成over斷點檔案,因此再次啟動時就會將超時的車型再爬一遍,從而提高資料的完整度 self.crawledIdSet.remove(chexingId) self.specCount -= 1 break # 解析 # 取出page_source page_source = self.browser.page_source # 轉selector取值 response = Selector(text=page_source) # 解析平均分 # self.parseAvgScore(response,chexingId) # 解析使用者、購買資訊、經銷商、評分、購車用途 self.parse(response,chexingId) # self.parseUserCar(response,chexingId) # self.parseDealer(response) # self.parseScoreDetail(response,chexingId) # 解析下一頁 url=self.parseNextPageUrl(response,chexingId) print("~~~~~~~~~~~~~~~~~~~~~~~~~~~url:%s" % url) if url == None: break # 將爬取過的車型id記錄到斷點檔案中,每爬取10個車型儲存一次 if len(self.crawledIdSet) > 10: crawledIdSetCopy = self.crawledIdSet.copy() self.crawledIdSet.clear() Point.savePointFromSet(crawledIdSetCopy) # 出迴圈爬取結束,完成斷點,關閉瀏覽器 Point.savePointFromSet(self.crawledIdSet) if self.specCount >= len(self.waitingCrawlIdSet): Point.complete() self.browser.close() def requestConfigLink(self, url): success=True wait=None try: #configUrl="https://car.autohome.com.cn/config/spec/20211.html" #該請求沒有配置資料 self.requestCount+=1 self.browser.get(url) wait = WebDriverWait(self.browser, 10) wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, '.date-ul.fn-left'))) time.sleep(random.randint(1,3)) # 若不加一個會發生頁面沒有完全渲染 except Exception as e: print(e) success=False if not success: try: ele = wait.until(EC.element_to_be_clickable((By.CLASS_NAME, 'geetest_radar_tip'))) ele.click() wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, '.date-ul.fn-left'))) success=True except Exception as e: success = False return success # 解析所有使用者對車型的平均評分 def parseAvgScore(self, response,chexingId): # 取出參與評分人數:font-arial red font-16 numPeople = response.css(".font-arial.red.font-16 ::text").extract_first() # 取出平均油耗:font-arial font-number avgPower=response.css(".font-arial.font-number ::text").extract_first() # 取出車型代表圖片小圖 imgUrlS = response.css(".appraise-cont-dl.fn-left img::attr(src)").extract_first() # 新增到待儲存的集合中 if avgPower or numPeople or imgUrlS: updateDate=datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S") self.updateSpecAvgFuelList.append((avgPower,numPeople,updateDate,imgUrlS,chexingId)) # 取出評分明細類目 ulArr = response.css(".date-ul.fn-left") for ul in ulArr: for i, li in enumerate(ul.css("li")): if i == 0: continue # 取出當前類別值 optionName = li.css(".width-01 ::text").extract_first() optionName = optionName.rstrip() optionName = optionName.lstrip() # 取出當前類別的評分 optionValue = li.css(".width-02 ::text").extract_first() optionValue = optionValue.rstrip() optionValue = optionValue.lstrip() if optionValue == "-": optionValue = None break # 取出當前類別的高於/低於 # 取值 cpValue = li.css(".width-03").xpath("string(.)").extract_first() cpValue = cpValue.rstrip() cpValue = cpValue.lstrip() cpValue = re.search(r'\d+(\.\d+)?', cpValue) if cpValue: cpValue = cpValue.group() # 判斷是否存在子元素i if li.css(".width-03 i"): # 根據css名稱 判斷高於低於 cssClassName = li.css(".width-03 i::attr(class)").extract_first() pcssName, subCssName = cssClassName.split(" ") if subCssName == 'icon-dy': cpValue = -1 * float(cpValue) scoreType=self.scoreType[optionName] sid=chexingId + "_" + str(random.randint(100000, 999999)) if chexingId in self.savedChexingIdSet: updateTime=datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S") self.scoreUpdateList.append((optionValue,cpValue,updateTime,chexingId,scoreType)) else: self.scoreSaveList.append((sid,chexingId,scoreType,optionValue,cpValue)) # 儲存到資料庫,當爬取接近尾聲時,每個車型儲存一次,否則多個車型批量儲存 if self.specCount > len(self.chexingIdSet) - 100: self.updateSpecScore() else: if len(self.scoreSaveList) >= 100 or len(self.scoreUpdateList) >= 100 : self.updateSpecScore() # 解析入口 def parse(self,response,chexingId): updateTime = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S") # 解析平均分 # self.parseAvgScore(response,chexingId) # 一個頁面多條口碑,解析口碑div集合 mouthconDivs=response.css(".mouthcon") for mouthDiv in mouthconDivs: # 解析使用者資訊 # print("userid:%s,name:%s,homepage:%s,img:%s" % (userid, userName, userPage, headImgUrl)) userid, userName, userPage ,headImgUrl = self.parseUserInfo(mouthDiv) # 解析經銷商資訊 # print("city:%,county:%s,dealerName:%s,dealerId:%s,dealerHomePage:%s" % (city,county,dealerName,dealerId,dealerHomePage)) city, county, dealerName, dealerId, dealerHomePage = self.parseDealer(mouthDiv) # 解析購買時間、購買價格、油耗、行駛里程;空間、動力、操控、油耗、舒適性、外觀、內飾、價效比;購車目的集合 # carItem = (price, buyTime, fuel, currentKm) # scoreItem = (spaceScore, powerScore, operateScore, fuelScore, comfortScore, appearanceScore, interiorScore, costScore) carItem, scoreItem, carPurposeList = self.parseScoreItem(mouthDiv) # 解析口碑主鍵、口碑連結、口碑主題、口碑第一次發表時間、口碑來源 、閱讀人數、評論人數、支援人數、滿級精華標識 koubeiSid, koubeiTitle, publicTime, koubeiLink, koubeiSrc, readNum, commentNum, favorNum,mjjh = self.parseKoubei(mouthDiv) # 儲存口碑主表,若已存在則更新 if koubeiSid not in self.existKoubeiId: insertKoubeiParams = ( koubeiSid, koubeiTitle, publicTime, userid, chexingId, carItem[1], carItem[0], dealerId, city, county, carItem[3], carItem[2], koubeiLink, favorNum, readNum, commentNum, koubeiSrc,mjjh) self.insertKoubeiHeadList.append(insertKoubeiParams) else: self.updateKoubeiCount += 1 updateKoubeiParams = ( carItem[3], carItem[2], favorNum, readNum, commentNum, updateTime, mjjh, koubeiSid) self.updateKoubeiHeadList.append(updateKoubeiParams) # 儲存使用者資訊 if userid not in self.existUserIdSet and userid not in self.crawlingUserIdSet: insertUserParams = (userid, userName, userPage, headImgUrl, city, county) self.insertUserList.append(insertUserParams) self.crawlingUserIdSet.add(userid) # 儲存使用者評分和購車用途資訊 if scoreItem or carPurposeList: if koubeiSid not in self.existUserScoreKoubeiIdSet: # 儲存使用者評分 for item in scoreItem: userScoreSid=userid+"_"+chexingId+"_"+koubeiSid+"_"+str(random.randint(100000,999999)) insertUserScoreParams=(userScoreSid,koubeiSid,item[0],item[1]) self.insertUserScoreList.append(insertUserScoreParams) # 儲存購車用途 for purposeVal in carPurposeList: purposeSid=userid+"_"+chexingId+"_"+koubeiSid+"_"+str(random.randint(100000,999999)) insertPurposeParams=(purposeSid,koubeiSid,purposeVal) self.insertCarPurposeList.append(insertPurposeParams) # 儲存經銷商資訊集合中 if dealerId: if dealerId not in self.existDealerIdSet and dealerId not in self.crawlingDealerIdSet: insertDealerParams = (dealerId,dealerName,dealerHomePage,city,county) self.insertDealerList.append(insertDealerParams) self.crawlingDealerIdSet.add(dealerId) # print("buyTime:%s,price:%s,fuel:%s,currentKm:%s" % ( # buyTime.strftime("%Y-%m-%d %H:%M:%S"), price, fuel, currentKm)) # print("spaceScore:%s,powerScore:%s,operateScore:%s,fuelScore:%s,comfortScore:%s,appearanceScore:%s,interiorScore:%s,costScore:%s" % ( # spaceScore, powerScore, operateScore, fuelScore, comfortScore, appearanceScore, interiorScore, # costScore)) # print("koubeiSid:%s,koubeiTitle%s,publicTime:%s,koubeiSrc:%s,koubeiLink:%s" % (koubeiSid,koubeiTitle,publicTime,koubeiSrc,koubeiLink)) # print("favorNum:%s,readNum:%s,commentNum:%s" % (favorNum,readNum,commentNum)) # 存入使用者集合 userParams=(userid,userName,userPage,headImgUrl,city,county) # self.saveToUserList(userParams) # 出迴圈一個頁面解析完成,當爬到末尾時逐條儲存,否則批量儲存 self.save() # 解析使用者資訊 def parseUserInfo(self,mouthDiv): nameText = mouthDiv.css(".name-text") userName = nameText.xpath("string(.)").extract_first().strip() userPage = nameText.css("a ::attr(href)").extract_first().strip() userid = userPage[userPage.rfind("/") + 1:] headImgUrl = mouthDiv.css(".name-pic img::attr(data-src)").extract_first() return userid,userName,userPage,headImgUrl # 解析經銷商資訊 def parseDealer(self,mouthDiv): # 解析地帶呢 county = None city = None place = mouthDiv.css(".c333 ::text").extract_first().strip() placeSplit = place.split(" ") if len(placeSplit) > 1: city = placeSplit[0] county = placeSplit[1] else: city = place # 解析經銷商 dealerLink = mouthDiv.css(".js-dearname ::attr(href)").extract_first() # print("dealerLink:%s" % dealerLink) dealerId = None dealerName=None dealerHomePage = None if dealerLink: # 獲取經銷商id dealerId = dealerLink[dealerLink.rfind("/") + 1:dealerLink.rfind("#")] # # 經銷商主頁 dealerHomePage = dealerLink[:dealerLink.rfind("#")] dealerName=mouthDiv.css(".js-dearname ::text").extract_first().strip() return city,county,dealerName,dealerId,dealerHomePage # 解析口碑概要資訊 def parseKoubei(self,mouthDiv): koubeiSrc = None titleItem = mouthDiv.css(".title-name.name-width-01") # 解析口碑來源 koubeiSrcText = titleItem.css("span ::text").extract_first() if koubeiSrcText: koubeiSrc = koubeiSrcText.split(":")[1] # 解析發表時間 publicTime = titleItem.css("b a::text").extract_first() # 解析口碑連結 koubeiLink = titleItem.css("b a::attr(href)").extract_first() # 解析口碑id koubeiSid = koubeiLink[koubeiLink.rfind("_") + 1:koubeiLink.rfind(".")] # 解析口碑主題 koubeiTitle = None if len(titleItem.css("a ::text")) > 1: koubeiTitle=titleItem.css("a ::text")[1].extract() helpDiv = mouthDiv.css(".help") # 解析評論人數 commentNum = helpDiv.css(".font-arial.CommentNumber ::text").extract_first().strip() # 解析支援人數 favorNum = helpDiv.css(".supportNumber ::text").extract_first().strip() # 解析閱讀人數 readNum = helpDiv.css(".orange ::text").extract_first().strip() # 解析滿級精華 mjjh=mouthDiv.css(".mjjh-icon ::attr(src)").extract_first() # 解析首頁推薦 sytj=mouthDiv.css(".sytj-icon ::attr(src)").extract_first() if mjjh: # 擷取精華數字 mjjh=mjjh[mjjh.rfind("-")+1:mjjh.rfind(".")] if sytj: # 轉換首頁推薦為數字 sytj=0 if mjjh and sytj: mjjh=mjjh+","+sytj return koubeiSid,koubeiTitle,publicTime,koubeiLink,koubeiSrc,readNum,commentNum,favorNum,mjjh # 解析購買價格、評分明細、及購車用途資訊 def parseScoreItem(self,mouthDiv): buyTime = None price = None fuel = None currentKm = None spaceScore = None powerScore = None operateScore = None fuelScore = None comfortScore = None appearanceScore = None interiorScore = None costScore = None carPurposeList = list() dlArr = mouthDiv.css(".choose-dl") for dl in dlArr: # 解析左側標籤名稱 labelName = dl.css("dt ::text").extract_first().strip() # print(labelName) # print("type:",type(labelName)) # 如果沒有取到標籤名則證明是油耗及行駛里程二合一標籤 if labelName == None or labelName == "": doubleLabel = dl.css("dt p::text") doubleValue = dl.css("dd p::text") # print("doubleLabelLength:%s" % len(doubleLabel)) # print(doubleLabel) # print("doubleValueLength:%s" % len(doubleValue)) # print(doubleValue) for i, label in enumerate(doubleLabel): labelText = label.extract().strip() if "油耗" == labelText or "能耗" == labelText or "電耗" == labelText: fuel = doubleValue[i].extract().strip() if "目前行駛" == labelText: currentKm = doubleValue[i].extract().strip() continue labelName.strip() if "購買時間" == labelName: dateStr = dl.css("dd ::text").extract_first().strip() buyTime = datetime.datetime.strptime(dateStr, '%Y年%m月') # 若出現插入資料庫報錯,需要格式化成字串 continue if "裸車購買價" == labelName: price = dl.css("dd ::text").extract_first().strip() continue scoreValue = dl.css(".font-arial.c333 ::text").extract_first() if scoreValue: labelName = labelName.strip() scoreValue.strip() # print("labelName:%s" % labelName) # print(labelName == "油耗") if "空間" == labelName: spaceScore =(self.scoreType['空間'], scoreValue) if "動力" == labelName: powerScore = (self.scoreType['動力'], scoreValue) if "操控" == labelName: operateScore = (self.scoreType['操控'], scoreValue) if "油耗" == labelName or "能耗" == labelName or "電耗" == labelName or "耗電量" == labelName: fuelScore = (self.scoreType['油耗'],scoreValue) if "舒適性" == labelName: comfortScore = (self.scoreType['舒適性'],scoreValue) if "外觀" == labelName: appearanceScore = (self.scoreType['外觀'],scoreValue) if "內飾" == labelName: interiorScore = (self.scoreType['內飾'],scoreValue) if "價效比" == labelName: costScore = (self.scoreType['價效比'],scoreValue) # 解析購車目的 if "購車目的" == labelName: purposeArr = dl.css("p ::text") for purpose in purposeArr: purpose=purpose.extract().strip() purposeVal=self.carPurposeDict[purpose] carPurposeList.append(purposeVal) carItem = (price, buyTime, fuel, currentKm) scoreItem = (spaceScore, powerScore, operateScore, fuelScore, comfortScore, appearanceScore, interiorScore, costScore) return carItem,scoreItem,carPurposeList # 儲存入口 def save(self): # 當爬取接近尾聲時,每次儲存一次,否則為集合中的元素大於100時再做批量儲存 if self.requestCount > len(self.waitingCrawlIdSet) -100: self.saveAll() else: # 儲存或更新使用者 if len(self.insertUserList) > 100 or len(self.updateUserList) > 100: self.userCount += len(self.insertUserList) self.saveList(insertList=self.insertUserList,updateList=self.updateUserList,insertSql=MySqlUtils.sql_insert_user,updateSql=MySqlUtils.sql_update_user) # 儲存經銷商 if len(self.insertDealerList) > 100 : self.dealerCount += len(self.insertDealerList) self.saveList(insertList=self.insertDealerList,updateList=None,insertSql=MySqlUtils.sql_insert_dealer,updateSql=None) # 儲存或更新口碑主表 if len(self.updateKoubeiHeadList) > 100 or len(self.insertKoubeiHeadList) > 100: self.koubeiHeadCount += len(self.insertKoubeiHeadList) self.saveList(insertList=self.insertKoubeiHeadList,updateList=self.updateKoubeiHeadList,insertSql=MySqlUtils.sql_insert_koubei_head,updateSql=MySqlUtils.sql_update_koubei_head) # 儲存使用者口碑評分 if len(self.insertUserScoreList) > 100: self.userScoreCount += len(self.insertUserScoreList) self.saveList(insertList=self.insertUserScoreList,updateList=None,insertSql=MySqlUtils.sql_insert_koubei_score,updateSql=None) # 儲存使用者購車目的 if len(self.insertCarPurposeList) > 100: self.purposeCount += len(self.insertCarPurposeList) self.saveList(insertList=self.insertCarPurposeList,updateList=None,insertSql=MySqlUtils.sql_insert_koubei_purpose,updateSql=None) # 批量儲存 def saveList(self,insertList,updateList,insertSql,updateSql): flag=False if insertList and len(insertList) > 0: MySqlUtils.updateList(insertSql,insertList) insertList.clear() flag=True if updateList and len(updateList) > 0: MySqlUtils.updateList(updateSql,updateList) updateList.clear() flag=True print("userCount:%s,dealerCount:%s,koubeiHeadCount:%s,userScoreCount:%s,purposeCount:%s,updateKoubeiCount:%s" % ( self.userCount, self.dealerCount, self.koubeiHeadCount, self.userScoreCount, self.purposeCount,self.updateKoubeiCount)) return flag # 按次儲存 def saveAll(self): # 儲存使用者 if len(self.insertUserList) > 0: self.userCount += len(self.insertUserList) MySqlUtils.updateList(sql=MySqlUtils.sql_insert_user,paramsList=self.insertUserList) self.insertUserList.clear() # 更新使用者 if len(self.updateUserList) > 0: MySqlUtils.updateList(sql=MySqlUtils.sql_update_user,paramsList=self.updateUserList) self.updateUserList.clear() # 儲存經銷商 if len(self.insertDealerList) > 0 : self.dealerCount += len(self.insertDealerList) MySqlUtils.updateList(sql=MySqlUtils.sql_insert_dealer,paramsList=self.insertDealerList) self.insertDealerList.clear() # 儲存使用者口碑主表 if len(self.insertKoubeiHeadList) >0: self.koubeiHeadCount += len(self.insertKoubeiHeadList) MySqlUtils.updateList(sql=MySqlUtils.sql_insert_koubei_head,paramsList=self.insertKoubeiHeadList) self.insertKoubeiHeadList.clear() # 更新使用者口碑主表 if len(self.updateKoubeiHeadList) >0: MySqlUtils.updateList(sql=self.updateKoubeiHeadList,paramsList=self.updateKoubeiHeadList) self.updateKoubeiHeadList.clear() # 儲存使用者口碑評分 if len(self.insertUserScoreList) > 0: self.userScoreCount += len(self.insertUserScoreList) MySqlUtils.updateList(sql=MySqlUtils.sql_insert_koubei_score,paramsList=self.insertUserScoreList) self.insertUserScoreList.clear() # 儲存購車目的 if len(self.insertCarPurposeList) > 0: self.purposeCount += len(self.insertCarPurposeList) MySqlUtils.updateList(sql=MySqlUtils.sql_insert_koubei_purpose,paramsList=self.insertCarPurposeList) self.insertCarPurposeList.clear() print("userCount:%s,dealerCount:%s,koubeiHeadCount:%s,userScoreCount:%s,purposeCount:%s,updateKoubeiCount:%s" % (self.userCount,self.dealerCount,self.koubeiHeadCount,self.userScoreCount,self.purposeCount,self.updateKoubeiCount)) # 解析下一頁連結 def parseNextPageUrl(self,response,chexingId): nexPageUrl=response.css(".page-item-next ::attr(href)").extract_first() if nexPageUrl: page=nexPageUrl[nexPageUrl.rfind("/"):nexPageUrl.rfind("#")] if page != '': return self.nextPageUrl % (chexingId, page) return None # 更新車型平均評分表;更新車型表車型縮圖、油耗、評論人數 def updateSpecScore(self): print("--------------------->save:%s,update:%s,updateAvg:%s" % (len(self.scoreSaveList),len(self.scoreUpdateList),len(self.updateSpecAvgFuelList))) # 插入評分表 if len(self.scoreSaveList) > 0: scoreListCopy = self.scoreSaveList.copy() self.scoreSaveList.clear() MySqlUtils.saveSpecDetailScore(scoreListCopy) # 更新評分表 if len(self.scoreUpdateList) > 0: scoreUpdateListCopy = self.scoreUpdateList.copy() self.scoreUpdateList.clear() MySqlUtils.updateSpecDetailScore(scoreUpdateListCopy) # 更新車型表平均油耗、評論人數、車型縮圖 if len(self.updateSpecAvgFuelList) > 0: avgFuelListCopy = self.updateSpecAvgFuelList.copy() self.updateSpecAvgFuelList.clear() MySqlUtils.updateSpecAvgFuel(avgFuelListCopy) # 判重方法 def verify(self,idStr,idSet,paramsList,idIndex,sql): idStr = idStr[:len(idStr) - 1] idRes = MySqlUtils.query(sql % idStr) unexistIdSet = None if idRes: existIdSet = MySqlUtils.parseToSet(idRes, 0) # 用解析的使用者id集合減去已經存在的id集合就是不存在資料庫中的使用者ID集合 unexistIdSet = idSet - existIdSet else: unexistIdSet = idSet # 儲存使用者 unExistParamsList = list() for id in unexistIdSet: for params in paramsList: if id == params[idIndex]: unExistParamsList.append(params) break return unExistParamsList import pymysql class MySqlUtils(object): #獲取資料庫連結 vcar_host="10.1.11.129" # 更新車型表中平均油耗、車型縮圖、評論人數 sql_update_chexing_imgs_fuel_num=""" UPDATE `vcar_vcyber_com`.`vcar_chexing` SET `avgFuel` = %s, `numPeople` = %s, `updateTime` = %s, `imgUrlS` = %s WHERE `chexingID` = %s; """ # 查詢使用者評分 sql_query_user_score=""" SELECT koubeiSid FROM vcar_vcyber_com.vcar_qczj_user_koubei_score; """ # 插入使用者平均評分 sql_insert_avg_score=""" INSERT INTO `vcar_vcyber_com`.`vcar_qczj_score_chexing` (`sid`, `chexingID`, `scoreType`, `score`, `compareScore`) VALUES (%s, %s, %s, %s, %s); """ # 更新使用者平均評分 sql_update_avg_score=""" UPDATE `vcar_vcyber_com`.`vcar_qczj_score_chexing` SET `score` = %s, `compareScore` = %s, `updateTime` = %s WHERE `chexingID` = %s and scoreType= %s; """ # 查詢使用者 sql_query_user=""" SELECT sid FROM vcar_vcyber_com.vcar_qczj_user; """ # 更新使用者 sql_update_user=""" UPDATE `vcar_vcyber_com`.`vcar_qczj_user` SET `userName` = %s, `headImg` = %s, `updateTime` = %s WHERE `sid` = %s; """ # 插入使用者 sql_insert_user=""" INSERT INTO `vcar_vcyber_com`.`vcar_qczj_user` (`sid`, `userName`, `homepageUrl`, `headImg`, `city`, `county`) VALUES (%s, %s, %s, %s, %s, %s); """ # 查詢經銷商表 sql_query_dealer=""" SELECT sid FROM vcar_vcyber_com.vcar_qczj_dealer; """ # 插入經銷商表 sql_insert_dealer=""" INSERT INTO `vcar_vcyber_com`.`vcar_qczj_dealer` (`sid`, `dealerName`, `homepageUrl`, `city`, `county`) VALUES (%s, %s, %s, %s, %s ); """ # 查詢口碑主表 sql_query_koubei_head=""" SELECT sid FROM vcar_vcyber_com.vcar_qczj_user_koubei_head; """ # 插入口碑主表 sql_insert_koubei_head=""" INSERT INTO `vcar_vcyber_com`.`vcar_qczj_user_koubei_head` (`sid`, `title`, `publicTime`, `userSid`, `chexingID`, `buyTime`, `price`, `dealerId`, `city`, `county`, `currentKm`, `fuel`, `koubeiLink`, `favorNum`, `readNum`, `commentNum`, `koubeiSrc`, `mjjh`) VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s); """ # 更新口碑主表 sql_update_koubei_head=""" UPDATE `vcar_vcyber_com`.`vcar_qczj_user_koubei_head` SET `currentKm` = %s, `fuel` = %s, `favorNum` = %s, `readNum` = %s, `commentNum` = %s, `updateTime` = %s, `mjjh` = %s WHERE `sid` = %s; """ # 插入購車用途 sql_insert_koubei_purpose=""" INSERT INTO `vcar_vcyber_com`.`vcar_qczj_user_koubei_purpose` (`sid`, `koubeiSid`, `purpose`) VALUES (%s, %s, %s); """ # 插入評分明細表 sql_insert_koubei_score=""" INSERT INTO `vcar_vcyber_com`.`vcar_qczj_user_koubei_score` (`sid`, `koubeiSid`, `scoreType`, `score`) VALUES (%s, %s, %s, %s); """ @classmethod def getConnection(self): conn = pymysql.connect(host='localhost', user='root', passwd='root', db='vcar_vcyber_com', port=3306, charset='utf8') return conn @classmethod def query(cls,sql): try: # 獲取連結 conn = cls.getConnection() cursor = conn.cursor() cursor.execute(sql) res=cursor.fetchall() #for item in res: #print(item) #self.log(item) #返回的是列表,列表元素型別是元組[(),(),,] return res except Exception as e: print(e) finally: cursor.close() conn.close() # 批量更新或儲存 @classmethod def updateList(cls,sql,paramsList): try: # 獲取資料連線 conn = cls.getConnection() # 獲取查詢遊標 cursor = conn.cursor() # 執行 # print(itemList) cursor.executemany(sql, paramsList) # 提交 conn.commit() except Exception as e: print(e) finally: cursor.close() conn.close() # 只更新一個 @classmethod def updateOne(cls,sql,params): try: # 獲取資料連線 conn = cls.getConnection() # 獲取查詢遊標 cursor = conn.cursor() # 執行 # print(itemList) cursor.execute(sql, params) # 提交 conn.commit() except Exception as e: print(e) finally: cursor.close() conn.close() # 將查詢結果解析成id集合 @classmethod def parseToSet(cls,res,index): idSet=set() for item in res: idSet.add(item[index]) return idSet # 解析成字典 @classmethod def parseToDict(cls,res,keyIndex,valueIndex): d=dict() for item in res: d.__setitem__(item[keyIndex],item[valueIndex]) return d @classmethod def queryBrandId(self): #self.log("start query --------------------------------") queryList=list() try: conn = self.getConnection() cur = conn.cursor() sql=""" SELECT `vcar_pinpai`.`pinpaiID` FROM `vcar_vcyber_com`.`vcar_pinpai`; """ cur.execute(sql) res=cur.fetchall() #for item in res: #print(item) #self.log(item) #返回的是列表,列表元素型別是元組[(),(),,] return res except Exception as e: pass #print(e) #self.log(e) #self.log("查詢失敗") finally: cur.close() conn.close() #self.log("end query ----------------------------------") #查詢車系資訊,返回元組(brandId,seriesId,seriesLink) @classmethod def querySeriesLink(self): sql=""" SELECT `vcar_chexi`.`pinpaiID`, `vcar_chexi`.`chexiID`, `vcar_chexi`.`url` FROM `vcar_vcyber_com`.`vcar_chexi`; """ try: #獲取資料連線 conn=self.getConnection() #獲取查詢遊標 cursor=conn.cursor() #執行查詢 cursor.execute(sql) #獲取結果 res=cursor.fetchall() #for item in res: #print(item) return res except Exception as e: pass finally: conn.close() cursor.close() @classmethod def insertSpecItemList(cls,itemList): sql=""" INSERT INTO `vcar_vcyber_com`.`vcar_chexing` (`chexingID`, `pinpaiID`, `chexiID`, `name`, `url`) VALUES (%s, %s, %s, %s, %s); """ try: # 獲取資料連線 conn = cls.getConnection() # 獲取查詢遊標 cursor = conn.cursor() # 執行 # print(itemList) cursor.executemany(sql,itemList) # 提交 conn.commit() except Exception as e: print(e) finally: cursor.close() conn.close() @classmethod def querySpec(cls): sql=""" SELECT chexingID, chexiID, pinpaiID, name FROM vcar_vcyber_com.vcar_chexing """ try: # 獲取資料連線 conn = cls.getConnection() # 獲取查詢遊標 cursor = conn.cursor() # 執行 # print(itemList) cursor.execute(sql) res=cursor.fetchall() return res except Exception as e: print(e) finally: cursor.close() conn.close() # 查詢列舉表 @classmethod def queryEnum(cls,labelCd): sql=""" SELECT optionName, optionValue FROM vcar_vcyber_com.vcar_dic WHERE labelCd = '%s'; """ try: # 獲取資料連線 conn = cls.getConnection() # 獲取查詢遊標 cursor = conn.cursor() # 執行 # print(itemList) cursor.execute(sql % labelCd) res = cursor.fetchall() return res except Exception as e: print(e) finally: cursor.close() conn.close() # 將車型評分批量儲存到資料庫 @classmethod def saveSpecDetailScore(cls,scoreList): sql=""" INSERT INTO `vcar_vcyber_com`.`vcar_qczj_score_chexing` (`sid`, `chexingID`, `scoreType`, `score`, `compareScore`) VALUES (%s, %s, %s, %s, %s); """ try: # 獲取資料連線 conn = cls.getConnection() # 獲取查詢遊標 cursor = conn.cursor() # 執行 # print(itemList) cursor.executemany(sql,scoreList) # 提交 conn.commit() except Exception as e: print(e) finally: cursor.close() conn.close() # 更新車型評分 @classmethod def updateSpecDetailScore(cls,updateList): sql=""" UPDATE `vcar_vcyber_com`.`vcar_qczj_score_chexing` SET `score` = %s, `compareScore` = %s, `updateTime` = %s WHERE `chexingID` = %s and scoreType= %s; """ try: # 獲取資料連線 conn = cls.getConnection() # 獲取查詢遊標 cursor = conn.cursor() # 執行 # print(itemList) cursor.executemany(sql,updateList) # 提交 conn.commit() except Exception as e: print(e) finally: cursor.close() conn.close() # 查詢車型評分表,查詢出所有的已儲存的車型id @classmethod def querySavedScoreSepc(cls): sql=""" SELECT DISTINCT (chexingID) AS chexingID FROM vcar_vcyber_com.vcar_qczj_score_chexing; """ try: # 獲取資料連線 conn = cls.getConnection() # 獲取查詢遊標 cursor = conn.cursor() # 執行 # print(itemList) cursor.execute(sql) res = cursor.fetchall() return res except Exception as e: print(e) finally: cursor.close() conn.close() # 更新車型平均油耗 @classmethod def updateSpecAvgFuel(cls,paramsList): # (None, None, '2018-10-12 16:54:11', 'https:https://car2.autoimg.cn/cardfs/product/g30/M0A/D3/4C/t_autohomecar__ChsEf1uEBSuAHcMXAAj-04onwvc092.jpg', '1006690' sql=""" UPDATE `vcar_vcyber_com`.`vcar_chexing` SET `avgFuel` = %s, `numPeople` = %s, `updateTime` = %s, `imgUrlS` = %s WHERE `chexingID` = %s; """ try: # 獲取資料連線 conn = cls.getConnection() # 獲取查詢遊標 cursor = conn.cursor() # 執行 # print(itemList) cursor.executemany(sql,paramsList) # 提交 conn.commit() except Exception as e: print("error:updateSpecAvgFuel") print(e) finally: cursor.close() conn.close() # 將車型元組集合轉換為車型IDset集合 @classmethod def parseToChexingIdSet(cls,res): chexingIdSet=set() for item in res: chexingIdSet.add(item[0]) return chexingIdSet # 將車系元組集合轉換成車系ID集合 @classmethod def parseToSeriesIdSet(cls,res): seriesIdSet=set() for item in res: seriesIdSet.add(item[1]) return seriesIdSet # 查詢車系id集合中的車系資料 @classmethod def findChexiInChexiSet(cls,seriesItems,seriesIdSet): waitingCrawlItems=list() for id in seriesIdSet: for item in seriesItems: if id == item[1]: waitingCrawlItems.append(item) break return waitingCrawlItems; @classmethod def parseEnum(cls,res): enumDic=dict() for item in res: enumDic.__setitem__(item[0],item[1]) return enumDic import os,sys # 斷點管理類 class Point(object): # 正常爬取結束標識檔案 overFilePath=None # 斷點記錄檔案 pointFilePath=None @classmethod def init(cls): # 獲取當前目錄 path = os.path.abspath(__file__) path = path[0:path.rfind("/")] # 獲取當前目錄下所有檔案 (('/Users/guohan/Documents/pworkspace/VCar/Python/spider_vcar_vcyber_com/vcarDetailSpider/vcarDetailSpider', ['__pycache__', 'spiders', 'temp'], ['__init__.py', 'items.py', 'middlewares.py', 'mySqlUtils.py', 'pipelines.py', 'settings.py']), ('/Users/guohan/Documents/pworkspace/VCar/Python/spider_vcar_vcyber_com/vcarDetailSpider/vcarDetailSpider/__pycache__', [], ['__init__.cpython-36.pyc', 'items.cpython-36.pyc', 'mySqlUtils.cpython-36.pyc', 'pipelines.cpython-36.pyc', 'settings.cpython-36.pyc']), ('/Users/guohan/Documents/pworkspace/VCar/Python/spider_vcar_vcyber_com/vcarDetailSpider/vcarDetailSpider/spiders', ['__pycache__'], ['__init__.py', 'detailSpider.py']), ('/Users/guohan/Documents/pworkspace/VCar/Python/spider_vcar_vcyber_com/vcarDetailSpider/vcarDetailSpider/spiders/__pycache__', [], ['__init__.cpython-36.pyc', 'detailSpider.cpython-36.pyc']), ('/Users/guohan/Documents/pworkspace/VCar/Python/spider_vcar_vcyber_com/vcarDetailSpider/vcarDetailSpider/temp', [], ['1.txt'])) tt = tuple(os.walk(path)) # 獲取當前目錄 currentDir = tt[0][0] # 系統檔案分隔符 sep = os.sep # 拼接目的檔案 Point.overFilePath = currentDir + sep + "temp" + sep + "over.txt" Point.pointFilePath = currentDir + sep + "temp" + sep + "point.txt" # 切入斷點,返回待爬集合 @classmethod def cutInto(cls,total): print("--------------------cutInto--------------") # 定義最終要爬取的資料集 waitingCrawlIdSet=None # 判斷當前目錄中是否存在over.text檔案 hasOverFile = os.path.exists(Point.overFilePath) # 如果存在結束標識檔案則證明上一次完整爬取,刪除標識檔案和斷點檔案 if hasOverFile: print(Point.overFilePath) os.remove(Point.overFilePath) # 清空斷點檔案的內容 f = open(Point.pointFilePath, "w", encoding="utf-8") f.write("") f.flush() f.close() del f # 待爬資料就是查詢出的全部 waitingCrawlIdSet = total else: # 讀取斷點檔案 pointFile = open(Point.pointFilePath, "r+", encoding="utf-8") lines = pointFile.read() # 如果行末尾存在逗號,則消除逗號 if len(lines) - 1 == lines.rfind(","): lines = lines[0:lines.rfind(",")] # 提取已爬取的車型id,封裝成set集合 crawledIdSet = set(lines.split(",")) # 用全部爬取id集減去已爬取的的id集得出待爬取的id集 waitingCrawlIdSet = total - crawledIdSet # print(len(DetailPipeline.waitingCrawlIdSet)) pointFile.close() del pointFile print("總共需要爬取%s,上次已爬取%s,本次需爬取%s" % (len(total),len(total)-len(waitingCrawlIdSet),len(waitingCrawlIdSet))) return waitingCrawlIdSet # 記錄斷點 @classmethod def savePoint(self,data): f = open(self.pointFilePath, "a", encoding="utf-8") f.write(data) f.flush() f.close() del f # 記錄斷點,傳入一個集合 @classmethod def savePointFromSet(cls,setData): data="" for id in setData: data += id + "," Point.savePoint(data) # 完成爬取 @classmethod def complete(cls): overFile = open(cls.overFilePath, "w", encoding="utf-8") overFile.write("") overFile.flush() overFile.close() del overFile # 啟動專案 se=AutoSelenium() se.start_requests()