1. 程式人生 > >Python 爬蟲實戰 汽車某家(五) 口碑、評分

Python 爬蟲實戰 汽車某家(五) 口碑、評分

文章目錄

爬取內容
1、使用者口碑明細評分
2、口碑標題、發表日期、口碑推薦級別
3、購車目的
4、購車價格
5、購車經銷商

一、專案結構

在這裡插入圖片描述
point.txt 為斷點儲存檔案,over.txt為爬取結束標識檔案,不存放任何內容

二、核心類程式碼



import time,json,re,random,datetime
from io import BytesIO
from PIL import Image
from selenium import webdriver
from selenium.webdriver import ActionChains
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from scrapy import Selector

class AutoSelenium(object):
    requestCount=0
    # 爬取連結,佔位符為車型id
    kouBeiUrl = "https://k.autohome.com.cn/spec/%s"
    # 分頁url
    nextPageUrl="https://k.autohome.com.cn/spec/%s%s"
    # 全部車型id
    chexingIdSet=None
    # 批量儲存到資料庫
    scoreSaveList=list()
    # 批量更新到資料庫
    scoreUpdateList=list()
    # 已儲存過的車型id集合
    savedChexingIdSet=None
    # 用於更新車型表平均能耗的資料集合
    updateSpecAvgFuelList=list()

    # 更新使用者集合
    updateUserList=list()
    # 插入使用者集合
    insertUserList=list()
    # 執行時儲存的使用者集合
    crawlingUserIdSet=set()

    # 插入經銷商集合
    insertDealerList=list()
    # 正在爬取中的經銷商id集合
    crawlingDealerIdSet=set()
    # 插入使用者評分集合
    insertUserScoreList=list()


    # 插入口碑主表集合
    insertKoubeiHeadList=list()
    # 更新口碑主標集合
    updateKoubeiHeadList=list()
    # 已經儲存到資料庫的口碑id集合
    existKoubeiHeadIdSet=set()

    # 插入購車用途集合
    insertCarPurposeList=list()

    # 斷點功能用待爬取車型id集合
    waitingCrawlIdSet=None
    # 斷點功能用已爬取車型id集合
    crawledIdSet=set()


    # 車型爬取計數器
    specCount=0
    # 儲存使用者統計
    userCount=0
    # 儲存經銷商統計
    dealerCount=0
    # 儲存口碑主表統計
    koubeiHeadCount=0
    # 更新口碑主表統計
    updateKoubeiCount=0
    # 儲存使用者評分統計
    userScoreCount=0
    # 儲存購車目的統計
    purposeCount=0


    # 評分類別元組,與scoreItem對應:(spaceScore, powerScore, operateScore, fuelScore, comfortScore, appearanceScore, interiorScore, costScore)
    scoreTupe=()


    def __init__(self):
        # 初始化seleniume
        self.browser = webdriver.Chrome()
        self.wait = WebDriverWait(self.browser, 20)
        # 初始化全部車型id集合
        specRes=MySqlUtils.querySpec()
        self.chexingIdSet=MySqlUtils.parseToChexingIdSet(specRes)
        # 初始化分數型別列舉
        scoreTypeRes=MySqlUtils.queryEnum('qczj_score')
        self.scoreType=MySqlUtils.parseEnum(scoreTypeRes)
        self.scoreType.__setitem__("能耗",4)
        self.scoreType.__setitem__("電耗",4)
        self.scoreType.__setitem__("耗電量",4)
        # 初始化已爬取車型id
        savedScoreRes=MySqlUtils.querySavedScoreSepc()
        self.savedChexingIdSet=MySqlUtils.parseToChexingIdSet(savedScoreRes)
        # 初始化購車型別字典
        carPurposeRes=MySqlUtils.queryEnum('qczj_car_purpose')
        self.carPurposeDict=MySqlUtils.parseEnum(carPurposeRes)

        # 查詢所有的使用者,用於判重
        userRes=MySqlUtils.query(MySqlUtils.sql_query_user)
        self.existUserIdSet=MySqlUtils.parseToSet(userRes,0)

        # 初始化已經存在於經銷商表中的經銷商id集合
        dearlerRes=MySqlUtils.query(MySqlUtils.sql_query_dealer)
        self.existDealerIdSet=MySqlUtils.parseToSet(dearlerRes,0)

        # 初始化已經存在於資料庫中的口碑主表id
        koubeiHeadRes=MySqlUtils.query(MySqlUtils.sql_query_koubei_head)
        self.existKoubeiId=MySqlUtils.parseToSet(koubeiHeadRes,0)

        # 初始化已經存在於使用者評分表中的口碑id
        userScoreRes=MySqlUtils.query(MySqlUtils.sql_query_user_score)
        self.existUserScoreKoubeiIdSet = MySqlUtils.parseToSet(userScoreRes,0)


        # 初始化斷點
        Point.init()
        # 切入斷點
        self.waitingCrawlIdSet=Point.cutInto(self.chexingIdSet)


    def start_requests(self):
        for chexingId in self.waitingCrawlIdSet:
        # for chexingId in ['23009']:
            # 已爬取的id儲存到已爬取的id集合中
            self.crawledIdSet.add(chexingId)
            self.specCount += 1
            # 拼接請求連結
            url = self.kouBeiUrl % chexingId
            while True:
                # 請求連結
                if not self.requestConfigLink(url):
                    # 請求超時,證明沒有資料,返回請求下一個
                    print("請求超時")
                    # 超時原因是由於驗證沒有通過而導致,可以不儲存到已爬集合中,當爬完結束後並不會生成over斷點檔案,因此再次啟動時就會將超時的車型再爬一遍,從而提高資料的完整度
                    self.crawledIdSet.remove(chexingId)
                    self.specCount -= 1
                    break
                # 解析
                # 取出page_source
                page_source = self.browser.page_source
                # 轉selector取值
                response = Selector(text=page_source)
                # 解析平均分
                # self.parseAvgScore(response,chexingId)
                # 解析使用者、購買資訊、經銷商、評分、購車用途
                self.parse(response,chexingId)
                # self.parseUserCar(response,chexingId)
                # self.parseDealer(response)
                # self.parseScoreDetail(response,chexingId)
                # 解析下一頁
                url=self.parseNextPageUrl(response,chexingId)
                print("~~~~~~~~~~~~~~~~~~~~~~~~~~~url:%s" % url)
                if url == None:
                    break
            # 將爬取過的車型id記錄到斷點檔案中,每爬取10個車型儲存一次
            if len(self.crawledIdSet) > 10:
                crawledIdSetCopy = self.crawledIdSet.copy()
                self.crawledIdSet.clear()
                Point.savePointFromSet(crawledIdSetCopy)
        # 出迴圈爬取結束,完成斷點,關閉瀏覽器
        Point.savePointFromSet(self.crawledIdSet)
        if self.specCount >= len(self.waitingCrawlIdSet):
            Point.complete()
        self.browser.close()



    def requestConfigLink(self, url):
        success=True
        wait=None
        try:
            #configUrl="https://car.autohome.com.cn/config/spec/20211.html" #該請求沒有配置資料
            self.requestCount+=1
            self.browser.get(url)
            wait = WebDriverWait(self.browser, 10)
            wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, '.date-ul.fn-left')))
            time.sleep(random.randint(1,3))  # 若不加一個會發生頁面沒有完全渲染
        except Exception as e:
            print(e)
            success=False
        if not success:
            try:
                ele = wait.until(EC.element_to_be_clickable((By.CLASS_NAME, 'geetest_radar_tip')))
                ele.click()
                wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, '.date-ul.fn-left')))
                success=True
            except Exception as e:
                success = False
        return success



    # 解析所有使用者對車型的平均評分
    def parseAvgScore(self, response,chexingId):
        # 取出參與評分人數:font-arial red font-16
        numPeople = response.css(".font-arial.red.font-16 ::text").extract_first()
        # 取出平均油耗:font-arial font-number
        avgPower=response.css(".font-arial.font-number ::text").extract_first()
        # 取出車型代表圖片小圖
        imgUrlS = response.css(".appraise-cont-dl.fn-left img::attr(src)").extract_first()

        # 新增到待儲存的集合中
        if avgPower or numPeople or imgUrlS:
            updateDate=datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
            self.updateSpecAvgFuelList.append((avgPower,numPeople,updateDate,imgUrlS,chexingId))

        # 取出評分明細類目
        ulArr = response.css(".date-ul.fn-left")
        for ul in ulArr:
            for i, li in enumerate(ul.css("li")):
                if i == 0:
                    continue
                # 取出當前類別值
                optionName = li.css(".width-01 ::text").extract_first()
                optionName = optionName.rstrip()
                optionName = optionName.lstrip()

                # 取出當前類別的評分
                optionValue = li.css(".width-02 ::text").extract_first()
                optionValue = optionValue.rstrip()
                optionValue = optionValue.lstrip()
                if optionValue == "-":
                    optionValue = None
                    break


                # 取出當前類別的高於/低於
                # 取值
                cpValue = li.css(".width-03").xpath("string(.)").extract_first()
                cpValue = cpValue.rstrip()
                cpValue = cpValue.lstrip()
                cpValue = re.search(r'\d+(\.\d+)?', cpValue)
                if cpValue:
                    cpValue = cpValue.group()
                # 判斷是否存在子元素i
                if li.css(".width-03 i"):
                    # 根據css名稱 判斷高於低於
                    cssClassName = li.css(".width-03 i::attr(class)").extract_first()
                    pcssName, subCssName = cssClassName.split(" ")
                    if subCssName == 'icon-dy':
                        cpValue = -1 * float(cpValue)


                scoreType=self.scoreType[optionName]
                sid=chexingId + "_" + str(random.randint(100000, 999999))
                if chexingId in self.savedChexingIdSet:
                    updateTime=datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
                    self.scoreUpdateList.append((optionValue,cpValue,updateTime,chexingId,scoreType))
                else:
                    self.scoreSaveList.append((sid,chexingId,scoreType,optionValue,cpValue))

            # 儲存到資料庫,當爬取接近尾聲時,每個車型儲存一次,否則多個車型批量儲存
            if self.specCount > len(self.chexingIdSet) - 100:
                self.updateSpecScore()

            else:
                if len(self.scoreSaveList) >= 100 or len(self.scoreUpdateList) >= 100 :
                    self.updateSpecScore()


    # 解析入口
    def parse(self,response,chexingId):
        updateTime = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
        # 解析平均分
        # self.parseAvgScore(response,chexingId)

        # 一個頁面多條口碑,解析口碑div集合
        mouthconDivs=response.css(".mouthcon")
        for mouthDiv in mouthconDivs:

            # 解析使用者資訊 # print("userid:%s,name:%s,homepage:%s,img:%s" % (userid, userName, userPage, headImgUrl))
            userid, userName, userPage ,headImgUrl = self.parseUserInfo(mouthDiv)

            # 解析經銷商資訊 # print("city:%,county:%s,dealerName:%s,dealerId:%s,dealerHomePage:%s" % (city,county,dealerName,dealerId,dealerHomePage))
            city, county, dealerName, dealerId, dealerHomePage = self.parseDealer(mouthDiv)

            # 解析購買時間、購買價格、油耗、行駛里程;空間、動力、操控、油耗、舒適性、外觀、內飾、價效比;購車目的集合
            # carItem = (price, buyTime, fuel, currentKm)
            # scoreItem = (spaceScore, powerScore, operateScore, fuelScore, comfortScore, appearanceScore, interiorScore, costScore)
            carItem, scoreItem, carPurposeList = self.parseScoreItem(mouthDiv)

            # 解析口碑主鍵、口碑連結、口碑主題、口碑第一次發表時間、口碑來源 、閱讀人數、評論人數、支援人數、滿級精華標識
            koubeiSid, koubeiTitle, publicTime, koubeiLink, koubeiSrc, readNum, commentNum, favorNum,mjjh = self.parseKoubei(mouthDiv)


            # 儲存口碑主表,若已存在則更新
            if koubeiSid not in self.existKoubeiId:
                insertKoubeiParams = (
                koubeiSid, koubeiTitle, publicTime, userid, chexingId, carItem[1], carItem[0], dealerId, city, county,
                carItem[3], carItem[2], koubeiLink, favorNum, readNum, commentNum, koubeiSrc,mjjh)
                self.insertKoubeiHeadList.append(insertKoubeiParams)
            else:
                self.updateKoubeiCount += 1
                updateKoubeiParams = (
                carItem[3], carItem[2], favorNum, readNum, commentNum, updateTime, mjjh, koubeiSid)
                self.updateKoubeiHeadList.append(updateKoubeiParams)


            # 儲存使用者資訊
            if userid not in self.existUserIdSet and userid not in self.crawlingUserIdSet:
                insertUserParams = (userid, userName, userPage, headImgUrl, city, county)
                self.insertUserList.append(insertUserParams)
                self.crawlingUserIdSet.add(userid)

            # 儲存使用者評分和購車用途資訊
            if scoreItem or carPurposeList:
                if koubeiSid not in self.existUserScoreKoubeiIdSet:
                    # 儲存使用者評分
                    for item in scoreItem:
                        userScoreSid=userid+"_"+chexingId+"_"+koubeiSid+"_"+str(random.randint(100000,999999))
                        insertUserScoreParams=(userScoreSid,koubeiSid,item[0],item[1])
                        self.insertUserScoreList.append(insertUserScoreParams)
                    # 儲存購車用途
                    for purposeVal in carPurposeList:
                        purposeSid=userid+"_"+chexingId+"_"+koubeiSid+"_"+str(random.randint(100000,999999))
                        insertPurposeParams=(purposeSid,koubeiSid,purposeVal)
                        self.insertCarPurposeList.append(insertPurposeParams)

            # 儲存經銷商資訊集合中
            if dealerId:
                if dealerId not in self.existDealerIdSet and dealerId not in self.crawlingDealerIdSet:
                    insertDealerParams = (dealerId,dealerName,dealerHomePage,city,county)
                    self.insertDealerList.append(insertDealerParams)
                    self.crawlingDealerIdSet.add(dealerId)
            # print("buyTime:%s,price:%s,fuel:%s,currentKm:%s" % (
            # buyTime.strftime("%Y-%m-%d %H:%M:%S"), price, fuel, currentKm))
            # print("spaceScore:%s,powerScore:%s,operateScore:%s,fuelScore:%s,comfortScore:%s,appearanceScore:%s,interiorScore:%s,costScore:%s" % (
            #     spaceScore, powerScore, operateScore, fuelScore, comfortScore, appearanceScore, interiorScore,
            #     costScore))


            # print("koubeiSid:%s,koubeiTitle%s,publicTime:%s,koubeiSrc:%s,koubeiLink:%s" % (koubeiSid,koubeiTitle,publicTime,koubeiSrc,koubeiLink))
            # print("favorNum:%s,readNum:%s,commentNum:%s" % (favorNum,readNum,commentNum))

            # 存入使用者集合
            userParams=(userid,userName,userPage,headImgUrl,city,county)
            # self.saveToUserList(userParams)

        # 出迴圈一個頁面解析完成,當爬到末尾時逐條儲存,否則批量儲存
        self.save()








    # 解析使用者資訊
    def parseUserInfo(self,mouthDiv):
        nameText = mouthDiv.css(".name-text")
        userName = nameText.xpath("string(.)").extract_first().strip()
        userPage = nameText.css("a ::attr(href)").extract_first().strip()
        userid = userPage[userPage.rfind("/") + 1:]
        headImgUrl = mouthDiv.css(".name-pic img::attr(data-src)").extract_first()
        return userid,userName,userPage,headImgUrl


    # 解析經銷商資訊
    def parseDealer(self,mouthDiv):
        # 解析地帶呢
        county = None
        city = None
        place = mouthDiv.css(".c333 ::text").extract_first().strip()
        placeSplit = place.split(" ")
        if len(placeSplit) > 1:
            city = placeSplit[0]
            county = placeSplit[1]
        else:
            city = place

        # 解析經銷商
        dealerLink = mouthDiv.css(".js-dearname ::attr(href)").extract_first()
        # print("dealerLink:%s" % dealerLink)
        dealerId = None
        dealerName=None
        dealerHomePage = None
        if dealerLink:
            # 獲取經銷商id
            dealerId = dealerLink[dealerLink.rfind("/") + 1:dealerLink.rfind("#")]
            # # 經銷商主頁
            dealerHomePage = dealerLink[:dealerLink.rfind("#")]
            dealerName=mouthDiv.css(".js-dearname ::text").extract_first().strip()

        return city,county,dealerName,dealerId,dealerHomePage


    # 解析口碑概要資訊
    def parseKoubei(self,mouthDiv):
        koubeiSrc = None
        titleItem = mouthDiv.css(".title-name.name-width-01")
        # 解析口碑來源
        koubeiSrcText = titleItem.css("span ::text").extract_first()
        if koubeiSrcText:
            koubeiSrc = koubeiSrcText.split(":")[1]
        # 解析發表時間
        publicTime = titleItem.css("b a::text").extract_first()
        # 解析口碑連結
        koubeiLink = titleItem.css("b a::attr(href)").extract_first()
        # 解析口碑id
        koubeiSid = koubeiLink[koubeiLink.rfind("_") + 1:koubeiLink.rfind(".")]
        # 解析口碑主題
        koubeiTitle = None
        if len(titleItem.css("a ::text")) > 1:
            koubeiTitle=titleItem.css("a ::text")[1].extract()
        helpDiv = mouthDiv.css(".help")
        # 解析評論人數
        commentNum = helpDiv.css(".font-arial.CommentNumber ::text").extract_first().strip()
        # 解析支援人數
        favorNum = helpDiv.css(".supportNumber ::text").extract_first().strip()
        # 解析閱讀人數
        readNum = helpDiv.css(".orange ::text").extract_first().strip()
        # 解析滿級精華
        mjjh=mouthDiv.css(".mjjh-icon ::attr(src)").extract_first()
        # 解析首頁推薦
        sytj=mouthDiv.css(".sytj-icon ::attr(src)").extract_first()
        if mjjh:
            # 擷取精華數字
            mjjh=mjjh[mjjh.rfind("-")+1:mjjh.rfind(".")]
        if sytj:
            # 轉換首頁推薦為數字
            sytj=0
        if mjjh and sytj:
            mjjh=mjjh+","+sytj



        return koubeiSid,koubeiTitle,publicTime,koubeiLink,koubeiSrc,readNum,commentNum,favorNum,mjjh


    # 解析購買價格、評分明細、及購車用途資訊
    def parseScoreItem(self,mouthDiv):
        buyTime = None
        price = None
        fuel = None
        currentKm = None

        spaceScore = None
        powerScore = None
        operateScore = None
        fuelScore = None
        comfortScore = None
        appearanceScore = None
        interiorScore = None
        costScore = None

        carPurposeList = list()
        dlArr = mouthDiv.css(".choose-dl")
        for dl in dlArr:
            # 解析左側標籤名稱
            labelName = dl.css("dt ::text").extract_first().strip()
            # print(labelName)
            # print("type:",type(labelName))
            # 如果沒有取到標籤名則證明是油耗及行駛里程二合一標籤
            if labelName == None or labelName == "":
                doubleLabel = dl.css("dt p::text")
                doubleValue = dl.css("dd p::text")
                # print("doubleLabelLength:%s" % len(doubleLabel))
                # print(doubleLabel)
                # print("doubleValueLength:%s" % len(doubleValue))
                # print(doubleValue)
                for i, label in enumerate(doubleLabel):
                    labelText = label.extract().strip()
                    if "油耗" == labelText or "能耗" == labelText or "電耗" == labelText:
                        fuel = doubleValue[i].extract().strip()
                    if "目前行駛" == labelText:
                        currentKm = doubleValue[i].extract().strip()
                continue

            labelName.strip()
            if "購買時間" == labelName:
                dateStr = dl.css("dd ::text").extract_first().strip()
                buyTime = datetime.datetime.strptime(dateStr, '%Y年%m月')  # 若出現插入資料庫報錯,需要格式化成字串
                continue
            if "裸車購買價" == labelName:
                price = dl.css("dd ::text").extract_first().strip()
                continue

            scoreValue = dl.css(".font-arial.c333 ::text").extract_first()
            if scoreValue:
                labelName = labelName.strip()
                scoreValue.strip()
                # print("labelName:%s" % labelName)
                # print(labelName == "油耗")
                if "空間" == labelName:
                    spaceScore =(self.scoreType['空間'], scoreValue)
                if "動力" == labelName:
                    powerScore = (self.scoreType['動力'], scoreValue)
                if "操控" == labelName:
                    operateScore = (self.scoreType['操控'], scoreValue)
                if "油耗" == labelName or "能耗" == labelName or "電耗" == labelName or "耗電量" == labelName:
                    fuelScore = (self.scoreType['油耗'],scoreValue)
                if "舒適性" == labelName:
                    comfortScore = (self.scoreType['舒適性'],scoreValue)
                if "外觀" == labelName:
                    appearanceScore = (self.scoreType['外觀'],scoreValue)
                if "內飾" == labelName:
                    interiorScore = (self.scoreType['內飾'],scoreValue)
                if "價效比" == labelName:
                    costScore = (self.scoreType['價效比'],scoreValue)

            # 解析購車目的
            if "購車目的" == labelName:
                purposeArr = dl.css("p ::text")
                for purpose in purposeArr:
                    purpose=purpose.extract().strip()
                    purposeVal=self.carPurposeDict[purpose]
                    carPurposeList.append(purposeVal)

        carItem = (price, buyTime, fuel, currentKm)
        scoreItem = (spaceScore, powerScore, operateScore, fuelScore, comfortScore, appearanceScore, interiorScore, costScore)
        return carItem,scoreItem,carPurposeList


    # 儲存入口
    def save(self):
        # 當爬取接近尾聲時,每次儲存一次,否則為集合中的元素大於100時再做批量儲存
        if self.requestCount > len(self.waitingCrawlIdSet) -100:
            self.saveAll()
        else:
            # 儲存或更新使用者
            if len(self.insertUserList) > 100 or len(self.updateUserList) > 100:
                self.userCount += len(self.insertUserList)
                self.saveList(insertList=self.insertUserList,updateList=self.updateUserList,insertSql=MySqlUtils.sql_insert_user,updateSql=MySqlUtils.sql_update_user)
            # 儲存經銷商
            if len(self.insertDealerList) > 100 :
                self.dealerCount += len(self.insertDealerList)
                self.saveList(insertList=self.insertDealerList,updateList=None,insertSql=MySqlUtils.sql_insert_dealer,updateSql=None)
            # 儲存或更新口碑主表
            if len(self.updateKoubeiHeadList) > 100 or len(self.insertKoubeiHeadList) > 100:
                self.koubeiHeadCount += len(self.insertKoubeiHeadList)
                self.saveList(insertList=self.insertKoubeiHeadList,updateList=self.updateKoubeiHeadList,insertSql=MySqlUtils.sql_insert_koubei_head,updateSql=MySqlUtils.sql_update_koubei_head)
            # 儲存使用者口碑評分
            if len(self.insertUserScoreList) > 100:
                self.userScoreCount += len(self.insertUserScoreList)
                self.saveList(insertList=self.insertUserScoreList,updateList=None,insertSql=MySqlUtils.sql_insert_koubei_score,updateSql=None)
            #  儲存使用者購車目的
            if len(self.insertCarPurposeList) > 100:
                self.purposeCount += len(self.insertCarPurposeList)
                self.saveList(insertList=self.insertCarPurposeList,updateList=None,insertSql=MySqlUtils.sql_insert_koubei_purpose,updateSql=None)

    # 批量儲存
    def saveList(self,insertList,updateList,insertSql,updateSql):
        flag=False
        if insertList and len(insertList) > 0:
            MySqlUtils.updateList(insertSql,insertList)
            insertList.clear()
            flag=True
        if updateList and len(updateList) > 0:
            MySqlUtils.updateList(updateSql,updateList)
            updateList.clear()
            flag=True
        print("userCount:%s,dealerCount:%s,koubeiHeadCount:%s,userScoreCount:%s,purposeCount:%s,updateKoubeiCount:%s" % (
            self.userCount, self.dealerCount, self.koubeiHeadCount, self.userScoreCount, self.purposeCount,self.updateKoubeiCount))
        return flag


    # 按次儲存
    def saveAll(self):
        # 儲存使用者
        if len(self.insertUserList) > 0:
            self.userCount += len(self.insertUserList)
            MySqlUtils.updateList(sql=MySqlUtils.sql_insert_user,paramsList=self.insertUserList)
            self.insertUserList.clear()
        # 更新使用者
        if len(self.updateUserList) > 0:
            MySqlUtils.updateList(sql=MySqlUtils.sql_update_user,paramsList=self.updateUserList)
            self.updateUserList.clear()

        # 儲存經銷商
        if len(self.insertDealerList) > 0 :
            self.dealerCount += len(self.insertDealerList)
            MySqlUtils.updateList(sql=MySqlUtils.sql_insert_dealer,paramsList=self.insertDealerList)
            self.insertDealerList.clear()

        # 儲存使用者口碑主表
        if len(self.insertKoubeiHeadList) >0:
            self.koubeiHeadCount += len(self.insertKoubeiHeadList)
            MySqlUtils.updateList(sql=MySqlUtils.sql_insert_koubei_head,paramsList=self.insertKoubeiHeadList)
            self.insertKoubeiHeadList.clear()
        # 更新使用者口碑主表
        if len(self.updateKoubeiHeadList) >0:
            MySqlUtils.updateList(sql=self.updateKoubeiHeadList,paramsList=self.updateKoubeiHeadList)
            self.updateKoubeiHeadList.clear()

        # 儲存使用者口碑評分
        if len(self.insertUserScoreList) > 0:
            self.userScoreCount += len(self.insertUserScoreList)
            MySqlUtils.updateList(sql=MySqlUtils.sql_insert_koubei_score,paramsList=self.insertUserScoreList)
            self.insertUserScoreList.clear()

        # 儲存購車目的
        if len(self.insertCarPurposeList) > 0:
            self.purposeCount += len(self.insertCarPurposeList)
            MySqlUtils.updateList(sql=MySqlUtils.sql_insert_koubei_purpose,paramsList=self.insertCarPurposeList)
            self.insertCarPurposeList.clear()

        print("userCount:%s,dealerCount:%s,koubeiHeadCount:%s,userScoreCount:%s,purposeCount:%s,updateKoubeiCount:%s" % (self.userCount,self.dealerCount,self.koubeiHeadCount,self.userScoreCount,self.purposeCount,self.updateKoubeiCount))


    # 解析下一頁連結
    def parseNextPageUrl(self,response,chexingId):
        nexPageUrl=response.css(".page-item-next ::attr(href)").extract_first()
        if nexPageUrl:
            page=nexPageUrl[nexPageUrl.rfind("/"):nexPageUrl.rfind("#")]
            if page != '':
                return self.nextPageUrl % (chexingId, page)
        return None



    # 更新車型平均評分表;更新車型表車型縮圖、油耗、評論人數
    def updateSpecScore(self):
        print("--------------------->save:%s,update:%s,updateAvg:%s" % (len(self.scoreSaveList),len(self.scoreUpdateList),len(self.updateSpecAvgFuelList)))
        # 插入評分表
        if len(self.scoreSaveList) > 0:
            scoreListCopy = self.scoreSaveList.copy()
            self.scoreSaveList.clear()
            MySqlUtils.saveSpecDetailScore(scoreListCopy)

        # 更新評分表
        if len(self.scoreUpdateList) > 0:
            scoreUpdateListCopy = self.scoreUpdateList.copy()
            self.scoreUpdateList.clear()
            MySqlUtils.updateSpecDetailScore(scoreUpdateListCopy)

        # 更新車型表平均油耗、評論人數、車型縮圖
        if len(self.updateSpecAvgFuelList) > 0:
            avgFuelListCopy = self.updateSpecAvgFuelList.copy()
            self.updateSpecAvgFuelList.clear()
            MySqlUtils.updateSpecAvgFuel(avgFuelListCopy)


    # 判重方法
    def verify(self,idStr,idSet,paramsList,idIndex,sql):
        idStr = idStr[:len(idStr) - 1]
        idRes = MySqlUtils.query(sql % idStr)
        unexistIdSet = None
        if idRes:
            existIdSet = MySqlUtils.parseToSet(idRes, 0)
            # 用解析的使用者id集合減去已經存在的id集合就是不存在資料庫中的使用者ID集合
            unexistIdSet = idSet - existIdSet
        else:
            unexistIdSet = idSet
        # 儲存使用者
        unExistParamsList = list()
        for id in unexistIdSet:
            for params in paramsList:
                if id == params[idIndex]:
                    unExistParamsList.append(params)
                    break
        return unExistParamsList




import pymysql
class MySqlUtils(object):
    #獲取資料庫連結
    vcar_host="10.1.11.129"
    # 更新車型表中平均油耗、車型縮圖、評論人數
    sql_update_chexing_imgs_fuel_num="""
    UPDATE `vcar_vcyber_com`.`vcar_chexing`
                SET
                    `avgFuel` =  %s,
                    `numPeople` = %s,
                    `updateTime` = %s,
                    `imgUrlS` = %s
                WHERE `chexingID` = %s;
    """

    # 查詢使用者評分
    sql_query_user_score="""
                            SELECT 
                                koubeiSid
                            FROM
                                vcar_vcyber_com.vcar_qczj_user_koubei_score;
    """

    # 插入使用者平均評分
    sql_insert_avg_score="""
                         INSERT INTO `vcar_vcyber_com`.`vcar_qczj_score_chexing`
                                        (`sid`,
                                        `chexingID`,
                                        `scoreType`,
                                        `score`,
                                        `compareScore`)
                                        VALUES
                                        (%s,
                                        %s,
                                        %s,
                                        %s,
                                        %s);
    """
    # 更新使用者平均評分
    sql_update_avg_score="""
                         UPDATE `vcar_vcyber_com`.`vcar_qczj_score_chexing`
                         SET
                               `score` = %s,
                               `compareScore` = %s,
                               `updateTime` = %s
                         WHERE `chexingID` = %s and scoreType= %s;
    """

    # 查詢使用者
    sql_query_user="""
                        SELECT 
                            sid
                        FROM
                            vcar_vcyber_com.vcar_qczj_user;
    """
    # 更新使用者
    sql_update_user="""
                        UPDATE `vcar_vcyber_com`.`vcar_qczj_user`
                        SET
                            `userName` = %s,
                            `headImg` = %s,
                            `updateTime` = %s
                        WHERE `sid` = %s;
    """
    # 插入使用者
    sql_insert_user="""
                        INSERT INTO `vcar_vcyber_com`.`vcar_qczj_user`
                        (`sid`,
                        `userName`,
                        `homepageUrl`,
                        `headImg`,
                        `city`,
                        `county`)
                        VALUES
                        (%s,
                        %s,
                        %s,
                        %s,
                        %s,
                        %s);
    """

    # 查詢經銷商表
    sql_query_dealer="""
                        SELECT 
                            sid 
                        FROM vcar_vcyber_com.vcar_qczj_dealer;
    """

    # 插入經銷商表
    sql_insert_dealer="""
                        INSERT INTO `vcar_vcyber_com`.`vcar_qczj_dealer`
                        (`sid`,
                        `dealerName`,
                        `homepageUrl`,
                        `city`,
                        `county`)
                        VALUES
                        (%s,
                        %s,
                        %s,
                        %s,
                        %s
                        );
    """

    # 查詢口碑主表
    sql_query_koubei_head="""
                        SELECT 
                            sid
                        FROM
                            vcar_vcyber_com.vcar_qczj_user_koubei_head;
    """

    # 插入口碑主表
    sql_insert_koubei_head="""
                        INSERT INTO `vcar_vcyber_com`.`vcar_qczj_user_koubei_head`
                        (`sid`,
                        `title`,
                        `publicTime`,
                        `userSid`,
                        `chexingID`,
                        `buyTime`,
                        `price`,
                        `dealerId`,
                        `city`,
                        `county`,
                        `currentKm`,
                        `fuel`,
                        `koubeiLink`,
                        `favorNum`,
                        `readNum`,
                        `commentNum`,
                        `koubeiSrc`,
                        `mjjh`)
                        VALUES
                        (%s,
                        %s,
                        %s,
                        %s,
                        %s,
                        %s,
                        %s,
                        %s,
                        %s,
                        %s,
                        %s,
                        %s,
                        %s,
                        %s,
                        %s,
                        %s,
                        %s,
                        %s);
    """
    # 更新口碑主表
    sql_update_koubei_head="""
                            UPDATE `vcar_vcyber_com`.`vcar_qczj_user_koubei_head`
                            SET
                                `currentKm` = %s,
                                `fuel` = %s,
                                `favorNum` = %s,
                                `readNum` = %s,
                                `commentNum` = %s,
                                `updateTime` = %s,
                                `mjjh` = %s
                            WHERE `sid` = %s;
    """

    # 插入購車用途
    sql_insert_koubei_purpose="""
                                INSERT INTO `vcar_vcyber_com`.`vcar_qczj_user_koubei_purpose`
                                    (`sid`,
                                    `koubeiSid`,
                                    `purpose`)
                                    VALUES
                                    (%s,
                                    %s,
                                    %s);
    """

    # 插入評分明細表
    sql_insert_koubei_score="""
                                INSERT INTO `vcar_vcyber_com`.`vcar_qczj_user_koubei_score`
                                    (`sid`,
                                    `koubeiSid`,
                                    `scoreType`,
                                    `score`)
                                    VALUES
                                    (%s,
                                    %s,
                                    %s,
                                    %s);
    """

    @classmethod
    def getConnection(self):
        conn = pymysql.connect(host='localhost', user='root', passwd='root', db='vcar_vcyber_com', port=3306, charset='utf8')
        return conn


    @classmethod
    def query(cls,sql):
        try:
            # 獲取連結
            conn = cls.getConnection()
            cursor = conn.cursor()
            cursor.execute(sql)
            res=cursor.fetchall()
            #for item in res:
                #print(item)
                #self.log(item)
            #返回的是列表,列表元素型別是元組[(),(),,]
            return res
        except Exception as e:
            print(e)
        finally:
            cursor.close()
            conn.close()

    # 批量更新或儲存
    @classmethod
    def updateList(cls,sql,paramsList):
        try:
            # 獲取資料連線
            conn = cls.getConnection()
            # 獲取查詢遊標
            cursor = conn.cursor()
            # 執行
            # print(itemList)
            cursor.executemany(sql, paramsList)
            # 提交
            conn.commit()
        except Exception as e:
            print(e)
        finally:
            cursor.close()
            conn.close()

    # 只更新一個
    @classmethod
    def updateOne(cls,sql,params):
        try:
            # 獲取資料連線
            conn = cls.getConnection()
            # 獲取查詢遊標
            cursor = conn.cursor()
            # 執行
            # print(itemList)
            cursor.execute(sql, params)
            # 提交
            conn.commit()
        except Exception as e:
            print(e)
        finally:
            cursor.close()
            conn.close()

    # 將查詢結果解析成id集合
    @classmethod
    def parseToSet(cls,res,index):
        idSet=set()
        for item in res:
            idSet.add(item[index])
        return idSet
    # 解析成字典
    @classmethod
    def parseToDict(cls,res,keyIndex,valueIndex):
        d=dict()
        for item in res:
            d.__setitem__(item[keyIndex],item[valueIndex])
        return d


    @classmethod
    def queryBrandId(self):
        #self.log("start query --------------------------------")
        queryList=list()
        try:
            conn = self.getConnection()
            cur = conn.cursor()
            sql="""
            SELECT  `vcar_pinpai`.`pinpaiID`
            FROM `vcar_vcyber_com`.`vcar_pinpai`;
            """
            cur.execute(sql)
            res=cur.fetchall()
            #for item in res:
                #print(item)
                #self.log(item)
            #返回的是列表,列表元素型別是元組[(),(),,]
            return res

        except Exception as e:
            pass
            #print(e)
            #self.log(e)
            #self.log("查詢失敗")
        finally:
            cur.close()
            conn.close()
        #self.log("end query ----------------------------------")


    #查詢車系資訊,返回元組(brandId,seriesId,seriesLink)
    @classmethod
    def querySeriesLink(self):
        sql="""
                SELECT 
                    `vcar_chexi`.`pinpaiID`,
                    `vcar_chexi`.`chexiID`,
                    `vcar_chexi`.`url`
                FROM `vcar_vcyber_com`.`vcar_chexi`;
        """
        try:
            #獲取資料連線
            conn=self.getConnection()
            #獲取查詢遊標
            cursor=conn.cursor()
            #執行查詢
            cursor.execute(sql)
            #獲取結果
            res=cursor.fetchall()
            #for item in res:
                #print(item)
            return res
        except Exception as e:
            pass
        finally:
            conn.close()
            cursor.close()

    @classmethod
    def insertSpecItemList(cls,itemList):
        sql="""
                INSERT INTO `vcar_vcyber_com`.`vcar_chexing`
                    (`chexingID`,
                    `pinpaiID`,
                    `chexiID`,
                    `name`,
                    `url`)
                    VALUES
                    (%s,
                    %s,
                    %s,
                    %s,
                    %s);
            """
        try:
            # 獲取資料連線
            conn = cls.getConnection()
            # 獲取查詢遊標
            cursor = conn.cursor()
            # 執行
            # print(itemList)
            cursor.executemany(sql,itemList)
            # 提交
            conn.commit()
        except Exception as e:
            print(e)
        finally:
            cursor.close()
            conn.close()

    @classmethod
    def querySpec(cls):
        sql="""
            SELECT 
                chexingID,
                chexiID,
                pinpaiID,
                name 
            FROM vcar_vcyber_com.vcar_chexing 
        """
        try:
            # 獲取資料連線
            conn = cls.getConnection()
            # 獲取查詢遊標
            cursor = conn.cursor()
            # 執行
            # print(itemList)
            cursor.execute(sql)
            res=cursor.fetchall()
            return res
        except Exception as e:
            print(e)
        finally:
            cursor.close()
            conn.close()


    # 查詢列舉表
    @classmethod
    def queryEnum(cls,labelCd):
        sql="""
                SELECT 
                    optionName, optionValue
                FROM
                    vcar_vcyber_com.vcar_dic
                WHERE
                    labelCd = '%s';
        """
        try:
            # 獲取資料連線
            conn = cls.getConnection()
            # 獲取查詢遊標
            cursor = conn.cursor()
            # 執行
            # print(itemList)
            cursor.execute(sql % labelCd)
            res = cursor.fetchall()
            return res
        except Exception as e:
            print(e)
        finally:
            cursor.close()
            conn.close()

    # 將車型評分批量儲存到資料庫
    @classmethod
    def saveSpecDetailScore(cls,scoreList):
        sql="""
                INSERT INTO `vcar_vcyber_com`.`vcar_qczj_score_chexing`
                    (`sid`,
                    `chexingID`,
                    `scoreType`,
                    `score`,
                    `compareScore`)
                    VALUES
                    (%s,
                    %s,
                    %s,
                    %s,
                    %s);
        """
        try:
            # 獲取資料連線
            conn = cls.getConnection()
            # 獲取查詢遊標
            cursor = conn.cursor()
            # 執行
            # print(itemList)
            cursor.executemany(sql,scoreList)
            # 提交
            conn.commit()
        except Exception as e:
            print(e)
        finally:
            cursor.close()
            conn.close()

    # 更新車型評分
    @classmethod
    def updateSpecDetailScore(cls,updateList):
        sql="""
                UPDATE `vcar_vcyber_com`.`vcar_qczj_score_chexing`
                SET
                `score` = %s,
                `compareScore` = %s,
                `updateTime` = %s
                WHERE `chexingID` = %s and scoreType= %s;
        """
        try:
            # 獲取資料連線
            conn = cls.getConnection()
            # 獲取查詢遊標
            cursor = conn.cursor()
            # 執行
            # print(itemList)
            cursor.executemany(sql,updateList)
            # 提交
            conn.commit()
        except Exception as e:
            print(e)
        finally:
            cursor.close()
            conn.close()


    # 查詢車型評分表,查詢出所有的已儲存的車型id
    @classmethod
    def querySavedScoreSepc(cls):
        sql="""
                SELECT DISTINCT
                    (chexingID) AS chexingID
                FROM
                    vcar_vcyber_com.vcar_qczj_score_chexing;
        """
        try:
            # 獲取資料連線
            conn = cls.getConnection()
            # 獲取查詢遊標
            cursor = conn.cursor()
            # 執行
            # print(itemList)
            cursor.execute(sql)
            res = cursor.fetchall()
            return res
        except Exception as e:
            print(e)
        finally:
            cursor.close()
            conn.close()


    # 更新車型平均油耗
    @classmethod
    def updateSpecAvgFuel(cls,paramsList):
        # (None, None, '2018-10-12 16:54:11', 'https:https://car2.autoimg.cn/cardfs/product/g30/M0A/D3/4C/t_autohomecar__ChsEf1uEBSuAHcMXAAj-04onwvc092.jpg', '1006690'
        sql=""" 
                UPDATE `vcar_vcyber_com`.`vcar_chexing`
                SET
                    `avgFuel` =  %s,
                    `numPeople` = %s,
                    `updateTime` = %s,
                    `imgUrlS` = %s
                WHERE `chexingID` = %s;
        """
        try:
            # 獲取資料連線
            conn = cls.getConnection()
            # 獲取查詢遊標
            cursor = conn.cursor()
            # 執行
            # print(itemList)
            cursor.executemany(sql,paramsList)
            # 提交
            conn.commit()
        except Exception as e:
            print("error:updateSpecAvgFuel")
            print(e)
        finally:
            cursor.close()
            conn.close()





    # 將車型元組集合轉換為車型IDset集合
    @classmethod
    def parseToChexingIdSet(cls,res):
        chexingIdSet=set()
        for item in res:
            chexingIdSet.add(item[0])
        return chexingIdSet

    # 將車系元組集合轉換成車系ID集合
    @classmethod
    def parseToSeriesIdSet(cls,res):
        seriesIdSet=set()
        for item in res:
            seriesIdSet.add(item[1])
        return seriesIdSet

    # 查詢車系id集合中的車系資料
    @classmethod
    def findChexiInChexiSet(cls,seriesItems,seriesIdSet):
        waitingCrawlItems=list()
        for id in seriesIdSet:
             for item in seriesItems:
                 if id == item[1]:
                    waitingCrawlItems.append(item)
                    break
        return waitingCrawlItems;

    @classmethod
    def parseEnum(cls,res):
        enumDic=dict()
        for item in res:
            enumDic.__setitem__(item[0],item[1])
        return enumDic








import os,sys

# 斷點管理類
class Point(object):
    # 正常爬取結束標識檔案
    overFilePath=None
    # 斷點記錄檔案
    pointFilePath=None

    @classmethod
    def init(cls):
        # 獲取當前目錄
        path = os.path.abspath(__file__)
        path = path[0:path.rfind("/")]
        # 獲取當前目錄下所有檔案  (('/Users/guohan/Documents/pworkspace/VCar/Python/spider_vcar_vcyber_com/vcarDetailSpider/vcarDetailSpider', ['__pycache__', 'spiders', 'temp'], ['__init__.py', 'items.py', 'middlewares.py', 'mySqlUtils.py', 'pipelines.py', 'settings.py']), ('/Users/guohan/Documents/pworkspace/VCar/Python/spider_vcar_vcyber_com/vcarDetailSpider/vcarDetailSpider/__pycache__', [], ['__init__.cpython-36.pyc', 'items.cpython-36.pyc', 'mySqlUtils.cpython-36.pyc', 'pipelines.cpython-36.pyc', 'settings.cpython-36.pyc']), ('/Users/guohan/Documents/pworkspace/VCar/Python/spider_vcar_vcyber_com/vcarDetailSpider/vcarDetailSpider/spiders', ['__pycache__'], ['__init__.py', 'detailSpider.py']), ('/Users/guohan/Documents/pworkspace/VCar/Python/spider_vcar_vcyber_com/vcarDetailSpider/vcarDetailSpider/spiders/__pycache__', [], ['__init__.cpython-36.pyc', 'detailSpider.cpython-36.pyc']), ('/Users/guohan/Documents/pworkspace/VCar/Python/spider_vcar_vcyber_com/vcarDetailSpider/vcarDetailSpider/temp', [], ['1.txt']))
        tt = tuple(os.walk(path))
        # 獲取當前目錄
        currentDir = tt[0][0]
        # 系統檔案分隔符
        sep = os.sep
        # 拼接目的檔案
        Point.overFilePath = currentDir + sep + "temp" + sep + "over.txt"
        Point.pointFilePath = currentDir + sep + "temp" + sep + "point.txt"


    # 切入斷點,返回待爬集合
    @classmethod
    def cutInto(cls,total):
        print("--------------------cutInto--------------")

        # 定義最終要爬取的資料集
        waitingCrawlIdSet=None
        # 判斷當前目錄中是否存在over.text檔案
        hasOverFile = os.path.exists(Point.overFilePath)
        # 如果存在結束標識檔案則證明上一次完整爬取,刪除標識檔案和斷點檔案
        if hasOverFile:
            print(Point.overFilePath)
            os.remove(Point.overFilePath)
            # 清空斷點檔案的內容
            f = open(Point.pointFilePath, "w", encoding="utf-8")
            f.write("")
            f.flush()
            f.close()
            del f
            # 待爬資料就是查詢出的全部
            waitingCrawlIdSet = total
        else:
            # 讀取斷點檔案
            pointFile = open(Point.pointFilePath, "r+", encoding="utf-8")
            lines = pointFile.read()
            # 如果行末尾存在逗號,則消除逗號
            if len(lines) - 1 == lines.rfind(","):
                lines = lines[0:lines.rfind(",")]
            # 提取已爬取的車型id,封裝成set集合
            crawledIdSet = set(lines.split(","))
            # 用全部爬取id集減去已爬取的的id集得出待爬取的id集
            waitingCrawlIdSet = total - crawledIdSet
            # print(len(DetailPipeline.waitingCrawlIdSet))
            pointFile.close()
            del pointFile
            print("總共需要爬取%s,上次已爬取%s,本次需爬取%s" % (len(total),len(total)-len(waitingCrawlIdSet),len(waitingCrawlIdSet)))
        return waitingCrawlIdSet

    # 記錄斷點
    @classmethod
    def savePoint(self,data):
        f = open(self.pointFilePath, "a", encoding="utf-8")
        f.write(data)
        f.flush()
        f.close()
        del f

    # 記錄斷點,傳入一個集合
    @classmethod
    def savePointFromSet(cls,setData):
        data=""
        for id in setData:
            data += id + ","
        Point.savePoint(data)



    # 完成爬取
    @classmethod
    def complete(cls):
        overFile = open(cls.overFilePath, "w", encoding="utf-8")
        overFile.write("")
        overFile.flush()
        overFile.close()
        del overFile









# 啟動專案
se=AutoSelenium()
se.start_requests()