1. 程式人生 > >python爬蟲實戰 爬取汽車之家上車型價格

python爬蟲實戰 爬取汽車之家上車型價格

相關庫

import pymysql
import pymysql.cursors
from bs4 import BeautifulSoup
import requests
import random
import time
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait 
from selenium.webdriver.support import expected_conditions as
EC import codecs from selenium.common.exceptions import TimeoutException

從資料庫中讀取車型(車型已經存放再資料庫,這裡讀取車型的id,拼接到url上)

cars = []
conn = pymysql.connect(host='*******',charset='utf8',user=*******',passwd='*****',db='mysql',cursorclass=pymysql.cursors.DictCursor)

try:
    cur = conn.cursor()
    cur.execute(
"USE data_etl") cur.execute("select distinct(car_id),car_name from user_car_port") item = cur.fetchone() count = 0 while item is not None: cars.append(item) count+=1 item = cur.fetchone() print(count) finally: conn.close()

由於汽車之家反爬比較複雜,我們直接呼叫瀏覽器介面

driver =
webdriver.Chrome('chromedriver.exe')
def getCarPriceOffSale(innerHtml):
    button = 0.0
    top = 0.0
    print("此車型已經停售!")
    bsObj = BeautifulSoup(innerHtml)
    try:
        spanPrice = bsObj.findAll("span",{"class":"price"})[0]
        if spanPrice is not None:
            strongPrice = spanPrice.find("strong",{"class":"red"})
            if strongPrice is not None:
                text = strongPrice.text
                if text is not None:
                    prices = text.split("-")
                    prices = text.split("-")
                    prices[0] = prices[0].replace("萬","")
                    prices[0] = prices[0].replace("元","")
                    button = float(prices[0])
                    if(len(prices) == 2):
                        prices[1] = prices[1].replace("萬","")
                        prices[1] = prices[1].replace("元","")
                        top = float(prices[1])
                    else:
                        top = button
                else:
                    print("價格欄位為空")
            else:
                print("價格strong為空")
        else:
            print("價格span為空")
    except Exception:
        print("程式出錯!停售車型")
    return button,top
                        

處理在售車型的價格 資訊

def getCarPriceOnSale(innerHtml):
    button = 0.0
    top = 0.0
    print("此車型在售")
    bsObj = BeautifulSoup(innerHtml)
    try:
            ddprice = bsObj.findAll("dd")[0]
            if ddprice is not None:
                a = ddprice.find("a",{"class":"emphasis"})
                if a is not None:
                    text = a.text
                    prices = text.split("-")
                    prices[0] = prices[0].replace("萬","")
                    prices[0] = prices[0].replace("元","")
                    button = float(prices[0])
                    if(len(prices) == 2):
                        prices[1] = prices[1].replace("萬","")
                        prices[1] = prices[1].replace("元","")
                        top = float(prices[1])
                    else:
                        top = button
                else:
                    print("此車型暫時無法查詢價格")
    except Exception:
            print("程式出錯!在售車型")
    return button,top  

處理停售車型的價格資訊

def getCarPrice(carId):
    button = 0.0
    top = 0.0
    try:
        driver.get(url+str(carId))
        wait = WebDriverWait(driver,5).until(EC.presence_of_element_located((By.CLASS_NAME,"information-summary")))
        ele = driver.find_element_by_class_name("information-price").get_attribute('innerHTML')
        button,top=getCarPriceOnSale(ele)
    except TimeoutException:
        try:
            wait = WebDriverWait(driver,5).until(EC.presence_of_element_located((By.CLASS_NAME,"car_price")))
            ele = driver.find_element_by_class_name("car_price").get_attribute('innerHTML')
            button,top=getCarPriceOffSale(ele)
        except TimeoutException:
            print("此車型有問題:"+str(carId))   
    return button,top

遍歷資料庫所有車型的id

for car in cars:
    id = car["car_id"]
    time.sleep(random.randint(1,5))
    button,top = getCarPrice(id)
    if button == 0.0 and top == 0.0:
        car["button"] = 9999
        car["top"] = 9999
    else:
        car["button"] = button
        car["top"] = top