Python爬取最新反爬蟲汽車之家口碑

阿新 • • 發佈：2019-01-03

本人剛學Python沒幾天,程式碼可能比較醜陋, 大牛不要噴

用的Python2.7.2, 因為PyV8最高支援2.7.2, js混淆部分用的PyV8直接執行的js

原理已經寫過一篇了,這裡不再贅述了.可以看我的這篇

目錄結構如下:

fonts資料夾負責存放下載的字型檔案

decode_fontfile負責解析字型檔案

decode_script負責解析js混淆

document負責模擬js中的document物件,因為PyV8中沒有document物件,但是js混淆中用到了

spider是主要邏輯

下面貼一下程式碼:

spider.py

# -*- coding:utf-8 -*-
import requests
from bs4 import BeautifulSoup
import re
from decode_script import DecodeScript
from hero.proxy import proxy
from decode_fontfile import DecodeFontFile
import sys
reload(sys)
sys.setdefaultencoding('utf8')

class ParseHtml(object):
    def __init__(self):
        self.header = {"Host": "k.autohome.com.cn",
                  "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:57.0) Gecko/20100101 Firefox/57.0",
                  "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
                  "Accept-Language": "zh-CN,zh;q=0.8,zh-TW;q=0.7,zh-HK;q=0.5,en-US;q=0.3,en;q=0.2",
                  "Accept-Encoding": "gzip, deflate, br",
                  "Connection": "keep-alive",
                  "Upgrade-Insecure-Requests": "1"}
    def get_html_doc(self, url):
        """根據傳入的url,獲得所有口碑頁面的html程式碼"""
        s = requests.Session()
        resp = s.get(url, verify=False)
        if resp.status_code != 200:
            return 1
        else:
            return resp.content

    def get_text_con(self, html_doc):
        """解析網頁原始碼,利用css屬性,獲得口碑內容部分的原始碼"""
        soup = BeautifulSoup(html_doc,'lxml')
        mouth_item = soup.find_all(class_='mouth-item')[-1:][0]
        text_con = mouth_item.find(class_="text-con")
        return text_con

    def get_font_url(self, html_doc):
        """利用正則獲取字型檔案連結"""
        regex = r'\w+\.\w+\..*?ttf'
        font_url = re.findall(regex, html_doc)[0]
        return font_url



def run():
    url = "https://k.autohome.com.cn/detail/view_01c16ytpa964w38c1s70v00000.html?st=2&piap=0|2123|0|0|1|0|0|0|0|0|1#pvareaid=2112108"
    parse = ParseHtml()
    html_doc = parse.get_text_con(url) # 獲得網頁原始碼 ,如果狀態碼不是200,則返回404
    if html_doc == 1:
        run()
    else:
        # 獲取字型檔案連結, 並下載字型檔案
        font_url = parse.get_font_url(html_doc)
        decode_fontfile = DecodeFontFile()
        
        decode_fontfile.download_fontfile(font_url)
        text_con = parse.get_text_con(html_doc)
        decode_script = DecodeScript()
        list_text = decode_script.get_text_con(text_con, decode_fontfile)
        for text in list_text:
            for key, value in text.items():
                print(key+":"+value)

run()

decode_script.py

# -*- coding:utf-8 -*-
"""對混淆的js程式碼破解,獲取想要的內容"""
from bs4 import BeautifulSoup
import re
import PyV8
from document import Global
from decode_fontfile import DecodeFontFile
import sys

reload(sys)
sys.setdefaultencoding('utf8')

class DecodeScript(object):
    """傳入口碑的所有內容, 返回正常文字資訊"""

    def get_list_part(self, text_con):
        """傳入口碑內容,返回拆分後的列表"""
        return str(text_con).split('【')[1:]

    def get_list_title_con_js(self, part_con):
        """獲取標題和混淆的js程式碼"""
        # 獲取小標題
        title = part_con.split("】")[0]
        # 獲取加密的文字
        start = re.search('< 
[email protected][email protected]>', part_con).span()[1]
        end = re.search('<[email protected][email protected]>', part_con).span()[0]
        part_base64 = part_con[start: end].decode("utf-8")
        # 獲取混淆的js程式碼
        soup_part = BeautifulSoup(part_con, "lxml")
        h_js = soup_part.find('script')
        # 將標題和混淆的js存入一個列表
        list_title_con_js = [title, part_base64, h_js]
        return list_title_con_js

    def put_js(self, js):
        """組裝js程式碼"""

        # 去掉多餘字元,用切片也可以
        # if '<script>' in js:
        #     js = js.replace('<script>', "")
        # if '</script>' in js:
        #     js = js.replace('</script>', "")
        js = str(js)[8:-9]
        # 在開始處定義變數
        def_var = "var result = "
        js = def_var+js
        # 在指定位置定義陣列
        first_point = js.index("{")
        def_arr = "var arr = [];"
        js = js[:first_point+1]+def_arr+js[first_point+1:]
        # 在指定位置給陣列賦值
        regex = r"function\s*\w+\(\)\s*\{\s*(\w+)\s*=[\s\S]*?\);\s*(\w+)\s*=[\s\S]*?\);\s*(\w+)\s*=[\s\S]*?\);"
        tuple_groups = re.search(regex, js).groups()
        second_point = re.search(regex, js).span()[1]
        set_arr = "arr = ["+str(tuple_groups[0])+", "+str(tuple_groups[1])+"];"
        js = js[:second_point]+set_arr+js[second_point:]
        # 在指定位置return陣列
        add_return = "return arr;"
        js = js.strip()
        js = js[:-13]+add_return+js[-13:]
        return js

    def run_js(self, js):
        """在v8中執行js,獲得16進位制數字和對應數字"""
        glob = Global()
        list_num16 = []
        list_index = []
        with PyV8.JSContext(glob) as ctext:
            ctext.eval(js)
            vars = ctext.locals
            js_array = vars.result
            for num16 in js_array[0]:
                list_num16.append(num16)
            for index in js_array[1]:
                list_index.append(index)
        return [list_num16, list_index]

    def replace_span(self, part_con, decode_fontfile):
        """用16進位制數字替換掉段落中的span"""
        list_title_con_js = self.get_list_title_con_js(part_con)
        title = list_title_con_js[0]                   #獲取標題
        con = list_title_con_js[1]                     #獲取加密後段落
        js = self.put_js(list_title_con_js[2])         #獲取js後重新組裝js
        list_num16_index = self.run_js(js)             #利用v8執行js,獲得16進位制數字和對應關係
        list_num16 = list_num16_index[0]
        list_num16 = list_num16[0].split(",")
        list_index = list_num16_index[1]
        regex = r"<span\s*class[\s\S]*?hs_kw(\d+)[\s\S]*?</span>"
        list_span = re.finditer(regex, con)
        for span in list_span:
            tag_span = span.group().encode('unicode_escape').decode('string_escape')
            index = list_index[int(span.group(1))]
            num16 = list_num16[int(index)]
            glyph = "uni"+num16.upper()
            decode = DecodeFontFile()
            font = decode_fontfile.get_font(glyph)
            con = con.replace(tag_span, font)
        return {title: str(con)}

    def get_text_con(self, text_con, decode_fontfile):
        # 傳入完成口碑加密內容, 返回按標題分割的片斷列表
        list_part = self.get_list_part(text_con)
        content = []
        for part_con in list_part:
            part_text = self.replace_span(part_con, decode_fontfile)
            content.append(part_text)
        return content

decode_fontfile.py

# -*- coding:utf-8 -*-
"""解析字型檔案"""
from fontTools.ttLib import TTFont
import requests
import re
import os

list_font = [ ' ', '一', '七', '三', '上', '下', '不', '中', '檔', '比', '油', '泥', '燈', '九', '了', '二', '五',
				'低', '保', '光', '八', '公', '六', '養', '內', '冷', '副', '加', '動', '十', '電', '的', '皮', '盤', '真', '著', '路', '身',
				'軟', '過', '近', '遠', '裡', '量', '長', '門', '問', '只', '右', '啟', '呢', '味', '和', '響', '四', '地', '壞', '坐', '外',
				'多', '大', '好', '孩', '實', '小', '少', '短', '矮', '硬', '空', '級', '耗', '雨', '音', '高', '左', '開', '當', '很', '得',
				'性', '自', '手', '排', '控', '無', '是', '更', '有', '機', '來' ]

class DecodeFontFile(object):
    def __init__(self):
        self.file_path = ""
        self.headers = {
            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.115 Safari/537.36"
        }


    def download_fontfile(self, font_url):
        font_url = "http://"+font_url
        cont = requests.get(font_url, headers=self.headers).content
        file_name = re.findall(r'\w{20,}[\s\S]*?ttf', font_url)[0]
        self.file_path = "./fonts/"+file_name
        with open(self.file_path, "wb") as f:
            f.write(cont)

    # 建立 self.font 屬性
    def get_glyph_id(self, glyph):
        ttf = TTFont(self.file_path)
        # gly_list = ttf.getGlyphOrder()  # 獲取 GlyphOrder 欄位的值
        index = ttf.getGlyphID(glyph)
        # os.remove(self.file_path)
        return index

    def get_font(self, glyph):
        id = self.get_glyph_id(glyph)
        return list_font[id]

document.py

# -*- coding:utf-8 -*-
"""模擬Document物件和window物件"""
import PyV8

class Element():
    def __init__(self):
        self.sheet = ""
class Head(object):
    def appendChild(self, *args, **kwargs):
        return "sheet"

class v8Doc(PyV8.JSClass):
    def createElement(self,  *args, **kwargs):
        return Element()
    def getElementsByTagName(self, *args, **kwargs):
        head = Head()
        list = [head]
        return list
    def getComputedStyle(self, *args, **kwargs):
        return None
    def decodeURIComponent(self, *args, **kwargs):
        return args
    def querySelectorAll(self, *args, **kwargs):
        return None

class Global(PyV8.JSClass):
    def __init__(self):
        self.document = v8Doc()
        self.window = v8Doc()

輸出結果

Python爬取最新反爬蟲汽車之家口碑

本人剛學Python沒幾天,程式碼可能比較醜陋, 大牛不要噴用的Python2.7.2, 因為PyV8最高支援2.7.2, js混淆部分用的PyV8直接執行的js 原理已經寫過一篇了,這裡不再贅述了.可以看我的這篇目錄結構如下: fonts資料夾負責存放下載的字型檔案

python爬蟲----汽車之家的汽車論壇的最新精華帖

模組 import requests---網頁請求 import re---正則表示式 from bs4 import BeautifulSoup as bs---節點處理抓取的內容這次主要抓取汽車之家的汽車論壇裡面的最新精華帖的內容，具體抓取帖子文字。url:

汽車之家口碑資料的爬蟲

一，爬蟲都是具有時效性的，我不知道反爬蟲什麼時候更新，所以失效以後，除非工作需要，否則我也一般會維護。二，移動客戶端網頁一般比電腦的網頁更好爬取，所以這次可以用汽車之家口碑的移動端網頁，用電腦網頁也可以，但是需要在汽車之家論壇反爬蟲的基礎上在進行修改。比較麻煩。三，以這篇口碑為例，在網頁上顯示的最滿意為下圖所

python網路爬蟲爬取汽車之家的最新資訊和照片

實現的功能是爬取汽車之家的最新資訊的連結題目和文章中的照片爬蟲需要用到我們使用了 requests 做網路請求，拿到網頁資料再用 BeautifulSoup 進行解析首先先檢查是否安裝了pip，如果已經安裝了pip,直接pip install requests,pip uninstal

Python爬蟲六：字型反爬處理（貓眼+汽車之家）-2018.10

環境：Windows7 +Python3.6+Pycharm2017 目標：貓眼電影票房、汽車之家字型反爬的處理 --------全部文章：京東爬蟲、鏈家爬蟲、美團爬蟲、微信公眾號爬蟲、字型反爬--------- 前言：字型反爬，

python爬蟲實戰爬取汽車之家上車型價格

相關庫 import pymysql import pymysql.cursors from bs4 import BeautifulSoup import requests import random

python爬蟲——爬取汽車之家新聞

按F12審查一下元素：找到了對應的資訊。而且發現要爬取的圖片都在id=auto-channel-lazyload-article的div標籤下的li標籤裡。 li標籤下的a標籤就是新聞的url；image標籤，src就是獲取圖片的url；請求圖片地

python入門-----爬取汽車之家新聞,---自動登錄抽屜並點贊,

ike color div標簽 pla spa art com col 3-9 爬取汽車之家新聞,代碼如下 import requests res=requests.get(url=‘https://www.autohome.com.cn/news/‘) #向汽車直接

汽車之家網站為例-爬蟲的編寫，爬取圖片

汽車之家圖片的爬取汽車之家有很多汽車的點評、價格、圖片等資訊，那麼怎麼才能編寫一個爬蟲來獲得我們所需要的資訊呢，很簡單，兩個工具便可以了，一個網頁解析工具requests，一個正則匹配工具re

Python爬蟲系列之四：利用Python爬取PyODPS頁面並整合成PDF文件

文章架構開發場景在日常開發過程中，經常需要參考一些文件。對於線上文件，往往由於網速等原因，用起來總不是那麼（ma）順（fan）心。開發工具 Anaconda Python 2 實現方案基於 bs4 模組標籤解析爬取

WebMagic爬蟲入門教程（三）爬取汽車之家的例項-品牌車系車型結構等

本文使用WebMagic爬取汽車之家的品牌車系車型結構價格能源產地國別等；java程式碼備註，只是根據url變化爬取的，沒有使用爬取script頁面具體的資料，也有反爬機制，知識簡單爬取html標籤爬取的網頁：需要配置pom.xml <!-

爬蟲之字型反爬（三）汽車之家

今天為大家帶來的是字型反爬的另一個案例，汽車之家。與之前不同的是，這裡是對漢字的處理。具體來看下面的分析與程式碼。首先參考的網站：https://club.autohome.com.cn/bbs/thread/1f05b4da4448439b/76044817-1.html#%23%23

Python3[爬蟲實戰] scrapy爬取汽車之家全站連結存json檔案

昨晚晚上一不小心學習了崔慶才，崔大神的部落格，試著嘗試一下爬取一個網站的全部內容，福利吧網站現在已經找不到了，然後一不小心逛到了汽車之家 (http://www.autohome.com.cn/beijing/) 很喜歡這個網站，女人都喜歡車，更何況男人呢。（

Python爬取拉勾網資料(破解反爬蟲機制)

人生苦短, 我學 Python! 這篇文章主要記錄一下我學習 Python 爬蟲的一個小例子, 是爬取的拉勾網的資料. 1.準備配置 Python 環境什麼的就不說了, 網上教程很多, 自行解決. 2.扒原始碼先開啟拉勾網的網頁. 我們要爬取這部分的資料

Python練習 scrapy 爬取汽車之家文章

autohome.py #spider檔案 # -*- coding: utf-8 -*- import scrapy from Autohome.items import AutohomeItem class AutohomeSpider(scrapy.Spider)

python爬取煎蛋妹子圖（老司機養成之路）

chrom all with file windows import apple 妹子 lib 源碼： 1 import urllib.request 2 from bs4 import BeautifulSoup 3 import os 4 import io

利用python爬取點小圖片，滿足私欲(爬蟲)

.text write ret append jpg use download div pat import requestsimport reimport os,syslinks=[]titles=[] headers = { "User-Agent": "Mozi

爬蟲系列（2）-----python爬取CSDN博客首頁所有文章

成功 -name 保存 eas attr eve lan url att 對於Python初學者來說，爬蟲技能是應該是最好入門，也是最能夠有讓自己有成就感的，今天在整理代碼時，整理了一下之前自己學習爬蟲的一些代碼，今天上第2個簡單的例子，python爬取CSDN博客首頁所有

python爬取數據被限制？一招教你偽造反爬技術！

python 爬蟲編程程序員1.Headers限制這應該是最常見的，最基本的反爬蟲手段，主要是初步判斷你是否是真實的瀏覽器在操作。這個一般很好解決，把瀏覽器中的Headers信息復制上去就OK了。值得註意的是，很多網站只需要userAgent信息就可以通過，但是有的網站還需要驗證一些其他的信息，比如

Python爬蟲案例：利用Python爬取笑話網

htm 分享 targe pen 技術分享搞笑 lan tle import 學校的服務器可以上外網了，所以打算寫一個自動爬取笑話並發到bbs的東西，從網上搜了一個笑話網站，感覺大部分還不太冷，html結構如下：可以看到，笑話的鏈接列表都在<div cla

Python爬取最新反爬蟲汽車之家口碑

相關推薦