1. 程式人生 > >Python爬取最新反爬蟲汽車之家口碑

Python爬取最新反爬蟲汽車之家口碑

本人剛學Python沒幾天,程式碼可能比較醜陋, 大牛不要噴

用的Python2.7.2, 因為PyV8最高支援2.7.2, js混淆部分用的PyV8直接執行的js

原理已經寫過一篇了,這裡不再贅述了.可以看我的這篇

目錄結構如下:


fonts資料夾負責存放下載的字型檔案

decode_fontfile負責解析字型檔案

decode_script負責解析js混淆

document負責模擬js中的document物件,因為PyV8中沒有document物件,但是js混淆中用到了

spider是主要邏輯

下面貼一下程式碼:

spider.py

# -*- coding:utf-8 -*-
import requests
from bs4 import BeautifulSoup
import re
from decode_script import DecodeScript
from hero.proxy import proxy
from decode_fontfile import DecodeFontFile
import sys
reload(sys)
sys.setdefaultencoding('utf8')

class ParseHtml(object):
    def __init__(self):
        self.header = {"Host": "k.autohome.com.cn",
                  "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:57.0) Gecko/20100101 Firefox/57.0",
                  "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
                  "Accept-Language": "zh-CN,zh;q=0.8,zh-TW;q=0.7,zh-HK;q=0.5,en-US;q=0.3,en;q=0.2",
                  "Accept-Encoding": "gzip, deflate, br",
                  "Connection": "keep-alive",
                  "Upgrade-Insecure-Requests": "1"}
    def get_html_doc(self, url):
        """根據傳入的url,獲得所有口碑頁面的html程式碼"""
        s = requests.Session()
        resp = s.get(url, verify=False)
        if resp.status_code != 200:
            return 1
        else:
            return resp.content

    def get_text_con(self, html_doc):
        """解析網頁原始碼,利用css屬性,獲得口碑內容部分的原始碼"""
        soup = BeautifulSoup(html_doc,'lxml')
        mouth_item = soup.find_all(class_='mouth-item')[-1:][0]
        text_con = mouth_item.find(class_="text-con")
        return text_con

    def get_font_url(self, html_doc):
        """利用正則獲取字型檔案連結"""
        regex = r'\w+\.\w+\..*?ttf'
        font_url = re.findall(regex, html_doc)[0]
        return font_url



def run():
    url = "https://k.autohome.com.cn/detail/view_01c16ytpa964w38c1s70v00000.html?st=2&piap=0|2123|0|0|1|0|0|0|0|0|1#pvareaid=2112108"
    parse = ParseHtml()
    html_doc = parse.get_text_con(url) # 獲得網頁原始碼 ,如果狀態碼不是200,則返回404
    if html_doc == 1:
        run()
    else:
        # 獲取字型檔案連結, 並下載字型檔案
        font_url = parse.get_font_url(html_doc)
        decode_fontfile = DecodeFontFile()
        
        decode_fontfile.download_fontfile(font_url)
        text_con = parse.get_text_con(html_doc)
        decode_script = DecodeScript()
        list_text = decode_script.get_text_con(text_con, decode_fontfile)
        for text in list_text:
            for key, value in text.items():
                print(key+":"+value)

run()

decode_script.py

# -*- coding:utf-8 -*-
"""對混淆的js程式碼破解,獲取想要的內容"""
from bs4 import BeautifulSoup
import re
import PyV8
from document import Global
from decode_fontfile import DecodeFontFile
import sys

reload(sys)
sys.setdefaultencoding('utf8')

class DecodeScript(object):
    """傳入口碑的所有內容, 返回正常文字資訊"""

    def get_list_part(self, text_con):
        """傳入口碑內容,返回拆分後的列表"""
        return str(text_con).split('【')[1:]

    def get_list_title_con_js(self, part_con):
        """獲取標題和混淆的js程式碼"""
        # 獲取小標題
        title = part_con.split("】")[0]
        # 獲取加密的文字
        start = re.search('<
[email protected]
[email protected]>', part_con).span()[1] end = re.search('<[email protected][email protected]>', part_con).span()[0] part_base64 = part_con[start: end].decode("utf-8") # 獲取混淆的js程式碼 soup_part = BeautifulSoup(part_con, "lxml") h_js = soup_part.find('script') # 將標題和混淆的js存入一個列表 list_title_con_js = [title, part_base64, h_js] return list_title_con_js def put_js(self, js): """組裝js程式碼""" # 去掉多餘字元,用切片也可以 # if '<script>' in js: # js = js.replace('<script>', "") # if '</script>' in js: # js = js.replace('</script>', "") js = str(js)[8:-9] # 在開始處定義變數 def_var = "var result = " js = def_var+js # 在指定位置定義陣列 first_point = js.index("{") def_arr = "var arr = [];" js = js[:first_point+1]+def_arr+js[first_point+1:] # 在指定位置給陣列賦值 regex = r"function\s*\w+\(\)\s*\{\s*(\w+)\s*=[\s\S]*?\);\s*(\w+)\s*=[\s\S]*?\);\s*(\w+)\s*=[\s\S]*?\);" tuple_groups = re.search(regex, js).groups() second_point = re.search(regex, js).span()[1] set_arr = "arr = ["+str(tuple_groups[0])+", "+str(tuple_groups[1])+"];" js = js[:second_point]+set_arr+js[second_point:] # 在指定位置return陣列 add_return = "return arr;" js = js.strip() js = js[:-13]+add_return+js[-13:] return js def run_js(self, js): """在v8中執行js,獲得16進位制數字和對應數字""" glob = Global() list_num16 = [] list_index = [] with PyV8.JSContext(glob) as ctext: ctext.eval(js) vars = ctext.locals js_array = vars.result for num16 in js_array[0]: list_num16.append(num16) for index in js_array[1]: list_index.append(index) return [list_num16, list_index] def replace_span(self, part_con, decode_fontfile): """用16進位制數字替換掉段落中的span""" list_title_con_js = self.get_list_title_con_js(part_con) title = list_title_con_js[0] #獲取標題 con = list_title_con_js[1] #獲取加密後段落 js = self.put_js(list_title_con_js[2]) #獲取js後重新組裝js list_num16_index = self.run_js(js) #利用v8執行js,獲得16進位制數字和對應關係 list_num16 = list_num16_index[0] list_num16 = list_num16[0].split(",") list_index = list_num16_index[1] regex = r"<span\s*class[\s\S]*?hs_kw(\d+)[\s\S]*?</span>" list_span = re.finditer(regex, con) for span in list_span: tag_span = span.group().encode('unicode_escape').decode('string_escape') index = list_index[int(span.group(1))] num16 = list_num16[int(index)] glyph = "uni"+num16.upper() decode = DecodeFontFile() font = decode_fontfile.get_font(glyph) con = con.replace(tag_span, font) return {title: str(con)} def get_text_con(self, text_con, decode_fontfile): # 傳入完成口碑加密內容, 返回按標題分割的片斷列表 list_part = self.get_list_part(text_con) content = [] for part_con in list_part: part_text = self.replace_span(part_con, decode_fontfile) content.append(part_text) return content
decode_fontfile.py
# -*- coding:utf-8 -*-
"""解析字型檔案"""
from fontTools.ttLib import TTFont
import requests
import re
import os

list_font = [ ' ', '一', '七', '三', '上', '下', '不', '中', '檔', '比', '油', '泥', '燈', '九', '了', '二', '五',
				'低', '保', '光', '八', '公', '六', '養', '內', '冷', '副', '加', '動', '十', '電', '的', '皮', '盤', '真', '著', '路', '身',
				'軟', '過', '近', '遠', '裡', '量', '長', '門', '問', '只', '右', '啟', '呢', '味', '和', '響', '四', '地', '壞', '坐', '外',
				'多', '大', '好', '孩', '實', '小', '少', '短', '矮', '硬', '空', '級', '耗', '雨', '音', '高', '左', '開', '當', '很', '得',
				'性', '自', '手', '排', '控', '無', '是', '更', '有', '機', '來' ]

class DecodeFontFile(object):
    def __init__(self):
        self.file_path = ""
        self.headers = {
            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.115 Safari/537.36"
        }


    def download_fontfile(self, font_url):
        font_url = "http://"+font_url
        cont = requests.get(font_url, headers=self.headers).content
        file_name = re.findall(r'\w{20,}[\s\S]*?ttf', font_url)[0]
        self.file_path = "./fonts/"+file_name
        with open(self.file_path, "wb") as f:
            f.write(cont)

    # 建立 self.font 屬性
    def get_glyph_id(self, glyph):
        ttf = TTFont(self.file_path)
        # gly_list = ttf.getGlyphOrder()  # 獲取 GlyphOrder 欄位的值
        index = ttf.getGlyphID(glyph)
        # os.remove(self.file_path)
        return index

    def get_font(self, glyph):
        id = self.get_glyph_id(glyph)
        return list_font[id]

document.py
# -*- coding:utf-8 -*-
"""模擬Document物件和window物件"""
import PyV8

class Element():
    def __init__(self):
        self.sheet = ""
class Head(object):
    def appendChild(self, *args, **kwargs):
        return "sheet"

class v8Doc(PyV8.JSClass):
    def createElement(self,  *args, **kwargs):
        return Element()
    def getElementsByTagName(self, *args, **kwargs):
        head = Head()
        list = [head]
        return list
    def getComputedStyle(self, *args, **kwargs):
        return None
    def decodeURIComponent(self, *args, **kwargs):
        return args
    def querySelectorAll(self, *args, **kwargs):
        return None

class Global(PyV8.JSClass):
    def __init__(self):
        self.document = v8Doc()
        self.window = v8Doc()

輸出結果