1. 程式人生 > >爬取起點小說信息

爬取起點小說信息

main 請求 .text web ttl 遍歷 import values for

沒有vip所以並沒爬取小說內容,這裏主要是解決起點小說字數的反反爬

import random

import requests
import re
import csv
from fontTools.ttLib import TTFont
from io import BytesIO
from pyquery import PyQuery as pq

class Spider(object):
def init(self):
# 代理ip列表
self.proxy_list = [{"http": ‘219.138.58.114:3128‘}, {"http": ‘61.135.217.7:80‘}, {"http": ‘101.201.79.172:808‘},

{‘http‘: ‘122.114.31.177:808‘}]
# 用戶代理列表
self.user_list = [
‘Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.89 Safari/537.36‘,
‘User-Agent:Mozilla/5.0(compatible;MSIE9.0;WindowsNT6.1;Trident/5.0‘,
‘Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/534.16 (KHTML, like Gecko) Chrome/10.0.648.133 Safari/534.16‘,
‘Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/534.16 (KHTML, like Gecko) Chrome/10.0.648.133 Safari/534.16‘]
self.index = random.randint(0, 3)
self.base_url = ‘https://www.qidian.com/all?orderId=&style=1&pageSize=20&siteid=1&pubflag=0&hiddenField=0&page=‘
self.headers = {"User-Agent": self.user_list[self.index]}

def send_request(self, page_url):
    data = requests.get(page_url, headers=self.headers).content.decode('utf-8')
    return data

def get_font(self, url):
    response = requests.get(url)
    font = TTFont(BytesIO(response.content))
    cmap = font.getBestCmap()
    font.close()
    return cmap

def get_encode(self, cmap, values):
    WORD_MAP = {'zero': '0', 'one': '1', 'two': '2', 'three': '3', 'four': '4', 'five': '5', 'six': '6',
                'seven': '7',
                'eight': '8', 'nine': '9', 'period': '.'}
    word_count = ''
    for value in values.split(';'):
        value = value[2:]
        key = cmap[int(value)]
        word_count += WORD_MAP[key]
    return word_count

def parse(self, data, page_url):
    """清洗數據"""
    # 編寫正則表達式
    book_name = r'<h4><a href="(.*?)" target="_blank" data-eid=".*?" data-bid="\d*?">(.*?)</a></h4>'  # 鏈接+書名
    book_author = r'<a class="name" href=".*?" data-eid=".*?" target="_blank">(.*?)</a>'  # 作者
    book_type1 = r'<a href=".*?" target="_blank" data-eid=".*?">(.*?)</a>'  # 類型
    # 新增一個類型
    book_type2 = r'<a class="go-sub-type" data-typeid="\d*?" data-subtypeid="\d*?" href="javascript:" data-eid=".*?">(.*?)</a>'  # 類型
    book_state = r'<span >(.*?)</span>'  # 狀態
    book_intro = r'<p class="intro">(.*?)</p>'  # 簡介
    # book_link = r'<h4><a href="//book.qidian.com/info/1010734492" target="_blank" data-eid="qd_B58" data-bid="1010734492">.*?</a></h4>'  # 鏈接

    informations = book_name + r'.*?' + book_author + r'.*?' + book_type1 +                    r'.*?' + book_type2 + r'.*?' + book_state + r'.*?' + book_intro
    # 返回一個正則表達式對象
    reg = re.compile(informations, re.S)
    # 開始查找所有信息
    contents_list = re.findall(reg, data)
    # print(contents_list)

    # 獲取當前頁面的html
    response = requests.get(page_url).text
    doc = pq(response)
    # 獲取當前字體文件名稱
    classattr = doc('p.update > span > span').attr('class')
    cla = doc('p.update > span > span')
    # print(cla)
    pattern = '</style><span.*?%s.*?>(.*?)</span>' % classattr
    # 獲取當前頁面所有被字數字符
    numberlist = re.findall(pattern, response)
    # 獲取當前包含字體文件鏈接的文本
    fonturl = doc('p.update > span > style').text()
    # 通過正則獲取當前頁面字體文件鏈接
    url = re.search('woff.*?url.*?\'(.+?)\'.*?truetype', fonturl).group(1)
    cmap = self.get_font(url)

    contents = []
    # 遍歷每一個作品信息,進行修改
    i = 0
    for content in contents_list:
        content = list(content)
        # print(content)
        new_content = content[1:3]  # 書名+作者
        new_content.append('https:' + content[0])  # 鏈接
        new_content.append(content[3] + '-' + content[4])  # 類型
        new_content.append(content[5])  # 狀態
        new_content.append(self.get_encode(cmap, numberlist[i][:-1]) + '萬字')  # 字數
        new_content.append(content[6].strip())  # 簡介
        # 添加到列表
        contents.append(new_content)
        print(contents)
        i += 1

    return contents

def write(self, contents, csv_writer):
    """保存內容"""
    for content in contents:
        csv_writer.writerow(content)

def run(self, pages=1):
    # 設置分類
    fileheader = ['作品', '作者', '鏈接', '類型', '狀態', '字數', '簡介']
    # 創建csv文件
    with open('qidian.csv', 'w', newline='', encoding='gb18030') as f:
        csv_writer = csv.writer(f)
        # 把fileheader的內容寫入csv文件中
        csv_writer.writerow(fileheader)

        for page in range(1, pages + 1):
            # 設置url
            page_url = self.base_url + str(page)
            print(page_url)
            # 請求數據
            data = self.send_request(page_url)
            # 清洗數據
            contents = self.parse(data, page_url)
            # 寫入數據
            self.write(contents, csv_writer)

if name == ‘main‘:
Spider().run(2)

爬取起點小說信息