1. 程式人生 > >python 爬蟲爬取小說 單程序與多程序 學習

python 爬蟲爬取小說 單程序與多程序 學習

轉載地址

1 單程序:

# -*- coding:UTF-8 -*-
from bs4 import BeautifulSoup
import requests, sys

"""
類說明:下載《筆趣看》網小說《一念永恆》
"""


class downloader(object):

    def __init__(self):
        self.server = 'http://www.biqukan.com/'
        self.target = 'http://www.biqukan.com/1_1094/'
        self.names = []  # 存放章節名
        self.urls = []  # 存放章節連結
        self.nums = 0  # 章節數

    """
    函式說明:獲取下載連結

    """

    def get_download_url(self):
        html=requests.get(self.target).text
        bs=BeautifulSoup(html,'html.parser')
        div=bs.find_all('div',class_='listmain')
        bs=BeautifulSoup(str(div[0]),'html.parser')
        list=bs.find_all('a')
        self.nums=len(list[15:])
   #     print(list)
        for each in list[15:]:
            self.names.append(each.string)
            self.urls.append(self.server+each.get('href'))

    def get_content(self,target):
        html=requests.get(url=target).text
        bs=BeautifulSoup(html,'html.parser')
        div=bs.find_all('div',class_='showtxt')
        texts=div[0].text.replace('\xa0' * 8, '\n\n')
        return texts

    def writer(self):
        f=open('一念永恆.txt','a',encoding='utf-8')
        for i in range(self.nums) :
            f.write(self.names[i])
            f.write('\n')
            f.writelines(self.get_content(self.urls[i]))
            f.write('\n\n')
            print(i)
        f.close()
if __name__ == "__main__":
    dl = downloader()
    dl.get_download_url()
    dl.writer()
 #   print('《一年永恆》開始下載:')
 #   print('《一年永恆》下載完成')

2 多程序

注意 dl.names[i] 為string ,pool中 args 不能傳遞 (我也不知道為什麼) 所以加了個str

# -*- coding:UTF-8 -*-
from bs4 import BeautifulSoup
import requests, sys
from multiprocessing import Pool
import string
headers={
'Cookie':r'UM_distinctid=164fd71debe478-0aed9f594fffa9-3c604504-1fa400-164fd71debf4c2; bcolor=; font=; size=; fontcolor=; width=; CNZZDATA1260938422=2084872231-1533260238-%7C1533551125',
'Host':'www.biqukan.com',
#'Referer':r'http://www.biqukan.com/1_1094/',
'User-Agent':r'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36'
}

class downloader(object):

    def __init__(self):
        self.server = 'http://www.biqukan.com/'
        self.target = 'http://www.biqukan.com/1_1094/'
        self.names = []  # 存放章節名
        self.urls = []  # 存放章節連結
        self.nums = 0  # 章節數
    def get_download_url(self):
        req = requests.get(url=self.target,headers=headers)
        html = req.text
        div_bf = BeautifulSoup(html,"html.parser")
        div = div_bf.find_all('div', class_='listmain')
        a_bf = BeautifulSoup(str(div[0]),"html.parser")
        a = a_bf.find_all('a')
        self.nums = len(a[15:])  # 剔除不必要的章節,並統計章節數
        for each in a[15:]:
            self.names.append(each.string)
            self.urls.append(self.server + each.get('href'))


def get_contents(target):
    req = requests.get(url=target)
    html = req.text
    bf = BeautifulSoup(html,"html.parser")
    texts = bf.find_all('div', class_='showtxt')
    texts = texts[0].text.replace('\xa0' * 8, '\n\n')
    return texts

def writer(name,path,texts):
    ans=get_contents(texts)

    with open(path, 'a', encoding='utf-8') as f:
        f.write(name + '\n')
        f.writelines(ans)
        f.write('\n\n')


if __name__ == "__main__":
    pool = Pool(15)
    dl = downloader()
    dl.get_download_url()


    print('《一年永恆》開始下載:')

    for i in range(dl.nums):
        #print('1',dl.get_contents('http://www.biqukan.com/1_1094/5403177.html'))
        pool.apply_async(writer,args=(str(dl.names[i]),'一念永恆.txt',dl.urls[i]))
        #pool.apply_async(writer, args=(str(dl.names[i]), '一念永恆.txt', dl.urls[i]))

    pool.close()
    pool.join()
    print('《一年永恆》下載完成')