1. 程式人生 > >Python 爬蟲-豆瓣讀書

Python 爬蟲-豆瓣讀書

import requests
from bs4 import BeautifulSoup
#有需要Python學習資料的小夥伴嗎?小編整理【一套Python資料、原始碼和PDF】,感興趣者可以加學習群:548377875,反正閒著也是閒著呢,不如學點東西啦~~

def parse_html(num):
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36'
    }
    response = requests.get(f'https://book.douban.com/top250?start={num}', headers=headers)

    soup = BeautifulSoup(response.text, 'lxml')

    # 書名
    all_div = soup.find_all('div', class_='pl2')
    book_names = [div.find('a')['title'] for div in all_div]

    # 圖書資訊
    all_p = soup.find_all('p', class_='pl')
    book_infos = [p.get_text() for p in all_p]

    # 評分
    all_span = soup.find_all('span', class_='rating_nums')
    book_rates = [span.get_text() for span in all_span]

    # 簡介
    all_quote = soup.find_all('span', class_='inq')
    book_inqs = [quote.get_text() for quote in all_quote]

    d = ''
    # 組合
    for name, info, rate, inq in zip(book_names, book_infos, book_rates, book_inqs):
        d = f'{d}書名:{name}\n作者:{info}\n評分:{rate}\n簡介:{inq}\n=======================\n'

    return d


if __name__ == '__main__':
    data = ''
    for i in range(0, 250, 25):
        data = f'{data}{parse_html(i)}'

    filename = '豆瓣圖書Top250.txt'
    with open(filename, 'w', encoding='utf-8') as f:
        f.write(data)