1. 程式人生 > >爬取所有部落格

爬取所有部落格

爬取所有部落格的內容並轉換成為pdf格式

from bs4 import BeautifulSoup
import pdfkit
import re


# <a href="https://blog.csdn.net/qq_41911569/article/details/83034422" target="_blank"><span class="">檢視</span></a>
from gevent import os


def getPagehtml(url):  #獲取網頁的內容
    response = requests.get(url)
    return response.text


def createurl(text):  #從網頁原始碼中匹配到每一片部落格網址
    '''
    <a href="https://blog.csdn.net/qq_41911569/article/details/83034422" target="_blank"><span class="article-type type-1">原</span>爬取貓眼電影</a>
    :param text:
    :return:
    '''
    pattern = r'<a href="(https://blog.csdn.net/qq_41911569/article/.*?)" target="_blank">'
    return re.findall(pattern,text)

url = 'https://blog.csdn.net/qq_41911569'
text = getPagehtml(url)
createurl(text)


def get_blog_content(i,url):  #根據獲取到的每一片的部落格網址,獲得部落格的內容,並寫入檔案中
    response = requests.get(url)
    soup = BeautifulSoup(response.text, 'html5lib')
    # 獲取head標籤的內容
    head = soup.head
    # 獲取部落格標題
    title = soup.find_all(class_="title-article")[0].get_text()
    # 獲取部落格內容
    content = soup.find_all(class_="article_content")[0]
    # 寫入本地檔案
    other = 'http://passport.csdn.net/account/login?from='
    with open('/home/kiosk/Desktop/python筆記/python_stack/day26/bs/westos%d.html' %i, 'w') as f:
        f.write(str(head))
        f.write('<h1>%s</h1>\n\n' %(title))
        f.write(str(content))

def main():
    # https://blog.csdn.net/qq_41911569/article/list/3
    article_url = []
    for i in range(3):
        url = 'https://blog.csdn.net/qq_41911569/article/list/%d' %(i+1)
        text = getPagehtml(url)
        article_url.append(createurl(text))
    article_url = [j for i in article_url for j in i]

    # print(article_url)
    for i,v in enumerate(set(article_url)):
        get_blog_content(i,v)


main()

結果:
在這裡插入圖片描述