1. 程式人生 > >bs4抓取糗事百科

bs4抓取糗事百科

tps quest mpi block ntp lap closed resp pan

抓取糗事百科內容及評論,不包含圖片信息。user-agent填入瀏覽器的即可。user-agent對應的value,360極速瀏覽器的話,可以在地址欄輸入about:version,回車,用戶代理後面的一長串就是需要填入‘‘裏面的內容。其他的可以自行百度

技術分享圖片
import urllib.request
import re
from urllib import request
from bs4 import BeautifulSoup

#1.獲取網頁源代碼
def get_html(url):
    headers = {
        User-Agent: ‘‘,
    }
    req 
= request.Request(headers=headers,url=url) response = urllib.request.urlopen(req) content = response.read().decode(utf-8) return content #獲取評論鏈接 def get_comment_link(content,comment_url_base): soup = BeautifulSoup(content,html.parser) articleFloor = 1 for string in soup.find_all(attrs=re.compile(r"
article block untagged mb15.*?")): comment = str(string.get(id)).strip().split("_")[2] comment_url = comment_url_base % comment#評論鏈接 get_comment_content(comment_url,articleFloor)#獲取評論內容 articleFloor += 1 #獲取糗事內容及評論內容 def get_comment_content(comment_url,articleFloor): commentPage
= get_html(comment_url) commentFloor = 1 soupComment = BeautifulSoup(commentPage,html.parser) for item in soupComment.find_all(div,class_=content): print(articleFloor,".",item.get_text().strip())#獲取糗事內容 for comment in soupComment.find_all(attrs="body"): print(" ",commentFloor,"樓回復:",comment.get_text())#獲取評論內容 commentFloor += 1 def command(): while True: raw = input("點擊enter查看或者輸入exit退出,請輸入你的選擇:") if raw==enter: main() break else: break def main(): article_url_base = https://www.qiushibaike.com/8hr/page/%d/#文章地址 comment_url_base = https://www.qiushibaike.com/article/%s#評論地址 article_url = article_url_base % 2 content = get_html(article_url) get_comment_link(content,comment_url_base) if __name__ == __main__: command()
View Code

bs4抓取糗事百科