1. 程式人生 > >python 爬蟲(五)爬取多頁內容

python 爬蟲(五)爬取多頁內容

import urllib.request
import ssl
import re

def ajaxCrawler(url):
    headers = {"User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36"}
    req = urllib.request.Request(url,headers=headers)

    #使用ssl建立未驗證的上下文
    context = ssl._create_unverified_context()

    response = urllib.request.urlopen(req,context=context)
    jsonStr = response.read().decode("utf-8")

    return jsonStr

url = "https://www.qiushibaike.com/text/page/1/" #然後迴圈爬取page/2/ 、、、
#filePath = "qiushi.html"
par1 = r'''article block untagged mb15(.*?)class="stats-comments'''
re_ob = re.compile(par1,re.S)
listStr = re_ob.findall(ajaxCrawler(url))

jsonStr ={}

for ss in listStr:
    re_Content = re.compile(r'''class="content".*?<span>(.*?)</span>''',re.S)  #前期不要寫的太嚴格,防止有的匹配不到
    userContent = re_Content.findall(ss)[0] #返回的是一個數組,取第一個

    re_name = re.compile(r'''<h2>(.*?)</h2>''',re.S)
    userName = re_name.findall(ss)[0]

    jsonStr[userName] = userContent
for k,v in jsonStr.items():
    print(k+":說"+v)