1. 程式人生 > >xpath爬取簡書 攝影專題裡的最新收錄 文章的詳情及圖片 完整程式碼

xpath爬取簡書 攝影專題裡的最新收錄 文章的詳情及圖片 完整程式碼

import requests
from lxml import etree #etree
import urllib.parse
import re

header = {
    "User-Agent":"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36"
}
def suibian(url):
    respones = requests.get(url,headers=header)
    a = etree.HTML(respones.
text) b = a.xpath('//ul[@class="note-list"]/li') for i in b: print(i) a_O = i.xpath('.//a[@class="title"]/@href')[0] d = urllib.parse.urljoin(respones.url,a_O) print(d) jiexi(d) def jiexi(url): response = requests.get(url,headers=header) a = etree.HTML(
response.text) b = a.xpath('//h1[@class="title"]/text()') c = a.xpath('//span[@class="name"]/a/text()') d = a.xpath('//div[@class="meta"]//text()') e = a.xpath('//div[@class="show-content"]//img/@data-original-src') f = a.xpath('//div[@class="show-content"]//text()') for i in e: url = i.
replace('//','https://') img(url,b) for i in f: writes(i,b) def img(url,b): response = requests.get(url,headers=header) r = re.compile(".*-(.{1,15})",re.S) c = r.findall(response.url)[0] # print(c) with open('{}{}.jpg'.format(b,c),'wb') as f: f.write(response.content) def writes(t,b): with open('{}.txt'.format(b),'a') as f: f.write(t) if __name__ == '__main__': url ="https://www.jianshu.com/c/7b2be866f564?order_by=added_at&page=1" suibian(url)