python爬蟲 爬取詩詞名句網
阿新 • • 發佈:2018-12-30
- 使用requests庫,xpath庫
import requests import time from lxml import etree # 去請求頁面的函式 def request_Header(url): headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.67 Safari/537.36', } request = requests.get(url=url,headers=headers) return request def get_content_text(all_href): request = request_Header(all_href) etrees = etree.HTML(request.content) # 獲取所有章節的內容 得到一個列表 這是由多個p標籤組成的 content_text = etrees.xpath('//div[@class="layui-col-md8 layui-col-sm7"]/div/div/p/text()') strs_cont = ' ' for con in content_text: strs_cont+=con+'\n' print(strs_cont) return strs_cont def main(): url = 'http://www.shicimingju.com/book/sanguoyanyi.html' req = request_Header(url) # 得到網頁的內容 content = req.content etrees = etree.HTML(content) # 獲取所有章節內容 text = etrees.xpath('//div[@class="book-mulu"]/ul/li/a') fp = open('三國演義.txt','w',encoding='utf8') # 遍歷這個列表,挨個獲取 for a in text: # 獲取標題 title = a.xpath('text()')[0] print('正在下載>>>%s'%title) # 獲取連線 href = a.xpath('@href')[0] # 拼接url all_href = 'http://www.shicimingju.com' + str(href) # print(all_href) # 去網頁中獲取內容 get_content = get_content_text(all_href) fp.write(title+'\n' + str(get_content)+'\n') # time.sleep(2) print('結束下載%s' % title) fp.close() if __name__ == '__main__': main()