1. 程式人生 > >爬取筆趣閣小說(一念永恒)

爬取筆趣閣小說(一念永恒)

with inf end name style code color lin lena

!:編碼格式。編碼格式。編碼格式

!!:http://xiaorui.cc/2016/02/19/%E4%BB%A3%E7%A0%81%E5%88%86%E6%9E%90python-requests%E5%BA%93%E4%B8%AD%E6%96%87%E7%BC%96%E7%A0%81%E9%97%AE%E9%A2%98/

!!!:https://www.zhihu.com/question/264878732

!!!!:xx.apparent_encoding

import requests
from bs4 import BeautifulSoup
import re
import sys
article={}
ll
=[] def getlink(url): res=requests.get(url) res.encoding = res.apparent_encoding soup = BeautifulSoup(res.text,html.parser) for i in range(12,20): t1=soup.find_all(dd)[i] for t2 in t1: t3 = t2.get(href) #print(t3) ll.append(t3)
return ll def gettext(url): res=requests.get(url) res.encoding = res.apparent_encoding li=[] soup = BeautifulSoup(res.text,html.parser) li=getlink(url) filename=soup.select(.info h2)[0].text #print(filename) #print(type(filename)) #with open("%s.txt" %filename ,‘wb+‘) as f
f = open("%s.txt" %filename ,a) for k in range(0,3): #print(li[k]) link=http://www.biqukan.com+li[k] t=requests.get(link) t.apparent_encoding #t.encoding=‘gbk‘ st = BeautifulSoup(t.text,html.parser) article[title]=st.select(.content h1) [0].text article[content] = st.select(.showtxt) [0].text.replace(\r, ).replace(\u3000,‘‘).replace(\xa0,‘‘).rstrip(http://www.biqukan.com/1_1094/17967679.html請記住本書首發域名:www.biqukan.com。筆趣閣手機版閱讀網址:m.biqukan.com) #print(type(article[‘title‘])) #print(article[‘content‘]) f.write(article[title]+\n) f.write(article[content]+\n) f.close() url=http://www.biqukan.com/1_1094/ gettext(url)

爬取筆趣閣小說(一念永恒)