抓取biqukan
阿新 • • 發佈:2019-03-15
quest wing sts imp pat owin sibling list title
#python3.7 ‘‘‘ 功能:實現www.biqukan.com/1_1094/5403177.html小說下載為txt
v1.0 ‘‘‘ import requests,sys,time from lxml import etree ##0.獲取所有章節url def get_url_list(catalog_url): res=requests.get(catalog_url) #time.sleep(1) if res.status_code==200: text=res.text html=etree.HTML(text) hrefs=html.xpath(‘//div[@class="listmain"]/dl/dt[2]/following-sibling::*/a/@href‘) #print(‘hrefs‘) return hrefs return None ##1.獲取頁面 def get_one_page(url): res=requests.get(url) #time.sleep(1) if res.status_code==200: return res.text return None ##2.解析頁面 defparse_one_page(text): html=etree.HTML(text) title=html.xpath(‘//div[@class="content"]/h1//text()‘) content=html.xpath(‘//div[@class="showtxt"]//text()‘) #去掉換行 contents=‘‘.join(content).replace(‘\xa0‘*8,‘\n‘*2)#把列表轉換為一整段文本,並把8個空格換為2個換行 #print(title,contents) #print(title) returntitle,contents #返回多個參數,相當於返回一個元組return(title,content) ##3.保存內面 def write_to_file(title,contents): with open(‘一念永恒.txt‘,‘a‘,encoding=‘utf-8‘)as f: f.write(title[0]+‘\n‘+contents+‘\n‘) #for content in contents: # f.write(content) ##主函數 def main(): #0.獲取章節列表的網址 catalog_url=‘https://www.biqukan.com/1_1094/‘ urls=get_url_list(catalog_url) #print(urls) #1.把網址傳入詳情抓取頁面,並保存 for i in range(len(urls)): rel_url=‘https://www.biqukan.com‘+urls[i] print(rel_url) text=get_one_page(rel_url) #接收2步return的多個參數寫法a,b=(x,y) title,contents=parse_one_page(text) #print(title,contents) #寫入txt write_to_file(title[0],contents) #顯示下載進度 sys.stdout.write(" 已下載:%.3f%%" % float(i/len(urls)) + ‘\r‘) sys.stdout.flush() ##執行入口 if __name__==‘__main__‘: main()
抓取biqukan