1. 程式人生 > >一個爬取52破解的全部帖子地址的簡單爬蟲

一個爬取52破解的全部帖子地址的簡單爬蟲

軟件調試 ict print __main__ 逆向 慶典活動 exception requests 總頁數

 


1
# -*- coding:utf-8 -*- 2 import requests 3 from bs4 import BeautifulSoup 4 import time 5 6 7 8 title_list=[{原創發布區:http://www.52pojie.cn/forum-2-1.html}, 9 {逆向資源區:http://www.52pojie.cn/forum-4-1.html}, 10 {脫殼破解區:http://www.52pojie.cn/forum-5-1.html
}, 11 {動畫發布區:http://www.52pojie.cn/forum-6-1.html}, 12 {懸賞問答區:http://www.52pojie.cn/forum-8-1.html}, 13 {水漫金山:http://www.52pojie.cn/forum-10-1.html}, 14 {站點公告:http://www.52pojie.cn/forum-13-1.html
}, 15 {精品軟件區:http://www.52pojie.cn/forum-16-1.html}, 16 {音樂視頻:http://www.52pojie.cn/forum-19-1.html}, 17 {編程語言區:http://www.52pojie.cn/forum-24-1.html}, 18 {申請專區:http://www.52pojie.cn/forum-25-1.html
}, 19 {LCG Area:http://www.52pojie.cn/forum-28-1.html}, 20 {病毒分析區:http://www.52pojie.cn/forum-32-1.html}, 21 {周年慶典活動專區:https://www.52pojie.cn/forum-36-1.html}, 22 {招聘求職:http://www.52pojie.cn/forum-39-1.html}, 23 {病毒樣本區:http://www.52pojie.cn/forum-40-1.html}, 24 {安全工具區:http://www.52pojie.cn/forum-41-1.html}, 25 {電子書策劃制作區:http://www.52pojie.cn/forum-42-1.html}, 26 {Key|Patch|共享賬號:http://www.52pojie.cn/forum-44-1.html}, 27 {病毒救援區:http://www.52pojie.cn/forum-50-1.html}, 28 {影視推薦:http://www.52pojie.cn/forum-56-1.html}, 29 {LSG Area:http://www.52pojie.cn/forum-58-1.html}, 30 {軟件調試區:http://www.52pojie.cn/forum-59-1.html}, 31 {T恤活動作品區:http://www.52pojie.cn/forum-62-1.html}, 32 {移動安全區:http://www.52pojie.cn/forum-65-1.html}, 33 {福利經驗:http://www.52pojie.cn/forum-66-1.html}, 34 {2014CrackMe大賽:http://www.52pojie.cn/forum-67-1.html}, 35 {吾愛破解2016安全挑戰賽:http://www.52pojie.cn/forum-71-1.html}, 36 {站務處理:http://www.52pojie.cn/forum-72-1.html}] 37 38 39 40 41 42 43 def get_html(url): 44 while True: 45 try: 46 response = requests.get(url) 47 return response.text 48 except Exception as e: 49 time.sleep(10) 50 continue 51 52 53 # 得到區域總頁數 54 def get_page(url): 55 html = get_html(url) 56 soup = BeautifulSoup(html,lxml) 57 label_list =soup.find_all(label) 58 page = int(label_list[3].span.string[3:-2]) 59 return page 60 61 # 下載指定頁面 62 def page_down(url): 63 64 page = get_page(url) 65 print("總頁數:"+str(page)) 66 txt = input("請輸入保存到的文件名(註意添加後綴):") 67 for j in range(1,page+1): 68 print((""+str(j)+"頁下載中").center(40,"")) 69 html = get_html(url[:-7]+-+str(j)+.html) 70 soup = BeautifulSoup(html,lxml) 71 label_list =soup.find_all(label) 72 a_list =soup.find_all(a,attrs={class:s xst}) 73 #寫入到文件 74 for a in a_list: 75 #print(a.string) 76 #print("https://www.52pojie.cn/"+a.attrs[‘href‘]) 77 with open(txt,a+,encoding=utf-8) as f: 78 f.write(a.get_text()) 79 f.write(\n) 80 f.write("https://www.52pojie.cn/"+a.attrs[href]) 81 f.write(\n) 82 83 print((""+str(j)+"頁下載完成").center(40,"")) 84 85 def main(): 86 i = 0 87 time = 0 88 url = ‘‘ 89 # 輸出列表 90 for title in title_list: 91 #print(title) 92 for key in title: 93 url = str(title[key]) 94 if time==1: 95 print((str(i)+:+key).ljust(20)) 96 time=0 97 98 else: 99 print((str(i)+:+key).ljust(20),end=" ") 100 time+=1 101 i+=1 102 103 # 判斷輸入是否在範圍內 104 while True: 105 try: 106 print() 107 num = int(input(請輸入你要瀏覽的代號:)) 108 if num>28 or num<0: 109 print(輸入有誤請重新輸入) 110 continue 111 else: 112 break 113 except Exception as e: 114 print(輸入有誤請重新輸入) 115 continue 116 # 獲得區域鏈接 117 dict_t = title_list[num] 118 for key in dict_t: 119 print(dict_t[key]) 120 page_down(dict_t[key]) 121 122 if __name__ == __main__: 123 main()

一個爬取52破解的全部帖子地址的簡單爬蟲