python爬蟲爬取全站url,完美小demo(可防止連結到外網等各種強大篩選)
阿新 • • 發佈:2018-12-12
上次完成的url爬取專案並不能滿足需求,在此完成了一個更為強大的爬取程式碼,有需要的可以直接執行,根據自己爬取的網站更改部分正則和形參即可。前排提示:執行需要耐心,因為幾千個url爬完的話,還是建議花生瓜子可樂電影準備好。
話不多說,直接上程式碼,程式碼有註釋,很容易理解。
# -*- coding: utf-8 -*- """ Created on Wed Sep 29 16:09:28 2018 @author: ESionJL資料貓 question:1.當前url若爬取到的pagelinks為[],則將其移除visited列表。 2.spiderpage()函式中,當前url爬取到的網頁為UNknown,會報錯,如何規避,並將此url移除。 3.返回title為空 4.網站不可載入 5.過期網站,垃圾網站 """ import re import requests from bs4 import BeautifulSoup from urllib import request from urllib import error #此測試首頁是否可以連結 def url_get(num_retries=5): # url = input("請輸入要爬取的首頁url:") url = "http://www.newchinalife.com/ncl/cn/new/index/index.shtml" # url = "http://www.newchinalife.comindex.html/" try: # 做一個user-agent模擬瀏覽器傳送請求,也可以加入其它欄位 kv = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:61.0) Gecko'} requests.get(url, headers=kv) return url except error.URLError or error.HTTPError as e: if num_retries > 0: if hasattr(e,'code') and 500 <= e.code < 600: url_get(num_retries-1) print("url無法連線") #此函式用於提取各連結網站下的所有連結 def spiderpage(url): try: kv = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) Chrome/57.0.2987.98 Safari/537.36 LBBROWSER'} r = requests.get(url, headers=kv) r.encoding = r.apparent_encoding pagetext = r.text # 正則表示式表示要爬取的是<a href="和"中的內容,"或'都可以,即當前頁面下所有的連結url,返回列表 pagelinks = re.findall(r'(?<=<a href=\").*?(?=\")|(?<=href=\').*?(?=\')', pagetext) # print(pagelinks) return pagelinks except: pagelinks = ['http://www.newchinalife.com/ncl/cn/new/index/index.shtml'] print("這個網站有點東西") return pagelinks #此函式用來檢測連結是否為外網連結或者不合格連結 def getTitle(url): # 檢驗是否為本站連結,防止死迴圈爬取,如連結跳出本站則不進行操作 headers = {'Accept': '*/*', 'Accept-Language': 'en-US,en;q=0.8', 'Cache-Control': 'max-age=0', 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/48.0.2564.116 Safari/537.36', 'Connection': 'keep-alive', 'Referer': 'http://www.baidu.com/' } print(url) req = request.Request(url, headers=headers) html = None try: response = request.urlopen(req) html = response.read().decode('utf-8') soup = BeautifulSoup(html, "html.parser") if soup.body is not None: url_list = soup.head.title title = url_list.string print(title) if title != None: return title else: return "這網站沒有靈性" else: title = "不可載入" return title # except error.URLError or error.HTTPError or error.UnicodeDecodeError: except: print("這網站沒有靈性") return "不可載入" #正則刪選函式 def url_filtrate(pagelinks): same_target_url = [] try: for murl in pagelinks: murl = re.sub(r'\s+','', murl) if re.findall("^java",murl) or re.findall("^jse",murl) or re.findall("^ALL",murl) or re.findall("pdf$",murl) or re.findall("^login",murl) or re.findall("css$",murl) or re.findall("@",murl): pagelinks.remove(murl) elif re.findall("^http",murl) and re.findall("newchinalife",murl): murl = str(murl) same_target_url.append(murl) elif re.findall("^java",murl) or re.findall("^jse",murl) or re.findall("^ALL",murl) or re.findall("pdf$",murl) or re.findall("^login",murl): pagelinks.remove(murl) elif re.findall("gsp$",murl) or re.findall("shtml$",murl) or re.findall("[0-9]*$",murl): murl = "https://www.newchinalife.com" + str(murl) same_target_url.append(murl) elif re.findall("^/",murl): murl = "https://www.newchinalife.com" + str(murl) same_target_url.append(murl) else: pass except ValueError as e: pass # 去除重複url unrepect_url = [] for l in same_target_url: if l not in unrepect_url: unrepect_url.append(l) print(unrepect_url) return unrepect_url class linkQuence: def __init__(self): # 已訪問的url集合 self.visited = [] # 待訪問的url集合 self.unvisited = [] # 獲取訪問過的url佇列 def getvisitedurl(self): return self.visited # 獲取未訪問的url佇列 def getunvisitedurl(self): return self.unvisited # 新增url到訪問過得佇列中 def addvisitedurl(self, url): return self.visited.append(url) # 移除訪問過得url def removevisitedurl(self, url): return self.visited.remove(url) # 從未訪問佇列中取一個url def unvisitedurldequence(self): try: return self.unvisited.pop() except: return None # 新增url到未訪問的佇列中 def addunvisitedurl(self, url): if url != "" and url not in self.visited and url not in self.unvisited: return self.unvisited.insert(0, url) # 獲得已訪問的url數目 def getvisitedurlount(self): return len(self.visited) # 獲得未訪問的url數目 def getunvistedurlcount(self): return len(self.unvisited) # 判斷未訪問的url佇列是否為空 def unvisitedurlsempty(self): return len(self.unvisited) == 0 class Spider(): def __init__(self, url): self.linkQuence = linkQuence() # 將佇列引入本類 self.linkQuence.addunvisitedurl(url) # 傳入待爬取的url,即爬蟲入口 #真正的爬取連結函式 def crawler(self,urlcount): # 子頁面過多,為測試方便加入迴圈控制子頁面數量 x = 1 while self.linkQuence.unvisited or x==urlcount: # 若子頁面不是很多,可以直接使用佇列中的未訪問列表非空作為迴圈條件 # while not self.linkQuence.unvisitedurlsempty(): if x > 1: print(f"第{x-1}個url,開始爬") visitedurl = self.linkQuence.unvisitedurldequence() # 從未訪問列表中pop出一個url if visitedurl is None or visitedurl == '': continue title = getTitle(visitedurl) if re.findall("新華保險",title): #如果跳出本站則pass initial_links = spiderpage(visitedurl) # 爬出該url頁面中所有的連結 right_links = url_filtrate(initial_links) # 篩選出合格的連結 if not right_links: pass else: self.linkQuence.addvisitedurl(visitedurl) # 將該url放到訪問過的url佇列中 for link in right_links: # 將篩選出的連結放到未訪問佇列中 self.linkQuence.addunvisitedurl(link) x += 1 else: pass print(f"爬完了") return self.linkQuence.visited #寫檔案函式 def writetofile(urllist): #寫入網站並計數 x=1 for url in urllist: # Furls.txt用於儲存連結 file = open('Furls.txt', 'a', encoding='utf8') file.write(f'{url}\n') x += 1 file.close() print(f'寫入已完成,總計{x-1}個網頁的子連結') #主迴圈 if __name__ == '__main__': url = url_get() spider = Spider(url) #傳入要爬取的子連結數量 urllist = spider.crawler(5000) writetofile(urllist)
還是希望大家自己學會比較好,只是貼上畢竟學不到東西,這個主題框架不是我寫的,但是真正的實現函式都是我自己一點一點寫的,遇到很多困難也都解決了,能學到不少東西。