爬蟲進階(1)
阿新 • • 發佈:2018-12-20
import random import requests from fake_useragent import UserAgent from retrying import retry # 裝飾器 下載錯誤重複下載 import hashlib # 資訊摘要 md5 import queue # 佇列 import re # 正則表示式 from urllib import robotparser # 解析網站robots。txt檔案 from urllib.parse import urlparse,urljoin,urldefrag # 解析url from threading import Thread # 多執行緒 from datetime import datetime # 獲取時間 import time import mongo_cache MAX_DEP = 2 # 定義爬取深度 def get_robots(url): """ 解析robots.txt檔案 :param url: :return: """ rp = robotparser.RobotFileParser() rp.set_url(urljoin(url,'robots.txt')) rp.read() return rp def save_url(html_content,url_str): """ 儲存下載內容 :param html_content: :param url_str: :return: """ md5 = hashlib.md5() md5.update(html_content) # file_path = "./download/" + md5.hexdigest() + ".html" file_path = "D:\crawler\download\crawler-" + gen_html_name(url_str) with open(file_path,'wb') as f: f.write(html_content) def gen_html_name(url_str): path = urlparse(url_str).path path_array = path.split('/') return path_array[len(path_array) - 1]#取出最後一個 def extractor_url_lists(html_content): """ 抽取網頁中的其他連結 :param html_content: :return: """ url_regex = re.compile('<a[^>]+href=["\'](.*?)["\']',re.IGNORECASE) return url_regex.findall(html_content) class CrawlerCommon(Thread): """ 實現一個通用爬蟲,涵蓋基本的爬蟲功能及涉及一些反爬蟲技術 """ def __init__(self,init_url): super(CrawlerCommon,self).__init__() __ua = UserAgent() # 隨機User_Agent self.seed_url = init_url #初始爬取的種子網址 self.crawler_queue = queue.Queue() #使用不同的佇列會造成BFS和DFS的效果 self.crawler_queue.put(init_url) #將種子網址放入佇列 self.visited = {init_url : 0} # 初始爬取深度為0 self.rp = get_robots(init_url) # 初始化orbots解析器 self.headers = {'User-Agent': __ua.random} #生成一個隨機user-agent self.link_regex = '(index|view)' #抽取網址的過濾條件 self.throttle = Throttle(5.0) #下載限流器的間隔5秒 self.mcache = mongo_cache.MongoCache() #初始化mongo_cache self.time_sleep = 5 def retry_download(self,url_str,data,method,proxies): """ 使用裝飾器的重試下載類 :param url_str: :param data: :param method: :param proxies: :return: """ if method == 'POST': result = requests.post(url_str,data=data,headers=self.headers,proxies=proxies) else: result = requests.get(url_str,headers=self.headers,timeout=3,proxies=proxies) assert result.status_code == 200 #此處為斷言,判斷狀態碼是否為200 return result.content def download(self,url_str,data=None,method="get",proxies={}): """ 真正的下載類 :param url_str: :param data: :param method: :param proxies: :return: """ print("download url is :::::",url_str) try: result = self.retry_download(url_str,data,method,proxies) except Exception as e: #python3使用as e print(e.message) result = None return result def nomalize(self,url_str): """ 補全下載連結 :param url_str: :return: """ real_url,_ = urldefrag(url_str) return urljoin(self.seed_url,real_url) def save_result(self,html_content,url_str): """ 將結果存入資料庫庫,存入前檢查內容是否存在 :param html_content: 下載的二進位制內容 :param url_str: 下載網頁的url :return: """ if url_str not in self.mcache: self.mcache[url_str]=html_content else: data_from_mongo = self.mcache[url_str] md5_func = hashlib.md5() md5_func.update(data_from_mongo) mongo_md5_str = md5_func.hexdigest() md5_func.update(html_content) download_md5_str =md5_func.hexdigest() if download_md5_str != mongo_md5_str: self.mcache[url_str] = html_content def run(self): """ 進行網頁爬取的主要方法 :return: """ while not self.crawler_queue.empty(): url_str = self.crawler_queue.get() # 檢測robots。txt檔案規則 if self.rp.can_fetch(self.headers["User-Agent"],url_str): self.throttle.wait_url(url_str) # random_oper = random.randint(0,1) # if random_oper == 1: # time.sleep(self.time_sleep + random.random() * random.randint(1,5)) # else: # time.sleep(self.time_sleep - random.random()) depth = self.visited[url_str] if depth < MAX_DEP: # 下載連結 html_content = self.download(url_str) # 儲存連結 if html_content is not None: # self.mcache[url_str] = html_content self.save_result(html_content,url_str) save_url(html_content,url_str) # 篩選出頁面所有連結 url_list = extractor_url_lists(html_content.decode('utf-8')) # 篩選需要爬取的連結 filter_urls = [link for link in url_list if re.search('/(mongodb)',link)] for url in filter_urls: # 補全連結 real_url = self.nomalize(url) # 判斷連結是否訪問過 if real_url not in self.visited: self.visited[real_url] = depth + 1 self.crawler_queue.put(real_url) else: print("robots.txt 禁止下載:",url_str) class Throttle(object): """ 下載限流器 """ def __init__(self,delay): self.domains = {} self.delay = delay def wait_url(self,url_str): domain_url = urlparse(url_str).netloc # 取出網址域名(netloc) last_accessed = self.domains.get(domain_url) #取出域名的上次下載時間 if self.delay > 0 and last_accessed is not None: #將當前時間和上次下載時間相減,得出兩次下載時間間隔 sleep_interval = self.delay - (datetime.now() - last_accessed).seconds # 如果時間大於0 休眠 if sleep_interval > 0: time.sleep(sleep_interval) self.domains[domain_url] = datetime.now() #把當前時間以域名作為key存到字典中 if __name__ == '__main__': crawler = CrawlerCommon('http://www.runoob.com/mongodb/mongodb-tutorial.html') crawler.run()