爬蟲--爬取網站連結
阿新 • • 發佈:2019-02-18
該爬蟲的目的是爬取一個網站包含的連結並存入檔案。爬蟲的原型來源於《Web Scraping with Python》一書的第一章的程式碼,原始碼是python2編寫的,這裡改用python3編寫,在此基礎上做一些恰當的修改,增強容錯能力,支援代理、限速、爬取深度設定。
from urllib.request import urlopen, URLError, HTTPError, Request, build_opener, ProxyHandler from urllib.parse import urlparse,quote,urljoin import re from datetime import datetime import time #第一個引數是要爬取的url,第二個引數是代理,第三個引數是重試次數 def download(url, proxy=None, num_retries=2): print('Downloading', url) headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.139 Safari/537.36'} request = Request(url, headers=headers) opener = build_opener() if proxy: opener.add_handler(ProxyHandler({urlparse(url).scheme : proxy})) try: html = opener.open(request).read().decode() except (URLError, HTTPError, UnicodeDecodeError, UnicodeEncodeError) as e: print('Download error:', e.reason) html = None if num_retries > 0: if hasattr(e, 'code') and 500 <= e.code < 600: return download(url, num_retries - 1) return html def get_link(html): webpage_regex = re.compile('<a[^>]+href=["\'](.*?)["\']', re.IGNORECASE) try: return webpage_regex.findall(html) except TypeError: return [] # 限速 class Throttle: def __init__(self, delay): self.delay = delay self.domains = {} def wait(self, url): domain = urlparse(url).netloc last_accessed = self.domains.get(domain) if self.delay > 0 and last_accessed is not None: sleep_secs = self.delay - (datetime.now() - last_accessed).seconds if sleep_secs > 0: time.sleep(sleep_secs) self.domains[domain] = datetime.now() # 爬取連結,儲存到link_save #第一個引數是要爬取的url,第二個引數是匹配連結的正則表示式,第三個引數是爬取的深度 def link_crawler(seed_url, link_regex, max_depth=2): crawl_queue = [seed_url] link_save = {seed_url:0} throttle = Throttle(2) while crawl_queue: url = crawl_queue.pop() depth = link_save[url] throttle.wait(url) html = download(url) if depth <= max_depth: for link in get_link(html): # 相對連結使用這行程式碼合併絕對連結 # link = urljoin(seed_url, link) if re.match(link_regex, link) and link not in link_save.keys(): crawl_queue.append(link) link_save[link] = depth + 1 return link_save if __name__ == "__main__": result = link_crawler("http://www.xxxx.com/", "http://.*?") with open('xxxx_com.txt', 'w') as f: for i in result.keys(): f.writelines(f'{i}\n')