簡單Python3爬蟲程式(1)簡單架構:佇列、集合、正則
阿新 • • 發佈:2019-02-05
<span style="font-size:18px;">import re import urllib.request import urllib from collections import deque queue = deque() visited = set() url = 'http://www.baidu.com' queue.append(url) cnt = 0 while queue: url = queue.popleft() # 隊首元素出隊 visited |= {url} # 標記為已訪問 print('已經抓取: ' , cnt, '個連結',' 正在抓取 : ' + url) cnt += 1 urlop = urllib.request.urlopen(url) if 'html' not in urlop.getheader('Content-Type'):#抓取到的並沒有html格式的資料,則重新開始迴圈 continue # 避免程式異常中止, 用try..catch處理異常 try: data = urlop.read().decode('utf-8') except: continue linkre = re.compile('href=\"(.+?)\"') for x in linkre.findall(data): if 'http' in x and x not in visited: queue.append(x) print('把 ' + x +'加入佇列')</span>