1. 程式人生 > >Python爬蟲(6) 多線程

Python爬蟲(6) 多線程

創建線程 wow query 爬取 exit quest 人工智 += txt

import threading as td;
import queue as qu;
import re;
import urllib.request as ur;
import urllib.error as ue;
import time;

#隊列實例
urlqueue = qu.Queue();

#模擬成瀏覽器
headers = {"User-Agent" , "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.26 Safari/537.36 Core/1.63.6788.400 QQBrowser/10.3.2767.400"};

opener = ur.build_opener();
opener.addheaders = [headers];

#將opener安裝為全局
ur.install_opener(opener);

listurl = [];

#使用代理服務器函數
def use_proxy(proxy_addr,url):
try:
proxy = ur.ProxyHandler({‘http‘:proxy_addr});
opener = ur.build_opener(proxy,ur.HTTPHandler);
ur.install_opener(opener);
data = ur.urlopen(url
).read().decode(‘utf-8‘);
return data;
except ue.URLError as e:
if hasattr(e,"code"):
print(e.code);
if hasattr(e,"reason"):
print(e.reason);
time.sleep(10)
except Exception as e:
print("exception:"+str(e));
time.sleep(1);

#線程1,專門獲取對應網址並處理為真實網址
class geturl(td.Thread):
def __init__(self,key,pagestart,pageend,proxy,urlqueue):
td.Thread.__init__(self);
self.pagestart = pagestart;
self.pageend = pageend;
self.proxy = proxy;
self.urlqueue = urlqueue;

def run(self):
page = self.pagestart;
#編碼關鍵詞key
keycode = ur.quote(key);
#編碼“&page”
pagecode = ur.quote("&page");

for page in range(self.pagestart,self.pageend + 1):
url = "http://weixin.sogou.com/weixin?type=2&query="+keycode+pagecode+str(page);
#用代理服務器爬取,解決ID被封殺問題
data1 = use_proxy(self.proxy,url);
#列表頁url正則

print("data1:"+data1);

listurlpat = ‘<div class="txt-box">.*?(http://.*?)‘;
listurl.append(re.compile(listurlpat,re.S).findall(data1))

#便於調試
print("獲取到"+str(len(listurl))+"頁")
for i in range(0,len(listurl)):
#等一等線程2,合理分配資源
time.sleep(7)
for j in range(0,len(listurl[i])):
try:
url = listurl[i][j];
#處理成真實URL,讀者亦可以觀察對應網址的關系自行分析,采集網址比真實網址多了一串amp
url = url.replace("apm:","");
print("第"+str(i)+"i"+str(j)+"j次入隊");
self.urlqueue.put(url);
self.urlqueue.tasl_done();
except ue.URLError as e:
if hasattr(e, "code"):
print(e.code);
if hasattr(e, "reason"):
print(e.reason);
time.sleep(10)
except Exception as e:
print("exception:" + str(e));
time.sleep(1);

#線程2,與線程1並行自行,從線程1提供的文章網址中一次爬取對應文章信息並處理
class getcontent(td.Thread):
def __init__(self,urlqueue,proxy):
td.Thread.__init__(self);
self.urlqueue = urlqueue;
self.proxy = proxy;

def run(self):
html1 = ‘‘‘<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
<html xmlns="http://www.w3.org/1999/xhtml">
<head>
<meta http-equiv="Content-Type" content="text/html; chatset=utf-8" />
<title>微信文章頁面</title>
</head>
<body>‘‘‘
fh = open("G:/Pcode/2.html","wb");
fh.write(html1.encode("utf-8"));
fh.close();
fh = open("G:/Pcode/2.html","ab");
i=1
while(True):
try:
url = self.urlqueue.get();
data = use_proxy(self.proxy,url);
titlepat = "<title>(.*?)</title>";
contentpat = ‘id="js_content">(.*?)id="js_sg_bar"‘;
title = re.compile(titlepat).findall(data);
thistitle ="此次沒有獲取到";
thiscontent="此次沒有獲取到";
if(title != []):
thistitle = title[0];

if(content != []):
thiscontent = content[0];
dataall = "<p>標題為:"+thistitle+"</p><p>內容為:"+thiscontent+"</p><br/>";
fh.write(dataall.encode("utf-8"));
print("第"+str(i)+"個網頁處理"); #便於調試
i += 1;
except ue.URLError as e:
if hasattr(e, "code"):
print(e.code);
if hasattr(e, "reason"):
print(e.reason);
time.sleep(10)
except Exception as e:
print("exception:" + str(e));
time.sleep(1);
fh.close();
html2 = ‘‘‘</body>
</html>
‘‘‘
fh = open("G:/Pcode/2.html","ab");
fh.write(html2.encode("utf-8"));
fh.close();

class conrl(td.Thread):
def __init__(self,urlqueue):
td.Thread.__init__(self);
self.urlqueue = urlqueue;

def run(self):
while(True):
print("程序執行中")
time.sleep(60)
if(self.urlqueue.empty()):
print("程序執行完畢!");
exit();


key = "人工智能";
proxy = "60.191.201.38:45461";
proxy2 = "";

pagestart = 1#起始頁
pageend = 2#抓取到哪頁

#創建線程1對象,隨後啟動線程1
t1=geturl(key,pagestart,pageend,proxy,urlqueue);
t1.start();

#創建線程2對象,隨後啟動線程2
t2=getcontent(urlqueue,proxy);
t2.start();

#創建線程3對象,隨後啟動線程3
t3=conrl(urlqueue);
t3.start();

Python爬蟲(6) 多線程