1. 程式人生 > >ip代理池-基於mongodb數據庫

ip代理池-基於mongodb數據庫

url upd tostring mls from path ida request protocol

代碼用的python2.7,抓取xici免費代理,檢測放入數據庫中,為以後爬蟲做準備。下面直接上代碼

 1 #-*-encoding=utf-8-*-
 2 
 3 import requests
 4 from lxml import etree
 5 import time
 6 import pymongo
 7 from multiprocessing import Pool
 8 
 9 
10 class Getproxy(object):
11     def __init__(self):
12         self.headers = {User-Agent:Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.115 Safari/537.36
} 13 self.url = http://www.xicidaili.com/wt/ 14 self.client = pymongo.MongoClient(localhost,27017) 15 self.xici = self.client[xici] 16 self.xiciipinfo =self.xici[xiciipinfo] 17 #self.removeip = ‘127.0.0.1‘ #第一次運行會檢測該變量,因為下面只有檢測失敗了才會賦值 18 19 def getip(self,num):
20 #爬西祠所有代理,更新放入數據庫 21 url = self.url + str(num) 22 wb_data = requests.get(url, headers= self.headers) 23 html = etree.HTML(wb_data.text) 24 # htmls = etree.tostring(html) 25 ips = html.xpath(//tr[@class="odd"]/td[2]/text()) 26 ports = html.xpath(
//tr[@class="odd"]/td[3]/text()) 27 protocols = html.xpath(//tr[@class="odd"]/td[6]/text()) 28 areas = html.xpath(//tr[@class="odd"]/td[4]/a/text()) 29 for ip, port, protocol, area in zip(ips, ports, protocols, areas): 30 data = { 31 ip: ip, 32 port: port, 33 protocol: protocol, 34 area: area, 35 } 36 print data 37 #self.xiciipinfo.insert_one(data) 38 #if self.removeip != ip: #此處加一個判斷,如果是下面檢測過的不可用的ip,就不更新進入數據庫,可以節省下面的檢測時間 39 self.xiciipinfo.update({ip:ip}, {$set:data}, True) 40 41 42 def count(self,num): 43 for i in range(1,num): 44 self.getip(i) 45 time.sleep(2) 46 47 48 def dbclose(self): 49 self.client.close() 50 51 52 def getiplist(self): 53 # 將數據庫內數據整理放入列表 54 ips = self.xiciipinfo.find() 55 proxylist = [] 56 for i in ips: 57 b = "http" + "://" + i[ip] + ":" + i[port] 58 proxies = {"http": b} 59 # print proxies 60 proxylist.append(proxies) 61 # print proxylist 62 return proxylist 63 64 def iptest(self, proxy): 65 # 檢測ip,並更新進入數據庫,刪掉不可用的ip 66 ip = proxy[http][7:].split(:)[0] 67 try: 68 requests.get(http://wenshu.court.gov.cn/, proxies=proxy, timeout = 6) 69 except: 70 print field...............>>>>>>>>>>>>>>>>>>>>>>>> 71 #self.removeip = ip #賦值給類屬性 72 self.xiciipinfo.remove({ip: ip}) # 用remove方法,將符合條件的刪掉 73 print remove it now.....{}.format(ip) 74 else: 75 print <<<<<<<<<<<<<<<<<.............success 76 print proxy 77 78 79 if __name__ == __main__: 80 pool = Pool() 81 proxy = Getproxy() 82 proxy.count(2) 83 iplist = proxy.getiplist() 84 map(proxy.iptest, iplist) 85 proxy.dbclose()


ip代理池-基於mongodb數據庫