1. 程式人生 > >使用scrapy+IP代理+多執行緒爬蟲對拉鉤網在杭州網際網路職位資訊的抓取

使用scrapy+IP代理+多執行緒爬蟲對拉鉤網在杭州網際網路職位資訊的抓取

#encoding=utf8
import urllib2
from bs4 import BeautifulSoup
import socket
import urllib
import requests
import random
from LagouProject.dbhelper import TestDBHelper
import threading
import time
import re
from lxml import etree
from scrapy.conf import settings
import sys
reload(sys)
sys.setdefaultencoding('utf-8')

#設定header
User_Agent = 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.113 Safari/537.36'
header = {}
header['User-Agent'] = User_Agent

'''
獲取西刺首頁的所有代理IP地址
'''
def getProxyIp():
     proxy = []
     for i in range(1,2):
          try:
               url = 'http://www.xicidaili.com/nn/'+str(i)
               req = urllib2.Request(url,headers=header)
               res = urllib2.urlopen(req).read()
               soup = BeautifulSoup(res,'html.parser',from_encoding='utf8')
               ips = soup.findAll('tr')
               for x in range(1,len(ips)):
                ip = ips[x]
                tds = ip.findAll("td")
                ip_temp = tds[1].contents[0]+"\t"+tds[2].contents[0]
                proxy.append(ip_temp)
          except:
                continue
     return proxy

'''
驗證獲得的代理IP地址是否可用
'''

def validateIp(proxy):
      url = "http://ip.chinaz.com/getip.aspx"
      available_ip =[]
      socket.setdefaulttimeout(3)
      for i in range(0,len(proxy)):
          try:
               ip = proxy[i].strip().split("\t")
               proxy_host = "http://"+ip[0]+":"+ip[1]
               proxy_temp = {"http":proxy_host}
               res = urllib.urlopen(url,proxies=proxy_temp).read()
               available_ip.append(proxy_host)
               #print proxy[i]
          except Exception,e:
               continue
      return available_ip

def spider(validateProxy,max_threads=2):
     dbhelper = TestDBHelper()
     #呼叫TestDBHelper中的testSelect方法讀取資料庫中的URL
     results = dbhelper.testSelect()
     rLock = threading.RLock()  #RLock物件
     s  = requests.session()
     #需要抓取的URL列表
     url_queue=[]
     #職位列表
     name_list=[]
     #訪問失敗URL列表
     fail_url=[]
     fail_name=[]
     for row in results:
         name = row[1]
         url = row[2]
         name_list.append(name)
         url_queue.append(url)
     #設定代理
     def process_queue():
          #隨機選取一個IP代理
          IP = random.choice(validateProxy)
          while True:
              try:
                  rLock.acquire()  #獲取鎖
                  url = url_queue.pop()
                  name = name_list.pop()
                  rLock.release()   #釋放鎖
                  #sleep_time = (random.choice(num_list)%3)*10 #設定隨機睡眠時間
                  time.sleep(5)
                  print  'sub thread start!the thread name is:%s\r' % threading.currentThread().getName()
              except:
                  #判斷url_queue是否為空
                  rLock.release()
                  break
              try:
                  #設定代理
                  proxies = {
                      'http' :  IP,
                  }
                  print IP
                  cookie = settings['COOKIE']  # 帶著Cookie向網頁發請求
                  #print cookie
                  #將字典轉為CookieJar:
                  cookies = requests.utils.cookiejar_from_dict(cookie, cookiejar=None, overwrite=True)
                  s.cookies= cookies
                  html = s.get(url,headers=header, timeout=10,proxies=proxies,).content.encode('utf-8')
                  #print html
                  page = etree.HTML(html.decode('utf-8'))
                  #print page
                  elements = page.xpath("//div[@class='position-content-l']/dd//span")
                  #print len(elements)
                  #break

                  #一條資料庫記錄的資訊存放在這個列表中
                  content_list=[]
                  for element in elements:
                      content = element.text
                      #print content
                      if content:
                          content = content.replace('/','')
                          if 'k' in content or '年' in content or 'K' in content:
                                 list = re.findall(r'\d+',content)
                                 value = [float(i) for i in list]
                                 content = sum(value)/len(value)
                      else:
                          content = 'null'
                      content_list.append(content)

                  content_list.append(name)
                  #print content_list
                  #將記錄插入資料庫
                  dbhelper.testInsert(content_list)

              except Exception,e:
                  print '---------------------------------------異常'
                  print url
                  print e
                  IP = random.choice(validateProxy) #如果在timeout內沒有訪問網頁成功,從新選擇一個代理
                  rLock.acquire()  #獲取鎖
                  fail_name.append(name)
                  fail_url.append(url)  #把訪問失敗的網頁新增到fail_url以備遞迴訪問
                  rLock.release()  #釋放鎖
                  continue
     #設定多執行緒
     threads=[]
     while threads or url_queue:
          for thread in threads:
              if not thread.is_alive():
                 #移除the stopped threads
                 threads.remove(thread)
          while len(threads) < max_threads and url_queue:
              time.sleep(5)
              #can start some more threads
              thread = threading.Thread(target=process_queue)
              # set daemon so main thread can exit when receives ctrl-c
              thread.setDaemon(True)
              print '---------------------------------------------------------------------------------多執行緒'+thread.name

              thread.start()
              threads.append(thread)
          time.sleep(1)


     if fail_url:   #把訪問失敗的URL遞迴呼叫spider方法
         failspider(fail_url,fail_name,validateProxy,max_threads=1)



def failspider(url_list,name_list,proxy,max_threads=2):
      print url_list
      print name_list
      dbhelper = TestDBHelper()
      s  = requests.session()
      rLock = threading.RLock()  #RLock物件
      #訪問失敗URL列表
      fail_url=[]
      fail_name=[]
      def process_queue():
          #隨機選取一個IP代理
          IP = [random.choice(proxy)]
          while True:
              try:
                  rLock.acquire()   #獲取鎖
                  url = url_list.pop()
                  name = name_list.pop()
                  rLock.release()   #釋放鎖
                  #sleep_time = (random.choice(num_list)%3)*10
                  time.sleep(5)
                  print  'sub thread start!the thread name is:%s\r' % threading.currentThread().getName()
              except:
                  #判斷url_queue是否為空
                  rLock.release()
                  break
              try:
                  #設定代理
                  proxies = {
                      'http' :  IP,
                  }
                  print IP
                  cookie = settings['COOKIE']  # 帶著Cookie向網頁發請求
                  #print cookie
                  #將字典轉為CookieJar:
                  cookies = requests.utils.cookiejar_from_dict(cookie, cookiejar=None, overwrite=True)
                  s.cookies= cookies
                  html = s.get(url,headers=header, timeout=10,proxies=proxies).content.encode('utf-8')
                  #print html
                  page = etree.HTML(html.decode('utf-8'))
                  #print page
                  elements = page.xpath("//div[@class='position-content-l']/dd//span")
                  #break
                  #一條資料庫記錄的資訊存放在這個列表中
                  content_list=[]
                  for element in elements:
                      content = element.text
                      if content:
                          content = content.replace('/','')
                          if 'k' in content or '年' in content or 'K' in content:
                                 list = re.findall(r'\d+',content)
                                 value = [float(i) for i in list]
                                 content = sum(value)/len(value)
                      else:
                          content = 'null'
                      content_list.append(content)

                  content_list.append(name)
                  #print content_list
                  #將記錄插入資料庫
                  dbhelper.testInsert(content_list)

              except Exception,e:
                  print '---------------------------------------異常'
                  print e
                  IP = random.choice(validateProxy) #如果在timeout內沒有訪問網頁成功,從新選擇一個代理
                  rLock.acquire()   #獲取鎖
                  fail_url.append(name)
                  fail_name.append(url)  #把訪問失敗的網頁新增到fail_url以備遞迴訪問
                  rLock.release()   #釋放鎖
                  continue
      #設定多執行緒
      threads=[]
      while threads or url_list:
          for thread in threads:
              if not thread.is_alive():
                 #移除the stopped threads
                 threads.remove(thread)
          while len(threads) < max_threads and url_list:
              time.sleep(5)
              #can start some more threads
              thread = threading.Thread(target=process_queue)
              # set daemon so main thread can exit when receives ctrl-c
              thread.setDaemon(True)
              print '---------------------------------------------------------------------------------多執行緒'+thread.name
              thread.start()
              threads.append(thread)
          time.sleep(1)


      if fail_url:   #把訪問失敗的URL遞迴呼叫spider方法
          failspider(fail_url,fail_name,proxy,max_threads=1)#遞迴呼叫failspider方法




if __name__ == '__main__':
 # proxy = getProxyIp()
 # validateProxy = validateIp(proxy)
 # print validateProxy
 validateProxy=[u'http://60.209.166.172:8118', u'http://121.43.227.212:808', u'http://113.87.90.218:53281', u'http://112.123.42.94:9745', u'http://175.42.102.252:8118', u'http://116.248.172.233:80', u'http://175.16.221.31:8118', u'http://171.36.182.180:8118', u'http://115.215.50.218:8118', u'http://171.126.12.9:80', u'http://113.205.0.23:8118', u'http://106.58.152.171:80', u'http://59.63.178.203:53281', u'http://111.155.116.239:8123', u'http://117.90.34.87:8118', u'http://111.155.116.200:8123', u'http://61.183.176.122:53281', u'http://112.114.96.94:8118', u'http://58.49.122.30:53281', u'http://112.114.94.8:8118', u'http://27.22.63.12:808', u'http://112.114.78.28:8118']

 spider(validateProxy,max_threads=2)

b、要使用到的資料庫程式碼dbhelper,在dbhelper是連結資料庫和對資料庫進行建庫建表,以及增刪改查操作的工具類,在爬蟲程式碼中有使用到,大家注意觀察