1. 程式人生 > >requests使用ip代理時單ip和多ip設定方式,智聯招聘小爬蟲封裝

requests使用ip代理時單ip和多ip設定方式,智聯招聘小爬蟲封裝

reqeusts庫,在使用ip代理時,單ip代理和多ip代理的寫法不同
(目前測試通過,如有錯誤,請評論指正)

  • 單ip代理模式
    省去headers等

    import requests
    proxy = {
        'HTTPS': '162.105.30.101:8080'
    }
    url = '爬取連結地址'
    response = requests.get(url,proxies=proxy)
    
  • 多ip代理模式

import requests
#匯入random,對ip池隨機篩選
import random
proxy = [
    {
        'http': 'http://61.135.217.7:80'
, 'https': 'http://61.135.217.7:80', }, { 'http': 'http://118.114.77.47:8080', 'https': 'http://118.114.77.47:8080', }, { 'http': 'http://112.114.31.177:808', 'https': 'http://112.114.31.177:808', }, { 'http': 'http://183.159.92.117:18118', 'https': 'http://183.159.92.117:18118'
, }, { 'http': 'http://110.73.10.186:8123', 'https': 'http://110.73.10.186:8123', }, ] url = '爬取連結地址' response = requests.get(url,proxies=random.choice(proxy))

簡單的智聯招聘爬蟲封裝

import requests
from bs4 import BeautifulSoup
import re
import ssl
import time
import random

ssl._create_default_https_context = ssl._create_unverified_context

user_agent = [
    "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; AcooBrowser; .NET CLR 1.1.4322; .NET CLR 2.0.50727)"
, "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0; Acoo Browser; SLCC1; .NET CLR 2.0.50727; Media Center PC 5.0; .NET CLR 3.0.04506)", "Mozilla/4.0 (compatible; MSIE 7.0; AOL 9.5; AOLBuild 4337.35; Windows NT 5.1; .NET CLR 1.1.4322; .NET CLR 2.0.50727)", "Mozilla/5.0 (Windows; U; MSIE 9.0; Windows NT 9.0; en-US)", "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Win64; x64; Trident/5.0; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 2.0.50727; Media Center PC 6.0)", "Mozilla/5.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0; WOW64; Trident/4.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 1.0.3705; .NET CLR 1.1.4322)", "Mozilla/4.0 (compatible; MSIE 7.0b; Windows NT 5.2; .NET CLR 1.1.4322; .NET CLR 2.0.50727; InfoPath.2; .NET CLR 3.0.04506.30)", "Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN) AppleWebKit/523.15 (KHTML, like Gecko, Safari/419.3) Arora/0.3 (Change: 287 c9dfb30)", "Mozilla/5.0 (X11; U; Linux; en-US) AppleWebKit/527+ (KHTML, like Gecko, Safari/419.3) Arora/0.6", "Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.8.1.2pre) Gecko/20070215 K-Ninja/2.1.1", "Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN; rv:1.9) Gecko/20080705 Firefox/3.0 Kapiko/3.0", "Mozilla/5.0 (X11; Linux i686; U;) Gecko/20070322 Kazehakase/0.4.5", "Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.8) Gecko Fedora/1.9.0.8-1.fc10 Kazehakase/0.5.6", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11", "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_3) AppleWebKit/535.20 (KHTML, like Gecko) Chrome/19.0.1036.7 Safari/535.20", "Opera/9.80 (Macintosh; Intel Mac OS X 10.6.8; U; fr) Presto/2.9.168 Version/11.52", ] ''' 代理若出錯,替換代理池,但代理池需要更新 ''' # proxy = [ # { # 'http': 'http://61.135.217.7:80', # 'https': 'http://61.135.217.7:80', # }, # { # 'http': 'http://118.114.77.47:8080', # 'https': 'http://118.114.77.47:8080', # }, # { # 'http': 'http://112.114.31.177:808', # 'https': 'http://112.114.31.177:808', # }, # { # 'http': 'http://183.159.92.117:18118', # 'https': 'http://183.159.92.117:18118', # }, # { # 'http': 'http://110.73.10.186:8123', # 'https': 'http://110.73.10.186:8123', # }, # ] def get_job_txt(city,kw,txt_name): for i in range(100): time.sleep(2) url = 'https://sou.zhaopin.com/jobs/searchresult.ashx?jl={2}&kw={0}&sm=0&p={1}'.format(kw,i,city) response = requests.get(url,headers = {'User-Agent': random.choice(user_agent)}).content.decode() soup =BeautifulSoup(response,'lxml') tables = soup.select('.newlist')[1:] if tables: for table in tables: job = table.select('.zwmc')[0].text company = table.select('.gsmc')[0].text money = table.select('.zwyx')[0].text place = table.select('.gzdd')[0].text href = table.select('.zwmc')[0].find('a')['href'] print(job+'\t'+company+'\t'+money+'\t'+place+'\t'+href+'\n') with open('{0}.txt'.format(txt_name),'a+',encoding='utf-8',errors='ignore') as f: f.write(job+'\t'+company+'\t'+money+'\t'+place+'\t'+href+'\n') else: print('總頁'+ str(i)) break if __name__ == '__main__': city = input('輸入城市') kw = input('輸入崗位') txt_name = input('輸入儲存檔名') get_job_txt(city=city,kw=kw,txt_name=txt_name)