1. 程式人生 > >request+redis 分散式爬蟲

request+redis 分散式爬蟲

# __author__ = ''
# __createTime__ = '2019/1/7 13:49'
# __description__ = '‘’
# # -*- coding:utf-8 -*-
import random
from itertools import chain
from urllib.parse import quote
from concurrent.futures import ThreadPoolExecutor
from redis import Redis
import pymysql
import requests
from lxml import etree
'''redis + requests 分散式''' redis_connect = Redis.from_url("redis://:6379", decode_responses=True) db = pymysql.connect(host='193.112.41.49', user='', password="", database='spiders', port=3306, charset='utf8mb4') cursor = db.cursor()
class Conton_Fair(): def __init__(self,url): self.url = url self.headers = { 'Accept': '*/*', 'Accept-Encoding': 'gzip, deflate', 'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8,und;q=0.7', 'Cache-Control
': 'no-cache', 'Connection': 'keep-alive', 'Cookie': 'ASP.NET_SessionId=u1rolptswy22kite05yuu2dr; Hm_lvt_26d823f5326e82607b28c9dd5bb3276f=1546075438; Hm_lpvt_26d823f5326e82607b28c9dd5bb3276f=1546075438; _gcl_au=1.1.1828690268.1546075439; _ga=GA1.3.682141728.1546075439; _ym_uid=15460754431066088148; _ym_d=1546075443; ASPSESSIONIDSQARTRST=JBKMEFAABPPOIONCBCGLIDOM; cookie-notification=1; ASPSESSIONIDQASDDBCA=ODAOCGMCBGEJAHGFIDCKFJHL; _ctauu_469_1=%7B%22uuid%22%3A%22cp21gbzc66s18asqrg96%22%2C%22vsts%22%3A2%2C%22imps%22%3A%7B%7D%2C%22cvs%22%3A%7B%7D%7D; safedog-flow-item=; WT_FPC=id=2eedfbfb975c7db4e0b1546075438399:lv=1546830767948:ss=1546830613964', 'Host': 'www.cantonfair.org.cn', 'Pragma': 'no-cache', 'Referer':self.url, 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.67 Safari/537.36''' } def Get_url(self): htmls = requests.get(url=self.url,headers = self.headers) html = etree.HTML(htmls.text) return self.Save_url(html) def Save_url(self,html): h4 = html.xpath('//li//h4') for Company in h4: if Company.xpath('.//text()'): link =(Company.xpath('./a/@href')[0].replace('Product', 'Company').split('&productid')[ 0] + '&corptype=1').replace('en', 'cn') # 加入快取 redis_connect.sadd("urls", link) # 下一頁 Next = html.xpath('//a[text()="Next"]/@href') if Next: self.url = 'http://www.cantonfair.org.cn/en/search/%s'%Next[0] self.Get_url() def main(kw): url_datas = quote(kw) url = list.aspx?k=%s&lang=2&len=100' % url_datas Class_Conton = Conton_Fair(url) Class_Conton.Get_url() if __name__ == '__main__': # while True: ssql = """SELECT kw FROM words WHERE status=0 or status=5 LIMIT 100 """ cursor.execute(ssql) dataAll = cursor.fetchall() list_url = list(chain.from_iterable(dataAll)) # urls = list(chain.from_iterable(dataAll)) with ThreadPoolExecutor(3) as executor: for data_url in list_url: executor.submit(main,data_url) upda = '''UPDATE words SET status=5 WHERE kw=%r'''%data_url cursor.execute(upda) db.commit()

使用分散式爬取,我的思路是這樣的,一臺機器爬取指定的url,存到快取,爬url比解析總是要快吧,一頁都有好幾十的那種,就算每臺機器的速度都一樣,爬一次的url夠幾臺機器同時去解析的了

接下來就是我們的解析了:

因為這個網站需要的資料是動態載入的,我js比較差,也不想去找函式,我就直接使用splash渲染了,它和selenium差不多,但是的話,splash比較快一些,就選擇這個了。

可以去了解了解哦

# __author__ = ''
# __createTime__ = '2019/1/7 15:20'
# __description__ = '程式碼簡要說明'

import time
import requests
from redis import Redis

redis_connect = Redis.from_url("redis:/:6379", decode_responses=True)

def splash_render(url):
    splash_url = "http:/:8050/render.html"

    args = {
        "url": url,
        "timeout": 5,
        "image": 0
    }
    response = requests.get(splash_url, params=args)
    return response.text


if __name__ == '__main__':
    # 判斷快取中是否有url
    if "first_urls" in redis_connect.keys():
        # 隨機取一個url並且移除,如果需要去重的話,可以考慮使用布隆過濾器去去重
        url = redis_connect.spop("urls")
        html = splash_render(url)
        print(html)

解析網頁的結果這份程式碼可以拷貝到許多臺機器同時執行,當然,以上只是簡單版的,不要以為這樣分散式就完事了