1. 程式人生 > >python:用scrapy爬去天貓評論

python:用scrapy爬去天貓評論

1,建立scrapy startproject  tb

2 ,   cd  tb  ,建立一個spider scrapy genspider 爬蟲名字 網站域名

3, 在items中寫自己想爬的東西  ,這裡我爬的是評論 ,型號,使用者名稱

4,在pippelines.py寫儲存的方式  我這裡寫的是資料夾

5,seting裡面開啟

USER_AGENT = 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.67 Safari/537.36'
ROBOTSTXT_OBEY = False
DOWNLOAD_DELAY = 3
DOWNLOADER_MIDDLEWARES = {
   'tb.middlewares.SeleniumMiddlewares': 543,
}
ITEM_PIPELINES = {
   'tb.pipelines.TbPipeline': 300,
}

6  spdier.py   中

import scrapy
from scrapy import Request
import lxml.html
from tb.items import TbItem

class TaobaosSpider(scrapy.Spider):
    name = 'tianmao1'
    #allowed_domains = ['www.tianmao.com','detail.tmall.com']
    #@property
    def start_requests(self):
        base_url = "https://detail.tmall.com/item.htm?spm=a220m.1000858.1000725.7.3c3f2a68kywm0p&id=549984903510&skuId=3721826599822&areaId=410100&user_id=370627083&cat_id=2&is_b=1&rn=da3c446a634049bd41933f1cde5d6d1f"
        yield Request(url=base_url,callback=self.parse,dont_filter=True,meta={"page": "1"})
    #start_urls = ['https://detail.tmall.com/item.htm?spm=a220o.1000855.0.da321h.739b68c88QNKJE&id=565262586274&skuId=4029282759058']
    def parse(self,response):
        item = TbItem()
        tr_list = response.xpath('//div[@class="rate-grid"]/table/tbody/tr').extract()
        for tr in tr_list:
            html = lxml.html.fromstring(tr)
            pinglun = html.xpath('//td[@class="tm-col-master"]/div/div[1]/text()')[0]
            xinghao = html.xpath('//td[@class="col-meta"]/div/p/text()')[0]
            xingming = html.xpath('//td[@class="col-author"]/div/text()')[0]
            #time = html.xpath('//td[@class="tm-col-master"]/div[@class="tm-rate-date"]/text()')[0]
            item["pinglun"]=pinglun
            item["xianghao"]=xinghao
            item["xingming"]=xingming
            #item["time"]=time
            yield item
        yield Request(url="http://www.baidu.com",callback=self.parse,meta={"page": "2"},dont_filter=True)
7,在middlewares.py填寫
class SeleniumMiddlewares(object):
    def __init__(self):
        self.options = Options()
        #self.options.add_argument('-headless')
        self.browser = webdriver.Chrome(executable_path="F:\第七重新爬蟲\day06\day06全天\ziliao\chromedriver.exe",chrome_options=self.options)
    def process_request(self,request,spider):
        if int(request.meta["page"]) == 1:
            self.browser.get(request.url)
            time.sleep(5)
            for y in range(10):
                self.browser.execute_script("window.scrollBy(0,220)")
                time.sleep(3)
            pages = self.browser.find_element_by_xpath('//li/a[@href="#J_Reviews"]')
            pages.click()
            time.sleep(5)
            return HtmlResponse(url=self.browser.current_url,body=self.browser.page_source,request=request,encoding="utf-8")

        if int(request.meta["page"]) == 2:
            for y in range(20):
                self.browser.execute_script("window.scrollBy(0,200)")
                time.sleep(3)
            pages = self.browser.find_element_by_link_text("下一頁>>")
            self.browser.execute_script("arguments[0].click();", pages)
            #pages.click()
            return HtmlResponse(url=self.browser.current_url, body=self.browser.page_source, request=request,encoding="utf-8")

這裡用的Selenium模擬點選評論連結 獲取頁面  傳給spdier然後解析

8 ,啟動爬蟲 scrapy crawl 爬蟲名