1. 程式人生 > >scrapy實戰爬取cl社群評論數超過設定值的連結

scrapy實戰爬取cl社群評論數超過設定值的連結

1、建立scrapy專案

scrapy startproject cl

2、前戲

  a、註釋爬蟲檔案中的allowed_domains

  b、settings.py第22行,ROBOTSTXT_OBEY = True改為ROBOTSTXT_OBEY = False

  c、settings.py第19行,改為USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36'

  d、開啟管道:67-69行,

  ITEM_PIPELINES = {
       'mytestscrapy.pipelines.MytestscrapyPipeline': 300,
    }

3、cl.py

# -*- coding: utf-8 -*-
import scrapy
from scrapy import Selector
from mytestscrapy.items import MytestscrapyItem
import time
import random

class TestCLSpider(scrapy.Spider):
    name = 'cl'
    #
allowed_domains = ['www.baidu.com'] start_urls = ['https://cc.yyss.icu/thread0806.php?fid=2&search=&page=1'] print("第1頁開始") url = 'https://cc.yyss.icu/thread0806.php?fid=2&search=&page=%d' pageNum = 1 def parse(self, response): # response_text = response.text if
self.pageNum == 1: tr_ele=Selector(response=response).xpath('//table[@id="ajaxtable"]/tbody[@style="table-layout:fixed;"]/tr[@class="tr3 t_one tac"]')[2:] else: tr_ele=Selector(response=response).xpath('//table[@id="ajaxtable"]/tbody[@style="table-layout:fixed;"]/tr[@class="tr3 t_one tac"]') for tr in tr_ele: count = tr.xpath('./td[4]/text()').extract_first() #過濾評論數小於4的 if int(count) < 4: continue text = tr.xpath('./td[2]//a/text()').extract_first() url = 'https://cc.yyss.icu/'+tr.xpath('./td[2]//a/@href').extract_first() item = MytestscrapyItem() item['urlname'] = text item['urladdr'] = url item['commentsNum'] = count yield item #爬取1-30頁資料 if self.pageNum < 30: #每爬取一頁資料,隨機等待2-4秒 time.sleep(random.randint(2,5)) self.pageNum += 1 new_url = format(self.url % self.pageNum) print("第%s頁開始"%self.pageNum) yield scrapy.Request(url=new_url,callback=self.parse)

 

4.items.py

import scrapy


class MytestscrapyItem(scrapy.Item):
    # define the fields for your item here like:
    # name = scrapy.Field()
    urlname = scrapy.Field()
    urladdr = scrapy.Field()
    commentsNum = scrapy.Field()

5、pipelines.py(資料存入mysql資料庫,mysql資料庫cl_table表的欄位urlname, urladdr, commentsNum)

import pymysql


class MytestscrapyPipeline(object):
    connect = ''
    cursor = ''
    def open_spider(self, spider):
        self.connect = pymysql.Connect(
            host='localhost',
            port=3306,
            user='root',
            passwd='123456',
            db='cl',
            charset='utf8'
        )
    def process_item(self, item, spider):
        urlname = item['urlname']
        urladdr = item['urladdr']
        commentsNum = item['commentsNum']
        self.cursor = self.connect.cursor()
        sql = "INSERT INTO cl_table (urlname, urladdr, commentsNum) VALUES ('%s','%s','%s' )"
        data = (urlname, urladdr, commentsNum)

        try:
            self.cursor.execute(sql % data)
        except Exception as e:
            self.connect.rollback()  # 事務回滾
            print('事務處理失敗', e)
        else:
            self.connect.commit()  # 事務提交
            print('事務處理成功', self.cursor.rowcount)
        return item

    def close_spider(self,spider):
        self.cursor.close()
        self.connect.close()