1. 程式人生 > >scrapy實戰8關於數據異步寫入mysql:

scrapy實戰8關於數據異步寫入mysql:

extract [0 self mod back sqlt pos osi tencent

環境:python3

爬取網址:騰訊社招(http://hr.tencent.com/position.php?keywords=&tid=0&start=0#a)總共2202條數據

pipelines.py

 1 from twisted.enterprise import adbapi
 2 import pymysql
 3 import pymysql.cursors
 4 
 5 
 6 class MysqlTwistedPipeline(object):
 7     def __init__(self,dbpool):
 8         self.dbpool=dbpool
9 10 @classmethod 11 def from_settings(cls,settings): 12 dbpool=adbapi.ConnectionPool("pymysql",host=settings["MYSQL_HOST"],db=settings["MYSQL_DBNAME"],user=settings["MYSQL_USER"],password=settings["MYSQL_PASSWORD"],charset="utf8", cursorclass=pymysql.cursors.DictCursor, 13 use_unicode=True)
14 return cls(dbpool) 15 16 def process_item(self,item,spider): 17 # 使用twisted將mysql插入變成異步執行 18 self.dbpool.runInteraction(self.do_insert,item) 19 20 21 def do_insert(self,cursor,item): 22 # 執行具體的插入 23 # 根據不同的item 構建不同的sql語句並插入到mysql中 24 insert_sql, params = item.get_insert_sql()
25 cursor.execute(insert_sql, params)

items.py

 1 import scrapy
 2 
 3 
 4 class TencentItem(scrapy.Item):
 5    
 6     positionname=scrapy.Field()
 7     positionlink=scrapy.Field()
 8     positionType=scrapy.Field()
 9     positionNum=scrapy.Field()
10     positionLocation=scrapy.Field()
11     publishTime=scrapy.Field()
12 
13 
14     def get_insert_sql(self):
15         insert_sql="""
16         insert into tencent(positionname,positionlink,positionType,positionNum,positionLocation,publishTime)
17         VALUES (%s,%s,%s,%s,%s,%s)
18         
19         """
20         params=(
21             self[positionname], self[positionlink], self[positionType], self[positionNum],
22             self[positionLocation], self[publishTime]
23         )
24         return insert_sql,params
25     

settings.py

BOT_NAME = tencent

SPIDER_MODULES = [tencent.spiders]
NEWSPIDER_MODULE = tencent.spiders


ROBOTSTXT_OBEY = False

(不用分布式可忽略下面三項)
SCHEDULER = "scrapy_redis.scheduler.Scheduler"
DUPEFILTER_CLASS = "scrapy_redis.dupefilter.RFPDupeFilter"
SCHEDULER_PERSIST = True


DOWNLOAD_DELAY = 2

DEFAULT_REQUEST_HEADERS = {
  User-Agent:Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:53.0) Gecko/20100101 Firefox/53.0,
  Accept: text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8,
   #‘Accept-Language‘: ‘en‘,
}


ITEM_PIPELINES = {
       scrapy_redis.pipelines.RedisPipeline:400,(不用分布式可忽略)
      tencent.pipelines.MysqlTwistedPipeline: 300,
}
REDIS_HOST = 172.21.118.56(分布式主機ip 不用分布式可忽略)
REDIS_PORT = 6379(不用分布式可忽略)


MYSQL_HOST = "127.0.0.1"
MYSQL_DBNAME = "tencent"(自己數據庫名字)
MYSQL_USER = "usrername"(用戶名)
MYSQL_PASSWORD = "userpassword"(密碼)

spiders/Tencent.py

from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import Rule
from scrapy_redis.spiders import RedisCrawlSpider
from tencent.items import TencentItem

class TencentSpider(RedisCrawlSpider):
    name = "Tencent"
    allowed_domains = ["tencent.com"]
    redis_key = TencentSpider:start_urls


    page_link=LinkExtractor(allow=(r"start=\d+"))

    rules=[
            Rule(page_link,callback = "parseContent",follow=True)
    ]

    def parseContent(self, response):
        list=response.xpath(//tr[@class="even"] | //tr[@class="odd"])
        for infos in list:
            item=TencentItem()
            item[positionname]=infos.xpath("./td[1]/a/text()").extract()[0]
            item[positionlink]=infos.xpath("./td[1][email protected]").extract()[0]
            item[positionType]=infos.xpath("./td[2]/text()").extract()
            item[positionNum]=infos.xpath("./td[3]/text()").extract()[0]
            item[positionLocation]=infos.xpath("./td[4]/text()").extract()[0]
            item[publishTime]=infos.xpath("./td[5]/text()").extract()[0]

            yield item

scrapy實戰8關於數據異步寫入mysql: