1. 程式人生 > >爬蟲常用設置

爬蟲常用設置

init gen bug wow res agen true cookies quit

1. setting.py

 1 #!/usr/bin/python
 2 # -*- coding: utf-8 -*-
 3 """
 4 @author: yugengde
 5 @contact: [email protected]
 6 @file : settings.py
 7 @time: 2017/11/22 15:41
 8 """
 9 
10 BOT_NAME = pro
11 
12 SPIDER_MODULES = [pro.spiders]
13 NEWSPIDER_MODULE = pro.spiders
14 
15 ROBOTSTXT_OBEY = False
16 17 DOWNLOAD_DELAY = 3 18 COOKIES_ENABLED = False 19 20 DOWNLOADER_MIDDLEWARES = { 21 pro.middlewares.PhantomJSMiddleware: 301, 22 pro.middlewares.UserAgentMiddleware: 300, 23 } 24 25 ITEM_PIPELINES = { 26 scrapy_redis.pipelines.RedisPipeline: 301, 27 pro.pipelines.DuplicatesPipeline
: 300, 28 } 29 30 LOG_ENABLED = True 31 LOG_ENCODING = utf-8 32 LOG_FILE = ‘pro.log 33 LOG_LEVEL = DEBUG 34 # LOG_STDOUT = 35 36 SCHEDULER = "scrapy_redis.scheduler.Scheduler" 37 DUPEFILTER_CLASS = "scrapy_redis.dupefilter.RFPDupeFilter" 38 REDIS_URL = redis://root:password@localhost:6379

2. middlewares.py

  1 class PhantomJSMiddleware(object):
  2     @classmethod
  3     def process_request(cls, request, spider):
  4         from selenium import webdriver
  5         from scrapy.http import HtmlResponse
  6         driver = webdriver.PhantomJS(rC:\InstallFile\Phantomjs\bin\phantomjs.exe)
  7         driver.get(request.url)
  8         content = driver.page_source.encode(utf-8)
  9         driver.quit()
 10 
 11         return HtmlResponse(request.url, encoding=utf-8, body=content, request=request)
 12 
 13 
 14 class UserAgentMiddleware(object):
 15     @classmethod
 16     def process_request(cls, request, spider):
 17         import random
 18         user_agents = [ 
20
"Mozilla/5.0 (Windows NT 6.1; WOW64; rv:21.0) Gecko/20130331 Firefox/21.0", 21 "Mozilla/5.0 (Windows x86; rv:19.0) Gecko/20100101 Firefox/19.0", # 可以使用 UserAgent()函數生成      ] 294 request.headers.setdefault(UserAgent,random.choice(user_agents))

3. pipelines.py

 1 #!/usr/bin/python
 2 # -*-coding:utf-8-*-
 3 
 4 from scrapy.exceptions import DropItem
 5 
 6 
 7 # 數據的去重
 8 class DuplicatesPipeline(object):
 9     def __init__(self):
10         self.ids_seen = set()
11 
12     def process_item(self, item, spider):
13         if not item[title]:
14             raise DropItem("Missing title in %s " % item)
15 
16         if item[item_id] in self.ids_seen:
17             raise DropItem("Duplicate item found: %s" % item)
18         else:
19             self.ids_seen.add(item[item_id])
20             yield item

爬蟲常用設置