1. 程式人生 > >scrapy爬取當當網

scrapy爬取當當網

enable eight mongodb tle () bject field div pipe

春節已經臨近了尾聲,也該收收心了。博客好久都沒更新了,自己在年前寫的爬蟲也該“拿”出來了。

本次爬取的目標是當當網,獲取當當網所有的書籍信息。采用scrapy+mongodb來采集存儲數據。開幹!

起始url:

start_urls = [http://category.dangdang.com/cp01.00.00.00.00.00-shlist.html]

當當書籍的一級分類二級分類都很明顯的展示了出來。

技術分享圖片

ok~入口找到了,當當網也沒有設置反爬措施,所以可以直接的放心爬取,如果需要大規模爬取的時候可以對爬蟲的請求時間設置下,不要給別人的服務器帶來太多的壓力。

DOWNLOAD_DELAY = 5

ok,我直接上代碼了!

items.py
 1 class BookDangdangItem(scrapy.Item):
 2     # 將數據插入到moongodb中
 3     price = scrapy.Field()  # 價格
 4     type_tag = scrapy.Field()  # 所屬分類
 5     name = scrapy.Field()  # 書籍名稱
 6     image_url = scrapy.Field()  # 首頁的圖片url
 7     link = scrapy.Field()  # url
 8
star_level = scrapy.Field() # 9 pub_time = scrapy.Field() # 出版時間 10 publish = scrapy.Field() # 出版社 11 brief = scrapy.Field() # 描述 12 13 detail = scrapy.Field() # 書籍詳情 {}
spiders.py
技術分享圖片
  1 # -*- coding: utf-8 -*-
  2 import time
  3 import logging
  4 
  5 import
scrapy 6 from scrapy.http.cookies import CookieJar 7 8 from ..items import BookDangdangItem 9 from ..settings import DEFAULT_REQUEST_HEADERS 10 11 12 class DangdangSpider(scrapy.Spider): 13 name = dangdang 14 allowed_domains = [dangdang.com] 15 start_urls = [http://category.dangdang.com/cp01.00.00.00.00.00-shlist.html] 16 dom = http://category.dangdang.com # 用於拼接url 17 cookie_dict = {} 18 19 def start_requests(self): 20 return [scrapy.Request(url=self.start_urls[0], callback=self.parse, headers=DEFAULT_REQUEST_HEADERS)] 21 22 def parse(self, response): 23 try: 24 typestr = response.meta[type] 25 except(KeyError): 26 typestr = "" 27 types = response.xpath(//*[@id="navigation"]/ul/li[1]/div[2]/div[1]/div/span/a) # 獲取書籍分類 28 tyname = response.xpath(//*[@id="navigation"]/ul/li[1]/@dd_name).extract_first() 29 if types and tyname == 分類: # 到分類終止遞歸 30 for type in types: 31 url = self.dom + type.xpath(@href).extract_first() # 每一個書籍分類下面的url 32 typestr_new = typestr + "{0}>>".format(type.xpath(text()).extract_first()) # 多級分類 33 34 scrapy.Spider.log(self, "Find url:{0},type{1}".format(url, typestr_new), logging.INFO) # 配置日誌信息 35 yield scrapy.Request(url=url, callback=self.parse, meta={type: typestr_new}, 36 headers=DEFAULT_REQUEST_HEADERS) 37 else: 38 page = int(response.xpath(//*[@id="go_sort"]/div/div[2]/span[1]/text()).extract_first()) # 當前頁 39 all_page = int(response.xpath(//*[@id="go_sort"]/div/div[2]/span[2]/text()).extract_first().lstrip(/)) # 總頁數 40 for x in range(page, all_page): # 處理分頁 41 yield scrapy.Request(url=self.dom + /pg{0}-.format(x) + response.url.split(/)[-1], 42 callback=self.parse_page, headers=DEFAULT_REQUEST_HEADERS, 43 meta={type: typestr}) 44 45 def parse_page(self, response): 46 """解析列表頁中書籍的信息""" 47 # cookie可以不用加,我加上只是為了測試 48 cookie_jar = CookieJar() 49 cookie_jar.extract_cookies(response, response.request) 50 for k, v in cookie_jar._cookies.items(): 51 for i, j in v.items(): 52 for m, n in j.items(): 53 self.cookie_dict[m] = n.value 54 # print(self.cookie_dict) 55 56 for item in response.xpath(//*[@id="search_nature_rg"]/ul[@class="bigimg"]/li): 57 # 所有圖書 58 book = BookDangdangItem() 59 book[price] = float(item.xpath(./p[@class="price"]/span[1]/text()).extract_first().lstrip(¥)) 60 book[type_tag] = response.meta[type] 61 book[name] = item.xpath(./p[@class="name"]/a/text()).extract_first().strip() 62 book[image_url] = item.xpath(./a/img/@src).extract_first() 63 book[link] = item.xpath(./p[1]/a/@href).extract_first() 64 book[star_level] = int(item.xpath(./p[@class="search_star_line"]/span/span/@style).extract_first() 65 .split( )[-1].rstrip(%;)) 66 try: 67 book[pub_time] = item.xpath(.//p[@class="search_book_author"]/span[2]/text()).extract_first().split(/)[-1] 68 except Exception as e: 69 book[pub_time] = time.strftime("%Y-%m-%d") 70 try: 71 book[publish] = item.xpath( 72 ./p[@class="search_book_author"]/span[3]/a/text()).extract_first().strip() 73 except Exception as e: 74 book[publish] = "暫無出版社信息" 75 try: 76 book[brief] = item.xpath(./p[2]/text()).extract_first().strip() 77 except Exception as e: 78 book[brief] = "暫無書籍簡述" 79 yield scrapy.Request(callback=self.parse_book, cookies=self.cookie_dict, 80 headers=DEFAULT_REQUEST_HEADERS, meta={item: book}, url=book[link]) 81 82 def parse_book(self, response): 83 """跟進url解析書籍詳情""" 84 book = response.meta[item] 85 book[detail] = {} 86 info = response.xpath("//ul[@class=‘key clearfix‘]/li/text()").extract() 87 print(info) 88 for i in info: 89 t = i.partition("") 90 k = t[0].replace(" ", "") 91 v = t[-1] 92 if v == ‘‘: 93 v = "暫無詳情" 94 book[detail][k] = v 95 96 # 這個作者詳情當當不同的板塊有不同的取法,看了下有很多種,寫起來挺麻煩的就暫時這樣處理 97 try: 98 book[detail][author_detail] = response.xpath("//span[@id=‘authorIntroduction-show’]/text()") 99 .extract_first().replace(\n, ‘‘) 100 except Exception as e: 101 book[detail][author_detail] = "暫無作者信息" 102 103 yield book
View Code

說明下:cookie在本次爬蟲中可以不用加,我加上只是為了測試。請求頭做太多的定制。

piplines.py
技術分享圖片
 1 from scrapy.conf import settings
 2 from scrapy import signals
 3 from pymongo import MongoClient
 4 
 5 
 6 class DangDangSpiderPipeline(object):
 7     def __init__(self):
 8         # 獲取setting中主機名,端口號和集合名
 9         host = settings[MONGODB_HOST]
10         port = settings[MONGODB_PORT]
11         dbname = settings[MONGODB_DBNAME]
12         col = settings[MONGODB_COL]
13 
14         # 創建一個mongo實例
15         client = MongoClient(host=host, port=port)
16 
17         # 訪問數據庫
18         db = client[dbname]
19 
20         # 訪問集合
21         self.col = db[col]
22 
23     def process_item(self, item, spider):
24         data = dict(item)
25         self.col.insert(data)
26         return item
View Code
settings.py
技術分享圖片
 1 USER_AGENT = Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.22  2               Safari/537.36 SE 2.X MetaSr 1.0
 3 
 4 ROBOTSTXT_OBEY = False
 5 
 6 DOWNLOAD_DELAY = 5
 7 
 8 COOKIES_ENABLED = False
 9 
10 DEFAULT_REQUEST_HEADERS = {
11     Accept: text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8,
12     Accept-Language: en,
13     "authority": " www.dangdang.com",
14     "method": "GET",
15     "path": "/",
16     "scheme": "http",
17     "accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8",
18     "accept-encoding": gzip, deflate, br,
19     "accept-language": en-US,en;q=0.9,
20     "referer": None,
21     "upgrade-insecure-requests": 1,
22     "User-Agent": Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.22 23                       Safari/537.36 SE 2.X MetaSr 1.0
24 }    # 可以不寫使用默認的
25 
26 ITEM_PIPELINES = {
27     dangdangspider.pipelines.DangDangSpiderPipeline: 300,
28 }
29 
30 # Mongodb
31 # 主機環回地址
32 MONGODB_HOST = 127.0.0.1
33 # 端口號,默認27017
34 MONGODB_POST = 27017
35 # 設置數據庫名稱
36 MONGODB_DBNAME = dangdangs
37 # 設置集合名稱
38 MONGODB_COL = books
View Code

settings.py請求頭可以不用寫,使用默認的就行,我之所以加上也是測試用的,同理上面寫了兩個user-agent。just for fun :-)

這次爬蟲其實主要的目的是,我想了解下scrapy中的cookie是如何加上的,以及怎麽用,有多少中用法!scrapy中的cookie無論是在官方文檔或者一些大V的博客裏都很少介紹到,但是他在實際的場景中卻會用到,那就是模擬登錄的時候。或者向淘寶那樣需要攜帶cookie才能訪問頁面的時候。關於如何使用cookie請查看我的另一篇博客

scrapy爬取當當網