scrapy抓取某些樣式的博客園博客信息
阿新 • • 發佈:2018-09-24
mongo from split yield 標簽 col chrome fin afa
測試過很多樣式的博客園,就發現長書這樣的也就是我的博客這樣的抓取不了,標簽不一樣。其他的只需要把bky.py下的user的值即‘username’改為要抓取的用戶的用戶名即可,如:
user = "whz0215"。
代碼如下:
spiders下的bky.py
import scrapy from scrapy import Request,Selector import re from bokeyuan.items import * class BkySpider(scrapy.Spider): user = "username" name = ‘bky‘ allowed_domains = [‘cnblogs.com‘] start_urls = [‘https://cnblogs.com/‘] cur_page = 1 url = "https://www.cnblogs.com/%s/default.html?page=%s" def start_requests(self): url = self.url%(self.user,self.cur_page) yield Request(url,callback=self.parse) def parse(self, response): selector = Selector(text=response.text) one_page = selector.xpath(‘//div[@class="forFlow"]/div[@class="day"]‘) for each in one_page: title = each.xpath(‘div[@class="postTitle"]/a[@class="postTitle2"]/text()‘).extract_first() sec_title = each.xpath(‘div[@class="postCon"]/div[@class="c_b_p_desc"]/text()‘).extract_first() detail_url = each.xpath(‘div[@class="postTitle"]/a/@href‘).extract_first() desc = each.xpath(‘div[@class="postDesc"]/text()‘).extract_first() if desc: split_desc = desc.strip().split() post_time = split_desc[2] + " " + split_desc[3] postor = split_desc[4] read = re.search(r"(\d+)",split_desc[5]).group(1) conment = re.search((r"(\d+)"),split_desc[6]).group(1) # print(title,sec_title,post_time,postor,read,conment,detail_url) item = BokeyuanItem() item["title"] = title item["sec_title"] = sec_title item["post_time"] = post_time item["postor"] = postor item["read"] = read item["comment"] = conment item["detail_url"] = detail_url yield item if self.cur_page > 1: if_next = selector.xpath(‘//div[@class="pager"]/a[last()]‘).extract_first() if re.search(r‘(\d+)‘,if_next).group(1) == None: exit(0) self.cur_page += 1 yield Request(url=self.url%(self.user,self.cur_page),callback=self.parse)
pipelines.py
class BokeyuanPipeline(object): def process_item(self, item, spider): return item import pymongo class MongoPipeline(object): collection_name = ‘whz‘ def __init__(self, mongo_uri, mongo_db): self.mongo_uri = mongo_uri self.mongo_db = mongo_db @classmethod def from_crawler(cls, crawler): return cls( mongo_uri=crawler.settings.get(‘MONGO_URI‘), mongo_db=crawler.settings.get(‘MONGO_DATAEASE‘,‘bky‘), ) def open_spider(self, spider): self.client = pymongo.MongoClient(self.mongo_uri) self.db = self.client[self.mongo_db] def close_spider(self, spider): self.client.close() def process_item(self, item, spider): if len(item["sec_title"]) > 50: item["sec_title"] = item["sec_title"][:50] + "..." self.db[self.collection_name].insert_one(dict(item)) return item
items.py
import scrapy from scrapy import Item, Field class BokeyuanItem(Item): # define the fields for your item here like: # name = scrapy.Field() # title, sec_title, post_time, postor, read, conment, detail_url title = Field() sec_title = Field() post_time = Field() postor = Field() read = Field() comment = Field() detail_url = Field()
settings.py
修改為 ROBOTSTXT_OBEY = False 註釋解開並添加 DEFAULT_REQUEST_HEADERS = { ‘Accept‘: ‘text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8‘, ‘Accept-Language‘: ‘en‘, ‘Referer‘: ‘https://www.cnblogs.com‘, ‘USER_AGENT‘: ‘Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36‘ } ITEM_PIPELINES = { ‘bokeyuan.pipelines.BokeyuanPipeline‘: 300, ‘bokeyuan.pipelines.MongoPipeline‘: 301, } 添加 MONGO_URL = ‘localhost‘ MONGO_DATABASE = ‘dbname‘
scrapy抓取某些樣式的博客園博客信息