1. 程式人生 > >scrapy框架項目:抓取全部知乎用戶信息,並且保存至mongodb

scrapy框架項目:抓取全部知乎用戶信息,並且保存至mongodb

-- resp 用戶信息 ces filter name object api .com

import scrapy
import json,time,re
from zhihuinfo.items import ZhihuinfoItem


class ZhihuSpider(scrapy.Spider):
name = ‘zhihu‘
allowed_domains = [‘www.zhihu.com‘]
start_urls = [‘https://www.zhihu.com/api/v4/members/eve-lee-55/followees?include=data%5B*%5D.answer_count%2Carticles_count%2Cgender%2Cfollower_count%2Cis_followed%2Cis_following%2Cbadge%5B%3F(type%3Dbest_answerer)%5D.topics&offset=20&limit=20‘,]

def parse(self, response):
temp_data = json.loads(response.body.decode("utf-8"))["data"]
count = len(temp_data)
#如果用戶信息數字低於18 說明已經到達最後一頁
if count <= 18:
pass

#如果沒有達到最後一頁,則改變offset促使爬蟲翻頁
else:
offset = re.findall(re.compile(r‘&offset=(.*?)&‘),response.url)[0]
new_offset = int(offset) + 20
print(new_offset)
time.sleep(1)
yield scrapy.Request("https://www.zhihu.com/api/v4/members/eve-lee-55/followees?include=data%5B*%5D.answer_count%2Carticles_count%2Cgender%2Cfollower_count%2Cis_followed%2Cis_following%2Cbadge%5B%3F(type%3Dbest_answerer)%5D.topics&offset="+str(new_offset)+"&limit=20",callback=self.parse,dont_filter=True)

for i in temp_data:
#print(i)
#print("***************"*10)
#print(response.url)
#print("***************" * 10)

item = ZhihuinfoItem()
item["name"] = i["name"]
item["url_token"] = i["url_token"]
item["headline"] = i["headline"]
item["follower_count"] = i["follower_count"]
item["answer_count"] = i["answer_count"]
item["articles_count"] = i["articles_count"]
item["id"] = i["id"]
item["type"] = i["type"]

with open("userinfo.txt") as f:
user_list = f.read()

#建立一個文檔,把爬取過的用戶信息其中的url_token寫入,防止重復爬取用戶
if i["url_token"] not in user_list:
with open("userinfo.txt","a") as f: #"a" 是 追加 的意思
f.write(i["url_token"]+"-----")
yield item
#print(i["url_token"])

#切換到新的用戶的關註列表內

#這樣爬蟲就不斷蔓延,理論上就可以無限爬取完所有互動性強的活躍用戶。
new_url = "https://www.zhihu.com/api/v4/members/" + i["url_token"] + "/followees?include=data%5B*%5D.answer_count%2Carticles_count%2Cgender%2Cfollower_count%2Cis_followed%2Cis_following%2Cbadge%5B%3F(type%3Dbest_answerer)%5D.topics&offset=20&limit=20"
time.sleep(1)
yield scrapy.Request(url=new_url,callback=self.parse)




pipelines

import pymongo
from scrapy.conf import settings

class ZhihuinfoPipeline(object):
def __init__(self):
host = settings["MONGODB_HOST"]
port = settings["MONGODB_PORT"]
dbname = settings["MONGODB_DBNAME"]
client = pymongo.MongoClient(host=host,port=port)
tdb = client[dbname]
self.post = tdb[settings["MONGODB_DOCNAME"]]

def process_item(self, item, spider):
zhihuzhihu = dict(item)
self.post.insert(zhihuzhihu)
return item


items
import scrapy


class ZhihuinfoItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
name = scrapy.Field()
url_token = scrapy.Field()
headline = scrapy.Field()
follower_count = scrapy.Field()
answer_count = scrapy.Field()
articles_count= scrapy.Field()
id = scrapy.Field()
type = scrapy.Field()



scrapy框架項目:抓取全部知乎用戶信息,並且保存至mongodb