scrapy框架爬取微博之spider檔案
阿新 • • 發佈:2018-12-10
# -*- coding: utf-8 -*- import scrapy from scrapy.settings import default_settings import json from ..items import WeiboItem import re from w3lib.html import remove_tags class WeiboSpider(scrapy.Spider): name = 'weibo' allowed_domains = ['weibo.cn'] start_urls = ['https://m.weibo.cn/api/container/getIndex?containerid=231051_-_followers_-_2027356850'] def parse(self, response): # print(response) user_str = json.loads(response.text) # print(user_str) # 每個使用者的資訊 user_list = user_str['data']['cards'] for i in user_list: # print(i['card_group']) name_str = i['card_group'] for j in name_str: if 'user' in j: user_id = j['user']['id'] # print(user_id) user_url = 'https://m.weibo.cn/api/container/getIndex?containerid=230413%d_-_WEIBO_SECOND_PROFILE_WEIBO' url = user_url % user_id # print(url) item = WeiboItem() yield scrapy.Request(url, meta={'item': item}, callback=self.parse_list, dont_filter= False) def parse_list(self, response): user_dict = json.loads(response.text) for i in user_dict['data']['cards']: # print(i) if 'mblog' in i: # 名字 name = i['mblog']['user']['screen_name'] # 內容 info = i['mblog']['text'] # 去除標籤 info = remove_tags(info) # 時間 time = i['mblog']['created_at'] # print(time) if '前' in time: time = '8-30' if '昨天' in time: time = '8-29' print(time) # 轉發 zhuanfa = i['mblog']['reposts_count'] # 評論 pinglun = i['mblog']['comments_count'] # 點贊 zan = i['mblog']['attitudes_count'] item = WeiboItem() item['name'] = name item['info'] = info item['time'] = time item['zhuanfa'] = str(zhuanfa) item['pinglun'] = str(pinglun) item['zan'] = str(zan) user_id = i['mblog']['user']['id'] user_url = 'https://m.weibo.cn/api/container/getIndex?containerid=231051_-_followers_-_%d' url = user_url % user_id yield item for i in user_dict['data']['cards']: # print(i) if 'mblog' in i: user_id = i['mblog']['user']['id'] user_url = 'https://m.weibo.cn/api/container/getIndex?containerid=231051_-_followers_-_%d' url = user_url % user_id yield scrapy.Request(url, callback=self.parse, dont_filter=False)