1. 程式人生 > >scrapy框架爬取微博之spider檔案

scrapy框架爬取微博之spider檔案

# -*- coding: utf-8 -*-
import scrapy
from scrapy.settings import default_settings
import json
from ..items import WeiboItem
import re
from w3lib.html import remove_tags

class WeiboSpider(scrapy.Spider):
    name = 'weibo'
    allowed_domains = ['weibo.cn']
    start_urls = ['https://m.weibo.cn/api/container/getIndex?containerid=231051_-_followers_-_2027356850']

    def parse(self, response):
        # print(response)
        user_str = json.loads(response.text)
        # print(user_str)
        # 每個使用者的資訊
        user_list = user_str['data']['cards']
        for i in user_list:
            # print(i['card_group'])
            name_str = i['card_group']
            for j in name_str:
                if 'user' in j:
                    user_id = j['user']['id']
                    # print(user_id)
                    user_url = 'https://m.weibo.cn/api/container/getIndex?containerid=230413%d_-_WEIBO_SECOND_PROFILE_WEIBO'
                    url = user_url % user_id
                    # print(url)
                    item = WeiboItem()
                    yield scrapy.Request(url, meta={'item': item}, callback=self.parse_list, dont_filter= False)


    def parse_list(self, response):
        user_dict = json.loads(response.text)
        for i in user_dict['data']['cards']:
            # print(i)
            if 'mblog' in i:
                # 名字
                name = i['mblog']['user']['screen_name']
                # 內容
                info = i['mblog']['text']
                # 去除標籤
                info =  remove_tags(info)
                # 時間
                time = i['mblog']['created_at']
                # print(time)
                if '前' in time:

                    time = '8-30'
                if '昨天' in time:
                    time = '8-29'
                print(time)
                # 轉發
                zhuanfa = i['mblog']['reposts_count']
                # 評論
                pinglun = i['mblog']['comments_count']
                # 點贊
                zan = i['mblog']['attitudes_count']
                item = WeiboItem()
                item['name'] = name
                item['info'] = info
                item['time'] = time
                item['zhuanfa'] = str(zhuanfa)
                item['pinglun'] = str(pinglun)
                item['zan'] = str(zan)
                user_id = i['mblog']['user']['id']
                user_url = 'https://m.weibo.cn/api/container/getIndex?containerid=231051_-_followers_-_%d'
                url = user_url % user_id
                yield item

                for i in user_dict['data']['cards']:
                    # print(i)
                    if 'mblog' in i:
                        user_id = i['mblog']['user']['id']
                        user_url = 'https://m.weibo.cn/api/container/getIndex?containerid=231051_-_followers_-_%d'
                        url = user_url % user_id
                        yield scrapy.Request(url, callback=self.parse, dont_filter=False)