1. 程式人生 > >[Python爬蟲]Scrapy框架爬取bilibili個人資訊

[Python爬蟲]Scrapy框架爬取bilibili個人資訊

啟動檔案main.py

from scrapy.cmdline import execute

execute('scrapy crawl bili_gr_xx'.split())

執行spider下的爬取檔案

# -*- coding: utf-8 -*-
import scrapy,json
from .. import items

class BiliGrXxSpider(scrapy.Spider):
    name = 'bili_gr_xx'
    allowed_domains = ['bilibili.com']
    # start_urls = ['http://bilibili.com/']
# 我們使用這個函式作為初始的執行函式 def start_requests(self): url = 'https://space.bilibili.com/ajax/member/GetInfo' for i in range(1,201): data_form = { 'mid':str(i), 'csrf': '', } url_ajax = 'https://space.bilibili.com/{}/'.format(i) # get的時候是這個東東, scrapy.Request(url=, callback=)
req = scrapy.FormRequest(url=url, formdata=data_form, callback=self.parse, method='POST') req.headers['referer'] = url_ajax yield req def parse(self, response): print('--'*20) mysql=items.bili_mysql() html=json.loads(response.text) # print(html)
mysql['name']=html['data']['name'] mysql['ID']=html['data']['mid'] mysql['sex']=html['data']['sex'] mysql['tx_img']=html['data']['face'] mysql['gr_biaoq']=html['data']['sign'] mysql['chao']=html['data']['official_verify']['desc'] for i in mysql: if mysql[i] == '': mysql[i]=None yield mysql

items檔案

class bili_mysql(scrapy.Item):
    # define the fields for your item here like:
    # name = scrapy.Field()
    name=scrapy.Field()
    ID=scrapy.Field()
    sex=scrapy.Field()
    tx_img=scrapy.Field()
    gr_biaoq=scrapy.Field()
    chao=scrapy.Field()

settings.py配置檔案

將改的地方寫了下來

#導包
from ..piaot import *

#是否遵循規則,不懂請百度
#ROBOTSTXT_OBEY改成False
ROBOTSTXT_OBEY = False

#開啟報頭
DEFAULT_REQUEST_HEADERS = {

  "User-Agent":"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36",
}

#配置儲存檔案地址和優先順序
ITEM_PIPELINES = {

  'bilibili_wj.pipelines.bilibili_mysql': 300,

}

pipelines.py儲存檔案

# -*- coding: utf-8 -*-
import pymysql
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html


class BilibiliWjPipeline(object):
    def process_item(self, item, spider):
        return item


class bilibili_mysql(object):
    def process_item(self, item, spider):
        sql = "insert into xq_2 values(NULL,'{}',{},'{}','{}','{}','{}')".format(item['name'], item['ID'], item['sex'], item['tx_img'], item['gr_biaoq'],item['chao'])
        print(sql)
        # 開啟資料庫連線,
        db = pymysql.connect("192.168.43.128", "root", "123456", "xq", charset='utf8')
        # 使用 cursor() 方法建立一個遊標物件 cursor
        cursor = db.cursor()
        # 使用 fetchone() 方法獲取單條資料.
        cursor.execute(sql)

        # 執行mysql
        db.commit()
        db.close()