[Python爬蟲]Scrapy框架爬取bilibili個人資訊
阿新 • • 發佈:2019-02-04
啟動檔案main.py
from scrapy.cmdline import execute
execute('scrapy crawl bili_gr_xx'.split())
執行spider下的爬取檔案
# -*- coding: utf-8 -*-
import scrapy,json
from .. import items
class BiliGrXxSpider(scrapy.Spider):
name = 'bili_gr_xx'
allowed_domains = ['bilibili.com']
# start_urls = ['http://bilibili.com/']
# 我們使用這個函式作為初始的執行函式
def start_requests(self):
url = 'https://space.bilibili.com/ajax/member/GetInfo'
for i in range(1,201):
data_form = {
'mid':str(i),
'csrf': '',
}
url_ajax = 'https://space.bilibili.com/{}/'.format(i)
# get的時候是這個東東, scrapy.Request(url=, callback=)
req = scrapy.FormRequest(url=url, formdata=data_form, callback=self.parse, method='POST')
req.headers['referer'] = url_ajax
yield req
def parse(self, response):
print('--'*20)
mysql=items.bili_mysql()
html=json.loads(response.text)
# print(html)
mysql['name']=html['data']['name']
mysql['ID']=html['data']['mid']
mysql['sex']=html['data']['sex']
mysql['tx_img']=html['data']['face']
mysql['gr_biaoq']=html['data']['sign']
mysql['chao']=html['data']['official_verify']['desc']
for i in mysql:
if mysql[i] == '':
mysql[i]=None
yield mysql
items檔案
class bili_mysql(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
name=scrapy.Field()
ID=scrapy.Field()
sex=scrapy.Field()
tx_img=scrapy.Field()
gr_biaoq=scrapy.Field()
chao=scrapy.Field()
settings.py配置檔案
將改的地方寫了下來
#導包
from ..piaot import *
#是否遵循規則,不懂請百度
#ROBOTSTXT_OBEY改成False
ROBOTSTXT_OBEY = False
#開啟報頭
DEFAULT_REQUEST_HEADERS = {
"User-Agent":"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36",
}
#配置儲存檔案地址和優先順序
ITEM_PIPELINES = {
'bilibili_wj.pipelines.bilibili_mysql': 300,
}
pipelines.py儲存檔案
# -*- coding: utf-8 -*-
import pymysql
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html
class BilibiliWjPipeline(object):
def process_item(self, item, spider):
return item
class bilibili_mysql(object):
def process_item(self, item, spider):
sql = "insert into xq_2 values(NULL,'{}',{},'{}','{}','{}','{}')".format(item['name'], item['ID'], item['sex'], item['tx_img'], item['gr_biaoq'],item['chao'])
print(sql)
# 開啟資料庫連線,
db = pymysql.connect("192.168.43.128", "root", "123456", "xq", charset='utf8')
# 使用 cursor() 方法建立一個遊標物件 cursor
cursor = db.cursor()
# 使用 fetchone() 方法獲取單條資料.
cursor.execute(sql)
# 執行mysql
db.commit()
db.close()