1. 程式人生 > >Scrapy學習筆記(3)爬取知乎首頁問題及答案

Scrapy學習筆記(3)爬取知乎首頁問題及答案

目標:爬取知乎首頁前x個問題的詳情及問題指定範圍內的答案的摘要

power by:

  1. Python 3.6
  2. Scrapy 1.4
  3. json
  4. pymysql

Step 1——相關簡介

Step 2——模擬登入

知乎如果不登入是爬取不到資訊的,所以首先要做的就是模擬登入
主要步驟:

獲取xsrf及驗證碼圖片
填寫驗證碼提交表單登入
登入是否成功

獲取xsrf及驗證碼圖片:

def start_requests(self):

    yield scrapy.Request('https://www.zhihu.com/', callback=self.login_zhihu)

def login_zhihu(self, response):
    """ 獲取xsrf及驗證碼圖片 """
    xsrf = re.findall(r'name="_xsrf" value="(.*?)"/>', response.text)[0]
    self.headers['X-Xsrftoken'] = xsrf
    self.post_data['_xsrf'] = xsrf

    times = re.findall(r'<script type="text/json" class="json-inline" data-n'
                       r'ame="ga_vars">{"user_created":0,"now":(\d+),', response.text)[0]
    captcha_url = 'https://www.zhihu.com/' + 'captcha.gif?r=' + times + '&type=login&lang=cn'

    yield scrapy.Request(captcha_url, headers=self.headers, meta={'post_data': self.post_data},
                         callback=self.veri_captcha)

這裡寫圖片描述

填寫驗證碼提交表單登入:

def veri_captcha(self, response):
    """ 輸入驗證碼資訊進行登入 """
    with open('captcha.jpg', 'wb') as f:
        f.write(response.body)

    print('只有一個倒立文字則第二個位置為0')
    loca1 = input('input the loca 1:')
    loca2 = input('input the loca 2:')
    captcha = self.location(int(loca1), int(loca2))

    self.post_data = response.meta.get('post_data', {})
    self.post_data['captcha'] = captcha
    post_url = 'https://www.zhihu.com/login/email'

    yield scrapy.FormRequest(post_url, formdata=self.post_data, headers=self.headers,
                             callback=self.login_success)

def location(self, a, b):
    """ 將輸入的位置轉換為相應資訊 """
    if b != 0:
        captcha = "{\"img_size\":[200,44],\"input_points\":[%s,%s]}" %
                   (str(self.capacha_index[a - 1]),
                    str(self.capacha_index[b - 1]))
    else:
        captcha = "{\"img_size\":[200,44],\"input_points\":[%s]}" % str(self.capacha_index[a - 1])
    return captcha

登入是否成功:

def login_success(self, response):

    if 'err' in response.text:
        print(response.text)
        print("error!!!!!!")
    else:
        print("successful!!!!!!")
        yield scrapy.Request('https://www.zhihu.com', headers=self.headers, dont_filter=True)

這裡寫圖片描述

Step 3——獲取首頁問題

獲取第一頁的問題只需要將問題URL提取出來即可,不過第一頁只有10個左右的問題,
如果想提取更多的問題就需要模擬翻頁以便獲取問題資料

def parse(self, response):
    """ 獲取首頁問題 """
    question_urls = re.findall(r'https://www.zhihu.com/question/(\d+)', response.text)

    # 翻頁用到的session_token 和 authorization都可在首頁原始碼找到
    self.session_token = re.findall(r'session_token=([0-9,a-z]{32})', response.text)[0]
    auto = re.findall(r'carCompose&quot;:&quot;(.*?)&quot', response.text)[0]
    self.headers['authorization'] = 'Bearer ' + auto

    # 首頁第一頁問題
    for url in question_urls:
        question_detail = 'https://www.zhihu.com/question/' + url
        yield scrapy.Request(question_detail, headers=self.headers, callback=self.parse_question)

    # 獲取指定數量問題
    n = 10
    while n < self.question_count:
        yield scrapy.Request(self.next_page.format(self.session_token, n), headers=self.headers,
                             callback=self.get_more_question)
        n += 10


def get_more_question(self, response):
    """ 獲取更多首頁問題 """
    question_url = 'https://www.zhihu.com/question/{0}'
    questions = json.loads(response.text)

    for que in questions['data']:
        question_id = re.findall(r'(\d+)', que['target']['question']['url'])[0]
        yield scrapy.Request(question_url.format(question_id), headers=self.headers,
                             callback=self.parse_question)

Step 4——獲取問題詳情

分析問題頁獲取問題詳情及請求問題指定範圍的指定數量答案

Item結構:

class ZhihuQuestionItem(scrapy.Item):

    name = scrapy.Field()
    url = scrapy.Field()
    keywords = scrapy.Field()
    answer_count = scrapy.Field()
    comment_count = scrapy.Field()
    flower_count = scrapy.Field()
    date_created = scrapy.Field()

獲取問題詳情及請求指定範圍答案

def parse_question(self, response):
    """ 解析問題詳情及獲取指定範圍答案 """
    text = response.text
    item = ZhihuQuestionItem()

    item['name'] = re.findall(r'<meta itemprop="name" content="(.*?)"', text)[0]
    item['url'] = re.findall(r'<meta itemprop="url" content="(.*?)"', text)[0]
    item['keywords'] = re.findall(r'<meta itemprop="keywords" content="(.*?)"', text)[0]
    item['answer_count'] = re.findall(r'<meta itemprop="answerCount" content="(.*?)"', text)[0]
    item['comment_count'] = re.findall(r'<meta itemprop="commentCount" content="(.*?)"', text)[0]
    item['flower_count'] = re.findall(r'<meta itemprop="zhihu:followerCount" 
                                      content="(.*?)"', text)[0]
    item['date_created'] = re.findall(r'<meta itemprop="dateCreated" content="(.*?)"', text)[0]

    count_answer = int(item['answer_count'])
    yield item

    question_id = int(re.match(r'https://www.zhihu.com/question/(\d+)', response.url).group(1))

    # 從指定位置開始獲取指定數量答案
    if count_answer > self.answer_count:
        count_answer = self.answer_count
    n = self.answer_offset
    while n + 20 <= count_answer:
        yield scrapy.Request(self.more_answer_url.format(question_id, n, n + 20), 
                             headers=self.headers, callback=self.parse_answer)
        n += 20

Step 5——獲取答案

在parse_question( )中請求的指定範圍答案的url返回json資料

item結構:

class ZhihuAnswerItem(scrapy.Item):

    question_id = scrapy.Field()
    author = scrapy.Field()
    ans_url = scrapy.Field()
    comment_count = scrapy.Field()
    upvote_count = scrapy.Field()
    excerpt = scrapy.Field()

獲取答案:

def parse_answer(self, response):
    """ 解析獲取到的指定範圍答案 """
    answers = json.loads(response.text)

    for ans in answers['data']:
        item = ZhihuAnswerItem()
        item['question_id'] = re.match(r'http://www.zhihu.com/api/v4/questions/(\d+)', 
                                       ans['question']['url']).group(1)
        item['author'] = ans['author']['name']
        item['ans_url'] = ans['url']
        item['comment_count'] = ans['comment_count']
        item['upvote_count'] = ans['voteup_count']
        item['excerpt'] = ans['excerpt']

        yield item

Step 6——問題及答案入庫

class ZhihuPipeline(object):

    def __init__(self):

        self.settings = get_project_settings()
        self.connect = pymysql.connect(
            host=self.settings['MYSQL_HOST'],
            db=self.settings['MYSQL_DBNAME'],
            user=self.settings['MYSQL_USER'],
            passwd=self.settings['MYSQL_PASSWD'],
            charset=self.settings['MYSQL_CHARSET'],
            use_unicode=True
        )
        self.cursor = self.connect.cursor()

    def process_item(self, item, spider):

        if item.__class__.__name__ == 'ZhihuQuestionItem':
            sql = 'insert into Scrapy_test.zhihuQuestion(name,url,keywords,answer_count,' \
                  'flower_count,comment_count,date_created) values (%s,%s,%s,%s,%s,%s,%s)'

            self.cursor.execute(sql, (item['name'], item['url'], item['keywords'],      
                                item['answer_count'],item['flower_count'], 
                                item['comment_count'], item['date_created']))
        else:
            sql = 'insert intoScrapy_test.zhihuAnswer(question_id,author,ans_url',\
                  'upvote_count,comment_count,excerpt)values (%s,%s,%s,%s,%s,%s)'

            self.cursor.execute(sql, (item['question_id'], item['author'], 
                                item['ans_url'], 
                                item['upvote_count'],item['comment_count'], item['excerpt']))
    self.connect.commit()

成果展示:

這裡寫圖片描述

這裡寫圖片描述