1. 程式人生 > >python3爬取知乎某話題下的若干個問題及其回答

python3爬取知乎某話題下的若干個問題及其回答

思路:通過selenium的webdriver實現頁面的點選、下來等操作。頁面完全載入後通beautifulsoup來查詢相應的標籤。將資料放到列表中,然後通過xlwt建立excel,並把資料存進去。

缺點:容易遇到效能問題。可能一個話題有很多的回覆,而對於往excel中插入資料來說,上萬條就會有卡頓了。其次,此程式碼處理完一個頁面後並不會關閉,而是接著開啟下一個網頁繼續處理。等所有網頁都處理完後,再從後往前出棧的方式關閉。對記憶體消耗大。

疑難雜症:沒能夠實現將資料存到mysql中,一直報各種錯,資料庫的相關程式碼在最後,希望有明白人指點一下啊!

報錯內容大概是下面這樣:

pymysql.err.ProgrammingError: (1064, "You have an error in your SQL syntax; check the manual that corresponds to your MySQL server version for the right syntax to use near "inset into answerInfo(answerer,answer,upvoteCount,commentCount) values ('%s','%s','%s','%s')"""%(pymysql.escape_string(answerer),answer,voteCount,commentCount) at(xxx line 1")
       

程式碼

from bs4 import BeautifulSoup
from selenium import webdriver
import time
import json
import xlwt


class ZhiHu():
    pubDate=0
    title=''

    def __init__(self,topicURL):  # 類的初始化操作
        self.headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36'}  # 給請求指定一個請求頭來模擬chrome瀏覽器
        self.topicURL = topicURL  # 要訪問的話題地址
    def getHtml(self,url):
        driver = webdriver.Chrome()
        driver.get(url)
        #點選檢視全部回答按鈕
        driver.find_element_by_class_name('QuestionMainAction').click()
        time.sleep(3)
        bs = BeautifulSoup(driver.page_source, 'lxml')
        #迴圈下拉
        while True:
            b = bs.find('button',{'class':'Button QuestionAnswers-answerButton Button--blue Button--spread'})
            if b!=None:
                break
            else:
                driver.execute_script('window.scrollTo(0, document.body.scrollHeight)')
        bs = BeautifulSoup(driver.page_source, 'lxml')
        global pubDate,title
        title = bs.find('h1', class_='QuestionHeader-title').string
        pubDate= json.loads(bs.find('script', {'type': 'application/ld+json'}).get_text())["pubDate"][0:10]
        html = bs.find_all('div',{'class':'List-item'})
        print(title+"\t:\t此問題總共有%d條回答"%len(html))
        return html

    def downLoadToTxt(self,html,path):

        for tag in html:
            content = []
            content.append(title)
            content.append(pubDate)
            # 獲取回答內容
            answer = tag.find('div', class_='RichContent-inner').find('span').get_text()
            #獲取回答人
            answerer = json.loads(tag.find('div',class_='ContentItem AnswerItem')['data-zop'])["authorName"]
            content.append(answerer)
            #回答時間
            time = tag.find('div',class_='ContentItem-time').find('span').get_text()[-10:]
            content.append(time)
            #獲取贊同數
            upvoteCount = json.loads(tag.find('div',class_='ContentItem AnswerItem')['data-za-extra-module'])["card"]["content"]["upvote_num"]
            content.append(str(upvoteCount))
            #獲取評論數
            commentCount = json.loads(tag.find('div',class_='ContentItem AnswerItem')['data-za-extra-module'])["card"]["content"]["comment_num"]
            content.append(str(commentCount))
            content.append(answer)
            with open(path, 'a') as f:  # 如果filename不存在會自動建立, 'w'表示寫資料,寫之前會清空檔案中的原有資料!
                for tag in content:
                    f.write(tag+'\t')
                f.write('\n')
            f.close()
            print(answerer+'\n'+str(upvoteCount)+'\n'+str(commentCount)+'\n\n\n')
    def downLoadToExcel(self,html):
        result = []
        head = ['問題','釋出時間','回答人','回答時間','贊同數','評論數','回答內容']
        result.append(head)
        for tag in html:
            content = []
            content.append(title)
            content.append(pubDate)
            # 獲取回答內容
            answer = tag.find('div', class_='RichContent-inner').find('span').get_text()
            #獲取回答人
            answerer = json.loads(tag.find('div',class_='ContentItem AnswerItem')['data-zop'])["authorName"]
            content.append(answerer)
            # 回答時間
            time = tag.find('div', class_='ContentItem-time').find('span').get_text()[-10:]
            content.append(time)
            #獲取贊同數
            upvoteCount = json.loads(tag.find('div',class_='ContentItem AnswerItem')['data-za-extra-module'])["card"]["content"]["upvote_num"]
            content.append(str(upvoteCount))
            #獲取評論數
            commentCount = json.loads(tag.find('div',class_='ContentItem AnswerItem')['data-za-extra-module'])["card"]["content"]["comment_num"]
            content.append(str(commentCount))
            content.append(answer)
            result.append(content)
        workbook = xlwt.Workbook(encoding='utf-8')
        booksheet = workbook.add_sheet('Sheet 1', cell_overwrite_ok=True)
        for i, row in enumerate(result):
            for j, col in enumerate(row):
                booksheet.write(i, j, col)
        workbook.save(title+'.xls')

    def getAnswerItemURLs(self):
        driver = webdriver.Chrome()
        driver.get(self.topicURL)
        time.sleep(2)
        #下拉次數
        i=5
        # 迴圈下拉
        while i>0:
            driver.execute_script('window.scrollTo(0, document.body.scrollHeight)')
            time.sleep(3)
            i=i-1
        bs = BeautifulSoup(driver.page_source, 'lxml')

        #所有的回答
        AnswerItems=bs.find_all('div',class_='ContentItem AnswerItem')

        AnswerItemURLs=[]
        preURL="https://www.zhihu.com"
        for item in AnswerItems:
            tailURL=item.find('a')['href']
            URL=preURL+tailURL
            AnswerItemURLs.append(URL)
            print(URL)
        print("總共有%d條問題!"%len(AnswerItemURLs))
        return AnswerItemURLs

    def getArticleItemURLs(self):
        driver = webdriver.Chrome()
        driver.get(self.topicURL)
        time.sleep(2)
        i=5
        # 迴圈下拉
        while i>0:
            driver.execute_script('window.scrollTo(0, document.body.scrollHeight)')
            time.sleep(3)
            i=i-1
        bs = BeautifulSoup(driver.page_source, 'lxml')

        # 所有的專欄
        ArticleItems = bs.find_all('div', class_='ContentItem ArticleItem')

        ArticleItemURLs=[]
        preURL="https:"
        for item in ArticleItems:
            tailURL=item.find('a')['href']
            URL=preURL+tailURL
            ArticleItemURLs.append(URL)
            print(URL)
        print("總共有%d條問題!"%len(ArticleItemURLs))
        return ArticleItemURLs

zhihu = ZhiHu("話題地址")

AnswerItemURLs = zhihu.getAnswerItemURLs()
for url in AnswerItemURLs:
    html = zhihu.getHtml(url)
    zhihu.downLoadToExcel(html)
print("ok")