python3爬取知乎某話題下的若干個問題及其回答

阿新 • • 發佈：2018-12-21

思路：通過selenium的webdriver實現頁面的點選、下來等操作。頁面完全載入後通beautifulsoup來查詢相應的標籤。將資料放到列表中，然後通過xlwt建立excel，並把資料存進去。

缺點：容易遇到效能問題。可能一個話題有很多的回覆，而對於往excel中插入資料來說，上萬條就會有卡頓了。其次，此程式碼處理完一個頁面後並不會關閉，而是接著開啟下一個網頁繼續處理。等所有網頁都處理完後，再從後往前出棧的方式關閉。對記憶體消耗大。

疑難雜症：沒能夠實現將資料存到mysql中，一直報各種錯，資料庫的相關程式碼在最後，希望有明白人指點一下啊！

報錯內容大概是下面這樣：

pymysql.err.ProgrammingError: (1064, "You have an error in your SQL syntax; check the manual that corresponds to your MySQL server version for the right syntax to use near "inset into answerInfo(answerer,answer,upvoteCount,commentCount) values ('%s','%s','%s','%s')"""%(pymysql.escape_string(answerer),answer,voteCount,commentCount) at(xxx line 1")

程式碼

from bs4 import BeautifulSoup
from selenium import webdriver
import time
import json
import xlwt


class ZhiHu():
    pubDate=0
    title=''

    def __init__(self,topicURL):  # 類的初始化操作
        self.headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36'}  # 給請求指定一個請求頭來模擬chrome瀏覽器
        self.topicURL = topicURL  # 要訪問的話題地址
    def getHtml(self,url):
        driver = webdriver.Chrome()
        driver.get(url)
        #點選檢視全部回答按鈕
        driver.find_element_by_class_name('QuestionMainAction').click()
        time.sleep(3)
        bs = BeautifulSoup(driver.page_source, 'lxml')
        #迴圈下拉
        while True:
            b = bs.find('button',{'class':'Button QuestionAnswers-answerButton Button--blue Button--spread'})
            if b!=None:
                break
            else:
                driver.execute_script('window.scrollTo(0, document.body.scrollHeight)')
        bs = BeautifulSoup(driver.page_source, 'lxml')
        global pubDate,title
        title = bs.find('h1', class_='QuestionHeader-title').string
        pubDate= json.loads(bs.find('script', {'type': 'application/ld+json'}).get_text())["pubDate"][0:10]
        html = bs.find_all('div',{'class':'List-item'})
        print(title+"\t:\t此問題總共有%d條回答"%len(html))
        return html

    def downLoadToTxt(self,html,path):

        for tag in html:
            content = []
            content.append(title)
            content.append(pubDate)
            # 獲取回答內容
            answer = tag.find('div', class_='RichContent-inner').find('span').get_text()
            #獲取回答人
            answerer = json.loads(tag.find('div',class_='ContentItem AnswerItem')['data-zop'])["authorName"]
            content.append(answerer)
            #回答時間
            time = tag.find('div',class_='ContentItem-time').find('span').get_text()[-10:]
            content.append(time)
            #獲取贊同數
            upvoteCount = json.loads(tag.find('div',class_='ContentItem AnswerItem')['data-za-extra-module'])["card"]["content"]["upvote_num"]
            content.append(str(upvoteCount))
            #獲取評論數
            commentCount = json.loads(tag.find('div',class_='ContentItem AnswerItem')['data-za-extra-module'])["card"]["content"]["comment_num"]
            content.append(str(commentCount))
            content.append(answer)
            with open(path, 'a') as f:  # 如果filename不存在會自動建立， 'w'表示寫資料，寫之前會清空檔案中的原有資料！
                for tag in content:
                    f.write(tag+'\t')
                f.write('\n')
            f.close()
            print(answerer+'\n'+str(upvoteCount)+'\n'+str(commentCount)+'\n\n\n')
    def downLoadToExcel(self,html):
        result = []
        head = ['問題','釋出時間','回答人','回答時間','贊同數','評論數','回答內容']
        result.append(head)
        for tag in html:
            content = []
            content.append(title)
            content.append(pubDate)
            # 獲取回答內容
            answer = tag.find('div', class_='RichContent-inner').find('span').get_text()
            #獲取回答人
            answerer = json.loads(tag.find('div',class_='ContentItem AnswerItem')['data-zop'])["authorName"]
            content.append(answerer)
            # 回答時間
            time = tag.find('div', class_='ContentItem-time').find('span').get_text()[-10:]
            content.append(time)
            #獲取贊同數
            upvoteCount = json.loads(tag.find('div',class_='ContentItem AnswerItem')['data-za-extra-module'])["card"]["content"]["upvote_num"]
            content.append(str(upvoteCount))
            #獲取評論數
            commentCount = json.loads(tag.find('div',class_='ContentItem AnswerItem')['data-za-extra-module'])["card"]["content"]["comment_num"]
            content.append(str(commentCount))
            content.append(answer)
            result.append(content)
        workbook = xlwt.Workbook(encoding='utf-8')
        booksheet = workbook.add_sheet('Sheet 1', cell_overwrite_ok=True)
        for i, row in enumerate(result):
            for j, col in enumerate(row):
                booksheet.write(i, j, col)
        workbook.save(title+'.xls')

    def getAnswerItemURLs(self):
        driver = webdriver.Chrome()
        driver.get(self.topicURL)
        time.sleep(2)
        #下拉次數
        i=5
        # 迴圈下拉
        while i>0:
            driver.execute_script('window.scrollTo(0, document.body.scrollHeight)')
            time.sleep(3)
            i=i-1
        bs = BeautifulSoup(driver.page_source, 'lxml')

        #所有的回答
        AnswerItems=bs.find_all('div',class_='ContentItem AnswerItem')

        AnswerItemURLs=[]
        preURL="https://www.zhihu.com"
        for item in AnswerItems:
            tailURL=item.find('a')['href']
            URL=preURL+tailURL
            AnswerItemURLs.append(URL)
            print(URL)
        print("總共有%d條問題！"%len(AnswerItemURLs))
        return AnswerItemURLs

    def getArticleItemURLs(self):
        driver = webdriver.Chrome()
        driver.get(self.topicURL)
        time.sleep(2)
        i=5
        # 迴圈下拉
        while i>0:
            driver.execute_script('window.scrollTo(0, document.body.scrollHeight)')
            time.sleep(3)
            i=i-1
        bs = BeautifulSoup(driver.page_source, 'lxml')

        # 所有的專欄
        ArticleItems = bs.find_all('div', class_='ContentItem ArticleItem')

        ArticleItemURLs=[]
        preURL="https:"
        for item in ArticleItems:
            tailURL=item.find('a')['href']
            URL=preURL+tailURL
            ArticleItemURLs.append(URL)
            print(URL)
        print("總共有%d條問題！"%len(ArticleItemURLs))
        return ArticleItemURLs

zhihu = ZhiHu("話題地址")

AnswerItemURLs = zhihu.getAnswerItemURLs()
for url in AnswerItemURLs:
    html = zhihu.getHtml(url)
    zhihu.downLoadToExcel(html)
print("ok")

python3爬取知乎某話題下的若干個問題及其回答

python3爬取知乎某話題下的若干個問題及其回答

用於爬取知乎某個話題下的精華問題中所有回答的爬蟲

爬取知乎某個問題下所有的圖片

用Python3爬取知乎上好看的桌布

通過Python爬蟲爬取知乎某個問題下的圖片

python爬取知乎話題的精華問題下的使用者資訊

爬取知乎話題async使用協程

python scrapy爬取知乎問題和收藏夾下所有答案的內容和圖片

python3編寫知乎某話題爬蟲應注意事項

【Python3.6爬蟲學習記錄】（五）Cookie的使用以及簡單的爬取知乎

Scrapy分布式爬蟲打造搜索引擎（慕課網）--爬取知乎（二）

用PHP爬取知乎的100萬用戶

利用 Scrapy 爬取知乎用戶信息

爬取知乎Python中文社區信息

Python爬去知乎上問題下所有圖片

scrapy爬取知乎問答

教程+資源,python scrapy實戰爬取知乎最性感妹子的爆照合集(12G)!

python爬取知乎專欄使用者評論資訊

爬蟲爬取知乎登陸後首頁

用python爬取知乎中的圖片

python3爬取知乎某話題下的若干個問題及其回答

相關推薦