python3爬取知乎某話題下的若干個問題及其回答
阿新 • • 發佈:2018-12-21
思路:通過selenium的webdriver實現頁面的點選、下來等操作。頁面完全載入後通beautifulsoup來查詢相應的標籤。將資料放到列表中,然後通過xlwt建立excel,並把資料存進去。
缺點:容易遇到效能問題。可能一個話題有很多的回覆,而對於往excel中插入資料來說,上萬條就會有卡頓了。其次,此程式碼處理完一個頁面後並不會關閉,而是接著開啟下一個網頁繼續處理。等所有網頁都處理完後,再從後往前出棧的方式關閉。對記憶體消耗大。
疑難雜症:沒能夠實現將資料存到mysql中,一直報各種錯,資料庫的相關程式碼在最後,希望有明白人指點一下啊!
報錯內容大概是下面這樣:
pymysql.err.ProgrammingError: (1064, "You have an error in your SQL syntax; check the manual that corresponds to your MySQL server version for the right syntax to use near "inset into answerInfo(answerer,answer,upvoteCount,commentCount) values ('%s','%s','%s','%s')"""%(pymysql.escape_string(answerer),answer,voteCount,commentCount) at(xxx line 1")
程式碼
from bs4 import BeautifulSoup from selenium import webdriver import time import json import xlwt class ZhiHu(): pubDate=0 title='' def __init__(self,topicURL): # 類的初始化操作 self.headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36'} # 給請求指定一個請求頭來模擬chrome瀏覽器 self.topicURL = topicURL # 要訪問的話題地址 def getHtml(self,url): driver = webdriver.Chrome() driver.get(url) #點選檢視全部回答按鈕 driver.find_element_by_class_name('QuestionMainAction').click() time.sleep(3) bs = BeautifulSoup(driver.page_source, 'lxml') #迴圈下拉 while True: b = bs.find('button',{'class':'Button QuestionAnswers-answerButton Button--blue Button--spread'}) if b!=None: break else: driver.execute_script('window.scrollTo(0, document.body.scrollHeight)') bs = BeautifulSoup(driver.page_source, 'lxml') global pubDate,title title = bs.find('h1', class_='QuestionHeader-title').string pubDate= json.loads(bs.find('script', {'type': 'application/ld+json'}).get_text())["pubDate"][0:10] html = bs.find_all('div',{'class':'List-item'}) print(title+"\t:\t此問題總共有%d條回答"%len(html)) return html def downLoadToTxt(self,html,path): for tag in html: content = [] content.append(title) content.append(pubDate) # 獲取回答內容 answer = tag.find('div', class_='RichContent-inner').find('span').get_text() #獲取回答人 answerer = json.loads(tag.find('div',class_='ContentItem AnswerItem')['data-zop'])["authorName"] content.append(answerer) #回答時間 time = tag.find('div',class_='ContentItem-time').find('span').get_text()[-10:] content.append(time) #獲取贊同數 upvoteCount = json.loads(tag.find('div',class_='ContentItem AnswerItem')['data-za-extra-module'])["card"]["content"]["upvote_num"] content.append(str(upvoteCount)) #獲取評論數 commentCount = json.loads(tag.find('div',class_='ContentItem AnswerItem')['data-za-extra-module'])["card"]["content"]["comment_num"] content.append(str(commentCount)) content.append(answer) with open(path, 'a') as f: # 如果filename不存在會自動建立, 'w'表示寫資料,寫之前會清空檔案中的原有資料! for tag in content: f.write(tag+'\t') f.write('\n') f.close() print(answerer+'\n'+str(upvoteCount)+'\n'+str(commentCount)+'\n\n\n') def downLoadToExcel(self,html): result = [] head = ['問題','釋出時間','回答人','回答時間','贊同數','評論數','回答內容'] result.append(head) for tag in html: content = [] content.append(title) content.append(pubDate) # 獲取回答內容 answer = tag.find('div', class_='RichContent-inner').find('span').get_text() #獲取回答人 answerer = json.loads(tag.find('div',class_='ContentItem AnswerItem')['data-zop'])["authorName"] content.append(answerer) # 回答時間 time = tag.find('div', class_='ContentItem-time').find('span').get_text()[-10:] content.append(time) #獲取贊同數 upvoteCount = json.loads(tag.find('div',class_='ContentItem AnswerItem')['data-za-extra-module'])["card"]["content"]["upvote_num"] content.append(str(upvoteCount)) #獲取評論數 commentCount = json.loads(tag.find('div',class_='ContentItem AnswerItem')['data-za-extra-module'])["card"]["content"]["comment_num"] content.append(str(commentCount)) content.append(answer) result.append(content) workbook = xlwt.Workbook(encoding='utf-8') booksheet = workbook.add_sheet('Sheet 1', cell_overwrite_ok=True) for i, row in enumerate(result): for j, col in enumerate(row): booksheet.write(i, j, col) workbook.save(title+'.xls') def getAnswerItemURLs(self): driver = webdriver.Chrome() driver.get(self.topicURL) time.sleep(2) #下拉次數 i=5 # 迴圈下拉 while i>0: driver.execute_script('window.scrollTo(0, document.body.scrollHeight)') time.sleep(3) i=i-1 bs = BeautifulSoup(driver.page_source, 'lxml') #所有的回答 AnswerItems=bs.find_all('div',class_='ContentItem AnswerItem') AnswerItemURLs=[] preURL="https://www.zhihu.com" for item in AnswerItems: tailURL=item.find('a')['href'] URL=preURL+tailURL AnswerItemURLs.append(URL) print(URL) print("總共有%d條問題!"%len(AnswerItemURLs)) return AnswerItemURLs def getArticleItemURLs(self): driver = webdriver.Chrome() driver.get(self.topicURL) time.sleep(2) i=5 # 迴圈下拉 while i>0: driver.execute_script('window.scrollTo(0, document.body.scrollHeight)') time.sleep(3) i=i-1 bs = BeautifulSoup(driver.page_source, 'lxml') # 所有的專欄 ArticleItems = bs.find_all('div', class_='ContentItem ArticleItem') ArticleItemURLs=[] preURL="https:" for item in ArticleItems: tailURL=item.find('a')['href'] URL=preURL+tailURL ArticleItemURLs.append(URL) print(URL) print("總共有%d條問題!"%len(ArticleItemURLs)) return ArticleItemURLs zhihu = ZhiHu("話題地址") AnswerItemURLs = zhihu.getAnswerItemURLs() for url in AnswerItemURLs: html = zhihu.getHtml(url) zhihu.downLoadToExcel(html) print("ok")