python 爬取 豆瓣網 搜尋結果 同城活動 資料
阿新 • • 發佈:2018-12-13
主要使用的庫:
requests:爬蟲請求並獲取原始碼
re:使用正則表示式提取資料
json:使用JSON提取資料
pandas:使用pandans儲存資料
bs4:網頁程式碼解析
以下是原始碼:
#!coding=utf-8 import requests import re import json from requests.packages.urllib3.exceptions import InsecureRequestWarning import pandas as pd requests.packages.urllib3.disable_warnings(InsecureRequestWarning) from bs4 import BeautifulSoup import lxml def doubanhudong(q,cat): ###q 查詢內容,cat 目錄編號 s = requests.session() headers = { 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8', 'Accept-Encoding': 'gzip, deflate, br', 'Accept-Language': 'zh-CN,zh;q=0.9', 'Cache-Control': 'max-age=0', 'Connection': 'keep-alive', 'Host': 'www.douban.com', 'Upgrade-Insecure-Requests': '1', 'X-Requested-': 'XMLHttpRequest', #'Referer': 'https://www.douban.com/search?cat={}&q={}'.format(cat,q), 'Referer': 'https://www.douban.com', 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.15 Safari/537.36', } s.headers.update(headers) activity = [] ##頻道名 title = [] ##標題 html = [] ##網址 peoplenum = [] ##參加人數 date = [] ##活動時間 address1 = [] ##省 address2 = [] ##城市 address3 = [] ##詳細地址 abstract = [] ##概述 for i in range(0,100000,20): url='https://www.douban.com/j/search?q={}&start={}&cat={}'.format(q,i,cat) req=s.get(url=url,verify=False) req=json.loads(req.text) # bs=BeautifulSoup(req,'lxml') # resultlist=bs.find_all(class_='result') bs=req['items'] if bs==[]: break for r in bs: print(r) i=BeautifulSoup(r,'lxml') activity.append(i.find('span').get_text().strip()) html.append(i.find('a')['href'].strip()) title.append(i.find('a')['title'].strip()) peoplenum.append(i.find(class_='info').get_text().strip().split('\n')[0]) date.append(i.find(class_='info').get_text().strip().split('\n')[2].strip()) address1.append(i.find(class_='info').get_text().strip().split('\n')[4].strip()) address2.append(i.find(class_='info').get_text().strip().split('\n')[5].strip()) address3.append(i.find(class_='info').get_text().strip().split('\n')[6].strip()) try: abstract.append(i.find('p').get_text().strip()) except: abstract.append('') # print(activity) # print(title) # print(html) # print(peoplenum) # print(date) # print(address1) # print(address2) # print(address3) # print(abstract) data={ 'activity':activity,'title':title,'html':html,'peoplenum':peoplenum,'date':date,'address1':address1,'address2':address2,'address3':address3,'abstract':abstract } df=pd.DataFrame(data) df.to_csv(r'E:\doubai.csv', index=False, encoding="GB18030") if __name__ == '__main__': q='電影' ##搜尋詞 cat='1011' ##頻道編號 doubanhudong(q,cat)
注:引數q為搜尋詞,cat為豆瓣頻道編號,同城活動為1011