網路爬蟲--python抓取豆瓣同城北京地區活動資訊
阿新 • • 發佈:2019-02-13
import re import requests import os import sys #url = 'https://beijing.douban.com/events/future-music?start=0' #header = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64; rv:46.0) Gecko/20100101 Firefox/46.0'} #定義一個全域性目錄路徑 path = os.path.abspath(os.path.dirname(sys.argv[0])) class douban_local_activity(object): def __init__(self): print('開始爬取內容') #獲取單個連線html文字 def getSource(self,url): html = requests.get(url) html.encoding = 'utf-8' return html.text #根據頁數獲取所有的連結 def changePage(self,url,totalPage): startPageNum = int(re.search('start=(\d+)',url,re.S).group(1)) pageGroup = [] for i in range(startPageNum,totalPage+1): perLink = re.sub('start=\d+','start=%s' % (i*10),url,re.S) pageGroup.append(perLink) return pageGroup #抓取單頁所有的活動資訊 def getAllEvents(self,source): biggerHtml = re.search('<ul class="events-list(.*?)<div class="paginator">', source, re.S).group(1) events = re.findall('(<li class="list-entry".*?</p>\s+</div>\s+</li>)', biggerHtml, re.S) return events #獲取每個活動的詳細資訊 def getEntity(self,event): entity = {} entity['title'] = re.search('<span itemprop="summary">(.*?)</span>',event,re.S).group(1) entity['time'] = re.search('時間:</span>\s+(.*?)\s+<time',event,re.S).group(1) entity['position'] = re.search('<li title="(.*?)">\s+<span',event,re.S).group(1) entity['fee'] = re.search('<strong>(.*?)</strong>',event,re.S).group(1) return entity #將活動資訊儲存到文字檔案中 def saveEntity(self,eventInfo): f = open(path+'/douban_events.txt','a',encoding='utf-8') for event in eventInfo: f.writelines('title:' + event['title'] + '\n') f.writelines('time:' + event['time'] + '\n') f.writelines('position:' + event['position'] + '\n') f.writelines('fee:' + event['fee'] + '\n') f.writelines('\n') f.close() if __name__ == '__main__': eventInfo = [] url = 'https://beijing.douban.com/events/future-music?start=0' activity = douban_local_activity() links = activity.changePage(url,10) for link in links: print('正在處理頁面:' + link) html = activity.getSource(link) allEvents = activity.getAllEvents(html) for item in allEvents: entity = activity.getEntity(item) eventInfo.append(entity) activity.saveEntity(eventInfo)