1. 程式人生 > >網路爬蟲--python抓取豆瓣同城北京地區活動資訊

網路爬蟲--python抓取豆瓣同城北京地區活動資訊

import re
import requests
import os
import sys
#url =  'https://beijing.douban.com/events/future-music?start=0'
#header = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64; rv:46.0) Gecko/20100101 Firefox/46.0'}

#定義一個全域性目錄路徑
path = os.path.abspath(os.path.dirname(sys.argv[0]))

class douban_local_activity(object):
	def __init__(self):
		print('開始爬取內容')
	#獲取單個連線html文字
	def getSource(self,url):
		html = requests.get(url)
		html.encoding = 'utf-8'
		return html.text
	#根據頁數獲取所有的連結
	def changePage(self,url,totalPage):
		startPageNum = int(re.search('start=(\d+)',url,re.S).group(1))
		pageGroup = []
		for i in range(startPageNum,totalPage+1):
			perLink = re.sub('start=\d+','start=%s' % (i*10),url,re.S)
			pageGroup.append(perLink)
		return pageGroup
	#抓取單頁所有的活動資訊
	def getAllEvents(self,source):
		biggerHtml = re.search('<ul class="events-list(.*?)<div class="paginator">', source, re.S).group(1)
		events = re.findall('(<li class="list-entry".*?</p>\s+</div>\s+</li>)', biggerHtml, re.S)
		return events
	#獲取每個活動的詳細資訊
	def getEntity(self,event):
		entity = {}
		entity['title'] = re.search('<span itemprop="summary">(.*?)</span>',event,re.S).group(1)
		entity['time'] = re.search('時間:</span>\s+(.*?)\s+<time',event,re.S).group(1)
		entity['position'] = re.search('<li title="(.*?)">\s+<span',event,re.S).group(1)
		entity['fee'] = re.search('<strong>(.*?)</strong>',event,re.S).group(1)
		return entity
	#將活動資訊儲存到文字檔案中
	def saveEntity(self,eventInfo):
		f = open(path+'/douban_events.txt','a',encoding='utf-8')
		for event in eventInfo:
			f.writelines('title:' + event['title'] + '\n')
			f.writelines('time:' + event['time'] + '\n')
			f.writelines('position:' + event['position'] + '\n')
			f.writelines('fee:' + event['fee'] + '\n')
			f.writelines('\n')
		f.close()

if __name__ == '__main__':
	eventInfo = []
	url = 'https://beijing.douban.com/events/future-music?start=0'
	activity = douban_local_activity()
	links = activity.changePage(url,10)
	for link in links:
		print('正在處理頁面:' + link)
		html = activity.getSource(link)
		allEvents = activity.getAllEvents(html)
		for item in allEvents:
			entity = activity.getEntity(item)
			eventInfo.append(entity)
	activity.saveEntity(eventInfo)