python日常—爬取豆瓣250條電影記錄
摘要:#感興趣的同仁可以相互交流哦
import requests
import lxml.html,csv
doubanUrl = 'https://movie.douban.com/top250?start={}&filter='
def getSource(u...
#感興趣的同仁可以相互交流哦 import requests import lxml.html,csv doubanUrl = 'https://movie.douban.com/top250?start={}&filter=' def getSource(url): response = requests.get(url)# 獲取網頁 response.encoding = 'utf-8'# 修改編碼 return response.content#獲取原始碼 def getEveryItem(source): # 獲取HTML物件 selector = lxml.html.document_fromstring(source) # 提取標籤所有的資訊 movieItemList = selector.xpath('//div[@class="info"]') # 定義一個空列表——用於展示資訊 movieList = [] for eachMovie in movieItemList: movieDict = {} # 分層提取 title = eachMovie.xpath('div[@class="hd"/a/span/[@class="title"]/text()') otherTitle = eachMovie.xpath('div[@class="hd"/a/span/[@class="other"]/text()') link = eachMovie.xpath('div[@class="hd"/a/@href')[0] star = eachMovie.xpath('div[@class="hd"/div[@class="star"]/span[@class="rating_num"]/text()') quote = eachMovie.xpath('div[@class="hd"/p[@class="quote"]/span/text()') # 儲存字典資訊 movieDict['title'] = ''.join(title+otherTitle) movieDict['url'] = link movieDict['star'] = star movieDict['quote'] = quote movieList.append(movieDict) return movieList def writeData(movieList): with open('./Douban.csv','w',encoding='UTF-8',newline='') as f: writer = csv.DictWriter(f,fieldnames=['titlr','star','quote','url']) # 寫入表頭 writer.writeheader() for each in movieList: writer.writerow(each) if __name__ == 'main': # 共展示250條電影資訊 每頁25條 ,共10頁 movieList = [] for i in range(10): # 獲取url pageLink = doubanUrl.format(i*25) print(pageLink) # 根據地址獲取資源 source = getSource(pageLink) movieList = getEveryItem(source) print(movieList[:10]) writeData(movieList)