1. 程式人生 > >豆瓣上映電影爬蟲

豆瓣上映電影爬蟲

https://study.163.com/course/courseLearn.htm?courseId=1005913008#/learn/video?lessonId=1053258282&courseId=1005913008

課堂上的程式碼,做個記錄

 1 import requests
 2 from bs4 import BeautifulSoup
 3 import json
 4 
 5 
 6 def get_page():
 7     url = 'https://movie.douban.com/cinema/nowplaying/changsha/'
 8     headers = {
9 "User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36" 10 } 11 response = requests.get(url, headers=headers, verify=False) 12 text = response.text 13 return text 14 15 16 def parse_page(text): 17 soup = BeautifulSoup(text, '
lxml') 18 movies = [] 19 liList = soup.find_all('li', attrs={"data-category":"nowplaying"}) 20 for li in liList: 21 movie = {} 22 title = li['data-title'] 23 score = li['data-score'] 24 release = li['data-release'] 25 region = li['data-region'] 26 director = li['
data-director'] 27 actors = li['data-actors'] 28 img = li.find('img')['src'] 29 30 movie['title'] = title 31 movie['score'] = score 32 movie['release'] = release 33 movie['region'] = region 34 movie['director'] = director 35 movie['actors'] = actors 36 movie['img'] = img 37 movies.append(movie) 38 return movies 39 40 41 def save_data(data): 42 # 返回一個檔案指標 43 with open('douban.json', 'w', encoding='utf-8') as fp: 44 # json.dump作用 45 # 將字典、列表dump成滿足json格式的字串 46 # ensure_ascii=False可以儲存非ascii的值 47 json.dump(data, fp, ensure_ascii=False) 48 49 50 if __name__ == '__main__': 51 text = get_page() 52 movies = parse_page(text) 53 save_data(movies)