1. 程式人生 > >Python 爬取 豆瓣

Python 爬取 豆瓣

...

import urllib.request
import time
from bs4 import BeautifulSoup

def url_open(url):
    response = urllib.request.urlopen(url)
    return response
def parse_html(response):
    html_content = response.read()
    html_soup = BeautifulSoup(html_content, 'html.parser', from_encoding='utf-8')
    tag_lis 
= html_soup.find_all('li') for li in tag_lis: em = li.find('em') title = li.find_all('span', class_='title') # other = li.find_all('span', class_='other') rating = li.find('span', class_='rating_num') if title != []: rank=em.get_text()
print("排名:" + rank + "------評分:" + str(rating.get_text()) + "-------" + title[0].get_text()) if rank==250: return None if int(rank)%25==0: url="https://movie.douban.com/top250?start="+rank+"&filter=" return url url = "https://movie.douban.com/top250?start=0&filter=
" if __name__=='__main__': response=url_open(url) start_time=time.time() print("開始:"+str(start_time)) while 1: url=parse_html(response) if url==None: break response=url_open(url) end_time=time.time() print("結束:"+str(end_time)) print("一共用了:"+str(end_time-start_time)+"")