python爬蟲爬取貓眼電影top100
阿新 • • 發佈:2019-01-04
這個爬蟲我是跟著教程做的,也是第一次用python的re和multiprocessing(多執行緒),還知道了yield生成器的用法。不過re正則表示式真的厲害,但是學起來比較難,還在學習中。
import requests import re import pymysql from multiprocessing import Pool def get_one_page(url):#判斷是否get到網頁資訊 res = requests.get(url) if res.status_code == 200: return res.text return None def parse_one_page(html):#用正則表示式抓取 pattern = re.compile('<dd>.*?board-index.*?>(\d+)</i>.*?data-src="(.*?)".*?name"><a' +'.*?>(.*?)</a>.*?star">(.*?)</p>.*?releasetime">(.*?)</p>' +'.*?integer">(.*?)</i>.*?fraction">(.*?)</i>.*?</dd>',re.S) items = re.findall(pattern,html) for item in items: yield { 'index':item[0], 'url':item[1], 'name':item[2], 'actors':item[3].strip()[3:], 'time':item[4].strip()[5:], 'score':item[5]+item[6] } def save_detail(num,name,actors,time,score):#呼叫pymysql儲存資訊 db = pymysql.connect('localhost','root','123456','maoyan_movies',charset='utf8') cursor = db.cursor() sql = 'insert into top100 values (%s,%s,%s,%s,%s)' cursor.execute(sql,(num,name,actors,time,score)) db.commit() db.close() def main(offset): url = 'http://maoyan.com/board/4?offset={0}'.format(offset) html = get_one_page(url) for i in parse_one_page(html): save_detail(i['index'],i['name'],i['actors'],i['time'],i['score']) if __name__ == '__main__': offset = [i*10 for i in range(10)]#啟用多執行緒 pool = Pool() pool.map(main,offset)