1. 程式人生 > >爬取貓眼電影前100名(獲取圖片並記錄名字跟排名)

爬取貓眼電影前100名(獲取圖片並記錄名字跟排名)

簡述

程式碼實現上,這個可以說是不難的。但是需要注意的是,這裡的有一些細節得注意一下。

  • 協程的,gevent.joinall() 需要的是一個可以迭代的物件,但是內容必須是gevent.spawn()這個類
  • 這裡有些小東西,比如在network上得到的包的內的資料(也就是直接爬取得到的資料,其實不是完全的資料,會有一些對於原始碼的部分的修改)。這種修改導致一開始,如果是看著檢查的話,就會有bug。

程式碼

import requests
import re
import gevent
from gevent import monkey

monkey.patch_all(select=False
) def getPicture(url, name): headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.87 Safari/537.36' } res = requests.get(url, headers=headers) if res.status_code == 200: with open('./picture/' + name + '.png', 'wb'
) as f: f.write(res.content) else: print(url) def get_one_page(url): headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.87 Safari/537.36' } res = requests.get(url, headers=headers) if res.status_code == 200
: pattern = re.compile('<dd>.*?board-index-.*?>(.*?)</i>.*?data-src="(.*?)".*?alt="(.*?)"', re.S) data = re.findall(pattern, res.text) waitlist = [gevent.spawn(getPicture, d[1], d[0] + '-' + d[2]) for d in data] gevent.joinall(waitlist) else: print(url) if __name__ == '__main__': url = 'http://maoyan.com/board/4?offset=%d' waitlist = [gevent.spawn(get_one_page, url % (d * 10)) for d in range(10)] gevent.joinall(waitlist)
import os
import requests
from lxml import etree
import gevent
from gevent import monkey

monkey.patch_all(select=False)

def getPicture(url, name):
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.87 Safari/537.36'
    }
    res = requests.get(url, headers=headers)
    if res.status_code == 200:
        with open('./picture/' + name + '.png', 'wb') as f:
            f.write(res.content)
    else:
        print(url)

def get_one_page(url):
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.87 Safari/537.36'
    }
    res = requests.get(url, headers=headers)
    if res.status_code == 200:
        html = etree.HTML(res.text)
        i = html.xpath('//i[contains(@class, "board")]/text()')
        title = html.xpath('//a[contains(@class, "image")]/@title')
        img = html.xpath('//img/@data-src')
        waitlist = [gevent.spawn(getPicture, img[t], i[t] + '-' + title[t]) for t in range(10)]
        gevent.joinall(waitlist)
    else:
        print(url)

def createfile():
    if not os.path.exists('./picture/'):
        os.mkdir('./picture')

if __name__ == '__main__':
    createfile()
    url = 'http://maoyan.com/board/4?offset=%d'
    waitlist = [gevent.spawn(get_one_page, url % (d * 10)) for d in range(10)]
    gevent.joinall(waitlist)