爬取貓眼電影前100名(獲取圖片並記錄名字跟排名)
阿新 • • 發佈:2019-01-10
簡述
程式碼實現上,這個可以說是不難的。但是需要注意的是,這裡的有一些細節得注意一下。
- 協程的,gevent.joinall() 需要的是一個可以迭代的物件,但是內容必須是gevent.spawn()這個類
- 這裡有些小東西,比如在network上得到的包的內的資料(也就是直接爬取得到的資料,其實不是完全的資料,會有一些對於原始碼的部分的修改)。這種修改導致一開始,如果是看著檢查的話,就會有bug。
程式碼
import requests
import re
import gevent
from gevent import monkey
monkey.patch_all(select=False )
def getPicture(url, name):
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.87 Safari/537.36'
}
res = requests.get(url, headers=headers)
if res.status_code == 200:
with open('./picture/' + name + '.png', 'wb' ) as f:
f.write(res.content)
else:
print(url)
def get_one_page(url):
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.87 Safari/537.36'
}
res = requests.get(url, headers=headers)
if res.status_code == 200 :
pattern = re.compile('<dd>.*?board-index-.*?>(.*?)</i>.*?data-src="(.*?)".*?alt="(.*?)"', re.S)
data = re.findall(pattern, res.text)
waitlist = [gevent.spawn(getPicture, d[1], d[0] + '-' + d[2]) for d in data]
gevent.joinall(waitlist)
else:
print(url)
if __name__ == '__main__':
url = 'http://maoyan.com/board/4?offset=%d'
waitlist = [gevent.spawn(get_one_page, url % (d * 10)) for d in range(10)]
gevent.joinall(waitlist)
import os
import requests
from lxml import etree
import gevent
from gevent import monkey
monkey.patch_all(select=False)
def getPicture(url, name):
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.87 Safari/537.36'
}
res = requests.get(url, headers=headers)
if res.status_code == 200:
with open('./picture/' + name + '.png', 'wb') as f:
f.write(res.content)
else:
print(url)
def get_one_page(url):
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.87 Safari/537.36'
}
res = requests.get(url, headers=headers)
if res.status_code == 200:
html = etree.HTML(res.text)
i = html.xpath('//i[contains(@class, "board")]/text()')
title = html.xpath('//a[contains(@class, "image")]/@title')
img = html.xpath('//img/@data-src')
waitlist = [gevent.spawn(getPicture, img[t], i[t] + '-' + title[t]) for t in range(10)]
gevent.joinall(waitlist)
else:
print(url)
def createfile():
if not os.path.exists('./picture/'):
os.mkdir('./picture')
if __name__ == '__main__':
createfile()
url = 'http://maoyan.com/board/4?offset=%d'
waitlist = [gevent.spawn(get_one_page, url % (d * 10)) for d in range(10)]
gevent.joinall(waitlist)