Python爬蟲抓取女演員圖片
阿新 • • 發佈:2019-01-23
介紹利用Python爬蟲抓取日本女演員照片。
遇到的最大問題就是該網站用了cloudflare以及其他的策略禁止爬蟲爬取資訊,導致urllib自帶的urlretrieve函式無法使用,而其他部分都較為常規,故直接貼出程式碼。
import re
import urllib2
def getHtml(url1):
headers = {'User-Agent':'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.1.6) Gecko/20091201 Firefox/3.5.6'}
request = urllib2.Request(url = url1,headers = headers)
html = urllib2.urlopen(request).read()
return html
def getImg(html):
headers = {'User-Agent':'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.1.6) Gecko/20091201 Firefox/3.5.6'}
reg = r'<img src="(.*?)" title="">'
imgre = re.compile(reg)
imglist = re.findall(imgre,html)
for imgurl in imglist:
print imgurl
name = re.findall(r'https://jp.netcdn.space/mono/actjpgs/(.*?).jpg' ,imgurl)
print name
namestr = "".join(name[0])
filename = namestr + '.jpg'
picpath = 'D:\\ImageDownload\\%s' % (filename)
print picpath
timeout = 50
request = urllib2.Request(imgurl,None,headers)
response = urllib2.urlopen(request,None,timeout)
str = response.read()
foo = open(picpath,"wb" )
foo.write(str)
foo.close()
start = int(1)
end = int(2)
for page in range(start,end):
page = str(page)
url = "https://avmo.pw/cn/actresses/page/"+page
html = getHtml(url)
getImg(html)
print u"""
---------------------------------------
name : avmo_img
edition : 0.1
author : ultrakin
time : 2016-09-27
---------------------------------------
"""
程式抓取結果: