1. 程式人生 > >第一個爬蟲代碼

第一個爬蟲代碼

/usr wow64 print exc reg mozilla getc idt size

# !/usr/bin/python
#coding=GBK
import urllib.request
import re


#file=open("F:/python_workspace/爬蟲/圖片/0.jpg","wb")
#url="http://desk.zol.com.cn/2560x1600/"
def gethtml(url):
header={"User-Agent":"Mozilla/5.0 (Windows NT 6.1; WOW64; rv:55.0) Gecko/20100101 Firefox/55.0"}
req=urllib.request.Request(url,headers=header)
res=urllib.request.urlopen(req)
html=res.read()
return html

def getcata(html,reg):
#reg = r‘href="(/[a-z]+/.*?2560x1600/)‘
imgre = re.compile(reg)
#html0 = gethtml("http://desk.zol.com.cn/2560x1600/").decode(‘utf-8‘)
cata_list=imgre.findall(html)
return cata_list

def geturl(url):
url=url
html=gethtml(url)
urllist=getcata(html.decode("GBK"),r‘imgsrc":"(http:.*?\.png|http:.*?\.jpg)‘)
return urllist

def getpicurl(picurl,num):
file = open(‘F:/python_workspace/爬蟲/圖片/‘+str(num)+‘.png‘, "wb")
reg1 = r‘(\\)‘
reg2 = r‘(##SIZE##)‘
imgre1 = re.compile(reg1)
res1, num1 = imgre1.subn("", picurl)

imgre2 = re.compile(reg2)
res2, num2 = imgre2.subn("2560x1600", res1)
# x=0
res=urllib.request.urlopen(res2,timeout=10)
res=res.read()
data=file.write(res)
file.close()





#html=gethtml("http://desk.zol.com.cn/2560x1600/").decode(‘GBK‘)
#cata_list=getcata(html,r‘href="(/[a-z]+/.*?2560x1600/)‘)
#for i in cata_list:
# geturl(i)
#ss="http:\/\/desk.fd.zol-img.com.cn\/t_s##SIZE##\/g5\/M00\/0D\/03\/ChMkJlmVBaOIK26rAAJ3foZd400AAfwAADpPesAAneW914.jpg"
#getpicurl(ss)
domain="http://desk.zol.com.cn"
count=0
for urlcount in range(1,47):
url=‘http://desk.zol.com.cn/2560x1600/‘+str(urlcount)+‘.html‘
try:
html=gethtml(url).decode(‘GBK‘)
cata_list=getcata(html,r‘href="(/bizhi/.*?\.html)" target="_blank" hidefocus="true"><img width="208px"‘)
except:
print ("gethtml method error!")
continue

for i in cata_list:
i = domain + i;
try:
picurllist=geturl(i)
except:
print("picurllist method error!")
continue

for j in picurllist:
try:
getpicurl(j,count)
except:
print("getpicurl method error!")
continue
count=count+1
print (j)

第一個爬蟲代碼