1. 程式人生 > >Python3.4網頁爬蟲,提取圖片

Python3.4網頁爬蟲,提取圖片

網頁圖片爬蟲:

第一個爬蟲抓去bing主頁圖片,24張

第二個抓取貼吧圖片

第三個抓去圖蟲圖片

#!/usr/bin/env python
# -*- coding:utf-8 -*-
# -*- author:miko-*-
# python3抓取bing主頁所有背景圖片
import urllib.request
import urllib,re,sys,os
def get_bing_backphoto():
    
    if (os.path.exists('img')== False):
        os.mkdir('img')
    for i in range(0,24):
        url = 'http://cn.bing.com/HPImageArchive.aspx?format=js&idx='+str(i)+'&n=1&nc=1361089515117&FORM=HYLH1'
        html = urllib.request.urlopen(url).read()
        if html == 'null':
            print( 'open & read bing error!')
            sys.exit(-1)
        html = html.decode('utf-8')
        #print (html)
        reg = re.compile('"url":"(.*?)","urlbase"',re.S)
        text = re.findall(reg,html)
        #http://s.cn.bing.net/az/hprichbg/rb/LongJi_ZH-CN8658435963_1366x768.jpg
        for imgurl in text :
            right = imgurl.rindex('/')
            name = imgurl.replace(imgurl[:right+1],'')
            savepath = 'img/'+ name
            print (imgurl)
            urllib.request.urlretrieve(imgurl, savepath)
            #print (name + ' save success!')
get_bing_backphoto()

#coding=utf-8
import urllib.request
import re
import urllib,re,sys,os
def getHtml(url):

    html = urllib.request.urlopen(url).read()
    if html == 'null':
            print( 'open & read bing error!')
            sys.exit(-1)
    html=html.decode('utf-8')
    return html

def getImg(html):
    if (os.path.exists('baidu')== False):
        os.mkdir('baidu')
    reg = r'src="(.+?\.jpg)" pic_ext'
    imgre = re.compile(reg)  
    imglist = imgre.findall(html)  
    x = 0  
    for imgurl in imglist:  
        urllib.request.urlretrieve(imgurl,'baidu/%s.jpg' % x)  
        x = x + 1
        print(imgurl)
        #http://imgsrc.baidu.com/forum/pic/item/16391f30e924b89915f86eb06f061d950b7bf677.jpg
html = getHtml("http://tieba.baidu.com/p/2460150866")
getImg(html)
#print (getImg(html))


#-*- encoding: utf-8 -*-
'''
Created on 2015-7-30
@author: Miko
'''

import urllib.request
import urllib,re,sys,os,time
import uuid
#獲取二級頁面url
def findUrl2(html):
    re1 = r'http://tuchong.com/\d+/\d+/|http://\w+(?<!photos).tuchong.com/\d+/'
    url2list = re.findall(re1,html)
    url2lstfltr = list(set(url2list))
    url2lstfltr.sort(key=url2list.index)
    #print url2lstfltr
    return url2lstfltr
#獲取html文字
def getHtml(url):
    html = urllib.request.urlopen(url).read().decode('utf-8')#解碼為utf-8
    return html
#下載圖片到本地
def download(html_page , pageNo):   
    #定義資料夾的名字
    x = time.localtime(time.time())
    foldername = str(x.__getattribute__("tm_year"))+"-"+str(x.__getattribute__("tm_mon"))+"-"+str(x.__getattribute__("tm_mday"))
    re2=r'http://photos.tuchong.com/.+/f/.+\.jpg'
    imglist=re.findall(re2,html_page)
    print (imglist)
    download_img=None
    for imgurl in imglist:
        picpath = 'D:\\TuChong\\%s\\%s'  % (foldername,str(pageNo))
        filename = str(uuid.uuid1())
        if not os.path.exists(picpath):
            os.makedirs(picpath)               
        target = picpath+"\\%s.jpg" % filename
        print ("The photos location is:"+target)
        download_img = urllib.request.urlretrieve(imgurl, target)#將圖片下載到指定路徑中
        time.sleep(1)
        print(imgurl)
    return download_img

# def callback(blocknum, blocksize, totalsize):
#     '''回撥函式
#     @blocknum: 已經下載的資料塊
#     @blocksize: 資料塊的大小
#     @totalsize: 遠端檔案的大小
#     '''
#     print str(blocknum),str(blocksize),str(totalsize)
#     if blocknum * blocksize >= totalsize:
#         print '下載完成'
def quitit():
    print ("Bye!")
    exit(0)
   
if __name__ == '__main__':
    print ('''            *****************************************
            **    Welcome to Spider for TUCHONG    **
            **      Created on 2015-7-30           **
            **      @author: miko                  **
            *****************************************''')
    pageNo ='10' # raw_input("Input the page number you want to scratch (1-100),please input 'quit' if you want to quit>")
    while not pageNo.isdigit() or int(pageNo) > 100 :
        if pageNo == 'quit':quitit()
        print ("Param is invalid , please try again.")
        pageNo = raw_input("Input the page number you want to scratch >")

    #針對圖蟲人像模組來爬取
    html = getHtml("http://tuchong.com/tags/%E4%BA%BA%E5%83%8F/?page="+str(pageNo))
  

    detllst = findUrl2(html)
    for detail in detllst:
        html2 = getHtml(detail)
        download(html2,pageNo)
    print ("Finished.")