1. 程式人生 > >爬蟲 爬國旗

爬蟲 爬國旗

# -*- coding: utf-8 -*-
"""根據搜尋詞下載百度圖片"""
import re
import sys
import urllib
import requests

def get_onepage_urls(onepageurl):
    """獲取單個翻頁的所有圖片的urls+當前翻頁的下一翻頁的url"""
    #print('syy')
    if not onepageurl:
        print('已到最後一頁, 結束')
        return []
    try:
        req = urllib.request.Request(onepageurl)
        req.add_header('User-Agent','Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)')
        html = urllib.request.urlopen(req).read()
        html=html.decode('utf-8')
    except Exception as e:
        print(e)
        pic_urls = []
        fanye_url = ''
        return pic_urls, fanye_url
    
    pic_urls = re.findall('img src="(.*?)" width="190"', html, re.S)
    pic_urls=pic_urls[0:35]
    #print(pic_urls)
    title=[]
    
    title=re.findall('alt="(.*?)" style=', html, re.S)
    title=title[0:35]#46
    #print(title)
    if(title):
        print('')
    else:
        title=['']
    #print('title%s' % (title[0:20]))
    #
    fanye_urls = re.findall(re.compile(r"'page-next' href='(.*)'>下一頁"), html, flags=0)
    
    fanye_url = 'http://www.ivsky.com' + fanye_urls[0] if fanye_urls else ''
    #print(fanye_url)
    #return pic_urls,fanye_url,title
    return pic_urls,fanye_url


def down_pic(pic_urls,all_title):
    """給出圖片連結列表, 下載所有圖片"""
    #print(all_title)
    for i, pic_url in enumerate(pic_urls):
        try:
            pic = requests.get(pic_url, timeout=15)
            string = str(i + 1)  + '.jpg'
            with open('../2_picture/online_picture_3/'+string, 'wb') as f:
                f.write(pic.content)
                print('成功下載第%s張圖片: %s' % (str(i + 1), str(pic_url)))
                
            #if(i>=19):
             #   break
        except Exception as e:
            print('下載第%s張圖片時失敗: %s' % (str(i + 1), str(pic_url)))
            print(e)
            #if(i>=19):
             #   break
            continue


if __name__ == '__main__':
    keyword = 'china flag'  # 關鍵詞, 改為你想輸入的詞即可, 相當於在百度圖片裡搜尋一樣
    
    url_init = r'http://www.ivsky.com/tupian/geguoguoqi_t2928/' 
    all_pic_urls = []
    all_title=[]
    onepage_urls,fanye_url= get_onepage_urls(url_init)
    all_pic_urls.extend(onepage_urls)
    print(all_pic_urls)
    #pic = requests.get('http://img.ivsky.com/img/tupian/t/201101/12/guoqi.jpg', timeout=15)
    #all_title.extend(title)
    fanye_count = 0  # 累計翻頁數
    i=0
    while 1:
        onepage_urls,fanye_url= get_onepage_urls(fanye_url)
        fanye_count += 1
        i+=1
        all_pic_urls.extend(onepage_urls)      
        #all_title.extend(title)
        #print(all_title)
        if(i>=10):#9
            break
        # print('第頁' % str(fanye_count))
        if fanye_url == '' and onepage_urls == []:
            break
    #print(all_title)
    #print(all_pic_urls)
    down_pic(all_pic_urls,all_title)