爬蟲 爬國旗
阿新 • • 發佈:2018-12-20
# -*- coding: utf-8 -*- """根據搜尋詞下載百度圖片""" import re import sys import urllib import requests def get_onepage_urls(onepageurl): """獲取單個翻頁的所有圖片的urls+當前翻頁的下一翻頁的url""" #print('syy') if not onepageurl: print('已到最後一頁, 結束') return [] try: req = urllib.request.Request(onepageurl) req.add_header('User-Agent','Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)') html = urllib.request.urlopen(req).read() html=html.decode('utf-8') except Exception as e: print(e) pic_urls = [] fanye_url = '' return pic_urls, fanye_url pic_urls = re.findall('img src="(.*?)" width="190"', html, re.S) pic_urls=pic_urls[0:35] #print(pic_urls) title=[] title=re.findall('alt="(.*?)" style=', html, re.S) title=title[0:35]#46 #print(title) if(title): print('') else: title=[''] #print('title%s' % (title[0:20])) # fanye_urls = re.findall(re.compile(r"'page-next' href='(.*)'>下一頁"), html, flags=0) fanye_url = 'http://www.ivsky.com' + fanye_urls[0] if fanye_urls else '' #print(fanye_url) #return pic_urls,fanye_url,title return pic_urls,fanye_url def down_pic(pic_urls,all_title): """給出圖片連結列表, 下載所有圖片""" #print(all_title) for i, pic_url in enumerate(pic_urls): try: pic = requests.get(pic_url, timeout=15) string = str(i + 1) + '.jpg' with open('../2_picture/online_picture_3/'+string, 'wb') as f: f.write(pic.content) print('成功下載第%s張圖片: %s' % (str(i + 1), str(pic_url))) #if(i>=19): # break except Exception as e: print('下載第%s張圖片時失敗: %s' % (str(i + 1), str(pic_url))) print(e) #if(i>=19): # break continue if __name__ == '__main__': keyword = 'china flag' # 關鍵詞, 改為你想輸入的詞即可, 相當於在百度圖片裡搜尋一樣 url_init = r'http://www.ivsky.com/tupian/geguoguoqi_t2928/' all_pic_urls = [] all_title=[] onepage_urls,fanye_url= get_onepage_urls(url_init) all_pic_urls.extend(onepage_urls) print(all_pic_urls) #pic = requests.get('http://img.ivsky.com/img/tupian/t/201101/12/guoqi.jpg', timeout=15) #all_title.extend(title) fanye_count = 0 # 累計翻頁數 i=0 while 1: onepage_urls,fanye_url= get_onepage_urls(fanye_url) fanye_count += 1 i+=1 all_pic_urls.extend(onepage_urls) #all_title.extend(title) #print(all_title) if(i>=10):#9 break # print('第頁' % str(fanye_count)) if fanye_url == '' and onepage_urls == []: break #print(all_title) #print(all_pic_urls) down_pic(all_pic_urls,all_title)