1. 程式人生 > >爬取圖片

爬取圖片

fun lis fin tip tro btn apply res %s

import requests # 模塊導入的倆種方法 from multiprocessing import Pool import re def get(url): ret=requests.get(url) if ret.status_code==200: return ret.content.decode(‘gbk‘) def call_back(arg): ret = com.finditer(arg) dict_lst=[] for i in ret: dic = { ‘png‘: i.group(‘png‘), ‘name‘: i.group(‘name‘), ‘place‘: i.group(‘place‘) } dict_lst.append(dic) for i in dict_lst: res=subget(i[‘png‘]) write_func(i[‘name‘],i[‘place‘],res) return dict_lst def subget(url): if ‘https‘ in url: ret = requests.get(url) if ret.status_code == 200: return ret.content else: pass else: n_url = ‘http://www.xiaohuar.com‘ + url ret = requests.get(n_url) if ret.status_code == 200: return ret.content else: pass def write_func(path,place,picture): with open(r‘E:\text1\爬蟲\text_png\%s-%s.png‘ %(path,place),‘wb‘) as f: f.write(picture) ‘‘‘我要爬取的網頁的特征‘‘‘ ‘‘‘http://www.xiaohuar.com/list-1-0.html‘‘‘ ‘‘‘http://www.xiaohuar.com/list-1-43.html‘‘‘ if __name__ ==‘__main__‘: com = re.compile( ‘<div class="item_t">(?:.*?)src="(?P<png>.*?)"(?:.*?)<span class="price">(?P<name>.*?)</span>(?:.*?)‘ ‘<a href="http://www.xiaohuar.com/" class="img_album_btn">(?P<place>.*?)</a>‘, re.S) pool=Pool(3) res_lst=[] for i in range(40): pool.apply_async(get,args=(‘http://www.xiaohuar.com/list-1-%s.html‘ %i,),callback=call_back) pool.close() pool.join()

缺點:爬取的速度慢,最多17個網頁(好無奈)

爬取圖片