1. 程式人生 > >python識別批量網站中的圖片

python識別批量網站中的圖片

需要實現的功能:給出一個網站列表,抓出這些網頁上的圖片。

實現方式: 下載網頁原始碼,在原始碼中識別包含圖片url的標籤,如<img>,<div>,<li>。由於對html瞭解較少,哪些標籤可能含有圖片是從檢視多個網站的原始碼中總結出來的。

呼叫的庫:Selenium(載入Chrome驅動)--獲取執行JS後的原始碼。

  threading--實現多程序

程式碼:

from urllib.parse import urljoin,urlparse
import os
import threading
from time import
ctime from selenium import webdriver import re class myThread(threading.Thread): def __init__(self,func,args,name=''): threading.Thread.__init__(self) self.name=name self.func=func self.args=args self.is_end=False def getResult(self): return
self.res def run(self): self.res=self.func(*self.args) def filter_in_tag(page_file,tag): url_in_tag = [] url_in_tag.append('------------------%s--------------------' % (tag)) with open(page_file, 'r', encoding='utf-8') as jj: for line in jj: ##先找出li所有標籤
reg = '<%s [^>]*>' % (tag) all_tag_str = re.findall(reg, line) for tag_str in all_tag_str: if re.search('https?://[^\'\"\)]+', tag_str): url_in_tag.extend(re.findall('http?://[^\'\"]+', tag_str)) return url_in_tag def process(m_url): imgs,big_files,hrefs=[],[],[] ##先找出圖片 ##新增引數,使chrome不出現介面 chrome_options = webdriver.chrome.options.Options() chrome_options.add_argument('--headless') chrome_options.add_argument('--disable-gpu') driver = webdriver.Chrome(r'C:\Program Files (x86)\Google\Chrome\Application\chromedriver.exe', chrome_options=chrome_options) ##driver = webdriver.PhantomJS(executable_path='/bin/phantomjs/bin/phantomjs')#如果不方便配置環境變數。就使用phantomjs的絕對路徑也可以 driver.set_page_load_timeout(30) try: driver.get(m_url) except Exception as e: ##(HTTPError,URLError,UnicodeDecodeError,WindowsError) as e: err_info='url open error: %s\n, reason: %s\n'%(m_url,e) print(err_info) err_log.write(err_info) #print('url open error: %s\n, reason: %s'%(m_url,e)) return [] imgs = [] imgs.append('------------------<img src=>-----------------') for x in driver.find_elements_by_tag_name("img"): imgs.append(x.get_attribute('src')) # 找出所有div li標籤中的連結 with open('tmp_page_source.html','w',encoding='utf-8') as tmp_f: tmp_f.write(driver.page_source) for tag in ('li', 'div'): imgs.extend(filter_in_tag('tmp_page_source.html',tag)) ##列表去重複 imgs_uniq = [] for url in imgs: if (url not in imgs_uniq) and (url): ##url不在新列表中且url不為空 imgs_uniq.append(url) ##查詢頁面中的a連結中的大檔案和其它網頁 links=[a_link.get_attribute('href') for a_link in driver.find_elements_by_tag_name('a') if a_link.get_attribute('href')] driver.quit() for link in links: host = urlparse(m_url).netloc.split('@')[-1].split(':')[0] dom = '.'.join(host.split('.')[-2:]) if link.startswith('mailto:'): continue if not link.startswith('http'): link=urljoin(m_url,link) f_name = urlparse(link).path.split('/')[-1] f_type = os.path.splitext(f_name)[1] if f_type not in ('.htm','.html','shtml',''): big_files.append(link) continue if link in seen_links: pass#print(link,'--aleady processed,pass.') else: if dom not in link: pass#print(link,'--not in domain,pass.') else: hrefs.append(link) seen_links.append(link) return imgs_uniq,big_files,hrefs ##對process處理結果進行分析,得出如下統計資料: ##圖片:100,HTTP協議佔比:80%,HTTP協議下各種字尾的數量:jpg-50,gif-30 ##大檔案:10,HTTP協議佔比:100%,HTTP協議下各種字尾的數量:pdf-10 def ret_analyse(url_list): to_len=len(url_list)##含有3行標識資訊,非url http_list= [url for url in url_list if url.startswith("http://")] http_perc='%.1f%%'%(len(http_list)/to_len*100) if to_len>0 else '0' exts_dict={} for url in url_list: if url.startswith('-----------'): ##排除‘-------img:src-----’等 continue f_name = urlparse(url).path.split('/')[-1] f_type = os.path.splitext(f_name)[1] if f_type not in exts_dict: exts_dict[f_type]=1 else: exts_dict[f_type]+=1 return to_len,http_perc,exts_dict ##對一組url呼叫process函式處理,並輸出結果到文字 def group_proc(url_f , urls,is_analyse) : links=[] ##儲存該頁面除大檔案外的a連結 ##定義寫日誌的函式 def wLog(*lines): for line in lines: try: url_f.write(line + '\n') except Exception as e: print('write eror,line:%s, err: %s'%(line,e)) for url in urls: proc_ret=process(url) if proc_ret: img_list,bigfile_list,link_list=proc_ret wLog('*'*40,'from: ',url) # 分隔行+起始行 if is_analyse: img_output='圖片:%d,HTTP協議佔比:%s,HTTP協議下各種字尾的數量:%s'%(ret_analyse(img_list)[0]-3,ret_analyse(img_list)[1],ret_analyse(img_list)[2]) ##圖片含有3行標識資訊 big_output = '大檔案:%d,HTTP協議佔比:%s,HTTP協議下各種字尾的數量:%s' % (ret_analyse(bigfile_list)) wLog(img_output,big_output) img_list = '\n'.join(img_list) bigfile_list = '\n'.join(bigfile_list) wLog('imgs:',img_list,'bigfiles: ',bigfile_list,'*'*40) imgs_f.write(img_list + '\n') if bigfile_list: bigfiles_f.write(bigfile_list + '\n') if link_list: links.extend(link_list) return links def main(depth): u_file=open('urls.txt','r') links=[line.strip('\n') for line in u_file] links=['http://'+link for link in links if not link.startswith('http')] u_file.close() for i in range(depth): is_analyse=True if i==0 else False ##對第一層資料需要分析統計 url_f = open('layer' + str(i)+'.txt','w') next_links=[] if not links: break else: print('第 %d 層開始爬取...'%(i)) ##將連結分配給5組 avg=len(links)//5 links_grp=[] if avg==0: grp_len=len(links) for i in range(grp_len): links_grp.append([links[i]]) else: grp_len = 5 links_grp=links[:avg],links[avg:avg*2],links[avg*2:avg*3],links[avg*3:avg*4],links[avg*4:] #for i in range(grp_len): #url_f.write('link_group %d:%s'%(i,links_grp[i])) ##新建5個執行緒,分別處理5組url threads=[] for i in range(grp_len): t=myThread(group_proc,(url_f,links_grp[i],is_analyse),group_proc.__name__) threads.append(t) ##執行緒同時啟動 for i in range(grp_len): print('執行緒%d開始執行,時間:%s'%(i,ctime())) threads[i].setDaemon(True) threads[i].start() ##等待執行緒結束,結束後將各組url中獲取的外鏈加入到下一次處理的列表中 for i in range(grp_len): threads[i].join() print('執行緒%d執行結束,時間:%s' % (i, ctime())) ret_links=threads[i].getResult() next_links.extend(ret_links) links=next_links url_f.close() if __name__=='__main__': seen_links = [] imgs_f = open('圖片.txt', 'w',encoding='utf-8') bigfiles_f = open('大檔案.txt', 'w',encoding='utf-8') err_log = open('err_log.txt', 'w',encoding='utf-8') depth=int(input('請輸入爬取深度:')) main(depth) err_log.close() imgs_f.close() bigfiles_f.close() input('按任意鍵退出...')
View Code