1. 程式人生 > >Python-爬取校花網視訊(單執行緒和多執行緒版本)

Python-爬取校花網視訊(單執行緒和多執行緒版本)

一、參考文章

    python爬蟲爬取校花網視訊,單執行緒爬取

    爬蟲----爬取校花網視訊,包含多執行緒版本

    上述兩篇文章都是對校花網視訊的爬取,由於時間相隔很久了,校花網上的一些視訊已經不存在了,因此上述文章中的程式碼在執行時會出現一些異常,本篇文章主要是對上述文章中的程式碼進行了優化和異常處理,在次做筆記記錄方便以後查閱,修改如下:

1、新增的異常處理如下紅色部分程式碼

二、單執行緒版本

 1 #-*- coding=utf-8 -*-
 2 import re
 3 import
requests 4 import hashlib 5 import time 6 import os 7 8 header = { 9 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 UBrowser/6.1.2107.204 Safari/537.36', 10 'Referer':'http://www.xiaohuar.com' 11 } 12 13 def get_index(url):
14 respose = requests.get(url, headers = header) 15 if respose.status_code == 200: 16 return respose.text 17 18 def parse_index(res): 19 urls = re.findall(r'class="items".*?href="(.*?)"', res, re.S) # re.S 把文字資訊轉換成1行匹配 20 return urls 21 22 23 def get_detail(urls):
24 for url in urls: 25 if not url.startswith('http'): 26 url='http://www.xiaohuar.com%s' %url 27 result = requests.get(url, headers = header) 28 if result.status_code == 200 : 29 mp4_url_list = re.findall(r'id="media".*?src="(.*?)"', result.text, re.S) 30 if mp4_url_list: 31 mp4_url = mp4_url_list[0] 32 save(mp4_url) 33 34 path = os.getcwd() + '/video/' 35 36 def save(url): 37 try:#下載視訊加異常處理 38 video = requests.get(url, headers = header) 39 except requests.exceptions.RequestException as e : 40 print(repr(e)) 41 return 42 43 if video.status_code == 200: 44 m = hashlib.md5() 45 m.update(url.encode('utf-8')) 46 m.update(str(time.time()).encode('utf-8')) 47 filename = r'%s.mp4' % m.hexdigest() 48 filepath = path + filename 49 print(filepath) 50 with open(filepath, 'wb') as f: 51 f.write(video.content) 52 else: 53 print(f'視訊不存在了:{url}') 54 55 def main(): 56 for i in range(5): 57 res1 = get_index('http://www.xiaohuar.com/list-3-%s.html' % i )#拿第一頁資料 58 res2 = parse_index(res1)#提取第一頁上的所有url 59 get_detail(res2)#下載url集合上的視訊 60 61 if __name__ == '__main__': 62 main()

三、多執行緒版本

 1 #-*- coding=utf-8 -*-
 2 # 非同步,多執行緒優化下載速度
 3 
 4 import requests
 5 import re
 6 import os
 7 import hashlib,time
 8 from concurrent.futures import ThreadPoolExecutor
 9 
10 p = ThreadPoolExecutor(30)
11  
12 def get_index(url):
13     response = requests.get(url)
14     if response.status_code == 200:
15         return response.text
16 
17 def parse_index(res):
18     res = res.result()
19     urls = re.findall(r'class="items".*?href="(.*?)"', res, re.S)
20     
21     p.submit(get_detail, urls)
22 
23 def get_detail(urls):
24     for url in urls:
25         if not url.startswith('http'):
26             url='http://www.xiaohuar.com%s' %url
27         r1=requests.get(url)
28         if r1.status_code == 200:
29             url_list=re.findall(r'id="media".*?src="(.*?)"', r1.text, re.S)
30             if url_list:
31                 mp4_url = url_list[0]
32                 save(mp4_url)
33 
34 path = os.getcwd() + '/video_mutil/'
35 if not os.path.exists(path):
36     os.makedirs(path)
37 
38 def save(url):
39     try:#下載視訊做異常處理,視訊可能不存在了
40         r2 = requests.get(url)
41     except requests.exceptions.RequestException as e :
42         print(repr(e))
43         return
44 
45     if r2.status_code == 200:
46         m=hashlib.md5()
47         m.update(url.encode('utf-8'))
48         m.update(str(time.time()).encode('utf-8'))
49         filename = '%s.mp4' %m.hexdigest()
50         file_path = path + filename
51         with open(file_path,'wb') as f:
52             f.write(r2.content)
53         print('視訊下載完成:%s' % file_path)
54     else:
55         print(f'視訊不存在了:{url}')
56 
57 def main():
58     for i in range(5):
59         p.submit(get_index, 'http://www.xiaohuar.com/list-3-%s.html' % i).add_done_callback(parse_index)
60 
61 if __name__ == '__main__':
62     main()

四、資源下載

    資源下載地址:Python爬取校花網視訊-單執行緒和多執行緒版本

 

轉載宣告:本站文章無特別說明,皆為原創,版權所有,轉載請註明:朝十晚八