1. 程式人生 > >百思不得姐視訊爬取

百思不得姐視訊爬取

# -*- coding:utf-8 -*-
from Tkinter import *
from ScrolledText import ScrolledText
import urllib,requests
import re
import threading
import sys
url_name = []
a = 1
def get():
    global a#全域性變數
hd = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2454.101 Safari/537.36'
} url = 'http://www.budejie.com/video/'+str(a) varl.set('已經獲取到第%s頁視訊'%(a)) html = requests.get(url,headers=hd).text #print html a+=1 url_content= re.compile(r'<div class="j-r-list-c">.*?</div>.*?</div>',re.S) url_contents =re.findall(url_content,html) #print url_contents
for i in url_contents: url_reg = r'data-mp4="(.*?)">'#正則表示式 url_items = re.findall(url_reg,i) #print url_items#視訊列表 if url_items:#判斷地址視訊存不存在 name_reg = re.compile(r'<a href="/detail-.{8}?.html">(.*?)</\w',re.S) name_items = re.findall(name_reg,i) #print name_items
for i,k in zip(name_items,url_items):#標題與視訊結合 url_name.append([i,k]) print i,k return url_name id = 1 def write(): global id while id<10: url_name = get() for i in url_name: urllib.urlretrieve(i[1],'video\\%s.mp4' % (i[0])) text.insert(END,str(id)+'.'+i[1]+'\n'+i[0]+'\n') url_name.pop(0) id +=1 varl.set('抓取完畢') def start(): th = threading.Thread(target=write) th.start()#觸發 root = Tk() root.title('視訊爬取') root.geometry('+400+100')#指定位置 text = ScrolledText(root,font=('微軟雅黑',10)) text.grid() button = Button(root,text='開始爬取',font=('微軟雅黑',10),fg='blue',command=start) button.grid() varl = StringVar() label = Label(root,font=('微軟雅黑',10),fg='black',textvariable = varl) label.grid() varl.set('已準備...') root.mainloop()