更新版-基於python3實現的抓取騰訊視頻所有電影的爬蟲-親測可用
阿新 • • 發佈:2018-07-18
地址 找到 換行 download 8.0 txt nload details nexus
本人Python小白一枚(妹),大家都說爬蟲是python入門必學,找了幾個實例,無奈無法運行,猜測可能是技術更新太快,有些已經不適用了。本著學習的決心,試試能不能調通。
原貼地址 https://blog.csdn.net/zhongqi2513/article/details/76896352
下面是我改動的,已經面目全非了??,但是親測可以通過。
也可以看我GitHub
# -*- coding: utf-8 -*- import re import requests from bs4 import BeautifulSoup import string, time import pymongo NUM = 0 #全局變量,電影數量 #m_type = u'' #全局變量,電影類型 m_site = u'qq' #全局變量,電影網站 #根據指定的URL獲取網頁內容 def getHTML(url): try: head = {'User-Agent':'Mozilla/5.0 (Linux; Android 4.1.1; Nexus 7 Build/JRO03D) AppleWebKit/535.19 (KHTML, like Gecko) Chrome/18.0.1025.166 Safari/535.19'} r = requests.get(url, headers = head, timeout = 30) r.raise_for_status() r.encoding = r.apparent_encoding html = r.text return html except: return "" #從電影分類列表頁面獲取電影分類 tag = ('1', '劇情') def getMovieTypeList(url,html): #global m_type soup = BeautifulSoup(html, 'html.parser') #過濾出分類內容 #print(soup) #<div class="filter_content"> #電影分類信息在div這個標簽下。 tags_all = soup.find_all('div', {'class' : 'filter_content' }) #找到所有div.filter_content下的a標簽 #<a _stat2="filter:params|subtype=1" class="item" href="?offset=0&subtype=1">劇情</a> re_tags = r'<a _stat2=".*?subtype=.*?" class="item" href=".*?;subtype=(.*?)">(.*?)</a>' #(.*?) p = re.compile(re_tags) #, re.DOTALL句號(.)是匹配任何除換行符之外的任意字符,使用DOTALL標誌,就可以讓它匹配所有字符,不再排除換行符了 tags = p.findall(str(tags_all[0])) #print('tags = ',tags) if tags: tags_type = {} for tag in tags: #print(tag) #tag = ('1', '劇情') #tag_url = url + '?offset=0&subtype=' + tag[0] #每個電影類型的url #print('tag_url = ',tag_url) tag_subtype = tag[0] m_type = tag[1] print('m_type = ',m_type) tags_type[m_type] = tag_subtype print('tags_type[m_type] = ',tags_type[m_type]) else: print("Not Find") return tags_type #獲取每個分類的頁數?offset=0&subtype=2 def get_pages(url, tag_type): tag_url = url + '?offset=0' + '&subtype=' + tag_type tag_html = getHTML(tag_url) soup = BeautifulSoup(tag_html, 'html.parser') #過濾出標記頁面的html #print(soup) #<div class="mod_pages" r-notemplate="true"> div_page = soup.find_all('div', {'class' : 'mod_pages'}) #print('div_page=',div_page) #len(div_page), div_page[0] if div_page == []: #只有一頁時沒有div標簽,div_page是空[] return 1 #<a _stat2="paging_page|63" class="page_num" href="?subtype=2&offset=1860">63</a> re_pages = r'<a _stat2=".*?" class=".*?" href=".*?">(.*?)</a>' p = re.compile(re_pages) pages = p.findall(str(div_page[0])) #print(pages,len(pages)) if len(pages) > 1: return pages[-2] else: return 1 def getmovielist(m_type, html): #html 分類頁面文本,用gethtml(str(tag_url[1])) global NUM #global m_type global m_site soup = BeautifulSoup(html, 'html.parser') #<ul class="figures_list"> divs = soup.find_all('ul', {'class' : 'figures_list'}) #movie的信息 #print(divs) #<a _stat2="videos:title" href="https://v.qq.com/x/cover/vfx3eugf3h4jiqg.html" target="_blank" title="二十二">二十二</a></strong> re_movie = r'<a _stat2="videos:title" href="(.*?)" target=".*?" title=".*?">(.*?)</a>' #(.*?) p = re.compile(re_movie, re.DOTALL) #, re.DOTALL句號(.)是匹配任何除換行符之外的任意字符,使用DOTALL標誌,就可以讓它匹配所有字符,不再排除換行符了 movies = p.findall(str(divs[0])) #print(movies) #f = open('qqMovies.txt','w') #如qqMovies.txt存在則覆蓋,無則新建 if movies: #print(movies) for movie in movies: #print(movie) #print(NUM) NUM += 1 print('downloading movies: %d' % NUM) #print("%s : %d" % ("=" * 70, NUM)) #values = dict(movie_title = movie[1], movie_url = movie[0], movie_site = m_site, movie_type = m_type) #JSON 格式存儲dict values = 'movie_title: %s , movie_url: %s ,movie_site: %s ,movie_type:%s' % (movie[1],movie[0],m_site,m_type) #TXT 格式存儲字符串 print(values) f = open('qqMovies.txt','a') #以追加的形式讀寫內容,指針在結尾 f.write(values) print("_" * 70) f.write('\n' + "_" * 70 + '\n') #寫入\n換行和-分隔符 else: "Not Found" f.write('total movies: %s \n' % str(NUM)) #寫入最後電影總數 f.close() #關閉文件 if __name__ == "__main__": url = 'http://v.qq.com/x/list/movie' html = getHTML(url) movie_type = getMovieTypeList(url,html) print('movie_type = ',movie_type) for m_url in movie_type.items(): #print('m_url = ', m_url) tag_url = url + '?subtype=' + m_url[1] + '&offset=0' print('tag_url = %s' % tag_url, end = '') #print('tag_type=',str(m_url[1])) #m_url[0] = '劇情' maxpage = int(get_pages(url, str(m_url[1]))) print(', total pages are ', maxpage) for x in range(0,maxpage): #http://v.qq.com/x/list/movie?offset=30&subtype=16 #str.replace(old, new[, max]) page_url = tag_url.replace('0', '') + str(x*30) #print('page_url = ',page_url) #某個分類下,每個頁面的url,如分類為18的第四頁:http://v.qq.com/x/list/movie?subtype=18&offset=90 page_html = getHTML(page_url) getmovielist(m_url[0],page_html) time.sleep(0.1) #設置sleep時間,以防爬取過快被封IP
更新版-基於python3實現的抓取騰訊視頻所有電影的爬蟲-親測可用