【python筆記】騰訊動漫爬取(更新)
阿新 • • 發佈:2018-12-20
目前騰訊動漫閱讀介面的滑動需要切換到指定視窗,即漫畫照片頁所在div
'var q=document.getElementById("mainView").scrollTop ='+str(i*3000)
才可以滑動
import requests import urllib.request import re import selenium.webdriver import random import time from PIL import Image from io import BytesIO import os from lxml import etree headers = ("User-Agent","Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko)" " Chrome/65.0.3325.146 Safari/537.36") opener = urllib.request.build_opener() opener.addheaders = [headers] urllib.request.install_opener(opener) driver = selenium.webdriver.Chrome() def get_ac(url): driver.implicitly_wait(30) driver.get(url) time.sleep(1) n=20 for i in range(1,n+1): # fr=driver.find_element_by_xpath('//*[@id="checkVipFrame"]') # driver.switch_to.frame(fr) time.sleep(random.randint(0,3)) s ='var q=document.getElementById("mainView").scrollTop ='+str(i*3000) driver.execute_script(s) content=driver.page_source return content lpat= '<a href="/Comic/comicInfo/id/(.*?)"' tpat='id="chapter" title="(.*?)"' mpat='<span class="title-comicHeading">(.*?)</span>' i=0 try: for i in range(11,20): print("第"+str(i)+"次章節爬取開始") url = 'http://ac.qq.com/ComicView/index/id/623537/cid/' + str(i) try: data = get_ac(url) except Exception as e1: i+=1 continue pat1='img src="(https://manhua.qpic.cn/manhua_detail.*?.jpg/0)" alt=""' ft= re.compile(pat1).findall(data) lqurl = re.compile(lpat).findall(data) tqurl = re.compile(tpat).findall(data) mqurl = re.compile(mpat).findall(data) # string=urllib.request.quote(string) # print(string) # print(response) #ft = re.compile(upat).findall(data) #print(ft) print("該章節總計"+str(len(ft))+"頁") if ft != []: if len(ft) < 10: i -= 1 f = open('E:/python雲沉/img/error.txt', 'ab') f.write('第' + str(i) + '話爬取出錯') f.write(ft[0]) f.close() tqurl[0] = tqurl[0] + 'error' print('第' + str(i) + '話爬取出錯') continue for j in range(0, len(ft)): filename = 'E:/python雲沉/img/' + str(tqurl[0]) + '/' + str(mqurl[0]) + '/' isExists = os.path.exists(filename) if not isExists: os.makedirs(filename) print(filename + '建立成功') filename2 = 'E:/python雲沉/img/' + str(tqurl[0]) + '/' + str(mqurl[0]) + '/' + str(j) + '.jpg' response = requests.get(ft[j]) img = Image.open(BytesIO(response.content)) img.save(filename2) print("第" + str(i) + "次章節爬取結束") except OSError as e: print(e)