1. 程式人生 > >【python筆記】騰訊動漫爬取(更新)

【python筆記】騰訊動漫爬取(更新)

目前騰訊動漫閱讀介面的滑動需要切換到指定視窗,即漫畫照片頁所在div

'var q=document.getElementById("mainView").scrollTop ='+str(i*3000)

才可以滑動

import requests
import urllib.request
import re
import selenium.webdriver
import random
import time
from PIL import Image
from io import BytesIO
import os
from lxml import etree
headers = ("User-Agent","Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko)"
                        " Chrome/65.0.3325.146 Safari/537.36")
opener = urllib.request.build_opener()
opener.addheaders = [headers]
urllib.request.install_opener(opener)
driver = selenium.webdriver.Chrome()
def get_ac(url):
    driver.implicitly_wait(30)
    driver.get(url)
    time.sleep(1)
    n=20
    for i in range(1,n+1):
        # fr=driver.find_element_by_xpath('//*[@id="checkVipFrame"]')
        # driver.switch_to.frame(fr)
        time.sleep(random.randint(0,3))
        s ='var q=document.getElementById("mainView").scrollTop ='+str(i*3000)
        driver.execute_script(s)
    content=driver.page_source
    return content
lpat= '<a href="/Comic/comicInfo/id/(.*?)"'
tpat='id="chapter" title="(.*?)"'
mpat='<span class="title-comicHeading">(.*?)</span>'
i=0
try:
    for i in range(11,20):
        print("第"+str(i)+"次章節爬取開始")
        url = 'http://ac.qq.com/ComicView/index/id/623537/cid/' + str(i)
        try:
            data = get_ac(url)
        except Exception as e1:
            i+=1
            continue
        pat1='img src="(https://manhua.qpic.cn/manhua_detail.*?.jpg/0)" alt=""'
        ft= re.compile(pat1).findall(data)
        lqurl = re.compile(lpat).findall(data)
        tqurl = re.compile(tpat).findall(data)
        mqurl = re.compile(mpat).findall(data)
        # string=urllib.request.quote(string)
        # print(string)
        # print(response)
        #ft = re.compile(upat).findall(data)
        #print(ft)
        print("該章節總計"+str(len(ft))+"頁")
        if ft != []:
            if len(ft) < 10:
                i -= 1
                f = open('E:/python雲沉/img/error.txt', 'ab')
                f.write('第' + str(i) + '話爬取出錯')
                f.write(ft[0])
                f.close()
                tqurl[0] = tqurl[0] + 'error'
                print('第' + str(i) + '話爬取出錯')
                continue
        for j in range(0, len(ft)):
            filename = 'E:/python雲沉/img/' + str(tqurl[0]) + '/' + str(mqurl[0]) + '/'
            isExists = os.path.exists(filename)
            if not isExists:
                os.makedirs(filename)
                print(filename + '建立成功')
            filename2 = 'E:/python雲沉/img/' + str(tqurl[0]) + '/' + str(mqurl[0]) + '/' + str(j) + '.jpg'
            response = requests.get(ft[j])
            img = Image.open(BytesIO(response.content))
            img.save(filename2)
        print("第" + str(i) + "次章節爬取結束")
except OSError as e:
    print(e)