1. 程式人生 > >Python應用之爬取一本pdf

Python應用之爬取一本pdf

 

爬取https://max.book118.com網站的某一本書,其實也算不上爬取,只是下載。我這個只是拋磚引玉,大神們可以寫個網站整個文件的爬蟲。

以這本書為列 https://max.book118.com/html/2017/0802/125615287.shtm,再加上批量img2pdf的方法,就可以下載一本書了。具體的分析過程不在此贅述,直接上程式碼(程式碼只是用於本人學習,寫的有些low)

 

import requests
import os
import json
def savePng(url, fileName):
    root = "//home//Desktop//
" path = root + "//" + fileName if not os.path.exists(root): os.mkdir(root) if not os.path.exists(path): r = requests.get(url) r.raise_for_status() with open(path, "wb+") as f: f.write(r.content) def getPNGName(url): req = requests.get(url) json_req
= req.content.decode() json_dict= json.loads(json_req) print(json_dict) return json_dict["NextPage"] def getNextPageURL(pngName): url = "https://view42.book118.com/pdf/GetNextPage/?f=dXAyMjI2LTIuYm9vazExOC5jb20uODBcMzQ4NDU0MS01OTgxMGI5MDMwM2JjLnBkZg==&img=%s&isMobile=false&isNet=True&readLimit=kVJSwRWfuu2BpuMVDJqlnw==&
[email protected]
[email protected][email protected]==
" % pngName return url def getCurPageUrl(pngName): url = "https://view42.book118.com/img/?img=%s" % pngName return url #url = getNextPageURL("[email protected]cA4m4rqRBGs=") url = "https://view42.book118.com/pdf/GetNextPage/?f=dXAyMjI2LTIuYm9vazExOC5jb20uODBcMzQ4NDU0MS01OTgxMGI5MDMwM2JjLnBkZg==&[email protected]WAYh&isMobile=false&isNet=True&readLimit=kVJSwRWfuu2BpuMVDJqlnw==&[email protected][email protected][email protected]==" for curPageIndex in range(0, 486): #根據當前圖片名字,請求下一張圖片名字 pngName = getPNGName(url) #根據下一張圖片名字拼湊url url = getCurPageUrl(pngName) #下載PNG,記錄圖片名字 savePng(url, str(curPageIndex) + ".PNG") #得到下一頁圖片url url = getNextPageURL(pngName)