【爬蟲】抓取msdn.itellyou.cn所有作業系統映象下載連結
阿新 • • 發佈:2018-11-08
msdn.itellyou.cn這個網站首頁是SPA單頁應用,所有資料用過請求restfulAPI來獲取,然後動態生成頁面。
通過chrome的除錯工具可以抓取到獲取資料的API介面地址,以及引數情況。
get_download_list函式中傳入的id是在首頁作業系統頁面抓到的。
下面指令碼是抓取作業系統的下載連結
#!/usr/bin/python
# -*- coding: utf-8 -*-
import re
import json
import requests
API_MSDN_INDEX = 'https://msdn.itellyou.cn/'
API_INDEX = 'http://msdn.itellyou.cn/Category/Index'
API_GET_LANG = 'https://msdn.itellyou.cn/Category/GetLang'
API_GET_LIST = 'https://msdn.itellyou.cn/Category/GetList'
API_GET_PRODUCT = 'https://msdn.itellyou.cn/Category/GetProduct'
headers = {
'Referer':'https://msdn.itellyou.cn/'
}
RESULT = {'data':[]}
def get_product(id):
r = requests.post(API_GET_PRODUCT, headers=headers, data={'id' :id})
if r.status_code == requests.codes.ok:
item = r.json().get('result')
print 'FileName:%s' % item.get('FileName')
print 'PostData:%s' % item.get('PostDateString')
print 'SHA1:%s' % item.get('SHA1')
print 'size:%s' % item.get('size')
print 'Download:%s' % item.get('DownLoad')
return item
def get_list(id, lang_id):
r = requests.post(API_GET_LIST, headers=headers, data={'id':id, 'lang':lang_id, 'filter':'true'})
if r.status_code == requests.codes.ok:
product_list = []
for item in r.json().get('result'):
product_info = get_product(item.get('id'))
product_list.append(product_info)
return product_list
def get_lang(id):
r = requests.post(API_GET_LANG, headers=headers, data={'id':id})
if r.status_code == requests.codes.ok:
lang_list = []
for lang in r.json().get('result'):
print lang.get('lang')
info = {'lang':lang.get('lang'), 'product_list':get_list(id,lang.get('id'))}
lang_list.append(info)
return lang_list
def get_download_list(category_id):
r = requests.post(API_INDEX, headers=headers, data={'id':category_id})
if r.status_code == requests.codes.ok:
for item in r.json():
print 'System Name: %s'% item.get('name')
system_info = get_lang(item.get('id'))
system_info = {'name':item.get('name'), 'lang_list':system_info}
RESULT['data'].append(system_info)
print 'finishied!!!'
return RESULT
if __name__ == '__main__':
json_buffer = get_download_list('7ab5f0cb-7607-4bbe-9e88-50716dc43de6')
with open('./msdn.json','w') as f:
json.dump(json_buffer,f)
抓取效果
抓取所有作業系統的下載連結後,儲存在本地msdn.json,現在把這些作業系統的檔案,自動離線下載到百度雲網盤裡面。
通過chrome瀏覽器的開發人員除錯工具,可以抓到百度雲建立目錄以及離線下載這兩個功能的xhr請求,我們可以構造一下請求,然後讀取儲存的json檔案,實現自動離線下載
import requests,json
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.81 Safari/537.36',
'Accept': 'application/json, text/javascript, */*; q=0.01',
'Cookie': '這裡改成你的cookie',
'Referer':'https://pan.baidu.com/disk/home?'
}
def download(source_url, path):
host = 'https://pan.baidu.com/rest/2.0/services/cloud_dl?channel=chunlei&web=1&app_id=250528&bdstoken=a1ca97d6f5763c08df76e5497c66c936&logid=MTUzNjg4NzA3MjI2MjAuNzIxMTE4MDY2NDI0NDg1Ng==&clienttype=0'
payload = {
'method': 'add_task',
'app_id': '250528',
'source_url': source_url,
'save_path': path,
'type': '3'
}
r = requests.post(host, headers = headers, data = payload)
if r.status_code == requests.codes.ok:
if r.json().get('status') == 0:
return True
return False
def create(path):
host = 'https://pan.baidu.com/api/create?a=commit&channel=chunlei&web=1&app_id=250528&bdstoken=a1ca97d6f5763c08df76e5497c66c936&logid=MTUzNjg4NzMyODE2MzAuMTU4NTI4OTI3NDUxNDY1ODU=&clienttype=0'
payload = {
'path': path,
'isdir': '1',
'block_list': '[]',
}
r = requests.post(host, headers = headers, data = payload)
if r.status_code == requests.codes.ok:
if r.json().get('status') == 0:
return path
return False
def main():
with open('./msdn.json','r') as f:
temp = json.loads(f.read())['data']
for system in temp:
for lang in system['lang_list']:
tree_folder = '/msdn_itellyou/%s/%s' % (system['name'],lang['lang'])
if create(tree_folder):
print 'Create folder %s success!!'% tree_folder
for product in lang['product_list']:
if product and download(product['DownLoad'], tree_folder):
print 'Download ISO %s success!!' % product['FileName']
if __name__ == '__main__':
main()
由於百度雲的離線下載功能有驗證,頻繁呼叫離線下載的API會讓你輸入驗證碼。到此不再深入。