1. 程式人生 > >Python獲取個人網站的所有課程下載鏈接和密碼,並保存到Mongodb中

Python獲取個人網站的所有課程下載鏈接和密碼,並保存到Mongodb中

one find() net agent play col pat 進行 jpg

1、獲取網站課程的分類地址;

技術分享
‘‘‘
爬取屌絲首頁,獲取每個分類名稱和鏈接
‘‘‘

import requests
from lxml import etree

headers = {
    User-Agent:Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.81 Safari/537.36,
}
def get_class_data():
    list_data = []
    url = http://www.diaosiweb.net/index.html
responese = requests.get(url,headers=headers) responese.encoding = responese.apparent_encoding class_names = etree.HTML(responese.text).xpath(//div[@id="menu"]/div/ul/li/a/text()) class_links = etree.HTML(responese.text).xpath(//div[@id="menu"][email protected]) for class_name,class_link in
zip(class_names,class_links): if len(class_link.split(/)[-1]) == 0: class_data = { 類別名稱:class_name, 類別鏈接:class_link, } list_data.append(class_data) else: pass return list_data
View Code

2、通過上面獲取的地址來獲取所有的每個分類下的所有課程名稱、鏈接和發布時間,並保存到Mongodb中去;

技術分享
‘‘‘
獲取每個分類url下面的課程名稱和鏈接,然後通過課程鏈接,進入到鏈接裏面去獲取每個課程的url和密碼
‘‘‘

from spiders_diaosi import get_class_data
import requests
from lxml import etree
import pymongo
from multiprocessing import Pool

headers = {
    User-Agent:Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.81 Safari/537.36,
}
client = pymongo.MongoClient(localhost,27017)
diaosi = client[kecheng_message]
kecheng_message = diaosi[message]

def get_kecheng_data(url):      #獲取每頁的課程名稱、鏈接、發布時間
    try:
        response = requests.get(url,headers = headers)
        response.encoding = response.apparent_encoding
        kecheng_names = etree.HTML(response.text).xpath(//ul[@class="g-list1"]/li/a/text())
        kecheng_links = etree.HTML(response.text).xpath(//ul[@class="g-list1"][email protected])
        times = etree.HTML(response.text).xpath(//ul[@class="g-list1"]/li/span/text())
        for kecheng_name,kecheng_link,time in zip(kecheng_names,kecheng_links,times):
            data = {
                課程名稱:kecheng_name,
                課程鏈接:kecheng_link,
                發布時間:time
            }
            kecheng_message.insert(data)        #把獲取到的課程信息保存到Mongodb中,最後爬取的時候從數據中爬取
            #print(data)
    except Exception as e:
        print(e)

def get_max_page(url):      #獲取每個分類的最大頁數
    page_response = requests.get(url,headers=headers)
    page_num = int(etree.HTML(page_response.text).xpath(//span[@class="pageinfo"]/strong[1]/text())[0])
    return page_num
    #print(page_num)

def get_class_id(url):
    class_response = requests.get(url,headers=headers)
    class_response.encoding = class_response.apparent_encoding
    if get_max_page(url) != 1:
        class_id = int(etree.HTML(class_response.text).xpath(//ul[@class="pagelist"][email protected])[-1].split(_)[1])
        for num in range(1,get_max_page(url) + 1):
            new_url = {}list_{}_{}.html.format(url,class_id,num)
            #print(new_url)
            get_kecheng_data(new_url)

    else:
        get_kecheng_data(url)

for link in get_class_data():       #從之前的爬取的分類鏈接中,讀取其中的鏈接,然後爬取每個分類鏈接中的課程信息
    url = link[類別鏈接]
    print(開始爬取: + link[類別名稱])
    get_class_id(url)
    print(已經爬完了: + link[類別名稱])
View Code

3、從數據庫中讀取每個課程的鏈接,因為下載地址只有登入之後才可以看到,所以模擬登入之後,進行獲取,並保存到Mongodb中去,

技術分享
from get_captcha import get_capthca
import pymongo
import re
import requests
from lxml import etree
import random

client = pymongo.MongoClient(localhost,27017)
diaosi = client[kecheng_message]
kecheng_message = diaosi[message]
dow_message = diaosi[dow_message]

login_url = http://www.diaosiweb.net/member/index.php
headers_data = [
    Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.81 Safari/537.36,
    Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.79 Safari/537.36 Edge/14.14393,
    Mozilla/5.0 (Windows NT 10.0; WOW64; rv:53.0) Gecko/20100101 Firefox/53.0,
]
headers = {User-Agent:random.choice(headers_data)}
data = {
   fmdo:login,
   dopost:login,
   gourl:‘‘,
   userid:***,      #運行的時候這裏輸入你的用戶名,或者用input函數輸入也可以
   pwd:****,        #這裏則輸入密碼,獲取用input函數
   vdcode:‘‘,
   keeptime:604800,
}

get_capthca(login_url)
captcha = input(輸入你看到的驗證碼:)
data[vdcode] = captcha

session = requests.Session()
session.headers.update(headers)

login_response = session.get(login_url,headers= headers,data=data)
for link in kecheng_message.find():
    html = session.get(link[課程鏈接])
    html.encoding = html.apparent_encoding
    dow_url = re.compile("<div id=‘pan‘ style=\"display:none;\">(.*?)</div>").findall(html.text)[0]
    mima = etree.HTML(html.text).xpath(//span[@style]/text())
    data = {
        name:link[課程名稱],
        link:link[課程鏈接],
        dow_url:dow_url,
    }
    try:
        if len(mima) == 0  or len(mima) > 5 and 網盤提取密碼 not in mima[-1].split(:) :
            data[mima] = 沒有密碼
        else:
            data[mima] = mima
        dow_message.insert(data)
        print(data)
    except Exception as e:
        print(e)
        print(link[課程名稱])
View Code

下面是獲取網頁驗證碼的,

技術分享
‘‘‘
獲取登入界面的驗證碼,並保存到本地     --現在只是保存到本地中,後期再編寫自動輸入
‘‘‘


import requests
from lxml import etree
import os

login_url = http://www.diaosiweb.net/member/index.php
headers = {
    User-Agent:Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.81 Safari/537.36,
}

def get_capthca(url):
    login_response = requests.get(url,headers=headers)
    image_url = http://www.diaosiweb.net + etree.HTML(login_response.text).xpath(//img[@id="vdimgck"][email protected])[0].replace(..,‘‘)
    image_response = requests.get(image_url).content
    with open(captcha.jpg,wb) as f:
        f.write(image_response)
        f.close()
        print(驗證碼已經保存到:{}.format(os.getcwd()))
View Code

恩,這樣差不多就完成了一個爬蟲項目了,因為是第一次完整的爬取,所以寫的比較亂,也沒有思維圖,也知道有很多地方不完善,但是發懶筋了,不想寫了,先這樣吧!

Python獲取個人網站的所有課程下載鏈接和密碼,並保存到Mongodb中