python爬蟲下載網站磁力連結

阿新 • • 發佈：2019-02-03

設計分三步走：

1.獲取明星列表地址

2.獲取明星作品序列號

3.根據作品序列號查詢磁力連結

一、獲取網站中明星列表的作品集地址

#coding=utf8
import requests
import re
import xlrd
import xlwt
import time
from bs4 import BeautifulSoup

#新建excel表格用於儲存資料

myfile=xlwt.Workbook()
table=myfile.add_sheet(u"資訊",cell_overwrite_ok=True)
table.write(0,0,u"名字")
table.write(0,1,u"連結")

user_agent = 'Mozilla/5.0 (Windows; U; Windows NT 5.2) AppleWebKit/525.13 (KHTML, like Gecko) Chrome/0.2.149.27 '
headers = { 'User-Agent' : user_agent }

class geturl():
    def __init__(self,page):
        self.page = page
    
    def get_url(self):

        for p in range(1,self.page+1):
            url = 'https://avso.pw/cn/actresses/page/'+str(p)
            r = requests.get(url,headers=headers)
            html = r.text
            #print html

            
            soup = BeautifulSoup(html)
            
            i = (p-1)*50 + 1
            for tag in soup.find_all(href=re.compile("https://avso.pw/cn/star")):
                #print tag.attrs['href']
                table.write(i,1,tag.attrs['href'])
                i += 1

            j = (p-1)*50 +1    
            for tag in soup.find_all(class_='photo-info'):
                for gg in tag.find_all('span'):
                    #print gg.string
                    table.write(j,0,gg.string)
                    j += 1
            print u"完成讀取第%s頁資訊"%p

        
            
            
          

test = geturl(2)
test.get_url()
filename=str(time.strftime('%Y%m%d%H%M%S',time.localtime()))+"url.xlsx"
myfile.save(filename)
print u"完成%s的url備份"%time.strftime('%Y%m%d%H%M%S',time.localtime())

二、獲取明星作品的番號

#coding=utf8
import requests
import re
import xlrd
import xlwt
import time
import ConfigParser
from bs4 import BeautifulSoup

user_agent = 'Mozilla/5.0 (Windows; U; Windows NT 5.2) AppleWebKit/525.13 (KHTML, like Gecko) Chrome/0.2.149.27 '
headers = { 'User-Agent' : user_agent }

myfile=xlwt.Workbook()
wtable=myfile.add_sheet(u"資訊",cell_overwrite_ok=True)
wtable.write(0,0,u"名字")
wtable.write(0,1,u"連結")
wtable.write(0,2,u"番號")

class getserial():
    
    def get_serial(self):
        data = xlrd.open_workbook('url.xls')
        table = data.sheets()[0]
        nrows = table.nrows
        for j in range(nrows):
            try:
                cf = ConfigParser.ConfigParser()
                cf.read("liao.ini")
                p = cf.getint('num','p')
                if j == 0:
                    continue
                else:
                    url = table.cell(j,1).value
                    
                    r = requests.get(url,headers=headers)
                    html = r.text
                    soup = BeautifulSoup(html)
                    i = 0
                    
                    for tag in soup.find_all('date'):
                        if i%2 == 0:
                            #print tag.string
                            wtable.write(p,2,tag.string)
                            wtable.write(p,0,table.cell(j,0).value)
                            wtable.write(p,1,table.cell(j,1).value)
                            p += 1
                        i+=1
                    print j
                    cf.set("num", "p", p)
                    cf.write(open("liao.ini", "w"))
            except:
                filename=str(time.strftime('%Y%m%d%H%M%S',time.localtime()))+"serial.xlsx"
                myfile.save(filename)
                print u"出現異常自動儲存%s的番號備份"%time.strftime('%Y%m%d%H%M%S',time.localtime())

test = getserial()
test.get_serial()
filename=str(time.strftime('%Y%m%d%H%M%S',time.localtime()))+"serial.xlsx"
myfile.save(filename)
print u"完成%s的番號備份"%time.strftime('%Y%m%d%H%M%S',time.localtime())

三、根據番號查詢對應的磁力連結

#coding=utf8
import requests
import re
import xlrd
import xlwt
import time
import ConfigParser
import threading
from bs4 import BeautifulSoup

user_agent = 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.87 Safari/537.36'
headers = { 
'Accept':'text/css,*/*;q=0.1',
'Accept-Encoding':'gzip, deflate, sdch, br',
'Accept-Language':'zh-CN,zh;q=0.8',
'Cache-Control':'max-age=0',
'Connection':'keep-alive',
'User-Agent' : user_agent ,
}


class getlink():
    def get_link(self,conf,excel):
        myfile=xlwt.Workbook()
        wtable=myfile.add_sheet(u"資訊",cell_overwrite_ok=True)
        wtable.write(0,0,u"名字")
        wtable.write(0,1,u"番號")
        wtable.write(0,2,u"檔案大小")
        wtable.write(0,3,u"檔案更新日期")
        wtable.write(0,4,u"連結")
        wtable.write(0,5,u"磁力連結")
        data = xlrd.open_workbook(excel)
        table = data.sheets()[0]
        nrows = table.nrows
        for j in range(nrows):
            try:
                cf = ConfigParser.ConfigParser()
                cf.read(conf)
                p = cf.getint('num','p')
                if j == 0:
                    continue
                else:
                    serial = table.cell(j,2).value
                    url = 'https://btso.pw/search/' + serial
                    #print url
                    r = requests.get(url,headers=headers,timeout=30)
                    html = r.text
                    #print html
                    soup = BeautifulSoup(html)
                    
                    for tag in soup.find_all('div',class_='row'):

                        for gg in tag.find_all(class_='col-sm-2 col-lg-1 hidden-xs text-right size'):
                            print gg.string
                            wtable.write(p,0,table.cell(j,0).value)
                            wtable.write(p,1,table.cell(j,2).value)
                            wtable.write(p,2,gg.string)
                        
                        for aa in tag.find_all(class_='col-sm-2 col-lg-2 hidden-xs text-right date'):
                            print aa.string
                            wtable.write(p,3,aa.string)
                        
                        for xx in tag.find_all(href=re.compile("https://btso.pw/magnet/detail/hash")):
                            print xx.attrs['href']
                            wtable.write(p,4,xx.attrs['href'])
                            r1 = requests.get(xx.attrs['href'],headers=headers,timeout=30)
                            html1 = r1.text
                            #print html1
                            soup1 = BeautifulSoup(html1)
                            for tag1 in soup1.find_all('textarea',id='magnetLink'):
                                print tag1.string
                                wtable.write(p,5,tag1.string)
                            p += 1
                    cf.set("num", "p", p)
                    cf.write(open(conf, "w"))
            
            except:
                filename=str(time.strftime('%Y%m%d%H%M%S',time.localtime()))+"link.xls"
                myfile.save(filename)
                print u"出現異常自動儲存%s的磁力連結備份"%time.strftime('%Y%m%d%H%M%S',time.localtime())
        filename=str(time.strftime('%Y%m%d%H%M%S',time.localtime()))+"link.xls"
        myfile.save(filename)
        print u"自動儲存%s的磁力連結備份"%time.strftime('%Y%m%d%H%M%S',time.localtime())
if __name__ == '__main__':
    test = getlink()
    threads = []
    t1 = threading.Thread(target=test.get_link,args=('link1.ini','serial1.xls',))
    threads.append(t1)
    t2 = threading.Thread(target=test.get_link,args=('link2.ini','serial2.xls',))
    threads.append(t2)
    t3 = threading.Thread(target=test.get_link,args=('link3.ini','serial3.xls',))
    threads.append(t3)
    t4 = threading.Thread(target=test.get_link,args=('link4.ini','serial4.xls',))
    threads.append(t4)
    t5 = threading.Thread(target=test.get_link,args=('link5.ini','serial5.xls',))
    threads.append(t5)
    t6 = threading.Thread(target=test.get_link,args=('link6.ini','serial6.xls',))
    threads.append(t6)
    for t in threads:
        t.setDaemon(True)
        t.start()
    t.join()
    print u"完成所有程序"

磁力連結丟到迅雷就可以下載了。

看看最後的excel：

python爬蟲下載網站磁力連結

設計分三步走： 1.獲取明星列表地址 2.獲取明星作品序列號 3.根據作品序列號查詢磁力連結一、獲取網站中明星列表的作品集地址 #coding=utf8 import requests import re import xlrd import xlwt import

python爬蟲--下載煎蛋網妹子圖到本地

eve 元素 download down find .get fault log arc 1 #coding:utf-8 2 """ 3 下載煎蛋妹子到本地，通過selenium、正則表達式、phantomjs、Beautifulsoup實現 4 """ 5

Python爬蟲下載whois server字典和whois自動化查詢

python爬蟲下載檔案

原文地址： https://www.cnblogs.com/hjw1/p/8277946.html?utm_source=debugrun&utm_medium=referral python爬蟲下載檔案下載東西和訪問網頁差不多,這裡以下載我以前做

用python爬蟲下載20張圖片到本地

資料全都是寫死的，有需要可以自行修改。 import requests from lxml import etree base_url = "https://unsplash.com/search/photos/flower" headers = {"User-Agent":

python批量下載網站圖片照片

1. 分析 1.1 Chrome除錯在chrome瀏覽器裡輸入快捷鍵Cmd + Opt + I（Windows上是F12,或Ctrl + Shift + I），將除錯選項切到Network，如下一個個觀察此網頁傳送的請求，找到和圖片相關的請求這是一個get請求，初步

Python爬蟲 -下載百度貼吧圖片

先放上程式的程式碼 import urllib.request import os import easygui as g import re def url_open(url): req = urllib.request.Request(url)

python 爬蟲下載網易歌單歌曲

python 爬蟲下載網易歌單歌曲可以根據歌單 id 來下載歌單中的所有音樂，付費音樂除外可以自己輸入歌單 id 來進行單個歌單下載，也可以結合上一篇文章爬取網易雲音樂所有歌單資訊先取到所有的歌單資訊，在進行所有歌單中的歌曲下載爬

【python爬蟲】抓取連結網頁內的文字（第一步定位超連結文字）

第一步：匯入模組>>> import re >>> from bs4 import BeautifulSoup >>> import urllib.request ---------------------------

python爬蟲如何獲得完整連結（動態網頁）

參考：https://blog.csdn.net/hdu09075340/article/details/74202339-------------------參考：https://www.cnblogs.com/hhh5460/p/5044038.html四中方法''' 得

零基礎Python爬蟲下載圖片 10分鐘搞定

本人從未學過Python 想開始學但是但是不喜歡聽理論課就直接開始在時間中學習進入正題Python爬蟲非常簡單第一步首先獲取開啟頁面獲取整個網頁原始碼def getHtml(url): page = urllib.request.urlopen(url)

初試Python爬蟲下載pdf

最近剛學完Boyd的Convex Optimization，真是對Boyd神佩服得五體投地。在他的lecture slides末尾發現原來還有進階課程Stanford的ee364b，那本convex optimization只包括了ee364a，然而ee364b沒有現成的完整

python 爬蟲視訊網站(二)

一前言之前寫了一個python爬蟲視訊網站的程式，這篇文章中提到了關於抓包視訊真實地址的方法。最近，由於一部影院網站更新，導致以前的下載視訊功能失效。所以本文在此，對軟體進行一次更新。二軟體說明更新說明 1.改變之前抓包分析視訊真是地址

python爬蟲下載驗證碼或附件的方法

下邊的是下載京東驗證碼的方法，一些方法給記錄一下： url='https://mall.jd.com/sys/vc/createVerifyCode.html?random='+str(random.random()) img = requests.

記錄一次python爬蟲批量下載一個校花網站的妹子圖片

學python也快2個禮拜了，從開始看別人寫的爬蟲程式碼，然後試著抄著學習，感覺沒太大進步，最大收穫就是改了幾處bug（可能有些地方不適用我的pyyhon平臺報錯）。中午看到一個帖子校花妹子圖使用爬蟲進行批量下載，看了下，感覺不錯（我說的技術，哈哈哈）。然後決定自己寫一個爬蟲，已經看書兩個禮

Python 爬蟲簡單實現（爬取下載連結）

原文地址：https://www.jianshu.com/p/8fb5bc33c78e 專案地址：https://github.com/Kulbear/All-IT-eBooks-Spider 這幾日和朋友搜尋東西的

[Python] [爬蟲] 5.批量政府網站的招投標、中標資訊爬取和推送的自動化爬蟲——網頁下載器

目錄 1.Intro 2.Source 1.Intro 檔名：pageDownloader.py 模組名：網頁下載器引用庫： selenium random sys socket tim

python爬蟲自動下載網頁連結

需求分析今天遇到一個簡單的需求，需要下載澳大利亞電力市場NEM日前市場的發電商報價資料（http://nemweb.com.au/Reports/Current/Next_Day_Offer_Energy/），頁面觀感是這樣的： Ctrl + F 一下，看到一共有395個zip連結。

Python 爬蟲--網站下載器

分享一個自己寫的網站下載器，程式語言是 Python。這個網站下載器主要下載網站可訪問的靜態資源，即各種靜態檔案，包括html、js、css、jpg、png、gif、mp3、mp4、pdf、doc、xls等等等等，具體可參考程式內容。本下載器預設開啟8個執行緒，

【Python爬蟲】使用urllib.request下載已知連結的網路資源

如果有這樣一個場景，我們的EXCEL某一列記錄了好多（圖片、視訊、音訊）連結A，另外一列記錄了連結名稱B，現在我們想要自動下載這些連結的檔案，我們應該怎樣處理？ 1.迴圈去excel取值,將A和B存入到一個二維列表中 2.根據連結字尾不同情況（.jpg,.mp4,mp3等）用urllib.req

python爬蟲下載網站磁力連結

相關推薦