1. 程式人生 > >無比強大!Python抓取cssmoban站點的模版並下載

無比強大!Python抓取cssmoban站點的模版並下載

jea blank file timeout 全局 -- 文件的 pre target

Python實現抓取http://www.cssmoban.com/cssthemes站點的模版並下載


實現代碼

# -*- coding: utf-8 -*-
import urlparse
import urllib2
import re
import os  
import os.path

URL=‘http://www.cssmoban.com/cssthemes‘

#全局超時設置 
urllib2.socket.setdefaulttimeout(500)

#依據url獲取內容
def getUrlContent(url):
    response = urllib2.urlopen(url)
    html = response.read();
    return html

#獲取html中的a標簽。且格式是<a target="_blank" href="/showcase/*">的
def getAllUrl(html):
    return re.findall(‘<a[\\s]+href="/cssthemes/\d+\.shtml">.*?

\/a>‘,html) #獲取下載文件的標題 def getDownTitle(html): return re.findall(‘\<h1>(.*?

)\</h1>‘,html) #獲取文件下載的url def getDownUrl(html): return re.findall(‘<a.*?class="button btn-down".*?

\/a>‘,html) #獲取下一頁的url def getNextUrl(html): return re.findall(‘<a.*?

下一頁</a>‘,html) #下載文件 def download(title,url): result = urllib2.urlopen(url).read() if os.path.exists("template/")==False: os.makedirs("template/") newname=("template/"+title.decode(‘utf-8‘)) newname=newname+‘.‘+url[url.rfind(‘.‘)+1:len(url)] open(newname, "wb").write(result) #記錄日誌 def i(msg): fileobj=open(‘info.log‘,‘a‘) fileobj.write(msg+‘\n‘) fileobj.close(); print msg #記錄錯誤日誌 def e(msg): fileobj=open(‘error.log‘,‘a‘) fileobj.write(msg+‘\n‘) fileobj.close(); print msg if __name__ == ‘__main__‘: #print getDownUrl(‘<a href="http://down.cssmoban.com/cssthemes1/cctp_17_jeans.zip" target="_blank" class="button btn-down" title="免費下載"><i class="icon-down icon-white"></i><i class="icon-white icon-down-transiton"></i>免費下載</a>‘) html= getUrlContent(URL) i(‘開始下載:%s‘ %(URL)) while True: lista= getAllUrl(html); #print lista; nextPage=getNextUrl(html) #print nextPage[0] nextUrl=‘‘ #i(‘下一頁%s‘%(nextPage)) if len(nextPage)<=0: e(‘地址:%s。未找到下一頁,程序退出‘ %(nextPage)) break; nextUrl=nextPage[0] nextUrl=URL+‘/‘+nextUrl[nextUrl.index(‘href="‘)+6:nextUrl.index(‘" target‘)] #print nextPage for a in lista: downGotoUrl=‘‘ try: #print a.decode(‘utf-8‘) downGotoUrl=(URL+‘‘+a[a.index(‘href="‘)+6:a.index(‘">‘)]) downGotoUrl=downGotoUrl.replace(URL,‘http://www.cssmoban.com‘) #print downGotoUrl downHtml=getUrlContent(downGotoUrl) #print downHtml downTitleList= getDownTitle(downHtml) downTitle=‘‘ if len(downTitleList)>0: downTitle=downTitleList[0] #print downTitle downUrlList= getDownUrl(downHtml) downUrl=‘‘ if len(downUrlList)>0: downUrl=downUrlList[0] downUrl= downUrl[downUrl.index(‘href="‘)+6:downUrl.index(‘" target‘)] #print downUrl i(‘開始下載:%s,文件名稱:%s‘ %(downUrl,downTitle)) download(downTitle,downUrl) i(‘%s下載完畢。保存文件名稱:%s‘ %(downUrl,downTitle)) except Exception,e: e(‘地址:%s下載失敗,失敗信息:‘ %(downGotoUrl)) e(str(e)) i(‘-----------------------------------------‘) i(‘運行下一頁:%s‘ %(nextUrl)) html= getUrlContent(nextUrl)



無比強大!Python抓取cssmoban站點的模版並下載