python 批量爬取部落格資料(僅供學習)
阿新 • • 發佈:2019-02-13
#coding:utf-8 import urllib import time import os page=1 while page<=7: url=['']*50 #每頁有50篇文章 temp='http://blog.sina.com.cn/s/articlelist_1191258123_0_'+str(page)+'.html' con =urllib.urlopen(temp).read() #讀取資料 i=0 title=con.find(r'<a title=') href=con.find(r'href=',title) html = con.find(r'.html',href) while title!=-1 and href!=-1 and html!=-1 and i<50: url[i]=con[href+6:html+5] print url[i] title=con.find(r'<a title=',html) href=con.find(r'href=',title) html = con.find(r'.html',href) i=i+1 else: print 'end page=',page j=0 '''寫入本地檔案''' while(j<i): content=urllib.urlopen(url[j]).read() path = 'hanhan/'+str(page)+"/" if os.path.isdir(path): open(path+url[j][-26:],'w+').write(content) else: os.makedirs(path) open(path+url[j][-26:],'w+').write(content) j=j+1 time.sleep(1) else: print 'download' page=page+1 else: print 'all find end'