python網路爬蟲磁碟快取資料
阿新 • • 發佈:2018-11-14
import os import re import urllib.parse import pickle class DiskCache: def __init__(self,cache_dir='cache'): self.cache_dir=cache_dir def __getitem__(self, item): '''獲取快取''' path=self.url_to_path(item) if os.path.exists(path): with open(path,'wb')as fp: return pickle.load(fp) else: raise KeyError(url+' dose not exist!') def __setitem__(self, key, value): '''儲存快取''' path=self.url_to_path(key) folder=os.path.dirname(path) #返回dirname目錄 if not os.path.exists(folder): #沒有父目錄則建立父目錄 os.makedirs(folder) with open(path,'wb') as fp: #寫入子檔案 fp.write(pickle.dumps(value)) def url_to_path(self,url): '''為url建立path''' components=urllib.parse.urlparse(url) path=components.path # 主要是避免特殊情況,可以視情況而定 ''' if not path: path='/start' #如果url沒有path就建立一個 elif path.endswith('/'): path+='no_end' ''' filename=components.netloc+path+components.query #替換檔名中的不合法部分 filename=re.sub('[^/0-9a-zA-Z\-.,;_]','_',filename) filename='/'.join(segment[:255] for segment in filename.split('/'))#建立多級目錄 return os.path.join(self.cache_dir,filename) #新增壓縮減少磁碟佔用,但是會加大一點查詢時間,不過無關緊要 #在加一個快取過期
import os import re import urllib.parse import pickle import zlib import datetime from datetime import timedelta class DiskCache: def __init__(self,cache_dir='cache',expire=timedelta(days=30)): self.cache_dir=cache_dir self.expire=expire def __getitem__(self, item): '''獲取快取''' path=self.url_to_path(item) if os.path.exists(path): with open(path,'wb')as fp: result,time=pickle.load(zlib.decompress(fp.read())) if self.has_expire(time): raise KeyError(url+' has expired!') return result else: raise KeyError(url+' dose not exist!') def __setitem__(self, key, value): '''儲存快取''' path=self.url_to_path(key) folder=os.path.dirname(path) #返回dirname目錄 if not os.path.exists(folder): #沒有父目錄則建立父目錄 os.makedirs(folder) time=datetime.datetime.utcnow() with open(path,'wb') as fp: #寫入子檔案 fp.write(zlib.compress(pickle.dumps(value,time))) def has_expire(self,timestamp): return datetime.datetime.utcnow()>timestamp+self.expire def url_to_path(self,url): '''為url建立path''' components=urllib.parse.urlparse(url) path=components.path # 主要是避免特殊情況,可以視情況而定 ''' if not path: path='/start' #如果url沒有path就建立一個 elif path.endswith('/'): path+='no_end' ''' filename=components.netloc+path+components.query #替換檔名中的不合法部分 filename=re.sub('[^/0-9a-zA-Z\-.,;_]','_',filename) filename='/'.join(segment[:255] for segment in filename.split('/'))#建立多級目錄 return os.path.join(self.cache_dir,filename)