python實戰之原生爬蟲(爬取熊貓主播排行榜)
阿新 • • 發佈:2018-08-09
ref png ret spider find end mod int tps
""" this is a module,多行註釋 """ import re from urllib import request # BeautifulSoup:解析數據結構 推薦庫 Scrapy:爬蟲框架 #爬蟲,反爬蟲,反反爬蟲 #ip 封 #代理ip庫 class Spider(): url=‘https://www.panda.tv/cate/lol‘ root_pattern=‘<div class="video-info">([\s\S]*?)</div>‘ name_pattern=‘</i>([\s\S]*?)</span>‘ number_pattern=‘<span class="video-number">([\s\S]*?)</span>‘ def __fetch_content(self): r=request.urlopen(Spider.url) htmls=r.read() htmls=str(htmls,encoding=‘utf-8‘) return htmls a=1 def __analysis(self,htmls): root_html=re.findall(Spider.root_pattern,htmls) anchors=[] for html in root_html: name=re.findall(Spider.name_pattern,html) number=re.findall(Spider.number_pattern,html) anchor={‘name‘:name,‘number‘:number} anchors.append(anchor) return anchors def __refine(self,achors): l=lambda anchor:{‘name‘:anchor[‘name‘][0].strip(),‘number‘:anchor[‘number‘][0]} return map(l,achors) def __sort(self,anchors): anchors=sorted(anchors,key=self.__sord_seed,reverse=True) return anchors def __show(self,anchors): for rank in range(0,len(anchors)): print(‘rank ‘+str(rank+1)+‘:‘+anchors[rank][‘name‘] +‘ ‘+anchors[rank][‘number‘] ) def __sord_seed(self,anchor): r=re.findall(‘\d*‘,anchor[‘number‘]) number= float(r[0]) if ‘萬‘ in anchor[‘number‘]: number*=10000 return number def go(self): htmls=self.__fetch_content() anchors=self.__analysis(htmls) anchors=list(self.__refine(anchors)) anchors=self.__sort(anchors) self.__show(anchors) splider=Spider() splider.go()
python實戰之原生爬蟲(爬取熊貓主播排行榜)