反爬蟲-python3.6抓取貓眼電影資訊
阿新 • • 發佈:2018-12-15
思路分解:
1.頁面資訊
url:http://maoyan.com/cinema/24311?poi=164257570
檢視資訊發現價格存在亂碼現象:
重新整理頁面找到亂碼的URL,下載woff格式檔案:方法:複製URL:右鍵單擊轉到下載完成,即為程式碼中的baseprice.woff檔案,再次重新整理網頁,同樣的方法再次下載URL作為匹配的woff檔案,即為程式碼中的maoprice.woff.
用這個網址開啟儲存的base.woff檔案,如下圖:
FontEditor
fontstore.baidu.com
與程式碼行對應:
反爬蟲字型解析原理:先在網頁上下載亂碼檔案base.woff,可以轉化為xml,用pycharm開啟可以看到資訊,再重新整理頁面後下載maoyan.woff檔案可以看到二者有對應的關係,就可以編寫程式碼。
進群“960410445 ” 即可獲取數十套PDF哦!@
二者的對應關係:
2.字型解析程式碼:
baseFont = TTFont('C:\Users\nanafighting\Desktop\baseprice.woff') maoyanFont = TTFont('maoprice.woff') maoyan_unicode_list = maoyanFont['cmap'].tables[0].ttFont.getGlyphOrder() maoyan_num_list = [] baseNumList = ['.', '6', '4', '7', '5', '2', '8', '0', '1', '9', '3'] baseUniCode = ['x', 'uniF76E', 'uniEACB', 'uniE8D1', 'uniE737', 'uniE9B7', 'uniF098', 'uniF4DC', 'uniF85E','uniE2F1', 'uniEE4E'] for i in range(1, 12): maoyanGlyph = maoyanFont['glyf'][maoyan_unicode_list[i]] for j in range(11): baseGlyph = baseFont['glyf'][baseUniCode[j]] if maoyanGlyph == baseGlyph: maoyan_num_list.append(baseNumList[j]) break maoyan_unicode_list[1] = 'uni0078' utf8List = [eval(r"'\u" + uni[3:] + "'").encode("utf-8") for uni in maoyan_unicode_list[1:]]
3.程式碼中容易出錯的地方:字串的轉換
moviewish = mw[i].get_text().encode('utf-8') #字串轉換方法1 #moviewish = str(moviewish, encoding='utf-8') #moviewish = '%r' % moviewish #moviewish = moviewish[1:-1] #字串轉換方法2 moviewish=''.join('%s' %id for id in moviewish) for i in range(len(utf8List)): #字元轉換 utf8List[i]=''.join('%s' %id for id in utf8List[i]) maoyan_num_list[i]=''.join('%s' %id for id in maoyan_num_list[i]) moviewish = moviewish.replace(utf8List[i], maoyan_num_list[i]) #完整程式碼import requestsimport refrom fontTools.ttLib import TTFontfrom bs4 import BeautifulSoup as bsfrom lxml import htmlfrom fontTools.ttLib import TTFont# 抓取maoyan票房class MaoyanSpider: # 頁面初始化 def __init__(self): self.headers = { "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8", "Accept-Encoding": "gzip, deflate, br", "Accept-Language": "zh-CN,zh;q=0.8", "Cache-Control": "max-age=0", "Connection": "keep-alive", "Upgrade-Insecure-Requests": "1", "Content-Type": "application/x-www-form-urlencoded; charset=UTF-8", "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.86 Safari/537.36" } # 獲取票房 def getNote(self): url = 'http://maoyan.com/cinema/24311?poi=164257570' host = {'host': 'maoyan.com', 'refer': 'http://maoyan.com/news'} # 合併字典 headers={**self.headers,**host} #headers = dict(self.headers.items() + host.items())在python3中會報錯 # 獲取頁面內容 r = requests.get(url, headers=headers) # print r.text response = html.fromstring(r.text) u = r.text # 匹配ttf font cmp = re.compile(", url('(//.*.woff)') format('woff')") rst = cmp.findall(u) ttf = requests.get("http:" + rst[0], stream=True) with open("maoyanprice.woff", "wb") as pdf: for chunk in ttf.iter_content(chunk_size=1024): if chunk: pdf.write(chunk) # 解析字型庫font檔案 #baseprice.woff是自己在網頁上下載的亂碼字元 baseFont = TTFont('C:\Users\nanafighting\Desktop\baseprice.woff') maoyanFont = TTFont('maoprice.woff') maoyan_unicode_list = maoyanFont['cmap'].tables[0].ttFont.getGlyphOrder() maoyan_num_list = [] baseNumList = ['.', '6', '4', '7', '5', '2', '8', '0', '1', '9', '3'] baseUniCode = ['x', 'uniF76E', 'uniEACB', 'uniE8D1', 'uniE737', 'uniE9B7', 'uniF098', 'uniF4DC', 'uniF85E','uniE2F1', 'uniEE4E'] for i in range(1, 12): maoyanGlyph = maoyanFont['glyf'][maoyan_unicode_list[i]] for j in range(11): baseGlyph = baseFont['glyf'][baseUniCode[j]] if maoyanGlyph == baseGlyph: maoyan_num_list.append(baseNumList[j]) break maoyan_unicode_list[1] = 'uni0078' utf8List = [eval(r"'\u" + uni[3:] + "'").encode("utf-8") for uni in maoyan_unicode_list[1:]] # 獲取發帖內容 soup = bs(u, "html.parser") index = soup.find_all('div', {'class': 'show-list'}) print('---------------Prices-----------------') for n in range(len(index)): mn = soup.find_all('h3', {'class': 'movie-name'}) ting = soup.find_all('span', {'class': 'hall'}) mt = soup.find_all('span', {'class': 'begin-time'}) mw = soup.find_all('span', {'class': 'stonefont'}) for i in range(len(mn)): moviename = mn[i].get_text() film_ting = ting[i].get_text() movietime = mt[i].get_text() moviewish = mw[i].get_text().encode('utf-8') #字串轉換 #moviewish = str(moviewish, encoding='utf-8') #moviewish = '%r' % moviewish #moviewish = moviewish[1:-1] moviewish=''.join('%s' %id for id in moviewish) for i in range(len(utf8List)): #字元轉換 utf8List[i]=''.join('%s' %id for id in utf8List[i]) maoyan_num_list[i]=''.join('%s' %id for id in maoyan_num_list[i]) moviewish = moviewish.replace(utf8List[i], maoyan_num_list[i]) print(moviename, film_ting, movietime, moviewish)spider = MaoyanSpider()print(spider.getNote())
執行結果: