Python 抓取旅遊資訊
阿新 • • 發佈:2019-02-01
#coding=UTF-8 from urllib.request import Request, urlopen,quote from urllib.error import URLError import chardet from bs4 import BeautifulSoup as BS import sys import re # from readability.readability import Document # from html2text import html2text def __searchUrls(pageCur,pageTotal): if pageCur == 1: url = 'http://www.bytravel.cn/view/index109_list.html' else: url = 'http://www.bytravel.cn/view/index109_list' + str(pageCur-1)+'.html' if pageCur > pageTotal:#獲取前pageTotal頁 return else: try: # print(pageCur) # print(url) headers = { 'User-Agent': 'User-Agent:Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2454.101 Safari/537.36' } req = Request(url, headers=headers) response = urlopen(req) content = response.read().decode('gb2312','ignore') soup = BS(content) # print(soup) f=open('北京景點.txt', "a+",encoding='utf-8') #寫入檔案 print("★ 上海旅遊第【"+str(pageCur)+"】頁"+url) for result_table in soup.findAll("table", {"id": "tjtable"}): # a_content =result_table.find("div",{"class": "user-section"}) # a_href = a_content.find("a",{"class": "more flc80"}) # text = getContextByurl('http://www.xxhh.com'+a_href.get("href")) title_div = result_table.find("div",{"id": "tctitle"}) link = 'http://www.bytravel.cn'+title_div.a['href'] title = title_div.text # text =result_table.find("div",{"id": "tcjs"}).text text = getContextByurl(link) print('['+title+']'+':'+link) print('簡介:'+text+'\n'+'---------------------------------------------'+'\n') f.write('['+title+']'+':'+link) f.write('簡介:'+text+'\n'+'---------------------------------------------'+'\n') except URLError as e: if hasattr(e, 'reason'): print('We failed to reach a server.') print('Reason: ', e.reason) elif hasattr(e, 'code'): print('The server couldn\'t fulfill the request.') print('Error code: ', e.code) pageCur = pageCur+ 1 __searchUrls(pageCur,pageTotal) def getContextByurl(url): try: headers = { 'User-Agent': 'User-Agent:Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2454.101 Safari/537.36' } # print(url) req = Request(url, headers=headers) response = urlopen(req) html = response.read().decode('gb2312','ignore') soup = BS(html) # article =Document(html).summary() # text = html2text(article) div_text = soup.find("div", {"class": "f14"}) return div_text.text except URLError as e: if hasattr(e, 'reason'): print('We failed to reach a server.') print('Reason: ', e.reason) return '' elif hasattr(e, 'code'): print('The server couldn\'t fulfill the request.') print('Error code: ', e.code) return '' if __name__ == '__main__': __searchUrls(1,20) #抓取第一頁到第十頁的嘻嘻哈哈 # getContextByurl('http://www.bytravel.cn/Landscape/70/maominglu.html')
#coding=UTF-8 from urllib.request import Request, urlopen,quote from urllib.error import URLError import chardet from bs4 import BeautifulSoup as BS import sys import re # from readability.readability import Document # from html2text import html2text def __searchUrls(pageCur,pageTotal): if pageCur == 1: url = 'https://www.meet99.com/lvyou-shanghai.html' else: url = 'http://www.bytravel.cn/view/index109_list' + str(pageCur-1)+'.html' if pageCur > pageTotal:#獲取前pageTotal頁 return else: try: # print(pageCur) # print(url) headers = { 'User-Agent': 'User-Agent:Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2454.101 Safari/537.36' } req = Request(url, headers=headers) response = urlopen(req) content = response.read().decode('utf-8','ignore') soup = BS(content) # print(soup) # f=open('北京景點.txt', "a+",encoding='utf-8') #寫入檔案 print("★ 上海旅遊第【"+str(pageCur)+"】頁"+url) for result_table in soup.findAll("li", {"class": "box"}): # a_content =result_table.find("div",{"class": "user-section"}) # a_href = a_content.find("a",{"class": "more flc80"}) # text = getContextByurl('http://www.xxhh.com'+a_href.get("href")) title_div = result_table.find("div",{"class": "img"}) title_like = result_table.find("div",{"class": "bar"})# 想去與去過 never_cnt ="" ever_cnt = "" if title_like is not None: never_cnt = title_like.find("a",{"class": "never"}).text ever_cnt = title_like.find("a",{"class": "ever"}).text if title_div is None: continue link =title_div.a['href'] title = title_div.a.text # text =result_table.find("div",{"id": "tcjs"}).text # text = getContextByurl(link) print('['+title+']'+':https://www.meet99.com'+link) print( ever_cnt +' '+never_cnt) # f.write('['+title+']'+':'+link) # f.write('簡介:'+text+'\n'+'---------------------------------------------'+'\n') except URLError as e: if hasattr(e, 'reason'): print('We failed to reach a server.') print('Reason: ', e.reason) elif hasattr(e, 'code'): print('The server couldn\'t fulfill the request.') print('Error code: ', e.code) pageCur = pageCur+ 1 __searchUrls(pageCur,pageTotal) if __name__ == '__main__': __searchUrls(1,1)