Python 抓取旅遊資訊

阿新 • • 發佈：2019-02-01

#coding=UTF-8 
from urllib.request import Request, urlopen,quote
from urllib.error import URLError
import chardet
from bs4 import BeautifulSoup as BS

import sys 
import re
# from readability.readability import Document 
# from html2text import html2text
def __searchUrls(pageCur,pageTotal): 
    if pageCur == 1:
        url = 'http://www.bytravel.cn/view/index109_list.html'
    else:
        url = 'http://www.bytravel.cn/view/index109_list' + str(pageCur-1)+'.html' 
    if pageCur > pageTotal:#獲取前pageTotal頁
        return
    else: 
        try:
            # print(pageCur)
            # print(url)
            headers = {
            'User-Agent':
            'User-Agent:Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2454.101 Safari/537.36'
            } 
            req = Request(url, headers=headers) 
            response = urlopen(req)
            content = response.read().decode('gb2312','ignore')   
            soup = BS(content) 
            # print(soup)
            f=open('北京景點.txt', "a+",encoding='utf-8') #寫入檔案
            print("★ 上海旅遊第【"+str(pageCur)+"】頁"+url)
            for result_table in soup.findAll("table", {"id": "tjtable"}):
                # a_content =result_table.find("div",{"class": "user-section"}) 
                # a_href = a_content.find("a",{"class": "more flc80"})
                # text = getContextByurl('http://www.xxhh.com'+a_href.get("href"))
                title_div = result_table.find("div",{"id": "tctitle"})
                link = 'http://www.bytravel.cn'+title_div.a['href']
                title = title_div.text
                # text =result_table.find("div",{"id": "tcjs"}).text
                text = getContextByurl(link)
                print('['+title+']'+':'+link)
                print('簡介:'+text+'\n'+'---------------------------------------------'+'\n') 
                f.write('['+title+']'+':'+link)
                f.write('簡介:'+text+'\n'+'---------------------------------------------'+'\n') 
        except URLError as e:
            if hasattr(e, 'reason'):
                print('We failed to reach a server.')
                print('Reason: ', e.reason)
            elif hasattr(e, 'code'):
                print('The server couldn\'t fulfill the request.')
                print('Error code: ', e.code)   
        pageCur = pageCur+ 1 
        __searchUrls(pageCur,pageTotal) 

def getContextByurl(url):
    try:  
        headers = {
            'User-Agent': 
            'User-Agent:Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2454.101 Safari/537.36'
            }  
        # print(url)
        req = Request(url, headers=headers) 
        response = urlopen(req)
        html = response.read().decode('gb2312','ignore')
        soup = BS(html)    
        # article =Document(html).summary()  
        # text = html2text(article) 
        div_text = soup.find("div", {"class": "f14"})  
        return div_text.text
    except URLError as e:
        if hasattr(e, 'reason'):
            print('We failed to reach a server.')
            print('Reason: ', e.reason)
            return ''
        elif hasattr(e, 'code'):
            print('The server couldn\'t fulfill the request.')
            print('Error code: ', e.code) 
            return ''

if __name__ == '__main__':
    __searchUrls(1,20)    #抓取第一頁到第十頁的嘻嘻哈哈
    # getContextByurl('http://www.bytravel.cn/Landscape/70/maominglu.html')

#coding=UTF-8 
from urllib.request import Request, urlopen,quote
from urllib.error import URLError
import chardet
from bs4 import BeautifulSoup as BS

import sys 
import re
# from readability.readability import Document 
# from html2text import html2text
def __searchUrls(pageCur,pageTotal): 
    if pageCur == 1:
        url = 'https://www.meet99.com/lvyou-shanghai.html'
    else:
        url = 'http://www.bytravel.cn/view/index109_list' + str(pageCur-1)+'.html' 
    if pageCur > pageTotal:#獲取前pageTotal頁
        return
    else: 
        try:
            # print(pageCur)
            # print(url)
            headers = {
            'User-Agent':
            'User-Agent:Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2454.101 Safari/537.36'
            } 
            req = Request(url, headers=headers) 
            response = urlopen(req)
            content = response.read().decode('utf-8','ignore')   
            soup = BS(content) 
            # print(soup)
            # f=open('北京景點.txt', "a+",encoding='utf-8') #寫入檔案
            print("★ 上海旅遊第【"+str(pageCur)+"】頁"+url)
            for result_table in soup.findAll("li", {"class": "box"}): 
                # a_content =result_table.find("div",{"class": "user-section"}) 
                # a_href = a_content.find("a",{"class": "more flc80"})
                # text = getContextByurl('http://www.xxhh.com'+a_href.get("href"))
                title_div = result_table.find("div",{"class": "img"})
                title_like = result_table.find("div",{"class": "bar"})# 想去與去過
                never_cnt =""
                ever_cnt = ""
                if title_like is not None:
                    never_cnt = title_like.find("a",{"class": "never"}).text
                    ever_cnt = title_like.find("a",{"class": "ever"}).text 
                if title_div is None:
                    continue

                link =title_div.a['href']
                title = title_div.a.text
                # text =result_table.find("div",{"id": "tcjs"}).text
                # text = getContextByurl(link)
                print('['+title+']'+':https://www.meet99.com'+link)
                print( ever_cnt +' '+never_cnt)
                # f.write('['+title+']'+':'+link)
                # f.write('簡介:'+text+'\n'+'---------------------------------------------'+'\n') 
        except URLError as e:
            if hasattr(e, 'reason'):
                print('We failed to reach a server.')
                print('Reason: ', e.reason)
            elif hasattr(e, 'code'):
                print('The server couldn\'t fulfill the request.')
                print('Error code: ', e.code)   
        pageCur = pageCur+ 1 
        __searchUrls(pageCur,pageTotal) 
 
if __name__ == '__main__':
    __searchUrls(1,1)

Python 抓取旅遊資訊

#coding=UTF-8 from urllib.request import Request, urlopen,quote from urllib.error import URLError import chardet from bs4 import Beauti

python抓取豆瓣電影top250資訊

1、本博文中程式碼是轉載內容，原文章地址如下： https://blog.csdn.net/submit66/article/details/78631342?utm_source=blogxgwz1 2、只是在原文程式碼的基礎上稍作修改，添加了一些註釋及無關緊要的程式碼 3、本

python抓取去哪網當天的酒店資訊

searchHotel python抓取去哪網當天的酒店資訊 . 主要內容環境準備 selenium 使用資料抓取程式碼地址環境準備安裝selenium sudo pip install selenium seleni

Python抓取京東手機的配置資訊

以下程式碼是使用python抓取京東小米8手機的配置資訊然後找到其配置資訊的標籤，我們找到其配置資訊的標籤為 <div class="Ptable"> 然後再分析其配置資訊的頁面的規律，我們發現都是一個dt對應的一個dd，dt對應的是引數，dd對

python爬蟲爬取非同步載入網頁資訊（python抓取網頁中無法通過網頁標籤屬性抓取的內容）

1.問題描述最近由於學習內容的要求，需要從網頁上抓取一些資料來做分析報告，在看了python爬蟲的一些基礎知識之後就直接上手去網站上爬資料了。作為新手踩坑是無法避免，最近就遇到了一個比較難的問題：一般情況下，要抓去網頁上某個標籤上的內容，在通過urllib下

Python網路爬蟲之抓取訂餐資訊

本文以大眾點評網為例，獲取頁面的餐館資訊，以達到練習使用python的目的。 1.抓取大眾點評網中關村附近的餐館有哪些 import urllib.request import re def fetchFood(url):

用python 通過12306api抓取列車資訊

PS:本文為學習參考例項。程式碼與上述大體相同。首先了解這些查詢介面是怎麼來的 chrome是個好東西，特別是它的控制檯能看到很多細節。 12306網站通過chrome可以看到查詢票的api 其中有log? 和 queryA?兩種開頭的介面

python抓取動態資料 A股上市公司基本資訊

1.背景之前寫的抓取A股所有上市公司資訊的小程式在上交所網站改版後，需要同步修改 pyton2.7.9 2.分析過程以抓取宇通客車【600066】資訊為例紅框中的內容是需要抓取的資訊，檢視網頁原始碼可以看到公司資訊並沒有直接寫到html中，使用chrome “

python網路爬蟲--抓取股票資訊到Mysql

1.建表mysql -u root -p 123456create database test default character set utf8;create table stocks --a股( code varchar(10) comment '程式碼', nam

爬蟲requests庫簡單抓取頁面資訊功能實現（Python）

import requests import re, json,time,random from requests import RequestException UserAgentList = [ "Mozilla/5.0 (Windows NT 6.1; WO

Python抓取天貓商品詳細資訊及交易記錄

一、搭建Python環境本帖使用的是Python 2.7 涉及到的模組：spynner, scrapy, bs4, pymmssql 二、要獲取的天貓資料三、資料抓取流程四、原始碼 #coding:utf-8 import spynner f

使用Python抓取網易雲音樂所有歌手資訊

思路 1. 構造請求頁面的URL 2. 請求資料請求資料使用的是requests包，當請求的網址沒有錯誤並且status_code為200時，返回網頁的內容。注意：這裡並沒改變請求的headers，也沒有使用代理 3. 解析資料

python抓取知乎首頁文字資訊的簡單實現

利用requests提供的方法得到網頁中的html檔案，然後用beautifulsoup提供的方法解析網頁資訊。 find_all('a',{"class":"question_link"}):找出網頁a標籤中class為question_link的標籤。 get_text

Python爬蟲一步步抓取房產資訊

嗯，這一篇文章更多是想分享一下我的網頁分析方法。玩爬蟲也快有一年了，基本程式碼熟悉之後，我感覺寫一個爬蟲最有意思的莫過於研究其網頁背後的載入過程了，也就是分析過程，對效能沒有特殊要求的情況下，程式設計一般是小事。以深圳地區的X房網為例吧。XX房網的主頁非常簡潔，輸入相

python 抓取淘寶價格資訊

接著上一回。這段程式實現從淘寶主頁開始，輸入搜尋資訊（搜尋‘手錶’），確定搜尋後，爬取結果前20頁所有的價格資訊和地區資訊（其他資訊也可以獲取，暫時只獲取了這兩個），最後用matplotlib繪製了手表價格與數量的頻率圖。可以看出來，普通手錶價格集中在50-400元左右。下面

網路爬蟲--python抓取豆瓣同城北京地區活動資訊

import re import requests import os import sys #url = 'https://beijing.douban.com/events/future-music?start=0' #header = {'User-Agent':'Mozilla/5.0 (Windo

Python抓取學院新聞報告

滿足 imp 實驗源代碼 ges tail view paste rom Python案例 scrapy抓取學院新聞報告任務抓取四川大學公共管理學院官網(http://ggglxy.scu.edu.cn)所有的新聞咨詢. 實驗流程 1.確定抓取目標.2.制定抓取規則.

python抓取

info 奧巴馬 www word ref str source div term 我要抓取奧巴馬每周的演講內容http://www.putclub.com/html/radio/VOA/presidentspeech/index.html 如果手動提取，就需要一個個點進去

python抓取bing主頁背景圖片

replace utf bytes for json格式 module imp urlopen 有變最初Python2寫法： #!/usr/bin/env python # -*- coding:utf-8 -*- # -*- author:nancy -*- # pyt

無比強大！Python抓取cssmoban站點的模版並下載

jea blank file timeout 全局 -- 文件的 pre target Python實現抓取http://www.cssmoban.com/cssthemes站點的模版並下載實現代碼 # -*- coding: utf-8 -*- im

Python 抓取旅遊資訊

相關推薦