1. 程式人生 > >Python爬取網頁資料並匯入表格

Python爬取網頁資料並匯入表格

import requests
import time
import random
import socket
import http.client
from bs4 import BeautifulSoup
import csv

def getContent(url , data = None):
    header={
        'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
        'Accept-Encoding': 'gzip, deflate, sdch',
        'Accept-Language': 'zh-CN,zh;q=0.8',
        'Connection': 'keep-alive',
        'User-Agent': 'Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/43.0.235'
    } # request 的請求頭
    timeout = random.choice(range(80, 180))
    while True:
        try:
            rep = requests.get(url,headers = header,timeout = timeout) #請求url地址,獲得返回 response 資訊
            rep.encoding = 'utf-8'
            break
        except socket.timeout as e: # 以下都是異常處理
            print( '3:', e)
            time.sleep(random.choice(range(8,15)))

        except socket.error as e:
            print( '4:', e)
            time.sleep(random.choice(range(20, 60)))

        except http.client.BadStatusLine as e:
            print( '5:', e)
            time.sleep(random.choice(range(30, 80)))

        except http.client.IncompleteRead as e:
            print( '6:', e)
            time.sleep(random.choice(range(5, 15)))
    print('request success')
    return rep.text # 返回的 Html 全文

if __name__ == '__main__':
    url ='http://wsb.wuhan.gov.cn/html/friendly/201602/t20160203_45633.shtml'
    html = getContent(url) # 呼叫獲取網頁資訊
    print('my frist python file')

def getData(html_text):
    final = []
    bs = BeautifulSoup(html_text, "html.parser")  # 建立BeautifulSoup物件
    body = bs.body #獲取body
    href = body.find('div',{'id': 'nav'})
    ul = href.find('ul')
    li = ul.find_all('li')

    for nav in li:
        temp = []
        href = nav.find('h1').string
        temp.append(href)
        inf = nav.find_all('p')
        weather = inf[0].string  # 天氣
        temp.append(weather)
        temperature_highest = inf[1].find('span').string  # 最高溫度,夜間可能沒有這個元素,需要注意
        temperature_low = inf[1].find('i').string  # 最低溫度
        temp.append(temperature_low)
        temp.append(temperature_highest)
    final.append(temp)
    print('getDate success')
    return final


if __name__ == '__main__':
    url ='http://wsb.wuhan.gov.cn/html/friendly/201602/t20160203_45633.shtml'
    html = getContent(url)    # 獲取網頁資訊
    result = getData(html)  # 解析網頁資訊,拿到需要的資料
    print('my frist python file')

def writeData(data, name):
    with open(name, 'a', errors='ignore', newline='') as f:
        f_csv =csv.writer(f)
        f_csv.writerows(data)
    print('write_csv success')

if __name__ == '__main__':
        url = 'http://www.weather.com.cn/weather/101210101.shtml'
        html = getContent(url)  # 獲取網頁資訊
        result = getData(html)  # 解析網頁資訊,拿到需要的資料
        writeData(result, 'E:\地理國情監測\e.csv')  # 資料寫入到 csv文件中
        print('my frist python file')

報錯:

C:\Users\jpy\PycharmProjects\venv\Scripts\python.exe C:/Users/jpy/PycharmProjects/test1.py request success my frist python file request success Traceback (most recent call last):   File "C:/Users/jpy/PycharmProjects/test1.py", line 73, in <module>     result = getData(html)  # 解析網頁資訊,拿到需要的資料   File "C:/Users/jpy/PycharmProjects/test1.py", line 56, in getData     href = nav.find('h1').string AttributeError: 'NoneType' object has no attribute 'string'

Process finished with exit code 1