R語音 rvest爬取中國天氣網所有城市未來七天天氣資料並寫入oracle資料庫
本文使用R語音 rvest爬取中國天氣網所有城市未來七天天氣資料並寫入oracle資料庫,其中包括瞭如何使用R語言連線oracle資料庫,以及爬取時候的簡單策略,最後對爬取到的資料組裝成資料框並寫入資料庫,可以作為R語音初中級愛好者們很好的參考例子,當然這是我很久前寫的程式碼,很多地方沒有進行優化,比如使用了for迴圈,其實應該封裝到函式中使用sapply來處理迴圈以提高效率。感興趣的朋友想與我交流的話可以加群R語言&大資料分析456726635,或者加群Python & Spark 636866908。下面廢話不多說了直接貼程式碼。
library(rvest)
library(ROracle)
library(curl)
library(stats)
library(lubridate)
library(xts)
library(zoo)
library(TTR)
library(forecast)
library(fGarch)
library(tseries)
library(FinTS)
library(rugarch)
#############################連線oracle資料庫的相關配置
localdrv <- dbDriver("Oracle")
localhost <- "192.168.11.170"
localport <- 1521
sid <- "BIFORECAST"
localstring <- paste(
"(DESCRIPTION=",
"(ADDRESS=(PROTOCOL=tcp)(HOST=", localhost, ")(PORT=", localport, "))",
"(CONNECT_DATA=(SID=", sid, ")))", sep = "")
## Use username/password authentication.
localcon <- dbConnect(localdrv, username = "bi", password = "bi",
dbname = localstring)
# 第一步:定義幾個函式**
# 定義讀取大區URL地址函式
read.region.url<-function(url){
web <-""
trynext=try(read_html(url,encoding = "utf8"),silent=TRUE)
if ('try-error' %in% class(trynext))
{
web<-read_html(url,encoding = "utf8")
}else
{
web<-read_html(url,encoding = "utf8")
}
region.url<-web%>%
html_nodes("div.maptabboxinBox")%>%
html_nodes("div.maptabbox")%>%
html_nodes("h4")%>%
html_nodes("a")%>%
html_attr("href")
return(region.url)
}
# 定義讀取省會URL地址函式
read.province.url<-function(region.url){
web <-""
trynext=try(read_html(region.url,encoding = "utf8"),silent=TRUE)
if ('try-error' %in% class(trynext))
{
web<-read_html(region.url,encoding = "utf8")
}else
{
web<-read_html(region.url,encoding = "utf8")
}
province.url<-web%>%
html_nodes("div")%>%
html_nodes("tr")%>%
html_nodes("td.rowsPan")%>%
html_nodes("a")%>%
html_attr("href")%>%
unique()
province.url<-paste("http://www.weather.com.cn",province.url,sep = "")
return(province.url)
}
# 定義讀取城市URL地址函式
read.city.url<-function(province.url){
web <-""
trynext=try(read_html(province.url,encoding = "utf8"),silent=TRUE)
if ('try-error' %in% class(trynext))
{
web<-read_html(province.url,encoding = "utf8")
}else
{
web<-read_html(province.url,encoding = "utf8")
}
city.url<-web%>%
html_nodes("div")%>%
html_nodes("tr")%>%
html_nodes("td")%>%
html_nodes("a")%>%
html_attr("href")%>%
unique()
return(city.url)
}
#第二步:獲取URL地址**
# 中國天氣網URL地址
url<-"http://www.weather.com.cn/forecast/index.shtml"
# 開始讀取大區URL地址
region.url<-read.region.url(url)
# 開始讀取省會URL地址
province.url<-unlist(lapply(region.url,read.province.url))
# 開始讀取各城市URL地址
city.urls<-unlist(lapply(province.url,read.city.url))
#第三步:爬取實時天氣資料**
# 開始爬取各城市天氣資料
#city.urls=head(city.urls)
####一個計數器
ii <- 1
{
web <-""
print(ii)
######睡會吧,心急吃不了熱豆腐
if(ii>1500)
{
Sys.sleep(20)
ii <- 1
}
totalweather=data.frame()
print(i)
#####################當前城市不能爬取的時候的策略,可根據自己的需求新增
trynext=try(read_html(i,encoding = "utf8"),silent=TRUE)
if ('try-error' %in% class(trynext))
{
print("Error con111")
next
}else
{
web <- read_html(curl(i, handle = new_handle("useragent" = "Mozilla/5.0")),encoding = "utf8")
}
city <- web %>%
html_nodes("div")%>%
html_nodes("div")%>%
html_nodes("div")%>%
html_nodes("div.crumbs.fl")%>%
html_nodes("a")%>%
html_text()
if(length(city)>1)
{
province <- city[1]
city <- city[2]
}else
{
province <- city[1]
}
# # # # ## # # # # # # # # # 確認城市名
city1 <- web %>%
html_nodes("div")%>%
html_nodes("div")%>%
html_nodes("div")%>%
html_nodes("div.crumbs.fl")%>%
html_nodes("span")%>%
html_text()
if(length(city1)>1)
{
city1 <- city1[length(city1)]
}
############################省-市 和 直轄市-城區 的分別處理
if(city1 != "城區")
{
city <- city1
}else
{
if(length(city)==1) city <- city[1]
}
print(city)
weather <-web %>%
html_nodes("div")%>%
html_nodes("ul.t.clearfix")%>%
html_nodes("li")%>%
html_text()%>%
strsplit(split="[\n]+")
newweather<-data.frame()
for(j in weather)
{
if(length(j)>5)
{
jtemp <- paste0(j[4],j[5])
weather0=data.frame(j[3],jtemp,j[6])
}else
{
weather0=data.frame(j[3],j[4],j[5])
}
newweather=rbind(newweather,weather0)
}
names(newweather)=c("weather","temperature","wind")
####################未來七天的時間準備好
sdate1 <- as.character(Sys.Date())
sdate2 <- as.character(Sys.Date()+1)
sdate3 <- as.character(Sys.Date()+2)
sdate4 <- as.character(Sys.Date()+3)
sdate5 <- as.character(Sys.Date()+4)
sdate6 <- as.character(Sys.Date()+5)
sdate7 <- as.character(Sys.Date()+6)
date<-c(sdate1,sdate2,sdate3,sdate4,sdate5,sdate6,sdate7)
weather<-data.frame(province,city,date,newweather)
totalweather=rbind(totalweather,weather)
aprovince=totalweather$province
acity=totalweather$city
adate=totalweather$date
aweather=totalweather$weather
atemperature=totalweather$temperature
awind=totalweather$wind
################資料框已經裝載完畢
writedata<-data.frame(CITY=acity,PERIOD_SDATE=adate,WEATHER=aweather,TEMPERATURE=atemperature,WIND=awind,PROVINCE_NAME=aprovince)
print(writedata)
# Sys.setenv(TZ = "GMT")
# Sys.setenv(ORA_SDTZ = "GMT")
#adate=as.Date(adate)
########################寫入資料庫
dbWriteTable(localcon,"WECHAT_NEW_WEATHER_FORECAST", writedata, row.names = FALSE,append=TRUE, ora.number=FALSE)
dbWriteTable(localcon,"WECHAT_NEW_WEATHER_FORECAST_TT", writedata, row.names = FALSE,append=TRUE, ora.number=FALSE)
ii <- ii+1
}