1. 程式人生 > >R語言學習筆記之七

R語言學習筆記之七

head ace row any type 分析 min 記錄 ted

摘要: 僅用於記錄R語言學習過程:

內容提要:

缺失值的識別與處理;

異常值與重復值的處理

正文:

缺失值的識別與處理

導讀:

> x <- c(1,2,3,NA,NA,4)

> mean(x)

[1] NA

> sum(x)

[1] NA

> mean(x,na.rm = TRUE)

[1] 2.5

> sum(x,na.rm = TRUE)

[1] 10

> is.na(x)

[1] FALSE FALSE FALSE TRUE TRUE FALSE

> sum(is.na(x))

[1] 2

n 用[! ]去掉缺失值

u 示例:

> x[!is.na(x)]

[1] 1 2 3 4

> iris_na <- iris

> for (i in 1:4) {

+ iris_na[sample(1:nrow(iris),5),i] = NA

+ }

>

> sapply(iris_na[,1:4],function(x)sum(is.na(x)))

我 是 中 國

5 5 5 5

> sapply(iris_na[,1:4],function(x)which(is.na(x))) #which返回的是位置

我 是 中 國

[1,] 10 11 6 28

[2,] 60 52 21 108

[3,] 92 54 32 111

[4,] 108 113 79 124

[5,] 144 136 83 135

u psych擴展包裏describe()函數,用於描述數據集的基本統計值

u 計算缺失值的比例:

l 示例:

> sapply(iris_na[,1:4],function(x)sum(is.na(x)/nrow(iris_na)))

我 是 中 國

0.03333333 0.03333333 0.03333333 0.03333333

n 缺失值對於回歸分析的影響

u 示例:

u lm(Sepal.length~Sepal.Width,data = iris_na,na.action = na.omit) #在進行回歸分析是也要去除缺失值,用na.action,但其默認也是na.action=na.omit

n 缺失值得填補:一般采用中數和中位數,具體如下

u 示例1:

mean_value <- sapply(iris_na[,1:4],mean,na.rm = TRUE)

for (i in 1:4) {

iris_na[is.na(iris_na[,i],i)] = mean_value[i]

}

u 示例2:

> cancer[sample(1:1000,90),2] <- NA #生成缺失值

> mean_value <- tapply(cancer$sur_days,list(cancer$type,cancer$treatment),mean,na.rm = TRUE)

> mean_value #求均值

chemo sugr

colon 523.6489 539.9329

liver 530.1656 576.6056

lung 547.7427 553.1241

> for (i in 1:3){

+ for (j in 1:2){

+ cancer$sur_days[is.na(cancer$sur_days) & cancer$type == rownames(mean_value)[i]

+ & cancer$treatment == colnames(mean_value)[j]] =mean_value[i,j]

+ }

+ } #填補缺失值

> summary(cancer)

id sur_days type treatment

  1. Min. : 1.0 Min. : 100.0 colon:320 chemo:497

1st Qu.: 250.8 1st Qu.: 335.8 liver:330 sugr :503

Median : 500.5 Median : 547.7 lung :350

Mean : 500.5 Mean : 545.4

3rd Qu.: 750.2 3rd Qu.: 738.5

  1. Max. :1000.0 Max. :1000.0

> describe(cancer)

vars n mean sd median trimmed mad min max range

id 1 1000 500.50 288.82 500.50 500.50 370.65 1 1000 999

sur_days 2 1000 545.45 247.96 547.74 544.41 300.23 100 1000 900

type* 3 1000 2.03 0.82 2.00 2.04 1.48 1 3 2

treatment* 4 1000 1.50 0.50 2.00 1.50 0.00 1 2 1

skew kurtosis se

id 0.00 -1.20 9.13

sur_days 0.03 -1.00 7.84

type* -0.06 -1.51 0.03

treatment* -0.01 -2.00 0.02

n 用R包對缺失值進行填補

u 安裝mlbench包

l 示例:

install.packages(‘mlbench‘)

library(mlbench)

data(‘BostonHousing‘)

original_data <- BostonHousing

set.seed(2018)

BostonHousing[sample(1:nrow(BostonHousing),80),‘rad‘] <- NA #在rad列生成缺失值

BostonHousing[sample(1:nrow(BostonHousing),80),‘ptratio‘] <- NA #在ptratio列生成缺失值

u 安裝mice包,利用裏面的md.pattern()函數查看缺失值得模式。md.pattern(BostonHousing數據集名稱) ,0代表缺失,1表示不缺失,最後一行表示缺失總數。

u 安裝Hmisc包,利用其中的impute()插補:參數設置:需要插補的對象,擬插補的參數

n 示例:

im_mean <- impute(BostonHousing$ptratio,mean) #mean可以換成median或者固定的數字

BostonHousing$ptratio <- NULL

BostonHousing$im_mean <- im_mean

u 用mice包(鏈式方程多元插值)進行缺失值填補,基本思想為利用一種模型,把缺失值的變量作為因變量,把其他不缺失的變量作為自變量進行回歸分析,以預測缺失值

n 示例:

mice_mod <- mice(BostonHousing[,!names(BostonHousing)%in% ‘medv‘],method =‘rf‘) #rf 隨機森林

mice_output <- complete(mice_mod) #complete函數,完整顯示

anyNA(mice_output) #是否還有NA值

FALSE

n 示例2:查看預測的精度

actuals <- original_data$rad[is.na(BostonHousing$rad)] #查看原始值

predicts <- mice_out[is.na(BostonHousing$rad),‘rad‘] #查看預測值

mean(actuals != predicts) #原始值不等於預測值的比例

0.3 #預測精度為70%

u VIM包:可以對缺失值進行可視化;也可以對數據進行插補

n 數據可視化:

u 示例:

md.pattern(airquality)

aggr_plot <- aggr(airquality,col = c(‘red‘,‘green‘),numbers = TRUE,sortVars = T,labels = names(airquality),cex.axis = 0.7,gap = 3)

marginplot(airquality[1:2]

n 用線性回歸模型進行插補:

n 示例:

data(sleep)

head(sleep)

BodyWgt BrainWgt NonD Dream Sleep Span Gest Pred Exp Danger

1 6654.000 5712.0 NA NA 3.3 38.6 645 3 5 3

2 1.000 6.6 6.3 2.0 8.3 4.5 42 3 1 3

3 3.385 44.5 NA NA 12.5 14.0 60 1 1 1

4 0.920 5.7 NA NA 16.5 NA 25 5 2 3

5 2547.000 4603.0 2.1 1.8 3.9 69.0 624 3 5 4

6 10.550 179.5 9.1 0.7 9.8 27.0 180 4 4 4

sleepIm <- regressionImp(Sleep + Gest + Span + Dream + NonD ~ BodyWgt+BrainWgt,data = sleep) # 第一個為缺失值變量,~後面為不包含缺失值的變量,data為數據集名,也可以再考慮family 參數,可選auto

head(sleepIm) #不含有缺失值了,TRUE代表該位置原來為缺失值。

BodyWgt BrainWgt NonD Dream Sleep Span Gest Pred Exp Danger

1 6654.000 5712.0 -11.732867 -0.6897314 3.3 38.60000 645 3 5 3

2 1.000 6.6 6.300000 2.0000000 8.3 4.50000 42 3 1 3

3 3.385 44.5 8.987353 2.0132372 12.5 14.00000 60 1 1 1

4 0.920 5.7 9.017324 2.0148478 16.5 15.50179 25 5 2 3

5 2547.000 4603.0 2.100000 1.8000000 3.9 69.00000 624 3 5 4

6 10.550 179.5 9.100000 0.7000000 9.8 27.00000 180 4 4 4

Sleep_imp Gest_imp Span_imp Dream_imp NonD_imp

1 FALSE FALSE FALSE TRUE TRUE

2 FALSE FALSE FALSE FALSE FALSE

3 FALSE FALSE FALSE TRUE TRUE

4 FALSE FALSE TRUE TRUE TRUE

5 FALSE FALSE FALSE FALSE FALSE

6 FALSE FALSE FALSE FALSE FALSE

異常值與重復值的處理

n 異常值的處理:原則:根據數據的實際背景來判斷

u 一些基礎值:

> set.seed(2017)

> mmhg <- sample(60:250,1000, replace = TRUE)

> range(mmhg)

[1] 60 250

> min(mmhg)

[1] 60

> max(mmhg)

[1] 250

> quantile(mmhg) #四分位數

0% 25% 50% 75% 100%

60.00 104.75 154.00 199.00 250.00

> fivenum(mmhg) #四分位數

[1] 60.0 104.5 154.0 199.0 250.0

u 自定義函數:

> outlierKD <- function(dt,var){

+ var_name <-eval(substitute(var),eval(dt))

+ tot <- sum(!is.na(var_name))

+ na1 <- sum(is.na(var_name))

+ m1 <- mean(var_name,na.rm = T)

+ par(mfrow = c(2,2),oma = c(0,0,3,0))

+ boxplot(var_name,main =‘With outliers‘)

+ hist(var_name,main =‘With outliers‘,xlab = NA,ylab = NA)

+ outlier <- var_name[var_name >230]

+ mo <- mean(outlier)

+ var_name <- ifelse(var_name %in% outlier, NA, var_name)

+ boxplot(var_name,main =‘Without outliers‘)

+ hist(var_name,main =‘Without outliers‘,xlab = NA,ylab = NA)

+ title(‘Outlier Check‘,outer = T)

+ na2 <- sum(is.na(var_name))

+ cat(‘Outliers identified:‘,na2 - na1,‘\n‘)

+ cat(‘Propotion (%) of outliers:‘,round((na2 - na1)/ tot * 100,1),‘\n‘)

+ cat(‘Mean of the outliers:‘,round(mo,2),‘\n‘)

+ m2 <- mean(var_name,na.rm = T)

+ cat(‘Mean without removing outliers:‘,round(m1,2),‘\n‘)

+ cat(‘Mean if we remove outliers:‘,round(m2,2),‘\n‘)

+ response <- readline(prompt = ‘Do you want to remove outliers

+ and to replace with NA? [yes/no]:‘)

+ if (response == ‘y‘ | response == ‘yes‘){

+ dt[as.character(substitute(var))] <- invisible(var_name)

+ assign(as.character(as.list(match.call())$dt),dt,envir = .GlobalEnv)

+ cat(‘Outliers successfully removed‘,‘\n‘)

+ return(invisible(dt))

+ } else{

+ cat(‘Nothing changed‘,‘\n‘)

+ return(invisible(var_name))

+ }

+ }

> set.seed(2017)

> df <- data.frame(bp = c(sample(80:250,1000,replace = T),NA,390,100))

> outlierKD(df,bp)

Outliers identified: 126

Propotion (%) of outliers: 12.6

Mean of the outliers: NA

Mean without removing outliers: 163.79

Mean if we remove outliers: 152.71

Do you want to remove outliers

and to replace with NA? [yes/no]:

同時也會生成可視化的圖。

n 重復值的處理:

u unique()函數:返回值 針對向量

> x<- c(1,2,3,4,5,1,2,3)

> unique(x)

[1] 1 2 3 4 5

u duplicated()函數:返回邏輯值T和F,可用於提取子集 針對向量

> x<- c(1,2,3,4,5,1,2,3)

> duplicated(x)

[1] FALSE FALSE FALSE FALSE FALSE TRUE TRUE TRUE

> x[duplicated(x)] #如要返回非重復值,前面加!

[1] 1 2 3

u anyDuplicated()函數:返回第一個重復值出現的位置 針對向量

> anyDuplicated(x)

[1] 6

u 對數據框進行重復值的處理

l 自己設置條件

library(readxl)

mydata <- read_excel(‘absolute path/filename.xlsx‘)

mydata[!(duplicated(mydata$name)& duplicated(mydata$birthday)),]

l 用函數實現:paste()函數:主要是對字符串進行操作

library(readxl)

mydata <- read_excel(‘absolute path/filename.xlsx‘)

mydata$test <- paste(mydata$name,mydata$birthday) # 把兩個變量粘在一起

newdata <- mydata[!duplicated(mydata$test),]

R語言學習筆記之七