用R語言進行基本統計分析
阿新 • • 發佈:2018-12-24
1. 描述性統計分析
使用自帶的summary()函式
> myvars <- c("mpg","hp","wt") > summary(mtcars[myvars]) mpg hp wt Min. :10.40 Min. : 52.0 Min. :1.513 1st Qu.:15.43 1st Qu.: 96.5 1st Qu.:2.581 Median :19.20 Median :123.0 Median :3.325 Mean :20.09 Mean :146.7 Mean :3.217 3rd Qu.:22.80 3rd Qu.:180.0 3rd Qu.:3.610 Max. :33.90 Max. :335.0 Max. :5.424
使用apply()或sapply()函式計算所選擇的任意描述性統計量,函式形式為:
sapply(x, FUN, options)
這裡插入的典型函式有mean()、sd()、var()、min()、max()、median()、length()、range()和quantile()。函式fivenum()可返回圖基五數。
基礎安裝並沒有封裝偏度和峰度的計算函式,這裡手擼一個。
> mystats <- function(x, na.omit = FALSE){ + if (na.omit) + x <- x[!is.na(x)] + m <- mean(x) + n <- length(x) + s <- sd(x) + skew <- sum((x-m)^3/s^3)/n + kurt <- sum((x-m)^4/s^4)/n - 3 + return(c(n=n, mean=m, stdev=s, skew=skew,kurtosis=kurt)) + } > > myvars <- c("mpg","hp","wt") > sapply(mtcars[myvars], mystats) mpg hp wt n 32.000000 32.0000000 32.00000000 mean 20.090625 146.6875000 3.21725000 stdev 6.026948 68.5628685 0.97845744 skew 0.610655 0.7260237 0.42314646 kurtosis -0.372766 -0.1355511 -0.02271075
比較全面的有pastecs包,其中的describe()函式:
> library(psych) > > myvars <- c("mpg","hp","wt") > > describe(mtcars[myvars]) vars n mean sd median trimmed mad min max range skew kurtosis mpg 1 32 20.09 6.03 19.20 19.70 5.41 10.40 33.90 23.50 0.61 -0.37 hp 2 32 146.69 68.56 123.00 141.19 77.10 52.00 335.00 283.00 0.73 -0.14 wt 3 32 3.22 0.98 3.33 3.15 0.77 1.51 5.42 3.91 0.42 -0.02 se mpg 1.07 hp 12.12 wt 0.17
分組計算描述性統計量
可以使用aggregate()函式來分組獲取描述性統計量。
> myvars <- c("mpg","hp","wt")
>
> aggregate(mtcars[myvars], by=list(am=mtcars$am), mean)
> #使用by=list(am=mtcars$am)可以使新列被標註為am而不是Group.1
am mpg hp wt
1 0 17.14737 160.2632 3.768895
2 1 24.39231 126.8462 2.411000
但是aggregate()函式無法一次返回若干個統計量。一種解決的辦法是使用by()函式。
by(data, INDICES, FUN)
> myvars <- c("mpg","hp","wt")
>
> mystats <- function(x, na.omit = FALSE){
+ if (na.omit)
+ x <- x[!is.na(x)]
+ m <- mean(x)
+ n <- length(x)
+ s <- sd(x)
+ skew <- sum((x-m)^3/s^3)/n
+ kurt <- sum((x-m)^4/s^4)/n - 3
+ return(c(n=n, mean=m, stdev=s, skew=skew,kurtosis=kurt))
+ }
>
> dstats <- function(x)sapply(x, mystats)
>
> by(mtcars[myvars], mtcars$am, dstats)
mtcars$am: 0
mpg hp wt
n 19.00000000 19.00000000 19.0000000
mean 17.14736842 160.26315789 3.7688947
stdev 3.83396639 53.90819573 0.7774001
skew 0.01395038 -0.01422519 0.9759294
kurtosis -0.80317826 -1.20969733 0.1415676
---------------------------------------------------------------
mtcars$am: 1
mpg hp wt
n 13.00000000 13.0000000 13.0000000
mean 24.39230769 126.8461538 2.4110000
stdev 6.16650381 84.0623243 0.6169816
skew 0.05256118 1.3598859 0.2103128
kurtosis -1.45535200 0.5634635 -1.1737358
也可以利用doBy()包的summaryBy()函式分組計算概述統計量。
> library(doBy)
>
> myvars <- c("mpg","hp","wt")
>
> mystats <- function(x, na.omit = FALSE){
+ if (na.omit)
+ x <- x[!is.na(x)]
+ m <- mean(x)
+ n <- length(x)
+ s <- sd(x)
+ skew <- sum((x-m)^3/s^3)/n
+ kurt <- sum((x-m)^4/s^4)/n - 3
+ return(c(n=n, mean=m, stdev=s, skew=skew,kurtosis=kurt))
+ }
>
> summary(summaryBy(mpg+hp+wt~am, data=mtcars, FUN=mystats))
am mpg.n mpg.mean mpg.stdev mpg.skew mpg.kurtosis
Min. :0.00 Min. :13.0 Min. :17.15 Min. :3.834 Min. :0.01395 Min. :-1.4554
1st Qu.:0.25 1st Qu.:14.5 1st Qu.:18.96 1st Qu.:4.417 1st Qu.:0.02360 1st Qu.:-1.2923
Median :0.50 Median :16.0 Median :20.77 Median :5.000 Median :0.03326 Median :-1.1293
Mean :0.50 Mean :16.0 Mean :20.77 Mean :5.000 Mean :0.03326 Mean :-1.1293
3rd Qu.:0.75 3rd Qu.:17.5 3rd Qu.:22.58 3rd Qu.:5.583 3rd Qu.:0.04291 3rd Qu.:-0.9662
Max. :1.00 Max. :19.0 Max. :24.39 Max. :6.167 Max. :0.05256 Max. :-0.8032
hp.n hp.mean hp.stdev hp.skew hp.kurtosis
Min. :13.0 Min. :126.8 Min. :53.91 Min. :-0.01423 Min. :-1.2097
1st Qu.:14.5 1st Qu.:135.2 1st Qu.:61.45 1st Qu.: 0.32930 1st Qu.:-0.7664
Median :16.0 Median :143.6 Median :68.99 Median : 0.67283 Median :-0.3231
Mean :16.0 Mean :143.6 Mean :68.99 Mean : 0.67283 Mean :-0.3231
3rd Qu.:17.5 3rd Qu.:151.9 3rd Qu.:76.52 3rd Qu.: 1.01636 3rd Qu.: 0.1202
Max. :19.0 Max. :160.3 Max. :84.06 Max. : 1.35989 Max. : 0.5635
wt.n wt.mean wt.stdev wt.skew wt.kurtosis
Min. :13.0 Min. :2.411 Min. :0.6170 Min. :0.2103 Min. :-1.1737
1st Qu.:14.5 1st Qu.:2.750 1st Qu.:0.6571 1st Qu.:0.4017 1st Qu.:-0.8449
Median :16.0 Median :3.090 Median :0.6972 Median :0.5931 Median :-0.5161
Mean :16.0 Mean :3.090 Mean :0.6972 Mean :0.5931 Mean :-0.5161
3rd Qu.:17.5 3rd Qu.:3.429 3rd Qu.:0.7373 3rd Qu.:0.7845 3rd Qu.:-0.1873
Max. :19.0 Max. :3.769 Max. :0.7774 Max. :0.9759 Max. : 0.1416
2. 頻數表和列聯表
library(vcd)
head(Arthritis)
#一維列聯表
mytable <- with(Arthritis,table(Improved))
mytable
#二維列聯表
mytable <- xtabs(~ Treatment+Improved, data = Arthritis)
mytable
#使用margin.table()和prop.table()函式分別生成邊際頻數和比例
margin.table(mytable,1) #下標1指代table()語句中的第一個變數
prop.table(mytable,1)
margin.table(mytable,2)
prop.table(mytable,2)
#多維列聯表
mytable <- xtabs(~ Treatment+Sex+Improved, data = Arthritis)
mytable