1. 程式人生 > >用R語言進行基本統計分析

用R語言進行基本統計分析

1. 描述性統計分析

使用自帶的summary()函式

> myvars <- c("mpg","hp","wt")
> summary(mtcars[myvars])
      mpg              hp              wt       
 Min.   :10.40   Min.   : 52.0   Min.   :1.513  
 1st Qu.:15.43   1st Qu.: 96.5   1st Qu.:2.581  
 Median :19.20   Median :123.0   Median :3.325  
 Mean   :20.09   Mean   :146.7   Mean   :3.217  
 3rd Qu.:22.80   3rd Qu.:180.0   3rd Qu.:3.610  
 Max.   :33.90   Max.   :335.0   Max.   :5.424 

使用apply()或sapply()函式計算所選擇的任意描述性統計量,函式形式為:

sapply(x, FUN, options)

這裡插入的典型函式有mean()、sd()、var()、min()、max()、median()、length()、range()和quantile()。函式fivenum()可返回圖基五數。

基礎安裝並沒有封裝偏度和峰度的計算函式,這裡手擼一個。

> mystats <- function(x, na.omit = FALSE){
+   if (na.omit)
+     x <- x[!is.na(x)]
+   m <- mean(x)
+   n <- length(x)
+   s <- sd(x)
+   skew <- sum((x-m)^3/s^3)/n
+   kurt <- sum((x-m)^4/s^4)/n - 3
+   return(c(n=n, mean=m, stdev=s, skew=skew,kurtosis=kurt))
+ }
> 
> myvars <- c("mpg","hp","wt")
> sapply(mtcars[myvars], mystats)
               mpg          hp          wt
n        32.000000  32.0000000 32.00000000
mean     20.090625 146.6875000  3.21725000
stdev     6.026948  68.5628685  0.97845744
skew      0.610655   0.7260237  0.42314646
kurtosis -0.372766  -0.1355511 -0.02271075

比較全面的有pastecs包,其中的describe()函式:

> library(psych)
> 
> myvars <- c("mpg","hp","wt")
> 
> describe(mtcars[myvars])
    vars  n   mean    sd median trimmed   mad   min    max  range skew kurtosis
mpg    1 32  20.09  6.03  19.20   19.70  5.41 10.40  33.90  23.50 0.61    -0.37
hp     2 32 146.69 68.56 123.00  141.19 77.10 52.00 335.00 283.00 0.73    -0.14
wt     3 32   3.22  0.98   3.33    3.15  0.77  1.51   5.42   3.91 0.42    -0.02
       se
mpg  1.07
hp  12.12
wt   0.17

分組計算描述性統計量

可以使用aggregate()函式來分組獲取描述性統計量。

> myvars <- c("mpg","hp","wt")
> 
> aggregate(mtcars[myvars], by=list(am=mtcars$am), mean)
> #使用by=list(am=mtcars$am)可以使新列被標註為am而不是Group.1


  am      mpg       hp       wt
1  0 17.14737 160.2632 3.768895
2  1 24.39231 126.8462 2.411000

但是aggregate()函式無法一次返回若干個統計量。一種解決的辦法是使用by()函式。

by(data, INDICES, FUN)

> myvars <- c("mpg","hp","wt")
> 
> mystats <- function(x, na.omit = FALSE){
+      if (na.omit)
+          x <- x[!is.na(x)]
+        m <- mean(x)
+        n <- length(x)
+        s <- sd(x)
+        skew <- sum((x-m)^3/s^3)/n
+        kurt <- sum((x-m)^4/s^4)/n - 3
+        return(c(n=n, mean=m, stdev=s, skew=skew,kurtosis=kurt))
+      }
> 
> dstats <- function(x)sapply(x, mystats)
> 
> by(mtcars[myvars], mtcars$am, dstats)
mtcars$am: 0
                 mpg           hp         wt
n        19.00000000  19.00000000 19.0000000
mean     17.14736842 160.26315789  3.7688947
stdev     3.83396639  53.90819573  0.7774001
skew      0.01395038  -0.01422519  0.9759294
kurtosis -0.80317826  -1.20969733  0.1415676
--------------------------------------------------------------- 
mtcars$am: 1
                 mpg          hp         wt
n        13.00000000  13.0000000 13.0000000
mean     24.39230769 126.8461538  2.4110000
stdev     6.16650381  84.0623243  0.6169816
skew      0.05256118   1.3598859  0.2103128
kurtosis -1.45535200   0.5634635 -1.1737358

也可以利用doBy()包的summaryBy()函式分組計算概述統計量。

> library(doBy)
> 
> myvars <- c("mpg","hp","wt")
> 
> mystats <- function(x, na.omit = FALSE){
+           if (na.omit)
+                 x <- x[!is.na(x)]
+               m <- mean(x)
+               n <- length(x)
+               s <- sd(x)
+               skew <- sum((x-m)^3/s^3)/n
+               kurt <- sum((x-m)^4/s^4)/n - 3
+               return(c(n=n, mean=m, stdev=s, skew=skew,kurtosis=kurt))
+ }
> 
> summary(summaryBy(mpg+hp+wt~am, data=mtcars, FUN=mystats))



       am           mpg.n         mpg.mean       mpg.stdev        mpg.skew        mpg.kurtosis    
 Min.   :0.00   Min.   :13.0   Min.   :17.15   Min.   :3.834   Min.   :0.01395   Min.   :-1.4554  
 1st Qu.:0.25   1st Qu.:14.5   1st Qu.:18.96   1st Qu.:4.417   1st Qu.:0.02360   1st Qu.:-1.2923  
 Median :0.50   Median :16.0   Median :20.77   Median :5.000   Median :0.03326   Median :-1.1293  
 Mean   :0.50   Mean   :16.0   Mean   :20.77   Mean   :5.000   Mean   :0.03326   Mean   :-1.1293  
 3rd Qu.:0.75   3rd Qu.:17.5   3rd Qu.:22.58   3rd Qu.:5.583   3rd Qu.:0.04291   3rd Qu.:-0.9662  
 Max.   :1.00   Max.   :19.0   Max.   :24.39   Max.   :6.167   Max.   :0.05256   Max.   :-0.8032  
      hp.n         hp.mean         hp.stdev        hp.skew          hp.kurtosis     
 Min.   :13.0   Min.   :126.8   Min.   :53.91   Min.   :-0.01423   Min.   :-1.2097  
 1st Qu.:14.5   1st Qu.:135.2   1st Qu.:61.45   1st Qu.: 0.32930   1st Qu.:-0.7664  
 Median :16.0   Median :143.6   Median :68.99   Median : 0.67283   Median :-0.3231  
 Mean   :16.0   Mean   :143.6   Mean   :68.99   Mean   : 0.67283   Mean   :-0.3231  
 3rd Qu.:17.5   3rd Qu.:151.9   3rd Qu.:76.52   3rd Qu.: 1.01636   3rd Qu.: 0.1202  
 Max.   :19.0   Max.   :160.3   Max.   :84.06   Max.   : 1.35989   Max.   : 0.5635  
      wt.n         wt.mean         wt.stdev         wt.skew        wt.kurtosis     
 Min.   :13.0   Min.   :2.411   Min.   :0.6170   Min.   :0.2103   Min.   :-1.1737  
 1st Qu.:14.5   1st Qu.:2.750   1st Qu.:0.6571   1st Qu.:0.4017   1st Qu.:-0.8449  
 Median :16.0   Median :3.090   Median :0.6972   Median :0.5931   Median :-0.5161  
 Mean   :16.0   Mean   :3.090   Mean   :0.6972   Mean   :0.5931   Mean   :-0.5161  
 3rd Qu.:17.5   3rd Qu.:3.429   3rd Qu.:0.7373   3rd Qu.:0.7845   3rd Qu.:-0.1873  
 Max.   :19.0   Max.   :3.769   Max.   :0.7774   Max.   :0.9759   Max.   : 0.1416 

2. 頻數表和列聯表

library(vcd)
head(Arthritis)

#一維列聯表
mytable <- with(Arthritis,table(Improved))
mytable

#二維列聯表
mytable <- xtabs(~ Treatment+Improved, data = Arthritis)
mytable

#使用margin.table()和prop.table()函式分別生成邊際頻數和比例
margin.table(mytable,1)     #下標1指代table()語句中的第一個變數
prop.table(mytable,1)
margin.table(mytable,2)
prop.table(mytable,2)

#多維列聯表
mytable <- xtabs(~ Treatment+Sex+Improved, data = Arthritis)
mytable