1. 程式人生 > >ML: 聚類算法R包-對比

ML: 聚類算法R包-對比

rar spl stat ecs ror .cn cnblogs add run

測試驗證環境


數據: 7w+ 條,數據結構如下圖:

> head(car.train)
   DV    DC     RV   RC   SOC   HV   LV HT LT                                 Type TypeName
1 379 85.09   0.00  0.0 62.99 3.99 0.00 12  0 10f689e8-e6cc-47a3-be5a-dbc3833428ef    EV200
2 379 85.09 370.89 59.9 63.99 4.01 0.00 12  0 10f689e8-e6cc-47a3-be5a-dbc3833428ef    EV200
3 379 85.09 0.00 0.0 64.99 4.01 0.00 12 0 10f689e8-e6cc-47a3-be5a-dbc3833428ef EV200 4 379 85.09 0.00 0.0 66.00 4.03 1.55 12 11 10f689e8-e6cc-47a3-be5a-dbc3833428ef EV200 5 379 85.09 0.00 0.0 67.00 4.03 0.00 12 0 10f689e8-e6cc-47a3-be5a-dbc3833428ef EV200 6 379 85.09 0.00 0.0 68.00 4.05 0.00 13 0 10f689e8-e6cc-47a3-be5a-dbc3833428ef EV200

機器配置:

技術分享

R version:

> version
               _                           
platform       x86_64-w64-mingw32          
arch           x86_64                      
os             mingw32                     
system         x86_64, mingw32             
status                                     
major          
3 minor 2.5 year 2016 month 04 day 14 svn rev 70478 language R version.string R version 3.2.5 (2016-04-14) nickname Very, Very Secure Dishes

R包性能對比


全局函數及參數設置

技術分享
##----------------------全局設置-------------------------------
remove(list=ls())
space_path <- c("E:\\RScore\\kmeans\\")
setwd(space_path)
Sys.setlocale(category = "LC_ALL",local="chinese")

##table 行列轉換函數
tblView <- function (tbl)
{
  ##install.packages("tidyr")
  library(tidyr)
  df <- as.data.frame(tbl)
  df <- spread(data = df, key = Var2, value = Freq)
  datatable(df)
}

## 公共函數:數據讀寫及計算
source("core.R",encoding="utf-8")
teld.ml.init()

##訓練樣本
car.train <- teld.ml.rQuery("D_Cluster")
newdata <- car.train[1:8]
View Code

stats::kmeans


source code:

技術分享
> ################################################stats::kmeans######################################
> startTime <- Sys.time();
> 
> library(stats)
> kc <- kmeans(x=newdata, centers = 13)
> #plot(newdata[,c("DV","DC")],col=kc$cluster)
> tbl <- table(car.train$TypeName,kc$cluster)
> tblView(tbl)
> 
> ##耗時間
> endTime <- Sys.time()
> difTime <- difftime(endTime,startTime,units = "secs")
> print(paste0("stats::kmeans total time:", difTime))
[1] "stats::kmeans total time:0.195545196533203"
View Code
stats::kmeans total time:0.195545196533203, result view:

技術分享

fpc::kmeansruns


source code:

技術分享
> ################################################fpc::kmeansruns######################################
> startTime <- Sys.time();
> 
> library(fpc)
> kc1 <- kmeansruns(data = newdata,krange = 1:15,critout = TRUE)
2  clusters  9394.437 
3  clusters  185919.7 
4  clusters  482630.4 
5  clusters  414875.3 
6  clusters  376338 
7  clusters  334493.6 
8  clusters  303976.7 
9  clusters  279036.3 
10  clusters  432009.9 
11  clusters  363074.8 
12  clusters  405784.7 
13  clusters  397422.8 
14  clusters  371842.5 
15  clusters  408561.7 
Warning messages:
1: Quick-TRANSfer stage steps exceeded maximum (= 3507150) 
2: Quick-TRANSfer stage steps exceeded maximum (= 3507150) 
3: Quick-TRANSfer stage steps exceeded maximum (= 3507150) 
> tbl<- table(car.train$TypeName,kc1$cluster)
> tblView(tbl)
> 
> ##耗時間
> endTime <- Sys.time()
> difTime <- difftime(endTime,startTime,units = "secs")
> print(paste0("fpc::kmeansruns total time:", difTime))
[1] "fpc::kmeansruns total time:107.454074859619"
View Code
[1] "fpc::kmeansruns total time:107.454074859619"  result view:

技術分享

cluster::pam


source code

技術分享
> ################################################cluster::pam######################################
> 
> library(cluster)
> cPam <- pam(x=newdata,k=13)
Error in pam(x = newdata, k = 13) : 
  have 70143 observations, but not more than 65536 are allowed
View Code

Error: 待確認

fpc::pamk


source code

技術分享
> ################################################fpc::pamk######################################
> 
> library(fpc)
> fPamk <- pamk(newdata,krang=1:15)
Error in pam(sdata, k, diss = diss, ...) : 
  have 70143 observations, but not more than 65536 are allowed
View Code

Error: 待確認

stats::hclust


source code:

技術分享
################################################fpc::pamk######################################
> 
> library(fpc)
> fPamk <- pamk(newdata,krang=1:15)
Error in pam(sdata, k, diss = diss, ...) : 
  have 70143 observations, but not more than 65536 are allowe
View Code

Error: 待確認

mclust::Mclust


source code:

技術分享
> ################################################mclust::Mclust######################################
> library(mclust)
> EM<-Mclust(newdata)
Error in hcVVV(data = c(379, 379, 379, 379, 379, 379, 379, 379, 379, 379,  : 
  NAs in foreign function call (arg 13)
In addition: Warning message:
In hcVVV(data = c(379, 379, 379, 379, 379, 379, 379, 379, 379, 379,  :
  NAs introduced by coercion to integer range
View Code

Error: 待確認

cluster::fanny


source code:

技術分享
> ################################################cluster::fanny######################################
> library(cluster)
> fannyz=fanny(newdata,13,metric="SqEuclidean")
Error in fanny(newdata, 13, metric = "SqEuclidean") : 
  long vectors (argument 5) are not supported in .Fortran
View Code

Error: 待確認

e1071::cmeans


source code:

技術分享
> ################################################e1071::cmeans######################################
> startTime <- Sys.time();
> 
> library("e1071")
> eCm<-cmeans(newdata,15)
> tbl <- table(car.train$TypeName,eCm$cluster)
> tblView(tbl)
> 
> ##耗時間
> endTime <- Sys.time()
> difTime <- difftime(endTime,startTime,units = "secs")
> print(paste0("stats::kmeans total time:", difTime))
[1] "stats::kmeans total time:8.7237401008606"
View Code

[1] "stats::kmeans total time:8.7237401008606" result view:

技術分享

待驗證


ML: 聚類算法R包-對比