利用R語言進行基本資料管理
阿新 • • 發佈:2018-12-24
####建立leadership資料框 manager <- c(1, 2, 3, 4, 5) date <- c("10/24/08", "10/28/08", "10/1/08", "10/12/08", "5/1/09") country <- c("US", "US", "UK", "UK", "UK") gender <- c("M", "F", "F", "M", "F") age <- c(32, 45, 25, 39, 99) q1 <- c(5, 3, 3, 3, 2) q2 <- c(4, 5, 5, 3, 2) q3 <- c(5, 2, 5, 4, 1) q4 <- c(5, 5, 5, NA, 2) q5 <- c(5, 5, 2, NA, 1) leadership <- data.frame(manager, date, country, gender, age, q1, q2, q3, q4, q5,stringsAsFactors = FALSE) ####建立新變數 my_data <- data.frame(x1 = c(2, 2, 6, 4), x2 = c(3, 4, 2, 8)) my_data <- transform(my_data, sum_x = x1 + x2, mean_x = (x1 + x2)/2) my_data ####變數的重編碼 leadership leadership$age[leadership$age == 99] <- NA leadership$age[leadership$age > 75] <- "Elder" leadership$age[leadership$age >= 55 & leadership$age <= 75] <- "Middle Aged" leadership$age[leadership$age < 55] <- "Young" #等價於 leadership <- within(leadership, { agecat <- NA agecat[age > 75] <- "Eleder" agecat[age >= 55 & age <= 75] <- "Middle Age" agecat[age < 55] <- "Young" }) ####變數的重新命名 names(leadership) names(leadership)[2] <- "test_date" ####缺失值 is.na(leadership[,c(6:10)]) #排除缺失值 x <- c(1, 2, NA, 3) y <- sum(x,na.rm = TRUE) (new_data <- na.omit(leadership)) ####日期值 str_date <- c("01/05/1965", "08/16/1975") (dates <- as.Date(str_date, "%m/%d/%Y")) #計算時間間隔 today <- Sys.Date() dob <- as.Date("1990-8-30") difftime(today, dob, units = "days") ####型別轉換 a <- c(1, 2, 3) is.numeric(a) is.character(a) is.vector(a) is.matrix(a) is.data.frame(a) is.factor(a) is.logical(a) ####資料排序 with(leadership, { new_data <- leadership[order(gender, -q1),] print(new_data) }) ####資料集的合併 #向資料框新增列 total <- merge(dataframe1,dataframe2, by=c("ID", "country")) #向資料框中新增行 total <- rbind(dataframe1, dataframe2) ####資料集取子集 #選入變數 my_vars <- c("q1", "q2") (new_data <- leadership[my_vars]) #剔除變數 my_vars <- names(leadership) %in% c("q3", "q4") (new_data <- leadership[!my_vars]) #最好的方法——subset() new_data <- subset(leadership, age >= 35 | age < 24, select = c(q1,q2)) new_data #隨機抽樣 (my_sample <- leadership[sample(1:nrow(leadership), 3, replace = FALSE),]) ####使用SQL語句操作資料框 library(sqldf) (new_df <- sqldf("select * from mtcars where carb=1 order by mpg", row.names=TRUE))