1. 程式人生 > >資料分析學習體驗——實際案例_邏輯迴歸&線性迴歸

資料分析學習體驗——實際案例_邏輯迴歸&線性迴歸

作者: 江俊
日期: 2018年3月27日

主要介紹批量生成profiling圖以及五數概括的自建函式。

專案背景

某保養品公司目前有一款產品線銷售情況一直不景氣,公司預算有限,希望在現有的客戶中挖掘出最有可能在30天內購買該產品的使用者群

使用語言

R語言

使用模型

邏輯迴歸+線性迴歸

建模步驟

一、 瞭解資料

  • 資料結構
  • Y變數定義
  • X變數型別
  • 響應率情況
  • 花費金額分佈

程式碼:

rm(list=ls())
setwd("./") #change the location
getwd()      #check the location
list.files() #list the files under your location
######################################################################### ######################## Part1 read data ########################### ######################################################################### filepath<-"./Exercise_Response_data.csv" raw<-read.csv(filepath,stringsAsFactors = F
) dim(raw) str(raw) summary(raw) var<-data.frame(var=colnames(raw),type=sapply(raw,class)) # 將結果匯出到 xlsx表格 require(XLConnect) #xlsx <- loadWorkbook('Correlation.xlsx',create=TRUE) xlsx <- loadWorkbook('myhomework.xlsx',create = T) createSheet(xlsx,name='variable') #name the worksheet as 'correlation'
writeWorksheet(xlsx,var,'variable',startRow=1,startCol=1, header=TRUE) #define the startrow,startcol,header saveWorkbook(xlsx) # dv_revenue summary(raw$dv_revenue) raw$dv_revenue<-ifelse(is.na(raw$dv_revenue),0,raw$dv_revenue) # table 自動忽略缺失值 View(table(raw$dv_revenue)) hist(raw$dv_revenue) # dv_revenue hist quantile(raw$dv_revenue,(1:20)/20,na.rm = T) # dv_revenue quantile View(t(mean_rev<-quantile(raw$dv_revenue,c(0,0.01,0.1,0.25,0.5,0.75,0.9,0.99,1),na.rm = T))) hist(raw[raw$dv_revenue>0 & raw$dv_revenue<=50,"dv_revenue"],main="dev_revenue <=50",xlab = "dev_revenue") # dv_response table(raw$dv_response) prop.table(table(raw$dv_response))

執行結果:
這裡寫圖片描述

 0     1 
22878  1220 

         0          1 
0.94937339 0.05062661 

二、拆分資料

  • train:訓練集
  • test:驗證集
    程式碼:
#########################################################################
########################   Part2 split into two    ######################
#########################################################################
# modeling segments
  table(raw$segment)
  prop.table(table(raw$segment))

#separate build sample
  train<-raw[raw$segment=="build",]
  table(train$segment)

#separate inval sample
  test<-raw[raw$segment=="inval",]
  table(test$segment)

執行結果:

build inval 
16898  7200 

  build   inval 
0.70122 0.29878 

build 
16898 

inval 
 7200 

三、 探索資料

  • 分型別、數值型
  • X內部表現
  • X與Y關係
  • 缺失值

批量生成profiling圖

程式碼:

#########################################################################
########################   Part3 profile     ############################
#########################################################################
  #overall performance
  #總體人數,計算總體樣本響應情況
  overrall<-dim(train)[1]
  #相應人數,因為響應的記為1,所以可以直接使用sum()求和
  over_responder<-sum(train$dv_response)
  #responder<-length(train$dv_response[train$dv_response==1])
  #響應率
  over_response_rate<-over_responder/overrall
  overall_perf<-data.frame(overrall,responder=over_responder,response_rate=over_response_rate)
  overall_perf

  #variable type  
  data.frame(table(sapply(train[,4:27],class)))

  #character
  #檢視資料型別為某種的資料名,類似的有:is.character,is.numeric,is.factor
  chavar_name<-colnames(train[,4:27])[unlist(lapply(train[,4:27],is.character))]
  #字元型資料索引
  charater_index<-which(colnames(train) %in% chavar_name)
  lapply(train[,chavar_name],table)

  #整數型,注意可能是分型別數值
  intvar_name<-colnames(train[,4:27])[unlist(lapply(train[4:27],is.integer))]
  summary(train[,intvar_name])

  #根據結果記錄分型別數值的變數名
  var_fenlei<-c(chavar_name,"Occupation","Education","Frequency_of_last_mth")
  lapply(train[,var_fenlei],table)
  #根據分型別和連續型將原資料集分成兩類,方便後續profile的批量處理
  #分型別數值的索引
  fenlei_index<-which(colnames(train) %in% var_fenlei)

  #除開id列,響應變數列,字元型,分型別數值以外的連續數值型變數
  #which(colnames(train[,4:27]) %in% c("rid","dv_response","dv_revenue"))
  numvar_name<-colnames(train[,-c(1:3,fenlei_index,28:ncol(train))])

  #數值型數值的索引
  lianxu_index<-which(colnames(train) %in% numvar_name)

  ############################################### 1. Profiling for category variables####################################################
  #install.packages('plyr')
  library(plyr)

  ###################################### 1.profile 分型別數值 #########################################

  #封裝函式,分型別數值
  #資料集,索引,索引長度
  profile_fenlei<-function(x,y,n){
    results<-data.frame(var=NA,category=NA,count=NA,responder=NA,
                        percent=NA,response_rate=NA,index=NA)
    for(i in 1:n){
      prof<-ddply(x,.(x[,y[i]]),summarise,count=length(id),responder=sum(dv_response)) #group by hh_gender_m_flg
      #prof
      #新增百分比結果
      propf<-within(prof,{
        index<-responder/count/over_response_rate*100
        response_rate<-responder/count*100
        percent<-count/overrall*100
      })  #add response_rate,index, percentage
      propf<-data.frame(var=colnames(train)[y[i]],propf)   
      colnames(propf)[2]<-"category"
      #行連線
      results<-rbind(results,propf)
    }
    #去除首行的空值
    results<-results[-1,]
    row.names(results)<-1:nrow(results)
    return(results)
  }
  #分類數值的profile
  results_fenlei<-profile_fenlei(train[,1:28],fenlei_index,length(fenlei_index))
  results_fenlei$category[is.na(results_fenlei$category)]<-"unknown"
  results_fenlei$category[results_fenlei$category==""]<-"unknown"
  View(results_fenlei)

  # #xlsx <- loadWorkbook('Correlation.xlsx',create=TRUE)
  # xlsx <- loadWorkbook('myhomework.xlsx')
  # createSheet(xlsx,name='profile')  #name the worksheet as 'correlation'
  # writeWorksheet(xlsx,results_fenlei,'profile',startRow=1,startCol=1, header=TRUE)  #define the startrow,startcol,header
  # saveWorkbook(xlsx)
  # 
  # 
  ###################################### 1.profile 分型別數值 #########################################

  #####################################  2.profile 連續型數值 #########################################

  ######封裝函式
  #資料集,索引,索引長度,分段個數
  profile_lianxu<-function(x,y,n,m){
    var_data=x
    results<-data.frame(var=NA,category=NA,count=NA,responder=NA,
                        percent=NA,response_rate=NA,index=NA)
    for(i in 1:n){
      #分離成兩部分:缺失值和無缺失值
      nomissing<-data.frame(var_data[!is.na(var_data[,y[i]]),]) #select the no missing value records 
      missing<-data.frame(var_data[is.na(var_data[,y[i]]),])    #select the missing value records
      ##################3.2.1 numeric Profiling:missing part 
      missing2<-ddply(missing,.(missing[,y[i]]),summarise,count=length(id),responder=sum(dv_response)) #group by pos_revenue_base_sp_6mo
      colnames(missing2)[1]<-"category"
      #View(missing2)
      missing_perf<-within(missing2,{
        index<-responder/count/over_response_rate*100
        response_rate<-responder/count*100
        percent<-count/overrall*100
      })   
      #View(missing_perf)
      nomissing_value<-nomissing[,y[i]]  #put the nomissing values into a variable

      nomissing$category<-cut(nomissing_value,unique(quantile(nomissing_value,(0:m)/m)),include.lowest = T) #separte into 10 groups
      #View(table(nomissing$var_category))  #take a look at the 10 category
      prof2<-ddply(nomissing,.(category),summarise,count=length(id),responder=sum(dv_response))#group by the 10 groups
      #View(prof2)
      nonmissing_perf<-within(prof2,{
        index<-responder/count/over_response_rate*100
        response_rate<-responder/count*100
        percent<-count/overrall*100
      })#add avg_revenue,index,percent
      #View(nonmissing_perf)
      #set missing_perf and non-missing_Perf together
      #View(missing_perf)
      #View(nonmissing_perf)
      #colnames(nonmissing_perf)[3]<-"responder"
      lastprofile<-rbind(nonmissing_perf,missing_perf) #set 2 data together
      lastprofile<-data.frame(var=colnames(train)[y[i]],lastprofile)
      #行連線
      results<-rbind(results,lastprofile)
    }
    #去除首行的空值
    results<-results[-1,]
    row.names(results)<-1:nrow(results)
    return(results)
  }
  #連續數值的profile
  results_lianxu<-profile_lianxu(train[,1:34],lianxu_index,length(lianxu_index),10)
  results_lianxu$category[is.na(results_lianxu$category)]<-"unknown"
  View(results_lianxu)
  ######封裝函式
#####################################  2.profile 連續型數值 #########################################

    #將兩個 profile 合成一個整體,輸出到xlsx表格

  #xlsx <- loadWorkbook('Correlation.xlsx',create=TRUE)
  final_profile<-rbind(results_fenlei,results_lianxu)
  View(final_profile)
  xlsx <- loadWorkbook('myhomework.xlsx')
  createSheet(xlsx,name='profile')  #name the worksheet as 'correlation'
  writeWorksheet(xlsx,final_profile,'profile',startRow=1,startCol=1, header=T)  #define the startrow,startcol,header
  saveWorkbook(xlsx)

執行結果(部分截圖):
所有變數的profiling圖

生成連續型數值的五數概括

程式碼:

#########################################################################
########################   Part4 means     ##############################
#########################################################################
  # 連續性資料的五數概括
  dat_n<-train[,lianxu_index]
  mean_var<-data.frame(var=1:ncol(dat_n),mean=NA,median=NA,"0%"=NA,
                       "1%"=NA,"10%"=NA,"25%"=NA,"50%"=NA,
                       "75%"=NA,"90%"=NA,"99%"=NA,"100%"=NA,
                       max=NA,missing=NA)
  colnames(mean_var)[4:12]<-c("Minimum","1st Pthl","10th Pctl","25th Pctl","50th Pctl","75th Pctl","90th Pctl",
                              "99th Pctl","Maximum")
  for(i in 1:ncol(dat_n)){
    mean_var$var[i]=colnames(dat_n)[i]
    mean_var$mean[i]=mean(dat_n[,i],na.rm=TRUE)   #na.rm=TRUE去除NA的影響
    mean_var$median[i]=median(dat_n[,i],na.rm=TRUE)
    mean_var[i,4:12]=quantile(dat_n[,i],c(0,0.01,0.1,0.25,0.5,0.75,0.9,0.99,1),na.rm=TRUE)
    mean_var$max[i]=max(dat_n[,i],na.rm=TRUE)
    mean_var$missing[i]=sum(is.na(dat_n[,i]))
  }
  # #銷燬臨時變數
  # dat_n<-NULL
  #在列表中檢視數值變數的統計資訊
  View(mean_var)

  # 匯出到 xlsx 表格
  xlsx <- loadWorkbook('myhomework.xlsx')
  createSheet(xlsx,name='means')  #name the worksheet as 'correlation'
  writeWorksheet(xlsx,mean_var,'means',startRow=1,startCol=1, header=T)  #define the startrow,startcol,header
  saveWorkbook(xlsx)

執行結果(部分截圖):
僅連續型數值變數的五數概括

未完待續。。。
轉載請註明出處