Python資料分析與機器學習-使用者流失預警
阿新 • • 發佈:2019-02-10
import pandas as pd import numpy as np pd.set_option('display.height', 9999) pd.set_option('display.max_rows', 9999) pd.set_option('display.max_columns', 9999) pd.set_option('display.width', 9999) churn_df = pd.read_csv('churn.csv') ''' State Account Length Area Code Phone Int'l Plan VMail Plan VMail Message Day Mins Day Calls Day Charge Eve Mins Eve Calls Eve Charge Night Mins Night Calls Night Charge Intl Mins Intl Calls Intl Charge CustServ Calls Churn? 0 KS 128 415 382-4657 no yes 25 265.1 110 45.07 197.4 99 16.78 244.7 91 11.01 10.0 3 2.70 1 False. 1 OH 107 415 371-7191 no yes 26 161.6 123 27.47 195.5 103 16.62 254.4 103 11.45 13.7 3 3.70 1 False. 2 NJ 137 415 358-1921 no no 0 243.4 114 41.38 121.2 110 10.30 162.6 104 7.32 12.2 5 3.29 0 False. 3 OH 84 408 375-9999 yes no 0 299.4 71 50.90 61.9 88 5.26 196.9 89 8.86 6.6 7 1.78 2 False. 4 OK 75 415 330-6626 yes no 0 166.7 113 28.34 148.3 122 12.61 186.9 121 8.41 10.1 3 2.73 3 False. ''' churn_feat_space = churn_df.drop(['State', 'Area Code', 'Phone', 'Churn?'], axis=1) yes_no_cols = ["Int'l Plan", "VMail Plan"] churn_feat_space[yes_no_cols] = churn_feat_space[yes_no_cols] == 'yes' # features = churn_feat_space.columns # print(churn_feat_space.head()) ''' Account Length Int'l Plan VMail Plan VMail Message Day Mins Day Calls Day Charge Eve Mins Eve Calls Eve Charge Night Mins Night Calls Night Charge Intl Mins Intl Calls Intl Charge CustServ Calls 0 128 False True 25 265.1 110 45.07 197.4 99 16.78 244.7 91 11.01 10.0 3 2.70 1 1 107 False True 26 161.6 123 27.47 195.5 103 16.62 254.4 103 11.45 13.7 3 3.70 1 2 137 False False 0 243.4 114 41.38 121.2 110 10.30 162.6 104 7.32 12.2 5 3.29 0 3 84 True False 0 299.4 71 50.90 61.9 88 5.26 196.9 89 8.86 6.6 7 1.78 2 4 75 True False 0 166.7 113 28.34 148.3 122 12.61 186.9 121 8.41 10.1 3 2.73 3 ''' X = churn_feat_space.as_matrix().astype(np.float) churn_result = churn_df['Churn?'] y = np.where(churn_result == 'True.', 1, 0) from sklearn.preprocessing import StandardScaler scaler = StandardScaler() X = scaler.fit_transform(X) # print(X[0]) ''' [ 0.67648946 -0.32758048 1.6170861 1.23488274 1.56676695 0.47664315 1.56703625 -0.07060962 -0.05594035 -0.07042665 0.86674322 -0.46549436 0.86602851 -0.08500823 -0.60119509 -0.0856905 -0.42793202] ''' '''交叉驗證通用函式''' from sklearn.cross_validation import KFold # X,y,選擇的分類器,引數 def run_cv(X, y, clf_class, **kwargs): # Construct a kfolds object kf = KFold(len(y), n_folds=5, shuffle=True) y_pred = y.copy() # Iterate through folds for train_index, test_index in kf: X_train, X_test = X[train_index], X[test_index] y_train = y[train_index] # Initialize a classifier with key word arguments clf = clf_class(**kwargs) clf.fit(X_train, y_train) y_pred[test_index] = clf.predict(X_test) return y_pred from sklearn.svm import SVC from sklearn.ensemble import RandomForestClassifier as RF from sklearn.neighbors import KNeighborsClassifier as KNN # 精度 def accuracy(y_true, y_pred): # NumPy interprets True and False as 1. and 0. return np.mean(y_true == y_pred) print("Support vector machines:") print("%.3f" % accuracy(y, run_cv(X, y, SVC))) print("Random forest:") print("%.3f" % accuracy(y, run_cv(X, y, RF))) print("K-nearest-neighbors:") print("%.3f" % accuracy(y, run_cv(X, y, KNN))) # 客戶流失的概率 def run_prob_cv(X, y, clf_class, **kwargs): kf = KFold(len(y), n_folds=5, shuffle=True) y_prob = np.zeros((len(y), 2)) for train_index, test_index in kf: X_train, X_test = X[train_index], X[test_index] y_train = y[train_index] clf = clf_class(**kwargs) clf.fit(X_train, y_train) # Predict probabilities, not classes y_prob[test_index] = clf.predict_proba(X_test) return y_prob # Use 10 estimators so predictions are all multiples of 0.1 pred_prob = run_prob_cv(X, y, RF, n_estimators=10) # print pred_prob[0] pred_churn = pred_prob[:, 1] is_churn = y == 1 # Number of times a predicted probability is assigned to an observation counts = pd.value_counts(pred_churn) # print counts # calculate true probabilities true_prob = {} for prob in counts.index: true_prob[prob] = np.mean(is_churn[pred_churn == prob]) true_prob = pd.Series(true_prob) # pandas-fu counts = pd.concat([counts, true_prob], axis=1).reset_index() counts.columns = ['pred_prob', 'count', 'true_prob'] print(counts)