評分卡模型(二資料清洗)
# -*- coding: utf-8 -*- """ Created on Sun Sep 16 19:04:53 2018
@author: wangxihe """
import os import pandas as pd import numbers import numpy as np import matplotlib.pyplot as plt #%% os.chdir(r'E:\spyderwork\評分卡模型\一特徵構建') allData = pd.read_csv('Idx0.csv',header = 0,encoding = 'gbk') allData.shape #%% os.chdir(r'E:\spyderwork\評分卡模型\二特徵清洗') describeDf=allData.describe().T
def MissingCategorial(df,col): missing_vals = df[col].map(lambda x: int(x!=x)) return sum(missing_vals)*1.0/df.shape[0]
def MissingContinuous(df,col): missing_vals = df[col].map(lambda x: int(np.isnan(x))) return sum(missing_vals) * 1.0 / df.shape[0]
#%% allFeatures = list(allData.columns) allFeatures.remove('target') if 'Idx' in allFeatures: allFeatures.remove('Idx') allFeatures.remove('ListingInfo')
len(allFeatures) #%% #檢查是否有常數型變數,並且檢查是類別型還是數值型變數
numerical_var = [] for col in allFeatures: if len(set(allData[col])) == 1: print(' {} :此列為常數所以刪除'.format(col)) del allData[col] allFeatures.remove(col) else: uniq_valid_vals = [i for i in allData[col] if i == i] uniq_valid_vals = list(set(uniq_valid_vals)) if len(uniq_valid_vals) >= 10 and isinstance(uniq_valid_vals[0], numbers.Real): numerical_var.append(col)
categorical_var = [i for i in allFeatures if i not in numerical_var]
len(numerical_var) len(categorical_var) #%% #檢查變數的最多值的佔比情況,以及每個變數中佔比最大的值 records_count = allData.shape[0] col_most_values,col_large_value = {},{} for col in allFeatures: value_count = allData[col].groupby(allData[col]).count() col_most_values[col] = max(value_count)/records_count large_value = value_count[value_count== max(value_count)].index[0] col_large_value[col] = large_value col_most_values_df = pd.DataFrame.from_dict(col_most_values, orient = 'index') col_most_values_df.columns = ['max percent'] col_most_values_df = col_most_values_df.sort_values(by = 'max percent', ascending = False) pcnt = list(col_most_values_df[:180]['max percent']) vars = list(col_most_values_df[:180].index) plt.bar(range(len(pcnt)), height = pcnt) plt.title('Largest Percentage of Single Value in Each Variable') len(col_most_values) len(col_large_value) #%% #計算多數值產比超過90%的欄位中,少數值的壞樣本率是否會顯著高於多數值 large_percent_cols = list(col_most_values_df[col_most_values_df['max percent']>=0.9].index) bad_rate_diff = {} for col in large_percent_cols: large_value = col_large_value[col] temp = allData[[col,'target']] temp[col] = temp.apply(lambda x: int(x[col]==large_value),axis=1) bad_rate = temp.groupby(col).mean() if bad_rate.iloc[0]['target'] == 0: bad_rate_diff[col] = 0 continue bad_rate_diff[col] = np.log(bad_rate.iloc[0]['target']/bad_rate.iloc[1]['target']) bad_rate_diff_sorted = sorted(bad_rate_diff.items(),key=lambda x: x[1], reverse=True) bad_rate_diff_sorted_values = [x[1] for x in bad_rate_diff_sorted] plt.bar(x = range(len(bad_rate_diff_sorted_values)), height = bad_rate_diff_sorted_values) len(bad_rate_diff) #%% #由於所有的少數值的壞樣本率並沒有顯著高於多數值,意味著這些變數可以直接剔除 for col in large_percent_cols: if col in numerical_var: numerical_var.remove(col) else: categorical_var.remove(col) del allData[col] #%% missvalue={} #對類別型變數,如果缺失超過80%, 就刪除,否則當成特殊的狀態 missing_pcnt_threshould_1 = 0.8 #%% #for col in categorical_var: # missingRate = MissingCategorial(allData,col) # print('{0} has missing rate as {1}'.format(col,missingRate)) # if missingRate > missing_pcnt_threshould_1: # categorical_var.remove(col) # del allData[col] # if 0 < missingRate < missing_pcnt_threshould_1: # uniq_valid_vals = [i for i in allData[col] if i == i] # uniq_valid_vals = list(set(uniq_valid_vals)) # if isinstance(uniq_valid_vals[0], numbers.Real): # missing_position = allData.loc[allData[col] != allData[col]][col].index # not_missing_sample = [-1]*len(missing_position) # allData.loc[missing_position, col] = not_missing_sample # # else: # allData[col] = allData[col].map(lambda x: str(x).upper()) # #%% for col in categorical_var: missingRate = MissingCategorial(allData,col) print('{0} has missing rate as {1}'.format(col,missingRate)) if missingRate > missing_pcnt_threshould_1: categorical_var.remove(col) del allData[col] else: temp=allData[col].value_counts() max1=max(temp) maxmiss=temp[temp==max1].index[0] missvalue[col]=maxmiss allData[col].fillna(maxmiss,inplace=True)
#sum(pd.isnull(allData['WeblogInfo_19'])) #%%檢查數值型變數
#len(numerical_var) #missing_pcnt_threshould_2 = 0.8 #deleted_var = [] #for col in numerical_var: # missingRate = MissingContinuous(allData, col) # print('{0} 該列缺失率為 {1}'.format(col, missingRate)) # if missingRate > missing_pcnt_threshould_2: # deleted_var.append(col) # print('將刪除變數 {} 因為該變數缺失率高於設定的閾值'.format(col)) # else: # if missingRate > 0: # not_missing = allData.loc[allData[col] == allData[col]][col] # #makeuped = allData[col].map(lambda x: MakeupRandom(x, list(not_missing))) # missing_position = allData.loc[allData[col] != allData[col]][col].index # #隨機抽樣補缺 # not_missing_sample = random.sample(list(not_missing), len(missing_position)) # allData.loc[missing_position,col] = not_missing_sample # #del allData[col] # #allData[col] = makeuped # missingRate2 = MissingContinuous(allData, col) # print('missing rate after making up is:{}'.format(str(missingRate2))) #%% len(numerical_var) missing_pcnt_threshould_2 = 0.8 deleted_var = [] for col in numerical_var: missingRate = MissingContinuous(allData, col) print('{0} 該列缺失率為 {1}'.format(col, missingRate)) if missingRate > missing_pcnt_threshould_2: deleted_var.append(col) print('將刪除變數 {} 因為該變數缺失率高於設定的閾值'.format(col)) else: if missingRate > 0: meanmiss=allData[col].mean() missvalue[col]=round(meanmiss,6) allData[col].fillna(round(meanmiss,6),inplace=True) import pickle
with open('var_Fill.pkl',"wb") as f: f.write(pickle.dumps(missingRate))
#%% if deleted_var != []: for col in deleted_var: numerical_var.remove(col) del allData[col]
#%% allData.to_csv('Idx1.csv', header=True,encoding='gbk', columns = allData.columns, index=False)
#%% #L0['Update_90_rate']=round(L0['Update_90_rate'],4) #L0set=set(L0.columns) #L1=pd.read_csv('allData_1.csv',encoding='gbk') #L1set=set(L1.columns)#L1['Update_90_rate']=round(L1['Update_90_rate'],4) #L0set==L1set # #L2=L0.append(L1) ##del L2['ListingInfo'] #L3=L2.drop_duplicates()