1. 程式人生 > >評分卡模型(二資料清洗)

評分卡模型(二資料清洗)

# -*- coding: utf-8 -*- """ Created on Sun Sep 16 19:04:53 2018

@author: wangxihe """

import os import pandas as pd import numbers import numpy as np import matplotlib.pyplot as plt #%% os.chdir(r'E:\spyderwork\評分卡模型\一特徵構建') allData = pd.read_csv('Idx0.csv',header = 0,encoding = 'gbk') allData.shape #%% os.chdir(r'E:\spyderwork\評分卡模型\二特徵清洗') describeDf=allData.describe().T

def MissingCategorial(df,col):     missing_vals = df[col].map(lambda x: int(x!=x))     return sum(missing_vals)*1.0/df.shape[0]

def MissingContinuous(df,col):     missing_vals = df[col].map(lambda x: int(np.isnan(x)))     return sum(missing_vals) * 1.0 / df.shape[0]

#%% allFeatures = list(allData.columns) allFeatures.remove('target') if 'Idx' in allFeatures:     allFeatures.remove('Idx') allFeatures.remove('ListingInfo')

len(allFeatures) #%% #檢查是否有常數型變數,並且檢查是類別型還是數值型變數

numerical_var = [] for col in allFeatures:     if len(set(allData[col])) == 1:         print(' {} :此列為常數所以刪除'.format(col))         del allData[col]         allFeatures.remove(col)     else:         uniq_valid_vals = [i for i in allData[col] if i == i]                 uniq_valid_vals = list(set(uniq_valid_vals))         if len(uniq_valid_vals) >= 10 and isinstance(uniq_valid_vals[0], numbers.Real):             numerical_var.append(col)

categorical_var = [i for i in allFeatures if i not in numerical_var]

len(numerical_var) len(categorical_var) #%% #檢查變數的最多值的佔比情況,以及每個變數中佔比最大的值 records_count = allData.shape[0] col_most_values,col_large_value = {},{} for col in allFeatures:     value_count = allData[col].groupby(allData[col]).count()     col_most_values[col] = max(value_count)/records_count     large_value = value_count[value_count== max(value_count)].index[0]     col_large_value[col] = large_value col_most_values_df = pd.DataFrame.from_dict(col_most_values, orient = 'index') col_most_values_df.columns = ['max percent'] col_most_values_df = col_most_values_df.sort_values(by = 'max percent', ascending = False) pcnt = list(col_most_values_df[:180]['max percent']) vars = list(col_most_values_df[:180].index) plt.bar(range(len(pcnt)), height = pcnt) plt.title('Largest Percentage of Single Value in Each Variable') len(col_most_values) len(col_large_value) #%% #計算多數值產比超過90%的欄位中,少數值的壞樣本率是否會顯著高於多數值 large_percent_cols = list(col_most_values_df[col_most_values_df['max percent']>=0.9].index) bad_rate_diff = {} for col in large_percent_cols:     large_value = col_large_value[col]     temp = allData[[col,'target']]     temp[col] = temp.apply(lambda x: int(x[col]==large_value),axis=1)     bad_rate = temp.groupby(col).mean()     if bad_rate.iloc[0]['target'] == 0:         bad_rate_diff[col] = 0         continue     bad_rate_diff[col] = np.log(bad_rate.iloc[0]['target']/bad_rate.iloc[1]['target']) bad_rate_diff_sorted = sorted(bad_rate_diff.items(),key=lambda x: x[1], reverse=True) bad_rate_diff_sorted_values = [x[1] for x in bad_rate_diff_sorted] plt.bar(x = range(len(bad_rate_diff_sorted_values)), height = bad_rate_diff_sorted_values) len(bad_rate_diff) #%% #由於所有的少數值的壞樣本率並沒有顯著高於多數值,意味著這些變數可以直接剔除 for col in large_percent_cols:     if col in numerical_var:         numerical_var.remove(col)     else:         categorical_var.remove(col)     del allData[col] #%%     missvalue={}     #對類別型變數,如果缺失超過80%, 就刪除,否則當成特殊的狀態 missing_pcnt_threshould_1 = 0.8 #%% #for col in categorical_var: #    missingRate = MissingCategorial(allData,col) #    print('{0} has missing rate as {1}'.format(col,missingRate)) #    if missingRate > missing_pcnt_threshould_1: #        categorical_var.remove(col) #        del allData[col] #    if 0 < missingRate < missing_pcnt_threshould_1: #        uniq_valid_vals = [i for i in allData[col] if i == i] #        uniq_valid_vals = list(set(uniq_valid_vals)) #        if isinstance(uniq_valid_vals[0], numbers.Real): #            missing_position = allData.loc[allData[col] != allData[col]][col].index #            not_missing_sample = [-1]*len(missing_position) #            allData.loc[missing_position, col] = not_missing_sample #            #        else: #            allData[col] = allData[col].map(lambda x: str(x).upper()) #           #%% for col in categorical_var:     missingRate = MissingCategorial(allData,col)     print('{0} has missing rate as {1}'.format(col,missingRate))     if missingRate > missing_pcnt_threshould_1:         categorical_var.remove(col)         del allData[col]     else:         temp=allData[col].value_counts()             max1=max(temp)         maxmiss=temp[temp==max1].index[0]         missvalue[col]=maxmiss         allData[col].fillna(maxmiss,inplace=True)            

#sum(pd.isnull(allData['WeblogInfo_19']))        #%%檢查數值型變數

#len(numerical_var) #missing_pcnt_threshould_2 = 0.8 #deleted_var = [] #for col in numerical_var: #    missingRate = MissingContinuous(allData, col) #    print('{0} 該列缺失率為 {1}'.format(col, missingRate)) #    if missingRate > missing_pcnt_threshould_2: #        deleted_var.append(col) #        print('將刪除變數 {} 因為該變數缺失率高於設定的閾值'.format(col)) #    else: #        if missingRate > 0: #            not_missing = allData.loc[allData[col] == allData[col]][col] #            #makeuped = allData[col].map(lambda x: MakeupRandom(x, list(not_missing))) #            missing_position = allData.loc[allData[col] != allData[col]][col].index #            #隨機抽樣補缺 #            not_missing_sample = random.sample(list(not_missing), len(missing_position)) #            allData.loc[missing_position,col] = not_missing_sample #            #del allData[col] #            #allData[col] = makeuped #            missingRate2 = MissingContinuous(allData, col) #            print('missing rate after making up is:{}'.format(str(missingRate2))) #%% len(numerical_var) missing_pcnt_threshould_2 = 0.8 deleted_var = [] for col in numerical_var:     missingRate = MissingContinuous(allData, col)     print('{0} 該列缺失率為 {1}'.format(col, missingRate))     if missingRate > missing_pcnt_threshould_2:         deleted_var.append(col)         print('將刪除變數 {} 因為該變數缺失率高於設定的閾值'.format(col))     else:         if missingRate > 0:            meanmiss=allData[col].mean()            missvalue[col]=round(meanmiss,6)            allData[col].fillna(round(meanmiss,6),inplace=True)            import pickle

with open('var_Fill.pkl',"wb") as f:     f.write(pickle.dumps(missingRate))

#%% if deleted_var != []:     for col in deleted_var:         numerical_var.remove(col)         del allData[col]

#%% allData.to_csv('Idx1.csv', header=True,encoding='gbk', columns = allData.columns, index=False)

#%% #L0['Update_90_rate']=round(L0['Update_90_rate'],4) #L0set=set(L0.columns) #L1=pd.read_csv('allData_1.csv',encoding='gbk') #L1set=set(L1.columns)#L1['Update_90_rate']=round(L1['Update_90_rate'],4) #L0set==L1set # #L2=L0.append(L1) ##del L2['ListingInfo'] #L3=L2.drop_duplicates()