1. 程式人生 > >評分卡模型-(一特徵構建)

評分卡模型-(一特徵構建)

# -*- coding: utf-8 -*- """ Created on Sun Sep 16 09:24:18 2018

@author: wangxihe """

import os import pandas as pd import datetime import matplotlib.pyplot as plt import collections import numpy as np os.chdir(r'E:\spyderwork\評分卡模型\一特徵構建')

plt.rcParams['font.sans-serif']=['SimHei']  # 用來正常顯示中文標籤  plt.rcParams['axes.unicode_minus']=False  # 用來正常顯示負號 #%%讀取資料 MasterData=pd.read_csv('PPD_Training_Master_GBK_3_1_Training_Set.csv',encoding='gbk') LoginData=pd.read_csv('PPD_LogInfo_3_1_Training_Set.csv',encoding='gbk') UpdateData=pd.read_csv('PPD_Userupdate_Info_3_1_Training_Set.csv',encoding='gbk') #%%處理時間格式 #LoginData['Listinginfo1']=pd.to_datetime(LoginData['Listinginfo1']) LoginData['Listinginfo1']=LoginData['Listinginfo1'].apply(lambda x :datetime.datetime.strptime(x,'%Y-%m-%d')) LoginData['LogInfo3']=LoginData['LogInfo3'].apply(lambda x :datetime.datetime.strptime(x,'%Y-%m-%d'))

#計算登入天數 LoginData['LogDay']=LoginData['Listinginfo1']-LoginData['LogInfo3'] #LoginData['LogDay']=LoginData[['Listinginfo1','LogInfo3']].apply(lambda x:x[0]-x[1]) LoginData['LogDay']=LoginData['LogDay'].dt.days #%% #檢視登入天數分佈 LoginData['LogDay'].plot(kind='hist',bins=200)

#%%#檢視登入天數分佈 plt.hist(LoginData['LogDay'],bins=300) plt.title('登入天數分佈') #%%登入方式

LoginData['LogInfo2'].value_counts() LoginData['LogInfo2'].value_counts().sort_values().plot(kind='barh') #%% def MyDiv(x,y):     if y==None:         return 0     elif y==0:         return 0     else:         return x*1.0/y     #%% Tw=[7, 30, 60, 90, 120, 150, 180] cols=['LogInfo1','LogInfo2']

LoginIdxDf=pd.DataFrame({'Idx':LoginData['Idx'].drop_duplicates()})

for day in Tw:     LoginData['LogTime']=LoginData['Listinginfo1']-datetime.timedelta(days=day)     TempDf=LoginData[LoginData['LogInfo3']>=LoginData['LogTime']]     for var in cols:         #總的登入次數         TempGroupDict=TempDf.groupby('Idx')[var].count().to_dict()         LoginIdxDf[str(var)+'_'+str(day)+'_totalnum']=LoginIdxDf['Idx'].apply(lambda x:TempGroupDict.get(x,0))         #不重複的登入次數         UnionTempDf=TempDf[['Idx',var]].drop_duplicates()         UnionTempDict=UnionTempDf.groupby('Idx')[var].count().to_dict()         LoginIdxDf[str(var) + '_' + str(day) + '_unique']=LoginIdxDf['Idx'].apply(lambda x:UnionTempDict.get(x,0))         #比例         LoginIdxDf[str(var) + '_' + str(day) + '_rate']=LoginIdxDf[[str(var)+'_'+str(day)+'_totalnum',str(var) + '_' + str(day) + '_unique']].apply(lambda x:MyDiv(x[0],x[1]),axis=1)         LoginIdxDf.to_csv('Log.csv') #%%  #UpdateData['ListingInfo1']=pd.to_datetime(UpdateData['ListingInfo1']) #UpdateData['UserupdateInfo2']=pd.to_datetime(UpdateData['UserupdateInfo2']) UpdateData['ListingInfo1']=UpdateData['ListingInfo1'].apply(lambda x :datetime.datetime.strptime(x,'%Y/%m/%d')) UpdateData['UserupdateInfo2']=UpdateData['UserupdateInfo2'].apply(lambda x :datetime.datetime.strptime(x,'%Y/%m/%d')) #%% UpdateData['UserupdateInfo1'].value_counts().sort_values(ascending=False) len(UpdateData['UserupdateInfo1'].value_counts()) updateTop10=UpdateData['UserupdateInfo1'].value_counts().sort_values(ascending=False).head(20).copy() updateTop10.sort_values().plot(kind='barh') #%%轉為大寫,看是否有重複 UpdateData['UserupdateInfo1']=UpdateData['UserupdateInfo1'].apply(lambda x:x.upper()) len(UpdateData['UserupdateInfo1'].value_counts())

def updateNumber(x):     if x=='_MOBILEPHONE':         return '_PHONE'     else:         return x     UpdateData['UserupdateInfo1']=UpdateData['UserupdateInfo1'].apply(lambda x:updateNumber(x)) #%% UpdateIdxDf=pd.DataFrame({'Idx':UpdateData['Idx'].drop_duplicates()})        for day in Tw:     UpdateData['LogTime']=UpdateData['ListingInfo1']-datetime.timedelta(days=day)     TempDf=UpdateData[UpdateData['UserupdateInfo2']>=UpdateData['LogTime']]     TempGroupDict=TempDf.groupby('Idx')['UserupdateInfo1'].count().to_dict()     UpdateIdxDf['Update_'+str(day)+'_freq']=UpdateIdxDf['Idx'].apply(lambda x:TempGroupDict.get(x,0))     UnionTempDf=TempDf[['Idx','UserupdateInfo1']].drop_duplicates()     UnionTempDict=UnionTempDf.groupby('Idx')['UserupdateInfo1'].count().to_dict()     UpdateIdxDf['Update_' + str(day) + '_unique']=UpdateIdxDf['Idx'].apply(lambda x:UnionTempDict.get(x,0))     UpdateIdxDf['Update_' + str(day) + '_rate']=UpdateIdxDf[['Update_'+str(day)+'_freq','Update_' + str(day) + '_unique']].apply(lambda x:MyDiv(x[0],x[1]),axis=1)     #修改重要屬性

    TempsumDict=UnionTempDf.groupby('Idx')['UserupdateInfo1'].sum()     for item in ['_IDNUMBER','_HASBUYCAR','_MARRIAGESTATUSID','_PHONE']:        item_dict = TempsumDict.map(lambda x: int(item in x)).to_dict()        UpdateIdxDf['UserupdateInfo_' + str(day) + str(item)]=UpdateIdxDf['Idx'].apply(lambda x:item_dict.get(x,0))           UpdateIdxDf.to_csv('update.csv') #%%判斷歸屬地是否一致 MasterData['city_match'] = MasterData.apply(lambda x: int(x.UserInfo_2 == x.UserInfo_4 == x.UserInfo_8 == x.UserInfo_20),axis = 1) del MasterData['UserInfo_2'] del MasterData['UserInfo_4'] del MasterData['UserInfo_8'] del MasterData['UserInfo_20']

MasterData.to_csv('master.csv',encoding = 'gbk') #%%

allData_0=pd.concat([MasterData.set_index('Idx'),UpdateIdxDf.set_index('Idx'),LoginIdxDf.set_index('Idx')],axis=1) allData_0.to_csv('Idx0.csv',encoding='gbk')

#%% LoginData['MinueDays']=LoginData[['Listinginfo1','LogInfo3']].apply(lambda x:(x[0]-x[1]).days,axis=1)

def TimeWindowSelection(df,col,tw):     tw_dict={}     for day in tw:         tw_dict[day]=len(df[df[col]<=day])     return tw_dict tw_dict=TimeWindowSelection(LoginData,'MinueDays',[7,15,30,60,90,120,150,180]) tw_df=pd.DataFrame.from_dict(tw_dict,orient ='index')

tw_df.plot(kind='bar')

#%%

UpdateData['MinueDays']=UpdateData[['UserupdateInfo2','ListingInfo1']].apply(lambda x:(x[1]-x[0]).days,axis=1) t=collections.Counter(UpdateData['MinueDays']) hist_ListingGap = np.histogram(UpdateData['MinueDays']) hist_ListingGap = pd.DataFrame({'Freq':hist_ListingGap[0],'gap':hist_ListingGap[1][1:]}) hist_ListingGap['CumFreq'] = hist_ListingGap['Freq'].cumsum() hist_ListingGap['CumPercent'] = hist_ListingGap['CumFreq'].map(lambda x: x*1.0/hist_ListingGap.iloc[-1]['CumFreq'])

#%%

#groupby collections.Counter np.histogram concat merger