1. 程式人生 > >python中的pandas包的資料清洗能力

python中的pandas包的資料清洗能力

pandas很強大,前幾天公司要求利用已知使用者身份證、手機號知道客戶的星座、性別、年齡等相關資訊做使用者畫像,剛剛開始想到用R語言來實現,後來想到以後效能的問題,放棄了,由於公司沒sas,就用pandas快速實現,參考程式碼:

# coding: UTF-8
'''
Created on 2015年8月25日

@author: ZHOUMEIXU204
'''
import  MySQLdb
import pandas as pd
import time
import  datetime
Table_id =pd.read_excel(u'D:\\Users\\zhoumeixu204\\Desktop\\全國身份證歸屬地資料庫.xlsx')
print(Table_id.head())
num_str = lambda x: str(x)
num_sub = lambda x:x[0:6]
id_dict = dict(zip(Table_id['BM'].apply(num_str).apply(num_sub), Table_id['DQ']))
con = MySQLdb.connect(host="202.69.27.239", port=8443, user="root", passwd="Pa123456!", db="analyse", use_unicode=True, charset="utf8")
con_dev = MySQLdb.connect(host="202.69.27.239", port=8443, user="root", passwd="Pa123456!", db="analyse_dev", use_unicode=True, charset="utf8")
table_id_decode = pd.read_sql("select * from table_id", con)
f = lambda x: x[0:6]
table_id_decode['cert_address'] = table_id_decode['cert_id'].apply(f).map(id_dict).fillna(u'無法匹配')
def constellation(x):
    if  len(x)>=15:
        monthday=int(x[10:12]+x[12:14])
        if monthday>=321 and monthday<=419:
            constellation=u'白羊座'
        elif monthday>=420 and monthday<=520:
            constellation=u'金牛座'
        elif monthday>=521 and monthday<=621:
            constellation=u'雙子座'
        elif monthday>=622 and monthday<=722:
            constellation=u'巨蟹座'
        elif monthday>=723 and monthday<=822:
            constellation=u'獅子座'
        elif monthday>=823 and monthday<=922:
            constellation=u'處女座'
        elif monthday>=823 and monthday<=1023:
            constellation=u'天枰座'
        elif monthday>=1024 and monthday<=1121:
            constellation=u'天蠍座'
        elif monthday>=1122 and monthday<=1221:
            constellation=u'射手座'
        elif (monthday>=1222 and monthday<=1231) or (monthday>=101 and monthday<=119) :
            constellation=u'魔蠍座'
        elif  monthday>=120 and monthday<=218:
            constellation=u'水瓶座'
        elif monthday>=219 and monthday<=320:
            constellation=u'雙魚座'
        else:
            constellation=u'其他'
    else:
        constellation=u'無法識別'

    return(constellation)  

def  zodiac(x):
    if  len(x)>=15:
        year=int(x[6:10])
        if year==11:
            zodiac=u'羊'
        elif year==10:
            zodiac=u'馬'
        elif year==9:
            zodiac=u'蛇'
        elif year==8:
            zodiac=u'龍'
        elif year==7:
            zodiac=u'兔'
        elif year==6:
            zodiac=u'虎'
        elif year==5:
            zodiac=u'牛'
        elif year==4:
            zodiac=u'鼠'
        elif year==3:
            zodiac=u'豬'
        elif year==2:
            zodiac=u'狗'
        elif year==1:
            zodiac=u'雞'
        elif year==0:
            zodiac=u'猴'
        else:
            zodiac=u'其他'
    else:
        zodiac=u'無法獲得'
    return(zodiac)
def   sex(x):
    if len(x)==15:
        if int(x[len(x)-1])%2==1:
            sex=u'男'
        else:
            sex=u'女'
    elif len(x)==18:
        if int(x[len(x)-2])%2==1:
            sex=u'男'
        else:
            sex=u'女'
    else:
        sex=u'無法識別'
    return(sex)
time.sleep(0.5)
def  birthday(x):
    if len(x)>=15:
        birthday=x[6:10]+"-"+x[10:12]+"-"+x[12:14]
    else:
        birthday=u'無法獲得'
    return(birthday)
def  age(x):
    if len(x)>=15:
        now = datetime.datetime.now()
        otherStyleTime =now.strftime("%Y-%m-%d %H:%M:%S")[0:4]
        age=str(int(otherStyleTime)-int(x[6:10]))
    else:
        age=u'無法獲得'
    return(age)
table_id_decode['zodiac']=table_id_decode['cert_id'].apply(zodiac)
table_id_decode['constellation']=table_id_decode['cert_id'].apply(constellation)
table_id_decode['sex']=table_id_decode['cert_id'].apply(sex)
table_id_decode['birthday']=table_id_decode['cert_id'].apply(birthday)
table_id_decode['age']=table_id_decode['cert_id'].apply(age)    
cert_address = table_id_decode.drop('usr_name',axis=1)
# print(cert_address)
cert_address.to_sql("cert_info", con_dev, flavor="mysql", if_exists='replace', index=False) 
con.commit()
con.close()
con_dev.commit()
con_dev.close()
print("suceess")