1. 程式人生 > >pandas處理資料例子

pandas處理資料例子

import  pandas as pd
import os
import numpy as np
# apply適用的函式,處理每個group
def add_prop(group):
    births = group.births.astype(float)
    group['prop'] = births / births.sum()
    return group
# top1000
def top1000(group):
    return group.sort_values(by="births",ascending=False)[:1000]
if __name__ == "__main__"
: years = range(1880,2011) names = ["name","sex","births"] pieces = [] # read sequence file for year in years: path = "names/yob{year}.txt".format(year=year) if os.path.exists(path): print "begin to read {path}".format(path=path) else: print
"{path} does not exists".format(path=path) continue # 讀取資料 frame = pd.read_table(path,names=names,sep=',') frame['years'] = year pieces.append(frame) names = pd.concat(pieces,ignore_index=True) #print names[:10] #資料歸總========================= total_births = names.pivot_table("births"
,index="years",columns="sex",aggfunc=sum) total_births.plot(title="births") #print total_births grouped = names.groupby(['years','sex']).size().unstack() #print grouped #apply # group函式以及apply配合使用 names = grouped = names.groupby(['years','sex']).apply(add_prop) print names[:10] ## verify print np.allclose(names.groupby(['years','sex']).prop.sum(),1) ## every year top 1000 grouped = names.groupby(['years','sex']) top1000 = grouped.apply(top1000) #print top1000.ix[top1000['years']==2000] print top1000.ix[top1000['years']==2000]