pandas處理資料例子
阿新 • • 發佈:2018-12-31
import pandas as pd
import os
import numpy as np
# apply適用的函式,處理每個group
def add_prop(group):
births = group.births.astype(float)
group['prop'] = births / births.sum()
return group
# top1000
def top1000(group):
return group.sort_values(by="births",ascending=False)[:1000]
if __name__ == "__main__" :
years = range(1880,2011)
names = ["name","sex","births"]
pieces = []
# read sequence file
for year in years:
path = "names/yob{year}.txt".format(year=year)
if os.path.exists(path):
print "begin to read {path}".format(path=path)
else:
print "{path} does not exists".format(path=path)
continue
# 讀取資料
frame = pd.read_table(path,names=names,sep=',')
frame['years'] = year
pieces.append(frame)
names = pd.concat(pieces,ignore_index=True)
#print names[:10]
#資料歸總=========================
total_births = names.pivot_table("births" ,index="years",columns="sex",aggfunc=sum)
total_births.plot(title="births")
#print total_births
grouped = names.groupby(['years','sex']).size().unstack()
#print grouped
#apply
# group函式以及apply配合使用
names = grouped = names.groupby(['years','sex']).apply(add_prop)
print names[:10]
## verify
print np.allclose(names.groupby(['years','sex']).prop.sum(),1)
## every year top 1000
grouped = names.groupby(['years','sex'])
top1000 = grouped.apply(top1000)
#print top1000.ix[top1000['years']==2000]
print top1000.ix[top1000['years']==2000]