1. 程式人生 > >《利用Python進行資料分析》筆記---第9章資料聚合與分組運算

《利用Python進行資料分析》筆記---第9章資料聚合與分組運算

寫在前面的話:

還有一定要說明的:

我使用的是Python2.7,書中的程式碼有一些有錯誤,我使用自己的2.7版本調通。

# coding: utf-8
from pandas import Series, DataFrame
import pandas as pd
import numpy as np

df =DataFrame({'key1':['a','a','b','b','a'],'key2':['one','two','one','two','one'],
    'data1':np.random.randn(5),'data2':np.random.randn(5
)}) df grouped = df['data1'].groupby(df['key1']) grouped grouped.mean() means = df['data1'].groupby([df['key1'],df['key2']]).mean() means means.unstack() states = np.array(['Ohio','California','California','Ohio','Ohio']) years = np.array([2005,2005,2006,2005,2006]) df['data1'].groupby([states,years]).mean() df.groupby('key1'
).mean() df.groupby(['key1','key2']).mean() df.groupby(['key1','key2']).size() for name,group in df.groupby('key1'): print name print group for (k1,k2),group in df.groupby(['key1','key2']): print k1,k2 print group pieces = dict(list(df.groupby('key1'))) pieces['b'] df.dtypes grouped = df.groupby(df.dtypes,axis = 1
) dict(list(grouped)) df.groupby('key1')['data1'] df.groupby('key1')[['data1']] df.groupby(['key1','key2'])[['data2']].mean() s_grouped = df.groupby(['key1','key2'])['data2'] s_grouped s_grouped.mean() people = DataFrame(np.random.randn(5,5),columns = ['a','b','c','d','e'],index = ['Joe','Steve','Wes','Jim','Travis']) people.ix[2:3,['b','c']] = np.nan people mapping = {'a':'red','b':'red','c':'blue','d':'blue','e':'red','f':'orange'} by_column = people.groupby(mapping,axis = 1) by_column.sum() map_series = Series(mapping) map_series people.groupby(map_series,axis = 1).count() people.groupby(len).sum() key_list = ['one','one','one','two','two'] people.groupby([len,key_list]).min() columns = pd.MultiIndex.from_arrays([['US','US','US','JP','JP'],[1,3,5,1,3]],names = ['cty','tenor']) hier_df = DataFrame(np.random.randn(4,5),columns = columns) hier_df hier_df.groupby(level = 'cty',axis = 1).count() hier_df.groupby(level = 'tenor',axis = 1).count() hier_df.groupby(level = ['cty','tenor'],axis = 1).count() df grouped = df.groupby('key1') grouped['data1'].quantile(0.9), def peak_to_peak(arr): return arr.max() - arr.min() grouped.agg(peak_to_peak) grouped.describe() tips = pd.read_csv('D:\Source Code\pydata-book-master\ch08\\tips.csv') tips['tip_pct'] = tips['tip'] / tips['total_bill'] tips.head() grouped = tips.groupby(['sex','smoker']) grouped_pct = grouped['tip_pct'] grouped_pct.agg('mean') grouped_pct.agg(['mean','std',peak_to_peak]) grouped_pct.agg([('foo','mean'),('bar',np.std)]) functions = ['count','mean','max'] result = grouped['tip_pct','total_bill'].agg(functions) result result['tip_pct'] ftuples = [('Durchschnitt','mean'),('Abweichung',np.var)] grouped['tip_pct','total_bill'].agg(ftuples) grouped.agg({'tip':np.max,'size':sum}) grouped.agg({'tip':['min','max','mean','std'],'size':sum}) tips.groupby(['sex','smoker'],as_index=False).mean() df k1_means = df.groupby('key1').mean().add_prefix('mean_') k1_means pd.merge(df,k1_means,left_on = 'key1',right_index = True) people = DataFrame(np.random.randn(5,5),columns = ['a','b','c','d','e'],index = ['Joe','Steve','Wes','Jim','Travis']) people key = ['one','two','one','two','one'] people.groupby(key).mean() people.groupby(key).transform(np.mean) def demean(arr): return arr - arr.mean() demeaned = people.groupby(key).transform(demean) demeaned demeaned.groupby(key).mean() def top(df,n = 5,column = 'tip_pct'): return df.sort_index(by = column)[-n:] top(tips,n = 6) tips.groupby('smoker').apply(top) tips.groupby(['smoker','day']).apply(top,n = 1,column = 'total_bill') result = tips.groupby('smoker')['tip_pct'].describe() result result.unstack('smoker') f = lambda x : x.describe() tips.groupby('smoker')['tip_pct'].apply(f) tips.groupby('smoker').apply(f) tips.groupby('smoker',group_keys = False).apply(top) frame = DataFrame({'data1':np.random.randn(1000),'data2':np.random.randn(1000)}) frame.head() factor = pd.cut(frame.data1,4) factor[:10] def get_stats(group): return {'min':group.min(),'max':group.max(),'count':group.count(),'mean':group.mean()} grouped = frame.data2.groupby(factor) grouped.apply(get_stats) grouped.apply(get_stats).unstack() grouping = pd.qcut(frame.data1,10) grouping = pd.qcut(frame.data1,10,labels = False) grouping grouped = frame.data2.groupby(grouping) grouped.apply(get_stats).unstack() df = DataFrame({'category':['a','a','a','a','b','b','b','b'], 'data':np.random.randn(8), 'weights':np.random.randn(8)}) df grouped = df.groupby('category') get_wavg = lambda g:np.average(g['data'],weights=g['weights']) grouped.apply(get_wavg) close_px = pd.read_csv('D:\Source Code\pydata-book-master\ch09\stock_px.csv',parse_dates=True,index_col=0) close_px close_px[-4:] rets = close_px.pct_change().dropna() spx_corr = lambda x:x.corrwith(x['SPX']) by_year = rets.groupby(lambda x:x.year) by_year.apply(spx_corr) by_year.apply(lambda g:g['AAPL'].corr(g['MSFT'])) import statsmodels.api as sm def regress(data,yvax,xvars): Y = data[yvax] X = data[xvars] X['intercept'] = 1 result = sm.OLS(Y,X).fit() return result.params by_year.apply(regress,'AAPL',['SPX']) fec = pd.read_csv('D:\Source Code\pydata-book-master\ch09\P00000001-ALL.csv') fec fec.ix[123456] unique_cands = fec.cand_nm.unique() unique_cands unique_cands[2] parties = {'Bachmann, Michelle':'Republican', 'Cain, Herman':'Republican', 'Gingrich, Newt':'Republican', 'Huntsman, Jon':'Republican', 'Johnson, Gary Earl':'Republican', 'McCotter, Thaddeus G':'Republican', 'Obama, Barack':'Democrat', 'Paul, Ron':'Republican', 'Pawlenty, Timothy':'Republican', 'Perry, Rick':'Republican', "Roemer, Charles E. 'Buddy' III":'Republican', 'Romney, Mitt':'Republican', 'Santorum, Rick':'Republican'} fec.cand_nm[123456:123461] fec.cand_nm[123456:123461].map(parties) fec['party'] = fec.cand_nm.map(parties) fec['party'].value_counts() (fec.contb_receipt_amt > 0).value_counts() fec = fec[fec.contb_receipt_amt >0] fec_mrbo = fec[fec.cand_nm.isin(['Obama, Barack','Romney, Mitt'])] fec_mrbo fec.contbr_occupation.value_counts()[:10] occ_mapping = { 'INFORMATION REQUESTED PER BEST EFFORTS':'NOT PROVIDED', 'INFORMATION REQUESTED':'NOT PROVIDED', 'INFORMATION REQUESTED (BEST EFFORTS)':'NOT PROVIDED', 'C.E.O':'CEO' } f = lambda x:occ_mapping.get(x,x) fec.contbr_occupation = fec.contbr_occupation.map(f) emp_mapping = { 'INFORMATION REQUESTED PER BEST EFFORTS':'NOT PROVIDED', 'INFORMATION REQUESTED':'NOT PROVIDED', 'SELF':'SELF-EMPLOYED', 'SELF EMPLOYED':'SELF-EMPLOYED' } f = lambda x:emp_mapping.get(x,x) fec.contbr_employer = fec.contbr_employer.map(f) by_occupation = fec.pivot_table('contb_receipt_amt',rows = 'contbr_occupation',cols = 'party',aggfunc = sum) by_occupation.head() over_2mm = by_occupation[by_occupation.sum(1) > 2000000] over_2mm over_2mm.plot(kind = 'barh') def get_top_amounts(group,key,n = 5): totals = group.groupby(key)['contb_receipt_amt'].sum() return totals.order(ascending = False)[:n] grouped = fec_mrbo.groupby('cand_nm') grouped.apply(get_top_amounts,'contbr_occupation',n = 7),'\n' fec_mrbo.groupby(['cand_nm','contbr_occupation'])['contb_receipt_amt'].sum() grouped.apply(get_top_amounts,'contbr_employer',n = 10)