1. 程式人生 > >第九章 資料分組與聚合(下)

第九章 資料分組與聚合(下)

import pandas as pd
from pandas import Series
fec=pd.read_csv("e:/P00000001-ALL.csv")
fec[:2]
     cmte_id    cand_id             cand_nm        contbr_nm contbr_city  \
0  C00410118  P20002978  Bachmann, Michelle  HARVEY, WILLIAM      MOBILE   
1  C00410118  P20002978  Bachmann, Michelle  HARVEY, WILLIAM      MOBILE   

  contbr_st  contbr_zip contbr_employer contbr_occupation  contb_receipt_amt  \
0        AL  3.6601e+08         RETIRED           RETIRED              250.0   
1        AL  3.6601e+08         RETIRED           RETIRED               50.0   

  contb_receipt_dt receipt_desc memo_cd memo_text form_tp  file_num  
0        20-JUN-11          NaN     NaN       NaN   SA17A    736166  
1        23-JUN-11          N
fec.ix[123456]
unique_cands
cmte_id                             C00431445
cand_id                             P80003338
cand_nm                         Obama, Barack
contbr_nm                         ELLMAN, IRA
contbr_city                             TEMPE
contbr_st                                  AZ
contbr_zip                          852816719
contbr_employer      ARIZONA STATE UNIVERSITY
contbr_occupation                   PROFESSOR
contb_receipt_amt                          50
contb_receipt_dt                    01-DEC-11
receipt_desc                              NaN
memo_cd                                   NaN
memo_text                                 NaN
form_tp                                 SA17A
file_num                               772372
Name: 123456, dtype: object
unique_cands=fec.cand_nm.unique()
patries={'Bachmann, Michelle':'Republican','Cain, Herman':'Republican',
         'Gingrich, Newt':'Republican','Huntsman, Jon':'Republican',
         'Johnson, Gary Earl':'Republican','McCotter, Thaddeus G':'Republican',
         'Obama, Barack':'Democrat','Paul, Ron':'Republican',
         'Pawlenty, Timothy':'Republican','Perry, Rick':'Republican',
         "Roemer, Charles E. 'Buddy' III":'Republican',
         'Romney, Mitt':'Republican',
         'Santorum, Rick':'Republican'
         }
unique_cands[2]
'Obama, Barack'fec.cand_nm[123456:123461]
123456    Obama, Barack
123457    Obama, Barack
123458    Obama, Barack
123459    Obama, Barack
123460    Obama, Barack
Name: cand_nm, dtype: object
fec.cand_nm[123456:123461].map(patries)#通過這個對映以及Series物件的map方法,根據候選人姓名得到黨派資訊123456    Democrat
123457    Democrat
123458    Democrat
123459    Democrat
123460    Democrat
Name: cand_nm, dtype: object
#新增一個新列fec['party']=fec.cand_nm.map(patries)fec['party'].value_counts()
Democrat      593746
Republican    407985
Name: party, dtype: int64
#這裡需要注意:該數字既包括贊助也包括退款(負的出資額),限定該資料集只能有正的出資額

fec=fec[fec.contb_receipt_amt>0]

#由於Barack Obama和Mitt Romney是最主要的兩名候選人,建立一個子集只包含針對他們兩人的競選活動的贊助資訊

fec_mrbo=fec[fec.cand_nm.isin(['Obama, Barack','Romney, Mitt'])]





根據職業和僱主統計資訊

#首先根據職業計算總出資額

fec.contbr_occupation.value_counts()[:10]
Out[30]: 
RETIRED                                   233990
INFORMATION REQUESTED                      35107
ATTORNEY                                   34286
HOMEMAKER                                  29931
PHYSICIAN                                  23432
INFORMATION REQUESTED PER BEST EFFORTS     21138
ENGINEER                                   14334
TEACHER                                    13990
CONSULTANT                                 13273
PROFESSOR                                  12555
Name: contbr_occupation, dtype: int64

#巧妙運用dict.get,允許沒有對映關係的職業也能通過,處理職業資訊
occ_mapping={
             'INFORMATION REQUESTED PER BEST EFFORTS':'NOT PROVIDED',
             'INFORMATION REQUESTED':'NOT PROVIDED',
             'INFORMATION REQUESTED(BEST EFFORTS)':'NOT PROVIDED',
             'C.E.O.':'CEO'
             }
#如果沒有提供相關對映,則返回x
f=lambda x:occ_mapping.get(x,x)
fec.contbr_employer=fec.contbr_employer.map(f)

#通過pivot_table根據黨派和職業對資料進行聚合,過濾掉總出資額不足200萬美元的資料

by_occupation=fec.pivot_table('contb_receipt_amt',index='contbr_occupation',
                              columns='party',aggfunc='sum')

over_2mm=by_occupation[by_occupation.sum(1)>2000000]

over_2mm
Out[42]: 
party                                      Democrat    Republican
contbr_occupation                                                
ATTORNEY                                11141982.97  7.477194e+06
C.E.O.                                      1690.00  2.592983e+06
CEO                                      2073284.79  1.618057e+06
CONSULTANT                               2459912.71  2.544725e+06
ENGINEER                                  951525.55  1.818374e+06
EXECUTIVE                                1355161.05  4.138850e+06
HOMEMAKER                                4248875.80  1.363428e+07
INFORMATION REQUESTED                    4866973.96  3.896616e+06
INFORMATION REQUESTED PER BEST EFFORTS          NaN  1.634053e+07
INVESTOR                                  884133.00  2.431769e+06
LAWYER                                   3160478.87  3.912243e+05
MANAGER                                   762883.22  1.444532e+06
OWNER                                    1001567.36  2.408287e+06
PHYSICIAN                                3735124.94  3.594320e+06
PRESIDENT                                1878509.95  4.720924e+06
PROFESSOR                                2165071.08  2.967027e+05
REAL ESTATE                               528902.09  1.625902e+06
RETIRED                                 25305116.38  2.356124e+07
SELF-EMPLOYED                             672393.40  1.640253e+06
import matplotlib
%matplotlib inline
over_2mm.plot(kind='barh')




def get_top_amounts(group,key,n=5):
    totals=group.groupby(key)['contb_receipt_amt'].sum()

#    根據key對totals進行降序排列
    return totals.sort_values(ascending=False)[n:]
#根據職業和僱主進行聚合

grouped=fec_mrbo.groupby('cand_nm')

grouped.apply(get_top_amounts,'contbr_occupation',n=7)
Out[56]: 
cand_nm        contbr_occupation                     
Obama, Barack  PROFESSOR                                 2165071.08
               CEO                                       2073284.79
               PRESIDENT                                 1878509.95
               NOT EMPLOYED                              1709188.20
               EXECUTIVE                                 1355161.05
               TEACHER                                   1250969.15
               WRITER                                    1084188.88
               OWNER                                     1001567.36
               ENGINEER                                   951525.55
               INVESTOR                                   884133.00
               ARTIST                                     763125.00
               MANAGER                                    762883.22
               SELF-EMPLOYED                              672393.40
               STUDENT                                    628099.75
               REAL ESTATE                                528902.09
               CHAIRMAN                                   496547.00
               ARCHITECT                                  483859.89
               DIRECTOR                                   471741.73
               BUSINESS OWNER                             449979.30
               EDUCATOR                                   436600.89
               PSYCHOLOGIST                               427299.92
               SOFTWARE ENGINEER                          396985.65
               PARTNER                                    395759.50
               SALES                                      392886.91
               EXECUTIVE DIRECTOR                         348180.94
               MANAGING DIRECTOR                          329688.25
               SOCIAL WORKER                              326844.43
               VICE PRESIDENT                             325647.15
               ADMINISTRATOR                              323079.26
               SCIENTIST                                  319227.88
   
Romney, Mitt   NON-PROFIT VETERANS ORG. CHAIR/ANNUITA         10.00
               PARAPLANNER                                    10.00
               APPRAISAL                                      10.00
               SIGN CONTRACTOR                                10.00
               POLITICAL OPERATIVE                            10.00
               PORT MGT                                       10.00
               PRESIDENT EMERITUS                             10.00
               CONTRACTS SPECIALIST                            9.00
               TEACHER & FREE-LANCE JOURNALIST                 9.00
               FOUNDATION CONSULTANT                           6.00
               MAIL HANDLER                                    6.00
               TREASURER & DIRECTOR OF FINANCE                 6.00
               SECRETARY/BOOKKEPPER                            6.00
               ELAYNE WELLS HARMER                             6.00
               CHICKEN GRADER                                  5.00
               DIRECTOR REISCHAUER CENTER FOR EAST A           5.00
               SCOTT GREENBAUM                                 5.00
               EDUCATION ADMIN                                 5.00
               ENGINEER/RISK EXPERT                            5.00
               PLANNING AND OPERATIONS ANALYST                 5.00
               VILLA NOVA                                      5.00
               FINANCIAL INSTITUTION - CEO                     5.00
               HORTICULTURIST                                  5.00
               MD - UROLOGIST                                  5.00
               DISTRICT REPRESENTATIVE                         5.00
               INDEPENDENT PROFESSIONAL                        3.00
               REMODELER & SEMI RETIRED                        3.00
               AFFORDABLE REAL ESTATE DEVELOPER                3.00
               IFC CONTRACTING SOLUTIONS                       3.00
               3RD GENERATION FAMILY BUSINESS OWNER            3.00
Name: contb_receipt_amt, dtype: float64
grouped.apply(get_top_amounts,'contbr_occupation',n=10)
Out[57]: 
cand_nm        contbr_occupation                     
Obama, Barack  NOT EMPLOYED                              1709188.20
               EXECUTIVE                                 1355161.05
               TEACHER                                   1250969.15
               WRITER                                    1084188.88
               OWNER                                     1001567.36
               ENGINEER                                   951525.55
               INVESTOR                                   884133.00
               ARTIST                                     763125.00
               MANAGER                                    762883.22
               SELF-EMPLOYED                              672393.40
               STUDENT                                    628099.75
               REAL ESTATE                                528902.09
               CHAIRMAN                                   496547.00
               ARCHITECT                                  483859.89
               DIRECTOR                                   471741.73
               BUSINESS OWNER                             449979.30
               EDUCATOR                                   436600.89
               PSYCHOLOGIST                               427299.92
               SOFTWARE ENGINEER                          396985.65
               PARTNER                                    395759.50
               SALES                                      392886.91
               EXECUTIVE DIRECTOR                         348180.94
               MANAGING DIRECTOR                          329688.25
               SOCIAL WORKER                              326844.43
               VICE PRESIDENT                             325647.15
               ADMINISTRATOR                              323079.26
               SCIENTIST                                  319227.88
               VOLUNTEER                                  305233.64
               FINANCE                                    296031.40
               MARKETING                                  263610.68
   
Romney, Mitt   NON-PROFIT VETERANS ORG. CHAIR/ANNUITA         10.00
               PARAPLANNER                                    10.00
               APPRAISAL                                      10.00
               SIGN CONTRACTOR                                10.00
               POLITICAL OPERATIVE                            10.00
               PORT MGT                                       10.00
               PRESIDENT EMERITUS                             10.00
               CONTRACTS SPECIALIST                            9.00
               TEACHER & FREE-LANCE JOURNALIST                 9.00
               FOUNDATION CONSULTANT                           6.00
               MAIL HANDLER                                    6.00
               TREASURER & DIRECTOR OF FINANCE                 6.00
               SECRETARY/BOOKKEPPER                            6.00
               ELAYNE WELLS HARMER                             6.00
               CHICKEN GRADER                                  5.00
               DIRECTOR REISCHAUER CENTER FOR EAST A           5.00
               SCOTT GREENBAUM                                 5.00
               EDUCATION ADMIN                                 5.00
               ENGINEER/RISK EXPERT                            5.00
               PLANNING AND OPERATIONS ANALYST                 5.00
               VILLA NOVA                                      5.00
               FINANCIAL INSTITUTION - CEO                     5.00
               HORTICULTURIST                                  5.00
               MD - UROLOGIST                                  5.00
               DISTRICT REPRESENTATIVE                         5.00
               INDEPENDENT PROFESSIONAL                        3.00
               REMODELER & SEMI RETIRED                        3.00
               AFFORDABLE REAL ESTATE DEVELOPER                3.00
               IFC CONTRACTING SOLUTIONS                       3.00
               3RD GENERATION FAMILY BUSINESS OWNER            3.00
Name: contb_receipt_amt, dtype: float64
對出資額分組
import numpy as np
bins=np.array([0,1,10,100,1000,10000,100000,1000000,10000000])
labels=pd.cut(fec_mrbo.contb_receipt_amt,bins)
labels
411           (10, 100]
412         (100, 1000]
413         (100, 1000]
414           (10, 100]
415           (10, 100]
416           (10, 100]
417         (100, 1000]
418           (10, 100]
419         (100, 1000]
420           (10, 100]
421           (10, 100]
422         (100, 1000]
423         (100, 1000]
424         (100, 1000]
425         (100, 1000]
426         (100, 1000]
427       (1000, 10000]
428         (100, 1000]
429         (100, 1000]
430           (10, 100]
431       (1000, 10000]
432         (100, 1000]
433         (100, 1000]
434         (100, 1000]
435         (100, 1000]
436         (100, 1000]
437           (10, 100]
438         (100, 1000]
439         (100, 1000]
440           (10, 100]
     
701356        (10, 100]
701357          (1, 10]
701358        (10, 100]
701359        (10, 100]
701360        (10, 100]
701361        (10, 100]
701362      (100, 1000]
701363        (10, 100]
701364        (10, 100]
701365        (10, 100]
701366        (10, 100]
701367        (10, 100]
701368      (100, 1000]
701369        (10, 100]
701370        (10, 100]
701371        (10, 100]
701372        (10, 100]
701373        (10, 100]
701374        (10, 100]
701375        (10, 100]
701376    (1000, 10000]
701377        (10, 100]
701378        (10, 100]
701379      (100, 1000]
701380    (1000, 10000]
701381        (10, 100]
701382      (100, 1000]
701383          (1, 10]
701384        (10, 100]
701385      (100, 1000]
Name: contb_receipt_amt, dtype: category
Categories (8, object): [(0, 1] < (1, 10] < (10, 100] < (100, 1000] < (1000, 10000] < (10000, 100000] < (100000, 1000000] < (1000000, 10000000]]
#根據候選人姓名以及面元標籤對資料進行分組
grouped=fec_mrbo.groupby(['cand_nm',labels])
grouped.size().unstack(0)
Out[67]: 
cand_nm              Obama, Barack  Romney, Mitt
contb_receipt_amt                               
(0, 1]                       493.0          77.0
(1, 10]                    40070.0        3681.0
(10, 100]                 372280.0       31853.0
(100, 1000]               153991.0       43357.0
(1000, 10000]              22284.0       26186.0
(10000, 100000]                2.0           1.0
(100000, 1000000]              3.0           NaN
(1000000, 10000000]            4.0           NaN
#對出資額求合併在面元內規格化,以便圖形化顯示兩位候選人各種贊助額度
bucket_sums
Out[69]: 
cand_nm              Obama, Barack  Romney, Mitt
contb_receipt_amt                               
(0, 1]                      318.24         77.00
(1, 10]                  337267.62      29819.66
(10, 100]              20288981.41    1987783.76
(100, 1000]            54798531.46   22363381.69
(1000, 10000]          51753705.67   63942145.42
(10000, 100000]           59100.00      12700.00
(100000, 1000000]       1490683.08           NaN
(1000000, 10000000]     7148839.76           NaN
normed_sums=bucket_sums.div(bucket_sums.sum(axis=1),axis=0)
normed_sums
Out[71]: 
cand_nm              Obama, Barack  Romney, Mitt
contb_receipt_amt                               
(0, 1]                    0.805182      0.194818
(1, 10]                   0.918767      0.081233
(10, 100]                 0.910769      0.089231
(100, 1000]               0.710176      0.289824
(1000, 10000]             0.447326      0.552674
(10000, 100000]           0.823120      0.176880
(100000, 1000000]         1.000000           NaN
(1000000, 10000000]       1.000000           NaN
normed_sums[:-2].plot(kind='barh',stacked=True)


#根據州統計贊助資訊