第九章 資料分組與聚合(下)
阿新 • • 發佈:2019-01-09
import pandas as pd from pandas import Series fec=pd.read_csv("e:/P00000001-ALL.csv") fec[:2] cmte_id cand_id cand_nm contbr_nm contbr_city \ 0 C00410118 P20002978 Bachmann, Michelle HARVEY, WILLIAM MOBILE 1 C00410118 P20002978 Bachmann, Michelle HARVEY, WILLIAM MOBILE contbr_st contbr_zip contbr_employer contbr_occupation contb_receipt_amt \ 0 AL 3.6601e+08 RETIRED RETIRED 250.0 1 AL 3.6601e+08 RETIRED RETIRED 50.0 contb_receipt_dt receipt_desc memo_cd memo_text form_tp file_num 0 20-JUN-11 NaN NaN NaN SA17A 736166 1 23-JUN-11 N
fec.ix[123456]
unique_candscmte_id C00431445
cand_id P80003338
cand_nm Obama, Barack
contbr_nm ELLMAN, IRA
contbr_city TEMPE
contbr_st AZ
contbr_zip 852816719
contbr_employer ARIZONA STATE UNIVERSITY
contbr_occupation PROFESSOR
contb_receipt_amt 50
contb_receipt_dt 01-DEC-11
receipt_desc NaN
memo_cd NaN
memo_text NaN
form_tp SA17A
file_num 772372
Name: 123456, dtype: object
unique_cands=fec.cand_nm.unique()
unique_cands[2]patries={'Bachmann, Michelle':'Republican','Cain, Herman':'Republican', 'Gingrich, Newt':'Republican','Huntsman, Jon':'Republican', 'Johnson, Gary Earl':'Republican','McCotter, Thaddeus G':'Republican', 'Obama, Barack':'Democrat','Paul, Ron':'Republican', 'Pawlenty, Timothy':'Republican','Perry, Rick':'Republican', "Roemer, Charles E. 'Buddy' III":'Republican', 'Romney, Mitt':'Republican', 'Santorum, Rick':'Republican' }
'Obama, Barack'fec.cand_nm[123456:123461]
123456 Obama, Barack
123457 Obama, Barack
123458 Obama, Barack
123459 Obama, Barack
123460 Obama, Barack
Name: cand_nm, dtype: object
fec.cand_nm[123456:123461].map(patries)#通過這個對映以及Series物件的map方法,根據候選人姓名得到黨派資訊123456 Democrat
123457 Democrat
123458 Democrat
123459 Democrat
123460 Democrat
Name: cand_nm, dtype: object
#新增一個新列fec['party']=fec.cand_nm.map(patries)fec['party'].value_counts()
Democrat 593746
Republican 407985
Name: party, dtype: int64
#這裡需要注意:該數字既包括贊助也包括退款(負的出資額),限定該資料集只能有正的出資額
fec=fec[fec.contb_receipt_amt>0]
#由於Barack Obama和Mitt Romney是最主要的兩名候選人,建立一個子集只包含針對他們兩人的競選活動的贊助資訊
fec_mrbo=fec[fec.cand_nm.isin(['Obama, Barack','Romney, Mitt'])]
根據職業和僱主統計資訊
#首先根據職業計算總出資額
fec.contbr_occupation.value_counts()[:10]
Out[30]:
RETIRED 233990
INFORMATION REQUESTED 35107
ATTORNEY 34286
HOMEMAKER 29931
PHYSICIAN 23432
INFORMATION REQUESTED PER BEST EFFORTS 21138
ENGINEER 14334
TEACHER 13990
CONSULTANT 13273
PROFESSOR 12555
Name: contbr_occupation, dtype: int64
#巧妙運用dict.get,允許沒有對映關係的職業也能通過,處理職業資訊
occ_mapping={
'INFORMATION REQUESTED PER BEST EFFORTS':'NOT PROVIDED',
'INFORMATION REQUESTED':'NOT PROVIDED',
'INFORMATION REQUESTED(BEST EFFORTS)':'NOT PROVIDED',
'C.E.O.':'CEO'
}
#如果沒有提供相關對映,則返回x
f=lambda x:occ_mapping.get(x,x)
fec.contbr_employer=fec.contbr_employer.map(f)
#通過pivot_table根據黨派和職業對資料進行聚合,過濾掉總出資額不足200萬美元的資料
by_occupation=fec.pivot_table('contb_receipt_amt',index='contbr_occupation',
columns='party',aggfunc='sum')
over_2mm=by_occupation[by_occupation.sum(1)>2000000]
over_2mm
Out[42]:
party Democrat Republican
contbr_occupation
ATTORNEY 11141982.97 7.477194e+06
C.E.O. 1690.00 2.592983e+06
CEO 2073284.79 1.618057e+06
CONSULTANT 2459912.71 2.544725e+06
ENGINEER 951525.55 1.818374e+06
EXECUTIVE 1355161.05 4.138850e+06
HOMEMAKER 4248875.80 1.363428e+07
INFORMATION REQUESTED 4866973.96 3.896616e+06
INFORMATION REQUESTED PER BEST EFFORTS NaN 1.634053e+07
INVESTOR 884133.00 2.431769e+06
LAWYER 3160478.87 3.912243e+05
MANAGER 762883.22 1.444532e+06
OWNER 1001567.36 2.408287e+06
PHYSICIAN 3735124.94 3.594320e+06
PRESIDENT 1878509.95 4.720924e+06
PROFESSOR 2165071.08 2.967027e+05
REAL ESTATE 528902.09 1.625902e+06
RETIRED 25305116.38 2.356124e+07
SELF-EMPLOYED 672393.40 1.640253e+06
import matplotlib
%matplotlib inline
over_2mm.plot(kind='barh')
def get_top_amounts(group,key,n=5):
totals=group.groupby(key)['contb_receipt_amt'].sum()
# 根據key對totals進行降序排列
return totals.sort_values(ascending=False)[n:]
#根據職業和僱主進行聚合
grouped=fec_mrbo.groupby('cand_nm')
grouped.apply(get_top_amounts,'contbr_occupation',n=7)
Out[56]:
cand_nm contbr_occupation
Obama, Barack PROFESSOR 2165071.08
CEO 2073284.79
PRESIDENT 1878509.95
NOT EMPLOYED 1709188.20
EXECUTIVE 1355161.05
TEACHER 1250969.15
WRITER 1084188.88
OWNER 1001567.36
ENGINEER 951525.55
INVESTOR 884133.00
ARTIST 763125.00
MANAGER 762883.22
SELF-EMPLOYED 672393.40
STUDENT 628099.75
REAL ESTATE 528902.09
CHAIRMAN 496547.00
ARCHITECT 483859.89
DIRECTOR 471741.73
BUSINESS OWNER 449979.30
EDUCATOR 436600.89
PSYCHOLOGIST 427299.92
SOFTWARE ENGINEER 396985.65
PARTNER 395759.50
SALES 392886.91
EXECUTIVE DIRECTOR 348180.94
MANAGING DIRECTOR 329688.25
SOCIAL WORKER 326844.43
VICE PRESIDENT 325647.15
ADMINISTRATOR 323079.26
SCIENTIST 319227.88
Romney, Mitt NON-PROFIT VETERANS ORG. CHAIR/ANNUITA 10.00
PARAPLANNER 10.00
APPRAISAL 10.00
SIGN CONTRACTOR 10.00
POLITICAL OPERATIVE 10.00
PORT MGT 10.00
PRESIDENT EMERITUS 10.00
CONTRACTS SPECIALIST 9.00
TEACHER & FREE-LANCE JOURNALIST 9.00
FOUNDATION CONSULTANT 6.00
MAIL HANDLER 6.00
TREASURER & DIRECTOR OF FINANCE 6.00
SECRETARY/BOOKKEPPER 6.00
ELAYNE WELLS HARMER 6.00
CHICKEN GRADER 5.00
DIRECTOR REISCHAUER CENTER FOR EAST A 5.00
SCOTT GREENBAUM 5.00
EDUCATION ADMIN 5.00
ENGINEER/RISK EXPERT 5.00
PLANNING AND OPERATIONS ANALYST 5.00
VILLA NOVA 5.00
FINANCIAL INSTITUTION - CEO 5.00
HORTICULTURIST 5.00
MD - UROLOGIST 5.00
DISTRICT REPRESENTATIVE 5.00
INDEPENDENT PROFESSIONAL 3.00
REMODELER & SEMI RETIRED 3.00
AFFORDABLE REAL ESTATE DEVELOPER 3.00
IFC CONTRACTING SOLUTIONS 3.00
3RD GENERATION FAMILY BUSINESS OWNER 3.00
Name: contb_receipt_amt, dtype: float64
grouped.apply(get_top_amounts,'contbr_occupation',n=10)
Out[57]:
cand_nm contbr_occupation
Obama, Barack NOT EMPLOYED 1709188.20
EXECUTIVE 1355161.05
TEACHER 1250969.15
WRITER 1084188.88
OWNER 1001567.36
ENGINEER 951525.55
INVESTOR 884133.00
ARTIST 763125.00
MANAGER 762883.22
SELF-EMPLOYED 672393.40
STUDENT 628099.75
REAL ESTATE 528902.09
CHAIRMAN 496547.00
ARCHITECT 483859.89
DIRECTOR 471741.73
BUSINESS OWNER 449979.30
EDUCATOR 436600.89
PSYCHOLOGIST 427299.92
SOFTWARE ENGINEER 396985.65
PARTNER 395759.50
SALES 392886.91
EXECUTIVE DIRECTOR 348180.94
MANAGING DIRECTOR 329688.25
SOCIAL WORKER 326844.43
VICE PRESIDENT 325647.15
ADMINISTRATOR 323079.26
SCIENTIST 319227.88
VOLUNTEER 305233.64
FINANCE 296031.40
MARKETING 263610.68
Romney, Mitt NON-PROFIT VETERANS ORG. CHAIR/ANNUITA 10.00
PARAPLANNER 10.00
APPRAISAL 10.00
SIGN CONTRACTOR 10.00
POLITICAL OPERATIVE 10.00
PORT MGT 10.00
PRESIDENT EMERITUS 10.00
CONTRACTS SPECIALIST 9.00
TEACHER & FREE-LANCE JOURNALIST 9.00
FOUNDATION CONSULTANT 6.00
MAIL HANDLER 6.00
TREASURER & DIRECTOR OF FINANCE 6.00
SECRETARY/BOOKKEPPER 6.00
ELAYNE WELLS HARMER 6.00
CHICKEN GRADER 5.00
DIRECTOR REISCHAUER CENTER FOR EAST A 5.00
SCOTT GREENBAUM 5.00
EDUCATION ADMIN 5.00
ENGINEER/RISK EXPERT 5.00
PLANNING AND OPERATIONS ANALYST 5.00
VILLA NOVA 5.00
FINANCIAL INSTITUTION - CEO 5.00
HORTICULTURIST 5.00
MD - UROLOGIST 5.00
DISTRICT REPRESENTATIVE 5.00
INDEPENDENT PROFESSIONAL 3.00
REMODELER & SEMI RETIRED 3.00
AFFORDABLE REAL ESTATE DEVELOPER 3.00
IFC CONTRACTING SOLUTIONS 3.00
3RD GENERATION FAMILY BUSINESS OWNER 3.00
Name: contb_receipt_amt, dtype: float64
對出資額分組
import numpy as np
bins=np.array([0,1,10,100,1000,10000,100000,1000000,10000000])
labels=pd.cut(fec_mrbo.contb_receipt_amt,bins)
labels
411 (10, 100]
412 (100, 1000]
413 (100, 1000]
414 (10, 100]
415 (10, 100]
416 (10, 100]
417 (100, 1000]
418 (10, 100]
419 (100, 1000]
420 (10, 100]
421 (10, 100]
422 (100, 1000]
423 (100, 1000]
424 (100, 1000]
425 (100, 1000]
426 (100, 1000]
427 (1000, 10000]
428 (100, 1000]
429 (100, 1000]
430 (10, 100]
431 (1000, 10000]
432 (100, 1000]
433 (100, 1000]
434 (100, 1000]
435 (100, 1000]
436 (100, 1000]
437 (10, 100]
438 (100, 1000]
439 (100, 1000]
440 (10, 100]
701356 (10, 100]
701357 (1, 10]
701358 (10, 100]
701359 (10, 100]
701360 (10, 100]
701361 (10, 100]
701362 (100, 1000]
701363 (10, 100]
701364 (10, 100]
701365 (10, 100]
701366 (10, 100]
701367 (10, 100]
701368 (100, 1000]
701369 (10, 100]
701370 (10, 100]
701371 (10, 100]
701372 (10, 100]
701373 (10, 100]
701374 (10, 100]
701375 (10, 100]
701376 (1000, 10000]
701377 (10, 100]
701378 (10, 100]
701379 (100, 1000]
701380 (1000, 10000]
701381 (10, 100]
701382 (100, 1000]
701383 (1, 10]
701384 (10, 100]
701385 (100, 1000]
Name: contb_receipt_amt, dtype: category
Categories (8, object): [(0, 1] < (1, 10] < (10, 100] < (100, 1000] < (1000, 10000] < (10000, 100000] < (100000, 1000000] < (1000000, 10000000]]
#根據候選人姓名以及面元標籤對資料進行分組
grouped=fec_mrbo.groupby(['cand_nm',labels])
grouped.size().unstack(0)
Out[67]:
cand_nm Obama, Barack Romney, Mitt
contb_receipt_amt
(0, 1] 493.0 77.0
(1, 10] 40070.0 3681.0
(10, 100] 372280.0 31853.0
(100, 1000] 153991.0 43357.0
(1000, 10000] 22284.0 26186.0
(10000, 100000] 2.0 1.0
(100000, 1000000] 3.0 NaN
(1000000, 10000000] 4.0 NaN
#對出資額求合併在面元內規格化,以便圖形化顯示兩位候選人各種贊助額度
bucket_sums
Out[69]:
cand_nm Obama, Barack Romney, Mitt
contb_receipt_amt
(0, 1] 318.24 77.00
(1, 10] 337267.62 29819.66
(10, 100] 20288981.41 1987783.76
(100, 1000] 54798531.46 22363381.69
(1000, 10000] 51753705.67 63942145.42
(10000, 100000] 59100.00 12700.00
(100000, 1000000] 1490683.08 NaN
(1000000, 10000000] 7148839.76 NaN
normed_sums=bucket_sums.div(bucket_sums.sum(axis=1),axis=0)
normed_sums
Out[71]:
cand_nm Obama, Barack Romney, Mitt
contb_receipt_amt
(0, 1] 0.805182 0.194818
(1, 10] 0.918767 0.081233
(10, 100] 0.910769 0.089231
(100, 1000] 0.710176 0.289824
(1000, 10000] 0.447326 0.552674
(10000, 100000] 0.823120 0.176880
(100000, 1000000] 1.000000 NaN
(1000000, 10000000] 1.000000 NaN
normed_sums[:-2].plot(kind='barh',stacked=True)#根據州統計贊助資訊