《利用python進行資料分析》————2012年聯邦選舉委員會資料庫
阿新 • • 發佈:2018-11-27
[本次資料分析所用到的資料集連結]
(http://github.com/wesm/pydata-book)
import pandas as pd import matplotlib.pyplot as plt import numpy as np fec = pd.read_csv('datasets/fec/P00000001-ALL.csv') #載入資料 print(fec.info()) print(fec.iloc[123456]) unique_cands = fec.cand_nm.unique() #使用unique獲得所有不同的政治候選人名單 parties = {'Bachmann, Michelle':'Republican', 'Cain, Herman':'Republican', 'Gingrich, Newt':'Republican', 'Huntsman, Jon':'Republican', 'John, Gary Earl':'Republican', 'McCotter, Thaddeus G':'Republican', 'Obama, Barack':'Democrat', 'Paul, Ron':'Republican', 'Pawlenty, Timothy':'Republican', 'Perry, Rick':'Republican', "Roemer, Charles E. 'Buddy' III":'Republican', 'Romney, Mitt':'Republican', 'Santorum, Rick':'Republican'} #表示政黨背景的方式之一是使用相應的字典 print(fec.cand_nm[123456:123461].map(parties)) #從候選人姓名中計算出政黨的陣列 fec['party'] = fec.cand_nm.map(parties) #將它作為一列加入 print(fec['party'].value_counts()) print((fec.contb_receipt_amt > 0).value_counts()) #資料準備的要點,這些資料既包括捐款也包括退款 fec = fec[fec.contb_receipt_amt > 0] #將分析範圍限制在正向貢獻中 fec_mrbo = fec[fec.cand_nm.isin(['Obama, Barack','Romney, Mitt'])] #由於Barack Obama和Mitt Romney是主要的兩位候選人,所以準備一個僅對他們的競選有貢獻的子集 print(fec.contbr_occupation.value_counts()[:10]) #獲得按職業的捐獻總數 occ_mapping = { 'INFORMATION REQUESTED PER BEST EFFORTS':'NOT PROVIDED', 'INFORMATION REQUESTED':'NOT PROVIDED', 'INFORMATION REQUESTED (BEST EFFORTS)':'NOT PROVIDED', 'C.E.O':'CEO' } f = lambda x:occ_mapping.get(x,x) #如果沒有對映,則返回x fec.contbr_occupation = fec.contbr_occupation.map(f) emp_mapping = { 'INFORMATION REQUESTED PER BEST EFFORTS':'NOT PROVIDED', 'INFORMATION REQUESTED':'NOT PROVIDED', 'SELF':'SELF-EMPLOYED', 'SELF EMPLOYED':'SELF-EMPLOYED', } f = lambda x:emp_mapping.get(x,x) #如果沒有對映,則返回x fec.contbr_employer = fec.contbr_employer.map(f) by_occupation = fec.pivot_table('contb_receipt_amt', index = 'contbr_occupation', columns = 'party',aggfunc = 'sum') #按照黨派和職業聚合資料 over_2mm = by_occupation[by_occupation.sum(1) > 2000000] #過濾出至少捐贈200萬美元的子集 over_2mm.plot(kind = 'barh') #以條形圖的方式進行資料視覺化,按黨派劃分各職業捐贈總量 plt.show() def get_top_amounts(group,key,n = 5): totals = group.groupby(key)['contb_receipt_amt'].sum() return totals.nlargest(n) grouped = fec_mrbo.groupby('cand_nm') grouped.apply(get_top_amounts,'contbr_occupation',n = 7) #按照職業進行聚合 grouped.apply(get_top_amounts,'contbr_employer',n = 10) #按照僱主進行聚合 bins = np.array([0,1,10,100,1000,10000, 100000,1000000,10000000]) labels = pd.cut(fec_mrbo.contb_receipt_amt, bins) #使用cut函式將貢獻者的數量按貢獻大小離散化分桶 print(labels) grouped = fec_mrbo.groupby(['cand_nm',labels]) #將Obama和Romney的資料按名稱和分類標籤進行分組,以獲得捐贈規模的直方圖 print(grouped.size().unstack(0)) bucket_sums = grouped.contb_receipt_amt.sum().unstack(0) normed_sums = bucket_sums.div(bucket_sums.sum(axis = 1),axis = 0) #對捐款數額進行歸一化 print(normed_sums) normed_sums[:-2].plot(kind = 'barh') #畫出不同捐贈規模的候選人收到的捐贈總額的百分比,這裡排除了最大的兩個箱體,因為這些箱體不是由個人捐贈的 plt.show() grouped = fec_mrbo.groupby(['cand_nm','contbr_st']) #按照候選人和州進行聚合 totals = grouped.contb_receipt_amt.sum().unstack(0).fillna(0) totals = totals[totals.sum(1) > 100000] print(totals[:10]) percent = totals.div(totals.sum(1),axis = 0) #將每一行除以捐款總額,就可以得到每個候選人按州的捐贈總額的相對百分比 print(percent[:10])