1. 程式人生 > >pandas縱向學習之10 minutes to pandas(三)

pandas縱向學習之10 minutes to pandas(三)

操作

數學統計

df.mean() #檢視每列的平均值
df.mean(1) #檢視每行的平均值

#每一行減去一列數
s = pd.Series([1,3,5,np.nan,6,8], index=dates).shift(2)
s
df.sub(s, axis='index')
A	B	C	D	F
2013-01-01	NaN	NaN	NaN	NaN	NaN
2013-01-02	NaN	NaN	NaN	NaN	NaN
2013
-01-03 -1.158085 -1.262675 -1.465764 -6.0 -3.0 2013-01-04 -3.679138 -3.191328 -4.159281 -8.0 -6.0 2013-01-05 -5.007158 -6.672655 -5.091954 -10.0 -9.0 2013-01-06 NaN NaN NaN NaN NaN

應用函式

#對每一列應用累計函式
df.apply(np.cumsum)
	A	B	C	D	F
2013-01-01	-0.001431	-0.908440	-0.851724	-5	NaN
2013-01-02	-1.093717	-2.312200	-1.815194	-10	-1.0
2013-01-03	-
1.251802 -2.574875 -2.280958 -15 -3.0 2013-01-04 -1.930940 -2.766203 -3.440239 -20 -6.0 2013-01-05 -1.938097 -4.438858 -3.532193 -25 -10.0 2013-01-06 -2.051573 -4.438876 -5.427721 -30 -15.0 #每一列的極差 df.apply(lambda x: x.max()-x.min()) A 1.090854 B 1.672638 C 1.803573 D 0.000000 F 4.000000 dtype: float64

數量統計

#統計每一種元素各出現了幾次
s = pd.Series(np.random.randint(0, 7, size=10)) s 0 2 1 0 2 4 3 5 4 0 5 2 6 6 7 3 8 3 9 5 dtype: int32 s.value_counts() 5 2 3 2 2 2 0 2 6 1 4 1 dtype: int64

字串方法

df.str.lower()	#小寫
df.str.upper()

合併

concat方法

df = pd.DataFrame(np.random.randn(10, 4))
pieces = [df[:3], df[3:7], df[7:]]
pd.concat(pieces)

join方法

比較兩種型別的合併:

left = pd.DataFrame({'key': ['foo', 'foo'], 'lval': [1, 2]})
right = pd.DataFrame({'key': ['foo', 'foo'], 'rval': [4, 5]})
left
	key	lval
0	foo	1
1	foo	2
right
	key	rval
0	foo	4
1	foo	5
pd.merge(left, right, on='key')
key	lval	rval
0	foo	1	4
1	foo	1	5
2	foo	2	4
3	foo	2	5
left = pd.DataFrame({'key': ['foo', 'bar'], 'lval': [1, 2]})
right = pd.DataFrame({'key': ['foo', 'bar'], 'rval': [4, 5]})
left
   key  lval
0  foo     1
1  bar     2
right
   key  rval
0  foo     4
1  bar     5
pd.merge(left, right, on='key')
   key  lval  rval
0  foo     1     4
1  bar     2     5

增加行

df = pd.DataFrame(np.random.randn(8, 4), columns=['A','B','C','D'])
df
A	B	C	D
0	-1.221865	-0.313737	0.813024	-2.067007
1	-0.833239	-1.123765	-0.580756	-1.618360
2	0.780570	0.057091	1.610320	1.198047
3	1.306492	-0.657629	0.946997	0.064994
4	-0.104776	-0.300427	-0.226296	-0.638638
5	-0.215063	-0.443774	1.900574	-0.392732
6	-0.108958	0.813018	-0.316127	-1.677159
7	0.678901	0.164350	-1.391680	0.434714
s = df.iloc[3]\
df.append(s, ignore_index=True)
A	B	C	D
0	-1.221865	-0.313737	0.813024	-2.067007
1	-0.833239	-1.123765	-0.580756	-1.618360
2	0.780570	0.057091	1.610320	1.198047
3	1.306492	-0.657629	0.946997	0.064994
4	-0.104776	-0.300427	-0.226296	-0.638638
5	-0.215063	-0.443774	1.900574	-0.392732
6	-0.108958	0.813018	-0.316127	-1.677159
7	0.678901	0.164350	-1.391680	0.434714
8	1.306492	-0.657629	0.946997	0.064994

分組

    df = pd.DataFrame({'A' : ['foo', 'bar', 'foo', 'bar',
   ....:                           'foo', 'bar', 'foo', 'foo'],
   ....:                    'B' : ['one', 'one', 'two', 'three',
   ....:                           'two', 'two', 'one', 'three'],
   ....:                    'C' : np.random.randn(8),
   ....:                    'D' : np.random.randn(8)})
df
A	B	C	D
0	foo	one	1.981136	1.652507
1	bar	one	2.676476	-1.424416
2	foo	two	-0.975054	-0.711273
3	bar	three	-0.366664	1.363469
4	foo	two	-1.447261	-0.122510
5	bar	two	0.138113	-0.559464
6	foo	one	-1.292988	-0.375974
7	foo	three	-0.533342	1.218957
df.groupby('A').sum()
	C	D
A		
bar	2.447925	-0.620411
foo	-2.267508	1.661708
df.groupby(['A', 'B']).sum()
		C	D
A	B		
bar	one	2.676476	-1.424416
three	-0.366664	1.363469
two	0.138113	-0.559464
foo	one	0.688148	1.276533
three	-0.533342	1.218957
two	-2.422314	-0.833782

重塑

堆疊

In [95]: tuples = list(zip(*[['bar', 'bar', 'baz', 'baz',
   ....:                      'foo', 'foo', 'qux', 'qux'],
   ....:                     ['one', 'two', 'one', 'two',
   ....:                      'one', 'two', 'one', 'two']]))
   ....: 

In [96]: index = pd.MultiIndex.from_tuples(tuples, names=['first', 'second'])

In [97]: df = pd.DataFrame(np.random.randn(8, 2), index=index, columns=['A', 'B'])

In [98]: df2 = df[:4]

In [99]: df2
Out[99]: 
                     A         B
first second                    
bar   one     0.029399 -0.542108
      two     0.282696 -0.087302
baz   one    -1.575170  1.771208
      two     0.816482  1.100230
In [100]: stacked = df2.stack()

In [101]: stacked
Out[101]: 
first  second   
bar    one     A    0.029399
               B   -0.542108
       two     A    0.282696
               B   -0.087302
baz    one     A   -1.575170
               B    1.771208
       two     A    0.816482
               B    1.100230
dtype: float64
In [102]: stacked.unstack()
Out[102]: 
                     A         B
first second                    
bar   one     0.029399 -0.542108
      two     0.282696 -0.087302
baz   one    -1.575170  1.771208
      two     0.816482  1.100230

In [103]: stacked.unstack(1)
Out[103]: 
second        one       two
first                      
bar   A  0.029399  0.282696
      B -0.542108 -0.087302
baz   A -1.575170  0.816482
      B  1.771208  1.100230

In [104]: stacked.unstack(0)
Out[104]: 
first          bar       baz
second                      
one    A  0.029399 -1.575170
       B -0.542108  1.771208
two    A  0.282696  0.816482
       B -0.087302  1.100230

資料透視表

In [105]: df = pd.DataFrame({'A' : ['one', 'one', 'two', 'three'] * 3,
   .....:                    'B' : ['A', 'B', 'C'] * 4,
   .....:                    'C' : ['foo', 'foo', 'foo', 'bar', 'bar', 'bar'] * 2,
   .....:                    'D' : np.random.randn(12),
   .....:                    'E' : np.random.randn(12)})
   .....: 

In [106]: df
Out[106]: 
        A  B    C         D         E
0     one  A  foo  1.418757 -0.179666
1     one  B  foo -1.879024  1.291836
2     two  C  foo  0.536826 -0.009614
3   three  A  bar  1.006160  0.392149
4     one  B  bar -0.029716  0.264599
5     one  C  bar -1.146178 -0.057409
6     two  A  foo  0.100900 -1.425638
7   three  B  foo -1.035018  1.024098
8     one  C  foo  0.314665 -0.106062
9     one  A  bar -0.773723  1.824375
10    two  B  bar -1.170653  0.595974
11  three  C  bar  0.648740  1.167115

#資料透視表
In [107]: pd.pivot_table(df, values='D', index=['A', 'B'], columns=['C'])
Out[107]: 
C             bar       foo
A     B                    
one   A -0.773723  1.418757
      B -0.029716 -1.879024
      C -1.146178  0.314665
three A  1.006160       NaN
      B       NaN -1.035018
      C  0.648740       NaN
two   A       NaN  0.100900
      B -1.170653       NaN
      C       NaN  0.536826