1. 程式人生 > >Python資料分析主力Pandas

Python資料分析主力Pandas

Python資料分析主力Pandas

 

Pandas的資料結構

  • Series:Series是一種類似於一維陣列的物件,它由一組資料(各種NumPy資料型別),以及一組與之相關的資料標籤(即索引)組成。 Series的字串表現形式為:索引在左邊,值在右邊。
  • DataFrame: DataFrame是一個表格型的資料結構,它含有一組有序的列,每列可以是不同的值型別(數值、字串、布林值等)。 DataFrame既有行索引也有列索引,它可以被看做由Series組成的字典(共用同一個索引)。
  • 索引物件Index:pandas的索引物件負責管理軸標籤和其他元資料(比如軸名稱等)。構建Series或DataFrame時,所用到的任何陣列或其他序列的標籤都會被轉換成一個Index。 Index物件是不可修改的(immutable),因此使用者不能對其進行修改。不可修改性非常重要,因為這樣才能使Index物件在多個數據結構之間安全共享。

Series的基本操作

# -*- coding: utf-8 -*- 

from pandas import Series

print('用陣列生成Series')
obj = Series([4, 7, -5, 3])
print(obj)
print(obj.values)
print(obj.index)
print

print('指定Series的index')
obj2 = Series([4, 7, -5, 3], index = ['d', 'b', 'a', 'c'])
print(obj2)
print(obj2.index)
print(obj2['a'])
obj2['d'] = 6
print(obj2[['c', 'a', 'd']])
print(obj2[obj2 > 0])  # 找出大於0的元素
print('b' in obj2 )# 判斷索引是否存在
print('e' in obj2)
print

print('使用字典生成Series')
sdata = {'Ohio':45000, 'Texas':71000, 'Oregon':16000, 'Utah':5000}
obj3 = Series(sdata)
print(obj3)
print

print('使用字典生成Series,並額外指定index,不匹配部分為NaN。')
states = ['California', 'Ohio', 'Oregon', 'Texas']
obj4 = Series(sdata, index = states)
print(obj4)
print

print('Series相加,相同索引部分相加。')
print(obj3 + obj4)
print

print('指定Series及其索引的名字')
obj4.name = 'population'
obj4.index.name = 'state'
print(obj4)
print

print('替換index')
obj.index = ['Bob', 'Steve', 'Jeff', 'Ryan']
print(obj)

DataFrame基本操作

# -*- coding: utf-8 -*- 
import numpy as np
from pandas import Series, DataFrame

print('用字典生成DataFrame,key為列的名字。')
data = {'state':['Ohio', 'Ohio', 'Ohio', 'Nevada', 'Nevada'],
        'year':[2000, 2001, 2002, 2001, 2002],
        'pop':[1.5, 1.7, 3.6, 2.4, 2.9]}
print(DataFrame(data))
print(DataFrame(data, columns = ['year', 'state', 'pop']))# 指定列順序
print

print('指定索引,在列中指定不存在的列,預設資料用NaN。')
frame2 = DataFrame(data,
                    columns = ['year', 'state', 'pop', 'debt'],
                    index = ['one', 'two', 'three', 'four', 'five'])
print(frame2)
print(frame2['state'])
print(frame2.state)
print(frame2['year'])
print(frame2.year)
print(frame2.ix['three'])
frame2['debt'] = 16.5 # 修改一整列
print(frame2)
frame2.debt = np.arange(5)  # 用numpy陣列修改元素
print(frame2)
print

print('用Series指定要修改的索引及其對應的值,沒有指定的預設資料用NaN。')
val = Series([-1.2, -1.5, -1.7], index = ['two', 'four', 'six'])
print(val)
frame2['debt'] = val
print(frame2)
print

print('賦值給新列')
frame2['eastern'] = (frame2.state == 'Ohio')  # 如果state等於Ohio為True
print(frame2)
print(frame2.columns)
print

print('DataFrame轉置')
#字典建立DataFrame
pop = {'Nevada':{2001:2.4, 2002:2.9},
        'Ohio':{2000:1.5, 2001:1.7, 2002:3.6}}
frame3 = DataFrame(pop)
print(frame3)
print(frame3.T)
print

print('指定索引順序,以及使用切片初始化資料。')
print(DataFrame(pop, index = [2001, 2002, 2003]))
pdata = {'Ohio':frame3['Ohio'][:-1], 'Nevada':frame3['Nevada'][:2]}
print(DataFrame(pdata))
print

print('指定索引和列的名稱')
frame3.index.name = 'year'
frame3.columns.name = 'state'
print(frame3)
print(frame3.values)
print(frame2.values)

索引Index基本操作

# -*- coding: utf-8 -*- 
import numpy as np
import pandas as pd
import sys
from pandas import Series, DataFrame, Index

print('獲取index')
obj = Series(range(3), index = ['a', 'b', 'c'])
index = obj.index
print(index[:2])
try:
    index[1] = 'd'  # index物件read only
except:
    print(sys.exc_info()[0])
print

print('使用Index物件')
index = Index(np.arange(3))
obj2 = Series([1.5, -2.5, 0], index = index)
print(obj2)
print(obj2.index is index)
print

print('判斷列和索引是否存在')
pop = {'Nevada':{20001:2.4, 2002:2.9},
        'Ohio':{2000:1.5, 2001:1.7, 2002:3.6}}
frame3 = DataFrame(pop)
print('Ohio' in frame3.columns)
print('2003' in frame3.index)

重新索引

# -*- coding: utf-8 -*- 
import numpy as np
from pandas import DataFrame, Series

print('重新指定索引及順序')
obj = Series([4.5, 7.2, -5.3, 3.6], index = ['d', 'b', 'a', 'c'])
print(obj)
obj2 = obj.reindex(['a', 'b', 'd', 'c', 'e'])
print(obj2)
print(obj.reindex(['a', 'b', 'd', 'c', 'e'], fill_value = 0))# 指定不存在元素的預設值
print

print('重新指定索引並指定填元素充方法')
obj3 = Series(['blue', 'purple', 'yellow'], index = [0, 2, 4])
print(obj3)
print(obj3.reindex(range(6), method = 'ffill'))
print

print('對DataFrame重新指定索引')
frame = DataFrame(np.arange(9).reshape(3, 3),
                  index = ['a', 'c', 'd'],
                  columns = ['Ohio', 'Texas', 'California'])
print(frame)
frame2 = frame.reindex(['a', 'b', 'c', 'd'])
print(frame2)
print

print('重新指定column')
states = ['Texas', 'Utah', 'California']
print(frame.reindex(columns = states))
print

print('對DataFrame重新指定索引並指定填元素充方法')
frame.reindex()
print(frame.reindex(index=['a','b','c','d'],columns=states).ffill())
print(frame.ix[['a', 'b', 'd', 'c'], states])

丟棄指定軸上的項

# -*- coding: utf-8 -*- 
import numpy as np
from pandas import Series, DataFrame

print( 'Series根據索引刪除元素')
#arange(5.)表示浮點數
obj = Series(np.arange(5.), index = ['a', 'b', 'c', 'd', 'e'])
new_obj = obj.drop('c')
print( new_obj)
print( obj.drop(['d', 'c']))
print

print( 'DataFrame刪除元素,可指定索引或列。')
data = DataFrame(np.arange(16).reshape((4, 4)),
                  index = ['Ohio', 'Colorado', 'Utah', 'New York'],
                  columns = ['one', 'two', 'three', 'four'])
print( data)
print( data.drop(['Colorado', 'Ohio']))
print( data.drop('two', axis = 1))
print( data.drop(['two', 'four'], axis = 1))

基本功能 索引、選取和過濾

  1. Series索引(obj[...])的工作方式類似於NumPy陣列的索引,只不過Series的索引值不只是整數。
  2. 利用標籤的切片運算與普通的Python切片運算不同,其末端是包含的(inclusive)。
  3. 對DataFrame進行索引其實就是獲取一個或多個列
  4. 為了在DataFrame的行上進行標籤索引,引入了專門的索引欄位ix
# -*- coding: utf-8 -*- 
import numpy as np
from pandas import Series, DataFrame

print( 'Series的索引,預設數字索引可以工作。')
obj = Series(np.arange(4.), index = ['a', 'b', 'c', 'd'])
print( obj['b'])
print( obj[3])
print( obj[[1, 3]])
print( obj[obj < 2])
print("\n")

print( 'Series的陣列切片')
print( obj['b':'c'])  # 非數字閉區間
obj['b':'c'] = 5
print( obj)
print("\n")

print( 'DataFrame的索引')
data = DataFrame(np.arange(16).reshape((4, 4)),
                  index = ['Ohio', 'Colorado', 'Utah', 'New York'],
                  columns = ['one', 'two', 'three', 'four'])
print( data)
print( data['two'] )# 列印列
print( data[['three', 'one']])
print( data[:2])  #前兩行
print( data.ix['Colorado', ['two', 'three']] )# 指定索引和列
print( data.ix[['Colorado', 'Utah'], [3, 0, 1]])
print(data)
print( data.ix[2] ) # 列印第3行(從0開始)
print( data.ix[:'Utah', 'two'] )# 從開始到Utah,第2列。
print("\n")

print( '根據條件選擇')
print( data[data.three > 5])
print( data < 5  )# 列印True或者False
data[data < 5] = 0
print( data)

算術運算和資料對齊

# -*- coding: utf-8 -*- 
import numpy as np
from pandas import Series, DataFrame
print( '加法')
s1 = Series([7.3, -2.5, 3.4, 1.5], index = ['a', 'c', 'd', 'e'])
s2 = Series([-2.1, 3.6, -1.5, 4, 3.1], index = ['a', 'c', 'e', 'f', 'g'])
print( s1)
print( s2)
print( s1 + s2)
print("\n")

print( 'DataFrame加法,索引和列都必須匹配。')
df1 = DataFrame(np.arange(9.).reshape((3, 3)),
                columns = list('bcd'),
                index = ['Ohio', 'Texas', 'Colorado'])
df2 = DataFrame(np.arange(12).reshape((4, 3)),
                columns = list('bde'),
                index = ['Utah', 'Ohio', 'Texas', 'Oregon'])
print( df1)
print( df2)
print( df1 + df2)
print("\n")

print( '資料填充')
df1 = DataFrame(np.arange(12.).reshape((3, 4)), columns = list('abcd'))
df2 = DataFrame(np.arange(20.).reshape((4, 5)), columns = list('abcde'))
print( df1)
print( df2)
print( df1.add(df2, fill_value = 0))
print( df1.reindex(columns = df2.columns, fill_value = 0))
print("\n")

print( 'DataFrame與Series之間的操作')
arr = np.arange(12.).reshape((3, 4))
print( arr)
print( arr[0])
print( arr - arr[0])
frame = DataFrame(np.arange(12).reshape((4, 3)),
                  columns = list('bde'),
                  index = ['Utah', 'Ohio', 'Texas', 'Oregon'])
series = frame.ix[0]
print( frame)
print( series)
print( frame - series)
series2 = Series(range(3), index = list('bef'))
print( frame + series2)
series3 = frame['d']
print( frame.sub(series3, axis = 0) ) # 按列減

函式應用和對映

# -*- coding: utf-8 -*- 

import numpy as np
from pandas import Series, DataFrame

print('函式')
frame = DataFrame(np.random.randn(4, 3),
                  columns = list('bde'),
                  index = ['Utah', 'Ohio', 'Texas', 'Oregon'])
print(frame)
print(np.abs(frame))
print

print('lambda以及應用')
f = lambda x: x.max() - x.min()
print(frame.apply(f))
print(frame.apply(f, axis = 1))
def f(x):
    return Series([x.min(), x.max()], index = ['min', 'max'])
print(frame.apply(f))
print

print('applymap和map')
_format = lambda x: '%.2f' % x
print(frame.applymap(_format))
print(frame['e'].map(_format))

排序和排名

# -*- coding: utf-8 -*- 

import numpy as np
from pandas import Series, DataFrame

print('根據索引排序,對於DataFrame可以指定軸。')
obj = Series(range(4), index = ['d', 'a', 'b', 'c'])
print(obj.sort_index())
frame = DataFrame(np.arange(8).reshape((2, 4)),
                  index = ['three', 'one'],
                  columns = list('dabc'))
print(frame.sort_index())
print(frame.sort_index(axis = 1))
print(frame.sort_index(axis = 1, ascending = False)) # 降序
print

print('根據值排序')
obj = Series([4, 7, -3, 2])
print(obj.sort_values() )# order已淘汰
print

print('DataFrame指定列排序')
frame = DataFrame({'b':[4, 7, -3, 2], 'a':[0, 1, 0, 1]})
print(frame)
print(frame.sort_values(by = 'b')) # sort_index(by = ...)已淘汰
print(frame.sort_values(by = ['a', 'b']))#先a後b
print

print('rank,求排名的平均位置(從1開始)')
obj = Series([7, -5, 7, 4, 2, 0, 4])
# 對應排名:-5(1), 0(2), 2(3), 4(4), 4(5), 7(6), 7(7)
print(obj.rank())
print(obj.rank(method = 'first'))  # 去第一次出現,不求平均值。
print(obj.rank(ascending = False, method = 'max')) # 逆序,並取最大值。所以-5的rank是7.
frame = DataFrame({'b':[4.3, 7, -3, 2],
                  'a':[0, 1, 0, 1],
                  'c':[-2, 5, 8, -2.5]})
print(frame)
print(frame.rank(axis = 1))

帶有重複值的索引

# -*- coding: utf-8 -*- 
import numpy as np
from pandas import Series, DataFrame
print('求和')
df = DataFrame([[1.4, np.nan], [7.1, -4.5], [np.nan, np.nan], [0.75, -1.3]],
              index = ['a', 'b', 'c', 'd'],
              columns = ['one', 'two'])
print(df)
print(df.sum())  # 按列求和
print(df.sum(axis = 1))  # 按行求和
print
print('平均數')
print(df.mean(axis = 1, skipna = False))
print(df.mean(axis = 1))
print
print('其它')
print(df.idxmax())
print(df.cumsum())
print(df.describe())
obj = Series(['a', 'a', 'b', 'c'] * 4)
print(obj.describe())

彙總和計算描述統計

# -*- coding: utf-8 -*- 
import numpy as np
from pandas import Series, DataFrame
print('求和')
df = DataFrame([[1.4, np.nan], [7.1, -4.5], [np.nan, np.nan], [0.75, -1.3]],
              index = ['a', 'b', 'c', 'd'],
              columns = ['one', 'two'])
print(df)
print(df.sum())  # 按列求和
print(df.sum(axis = 1))  # 按行求和
print
print('平均數')
print(df.mean(axis = 1, skipna = False))
print(df.mean(axis = 1))
print
print('其它')
print(df.idxmax())
print(df.cumsum())
print(df.describe())
obj = Series(['a', 'a', 'b', 'c'] * 4)
print(obj.describe())