Python數據分析(二)pandas缺失值處理
阿新 • • 發佈:2018-07-19
taf spa 3.0 .data float 數據分析 pandas panda pri
import pandas as pd import numpy as np df = pd.DataFrame(np.random.randn(5, 3), index=[‘a‘, ‘c‘, ‘e‘, ‘f‘, ‘h‘],columns=[‘one‘, ‘two‘, ‘three‘]) df = df.reindex([‘a‘, ‘b‘, ‘c‘, ‘d‘, ‘e‘, ‘f‘, ‘g‘, ‘h‘]) print(df) print(‘################缺失值判斷######################‘) print(‘--------Series的缺失值判斷---------‘) print (df[‘one‘].isnull())
‘‘‘
--------Series的缺失值判斷---------
a False
b True
c False
d True
e False
f False
g True
h False
Name: one, dtype: bool
‘‘‘ print(‘---------輸出Series缺失值和索引--------‘) print(df[‘one‘][df[‘one‘].isnull()])
‘‘‘
---------輸出Series缺失值和索引-------- b NaN d NaN g NaN Name: one, dtype: float64
‘‘‘ print(‘--------dataframe的缺失值判斷---------‘) print(df.isnull())
‘‘‘
--------dataframe的缺失值判斷--------- one two three a False False False b True True True c False False False d True True True e False False False f False False False g True True True h False False False
‘‘‘ print(‘--------輸出dataframe的缺失值和索引---------‘) data = df[df.isnull().values==True] print(data[~data.index.duplicated()])
‘‘‘
--------輸出dataframe的缺失值和索引---------
one two three
b NaN NaN NaN
d NaN NaN NaN
g NaN NaN NaN
‘‘‘ print(‘--------輸出dataframe的有缺失值的列---------‘) print(df.isnull().any())
‘‘‘
--------輸出dataframe的有缺失值的列---------
one True
two True
three True
dtype: bool
‘‘‘ print(‘################缺失值過濾######################‘) print(‘--------Series的缺失值過濾---------‘) print(df[‘one‘].isnull())
‘‘‘
################缺失值過濾######################
--------Series的缺失值過濾---------
a False
b True
c False
d True
e False
f False
g True
h False
Name: one, dtype: bool
‘‘‘ print(‘--------使用dropna方法刪除缺失數據,返回一個刪除後的Series--------‘) print(df[‘one‘].dropna())
‘‘‘
--------使用dropna方法刪除缺失數據,返回一個刪除後的Series--------
a -0.211055
c -0.870090
e -0.203259
f 0.490568
h 1.437819
Name: one, dtype: float64
‘‘‘ print(‘--------dataframe的缺失值過濾---------‘) print(df.dropna())
‘‘‘
--------dataframe的缺失值過濾---------
one two three
a -0.211055 -2.869212 0.022179
c -0.870090 -0.878423 1.071588
e -0.203259 0.315897 0.495306
f 0.490568 -0.968058 -0.999899
h 1.437819 -0.370934 -0.482307
‘‘‘ print(‘-------當行全為NaN的時候,才刪除,參數how默認是any,含有缺失值就刪除--------‘) print(df.dropna(how="all"))
‘‘‘
-------當行全為NaN的時候,才刪除,參數how默認是any,含有缺失值就刪除--------
one two three
a -0.211055 -2.869212 0.022179
c -0.870090 -0.878423 1.071588
e -0.203259 0.315897 0.495306
f 0.490568 -0.968058 -0.999899
h 1.437819 -0.370934 -0.482307
‘‘‘ print(‘################缺失值填充######################‘) print(‘------指定特殊值填充缺失值-------‘) print(df.fillna(0))
‘‘‘
################缺失值填充######################
------指定特殊值填充缺失值-------
one two three
a -0.211055 -2.869212 0.022179
b 0.000000 0.000000 0.000000
c -0.870090 -0.878423 1.071588
d 0.000000 0.000000 0.000000
e -0.203259 0.315897 0.495306
f 0.490568 -0.968058 -0.999899
g 0.000000 0.000000 0.000000
h 1.437819 -0.370934 -0.482307
‘‘‘ print(‘------不同的列用不同的值填充------‘) print(df.fillna({‘one‘:1,‘two‘:2,‘three‘:3}))
‘‘‘
------不同的列用不同的值填充------
one two three
a -0.211055 -2.869212 0.022179
b 1.000000 2.000000 3.000000
c -0.870090 -0.878423 1.071588
d 1.000000 2.000000 3.000000
e -0.203259 0.315897 0.495306
f 0.490568 -0.968058 -0.999899
g 1.000000 2.000000 3.000000
h 1.437819 -0.370934 -0.482307
‘‘‘ print(‘------前向填充------‘) print(df.fillna(method="ffill"))
‘‘‘
------前向填充------
one two three
a -0.211055 -2.869212 0.022179
b -0.211055 -2.869212 0.022179
c -0.870090 -0.878423 1.071588
d -0.870090 -0.878423 1.071588
e -0.203259 0.315897 0.495306
f 0.490568 -0.968058 -0.999899
g 0.490568 -0.968058 -0.999899
h 1.437819 -0.370934 -0.482307
‘‘‘ print(‘------後向填充------‘) print(df.fillna(method="bfill"))
‘‘‘
------後向填充------
one two three
a -0.211055 -2.869212 0.022179
b -0.870090 -0.878423 1.071588
c -0.870090 -0.878423 1.071588
d -0.203259 0.315897 0.495306
e -0.203259 0.315897 0.495306
f 0.490568 -0.968058 -0.999899
g 1.437819 -0.370934 -0.482307
h 1.437819 -0.370934 -0.482307
‘‘‘
print(‘------平均值填充------‘) print(df.fillna(df.mean()))
‘‘‘
------平均值填充------
one two three
a -0.211055 -2.869212 0.022179
b 0.128797 -0.954146 0.021373
c -0.870090 -0.878423 1.071588
d 0.128797 -0.954146 0.021373
e -0.203259 0.315897 0.495306
f 0.490568 -0.968058 -0.999899
g 0.128797 -0.954146 0.021373
h 1.437819 -0.370934 -0.482307
‘‘‘
Python數據分析(二)pandas缺失值處理