1. 程式人生 > >python筆記----pandas部分(2)

python筆記----pandas部分(2)

#coding=utf-8 import pandas as pd import numpy as np#泰坦尼克號獲救人員統計例子 titanic_survival = pd.read_csv("titanic_train.csv")print(titanic_survival.head())

#缺失值 age = titanic_survival["Age"]print(age.loc[0:10])#此列前10行有2行的值是缺失值 age_is_null = pd.isnull(age)#分別判斷此列是值是否是缺失值 結果也是一個矩陣print(age_is_null) age_null_is_true = age[age_is_null]print

(age_null_is_true)#輸出所有缺失值的行print(len(age_null_is_true))#統計此列缺失值的個數#去掉含缺失值的樣本 drop_na_columns = titanic_survival.dropna(axis = 1)#去掉所有含缺失值的樣本print(drop_na_columns) new_titanic_survival = titanic_survival.dropna(axis = 0,subset = ["Age","Sex"])#去掉此兩列含缺失值的樣本print(new_titanic_survival)

print("--------------------------------------

")

#求人員平均年齡 mean_age = sum(titanic_survival["Age"]) / len(titanic_survival["Age"])print(mean_age)#結果是NaN 因為資料中含有缺失值 無法計算 good_ages = titanic_survival["Age"][age_is_null == False]#取該列的值不為缺失值的部分 correct_mean_age = sum(good_ages) / len(good_ages)print(correct_mean_age)#pandas中也有現成的方法 mean()print(titanic_survival["Age"].mean())

print("--------------------------------------")

#快速統計常用函式 pivot_table #求泰坦尼克號住一等艙、二等艙、三等艙的人數的獲救率 #index:船艙等級Pclass 1 2 3     values:獲救情況 0 未獲救  1 獲救        aggfunc:計算方式 平均 passenger_survival = titanic_survival.pivot_table(index = "Pclass",values = "Survived", aggfunc = np.mean)print(passenger_survival)#求各船艙獲救人數 passenger_survival_sum = titanic_survival.pivot_table(index = "Pclass",values = "Survived", aggfunc = np.sum)print(passenger_survival_sum)#求各船艙獲救人數平均年齡 passenger_survival_age = titanic_survival.pivot_table(index = "Pclass",values = "Age", aggfunc = np.mean)print(passenger_survival_age)#求泰坦尼克號的3個登船地點的票價總、從3個登船地點登船的獲救人數統計(求一個量與兩個量間的關係) port_stats = titanic_survival.pivot_table(index = "Embarked",values = ["Fare","Survived"], aggfunc = np.sum)print(port_stats)#接上,不僅求和 還求平均值 port_stats2 = titanic_survival.pivot_table(index = "Embarked",values = ["Fare","Survived"], aggfunc = [np.sum,np.mean])print(port_stats2)

print("--------------------------------------")

#定位到具體值print(titanic_survival.loc[83,"Age"])#此列第83行的值

print("--------------------------------------")

#樣本排序 new_titanic_survival = titanic_survival.sort_values("Age",ascending = False)print(new_titanic_survival[0:10])#排序後重新生成有序的索引序號 new_titanic_survival_index = new_titanic_survival.reset_index(drop = True)print(new_titanic_survival_index)

print("--------------------------------------")

#自定義函式 apply(函式名) 自定義函式後,apply(函式名)則會執行自定義的函式 def hundredth_row(column):     return column.loc[99]

hundredth_row = titanic_survival.apply(hundredth_row)print(hundredth_row)

print("--------------------------------------")