1. 程式人生 > >003.python科學計算庫pandas(上)

003.python科學計算庫pandas(上)

import pandas

food_info = pandas.read_csv("food_info.csv")
# csv檔案型別 DataFrame
print(type(food_info))
print("---1")
# dtypes 返回每個列的資料型別。結果為字典
# food_info.dtypes['NDB_No'] 獲取NDB_No列的資料型別
print(food_info.dtypes)
print("---2")
# head 返回第一個'n'行 ,預設時n=5
first_rows = food_info.head(n=3)
print(first_rows)
print("---3"
) # columns 獲取所有的列名稱組成的索引元組 print(first_rows.columns) print("---4") # (3, 36) print(first_rows.shape)

loc

import pandas

food_info = pandas.read_csv("food_info.csv")
# loc[i] 獲取第i行的資料 結果為字典 food_info.loc[i]['columnName']
# 其中索引從0開始
print(food_info.loc[0])
# KeyError: 'the label [9999999] is not in the [index]'
# print(food_info.loc[9999999])

import pandas

food_info = pandas.read_csv("food_info.csv")
# 返回一個DataFrame,其中包含索引3、4、5和6處的行
food_info.loc[3:6]
# 返回一個DataFrame,其中包含索引2、5和10處的行。下面兩種方法都可以。
two_five_ten = [2, 5, 10]
food_info.loc[two_five_ten]
food_info.loc[[2, 5, 10]]

column

import pandas

food_info = pandas.read_csv("food_info.csv"
) ndb_col = food_info["NDB_No"] # 或者,可以通過傳入字串變數來訪問列 col_name = "NDB_No" ndb_col = food_info[col_name] print(ndb_col)

import pandas

food_info = pandas.read_csv("food_info.csv")
columns = ["Zinc_(mg)", "Copper_(mg)"]
zinc_copper = food_info[columns]
# 跳過變數陣列賦值
zinc_copper = food_info[["Zinc_(mg)", "Copper_(mg)"]]
print(zinc_copper)

tolist

import pandas

food_info = pandas.read_csv("food_info.csv")
# # columns 獲取所有的列名稱組成的索引元組
print(type(food_info.columns))
print(food_info.columns)
# food_info.columns[0]='sadfaf'  型別錯誤(“索引不支援可變操作”)
# tolist 返回值的列表
col_names = food_info.columns.tolist()
col_names[0] = "sdfafg"
print(col_names)
print(type(col_names))

for

import pandas

food_info = pandas.read_csv("food_info.csv")
col_names = food_info.columns.tolist()
gram_columns = []
for c in col_names:
    # 將列名稱以(g)結尾的,新增進gram_columns列表
    if c.endswith("(g)"):
        gram_columns.append(c)
# 將gram_columns相關列讀取到gram_df
gram_df = food_info[gram_columns]
# 讀取gram_df的前3行資料
print(gram_df.head(3))

列的算術運算

import pandas
import numpy

food_info = pandas.read_csv("food_info.csv")
iron_mg = food_info["Iron_(mg)"]
print(iron_mg[0:3])
# 此列的每一行資料均除以1000
div_1000 = iron_mg / 1000
print(div_1000[0:3])
print()
# 此列的每一行資料均加上100
add_100 = iron_mg + 100
print(add_100[0:3])
print()
# 此列的每一行資料均減去50
sub_100 = iron_mg - 50
print(sub_100[0:3])
print()
# 此列的每一行資料均乘以2
mult_2 = iron_mg * 2
print(mult_2[0:3])
# hstack上篇的連線方法
print(numpy.hstack((iron_mg[0:3], mult_2[0:3])).reshape(2, 3))

import pandas

food_info = pandas.read_csv("food_info.csv")
# 它將算術運算子應用於兩列中的第一個值,兩列中的第二個值,依此類推
print(food_info["Water_(g)"][0:3])
print(food_info["Energ_Kcal"][0:3])
print("-------------------------------")
water_energy = food_info["Water_(g)"] * food_info["Energ_Kcal"]
print(water_energy[0:3])
print("-------------------------------")
iron_grams = food_info["Iron_(mg)"] / 1000
food_info["Iron_(g)"] = iron_grams
print(food_info["Iron_(g)"][0:3])

import pandas

food_info = pandas.read_csv("food_info.csv")
# initial_rating = Protein_(g)列乘以2 減去 0.75乘以Lipid_Tot_(g)列
weighted_protein = food_info["Protein_(g)"] * 2
print(weighted_protein[0:3])
print("------------------------------")
weighted_fat = -0.75 * food_info["Lipid_Tot_(g)"]
print(weighted_fat[0:3])
print("------------------------------")
initial_rating = weighted_protein + weighted_fat
print(initial_rating[0:3])

max

import pandas

food_info = pandas.read_csv("food_info.csv")
# Energ_Kcal列上的最大值
max_calories = food_info["Energ_Kcal"].max()
print(max_calories)

sort_values

import pandas

food_info = pandas.read_csv("food_info.csv")
# 預設情況下,panda將按照我們按升序指定的列對資料進行排序,並返回一個新的DataFrame
# 預設情況下,inplace=False 返回新的DataFrame
# 預設情況下,ascending=True 按升序
# 預設情況下,kind=quicksort 使用快速排序演算法
# 預設情況下,na_position=last NaN放在最後面 如果=first則放在最前面
# 對DataFrame進行就地排序,而不是返回新的DataFrame。
food_info.sort_values("Sodium_(mg)", inplace=True)
print(food_info["Sodium_(mg)"][0:5])
# 按降序排序,而不是升序排序
food_info.sort_values("Sodium_(mg)", inplace=True, ascending=False)
print(food_info["Sodium_(mg)"][0:5])