1. 程式人生 > >吳裕雄 資料探勘與分析案例實戰(4)——python資料處理工具:Pandas

吳裕雄 資料探勘與分析案例實戰(4)——python資料處理工具:Pandas

# 匯入模組
import pandas as pd
import numpy as np

# 構造序列
gdp1 = pd.Series([2.8,3.01,8.99,8.59,5.18])
print(gdp1)
# 取出gdp1中的第一、第四和第五個元素
print('行號風格的序列:\n',gdp1[[0,3,4]])
# 數學函式--取對數
print('通過numpy函式:\n',np.log(gdp1))
# 平均gdp
print('通過numpy函式:\n',np.mean(gdp1))
print('通過序列的方法:\n',gdp1.mean())

gdp2 = pd.Series({'北京':2.8,'上海':3.01,'廣東':8.99,'江蘇':8.59,'浙江':5.18})
print(gdp2)
# 取出gdp2中的第一、第四和第五個元素
print('行名稱風格的序列:\n',gdp2[[0,3,4]])
# 取出gdp2中上海、江蘇和浙江的GDP值
print('行名稱風格的序列:\n',gdp2[['上海','江蘇','浙江']])

gdp3 = pd.Series(np.array((2.8,3.01,8.99,8.59,5.18)))
print(gdp3)

# 構造資料框
df1 = pd.DataFrame([['張三',23,'男'],['李四',27,'女'],['王二',26,'女']])
print('巢狀列表構造資料框:\n',df1)

df2 = pd.DataFrame({'姓名':['張三','李四','王二'],'年齡':[23,27,26],'性別':['男','女','女']})
print('字典構造資料框:\n',df2)

df3 = pd.DataFrame(np.array([['張三',23,'男'],['李四',27,'女'],['王二',26,'女']]))
print('二維陣列構造資料框:\n',df3)

# 讀取文字檔案中的資料
user_income = pd.read_table(r'F:\\python_Data_analysis_and_mining\\05\\data_test01.txt', sep = ',',
parse_dates={'birthday':[0,1,2]},skiprows=2, skipfooter=3,
comment='#', encoding='utf8', thousands='&')
print(user_income)

child_cloth = pd.read_excel(io = r'F:\\python_Data_analysis_and_mining\\05\\data_test02.xlsx', header = None,
names = ['Prod_Id','Prod_Name','Prod_Color','Prod_Price'], converters = {0:str})
print(child_cloth)

# 讀取電子表格資料
pd.read_excel(io = r'C:\Users\Administrator\Desktop\data_test02.xlsx', header = None,
names = ['Prod_Id','Prod_Name','Prod_Color','Prod_Price'])

# 匯入模組
import pymysql
# 連線MySQL資料庫
conn = pymysql.connect(host='localhost', user='root', password='1q2w3e4r',
database='test', port=3306, charset='utf8')
# 讀取資料
user = pd.read_sql('select * from topy', conn)
# 關閉連線
conn.close()
# 資料輸出
user

# 匯入第三方模組
import pymssql
# 連線SQL Server資料庫
connect = pymssql.connect(server = 'localhost', user = '', password = '',
database = 'train', charset = 'utf8')
# 讀取資料
data = pd.read_sql("select * from sec_buildings where direction = '朝南'", con=connect)
# 關閉連線
connect.close()
# 資料輸出
data.head()

import numpy as np
import pandas as pd

# 資料型別轉換及描述統計
# 資料讀取
sec_cars = pd.read_table(r'F:\\python_Data_analysis_and_mining\\05\\sec_cars.csv', sep = ',')
# 預覽資料的前五行
print(sec_cars.head())
# 檢視資料的行列數
print('資料集的行列數:\n',sec_cars.shape)
# 檢視資料集每個變數的資料型別
print('各變數的資料型別:\n',sec_cars.dtypes)
# 修改二手車上牌時間的資料型別
sec_cars.Boarding_time = pd.to_datetime(sec_cars.Boarding_time, format = '%Y年%m月')
# 預覽資料的前五行
print(sec_cars.head())
# 修改二手車新車價格的資料型別
# sec_cars.New_price = sec_cars.New_price.str[:-1].astype('float')
# 重新檢視各變數資料型別
print(sec_cars.dtypes)
# 資料的描述性統計
print(sec_cars.describe())
# 資料的形狀特徵
# 挑出所有數值型變數
num_variables = sec_cars.columns[sec_cars.dtypes !='object'][1:]
print(num_variables)

# 自定義函式,計算偏度和峰度
def skew_kurt(x):
skewness = x.skew()
kurtsis = x.kurt()
# 返回偏度值和峰度值
return pd.Series([skewness,kurtsis], index = ['Skew','Kurt'])

# 運用apply方法
print(sec_cars[num_variables].apply(func = skew_kurt, axis = 0))
# 離散型變數的統計描述
print()
print(sec_cars.describe(include = ['object']))

# 離散變數頻次統計
Freq = sec_cars.Discharge.value_counts()
print(Freq)
print(sec_cars.shape)
print(sec_cars.shape[0])
Freq_ratio = Freq/sec_cars.shape[0]
print(Freq_ratio)

Freq_df = pd.DataFrame({'Freq':Freq,'Freq_ratio':Freq_ratio})
print(Freq_df.head())
# 將行索引重設為變數
Freq_df.reset_index(inplace = True)
print(Freq_df.head())

# 資料讀入
df = pd.read_excel(r'F:\\python_Data_analysis_and_mining\\05\\data_test03.xlsx')
# 各變數資料型別
print(df.dtypes)
# 將birthday變數轉換為日期型
df.birthday = pd.to_datetime(df.birthday, format = '%Y/%m/%d')
print(df.birthday)
# 將手機號轉換為字串
df.tel = df.tel.astype('str')
print(df.dtypes)
print(df.birthday.dt.year)
print(df.start_work.dt.year)
# 新增年齡和工齡兩列
df['age'] = pd.datetime.today().year - df.birthday.dt.year
df['workage'] = pd.datetime.today().year - df.start_work.dt.year
print(df.head())
# 將手機號中間四位隱藏起來
df.tel = df.tel.apply(func = lambda x : x.replace(x[3:7], '****'))
print(df.head())
# 取出郵箱的域名
df['email_domain'] = df.email.apply(func = lambda x : x.split('@')[1])
print(df.head())
# 取出使用者的專業資訊
df['profession'] = df.other.str.findall('專業:(.*?),')
print(df.head())
# 去除birthday、start_work和other變數
df.drop(['birthday','start_work','other'], axis = 1, inplace = True)
print(df.head())

# 常用日期處理方法
dates = pd.to_datetime(pd.Series(['1989-8-18 13:14:55','1995-2-16']), format = '%Y-%m-%d %H:%M:%S')
print('返回日期值:\n',dates.dt.date)
print('返回季度:\n',dates.dt.quarter)
print('返回幾點鐘:\n',dates.dt.hour)
print('返回年中的天:\n',dates.dt.dayofyear)
print('返回年中的周:\n',dates.dt.weekofyear)
print('返回星期幾的名稱:\n',dates.dt.weekday_name)
print('返回月份的天數:\n',dates.dt.days_in_month)

# 資料清洗
# 資料讀入
df = pd.read_excel(r'F:\\python_Data_analysis_and_mining\\05\\data_test04.xlsx')
print(df)
# 重複觀測的檢測
print('資料集中是否存在重複觀測:\n',any(df.duplicated()))
# 刪除重複項
df.drop_duplicates(inplace = True)
print(df)

# 資料讀入
df = pd.read_excel(r'F:\\python_Data_analysis_and_mining\\05\\data_test05.xlsx')
print(df)
# 缺失觀測的檢測
print('資料集中是否存在缺失值:\n',any(df.isnull()))
# 刪除法之記錄刪除
df.dropna()
# print(df)
# 刪除法之變數刪除
df.drop('age', axis = 1)
# print(df)
# 替換法之前向替換
df.fillna(method = 'ffill',inplace = True)
# print(df)
# 替換法之後向替換
df.fillna(method = 'bfill',inplace = True)
# print(df)
# 替換法之常數替換
df.fillna(value = 0)
# print(df)
# 替換法之統計值替換
df = df.fillna(value = {'gender':df.gender.mode()[0], 'age':df.age.mean(), 'income':df.income.median()})
print(df)

# 資料讀入
sunspots = pd.read_table(r'F:\\python_Data_analysis_and_mining\05\\sunspots.csv', sep = ',')
print(sunspots.shape)
# 異常值檢測之標準差法
xbar = sunspots.counts.mean()
print(xbar)
xstd = sunspots.counts.std()
print(xstd)
print('標準差法異常值上限檢測:\n',any(sunspots.counts > xbar + 2 * xstd))
print('標準差法異常值下限檢測:\n',any(sunspots.counts < xbar - 2 * xstd))
# 異常值檢測之箱線圖法
Q1 = sunspots.counts.quantile(q = 0.25)
print(Q1)
Q3 = sunspots.counts.quantile(q = 0.75)
print(Q3)
IQR = Q3 - Q1
print(IQR)
print('箱線圖法異常值上限檢測:\n',any(sunspots.counts > Q3 + 1.5 * IQR))
print('箱線圖法異常值下限檢測:\n',any(sunspots.counts < Q1 - 1.5 * IQR))

# 匯入繪圖模組
import matplotlib.pyplot as plt
# 設定繪圖風格
plt.style.use('ggplot')
# 繪製直方圖
sunspots.counts.plot(kind = 'hist', bins = 30, normed = True)
# 繪製核密度圖
sunspots.counts.plot(kind = 'kde')
# 圖形展現
plt.show()
# 替換法處理異常值
print('異常值替換前的資料統計特徵:\n',sunspots.counts.describe())
# 箱線圖中的異常值判別上限
UL = Q3 + 1.5 * IQR
print('判別異常值的上限臨界值:\n',UL)
# 從資料中找出低於判別上限的最大值
replace_value = sunspots.counts[sunspots.counts < UL].max()
print('用以替換異常值的資料:\n',replace_value)
# 替換超過判別上限異常值
sunspots.counts[sunspots.counts > UL] = replace_value
print('異常值替換後的資料統計特徵:\n',sunspots.counts.describe())

df1 = pd.DataFrame({'name':['張三','李四','王二','丁一','李五'],
'gender':['男','女','女','女','男'],
'age':[23,26,22,25,27]}, columns = ['name','gender','age'])
print(df1)
# 取出資料集的中間三行(即所有女性),並且返回姓名和年齡兩列
a = df1.iloc[1:4,[0,2]]
print(a)
b = df1.loc[1:3, ['name','age']]
print(b)
c = df1.ix[1:3,[0,2]]
print(c)
# 將員工的姓名用作行標籤
df2 = df1.set_index('name')
print(df2)
# 取出資料集的中間三行
a = df2.iloc[1:4,:]
print(a)
b = df2.loc[['李四','王二','丁一'],:]
print(b)
c = df2.ix[1:4,:]
print(c)
# 使用篩選條件,取出所有男性的姓名和年齡
# df1.iloc[df1.gender == '男',]
a = df1.loc[df1.gender == '男',['name','age']]
print(a)
b = df1.ix[df1.gender == '男',['name','age']]
print(b)

# 資料讀取
diamonds = pd.read_table(r'F:\\python_Data_analysis_and_mining\\05\\diamonds.csv', sep = ',')
print(diamonds.shape)
print(diamonds.head())
# 單個分組變數的均值統計
a = pd.pivot_table(data = diamonds, index = 'color', values = 'price', margins = True, margins_name = '總計')
print(a)
# 兩個分組變數的列聯表
# 匯入numpy模組
import numpy as np

b = pd.pivot_table(data = diamonds, index = 'clarity', columns = 'cut', values = 'carat',
aggfunc = np.size,margins = True, margins_name = '總計')
print(b)

# 構造資料集df1和df2
df1 = pd.DataFrame({'name':['張三','李四','王二'], 'age':[21,25,22], 'gender':['男','女','男']})
print(df1)
df2 = pd.DataFrame({'name':['丁一','趙五'], 'age':[23,22], 'gender':['女','女']},)
print(df2)
# 資料集的縱向合併
a = pd.concat([df1,df2], keys = ['df1','df2'])
print(a)
# 如果df2資料集中的“姓名變數為Name”
df2 = pd.DataFrame({'Name':['丁一','趙五'], 'age':[23,22], 'gender':['女','女']})
# 資料集的縱向合併
b = pd.concat([df1,df2])
print(b)

# 構造資料集
df3 = pd.DataFrame({'id':[1,2,3,4,5],'name':['張三','李四','王二','丁一','趙五'],'age':[27,24,25,23,25],'gender':['男','男','男','女','女']})
print(df3)
df4 = pd.DataFrame({'Id':[1,2,2,4,4,4,5],'kemu':['科目1','科目1','科目2','科目1','科目2','科目3','科目1'],'score':[83,81,87,75,86,74,88]})
print(df4)
df5 = pd.DataFrame({'id':[1,3,5],'name':['張三','王二','趙五'],'income':[13500,18000,15000]})
print(df5)
# 三表的資料連線
# 首先df3和df4連線
merge1 = pd.merge(left = df3, right = df4, how = 'left', left_on='id', right_on='Id')
print(merge1)
# 再將連線結果與df5連線
merge2 = pd.merge(left = merge1, right = df5, how = 'left')
print(merge2)

# 資料讀取
diamonds = pd.read_table(r'F:\\python_Data_analysis_and_mining\\05\\diamonds.csv', sep = ',')
print(diamonds.shape)
print(diamonds.head())
# 通過groupby方法,指定分組變數
grouped = diamonds.groupby(by = ['color','cut'])
print(grouped)
# 對分組變數進行統計彙總
# 通過groupby方法,指定分組變數
grouped = diamonds.groupby(by = ['color','cut'])
print(grouped)
# 對分組變數進行統計彙總
result = grouped.aggregate({'color':np.size, 'carat':np.min, 'price':np.mean, 'face_width':np.max})
# 調整變數名的順序
result = pd.DataFrame(result, columns=['color','carat','price','face_width'])
# 資料集重新命名
result.rename(columns={'color':'counts','carat':'min_weight','price':'avg_price','face_width':'max_face_width'}, inplace=True)
# 將行索引變數資料框的變數
result.reset_index(inplace=True)
result