《Python資料分析與挖掘實戰》第7章——kmeans
阿新 • • 發佈:2019-01-07
本文是基於《Python資料分析與挖掘實戰》的實戰部分的第七章的資料——《航空公司客戶價值分析》做的分析。
旨在補充原文中的細節程式碼,並給出文中涉及到的內容的完整程式碼。
1)在資料預處理部分增加了屬性規約、資料變換的程式碼
2)在模型構建的部分增加了一個畫出雷達圖的函式程式碼
1 背景與目標分析
此專案旨在根據航空公司提供的資料,對其客戶進行分類,並且比較不同類別客戶的價值,為能夠更好的為客戶提供個性化服務做參考。
2 資料探索
2.1 資料質量分析
#對資料進行基本的探索 #返回缺失值個數以及最大最小值 import pandas as pd datafile = 'air_data.csv'#航空公司原始資料,第一行是屬性名 result = 'explore.xlsx' data = pd.read_csv(datafile, encoding='utf-8') explore = data.describe( percentiles = [],include = 'all').T explore['null'] = len(data)-explore['count'] explore1 = explore[['null','max','min']] explore1.columns = [u'空值數',u'最大值',u'最小值']#重新命名列名 explore1.to_excel(result)
部分截圖如下
3 資料預處理
3.1 資料清洗
datafile = 'air_data.csv'#航空公司原始資料,第一行是屬性名 data = pd.read_csv(datafile, encoding='utf-8') # 丟棄掉票價為0的記錄;丟棄票價為0、平均折扣不為零、總飛行公里大於0的記錄 cleanedfile = 'cleaned.xlsx' data1 = data[data['SUM_YR_1'].notnull()*data['SUM_YR_2'].notnull()] #票價非空值才保留,去掉空值 #只保留票價非零的,或者平均折扣率與總飛行公里數同時為零的記錄 index1 = data1['SUM_YR_1'] != 0 index2 = data1['SUM_YR_2'] != 0 index3 = (data1['SEG_KM_SUM'] == 0) & (data1['avg_discount'] == 0) data1 = data1[index1 | index2 | index3] #或關係 data1.to_excel(cleanedfile) data2 = data1[['LOAD_TIME','FFP_DATE','LAST_TO_END','FLIGHT_COUNT','SEG_KM_SUM','avg_discount']] data2.to_excel('datadecrese.xlsx')
3.2 資料規約及屬性構造
import numpy as np data = pd.read_excel('datadecrese.xlsx') data['L1'] = pd.to_datetime(data['LOAD_TIME']) - pd.to_datetime(data['FFP_DATE'])# 以納秒為單位 # data['L3'] = data['L1'].astype('int64')/10**10/8640/30 # 此方法假定每個月是30天,這方法不準確 data['L3'] = data['L1']/np.timedelta64(1, 'M') # 將間隔時間轉成月份為單位,注意,此處必須加一箇中間變數 (****)
# 將表中的浮點型別保留至小數點後四為
# f = lambda x:'%.2f' % x
# data[['L3']] = data[['L3']].applymap(f) # or data['L3'] = data['L3'].apply(f)
# data[['L3']] = data[['L3']].astype('float64')# 注意:使用apply或applymap後,資料型別變成Object,若後續有需要需要在此進行型別轉換
data["L3"] = data["L3"].round(2) # 等價於上面三句話,資料型別不變
data['LAST_TO_END'] = (data['LAST_TO_END']/30).round(2) # 此方法假定每個月是30天,這方法不夠準確
data['avg_discount'] = data['avg_discount'].round(2)
data.drop('L1', axis=1, inplace =True) # 刪除中間變數
data.drop(data.columns[:2], axis=1, inplace =True) # 去掉不需要的u'LOAD_TIME', u'FFP_DATE'
data.rename(columns={'LAST_TO_END':'R','FLIGHT_COUNT':'F','SEG_KM_SUM':'M','avg_discount':'C','L3':'L'},inplace=True)
data.to_excel('sxgz.xlsx',index=False)
def f(x):
return Series([x.min(),x.max()], index=['min','max'])
d = data.apply(f)
d.to_excel('summary_data.xlsx')
如下表:
標準化
# 3> 資料標準化
#標準差標準化
d1 = pd.read_excel('sxgz.xlsx')
d2 = (d1-d1.mean())/d1.std() # 等價於d2= StandardScaler().fit_transform(d1.values)
d1 =d2.iloc[:,[4,0,1,2,3]]
d1.columns = ['Z'+i for i in d1.columns]#表頭重新命名
d1.to_excel('sjbzh.xlsx',index=False)
4 模型構建
#使用K-means聚類演算法分類並分析每類的特徵
import pandas as pd
from pandas import DataFrame,Series
from sklearn.cluster import KMeans #匯入K均值聚類演算法
k = 5 # 聚為5類
d3 = pd.read_excel('sjbzh.xlsx')
#呼叫k-means演算法,進行聚類分析
kmodel = KMeans(n_clusters=k, n_jobs=4)# n_job是並行數,一般等於CPU數較好
kmodel.fit(d3)
labels = kmodel.labels_#檢視各樣本類別
demo = DataFrame(labels,columns=['numbers'])
demo1= DataFrame(kmodel.cluster_centers_, columns=d3.columns) # 儲存聚類中心
demo2= demo['numbers'].value_counts() # 確定各個類的數目
demo4 = pd.concat([demo2,demo1],axis=1)
demo4.index.name='labels'
demo4.to_excel('kmeansresults.xlsx')
print kmodel.cluster_centers_#檢視聚類中心
print kmodel.labels_#檢視各樣本類別
畫雷達圖
#畫雷達圖 客戶群特徵分析圖
subset = demo1.copy()
subset = subset.round(3)
subset.to_excel('testradar.xlsx')
data = subset.as_matrix() # 將表格資料轉成陣列
from radar1 import drawRader # 從已經編寫好的畫雷達圖的函式中匯入
title = 'RadarPicture'
rgrids = [0.5, 1, 1.5, 2, 2.5]
itemnames = ['ZL','ZR','ZF','ZM','ZC']
labels = list('abcde')
drawRader(itemnames=itemnames,data=data,title=title,labels=labels, saveas = '2.jpg',rgrids=rgrids)
其詳細程式碼如下:
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.path import Path
from matplotlib.spines import Spine
from matplotlib.projections.polar import PolarAxes
from matplotlib.projections import register_projection
def radar_factory(num_vars, frame='circle'):
"""Create a radar chart with `num_vars` axes.
This function creates a RadarAxes projection and registers it.
Parameters
----------
num_vars : int
Number of variables for radar chart.
frame : {'circle' | 'polygon'}
Shape of frame surrounding axes.
"""
# calculate evenly-spaced axis angles
theta = np.linspace(0, 2*np.pi, num_vars, endpoint=False)
# rotate theta such that the first axis is at the top
theta += np.pi/2
def draw_poly_patch(self):
verts = unit_poly_verts(theta)
return plt.Polygon(verts, closed=True, edgecolor='k')
def draw_circle_patch(self):
# unit circle centered on (0.5, 0.5)
return plt.Circle((0.5, 0.5), 0.5)
patch_dict = {'polygon': draw_poly_patch, 'circle': draw_circle_patch}
if frame not in patch_dict:
raise ValueError('unknown value for `frame`: %s' % frame)
class RadarAxes(PolarAxes):
name = 'radar'
# use 1 line segment to connect specified points
RESOLUTION = 1
# define draw_frame method
draw_patch = patch_dict[frame]
def fill(self, *args, **kwargs):
"""Override fill so that line is closed by default"""
closed = kwargs.pop('closed', True)
return super(RadarAxes, self).fill(closed=closed, *args, **kwargs)
def plot(self, *args, **kwargs):
"""Override plot so that line is closed by default"""
lines = super(RadarAxes, self).plot(*args, **kwargs)
for line in lines:
self._close_line(line)
def _close_line(self, line):
x, y = line.get_data()
# FIXME: markers at x[0], y[0] get doubled-up
if x[0] != x[-1]:
x = np.concatenate((x, [x[0]]))
y = np.concatenate((y, [y[0]]))
line.set_data(x, y)
def set_varlabels(self, labels):
self.set_thetagrids(np.degrees(theta), labels)
def _gen_axes_patch(self):
return self.draw_patch()
def _gen_axes_spines(self):
if frame == 'circle':
return PolarAxes._gen_axes_spines(self)
# The following is a hack to get the spines (i.e. the axes frame)
# to draw correctly for a polygon frame.
# spine_type must be 'left', 'right', 'top', 'bottom', or `circle`.
spine_type = 'circle'
verts = unit_poly_verts(theta)
# close off polygon by repeating first vertex
verts.append(verts[0])
path = Path(verts)
spine = Spine(self, spine_type, path)
spine.set_transform(self.transAxes)
return {'polar': spine}
register_projection(RadarAxes)
return theta
def unit_poly_verts(theta):
"""Return vertices of polygon for subplot axes.
This polygon is circumscribed by a unit circle centered at (0.5, 0.5)
"""
x0, y0, r = [0.5] * 3
verts = [(r*np.cos(t) + x0, r*np.sin(t) + y0) for t in theta]
return verts
def example_data():
# The following data is from the Denver Aero
data1 = [
['ZL','ZR','ZF','ZM','ZC'],
('R',
[[0.063,-0.0040000000000000001, -0.22600000000000001,-0.22900000000000001,2.1949999999999998],
[1.161, -0.377, -0.086999999999999994, -0.095000000000000001, -0.159],
[0.48299999999999998,-0.79900000000000004,2.4830000000000001,2.4249999999999998,0.308],
[-0.314,1.6859999999999999,-0.57399999999999995,-0.53700000000000003,-0.17299999999999999],
[-0.69999999999999996, -0.41499999999999998, -0.161, -0.161, -0.253]]
)
]
return data1
if __name__ == '__main__':
N = 5
theta = radar_factory(N, frame='polygon')
data = example_data()
spoke_labels = data.pop(0)
fig, axes = plt.subplots(figsize=(9, 9), nrows=2, ncols=2,
subplot_kw=dict(projection='radar'))
fig.subplots_adjust(wspace=0.25, hspace=0.20, top=0.85, bottom=0.05)
colors = ['b', 'r', 'g', 'm', 'y']
# Plot the four cases from the example data on separate axes
for ax, (title, case_data) in zip(axes.flatten(), data):
ax.set_rgrids([0.5, 1, 1.5,2,2.5])
ax.set_title(title, weight='bold', size='medium', position=(0.5, 1.1),
horizontalalignment='center', verticalalignment='center')
for d, color in zip(case_data, colors):
ax.plot(theta, d, color=color)
ax.fill(theta, d, facecolor=color, alpha=0.25)
ax.set_varlabels(spoke_labels)
# add legend relative to top-left plot
ax = axes[0, 0]
plt.rcParams['font.sans-serif'] = ['SimHei']
plt.rcParams['axes.unicode_minus'] = False
labels = (list('abcde'))
legend = ax.legend(labels, loc=(0.9, .95),
labelspacing=0.1, fontsize='small')
fig.text(0.5, 0.965, '5-Factor Solution Profiles Across Four Scenarios',
horizontalalignment='center', color='black', weight='bold',
size='large')
plt.show()
此章中,由於matplotlib 官網上是畫出一組,四個雷達圖,為了整出一個雷達圖,可花了博主不少時間(掩面笑哭。。。)
備註:本章節完整程式碼詳見點選開啟連結