1. 程式人生 > >資料清洗-> 資料入庫-> 資料視覺化 的 簡單專案

資料清洗-> 資料入庫-> 資料視覺化 的 簡單專案

資料從同事那裡拿來,大概60萬條,幾百MB ,是某市面上保險櫃子的資料,現在要分析這批資料。

資料清洗:略

資料入庫:略

資料視覺化:

#!/usr/bin/python3
 
import pymysql


type_list = ["userInfoSync","alertReport","changeNetwork","closeDoor","dataSync","deleteFP","dynPwd","dynPwdSecond",
             "formatDevice","heartbeat","lock_activation","network","openDoor","readStatus","regFP","resetting",
             "setCtlPwd","updateFirmware"]


def get_type_counts(): 

    config = {
        "mysql_config": {
            "host": "***",
            "user": "***",
            "password": "***",
            "database": "***"
                }
    }
    
    type_counts_dict={}
    
    user = config["mysql_config"]["user"]
    host = config["mysql_config"]["host"]
    password = config["mysql_config"]["password"]
    database = config["mysql_config"]["database"]
    # 開啟資料庫連線
    db = pymysql.connect(host,user ,password ,database , charset='utf8' )

    # 使用cursor()方法獲取操作遊標 
    cursor = db.cursor()

    # SQL 查詢語句
    sql = "SELECT type,count(*) as freq  FROM dictionary WHERE type != 'NULL' and type != 'networkStatus' group by type ;"

    try:
       # 執行SQL語句
       cursor.execute(sql)
       # 獲取所有記錄列表
       results = cursor.fetchall()
       #print(results)
       for row in results:
          type = row[0]
          freq = row[1]
          type_counts_dict[type]=freq
        
        
#            # 列印結果
#           print ("type=%s,freq=%s" % \
#                  (type, freq ))
            
            
    except:
       print ("Error: unable to fetch data")

    # 關閉資料庫連線
    db.close()
    return type_counts_dict

def fill_null_type(type_counts_dict,type_list):
    
    key_list = [ i for i in type_counts_dict]
    len_key_list = len(key_list)
    len_type_list = len(type_list)
    #查出的資料型別是否和預設業務型別作對比
    if len_key_list < len_type_list :
        null_type = list(set(type_list).difference(set(key_list)))
        print(null_type)
        for i in null_type:
            type_counts_dict[i] = 0
        return type_counts_dict
    elif len_key_list == type_list :
        print("Info: Data type is equals  business type!!!")
        return type_counts_dict
    else:
        print("Error: Data type is larger than business type!!!")
    return type_counts_dict

def data_visualization(type_counts_dict):
    import matplotlib.pyplot as plt
    import matplotlib
    
    matplotlib.rcParams['font.sans-serif'] = ['SimHei']
    matplotlib.rcParams['axes.unicode_minus'] = False
    #對字典進行排序
    type_counts_dict_sorted = sorted(zip(type_counts_dict.values(), type_counts_dict.keys()),reverse=True)
    datas = []
    type_name = []
    for x in type_counts_dict_sorted:
        datas.append(x[0])
        type_name.append(x[1])

    """
    繪製水平條形圖方法barh
    引數一:y軸
    引數二:x軸
    """
    plt.barh(range(len(datas)), datas, height=0.5, color='steelblue', alpha=0.8)      # 從下往上畫
    plt.yticks(range(len(type_name)), type_name)
    max_datas = max(datas)
    plt.xlim(0,max_datas+1000)
    plt.xlabel("Data Proportion")
    plt.title("Different types of data volume")
    for x, y in enumerate(datas):
        plt.text(y + 1/2, x - 0.1, '%s' % y)
    plt.show()


#獲取資料
type_counts_dict = get_type_counts()
#填充業務上要求,資料中沒有的型別
type_counts_dict = fill_null_type(type_counts_dict,type_list)   
#結果展示
data_visualization(type_counts_dict)

橫條形圖

#!/usr/bin/python3
 
import pymysql
import json

  

#獲取資料
def get_type_counts(): 

    config = {
        "mysql_config": {
            "host": "****",
            "user": "***",
            "password": "***.***",
            "database": "****"
                }
    }
    
    user = config["mysql_config"]["user"]
    host = config["mysql_config"]["host"]
    password = config["mysql_config"]["password"]
    database = config["mysql_config"]["database"]
    
    open_Doortype_counts_dict={}
    
    # 開啟資料庫連線
    db = pymysql.connect(host,user ,password ,database , charset='utf8' )

    # 使用cursor()方法獲取操作遊標 
    cursor = db.cursor()

    # SQL 查詢語句
    sql = "SELECT  msg  FROM dictionary WHERE type = 'openDoor';"
    
    try:
       # 執行SQL語句
       cursor.execute(sql)
       # 獲取所有記錄列表
       results = cursor.fetchall()
       #print(results)
       for row in results:
          line = str(row)[2:-3].strip("\\n")
          #print(line)
          open_Doortype = json.loads(line)["data"]["openDoorType"]
          if open_Doortype in open_Doortype_counts_dict.keys():
                open_Doortype_counts_dict[open_Doortype] += 1
          else:
                open_Doortype_counts_dict[open_Doortype] = 1
          
        
#            # 列印結果
#           print ("type=%s,freq=%s" % \
#                  (type, freq ))
            
            
    except:
       print ("Error: unable to fetch data")

    # 關閉資料庫連線
    db.close()
    return open_Doortype_counts_dict


#獲取資料
open_Doortype_counts_dict = get_type_counts()

#print(open_Doortype_counts_dict)
#{'3': 2191, '1': 1275}





#填充資料
def fill_null_type(open_Doortype_counts_dict):
    type_list = ["0","1","2","3","4"]
    key_list = [ i for i in open_Doortype_counts_dict]
    len_key_list = len(key_list)
    len_type_list = len(type_list)
    #查出的資料型別是否和預設業務型別作對比
    if len_key_list < len_type_list :
        null_type = list(set(type_list).difference(set(key_list)))
        print(null_type)
        for i in null_type:
            open_Doortype_counts_dict[i] = 0
        return open_Doortype_counts_dict
    elif len_key_list == type_list :
        print("Info: Data type is equals  business type!!!")
        return type_counts_dict
    else:
        print("Error: Data type is larger than business type!!!")
    return type_counts_dict

#  填充空值
open_Doortype_counts_dict = fill_null_type(open_Doortype_counts_dict)

#資料視覺化
def data_visualization(open_Doortype_counts_dict):
    import numpy as np
    import matplotlib.pyplot as plt

    fig, ax = plt.subplots(figsize=(9, 20), subplot_kw=dict(aspect="equal"))
    datas = []
    type_name = []
    open_Doortype_name_dict={'0':"Bluetooth opening",'1':"Open the door remotely",'2':"Password open",'3':"Fingerprint opening",'4':"Dynamic cipher"}
    type_name_cn = {}
    #名稱轉換 0 -> 藍芽開啟
    for name in open_Doortype_counts_dict:
        if name in open_Doortype_name_dict.keys():
            type_name_cn[open_Doortype_name_dict[name]] = open_Doortype_counts_dict[name]
            
    for x in type_name_cn:
        datas.append(type_name_cn[x])
        type_name.append(x)


    def func(pct, allvals):
        absolute = int(pct/100.*np.sum(allvals))
        return "{:.1f}%\n({:d} )".format(pct, absolute)


    wedges, texts, autotexts = ax.pie(datas, autopct=lambda pct: func(pct, datas),
                                      textprops=dict(color="w"))

    # 標籤距離
    ax.legend(wedges, type_name,
              title="Ingredients",
              loc="center left",
              bbox_to_anchor=(1, 0, 0.5, 0.5))
    #圖上的字
    plt.setp(autotexts, size=20, weight="bold")
    # title
    ax.set_title("Open Door Type Proportion",size = 20)

    plt.show()  
    
data_visualization(open_Doortype_counts_dict)

 

 

 

就先這樣吧。