1. 程式人生 > >python使用dataframe統計的一個小指令碼:

python使用dataframe統計的一個小指令碼:

 此指令碼是我讀取JSON檔案,解析為詞典,然後讀取為dataframe,通過列名進行統計,最後統計輸出到結果檔案

_metaclass_=type
import ConfigParser
import os
import pandas as pd
import json
from __future__ import division

#讀取配置檔案獲取輸入路徑,輸出路徑
cf = ConfigParser.ConfigParser()
cf.read("config.conf")#配置檔案路徑
inpath = cf.get("config","inpath")
outpath = cf.get("config","outpath")
print inpath
#遍歷dirname,獲取所有檔案路徑
result = []
for maindir, subdir, file_name_list in os.walk(inpath):
    for filename in file_name_list:
        apath = os.path.join(maindir, filename)
        result.append(apath)
print result
#遍歷檔案路徑,獲取每個檔案的質控結果
results = pd.DataFrame([['表名稱','欄位名稱','總行數','空值行數','空值佔比','去重後行數','樣例資料','樣例資料對應條數']],columns=['tablename','columnName','總行數','無效記錄數','無效佔比','去重行數','樣例資料','樣例資料對應條數'])
df = pd.DataFrame()
for inpath in result:
    if "_SUCCESS" not in inpath:#測試過濾出目標檔案並統計
        f = open(inpath)
        lines = f.readlines()
        for line in lines:
            dic = json.loads(line)
            ps = pd.DataFrame(dic,index=['1'])
            df = df.append(ps)
names = df.columns
tablename = inpath.split('\\')[len(inpath.split('\\'))-1]
sumcounts = len(df)
print sumcounts
for name in names:
    ylsj = df[name].value_counts()[0:5].reset_index()['index'].tolist()
    ylsjcount = df[name].value_counts()[0:5].reset_index()[name].tolist()
    discount = len(df.drop_duplicates([name]))
    nullcounts = len(df[df[name].isin(['','NULL'])])
    ratio = nullcounts/sumcounts*100
    result = pd.DataFrame([[tablename,name,sumcounts,nullcounts,ratio,discount,ylsj,ylsjcount]],columns=['tablename','columnName','總行數','無效記錄數','無效佔比','去重行數','樣例資料','樣例資料對應條數'])
    print result
    results = results.append(result,ignore_index=True)
results.to_csv(outpath, index=False,mode='a', header=False )

 版權所有!