1. 程式人生 > >python對於資料處理所會用到得一般操作

python對於資料處理所會用到得一般操作

xlsx檔案轉為csv

import pandas as pd#需要用到的包
import numpy as np#需要用到的包
path = "/home/public/GFQ/math_model/"#路徑
filepath_poi = path + "data.xlsx"#路徑+檔名
data1=pd.read_excel(filepath_poi)#讀取檔案。用data1儲存
data1.to_csv('select.csv',index=False, encoding='utf-8')

按概率統計

data=np.zeros((1005))#國家概率
with open(csv_file) as f:
    reader = csv.reader(f)
    first_row = next(reader)
#     second_row = reader.__next__()
#     print(first_row)
#     print(second_row)
#-----------------------
    for row in reader:
        index=int(row[8])
        #print(data[index])
        data[index]+=1
        #print(data[index])


with open('pre.csv', mode="w") as f:
    writer = csv.writer(f)
  
    with open(csv_file) as f01:
        reader = csv.reader(f01)
        first_row = next(reader)
        writer.writerow(first_row) 
#             print(first_row)
        for row in reader:
            index=int(row[8])
            #print(data[index])
            row[8]=data[index]*1.0/114184
            writer.writerow(row) 

或者

d = {}
e={}
with open('one1.csv') as f:
    reader = csv.reader(f)
    first_row = next(reader)
#     first_row = reader.__next__()
#     second_row = reader.__next__()
#     print(first_row)
#     print(second_row)
#-----------------------
    index = 0
    index2=0
    dis_area = []
    dis_area2=[]
    for row in reader:
        dis_area.append(row[31])#第一列的資料對映到不重複的矩陣
        dis_area2.append(row[32])


    for one_geo in dis_area:
        if one_geo not in d:
            d[one_geo]=[]
            d[one_geo].append(1)
        else:
            d[one_geo][0]=d[one_geo][0]+1
    for one_geo in dis_area2:
        if one_geo not in e:
            e[one_geo]=[]
            e[one_geo].append(1)
        else:
            e[one_geo][0]=e[one_geo][0]+1
#城市控制從前補
with open('one2.csv', mode="w") as f:
    writer = csv.writer(f)
  
    with open('one1.csv') as f01:
        reader = csv.reader(f01)
        first_row = next(reader)
        writer.writerow(first_row) 
    #     second_row = reader.__next__()
    #     print(first_row)
    #     print(second_row)
    #-----------------------
        for row in reader:
            row[31]=d[row[31]][0]*1.0/114184
            row[32]=e[row[32]][0]*1.0/114184
            writer.writerow(row)

缺失值處理(缺失值處理後記得用某個來檢視有沒有補進去np.isnan(train['doubtterr']).any())

import pandas as pd
import numpy as np
import csv

path = "/home/public/GFQ/math_model/"

train=pd.read_excel(path+'data2.xlsx')

train['nkill']=train['nkill'].fillna(train['nkill'].mean())
train['nkillus']=train['nkillus'].fillna(train['nkillus'].mean())
train['nkillter']=train['nkillter'].fillna(train['nkillter'].mean())
train['nwound']=train['nwound'].fillna(train['nwound'].mean())
train['nwoundus']=train['nwoundus'].fillna(train['nwoundus'].mean())
train['nkillus']=train['nkillus'].fillna(train['nkillus'].mean())
train['nwoundte']=train['nwoundte'].fillna(train['nwoundte'].mean())
train['nperpcap']=train['nperpcap'].fillna(train['nperpcap'].mean())

train['multiple']=train['nwoundte'].fillna(0)
train['guncertain1']=train['guncertain1'].fillna(train['guncertain1'].mode() )#zhongshu
train['doubtterr']=train['doubtterr'].fillna(train['doubtterr'].mode() )

train['gname']=train['gname'].fillna(train['gname'].mode() )
train['claimmode']=train['claimmode'].fillna(train['claimmode'].mode() )
train['ishostkid']=train['ishostkid'].fillna(train['ishostkid'].mode() )

資料歸一化

import pandas as pd
import numpy as np
import csv

path = "/home/public/GFQ/math_model/"

train=pd.read_excel(path+'one.xlsx')

dum_propextent=pd.get_dummies(train['propextent'],prefix='propextent')

dum_propextent.head(3)
df=pd.concat([train,dum_propextent],axis=1)

df.to_csv('one1.csv',index=False, encoding='utf-8')