python對於資料處理所會用到得一般操作
阿新 • • 發佈:2018-11-01
xlsx檔案轉為csv
import pandas as pd#需要用到的包
import numpy as np#需要用到的包
path = "/home/public/GFQ/math_model/"#路徑
filepath_poi = path + "data.xlsx"#路徑+檔名
data1=pd.read_excel(filepath_poi)#讀取檔案。用data1儲存
data1.to_csv('select.csv',index=False, encoding='utf-8')
按概率統計
data=np.zeros((1005))#國家概率 with open(csv_file) as f: reader = csv.reader(f) first_row = next(reader) # second_row = reader.__next__() # print(first_row) # print(second_row) #----------------------- for row in reader: index=int(row[8]) #print(data[index]) data[index]+=1 #print(data[index]) with open('pre.csv', mode="w") as f: writer = csv.writer(f) with open(csv_file) as f01: reader = csv.reader(f01) first_row = next(reader) writer.writerow(first_row) # print(first_row) for row in reader: index=int(row[8]) #print(data[index]) row[8]=data[index]*1.0/114184 writer.writerow(row)
或者
d = {} e={} with open('one1.csv') as f: reader = csv.reader(f) first_row = next(reader) # first_row = reader.__next__() # second_row = reader.__next__() # print(first_row) # print(second_row) #----------------------- index = 0 index2=0 dis_area = [] dis_area2=[] for row in reader: dis_area.append(row[31])#第一列的資料對映到不重複的矩陣 dis_area2.append(row[32]) for one_geo in dis_area: if one_geo not in d: d[one_geo]=[] d[one_geo].append(1) else: d[one_geo][0]=d[one_geo][0]+1 for one_geo in dis_area2: if one_geo not in e: e[one_geo]=[] e[one_geo].append(1) else: e[one_geo][0]=e[one_geo][0]+1 #城市控制從前補 with open('one2.csv', mode="w") as f: writer = csv.writer(f) with open('one1.csv') as f01: reader = csv.reader(f01) first_row = next(reader) writer.writerow(first_row) # second_row = reader.__next__() # print(first_row) # print(second_row) #----------------------- for row in reader: row[31]=d[row[31]][0]*1.0/114184 row[32]=e[row[32]][0]*1.0/114184 writer.writerow(row)
缺失值處理(缺失值處理後記得用某個來檢視有沒有補進去np.isnan(train['doubtterr']).any())
import pandas as pd import numpy as np import csv path = "/home/public/GFQ/math_model/" train=pd.read_excel(path+'data2.xlsx') train['nkill']=train['nkill'].fillna(train['nkill'].mean()) train['nkillus']=train['nkillus'].fillna(train['nkillus'].mean()) train['nkillter']=train['nkillter'].fillna(train['nkillter'].mean()) train['nwound']=train['nwound'].fillna(train['nwound'].mean()) train['nwoundus']=train['nwoundus'].fillna(train['nwoundus'].mean()) train['nkillus']=train['nkillus'].fillna(train['nkillus'].mean()) train['nwoundte']=train['nwoundte'].fillna(train['nwoundte'].mean()) train['nperpcap']=train['nperpcap'].fillna(train['nperpcap'].mean()) train['multiple']=train['nwoundte'].fillna(0) train['guncertain1']=train['guncertain1'].fillna(train['guncertain1'].mode() )#zhongshu train['doubtterr']=train['doubtterr'].fillna(train['doubtterr'].mode() ) train['gname']=train['gname'].fillna(train['gname'].mode() ) train['claimmode']=train['claimmode'].fillna(train['claimmode'].mode() ) train['ishostkid']=train['ishostkid'].fillna(train['ishostkid'].mode() )
資料歸一化
import pandas as pd
import numpy as np
import csv
path = "/home/public/GFQ/math_model/"
train=pd.read_excel(path+'one.xlsx')
dum_propextent=pd.get_dummies(train['propextent'],prefix='propextent')
dum_propextent.head(3)
df=pd.concat([train,dum_propextent],axis=1)
df.to_csv('one1.csv',index=False, encoding='utf-8')