1. 程式人生 > >特徵工程(五)length

特徵工程(五)length

'''
將原始資料的word的長度特徵,並將結果儲存到本地

article特徵可做類似處理

'''
df_train=pd.read_csv('train_set.csv')
df_test=pd.read_csv('test_set.csv')


def get_word_len(df_series):
	word_len=[]
	for row in df_series:
    	word_len.append(len(row.split(' ')))
    return word_len

df_train_word = pd.DataFrame({'id':df_train['id'].values.tolist(),'word_len':get_word_len(df_train['word_seg'])})
df_test_word = pd.DataFrame({'id':df_test['id'].values.tolist(),'word_len':get_word_len(df_test['word_seg'])})


df_train_word.to_csv('./train_word_len.csv',index=False)
df_test_word.to_csv('./test_word_len.csv',index=False)