1. 程式人生 > >使用LSTM和Softmx來進行意圖識別

使用LSTM和Softmx來進行意圖識別

1 # -*- coding: utf-8 -*- 2 3 import yaml 4 import sys 5 reload(sys) 6 sys.setdefaultencoding("utf-8") 7 from sklearn.cross_validation import train_test_split 8 import multiprocessing 9 import numpy as np 10 from keras.utils import np_utils 11 from gensim.models.word2vec import Word2Vec
12 from gensim.corpora.dictionary import Dictionary 13 14 from keras.preprocessing import sequence 15 from keras.models import Sequential 16 from keras.layers.embeddings import Embedding 17 from keras.layers.recurrent import LSTM 18 from keras.layers.core import Dense, Dropout,Activation 19
from keras.models import model_from_yaml 20 from sklearn.preprocessing import LabelEncoder 21 np.random.seed(1337) # For Reproducibility 22 import jieba 23 import pandas as pd 24 sys.setrecursionlimit(1000000) 25 # set parameters: 26 vocab_dim = 100 27 maxlen = 100 28 n_iterations = 1 # ideally more..
29 n_exposures = 10 30 window_size = 7 31 batch_size = 32 32 n_epoch = 15 33 input_length = 100 34 cpu_count = multiprocessing.cpu_count() 35 #載入訓練檔案 36 37 def loadfile(): 38 fopen = open('data/question_query.txt', 'r') 39 questtion = [] 40 for line in fopen: 41 question.append(line) 42 43 fopen = open('data/music_query.txt', 'r') 44 music = [] 45 for line in fopen: 46 music.append(line) 47 48 fopen = open('data/station_query.txt', 'r') 49 station = [] 50 for line in fopen: 51 station.append(line) 52 53 combined = np.concatenate((station, music, qabot)) 54 question_array = np.array([-1]*len(question),dtype=int) 55 station_array = np.array([0]*len(station),dtype=int) 56 music_array = np.array([1]*len(music),dtype=int) 57 #y = np.concatenate((np.ones(len(station), dtype=int), np.zeros(len(music), dtype=int)),qabot_array[0]) 58 y = np.hstack((qabot_array, station_array,music_array)) 59 print "y is:" 60 print y.size 61 print "combines is:" 62 print combined.size 63 return combined, y 64 66 #對句子分詞,並去掉換行符 67 def tokenizer(document): 68 ''' Simple Parser converting each document to lower-case, then 69 removing the breaks for new lines and finally splitting on the 70 whitespace 71 ''' 72 #text = [jieba.lcut(document.replace('\n', '')) for str(document) in text_list] 73 result_list = [] 74 for text in document: 75 result_list.append(' '.join(jieba.cut(text)).encode('utf-8').strip()) 76 return result_list 77 80 #建立詞語字典,並返回每個詞語的索引,詞向量,以及每個句子所對應的詞語索引 81 def create_dictionaries(model=None, 82 combined=None): 83 ''' Function does are number of Jobs: 84 1- Creates a word to index mapping 85 2- Creates a word to vector mapping 86 3- Transforms the Training and Testing Dictionaries 87 4- 返回所有詞語的向量的拼接結果 88 ''' 89 if (combined is not None) and (model is not None): 90 gensim_dict = Dictionary() 91 gensim_dict.doc2bow(model.wv.vocab.keys(), 92 allow_update=True) 93 w2indx = {v: k+1 for k, v in gensim_dict.items()}#所有頻數超過10的詞語的索引 94 w2vec = {word: model[word] for word in w2indx.keys()}#所有頻數超過10的詞語的詞向量 95 96 def parse_dataset(combined): 97 ''' Words become integers 98 ''' 99 data=[] 100 for sentence in combined: 101 new_txt = [] 102 sentences = sentence.split(' ') 103 for word in sentences: 104 try: 105 word = unicode(word, errors='ignore') 106 new_txt.append(w2indx[word]) 107 except: 108 new_txt.append(0) 109 data.append(new_txt) 110 return data 111 combined=parse_dataset(combined) 112 combined= sequence.pad_sequences(combined, maxlen=maxlen)#每個句子所含詞語對應的索引,所以句子中含有頻數小於10的詞語,索引為0 113 return w2indx, w2vec,combined 114 else: 115 print 'No data provided...' 116 118 #建立詞語字典,並返回每個詞語的索引,詞向量,以及每個句子所對應的詞語索引 119 def word2vec_train(combined): 120 # 載入word2vec 模型 121 model = Word2Vec.load('lstm_data/model/Word2vec_model.pkl') 122 index_dict, word_vectors,combined = create_dictionaries(model=model,combined=combined) 123 return index_dict, word_vectors,combined 124 125 def get_data(index_dict,word_vectors,combined,y): 126 # 獲取句子的向量 127 n_symbols = len(index_dict) + 1 # 所有單詞的索引數,頻數小於10的詞語索引為0,所以加1 128 embedding_weights = np.zeros((n_symbols, vocab_dim)) #索引為0的詞語,詞向量全為0 129 for word, index in index_dict.items(): #從索引為1的詞語開始,對每個詞語對應其詞向量 130 embedding_weights[index, :] = word_vectors[word] 131 x_train, x_test, y_train, y_test = train_test_split(combined, y, test_size=0.2) 132 # encode class values as integers 133 encoder = LabelEncoder() 134 encoded_y_train = encoder.fit_transform(y_train) 135 encoded_y_test = encoder.fit_transform(y_test) 136 # convert integers to dummy variables (one hot encoding) 137 y_train = np_utils.to_categorical(encoded_y_train) 138 y_test = np_utils.to_categorical(encoded_y_test) 139 print x_train.shape,y_train.shape 140 return n_symbols,embedding_weights,x_train,y_train,x_test,y_test 141 142 ##定義網路結構 143 def train_lstm(n_symbols,embedding_weights,x_train,y_train,x_test,y_test): 144 nb_classes = 3 145 print 'Defining a Simple Keras Model...' 146 ## 定義基本的網路結構 147 model = Sequential() # or Graph or whatever 148 ## 對於LSTM 變長的文字使用Embedding 將其變成指定長度的向量 149 model.add(Embedding(output_dim=vocab_dim, 150 input_dim=n_symbols, 151 mask_zero=True, 152 weights=[embedding_weights], 153 input_length=input_length)) # Adding Input Length 154 ## 使用單層LSTM 輸出的向量維度是50,輸入的向量維度是vocab_dim,啟用函式relu 155 model.add(LSTM(output_dim=50, activation='relu', inner_activation='hard_sigmoid')) 156 model.add(Dropout(0.5)) 157 ## 在這裡外接softmax,進行最後的3分類 158 model.add(Dense(output_dim=nb_classes, input_dim=50, activation='softmax')) 159 print 'Compiling the Model...' 160 ## 啟用函式使用的是adam 161 model.compile(loss='categorical_crossentropy', 162 optimizer='adam',metrics=['accuracy']) 163 164 print "Train..." 165 print y_train 166 model.fit(x_train, y_train, batch_size=batch_size, nb_epoch=n_epoch,verbose=1, validation_data=(x_test, y_test)) 167 print "Evaluate..." 168 score = model.evaluate(x_test, y_test, 169 batch_size=batch_size) 170 yaml_string = model.to_yaml() 171 with open('lstm_data/lstm_koubei.yml', 'w') as outfile: 172 outfile.write( yaml.dump(yaml_string, default_flow_style=True) ) 173 model.save_weights('lstm_data/lstm_koubei.h5') 174 print 'Test score:', score 175 176 #訓練模型,並儲存 177 def train(): 178 print 'Loading Data...' 179 combined,y=loadfile() 180 print len(combined),len(y) 181 print 'Tokenising...' 182 combined = tokenizer(combined) 183 print 'Training a Word2vec model...' 184 index_dict, word_vectors,combined=word2vec_train(combined) 185 print 'Setting up Arrays for Keras Embedding Layer...' 186 n_symbols,embedding_weights,x_train,y_train,x_test,y_test=get_data(index_dict, word_vectors,combined,y) 187 print x_train.shape,y_train.shape 188 train_lstm(n_symbols,embedding_weights,x_train,y_train,x_test,y_test) 190 191 #訓練模型,並儲存 192 def self_train(): 193 print 'Loading Data...' 194 combined,y=loadfile() 195 print len(combined),len(y) 196 print 'Tokenising...' 197 combined = tokenizer(combined) 198 print 'Training a Word2vec model...' 199 index_dict, word_vectors,combined=word2vec_train(combined) 200 print 'Setting up Arrays for Keras Embedding Layer...' 201 n_symbols,embedding_weights,x_train,y_train,x_test,y_test=get_data(index_dict, word_vectors,combined,y) 202 print x_train.shape,y_train.shape 203 train_lstm(n_symbols,embedding_weights,x_train,y_train,x_test,y_test) 204 205 def input_transform(string): 206 words=' '.join(jieba.cut(string)).encode('utf-8').strip() 207 tmp_list = [] 208 tmp_list.append(words) 209 #words=np.array(tmp_list).reshape(1,-1) 210 model=Word2Vec.load('lstm_data/model/Word2vec_model.pkl') 211 _,_,combined=create_dictionaries(model,tmp_list) 212 return combined248
249 if __name__=='__main__': 250 self_train()