使用LSTM和Softmx來進行意圖識別

阿新 • • 發佈：2018-12-30

1 # -*- coding: utf-8 -*- 2 3 import yaml 4 import sys 5 reload(sys) 6 sys.setdefaultencoding("utf-8") 7 from sklearn.cross_validation import train_test_split 8 import multiprocessing 9 import numpy as np 10 from keras.utils import np_utils 11 from gensim.models.word2vec import Word2Vec

12 from gensim.corpora.dictionary import Dictionary 13 14 from keras.preprocessing import sequence 15 from keras.models import Sequential 16 from keras.layers.embeddings import Embedding 17 from keras.layers.recurrent import LSTM 18 from keras.layers.core import Dense, Dropout,Activation 19

from keras.models import model_from_yaml 20 from sklearn.preprocessing import LabelEncoder 21 np.random.seed(1337) # For Reproducibility 22 import jieba 23 import pandas as pd 24 sys.setrecursionlimit(1000000) 25 # set parameters: 26 vocab_dim = 100 27 maxlen = 100 28 n_iterations = 1 # ideally more..

29 n_exposures = 10 30 window_size = 7 31 batch_size = 32 32 n_epoch = 15 33 input_length = 100 34 cpu_count = multiprocessing.cpu_count() 35 #載入訓練檔案 36 37 def loadfile(): 38 fopen = open('data/question_query.txt', 'r') 39 questtion = [] 40 for line in fopen: 41 question.append(line) 42 43 fopen = open('data/music_query.txt', 'r') 44 music = [] 45 for line in fopen: 46 music.append(line) 47 48 fopen = open('data/station_query.txt', 'r') 49 station = [] 50 for line in fopen: 51 station.append(line) 52 53 combined = np.concatenate((station, music, qabot)) 54 question_array = np.array([-1]*len(question),dtype=int) 55 station_array = np.array([0]*len(station),dtype=int) 56 music_array = np.array([1]*len(music),dtype=int) 57 #y = np.concatenate((np.ones(len(station), dtype=int), np.zeros(len(music), dtype=int)),qabot_array[0]) 58 y = np.hstack((qabot_array, station_array,music_array)) 59 print "y is:" 60 print y.size 61 print "combines is:" 62 print combined.size 63 return combined, y 64 66 #對句子分詞，並去掉換行符 67 def tokenizer(document): 68 ''' Simple Parser converting each document to lower-case, then 69 removing the breaks for new lines and finally splitting on the 70 whitespace 71 ''' 72 #text = [jieba.lcut(document.replace('\n', '')) for str(document) in text_list] 73 result_list = [] 74 for text in document: 75 result_list.append(' '.join(jieba.cut(text)).encode('utf-8').strip()) 76 return result_list 77 80 #建立詞語字典，並返回每個詞語的索引，詞向量，以及每個句子所對應的詞語索引 81 def create_dictionaries(model=None, 82 combined=None): 83 ''' Function does are number of Jobs: 84 1- Creates a word to index mapping 85 2- Creates a word to vector mapping 86 3- Transforms the Training and Testing Dictionaries 87 4- 返回所有詞語的向量的拼接結果 88 ''' 89 if (combined is not None) and (model is not None): 90 gensim_dict = Dictionary() 91 gensim_dict.doc2bow(model.wv.vocab.keys(), 92 allow_update=True) 93 w2indx = {v: k+1 for k, v in gensim_dict.items()}#所有頻數超過10的詞語的索引 94 w2vec = {word: model[word] for word in w2indx.keys()}#所有頻數超過10的詞語的詞向量 95 96 def parse_dataset(combined): 97 ''' Words become integers 98 ''' 99 data=[] 100 for sentence in combined: 101 new_txt = [] 102 sentences = sentence.split(' ') 103 for word in sentences: 104 try: 105 word = unicode(word, errors='ignore') 106 new_txt.append(w2indx[word]) 107 except: 108 new_txt.append(0) 109 data.append(new_txt) 110 return data 111 combined=parse_dataset(combined) 112 combined= sequence.pad_sequences(combined, maxlen=maxlen)#每個句子所含詞語對應的索引，所以句子中含有頻數小於10的詞語，索引為0 113 return w2indx, w2vec,combined 114 else: 115 print 'No data provided...' 116 118 #建立詞語字典，並返回每個詞語的索引，詞向量，以及每個句子所對應的詞語索引 119 def word2vec_train(combined): 120 # 載入word2vec 模型 121 model = Word2Vec.load('lstm_data/model/Word2vec_model.pkl') 122 index_dict, word_vectors,combined = create_dictionaries(model=model,combined=combined) 123 return index_dict, word_vectors,combined 124 125 def get_data(index_dict,word_vectors,combined,y): 126 # 獲取句子的向量 127 n_symbols = len(index_dict) + 1 # 所有單詞的索引數，頻數小於10的詞語索引為0，所以加1 128 embedding_weights = np.zeros((n_symbols, vocab_dim)) #索引為0的詞語，詞向量全為0 129 for word, index in index_dict.items(): #從索引為1的詞語開始，對每個詞語對應其詞向量 130 embedding_weights[index, :] = word_vectors[word] 131 x_train, x_test, y_train, y_test = train_test_split(combined, y, test_size=0.2) 132 # encode class values as integers 133 encoder = LabelEncoder() 134 encoded_y_train = encoder.fit_transform(y_train) 135 encoded_y_test = encoder.fit_transform(y_test) 136 # convert integers to dummy variables (one hot encoding) 137 y_train = np_utils.to_categorical(encoded_y_train) 138 y_test = np_utils.to_categorical(encoded_y_test) 139 print x_train.shape,y_train.shape 140 return n_symbols,embedding_weights,x_train,y_train,x_test,y_test 141 142 ##定義網路結構 143 def train_lstm(n_symbols,embedding_weights,x_train,y_train,x_test,y_test): 144 nb_classes = 3 145 print 'Defining a Simple Keras Model...' 146 ## 定義基本的網路結構 147 model = Sequential() # or Graph or whatever 148 ## 對於LSTM 變長的文字使用Embedding 將其變成指定長度的向量 149 model.add(Embedding(output_dim=vocab_dim, 150 input_dim=n_symbols, 151 mask_zero=True, 152 weights=[embedding_weights], 153 input_length=input_length)) # Adding Input Length 154 ## 使用單層LSTM 輸出的向量維度是50，輸入的向量維度是vocab_dim,啟用函式relu 155 model.add(LSTM(output_dim=50, activation='relu', inner_activation='hard_sigmoid')) 156 model.add(Dropout(0.5)) 157 ## 在這裡外接softmax，進行最後的3分類 158 model.add(Dense(output_dim=nb_classes, input_dim=50, activation='softmax')) 159 print 'Compiling the Model...' 160 ## 啟用函式使用的是adam 161 model.compile(loss='categorical_crossentropy', 162 optimizer='adam',metrics=['accuracy']) 163 164 print "Train..." 165 print y_train 166 model.fit(x_train, y_train, batch_size=batch_size, nb_epoch=n_epoch,verbose=1, validation_data=(x_test, y_test)) 167 print "Evaluate..." 168 score = model.evaluate(x_test, y_test, 169 batch_size=batch_size) 170 yaml_string = model.to_yaml() 171 with open('lstm_data/lstm_koubei.yml', 'w') as outfile: 172 outfile.write( yaml.dump(yaml_string, default_flow_style=True) ) 173 model.save_weights('lstm_data/lstm_koubei.h5') 174 print 'Test score:', score 175 176 #訓練模型，並儲存 177 def train(): 178 print 'Loading Data...' 179 combined,y=loadfile() 180 print len(combined),len(y) 181 print 'Tokenising...' 182 combined = tokenizer(combined) 183 print 'Training a Word2vec model...' 184 index_dict, word_vectors,combined=word2vec_train(combined) 185 print 'Setting up Arrays for Keras Embedding Layer...' 186 n_symbols,embedding_weights,x_train,y_train,x_test,y_test=get_data(index_dict, word_vectors,combined,y) 187 print x_train.shape,y_train.shape 188 train_lstm(n_symbols,embedding_weights,x_train,y_train,x_test,y_test) 190 191 #訓練模型，並儲存 192 def self_train(): 193 print 'Loading Data...' 194 combined,y=loadfile() 195 print len(combined),len(y) 196 print 'Tokenising...' 197 combined = tokenizer(combined) 198 print 'Training a Word2vec model...' 199 index_dict, word_vectors,combined=word2vec_train(combined) 200 print 'Setting up Arrays for Keras Embedding Layer...' 201 n_symbols,embedding_weights,x_train,y_train,x_test,y_test=get_data(index_dict, word_vectors,combined,y) 202 print x_train.shape,y_train.shape 203 train_lstm(n_symbols,embedding_weights,x_train,y_train,x_test,y_test) 204 205 def input_transform(string): 206 words=' '.join(jieba.cut(string)).encode('utf-8').strip() 207 tmp_list = [] 208 tmp_list.append(words) 209 #words=np.array(tmp_list).reshape(1,-1) 210 model=Word2Vec.load('lstm_data/model/Word2vec_model.pkl') 211 _,_,combined=create_dictionaries(model,tmp_list) 212 return combined248
249 if __name__=='__main__': 250 self_train()

使用LSTM和Softmx來進行意圖識別

使用LSTM和Softmx來進行意圖識別

javascript使用btoa和atob來進行Base64轉碼和解碼

如何使用padlepadle 進行意圖識別-開篇

使用Powermock和mockito來進行單元測試

javascript 使用btoa和atob來進行Base64轉碼和解碼

引入unittest和HTMLTestRunner來進行自動化迴歸

小程序之如和使用view內部組件來進行頁面的排版功能

Unity 影象識別接入Face++ 進行場景識別和文字識別

用Vue來進行移動Hybrid開發和客戶端間資料傳輸的一種方法

mysql分頁和 mysql中利用編號id和每頁條數來進行分頁

可以在命令列直接使用密碼來進行遠端連線和遠端拉取檔案的命令：sshpass

動態HTML處理和機器影象識別-----案例：嘗試對驗證碼進行機器識別處理

GHOST來進行備份和還原及菜單介紹

Spark SQL中使用StringIndexer和IndexToString來對字串資訊進行索引和反索引

深度學習框架Tensorflow學習與應用(八儲存和載入模型，使用Google的影象識別網路inception-v3進行影象識別)

c#使用 HtmlAgilityPack來進行抓取和解析來獲得table表格資訊

使用stl中的 advance和 distance 方法來進行iterator的加減

機器人意圖識別和詞槽抽取RasaNLU解析

java使用UDP來進行客戶端和伺服器端通訊的簡單例子

Facebook：FastText 理解和在query意圖識別的應用

使用LSTM和Softmx來進行意圖識別

相關推薦