唐宇迪Seq2Seq程式碼+註釋(tensorflow1.2版本)
阿新 • • 發佈:2018-12-15
import pandas as pd import re import numpy as np import tensorflow as tf import time filename = 'E:\DataSets\Reviews.csv\Reviews.csv' reviews = pd.read_csv(filename) # print(reviews.isnull().sum()) # 去除缺失的空值 reviews = reviews.dropna() # 去除不需要的列 reviews = reviews.drop( ['Id', 'ProductId', 'UserId', 'ProfileName', 'HelpfulnessNumerator', 'HelpfulnessDenominator', 'Score', 'Time'], 1) # 對去除以後的內容重新分配index reviews = reviews.reset_index(drop=True) # print(reviews.head()),顯示去除以後的前十個text與summary #連詞轉換詞典 contractions = { "ain't": "am not", "aren't": "are not", "can't": "can not", "can't've": "cannot have", "'cause": "because", "could've": "could have", "couldn't": "could not have", "didn't": "did not", "doesn't": "does not", "don't": "do not", "hadn't": "had not", "hadn't've": "had not have", "hasn't": "has not", "haven't": "have not", "he'd": "he would", "he'd've": "he would have", "he's": "he is", "how'd": "how did", "how'll": "how will", "how's": "how is", "i'd": "i would", "i'll": "i will", "i'm": "i am", "i've": "i have", "isn't": "is not", "it'd": "it would", "it'll": "it will", "it's": "it is", "let's": "let us", "ma'am": "madam", "mayn't": "may not", "might've": "might have", "mightn't": "might not", "must've": "must have", "mustn't": "must not", "needn't": "need not", "oughtn't": "ought not", "shan't": "shall not", "sha'n't": "shall not", "she'd": "she would", "she'll": "she will", "she's": "she is", "should've": "should have", "shouldn't": "should not", "that'd": "that would", "that's": "that is", "there'd": "there had", "there's": "there is", "they'd": "they would", "they've": "they have", "wasn't": "was not", "we'd": "we would", "we'll": "we will", "we're": "we are", "we've": "we have", "weren't": "were not", "what'll": "what will", "what've": "what have", "what'd": "what did", "where's": "where is", "who'll": "who will", "who's": "who is", "won't": "will not", "wouldn't": "would not", "you'd": "you would", "you'll": "you will", "you're": "you are" } #對文字內容進行清洗,全部轉化為小寫,最後形成'i want to rock you'形式 def clean_text(text, remove_stopwords=True): text = text.lower() if True: text = text.split() new_text = [] for word in text: if word in contractions: new_text.append(contractions[word]) else: new_text.append(word) #形成新的句子,型別為str text = " ".join(new_text) #去除一些特殊符號 text = re.sub(r'https?:\/\/.*[\r\n]*', '', text, flags=re.MULTILINE) text = re.sub(r'\<a href', ' ', text) text = re.sub(r'&', '', text) text = re.sub(r'[_"\-;%()|+&=*%.,!?:#
[email protected]\[\]/]', ' ', text) text = re.sub(r'<br />', ' ', text) text = re.sub(r'\'', ' ', text) words = open("Englishstopwords.txt", 'r') stop = words.readlines() stopwords = stop[0].split(" ") stopwords = stopwords[0:-1] if remove_stopwords: text = text.split() stops = set(stopwords) text = [w for w in text if w not in stops] # 形成新的句子,型別為str text = " ".join(text) return text clean_summaries = [] #clean_summaries 和 clean_texts 裡面的格式為['i want to rock you','you will win the championship',...] for summary in reviews.Summary: clean_summaries.append(clean_text(summary, remove_stopwords=False)) print("Summaries are completed") clean_texts = [] for text in reviews.Text: clean_texts.append(clean_text(text, remove_stopwords=True)) print("Texts are completed") #生成單詞字典形式,形式為{‘many’:897,'hate':234,....} def count_words(count_dict, text): for setence in text: for word in setence.split(): if word not in count_dict: count_dict[word] = 1 else: count_dict[word] += 1 # 單詞的字典,也就是說word_count儲存單詞的詞頻,不包括重複單詞 word_counts = {} count_words(word_counts, clean_summaries) count_words(word_counts, clean_texts) print("Size of Vocabulary :", len(word_counts)) embeddings_index = {} # 將訓練好的向量以字典形式儲存,第一步載入詞向量,embeddings_index最終的形式為{‘a’:(詞向量),‘money’:(詞向量),...} with open(r'E:\word2vecmodel\numberbatch-en-17.04b.txt', 'r',encoding='utf8') as f: for line in f: values = line.split(' ') word = values[0] embedding = np.asarray(values[1:], dtype='float32') embeddings_index[word] = embedding missing_words = 0 threshold = 20 # 設定閾值,出現次數小於20的就不用了 for word, count in word_counts.items(): if count > threshold: if word not in embeddings_index: missing_words += 1 # 統計不在詞向量的字典中的個數,且滿足經常出現條件 missing_ratio = round(missing_words / len(word_counts), 4) * 100 print("Number of words missing from CN:", missing_words) print("Percent of words that are missing from vocabulary: {}%".format(missing_ratio)) vocab_to_int = {} value = 0 #將單詞對映為整數 for word, count in word_counts.items(): if count >= threshold or word in embeddings_index: vocab_to_int[word] = value value += 1 # 特殊符號 codes = ["<UNK>", "<PAD>", "<EOS>", "<GO>"] for code in codes: vocab_to_int[code] = len(vocab_to_int) int_to_vocab = {} #顛倒vocab_to_int for word, value in vocab_to_int.items(): int_to_vocab[value] = word usage_ratio = round(len(vocab_to_int) / len(word_counts), 4) * 100 print("Total number of unique words:", len(word_counts)) print("Number of words we will use:", len(vocab_to_int)) print("Percent of words we will use: {}%".format(usage_ratio)) embedding_dim = 300 nb_words = len(vocab_to_int) # 初始化詞向量,最後得到word_embedding_matrix為矩陣shape為nb_words * 300 word_embedding_matrix = np.zeros((nb_words, embedding_dim), dtype=np.float32) for word, i in vocab_to_int.items(): if word in embeddings_index: word_embedding_matrix[i] = embeddings_index[word] else: new_embedding = np.array(np.random.uniform(-1.0, 1.0, embedding_dim)) embeddings_index[word] = new_embedding word_embedding_matrix[i] = new_embedding #將setence中的單詞形成數字[[1,234,7687,23,...],[345,908,2359,11234,...],...] def convert_to_ints(text, word_count, unk_count, eos=False): ints = [] for setence in text: setence_ints = [] for word in setence.split(): word_count += 1 if word in vocab_to_int: setence_ints.append(vocab_to_int[word]) else: setence_ints.append(vocab_to_int['<UNK>']) unk_count += 1 if eos: setence_ints.append(vocab_to_int['<EOS>']) ints.append(setence_ints) return ints, word_count, unk_count word_count = 0 unk_count = 0 #int_summaries和int_texts格式為[[1,234,7687,23,...],[345,908,2359,11234,...],...] int_summaries, word_count, unk_count = convert_to_ints(clean_summaries, word_count, unk_count) int_texts, word_count, unk_count = convert_to_ints(clean_texts, word_count, unk_count, eos=True) def create_lengths(text): lengths = [] for setence in text: lengths.append(len(setence)) return pd.DataFrame(lengths, columns=['counts']) lengths_summaries = create_lengths(int_summaries) lengths_texts = create_lengths(int_texts) # 測試當前text的統計長度 print(np.percentile(lengths_texts.counts, 90)) print(np.percentile(lengths_texts.counts, 95)) print(np.percentile(lengths_texts.counts, 99)) # 測試當前summary的統計長度 print(np.percentile(lengths_summaries.counts, 90)) print(np.percentile(lengths_summaries.counts, 95)) print(np.percentile(lengths_summaries.counts, 99)) #統計unk的數目,為下一步篩選有效訓練集做準備 def unk_counter(setence): unk_count = 0 for word in setence: if word == vocab_to_int['<UNK>']: unk_count += 1 return unk_count sorted_summaries = [] sorted_texts = [] max_text_length = 84 max_summary_length = 13 min_length = 2 unk_text_limit = 1 unk_summary_limit = 0 #按長度排序,迴圈中count為序號 for length in range(min(lengths_texts.counts), max_text_length): for count, words in enumerate(int_summaries): if (len(int_summaries[count]) >= min_length and len(int_summaries[count]) <= max_summary_length and len(int_texts[count]) > min_length and unk_counter( int_summaries[count]) <= unk_summary_limit and unk_counter(int_texts[count]) < unk_text_limit and length == len(int_texts[count])): sorted_summaries.append(int_summaries[count]) sorted_texts.append(int_texts[count]) #以上得到經預處理後長短排序升序的texts和summary #為輸入定義佔位符 def model_inputs(): input_data = tf.placeholder(tf.int32, [None, None], name = 'input')#應該是batch_size*dimensions,batch_size*句長 targets = tf.placeholder(tf.int32, [None, None], name = 'targets')#應該是batch_size*句長 lr = tf.placeholder(tf.float32, name = 'learning_rate')#學習率應該更小一些 keep_prob = tf.placeholder(tf.float32, name = 'keep_prob')#防止過擬合 summary_length = tf.placeholder(tf.int32,(None, ), name = 'summary_length')#summary的長度 max_summary_length = tf.reduce_max(summary_length, name = 'max_dec_len')#tf.reduce_max()計算各個維度上元素的最大值 text_length = tf.placeholder(tf.int32, (None, ), name = 'text_length')#text的長度 return input_data, targets, lr, keep_prob, summary_length, max_summary_length ,text_length #每個batch開始階段加<GO> def process_encoding_input(target_data, vocab_to_int, batch_size):#target就是summary ending = tf.strided_slice(target_data,[0,0],[batch_size,-1],[1,1])#三維切片,每一維切割都是來自於上一維切割的結果 dec_input = tf.concat([tf.fill([batch_size,1],vocab_to_int['<GO>']),ending],1) return dec_input #建立encoding層 def encoding_layer(rnn_size,sequence_length, num_layers, rnn_inputs, keep_prob): for layer in range(num_layers): with tf.variable_scope('encoder_{}'.format(layer)): cell_fw = tf.nn.rnn_cell.BasicLSTMCell(rnn_size) cell_fw = tf.nn.rnn_cell.DropoutWrapper(cell_fw,input_keep_prob=keep_prob) cell_bw = tf.nn.rnn_cell.BasicLSTMCell(rnn_size) cell_bw = tf.nn.rnn_cell.DropoutWrapper(cell_bw,input_keep_prob=keep_prob) enc_output, enc_state = tf.nn.bidirectional_dynamic_rnn(cell_fw,cell_bw,rnn_inputs,sequence_length, dtype=tf.float32) enc_output = tf.concat(enc_output,2) return enc_output, enc_state#enc_output應該為中間向量 def training_decoding_layer(dec_embed_input, summary_length, dec_cell, initial_state, output_layer, vocab_size, max_summary_length):#用於訓練模型 training_helper = tf.contrib.seq2seq.TrainingHelper(inputs=dec_embed_input, sequence_length=summary_length, time_major=False)#幫助建立一個訓練的decoder類 training_decoder = tf.contrib.seq2seq.BasicDecoder(dec_cell,training_helper,initial_state, output_layer)#構造一個decoder training_logits,_ = tf.contrib.seq2seq.dynamic_decode(training_decoder,output_time_major = False, impute_finished = True,maximum_iterations = max_summary_length)#構造一個動態的decoder,返回(final_outputs, final_state, final_sequence_lengths).final_outputs是一個namedtuple,裡面包含兩項(rnn_outputs, sample_id) return training_logits def inference_decoding_layer(embeddings, start_token, end_token, dec_cell, initial_state, output_layer, max_summary_length, batch_size):#decoding,解碼要有<GO>和<EOS>,用於預測 start_token = tf.tile(tf.constant([start_token],dtype = tf.int32), [batch_size], name = 'start_token')#tile擴充套件向量 inference_helper = tf.contrib.seq2seq.GreedyEmbeddingHelper(embeddings,start_token,end_token)#方便最後預測,seq2seq中幫助建立Decoder的一個類,在預測時使用 inference_decoder = tf.contrib.seq2seq.BasicDecoder(dec_cell,inference_helper,initial_state,output_layer)#構造一個decoder inference_logits,_ = tf.contrib.seq2seq.dynamic_decode(inference_decoder,output_time_major = False, impute_finished = True,maximum_iterations = max_summary_length) return inference_logits #decoding層 def decoding_layer(dec_embed_input, embeddings, enc_output, enc_state, vocab_size, text_length, summary_length, max_summary_length, rnn_size,vocab_to_int, keep_prob,batch_size, num_layers): for layer in range(num_layers): with tf.variable_scope('decoder_{}'.format(layer)): lstm = tf.nn.rnn_cell.LSTMCell(rnn_size, initializer = tf.random_uniform_initializer(-0.1,0.1,seed = 2)) dec_cell = tf.nn.rnn_cell.DropoutWrapper(lstm,input_keep_prob=keep_prob) output_layer = tf.layers.Dense(vocab_size, kernel_initializer=tf.truncated_normal_initializer(mean=0.0, stddev=0.1))#構造一個全連線的類,後續的vocab_size= len(vocab_to_int)+1仍需弄清楚 attn_mech = tf.contrib.seq2seq.BahdanauAttention(rnn_size, enc_output, text_length, normalize = False)#集中機制 dec_cell = tf.contrib.seq2seq.DynamicAttentionWrapper(dec_cell, attn_mech, rnn_size) initial_state = tf.contrib.seq2seq.DynamicAttentionWrapperState(enc_state[0])#可以理解為只給第一個,然後 with tf.variable_scope("decode"): training_logits = training_decoding_layer(dec_embed_input, summary_length, dec_cell, initial_state,output_layer, vocab_size, max_summary_length) with tf.variable_scope("decode",reuse= True): inference_logits = inference_decoding_layer(embeddings, vocab_to_int['<GO>'],vocab_to_int['<EOS>'],dec_cell,initial_state,output_layer,max_summary_length,batch_size) return training_logits, inference_logits def seq2seq_model(input_data, target_data, keep_prob, text_length, summary_length, max_summary_length, vocab_size, rnn_size, num_layers, vocab_to_int,batch_size): embeddings = word_embedding_matrix#因為要預測所有的詞,所以是全體詞彙表的長度 enc_embed_input = tf.nn.embedding_lookup(embeddings, input_data) enc_output, enc_state = encoding_layer(rnn_size, text_length, num_layers, enc_embed_input,keep_prob) dec_input = process_encoding_input(target_data, vocab_to_int, batch_size) dec_embed_input = tf.nn.embedding_lookup(embeddings, dec_input) training_logits, inference_logits = decoding_layer(dec_embed_input,embeddings,enc_output, enc_state, vocab_size,text_length,summary_length,max_summary_length,rnn_size,vocab_to_int,keep_prob,batch_size,num_layers) return training_logits, inference_logits #構造pad層 def pad_sentence_batch(sentence_batch):#pad層填充 max_sentence = max([len(sentence) for sentence in sentence_batch]) return [sentence + [vocab_to_int['<PAD>']]*(max_sentence-len(sentence)) for sentence in sentence_batch] def get_batches(summaries, texts, batch_size):#獲取資料 for batch_i in range(0,len(texts)//batch_size): start_i = batch_i*batch_size summaries_batch = summaries[start_i:start_i + batch_size] texts_batch = texts[start_i:start_i + batch_size] pad_summaries_batch = np.array(pad_sentence_batch(summaries_batch)) pad_texts_batch = np.array(pad_sentence_batch(texts_batch)) pad_summaries_lengths = [] for summary in pad_summaries_batch: pad_summaries_lengths.append(len(summary)) pad_texts_lengths = [] for text in pad_texts_batch: pad_summaries_lengths.append(len(text)) yield pad_summaries_batch, pad_texts_batch, pad_summaries_lengths,pad_texts_lengths epochs = 100 batch_size = 64 rnn_size = 256 num_layers = 2 learning_rate = 0.005 keep_probability = 0.75 train_graph = tf.Graph() with train_graph.as_default(): input_data, targets, lr, keep_prob, summary_length, text_length = model_inputs() training_logits, inference_logits = seq2seq_model(tf.reverse(input_data,[-1]),targets, keep_prob,text_length,summary_length ,max_summary_length,len(vocab_to_int),rnn_size,num_layers,vocab_to_int,batch_size)#-1說明將其顛倒過來以後方便聯絡 training_logits = tf.identity(training_logits.rnn_output, 'logits')#儲存每個單詞的概率,用於計算loss inference_logits = tf.identity(inference_logits.sample_id,name = 'predictions')#儲存最後的單詞結果 masks = tf.sequence_mask(summary_length,max_summary_length,dtype=tf.float32, name='masks')#engths代表的是一個一維陣列,代表每一個sequence的長度,那麼該函式返回的是一個mask的張量,張量的維數是:(lengths.shape,maxlen) with tf.name_scope("optimization"): cost = tf.contrib.seq2seq.sequence_loss(training_logits,targets,masks)#用於計算seq2seq中的loss。當我們的輸入是不定長的時候,weights引數常常使用我們1.11中得到的mask optimizer = tf.train.AdamOptimizer(learning_rate) gradients = optimizer.compute_gradients(cost) capped_gradients = [(tf.clip_by_value(grad,-5.,5.),var) for grad,var in gradients if grad is not None]#輸入一個張量A,把A中的每一個元素的值都壓縮在min和max之間。小於min的讓它等於min,大於max的元素的值等於max train_op = optimizer.apply_gradients(capped_gradients)#梯度修剪主要避免訓練梯度爆炸和消失問題 print("Graph is built") start = 200000 end = start + 5000 sorted_summaries_short = sorted_summaries[start:end] sorted_texts_short = sorted_texts[start:end] learning_rate_decay = 0.95 min_learning_rate = 0.0005 display_step = 20 stop_early = 0 stop = 3 per_epoch = 3 update_check = (len(sorted_texts_short)//batch_size//per_epoch)-1 update_loss = 0 batch_loss = 0 summary_update_loss = [] checkpoint = "best_model.ckpt" with tf.Session(graph=train_graph) as sess: sess.run(tf.global_variables_initializer()) for epoch_i in range(1,epochs+1): update_loss = 0 batch_loss = 0 for batch_i,(summaries_batch, texts_batch, summaries_lengths,texts_lengths) in enumerate(get_batches(sorted_summaries_short,sorted_texts_short,batch_size)): start_time = time.time() _,loss = sess.run([train_op,cost],{input_data:texts_batch,targets:summaries_batch,lr:learning_rate,summary_length:summaries_lengths,text_length:texts_lengths,keep_prob:keep_probability}) batch_loss += loss update_loss += loss end_time = time.time() batch_time = end_time - start_time if batch_i % display_step == 0 and batch_i >0: print('Epoch{:>3}/{} Batch {:>4}/{} - Loss: {:>6.3f}, Seconds:{:>4.2f}'.format(epoch_i,epochs,batch_i, len(sorted_texts_short)//batch_size, batch_loss/display_step, batch_time*display_step)) if batch_i % update_check == 0 and batch_i>0: print("Average loss for this update:", round(update_loss/update_check,3)) summary_update_loss.append(update_loss) #如果update_loss最小,則儲存模型 if update_loss <= min(summary_update_loss): print('New Record') stop_early = 0 saver = tf.train.Saver() saver.save(sess, checkpoint) else: print('No Improvement') stop_early += 1 if stop_early == stop: break update_loss = 0 learning_rate *= learning_rate_decay if learning_rate < min_learning_rate: learning_rate = min_learning_rate if stop_early == stop: print("Stopping Training") break #測試效果 def text_to_seq(text): text = clean_text(text) return [vocab_to_int.get(word,vocab_to_int['<UNK>']) for word in text.split()] random = np.random.randint(0,len(clean_texts)) input_sentence = clean_texts[random] text = text_to_seq(clean_texts[random]) checkpoint = './best_model.ckpt' loaded_graph = tf.Graph() with tf.Session(graph = loaded_graph) as sess: loader = tf.train.import_meta_graph(checkpoint+'.meta') loader.restore(sess,checkpoint) input_data = loaded_graph.get_tensor_by_name('input:0') logits = loaded_graph.get_tensor_by_name('predictions:0') text_length = loaded_graph.get_tensor_by_name('text_length:0') summary_length = loaded_graph.get_tensor_by_name('summary_length:0') keep_prob = loaded_graph.get_tensor_by_name('input:0') answer_logits = sess.run(logits,{input_data:{text}*batch_size, summary_length:[np.random.randint(5,8)], text_length:[len(text)*batch_size], keep_prob:1.0})[0] pad = vocab_to_int["<PAD>"] print('Original Text:', input_sentence) print('\nText') print('Word Ids: {}'.format([i for i in text])) print('Input Words: {}'.format(" ".join([int_to_vocab[i] for i in text]))) print("\nSummary") print('Word Ids: {}'.format([i for i in answer_logits if i != pad])) print('Response Words: {}'.format(" ".join([int_to_vocab[i] for i in answer_logits if i != pad])))