1. 程式人生 > >唐宇迪Seq2Seq程式碼+註釋(tensorflow1.2版本)

唐宇迪Seq2Seq程式碼+註釋(tensorflow1.2版本)

import pandas as pd
import re
import numpy as np
import tensorflow as tf
import time

filename = 'E:\DataSets\Reviews.csv\Reviews.csv'
reviews = pd.read_csv(filename)
# print(reviews.isnull().sum())
# 去除缺失的空值
reviews = reviews.dropna()
# 去除不需要的列
reviews = reviews.drop(
    ['Id', 'ProductId', 'UserId', 'ProfileName', 'HelpfulnessNumerator', 'HelpfulnessDenominator', 'Score', 'Time'], 1)
# 對去除以後的內容重新分配index
reviews = reviews.reset_index(drop=True)
# print(reviews.head()),顯示去除以後的前十個text與summary
#連詞轉換詞典
contractions = {
    "ain't": "am not",
    "aren't": "are not",
    "can't": "can not",
    "can't've": "cannot have",
    "'cause": "because",
    "could've": "could have",
    "couldn't": "could not have",
    "didn't": "did not",
    "doesn't": "does not",
    "don't": "do not",
    "hadn't": "had not",
    "hadn't've": "had not have",
    "hasn't": "has not",
    "haven't": "have not",
    "he'd": "he would",
    "he'd've": "he would have",
    "he's": "he is",
    "how'd": "how did",
    "how'll": "how will",
    "how's": "how is",
    "i'd": "i would",
    "i'll": "i will",
    "i'm": "i am",
    "i've": "i have",
    "isn't": "is not",
    "it'd": "it would",
    "it'll": "it will",
    "it's": "it is",
    "let's": "let us",
    "ma'am": "madam",
    "mayn't": "may not",
    "might've": "might have",
    "mightn't": "might not",
    "must've": "must have",
    "mustn't": "must not",
    "needn't": "need not",
    "oughtn't": "ought not",
    "shan't": "shall not",
    "sha'n't": "shall not",
    "she'd": "she would",
    "she'll": "she will",
    "she's": "she is",
    "should've": "should have",
    "shouldn't": "should not",
    "that'd": "that would",
    "that's": "that is",
    "there'd": "there had",
    "there's": "there is",
    "they'd": "they would",
    "they've": "they have",
    "wasn't": "was not",
    "we'd": "we would",
    "we'll": "we will",
    "we're": "we are",
    "we've": "we have",
    "weren't": "were not",
    "what'll": "what will",
    "what've": "what have",
    "what'd": "what did",
    "where's": "where is",
    "who'll": "who will",
    "who's": "who is",
    "won't": "will not",
    "wouldn't": "would not",
    "you'd": "you would",
    "you'll": "you will",
    "you're": "you are"
}

#對文字內容進行清洗,全部轉化為小寫,最後形成'i want to rock you'形式
def clean_text(text, remove_stopwords=True):
    text = text.lower()
    if True:
        text = text.split()
        new_text = []
        for word in text:
            if word in contractions:
                new_text.append(contractions[word])
            else:
                new_text.append(word)
        #形成新的句子,型別為str
        text = " ".join(new_text)
    #去除一些特殊符號
    text = re.sub(r'https?:\/\/.*[\r\n]*', '', text, flags=re.MULTILINE)
    text = re.sub(r'\<a href', ' ', text)
    text = re.sub(r'&', '', text)
    text = re.sub(r'[_"\-;%()|+&=*%.,!?:#
[email protected]
\[\]/]', ' ', text) text = re.sub(r'<br />', ' ', text) text = re.sub(r'\'', ' ', text) words = open("Englishstopwords.txt", 'r') stop = words.readlines() stopwords = stop[0].split(" ") stopwords = stopwords[0:-1] if remove_stopwords: text = text.split() stops = set(stopwords) text = [w for w in text if w not in stops] # 形成新的句子,型別為str text = " ".join(text) return text clean_summaries = [] #clean_summaries 和 clean_texts 裡面的格式為['i want to rock you','you will win the championship',...] for summary in reviews.Summary: clean_summaries.append(clean_text(summary, remove_stopwords=False)) print("Summaries are completed") clean_texts = [] for text in reviews.Text: clean_texts.append(clean_text(text, remove_stopwords=True)) print("Texts are completed") #生成單詞字典形式,形式為{‘many’:897,'hate':234,....} def count_words(count_dict, text): for setence in text: for word in setence.split(): if word not in count_dict: count_dict[word] = 1 else: count_dict[word] += 1 # 單詞的字典,也就是說word_count儲存單詞的詞頻,不包括重複單詞 word_counts = {} count_words(word_counts, clean_summaries) count_words(word_counts, clean_texts) print("Size of Vocabulary :", len(word_counts)) embeddings_index = {} # 將訓練好的向量以字典形式儲存,第一步載入詞向量,embeddings_index最終的形式為{‘a’:(詞向量),‘money’:(詞向量),...} with open(r'E:\word2vecmodel\numberbatch-en-17.04b.txt', 'r',encoding='utf8') as f: for line in f: values = line.split(' ') word = values[0] embedding = np.asarray(values[1:], dtype='float32') embeddings_index[word] = embedding missing_words = 0 threshold = 20 # 設定閾值,出現次數小於20的就不用了 for word, count in word_counts.items(): if count > threshold: if word not in embeddings_index: missing_words += 1 # 統計不在詞向量的字典中的個數,且滿足經常出現條件 missing_ratio = round(missing_words / len(word_counts), 4) * 100 print("Number of words missing from CN:", missing_words) print("Percent of words that are missing from vocabulary: {}%".format(missing_ratio)) vocab_to_int = {} value = 0 #將單詞對映為整數 for word, count in word_counts.items(): if count >= threshold or word in embeddings_index: vocab_to_int[word] = value value += 1 # 特殊符號 codes = ["<UNK>", "<PAD>", "<EOS>", "<GO>"] for code in codes: vocab_to_int[code] = len(vocab_to_int) int_to_vocab = {} #顛倒vocab_to_int for word, value in vocab_to_int.items(): int_to_vocab[value] = word usage_ratio = round(len(vocab_to_int) / len(word_counts), 4) * 100 print("Total number of unique words:", len(word_counts)) print("Number of words we will use:", len(vocab_to_int)) print("Percent of words we will use: {}%".format(usage_ratio)) embedding_dim = 300 nb_words = len(vocab_to_int) # 初始化詞向量,最後得到word_embedding_matrix為矩陣shape為nb_words * 300 word_embedding_matrix = np.zeros((nb_words, embedding_dim), dtype=np.float32) for word, i in vocab_to_int.items(): if word in embeddings_index: word_embedding_matrix[i] = embeddings_index[word] else: new_embedding = np.array(np.random.uniform(-1.0, 1.0, embedding_dim)) embeddings_index[word] = new_embedding word_embedding_matrix[i] = new_embedding #將setence中的單詞形成數字[[1,234,7687,23,...],[345,908,2359,11234,...],...] def convert_to_ints(text, word_count, unk_count, eos=False): ints = [] for setence in text: setence_ints = [] for word in setence.split(): word_count += 1 if word in vocab_to_int: setence_ints.append(vocab_to_int[word]) else: setence_ints.append(vocab_to_int['<UNK>']) unk_count += 1 if eos: setence_ints.append(vocab_to_int['<EOS>']) ints.append(setence_ints) return ints, word_count, unk_count word_count = 0 unk_count = 0 #int_summaries和int_texts格式為[[1,234,7687,23,...],[345,908,2359,11234,...],...] int_summaries, word_count, unk_count = convert_to_ints(clean_summaries, word_count, unk_count) int_texts, word_count, unk_count = convert_to_ints(clean_texts, word_count, unk_count, eos=True) def create_lengths(text): lengths = [] for setence in text: lengths.append(len(setence)) return pd.DataFrame(lengths, columns=['counts']) lengths_summaries = create_lengths(int_summaries) lengths_texts = create_lengths(int_texts) # 測試當前text的統計長度 print(np.percentile(lengths_texts.counts, 90)) print(np.percentile(lengths_texts.counts, 95)) print(np.percentile(lengths_texts.counts, 99)) # 測試當前summary的統計長度 print(np.percentile(lengths_summaries.counts, 90)) print(np.percentile(lengths_summaries.counts, 95)) print(np.percentile(lengths_summaries.counts, 99)) #統計unk的數目,為下一步篩選有效訓練集做準備 def unk_counter(setence): unk_count = 0 for word in setence: if word == vocab_to_int['<UNK>']: unk_count += 1 return unk_count sorted_summaries = [] sorted_texts = [] max_text_length = 84 max_summary_length = 13 min_length = 2 unk_text_limit = 1 unk_summary_limit = 0 #按長度排序,迴圈中count為序號 for length in range(min(lengths_texts.counts), max_text_length): for count, words in enumerate(int_summaries): if (len(int_summaries[count]) >= min_length and len(int_summaries[count]) <= max_summary_length and len(int_texts[count]) > min_length and unk_counter( int_summaries[count]) <= unk_summary_limit and unk_counter(int_texts[count]) < unk_text_limit and length == len(int_texts[count])): sorted_summaries.append(int_summaries[count]) sorted_texts.append(int_texts[count]) #以上得到經預處理後長短排序升序的texts和summary #為輸入定義佔位符 def model_inputs(): input_data = tf.placeholder(tf.int32, [None, None], name = 'input')#應該是batch_size*dimensions,batch_size*句長 targets = tf.placeholder(tf.int32, [None, None], name = 'targets')#應該是batch_size*句長 lr = tf.placeholder(tf.float32, name = 'learning_rate')#學習率應該更小一些 keep_prob = tf.placeholder(tf.float32, name = 'keep_prob')#防止過擬合 summary_length = tf.placeholder(tf.int32,(None, ), name = 'summary_length')#summary的長度 max_summary_length = tf.reduce_max(summary_length, name = 'max_dec_len')#tf.reduce_max()計算各個維度上元素的最大值 text_length = tf.placeholder(tf.int32, (None, ), name = 'text_length')#text的長度 return input_data, targets, lr, keep_prob, summary_length, max_summary_length ,text_length #每個batch開始階段加<GO> def process_encoding_input(target_data, vocab_to_int, batch_size):#target就是summary ending = tf.strided_slice(target_data,[0,0],[batch_size,-1],[1,1])#三維切片,每一維切割都是來自於上一維切割的結果 dec_input = tf.concat([tf.fill([batch_size,1],vocab_to_int['<GO>']),ending],1) return dec_input #建立encoding層 def encoding_layer(rnn_size,sequence_length, num_layers, rnn_inputs, keep_prob): for layer in range(num_layers): with tf.variable_scope('encoder_{}'.format(layer)): cell_fw = tf.nn.rnn_cell.BasicLSTMCell(rnn_size) cell_fw = tf.nn.rnn_cell.DropoutWrapper(cell_fw,input_keep_prob=keep_prob) cell_bw = tf.nn.rnn_cell.BasicLSTMCell(rnn_size) cell_bw = tf.nn.rnn_cell.DropoutWrapper(cell_bw,input_keep_prob=keep_prob) enc_output, enc_state = tf.nn.bidirectional_dynamic_rnn(cell_fw,cell_bw,rnn_inputs,sequence_length, dtype=tf.float32) enc_output = tf.concat(enc_output,2) return enc_output, enc_state#enc_output應該為中間向量 def training_decoding_layer(dec_embed_input, summary_length, dec_cell, initial_state, output_layer, vocab_size, max_summary_length):#用於訓練模型 training_helper = tf.contrib.seq2seq.TrainingHelper(inputs=dec_embed_input, sequence_length=summary_length, time_major=False)#幫助建立一個訓練的decoder類 training_decoder = tf.contrib.seq2seq.BasicDecoder(dec_cell,training_helper,initial_state, output_layer)#構造一個decoder training_logits,_ = tf.contrib.seq2seq.dynamic_decode(training_decoder,output_time_major = False, impute_finished = True,maximum_iterations = max_summary_length)#構造一個動態的decoder,返回(final_outputs, final_state, final_sequence_lengths).final_outputs是一個namedtuple,裡面包含兩項(rnn_outputs, sample_id) return training_logits def inference_decoding_layer(embeddings, start_token, end_token, dec_cell, initial_state, output_layer, max_summary_length, batch_size):#decoding,解碼要有<GO>和<EOS>,用於預測 start_token = tf.tile(tf.constant([start_token],dtype = tf.int32), [batch_size], name = 'start_token')#tile擴充套件向量 inference_helper = tf.contrib.seq2seq.GreedyEmbeddingHelper(embeddings,start_token,end_token)#方便最後預測,seq2seq中幫助建立Decoder的一個類,在預測時使用 inference_decoder = tf.contrib.seq2seq.BasicDecoder(dec_cell,inference_helper,initial_state,output_layer)#構造一個decoder inference_logits,_ = tf.contrib.seq2seq.dynamic_decode(inference_decoder,output_time_major = False, impute_finished = True,maximum_iterations = max_summary_length) return inference_logits #decoding層 def decoding_layer(dec_embed_input, embeddings, enc_output, enc_state, vocab_size, text_length, summary_length, max_summary_length, rnn_size,vocab_to_int, keep_prob,batch_size, num_layers): for layer in range(num_layers): with tf.variable_scope('decoder_{}'.format(layer)): lstm = tf.nn.rnn_cell.LSTMCell(rnn_size, initializer = tf.random_uniform_initializer(-0.1,0.1,seed = 2)) dec_cell = tf.nn.rnn_cell.DropoutWrapper(lstm,input_keep_prob=keep_prob) output_layer = tf.layers.Dense(vocab_size, kernel_initializer=tf.truncated_normal_initializer(mean=0.0, stddev=0.1))#構造一個全連線的類,後續的vocab_size= len(vocab_to_int)+1仍需弄清楚 attn_mech = tf.contrib.seq2seq.BahdanauAttention(rnn_size, enc_output, text_length, normalize = False)#集中機制 dec_cell = tf.contrib.seq2seq.DynamicAttentionWrapper(dec_cell, attn_mech, rnn_size) initial_state = tf.contrib.seq2seq.DynamicAttentionWrapperState(enc_state[0])#可以理解為只給第一個,然後 with tf.variable_scope("decode"): training_logits = training_decoding_layer(dec_embed_input, summary_length, dec_cell, initial_state,output_layer, vocab_size, max_summary_length) with tf.variable_scope("decode",reuse= True): inference_logits = inference_decoding_layer(embeddings, vocab_to_int['<GO>'],vocab_to_int['<EOS>'],dec_cell,initial_state,output_layer,max_summary_length,batch_size) return training_logits, inference_logits def seq2seq_model(input_data, target_data, keep_prob, text_length, summary_length, max_summary_length, vocab_size, rnn_size, num_layers, vocab_to_int,batch_size): embeddings = word_embedding_matrix#因為要預測所有的詞,所以是全體詞彙表的長度 enc_embed_input = tf.nn.embedding_lookup(embeddings, input_data) enc_output, enc_state = encoding_layer(rnn_size, text_length, num_layers, enc_embed_input,keep_prob) dec_input = process_encoding_input(target_data, vocab_to_int, batch_size) dec_embed_input = tf.nn.embedding_lookup(embeddings, dec_input) training_logits, inference_logits = decoding_layer(dec_embed_input,embeddings,enc_output, enc_state, vocab_size,text_length,summary_length,max_summary_length,rnn_size,vocab_to_int,keep_prob,batch_size,num_layers) return training_logits, inference_logits #構造pad層 def pad_sentence_batch(sentence_batch):#pad層填充 max_sentence = max([len(sentence) for sentence in sentence_batch]) return [sentence + [vocab_to_int['<PAD>']]*(max_sentence-len(sentence)) for sentence in sentence_batch] def get_batches(summaries, texts, batch_size):#獲取資料 for batch_i in range(0,len(texts)//batch_size): start_i = batch_i*batch_size summaries_batch = summaries[start_i:start_i + batch_size] texts_batch = texts[start_i:start_i + batch_size] pad_summaries_batch = np.array(pad_sentence_batch(summaries_batch)) pad_texts_batch = np.array(pad_sentence_batch(texts_batch)) pad_summaries_lengths = [] for summary in pad_summaries_batch: pad_summaries_lengths.append(len(summary)) pad_texts_lengths = [] for text in pad_texts_batch: pad_summaries_lengths.append(len(text)) yield pad_summaries_batch, pad_texts_batch, pad_summaries_lengths,pad_texts_lengths epochs = 100 batch_size = 64 rnn_size = 256 num_layers = 2 learning_rate = 0.005 keep_probability = 0.75 train_graph = tf.Graph() with train_graph.as_default(): input_data, targets, lr, keep_prob, summary_length, text_length = model_inputs() training_logits, inference_logits = seq2seq_model(tf.reverse(input_data,[-1]),targets, keep_prob,text_length,summary_length ,max_summary_length,len(vocab_to_int),rnn_size,num_layers,vocab_to_int,batch_size)#-1說明將其顛倒過來以後方便聯絡 training_logits = tf.identity(training_logits.rnn_output, 'logits')#儲存每個單詞的概率,用於計算loss inference_logits = tf.identity(inference_logits.sample_id,name = 'predictions')#儲存最後的單詞結果 masks = tf.sequence_mask(summary_length,max_summary_length,dtype=tf.float32, name='masks')#engths代表的是一個一維陣列,代表每一個sequence的長度,那麼該函式返回的是一個mask的張量,張量的維數是:(lengths.shape,maxlen) with tf.name_scope("optimization"): cost = tf.contrib.seq2seq.sequence_loss(training_logits,targets,masks)#用於計算seq2seq中的loss。當我們的輸入是不定長的時候,weights引數常常使用我們1.11中得到的mask optimizer = tf.train.AdamOptimizer(learning_rate) gradients = optimizer.compute_gradients(cost) capped_gradients = [(tf.clip_by_value(grad,-5.,5.),var) for grad,var in gradients if grad is not None]#輸入一個張量A,把A中的每一個元素的值都壓縮在min和max之間。小於min的讓它等於min,大於max的元素的值等於max train_op = optimizer.apply_gradients(capped_gradients)#梯度修剪主要避免訓練梯度爆炸和消失問題 print("Graph is built") start = 200000 end = start + 5000 sorted_summaries_short = sorted_summaries[start:end] sorted_texts_short = sorted_texts[start:end] learning_rate_decay = 0.95 min_learning_rate = 0.0005 display_step = 20 stop_early = 0 stop = 3 per_epoch = 3 update_check = (len(sorted_texts_short)//batch_size//per_epoch)-1 update_loss = 0 batch_loss = 0 summary_update_loss = [] checkpoint = "best_model.ckpt" with tf.Session(graph=train_graph) as sess: sess.run(tf.global_variables_initializer()) for epoch_i in range(1,epochs+1): update_loss = 0 batch_loss = 0 for batch_i,(summaries_batch, texts_batch, summaries_lengths,texts_lengths) in enumerate(get_batches(sorted_summaries_short,sorted_texts_short,batch_size)): start_time = time.time() _,loss = sess.run([train_op,cost],{input_data:texts_batch,targets:summaries_batch,lr:learning_rate,summary_length:summaries_lengths,text_length:texts_lengths,keep_prob:keep_probability}) batch_loss += loss update_loss += loss end_time = time.time() batch_time = end_time - start_time if batch_i % display_step == 0 and batch_i >0: print('Epoch{:>3}/{} Batch {:>4}/{} - Loss: {:>6.3f}, Seconds:{:>4.2f}'.format(epoch_i,epochs,batch_i, len(sorted_texts_short)//batch_size, batch_loss/display_step, batch_time*display_step)) if batch_i % update_check == 0 and batch_i>0: print("Average loss for this update:", round(update_loss/update_check,3)) summary_update_loss.append(update_loss) #如果update_loss最小,則儲存模型 if update_loss <= min(summary_update_loss): print('New Record') stop_early = 0 saver = tf.train.Saver() saver.save(sess, checkpoint) else: print('No Improvement') stop_early += 1 if stop_early == stop: break update_loss = 0 learning_rate *= learning_rate_decay if learning_rate < min_learning_rate: learning_rate = min_learning_rate if stop_early == stop: print("Stopping Training") break #測試效果 def text_to_seq(text): text = clean_text(text) return [vocab_to_int.get(word,vocab_to_int['<UNK>']) for word in text.split()] random = np.random.randint(0,len(clean_texts)) input_sentence = clean_texts[random] text = text_to_seq(clean_texts[random]) checkpoint = './best_model.ckpt' loaded_graph = tf.Graph() with tf.Session(graph = loaded_graph) as sess: loader = tf.train.import_meta_graph(checkpoint+'.meta') loader.restore(sess,checkpoint) input_data = loaded_graph.get_tensor_by_name('input:0') logits = loaded_graph.get_tensor_by_name('predictions:0') text_length = loaded_graph.get_tensor_by_name('text_length:0') summary_length = loaded_graph.get_tensor_by_name('summary_length:0') keep_prob = loaded_graph.get_tensor_by_name('input:0') answer_logits = sess.run(logits,{input_data:{text}*batch_size, summary_length:[np.random.randint(5,8)], text_length:[len(text)*batch_size], keep_prob:1.0})[0] pad = vocab_to_int["<PAD>"] print('Original Text:', input_sentence) print('\nText') print('Word Ids: {}'.format([i for i in text])) print('Input Words: {}'.format(" ".join([int_to_vocab[i] for i in text]))) print("\nSummary") print('Word Ids: {}'.format([i for i in answer_logits if i != pad])) print('Response Words: {}'.format(" ".join([int_to_vocab[i] for i in answer_logits if i != pad])))