keras快速上手-基於python的深度學習實踐_第8章_文字生成源代碼
阿新 • • 發佈:2019-01-03
app 取出 武俠小說 ica 內存 helper per charset bat 源代碼如下,但質量較差
# -*- coding: utf-8 -*- #!/usr/bin/env python # coding: utf-8 # # 序列模型 # In[1]: import pandas as pd import numpy as np import gc import keras from keras.models import Sequential from keras.models import load_model from keras.layers import Dense, Activation, Dropout from keras.layers import LSTM from keras.optimizers import RMSprop from keras.utils.data_utils import get_file import io import matplotlib.pyplot as plt from sklearn.preprocessing import MinMaxScaler plt.rcParams[‘figure.figsize‘]=(20, 10) # In[2]: np.random.seed(82832) # 我們使用《四世同堂》這部小說作為訓練集。讀者也可以選用其他長篇小說,或者爬取網上新聞作為訓練集。通常句式和語言比較有自己風格的長篇小說訓練起來相對容易產出好的結果,就像我們讀了武俠小說就比較容易學那種寫法一個道理。因此讀者也不妨選用名家的武俠小說,比如金庸全集等來訓練自己的模型。網上爬取的新聞則具有數據量大,風格一致的特點,也適合用來訓練模型。 # In[3]: #不符合下面固定句長設定的程序要求 #但是可用於計算平均句長 fileopen = io.open("new.txt", encoding=‘utf-8‘) with fileopen as fo: alltext0 = fo.readlines() # In[4]: alltext = io.open("new.txt", encoding=‘utf-8‘).read() # In[5]: len(set(alltext)) # 我們先按照單個字來建模。首先把所有的字符抽取出來。 # In[6]: ‘‘‘ 較naive的做法 charset = {} id = 0 for line in alltext: length = len(line) for k in range(length): w = line[k] if not w in charset: charset[w]=id id+=1 print(len(charset)) ‘‘‘ # In[7]: sortedcharset = sorted(set(alltext)) char_indices = dict((c, i) for i, c in enumerate(sortedcharset)) indices_char = dict((i, c) for i, c in enumerate(sortedcharset)) # 現在把原文按照指定長度劃分為虛擬的句子。這個指定虛擬句子的長度一般使用平均句子的字數。 # In[8]: sentencelength = 0 k=0 for line in alltext0: k=k+1 linelength = len(line) sentencelength = (k-1)/k * sentencelength + linelength / k print(sentencelength) print(k) # In[9]: maxlen = 40 step = 3 sentences = [] next_chars = [] for i in range(0, len(alltext) - maxlen, step): sentences.append(alltext[i: i + maxlen]) next_chars.append(alltext[i + maxlen]) print(‘nb sequences:‘, len(sentences)) # 下面對虛擬句子進行矩陣化 # In[10]: # 但是這麽直接構造得是非常浪費空間的密集矩陣,這個矩陣占據大約30GB的內存,如果把句長再增加一些,那麽在很多機器上無法運行。同時這麽大的數據無法送給顯卡進行計算,需要每次取一小塊批量供GPU計算所需。這時候需要使用fit_generator方法,而不是原來的fit方法。fit_generator將每個batch的數據讀入,從原始數據的稀疏矩陣變為當前批量的密集矩陣,然後計算。這樣對內存的壓力大大降低。 # In[12]: #data generator for fit_generator method def data_generator(X, y, batch_size): if batch_size<1: batch_size=256 number_of_batches = X.shape[0]//batch_size counter=0 shuffle_index = np.arange(np.shape(y)[0]) np.random.shuffle(shuffle_index) #reset generator while 1: index_batch = shuffle_index[batch_size*counter:batch_size*(counter+1)] X_batch = (X[index_batch,:,:]).astype(‘float32‘) y_batch = (y[index_batch,:]).astype(‘float32‘) counter += 1 yield(np.array(X_batch),y_batch) if (counter < number_of_batches): np.random.shuffle(shuffle_index) counter=0 # In[19]: batch_size=10240 number_of_batches = len(sentences)//batch_size counter=0 shuffle_index = np.arange(len(sentences)) np.random.shuffle(shuffle_index) #reset generator for i in range(number_of_batches): index_batch = shuffle_index[batch_size*counter:batch_size*(counter+1)] subsentences = [sentences[s] for s in index_batch] X = np.zeros((batch_size, maxlen, len(sortedcharset)), dtype=np.bool) y = np.zeros((batch_size, len(sortedcharset)), dtype=np.bool) for j in range(len(subsentences)): for t in range(maxlen): char=subsentences[j][t] X[j, t, char_indices[char]] = 1 y[j, char_indices[next_chars[j]]] = 1 X = X.astype(‘float32‘) y = y.astype(‘float32‘) counter += 1 print( (X.shape, y.shape )) # 但是這種方法仍然需要一開始生成巨大的特征矩陣和因變量矩陣。我們可以將生成這兩個矩陣的操作移入數據生成器中,這樣無需產生大量數據等待輸入GPU,而是每次只取所需並生成相應的矩陣並即刻輸入GPU運算即可。 # In[10]: # build the model: a single LSTM batch_size=300 print(‘Build model...‘) model = Sequential() model.add(LSTM(256, input_shape=(maxlen, len(sortedcharset)), recurrent_dropout=0.1, dropout=0.1)) #model.add(Dense(1024, activation=‘relu‘)) #model.add(Dropout(0.25)) model.add(Dense(len(sortedcharset))) model.add(Activation(‘softmax‘)) #optimizer = RMSprop(lr=0.01) adamoptimizer = keras.optimizers.Adam(lr = 1e-4) model.compile(loss=‘categorical_crossentropy‘, optimizer=adamoptimizer) print(‘Finished compiling‘) model.summary() # In[13]: def data_generator2(sentences, sortedcharset, char_indices, maxlen=40, batch_size=256): if batch_size<1: batch_size=256 number_of_batches = len(sentences)//batch_size counter=0 shuffle_index = np.arange(len(sentences)) np.random.shuffle(shuffle_index) #reset generator while 1: index_batch = shuffle_index[batch_size*counter:batch_size*(counter+1)] subsentences = [sentences[s] for s in index_batch] X = np.zeros((batch_size, maxlen, len(sortedcharset)), dtype=np.bool) y = np.zeros((batch_size, len(sortedcharset)), dtype=np.bool) for j, sentence in enumerate(subsentences): for t in range(maxlen): char=sentence[t] X[j, t, char_indices[char]] = 1 y[j, char_indices[next_chars[j]]] = 1 X = X.astype(‘float32‘) y = y.astype(‘float32‘) counter += 1 yield((np.array(X), np.array(y))) if (counter < number_of_batches): np.random.shuffle(shuffle_index) counter=0 # In[14]: model.fit_generator(data_generator2(sentences, sortedcharset, char_indices, maxlen=maxlen, batch_size=batch_size), steps_per_epoch=len(sentences)//batch_size, epochs=25) # In[20]: model.save(‘whk.h5‘) def sample(preds, temperature=1.0): # helper function to sample an index from a probability array preds = np.asarray(preds).astype(‘float64‘) preds = np.log(preds) / temperature exp_preds = np.exp(preds) preds = exp_preds / np.sum(exp_preds) probas = np.random.multinomial(1, preds, 1) return np.argmax(probas) start_index=1 sentence = alltext[start_index: start_index + maxlen] sentence0=sentence x = np.zeros((1, maxlen, len(sortedcharset))) generated=‘‘ x = np.zeros((1, maxlen, len(sortedcharset))).astype(‘float32‘) for t, char in enumerate(sentence): x[0, t, char_indices[char]] = 1. for i in range(20): preds = model.predict(x, verbose=0)[0] next_index = sample(preds, 1.1) next_char = indices_char[next_index] generated+=next_char sentence = sentence[1:]+next_char print(sentence0) print("=================") print(‘ ‘.join(generated)) # In[25]: start_index=2 sentence = alltext[start_index: start_index + maxlen] sentence0=sentence x = np.zeros((1, maxlen, len(sortedcharset))) def GenSentence(original): sentence=original generated=‘‘ for i in range(20): x = np.zeros((1, maxlen, len(sortedcharset))).astype(‘float32‘) for t, char in enumerate(sentence): x[0, t, char_indices[char]] = 1. preds = model.predict(x, verbose=0)[0] next_index = sample(preds, 1.20) next_char = indices_char[next_index] generated+=next_char sentence = sentence[1:]+next_char return(generated) # In[26]: start_index=3 sentence0 = alltext[start_index: start_index + maxlen] generated0 = GenSentence(sentence0) print(sentence0+"----->"+generated0) print("==========") generated1 = GenSentence(generated0) print(generated0+"------>"+generated1) # In[27]: try: del(X, y, model) except: print(‘Objects not found...‘) for i in range(10): gc.collect()
keras快速上手-基於python的深度學習實踐_第8章_文字生成源代碼