keras探索:nlp-電影評論分類
阿新 • • 發佈:2018-12-13
open resource :deep learning with python (keras)
from keras.datasets import imdb import matplotlib.pyplot as plt import numpy as np from keras import models from keras import layers from keras import optimizers def vectorize_sequences(sequences, dimension=4000): # Create an all-zero matrix of shape (len(sequences), dimension) results = np.zeros((len(sequences), dimension), dtype = int) for i, sequence in enumerate(sequences): results[i, sequence] = 1 # set specific indices of results[i] to 1s return results # saving the most common 10000 words (train_data, train_labels),(test_data, test_labels) \ = imdb.load_data(num_words =4000) # word_index is a dictionary mapping words to an integer index word_index = imdb.get_word_index() # We reverse it, mapping integer indices to words reverse_word_index = dict([(value, key) for (key, value) in word_index.items()]) # We decode the review; note that our indices were offset by 3 # because 0, 1 and 2 are reserved indices for "padding", "start of sequence", and "unknown". # this is just an example about review-1 decoded_review = ' '.join([reverse_word_index.get(i-3, '?') for i in train_data[0]]) # Our vectorized training data : 25000 x_train = vectorize_sequences(train_data) # Our vectorized test data : 25000 x_test = vectorize_sequences(test_data) # Our vectorized labels y_train = np.asarray(train_labels).astype('float32') y_test = np.asarray(test_labels).astype('float32') model = models.Sequential() model.add(layers.Dense(64, activation='relu', input_shape=(4000,))) model.add(layers.Dense(32, activation='relu')) model.add(layers.Dense(1, activation='sigmoid')) model.compile(optimizer=optimizers.RMSprop(lr=0.001), loss='binary_crossentropy', metrics=['acc']) #v validation & training data x_val = x_train[:10000] partial_x_train = x_train[10000:] y_val = y_train[:10000] partial_y_train = y_train[10000:] history = model.fit(partial_x_train, partial_y_train, epochs=20, batch_size=512, validation_data=(x_val, y_val)) ########################################################## acc = history.history['acc'] val_acc = history.history['val_acc'] loss = history.history['loss'] val_loss = history.history['val_loss'] epochs = range(1, len(acc) + 1) # "bo" is for "blue dot" plt.plot(epochs, loss, 'bo', label='Training loss') # b is for "solid blue line" plt.plot(epochs, val_loss, 'b', label='Validation loss') plt.title('Training and validation loss') plt.xlabel('Epochs') plt.ylabel('Loss') plt.legend() plt.show() plt.clf() # clear figure acc_values = history.history['acc'] val_acc_values = history.history['val_acc'] plt.plot(epochs, acc, 'bo', label='Training acc') plt.plot(epochs, val_acc, 'b', label='Validation acc') plt.title('Training and validation accuracy') plt.xlabel('Epochs') plt.ylabel('Loss') plt.legend() plt.show() predit = model.predict(x_test) output = model.evaluate(x_test, y_test)
總結:
1. 神經網路在進行自然語言的張量處理過程中,需要首先講自然語言符號變成向量,然後構造神經網路模型即可。
2. NLP-Vectors一般需要採用‘字典(通過統計詞的頻率)’進行數值化。這裡涉及到了編碼問題。如果採用one-hot編碼,即使採用最常出現的前10000個單詞也需要非常大的記憶體消耗。所以如果出現‘Memory Error’不用擔心,降低num_words就好了。當然精度也會下降。
3. 這個專案/實驗只是為了熟悉一下keras框架,精度不高,也沒有優化NN結構,所以不用糾結。