1. 程式人生 > >如何使用“預訓練的詞向量”,做文字分類

如何使用“預訓練的詞向量”,做文字分類

不多比比了,看程式碼!!!

def train_W2V(w2vCorpus, size=100):
    w2vModel = Word2Vec(sentences=w2vCorpus, hs=0, negative=5, min_count=5, window=8, iter=1, size=size)
    w2vModel.save(inPath+'w2vModel.model')
    return w2vModel

def load_W2V(W2V_path, loader_mySelf=1):
    if loader_mySelf:
        print('use my w2vModel
') w2vModel = Word2Vec.load(W2V_path+'w2vModel.model') #使用自己訓練的詞向量 else: #載入騰訊訓練的詞向量 print('use other w2vModel') w2vModel = gensim.models.KeyedVectors.load_word2vec_format(W2V_path+'w2v_embedding_tengxun', binary=False) return w2vModel def make_word2idx_embedMatrix(w2vModel): word2idx
= {"_PAD": 0} vocab_list = [(w, w2vModel.wv[w]) for w, v in w2vModel.wv.vocab.items()] embeddings_matrix = np.zeros((len(w2vModel.wv.vocab.items()) + 1, w2vModel.vector_size)) for i in range(0, len(vocab_list)): word = vocab_list[i][0] word2idx[word] = i + 1 embeddings_matrix[i
+ 1] = vocab_list[i][1] return word2idx, embeddings_matrix def make_deepLearn_data(w2vCorpus, word2idx): X_train = [] for sen in w2vCorpus: wordList = [] for w in sen: if w in word2idx.keys(): wordList.append(word2idx[w]) else: wordList.append(0) X_train.append(np.array(wordList)) X_train = np.array(sequence.pad_sequences(X_train, maxlen=TEXT_MAXLEN)) #必須是np.array()型別 return X_train def Lstm_model(): #注意命名不能和庫函式同名,之前命名為LSTM()就出很大的錯誤!! model = Sequential() model.add(Embedding(input_dim=len(embeddings_matrix), ##引數要注意 output_dim=len(embeddings_matrix[0]), input_length=TEXT_MAXLEN, weights=[embeddings_matrix], #表示直接使用預訓練的詞向量 trainable=False #不對詞向量微調 )) model.add(LSTM(units=20, return_sequences=False)) #units:輸出的維度 model.add(Dropout(0.5)) model.add(Dense(units=1, activation="sigmoid")) #全連線層 model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy']) return model if __name__ == '__main__': df_data_ = df_data[0: 10000] #原始資料載入 w2vCorpus = [sen.split(' ') for sen in df_data_.分析欄位] #製作W2V語料集 w2vModel = train_W2V(w2vCorpus, size=100) #訓練W2V模型 w2vModel = load_W2V(inPath, loader_mySelf=0) #載入w2vModel word2idx, embeddings_matrix = make_word2idx_embedMatrix(w2vModel) #製作word2idx和embedMatrix X_train = make_deepLearn_data(w2vCorpus, word2idx) #製作符合要求的深度學習資料 y_train = np.array(df_data_.特徵型別) #必須是np.array()型別 model = Lstm_model() model.fit(X_train[0: -2000], y_train[0: -2000], epochs=2, batch_size=10, verbose=1) score = model.evaluate(X_train[-2000: ], y_train[-2000: ]) print(score)