基於tflearn使用lstm實現文字分類

阿新 • • 發佈：2019-01-02

模型訓練部分程式碼

# -*- coding: utf-8 -*-
from __future__ import division, print_function, absolute_import
import tflearn
import os
import numpy
import jieba
import sys
import random
import re
import fire
from sys import argv
import json
from tflearn.data_utils import to_categorical, pad_sequences
from tflearn.datasets import  
imdb

def load_data1( keywordPath,stopwords_set,filepath,dictfilepath,n_words, valid_portion=0.1,
               sort_by_len=True):
   

    #keywordPath = sys.argv[1]
    jieba.load_userdict(keywordPath)
    pathDir = os.listdir(filepath)

    data_set = []
    train_set_x = []
    train_set_y = []
    test_set_x = []
    test_set_y = []

    # 把停用詞做成字典
 
    stopwords = {}
    fstop = open(stopwords_set, 'rb')
    for eachWord in fstop:
        stopwords[eachWord.strip().decode('utf-8', 'ignore')] = eachWord.strip().decode('utf-8', 'ignore')
    fstop.close()

    #寫入詞典
    f1 = open(dictfilepath, 'w', encoding='UTF-8')
    dic = dict()

    i = 0
     
j = 0

    #構建詞表
    for allDir in pathDir:
        child = filepath + allDir
        if os.path.isdir(child):
            pathSubDir = os.listdir(child)
            k = 1
            for subDir in pathSubDir:
                # if m >5000:
                #     break
                des = child + os.sep + subDir
                s1 = ""
                invert = []
                fOpen = open(des, "r", encoding='UTF-8')
                for eachLine in fOpen:
                    line = eachLine.strip()
                    line1 = re.sub("[\s+\.\!\/_,$%^*()?;；:-【】+\"\']+|[+——！，;:。？、[email protected]#￥%……&*（）]+",
                                   "", line)
                    wordList = list(jieba.cut(line1))
                    for word in wordList:
                        if word not in stopwords:
                            data_set.append(word)
                            if word not in dic:
                                i = i + 1
                                dic[word] = i
                                invert.append(dic[word])#append到invertlist，invert[22,123,424,..],文件word編碼集合
                                if re.match('[^ \t\n\x0B\f\r]', word, flags=0):
                                    f1.write(word+" "+str(i))
                                    f1.write("\n")
                            else:
                                invert.append(dic[word])
                    j = j+1
                #if random.randint(1, 10) == 1:#false
                n = len(pathSubDir)
                if k <= n*0.1:
                    print(str(j)+" test "+allDir)
                    test_set_x.append(invert)
                    test_set_y.append(allDir)
                else:
                    print(str(j) + " train " + allDir)
                    train_set_x.append(invert)
                    train_set_y.append(allDir)
                k += 1
                fOpen.close()
    f1.close()

    print("the number of words : "+str(i))

    n_samples = len(train_set_x)
    sidx = numpy.random.permutation(n_samples)
    n_train = int(numpy.round(n_samples * (1. - valid_portion)))
    valid_set_x = [train_set_x[s] for s in sidx[n_train:]]
    valid_set_y = [train_set_y[s] for s in sidx[n_train:]]
    train_set_x = [train_set_x[s] for s in sidx[:n_train]]
    train_set_y = [train_set_y[s] for s in sidx[:n_train]]

    train_set = (train_set_x, train_set_y)
    valid_set = (valid_set_x, valid_set_y)

    def remove_unk(x):
        return [[1 if w >= n_words else w for w in sen] for sen in x]

    valid_set_x, valid_set_y = valid_set
    train_set_x, train_set_y = train_set

    train_set_x = remove_unk(train_set_x)
    valid_set_x = remove_unk(valid_set_x)
    test_set_x = remove_unk(test_set_x)


    def len_argsort(seq):
        return sorted(range(len(seq)), key=lambda x: len(seq[x]))

    if sort_by_len:
        sorted_index = len_argsort(test_set_x)
        test_set_x = [test_set_x[i] for i in sorted_index]
        test_set_y = [test_set_y[i] for i in sorted_index]

        sorted_index = len_argsort(valid_set_x)
        valid_set_x = [valid_set_x[i] for i in sorted_index]
        valid_set_y = [valid_set_y[i] for i in sorted_index]

        sorted_index = len_argsort(train_set_x)
        train_set_x = [train_set_x[i] for i in sorted_index]
        train_set_y = [train_set_y[i] for i in sorted_index]

    train = (train_set_x, train_set_y)
    valid = (valid_set_x, valid_set_y)
    test = (test_set_x, test_set_y)
    return train, valid, test

def train():
    print("#######################")
    print("#         train       #")
    print("#######################")
    words = []
    s = os.sep  # 更改路徑操作符
    keywordPath = sys.argv[1]
    dictPath = sys.argv[2]
    f = open(dictPath, "r", encoding="utf-8")
    for i in f:
        words.append(i)
    word_num = len(words)
    modelPath = sys.argv[3]
    stopword_setPath = sys.argv[4]
    classnum = int(sys.argv[5])
    dataPath = "d:" + s + "data" 
    train, valid, test = load_data1(keywordPath=keywordPath, stopwords_set=stopword_setPath, filepath=dataPath, dictfilepath=dictPath, n_words=word_num, valid_portion=0.1)
    trainX, trainY = train
    valX, valY = valid
    trainX = pad_sequences(trainX, maxlen=30, value=0.)
    valX = pad_sequences(valX, maxlen=30, value=0.)

    trainY = to_categorical(trainY, nb_classes=classnum)
    valY = to_categorical(valY, nb_classes=classnum)

    net = tflearn.input_data([None, 30])
    net = tflearn.embedding(net, input_dim=word_num, output_dim=128)
    net = tflearn.lstm(net, 128, dropout=0.8)
    net = tflearn.fully_connected(net, classnum, activation='softmax')
    net = tflearn.regression(net, optimizer='adam', learning_rate=0.01,
                             loss='categorical_crossentropy')

    model = tflearn.DNN(net, tensorboard_verbose=0)
    model.fit(trainX, trainY, n_epoch=1, validation_set=(valX, valY), show_metric=True, batch_size=256)
    model.save(modelPath)


if __name__ == '__main__':
    fire.Fire(train)

基於tflearn使用lstm實現文字分類

模型訓練部分程式碼 # -*- coding: utf-8 -*- from __future__ import division, print_function, absolute_import import tflearn import os import numpy import jie

tensorflow實現基於LSTM的文字分類方法

引言學習一段時間的tensor flow之後，想找個專案試試手，然後想起了之前在看Theano教程中的一個文字分類的例項，這個星期就用tensorflow實現了一下，感覺和之前使用的theano還是有很大的區別，有必要總結mark一下模型說明這個

基於NaiveBayes的文字分類之Spark實現

在嘗試了python下面用sklearn進行文字分類（http://blog.csdn.net/a_step_further/article/details/50189727）後，我們再來看下用spark如何實現文字分類的工作，採用的演算法同樣是樸素貝葉斯。此

基於tensorflow 的cnn實現文字分類

# coding: utf-8 # In[72]: import os, xlrd import codecs, re import jieba import rarfile import os import jieba.analyse # In[22]:

nlp(一)用tgrocery實現文字分類

隨著深度學習的興起，很多文字分類都轉向用cnn這樣的網路來處理。但是使用神經網路模型進行文字分類是有一定前提條件的，那就是要有足夠的樣本來訓練模型中的引數。但是很多情況下，我能能夠蒐集到的樣本不會太多，而且分類的個數是不一定的。比如，原先

sklearn svm實現文字分類入門

正在學習sklearn , 實驗室專案需要實現一些文字的分類的功能。 sklearn提供了許多機器學習方面使用的到的寫好的工具。其中也包括分類器。sklearn在這裡不作介紹。有官網，有部落格，也正在學習中最開始是參照著這片文章： https://seg

基於RNN的文字分類模型（Tensorflow）

基於LSTM（Long-Short Term Memory，長短時記憶人工神經網路，RNN的一種）搭建一個文字意圖分類的深度學習模型（基於Python3和Tensorflow1.2），其結構圖如下：如圖1所示，整個模型包括兩部分第一部分：句子特徵提取 Step1 讀

tensorflow實現文字分類

Tensorflow文字分類練習初學tensorflow，借鑑了很多別人的經驗，參考部落格對評論分類(感謝博主的一系列好文)，本人也嘗試著實現了對文字資料的分類。 1、資料這裡借用此部落格提供的負類資料和正類資料對程式進行驗證(再次感謝此博主)。這些資

基於Tensorflow實現多分類支援向量機

1、匯入必要的程式設計庫； import matplotlib.pyplot as plt import numpy as np import tensorflow as tf from sklearn import datasets sess = tf.Se

jieba和樸素貝葉斯實現文字分類

#盜取男票年輕時候的程式碼，現在全給我教學使用了，感恩臉#分類文件為多個資料夾資料夾是以類別名命名內含多個單個文件#coding: utf-8 from __future__ import print_function, unicode_literals import

基於gibbsLDA的文字分類

之前幾篇文章講到了文件主題模型，但是畢竟我的首要任務還是做分類任務，而涉及主題模型的原因主要是用於text representation，因為考慮到Topic Model能夠明顯將文件向量降低維度，當然TopicModel可以做比這更多的事情，但是對於分類任務，我覺得這一

如何用 Spark 深度整合 Tensorflow 實現文字分類

這篇文章會分成以下 7 個部分：開發環境準備PySpark 基礎：基於 Dataframe 的 wordcount 實現PySpark MLlib 基礎-自動化特徵工程Tensorflow 基礎：Tensorflow 的編碼套路深度學習與 NLP 基礎：如何用深度學習完成 N

利用SVM 實現文字分類的例項

原文來自：http://blog.csdn.net/zhzhl202/article/details/8197109 之前做過一些文字挖掘的專案，比如網頁分類、微博情感分析、使用者評論挖掘，也曾經將libsvm進行包裝，寫了一個文字分類的開軟軟體Tmsvm。所以這裡將

手把手教你在Python中實現文字分類（附程式碼、資料集）

作者： Shivam Bansal 翻譯：申利彬校對：丁楠雅本文約2300字，建議

樸素貝葉斯的概率理論及其python程式碼實現文字分類的例項

一：樸素貝葉斯是一種基於概率分佈進行分類的方法，概率論是樸素貝葉斯的基礎，之所以被稱為樸素，而不是貝葉斯就是因為它在貝葉斯的基礎上，增添了兩個條件，一個是各特徵之間相互獨立，第二是每個特徵同等重要。樸素貝葉斯在資料很小的情況下仍然有效，可以處理多分類問題，但是對輸入資料的準

在 Flutter 中使用 TensorFlow Lite 外掛實現文字分類

![](https://img2020.cnblogs.com/other/1738727/202009/1738727-20200916104310205-1013918439.png) 如果您希望能有一種簡單、高效且靈活的方式把 TensorFlow 模型整合到 Flutter 應用裡，那請您一定不要錯

CNN字元級中文文字分類-基於TensorFlow實現

本章旨在使用TensorFlow API實現卷積神經網路文字分類。如今，TensorFlow大版本已經升級到了1.3，對很多的網路層實現了更高層次的封裝和實現，甚至還整合瞭如Keras這樣優秀的一些高層次框架，使得其易用性大大提升

《機器學習實戰》基於樸素貝葉斯分類演算法構建文字分類器的Python實現

Python程式碼實現：#encoding:utf-8 from numpy import * #詞表到向量的轉換函式 def loadDataSet(): postingList = [['my','dog','has','flea','problems','help','please'],

基於樸素貝葉斯的中文文字分類器(python實現，非呼叫)

本文將用樸素貝葉斯原理做一箇中文文字分類器。樸素貝葉斯完全可以勝任多分類任務。為了方便，這裡就先做個2分類的。理論部分：https://blog.csdn.net/montecarlostyle/article/details/79870860 我們事先準備兩

文字分類的python實現-基於SVM演算法

描述訓練集為評論文字，標籤為 pos,neu,neg三種分類，train.csv的第一列為文字content，第二列為label。可以單獨使用SVC訓練然後預測，也可以使用管道pipeline把訓練和預測放在一塊。 SVC的懲罰引數C：預設值是1.0。C越

基於tflearn使用lstm實現文字分類

相關推薦