1. 程式人生 > >Python使用doc2vec和LR進行文字分類

Python使用doc2vec和LR進行文字分類

(1)資料預處理
a.對文字資料進行貼標籤處理,標籤資料類似入下:

平素體質:健康狀況:良,既往有“高血壓病史”多年。#1

其中1表示患有高血壓,0表示沒有患有高血壓。
然後進行分開,文字儲存在一個檔案,標籤儲存在一個檔案,文字內容和標籤行對行對應。
b.對文字檔案的內容進行分詞。

import jieba

#讀取資料生成sentences
file=open(u'/home/ubuntu/file/資料平衡分類',encoding='utf-8')
filenoclass=open(u'/home/ubuntu/file/資料平衡無分類','w')
fileclass=open(u'/home/ubuntu/file/資料平衡分類結果'
,'w') documents=[] tig=[] for lines in file: text=lines.strip().split('#') segs=jieba.cut(text[0]) for seg in segs: filenoclass.write(seg+" ") filenoclass.write('\n') fileclass.write(str(text[1])+'\n') filenoclass.close() fileclass.close() file.close()

(2)訓練doc2vec得到文字向量

import gensim
from sklearn.linear_model import LogisticRegression
import pandas as pd
from sklearn.model_selection import train_test_split

file = open(u'/home/ubuntu/file/資料平衡無分類', encoding='utf-8')
fileclass=open(u'/home/ubuntu/file/資料平衡分類結果',encoding='utf-8')
documents = gensim.models.doc2vec.TaggedLineDocument(file)
model = gensim.models.Doc2Vec(documents, size=100
, window=8, min_count=100, workers=8) #生成文字向量 print(model.docvecs[1])

(3)準備進行分類的資料

def getData():
    #生成pandas
    tigs = []
    data_dict = {}
    # 生成pandas資料
    for tig in fileclass:
        tigs.append(tig.strip())
    for i in range(len(model.docvecs)):
        data_dict['p' + str(i)] = model.docvecs[i]
    print(tigs)
    print(data_dict)
    data = pd.DataFrame(data_dict)
    data = data.T
    data['class0'] = tigs
    X_train1, X_test1, y_train1, y_test1 = train_test_split(data, data['class0'], test_size=0.4, random_state=0)
    return X_train1, y_train1, X_test1, y_test1

(4)準備測試方法

def getRecognitionRate(testPre, testClass):
    testNum = len(testPre)
    rightNum = 0
    for i in range(0, testNum):
        if testClass[i] == testPre[i]:
            rightNum += 1
    return float(rightNum) / float(testNum)

(5)進行模型訓練

import gensim
from sklearn.linear_model import LogisticRegression
import pandas as pd
from sklearn.model_selection import train_test_split

file = open(u'/home/ubuntu/file/資料平衡無分類', encoding='utf-8')
fileclass=open(u'/home/ubuntu/file/資料平衡分類結果',encoding='utf-8')
documents = gensim.models.doc2vec.TaggedLineDocument(file)
model = gensim.models.Doc2Vec(documents, size=100, window=8, min_count=100, workers=8)
#生成文字向量
print(model.docvecs[1])
#使用邏輯迴歸進行預測
def LR():
    clf = LogisticRegression()
    return clf
def getRecognitionRate(testPre, testClass):
    testNum = len(testPre)
    rightNum = 0
    for i in range(0, testNum):
        if testClass[i] == testPre[i]:
            rightNum += 1
    return float(rightNum) / float(testNum)
def getData():
    #生成pandas
    tigs = []
    data_dict = {}
    # 生成pandas資料
    for tig in fileclass:
        tigs.append(tig.strip())
    for i in range(len(model.docvecs)):
        data_dict['p' + str(i)] = model.docvecs[i]
    print(tigs)
    print(data_dict)
    data = pd.DataFrame(data_dict)
    data = data.T
    data['class0'] = tigs
    X_train1, X_test1, y_train1, y_test1 = train_test_split(data, data['class0'], test_size=0.4, random_state=0)
    return X_train1, y_train1, X_test1, y_test1
T = getData()
trainMatrix, trainClass, testMatrix, testClass = T[0], T[1], T[2], T[3]
clf_LR=LR()
clf_LR.fit(trainMatrix, trainClass)
print('Logistic Regression recognition rate: ', getRecognitionRate(clf_LR.predict(testMatrix), testClass))