1. 程式人生 > >【自然語言處理】使用樸素貝葉斯進行語種檢測

【自然語言處理】使用樸素貝葉斯進行語種檢測

首先看一下資料集:

 基本上每行就是一句話,所屬類別,這裡包含English, French, German, Spanish, Italian 和 Dutch 6種語言)

先匯入相應的包:

import os
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
import re

首先讀取資料集:

def get_train_test_data():
    #獲取當前檔案的絕對目錄
    path_dir=os.path.dirname(os.path.abspath(__file__))
    #獲取資料集
    data_path = path_dir + "\\Database\\data.csv"
    #存放資料
    data = []
    #存放標籤
    label= []
    with open(data_path,'r') as fp:
        lines=fp.readlines()
        for line in lines:
            line=line.split(",")
            data.append(line[0])
            label.append(line[1].strip())
    #切分資料集
    x_train,x_test,y_train,y_test = train_test_split(data,label,random_state=1)
    return x_train,x_test,y_train,y_test

然後是過濾掉一些噪聲:

\w是匹配包括下劃線的任意字元,\S是匹配任何非空字元,+號表示匹配一個或多個字元

def remove_noise(document):
    noise_pattern = re.compile("|".join(["http\S+", "\@\w+", "\#\w+"]))
    clean_text = re.sub(noise_pattern, "", document)
    return clean_text.strip()

下一步,再降噪資料上抽取出有用的特徵,抽取1-gram和2-gram的統計特徵

vec = CountVectorizer(
    lowercase=True,     # lowercase the text
    analyzer='char_wb', # tokenise by character ngrams
    ngram_range=(1,2),  # use ngrams of size 1 and 2
    max_features=1000,  # keep the most common 1000 ngrams
    preprocessor=remove_noise
)
vec.fit(x_train)

def get_features(x):
    vec.transform(x)

最後就是進行分類:

classifier = MultinomialNB()
classifier.fit(vec.transform(x_train), y_train)
classifier.score(vec.transform(x_test), y_test)

將以上程式碼整合成一個類:

import os
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
import re

def get_train_test_data():
    #獲取當前檔案的絕對目錄
    path_dir=os.path.dirname(os.path.abspath(__file__))
    #獲取資料集
    data_path = path_dir + "\\Database\\data.csv"
    #存放資料
    data = []
    #存放標籤
    label= []
    with open(data_path,'r') as fp:
        lines=fp.readlines()
        for line in lines:
            line=line.split(",")
            data.append(line[0])
            label.append(line[1].strip())
    #切分資料集
    x_train,x_test,y_train,y_test = train_test_split(data,label,random_state=1)
    return x_train,x_test,y_train,y_test

class LanguageDetector():
    def __init__(self,classifier=MultinomialNB()):
        self.classifier=classifier
        self.vectorizer=CountVectorizer(
            lowercase=True,
            analyzer='char_wb',
            ngram_range=(1, 2),
            max_features=1000,
            preprocessor=self._remove_noise,
        )

    def _remove_noise(self, document):
        noise_pattern = re.compile("|".join(["http\S+", "\@\w+", "\#\w+"]))
        clean_text = re.sub(noise_pattern, "", document)
        return clean_text

    def features(self, X):
        return self.vectorizer.transform(X)

    def fit(self, X, y):
        self.vectorizer.fit(X)
        self.classifier.fit(self.features(X), y)

    def predict(self, x):
        return self.classifier.predict(self.features([x]))

    def score(self, X, y):
        return self.classifier.score(self.features(X), y)


language_detector = LanguageDetector()
x_train,x_test,y_train,y_test = get_train_test_data()
language_detector.fit(x_train, y_train)
print(language_detector.predict('This is an English sentence'))
print(language_detector.score(x_test, y_test))

最終結果:

相關資料及程式碼:連結: https://pan.baidu.com/s/1tjHcnZuEdGpDb9vtCHYRWA 提取碼: aqfs&n