1. 程式人生 > >對文字抽取詞袋模型特徵



from sklearn.feature_extraction.text import CountVectorizer


vec = CountVectorizer(

    analyzer='word',            # tokenise by character ngrams

    max_features=4000,     # keep the most common 4000 ngrams,表示抽取最常見的4000個單詞





classifier = MultinomialNB()

# vec.transform(x_train)轉化訓練集樣本,轉變之後矩陣維度是[n_samples, 4000]

classifier.fit(vec.transform(x_train), y_train)



vec = CountVectorizer(

    analyzer='word',   # tokenise by character ngrams

    ngram_range=(1,4),  # use ngrams of size 1 and 2

max_features=20000,)  # keep the most common 1000 ngrams



from sklearn.cross_validation import StratifiedKFold

#x是訓練資料,y是標籤,train_index : test_index = 4:1

stratifiedk_fold = StratifiedKFold(y, n_folds=n_folds, shuffle=shuffle)

    for train_index, test_index in stratifiedk_fold:

        X_train, X_test = x[train_index], x[test_index]

        y_train = y[train_index]