1. 程式人生 > >利用skift實現fasttext模型

利用skift實現fasttext模型

skift: 用於Python fastText的scikit-learn 包裝器

什麼是 skift?

skift包括幾個scikit-learn相容包裝器,裡面封裝了fasttext模型,fasttext原理類似於word2vec,主要用於文字快速分類。其優勢在於分類速度快,使用n-gram特徵容易獲得文字句子區域性資訊、構造新詞。缺點是隨著語料的增長,記憶體需求也會增長。那麼如果解決記憶體問題呢?fasttext這種提出三種解決方法,包括

  1. 過濾掉出現次數少的詞;
  2. 使用Hash儲存
  3. 採用word粒度,而非char粒度
    例如句子: 我喜歡去中國, 如果採用char粒度,則使用2-gram的話,產生的特徵為
    我喜 喜歡 歡中 中國
    如果採用word粒度的話,產生的特徵為
    我喜歡 喜歡去 去中國

關於fasttext原理比較好的參考有FastText文字分類演算法學習筆記FastText的內部機制,這裡不詳闡述。

下面使用skift實現faxtText來對細粒度情感分析模板

from tqdm import tqdm
from skift import FirstColFtClassifier
from sklearn.model_selection import KFold
import numpy as np
import os
import pickle

class BasicModel(object):
    """Docstring for BasicModel. """
def __init__(self): """TODO: to be defined1. """ pass def create_model(self, kfold_X_train, y_train, kfold_X_test, y_test, test): pass # Generate batches def batch_iter(self, data, batch_size, num_epochs=1, shuffle=True): data = np.array(data) data_size =
len(data) num_batches_per_epoch = int((data_size - 1) / batch_size) + 1 for epoch in range(num_epochs): if shuffle: shuffle_indices = np.random.permutation(np.arange(data_size)) shuffled_data = data[shuffle_indices] else: shuffled_data = data for batch_num in range(num_batches_per_epoch): start_index = batch_num * batch_size end_index = min((1 + batch_num) * batch_size, data_size) yield shuffled_data[start_index:end_index] def get_f1_score(self, x, y, verbose=False): tp = np.sum(np.logical_and(y > 0, x == y)) fp = np.sum(np.logical_and(x > 0, y == 0)) + np.sum(np.logical_and(x * y > 0, y != x)) # 多判或者錯判 fn = np.sum(np.logical_and(y > 0, x == 0)) # 漏判 P = float(tp) / (float(tp + fp) + 1e-8) R = float(tp) / (float(tp + fn) + 1e-8) F = 2 * P * R / (P + R + 1e-8) if verbose: print('P->', P) print('R->', R) print('F->', F) return F class BasicStaticModel(BasicModel): def __init__(self, config=None, n_folds=5, name='BasicStaticModel'): self.n_folds = n_folds self.name = name self.config = config self.kf = KFold(n_splits=n_folds, shuffle=True, random_state=10) def train_predict(self, train, train_y, test, option=None): name = self.name predict = np.zeros((test.shape[0], 10, 4)) oof_predict = np.zeros((train.shape[0], 10, 4)) scores_f1 = [] for train_index, dev_index in self.kf.split(train): kfold_X_train, kfold_X_val = train[train_index], train[dev_index] y_train, y_dev = train_y[train_index], train_y[dev_index] model_dict = {} print('start train model:') for idx in tqdm(range(10)): label = y_train[:, idx] model = self.create_model() model.fit(kfold_X_train, label) model_dict[idx] = model print('complete train model') print('start validate model') f1_scores = [] for idx in tqdm(range(10)): label_dev = y_dev[:, idx] model = model_dict[idx] dev_prob = model.predict_proba(kfold_X_val) test_prob = model.predict_proba(test) oof_predict[dev_index, idx] = dev_prob predict[:, idx] += test_prob / self.n_folds dev_predict = np.argmax(dev_prob, 1) f1_scores.append(self.get_f1_score(dev_predict, label_dev)) f1_score = np.mean(f1_scores) scores_f1.append(f1_score) print('f1_scores-> ', f1_scores) print('f1_score: ', f1_score) if self.config.is_debug == True: break print('Total f1->', scores_f1) print("Total f1'mean is ", np.mean(scores_f1)) # 儲存結果 os.makedirs('../data/result-ml', exist_ok=True) with open('../data/result-ml/{}_oof_f1_{}.pkl'.format(name, str(np.mean(scores_f1))), 'wb') as f: pickle.dump(oof_predict, f) with open('../data/result-ml/{}_pre_f1_{}.pkl'.format(name, str(np.mean(scores_f1))), 'wb') as f: pickle.dump(predict, f) print('done') class Fasttext(BasicStaticModel): def __init__(self, name='basicModel', n_folds=5, config=None): BasicStaticModel.__init__(self, name=name, n_folds=n_folds, config=config) def create_model(self): # 重寫 sk_clf = FirstColFtClassifier(lr=1.0, epoch=10, wordNgrams=1, minCount=5, verbose=2) return sk_clf