1. 程式人生 > >python--電影評論文字情感分類

python--電影評論文字情感分類

為了記錄kaggle學習心得。

參考了大神文章。

1.http://www.cnblogs.com/lijingpeng/p/5787549.html

2.python機器學習及實戰

from sklearn.datasets import fetch_20newsgroups
X, y = news.data , news.target

檢視X的長度 , 以及X[0]的長度

print(len(X) ,len(X[0]),len(X[0][0]))
from bs4 import BeautifulSoup
import nltk ,re
news = fetch_20newsgroups(subset='all')
def news_to_sentences(news):
    news_text = BeautifulSoup(news).get_text() # 去掉HTML標籤,拿到內容
    tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
    raw_sentences = tokenizer.tokenize(news_text)
    sentences = []
    for sent in raw_sentences:
        sentences.append(re.sub('[^a-zA-Z]', ' ', sent.lower().strip()).split())# 小寫化所有的詞,並轉成詞list
用正則表示式取出符合規範的部分

    return sentences
sentences = []
for x in X:
    sentences += news_to_sentences(x)
from gensim.models import word2vec


num_features = 300                       
min_word_count = 20                        
num_workers = 2    
context = 5                                                                               
downsampling = 1e-3   


from gensim.models import word2vec

model = word2vec.Word2Vec(sentences, workers=num_workers, \
            size=num_features, min_count = min_word_count, \
            window = context, sample = downsampling)

model.init_sims(replace=True)
model.most_similar('morning')
from sklearn.datasets import fetch_20newsgroups
X, y = news.data , news.target

檢視X的長度 , 以及X[0]的長度

print(len(X) ,len(X[0]),len(X[0][0]))
from bs4 import BeautifulSoup
import nltk ,re
news = fetch_20newsgroups(subset='all')
def news_to_sentences(news):
    news_text = BeautifulSoup(news).get_text()
    
    tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
    raw_sentences = tokenizer.tokenize(news_text)
    
    sentences = []
    
    for sent in raw_sentences:
        sentences.append(re.sub('[^a-zA-Z]', ' ', sent.lower().strip()).split())
    return sentences
sentences = []
for x in X:
    sentences += news_to_sentences(x)
from gensim.models import word2vec


num_features = 300                       
min_word_count = 20                        
num_workers = 2    
context = 5                                                                               
downsampling = 1e-3   


from gensim.models import word2vec

model = word2vec.Word2Vec(sentences, workers=num_workers, \
            size=num_features, min_count = min_word_count, \
            window = context, sample = downsampling)

model.init_sims(replace=True)
model.most_similar('morning')