1. 程式人生 > >使用樸素貝葉斯進行社會媒體挖掘之推特

使用樸素貝葉斯進行社會媒體挖掘之推特

前言:
本文參考《python資料探勘入門與實踐》第六章,進行twiitter社會媒體挖掘。學完後感覺對社會媒體挖掘領域很有興趣,往後會深入研究。還有就是本文采用ipython編輯,程式碼後面緊跟著輸出,注意不要混淆。

正文:

## 下載新的Twitter語料 授權令牌資訊獲取網址:https://apps.twitter.com/ 具體操作可google。 以下XXXX為需填入的資訊,如果不想自己下載可以在文末網盤找到我的資料,跳過此步驟。 匯入twitter庫,設定授權令牌
import twitter
consumer_key = "XXX"
consumer_secret = "XXXX"
access_token = "XXXX" access_token_secret = "XXXX" authorization = twitter.OAuth(access_token,access_token_secret,consumer_key,consumer_secret)
使用twitter庫提供的search函式查詢“python”的訊息 PS:一定要開vpn的全域性模式,否則會一直TimeError:10060
#指定訊息儲存位置
import os
import json
output_filename = os.path.join("E:\DataMining\Project\dataming_with_python\樸素貝葉斯社會媒體挖掘"
,"python_tweets.json") #建立用來從twitter讀取資料的物件 t = twitter.Twitter(auth=authorization) with open(output_filename,'a') as output_file: search_results = t.search.tweets(q="python",count=100)['statuses']#只需“statuses”部分內容 for tweet in search_results: #含有“text”的才是要的訊息物件 if 'text' in tweet: output_file.write(json.dumps(tweet)) output_file.write("\n\n"
)
## 載入資料集並對其進行分類
import os
import json
input_filename = os.path.join("E:\DataMining\Project\dataming_with_python\chapter_6樸素貝葉斯社會媒體挖掘","python_tweets.json")
classes_filename = os.path.join("E:\DataMining\Project\dataming_with_python\chapter_6樸素貝葉斯社會媒體挖掘","python_classes.json")
tweets = []
with open(input_filename) as inf:
    for line in inf:
        if len(line.strip()) ==0:            #當前行是空行跳過
            continue
        tweets.append(json.loads(line))     #非空行則loads()將json字串轉為python物件   
print("Loaded {} tweets".format(len(tweets)))
Loaded 95 tweets
tweet_sample = tweets
labels = []
if os.path.exists(classes_filename):
    with open(classes_filename) as inf:
        labels = json.load(inf)
print(len(labels))
0
def get_tweet():
    return tweet_sample[len(labels)]['text']
使用魔法函式%%html 在ipython中進行html/javascript程式碼從而實現互動
%%html
<div name="tweetbox">
    Instructions: Click in text box. Enter a 1 if the tweet is relevant, enter 0 otherwise.<br>
    Tweet: <div id="tweet_text" value="text"></div><br>
    <input type=text id="capture"></input><br>
</div>

<script>
function set_label(label){
    var kernel = IPython.notebook.kernel;
    kernel.execute("labels.append(" + label + ")");
    load_next_tweet();
}

function load_next_tweet(){
    console.log("1");
   var code_input = "get_tweet()";
    console.log("2");
   var kernel = IPython.notebook.kernel;
    console.log("3");
   var callbacks = { 'iopub' : {'output' : handle_output}};
    console.log("4");
   kernel.execute(code_input, callbacks, {silent:false});
    console.log("5");
}

function handle_output(out){
   console.log(out);
   var res = out.content.data["text/plain"];
   $("div#tweet_text").html(res);
}

$("input#capture").keypress(function(e) {
    console.log(e);
  if(e.which == 48) {
    // 0 pressed
    set_label(0);
    $("input#capture").val("");
  }else if (e.which == 49){
    // 1 pressed
    set_label(1);  
    $("input#capture").val("");
  }
});

load_next_tweet();
</script>
Instructions: Click in text box. Enter a 1 if the tweet is relevant, enter 0 otherwise.
Tweet:

function set_label(label){ var kernel = IPython.notebook.kernel; kernel.execute("labels.append(" + label + ")"); load_next_tweet(); } function load_next_tweet(){ console.log("1"); var code_input = "get_tweet()"; console.log("2"); var kernel = IPython.notebook.kernel; console.log("3"); var callbacks = { 'iopub' : {'output' : handle_output}}; console.log("4"); kernel.execute(code_input, callbacks, {silent:false}); console.log("5"); } function handle_output(out){ console.log(out); var res = out.content.data["text/plain"]; ("div#tweet_text").html(res);}("input#capture").keypress(function(e) { console.log(e); if(e.which == 48) { // 0 pressed set_label(0); ("input#capture").val("");  }else if (e.which == 49){    // 1 pressed    set_label(1);("input#capture").val(""); } }); load_next_tweet();
print(len(labels))
97

將類別以json儲存

with open(classes_filename,'w') as outf:
    json.dump(labels,outf)

貝葉斯定理,用二值化的詞袋模型來分類

建立流水線,接收訊息,僅根據訊息內容,判斷是否跟程式設計python有關

使用NLTK抽取單詞是否出現作為特徵

#建立基礎轉換器(由於NLTK的介面轉換器介面不一致)
from sklearn.base import TransformerMixin
from nltk import word_tokenize
import nltk
nltk.download('punkt')

class NLTKBOW(TransformerMixin):
    def fit(self,X,y=None):
        return self
    def transform(self,X):
        return [{word:True for word in word_tokenize(document)} 
                for document in X]                                #返回元素為字典的列表,出現的詞作為鍵,Ture作為值
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Administrator\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt.zip.

將字典轉換為矩陣,DictVectorizerl類

from sklearn.feature_extraction import DictVectorizer

訓練樸素貝葉斯分類器,這裡用專門用於二值特徵分類的BernoulliNB分類器

from sklearn.naive_bayes import BernoulliNB

組裝成流水線,用F1值評估,F1 = 2*precision*recall/precision+recall

import os
import json
from sklearn.cross_validation import cross_val_score
input_filename = os.path.join("E:\DataMining\Project\dataming_with_python\chapter_6樸素貝葉斯社會媒體挖掘","python_tweets.json")
classes_filename = os.path.join("E:\DataMining\Project\dataming_with_python\chapter_6樸素貝葉斯社會媒體挖掘","python_classes.json")

#只提取訊息的text值
tweets = []
with open(input_filename) as inf:
    for line in inf:
        if len(line.strip()) == 0:
            continue
        tweets.append(json.loads(line)['text'])
with open(classes_filename) as inf:
    lables = json.load(inf)
#流水線
from sklearn.pipeline import Pipeline
pipeline = Pipeline([('bag-of-words',NLTKBOW()),
                    ('vectorizer',DictVectorizer()),
                    ('naive-bayes',BernoulliNB())])
scores = cross_val_score(pipeline,tweets,lables,scoring='f1')
import numpy as np
print("Score: {:.3f}".format(np.mean(scores)))
Score: 0.223

最後:
用到的資料,有用的話點個讚唄

 連結:https://pan.baidu.com/s/1kRhameSX96GXYWlw4E226g 密碼:847k

———關注我的公眾號,一起學資料探勘————
這裡寫圖片描述