1. 程式人生 > >自然語言處理作業A2

自然語言處理作業A2

自然語言處理 作業A2


作業地址: link

Unigram model

1. Creating the word_to_index dictionary

把txt文字讀入,轉成字典,然後輸出到一個txt

import codecs

# TODO: read brown_vocab_100.txt into word_index_dict
import codecs
from generate import GENERATE
# TODO: read brown_vocab_100.txt into word_index_dict
vocabs = codecs.open("brown_vocab_100.txt" , "r","utf-16") word_index_dict = {i.rstrip():index for index,i in enumerate(vocabs.readlines())} # TODO: write word_index_dict to word_to_index_100.txt with open("word_to_index_100.txt","w") as wf: for index,i in enumerate(word_index_dict.items()): c =
i[0]+' '+str(i[1])+'\n' wf.write(c) print(word_index_dict['all']) print(word_index_dict['resolution']) print(len(word_index_dict))

2. Building an MLE unigram model

詞頻,並求unigram的概率

vocab = codecs.open("brown_vocab_100.txt", "r", encoding="utf-16")

word_index_dict = {i.rstrip():index for index,i in enumerate(vocab.readlines())}
with codecs.open("brown_100.txt", "r",'utf-16') as f:
    text = f.read().lower()
#TODO: iterate through file and update counts
# 防止allen 裡找到all,text中 句子為' . '空格句號空格結尾
counts = np.array([text.count(' '+word+' ') if not word == '<s>' else text.count(word) for word in word_index_dict] )
#TODO: normalize and writeout counts. 
prob = counts/counts.sum()

with open("unigram_probs.txt","w") as wf:
    for index,i in enumerate(prob):
        # 由value找key
        word = list(word_index_dict.keys())[list(word_index_dict.values()).index(index)]
        c = word +' '+str(prob[index])+'\n'
        wf.write(c)

這樣做其實挺危險的,如果因為 e n u m e r a t e ( w o r d _ i n d e x _ d i c t ) ] enumerate(word\_index\_dict)] 裡面如果亂序咋辦。作業裡面還附帶了一個生成的模型

returnSTR = ""
index_word_dict = {v: k for k, v in word_index_dict.items()}
num_words = 0
max_words = 20
probs = prob
#using https://stackoverflow.com/questions/483666/python-reverse-invert-a-mapping
while(True):
    # 依據unigram的概率生成下一個字元
    wordIndex = np.random.choice(len(word_index_dict), 1, p=list(probs)) #output:[index],所以需要wordIndex[0]
    word = index_word_dict[wordIndex[0]]
    returnSTR += word + " "
    num_words +=1
    if word == "</s>" or num_words == max_words:
        break
print(returnSTR)

隨便生成了兩句話,牛頭不對馬嘴

not worth , or . the , receives its the this term for or superintendent the or as on i

he i in . end wife i it can force . details i he these i he by despite a 

關於問題中的小問,小資料集和大資料集相比只出現一個的單詞比例會多還是少?

sum(prob==1/counts.sum())/len(word_index_dict) #ouput=0.5633

出現一次的比重未免太高了8,因為這是一個不全的dictionary,所以最後 p z e r o = 0 p_{zero}=0 ,大的資料集中, p o n c e p_{once} 肯定也會減小

import matplotlib.pyplot as plt
from matplotlib import rcParams
vocab = codecs.open("brown_vocab_100.txt", "r", encoding="utf-16")

word_index_dict = {i.rstrip():index for index,i in enumerate(vocab.readlines())}
with codecs.open("brown_100.txt", "r",'utf-16') as f:
    text = f.read().lower()
#TODO: iterate through file and update counts
divide = [0.25, 0.5, 0.75, 1]
prob_1 = []
prob_0 = []
for i in divide:
    text2 = text[:round(len(text)*i)]
    counts = np.array([text2.count(' '+word+' ') for word in word_index_dict])
    #TODO: normalize and writeout counts. 
    prob = counts/counts.sum()
    prob_1.append(sum(prob==1/counts.sum())/len(word_index_dict))
    prob_0.append(sum(prob==0/counts.sum())/len(word_index_dict))
prob_0 = np.asarray(prob_0)
prob_1 = np.asarray(prob_1)

# 畫圖
plt.rcParams['figure.figsize'] = (9.0, 10.0)
def plot_result(y,x,xlabel='Number of words in the corpus',ylabel = 'Prob',title='Probabilities of the word occurred X times'):
    y = np.array(y)
    plt.title(title)
    plt.xlabel(xlabel)
    plt.ylabel(ylabel)
    plt.grid(True)
    plt.plot(x, y)
    # plt.show()
    
x = np.asarray([round(len(text)*i) for i in divide])

plt.subplot(2,1,1)
plot_result(prob_1,x)
plot_result(prob_0,x)
plt.legend(['Once','Zero'], loc='best')

plt.subplot(2,1,2)
plot_result(prob_1+prob_0,x,ylabel='Sum of probs',title='The probabilities of the word occurred zero and one times')

plt.tight_layout()
plt.savefig('result.png')
plt.show()

在這裡插入圖片描述

Bigram models

3. Building an MLE bigram model

bigram模型

import codecs
import numpy as np
from sklearn.preprocessing import normalize
from generate import GENERATE
import random

# bigram和unigram差別很大
#load the indices dictionary
with codecs.open("brown_vocab_100.txt", "r", encoding="utf-16") as vocab:
    Dict = {i.rstrip():index for index,i in enumerate(vocab.readlines())}
# 句尾加一個' '防止把all the 和 all there搞混
Dict2 = np.asarray([j[0]+' '+j[1]  for word in Dict for j in zip([word]*len(Dict),Dict)]).reshape([813]*2)


#TODO: iterate through file and update counts
with codecs.open("brown_100.txt", "r",'utf-16') as f:
    text = f.read().lower()


# 多維array裡,一個個元素迭代,readwrite允許讀寫
it = np.nditer(Dict2, flags=['multi_index'])
count2 = []


while not it.finished:
    # </s> \r\n<s> 句尾和另一句的接頭是這樣子,不在字典內
    if '<s>' in it.value.tolist(): 
        count = text.count(Dict2[it.multi_index]+' ')
    else:   
        count = text.count(' '+Dict2[it.multi_index]+' ')
    
    count2.append(count)
    it.iternext()
# 去掉了
count2 = np.asarray(count2).reshape([813]*2)

#TODO: normalize counts
probs = normalize(count2, norm='l1', axis=1)
# p(the | all)
print(probs[Dict2 == 'all the'])
# p(jury | the)
print(probs[Dict2 == 'the jury'])
# p(campaign | the)
print(probs[Dict2 == 'the campaign'])
# p(calls | anonymous)
print(probs[Dict2 == 'anonymous calls'])
[1.]
[0.08333333]
[0.00641026]
[0.33333333]

生成模型:

# def GENERATE(word_index_dict, probs, model_type, max_words, start_word):
start_word = "<s>"
max_words = 20
returnSTR = ""
index_word_dict = {v: k for k, v in Dict.items()}
num_words = 0

returnSTR = start_word + " "
prevWord = start_word
while(True):
    wordIndex = np.random.choice(len(word_index_dict), 1, p=list(probs[word_index_dict[prevWord]]))
    word = index_word_dict[wordIndex[0]]
    returnSTR += word + " "
    prevWord = word
    num_words +=1
    if word == "</s>" or num_words == max_words:
        break
print(returnSTR)

雖然還是不明所以(第一句還不錯),但是比unigram好太多了8

<s> it was the county democratic executive committee . </s> 
<s> the size of sunday night in a proportionate distribution of this problem . </s>

###4. Add-α smoothing the bigram model
加一(拉普拉斯平滑)和加0.1平滑

# Laplace smoothing
count2_laplace = count2+1
probs_laplace = normalize(count2_laplace, norm='l1', axis=1)


# p(the | all)
print(probs_laplace[Dict2 == 'all the'])
# p(jury | the)
print(probs_laplace[Dict2 == 'the jury'])
# p(campaign | the)
print(probs_laplace[Dict2 == 'the campaign'])
# p(calls | anonymous)
print(probs_laplace[Dict2 == 'anonymous calls'])
[0.002457]
[0.01444788]
[0.00206398]
[0.00245098]
# add-α smoothing
count2_alpha = count2+0.1
probs_alpha = normalize(count2_alpha, norm='l1', axis=1)


# p(the | all)
print(probs_alpha[Dict2 == 'all the'])
# p(jury | the)
print(probs_alpha[Dict2 == 'the jury'])
# p(campaign | the)
print(probs_alpha[Dict2 == 'the campaign'])
# p(calls | anonymous)
print(probs_alpha[Dict2 == 'anonymous calls'])
[0.01336574]
[0.05520438]
[0.00463548]
[0.01304864]
Original Laplace smoothing(add one) α \alpha smoothing(add point one)
p ( t h e a l l ) p(the\vert all) 1. 0.002457 0.01336574
p ( j u r y t h e ) p(jury \vert the) 0.08333333 0.01444788 0.05520438
p ( c a m p a i g n t h e ) p(campaign \vert the) 0.00641026 0.00206398 0.00463548
p ( c a l l s a n o n y m o u s ) p(calls\vert anonymous) 0.33333333 0.00245098 0.01304864

問:為什麼平滑模型中所有四個概率都下降了?現在請注意,概率並沒有全部減少相同的數量。特別是,以’the’為條件的兩個概率僅略微下降,而另外兩個概率(以’all’和’anonymous’為條件)相當顯著地下降。問:為什麼add-α平滑導致以’the’為條件的概率比其他的更低?為什麼這種行為(導致’the’的概率低於其他因素)是一件好事?在弄清楚這一點時,您可能會發現檢視計數矩陣的相關各行(在新增0.1之前)以檢視它們的不同之處是有用的。在numpy中,你可以看看第n行counts 矩陣使用counts[n,]。

A: the為前一個的字元明顯比較多,此時增加 α \alpha 影響就小,但是像以anonymous為前一個的,全語料庫就3個,所以影響當然大了。

Using n-gram models

5. Experimenting with a MLE trigram model

獲得單獨的 P ( w 2 w 1 , w 0 ) P(w_{2}|w_{1},w_{0})

def triFinder(_input, Dict):
    if type(_input) == str:
        # ...尋找index
        a,b,c = _input.split(' ')
        return np.array([Dict.get(a),Dict.get(b),Dict.get(c)])
    else:
        # 尋找str
        key = np.array(list(Dict.keys()))
        return key[_input[0]]+' '+key[_input[1]]
            
           

相關推薦

自然語言處理作業A2

自然語言處理 作業A2 Unigram model 1. Creating the word_to_index dictionary 2. Building an MLE unigram model Bi

cs224d 自然語言處理作業 problem set3 (一) 實現Recursive Nerual Net Work 遞歸神經網絡

函數 rec 合並 聯系 cs224 作業 itl clas 自然語言處理 1、Recursive Nerual Networks能夠更好地體現每個詞與詞之間語法上的聯系這裏我們選取的損失函數仍然是交叉熵函數 2、整個網絡的結構如下圖所示: 每個參數的更新時的梯隊值如何計算

自然語言處理作業A1

作業地址 參考資料 任務1:把HTML格式轉為JSON資料,再用python的JSON包,把JSON資料轉為python能使用的資料結構(dicts, lists…)(chaos2json.py) Your implementation should hav

DeepLearning.ai作業:(5-2) -- 自然語言處理與詞嵌入(NLP and Word Embeddings)

title: ‘DeepLearning.ai作業:(5-2) – 自然語言處理與詞嵌入(NLP and Word Embeddings)’ id: dl-ai-5-2h tags: dl.ai homework categories: AI Deep L

自然語言處理課程作業 中文文字情感分類

摘要:20世紀初以來,文字的情感分析在自然語言處理領域成為了研究的熱點,吸引了眾多學者越來越多的關注。對於中文文字的情感傾向性研究在這樣一大環境下也得到了顯著的發展。本文主要是基於機器學習方法的中文文字情感分類,主要包括:使用開源的Markup處理程式對XML檔案進行分析處理、中科院計算所開源的中文分詞處理

吳恩達Coursera深度學習課程 deeplearning.ai (5-2) 自然語言處理與詞嵌入--程式設計作業(一):詞向量運算

Part 1: 詞向量運算 歡迎來到本週第一個作業。 由於詞嵌入的訓練計算量龐大切耗費時間長,絕大部分機器學習人員都會匯入一個預訓練的詞嵌入模型。 你將學到: 載入預訓練單詞向量,使用餘弦測量相似度 使用詞嵌入解決類別問題,比如 “Man is to

吳恩達Coursera深度學習課程 deeplearning.ai (5-2) 自然語言處理與詞嵌入--程式設計作業(二):Emojify表情包

Part 2: Emojify 歡迎來到本週的第二個作業,你將利用詞向量構建一個表情包。 你有沒有想過讓你的簡訊更具表現力? emojifier APP將幫助你做到這一點。 所以不是寫下”Congratulations on the promotion! L

自然語言處理中的Attention Model:是什麽及為什麽

機器 逆序 mar 回來 是什麽 all 意義 及其 creation /* 版權聲明:可以任意轉載,轉載時請標明文章原始出處和作者信息 .*/ author: 張俊

gensim自然語言處理

encode content for 服務 讀取 htm all mat 自然語言 最近在做詞語的相似度做比較,就選用了gensim 首先要安裝gensim庫,此處省略,參看官網http://radimrehurek.com/gensim/install.html 在網上下

NLP系列(1)_從破譯外星人文字淺談自然語言處理的基礎

應用 展現 發現 func 文本 詞幹 pos 中文分詞 漢語 作者:龍心塵 &&寒小陽 時間:2016年1月。 出處: http://blog.csdn.net/longxinchen_ml/article/details/505

文本情感分析的基礎在於自然語言處理、情感詞典、機器學習方法等內容。以下是我總結的一些資源。

建議 中心 這場 分詞 自然語言處理 目前 能力開放 計算 推薦算法 文本情感分析的基礎在於自然語言處理、情感詞典、機器學習方法等內容。以下是我總結的一些資源。 詞典資源:SentiWordNet《知網》中文版中文情感極性詞典 NTUSD情感詞匯本體下載 自然語言處理

自然語言處理哪家強?

的語音 科學 點對點 亞馬遜 消息 合作 夢幻 項目 找到 自然語言處理哪家強? 摘要:語音交互事關未來,這點從大公司收購、投資、合作不斷,就可見一斑。如蘋果收購Siri、Novauris、Google收購多項語音識別技術專利、Facebook收購Wit.ai等、Ama

2017MySQL中文索引解決辦法 自然語言處理(N-gram parser)

ray spa 全文索引 rom alt lte int 中文索引 ble   問題:長期以來MYSQL搜索對於中文來說不太理想,InnoDB引擎對FULLTEXT索引的支持是MySQL5.6新引入的特性,但是用“初級”一詞在“我是一名初

(zhuan) 自然語言處理中的Attention Model:是什麽及為什麽

機器 pri 概念 max page acf 集中 use tps 自然語言處理中的Attention Model:是什麽及為什麽 2017-07-13 張俊林 待字閨中 要是關註深度學習在自然語言處理方面的研究進展,我相信你一定聽說過Attention Model(

95、自然語言處理svd詞向量

atp ear logs plt images svd分解 range src for import numpy as np import matplotlib.pyplot as plt la = np.linalg words = ["I","like","enjoy

NLP-python 自然語言處理01

count ems odin 頻率分布 str sep mon location don 1 # -*- coding: utf-8 -*- 2 """ 3 Created on Wed Sep 6 22:21:09 2017 4 5 @author: A

自然語言處理怎麽最快入門?

改進 一個 問答系統 好的 必須 開源 都在 程序 得出 自然語言處理(簡稱NLP),是研究計算機處理人類語言的一門技術,包括: 1.句法語義分析:對於給定的句子,進行分詞、詞性標記、命名實體識別和鏈接、句法分析、語義角色識別和多義詞消歧。 2.信息抽取:從給定文本中抽

Python自然語言處理1

cmd 輸入 函數調用 down load src 選擇 分享 cnblogs 首先,進入cmd 輸入pip install的路徑 隨後開始下載nltk的包 一、準備工作 1、下載nltk 我的之前因為是已經下載好了 ,我現在用的參考書是Python自然語言處理這本書,最

數學之美讀書筆記——自然語言處理教父和他的弟子們

自然語言處理 jpg alt 自然 .cn 讀書筆記 bsp blog 處理 數學之美讀書筆記——自然語言處理教父和他的弟子們

自然語言處理隨筆(一)

索引 中國 大學 import pip for earch 清華 北京 安裝jieba中文分詞命令:pip install jieba 簡單的例子: import jiebaseg_list = jieba.cut("我來到北京清華大學", cut_all=True)pri