1. 程式人生 > >自然語言處理作業A2


自然語言處理 作業A2

作業地址: link

Unigram model

1. Creating the word_to_index dictionary


# TODO: read brown_vocab_100.txt into word_index_dict
from generate import GENERATE
vocabs = codecs.open("brown_vocab_100.txt" , "r","utf-16") word_index_dict = {i.rstrip():index for index,i in enumerate(vocabs.readlines())} # TODO: write word_index_dict to word_to_index_100.txt with open("word_to_index_100.txt","w") as wf: for index,i in enumerate(word_index_dict.items()): c =
i[0]+' '+str(i[1])+'\n' wf.write(c) print(word_index_dict['all']) print(word_index_dict['resolution']) print(len(word_index_dict))

2. Building an MLE unigram model


vocab = codecs.open("brown_vocab_100.txt", "r", encoding="utf-16")

word_index_dict = {i.rstrip():index for index,i in enumerate(vocab.readlines())}
with codecs.open("brown_100.txt", "r",'utf-16') as f:
    text = f.read().lower()
#TODO: iterate through file and update counts
# 防止allen 裡找到all,text中 句子為' . '空格句號空格結尾
counts = np.array([text.count(' '+word+' ') if not word == '<s>' else text.count(word) for word in word_index_dict] )
#TODO: normalize and writeout counts. 
prob = counts/counts.sum()

with open("unigram_probs.txt","w") as wf:
    for index,i in enumerate(prob):
        # 由value找key
        word = list(word_index_dict.keys())[list(word_index_dict.values()).index(index)]
        c = word +' '+str(prob[index])+'\n'

這樣做其實挺危險的,如果因為 e n u m e r a t e ( w o r d _ i n d e x _ d i c t ) ] enumerate(word\_index\_dict)] 裡面如果亂序咋辦。作業裡面還附帶了一個生成的模型

returnSTR = ""
index_word_dict = {v: k for k, v in word_index_dict.items()}
num_words = 0
max_words = 20
probs = prob
#using https://stackoverflow.com/questions/483666/python-reverse-invert-a-mapping
    # 依據unigram的概率生成下一個字元
    wordIndex = np.random.choice(len(word_index_dict), 1, p=list(probs)) #output:[index],所以需要wordIndex[0]
    word = index_word_dict[wordIndex[0]]
    returnSTR += word + " "
    num_words +=1
    if word == "</s>" or num_words == max_words:


not worth , or . the , receives its the this term for or superintendent the or as on i

he i in . end wife i it can force . details i he these i he by despite a 


sum(prob==1/counts.sum())/len(word_index_dict) #ouput=0.5633

出現一次的比重未免太高了8,因為這是一個不全的dictionary,所以最後 p z e r o = 0 p_{zero}=0 ,大的資料集中, p o n c e p_{once} 肯定也會減小

import matplotlib.pyplot as plt
from matplotlib import rcParams
vocab = codecs.open("brown_vocab_100.txt", "r", encoding="utf-16")

word_index_dict = {i.rstrip():index for index,i in enumerate(vocab.readlines())}
with codecs.open("brown_100.txt", "r",'utf-16') as f:
    text = f.read().lower()
#TODO: iterate through file and update counts
divide = [0.25, 0.5, 0.75, 1]
prob_1 = []
prob_0 = []
for i in divide:
    text2 = text[:round(len(text)*i)]
    counts = np.array([text2.count(' '+word+' ') for word in word_index_dict])
    #TODO: normalize and writeout counts. 
    prob = counts/counts.sum()
prob_0 = np.asarray(prob_0)
prob_1 = np.asarray(prob_1)

# 畫圖
plt.rcParams['figure.figsize'] = (9.0, 10.0)
def plot_result(y,x,xlabel='Number of words in the corpus',ylabel = 'Prob',title='Probabilities of the word occurred X times'):
    y = np.array(y)
    plt.plot(x, y)
    # plt.show()
x = np.asarray([round(len(text)*i) for i in divide])

plt.legend(['Once','Zero'], loc='best')

plot_result(prob_1+prob_0,x,ylabel='Sum of probs',title='The probabilities of the word occurred zero and one times')



Bigram models

3. Building an MLE bigram model


import codecs
import numpy as np
from sklearn.preprocessing import normalize
from generate import GENERATE
import random

# bigram和unigram差別很大
#load the indices dictionary
with codecs.open("brown_vocab_100.txt", "r", encoding="utf-16") as vocab:
    Dict = {i.rstrip():index for index,i in enumerate(vocab.readlines())}
# 句尾加一個' '防止把all the 和 all there搞混
Dict2 = np.asarray([j[0]+' '+j[1]  for word in Dict for j in zip([word]*len(Dict),Dict)]).reshape([813]*2)

#TODO: iterate through file and update counts
with codecs.open("brown_100.txt", "r",'utf-16') as f:
    text = f.read().lower()

# 多維array裡,一個個元素迭代,readwrite允許讀寫
it = np.nditer(Dict2, flags=['multi_index'])
count2 = []

while not it.finished:
    # </s> \r\n<s> 句尾和另一句的接頭是這樣子,不在字典內
    if '<s>' in it.value.tolist(): 
        count = text.count(Dict2[it.multi_index]+' ')
        count = text.count(' '+Dict2[it.multi_index]+' ')
# 去掉了
count2 = np.asarray(count2).reshape([813]*2)

#TODO: normalize counts
probs = normalize(count2, norm='l1', axis=1)
# p(the | all)
print(probs[Dict2 == 'all the'])
# p(jury | the)
print(probs[Dict2 == 'the jury'])
# p(campaign | the)
print(probs[Dict2 == 'the campaign'])
# p(calls | anonymous)
print(probs[Dict2 == 'anonymous calls'])


# def GENERATE(word_index_dict, probs, model_type, max_words, start_word):
start_word = "<s>"
max_words = 20
returnSTR = ""
index_word_dict = {v: k for k, v in Dict.items()}
num_words = 0

returnSTR = start_word + " "
prevWord = start_word
    wordIndex = np.random.choice(len(word_index_dict), 1, p=list(probs[word_index_dict[prevWord]]))
    word = index_word_dict[wordIndex[0]]
    returnSTR += word + " "
    prevWord = word
    num_words +=1
    if word == "</s>" or num_words == max_words:


<s> it was the county democratic executive committee . </s> 
<s> the size of sunday night in a proportionate distribution of this problem . </s>

###4. Add-α smoothing the bigram model

# Laplace smoothing
count2_laplace = count2+1
probs_laplace = normalize(count2_laplace, norm='l1', axis=1)

# p(the | all)
print(probs_laplace[Dict2 == 'all the'])
# p(jury | the)
print(probs_laplace[Dict2 == 'the jury'])
# p(campaign | the)
print(probs_laplace[Dict2 == 'the campaign'])
# p(calls | anonymous)
print(probs_laplace[Dict2 == 'anonymous calls'])
# add-α smoothing
count2_alpha = count2+0.1
probs_alpha = normalize(count2_alpha, norm='l1', axis=1)

# p(the | all)
print(probs_alpha[Dict2 == 'all the'])
# p(jury | the)
print(probs_alpha[Dict2 == 'the jury'])
# p(campaign | the)
print(probs_alpha[Dict2 == 'the campaign'])
# p(calls | anonymous)
print(probs_alpha[Dict2 == 'anonymous calls'])
Original Laplace smoothing(add one) α \alpha smoothing(add point one)
p ( t h e a l l ) p(the\vert all) 1. 0.002457 0.01336574
p ( j u r y t h e ) p(jury \vert the) 0.08333333 0.01444788 0.05520438
p ( c a m p a i g n t h e ) p(campaign \vert the) 0.00641026 0.00206398 0.00463548
p ( c a l l s a n o n y m o u s ) p(calls\vert anonymous) 0.33333333 0.00245098 0.01304864

問:為什麼平滑模型中所有四個概率都下降了?現在請注意,概率並沒有全部減少相同的數量。特別是,以’the’為條件的兩個概率僅略微下降,而另外兩個概率(以’all’和’anonymous’為條件)相當顯著地下降。問:為什麼add-α平滑導致以’the’為條件的概率比其他的更低?為什麼這種行為(導致’the’的概率低於其他因素)是一件好事?在弄清楚這一點時,您可能會發現檢視計數矩陣的相關各行(在新增0.1之前)以檢視它們的不同之處是有用的。在numpy中,你可以看看第n行counts 矩陣使用counts[n,]。

A: the為前一個的字元明顯比較多,此時增加 α \alpha 影響就小,但是像以anonymous為前一個的,全語料庫就3個,所以影響當然大了。

Using n-gram models

5. Experimenting with a MLE trigram model

獲得單獨的 P ( w 2 w 1 , w 0 ) P(w_{2}|w_{1},w_{0})

def triFinder(_input, Dict):
    if type(_input) == str:
        # ...尋找index
        a,b,c = _input.split(' ')
        return np.array([Dict.get(a),Dict.get(b),Dict.get(c)])
        # 尋找str
        key = np.array(list(Dict.keys()))
        return key[_input[0]]+' '+key[_input[1]]



