Part 1: 詞向量運算

歡迎來到本週第一個作業。

由於詞嵌入的訓練計算量龐大切耗費時間長，絕大部分機器學習人員都會匯入一個預訓練的詞嵌入模型。

你將學到：

載入預訓練單詞向量，使用餘弦測量相似度
使用詞嵌入解決類別問題，比如 “Man is to Woman as King is to __”
修改文字嵌入以減少他們的性別偏見

導包

import numpy as np
from w2v_utils import *

w2v_utils 中有用的函式

from keras.models import Model
from keras.layers import Input, Dense, Reshape, merge
from 
 keras.layers.embeddings import Embedding
from keras.preprocessing.sequence import skipgrams
from keras.preprocessing import sequence

import urllib.request
import collections
import os
import zipfile

import numpy as np
import tensorflow as tf

window_size = 3
vector_dim = 300
epochs = 1000

valid_size = 16 
     # Random set of words to evaluate similarity on.
valid_window = 100  # Only pick dev samples in the head of the distribution.
valid_examples = np.random.choice(valid_window, valid_size, replace=False)

def maybe_download(filename, url, expected_bytes):
    """Download a file if not present, and make sure it's the right size.""" 

    if not os.path.exists(filename):
        filename, _ = urllib.request.urlretrieve(url + filename, filename)
    statinfo = os.stat(filename)
    if statinfo.st_size == expected_bytes:
        print('Found and verified', filename)
    else:
        print(statinfo.st_size)
        raise Exception(
            'Failed to verify ' + filename + '. Can you get to it with a browser?')
    return filename


# Read the data into a list of strings.
def read_data(filename):
    """Extract the first file enclosed in a zip file as a list of words."""
    with zipfile.ZipFile(filename) as f:
        data = tf.compat.as_str(f.read(f.namelist()[0])).split()
    return data


def build_dataset(words, n_words):
    """Process raw inputs into a dataset."""
    count = [['UNK', -1]]
    count.extend(collections.Counter(words).most_common(n_words - 1))
    dictionary = dict()
    for word, _ in count:
        dictionary[word] = len(dictionary)
    data = list()
    unk_count = 0
    for word in words:
        if word in dictionary:
            index = dictionary[word]
        else:
            index = 0  # dictionary['UNK']
            unk_count += 1
        data.append(index)
    count[0][1] = unk_count
    reversed_dictionary = dict(zip(dictionary.values(), dictionary.keys()))
    return data, count, dictionary, reversed_dictionary

def collect_data(vocabulary_size=10000):
    url = 'http://mattmahoney.net/dc/'
    filename = maybe_download('text8.zip', url, 31344016)
    vocabulary = read_data(filename)
    print(vocabulary[:7])
    data, count, dictionary, reverse_dictionary = build_dataset(vocabulary,
                                                                vocabulary_size)
    del vocabulary  # Hint to reduce memory.
    return data, count, dictionary, reverse_dictionary

class SimilarityCallback:
    def run_sim(self):
        for i in range(valid_size):
            valid_word = reverse_dictionary[valid_examples[i]]
            top_k = 8  # number of nearest neighbors
            sim = self._get_sim(valid_examples[i])
            nearest = (-sim).argsort()[1:top_k + 1]
            log_str = 'Nearest to %s:' % valid_word
            for k in range(top_k):
                close_word = reverse_dictionary[nearest[k]]
                log_str = '%s %s,' % (log_str, close_word)
            print(log_str)

    @staticmethod
    def _get_sim(valid_word_idx):
        sim = np.zeros((vocab_size,))
        in_arr1 = np.zeros((1,))
        in_arr2 = np.zeros((1,))
        in_arr1[0,] = valid_word_idx
        for i in range(vocab_size):
            in_arr2[0,] = i
            out = validation_model.predict_on_batch([in_arr1, in_arr2])
            sim[i] = out
        return sim


def read_glove_vecs(glove_file):
    with open(glove_file, 'r') as f:
        words = set()
        word_to_vec_map = {}

        for line in f:
            line = line.strip().split()
            curr_word = line[0]
            words.add(curr_word)
            word_to_vec_map[curr_word] = np.array(line[1:], dtype=np.float64)

    return words, word_to_vec_map

def relu(x):
    """
    Compute the relu of x

    Arguments:
    x -- A scalar or numpy array of any size.

    Return:
    s -- relu(x)
    """
    s = np.maximum(0,x)

    return s


def initialize_parameters(vocab_size, n_h):
    """
    Arguments:
    layer_dims -- python array (list) containing the dimensions of each layer in our network

    Returns:
    parameters -- python dictionary containing your parameters "W1", "b1", "W2", "b2":
                    W1 -- weight matrix of shape (n_h, vocab_size)
                    b1 -- bias vector of shape (n_h, 1)
                    W2 -- weight matrix of shape (vocab_size, n_h)
                    b2 -- bias vector of shape (vocab_size, 1)
    """

    np.random.seed(3)
    parameters = {}

    parameters['W1'] = np.random.randn(n_h, vocab_size) / np.sqrt(vocab_size)
    parameters['b1'] = np.zeros((n_h, 1))
    parameters['W2'] = np.random.randn(vocab_size, n_h) / np.sqrt(n_h)
    parameters['b2'] = np.zeros((vocab_size, 1))

    return parameters

def softmax(x):
    """Compute softmax values for each sets of scores in x."""
    e_x = np.exp(x - np.max(x))
    return e_x / e_x.sum()

本作業中，我們使用50維的 Glove 向量來表示詞。匯入資料：

words, word_to_vec_map = read_glove_vecs('data/glove.6B.50d.txt')

其中

words: 詞典中的詞集合
word_to_vec_map: 表示單詞到向量對映的map。

one-hot向量不擅長表示向量相似度(內積為0), Glove 向量包含了單詞更多的資訊，下面看看如何使用 Glove 向量計算相似度。

1 餘弦相似度

為了測量兩個詞的相似程度，需要測量兩個詞嵌入向量之間的相似程度。給定兩個向量u和v，餘弦相似度定義如下：

CosineSimilarity(u, v) = \frac{u . v}{| | u | |_{2} | | v | |_{2}} = c o s (θ)

分子表示兩個向量的內積，分母是向量的模的乘積，

θ

表示向量夾角，向量越近夾角越小，cos 值越大。

練習：實現cosine_similarity()方法來測量向量相似度

謹記：向量的模：向量每項平方加和再開方

# GRADED FUNCTION: cosine_similarity

def cosine_similarity(u, v):
    """
    Cosine similarity reflects the degree of similariy between u and v

    Arguments:
        u -- a word vector of shape (n,)          
        v -- a word vector of shape (n,)

    Returns:
        cosine_similarity -- the cosine similarity between u and v defined by the formula above.
    """

    distance = 0.0

    ### START CODE HERE ###
    # Compute the dot product between u and v (≈1 line)
    dot = np.dot(u, v)
    # Compute the L2 norm of u (≈1 line)
    norm_u = np.sqrt(np.sum(u**2))

    # Compute the L2 norm of v (≈1 line)
    norm_v = np.sqrt(np.sum(v**2))
    # Compute the cosine similarity defined by formula (1) (≈1 line)
    cosine_similarity = dot / (norm_u * norm_v)
    ### END CODE HERE ###

    return cosine_similarity

#############################################

father = word_to_vec_map["father"]
mother = word_to_vec_map["mother"]
ball = word_to_vec_map["ball"]
crocodile = word_to_vec_map["crocodile"]
france = word_to_vec_map["france"]
italy = word_to_vec_map["italy"]
paris = word_to_vec_map["paris"]
rome = word_to_vec_map["rome"]

print("cosine_similarity(father, mother) = ", cosine_similarity(father, mother))
print("cosine_similarity(ball, crocodile) = ",cosine_similarity(ball, crocodile))
print("cosine_similarity(france - paris, rome - italy) = ",cosine_similarity(france - paris, rome - italy))

# cosine_similarity(father, mother) =  0.890903844289
# cosine_similarity(ball, crocodile) =  0.274392462614
# cosine_similarity(france - paris, rome - italy) =  -0.675147930817

期待的輸出

key	value
cosine_similarity(father, mother)	0.890903844289
cosine_similarity(ball, crocodile)	0.274392462614
cosine_similarity(france - paris, rome - italy)	-0.675147930817

2 單詞類比推理

類比推理任務中需要實現”a is to b as c is to __” 比如”man is to woman as king is to queen”。我們需要找到單詞 d,使得”e_b−e_a ≈ e_d−e_c”
也就是兩組的差向量應該相似(仍然用 cos 來衡量)

練習：實現類比推理

# GRADED FUNCTION: complete_analogy

def complete_analogy(word_a, word_b, word_c, word_to_vec_map):
    """
    Performs the word analogy task as explained above: a is to b as c is to ____. 

    Arguments:
    word_a -- a word, string
    word_b -- a word, string
    word_c -- a word, string
    word_to_vec_map -- dictionary that maps words to their corresponding vectors. 

    Returns:
    best_word --  the word such that v_b - v_a is close to v_best_word - v_c, as measured by cosine similarity
    """

    # convert words to lower case
    word_a, word_b, word_c = word_a.lower(), word_b.lower(), word_c.lower()

    ### START CODE HERE ###
    # Get the word embeddings v_a, v_b and v_c (≈1-3 lines)
    e_a, e_b, e_c = word_to_vec_map[word_a], word_to_vec_map[word_b], word_to_vec_map[word_c]
    ### END CODE HERE ###

    words = word_to_vec_map.keys()
    max_cosine_sim = -100              # Initialize max_cosine_sim to a large negative number
    best_word = None                   # Initialize best_word with None, it will help keep track of the word to output

    # loop over the whole word vector set
    for w in words:        
        # to avoid best_word being one of the input words, pass on them.
        if w in [word_a, word_b, word_c] :
            continue

        ### START CODE HERE ###
        # Compute cosine similarity between the vector (e_b - e_a) and the vector ((w's vector representation) - e_c)  (≈1 line)
        cosine_sim = cosine_similarity(e_b - e_a, word_to_vec_map[w] - e_c)

        # If the cosine_sim is more than the max_cosine_sim seen so far,
            # then: set the new max_cosine_sim to the current cosine_sim and the best_word to the current word (≈3 lines)
        if cosine_sim > max_cosine_sim:
            max_cosine_sim = cosine_sim
            best_word = w
        ### END CODE HERE ###

    return best_word

####################################################

triads_to_try = [('italy', 'italian', 'spain'), ('india', 'delhi', 'japan'), ('man', 'woman', 'boy'), ('small', 'smaller', 'large')]
for triad in triads_to_try:
    print ('{} -> {} :: {} -> {}'.format( *triad, complete_analogy(*triad,word_to_vec_map)))

# italy -> italian :: spain -> spanish
# india -> delhi :: japan -> tokyo
# man -> woman :: boy -> girl
# small -> smaller :: large -> larger

期待的輸出

key	value
italy -> italian	spain -> spanish
india -> delhi	japan -> tokyo
man -> woman	boy -> girl
small -> smaller	large -> larger

你可以自己試試：small->smaller as big->?

謹記

cos 是衡量向量相似度的好方法
對於 NLP 應用，使用一個預訓練的模型開始工作是一個不錯的選擇

3 消除詞向量偏見 (可選)

在下面的練習中，你將檢查詞嵌入中的性別偏見，並研究減少偏見的演算法。這部分涉及到一些線性代數，不要害怕，都是比較簡單的。

先看看與性別有關的 Glove 詞嵌入。 $g = e_{w o m a n} - e_{m a n}$ 那麼 g 向量認為就是性別有關的向量。

g = word_to_vec_map['woman'] - word_to_vec_map['man']
print(g)

# [-0.087144    0.2182     -0.40986    -0.03922    -0.1032      0.94165
#  -0.06042     0.32988     0.46144    -0.35962     0.31102    -0.86824
#   0.96006     0.01073     0.24337     0.08193    -1.02722    -0.21122
#   0.695044   -0.00222     0.29106     0.5053     -0.099454    0.40445
#   0.30181     0.1355     -0.0606     -0.07131    -0.19245    -0.06115
#  -0.3204      0.07165    -0.13337    -0.25068714 -0.14293    -0.224957
#  -0.149       0.048882    0.12191    -0.27362    -0.165476   -0.20426
#   0.54376    -0.271425   -0.10245    -0.32108     0.2516     -0.33455
#  -0.04371     0.01258   ]

然後利用 cos 計算不同單詞的差向量與 g 向量的相似度，考慮什麼是正相關什麼是負相關。

print ('List of names and their similarities with constructed vector:')

# girls and boys name
name_list = ['john', 'marie', 'sophie', 'ronaldo', 'priya', 'rahul', 'danielle', 'reza', 'katy', 'yasmin']

for w in name_list:
    print (w, cosine_similarity(word_to_vec_map[w], g))

# List of names and their similarities with constructed vector:
# john -0.23163356146
# marie 0.315597935396
# sophie 0.318687898594
# ronaldo -0.312447968503
# priya 0.17632041839
# rahul -0.169154710392
# danielle 0.243932992163
# reza -0.079304296722
# katy 0.283106865957
# yasmin 0.233138577679

注意到女性名稱與 g 正相關多一些，男性名字與 g負相關多一些。

再試試其他的詞

print('Other words and their similarities:')
word_list = ['lipstick', 'guns', 'science', 'arts', 'literature', 'warrior','doctor', 'tree', 'receptionist', 
             'technology',  'fashion', 'teacher', 'engineer', 'pilot', 'computer', 'singer']
for w in word_list:
    print (w, cosine_similarity(word_to_vec_map[w], g))

# Other words and their similarities:
# lipstick 0.276919162564
# guns -0.18884855679
# science -0.0608290654093
# arts 0.00818931238588
# literature 0.0647250443346
# warrior -0.209201646411
# doctor 0.118952894109
# tree -0.0708939917548
# receptionist 0.330779417506
# technology -0.131937324476
# fashion 0.0356389462577
# teacher 0.179209234318
# engineer -0.0803928049452
# pilot 0.00107644989919
# computer -0.103303588739
# singer 0.185005181365

令人驚訝的是這些與本應中立的詞也有的不同的性別偏見，例如，“電腦”更接近“男人”，而“文學”更接近“女人”。

下面使用Boliukbasi等人2016年的演算法來減少向量的偏見。應該保留性別相關的單詞(actor/actress或grandmother/grandfather)，中和那些與性別無關的單詞(receptionist/technology)。

我們將採用不同的方式來處理這兩種不同型別的詞對。

3.1 中和無關性別的單詞偏見

對於一個無關性別的單詞，應該將其性別偏見消除，也就是將詞向量分解為 g 方向和 g⊥方向，我們消除 g 方向的分量，僅保持g⊥方向的分量即可。

練習：實現 neutralize() 函式消除性別無關單詞的性別偏見

給定一個詞嵌入向量 e, 如下計算消除偏見後的向量。

先計算 e 向量在 g 方向上的分量
再用 e 減去上述分量即為無偏方向上的分量

e^{b i a s_c o m p o n e n t} = \frac{e * g}{| | g | |_{2}^{2}} * g

e^{d e b i a s e d} = e - e^{b i a s_c o m p o n e n t}

def neutralize(word, g, word_to_vec_map):
    """
    Removes the bias of "word" by projecting it on the space orthogonal to the bias axis. 
    This function ensures that gender neutral words are zero in the gender subspace.

    Arguments:
        word -- string indicating the word to debias
        g -- numpy-array of shape (50,), corresponding to the bias axis (such as gender)
        word_to_vec_map -- dictionary mapping words to their corresponding vectors.

    Returns:
        e_debiased -- neutralized word vector representation of the input "word"
    """

    ### START CODE HERE ###
    # Select word vector representation of "word". Use word_to_vec_map. (≈ 1 line)
    e = word_to_vec_map[word]

    # Compute e_biascomponent using the formula give above. (≈ 1 line)
    e_biascomponent = np.dot(e, g) / np.square(np.linalg.norm(g)) * g

    # Neutralize e by substracting e_biascomponent from it 
    # e_debiased should be equal to its orthogonal projection. (≈ 1 line)
    e_debiased = e - e_biascomponent
    ### END CODE HERE ###

    return e_debiased

######################################################

e = "receptionist"
print("cosine similarity between " + e + " and g, before neutralizing: ", cosine_similarity(word_to_vec_map["receptionist"], g))

e_debiased = neutralize("receptionist", g, word_to_vec_map)
print("cosine similarity between " + e + " and g, after neutralizing: ", cosine_similarity(e_debiased, g))

# cosine similarity between receptionist and g, before neutralizing:  0.330779417506
# cosine similarity between receptionist and g, after neutralizing:  -3.26732746085e-17

期待的輸出：

key	value
cosine similarity between receptionist and g, before neutralizing:	0.330779417506
cosine similarity between receptionist and g, after neutralizing:	-3.26732746085e-17

第二個值非常小，近似為0

3.2 性別相關詞彙的均衡演算法

接下來，我們看看消除偏見如何應用於性別單詞對：比如 actress/actor。

首先，希望將性別單詞對向量設定為僅性別不同，所以應該在g⊥分量上相等。
其次，為了保證與中和過的無關向量距離相等，需要設定為在g分量上基於對稱軸g⊥對稱。

吳恩達Coursera深度學習課程 deeplearning.ai (5-2) 自然語言處理與詞嵌入--程式設計作業(一)：詞向量運算