1. 程式人生 > >Tensorflow官方文件word2vec_basic.py中文註釋

Tensorflow官方文件word2vec_basic.py中文註釋

隱語義模型實戰,使用Text8(http://mattmahoney.net/dc/textdata)資料集訓練word2vec
1、匯入庫

#在開頭加上from __future__ import print_function這句之後,即使在python2.X,使用print就得像python3.X那樣加括號使用。
from __future__ import print_function   
import collections
import math
import numpy as np
import os
import random
import tensorflow as tf
import zipfile
from matplotlib import pylab
from six.moves import range
from six.moves.urllib.request import urlretrieve
from sklearn.manifold import TSNE
print('check:libs well prepared')

2、下載資料並解壓

url = 'http://mattmahoney.net/dc/'

def maybe_download(filename,expected_bytes):
    #判斷檔案是否存在
    if not os.path.exists(filename):
        #下載
        print('download...')
        filename, _ = urlretrieve(url + filename,filename)
    #校驗大小
    statinfo = os.stat(filename)
    if statinfo.st_size == expected_bytes:
        print('Found and verified %s' % filename)
    else:
        print('exception %s' % statinfo.st_size)
    return filename

filename = maybe_download('text8.zip',31244016)

執行後輸出:exception 31344016

def read_data(filename):
    with zipfile.ZipFile(filename) as f:
        #tf.compat.as_str 資料轉單詞列表
        data = tf.compat.as_str(f.read(f.namelist()[0])).split()
    return data

words = read_data(filename)
print('Data size %d' % len(words))

執行後輸出:Data size 17005207
3、編碼並替換低頻詞

vocabulary_size = 50000

def build_dataset(words):
    #將所有低頻單詞設為UNK,個數先設為-1
    count = [['UNK',-1]]
    #將words集合中的單詞按頻數排序,將頻率最高的前vocabulary_size-1個單詞以及他們的出現的個數按順序輸出到count中,
    #將頻數排在n_words-1之後的單詞設為UNK。同時,count的規律為索引越小,單詞出現的頻率越高
    count.extend(collections.Counter(words).most_common(vocabulary_size - 1))
    dictionary = dict()
    #構建字典單詞到數字的對映
    for word,_ in count:
        #對count中所有單詞進行編號,賦予ID,由0開始,儲存在字典dict中
        dictionary[word] = len(dictionary)
        
    data = list()
    unk_count = 0
    
    for word in words:
        if word in dictionary:
            index = dictionary[word]
        else:
            index = 0  # dictionary['UNK']
            unk_count += 1
        data.append(index)
        
    #記錄UNK個數
    count[0][1] = unk_count
    
    #數字到單詞的對映
    reverse_dictionary = dict(zip(dictionary.values(),dictionary.keys()))
    return data,count,dictionary,reverse_dictionary 

#對映之後的訓練資料
data, count, dictionary, reverse_dictionary = build_dataset(words)

print('Most common words (+UNK)', count[:5])
print('original data', words[:10])
print('training data', data[:10])

執行後輸出:
Most common words (+UNK) [[‘UNK’, 418391], (‘the’, 1061396), (‘of’, 593677), (‘and’, 416629), (‘one’, 411764)]
original data [‘anarchism’, ‘originated’, ‘as’, ‘a’, ‘term’, ‘of’, ‘abuse’, ‘first’, ‘used’, ‘against’]
training data [5236, 3082, 12, 6, 195, 2, 3137, 46, 59, 156]

4、生成skip-gram訓練資料

#這個函式的功能是對資料data中的每個單詞,分別與前1個單詞和後1個個單詞生成一個batch
#skip_window代表左右各選取詞的個數,num_skips代表預測周圍單詞的總數
def generate_batch(batch_size,num_skips,skip_window):
    #定義全域性變數
    global data_index
    assert batch_size % num_skips == 0
    assert num_skips <= 2 * skip_window
    #x y、
    #建一個大小為batch的陣列
    batch = np.ndarray(shape=(batch_size),dtype = np.int32)
    #建一個(batch,1)大小的二位陣列,儲存任意單詞前一個或者後一個單詞,從而形成一個pair
    labels = np.ndarray(shape=(batch_size,1),dtype = np.int32) 
    #窗的大小,為3,結構為[ skip_window target skip_window ] 
    span = 2*skip_window + 1
    #建立一個結構為雙向佇列的緩衝區,大小不超過span
    buffer = collections.deque(maxlen=span) 
    for _ in range(span):
        buffer.append(data[data_index])
        # 迴圈使用
        data_index = (data_index + 1) % len(data)
    for i in range(batch_size // num_skips):
        #將target賦值為1,即當前單詞
        target = skip_window  
        #將target存入targets_to_avoid中,避免重複存入
        targets_to_avoid = [skip_window]
        for j in range(num_skips):
            #選出還沒出現在targets_to_avoid中的單詞索引
            while target in targets_to_avoid:
                target = random.randint(0, span - 1)
            #存入targets_to_avoid
            targets_to_avoid.append(target)
            #在batch中存入當前單詞
            batch[i * num_skips + j] = buffer[skip_window]
            #在labels中存入當前單詞前一個單詞或者後一個單詞
            labels[i * num_skips + j, 0] = buffer[target]
        buffer.append(data[data_index])
        data_index = (data_index + 1) % len(data)
    return batch, labels
    
    
print('data:', [reverse_dictionary[di] for di in data[:8]])
data_index = 0
batch, labels = generate_batch(batch_size=8, num_skips=2, skip_window=2)
print('batch:', [reverse_dictionary[bi] for bi in batch])
print('labels:', [reverse_dictionary[li] for li in labels.reshape(8)])

執行後輸出:
data: [‘anarchism’, ‘originated’, ‘as’, ‘a’, ‘term’, ‘of’, ‘abuse’, ‘first’]
batch: [‘as’, ‘as’, ‘a’, ‘a’, ‘term’, ‘term’, ‘of’, ‘of’]
labels: [‘term’, ‘anarchism’, ‘as’, ‘term’, ‘abuse’, ‘of’, ‘abuse’, ‘term’]

5、定義網路結構

batch_size = 128
embedding_size = 128    #Dimension of the embedding vector.
skip_window = 1         # How many words to consider left and right.
num_skips = 2           # How many times to reuse an input to generate a label
# We pick a random validation set to sample nearest neighbors. Here we limit the
# validation samples to the words that have a low numeric ID, which by
# construction are also the most frequent.
valid_size = 16         # Random set of words to evaluate similarity on.
valid_window = 100      # Only pick dev samples in the head of the distribution.
valid_example = np.array(random.sample(range(valid_window),valid_size))
num_sampled = 64        # Number of negative examples to sample.

graph = tf.Graph()


with graph.as_default(),tf.device('/cpu:0'):
    #輸入資料
         # 輸入一個batch的訓練資料,是當前單詞在字典中的索引id
    train_dataset = tf.placeholder(tf.int32,shape=[batch_size])
         # 輸入一個batch的訓練資料的標籤,是當前單詞前一個或者後一個單詞在字典中的索引id
    train_labels = tf.placeholder(tf.int32,shape=[batch_size,1])
        #從字典前100個單詞,即頻率最高的前100個單詞中,隨機選出16個單詞,將它們的id儲存起來,作為驗證集
    valid_dataset = tf.constant(valid_example,tf.int32)
    
    
    #初始化變數字典中每個單詞的embeddings,值為-1到1的均勻分佈
    embeddings = tf.Variable(tf.random_uniform([vocabulary_size,embedding_size],-1.0,1.0))
    #初始化訓練引數
    softmax_weight = tf.Variable(tf.truncated_normal([vocabulary_size,embedding_size],stddev=1.0/math.sqrt(embedding_size)))
    softmax_bisase = tf.Variable(tf.zeros([vocabulary_size]))
    
    #本次訓練資料對應的embedding
    embed = tf.nn.embedding_lookup(embeddings,train_dataset)
    #batch loss
    # Compute the average loss for the batch.
    # tf.nce_loss automatically draws a new sample of the negative labels eachtime we evaluate the loss.

    #根據詞頻或者類似詞頻的概率選出64個負取樣v,聯同正確的輸入w(都是詞的id),用它們在weights對應的向量組成
    #一個訓練子集。對於訓練子集中各個元素,如果是w或者m(i)==w(w這裡是輸入對應的embedding),
    #loss(i)=log(sigmoid(w*mu(i)))如果是負取樣,則loss(i)=log(1-sigmoid(w*mu(i)))然後將所有loss加起來作為總的
    #loss,loss越小越相似(餘弦定理)用總的loss對各個引數求導數,來更新weight以及輸入的embedding
    
    loss = tf.reduce_mean(tf.nn.sampled_softmax_loss(weights=softmax_weight,
                                                     biases=softmax_bisase,
                                                     inputs=embed,
                                                     labels=train_labels,
                                                     num_sampled = num_sampled,
                                                     num_classes=vocabulary_size))
    #優化loss,更新引數
    optimizer = tf.train.AdamOptimizer(1.0).minimize(loss)
    
    
    #歸一化
    #呼叫reduce_sum(arg1, arg2)時,引數arg1即為要求和的資料,arg2有兩個取值分別為0和1,通常用reduction_indices=[0]或
    #reduction_indices=[1]來傳遞引數。從上圖可以看出,當arg2 = 0時,是縱向對矩陣求和,原來矩陣有幾列就得到幾個值;相
    #似地,當arg2 = 1時,是橫向對矩陣求和;當省略arg2引數時,預設對矩陣所有元素進行求和。
    norm = tf.sqrt(tf.reduce_sum(tf.square(embeddings),1,keepdims=True))
    
    normalized_embeddings = embeddings / norm
    
    #用已有的embedding計算valid的相似詞
    valid_embeddings = tf.nn.embedding_lookup(normalized_embeddings,valid_dataset)
    similarity = tf.matmul(valid_embeddings,tf.transpose(normalized_embeddings))

6、執行訓練流程:

num_steps = 100000
with tf.Session(graph=graph) as session:
    # Add variable initializer.
    tf.global_variables_initializer().run()
    print('Initialized')
    average_loss = 0
    for step in range(num_steps+1):
        #生成一個batch的訓練資料
        batch_data,batch_labels = generate_batch(batch_size,num_skips,skip_window)
        
        feed_dict = {train_dataset : batch_data, train_labels : batch_labels}
        _,loss_val = session.run([optimizer,loss],feed_dict = feed_dict)
        average_loss += loss_val
        #每2000出列印一次平均loss
        if step % 2000 == 0:
            if step > 0:
                average_loss = average_loss / 2000
            print('Average loss at step %d: %f' % (step, average_loss))
            average_loss = 0
        
        #列印valid效果
        if step % 10000 == 0:
            #每10000步評估一下驗證集和整個embeddings的相似性
            #結果是驗證集中每個詞和字典中所有詞的相似性  
            sim = similarity.eval()
            for i in range(valid_size):
                #根據id找回詞
                valid_word = reverse_dictionary[valid_example[i]]
                #相似度最高的5個詞
                top_k = 5
                nearest = (-sim[i, :]).argsort()[1:top_k+1]
                log = 'Nearest to %s:' % valid_word
                for k in range(top_k):
                    close_word = reverse_dictionary[nearest[k]]
                    log = '%s %s,' % (log, close_word)
                print(log)
                
        final_embeddings = normalized_embeddings.eval()

執行效果:

Initialized
Average loss at step 0: 8.377776
Nearest to new: reformer, inventor, interrupts, cranmer, shih,
Nearest to may: misandry, gaius, supplement, inappropriate, kaposi,
Nearest to he: glycogen, iic, cuzco, cranmer, deeds,
Nearest to be: overload, liquidity, litt, lux, sirens,
Nearest to in: norman, still, wa, arctocephalus, nitrogenous,
Nearest to b: separable, jurisprudence, flo, endogamous, aviator,
Nearest to eight: appropriating, curl, blogging, comecon, vikernes,
Nearest to who: topper, sena, disprove, capitoline, netsplit,
Nearest to known: prix, attributes, accelerates, excused, part,
Nearest to but: basayev, valens, landon, dojos, masking,
Nearest to also: sword, suffolk, nl, augmenting, ipcc,
Nearest to six: transitory, gras, championing, misuse, acoustical,
Nearest to will: potsdam, fangio, roskilde, obtainable, surpassed,
Nearest to many: paradiso, polysaccharides, adolphus, krone, framing,
Nearest to into: bruun, interim, dns, attractors, hangings,
Nearest to to: dimers, womanizer, unemployment, hoo, necessitate,
Average loss at step 2000: 5653.813801
Average loss at step 4000: 10679.583946
Average loss at step 6000: 12639.168628
Average loss at step 8000: 13891.868634
Average loss at step 10000: 14616.583857
Nearest to new: financial, write, phospholipids, fta, narbonne,
Nearest to may: phallic, supervises, catches, paw, specification,
Nearest to he: capitalised, macao, exhibited, mathrm, commemorates,
Nearest to be: feininger, lyrics, lps, jenny, fermentation,
Nearest to in: powder, c, daniel, compounds, duplicating,
Nearest to b: script, igbo, grace, collection, marlowe,
Nearest to eight: edmonton, jennie, de, nims, lucretia,
Nearest to who: writeup, vocally, fuller, banach, responded,
Nearest to known: actions, observances, pistols, wherein, wedding,
Nearest to but: drummer, dime, passion, glowing, miniature,
Nearest to also: mechanism, replicating, domesticated, euston, vindication,
Nearest to six: physik, parted, election, predominantly, ca,
Nearest to will: worsen, derives, allophones, pseudopods, jermaine,
Nearest to many: individualists, elders, living, montenegro, location,
Nearest to into: uses, lay, albums, astronaut, approximated,
Nearest to to: denounced, exhibited, boomerangs, topics, wept,
Average loss at step 12000: 15368.244784
。。。。。。

7、視覺化

num_points = 400
#perplexity:浮點型,可選(預設:30)較大的資料集通常需要更大的perplexity。考慮選擇一個介於5和50之間的值。由於t-SNE對這個引數非常不敏感,所以選擇並不是非常重要。
#n_components:int,可選(預設值:2)嵌入式空間的維度。
#init:字串,可選(預設值:“random”)嵌入的初始化。可能的選項是“隨機”和“pca”。 PCA初始化不能用於預先計算的距離,並且通常比隨機初始化更全域性穩定。
#n_iter:int,可選(預設值:1000)優化的最大迭代次數。至少應該200。
tsne = TSNE(perplexity=30,n_components=2,init='pca',n_iter=5000)
two_d_embeddings = tsne.fit_transform(final_embeddings[1:num_point+1,:])
def plot(embeddings,labels):
    assert embeddings.shape[0] >= len(labels),'More labels than embeddings'
    # in inches
    pylab.figure(figuresize=(20,20)) 
    for i, label in enumerate(labels):
    x, y = embeddings[i,:]
    pylab.scatter(x, y)
    pylab.annotate(label, xy=(x, y), xytext=(5, 2), textcoords='offset points',
                   ha='right', va='bottom')
    pylab.show()

words = [reverse_dictionary[i] for i in range(1, num_points+1)]
plot(two_d_embeddings, words)

在這裡插入圖片描述