從零開始 TensorFlow Word2vec

阿新 • • 發佈：2018-12-25

Word2vec 之 Skip-Gram 模型

from __future__ import division, print_function, absolute_import

import collections
import os
import random
import urllib.request
import zipfile

import numpy as np
import tensorflow as tf

learning_rate=0.1
batch_size=128
num_steps=3000000
display_step=10000
eval_step=200000

eval_words= 
[b'five',b'of',b'going',b'hardware',b'american',b'britain']
embedding_size=200
max_vocabulary_size=50000
min_occurrence=10
skip_window=3
num_skips=2
num_sampled=64

url = 'http://mattmahoney.net/dc/text8.zip'#自己先下載好
data_path = r'E:\learn\pc_code\tensorflow\12.24\text8.zip'


with zipfile.ZipFile(data_path) as 
 f:
    text_words=f.read(f.namelist()[0]).lower().split()

count=[('UNK',-1)]
count.extend(collections.Counter(text_words).most_common(max_vocabulary_size-1))
#去掉小於10次的單詞
for i in range(len(count)-1,-1,-1):
    if count[i][1]<min_occurrence:
        count.pop(i)
    else:
        break
vocabulary_size= 
len(count)
word2id=dict()
for i, (word,_) in enumerate(count):
    word2id[word]=i

data=list()
unk_count=0
for word in text_words:
    index=word2id.get(word,0)
    if index==0: unk_count+=1
    data.append(index)
count[0]=('UNK',unk_count)
id2word=dict(zip(word2id.values(),word2id.keys()))

print("Words count:", len(text_words))
print("Unique words:", len(set(text_words)))
print("vocabulary_size:", vocabulary_size)
print("Most Commen words:", count[:10])

data_index=0
def next_batch(batch_size,num_skips,skip_window):
    global data_index
    assert batch_size%num_skips==0
    assert num_skips<= 2*skip_window
    batch=np.ndarray(shape=(batch_size),dtype=np.int32)
    labels=np.ndarray(shape=(batch_size,1),dtype=np.int32)
    span=2*skip_window+1
    buffer=collections.deque(maxlen=span)
    if data_index + span>len(data):
        data_index=0
    buffer.extend(data[data_index:data_index+span])
    data_index+=span
    for i in range(batch_size//num_skips):
        context_words=[w for w in range(span) if w != skip_window]
        words_to_use=random.sample(context_words,num_skips)
        for j, context_word in enumerate(words_to_use):
            batch[i*num_skips+j]=buffer[skip_window]
            labels[i*num_skips+j,0]=buffer[context_word]
        if data_index==len(data):
            buffer.extend(data[0:span])
            data_index=span
        else:
            buffer.append(data[data_index])
            data_index+=1
    data_index=(data_index+len(data)-span)%len(data)
    return batch,labels

X=tf.placeholder(tf.int32,shape=[None])
Y=tf.placeholder(tf.int32,shape=[None,1])

with tf.device('/cpu:0'):
    embedding=tf.Variable(tf.random_normal([vocabulary_size,embedding_size]))
    X_embed=tf.nn.embedding_lookup(embedding,X)
    nce_weights=tf.Variable(tf.random_normal([vocabulary_size,embedding_size]))
    nce_biases=tf.Variable(tf.zeros([vocabulary_size]))
loss_op=tf.reduce_mean(
    tf.nn.nce_loss(
        weights=nce_weights,
        biases=nce_biases,
        labels=Y,
        inputs=X_embed,
        num_sampled=num_sampled,
        num_classes=vocabulary_size
    )
)
optimizer=tf.train.GradientDescentOptimizer(learning_rate)
train_op=optimizer.minimize(loss_op)

X_embed_norm = X_embed/tf.sqrt(tf.reduce_sum(tf.square(X_embed)))
embedding_norm = embedding/tf.sqrt(tf.reduce_sum(tf.square(embedding),1,keepdims=True))
cosine_sim_op=tf.matmul(X_embed_norm,embedding_norm,transpose_b=True)

init=tf.global_variables_initializer()
with tf.Session() as sess:
    sess.run(init)
    x_test=np.array([word2id[w] for w in eval_words])
    average_loss=0
    for step in range(1,num_steps+1):
        batch_x, batch_y=next_batch(batch_size, num_skips, skip_window)
        _, loss=sess.run([train_op,loss_op],feed_dict={X:batch_x,Y:batch_y})
        average_loss += loss

        if step % display_step==0 or step==1:
            if step > 1:
                average_loss /= display_step
            print("Step " + str(step) + ", Average Loss= " + \
                  "{:.4f}".format(average_loss))
            average_loss = 0

        # Evaluation
        if step % eval_step == 0 or step == 1:
            print("Evaluation...")
            sim = sess.run(cosine_sim_op, feed_dict={X: x_test})
            for i in range(len(eval_words)):
                top_k = 8  # number of nearest neighbors
                nearest = (-sim[i, :]).argsort()[1:top_k + 1]
                log_str = '"%s" nearest neighbors:' % eval_words[i]
                for k in range(top_k):
                    log_str = '%s %s,' % (log_str, id2word[nearest[k]])
                print(log_str)

從零開始 TensorFlow Word2vec

Word2vec 之 Skip-Gram 模型 from __future__ import division, print_function, absolute_import import collections import os import random import urll

從零開始 TensorFlow 立即進行符號運算

from __future__ import absolute_import, division, print_function import numpy as np import tensorflow as tf import tensorflow.contrib.eager as tf

從零開始 TensorFlow NN

L1距離，在訓練集中找出離測試資料最近的資料，比較他們的標籤 from __future__ import print_function import numpy as np import tensorflow as tf from tensorflow.examples.tutori

從零開始 TensorFlow session進行符號運算

from __function__ import print_function import tensorflow as tf a=tf.constant(2) b=tf.constant(3) with tf.Session() as sess: print('sess run ..

從零開始 TensorFlow kmeans

tf.nn.embedding_lookup： tf.nn.embedding_lookup()就是根據input_ids中的id，尋找embeddings中的第id行。比如input_ids=[1,3,5]，則找出embeddings中第1，3，5行，組成一個tensor返回。 num

從零開始 TensorFlow softmax迴歸

tf.cast 是轉換型別 from __future__ import print_function import tensorflow as tf import numpy as np import matplotlib.pyplot as plt from tensorflow.e

從零開始 TensorFlow線性迴歸

from __future__ import print_function import tensorflow as tf import numpy as np import matplotlib.pyplot as plt nr=np.random learning_rate = 0.0

從零開始 TensorFlow RandomForest

tf.reset_default_graph() 注意要重新設定一下圖 from __future__ import print_function import tensorflow as tf from tensorflow.contrib.tensor_forest.python i

從零開始Tensorflow == 資深小白配置深度學習環境的血淚史

2018.2.6 再更在另一臺電腦上裝了一遍TF1.5.0，其實並不複雜，下載速度快的話1個小時就可以搞定，而且從頭到尾也並沒有之前出現過的任何問題，所以啊，歸根結底，還是版本的問題吃一塹長一智。下次謹記，不要網上亂翻教程，因為隨著版本更新都是有時限性的，最靠譜的還是官

linux從零開始安裝nvidia驅動和tensorflow

安裝nvidia驅動和CUDA 下載驅動和CUDA安裝包，在官網下載對應版本就行 sudo apt-get install linux-headers-$(uname -r) 或者 linux-headers-generic.否則直接安裝會報錯 kernel

從零開始使用tensorflow（2）——詞向量

前面記錄了安裝過程，現在開始使用詞向量。一．對tf的膚淺認識首先是tf的基本總結（時間有限，認識比較膚淺）： (1). 使用圖來表示計算； (2). 在session中執行圖； (3). 使用tensor來表示資料； (4). Variable維護狀態 (5). 使用

從零開始使用tensorflow（1）——安裝

1. Python和jdk1.8之前已經安裝好了。報平臺不支援這個whl檔案。下載get-pip.py之後， python2.7 get-pip.py重新安裝pip後，再執行上面的命令，tensorflow就安裝好了。 3. 剛安裝完0.8.0

教程 | 一步步從零開始：使用PyCharm和SSH搭建遠端TensorFlow開發環境

作者：Erik Hallström 機器之心編譯參與：機器之心編輯部一般而言，大型的神經網路對硬體能力有著較高的需求——往往需要強勁的 GPU 來加速計算。但是你也許還是想拿著一臺筆記本坐在咖啡店裡安靜地寫 TensorFlow 程式碼，同時還能享受每秒數萬億次

從零開始學TensorFlow

開發十年，就只剩下這套架構體系了！ >>>

從零開始——基於角色的權限管理01（補充）

itl jsp mage logs log sonar class htm -1 此博文較為詳細的介紹從零開始——基於角色的權限管理01文中的兩個部分的流程（解釋代碼）。 1)　　index.jsp中提交跳轉action 　　　　action的login，獲取jsp頁面傳

從零開始學習html（五）與瀏覽者交互，表單標簽——下

定位開始系統 isp ctr 程序顯示 text 輸入六、使用下拉列表框進行多選 1 <!DOCTYPE HTML> 2 <html> 3 <head> 4 <meta http-equiv="Content-T

從零開始——電子商務平臺01

ges 任務框架 alt 圖片 ima .cn bsp 驗證一、任務　　後臺——登錄　　包含的內容：1）bootstrap驗證--登錄　　　　　　　　2）MD5加密（加鹽）--對密碼　　　　　　　　3）三框架頁面--主頁面二、整體圖三、分享　　源碼、數

從零開始構建一個的asp.net Core 項目（二）

mage .... cfi web execute 運行 figure 今天 deb 接著上一篇博客繼續進行。上一篇博客只是顯示了簡單的MVC視圖頁，這篇博客接著進行,連接上數據庫，進行簡單的CRUD。首先我在Controllers文件夾點擊右鍵，添加->控制器彈

IC卡解密從零開始學2 版本更新! 解密工具PN532-mfoc-mfcuk-GUI V2.1 By:lookyour

由於文件夾 7月 thread 2種金融系統 dos 同時程序更新更新內容最下面2017/5/3 V2.1======================================最簡要介紹下M1卡數據結構目前能看到的有2種M1卡,分別為S50 S70,其實就是

IC卡解密從零開始學1 (也許會有2) 解密工具V2 V3大放送 By:lookyour

中國 tac ... 需要編輯框回復 mile gui for 前段時間發了一個破解的PN532工具,詳見 ===========================IC卡解密工具 PN532工具XP 爆破版http://www.52pojie.cn/thread-5978

從零開始 TensorFlow Word2vec

相關推薦