1. one-hot編碼

# 字符集的one-hot編碼
import string

samples = ['zzh is a pig','he loves himself very much','pig pig han']
characters = string.printable
token_index = dict(zip(range(1,len(characters)+1),characters))

max_length =20
results = np.zeros((len(samples),max_length,max(token_index.keys()) + 1))
for i,sample in enumerate(sample): for j,character in enumerate(sample): index = token_index.get(character) results[i,j,index] = 1 results

characters= '0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVW

XYZ!"#$%&\'()*+,-./:;<=>[email protected]

[\\]^_`{|}~ \t\n\r\x0b\x0c'


# keras實現單詞級的one-hot編碼
from keras.preprocessing.text import Tokenizer
samples = ['zzh is a pig','he loves himself very much','pig pig han']

tokenizer = Tokenizer(num_words = 100)
構建單詞索引 sequences = tokenizer.texts_to_sequences(samples) one_hot_results = tokenizer.texts_to_matrix(samples,mode='binary') # one_hot_results.shape --> (3, 100) word_index = tokenizer.word_index print('發現%s個unique標記',len(word_index))


sequences = [[2, 3, 4, 1], 
[5, 6, 7, 8, 9, 10],
[1, 1, 11]]


word_index =
{'pig': 1, 'zzh': 2, 'is': 3, 'a': 4, 'he': 5, 
'loves': 6,'himself': 7, 'very': 8, 'much': 9,
'han': 10}

one-hot 編碼的一種辦法是 one-hot雜湊技巧(one-hot hashing trick)






import numpy as np

samples = ['the cat sat on the mat the cat sat on the mat the cat sat on the mat','the dog ate my homowork']
dimensionality = 1000#將單詞儲存為1000維的向量
max_length = 10

results = np.zeros((len(samples),max_length,dimensionality))
for i,sample in enumerate(samples):
    for j,word in list(enumerate(sample.split()))[:max_length]:
        index = abs(hash(word)) % dimensionality
        results[i,j,index] = 1




2. 詞嵌入