1. 程式人生 > >CNTK API文件翻譯(18)——多對多神經網路處理文字資料(2)

CNTK API文件翻譯(18)——多對多神經網路處理文字資料(2)




def create_model_train(s2smodel):
    # model used in training (history is known from labels)
    # note: the labels must NOT contain the initial <s>
    def model_train
(input, labels):
# (input*, labels*) --> (word_logp*) # The input to the decoder always starts with the special label sequence start token. # Then, use the previous value of the label sequence (for training) or the output (for execution). past_labels = C.layers.Delay(initial_state=sentence_start)(labels) return
s2smodel(past_labels, input) return model_train

上面我們又使用@Function裝飾器建立了一個CNTK函式物件model_train。這個函式的引數是輸入序列input和輸出序列labels。past_labels變數使用Delay層儲存了我們先前建立的模型的歷史記錄。這會返回之前單位時間的輸入labels。因此,如果我們將labels設定為[‘a’, ‘b’, ‘c’],past_labels的值將會是[‘’, ‘a’, ‘b’, ‘c’],然後返回呼叫past_labels和input的模型。


# model used in (greedy) decoding (history is decoder's own output) # (input*) --> (word_sequence*) @C.Function @C.layers.Signature(InputSequence[C.layers.Tensor[input_vocab_dim]]) def model_greedy(input): # Decoding is an unfold() operation starting from sentence_start. # We must transform s2smodel (history*, input* -> word_logp*) into a generator (history* -> output*) # which holds 'input' in its closure. unfold = C.layers.UnfoldFrom(lambda history: s2smodel(history, input) >> C.hardmax, # stop once sentence_end_index was max-scoring output until_predicate=lambda w: w[...,sentence_end_index], length_increase=length_increase) return unfold(initial_state=sentence_start, dynamic_axes_like=input) return model_greedy



def create_criterion_function(model):
    def criterion(input, labels):
        # criterion function must drop the <s> from the labels
        # <s> A B C </s> --> A B C </s>
        postprocessed_labels = C.sequence.slice(labels, 1, 0) 
        z = model(input, postprocessed_labels)
        ce = C.cross_entropy_with_softmax(z, postprocessed_labels)
        errs = C.classification_error(z, postprocessed_labels)
        return (ce, errs)

    return criterion



def train(train_reader, valid_reader, vocab, i2w, s2smodel, max_epochs, epoch_size):

    # create the training wrapper for the s2smodel, as well as the criterion function
    model_train = create_model_train(s2smodel)
    criterion = create_criterion_function(model_train)

    # also wire in a greedy decoder so that we can properly log progress on a validation example
    # This is not used for the actual training process.
    model_greedy = create_model_greedy(s2smodel)

    # Instantiate the trainer object to drive the model training
    minibatch_size = 72
    lr = 0.001 if use_attention else 0.005
    learner = C.fsadagrad(model_train.parameters,
                          lr = C.learning_rate_schedule([lr]*2+[lr/2]*3+[lr/4], C.UnitType.sample, epoch_size),
                          momentum = C.momentum_as_time_constant_schedule(1100),
    trainer = C.Trainer(None, criterion, learner)

    # Get minibatches of sequences to train with and perform model training
    total_samples = 0
    mbs = 0
    eval_freq = 100

    # print out some useful training information
    C.logging.log_number_of_parameters(model_train) ; print()
    progress_printer = C.logging.ProgressPrinter(freq=30, tag='Training')    

    # a hack to allow us to print sparse vectors
    sparse_to_dense = create_sparse_to_dense(input_vocab_dim)

    for epoch in range(max_epochs):
        while total_samples < (epoch+1) * epoch_size:
            # get next minibatch of training data
            mb_train = train_reader.next_minibatch(minibatch_size)

            # do the training
            trainer.train_minibatch({criterion.arguments[0]: mb_train[train_reader.streams.features], 
                                     criterion.arguments[1]: mb_train[train_reader.streams.labels]})

            # log progress
            progress_printer.update_with_trainer(trainer, with_metric=True)

            # every N MBs evaluate on a test sequence to visually show how we're doing
            if mbs % eval_freq == 0:
                mb_valid = valid_reader.next_minibatch(1)

                # run an eval on the decoder output model (i.e. don't use the groundtruth)
                e = model_greedy(mb_valid[valid_reader.streams.features])
                print(format_sequences(sparse_to_dense(mb_valid[valid_reader.streams.features]), i2w))
                print(format_sequences(e, i2w))

                # visualizing attention window
                if use_attention:
                    debug_attention(model_greedy, mb_valid[valid_reader.streams.features])

            total_samples += mb_train[train_reader.streams.labels].num_samples
            mbs += 1

        # log a summary of the stats for the epoch

    # done: save the final model
    model_path = "model_%d.cmf" % epoch
    print("Saving final model to '%s'" % model_path)
    print("%d epochs complete." % max_epochs)




# dummy for printing the input sequence below. Currently needed because input is sparse.
def create_sparse_to_dense(input_vocab_dim):
    I = C.Constant(np.eye(input_vocab_dim))
    def no_op(input):
        return C.times(input, I)
    return no_op



# Given a vocab and tensor, print the output
def format_sequences(sequences, i2w):
    return [" ".join([i2w[np.argmax(w)] for w in s]) for s in sequences]

# to help debug the attention window
def debug_attention(model, input):
    q = C.combine([model, model.attention_model.attention_weights])
    #words, p = q(input) # Python 3
    words_p = q(input)
    words = words_p[0]
    p     = words_p[1]
    seq_len = words[0].shape[attention_axis-1]
    #attention_span  #7 # test sentence is 7 tokens long
    span = 7
    # (batch, len, attention_span, 1, vector_dim)
    p_sq = np.squeeze(p[0][:seq_len,:span,0,:])
    opts = np.get_printoptions()


model = create_model()
train(train_reader, valid_reader, vocab, i2w, model, max_epochs=1, epoch_size=25000)


['<s> A B A D I </s>']
['O O ~K ~K X X X X ~JH ~JH ~JH']
[[ 0.14327  0.14396  0.14337  0.14305  0.14248  0.1422   0.14166]
 [ 0.14327  0.14395  0.14337  0.14305  0.14248  0.1422   0.14166]
 [ 0.14327  0.14396  0.14337  0.14305  0.14248  0.1422   0.14166]
 [ 0.14328  0.14395  0.14337  0.14305  0.14248  0.1422   0.14166]
 [ 0.14327  0.14395  0.14337  0.14305  0.14248  0.1422   0.14166]
 [ 0.14327  0.14395  0.14337  0.14305  0.14248  0.1422   0.14166]
 [ 0.14327  0.14396  0.14337  0.14305  0.14248  0.1422   0.14166]
 [ 0.14327  0.14396  0.14337  0.14305  0.14248  0.1422   0.14166]
 [ 0.14327  0.14395  0.14337  0.14305  0.14248  0.1422   0.14166]
 [ 0.14327  0.14395  0.14337  0.14305  0.14248  0.1422   0.14166]
 [ 0.14327  0.14396  0.14337  0.14305  0.14248  0.1422   0.14166]]
 Minibatch[   1-  30]: loss = 4.145903 * 1601, metric = 87.32% * 1601;
 Minibatch[  31-  60]: loss = 3.648827 * 1601, metric = 86.45% * 1601;
 Minibatch[  61-  90]: loss = 3.320400 * 1548, metric = 88.44% * 1548;
['<s> A B A D I </s>']
['~N ~N </s>']
[[ 0.14276  0.14348  0.14298  0.1428   0.1425   0.14266  0.14281]
 [ 0.14276  0.14348  0.14298  0.14281  0.1425   0.14266  0.14281]
 [ 0.14276  0.14348  0.14298  0.14281  0.1425   0.14266  0.14281]]
 Minibatch[  91- 120]: loss = 3.231915 * 1567, metric = 86.02% * 1567;
 Minibatch[ 121- 150]: loss = 3.212445 * 1580, metric = 83.54% * 1580;
 Minibatch[ 151- 180]: loss = 3.214926 * 1544, metric = 84.26% * 1544;
['<s> A B A D I </s>']
['~R ~R ~AH ~AH ~AH </s>']
[[ 0.14293  0.14362  0.14306  0.14283  0.14246  0.14252  0.14259]
 [ 0.14293  0.14362  0.14306  0.14283  0.14246  0.14252  0.14259]
 [ 0.14293  0.14362  0.14306  0.14283  0.14246  0.14252  0.14259]
 [ 0.14293  0.14362  0.14306  0.14283  0.14246  0.14252  0.14259]
 [ 0.14293  0.14362  0.14306  0.14283  0.14246  0.14252  0.14259]
 [ 0.14293  0.14362  0.14306  0.14283  0.14246  0.14252  0.14259]]
 Minibatch[ 181- 210]: loss = 3.144272 * 1565, metric = 82.75% * 1565;
 Minibatch[ 211- 240]: loss = 3.185484 * 1583, metric = 83.20% * 1583;
 Minibatch[ 241- 270]: loss = 3.126284 * 1562, metric = 83.03% * 1562;
 Minibatch[ 271- 300]: loss = 3.150704 * 1551, metric = 83.56% * 1551;
['<s> A B A D I </s>']
['~R ~R ~R ~AH </s>']
[[ 0.14318  0.14385  0.14318  0.14286  0.14238  0.1423   0.14224]
 [ 0.14318  0.14385  0.14318  0.14286  0.14238  0.1423   0.14224]
 [ 0.14318  0.14385  0.14318  0.14287  0.14238  0.1423   0.14224]
 [ 0.14318  0.14385  0.14318  0.14287  0.14239  0.1423   0.14224]
 [ 0.14318  0.14385  0.14318  0.14287  0.14239  0.1423   0.14224]]
 Minibatch[ 301- 330]: loss = 3.131863 * 1575, metric = 82.41% * 1575;
 Minibatch[ 331- 360]: loss = 3.095721 * 1569, metric = 82.98% * 1569;
 Minibatch[ 361- 390]: loss = 3.098615 * 1567, metric = 82.32% * 1567;
['<s> A B A D I </s>']
['~K ~R ~R ~AH </s>']
[[ 0.14352  0.14416  0.14335  0.14292  0.1423   0.14201  0.14173]
 [ 0.1435   0.14414  0.14335  0.14293  0.14231  0.14202  0.14174]
 [ 0.14351  0.14415  0.14335  0.14293  0.1423   0.14202  0.14174]
 [ 0.14351  0.14415  0.14335  0.14293  0.1423   0.14202  0.14174]
 [ 0.14351  0.14415  0.14335  0.14293  0.1423   0.14202  0.14174]]
 Minibatch[ 391- 420]: loss = 3.115971 * 1601, metric = 81.70% * 1601;
Finished Epoch[1 of 300]: [Training] loss = 3.274279 * 22067, metric = 84.14% * 22067 64.263s (343.4 samples/s);
Saving final model to 'model_0.cmf'
1 epochs complete.


# Uncomment the line below to train the model for a full epoch
#train(train_reader, valid_reader, vocab, i2w, model, max_epochs=1, epoch_size=908241)




# load the model for epoch 0
model_path = "model_0.cmf"
model = C.Function.load(model_path)

# create a reader pointing at our testing data
test_reader = create_reader(dataPath['testing'], False)


# This decodes the test set and counts the string error rate.
def evaluate_decoding(reader, s2smodel, i2w):

    # wrap the greedy decoder around the model
    model_decoding = create_model_greedy(s2smodel) 

    progress_printer = C.logging.ProgressPrinter(tag='Evaluation')

    sparse_to_dense = create_sparse_to_dense(input_vocab_dim)

    minibatch_size = 512
    num_total = 0
    num_wrong = 0
    while True:
        mb = reader.next_minibatch(minibatch_size)
        # finish when end of test set reached
        if not mb: 
        e = model_decoding(mb[reader.streams.features])
        outputs = format_sequences(e, i2w)
        labels  = format_sequences(sparse_to_dense(mb[reader.streams.labels]), i2w)
        # prepend sentence start for comparison
        outputs = ["<s> " + output for output in outputs]

        num_total += len(outputs)
        num_wrong += sum([label != output for output, label in zip(outputs, labels)])

    rate = num_wrong / num_total
    print("string error rate of {:.1f}% in {} samples".format(100 * rate, num_total))
    return rate


Finished Epoch[1 of 300]: [Training] loss = 0.878420 * 799303, metric = 26.23% * 799303 1755.985s (455.2 samples/s);


# print the string error rate
evaluate_decoding(test_reader, model, i2w)


# This decodes the test set and counts the string error rate.
def evaluate_decoding(reader, s2smodel, i2w):

    # wrap the greedy decoder around the model
    model_decoding = create_model_greedy(s2smodel)

    progress_printer = C.logging.ProgressPrinter(tag='Evaluation')

    sparse_to_dense = create_sparse_to_dense(input_vocab_dim)

    minibatch_size = 512
    num_total = 0
    num_wrong = 0
    while True:
        mb = reader.next_minibatch(minibatch_size)
        # finish when end of test set reached
        if not mb:
        e = model_decoding(mb[reader.streams.features])
        outputs = format_sequences(e, i2w)
        labels  = format_sequences(sparse_to_dense(mb[reader.streams.labels]), i2w)
        # prepend sentence start for comparison
        outputs = ["<s> " + output for output in outputs]

        for s in range(len(labels)):
            for w in range(len(labels[s])):
                num_total += 1
                # in case the prediction is longer than the label
                if w < len(outputs[s]):
                    if outputs[s][w] != labels[s][w]:
                        num_wrong += 1

    rate = num_wrong / num_total
    print("{:.1f}".format(100 * rate))
    return rate

# print the phoneme error rate
test_reader = create_reader(dataPath['testing'], False)
evaluate_decoding(test_reader, model, i2w)





# imports required for showing the attention weight heatmap
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd

def translate(tokens, model_decoding, vocab, i2w, show_attention=False):

    vdict = {v:i for i,v in enumerate(vocab)}
        w = [vdict["<s>"]] + [vdict[c] for c in tokens] + [vdict["</s>"]]
        print('Input contains an unexpected token.')
        return []

    # convert to one_hot
    query = C.Value.one_hot([w], len(vdict))
    pred = model_decoding(query)
    # first sequence (we only have one) -> [len, vocab size]
    pred = pred[0]
    if use_attention:
        # attention has extra dimensions
        pred = pred[:,0,0,:]

    # print out translation and stop at the sequence-end tag
    prediction = np.argmax(pred, axis=-1)
    translation = [i2w[i] for i in prediction]

    # show attention window (requires matplotlib, seaborn, and pandas)
    if use_attention and show_attention:    
        q = C.combine([model_decoding.attention_model.attention_weights])
        att_value = q(query)

        # get the attention data up to the length of the output (subset of the full window)
        # -> (len, span)
        att_value = att_value[0][0:len(prediction),0:len(w),0,0]

        # set up the actual words/letters for the heatmap axis labels
        columns = [i2w[ww] for ww in prediction]
        index = [i2w[ww] for ww in w]

        dframe = pd.DataFrame(data=np.fliplr(att_value.T), columns=columns, index=index)

    return translation

上面的translate函式的引數有tokens(使用者輸入的字元列表),model_decoding(我們模型的貪婪解碼版本),vocab(詞彙表),i2w(vocab的索引對映),show_attention (決定是否顯示注意力向量)




def interactive_session(s2smodel, vocab, i2w, show_attention=False):

    # wrap the greedy decoder around the model
    model_decoding = create_model_greedy(s2smodel)

    import sys

    print('Enter one or more words to see their phonetic transcription.')
    while True:
        # Testing a prefilled text for routine testing
        if isTest():
            line = "psychology"
            line = input("> ")
        if line.lower() == "quit":
        # tokenize. Our task is letter to sound.
        out_line = []
        for word in line.split():
            in_tokens = [c.upper() for c in word]
            out_tokens = translate(in_tokens, model_decoding, vocab, i2w, show_attention=True)
        out_line = [" " if tok == '</s>' else tok[1:] for tok in out_line]
        print("=", " ".join(out_line))
        #If test environment we will test the translation only once
        if isTest():


interactive_session(model, vocab, i2w, show_attention=True)


