1. 程式人生 > >TF從文件中讀取數據

TF從文件中讀取數據

pytho value extern dea __name__ which detail 二進制 learning

從文件中讀取數據

在TensorFlow中進行模型訓練時,在官網給出的三種讀取方式,中最好的文件讀取方式就是將利用隊列進行文件讀取,而且步驟有兩步:

  1. 把樣本數據寫入TFRecords二進制文件
  2. 從隊列中讀取

TFRecords二進制文件,能夠更好的利用內存,更方便的移動和復制,並且不需要單獨的標記文件
下面官網給出的,對mnist文件進行操作的code,具體代碼請參考:tensorflow-master\tensorflow\examples\how_tos\reading_data\convert_to_records.py
(https://www.sogou.com/link?url=DSOYnZeCC_pKZzihDKzFgzQoUkRGi7SFyAyslJcA_SlXxobSKiNyJA..)

生成TFRecords文件

定義主函數,給訓練、驗證、測試數據集做轉換:

def main(unused_argv):
  # Get the data.
  data_sets = mnist.read_data_sets(FLAGS.directory,
                                   dtype=tf.uint8,
                                   reshape=False,
                                   validation_size=FLAGS.validation_size)

  # Convert to Examples and write the result to TFRecords.
  convert_to(data_sets.train, 'train')
  convert_to(data_sets.validation, 'validation')
  convert_to(data_sets.test, 'test')

轉換函數的作用convert_to的主要功能是,將數據填入到協議緩沖區,並化為一個字符串,然後寫入到TFRecords文件。


def convert_to(data_set, name):
  """Converts a dataset to tfrecords."""
  images = data_set.images
  labels = data_set.labels
  num_examples = data_set.num_examples

  if images.shape[0] != num_examples:
    raise ValueError('Images size %d does not match label size %d.' %
                     (images.shape[0], num_examples))
  rows = images.shape[1] # 28
  cols = images.shape[2] # 28
  depth = images.shape[3] # 1. 是黑白圖像,所以是單通道

  filename = os.path.join(FLAGS.directory, name + '.tfrecords')
  print('Writing', filename)
  writer = tf.python_io.TFRecordWriter(filename)
  for index in range(num_examples):
    image_raw = images[index].tostring()

    # 寫入協議緩存區,height,width,depth,label編碼成int64類型,image_raw 編碼成二進制
    example = tf.train.Example(features=tf.train.Features(feature={
        'height': _int64_feature(rows),
        'width': _int64_feature(cols),
        'depth': _int64_feature(depth),
        'label': _int64_feature(int(labels[index])),
        'image_raw': _bytes_feature(image_raw)}))
    writer.write(example.SerializeToString()) # 序列化為字符串
  writer.close()

編碼函數如下:

def _int64_feature(value):
  return tf.train.Feature(int64_list=tf.train.Int64List(value=[value]))


def _bytes_feature(value):
  return tf.train.Feature(bytes_list=tf.train.BytesList(value=[value]))

完整代碼:

import tensorflow as tf
import os
import argparse
import sys

os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'

#1.0 生成TFRecords 文件
from tensorflow.contrib.learn.python.learn.datasets import mnist

FLAGS = None

# 編碼函數如下:
def _int64_feature(value):
  return tf.train.Feature(int64_list=tf.train.Int64List(value=[value]))


def _bytes_feature(value):
  return tf.train.Feature(bytes_list=tf.train.BytesList(value=[value]))


def convert_to(data_set, name):
  """Converts a dataset to tfrecords."""
  images = data_set.images
  labels = data_set.labels
  num_examples = data_set.num_examples

  if images.shape[0] != num_examples:
    raise ValueError('Images size %d does not match label size %d.' %
                     (images.shape[0], num_examples))
  rows = images.shape[1] # 28
  cols = images.shape[2] # 28
  depth = images.shape[3] # 1. 是黑白圖像,所以是單通道

  filename = os.path.join(FLAGS.directory, name + '.tfrecords')
  print('Writing', filename)
  writer = tf.python_io.TFRecordWriter(filename)
  for index in range(num_examples):
    image_raw = images[index].tostring()

    # 寫入協議緩存區,height,width,depth,label編碼成int64類型,image_raw 編碼成二進制
    example = tf.train.Example(features=tf.train.Features(feature={
        'height': _int64_feature(rows),
        'width': _int64_feature(cols),
        'depth': _int64_feature(depth),
        'label': _int64_feature(int(labels[index])),
        'image_raw': _bytes_feature(image_raw)}))
    writer.write(example.SerializeToString()) # 序列化為字符串
  writer.close()


def main(unused_argv):
  # Get the data.
  data_sets = mnist.read_data_sets(FLAGS.directory,
                                   dtype=tf.uint8,
                                   reshape=False,
                                   validation_size=FLAGS.validation_size)

  # Convert to Examples and write the result to TFRecords.
  convert_to(data_sets.train, 'train')
  convert_to(data_sets.validation, 'validation')
  convert_to(data_sets.test, 'test')

if __name__ == '__main__':
  parser = argparse.ArgumentParser()
  parser.add_argument(
      '--directory',
      type=str,
      default='MNIST_data/',
      help='Directory to download data files and write the converted result'
  )
  parser.add_argument(
      '--validation_size',
      type=int,
      default=5000,
      help="""      Number of examples to separate from the training data for the validation
      set.      """
  )
  FLAGS, unparsed = parser.parse_known_args()
  tf.app.run(main=main, argv=[sys.argv[0]] + unparsed)

運行結束後,在/tmp/data下生成3個文件,即train.tfrecords,validation.tfrecords和test.tfrecords.

從隊列中讀取

讀取TFRecords文件步驟

使用隊列讀取數TFRecords 文件 數據的步驟

  1. 創建張量,從二進制文件讀取一個樣本數據
  2. 創建張量,從二進制文件隨機讀取一個mini-batch
  3. 把每一批張量傳入網絡作為輸入點

TensorFlow使用TFRecords文件訓練樣本的步驟
在生成文件名的序列中,設定epoch數量
訓練時,設定為無窮循環
在讀取數據時,如果捕捉到錯誤,終止

source code:tensorflow-master\tensorflow\examples\how_tos\reading_data\fully_connected_reader.py(1.2.1)
(https://blog.csdn.net/fontthrone/article/details/76728083 )


import tensorflow as tf
import os

# from tensorflow.contrib.learn.python.learn.datasets import mnist
# 註意上面的這個mnist 與 example 中的 mnist 是不同的,本文件中請使用下面的那個 mnist

os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'
import argparse
import os.path
import sys
import time

from tensorflow.examples.tutorials.mnist import mnist

# Basic model parameters as external flags.
FLAGS = None

# This part of the code is added by FontTian,which comes from the source code of tensorflow.examples.tutorials.mnist
# The MNIST images are always 28x28 pixels.
# IMAGE_SIZE = 28
# IMAGE_PIXELS = IMAGE_SIZE * IMAGE_SIZE

# Constants used for dealing with the files, matches convert_to_records.
TRAIN_FILE = 'train.tfrecords'
VALIDATION_FILE = 'validation.tfrecords'


def read_and_decode(filename_queue):
    reader = tf.TFRecordReader()
    _, serialized_example = reader.read(filename_queue)
    features = tf.parse_single_example(
        serialized_example,
        # Defaults are not specified since both keys are required.
        # 必須寫明faetures 中的 key 的名稱
        features={
            'image_raw': tf.FixedLenFeature([], tf.string),
            'label': tf.FixedLenFeature([], tf.int64),
        })

    # Convert from a scalar string tensor (whose single string has
    # length mnist.IMAGE_PIXELS) to a uint8 tensor with shape
    # [mnist.IMAGE_PIXELS].
    # 將一個標量字符串張量(其單個字符串的長度是mnist.image像素) # 0 維的Tensor
    # 轉換為一個帶有形狀mnist.圖像像素的uint8張量。 # 一維的Tensor
    image = tf.decode_raw(features['image_raw'], tf.uint8)
    # print(tf.shape(image)) # Tensor("input/Shape:0", shape=(1,), dtype=int32)

    image.set_shape([mnist.IMAGE_PIXELS])
    # print(tf.shape(image)) # Tensor("input/Shape_1:0", shape=(1,), dtype=int32)

    # OPTIONAL: Could reshape into a 28x28 image and apply distortions
    # here.  Since we are not applying any distortions in this
    # example, and the next step expects the image to be flattened
    # into a vector, we don't bother.

    # Convert from [0, 255] -> [-0.5, 0.5] floats.
    image = tf.cast(image, tf.float32) * (1. / 255) - 0.5
    # print(tf.shape(image)) # Tensor("input/Shape_2:0", shape=(1,), dtype=int32)

    # Convert label from a scalar uint8 tensor to an int32 scalar.
    label = tf.cast(features['label'], tf.int32)
    # print(tf.shape(label)) # Tensor("input/Shape_3:0", shape=(0,), dtype=int32)

    return image, label


# 使用 tf.train.shuffle_batch 將前面生成的樣本隨機化,獲得一個最小批次的張量
def inputs(train, batch_size, num_epochs):
    """Reads input data num_epochs times.

    Args:
      train: Selects between the training (True) and validation (False) data.
      batch_size: Number of examples per returned batch.
      num_epochs: Number of times to read the input data, or 0/None to
         train forever.

    Returns:
      A tuple (images, labels), where:
      * images is a float tensor with shape [batch_size, mnist.IMAGE_PIXELS]
        in the range [-0.5, 0.5].
      * labels is an int32 tensor with shape [batch_size] with the true label,
        a number in the range [0, mnist.NUM_CLASSES).
      Note that an tf.train.QueueRunner is added to the graph, which
      must be run using e.g. tf.train.start_queue_runners().

    輸入參數:
      train: Selects between the training (True) and validation (False) data.
      batch_size: 訓練的每一批有多少個樣本
      num_epochs: 讀取輸入數據的次數, or 0/None 表示永遠訓練下去

    返回結果:
      A tuple (images, labels), where:
      * images is a float tensor with shape [batch_size, mnist.IMAGE_PIXELS]
        範圍: [-0.5, 0.5].
      * labels is an int32 tensor with shape [batch_size] with the true label,
        範圍: [0, mnist.NUM_CLASSES).
      註意 :  tf.train.QueueRunner 被添加進 graph, 它必須用 tf.train.start_queue_runners() 來啟動線程.

    """

    if not num_epochs: num_epochs = None
    filename = os.path.join(FLAGS.train_dir,
                            TRAIN_FILE if train else VALIDATION_FILE)

    with tf.name_scope('input'):
        # tf.train.string_input_producer 返回一個 QueueRunner,裏面有一個 FIFQueue
        filename_queue = tf.train.string_input_producer(
            [filename], num_epochs=num_epochs)
        # 如果樣本數據很大,可以分成若幹文件,把文件名列表傳入

        # Even when reading in multiple threads, share the filename queue.
        image, label = read_and_decode(filename_queue)

        # Shuffle the examples and collect them into batch_size batches.
        # (Internally uses a RandomShuffleQueue.)
        # We run this in two threads to avoid being a bottleneck.
        images, sparse_labels = tf.train.shuffle_batch(
            [image, label], batch_size=batch_size, num_threads=2,
            capacity=1000 + 3 * batch_size,
            # Ensures a minimum amount of shuffling of examples.
            # 留下一部分隊列,來保證每次有足夠的數據做隨機打亂
            min_after_dequeue=1000)

        return images, sparse_labels

def run_training():
    """Train MNIST for a number of steps."""

    # Tell TensorFlow that the model will be built into the default Graph.
    with tf.Graph().as_default():
        # Input images and labels.
        images, labels = inputs(train=True, batch_size=FLAGS.batch_size,
                                num_epochs=FLAGS.num_epochs)

        # 構建一個從推理模型來預測數據的圖
        logits = mnist.inference(images,
                                 FLAGS.hidden1,
                                 FLAGS.hidden2)

        # Add to the Graph the loss calculation.
        # 定義損失函數
        loss = mnist.loss(logits, labels)

        # 將模型添加到圖操作中
        train_op = mnist.training(loss, FLAGS.learning_rate)

        # 初始化變量的操作
        init_op = tf.group(tf.global_variables_initializer(),
                           tf.local_variables_initializer())

        # Create a session for running operations in the Graph.
        # 在圖中創建一個用於運行操作的會話
        sess = tf.Session()

        # 初始化變量,註意:string_input_product 內部創建了一個epoch計數器
        sess.run(init_op)

        # Start input enqueue threads.
        coord = tf.train.Coordinator()
        threads = tf.train.start_queue_runners(sess=sess, coord=coord)

        try:
            step = 0
            while not coord.should_stop():
                start_time = time.time()

                # Run one step of the model.  The return values are
                # the activations from the `train_op` (which is
                # discarded) and the `loss` op.  To inspect the values
                # of your ops or variables, you may include them in
                # the list passed to sess.run() and the value tensors
                # will be returned in the tuple from the call.
                _, loss_value = sess.run([train_op, loss])

                duration = time.time() - start_time

                # Print an overview fairly often.
                if step % 100 == 0:
                    print('Step %d: loss = %.2f (%.3f sec)' % (step, loss_value,
                                                               duration))
                step += 1
        except tf.errors.OutOfRangeError:
            print('Done training for %d epochs, %d steps.' % (FLAGS.num_epochs, step))
        finally:
            # 通知其他線程關閉
            coord.request_stop()

        # Wait for threads to finish.
        coord.join(threads)
        sess.close()

def main(_):
    run_training()


if __name__ == '__main__':
    parser = argparse.ArgumentParser()
    parser.add_argument(
        '--learning_rate',
        type=float,
        default=0.01,
        help='Initial learning rate.'
    )
    parser.add_argument(
        '--num_epochs',
        type=int,
        default=2,
        help='Number of epochs to run trainer.'
    )
    parser.add_argument(
        '--hidden1',
        type=int,
        default=128,
        help='Number of units in hidden layer 1.'
    )
    parser.add_argument(
        '--hidden2',
        type=int,
        default=32,
        help='Number of units in hidden layer 2.'
    )
    parser.add_argument(
        '--batch_size',
        type=int,
        default=100,
        help='Batch size.'
    )
    parser.add_argument(
        '--train_dir',
        type=str,
        default='/tmp/data',
        help='Directory with the training data.'
    )
    FLAGS, unparsed = parser.parse_known_args()
    tf.app.run(main=main, argv=[sys.argv[0]] + unparsed)

TF從文件中讀取數據