Tensorflow教程學習筆記（一）----將自己的資料集轉換成TFRecord

阿新 • • 發佈：2019-01-28

import tensorflow as tf
import numpy as np
import os
import matplotlib.pyplot as plt
import skimage.io as io
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'

# %%

def get_file(file_dir):
    '''Get full image directory and corresponding labels
    Args:
        file_dir: file directory
    Returns:
        images: image directories, list, string
        labels: label, list, int
    '''

    images = []  #存放每張圖片的路徑[‘./notMNIST_small/A/MDEtMDEtMDAudHRm.png’,...,'./notMNIST_small/G/R2FyYW1vbmRQcmVtclByby1NZWRJdERpc3Aub3Rm.png]
    temp = []    #存放資料集下每一個子檔案的路徑['./notMNIST_small/A','./notMNIST_small/B',...,'/notMNIST_small/J']
    for root, sub_folders, files in os.walk(file_dir):

        # image directories
        for name in files:
            images.append(os.path.join(root, name))
        # get 10 sub-folder names
        for name in sub_folders:
            temp.append(os.path.join(root, name))


    # assign 10 labels based on the folder names
    labels = []
    #迴圈資料集下每一個子資料夾
    for one_folder in temp:
        #獲得子資料夾中圖片的個數，os.listdir() 方法用於返回指定的資料夾包含的檔案或資料夾的名字的列表
        n_img = len(os.listdir(one_folder))
        #用‘/'來劃分子資料夾路徑，如./notMNIST_small/A，取最後一個元素，其實就是獲得ABCDEFGHIJ
        letter = one_folder.split('/')[-1]
        #按照子資料夾名字的不同，來劃分類，貼上標籤，共10類
        if letter == 'A':
            labels = np.append(labels, n_img * [1])
        elif letter == 'B':
            labels = np.append(labels, n_img * [2])
        elif letter == 'C':
            labels = np.append(labels, n_img * [3])
        elif letter == 'D':
            labels = np.append(labels, n_img * [4])
        elif letter == 'E':
            labels = np.append(labels, n_img * [5])
        elif letter == 'F':
            labels = np.append(labels, n_img * [6])
        elif letter == 'G':
            labels = np.append(labels, n_img * [7])
        elif letter == 'H':
            labels = np.append(labels, n_img * [8])
        elif letter == 'I':
            labels = np.append(labels, n_img * [9])
        else:
            labels = np.append(labels, n_img * [10])

    # shuffle
    temp = np.array([images, labels])#[['/notMNIST_small/A/MDEtMDEtMDAudHRm.png',...,'/notMNIST_small/J/SWNvbmUgTFQgUmVndWxhciBJdGFsaWMgT3NGLnR0Zg==.png'],[1,1,1...10,10,10]]
    temp = temp.transpose()#[['/notMNIST_small/A/MDEtMDEtMDAudHRm.png',1],...,['/notMNIST_small/J/SWNvbmUgTFQgUmVndWxhciBJdGFsaWMgT3NGLnR0Zg==.png',10]
    np.random.shuffle(temp)#打亂順序

    image_list = list(temp[:, 0])#讀取temp中第0列，即images
    label_list = list(temp[:, 1])#讀取temp中第1列，即labels
    label_list = [int(float(i)) for i in label_list]

    return image_list, label_list


# %%
#輸入的圖片跟標籤都是特徵，因為其型別不同，故將labels轉換成int64,將圖片轉換成bytes
#生成整數型的屬性
def int64_feature(value):
    """Wrapper for inserting int64 features into Example proto."""
    if not isinstance(value, list):
        value = [value]
    return tf.train.Feature(int64_list=tf.train.Int64List(value=value))

#生成字串型整數
def bytes_feature(value):
    return tf.train.Feature(bytes_list=tf.train.BytesList(value=[value]))


# %%

def convert_to_tfrecord(images, labels, save_dir, name):
    '''convert all images and labels to one tfrecord file.
    Args:
        images: list of image directories, string type
        labels: list of labels, int type
        save_dir: the directory to save tfrecord file, e.g.: '/home/folder1/'
        name: the name of tfrecord file, string type, e.g.: 'train'
    Return:
        no return
    Note:
        converting needs some time, be patient...
    '''
    #生成tfrecords檔案儲存路徑
    filename = os.path.join(save_dir, name + '.tfrecords')
    n_samples = len(labels)

    if np.shape(images)[0] != n_samples:
        raise ValueError('Images size %d does not match label size %d.' % (images.shape[0], n_samples))

    # wait some time here, transforming need some time based on the size of your data.
    #向這個檔案中寫入
    writer = tf.python_io.TFRecordWriter(filename)
    print('\nTransform start......')
    #迴圈所有圖片
    for i in np.arange(0, n_samples):
        try:
            #讀圖，需轉換成array
            image = io.imread(images[i])  # type(image) must be array!
            #將影象矩陣轉化成一個字串
            image_raw = image.tostring()
            label = int(labels[i])
            #將每個圖片和其對應的label寫入每一個example
            example = tf.train.Example(features=tf.train.Features(feature={
                'label': int64_feature(label),
                'image_raw': bytes_feature(image_raw)}))
            writer.write(example.SerializeToString())
        #如果圖片損壞，將錯誤資訊打印出來
        except IOError as e:
            print('Could not read:', images[i])
            print('error: %s' % e)
            print('Skip it!\n')
    writer.close()
    print('Transform done!')


# %%
#讀取tfrecord檔案，並生成批次
def read_and_decode(tfrecords_file, batch_size):
    '''read and decode tfrecord file, generate (image, label) batches
    Args:
        tfrecords_file: the directory of tfrecord file
        batch_size: number of images in each batch
    Returns:
        image: 4D tensor - [batch_size, width, height, channel]
        label: 1D tensor - [batch_size]
    '''
    # make an input queue from the tfrecord file
    #將檔案生成一個佇列
    filename_queue = tf.train.string_input_producer([tfrecords_file])
    #建立一個reader來讀取TFRecord檔案
    reader = tf.TFRecordReader()
    #從檔案中獨處一個樣例。也可以使用read_up_to函式一次性讀取多個樣例
    _, serialized_example = reader.read(filename_queue)
    #解析每一個元素。如果需要解析多個樣例，可以用parse_example函式
    img_features = tf.parse_single_example(
        serialized_example,
        features={
            'label': tf.FixedLenFeature([], tf.int64),
            'image_raw': tf.FixedLenFeature([], tf.string),
        })
    #tf.decode_raw可以將字串解析成影象對應的畫素陣列
    image = tf.decode_raw(img_features['image_raw'], tf.uint8)

    ##########################################################
    # you can put data augmentation here, I didn't use it
    ##########################################################
    # all the images of notMNIST are 28*28, you need to change the image size if you use other dataset.

    image = tf.reshape(image, [28, 28])
    label = tf.cast(img_features['label'], tf.int32)
    image_batch, label_batch = tf.train.batch([image, label],
                                              batch_size=batch_size,
                                              num_threads=64,
                                              capacity=2000)
    return image_batch, tf.reshape(label_batch, [batch_size])


# %% Convert data to TFRecord

test_dir = 'F://TensorFlow--middleteach//TFRecord_notmnist//notMNIST_small//'
save_dir = 'F://TensorFlow--middleteach//TFRecord_notmnist//'
BATCH_SIZE = 25

# Convert test data: you just need to run it ONCE !
name_test = 'test'
images, labels = get_file(test_dir)
tfrecords_file_dir='F://TensorFlow--middleteach//TFRecord_notmnist//test.tfrecords'
if not os.path.exists(tfrecords_file_dir):
    convert_to_tfrecord(images, labels, save_dir, name_test)


# %% TO test train.tfrecord file
#一個batchsize讀取25張圖，展示成5行5列
def plot_images(images, labels):
    '''plot one batch size
    '''
    for i in np.arange(0, BATCH_SIZE):
        plt.subplot(5, 5, i + 1)
        #關閉座標軸顯示
        plt.axis('off')
        '''ord()函式主要用來返回對應字元的ascii碼，chr()主要用來表示ascii碼對應的字元他的輸入時數字
            print ord('a)
             #97
            print chr(97)
            #a
        '''
        plt.title(chr(ord('A') + labels[i] - 1), fontsize=14)
        # plt.subplots_adjust(top=0.5)
        plt.imshow(images[i])
    plt.show()


tfrecords_file = 'F://TensorFlow--middleteach//TFRecord_notmnist//test.tfrecords'
image_batch, label_batch = read_and_decode(tfrecords_file, batch_size=BATCH_SIZE)

with tf.Session()  as sess:
    i = 0
    #啟用多執行緒處理資料
    coord = tf.train.Coordinator()
    threads = tf.train.start_queue_runners(coord=coord)

    try:
        while not coord.should_stop() and i < 1:
            # just plot one batch size
            image, label = sess.run([image_batch, label_batch])
            plot_images(image, label)
            i += 1

    except tf.errors.OutOfRangeError:
        print('done!')
    finally:
        coord.request_stop()
    coord.join(threads)

Tensorflow教程學習筆記（一）----將自己的資料集轉換成TFRecord

import tensorflow as tf import numpy as np import os import matplotlib.pyplot as plt import skimage.io as io os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2' # %%

TensorFlow官方教程學習筆記（一）——起步

TensorFlow可以拆成兩個詞：Tensor（張量）和Flow（流），Tensor代表最底層的資料結構，每一個Tensor可以簡易的理解為一個多維陣列，類似於Caffe中的Blob，不過與Blob不同的是，對於一張圖片，Tensor的四個維度分別是[batch, h

HIVE簡明教程學習筆記（一）——資料庫及表的操作HIVE DDL

1.建立資料庫 create database if not exists aa_db; 2.檢視資料庫定義 describe database aa_db; 3.檢視資料庫列表 show databases; 4.刪除資料庫 drop database if exists testdb casca

UFLDL 教程學習筆記（一）神經網路

UFLDL（Unsupervised Feature Learning and Deep Learning）Tutorial 是由 Stanford 大學的 Andrew Ng 教授及其團隊

vue餓了嗎學習筆記（一）配置mock資料的Router

vue餓了嗎學習筆記（一）最近在學習慕課網的vue.js高仿餓了麼外賣APP，在專案準備時就遇到了webpack版本問題。視訊中使用的webpack是1.12.2，而現在的webpack 版本已經到了3.6，原先的程式碼已經不適用了。言歸正傳，當我們想

Swift學習筆記（一）——Swift的資料型別

Swift的資料型別 Swift 的型別是在 C 和 Objective-C 的基礎上提出的，Int是整型；Double和Float是浮點型；Bool是布林型；String是字串。Swift 還有兩個有⽤用的集合型別， Array和Dictionary，除了我們熟悉的型別

Python學習筆記（一）：基本資料型別

一、數字型別整數：int浮點數：float注：python不同於其它語言，int不區分short、int、long型別，只有一種型別int；浮點數不區分float與double型別，只有一種型別float，在python中float就表示double注：1/2得到的結果是0.

kinect 2.0 SDK學習筆記（一）--獲得原始資料

工程配置包含目錄加上: $(KINECTSDK20_DIR)\inc; 庫目錄加上： $(KINECTSDK20_DIR)\lib\x64;（注意選擇x64/x86）其中KINECTSDK20_DIR是安裝kinect SDK時的安裝目錄

JVM學習筆記（一）執行時資料區

執行時資料區 java虛擬機器在執行java程式的過程中會把所管理的記憶體劃分為若干個不同的資料區域。這些區域都有各自的用途，以及建立和銷燬的時間。有的區域隨著虛擬機器程序的啟動而存在，隨虛擬機器程序的退出而銷燬；有的區域則依賴使用者執行緒的啟

TCP/IP學習筆記（11）-tcp互動資料流，成塊資料流

目前建立在TCP協議上的網路協議特別多，有telnet，ssh，有ftp，有http等等。這些協議又可以根據資料吞吐量來大致分成兩大類：

Tensorflow 學習筆記（一）mac os 安裝 tensorflow

trap ras str ons bre col hello 指定 any Homebrew 安裝python brew install python 安裝pip curl ‘https://bootstrap.pypa.io/get-pip.py’ > get

TensorFlow學習筆記（一）-- Softmax迴歸模型識別MNIST

最近學習Tensorflow，特此筆記，學習資料為21個專案玩轉深度學習基於TensorFlow的實踐詳解 Softmax迴歸是一個線性的多分類模型，它是從Logistic迴歸模型轉化而來的，不同的是Logistic迴歸模型是一個二分類模型，而Softmax迴歸模型是一個多分類模型

GitHub超過4700星的TensorFlow（Amirsina Torfi博士）程式碼學習筆記（一）

用TensorFlow的應該都知道，git上的一個大神弗吉尼亞理工博士Amirsina Torfi在GitHub上貢獻了一個新的教程，星星數當天就破千，現在已經4721了，估計這個文章寫完又得漲點。完整程式碼連結（1積分）：https://download.csdn.net/downloa

《Tensorflow實戰》學習筆記（一）

深度學習基本四步驟：（1）定義演算法公式，也就是神經網路forward時的計算（2）定義loss，選定優化器，並指定優化器優化loss （3）迭代對資料進行訓練（4）在測試集合對準確率進行評測有用的類 tf.placeholder() tf.Variable(

Tensorflow學習筆記（一）Tensorflow入門

Tensorflow入門前言：本文是閱讀《TensorFlow：實戰Google深度學習框架》第三章提煉出來的筆記，非本人原創。這一章主要介紹： TensorFlow 名字說明最重要兩個概念：Tensor(張量)，Flow(流)。 tensor張量可以理解

tensorflow學習筆記（一）

TensorFlow是谷歌基於DistBelief進行研發的第二代人工智慧學習系統，其命名來源於本身的執行原理。Tensor（張量）意味著N維陣列，Flow（流）意味著基於資料流圖的計算，TensorFlow為張量從流圖的一端流動到另一端計算過程。TensorFlow是將

NVIDIA CUDA初級教程視訊 - 學習筆記（一）CPU體系架構概述

一句話：CPU大量用於資料搬運而非數值運算，用流水線和分支提高效率周斌老師：NVIDIA CUDA初級教程視訊第2集【計算機組成原理】 Q：什麼是CPU? A：是執行指令、處理資料的器件，用於完成基本的邏輯和算術指令，現在增加了複雜功能（記憶體介面、外部裝置介面），包含大量電晶體（上

C# WPF 基礎教程視訊學習筆記（一）

1.[STAThread()] 代表單執行緒 2.using語句允許程式設計師指定使用資源的物件應當何時釋放資源 3.Border 一般用於裝載面板 Padding 邊框和內部內容中間新增空間 CornerRadius可以使邊框具有一個圓角 4.DockP

【TensorFlow學習筆記（一）】利用Anaconda安裝TensorFlow（windows系統）

1.安裝Anaconda Anaconda官網由於檔案很大，所以下載速度會很慢，可以採用映象下載下載完之後，如果你的電腦系統時win8+，一定要以管理員身份執行安裝包。有一個地方需要注意下：第一個勾是是否把Anaconda加入環境變數，這涉及到

Git教程-廖雪峰——學習筆記（一）

按照教程學習了git工具的簡單使用，目前最先進的分散式版本控制系統，無論是從GitHub上學習他人的程式碼，還是儲存自己的程式碼都非常有用。按照教程練習了一遍程式碼，這裡總結一下：一.建立版本庫 1.開啟git工具，用下面的指令可以

Tensorflow教程學習筆記（一）----將自己的資料集轉換成TFRecord

相關推薦