Mask R-CNN訓練自己的資料集在win10上的踩坑全過程:CUDA9.0+CUDNN7.1.4+Tensorflow-gpu1.9.0+keras-gpu2.2.4
阿新 • • 發佈:2018-12-17
基礎配置
- 首先你需要在win10上下載Git(用於我們在github上面下載原始碼)和MinGW(方便我們在win10上也能用linux的make操作命令)。
- 接著你要下載cuda9.0和cudnn7.1來繫結你的windows的Nvidia
- 接著你需要在win10上面安裝anaconda3(切記,python用的是3.6+,目前的tesorflow-gpu只能匹配這個)
- 然後在現有的base環境下(或者配置新環境),按照順序依次用conda install cudatoolkit==9.0->conda install cudnn==7.1.4->conda install tensorflow-gpu==1.9.0->conda install keras-gpu==2.2.4
- 然後記住要裝一些其他需要的包,比如(matplotlib、openCV、cython、scikit-image等等),總之就是按照提示conda install或者pip install
前期需要下載的內容
- 首先大家用在github上把mask r-cnn的專案下載到本地;
- 然後要把cocoapi下載到本地;
- 接著是把預權重mask_rcnn_coco.h5放到mask r-cnn檔案下面的新建資料夾logs下面;
- 最後是開啟anaconda prompt,cd到cocoapi/PythonAPI的目錄,一次輸入make和make -j8;
- 把PythonAPI下面更新的pycocotools放到mask r-cnn檔案的samples/coco/裡;
需要注意的地方
- mask r-cnn在訓練的時候,gpu視訊記憶體會被一直佔用,但是利用率只會偶爾上升是正常現象,大部分時間gpu利用率還是比較低,只要gpu視訊記憶體被佔用就說明後方是在用gpu加速的。
- 同時cpu記憶體也會被佔用一些,利用率大概在百分之三十多。
先附上mask-rcnn的官方測試程式碼:
import os import sys import random import math import numpy as np import skimage.io import matplotlib import matplotlib.pyplot as plt import cv2 import time ROOT_DIR = os.path.abspath("G:\Mask_RCNN") sys.path.append(ROOT_DIR) from mrcnn import utils import mrcnn.model as modellib from mrcnn import visualize sys.path.append(os.path.join(ROOT_DIR, "samples/coco/")) import coco MODEL_DIR = os.path.join(ROOT_DIR, "logs") COCO_MODEL_PATH = os.path.join(MODEL_DIR ,"mask_rcnn_coco.h5") if not os.path.exists(COCO_MODEL_PATH): utils.download_trained_weights(COCO_MODEL_PATH) print("cuiwei***********************") IMAGE_DIR = os.path.join(ROOT_DIR, "images") class InferenceConfig(coco.CocoConfig): GPU_COUNT = 1 IMAGES_PER_GPU = 1 config = InferenceConfig() config.display() model = modellib.MaskRCNN(mode="inference", model_dir=MODEL_DIR, config=config) #model.load_weights(COCO_MODEL_PATH, by_name=True) model.load_weights(COCO_MODEL_PATH, by_name=True, exclude=[ "mrcnn_class_logits", "mrcnn_bbox_fc"]) # ============================================================================= # class_names = ['BG', 'person', 'bicycle', 'car', 'motorcycle', 'airplane', # 'bus', 'train', 'truck', 'boat', 'traffic light', # 'fire hydrant', 'stop sign', 'parking meter', 'bench', 'bird', # 'cat', 'dog', 'horse', 'sheep', 'cow', 'elephant', 'bear', # 'zebra', 'giraffe', 'backpack', 'umbrella', 'handbag', 'tie', # 'suitcase', 'frisbee', 'skis', 'snowboard', 'sports ball', # 'kite', 'baseball bat', 'baseball glove', 'skateboard', # 'surfboard', 'tennis racket', 'bottle', 'wine glass', 'cup', # 'fork', 'knife', 'spoon', 'bowl', 'banana', 'apple', # 'sandwich', 'orange', 'broccoli', 'carrot', 'hot dog', 'pizza', # 'donut', 'cake', 'chair', 'couch', 'potted plant', 'bed', # 'dining table', 'toilet', 'tv', 'laptop', 'mouse', 'remote', # 'keyboard', 'cell phone', 'microwave', 'oven', 'toaster', # 'sink', 'refrigerator', 'book', 'clock', 'vase', 'scissors', # 'teddy bear', 'hair drier', 'toothbrush'] # ============================================================================= class_names = ['BG', 'person'] #file_names=next(os.walk(IMAGE_DIR))[2] #image=skimage.io.imread(os.path.join(IMAGE_DIR,random.choice(file_names))) image= cv2.imread("G:\\Mask_RCNN\\images\\25691390_f9944f61b5_z.jpg") #cap = cv2.VideoCapture(0) i=0 while(i<1): i+=1 # ret, frame = cap.read() start =time.clock() # results = model.detect([frame], verbose=1) results=model.detect([image],verbose=1) r = results[0] # visualize.display_instances(frame, r['rois'], r['masks'], r['class_ids'], class_names, r['scores']) visualize.display_instances(image,r['rois'],r['masks'],r['class_ids'],class_names,r['scores']) end = time.clock() print(end-start) if cv2.waitKey(1) & 0xFF == ord('q'): break #cap.release() cv2.destroyAllWindows()
在測試程式碼中會踩到的坑
Traceback (most recent call last):
File "<ipython-input-1-fdee81fb82fb>", line 1, in <module>
runfile('G:/labelme/test.py', wdir='G:/labelme')
File "C:\Users\34905\Anaconda3\envs\cv2\lib\site-packages\spyder_kernels\customize\spydercustomize.py", line 704, in runfile
execfile(filename, namespace)
File "C:\Users\34905\Anaconda3\envs\cv2\lib\site-packages\spyder_kernels\customize\spydercustomize.py", line 108, in execfile
exec(compile(f.read(), filename, 'exec'), namespace)
File "G:/labelme/test.py", line 48, in <module>
model.load_weights(COCO_MODEL_PATH, by_name=True)
File "G:\Mask_RCNN\mrcnn\model.py", line 2131, in load_weights
saving.load_weights_from_hdf5_group_by_name(f, layers)
File "C:\Users\34905\Anaconda3\envs\cv2\lib\site-packages\keras\engine\saving.py", line 1149, in load_weights_from_hdf5_group_by_name
str(weight_values[i].shape) + '.')
ValueError: Layer #389 (named "mrcnn_bbox_fc"), weight <tf.Variable 'mrcnn_bbox_fc/kernel:0' shape=(1024, 8) dtype=float32_ref> has shape (1024, 8), but the saved weight has shape (1024, 324).
如果你不想測試coco裡預設的81類,只想測試2類,那一定記住要把model.load_weights(COCO_MODEL_PATH, by_name=True)改為model.load_weights(COCO_MODEL_PATH, by_name=True, exclude=["mrcnn_class_logits", "mrcnn_bbox_fc","mrcnn_bbox", "mrcnn_mask"])
附上訓練程式碼
import os
import sys
import random
import math
import re
import time
import numpy as np
import cv2
import matplotlib
import matplotlib.pyplot as plt
import tensorflow as tf
#ROOT_DIR = os.getcwd()
ROOT_DIR = os.path.abspath("G:\Mask_RCNN")
sys.path.append(ROOT_DIR)
from mrcnn.config import Config
#import utils
from mrcnn import model as modellib,utils
from mrcnn import visualize
import yaml
from mrcnn.model import log
from PIL import Image
MODEL_DIR = os.path.join(ROOT_DIR, "logs")
iter_num=0
COCO_MODEL_PATH = os.path.join(MODEL_DIR, "mask_rcnn_coco.h5")
if not os.path.exists(COCO_MODEL_PATH):
utils.download_trained_weights(COCO_MODEL_PATH)
class ShapesConfig(Config):
NAME = "pods"
GPU_COUNT = 1
IMAGES_PER_GPU = 2
NUM_CLASSES = 1 + 1
IMAGE_MIN_DIM = 320
IMAGE_MAX_DIM = 384
# RPN_ANCHOR_SCALES = (8 * 6, 16 * 6, 32 * 6, 64 * 6, 128 * 6)
RPN_ANCHOR_SCALES = (32, 64, 128, 256, 512)
RPN_ANCHOR_RATIOS = [0.5, 1, 2]
RPN_ANCHOR_STRIDE = 1
RPN_NMS_THRESHOLD = 0.7
RPN_TRAIN_ANCHORS_PER_IMAGE = 256
TRAIN_ROIS_PER_IMAGE = 100
STEPS_PER_EPOCH = 28
MAX_GT_INSTANCES = 100
VALIDATION_STEPS = 5
config = ShapesConfig()
config.display()
class DrugDataset(utils.Dataset):
def get_obj_index(self, image):
n = np.max(image)
return n
def from_yaml_get_class(self, image_id):
info = self.image_info[image_id]
with open(info['yaml_path']) as f:
temp = yaml.load(f.read())
labels = temp['label_names']
del labels[0]
return labels
def draw_mask(self, num_obj, mask, image,image_id):
info = self.image_info[image_id]
for index in range(num_obj):
for i in range(info['width']):
for j in range(info['height']):
at_pixel = image.getpixel((i, j))
if at_pixel == index + 1:
mask[j, i, index] = 1
return mask
def load_shapes(self, count, img_folder, mask_folder, imglist, dataset_root_path):
self.add_class("shapes", 1, "pod")
for i in range(count):
filestr = imglist[i].split(".")[0]
mask_path = mask_folder + "/" + filestr + ".png"
yaml_path = dataset_root_path + "labelme_json/" + filestr + "_json/info.yaml"
print(dataset_root_path + "labelme_json/" + filestr + "_json/img.png")
cv_img = cv2.imread(dataset_root_path + "labelme_json/" + filestr + "_json/img.png")
self.add_image("shapes", image_id=i, path=img_folder + "/" + imglist[i],
width=cv_img.shape[1], height=cv_img.shape[0], mask_path=mask_path, yaml_path=yaml_path)
def load_mask(self, image_id):
global iter_num
print("image_id",image_id)
info = self.image_info[image_id]
count = 1 # number of object
img = Image.open(info['mask_path'])
num_obj = self.get_obj_index(img)
mask = np.zeros([info['height'], info['width'], num_obj], dtype=np.uint8)
mask = self.draw_mask(num_obj, mask, img,image_id)
occlusion = np.logical_not(mask[:, :, -1]).astype(np.uint8)
for i in range(count - 2, -1, -1):
mask[:, :, i] = mask[:, :, i] * occlusion
occlusion = np.logical_and(occlusion, np.logical_not(mask[:, :, i]))
labels = []
labels = self.from_yaml_get_class(image_id)
labels_form = []
for i in range(len(labels)):
if labels[i].find("pod") != -1:
# print "box"
labels_form.append("pod")
class_ids = np.array([self.class_names.index(s) for s in labels_form])
return mask, class_ids.astype(np.int32)
def get_ax(rows=1, cols=1, size=8):
_, ax = plt.subplots(rows, cols, figsize=(size * cols, size * rows))
return ax
#dataset_root_path="train_data/"
dataset_root_path = os.path.join(ROOT_DIR, "train_data/")
img_folder = dataset_root_path + "pic"
mask_folder = dataset_root_path + "cv2_mask"
imglist = os.listdir(img_folder)
count = len(imglist)
dataset_train = DrugDataset()
dataset_train.load_shapes(count, img_folder, mask_folder, imglist,dataset_root_path)
dataset_train.prepare()
dataset_val = DrugDataset()
dataset_val.load_shapes(56, img_folder, mask_folder, imglist,dataset_root_path)
dataset_val.prepare()
model = modellib.MaskRCNN(mode="training", config=config,model_dir=MODEL_DIR)
init_with = "coco"
if init_with == "imagenet":
model.load_weights(model.get_imagenet_weights(), by_name=True)
elif init_with == "coco":
model.load_weights(COCO_MODEL_PATH, by_name=True,exclude=["mrcnn_class_logits", "mrcnn_bbox_fc","mrcnn_bbox", "mrcnn_mask"])
elif init_with == "last":
model.load_weights(model.find_last()[1], by_name=True)
#model.train(dataset_train, dataset_val,learning_rate=config.LEARNING_RATE,epochs=1,layers='heads')
model.train(dataset_train, dataset_val,learning_rate=config.LEARNING_RATE,epochs=20,layers='heads')
model.train(dataset_train, dataset_val,learning_rate=config.LEARNING_RATE / 10,epochs=20,layers="all")
#model.train(dataset_train, dataset_val,learning_rate=config.LEARNING_RATE / 10,epochs=1,layers="all")
訓練程式碼中遇到的坑
Traceback (most recent call last):
File "<ipython-input-10-4a900268c3f6>", line 1, in <module>
runfile('D:/Mask R-CNN/train.py', wdir='D:/Mask R-CNN')
File "D:\Anaconda3\Anaconda3-5.3.0\envs\cv2\lib\site-packages\spyder_kernels\customize\spydercustomize.py", line 704, in runfile
execfile(filename, namespace)
File "D:\Anaconda3\Anaconda3-5.3.0\envs\cv2\lib\site-packages\spyder_kernels\customize\spydercustomize.py", line 108, in execfile
exec(compile(f.read(), filename, 'exec'), namespace)
File "D:/Mask R-CNN/train.py", line 158, in <module>
model.load_weights(COCO_MODEL_PATH, by_name=True,exclude=["mrcnn_class_logits", "mrcnn_bbox_fc","mrcnn_bbox", "mrcnn_mask"])
File "D:\Mask R-CNN\mrcnn\model.py", line 2131, in load_weights
saving.load_weights_from_hdf5_group_by_name(f, layers)
File "D:\Anaconda3\Anaconda3-5.3.0\envs\cv2\lib\site-packages\keras\engine\saving.py", line 1104, in load_weights_from_hdf5_group_by_name
g = f[name]
File "h5py\_objects.pyx", line 54, in h5py._objects.with_phil.wrapper
File "h5py\_objects.pyx", line 55, in h5py._objects.with_phil.wrapper
File "D:\Anaconda3\Anaconda3-5.3.0\envs\cv2\lib\site-packages\h5py\_hl\group.py", line 177, in __getitem__
oid = h5o.open(self.id, self._e(name), lapl=self._lapl)
File "h5py\_objects.pyx", line 54, in h5py._objects.with_phil.wrapper
File "h5py\_objects.pyx", line 55, in h5py._objects.with_phil.wrapper
File "h5py\h5o.pyx", line 190, in h5py.h5o.open
KeyError: 'Unable to open object (wrong B-tree signature)'
這個bug是我目前遇到最難察覺問題的bug,而且網上關於這個bug的解決方法很少,經過我仔細的排查,我發現是我的mask_rcnn_coco.h5檔案有問題。我重新下載後,就解決了這個bug,建議大家在重新搭建平臺的時候,不要用上一個平臺已經用過的預權重 mask_rcnn_coco.h5,會發生很多不可知的錯誤。最好還是重新去網上下載一個。
測試自己的訓練程式碼
import os
import sys
import random
import math
import numpy as np
import skimage.io
import matplotlib
import matplotlib.pyplot as plt
import cv2
import time
ROOT_DIR = os.path.abspath("G:\Mask_RCNN")
sys.path.append(ROOT_DIR)
from mrcnn.config import Config
from datetime import datetime
from mrcnn import utils
import mrcnn.model as modellib
from mrcnn import visualize
sys.path.append(os.path.join(ROOT_DIR, "samples/coco/"))
from samples.coco import coco
MODEL_DIR = os.path.join(ROOT_DIR, "logs/shapes20181122T2257")
#MODEL_DIR = os.path.join(ROOT_DIR, "logs")
COCO_MODEL_PATH = os.path.join(MODEL_DIR, "mask_rcnn_shapes_0011.h5")
#COCO_MODEL_PATH = os.path.join(MODEL_DIR, "mask_rcnn_coco.h5")
IMAGE_DIR = os.path.join(ROOT_DIR, "train_data/pic")
#IMAGE_DIR = os.path.join(ROOT_DIR, "images")
class ShapesConfig(Config):
NAME = "shapes"
GPU_COUNT = 1
IMAGES_PER_GPU = 1
NUM_CLASSES = 1 + 1
IMAGE_MIN_DIM = 320
IMAGE_MAX_DIM = 384
RPN_ANCHOR_SCALES = (8 * 6, 16 * 6, 32 * 6, 64 * 6, 128 * 6)
TRAIN_ROIS_PER_IMAGE =100
STEPS_PER_EPOCH = 10
VALIDATION_STEPS = 10
class InferenceConfig(ShapesConfig):
GPU_COUNT = 1
IMAGES_PER_GPU = 1
config = InferenceConfig()
model = modellib.MaskRCNN(mode="inference", model_dir=MODEL_DIR, config=config)
model = modellib.MaskRCNN(mode="inference", model_dir=MODEL_DIR, config=config)
model.load_weights(COCO_MODEL_PATH, by_name=True)
class_names = ['BG', 'pod']
i=1
while(i<2):
i=i+1
file_names = next(os.walk(IMAGE_DIR))[2]
# image = skimage.io.imread(os.path.join(IMAGE_DIR, random.choice(file_names)))
image = skimage.io.imread(os.path.join(IMAGE_DIR, "1001-01b.jpg"))
a=datetime.now()
results = model.detect([image], verbose=1)
b=datetime.now()
print("time:",(b-a).seconds)
r = results[0]
visualize.display_instances(image, r['rois'], r['masks'], r['class_ids'], class_names, r['scores'])
同樣的在Linux的ubutu16.04上測試我的程式碼的時候遇到了這個問題。
遇到這個問題大家不要慌,這是因為大家設定了多執行緒,但是執行緒不同步造成的。大家耐心等一會,就會出現loss了。同時如果大家不希望出現多執行緒,大家可以改為單執行緒。更改方法如下:在Mask RCNN\mrcnn\model.py中
self.keras_model.fit_generator(
train_generator,
initial_epoch=self.epoch,
epochs=epochs,
steps_per_epoch=self.config.STEPS_PER_EPOCH,
callbacks=callbacks,
validation_data=val_generator,
validation_steps=self.config.VALIDATION_STEPS,
max_queue_size=100,
workers=workers,
use_multiprocessing=True,
# use_multiprocessing=False,
)
大家如果只是想要把多執行緒改為單執行緒,就要把use_multiprocessing=False,同時要讓workers=1。因為單執行緒的情況下讓workers大於1會報錯!
有什麼其他問題歡迎大家在下方留言私信我!