主要分几个程序:
1、config.py : 保存了整个项目的大部分参数;
2、calculate_IOU.py : 计算预选框和真值框的IOU值,用于筛选正负样本;以及定义了对坐标进行encode和decode的函数;
3、nms.py : 定义了非极大值抑制函数;
4、random_crop.py : 定义了一个Cropper类,通过随机裁剪和随机翻转进行数据增强;
5、read_data.py : 定义了一个Reader类,用于读取VOC2012数据集;
6、anchors.py : 对不同特征层生成相应大小和数目的default box;
7、label_anchors.py : 将不同的default box与真值框(true boxes)进行匹配;
8、network.py : 定义了一个Net类,并定义了SSD网络结构,用于训练并保存模型;
9、loss_function.py : 定义了损失函数,其中包含对正样本和负样本1:3比例的取样;
10、SSD_API.py : 定义了SSD_detector类,用于加载模型并输入图片进行目标检测;
1、config.py
保存了这个项目的参数,先上代码:
# config.py
import numpy as np
import os
NMS_THRESHOLD = 0.3 # nms(非极大值抑制)的阙值
DATA_PATH = '../VOC2012' # 数据集路径
ImageSets_PATH = os.path.join(DATA_PATH, 'ImageSets') # 保存图片坐标和类别信息的路径
BLOCKS = ['block4', 'block7', 'block8',
'block9', 'block10', 'block11', 'block12'] # 需要抽出的特征层名称
MAX_SIZE = 1000 # 图片最大边长
MIN_SIZE = 600 # 图片最小边长
EPOCHES = 2000 # 迭代次数
BATCHES = 64 # 一个epoch迭代多少个batch
THRESHOLD = 0.5 # 区分正负样本匹配的阙值
SCORE_THRESHOLD = 0.997 # 测试时正样本得分阙值
MIN_CROP_RATIO = 0.6 # 随机裁剪的最小比率
MAX_CROP_RATIO = 1.0 # 随机裁剪的最大比率
MODEL_PATH = './model/' # 模型保存路径
LEARNING_RATE = 2e-4 # 学习率
CLASSES = ['', 'aeroplane', 'bicycle', 'bird', 'boat', 'bottle', 'bus',
'car', 'cat', 'chair', 'cow', 'diningtable', 'dog', 'horse',
'motorbike', 'person', 'pottedplant', 'sheep', 'sofa',
function(){ //外汇返佣 http://www.fx61.com/
'train', 'tvmonitor'] # 物体类别,第一个是背景类别
# 图片三像素均值
PIXEL_MEANS = np.array([[[122.7717, 115.9465, 102.9801]]])
# 不同层预选框的长宽比
RATIOS = [[2, .5],
[2, .5, 3, 1./3],
[2, .5, 3, 1./3],
[2, .5, 3, 1./3],
[2, .5, 3, 1./3],
[2, .5], [2, .5]]
# 每层的步长
STRIDES = [8, 16, 32, 64, 128, 256, 512]
# 论文中的s,认为是每层预选框的边长大小(比率大小)
S = [0.04, 0.1, 0.26, 0.42, 0.58, 0.74, 0.9, 1.06]
# 每层default box的边长,第二个元素是下一层default box的边长
Sk = [(20.48, 51.2),
(51.2, 133.12),
(133.12, 215.04),
(215.04, 296.96),
(296.96, 378.88),
(378.88, 460.8),
(460.8, 542.72)]
# 用于调整边框回归值在loss中的比率
PRIOT_SCALING = (0.1, 0.1, 0.2, 0.2)
参数都有备注,就不多说啦,挑几个比较重要的吧:
1、BLOCKS: BLOCKS保存了我们需要提取的特征层的名称(共七个),其中第一个特征层’block4’是VGG的一个中间层,其余六个特征层是SSD在VGG之层后额外添加的几个,每层的步长见‘STRIDES’参数;
2、RATIOS: RATIOS保存了七个层default box的几个长宽比,比如第一层有[2, 0.5]两个长宽比,代表第一个特征层每个特征点有长宽比分别为2, 0.5的额外两个default box;
3、Sk: Sk保存了每个特征层的default box的边长,注意这里的边长大小跟原论文不太一样;
然后config.py中的参数通过 import config as cfg 引用,参量用cfg.参数名即可。
2、calculate_IOU.py
这里定义了计算预选框和真值框的IOU值的函数,用于筛选正负样本;以及定义了对坐标进行encode和decode的函数;
先上代码:
# calculate_IOU.py
import numpy as np
import config as cfg
def encode_targets(true_box, anchors, prior_scaling=cfg.PRIOT_SCALING):
anchor_y_min = anchors[:, 0]
anchor_x_min = anchors[:, 1]
anchor_y_max = anchors[:, 2]
anchor_x_max = anchors[:, 3]
anchor_ctr_y = (anchor_y_max + anchor_y_min) / 2
anchor_ctr_x = (anchor_x_max + anchor_x_min) / 2
anchor_h = anchor_y_max - anchor_y_min
anchor_w = anchor_x_max - anchor_x_min
true_box_y_min = true_box[:, 0]
true_box_x_min = true_box[:, 1]
true_box_y_max = true_box[:, 2]
true_box_x_max = true_box[:, 3]
true_box_ctr_y = (true_box_y_max + true_box_y_min) / 2
true_box_ctr_x = (true_box_x_max + true_box_x_min) / 2
true_box_h = true_box_y_max - true_box_y_min
true_box_w = true_box_x_max - true_box_x_min
target_dy = (true_box_ctr_y-anchor_ctr_y)/anchor_h
target_dx = (true_box_ctr_x-anchor_ctr_x)/anchor_w
target_dh = np.log(true_box_h/anchor_h)
target_dw = np.log(true_box_w/anchor_w)
targets = np.stack([target_dy, target_dx, target_dh, target_dw], axis=1)
return np.reshape(targets, (-1, 4)) / prior_scaling
def decode_targets(anchors, targets, image_shape, prior_scaling=cfg.PRIOT_SCALING):
y_min = anchors[:, 0]
x_min = anchors[:, 1]
y_max = anchors[:, 2]
x_max = anchors[:, 3]
height, width = image_shape[:2]
ctr_y = (y_max + y_min) / 2
ctr_x = (x_max + x_min) / 2
h = y_max - y_min
w = x_max - x_min
targets = targets * prior_scaling
dy = targets[:, 0]
dx = targets[:, 1]
dh = targets[:, 2]
dw = targets[:, 3]
pred_ctr_y = dy*h + ctr_y
pred_ctr_x = dx*w + ctr_x
pred_h = h*np.exp(dh)
pred_w = w*np.exp(dw)
y_min = pred_ctr_y - pred_h/2
x_min = pred_ctr_x - pred_w/2
y_max = pred_ctr_y + pred_h/2
x_max = pred_ctr_x + pred_w/2
y_min = np.clip(y_min, 0, height)
y_max = np.clip(y_max, 0, height)
x_min = np.clip(x_min, 0, width)
x_max = np.clip(x_max, 0, width)
boxes = np.stack([y_min, x_min, y_max, x_max], axis=1)
return boxes
def fast_bbox_overlaps(holdon_anchor, true_boxes):
num_true = true_boxes.shape[0] # 真值框的个数 m
num_holdon = holdon_anchor.shape[0] # 候选框的个数(已删去越界的样本)n
true_y_max = true_boxes[:, 2]
true_y_min = true_boxes[:, 0]
true_x_max = true_boxes[:, 3]
true_x_min = true_boxes[:, 1]
anchor_y_max = holdon_anchor[:, 2]
anchor_y_min = holdon_anchor[:, 0]
anchor_x_max = holdon_anchor[:, 3]
anchor_x_min = holdon_anchor[:, 1]
true_h = true_y_max - true_y_min
true_w = true_x_max - true_x_min
true_h = np.expand_dims(true_h, axis=1)
true_w = np.expand_dims(true_w, axis=1)
anchor_h = holdon_anchor[:, 2] - holdon_anchor[:, 0]
anchor_w = holdon_anchor[:, 3] - holdon_anchor[:, 1]
true_area = true_w * true_h
anchor_area = anchor_w * anchor_h
min_y_up = np.expand_dims(true_y_max, axis=1) < anchor_y_max
min_y_up = np.where(min_y_up, np.expand_dims(
true_y_max, axis=1), np.expand_dims(anchor_y_max, axis=0))
max_y_down = np.expand_dims(true_y_min, axis=1) > anchor_y_min
max_y_down = np.where(max_y_down, np.expand_dims(
true_y_min, axis=1), np.expand_dims(anchor_y_min, axis=0))
lh = min_y_up - max_y_down
min_x_up = np.expand_dims(true_x_max, axis=1) < anchor_x_max
min_x_up = np.where(min_x_up, np.expand_dims(
true_x_max, axis=1), np.expand_dims(anchor_x_max, axis=0))
max_x_down = np.expand_dims(true_x_min, axis=1) > anchor_x_min
max_x_down = np.where(max_x_down, np.expand_dims(
true_x_min, axis=1), np.expand_dims(anchor_x_min, axis=0))
lw = min_x_up - max_x_down
pos_index = np.where(
np.logical_and(
lh > 0, lw > 0
)
)
overlap_area = lh * lw # (n, m)
overlap_weight = np.zeros(shape=lh.shape, dtype=np.int)
overlap_weight[pos_index] = 1
all_area = true_area + anchor_area
dialta_S = all_area - overlap_area
dialta_S = np.where(dialta_S > 0, dialta_S, all_area)
IOU = np.divide(overlap_area, dialta_S)
IOU = np.where(overlap_weight, IOU, 0)
IOU_s = np.transpose(IOU)
return IOU_s.astype(np.float32) # (n, m) 转置矩阵
if __name__ == "__main__":
pass
3、nms.py
非极大值抑制(Non-Maximum Suppression,NMS),功能是去除冗余的检测框,保留最好的一个。
如果不进行NMS,效果是这样的:
上代码:
import tensorflow as tf
from network import Net
import config as cfg
import cv2
import numpy as np
from label_anchors import decode_targets
import matplotlib.pyplot as plt
from nms import py_cpu_nms
class SSD_detector(object):
def __init__(self):
self.net = Net(is_training=False)
self.model_path = cfg.MODEL_PATH
self.pixel_means = cfg.PIXEL_MEANS
self.min_size = cfg.MIN_SIZE
self.pred_loc, self.pred_cls = self.net.output
self.score_threshold = cfg.SCORE_THRESHOLD
def pre_process(self, image_path):
image = cv2.imread(image_path)
image = image.astype(np.float)
image, scale = self.resize_image(image)
value = {'image': image, 'scale': scale, 'image_path': image_path}
return value
def resize_image(self, image):
image_shape = image.shape
size_min = np.min(image_shape[:2])
size_max = np.max(image_shape[:2])
scale = float(self.min_size) / float(size_min)
image = cv2.resize(image, dsize=(0, 0), fx=scale, fy=scale)
return image, scale
def test_ssd(self, image_paths):
if isinstance(image_paths, str):
image_paths = [image_paths]
with tf.Session() as sess:
sess.run(tf.compat.v1.global_variables_initializer())
ckpt = tf.train.get_checkpoint_state(cfg.MODEL_PATH)
if ckpt and ckpt.model_checkpoint_path:
# 如果保存过模型,则在保存的模型的基础上继续训练
self.net.saver.restore(sess, ckpt.model_checkpoint_path)
print('Model Reload Successfully!')
for path in image_paths:
value = self.pre_process(path)
image = value['image'] - self.pixel_means
feed_dict = {self.net.x: image}
pred_loc, pred_cls, layer_anchors = sess.run(
[self.pred_loc, self.pred_cls, self.net.anchors], feed_dict
)
pos_loc, pos_cls, pos_anchors, pos_scores = self.decode_output(
pred_loc, pred_cls, layer_anchors)
pos_boxes = decode_targets(pos_anchors, pos_loc, image.shape)
pos_scores = np.expand_dims(pos_scores, axis=-1)
self.draw_result(
value['image'], pos_boxes, pos_cls, value['scale']
)
keep_index = py_cpu_nms(np.hstack([pos_boxes, pos_scores]))
self.draw_result(
value['image'], pos_boxes[keep_index], pos_cls[keep_index], value['scale']
)
def draw_result(self, image, pos_boxes, pos_cls, scale, font=cv2.FONT_HERSHEY_SIMPLEX):
image = cv2.resize(image, dsize=(0, 0), fx=1/scale, fy=1/scale)
image = image.astype(np.int)
pos_boxes = pos_boxes * (1/scale)
for i in range(pos_boxes.shape[0]):
bbox = pos_boxes[i]
label = cfg.CLASSES[pos_cls[i]]
y_min, x_min, y_max, x_max = bbox.astype(np.int)
cv2.rectangle(image, (x_min, y_min),
(x_max, y_max), (0, 0, 255), thickness=2)
cv2.putText(image, label, (x_min+20, y_min+20),
font, 1, (255, 0, 0), thickness=2)
plt.imshow(image[:, :, [2, 1, 0]])
plt.show()
def decode_output(self, pred_loc, pred_cls, layer_anchors):
pos_loc, pos_cls, pos_anchors, pos_scores = [], [], [], []
for i in range(len(pred_cls)):
loc_ = pred_loc[i]
cls_ = pred_cls[i] # cls_是每个分类的得分
anchors = layer_anchors[i].reshape((-1, 4))
max_scores = np.max(cls_[:, 1:], axis=-1) # 非背景最大得分
cls_ = np.argmax(cls_, axis=-1) # 最大索引
pos_index = np.where(max_scores > self.score_threshold)[0] # 正样本
pos_loc.append(loc_[pos_index])
pos_cls.append(cls_[pos_index])
pos_anchors.append(anchors[pos_index])
pos_scores.append(max_scores[pos_index])
pos_loc = np.vstack(pos_loc)
pos_cls = np.hstack(pos_cls)
pos_anchors = np.vstack(pos_anchors)
pos_scores = np.hstack(pos_scores)
return pos_loc, pos_cls, pos_anchors, pos_scores
if __name__ == "__main__":
detector = SSD_detector()
detector.test_ssd('./1.jpg')
|
|