yolo v2使用总结

以下都是基于yolo v2版本的，对于现在的v3版本，可以先clone下来，再git checkout回v2版本。

玩了三四个月的yolo后发现数值相当不稳定，yolo只能用来小打小闹了。

v2训练的权重用v3做预测，结果不一样。

我的环境是 window 10 + cuda9.0 + opencv 3.4.0 + VS2015

先在这个地方下源文件：https://github.com/AlexeyAB/darknet

下好后，先打开用文本编辑器打开 darknet.vcxproj，将两处 cuda9.1 改成 cuda9.0

还要拷贝opencv的两个dll到 x64 下

用 VS2015 打开 darknet.sln，生成一下，在 darknet-master\build\darknet\x64 下面得到一个 darknet.exe

这个时候已经可以用训练好的模型对训练时的类别做预测了
当然，要预先下好对应的 weights 文件
darknet.exe detector test cfg/combine9k.data yolo9000.cfg yolo9000.weights data/dog.jpg

先准备好训练图片！

一定要先做好文件重命名工作！不然后面想添加、修改、删减样本都很痛苦。

然后再用 windows 版的 labelImg 做标注
然后修改相关配置文件，然后就可以开始训练了。

训练前下好一个 darknet19_448.conv.23 文件

训练命令如下
darknet.exe detector train cfg/voc.data cfg/yolo-voc.cfg cfg/darknet19_448.conv.23
要准备好.data 和 .cfg 文件以及训练数据集

训练时如果说找不到 txt 文件，直接把 txt 文件拷贝到image文件夹下

最后会得到一个自己的 weights 文件可以用来预测自己的类别

2018年8月24日08:58:19

总结一个比较完整的流程出来，大致有以下几个步骤：

1，不停的图片采集，以及不停的对新采集的图片重命名。

因为采集到的图片名称可能是以秒命名的，也有可能是按日期命名的，或者有其他命名规范。

后续标注和训练完测试的时候，如果发现某些样本图片不理想要剔除或跳过的时候，名字不易找就比较麻烦。

2，图片标注的技巧，实际标注的xml数量是少于采集到的图片数量的，因为有些图片拍的角度或光照不理想。

3，离线数据增强

yolo本身有一些在线的数据增强，然而没有深入阅读过代码的话很难改的动，而且提供的增强手段有限。所以做了个

简单的离线数据增强。

4，训练数据预处理，修改cfg文件等。

5，训练和测试

6，在opencv中调用，C++和python两种版本。以及批量抠图。

=====================================================

1，重命名 rename.py

 import os

 def getFilenames(filepath):

     '''

     得到一个文件夹下所有的文件名，不包含后缀, 忽略文件夹

     '''

     filenames = []

     for file in os.listdir(filepath):

         pt = os.path.join(filepath, file)

         if( os.path.isfile(pt) ):

             filename = os.path.splitext(file)[0]

             filenames.append(filename)

     return filenames

 filepath = "origin_img"

 filenames = getFilenames(filepath)

 print(filenames)

 # 只对文件夹中新增加的图片重命名，增量式重命名。所以要取得图片文件名中最大的数字

 def get_max_num(filenames):

     max_num = 0

     for name in filenames:

         if( name.isdigit() and int(name) < 10000):

             if(int(name) > max_num):

                 max_num = int(name)

     return max_num

 print("max num:", get_max_num(filenames))

 renameCount = get_max_num(filenames)+1

 #renameCount = 1

 for file in os.listdir(filepath):

     if( os.path.isfile( os.path.join(filepath, file) ) ): # 如果是文件

         filename = os.path.splitext(file)[0] # 取得文件名

         if( not filename.isdigit() ): # 如果文件名不是数字，则重命名

             print("rename count 1:", renameCount)

             os.rename(os.path.join(filepath, file), os.path.join(filepath, str('%03d'%renameCount)+".jpeg"))

             renameCount+=1

         if( filename.isdigit() and int(filename) > 10000 ):

             print("rename count 2:", renameCount)

             os.rename(os.path.join(filepath, file), os.path.join(filepath, str('%03d'%renameCount)+".jpeg"))

             renameCount+=1

2，图片标注技巧

用labelimg，尽量从特征的角度考虑，把目标物体最明显的特征框进去，跟程序实际工作情况差距太大的图片

就不要标注了，如果特征不是太明显的也标了，那样本数量就要上去。

标注的xml是少于图片数量的，需要进一步将标注过的图片拿出来。

 import os

 import shutil

 def getFilenames(filepath):

     '''得到一个文件夹下所有的文件名，不包含后缀

     '''

     filelist = os.listdir(filepath)

     filenames = []

     for files in filelist:

         filename = os.path.splitext(files)[0]

     #    print(files)

     #    print(filename)

         filenames.append(filename)

     return filenames

 xmlpath = 'xml'  # 这个是标注过后xml所在文件夹

 imgpath = 'img'  # 这个是标注时图片所在文件夹，图片数目多于xml

 img_write_path = 'img_less'  # 把标注过的图像拷贝到这个文件夹

 filenames = getFilenames(xmlpath)

 for i in range(len(filenames)):

     filename = filenames[i]

 #    print(filename)

     jpgpath = imgpath + "/" + str(filename) + ".jpeg"

 #    print(jpgpath)

     jpg_wrt_path = img_write_path + "/" + str(filename) + ".jpg"

     shutil.copy(jpgpath, jpg_wrt_path)

3，离线数据增强

有些数据增强要修改xml，比如水平翻转，旋转，裁剪，有些不要，比如对颜色、光照做扰动等。

本来写了水平翻转和随机裁剪，后来想想yolo把图片resize到416，随机裁剪没什么卵用。

注意数据增强的时候不要搞出一堆人为的特征让yolo去学，要合理的增强。

这里只放一个水平翻转的功能出来。

util.py

 import numpy as np

 import cv2

 import matplotlib.pyplot as plt

 import random

 import os

 def showimg(img):

     channelNum = len(img.shape)

     if channelNum == 3:

         fig = plt.subplots(1),plt.imshow( cv2.cvtColor(img,  cv2.COLOR_BGR2RGB)  )

     if channelNum == 2:

         fig = plt.subplots(1),plt.imshow( img  )

 def scaleimg(img, scale = 1.0):

     H, W, C = img.shape

     size = (int(scale*W), int(scale*H))

     img = cv2.resize(img, size, interpolation=cv2.INTER_AREA)

     del H, W, C, size, scale

     return img.copy()

 # img = rotateimg(image, angle)

 def rotateimg(image, angle, center=None, scale=1.0):

     # 获取图像尺寸

     (h, w) = image.shape[:2]

     # 若未指定旋转中心，则将图像中心设为旋转中心

     if center is None:

         center = (w / 2, h / 2)

     # 执行旋转

     M = cv2.getRotationMatrix2D(center, angle, scale) # 给的角度为正的时候，则逆时针旋转

     rotated = cv2.warpAffine(image, M, (w, h), flags=cv2.INTER_CUBIC)

 #    rotated = cv2.warpAffine(image, M, (w, h), flags=cv2.INTER_CUBIC, borderMode=cv2.BORDER_REPLICATE)

     return rotated  # 返回旋转后的图像, angle是角度制，不是弧度制

 '''

 1， 读取xml返回结果

     输入：CLASS_NAMES元组   xml路径

     返回：(H, W, boxes)   boxes是一个二维np数组，6列分别为

           id classid xmin xmax ymin ymax

            0       1    2    3    4    5

 2,  将boxes  CLASS_NAMES  H W 信息写入xml

     输入：boxes  CLASS_NAMES  H W   xml路径

     输出：硬盘上的一个xml

 3， 根据 img，boxes数组，class_names，画出一个图来

     输入：img，boxes，class_names

 '''

 import xml.etree.ElementTree as ET

 import numpy as np

 #CLASS_NAMES = ('person', 'dog')  # 下标从0开始，这里可以没有顺序，最好有顺序

 #          id classid xmin xmax ymin ymax

 #           0       1    2    3    4    5

 def xml2boxes(xmlpath, CLASS_NAMES):

     print("xmlpath:", xmlpath)

     cls_to_idx = dict( zip( CLASS_NAMES            , range(len(CLASS_NAMES)) ))

     idx_to_cls = dict( zip( range(len(CLASS_NAMES)), CLASS_NAMES             ))

 #    print(cls_to_idx)

 #    print(idx_to_cls)

     annotations = ET.parse(xmlpath)

     # 获得 HWC

     size = annotations.find('size')

     W = int(size.find('width').text)

     H = int(size.find('height').text)

     C = int(size.find('depth').text)

     # 获得类别和具体坐标

     bbox = list()

     count = 1

     for obj in annotations.iter('object'): # 提取 xml文件中的信息

         line = []

         bndbox_anno = obj.find('bndbox')

         # xmin等从 1 开始计数

         tmp = map(int, [bndbox_anno.find('xmin').text,

                         bndbox_anno.find('xmax').text,

                         bndbox_anno.find('ymin').text,

                         bndbox_anno.find('ymax').text])

         tmp = list(tmp) # 1 x 4

         name = obj.find('name').text.lower().strip()

         line.append(count)

         line.append(cls_to_idx[name])

         line.append(tmp[0])

         line.append(tmp[1])

         line.append(tmp[2])

         line.append(tmp[3])

         count = count + 1

 #        print(line)

         bbox.append( line )

     boxes = np.stack(bbox).astype(np.int32)

     return boxes, H, W

 #boxes, H, W = xml2boxes("1.xml", CLASS_NAMES)

 #print("boxes:\n", boxes)

 # 对只有一个类别的时候，CLASS_NAMES要在后面加一个字符串

 # 比如 CLASS_NAMES = ("apple", "xxxx") 这是个bug，还没修

 from lxml.etree import Element, SubElement, tostring

 from xml.dom.minidom import parseString

 ######################################################

 # boxes2xml_labelImg(boxes, CLASS_NAMES, H, W, xmlpath, wrtin_img_folder_name, imgName, img_fullpath)

 def boxes2xml_labelImg(boxes, CLASS_NAMES, H, W, xmlpath, wrtin_img_folder_name,

                        imgName, img_fullpath):

     '''

     这是一个labelImg可以查看的版本

     这个时候要求 CLASS_NAMES 是有顺序的，和boxes里头的第二列

     的类别id要一一对应

     '''

     cls_to_idx = dict( zip( CLASS_NAMES            , range(len(CLASS_NAMES)) ))

     idx_to_cls = dict( zip( range(len(CLASS_NAMES)), CLASS_NAMES             ))

     node_annotation = Element('annotation')

     #################################################

     node_folder   = SubElement(node_annotation, 'folder')

     node_filename = SubElement(node_annotation, 'filename')

     node_path     = SubElement(node_annotation, 'path')

     node_source   = SubElement(node_annotation, 'source')

     node_database   = SubElement(node_source, 'database')

     node_folder.text = wrtin_img_folder_name  # 这个是定死的，赋值一次就不会变了

     node_filename.text = imgName              # 图片的文件名，不包含后缀

     node_path.text = img_fullpath                  # 随着文件名变化

     node_database.text = "Unknown"

     node_size     = SubElement(node_annotation, 'size')

     #################################################

     # node_size

     node_width   = SubElement(node_size, 'width')

     node_height  = SubElement(node_size, 'height')

     node_depth   = SubElement(node_size, 'depth')

     node_width.text  = str(W)

     node_height.text = str(H)

     node_depth.text  = str(3) # 默认是彩色

     #################################################

     node_segmented   = SubElement(node_annotation, 'segmented')

     node_segmented.text = ""

     #################################################

     # node_object  若干    要循环

     for i in range(boxes.shape[0]):

         node_object = SubElement(node_annotation, 'object')

         classid = boxes[i, 1]

     #    print(idx_to_cls[classid])

         node_name   = SubElement(node_object, 'name')

         node_name.text = idx_to_cls[classid]

         node_pose   = SubElement(node_object, 'pose')

         node_truncated   = SubElement(node_object, 'truncated')

         node_Difficult   = SubElement(node_object, 'Difficult')

         node_pose.text = "Unspecified"

         node_truncated.text = ""

         node_Difficult.text = ""

         node_bndbox = SubElement(node_object, 'bndbox')

         node_xmin = SubElement(node_bndbox, 'xmin')

         node_ymin = SubElement(node_bndbox, 'ymin')

         node_xmax = SubElement(node_bndbox, 'xmax')

         node_ymax = SubElement(node_bndbox, 'ymax')

         node_xmin.text = str(boxes[i, 2])

         node_xmax.text = str(boxes[i, 3])

         node_ymin.text = str(boxes[i, 4])

         node_ymax.text = str(boxes[i, 5])

     ###################

     xml = tostring(node_annotation, pretty_print=True)  #格式化显示，该换行的换行

     dom = parseString(xml)

     test_string = xml.decode('utf-8')

     #print('test:\n', test_string)

     with open(xmlpath, "w") as text_file:

         text_file.write(test_string)

 ######################################################

 def drawboxes(imgpath, boxes, CLASS_NAMES):

     import matplotlib.pyplot as plt

     import matplotlib.patches as patches

     import cv2

     cls_to_idx = dict( zip( CLASS_NAMES            , range(len(CLASS_NAMES)) ))

     idx_to_cls = dict( zip( range(len(CLASS_NAMES)), CLASS_NAMES             ))

     if isinstance(imgpath, str):

         img = cv2.imread(imgpath)

         img = cv2.cvtColor(img,  cv2.COLOR_BGR2RGB)

     if isinstance(imgpath, np.ndarray):

         img = imgpath

     fig, ax = plt.subplots(1)

     for i in range(boxes.shape[0]):

         bndbox = list(boxes[i,:])

         x = bndbox[2]

         y = bndbox[4]

         w = bndbox[3] - bndbox[2]

         h = bndbox[5] - bndbox[4]

         rect = patches.Rectangle( (x,y),w,h, linewidth=1,edgecolor='yellow',facecolor='none')

         ax.add_patch(rect)

         name = idx_to_cls[boxes[i, 1]]

         ax.text(x-5, y-5, name, style='italic', color='yellow', fontsize=12)

     ax.imshow(img)

 #drawboxes("1.jpg", boxes, CLASS_NAMES)

 ##################################

 def getFilenames(filepath):

     '''得到一个文件夹下所有的文件名，不包含后缀

     '''

     filelist = os.listdir(filepath)

     filenames = []

     for files in filelist:

         filename = os.path.splitext(files)[0]

     #    print(files)

     #    print(filename)

         filenames.append(filename)

     return filenames

 def fliplr_boxes(boxes, W):

     ''' 对boxes做水平翻转'''

     boxes_copy = boxes.copy()

     xmin = boxes[:, 2].copy()

     xmax = boxes[:, 3].copy()

     boxes_copy[:, 3] = W - 1 - xmin # 注意这里不是 2,3 是 3,2 不然xmin会大于xmax

     boxes_copy[:, 2] = W - 1 - xmax

     return boxes_copy

main.py

 import os

 import cv2

 import numpy as np

 import random

 import matplotlib.pyplot as plt

 import matplotlib.patches as patches

 from util import *

 img_read_path = "img_less"

 xml_read_path = "xml"

 img_write_path = "fliped_img" # 图片和xml水平翻转后的写入文件夹

 xml_write_path = "fliped_xml"

 filenames = getFilenames(xml_read_path)

 CLASS_NAMES = ('person', 'aa')  # 这里有个bug懒得改，一个类别时也要写两个进去

 count = 201

 wrtin_img_folder_name = "fliped_img"

 for i in range(len(filenames)):

     name = filenames[i]

     imgname = img_read_path + "/" + str(name) + ".jpg"

     img = cv2.imread(imgname)

     xmlname = xml_read_path + "/" + str(name) + ".xml"

     boxes, H, W = xml2boxes(xmlname, CLASS_NAMES)

 #    print("xmlname:", xmlname)

     H,W,C = img.shape

     ##############################

     fliped_boxes = fliplr_boxes(boxes, W)

     fliped_img = cv2.flip(img, 1)

     ##############################

     FileName = str(count)

     jpgpath = img_write_path + "/" + FileName + ".jpg"

     cv2.imwrite(jpgpath, fliped_img)

     xmlpath = xml_write_path + "/" + FileName + ".xml"

     boxes2xml_labelImg(fliped_boxes, CLASS_NAMES, H, W, xmlpath, wrtin_img_folder_name, FileName, jpgpath)

     count = count + 1

4，训练数据预处理，修改cfg文件

下面这两个文件一定要执行，我从别的地方copy过来的。

trans1.py

 import os

 import shutil

 savepath = os.getcwd()

 img_path = savepath + "/img_less"   # 存放训练图片的文件夹名

 xml_path = savepath + "/xml"   # 总共标注了 X 张图片

 val_num = 10   #验证集数量，可修改

 # 下面新建了 4 个目录

 validateImage_path = savepath + "/validateImage";

 trainImage_path    = savepath + "/trainImage";

 if os.path.exists(validateImage_path)== False:

     os.mkdir(validateImage_path)

 if os.path.exists(trainImage_path) == False:

     os.mkdir(trainImage_path)

 validateImageXML_path = savepath + "/validateImageXML"

 trainImageXML_path    = savepath + "/trainImageXML"

 if os.path.exists(validateImageXML_path)== False:

     os.mkdir(validateImageXML_path)

 if os.path.exists(trainImageXML_path) == False:

     os.mkdir(trainImageXML_path)

 #=================================================

 filelist = os.listdir(xml_path)  # 以xml文件夹中的数量为标准

 count = 0

 for files in filelist:

     filename = os.path.splitext(files)[0]  # 文件名

     origin_jpg_name = os.path.join(img_path, filename + '.jpg')

     validateImage_jpg_name = os.path.join(validateImage_path, filename + '.jpg')

     trainImage_jpg_name    = os.path.join(trainImage_path,    filename + '.jpg')

 #    print(validateImage_jpg_name)

     if count < val_num:

         shutil.copy(origin_jpg_name, validateImage_jpg_name); # 拷贝 validate 图片

         xml_olddir = os.path.join(xml_path,              filename + ".xml")

         xml_newdir = os.path.join(validateImageXML_path, filename + ".xml")

         shutil.copyfile(xml_olddir, xml_newdir) # 拷贝 validate xml文件

     else:

         shutil.copy(origin_jpg_name, trainImage_jpg_name)

         xml_olddir = os.path.join(xml_path,           filename + ".xml")

         xml_newdir = os.path.join(trainImageXML_path, filename + ".xml")

         shutil.copyfile(xml_olddir, xml_newdir)

     count=count+1;

 validate_txtpath = savepath + "/validateImageId.txt"

 train_txtpath    = savepath + "/trainImageId.txt"

 def listname(path, idtxtpath):

     filelist = os.listdir(path)  # 该文件夹下所有的文件（包括文件夹）

     f = open(idtxtpath, 'w')

     for files in filelist:  # 遍历所有文件

         Olddir = os.path.join(path, files)  # 原来的文件路径

         if os.path.isdir(Olddir):  # 如果是文件夹则跳过

             continue

         filename = os.path.splitext(files)[0]  # 文件名

         f.write(filename)

         f.write('\n')

     f.close()

 listname(validateImage_path, validate_txtpath)

 listname(trainImage_path,    train_txtpath)

trans2.py

 import xml.etree.ElementTree as ET

 import pickle

 import string

 import os

 import shutil

 from os import listdir, getcwd

 from os.path import join

 sets=[('', 'train')]

 classes = ["person"]  

 def convert(size, box):

     dw = 1./size[0]

     dh = 1./size[1]

     x = (box[0] + box[1])/2.0

     y = (box[2] + box[3])/2.0

     w = box[1] - box[0]

     h = box[3] - box[2]

     x = x*dw

     w = w*dw

     y = y*dh

     h = h*dh

     return (x,y,w,h)

 def convert_annotation(image_id,flag,savepath):

     if flag == 0:

         in_file = open(savepath+'/trainImageXML/%s.xml' % (image_id))

         labeltxt = savepath+'/trainImageLabelTxt';

         if os.path.exists(labeltxt) == False:

             os.mkdir(labeltxt);

         out_file = open(savepath+'/trainImageLabelTxt/%s.txt' % (image_id), 'w')

         tree = ET.parse(in_file)

         root = tree.getroot()

         size = root.find('size')

         w = int(size.find('width').text)

         h = int(size.find('height').text)

     elif flag == 1:

         in_file = open(savepath+'/validateImageXML/%s.xml' % (image_id))

         labeltxt = savepath + '/validateImageLabelTxt';

         if os.path.exists(labeltxt) == False:

             os.mkdir(labeltxt);

         out_file = open(savepath+'/validateImageLabelTxt/%s.txt' % (image_id), 'w')

         tree = ET.parse(in_file)

         root = tree.getroot()

         size = root.find('size')

         w = int(size.find('width').text)

         h = int(size.find('height').text)

     for obj in root.iter('object'):

 #        difficult = obj.find('difficult').text

         cls = obj.find('name').text

 #        if cls not in classes or int(difficult) == 1:

 #            continue

         cls_id = classes.index(cls)

         xmlbox = obj.find('bndbox')

         b = (float(xmlbox.find('xmin').text), float(xmlbox.find('xmax').text), float(xmlbox.find('ymin').text), float(xmlbox.find('ymax').text))

         bb = convert((w,h), b)

         out_file.write(str(cls_id) + " " + " ".join([str(a) for a in bb]) + '\n')

 wd = getcwd()

 for year, image_set in sets:

     savepath = os.getcwd();

     idtxt = savepath + "/validateImageId.txt";

     pathtxt = savepath + "/validateImagePath.txt";

     image_ids = open(idtxt).read().strip().split()

     list_file = open(pathtxt, 'w')

     s = '\xef\xbb\xbf'

     for image_id in image_ids:

         nPos = image_id.find(s)

         if nPos >= 0:

             image_id = image_id[3:]

         list_file.write('%s/validateImage/%s.jpg\n' % (wd, image_id))

         print(image_id)

         convert_annotation(image_id, 1, savepath)

     list_file.close()

     idtxt = savepath + "/trainImageId.txt";

     pathtxt = savepath + "/trainImagePath.txt" ;

     image_ids = open(idtxt).read().strip().split()

     list_file = open(pathtxt, 'w')

     s = '\xef\xbb\xbf'

     for image_id in image_ids:

         nPos = image_id.find(s)

         if nPos >= 0:

            image_id = image_id[3:]

         list_file.write('%s/trainImage/%s.jpg\n'%(wd,image_id))

         print(image_id)

         convert_annotation(image_id,0,savepath)

     list_file.close()

训练的时候，编译好了，在windows下生成了darknet.exe，ubuntu下生成了可执行文件darknet。

然后在darknet.exe同级目录新建一个文件夹叫（比如训练做行人识别）train_person，在该文件夹下新建一个

backup文件夹，新建一个data文件夹，把数据增强过的图片文件夹img_less和xml拷贝到data下面，将trans1.py

和trans2.py也放到data文件夹下，然后先执行trans1.py，再执行trans2.py。然后把生成的txt拷贝到训练图片目录下。

至于修改cfg文件，看其他的博客吧。我就提一下修改那个anchors，faster-rcnn中有9个比例确定的anchor。yolo中则是

统计了样本中的标注框然后做聚类。用了效果不错我才修改的。

 # coding=utf-8

 # k-means ++ for YOLOv2 anchors

 # 通过k-means ++ 算法获取YOLOv2需要的anchors的尺寸

 import numpy as np

 # 定义Box类，描述bounding box的坐标

 class Box():

     def __init__(self, x, y, w, h):

         self.x = x

         self.y = y

         self.w = w

         self.h = h

 # 计算两个box在某个轴上的重叠部分

 # x1是box1的中心在该轴上的坐标

 # len1是box1在该轴上的长度

 # x2是box2的中心在该轴上的坐标

 # len2是box2在该轴上的长度

 # 返回值是该轴上重叠的长度

 def overlap(x1, len1, x2, len2):

     len1_half = len1 / 2

     len2_half = len2 / 2

     left = max(x1 - len1_half, x2 - len2_half)

     right = min(x1 + len1_half, x2 + len2_half)

     return right - left

 # 计算box a 和box b 的交集面积

 # a和b都是Box类型实例

 # 返回值area是box a 和box b 的交集面积

 def box_intersection(a, b):

     w = overlap(a.x, a.w, b.x, b.w)

     h = overlap(a.y, a.h, b.y, b.h)

     if w < 0 or h < 0:

         return 0

     area = w * h

     return area

 # 计算 box a 和 box b 的并集面积

 # a和b都是Box类型实例

 # 返回值u是box a 和box b 的并集面积

 def box_union(a, b):

     i = box_intersection(a, b)

     #print a.w,a.h,b.w,b.h

     u = a.w * a.h + b.w * b.h - i

     return u

 # 计算 box a 和 box b 的 iou

 # a和b都是Box类型实例

 # 返回值是box a 和box b 的iou

 def box_iou(a, b):

     #print box_union(a, b)

     return box_intersection(a, b) / box_union(a, b)

 # 使用k-means ++ 初始化 centroids，减少随机初始化的centroids对最终结果的影响

 # boxes是所有bounding boxes的Box对象列表

 # n_anchors是k-means的k值

 # 返回值centroids 是初始化的n_anchors个centroid

 def init_centroids(boxes,n_anchors):

     centroids = []

     boxes_num = len(boxes)

     centroid_index = np.random.choice(boxes_num, 1)

     centroids.append(boxes[centroid_index])

     print(centroids[0].w,centroids[0].h)

     for centroid_index in range(0,n_anchors-1):

         sum_distance = 0

         distance_thresh = 0

         distance_list = []

         cur_sum = 0

         for box in boxes:

             min_distance = 1

             for centroid_i, centroid in enumerate(centroids):

                 distance = (1 - box_iou(box, centroid))

                 if distance < min_distance:

                     min_distance = distance

             sum_distance += min_distance

             distance_list.append(min_distance)

         distance_thresh = sum_distance*np.random.random()

         for i in range(0,boxes_num):

             cur_sum += distance_list[i]

             if cur_sum > distance_thresh:

                 centroids.append(boxes[i])

                 print(boxes[i].w, boxes[i].h)

                 break

     return centroids

 # 进行 k-means 计算新的centroids

 # boxes是所有bounding boxes的Box对象列表

 # n_anchors是k-means的k值

 # centroids是所有簇的中心

 # 返回值new_centroids 是计算出的新簇中心

 # 返回值groups是n_anchors个簇包含的boxes的列表

 # 返回值loss是所有box距离所属的最近的centroid的距离的和

 def do_kmeans(n_anchors, boxes, centroids):

     loss = 0

     groups = []

     new_centroids = []

     for i in range(n_anchors):

         groups.append([])

         new_centroids.append(Box(0, 0, 0, 0))

     for box in boxes:

         min_distance = 1

         group_index = 0

         for centroid_index, centroid in enumerate(centroids):

             distance = (1 - box_iou(box, centroid))

             if distance < min_distance:

                 min_distance = distance

                 group_index = centroid_index

         groups[group_index].append(box)

         loss += min_distance

         new_centroids[group_index].w += box.w

         new_centroids[group_index].h += box.h

     for i in range(n_anchors):

         new_centroids[i].w /= len(groups[i])

         new_centroids[i].h /= len(groups[i])

     return new_centroids, groups, loss

 # 计算给定bounding boxes的n_anchors数量的centroids

 # label_path是训练集列表文件地址

 # n_anchors 是anchors的数量

 # loss_convergence是允许的loss的最小变化值

 # grid_size * grid_size 是栅格数量

 # iterations_num是最大迭代次数

 # plus = 1时启用k means ++ 初始化centroids

 def compute_centroids(label_path,n_anchors,loss_convergence,grid_size,iterations_num,plus):

     boxes = []

     label_files = []

     f = open(label_path)

     for line in f:

         label_path = line.rstrip().replace('images', 'labels')

         label_path = label_path.replace('JPEGImages', 'labels')

         label_path = label_path.replace('.jpg', '.txt')

         label_path = label_path.replace('.JPEG', '.txt')

         label_files.append(label_path)

     f.close()

     for label_file in label_files:

         f = open(label_file)

         for line in f:

             temp = line.strip().split(" ")

             if len(temp) > 1:

                 boxes.append(Box(0,0, float(temp[3]), float(temp[4])))

                 print(temp[3],temp[4])

                 if float(temp[3])<0:

                     print(label_file)

     print('done')

     if plus:

         centroids = init_centroids(boxes, n_anchors)

     else:

         centroid_indices = np.random.choice(len(boxes), n_anchors)

         centroids = []

         for centroid_index in centroid_indices:

             centroids.append(boxes[centroid_index])

     # iterate k-means

     centroids, groups, old_loss = do_kmeans(n_anchors, boxes, centroids)

     iterations = 1

     while (True):

         centroids, groups, loss = do_kmeans(n_anchors, boxes, centroids)

         iterations = iterations + 1

         print("loss = %f" % loss)

         if abs(old_loss - loss) < loss_convergence or iterations > iterations_num:

             break

         old_loss = loss

         for centroid in centroids:

             print(centroid.w * grid_size, centroid.h * grid_size)

     # print result

     for centroid in centroids:

         #print("k-means result:\t ", centroid.w * grid_size, ",", centroid.h * grid_size)

         #str('%.03f'%maxVal)

         print("k-means result:\t ", str('%.06f'%(centroid.w * grid_size)), ",", str('%.06f'%(centroid.h * grid_size)))

 label_path = "xx/train_person/data/trainImagePath.txt"

 n_anchors = 5

 loss_convergence = 1e-3

 grid_size = 13

 iterations_num = 10000000

 plus = 0

 compute_centroids(label_path,n_anchors,loss_convergence,grid_size,iterations_num,plus)

5，训练和测试

直接命令行了。训练的时候内存够的话，再开一个命令行窗口做测试，可以一边训练，一边偶尔看看预测效果。

6，在opencv中调用

C++版本

 #include <opencv2/opencv.hpp>

 #include <opencv2/dnn.hpp>

 #include <fstream>

 #include <iostream>

 #include <algorithm>

 #include <cstdlib>

 using namespace std;

 using namespace cv;

 using namespace cv::dnn;

 //char* jpgpath = "";

 //char* cfgpath = "";

 //char* weightspath = "";

 //char* namespath = "";

 //Mat boxes = yoloMultiPredict(jpgpath, cfgpath, weightspath, namespath);

 //cout << "boxes:\n" << boxes << endl;  // class prob xmin xmax ymin ymax

 Mat yoloMultiPredict(char* jpgpath, char* cfgpath, char* weightspath, char* namespath)

 {

     Mat boxes = Mat::zeros(, , CV_16UC1);

     Mat frame = imread(jpgpath);

     dnn::Net net = readNetFromDarknet(cfgpath, weightspath);

     if (net.empty())

     {

         printf("Could not load net...\n");

     }

     // 得到类别名称

     if ()

     {

         ifstream classNamesFile(namespath);

         vector<string> classNamesVec;

         if (classNamesFile.is_open())

         {

             string className = "";

             while (std::getline(classNamesFile, className))

                 classNamesVec.push_back(className);

         }

         for (int i = ; i < classNamesVec.size(); i++)

             cout << i << "\t" << classNamesVec[i] << endl;

         cout << endl;

     }

     Mat inputBlob = blobFromImage(frame,  / .F, Size(, ), Scalar(), true, false);

     net.setInput(inputBlob, "data");

     // 检测

     Mat detectionMat = net.forward("detection_out");

     //cout << "forward" << endl;

     // 输出结果

     for (int i = ; i < detectionMat.rows; i++)

     {

         const int probability_index = ;

         const int probability_size = detectionMat.cols - probability_index;

         float *prob_array_ptr = &detectionMat.at<float>(i, probability_index);

         size_t objectClass = max_element(prob_array_ptr, prob_array_ptr + probability_size) - prob_array_ptr;

         float confidence = detectionMat.at<float>(i, (int)objectClass + probability_index);

         if (confidence > 0.24)

         {

             float x = detectionMat.at<float>(i, );

             float y = detectionMat.at<float>(i, );

             float width = detectionMat.at<float>(i, );

             float height = detectionMat.at<float>(i, );

             int xmin = static_cast<int>((x - width / ) * frame.cols);

             int xmax = static_cast<int>((x + width / ) * frame.cols);

             int ymin = static_cast<int>((y - height / ) * frame.rows);

             int ymax = static_cast<int>((y + height / ) * frame.rows);

             // clip

             if (xmin<)

                 xmin = ;

             if (xmax > frame.cols)

                 xmax = frame.cols - ;

             if (ymin<)

                 ymin = ;

             if (ymax > frame.rows)

                 ymax = frame.rows - ;

             //rectangle(frame, cvPoint(xmin, ymin), cvPoint(xmax, ymax), Scalar(0, 0, 255), 4, 1, 0);

             //cout << "x y w h\t" << x << "\t" << y << "\t" << width << "\t" << height << endl;

             // class prob xmin xmax ymin ymax

             Mat L = (Mat_<short>(, ) << (short)objectClass, (short)(confidence * ), xmin, xmax, ymin, ymax);

             //cout << "L:" << L << endl;

             boxes.push_back(L);

         }

     }

     return boxes;

 }

2018年4月18日以后的opencv已经可以导入yolo v3的训练文件了。

直接用命令行做的预测结果和opencv中导入配置文件的预测结果不一样，有的时候相差还很大，不堪重用啊。

还是要在什么框架下训练就在什么框架下用，不然数值稳定性不能保证。。。

python版本

opencv的samples/dnn下的例子改造的。

 import cv2

 import numpy as np

 import os

 cwd = os.path.split(os.path.realpath(__file__))[0]

 def darknetPredict(jpgpathOrMat, cfgpath, wtspath):

     net = cv2.dnn.readNetFromDarknet(cfgpath, wtspath)

     confThreshold = 0.24

     nmsThreshold = 0.4

     def getOutputsNames(net):

         layersNames = net.getLayerNames()

         return [layersNames[i[0] - 1] for i in net.getUnconnectedOutLayers()]

     if(isinstance(jpgpathOrMat, str)):

         frame = cv2.imread(jpgpathOrMat)

     if(isinstance(jpgpathOrMat, np.ndarray)):

         frame = jpgpathOrMat

     frameHeight = frame.shape[0]

     frameWidth = frame.shape[1]

     # Create a 4D blob from a frame.

     inpW = 416

     inpH = 416

     blob = cv2.dnn.blobFromImage(frame, 1.0 / 255, (inpW, inpH), (0, 0, 0), swapRB=True, crop=False)

     net.setInput(blob)  # Run a model

     outs = net.forward(getOutputsNames(net))

     #

     classIds = []

     confidences = []

     boxes = []

     for out in outs:

     #    print('out:', out)

         for detection in out:

             scores = detection[5:]

             classId = np.argmax(scores)

             confidence = scores[classId]

             if confidence > confThreshold:

                 center_x = int(detection[0] * frameWidth)

                 center_y = int(detection[1] * frameHeight)

                 width    = int(detection[2] * frameWidth)

                 height   = int(detection[3] * frameHeight)

                 left     = int(center_x - width / 2)

                 top      = int(center_y - height / 2)

                 classIds.append(classId)

                 confidences.append(float(confidence))

                 boxes.append([left, top, width, height])

     rst_boxes = []

     indices = cv2.dnn.NMSBoxes(boxes, confidences, confThreshold, nmsThreshold)

     for i in indices:

         i = i[0]

         box = boxes[i]

         # left top  w  h

         #    0   1  2  3

         # xmin ymin w  h

         left = box[0]

         top = box[1]

         width = box[2]

         height = box[3]

 #        print("confidences:", confidences[i])

         xmin, ymin, xmax, ymax = [left, top, left+width, top+height]

         xmin = np.clip(xmin, 0, frameWidth-1 )

         xmax = np.clip(xmax, 0, frameWidth-1 )

         ymin = np.clip(ymin, 0, frameHeight-1)

         ymax = np.clip(ymax, 0, frameHeight-1)

         line = [classIds[i], confidences[i], xmin, ymin, xmax, ymax]

         #       classid      prob            xmin  ymin xmax   ymax

         #             0         1               2     3    4      5

         rst_boxes.append(line)

     rst_boxes = np.asarray(rst_boxes)

     return rst_boxes

差不多就是这样了。

秒客网

yolo v2使用总结

相关文章