# YOLOv3:基于 PyTorch 的目标检测模型实现

时间:2025-05-14 14:26:25

YOLOv3:基于 PyTorch 的目标检测模型实现

引言

YOLOv3(You Only Look Once)是一种流行的单阶段目标检测算法,它能够直接在输入图像上预测边界框和类别概率。YOLOv3 的优势在于其高效性和准确性,使其在实时目标检测任务中表现出色。本文将详细介绍如何使用 PyTorch 实现 YOLOv3 模型,并提供完整的代码实现。

1. YOLOv3 简介

YOLOv3 是 YOLO 系列算法的第三个版本,它在前两个版本的基础上进行了改进,提高了检测的准确性和速度。YOLOv3 的主要特点包括:

  • 单阶段检测:YOLOv3 直接在输入图像上预测边界框和类别概率,无需生成候选框。
  • 多尺度检测:YOLOv3 使用三个不同尺度的特征图进行检测,能够检测不同大小的目标。
  • 高效率:YOLOv3 的设计使其能够在实时应用中高效运行。

2. 环境准备

在开始实现之前,确保你已经安装了以下必要的依赖库:

pip install torch numpy matplotlib

3. 代码实现

3.1 导入必要的库

from __future__ import division

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.autograd import Variable
import numpy as np

from utils.parse_config import *  # 用于解析配置文件
from utils.utils import build_targets, to_cpu, non_max_suppression  # 用于目标构建和后处理

import matplotlib.pyplot as plt
import matplotlib.patches as patches

3.2 构建模块列表

create_modules 函数根据配置文件构建网络层:

def create_modules(module_defs):
    """
    Constructs module list of layer blocks from module configuration in module_defs
    """
    hyperparams = module_defs.pop(0)  # 获取超参数
    output_filters = [int(hyperparams["channels"])]  # 输出特征图的个数,也是卷积核的个数
    module_list = nn.ModuleList()  # 用于存储网络层的 ModuleList

    for module_i, module_def in enumerate(module_defs):
        modules = nn.Sequential()  # 用于线性堆叠网络层

        if module_def["type"] == "convolutional":
            # 获取卷积层的参数
            bn = int(module_def["batch_normalize"])
            filters = int(module_def["filters"])  # 卷积核的个数
            kernel_size = int(module_def["size"])
            pad = (kernel_size - 1) // 2
            # 添加卷积层
            modules.add_module(
                f"conv_{module_i}",  # 卷积层名称
                nn.Conv2d(
                    in_channels=output_filters[-1],  # 输入特征图的数量
                    out_channels=filters,  # 输出特征图的数量
                    kernel_size=kernel_size,  # 卷积核的大小
                    stride=int(module_def["stride"]),  # 卷积核滑动的步长
                    padding=pad,  # 填充的层数
                    bias=not bn,  # 是否添加偏置项
                ),
            )
            if bn:
                # 添加批量归一化层
                modules.add_module(f"batch_norm_{module_i}", nn.BatchNorm2d(filters, momentum=0.9))
            if module_def["activation"] == "leaky":
                # 添加 LeakyReLU 激活函数
                modules.add_module(f"leaky_{module_i}", nn.LeakyReLU(0.1))

        elif module_def["type"] == "maxpool":
            # 获取最大池化层的参数
            kernel_size = int(module_def["size"])
            stride = int(module_def["stride"])
            if kernel_size == 2 and stride == 1:
                # 添加零填充层
                modules.add_module(f"_debug_padding_{module_i}", nn.ZeroPad2d((0, 1, 0, 1)))
            # 添加最大池化层
            maxpool = nn.MaxPool2d(kernel_size=kernel_size, stride=stride, padding=int((kernel_size - 1) // 2))
            modules.add_module(f"maxpool_{module_i}", maxpool)

        elif module_def["type"] == "upsample":
            # 获取上采样层的参数
            upsample = Upsample(scale_factor=int(module_def["stride"]), mode="nearest")
            modules.add_module(f"upsample_{module_i}", upsample)

        elif module_def["type"] == "route":
            # 获取路由层的参数
            layers = [int(x) for x in module_def["layers"].split(",")]
            filters = sum([output_filters[1:][i] for i in layers])
            modules.add_module(f"route_{module_i}", EmptyLayer())  # 添加空层

        elif module_def["type"] == "shortcut":
            # 获取残差层的参数
            filters = output_filters[1:][int(module_def["from"])]
            modules.add_module(f"shortcut_{module_i}", EmptyLayer())  # 添加空层

        elif module_def["type"] == "yolo":
            # 获取 YOLO 层的参数
            anchor_idxs = [int(x) for x in module_def["mask"].split(",")]
            anchors = [int(x) for x in module_def["anchors"].split(",")]
            anchors = [(anchors[i], anchors[i + 1]) for i in range(0, len(anchors), 2)]
            anchors = [anchors[i] for i in anchor_idxs]
            num_classes = int(module_def["classes"])
            img_size = int(hyperparams["height"])
            # 定义检测层
            yolo_layer = YOLOLayer(anchors, num_classes, img_size)
            modules.add_module(f"yolo_{module_i}", yolo_layer)

        # 将当前模块添加到模块列表中
        module_list.append(modules)
        output_filters.append(filters)  # 保存每一层的卷积核个数

    return hyperparams, module_list

3.3 上采样层

Upsample 类实现上采样操作:

class Upsample(nn.Module):
    """ nn.Upsample is deprecated """

    def __init__(self, scale_factor, mode="nearest"):
        super(Upsample, self).__init__()
        self.scale_factor = scale_factor  # 上采样比例
        self.mode = mode  # 上采样模式

    def forward(self, x):
        # 使用 PyTorch 的 interpolate 函数进行上采样
        x = F.interpolate(x, scale_factor=self.scale_factor, mode=self.mode)
        return x

3.4 空层

EmptyLayer 类用于占位,例如在 routeshortcut 层中:

class EmptyLayer(nn.Module):
    """Placeholder for 'route' and 'shortcut' layers"""

    def __init__(self):
        super(EmptyLayer, self).__init__()

3.5 YOLO 检测层

YOLOLayer 类负责预测边界框、置信度和类别概率:

class YOLOLayer(nn.Module):
    """Detection layer"""

    def __init__(self, anchors, num_classes, img_dim=416):
        super(YOLOLayer, self).__init__()
        self.anchors = anchors  # 锚框
        self.num_anchors = len(anchors)  # 锚框数量
        self.num_classes = num_classes  # 类别数量
        self.ignore_thres = 0.5  # 忽略阈值
        self.mse_loss = nn.MSELoss()  # 均方误差损失
        self.bce_loss = nn.BCELoss()  # 二元交叉熵损失
        self.obj_scale = 1  # 有目标的损失权重
        self.noobj_scale = 100  # 无目标的损失权重
        self.metrics = {}  # 用于存储评估指标

        self.img_dim = img_dim  # 输入图像尺寸
        self.grid_size = 0  # 网格大小

    def compute_grid_offsets(self, grid_size, cuda=True):
        """计算网格偏移量"""
        self.grid_size = grid_size
        g = self.grid_size
        FloatTensor = torch.cuda.FloatTensor if cuda else torch.FloatTensor
        self.stride = self.img_dim / self.grid_size  # 每个网格的像素大小
        # 计算每个网格的偏移量
        self.grid_x = torch.arange(g).repeat(g, 1).view([1, 1, g, g]).type(FloatTensor)
        self.grid_y = torch.arange(g).repeat(g, 1).t().view([1, 1, g, g]).type(FloatTensor)
        self.scaled_anchors = FloatTensor([(a_w / self.stride, a_h / self.stride) for a_w, a_h in self.anchors])
        self.anchor_w = self.scaled_anchors[:, 0:1].view((1, self.num_anchors, 1, 1))
        self.anchor_h = self.scaled_anchors[:, 1:2].view((1, self.num_anchors, 1, 1))

    def forward(self, x, targets=None, img_dim=None):
        """前向传播"""
        FloatTensor = torch.cuda.FloatTensor if x.is_cuda else torch.FloatTensor
        LongTensor = torch.cuda.LongTensor if x.is_cuda else torch.LongTensor
        ByteTensor = torch.cuda.ByteTensor if x.is_cuda else torch.ByteTensor

        self.img_dim = img_dim
        num_samples = x.size(0)
        grid_size = x.size(2)

        # 重塑预测张量
        prediction = (
            x.view(num_samples, self.num_anchors, self.num_classes + 5, grid_size, grid_size)
            .permute(0, 1, 3, 4, 2)
            .contiguous()
        )
        # 提取预测结果
        x = torch.sigmoid(prediction[..., 0])  # 中心点 x
        y = torch.sigmoid(prediction[..., 1])  # 中心点 y
        w = prediction[..., 2]  # 宽度
        h = prediction[..., 3]  # 高度
        pred_conf = torch.sigmoid(prediction[..., 4])  # 置信度
        pred_cls = torch.sigmoid(prediction[..., 5:])  # 类别预测

        # 如果网格大小不匹配,重新计算偏移量
        if grid_size != self.grid_size:
            self.compute_grid_offsets(grid_size, cuda=x.is_cuda)

        # 添加偏移量并缩放锚框
        pred_boxes = FloatTensor(prediction[..., :4].shape)
        pred_boxes[..., 0] = x.data + self.grid_x
        pred_boxes[..., 1] = y.data + self.grid_y
        pred_boxes[..., 2] = torch.exp(w.data) * self.anchor_w
        pred_boxes[..., 3] = torch.exp(h.data) * self.anchor_h

        # 拼接最终输出
        output = torch.cat(
            (
                pred_boxes.view(num_samples, -1, 4) * self.stride,
                pred_conf.view(num_samples, -1, 1),
                pred_cls.view(num_samples, -1, self.num_classes),
            ),
            -1,
        )

        if targets is None:
            return output, 0
        else:
            # 构建目标张量
            iou_scores, class_mask, obj_mask, noobj_mask, tx, ty, tw, th, tcls, tconf = build_targets(
                pred_boxes=pred_boxes,
                pred_cls=pred_cls,
                target=targets,
                anchors=self.scaled_anchors,
                ignore_thres=self.ignore_thres,
            )
            # 计算损失
            loss_x = self.mse_loss(x[obj_mask], tx[obj_mask])
            loss_y = self.mse_loss(y[obj_mask], ty[obj_mask])
            loss_w = self.mse_loss(w[obj_mask], tw[obj_mask])
            loss_h = self.mse_loss(h[obj_mask], th[obj_mask])
            loss_conf_obj = self.bce_loss(pred_conf[obj_mask], tconf[obj_mask])
            loss_conf_noobj = self.bce_loss(pred_conf[noobj_mask], tconf[noobj_mask])
            loss_conf = self.obj_scale * loss_conf_obj + self.noobj_scale * loss_conf_noobj
            loss_cls = self.bce_loss(pred_cls[obj_mask], tcls[obj_mask])
            total_loss = loss_x + loss_y + loss_w + loss_h + loss_conf + loss_cls

            # 计算评估指标
            cls_acc = 100 * class_mask[obj_mask].mean()
            conf_obj = pred_conf[obj_mask].mean()
            conf_noobj = pred_conf[noobj_mask].mean()
            conf50 = (pred_conf > 0.5).float()
            iou50 = (iou_scores > 0.5).float()
            iou75 = (iou_scores > 0.75).float()
            detected_mask = conf50 * class_mask * tconf
            precision = torch.sum(iou50 * detected_mask) / (conf50.sum() + 1e-16)
            recall50 = torch.sum(iou50 * detected_mask) / (obj_mask.sum() + 1e-16)
            recall75 = torch.sum(iou75 * detected_mask) / (obj_mask.sum() + 1e-16)

            self.metrics = {
                "loss": to_cpu(total_loss).item(),
                "x": to_cpu(loss_x).item(),
                "y": to_cpu(loss_y).item(),
                "w": to_cpu(loss_w).item(),
                "h": to_cpu(loss_h).item(),
                "conf": to_cpu(loss_conf).item(),
                "cls": to_cpu(loss_cls).item(),
                "cls_acc": to_cpu(cls_acc).item(),
                "recall50": to_cpu(recall50).item(),
                "recall75": to_cpu(recall75).item(),
                "precision": to_cpu(precision).item(),
                "conf_obj": to_cpu(conf_obj).item(),
                "conf_noobj": to_cpu(conf_noobj).item(),
                "grid_size": grid_size,
            }

            return output, total_loss

3.6 YOLOv3 模型

Darknet 类是 YOLOv3 模型的主体,负责加载配置文件、构建网络、前向传播、加载和保存权重:

class Darknet(nn.Module):
    """YOLOv3 object detection model"""

    def __init__(self, config_path, img_size=416):
        super(Dark