YOLOv5改进系列(32)——替换主干网络之PKINet(CVPR2024 | 面向遥感旋转框主干,有效捕获不同尺度上的密集纹理特征)

时间:2025-04-02 09:12:29
  • import math
  • from typing import Optional, Union, Sequence
  • import torch
  • import as nn
  • from import _BatchNorm
  • from import ConvModule, build_norm_layer
  • from import DropPath
  • from import BaseModule, constant_init
  • from .weight_init import trunc_normal_init, normal_init
  • from import MMLogger
  • from import C3,Conv
  • # from import ROTATED_BACKBONES
  • # from utils import autopad, make_divisible, BHWC2BCHW, BCHW2BHWC
  • def autopad(kernel_size: int, padding: int = None, dilation: int = 1):
  • assert kernel_size % 2 == 1, 'if use autopad, kernel size must be odd'
  • if dilation > 1:
  • kernel_size = dilation * (kernel_size - 1) + 1
  • if padding is None:
  • padding = kernel_size // 2
  • return padding
  • def make_divisible(value, divisor, min_value=None, min_ratio=0.9):
  • """Make divisible function.
  • This function rounds the channel number to the nearest value that can be
  • divisible by the divisor. It is taken from the original tf repo. It ensures
  • that all layers have a channel number that is divisible by divisor. It can
  • be seen here: /tensorflow/models/blob/master/research/slim/nets/mobilenet/ # noqa
  • Args:
  • value (int, float): The original channel number.
  • divisor (int): The divisor to fully divide the channel number.
  • min_value (int): The minimum value of the output channel.
  • Default: None, means that the minimum value equal to the divisor.
  • min_ratio (float): The minimum ratio of the rounded channel number to
  • the original channel number. Default: 0.9.
  • Returns:
  • int: The modified output channel number.
  • """
  • if min_value is None:
  • min_value = divisor
  • new_value = max(min_value, int(value + divisor / 2) // divisor * divisor)
  • # Make sure that round down does not go down by more than (1-min_ratio).
  • if new_value < min_ratio * value:
  • new_value += divisor
  • return new_value
  • class BCHW2BHWC():
  • def __init__(self):
  • super().__init__()
  • @staticmethod
  • def forward(x):
  • return ([0, 2, 3, 1])
  • class BHWC2BCHW():
  • def __init__(self):
  • super().__init__()
  • @staticmethod
  • def forward(x):
  • return ([0, 3, 1, 2])
  • class GSiLU(BaseModule):
  • """Global Sigmoid-Gated Linear Unit, reproduced from paper <SIMPLE CNN FOR VISION>"""
  • def __init__(self):
  • super().__init__()
  • = nn.AdaptiveAvgPool2d(1)
  • def forward(self, x):
  • return x * ((x))
  • class CAA(BaseModule):
  • """Context Anchor Attention"""
  • def __init__(
  • self,
  • channels: int,
  • h_kernel_size: int = 11,
  • v_kernel_size: int = 11,
  • norm_cfg: Optional[dict] = dict(type='BN', momentum=0.03, eps=0.001),
  • act_cfg: Optional[dict] = dict(type='SiLU'),
  • init_cfg: Optional[dict] = None,
  • ):
  • super().__init__(init_cfg)
  • self.avg_pool = nn.AvgPool2d(7, 1, 3)
  • self.conv1 = ConvModule(channels, channels, 1, 1, 0,
  • norm_cfg=norm_cfg, act_cfg=act_cfg)
  • self.h_conv = ConvModule(channels, channels, (1, h_kernel_size), 1,
  • (0, h_kernel_size // 2), groups=channels,
  • norm_cfg=None, act_cfg=None)
  • self.v_conv = ConvModule(channels, channels, (v_kernel_size, 1), 1,
  • (v_kernel_size // 2, 0), groups=channels,
  • norm_cfg=None, act_cfg=None)
  • self.conv2 = ConvModule(channels, channels, 1, 1, 0,
  • norm_cfg=norm_cfg, act_cfg=act_cfg)
  • = ()
  • def forward(self, x):
  • attn_factor = (self.conv2(self.v_conv(self.h_conv(self.conv1(self.avg_pool(x))))))
  • return attn_factor
  • class ConvFFN(BaseModule):
  • """Multi-layer perceptron implemented with ConvModule"""
  • def __init__(
  • self,
  • in_channels: int,
  • out_channels: Optional[int] = None,
  • hidden_channels_scale: float = 4.0,
  • hidden_kernel_size: int = 3,
  • dropout_rate: float = 0.,
  • add_identity: bool = True,
  • norm_cfg: Optional[dict] = dict(type='BN', momentum=0.03, eps=0.001),
  • act_cfg: Optional[dict] = dict(type='SiLU'),
  • init_cfg: Optional[dict] = None,
  • ):
  • super().__init__(init_cfg)
  • out_channels = out_channels or in_channels
  • hidden_channels = int(in_channels * hidden_channels_scale)
  • self.ffn_layers = (
  • BCHW2BHWC(),
  • (in_channels),
  • BHWC2BCHW(),
  • ConvModule(in_channels, hidden_channels, kernel_size=1, stride=1, padding=0,
  • norm_cfg=norm_cfg, act_cfg=act_cfg),
  • ConvModule(hidden_channels, hidden_channels, kernel_size=hidden_kernel_size, stride=1,
  • padding=hidden_kernel_size // 2, groups=hidden_channels,
  • norm_cfg=norm_cfg, act_cfg=None),
  • GSiLU(),
  • (dropout_rate),
  • ConvModule(hidden_channels, out_channels, kernel_size=1, stride=1, padding=0,
  • norm_cfg=norm_cfg, act_cfg=act_cfg),
  • (dropout_rate),
  • )
  • self.add_identity = add_identity
  • def forward(self, x):
  • x = x + self.ffn_layers(x) if self.add_identity else self.ffn_layers(x)
  • return x
  • class Stem(BaseModule):
  • """Stem layer"""
  • def __init__(
  • self,
  • in_channels: int,
  • out_channels: int,
  • expansion: float = 1.0,
  • norm_cfg: Optional[dict] = dict(type='BN', momentum=0.03, eps=0.001),
  • act_cfg: Optional[dict] = dict(type='SiLU'),
  • init_cfg: Optional[dict] = None,
  • ):
  • super().__init__(init_cfg)
  • hidden_channels = make_divisible(int(out_channels * expansion), 8)
  • self.down_conv = ConvModule(in_channels, hidden_channels, kernel_size=3, stride=2, padding=1,
  • norm_cfg=norm_cfg, act_cfg=act_cfg)
  • self.conv1 = ConvModule(hidden_channels, hidden_channels, kernel_size=3, stride=1, padding=1,
  • norm_cfg=norm_cfg, act_cfg=act_cfg)
  • self.conv2 = ConvModule(hidden_channels, out_channels, kernel_size=3, stride=1, padding=1,
  • norm_cfg=norm_cfg, act_cfg=act_cfg)
  • def forward(self, x):
  • return self.conv2(self.conv1(self.down_conv(x)))
  • class DownSamplingLayer(BaseModule):
  • """Down sampling layer"""
  • def __init__(
  • self,
  • in_channels: int,
  • out_channels: Optional[int] = None,
  • norm_cfg: Optional[dict] = dict(type='BN', momentum=0.03, eps=0.001),
  • act_cfg: Optional[dict] = dict(type='SiLU'),
  • init_cfg: Optional[dict] = None,
  • ):
  • super().__init__(init_cfg)
  • out_channels = out_channels or (in_channels * 2)
  • self.down_conv = ConvModule(in_channels, out_channels, kernel_size=3, stride=2, padding=1,
  • norm_cfg=norm_cfg, act_cfg=act_cfg)
  • def forward(self, x):
  • return self.down_conv(x)
  • class InceptionBottleneck(BaseModule):
  • """Bottleneck with Inception module"""
  • def __init__(
  • self,
  • in_channels: int,
  • out_channels: Optional[int] = None,
  • kernel_sizes: Sequence[int] = (3, 5, 7, 9, 11),
  • dilations: Sequence[int] = (1, 1, 1, 1, 1),
  • expansion: float = 1.0,
  • add_identity: bool = True,
  • with_caa: bool = True,
  • caa_kernel_size: int = 11,
  • norm_cfg: Optional[dict] = dict(type='BN', momentum=0.03, eps=0.001),
  • act_cfg: Optional[dict] = dict(type='SiLU'),
  • init_cfg: Optional[dict] = None,
  • ):
  • super().__init__(init_cfg)
  • out_channels = out_channels or in_channels
  • hidden_channels = make_divisible(int(out_channels * expansion), 8)
  • self.pre_conv = ConvModule(in_channels, hidden_channels, 1, 1, 0, 1,
  • norm_cfg=norm_cfg, act_cfg=act_cfg)
  • self.dw_conv = ConvModule(hidden_channels, hidden_channels, kernel_sizes[0], 1,
  • autopad(kernel_sizes[0], None, dilations[0]), dilations[0],
  • groups=hidden_channels, norm_cfg=None, act_cfg=None)
  • self.dw_conv1 = ConvModule(hidden_channels, hidden_channels, kernel_sizes[1], 1,
  • autopad(kernel_sizes[1], None, dilations[1]), dilations[1],
  • groups=hidden_channels, norm_cfg=None, act_cfg=None)
  • self.dw_conv2 = ConvModule(hidden_channels, hidden_channels, kernel_sizes[2], 1,
  • autopad(kernel_sizes[2], None, dilations[2]), dilations[2],
  • groups=hidden_channels, norm_cfg=None, act_cfg=None)
  • self.dw_conv3 = ConvModule(hidden_channels, hidden_channels, kernel_sizes[3], 1,
  • autopad(kernel_sizes[3], None, dilations[3]), dilations[3],
  • groups=hidden_channels, norm_cfg=None, act_cfg=None)
  • self.dw_conv4 = ConvModule(hidden_channels, hidden_channels, kernel_sizes[4], 1,
  • autopad(kernel_sizes[4], None, dilations[4]), dilations[4],
  • groups=hidden_channels, norm_cfg=None, act_cfg=None)
  • self.pw_conv = ConvModule(hidden_channels, hidden_channels, 1, 1, 0, 1,
  • norm_cfg=norm_cfg, act_cfg=act_cfg)
  • if with_caa:
  • self.caa_factor = CAA(hidden_channels, caa_kernel_size, caa_kernel_size, None, None)
  • else:
  • self.caa_factor = None
  • self.add_identity = add_identity and in_channels == out_channels
  • self.post_conv = ConvModule(hidden_channels, out_channels, 1, 1, 0, 1,
  • norm_cfg=norm_cfg, act_cfg=act_cfg)
  • def forward(self, x):
  • x = self.pre_conv(x)
  • y = x # if there is an inplace operation of x, use y = () instead of y = x
  • x = self.dw_conv(x)
  • x = x + self.dw_conv1(x) + self.dw_conv2(x) + self.dw_conv3(x) + self.dw_conv4(x)
  • x = self.pw_conv(x)
  • if self.caa_factor is not None:
  • y = self.caa_factor(y)
  • if self.add_identity:
  • y = x * y
  • x = x + y
  • else:
  • x = x * y
  • x = self.post_conv(x)
  • return x
  • class PKIBlock(BaseModule):
  • """Poly Kernel Inception Block"""
  • def __init__(
  • self,
  • in_channels: int,
  • out_channels: Optional[int] = None,
  • kernel_sizes: Sequence[int] = (3, 5, 7, 9, 11),
  • dilations: Sequence[int] = (1, 1, 1, 1, 1),
  • with_caa: bool = True,
  • caa_kernel_size: int = 11,
  • expansion: float = 1.0,
  • ffn_scale: float = 4.0,
  • ffn_kernel_size: int = 3,
  • dropout_rate: float = 0.,
  • drop_path_rate: float = 0.,
  • layer_scale: Optional[float] = 1.0,
  • add_identity: bool = True,
  • norm_cfg: Optional[dict] = dict(type='BN', momentum=0.03, eps=0.001),
  • act_cfg: Optional[dict] = dict(type='SiLU'),
  • init_cfg: Optional[dict] = None,
  • ):
  • super().__init__(init_cfg)
  • out_channels = out_channels or in_channels
  • hidden_channels = make_divisible(int(out_channels * expansion), 8)
  • if norm_cfg is not None:
  • self.norm1 = build_norm_layer(norm_cfg, in_channels)[1]
  • self.norm2 = build_norm_layer(norm_cfg, hidden_channels)[1]
  • else:
  • self.norm1 = nn.BatchNorm2d(in_channels)
  • self.norm2 = nn.BatchNorm2d(hidden_channels)
  • = InceptionBottleneck(in_channels, hidden_channels, kernel_sizes, dilations,
  • expansion=1.0, add_identity=True,
  • with_caa=with_caa, caa_kernel_size=caa_kernel_size,
  • norm_cfg=norm_cfg, act_cfg=act_cfg)
  • = ConvFFN(hidden_channels, out_channels, ffn_scale, ffn_kernel_size, dropout_rate, add_identity=False,
  • norm_cfg=None, act_cfg=None)
  • self.drop_path = DropPath(drop_path_rate) if drop_path_rate > 0 else ()
  • self.layer_scale = layer_scale
  • if self.layer_scale:
  • self.gamma1 = (layer_scale * (hidden_channels), requires_grad=True)
  • self.gamma2 = (layer_scale * (out_channels), requires_grad=True)
  • self.add_identity = add_identity and in_channels == out_channels
  • def forward(self, x):
  • if self.layer_scale:
  • if self.add_identity:
  • x = x + self.drop_path(self.(-1).unsqueeze(-1) * (self.norm1(x)))
  • x = x + self.drop_path(self.(-1).unsqueeze(-1) * (self.norm2(x)))
  • else:
  • x = self.drop_path(self.(-1).unsqueeze(-1) * (self.norm1(x)))
  • x = self.drop_path(self.(-1).unsqueeze(-1) * (self.norm2(x)))
  • else:
  • if self.add_identity:
  • x = x + self.drop_path((self.norm1(x)))
  • x = x + self.drop_path((self.norm2(x)))
  • else:
  • x = self.drop_path((self.norm1(x)))
  • x = self.drop_path((self.norm2(x)))
  • return x
  • class PKIStage(BaseModule):
  • """Poly Kernel Inception Stage"""
  • def __init__(
  • self,
  • in_channels: int,
  • out_channels: int,
  • num_blocks: int,
  • kernel_sizes: Sequence[int] = (3, 5, 7, 9, 11),
  • dilations: Sequence[int] = (1, 1, 1, 1, 1),
  • expansion: float = 0.5,
  • ffn_scale: float = 4.0,
  • ffn_kernel_size: int = 3,
  • dropout_rate: float = 0.,
  • drop_path_rate: Union[float, list] = 0.,
  • layer_scale: Optional[float] = 1.0,
  • shortcut_with_ffn: bool = True,
  • shortcut_ffn_scale: float = 4.0,
  • shortcut_ffn_kernel_size: int = 5,
  • add_identity: bool = True,
  • with_caa: bool = True,
  • caa_kernel_size: int = 11,
  • norm_cfg: Optional[dict] = dict(type='BN', momentum=0.03, eps=0.001),
  • act_cfg: Optional[dict] = dict(type='SiLU'),
  • init_cfg: Optional[dict] = None,
  • ):
  • super().__init__(init_cfg)
  • hidden_channels = make_divisible(int(out_channels * expansion), 8)
  • = DownSamplingLayer(in_channels, out_channels, norm_cfg, act_cfg)
  • self.conv1 = ConvModule(out_channels, 2 * hidden_channels, kernel_size=1, stride=1, padding=0, dilation=1,
  • norm_cfg=norm_cfg, act_cfg=act_cfg)
  • self.conv2 = ConvModule(2 * hidden_channels, out_channels, kernel_size=1, stride=1, padding=0, dilation=1,
  • norm_cfg=norm_cfg, act_cfg=act_cfg)
  • self.conv3 = ConvModule(out_channels, out_channels, kernel_size=1, stride=1, padding=0, dilation=1,
  • norm_cfg=norm_cfg, act_cfg=act_cfg)
  • = ConvFFN(hidden_channels, hidden_channels, shortcut_ffn_scale, shortcut_ffn_kernel_size, 0.,
  • add_identity=True, norm_cfg=None, act_cfg=None) if shortcut_with_ffn else None
  • = ([
  • PKIBlock(hidden_channels, hidden_channels, kernel_sizes, dilations, with_caa,
  • caa_kernel_size+2*i, 1.0, ffn_scale, ffn_kernel_size, dropout_rate,
  • drop_path_rate[i] if isinstance(drop_path_rate, list) else drop_path_rate,
  • layer_scale, add_identity, norm_cfg, act_cfg) for i in range(num_blocks)
  • ])
  • def forward(self, x):
  • x = (x)
  • x, y = list(self.conv1(x).chunk(2, 1))
  • if is not None:
  • x = (x)
  • z = [x]
  • t = (, device=)
  • for block in :
  • t = t + block(y)
  • (t)
  • z = (z, dim=1)
  • z = self.conv2(z)
  • z = self.conv3(z)
  • return z