YOLOv5改进系列（32）——替换主干网络之PKINet（CVPR2024 | 面向遥感旋转框主干，有效捕获不同尺度上的密集纹理特征）

import math

from typing import Optional, Union, Sequence

import torch

import as nn

from import _BatchNorm

from import ConvModule, build_norm_layer

from import DropPath

from import BaseModule, constant_init

from .weight_init import trunc_normal_init, normal_init

from import MMLogger

from import C3,Conv

# from import ROTATED_BACKBONES

# from utils import autopad, make_divisible, BHWC2BCHW, BCHW2BHWC

def autopad(kernel_size: int, padding: int = None, dilation: int = 1):

    assert kernel_size % 2 == 1, 'if use autopad, kernel size must be odd'

if dilation > 1:

kernel_size = dilation * (kernel_size - 1) + 1

    if padding is None:

padding = kernel_size // 2

return padding

def make_divisible(value, divisor, min_value=None, min_ratio=0.9):

"""Make divisible function.

This function rounds the channel number to the nearest value that can be

divisible by the divisor. It is taken from the original tf repo. It ensures

that all layers have a channel number that is divisible by divisor. It can

    be seen here: /tensorflow/models/blob/master/research/slim/nets/mobilenet/  # noqa

Args:

value (int, float): The original channel number.

divisor (int): The divisor to fully divide the channel number.

min_value (int): The minimum value of the output channel.

Default: None, means that the minimum value equal to the divisor.

min_ratio (float): The minimum ratio of the rounded channel number to

the original channel number. Default: 0.9.

Returns:

int: The modified output channel number.

"""

    if min_value is None:

min_value = divisor

    new_value = max(min_value, int(value + divisor / 2) // divisor * divisor)

# Make sure that round down does not go down by more than (1-min_ratio).

if new_value < min_ratio * value:

new_value += divisor

return new_value

class BCHW2BHWC():

    def __init__(self):

super().__init__()

@staticmethod

    def forward(x):

        return ([0, 2, 3, 1])

class BHWC2BCHW():

    def __init__(self):

super().__init__()

@staticmethod

    def forward(x):

        return ([0, 3, 1, 2])

class GSiLU(BaseModule):

    """Global Sigmoid-Gated Linear Unit, reproduced from paper <SIMPLE CNN FOR VISION>"""

    def __init__(self):

super().__init__()

= nn.AdaptiveAvgPool2d(1)

    def forward(self, x):

return x * ((x))

class CAA(BaseModule):

"""Context Anchor Attention"""

    def __init__(

self,

channels: int,

            h_kernel_size: int = 11,

            v_kernel_size: int = 11,

            norm_cfg: Optional[dict] = dict(type='BN', momentum=0.03, eps=0.001),

            act_cfg: Optional[dict] = dict(type='SiLU'),

            init_cfg: Optional[dict] = None,

super().__init__(init_cfg)

        self.avg_pool = nn.AvgPool2d(7, 1, 3)

        self.conv1 = ConvModule(channels, channels, 1, 1, 0,

norm_cfg=norm_cfg, act_cfg=act_cfg)

self.h_conv = ConvModule(channels, channels, (1, h_kernel_size), 1,

(0, h_kernel_size // 2), groups=channels,

norm_cfg=None, act_cfg=None)

self.v_conv = ConvModule(channels, channels, (v_kernel_size, 1), 1,

(v_kernel_size // 2, 0), groups=channels,

norm_cfg=None, act_cfg=None)

        self.conv2 = ConvModule(channels, channels, 1, 1, 0,

norm_cfg=norm_cfg, act_cfg=act_cfg)

= ()

    def forward(self, x):

        attn_factor = (self.conv2(self.v_conv(self.h_conv(self.conv1(self.avg_pool(x))))))

return attn_factor

class ConvFFN(BaseModule):

"""Multi-layer perceptron implemented with ConvModule"""

    def __init__(

self,

in_channels: int,

            out_channels: Optional[int] = None,

            hidden_channels_scale: float = 4.0,

            hidden_kernel_size: int = 3,

            dropout_rate: float = 0.,

            add_identity: bool = True,

            norm_cfg: Optional[dict] = dict(type='BN', momentum=0.03, eps=0.001),

            act_cfg: Optional[dict] = dict(type='SiLU'),

            init_cfg: Optional[dict] = None,

super().__init__(init_cfg)

out_channels = out_channels or in_channels

hidden_channels = int(in_channels * hidden_channels_scale)

self.ffn_layers = (

BCHW2BHWC(),

(in_channels),

BHWC2BCHW(),

            ConvModule(in_channels, hidden_channels, kernel_size=1, stride=1, padding=0,

norm_cfg=norm_cfg, act_cfg=act_cfg),

            ConvModule(hidden_channels, hidden_channels, kernel_size=hidden_kernel_size, stride=1,

padding=hidden_kernel_size // 2, groups=hidden_channels,

norm_cfg=norm_cfg, act_cfg=None),

GSiLU(),

(dropout_rate),

            ConvModule(hidden_channels, out_channels, kernel_size=1, stride=1, padding=0,

norm_cfg=norm_cfg, act_cfg=act_cfg),

(dropout_rate),

)

self.add_identity = add_identity

    def forward(self, x):

x = x + self.ffn_layers(x) if self.add_identity else self.ffn_layers(x)

return x

class Stem(BaseModule):

"""Stem layer"""

    def __init__(

self,

in_channels: int,

out_channels: int,

            expansion: float = 1.0,

            norm_cfg: Optional[dict] = dict(type='BN', momentum=0.03, eps=0.001),

            act_cfg: Optional[dict] = dict(type='SiLU'),

            init_cfg: Optional[dict] = None,

super().__init__(init_cfg)

hidden_channels = make_divisible(int(out_channels * expansion), 8)

        self.down_conv = ConvModule(in_channels, hidden_channels, kernel_size=3, stride=2, padding=1,

norm_cfg=norm_cfg, act_cfg=act_cfg)

        self.conv1 = ConvModule(hidden_channels, hidden_channels, kernel_size=3, stride=1, padding=1,

norm_cfg=norm_cfg, act_cfg=act_cfg)

        self.conv2 = ConvModule(hidden_channels, out_channels, kernel_size=3, stride=1, padding=1,

norm_cfg=norm_cfg, act_cfg=act_cfg)

    def forward(self, x):

return self.conv2(self.conv1(self.down_conv(x)))

class DownSamplingLayer(BaseModule):

"""Down sampling layer"""

    def __init__(

self,

in_channels: int,

            out_channels: Optional[int] = None,

            norm_cfg: Optional[dict] = dict(type='BN', momentum=0.03, eps=0.001),

            act_cfg: Optional[dict] = dict(type='SiLU'),

            init_cfg: Optional[dict] = None,

super().__init__(init_cfg)

out_channels = out_channels or (in_channels * 2)

        self.down_conv = ConvModule(in_channels, out_channels, kernel_size=3, stride=2, padding=1,

norm_cfg=norm_cfg, act_cfg=act_cfg)

    def forward(self, x):

return self.down_conv(x)

class InceptionBottleneck(BaseModule):

"""Bottleneck with Inception module"""

    def __init__(

self,

in_channels: int,

            out_channels: Optional[int] = None,

            kernel_sizes: Sequence[int] = (3, 5, 7, 9, 11),

            dilations: Sequence[int] = (1, 1, 1, 1, 1),

            expansion: float = 1.0,

            add_identity: bool = True,

            with_caa: bool = True,

            caa_kernel_size: int = 11,

            norm_cfg: Optional[dict] = dict(type='BN', momentum=0.03, eps=0.001),

            act_cfg: Optional[dict] = dict(type='SiLU'),

            init_cfg: Optional[dict] = None,

super().__init__(init_cfg)

out_channels = out_channels or in_channels

hidden_channels = make_divisible(int(out_channels * expansion), 8)

        self.pre_conv = ConvModule(in_channels, hidden_channels, 1, 1, 0, 1,

norm_cfg=norm_cfg, act_cfg=act_cfg)

self.dw_conv = ConvModule(hidden_channels, hidden_channels, kernel_sizes[0], 1,

                                  autopad(kernel_sizes[0], None, dilations[0]), dilations[0],

groups=hidden_channels, norm_cfg=None, act_cfg=None)

        self.dw_conv1 = ConvModule(hidden_channels, hidden_channels, kernel_sizes[1], 1,

                                   autopad(kernel_sizes[1], None, dilations[1]), dilations[1],

groups=hidden_channels, norm_cfg=None, act_cfg=None)

        self.dw_conv2 = ConvModule(hidden_channels, hidden_channels, kernel_sizes[2], 1,

                                   autopad(kernel_sizes[2], None, dilations[2]), dilations[2],

groups=hidden_channels, norm_cfg=None, act_cfg=None)

        self.dw_conv3 = ConvModule(hidden_channels, hidden_channels, kernel_sizes[3], 1,

                                   autopad(kernel_sizes[3], None, dilations[3]), dilations[3],

groups=hidden_channels, norm_cfg=None, act_cfg=None)

        self.dw_conv4 = ConvModule(hidden_channels, hidden_channels, kernel_sizes[4], 1,

                                   autopad(kernel_sizes[4], None, dilations[4]), dilations[4],

groups=hidden_channels, norm_cfg=None, act_cfg=None)

        self.pw_conv = ConvModule(hidden_channels, hidden_channels, 1, 1, 0, 1,

norm_cfg=norm_cfg, act_cfg=act_cfg)

if with_caa:

            self.caa_factor = CAA(hidden_channels, caa_kernel_size, caa_kernel_size, None, None)

else:

self.caa_factor = None

self.add_identity = add_identity and in_channels == out_channels

        self.post_conv = ConvModule(hidden_channels, out_channels, 1, 1, 0, 1,

norm_cfg=norm_cfg, act_cfg=act_cfg)

    def forward(self, x):

x = self.pre_conv(x)

y = x # if there is an inplace operation of x, use y = () instead of y = x

x = self.dw_conv(x)

        x = x + self.dw_conv1(x) + self.dw_conv2(x) + self.dw_conv3(x) + self.dw_conv4(x)

x = self.pw_conv(x)

        if self.caa_factor is not None:

y = self.caa_factor(y)

if self.add_identity:

y = x * y

x = x + y

else:

x = x * y

x = self.post_conv(x)

return x

class PKIBlock(BaseModule):

"""Poly Kernel Inception Block"""

    def __init__(

self,

in_channels: int,

            out_channels: Optional[int] = None,

            kernel_sizes: Sequence[int] = (3, 5, 7, 9, 11),

            dilations: Sequence[int] = (1, 1, 1, 1, 1),

            with_caa: bool = True,

            caa_kernel_size: int = 11,

            expansion: float = 1.0,

            ffn_scale: float = 4.0,

            ffn_kernel_size: int = 3,

            dropout_rate: float = 0.,

            drop_path_rate: float = 0.,

            layer_scale: Optional[float] = 1.0,

            add_identity: bool = True,

            norm_cfg: Optional[dict] = dict(type='BN', momentum=0.03, eps=0.001),

            act_cfg: Optional[dict] = dict(type='SiLU'),

            init_cfg: Optional[dict] = None,

super().__init__(init_cfg)

out_channels = out_channels or in_channels

hidden_channels = make_divisible(int(out_channels * expansion), 8)

        if norm_cfg is not None:

self.norm1 = build_norm_layer(norm_cfg, in_channels)[1]

self.norm2 = build_norm_layer(norm_cfg, hidden_channels)[1]

else:

self.norm1 = nn.BatchNorm2d(in_channels)

self.norm2 = nn.BatchNorm2d(hidden_channels)

= InceptionBottleneck(in_channels, hidden_channels, kernel_sizes, dilations,

expansion=1.0, add_identity=True,

with_caa=with_caa, caa_kernel_size=caa_kernel_size,

norm_cfg=norm_cfg, act_cfg=act_cfg)

         = ConvFFN(hidden_channels, out_channels, ffn_scale, ffn_kernel_size, dropout_rate, add_identity=False,

norm_cfg=None, act_cfg=None)

        self.drop_path = DropPath(drop_path_rate) if drop_path_rate > 0 else ()

self.layer_scale = layer_scale

if self.layer_scale:

self.gamma1 = (layer_scale * (hidden_channels), requires_grad=True)

self.gamma2 = (layer_scale * (out_channels), requires_grad=True)

self.add_identity = add_identity and in_channels == out_channels

    def forward(self, x):

if self.layer_scale:

if self.add_identity:

x = x + self.drop_path(self.(-1).unsqueeze(-1) * (self.norm1(x)))

x = x + self.drop_path(self.(-1).unsqueeze(-1) * (self.norm2(x)))

else:

x = self.drop_path(self.(-1).unsqueeze(-1) * (self.norm1(x)))

x = self.drop_path(self.(-1).unsqueeze(-1) * (self.norm2(x)))

else:

if self.add_identity:

x = x + self.drop_path((self.norm1(x)))

x = x + self.drop_path((self.norm2(x)))

else:

x = self.drop_path((self.norm1(x)))

x = self.drop_path((self.norm2(x)))

return x

class PKIStage(BaseModule):

"""Poly Kernel Inception Stage"""

    def __init__(

self,

in_channels: int,

out_channels: int,

num_blocks: int,

            kernel_sizes: Sequence[int] = (3, 5, 7, 9, 11),

            dilations: Sequence[int] = (1, 1, 1, 1, 1),

            expansion: float = 0.5,

            ffn_scale: float = 4.0,

            ffn_kernel_size: int = 3,

            dropout_rate: float = 0.,

            drop_path_rate: Union[float, list] = 0.,

            layer_scale: Optional[float] = 1.0,

            shortcut_with_ffn: bool = True,

            shortcut_ffn_scale: float = 4.0,

            shortcut_ffn_kernel_size: int = 5,

            add_identity: bool = True,

            with_caa: bool = True,

            caa_kernel_size: int = 11,

            norm_cfg: Optional[dict] = dict(type='BN', momentum=0.03, eps=0.001),

            act_cfg: Optional[dict] = dict(type='SiLU'),

            init_cfg: Optional[dict] = None,

super().__init__(init_cfg)

hidden_channels = make_divisible(int(out_channels * expansion), 8)

= DownSamplingLayer(in_channels, out_channels, norm_cfg, act_cfg)

        self.conv1 = ConvModule(out_channels, 2 * hidden_channels, kernel_size=1, stride=1, padding=0, dilation=1,

norm_cfg=norm_cfg, act_cfg=act_cfg)

        self.conv2 = ConvModule(2 * hidden_channels, out_channels, kernel_size=1, stride=1, padding=0, dilation=1,

norm_cfg=norm_cfg, act_cfg=act_cfg)

        self.conv3 = ConvModule(out_channels, out_channels, kernel_size=1, stride=1, padding=0, dilation=1,

norm_cfg=norm_cfg, act_cfg=act_cfg)

         = ConvFFN(hidden_channels, hidden_channels, shortcut_ffn_scale, shortcut_ffn_kernel_size, 0.,

                           add_identity=True, norm_cfg=None, act_cfg=None) if shortcut_with_ffn else None

= ([

PKIBlock(hidden_channels, hidden_channels, kernel_sizes, dilations, with_caa,

caa_kernel_size+2*i, 1.0, ffn_scale, ffn_kernel_size, dropout_rate,

                     drop_path_rate[i] if isinstance(drop_path_rate, list) else drop_path_rate,

                     layer_scale, add_identity, norm_cfg, act_cfg) for i in range(num_blocks)

])

    def forward(self, x):

x = (x)

        x, y = list(self.conv1(x).chunk(2, 1))

        if  is not None:

x = (x)

z = [x]

t = (, device=)

for block in :

t = t + block(y)

(t)

z = (z, dim=1)

z = self.conv2(z)

z = self.conv3(z)

return z

秒客网

YOLOv5改进系列（32）——替换主干网络之PKINet（CVPR2024 | 面向遥感旋转框主干，有效捕获不同尺度上的密集纹理特征）

相关文章