YOLOv5改进系列(29)——添加DilateFormer(MSDA)注意力机制(中科院一区顶刊|即插即用的多尺度全局注意力机制)

时间:2025-04-02 09:13:40
  • import torch
  • import as nn
  • from functools import partial
  • from import DropPath, to_2tuple, trunc_normal_
  • class Mlp():
  • def __init__(self, in_features, hidden_features=None, out_features=None, act_layer=, drop=0.):
  • super().__init__()
  • out_features = out_features or in_features
  • hidden_features = hidden_features or in_features
  • self.fc1 = (in_features, hidden_features)
  • = act_layer()
  • self.fc2 = (hidden_features, out_features)
  • = (drop)
  • def forward(self, x):
  • x = self.fc1(x)
  • x = (x)
  • x = (x)
  • x = self.fc2(x)
  • x = (x)
  • return x
  • class DilateAttention():
  • "Implementation of Dilate-attention"
  • def __init__(self, head_dim, qk_scale=None, attn_drop=0, kernel_size=3, dilation=1):
  • super().__init__()
  • self.head_dim = head_dim
  • = qk_scale or head_dim ** -0.5
  • self.kernel_size=kernel_size
  • = (kernel_size, dilation, dilation*(kernel_size-1)//2, 1)
  • self.attn_drop = (attn_drop)
  • def forward(self,q,k,v):
  • #B, C//3, H, W
  • B,d,H,W =
  • q = ([B, d//self.head_dim, self.head_dim, 1 ,H*W]).permute(0, 1, 4, 3, 2) # B,h,N,1,d
  • k = (k).reshape([B, d//self.head_dim, self.head_dim, self.kernel_size*self.kernel_size, H*W]).permute(0, 1, 4, 2, 3) #B,h,N,d,k*k
  • attn = (q @ k) * # B,h,N,1,k*k
  • attn = (dim=-1)
  • attn = self.attn_drop(attn)
  • v = (v).reshape([B, d//self.head_dim, self.head_dim, self.kernel_size*self.kernel_size, H*W]).permute(0, 1, 4, 3, 2) # B,h,N,k*k,d
  • x = (attn @ v).transpose(1, 2).reshape(B, H, W, d)
  • return x
  • class MultiDilatelocalAttention():
  • "Implementation of Dilate-attention"
  • def __init__(self, dim, num_heads=4, qkv_bias=False, qk_scale=None,
  • attn_drop=0.,proj_drop=0., kernel_size=3, dilation=[1, 2]):
  • super().__init__()
  • = dim
  • self.num_heads = num_heads
  • head_dim = dim // num_heads
  • = dilation
  • self.kernel_size = kernel_size
  • = qk_scale or head_dim ** -0.5
  • self.num_dilation = len(dilation)
  • assert num_heads % self.num_dilation == 0, f"num_heads{num_heads} must be the times of num_dilation{self.num_dilation}!!"
  • = nn.Conv2d(dim, dim * 3, 1, bias=qkv_bias)
  • self.dilate_attention = (
  • [DilateAttention(head_dim, qk_scale, attn_drop, kernel_size, dilation[i])
  • for i in range(self.num_dilation)])
  • = (dim, dim)
  • self.proj_drop = (proj_drop)
  • def forward(self, x):
  • x = (0, 3, 1, 2) # B, C, H, W
  • B, C, H, W =
  • qkv = (x).reshape(B, 3, self.num_dilation, C //self.num_dilation, H, W).permute(2, 1, 0, 3, 4, 5)
  • #num_dilation,3,B,C//num_dilation,H,W
  • x = (B, self.num_dilation, C//self.num_dilation, H, W).permute(1, 0, 3, 4, 2 )
  • # num_dilation, B, H, W, C//num_dilation
  • for i in range(self.num_dilation):
  • x[i] = self.dilate_attention[i](qkv[i][0], qkv[i][1], qkv[i][2])# B, H, W,C//num_dilation
  • x = (1, 2, 3, 0, 4).reshape(B, H, W, C)
  • x = (x)
  • x = self.proj_drop(x)
  • return x
  • class DilateBlock():
  • "Implementation of Dilate-attention block"
  • def __init__(self, dim, num_heads=4, mlp_ratio=4., qkv_bias=False,qk_scale=None, drop=0., attn_drop=0.,
  • drop_path=0.,act_layer=, norm_layer=, kernel_size=3, dilation=[1, 2],
  • cpe_per_block=False):
  • super().__init__()
  • = dim
  • self.num_heads = num_heads
  • self.mlp_ratio = mlp_ratio
  • self.kernel_size = kernel_size
  • = dilation
  • self.cpe_per_block = cpe_per_block
  • if self.cpe_per_block:
  • self.pos_embed = nn.Conv2d(dim, dim, 3, padding=1, groups=dim)
  • self.norm1 = norm_layer(dim)
  • = MultiDilatelocalAttention(dim, num_heads=num_heads, qkv_bias=qkv_bias, qk_scale=qk_scale,
  • attn_drop=attn_drop, kernel_size=kernel_size, dilation=dilation)
  • self.drop_path = DropPath(
  • drop_path) if drop_path > 0. else ()
  • def forward(self, x):
  • x = (0, 3, 2, 1)
  • x = x + self.drop_path((self.norm1(x)))
  • x = (0, 3, 2, 1)
  • #B, C, H, W
  • return x
  • def autopad(k, p=None, d=1): # kernel, padding, dilation
  • # Pad to 'same' shape outputs
  • if d > 1:
  • k = d * (k - 1) + 1 if isinstance(k, int) else [d * (x - 1) + 1 for x in k] # actual kernel-size
  • if p is None:
  • p = k // 2 if isinstance(k, int) else [x // 2 for x in k] # auto-pad
  • return p
  • class Conv():
  • # Standard convolution with args(ch_in, ch_out, kernel, stride, padding, groups, dilation, activation)
  • default_act = () # default activation
  • def __init__(self, c1, c2, k=1, s=1, p=None, g=1, d=1, act=True):
  • super().__init__()
  • = nn.Conv2d(c1, c2, k, s, autopad(k, p, d), groups=g, dilation=d, bias=False)
  • = nn.BatchNorm2d(c2)
  • = self.default_act if act is True else act if isinstance(act, ) else ()
  • def forward(self, x):
  • return (((x)))
  • def forward_fuse(self, x):
  • return ((x))
  • class C3_DilateBlock():
  • # CSP Bottleneck with 3 convolutions
  • def __init__(self, c1, c2, n=1, shortcut=True, g=1, e=0.5): # ch_in, ch_out, number, shortcut, groups, expansion
  • super().__init__()
  • c_ = int(c2 * e) # hidden channels
  • self.cv1 = Conv(c1, c_, 1, 1)
  • self.cv2 = Conv(c1, c_, 1, 1)
  • self.cv3 = Conv(2 * c_, c2, 1) # optional act=FReLU(c2)
  • = (*(DilateBlock(c_) for _ in range(n)))
  • def forward(self, x):
  • return self.cv3((((self.cv1(x)), self.cv2(x)), 1))