当前位置：首页 > news >正文

精读双模态目标检测论文系列三｜恶劣环境下的PE-Det创新全解析（附可运行代码 + 二次顶刊创新思路）

news 2026/6/7 22:04:47

大家好，这里是双模态遥感目标检测精读系列第三篇！本期精读Expert Systems With Applications（2026，IF≈8.5+）顶刊论文 PE-Det ，聚焦红外 - 可见光（IR-VIS）双模态目标检测论文，从论文创新、模块拆解、可运行代码到顶刊二次创新思路全覆盖，适合科研党、算法工程师直接复用～～～

论文标题如下：

PE-Det: Prior-Guided visible preconditioning and routed expert fusion for robust infrared-visible object detection

论文代码如下：

https://github.com/601140736/PE-Det

PE-Det 是一篇发表于Expert Systems With Applications（2026，IF≈8.5+）的红外 - 可见光（IR-VIS）双模态目标检测论文，聚焦恶劣环境下可见光严重退化（低光照、雾霾散射、对比度崩塌）导致的跨模态不一致、固定融合策略失效问题，提出退化感知自适应融合框架，通过先验引导可见光预处理（PVP）、多尺度动态专家融合（MDE）、跨尺度特征聚合颈部（GS-SSFF）、核心聚焦边界框回归损失（CFI‑MPD‑IoU）四大协同模块，系统性解决退化引发的误差传播问题。

论文在FLIR、M3FD两大权威双模态数据集上全面超越 YOLOv8 (Dual)、SLF‑YOLO (Dual) 及十余种主流融合检测器，在mAP@0.5:0.95（严格定位指标）上提升显著，跨数据集泛化验证了对未知退化与域偏移的鲁棒性

创新点一：先验引导可见光预处理模块（PVP）

该创新点首次提出面向红外 - 可见光检测的不对称可微分预处理机制，仅针对易退化的可见光模态进行增强处理，保持红外模态原始稳定的热特征不变，通过集成物理驱动的逆散射去雾、递归式光照曲线校正与可学习拉普拉斯边缘增强三类先验算子，以残差耦合方式逐级稳定可见光图像的光度统计特性并强化结构相关特征，从输入源头降低退化引发的跨模态分布差异，为后续模态交互提供更可靠的特征基础，区别于传统视觉增强与检测任务脱节、双模态同步处理的低效方式，实现检测导向的精准预处理。

import torch import torch.nn as nn # ---- 模型部分：low_light_enhance ---- class Low_enhance_net(nn.Module): def __init__(self, in_channels): super(Low_enhance_net, self).__init__() self.conv1 = nn.Conv2d(in_channels, 4, kernel_size=3, padding=1, stride=1) self.conv2 = nn.Conv2d(4, 8, kernel_size=3, padding=1, stride=1) self.conv5 = nn.Conv2d(8, 8, kernel_size=3, padding=1, stride=1) self.leaky_relu = nn.LeakyReLU() def forward(self, x): x = self.leaky_relu((self.conv1(x))) x = self.leaky_relu((self.conv2(x))) x = self.leaky_relu((self.conv5(x))) r1, r2, r3, r4, r5, r6, r7, r8 = torch.split(x, 1, dim=1) return [r1, r2, r3, r4, r5, r6, r7, r8] def low_enhance_feature(low_light_image, r): # 遍历 r 中的每个元素并逐步增强 for r_it in r: # 将 r_it 通过 sigmoid 压缩到 0 到 1 的范围内，防止其值过大或过小 r_it = torch.sigmoid(r_it) # 增强操作，添加一个很小的常数 1e-6 来提高数值稳定性，避免零除或下溢 low_light_image = low_light_image + r_it * (torch.pow(low_light_image, 2) - low_light_image + 1e-6) # 对每次迭代的结果进行裁剪，避免值过大或过小 low_light_image = torch.clamp(low_light_image, min=0.0, max=1.0) # 将原始图像加回结果，作为增强后的最终输出 return low_light_image import torch import torch.nn as nn import torch.nn.functional as F # ---- 模型部分：预测透射率图和大气光 ---- class DehazeNet(nn.Module): def __init__(self, in_channel): super(DehazeNet, self).__init__() # 卷积层：用于提取特征 self.conv1 = nn.Conv2d(in_channel, 4, kernel_size=3, padding=1) self.conv2 = nn.Conv2d(4, 8, kernel_size=3, padding=1) self.conv5 = nn.Conv2d(8, 1, kernel_size=3, padding=1) # 输出1个通道，用于透射率图 # 全连接层：用于估计大气光值 self.fc1 = nn.Linear(8, 128) self.fc2 = nn.Linear(128, 1) def forward(self, x): # 提取透射率图的特征 x = nn.LeakyReLU()(self.conv1(x)) x = nn.LeakyReLU()(self.conv2(x)) transmission = torch.sigmoid(self.conv5(x)) # 透射率图输出范围为[0,1] # 用于估计大气光值的特征池化 #(x.shape) pooled = F.adaptive_avg_pool2d(x, (1, 1)) pooled = pooled.view(pooled.size(0), -1) # flatten #print(pooled.shape) A = torch.sigmoid(self.fc1(pooled)) A = torch.sigmoid(self.fc2(A)) # 大气光值输出范围为[0,1]（RGB） return transmission, A # ---- 去雾过程：通过大气散射模型公式去雾 ---- def dehaze_feature(hazy_image, transmission, A, t0=0.01): # 假设 hazy_image 形状为 (B, C, H, W) transmission = torch.clamp(transmission, min=t0) # 避免除0情况，透射率下限 t0 # 扩展 A 到 (B, C, H, W) 维度 A = A.view(A.size(0), A.size(1), 1, 1) # A shape: (B, 1, 1, 1) A = A.expand_as(hazy_image) # 去雾公式：J(x) = (I(x) - A) / t(x) + A dehazed_image = (hazy_image - A) / transmission + A #dehazed_image = torch.clamp(dehazed_image, 0, 1) # 确保像素值在[0,1]范围内 return dehazed_image class IN_poir(nn.Module): def __init__(self): super(IN_poir, self).__init__() self.L_N = Low_enhance_net(in_channels=3) self.F_N = DehazeNet(in_channel=3) self.w1 = nn.Parameter(torch.randn(1)) self.w2 = nn.Parameter(torch.randn(1)) def forward(self, x): vi, ir = x[:,:3,:,:], x[:, 3:, :, :] r = self.L_N(vi) t, a = self.F_N(vi) vi = vi * (1-self.w1) + self.w1* dehaze_feature(vi, t, a) vi = vi * (1-self.w2) + self.w2* low_enhance_feature(vi, r) return torch.cat([vi, ir],dim=1)

创新点二：多尺度动态专家融合模块（MDE）

这个创新点打开了新的思路对于跨模态特征融合，将几个经典的模块融合起来，这个几个“专家融合模块”由网络自适应融合。

该创新点首次构建基于金字塔层级的路由式专家融合机制，针对不同场景与目标尺度下模态可靠性差异大、固定融合策略失效的问题，设计包含全局令牌融合、跨模态交互、掩码引导局部互补与直接拼接四类互补融合专家的共享池，在 P2 至 P5 每个特征金字塔层级独立计算跨模态差异并生成路由决策，采用硬 Top-1 选择机制动态激活当前尺度最优的融合专家，使模型能够根据场景复杂度与尺度依赖的模态差异自适应调整融合行为，突破传统单一融合方式无法适配全场景与全尺度目标的局限，实现精细化的退化感知自适应融合。

# YOLOv5 common modules import math from copy import copy from pathlib import Path import numpy as np import pandas as pd import requests import torch import torch.nn as nn from PIL import Image from torch.cuda import amp import torch.nn.functional as F from torch.nn import init, Sequential def autopad(k, p=None): # kernel, padding # Pad to 'same' if p is None: p = k // 2 if isinstance(k, int) else [x // 2 for x in k] # auto-pad return p def DWConv(c1, c2, k=1, s=1, act=True): # Depthwise convolution return Conv(c1, c2, k, s, g=math.gcd(c1, c2), act=act) class Conv(nn.Module): # Standard convolution def __init__(self, c1, c2, k=1, s=1, p=None, g=1, act=True): # ch_in, ch_out, kernel, stride, padding, groups super(Conv, self).__init__() # print(c1, c2, k, s,) self.conv = nn.Conv2d(c1, c2, k, s, autopad(k, p), groups=g, bias=False) self.bn = nn.BatchNorm2d(c2) self.act = nn.SiLU() if act is True else (act if isinstance(act, nn.Module) else nn.Identity()) def forward(self, x): # print("Conv", x.shape) return self.act(self.bn(self.conv(x))) def fuseforward(self, x): return self.act(self.conv(x)) class TransformerLayer(nn.Module): # Transformer layer https://arxiv.org/abs/2010.11929 (LayerNorm layers removed for better performance) def __init__(self, c, num_heads): super().__init__() self.q = nn.Linear(c, c, bias=False) self.k = nn.Linear(c, c, bias=False) self.v = nn.Linear(c, c, bias=False) self.ma = nn.MultiheadAttention(embed_dim=c, num_heads=num_heads) self.fc1 = nn.Linear(c, c, bias=False) self.fc2 = nn.Linear(c, c, bias=False) def forward(self, x): x = self.ma(self.q(x), self.k(x), self.v(x))[0] + x x = self.fc2(self.fc1(x)) + x return x class TransformerBlock(nn.Module): # Vision Transformer https://arxiv.org/abs/2010.11929 def __init__(self, c1, c2, num_heads, num_layers): super().__init__() self.conv = None if c1 != c2: self.conv = Conv(c1, c2) self.linear = nn.Linear(c2, c2) # learnable position embedding self.tr = nn.Sequential(*[TransformerLayer(c2, num_heads) for _ in range(num_layers)]) self.c2 = c2 def forward(self, x): if self.conv is not None: x = self.conv(x) b, _, w, h = x.shape p = x.flatten(2) p = p.unsqueeze(0) p = p.transpose(0, 3) p = p.squeeze(3) e = self.linear(p) x = p + e x = self.tr(x) x = x.unsqueeze(3) x = x.transpose(0, 3) x = x.reshape(b, self.c2, w, h) return x class Bottleneck(nn.Module): # Standard bottleneck def __init__(self, c1, c2, shortcut=True, g=1, e=0.5): # ch_in, ch_out, shortcut, groups, expansion super(Bottleneck, self).__init__() c_ = int(c2 * e) # hidden channels self.cv1 = Conv(c1, c_, 1, 1) self.cv2 = Conv(c_, c2, 3, 1, g=g) self.add = shortcut and c1 == c2 def forward(self, x): return x + self.cv2(self.cv1(x)) if self.add else self.cv2(self.cv1(x)) class BottleneckCSP(nn.Module): # CSP Bottleneck https://github.com/WongKinYiu/CrossStagePartialNetworks def __init__(self, c1, c2, n=1, shortcut=True, g=1, e=0.5): # ch_in, ch_out, number, shortcut, groups, expansion super(BottleneckCSP, self).__init__() c_ = int(c2 * e) # hidden channels self.cv1 = Conv(c1, c_, 1, 1) self.cv2 = nn.Conv2d(c1, c_, 1, 1, bias=False) self.cv3 = nn.Conv2d(c_, c_, 1, 1, bias=False) self.cv4 = Conv(2 * c_, c2, 1, 1) self.bn = nn.BatchNorm2d(2 * c_) # applied to cat(cv2, cv3) self.act = nn.LeakyReLU(0.1, inplace=True) self.m = nn.Sequential(*[Bottleneck(c_, c_, shortcut, g, e=1.0) for _ in range(n)]) def forward(self, x): y1 = self.cv3(self.m(self.cv1(x))) y2 = self.cv2(x) return self.cv4(self.act(self.bn(torch.cat((y1, y2), dim=1)))) class C3(nn.Module): # CSP Bottleneck with 3 convolutions def __init__(self, c1, c2, n=1, shortcut=True, g=1, e=0.5): # ch_in, ch_out, number, shortcut, groups, expansion super(C3, self).__init__() c_ = int(c2 * e) # hidden channels self.cv1 = Conv(c1, c_, 1, 1) self.cv2 = Conv(c1, c_, 1, 1) self.cv3 = Conv(2 * c_, c2, 1) # act=FReLU(c2) self.m = nn.Sequential(*[Bottleneck(c_, c_, shortcut, g, e=1.0) for _ in range(n)]) # self.m = nn.Sequential(*[CrossConv(c_, c_, 3, 1, g, 1.0, shortcut) for _ in range(n)]) def forward(self, x): return self.cv3(torch.cat((self.m(self.cv1(x)), self.cv2(x)), dim=1)) class C3TR(C3): # C3 module with TransformerBlock() def __init__(self, c1, c2, n=1, shortcut=True, g=1, e=0.5): super().__init__(c1, c2, n, shortcut, g, e) c_ = int(c2 * e) self.m = TransformerBlock(c_, c_, 4, n) class SPP(nn.Module): # Spatial pyramid pooling layer used in YOLOv3-SPP def __init__(self, c1, c2, k=(5, 9, 13)): super(SPP, self).__init__() c_ = c1 // 2 # hidden channels self.cv1 = Conv(c1, c_, 1, 1) self.cv2 = Conv(c_ * (len(k) + 1), c2, 1, 1) self.m = nn.ModuleList([nn.MaxPool2d(kernel_size=x, stride=1, padding=x // 2) for x in k]) def forward(self, x): x = self.cv1(x) return self.cv2(torch.cat([x] + [m(x) for m in self.m], 1)) class Focus(nn.Module): # Focus wh information into c-space def __init__(self, c1, c2, k=1, s=1, p=None, g=1, act=True): # ch_in, ch_out, kernel, stride, padding, groups super(Focus, self).__init__() # print("c1 * 4, c2, k", c1 * 4, c2, k) self.conv = Conv(c1 * 4, c2, k, s, p, g, act) # self.contract = Contract(gain=2) def forward(self, x): # x(b,c,w,h) -> y(b,4c,w/2,h/2) # print("Focus inputs shape", x.shape) # print() return self.conv(torch.cat([x[..., ::2, ::2], x[..., 1::2, ::2], x[..., ::2, 1::2], x[..., 1::2, 1::2]], 1)) # return self.conv(self.contract(x)) class Contract(nn.Module): # Contract width-height into channels, i.e. x(1,64,80,80) to x(1,256,40,40) def __init__(self, gain=2): super().__init__() self.gain = gain def forward(self, x): N, C, H, W = x.size() # assert (H / s == 0) and (W / s == 0), 'Indivisible gain' s = self.gain x = x.view(N, C, H // s, s, W // s, s) # x(1,64,40,2,40,2) x = x.permute(0, 3, 5, 1, 2, 4).contiguous() # x(1,2,2,64,40,40) return x.view(N, C * s * s, H // s, W // s) # x(1,256,40,40) class Expand(nn.Module): # Expand channels into width-height, i.e. x(1,64,80,80) to x(1,16,160,160) def __init__(self, gain=2): super().__init__() self.gain = gain def forward(self, x): N, C, H, W = x.size() # assert C / s ** 2 == 0, 'Indivisible gain' s = self.gain x = x.view(N, s, s, C // s ** 2, H, W) # x(1,2,2,16,80,80) x = x.permute(0, 3, 4, 1, 5, 2).contiguous() # x(1,16,80,2,80,2) return x.view(N, C // s ** 2, H * s, W * s) # x(1,16,160,160) class Concat(nn.Module): # Concatenate a list of tensors along dimension def __init__(self, dimension=1): super(Concat, self).__init__() self.d = dimension def forward(self, x): # print(x.shape) return torch.cat(x, self.d) class Add(nn.Module): # Add two tensors def __init__(self, arg): super(Add, self).__init__() self.arg = arg def forward(self, x): return torch.add(x[0], x[1]) class Add2(nn.Module): # x + transformer[0] or x + transformer[1] def __init__(self, c1, index): super().__init__() self.index = index def forward(self, x): if self.index == 0: return torch.add(x[0], x[1][0]) elif self.index == 1: return torch.add(x[0], x[1][1]) # return torch.add(x[0], x[1]) class Classify(nn.Module): # Classification head, i.e. x(b,c1,20,20) to x(b,c2) def __init__(self, c1, c2, k=1, s=1, p=None, g=1): # ch_in, ch_out, kernel, stride, padding, groups super(Classify, self).__init__() self.aap = nn.AdaptiveAvgPool2d(1) # to x(b,c1,1,1) self.conv = nn.Conv2d(c1, c2, k, s, autopad(k, p), groups=g) # to x(b,c2,1,1) self.flat = nn.Flatten() def forward(self, x): z = torch.cat([self.aap(y) for y in (x if isinstance(x, list) else [x])], 1) # cat if list return self.flat(self.conv(z)) # flatten to x(b,c2) class SelfAttention(nn.Module): """ Multi-head masked self-attention layer """ def __init__(self, d_model, d_k, d_v, h, attn_pdrop=.1, resid_pdrop=.1): ''' :param d_model: Output dimensionality of the model :param d_k: Dimensionality of queries and keys :param d_v: Dimensionality of values :param h: Number of heads ''' super(SelfAttention, self).__init__() assert d_k % h == 0 self.d_model = d_model self.d_k = d_model // h self.d_v = d_model // h self.h = h # key, query, value projections for all heads self.que_proj = nn.Linear(d_model, h * self.d_k) # query projection self.key_proj = nn.Linear(d_model, h * self.d_k) # key projection self.val_proj = nn.Linear(d_model, h * self.d_v) # value projection self.out_proj = nn.Linear(h * self.d_v, d_model) # ours projection # regularization self.attn_drop = nn.Dropout(attn_pdrop) self.resid_drop = nn.Dropout(resid_pdrop) self.init_weights() def init_weights(self): for m in self.modules(): if isinstance(m, nn.Conv2d): init.kaiming_normal_(m.weight, mode='fan_out') if m.bias is not None: init.constant_(m.bias, 0) elif isinstance(m, nn.BatchNorm2d): init.constant_(m.weight, 1) init.constant_(m.bias, 0) elif isinstance(m, nn.Linear): init.normal_(m.weight, std=0.001) if m.bias is not None: init.constant_(m.bias, 0) def forward(self, x, attention_mask=None, attention_weights=None): ''' Computes Self-Attention Args: x (tensor): input (token) dim:(b_s, nx, c), b_s means batch size nx means length, for CNN, equals H*W, i.e. the length of feature maps c means channel, i.e. the channel of feature maps attention_mask: Mask over attention values (b_s, h, nq, nk). True indicates masking. attention_weights: Multiplicative weights for attention values (b_s, h, nq, nk). Return: ours (tensor): dim:(b_s, nx, c) ''' b_s, nq = x.shape[:2] nk = x.shape[1] q = self.que_proj(x).view(b_s, nq, self.h, self.d_k).permute(0, 2, 1, 3) # (b_s, h, nq, d_k) k = self.key_proj(x).view(b_s, nk, self.h, self.d_k).permute(0, 2, 3, 1) # (b_s, h, d_k, nk) K^T v = self.val_proj(x).view(b_s, nk, self.h, self.d_v).permute(0, 2, 1, 3) # (b_s, h, nk, d_v) # Self-Attention # :math:`(\text(Attention(Q,K,V) = Softmax((Q*K^T)/\sqrt(d_k))` att = torch.matmul(q, k) / np.sqrt(self.d_k) # (b_s, h, nq, nk) # weight and mask if attention_weights is not None: att = att * attention_weights if attention_mask is not None: att = att.masked_fill(attention_mask, -np.inf) # get attention matrix att = torch.softmax(att, -1) att = self.attn_drop(att) # ours out = torch.matmul(att, v).permute(0, 2, 1, 3).contiguous().view(b_s, nq, self.h * self.d_v) # (b_s, nq, h*d_v) out = self.resid_drop(self.out_proj(out)) # (b_s, nq, d_model) return out class myTransformerBlock(nn.Module): """ Transformer block """ def __init__(self, d_model, d_k, d_v, h, block_exp, attn_pdrop, resid_pdrop): """ :param d_model: Output dimensionality of the model :param d_k: Dimensionality of queries and keys :param d_v: Dimensionality of values :param h: Number of heads :param block_exp: Expansion factor for MLP (feed foreword network) """ super().__init__() self.ln_input = nn.LayerNorm(d_model) self.ln_output = nn.LayerNorm(d_model) self.sa = SelfAttention(d_model, d_k, d_v, h, attn_pdrop, resid_pdrop) self.mlp = nn.Sequential( nn.Linear(d_model, block_exp * d_model), # nn.SiLU(), # changed from GELU nn.GELU(), # changed from GELU nn.Linear(block_exp * d_model, d_model), nn.Dropout(resid_pdrop), ) def forward(self, x): bs, nx, c = x.size() x = x + self.sa(self.ln_input(x)) x = x + self.mlp(self.ln_output(x)) return x class GPT(nn.Module): """ the full GPT language model, with a context size of block_size """ def __init__(self, d_model, h=8, block_exp=4, n_layer=8, vert_anchors=8, horz_anchors=8, embd_pdrop=0.1, attn_pdrop=0.1, resid_pdrop=0.1): super().__init__() self.n_embd = d_model self.vert_anchors = vert_anchors self.horz_anchors = horz_anchors d_k = d_model d_v = d_model # positional embedding parameter (learnable), rgb_fea + ir_fea self.pos_emb = nn.Parameter(torch.zeros(1, 2 * vert_anchors * horz_anchors, self.n_embd)) # transformer self.trans_blocks = nn.Sequential(*[myTransformerBlock(d_model, d_k, d_v, h, block_exp, attn_pdrop, resid_pdrop) for layer in range(n_layer)]) # decoder head self.ln_f = nn.LayerNorm(self.n_embd) # regularization self.drop = nn.Dropout(embd_pdrop) # avgpool self.avgpool = nn.AdaptiveAvgPool2d((self.vert_anchors, self.horz_anchors)) # init weights self.apply(self._init_weights) @staticmethod def _init_weights(module): if isinstance(module, nn.Linear): module.weight.data.normal_(mean=0.0, std=0.02) if module.bias is not None: module.bias.data.zero_() elif isinstance(module, nn.LayerNorm): module.bias.data.zero_() module.weight.data.fill_(1.0) def forward(self, x): """ Args: x (tuple?) """ rgb_fea = x[0] # rgb_fea (tensor): dim:(B, C, H, W) ir_fea = x[1] # ir_fea (tensor): dim:(B, C, H, W) assert rgb_fea.shape[0] == ir_fea.shape[0] bs, c, h, w = rgb_fea.shape # ------------------------------------------------------------------------- # AvgPooling # ------------------------------------------------------------------------- # AvgPooling for reduce the dimension due to expensive computation rgb_fea = self.avgpool(rgb_fea) ir_fea = self.avgpool(ir_fea) # ------------------------------------------------------------------------- # Transformer # ------------------------------------------------------------------------- # pad token embeddings along number of tokens dimension rgb_fea_flat = rgb_fea.view(bs, c, -1) # flatten the feature ir_fea_flat = ir_fea.view(bs, c, -1) # flatten the feature token_embeddings = torch.cat([rgb_fea_flat, ir_fea_flat], dim=2) # concat token_embeddings = token_embeddings.permute(0, 2, 1).contiguous() # dim:(B, 2*H*W, C) # transformer x = self.drop(self.pos_emb + token_embeddings) # sum positional embedding and token dim:(B, 2*H*W, C) x = self.trans_blocks(x) # dim:(B, 2*H*W, C) # decoder head x = self.ln_f(x) # dim:(B, 2*H*W, C) x = x.view(bs, 2, self.vert_anchors, self.horz_anchors, self.n_embd) x = x.permute(0, 1, 4, 2, 3) # dim:(B, 2, C, H, W) # 这样截取的方式, 是否采用映射的方式更加合理？ rgb_fea_out = x[:, 0, :, :, :].contiguous().view(bs, self.n_embd, self.vert_anchors, self.horz_anchors) ir_fea_out = x[:, 1, :, :, :].contiguous().view(bs, self.n_embd, self.vert_anchors, self.horz_anchors) # ------------------------------------------------------------------------- # Interpolate (or Upsample) # ------------------------------------------------------------------------- rgb_fea_out = F.interpolate(rgb_fea_out, size=([h, w]), mode='bilinear') ir_fea_out = F.interpolate(ir_fea_out, size=([h, w]), mode='bilinear') return torch.cat([rgb_fea_out, ir_fea_out],dim=1) class fusion(nn.Module): def __init__(self,channel, reduction=16): super().__init__() self.channel = channel self.mask_map_r = nn.Conv2d(channel, 1,1,1,0,bias=True) self.mask_map_i = nn.Conv2d(channel, 1, 1, 1, 0, bias=True) self.softmax = nn.Softmax(-1) self.bottleneck1 = nn.Conv2d(channel, channel, 3,1,1,bias=False) self.bottleneck2 = nn.Conv2d(channel, channel, 3, 1, 1, bias=False) self.se = SE_Block(channel*2, reduction) def forward(self, x): x_left_ori, x_right_ori = x[0], x[1] x_left, x_right = x_left_ori*0.5, x_right_ori*0.5 x_mask_left = torch.mul(self.mask_map_r(x_left), x_left) x_mask_right = torch.mul(self.mask_map_i(x_right), x_right) out_IR = self.bottleneck1(x_mask_right + x_right_ori) out_RGB = self.bottleneck2(x_mask_left + x_left_ori) out = self.se(torch.cat([out_RGB, out_IR], 1)) #print(out.shape, x[0].shape) return out #----------融合模块1 cvci class CVCI(nn.Module): def __init__(self, in_chans=3, embed_dims=32, img_size=16, num_classes=1000, stem_channel=16, fc_dim=1280, num_heads=[1, 2], mlp_ratios=[3.6, 3.6], qkv_bias=True, qk_scale=None, representation_size=None, drop_rate=0., attn_drop_rate=0., drop_path_rate=0., hybrid_backbone=None, norm_layer=None, depths=[0, 1], qk_ratio=1, sr_ratios=[8, 4], dp=0.1): super().__init__() self.out_dict = {} #################### ir transformer #################### self.ir_patch_embed_b = PatchEmbed( img_size=img_size, patch_size=1, in_chans=in_chans, embed_dim=embed_dims) # self.ir_relative_pos_b = nn.Parameter(torch.randn( # num_heads[1], self.ir_patch_embed_b.num_patches, # self.ir_patch_embed_b.num_patches // sr_ratios[1] // sr_ratios[ # 1])) #self.ir_patch_embed_b.num_patches//sr_ratios[1]//sr_ratios[1]) ir_dpr = [x.item() for x in torch.linspace(0, drop_path_rate, sum(depths))] # stochastic depth decay rule ir_cur = 0 # self.ir_blocks_a = nn.ModuleList([ # CDAM_Block( # dim=embed_dims[0], num_heads=num_heads[0], mlp_ratio=mlp_ratios[0], qkv_bias=qkv_bias, # qk_scale=qk_scale, drop=drop_rate, attn_drop=attn_drop_rate, drop_path=ir_dpr[ir_cur + i], # norm_layer=norm_layer, qk_ratio=qk_ratio, sr_ratio=sr_ratios[0]) # for i in range(depths[0])]) ir_cur += depths[0] self.ir_blocks_b = nn.ModuleList([ CDAM_Block( dim=embed_dims, num_heads=num_heads[1], mlp_ratio=mlp_ratios[1], qkv_bias=qkv_bias, qk_scale=qk_scale, drop=drop_rate, attn_drop=attn_drop_rate, drop_path=ir_dpr[ir_cur + i], qk_ratio=qk_ratio, sr_ratio=sr_ratios[1]) for i in range(depths[1])]) #################### vis transformer #################### self.vis_patch_embed_b = PatchEmbed( img_size=img_size, patch_size=1, in_chans=in_chans, embed_dim=embed_dims) # self.vis_relative_pos_b = nn.Parameter(torch.randn( # num_heads[1], self.vis_patch_embed_b.num_patches, # self.vis_patch_embed_b.num_patches // sr_ratios[1] // sr_ratios[ # 1])) #self.vis_patch_embed_b.num_patches // sr_ratios[1] // sr_ratios[1] vis_dpr = [x.item() for x in torch.linspace(0, drop_path_rate, sum(depths))] # stochastic depth decay rule vis_cur = 0 # self.vis_blocks_a = nn.ModuleList([ # CDAM_Block( # dim=embed_dims[0], num_heads=num_heads[0], mlp_ratio=mlp_ratios[0], qkv_bias=qkv_bias, # qk_scale=qk_scale, drop=drop_rate, attn_drop=attn_drop_rate, drop_path=vis_dpr[vis_cur + i], # norm_layer=norm_layer, qk_ratio=qk_ratio, sr_ratio=sr_ratios[0]) # for i in range(depths[0])]) vis_cur += depths[0] self.vis_blocks_b = nn.ModuleList([ CDAM_Block( dim=embed_dims, num_heads=num_heads[1], mlp_ratio=mlp_ratios[1], qkv_bias=qkv_bias, qk_scale=qk_scale, drop=drop_rate, attn_drop=attn_drop_rate, drop_path=vis_dpr[vis_cur + i], qk_ratio=qk_ratio, sr_ratio=sr_ratios[1]) for i in range(depths[1])]) def forward(self, x): x, y = x[0], x[1] x, (H, W) = self.ir_patch_embed_b(x) y, (H, W) = self.vis_patch_embed_b(y) A = x B = y if self.out_dict != {}: for i, blk in enumerate(self.ir_blocks_b): x = blk(x, B, H, W, self.out_dict["ir_relative_pos_b"]) else: for i, blk in enumerate(self.ir_blocks_b): # x = blk(x, B, H, W, self.ir_relative_pos_b) x = blk(x, B, H, W) if self.out_dict != {}: for i, blk in enumerate(self.vis_blocks_b): y = blk(y, A, H, W, self.out_dict["vis_relative_pos_b"]) else: for i, blk in enumerate(self.vis_blocks_b): # y = blk(y, A, H, W, self.vis_relative_pos_b) y = blk(y, A, H, W) B, N, C = x.shape x = x.permute(0, 2, 1).reshape(B, C, H, W) # 加入域间融合，先可以直接卷积过去 y = y.permute(0, 2, 1).reshape(B, C, H, W) # 加入域间融合，先可以直接卷积过去 out_feature = torch.cat((x, y), dim=1) #print(x.shape, out_feature.shape) return out_feature class MFusion(nn.Module): def __init__(self,dim_in): super(MFusion, self).__init__() dim = dim_in//4 self.dim = dim self.num_experts = 4 self.top_k = 1 # 定义多个专家，每个专家是 (B, C, W, H) -> (B, C, W, H) self.experts = nn.ModuleList([ CVCI(in_chans=dim, embed_dims=dim), #FeatureAdd(q=1), fusion(channel=dim), GPT(d_model=dim), #FeatureAdd(q=1), #CrossTransformerFusion(input_dim=dim), #FeatureAdd(q=1), concat(), ]) # 门控网络 self.gating_network = nn.Linear(dim, self.num_experts, bias=False) def forward(self, x): xf = torch.abs(x[0]-x[1]) B, C, H, W = xf.shape # 保持 (B, C, H, W) 格式 #print(self.dim, xf.shape) pool_x = F.adaptive_avg_pool2d(xf, (1, 1)).squeeze(2).squeeze(2) #print(pool_x.shape) # 计算 gating 权重 (B, num_experts)，根据 text_feature 计算专家选择概率 #print(self.dim, pool_x.shape) gate_logits = self.gating_network(pool_x) # (B, num_experts) gate_weights = F.softmax(gate_logits, dim=-1) # (B, num_experts) print(gate_weights) # 选择 top-k 专家 topk_values, topk_indices = torch.topk(gate_weights, self.top_k, dim=-1) # (B, top_k) # 初始化 MoE 输出 moe_output = torch.zeros_like(torch.cat([x[0],x[1]], dim=1)) # (B, C, H, W) #print(topk_indices.shape) # 仅计算 Top-k 专家 #print(self.top_k) for i in range(self.top_k): expert_idx = topk_indices[:, i] # (B,) weight = topk_values[:, i].view(B, 1, 1, 1) # (B, 1, 1, 1) 用于加权 # 计算当前专家输出，仅在选中的 batch 进行计算 #print(self.num_experts) for j in range(self.num_experts): mask = (expert_idx == j).view(B, 1, 1, 1) # 选中的 batch if mask.any(): #print(moe_output.shape, self.experts[j](x).shape) moe_output += weight * mask * self.experts[j](x) # (B, C, H, W) return moe_output #print(self.experts[0](x).shape, self.experts[1](x).shape) #return self.experts[2](x)

大家可以将自己的融合方式放进去，包括之前的融合方式笔者为大家提供的融合方式，同时也为大家提供其他几种融合方式：

https://blog.csdn.net/2201_75517551/article/details/159799348?spm=1001.2014.3001.5502

class MFusion_1(nn.Module): def __init__(self,dim_in): super(MFusion_1, self).__init__() dim = dim_in//4 self.dim = dim self.num_experts = 4 # 这个 self.top_k = 1 # 定义多个专家，每个专家是 (B, C, W, H) -> (B, C, W, H) self.fusion = CVCI(in_chans=dim, embed_dims=dim) def forward(self, x): return self.fusion(x) class MFusion_2(nn.Module): def __init__(self,dim_in): super(MFusion_2, self).__init__() dim = dim_in//4 self.dim = dim self.num_experts = 4 # 这个 self.top_k = 1 # 定义多个专家，每个专家是 (B, C, W, H) -> (B, C, W, H) self.fusion = fusion(channel=dim) def forward(self, x): return self.fusion(x) class MFusion_3(nn.Module): def __init__(self,dim_in): super(MFusion_3, self).__init__() dim = dim_in//4 self.dim = dim self.num_experts = 4 # 这个 self.top_k = 1 # 定义多个专家，每个专家是 (B, C, W, H) -> (B, C, W, H) self.fusion = GPT(d_model=dim) def forward(self, x): return self.fusion(x)

创新三：CFI-MPD-IoU 边界框回归损失

该创新点提出聚焦目标核心区域的鲁棒边界框优化损失，针对恶劣环境下目标边界模糊导致传统 IoU 类损失梯度不稳定、定位精度下降的问题，通过收缩边界计算核心区域重叠度弱化模糊边界的干扰，同时引入 MPD 角点距离约束强化预测框与真实框的几何对齐一致性，以闭合可微分形式完成计算，在不增加推理开销的前提下提升边界框优化的稳定性与一致性，有效改善退化场景下目标定位不准、训练收敛震荡的问题，为检测模型提供更可靠的定位监督信号。

后续将进行更新！！！！以及进行二次创新，发顶刊必备。。。敬请关注！！！

笔者整理双模态检测的专属论文资料，免费分享给粉丝，需要关注后领取。

查看全文

http://www.jsqmd.com/news/609335/