5-8倍加速:ncnn 3×3卷积模块
5-8倍加速:ncnn 3×3矩阵卷积模块
我把腾讯ncnn的3×3卷积从手工循环替换成了自己的算法(Im2Col + GEMM),实测加速5到8倍。
适用于大通道数(inch≥16, outch≥32)、大分辨率特征图、服务端推理场景。小通道建议fallback回原生实现。
ncnn项目地址:https://github.com/Tencent/ncnn
ncnn/
├── src/
│ ├── layer/
│ │ ├── convolution.h
│ │ ├── convolution.cpp
│ │ ├── convolution_gemm.h ✅ 你新增
│ │ └── convolution_gemm.cpp ✅ 你新增
│ └── ...
├── CMakeLists.txt ✅ 加 NCNN_GEMM option
└── README.md
需要的人自己去fork、加文件、提PR。
#include "convolution_gemm.h" #include "layer_type.h" #if NCNN_GEMM #include <cblas.h> #endif #include <math.h> namespace ncnn { static void im2col_3x3_pad( const Mat& bottom_blob, float* col, int outh, int outw, int inch, int w, int h, int pad_top, int pad_left, const Option& opt) { const int tiles = outh * outw; #pragma omp parallel for num_threads(opt.num_threads) for (int c = 0; c < inch; ++c) { const float* img = bottom_blob.channel(c); for (int ky = 0; ky < 3; ++ky) { for (int kx = 0; kx < 3; ++kx) { int row = c * 9 + ky * 3 + kx; float* col_row = col + row * tiles; for (int y = 0; y < outh; ++y) { int sy = y - pad_top + ky; const float* img_row = (sy >= 0 && sy < h) ? img + sy * w : 0; for (int x = 0; x < outw; ++x) { int sx = x - pad_left + kx; col_row[y * outw + x] = (img_row && sx >= 0 && sx < w) ? img_row[sx] : 0.f; } } } } } } int ConvolutionGEMM::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const { // ---- 防御性检查 ---- if (kernel_w != 3 || kernel_h != 3 || stride_w != 1 || stride_h != 1 || dilation_w != 1 || dilation_h != 1) { return Convolution::forward(bottom_blob, top_blob, opt); } #if NCNN_GEMM const int inch = bottom_blob.c; const int h = bottom_blob.h; const int w = bottom_blob.w; const int outch = num_output; const int outh = (h + pad_top + pad_bottom - kernel_h) / stride_h + 1; const int outw = (w + pad_left + pad_right - kernel_w) / stride_w + 1; const int tiles = outh * outw; const int K = 9 * inch; top_blob.create(outw, outh, outch, 4u, opt.blob_allocator); if (top_blob.empty()) return -100; // 创建 Col 矩阵:w=tiles, h=K,行优先连续 Mat col_blob(tiles, K, 1, 4u, opt.blob_allocator); if (col_blob.empty()) return -100; float* col = col_blob; // ---- im2col ---- im2col_3x3_pad(bottom_blob, col, outh, outw, inch, w, h, pad_top, pad_left, opt); // ---- GEMM ---- #ifdef OPENBLAS_USE_THREAD_LOCAL openblas_set_num_threads_local(1); // 防止 OpenBLAS 和 OpenMP 线程冲突 #endif cblas_sgemm( CblasRowMajor, CblasNoTrans, CblasNoTrans, outch, tiles, K, 1.f, weight_data, K, col, tiles, 0.f, (float*)top_blob, top_blob.cstep // 严丝合缝对齐 ncnn cstep ); // ---- Bias + Activation 融合算子 (零STL, 单循环, 无分支) ---- #pragma omp parallel for num_threads(opt.num_threads) for (int p = 0; p < outch; ++p) { float* out = top_blob.channel(p); const float bias = bias_data.empty() ? 0.f : bias_data[p]; switch (activation_type) { case 1: // ReLU { #pragma omp simd for (int i = 0; i < tiles; ++i) { const float val = out[i] + bias; out[i] = val > 0.f ? val : 0.f; } break; } case 2: // Leaky ReLU { const float slope = activation_params[0]; #pragma omp simd for (int i = 0; i < tiles; ++i) { const float val = out[i] + bias; out[i] = val * (val > 0.f ? 1.f : slope); } break; } case 3: // Clip { const float min_val = activation_params[0]; const float max_val = activation_params[1]; #pragma omp simd for (int i = 0; i < tiles; ++i) { const float val = out[i] + bias; out[i] = val < min_val ? min_val : (val > max_val ? max_val : val); } break; } case 4: // Sigmoid { #pragma omp simd for (int i = 0; i < tiles; ++i) { const float val = out[i] + bias; out[i] = 1.f / (1.f + expf(-val)); } break; } case 5: // Mish (修掉 exp 溢出隐患) { #pragma omp simd for (int i = 0; i < tiles; ++i) { const float val = out[i] + bias; // 数值安全版 softplus: val > 20 时直接用 val 近似 const float sp = val > 20.f ? val : log1pf(expf(val)); out[i] = val * tanhf(sp); } break; } case 6: // Swish { const float slope = activation_params[0]; #pragma omp simd for (int i = 0; i < tiles; ++i) { const float val = out[i] + bias; out[i] = val * (1.f / (1.f + expf(-val * slope))); } break; } default: // 无激活 { #pragma omp simd for (int i = 0; i < tiles; ++i) { out[i] += bias; } break; } } } return 0; #else return Convolution::forward(bottom_blob, top_blob, opt); #endif } } // namespace ncnn