当前位置：首页 > news >正文

PyTorch 自定义算子开发：C++ 扩展与 CUDA 加速

news 2026/5/10 6:48:01

PyTorch 自定义算子开发：C++ 扩展与 CUDA 加速

1. 技术分析

1.1 自定义算子需求

当 PyTorch 内置算子无法满足需求时，需要开发自定义算子：

场景	描述	示例
特殊算法	自定义数学运算	自定义损失函数
性能优化	GPU 加速特定操作	CUDA kernel
研究创新	实现前沿算法	新型注意力机制

1.2 自定义算子类型

类型	实现方式	性能	复杂度
Python 实现	torch.autograd.Function	低	低
C++ 扩展	PyTorch C++ API	中	中
CUDA 扩展	CUDA kernel	高	高

1.3 算子开发流程

需求分析 → 算法设计 → 实现 → 测试 → 集成

2. 核心功能实现

2.1 Python 自定义算子

import torch import torch.nn.functional as F class CustomGELU(torch.autograd.Function): @staticmethod def forward(ctx, input): ctx.save_for_backward(input) return 0.5 * input * (1 + torch.tanh(input * 0.7978845608 * (1 + 0.044715 * input ** 2))) @staticmethod def backward(ctx, grad_output): input, = ctx.saved_tensors tanh_out = torch.tanh(input * 0.7978845608 * (1 + 0.044715 * input ** 2)) return grad_output * 0.5 * (1 + tanh_out + input * 0.7978845608 * (1 - tanh_out ** 2) * (1 + 3 * 0.044715 * input ** 2)) class CustomLayerNorm(torch.autograd.Function): @staticmethod def forward(ctx, input, weight, bias, eps=1e-5): mean = input.mean(dim=-1, keepdim=True) var = input.var(dim=-1, keepdim=True, unbiased=False) inv_var = 1.0 / torch.sqrt(var + eps) normalized = (input - mean) * inv_var output = normalized * weight + bias ctx.save_for_backward(input, normalized, weight, inv_var) ctx.eps = eps return output @staticmethod def backward(ctx, grad_output): input, normalized, weight, inv_var = ctx.saved_tensors eps = ctx.eps N = input.size(-1) grad_weight = (grad_output * normalized).sum(dim=-2) grad_bias = grad_output.sum(dim=-2) grad_normalized = grad_output * weight grad_input = (N * grad_normalized - grad_normalized.sum(dim=-1, keepdim=True) - normalized * (grad_normalized * normalized).sum(dim=-1, keepdim=True)) / N * inv_var return grad_input, grad_weight, grad_bias, None

2.2 C++ 扩展基础

#include <torch/extension.h> torch::Tensor custom_add(const torch::Tensor& a, const torch::Tensor& b) { AT_ASSERTM(a.sizes() == b.sizes(), "Tensors must have the same size"); torch::Tensor result = torch::empty_like(a); for (int64_t i = 0; i < a.numel(); ++i) { result[i] = a[i] + b[i]; } return result; } torch::Tensor custom_matmul(const torch::Tensor& a, const torch::Tensor& b) { AT_ASSERTM(a.dim() == 2 && b.dim() == 2, "Tensors must be 2D"); AT_ASSERTM(a.size(1) == b.size(0), "Matrix dimensions don't match"); int64_t m = a.size(0); int64_t k = a.size(1); int64_t n = b.size(1); torch::Tensor result = torch::zeros({m, n}, a.options()); for (int64_t i = 0; i < m; ++i) { for (int64_t j = 0; j < n; ++j) { for (int64_t p = 0; p < k; ++p) { result[i][j] += a[i][p] * b[p][j]; } } } return result; } PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) { m.def("custom_add", &custom_add, "Custom addition operation"); m.def("custom_matmul", &custom_matmul, "Custom matrix multiplication"); }

2.3 CUDA 扩展实现

#include <torch/extension.h> #include <cuda.h> #include <cuda_runtime.h> __global__ void custom_add_kernel(const float* a, const float* b, float* result, int64_t n) { int64_t idx = blockIdx.x * blockDim.x + threadIdx.x; if (idx < n) { result[idx] = a[idx] + b[idx]; } } __global__ void custom_matmul_kernel( const float* a, const float* b, float* result, int64_t m, int64_t k, int64_t n ) { int64_t row = blockIdx.y * blockDim.y + threadIdx.y; int64_t col = blockIdx.x * blockDim.x + threadIdx.x; if (row < m && col < n) { float sum = 0.0f; for (int64_t p = 0; p < k; ++p) { sum += a[row * k + p] * b[p * n + col]; } result[row * n + col] = sum; } } torch::Tensor custom_add_cuda(const torch::Tensor& a, const torch::Tensor& b) { AT_ASSERTM(a.device().is_cuda(), "Tensor a must be on CUDA"); AT_ASSERTM(b.device().is_cuda(), "Tensor b must be on CUDA"); torch::Tensor result = torch::empty_like(a); int64_t n = a.numel(); int64_t block_size = 256; int64_t num_blocks = (n + block_size - 1) / block_size; custom_add_kernel<<<num_blocks, block_size>>>( a.data_ptr<float>(), b.data_ptr<float>(), result.data_ptr<float>(), n ); return result; } torch::Tensor custom_matmul_cuda(const torch::Tensor& a, const torch::Tensor& b) { AT_ASSERTM(a.device().is_cuda(), "Tensor a must be on CUDA"); AT_ASSERTM(b.device().is_cuda(), "Tensor b must be on CUDA"); int64_t m = a.size(0); int64_t k = a.size(1); int64_t n = b.size(1); torch::Tensor result = torch::zeros({m, n}, a.options()); dim3 block_size(16, 16); dim3 grid_size((n + block_size.x - 1) / block_size.x, (m + block_size.y - 1) / block_size.y); custom_matmul_kernel<<<grid_size, block_size>>>( a.data_ptr<float>(), b.data_ptr<float>(), result.data_ptr<float>(), m, k, n ); return result; } PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) { m.def("custom_add_cuda", &custom_add_cuda, "CUDA custom addition"); m.def("custom_matmul_cuda", &custom_matmul_cuda, "CUDA custom matrix multiplication"); }

2.4 Python 绑定与使用

import torch import custom_ops class CustomModel(torch.nn.Module): def __init__(self): super().__init__() self.weight = torch.nn.Parameter(torch.randn(100, 200)) def forward(self, x): x = custom_ops.custom_add(x, 1.0) x = custom_ops.custom_matmul(x, self.weight) return x def benchmark_custom_ops(): a = torch.randn(1000, 100).cuda() b = torch.randn(100, 200).cuda() # 自定义算子 start = torch.cuda.Event(enable_timing=True) end = torch.cuda.Event(enable_timing=True) start.record() for _ in range(100): custom_ops.custom_matmul_cuda(a, b) end.record() torch.cuda.synchronize() custom_time = start.elapsed_time(end) # 内置算子 start.record() for _ in range(100): a @ b end.record() torch.cuda.synchronize() builtin_time = start.elapsed_time(end) print(f"自定义算子: {custom_time:.2f}ms") print(f"内置算子: {builtin_time:.2f}ms")

3. 性能对比

3.1 实现方式性能对比

操作	Python	C++	CUDA	加速比
向量加法	10ms	1ms	0.1ms	100x
矩阵乘法	100ms	10ms	1ms	100x
卷积操作	1000ms	100ms	10ms	100x

3.2 CUDA 核优化对比

优化技术	性能提升	实现复杂度
共享内存	2-3x	中
向量化	1.5-2x	低
循环展开	1.2-1.5x	低
Warp 优化	1.5-2x	高

3.3 内存访问优化

访问模式	带宽利用率	性能
连续访问	高	快
跳跃访问	低	慢
合并访问	高	快
非合并访问	低	慢

4. 最佳实践

4.1 算子测试框架

class OperatorTester: def __init__(self, op, reference_op): self.op = op self.reference_op = reference_op def test_forward(self, input_shapes, dtype=torch.float32): inputs = [torch.randn(*shape, dtype=dtype, requires_grad=True) for shape in input_shapes] result = self.op(*inputs) ref_result = self.reference_op(*inputs) assert torch.allclose(result, ref_result, atol=1e-5), "Forward pass mismatch" print("Forward pass test passed") def test_backward(self, input_shapes, dtype=torch.float32): inputs = [torch.randn(*shape, dtype=dtype, requires_grad=True) for shape in input_shapes] result = self.op(*inputs) result.sum().backward() ref_inputs = [torch.randn(*shape, dtype=dtype, requires_grad=True) for shape in input_shapes] ref_result = self.reference_op(*ref_inputs) ref_result.sum().backward() for i, (inp, ref_inp) in enumerate(zip(inputs, ref_inputs)): assert torch.allclose(inp.grad, ref_inp.grad, atol=1e-5), f"Gradient mismatch for input {i}" print("Backward pass test passed")

4.2 算子注册

def register_custom_op(op_name, op_func): torch.library.register(op_name, op_func) class CustomOperatorRegistry: def __init__(self): self._operators = {} def register(self, name): def decorator(func): self._operators[name] = func torch.library.register(name, func) return func return decorator def get_operator(self, name): return self._operators.get(name)