当前位置: 首页 > news >正文

Ascend C 新一代算子编程语言

Ascend C 新一代算子编程语言

Ascend C 概述

Ascend C 是华为推出的新一代昇腾算子编程语言,基于标准 C++ 语法扩展,是 TBE TIK 的官方替代方案。

核心优势

  • 标准 C++ 语法,学习曲线低
  • 支持 CPU 侧仿真调试,无需真实硬件
  • 编译器自动优化,性能接近手写汇编
  • 与 Triton/TileLang 等开源工具链兼容

编程模型

核函数(Kernel Function)

Ascend C 的核心是 __global__ 修饰的核函数,运行在 AI Core 上:

// 核函数声明
extern "C" __global__ __aicore__ void add_custom(GM_ADDR x,      // 输入1(全局内存地址)GM_ADDR y,      // 输入2GM_ADDR z       // 输出
) {// 核函数实现KernelAdd op;op.Init(x, y, z);op.Process();
}

核函数调用(Host 侧)

// Host 侧调用核函数
// 参数:<<<blockDim, l2ctrl, stream>>>
add_custom<<<blockDim, l2ctrl, stream>>>(x_gm, y_gm, z_gm);

核心 API 体系

内存层次 API

// 全局内存(HBM)访问
GlobalTensor<half> xGlobal;
xGlobal.SetGlobalBuffer((__gm__ half*)x, totalLength);// 本地内存(UB)声明
TBuf<TPosition::VECCALC> tmpBuf;  // 向量计算缓冲区
LocalTensor<half> xLocal = tmpBuf.Get<half>();

数据搬运 API

// DataCopy:GM → UB(数据搬入)
DataCopy(xLocal, xGlobal[offset], copyLength);// DataCopy:UB → GM(数据搬出)
DataCopy(zGlobal[offset], zLocal, copyLength);

向量计算 API

// 逐元素加法
Add(zLocal, xLocal, yLocal, length);// 逐元素乘法
Mul(zLocal, xLocal, yLocal, length);// ReLU
Relu(zLocal, xLocal, length);// 最大值
Max(zLocal, xLocal, yLocal, length);// 数据类型转换
Cast(zLocalFP32, xLocalFP16, RoundMode::CAST_NONE, length);

矩阵计算 API

// 矩阵乘法(Cube 单元)
Mmad(cLocal, aLocal, bLocal, M, K, N, false);

同步 API

// 设置 Pipe 间同步标志
SetFlag<HardEvent::V_MTE2>(eventId);   // 向量计算完成通知 MTE2
WaitFlag<HardEvent::MTE2_V>(eventId);  // 等待 MTE2 完成// 流水线同步
PipeBarrier<PIPE_ALL>();  // 等待所有 Pipe 完成

完整示例:Add 算子

#include "kernel_operator.h"// 算子实现类
class KernelAdd {
public:__aicore__ inline void Init(GM_ADDR x, GM_ADDR y, GM_ADDR z,uint32_t totalLength, uint32_t tileLength) {// 保存参数this->totalLength = totalLength;this->tileLength = tileLength;this->tileNum = totalLength / tileLength;// 初始化全局内存张量xGm.SetGlobalBuffer((__gm__ half*)x, totalLength);yGm.SetGlobalBuffer((__gm__ half*)y, totalLength);zGm.SetGlobalBuffer((__gm__ half*)z, totalLength);// 初始化 Pipe(内存管理器)pipe.InitBuffer(inQueueX, BUFFER_NUM, tileLength * sizeof(half));pipe.InitBuffer(inQueueY, BUFFER_NUM, tileLength * sizeof(half));pipe.InitBuffer(outQueueZ, BUFFER_NUM, tileLength * sizeof(half));}__aicore__ inline void Process() {// 分 Tile 处理for (uint32_t i = 0; i < tileNum; i++) {CopyIn(i);Compute(i);CopyOut(i);}}private:__aicore__ inline void CopyIn(uint32_t progress) {// 从 GM 搬入数据到 UBLocalTensor<half> xLocal = inQueueX.AllocTensor<half>();LocalTensor<half> yLocal = inQueueY.AllocTensor<half>();DataCopy(xLocal, xGm[progress * tileLength], tileLength);DataCopy(yLocal, yGm[progress * tileLength], tileLength);inQueueX.EnQue(xLocal);inQueueY.EnQue(yLocal);}__aicore__ inline void Compute(uint32_t progress) {// 执行向量加法LocalTensor<half> xLocal = inQueueX.DeQue<half>();LocalTensor<half> yLocal = inQueueY.DeQue<half>();LocalTensor<half> zLocal = outQueueZ.AllocTensor<half>();Add(zLocal, xLocal, yLocal, tileLength);outQueueZ.EnQue<half>(zLocal);inQueueX.FreeTensor(xLocal);inQueueY.FreeTensor(yLocal);}__aicore__ inline void CopyOut(uint32_t progress) {// 将结果从 UB 搬出到 GMLocalTensor<half> zLocal = outQueueZ.DeQue<half>();DataCopy(zGm[progress * tileLength], zLocal, tileLength);outQueueZ.FreeTensor(zLocal);}private:// 全局内存张量GlobalTensor<half> xGm, yGm, zGm;// 队列(管理 UB 内存)TQue<QuePosition::VECIN, BUFFER_NUM> inQueueX, inQueueY;TQue<QuePosition::VECOUT, BUFFER_NUM> outQueueZ;// Pipe(内存分配器)TPipe pipe;// 参数uint32_t totalLength, tileLength, tileNum;static constexpr int32_t BUFFER_NUM = 2;  // 双缓冲
};// 核函数入口
extern "C" __global__ __aicore__ void add_custom(GM_ADDR x, GM_ADDR y, GM_ADDR z,GM_ADDR workspace, GM_ADDR tiling
) {GET_TILING_DATA(tilingData, tiling);KernelAdd op;op.Init(x, y, z, tilingData.totalLength, tilingData.tileLength);op.Process();
}

Tiling 机制

Tiling 是 Ascend C 算子开发的核心,决定如何将大数据分块处理。

Tiling 参数定义

// tiling.h - 定义 Tiling 数据结构
#include "register/tilingdata_base.h"BEGIN_TILING_DATA_DEF(AddTilingData)TILING_DATA_FIELD_DEF(uint32_t, totalLength);  // 总元素数TILING_DATA_FIELD_DEF(uint32_t, tileLength);   // 每个 Tile 的元素数TILING_DATA_FIELD_DEF(uint32_t, tileNum);      // Tile 数量
END_TILING_DATA_DEFREGISTER_TILING_DATA_CLASS(AddCustom, AddTilingData)

Tiling 计算函数(Host 侧)

// tiling.cpp - 在 Host 侧计算 Tiling 参数
#include "tiling.h"namespace optiling {static ge::graphStatus TilingFunc(gert::TilingContext* context) {AddTilingData tiling;// 获取输入形状auto x_shape = context->GetInputShape(0)->GetStorageShape();uint32_t totalLength = x_shape.GetShapeSize();// 计算 Tile 大小(根据 UB 容量)// UB 容量约 256KB,FP16 占 2 字节// 双缓冲需要 2 倍空间uint32_t ubSize = 256 * 1024;  // 256KBuint32_t tileLength = ubSize / sizeof(uint16_t) / 2 / 3;  // 3个队列tileLength = (tileLength / 128) * 128;  // 128 对齐tiling.set_totalLength(totalLength);tiling.set_tileLength(tileLength);tiling.set_tileNum(totalLength / tileLength);// 设置 block 数量(多核并行)context->SetBlockDim(8);  // 使用 8 个 AI Coretiling.SaveToBuffer(context->GetRawTilingData()->GetData(),context->GetRawTilingData()->GetCapacity());context->GetRawTilingData()->SetDataSize(tiling.GetDataSize());return ge::GRAPH_SUCCESS;
}REGISTER_OP_TILING_FUNC_BUFFERED(AddCustom, TilingFunc);}  // namespace optiling

多核并行

Ascend C 支持多个 AI Core 并行执行:

extern "C" __global__ __aicore__ void add_custom_multicore(GM_ADDR x, GM_ADDR y, GM_ADDR z,GM_ADDR workspace, GM_ADDR tiling
) {GET_TILING_DATA(tilingData, tiling);// 获取当前 Core 的 IDuint32_t coreId = GetBlockIdx();uint32_t coreNum = GetBlockNum();// 每个 Core 处理一部分数据uint32_t perCoreLength = tilingData.totalLength / coreNum;uint32_t offset = coreId * perCoreLength;KernelAdd op;op.Init(x + offset * sizeof(half),y + offset * sizeof(half),z + offset * sizeof(half),perCoreLength, tilingData.tileLength);op.Process();
}

CPU 仿真调试

Ascend C 支持在 CPU 上仿真执行,方便调试:

// cpu_main.cpp - CPU 仿真入口
#include "kernel_operator.h"int main() {// 准备测试数据const int N = 1024;std::vector<half> x(N, 1.0f);std::vector<half> y(N, 2.0f);std::vector<half> z(N, 0.0f);// CPU 仿真调用AscendC::SetKernelMode(KernelMode::AIV_MODE);add_custom<<<1, nullptr, nullptr>>>(x.data(), y.data(), z.data(), nullptr, nullptr);// 验证结果for (int i = 0; i < N; i++) {assert(abs((float)z[i] - 3.0f) < 1e-3);}printf("CPU simulation passed!\n");return 0;
}
# 编译 CPU 仿真版本
cmake .. -DASCEND_COMPUTE_UNIT=cpu -DCMAKE_BUILD_TYPE=Debug
make -j4
./add_custom_test

项目结构

custom_add_op/
├── CMakeLists.txt
├── op_kernel/
│   └── add_custom.cpp      # Kernel 实现
├── op_host/
│   ├── add_custom_tiling.h # Tiling 数据结构
│   └── add_custom.cpp      # Tiling 计算 + 算子注册
└── test/├── cpu_test.cpp         # CPU 仿真测试└── npu_test.cpp         # NPU 真实测试
# CMakeLists.txt
cmake_minimum_required(VERSION 3.14)
project(add_custom)set(ASCEND_PATH $ENV{ASCEND_TOOLKIT_HOME})# Kernel 编译(NPU 侧)
add_custom_target(kernelCOMMAND ${ASCEND_PATH}/bin/ccec_compiler--cce-aicore-arch=dav-c220-O2-o add_custom.o${CMAKE_SOURCE_DIR}/op_kernel/add_custom.cpp
)# Host 编译
add_library(add_custom_host SHARED op_host/add_custom.cpp)
target_include_directories(add_custom_host PRIVATE ${ASCEND_PATH}/include)
target_link_libraries(add_custom_host ${ASCEND_PATH}/lib64/libascendcl.so)