当前位置: 首页 > news >正文

CANN/asc-devkit TensorTrait样例

更多样例

【免费下载链接】asc-devkit本项目是CANN 推出的昇腾AI处理器专用的算子程序开发语言,原生支持C和C++标准规范,主要由类库和语言扩展层构成,提供多层级API,满足多维场景算子开发诉求。项目地址: https://gitcode.com/cann/asc-devkit

  • 矢量计算基础API使用TensorTrait样例

    #include "kernel_operator.h" class KernelBinaryScalarTrait { public: __aicore__ inline KernelBinaryScalarTrait() {} __aicore__ inline void Init(__gm__ uint8_t* src, __gm__ uint8_t* dstGm) { srcGlobal.SetGlobalBuffer((__gm__ int16_t*)src); dstGlobal.SetGlobalBuffer((__gm__ int16_t*)dstGm); pipe.InitBuffer(inQueueSrc, 1, 512 * sizeof(int16_t)); pipe.InitBuffer(outQueueDst, 1, 512 * sizeof(int16_t)); } __aicore__ inline void Process() { CopyIn(); Compute(); CopyOut(); } private: __aicore__ inline void CopyIn() { AscendC::LocalTensor<AscendC::TensorTrait<int16_t>> srcLocal = inQueueSrc.AllocTensor<AscendC::TensorTrait<int16_t>>(); AscendC::DataCopy(srcLocal, srcGlobal, 512); inQueueSrc.EnQue(srcLocal); } __aicore__ inline void Compute() { AscendC::LocalTensor<AscendC::TensorTrait<int16_t>> srcLocal = inQueueSrc.DeQue<AscendC::TensorTrait<int16_t>>(); AscendC::LocalTensor<AscendC::TensorTrait<int16_t>> dstLocal = outQueueDst.AllocTensor<AscendC::TensorTrait<int16_t>>(); uint64_t mask = 128; int16_t scalar = 2; // repeatTime = 4, 128 elements one repeat, 512 elements total // dstBlkStride, srcBlkStride = 1, no gap between blocks in one repeat // dstRepStride, srcRepStride =8, no gap between repeats AscendC::Adds(dstLocal, srcLocal, scalar, mask, 4, {1, 1, 8, 8}); outQueueDst.EnQue(dstLocal); inQueueSrc.FreeTensor(srcLocal); } __aicore__ inline void CopyOut() { AscendC::LocalTensor<AscendC::TensorTrait<int16_t>> dstLocal = outQueueDst.DeQue<AscendC::TensorTrait<int16_t>>(); AscendC::DataCopy(dstGlobal, dstLocal, 512); outQueueDst.FreeTensor(dstLocal); } private: AscendC::TPipe pipe; AscendC::TQue<AscendC::TPosition::VECIN, 1> inQueueSrc; AscendC::TQue<AscendC::TPosition::VECOUT, 1> outQueueDst; AscendC::GlobalTensor<AscendC::TensorTrait<int16_t>> srcGlobal, dstGlobal; }; extern "C" __global__ __aicore__ void binary_scalar_trait_kernel(__gm__ uint8_t* src, __gm__ uint8_t* dstGm) { KernelBinaryScalarTrait op; op.Init(src, dstGm); op.Process(); }
  • 矩阵计算基础API使用TensorTrait样例

    #include "kernel_operator.h" template <typename dst_T, typename fmap_T, typename weight_T, typename dstCO1_T, typename bias_T> class KernelMatmul { public: __aicore__ inline KernelMatmul(uint16_t mIn, uint8_t kIn, uint8_t nIn, bool initl1In, bool initl0In) { m = mIn; k = kIn; n = nIn; aSize = m * k; bSize = k * n; cSize = m * n; initl0 = initl0In; initl1 = initl1In; } __aicore__ inline void Init(__gm__ uint8_t *a, __gm__ uint8_t *b, __gm__ uint8_t *c) { aGM.SetGlobalBuffer((__gm__ fmap_T *)a); bGM.SetGlobalBuffer((__gm__ weight_T *)b); cGM.SetGlobalBuffer((__gm__ dstCO1_T *)c); pipe.InitBuffer(inQueueA1, 1, aSize * sizeof(fmap_T)); pipe.InitBuffer(inQueueA2, 1, aSize * sizeof(fmap_T)); pipe.InitBuffer(inQueueB1, 1, bSize * sizeof(weight_T)); pipe.InitBuffer(inQueueB2, 2, bSize * sizeof(weight_T)); pipe.InitBuffer(outQueueCO1, 1, cSize * sizeof(dstCO1_T)); } __aicore__ inline void Process() { CopyIn(); SplitA(); SplitB(); Compute(); CopyOut(); } private: __aicore__ inline void CopyIn() { AscendC::LocalTensor<AscendC::TensorTrait<fmap_T>> a1Local = inQueueA1.AllocTensor<AscendC::TensorTrait<fmap_T>>(); AscendC::LocalTensor<AscendC::TensorTrait<weight_T>> b1Local = inQueueB1.AllocTensor<AscendC::TensorTrait<weight_T>>(); if(initl1 == true) { AscendC::Fill(a1Local, {static_cast<uint16_t>(m * k * sizeof(fmap_T) / 32), 1, 0, 1}); AscendC::Fill(b1Local, {static_cast<uint16_t>(k * n * sizeof(weight_T) / 32), 1, 0, 1}); } else { AscendC::DataCopy(a1Local, aGM, aSize); AscendC::DataCopy(b1Local, bGM, bSize); } inQueueA1.EnQue(a1Local); inQueueB1.EnQue(b1Local); } __aicore__ inline void SplitA() { AscendC::LocalTensor<AscendC::TensorTrait<fmap_T>> a1Local = inQueueA1.DeQue<AscendC::TensorTrait<fmap_T>>(); AscendC::LocalTensor<AscendC::TensorTrait<fmap_T>> a2Local = inQueueA2.AllocTensor<AscendC::TensorTrait<fmap_T>>(); // 1、load2d L1->L0A AscendC::LoadData2dParams loadL0AParams; loadL0AParams.repeatTimes = m * k * sizeof(fmap_T) / 512; loadL0AParams.srcStride = 1; loadL0AParams.dstGap = 0; if (initl0 == true) { Fill(a2Local, {static_cast<uint16_t>(m * k * sizeof(fmap_T) / 512), 1, 0, 1}); } else{ LoadData(a2Local, a1Local, loadL0AParams); } inQueueA2.EnQue<AscendC::TensorTrait<fmap_T>>(a2Local); inQueueA1.FreeTensor(a1Local); } __aicore__ inline void SplitB() { AscendC::LocalTensor<AscendC::TensorTrait<weight_T>> b1Local = inQueueB1.DeQue<AscendC::TensorTrait<weight_T>>(); AscendC::LocalTensor<AscendC::TensorTrait<weight_T>> b2Local = inQueueB2.AllocTensor<AscendC::TensorTrait<weight_T>>(); // 2、load2d L1->L0B AscendC::LoadData2dParams loadL0BParams; loadL0BParams.repeatTimes = k * n * sizeof(weight_T) / 512; loadL0BParams.srcStride = 1; loadL0BParams.dstGap = 0; if (initl0 == true) { AscendC::Fill(b2Local, {static_cast<uint16_t>(k * n * sizeof(weight_T) / 512), 1, 0, 1}); } else{ AscendC::LoadData(b2Local, b1Local, loadL0BParams); } inQueueB1.FreeTensor(b1Local); inQueueB2.EnQue<AscendC::TensorTrait<weight_T>>(b2Local); } __aicore__ inline void Compute() { AscendC::LocalTensor<AscendC::TensorTrait<fmap_T>> a2Local = inQueueA2.DeQue<AscendC::TensorTrait<fmap_T>>(); AscendC::LocalTensor<AscendC::TensorTrait<weight_T>> b2Local = inQueueB2.DeQue<AscendC::TensorTrait<weight_T>>(); AscendC::LocalTensor<AscendC::TensorTrait<dstCO1_T>> c1Local = outQueueCO1.AllocTensor<AscendC::TensorTrait<dstCO1_T>>(); mmadParams.isBias = false; mmadParams.m = m; mmadParams.n = n; mmadParams.k = k; AscendC::Mmad(c1Local, a2Local, b2Local, mmadParams); // m*n outQueueCO1.EnQue<AscendC::TensorTrait<dstCO1_T>>(c1Local); inQueueA2.FreeTensor(a2Local); inQueueB2.FreeTensor(b2Local); } #if __NPU_ARCH__ <= 2002 __aicore__ inline void CopyOut() { AscendC::LocalTensor<AscendC::TensorTrait<dstCO1_T>> c1Local = outQueueCO1.DeQue<AscendC::TensorTrait<dstCO1_T>>(); uint16_t M_ = Ceil(m, 16) * 16; AscendC::LocalTensor<AscendC::TensorTrait<dst_T>> ublocal; AscendC::TBuffAddr tbufublocal; tbufublocal.logicPos = (uint8_t)AscendC::TPosition::C1; ublocal.SetAddr(tbufublocal); ublocal.InitBuffer(0, M_ * n); DataCopyParams dataCopyParams; dataCopyParams.blockCount = 1; dataCopyParams.blockLen = Ceil(M_ * n * 4, 1024); DataCopyEnhancedParams enhancedParams; enhancedParams.blockMode = AscendC::BlockMode::BLOCK_MODE_MATRIX; AscendC::DataCopy(ublocal, c1Local, dataCopyParams, enhancedParams); PipeBarrier<PIPE_ALL>(); outQueueCO1.FreeTensor(c1Local); dataCopyParams.blockCount = 1; dataCopyParams.blockLen = m * n *sizeof(dstCO1_T) / ONE_BLK_SIZE; dataCopyParams.srcStride = 0; dataCopyParams.dstStride = 0; AscendC::DataCopy(cGM, ublocal, dataCopyParams); } #else __aicore__ inline void CopyOut() { AscendC::LocalTensor<AscendC::TensorTrait<dstCO1_T>> c1Local = outQueueCO1.DeQue<AscendC::TensorTrait<dstCO1_T>>(); AscendC::FixpipeParamsV220 fixpipeParams; fixpipeParams.nSize = n; fixpipeParams.mSize = m; fixpipeParams.srcStride = m; fixpipeParams.dstStride = n; fixpipeParams.ndNum = 1; fixpipeParams.srcNdStride = 0; fixpipeParams.dstNdStride = 0; AscendC::Fixpipe(cGM, c1Local, fixpipeParams); outQueueCO1.FreeTensor(c1Local); } #endif private: AscendC::TPipe pipe; AscendC::TQue<AscendC::TPosition::A1, 1> inQueueA1; AscendC::TQue<AscendC::TPosition::A2, 1> inQueueA2; AscendC::TQue<AscendC::TPosition::B1, 1> inQueueB1; AscendC::TQue<AscendC::TPosition::B2, 1> inQueueB2; // dst queue AscendC::TQue<AscendC::TPosition::CO1, 1> outQueueCO1; AscendC::GlobalTensor<AscendC::TensorTrait<fmap_T>> aGM; AscendC::GlobalTensor<AscendC::TensorTrait<weight_T>> bGM; AscendC::GlobalTensor<AscendC::TensorTrait<dst_T>> cGM; uint16_t m, k, n; bool initl0, initl1; uint16_t aSize, bSize, cSize, b2Size; AscendC::MmadParams mmadParams; }; extern "C" __global__ __aicore__ void cube_initconstvalue_simple_operator_half_16_32_16_true_false( __gm__ uint8_t *a, __gm__ uint8_t *b, __gm__ uint8_t *c) { if ASCEND_IS_AIV { return; } KernelMatmul<float, half, half, float, half> op(16, 32, 16, true, false); op.Init(a, b, c); op.Process(); }

【免费下载链接】asc-devkit本项目是CANN 推出的昇腾AI处理器专用的算子程序开发语言,原生支持C和C++标准规范,主要由类库和语言扩展层构成,提供多层级API,满足多维场景算子开发诉求。项目地址: https://gitcode.com/cann/asc-devkit

创作声明:本文部分内容由AI辅助生成(AIGC),仅供参考

http://www.jsqmd.com/news/851375/

相关文章:

  • FreeRTOS队列机制深度解析:嵌入式实时系统任务通信的核心枢纽
  • CANN/ops-nn Swish激活函数
  • Perplexity医疗搜索效能跃迁(FDA黑框警告级误检率下降76%实测报告)
  • Windows 11性能监控终极指南:实时跟踪系统资源使用情况的完整教程
  • CANN/asc-devkit AllGather通信接口
  • AI招聘工具怎么选?直接推荐前程无忧的3个理由
  • elec-ops-simulation实战教程:5步实现电网稳态运行仿真
  • KDiff3文件比较与合并工具:从新手到高手的完整指南
  • 无王无帝定乾坤,来自田间第一人:凰标永存昭后世
  • 别再乱设时钟裕量了!手把手教你用set_clock_uncertainty搞定DC/PT时序收敛
  • 终极指南:如何使用Harepacker复活版轻松打造你的MapleStory游戏世界
  • 3DSident深度技术解析:逆向工程工具与硬件诊断套件的系统级实现
  • NS-USBLoader终极指南:一站式解决Switch玩家的三大痛点
  • Codex 安装与 VS Code 联动:手把手配置指南
  • 【HarmonyOS 6.1 全场景实战】《灵犀厨房》实战(十七):【语音识别】免提声控启动播报——动口不动手
  • CANN/asc-devkit HCCL批量写入接口
  • 终极指南:如何用YOLOv8 AI自瞄系统快速提升游戏瞄准精度
  • 终极知识管理模板:快速搭建你的Obsidian笔记系统
  • CANN/Ascend C GroupBarrier Arrive函数
  • KDiff3技术深度解析:高效文件比较与合并的架构设计与算法实现
  • 用Simulink复现VSG自适应控制:从理论模型到完整仿真(附2018b+源码)
  • 通过 TaoToken CLI 工具一键配置开发环境提升团队协作效率
  • Perplexity体育新闻搜索失效真相大起底(2024赛季高频故障TOP5深度归因)
  • 如何零成本获取全球金融数据?开源工具AKShare终极指南
  • 告别‘听完再说’:聊聊LAS语音识别模型为啥不能实时转文字,以及现在有啥新方案
  • 3步精通FanControl:打造Windows平台智能风扇控制系统
  • 【Perplexity语法查询终极指南】:20年DBA亲授5大隐藏技巧,90%开发者至今不知!
  • 2026年青岛欧式起重机制造厂优选榜单揭晓 - 品牌企业推荐师(官方)
  • 在自动化工作流中集成Taotoken为OpenClaw提供稳定模型服务
  • CANN/asc-devkit Tiling注册API