当前位置: 首页 > news >正文

高层次综合设计乒乓buffer(double-buffer/pingpong-buffer)

一、数组优化为乒乓缓存或者FIFO
1.double-buffer/ping-pong buffer
2.FIFO
3.流水线设计,吞吐量优化

二、乒乓缓存ping-pong buffer
1.通常用于将一块内存分成两个区域或者两个独立内存,
在数据处理的时候交替读写;使得读写可以同事进行;
2.在vivado hls中,把数组显式配置为乒乓缓存ping-pong buffer或者FIFO,
消除访存瓶颈,实现任务流水线和数据驱动;

三、将数组实现为乒乓缓存
1.需要在一块存储区上同时进行load和compute计算;
需要compute和store,并且这些操作之间有数据依赖;
2.vivado hls没有专门的ping_pong指令,通过数组分区+手动索引来切换和改造,
并且配合dataflow实现了重叠;

四、乒乓buffer的实现方式
1.可以使用array_partition+block方式,将一个大数组切成两个独立的物理内存;
2.通过index索引进行读写目标的切换;
3.使用dataflow指令让生产者和消费者函数重叠运行,各自访问不同的内存块;


五、双缓冲处理的循环
1.案例一
#define N 1024
void top(int in[N], int out[N]) {
#pragma HLS dataflow
int buf[2][N]; // 两块缓冲
#pragma HLS ARRAY_PARTITION variable=buf complete dim=1
// 注意:dim=1 将第一维(2)完全分割,得到 buf0[N] 和 buf1[N] 两个独立 RAM

// 第一趟:加载到 buf0,同时处理 buf1(第一次无效,需主循环中控制)
// 这里用乒乓循环模型:
load(in, buf[0], 0); // 加载块0
for (int i = 0; i < M; i++) {
if (i % 2 == 0) {
load(in, buf[0], i); // 生产者写入 buf0
process(buf[1], out, i); // 消费者处理 buf1
} else {
load(in, buf[1], i);
process(buf[0], out, i);
}
}
}


2.案例二
void load(int* in, int* buf, int block_idx);
void process(int* buf, int* out, int block_idx);

void top(int in[N], int out[N]) {
#pragma HLS dataflow
int buf0[N], buf1[N]; // 两个独立数组
#pragma HLS STREAM variable=buf0 depth=... // 不一定是 stream,可保留为 BRAM
#pragma HLS STREAM variable=buf1 ...
for (int i = 0; i < BLOCKS; i++) {
if (i % 2 == 0) {
load(in, buf0, i);
process(buf1, out, i); // 上一个块已就绪
} else {
load(in, buf1, i);
process(buf0, out, i);
}
}
}

上述的两个案例load和process没办法重叠,
因为有了if-else,dataflow指令没办法作用到load和process上,让其重叠!

3.如何解决问题?
此写法不能让 load 和 process 真正重叠,因为它们在同一个循环内顺序调用。要实现重叠,必须启动两个并行执行的函数体,这正是 dataflow 擅长的地方。可以使用两个“永久循环”或更实际的:手动将乒乓缓冲写成显式两段流水线,通过 hls::stream 传递块索引同步。

实现真正重叠的改写方案,采用双进程 + 令牌同步模式,在 Vivado HLS dataflow 区域中让 load 与 process 完全并发执行。


#include <hls_stream.h>

#define N 1024
#define NUM_BLOCKS 10

// Load function: reads from input, writes to buffer, sends token
void load_process(int* in, int buf0[N], int buf1[N],
hls::stream<int>& load_sync,
hls::stream<int>& process_sync) {
for (int blk = 0; blk < NUM_BLOCKS; blk++) {
int token = load_sync.read(); // wait for available buffer token (0 or 1)
int* buf = (token == 0) ? buf0 : buf1;
// Load data into buf
for (int i = 0; i < N; i++) {
buf[i] = in[blk * N + i];
}
// Notify process which buffer is ready
process_sync.write(token);
}
}

// Process function: waits for token, processes buffer, sends back token
void process_process(int* out, int buf0[N], int buf1[N],
hls::stream<int>& process_sync,
hls::stream<int>& load_sync) {
for (int blk = 0; blk < NUM_BLOCKS; blk++) {
int token = process_sync.read(); // wait for ready buffer
int* buf = (token == 0) ? buf0 : buf1;
// Process data
for (int i = 0; i < N; i++) {
out[blk * N + i] = buf[i] * 2; // example
}
// Notify load that this buffer is free
load_sync.write(token);
}
}

void top(int in[N * NUM_BLOCKS], int out[N * NUM_BLOCKS]) {
#pragma HLS dataflow
int buf0[N], buf1[N];
#pragma HLS STREAM variable=buf0 ... // optional if needed as FIFO? but here we use BRAM. We may just partition to avoid false dependency.
#pragma HLS ARRAY_PARTITION variable=buf0 cyclic factor=... // if needed
// We don't necessarily need STREAM pragma if we keep them as RAM.
// Synchronization streams
hls::stream<int> load2proc; // token indicating which buffer has been loaded
hls::stream<int> proc2load; // token indicating which buffer has been processed and free

// Initially, both buffers are free: send two tokens
proc2load.write(0);
proc2load.write(1);

// Launch concurrent processes
load_process(in, buf0, buf1, proc2load, load2proc);
process_process(out, buf0, buf1, load2proc, proc2load);
}

或者
#include <hls_stream.h>

#define N 1024
#define NUM_BLOCKS 10

// Load function: reads from input, writes to buffer, sends token
void load_process(int* in, int buf0[N], int buf1[N],
hls::stream<int>& load_sync,
hls::stream<int>& process_sync) {
for (int blk = 0; blk < NUM_BLOCKS; blk++) {
int token = load_sync.read(); // wait for available buffer token (0 or 1)
int* buf = (token == 0) ? buf0 : buf1;
// Load data into buf
for (int i = 0; i < N; i++) {
buf[i] = in[blk * N + i];
}
// Notify process which buffer is ready
process_sync.write(token);
}
}

// Process function: waits for token, processes buffer, sends back token
void process_process(int* out, int buf0[N], int buf1[N],
hls::stream<int>& process_sync,
hls::stream<int>& load_sync) {
for (int blk = 0; blk < NUM_BLOCKS; blk++) {
int token = process_sync.read(); // wait for ready buffer
int* buf = (token == 0) ? buf0 : buf1;
// Process data
for (int i = 0; i < N; i++) {
out[blk * N + i] = buf[i] * 2; // example
}
// Notify load that this buffer is free
load_sync.write(token);
}
}

void top(int in[N * NUM_BLOCKS], int out[N * NUM_BLOCKS]) {
#pragma HLS dataflow
int buf0[N], buf1[N];
#pragma HLS STREAM variable=buf0 ... // optional if needed as FIFO? but here we use BRAM. We may just partition to avoid false dependency.
#pragma HLS ARRAY_PARTITION variable=buf0 cyclic factor=... // if needed
// We don't necessarily need STREAM pragma if we keep them as RAM.
// Synchronization streams
hls::stream<int> load2proc; // token indicating which buffer has been loaded
hls::stream<int> proc2load; // token indicating which buffer has been processed and free

// Initially, both buffers are free: send two tokens
proc2load.write(0);
proc2load.write(1);

// Launch concurrent processes
load_process(in, buf0, buf1, proc2load, load2proc);
process_process(out, buf0, buf1, load2proc, proc2load);
}


六、使用ap_memory + FSM实现乒乓buffer
#include <hls_stream.h>

// 常量定义
#define BLOCK_SIZE 1024 // 每次处理的块大小
#define NUM_BLOCKS 4 // 总块数

// 状态机状态
enum State { LOAD_BUF0, LOAD_BUF1, PROC_BUF0, PROC_BUF1, IDLE };

void pingpong_fsm(
int* in, // 输入:ap_memory 接口
int* out, // 输出:ap_memory 接口
int total_size // 总数据量(必须为 BLOCK_SIZE * NUM_BLOCKS)
) {
// 将顶层接口指定为 ap_memory(类似 BRAM 端口:addr, ce, d, q)
#pragma HLS INTERFACE ap_memory port=in
#pragma HLS INTERFACE ap_memory port=out
#pragma HLS INTERFACE s_axilite port=total_size
#pragma HLS INTERFACE s_axilite port=return

// 局部乒乓缓冲,指定为单端口块 RAM(ap_memory 资源)
int buf0[BLOCK_SIZE];
int buf1[BLOCK_SIZE];
#pragma HLS RESOURCE variable=buf0 core=RAM_1P_BRAM
#pragma HLS RESOURCE variable=buf1 core=RAM_1P_BRAM

// 消除由于重复使用同一数组造成的迭代间假依赖(手动 FSM 会顺序读写,但需告知工具)
#pragma HLS dependence variable=buf0 inter false
#pragma HLS dependence variable=buf1 inter false

// 状态寄存器及辅助变量
State state = LOAD_BUF0;
int in_addr = 0; // 输入数据地址(全局索引)
int out_addr = 0; // 输出数据地址(全局索引)
int local_addr = 0; // 局部缓冲内的地址
int block_cnt = 0; // 已处理的块数

// 主循环:每个周期执行一个操作(读或写)
while (block_cnt < NUM_BLOCKS) {
#pragma HLS pipeline II=1 // 关键:目标单周期吞吐,让状态机步进成为流水
switch (state) {
case LOAD_BUF0: {
if (in_addr < total_size) {
buf0[local_addr] = in[in_addr]; // 从 ap_memory 读入
in_addr++;
local_addr++;
}
// 一块加载完毕,切换到处理 buf0,同时准备加载 buf1
if (local_addr == BLOCK_SIZE) {
local_addr = 0;
state = PROC_BUF0; // 如果此时 buf1 未满,不能立即加载,需再判断
// 但若要同时加载 buf1,则需在这里启动加载进程。
// 由于是单 FSM,无法在同一时刻做两件事,因此采用顺序交替:处理 buf0 完后再加载 buf1。
}
break;
}
case LOAD_BUF1: {
if (in_addr < total_size) {
buf1[local_addr] = in[in_addr];
in_addr++;
local_addr++;
}
if (local_addr == BLOCK_SIZE) {
local_addr = 0;
state = PROC_BUF1;
}
break;
}
case PROC_BUF0: {
if (out_addr < total_size) {
int tmp = buf0[local_addr] * 2; // 简单处理:乘以2
out[out_addr] = tmp;
out_addr++;
local_addr++;
}
if (local_addr == BLOCK_SIZE) {
local_addr = 0;
block_cnt++;
// 一块处理结束,如果还有数据,则开始加载下一块到刚刚释放的 buf0
if (in_addr < total_size) {
state = LOAD_BUF0; // 注意:此处交替,处理 buf0 后立即开始加载 buf0(乒乓)
} else {
state = IDLE;
}
}
break;
}
case PROC_BUF1: {
if (out_addr < total_size) {
int tmp = buf1[local_addr] * 2;
out[out_addr] = tmp;
out_addr++;
local_addr++;
}
if (local_addr == BLOCK_SIZE) {
local_addr = 0;
block_cnt++;
if (in_addr < total_size) {
state = LOAD_BUF1;
} else {
state = IDLE;
}
}
break;
}
default: // IDLE
break;
}
}
}

http://www.jsqmd.com/news/1007629/

相关文章:

  • 2026年6月诚信的马弗炉供应商口碑分析,高精度测硫仪/环保型对辊破碎机/实验室小型对辊破碎机,马弗炉制造企业推荐 - 品牌推荐师
  • 少走弯路:盘点2026年王者级的AI论文写作工具
  • 怎么编写一个shell脚本,用户输入软件包自动识别系统,然后安装
  • FPGA时序收敛实战:手把手教你用Vivado正确处理时钟域与生成时钟
  • 2026手机端外语口音语音克隆工具实测:口音还原、语种覆盖选型指南 - GrowthUME
  • 5G URLLC低时延保障:深入解析PUSCH Repetition Type B与无效符号处理机制
  • 嵌入式开发硬核技能:SPI与Quad Timer寄存器级编程实战解析
  • 别光看理论!拆解MIPS指令字:LW、SW这些信号在CPU单总线里到底怎么‘蹦’出来的?
  • Xilinx FPGA上跑的8路并行低通滤波器工程包(含MATLAB信号生成与频谱分析)
  • 手把手复现:用Python仿真验证电容容抗公式1/(j*2*pi*f*C),附代码与波形分析
  • 2026科技驱动型EMBA客观测评:理性选型与项目对比 - 品牌2026推荐
  • 【jupyter notebook】中文符号需要按两次才能输入
  • 高数期末救命!72道不定积分题里,这5类‘换元法’套路必须掌握(附解题模板)
  • 别再只盯着准确率了!手把手教你用颜色矩+SVM做图像分类时的模型调优与评估陷阱
  • 2026年6月评价高的电加热器实力厂家哪家靠谱,小型导热油加热器/反应釜油加热器/空气电加热器,电加热器企业哪家强 - 品牌推荐师
  • MyBatis-Plus动态查询实战:用QueryWrapper的and()和or()优雅构建商品筛选与权限查询
  • 深度解析发酵饲料:核心原理、应用价值与养殖实践 - 速递信息
  • 终端与IDE形态的vibe coding实测:两款AI编程工具迭代能力对比
  • LangChain工程化实践:从提示词到AI原生架构
  • 2026靠前境内外EMBA客观测评:理性择校全指南 - 品牌2026推荐
  • 别再死记硬背了!用RTA-OS配置Task优先级和调度策略,看完这篇就够了
  • 告别日志混乱!用CAPL的setLogFileName和writeToLogEx打造自动化测试日志系统(Vector CANoe实战)
  • 郑州黄金珠宝回收哪家靠谱?24 小时上门、无套路变现,本地人可参考这两家! - 同城好物推荐官
  • 2026年6月在线浊度计知名品牌排行榜:国产力量崛起与技术格局重塑 - 液体流量液位品牌推荐
  • ParsecVDisplay虚拟显示器实战指南:3个高级技巧打造专业级多屏工作站
  • Spring MVC传统XML配置版登录注册实战项目(含MySQL建表脚本与完整工程结构)
  • 从IG发送器到CAPL脚本:手把手调试CAN(FD)报文属性(BRS/FDF/BitCount)
  • i.MX21 GPIO与PWM寄存器深度解析与嵌入式开发实战指南
  • 深度探索:解锁联想刃7000k隐藏性能的实战之旅
  • 手把手教你玩转CAPL Message:从IG发送器触发到自定义报文解析的完整流程