当前位置：首页 > news >正文

RTL设计和HLS高层次设计

news 2026/6/16 15:45:26

一、RTL设计和HLS高层次设计
1.rtl设计需要关注微架构的决策，高层次设计不需要制定微架构决策，关注的是宏框架设计；
2.FSM状态机的创建、数据的路径、寄存器流水线这些细节留给HLS工具编译器来处理；
3.高层次综合通过提供的约束来生成优化的rtl；
4.高层次综合在宏框架上做出解决方案，在性能和面积之间做权衡和取舍

二、任务级别的并行度
1.可以使用dataflow编译指令，或者使用hls::task对象来显式创建并行度；
2.存储器架构：
访问全局存储器会产生更高的时延成本，可耗时大量周期，而访问本地存储器通常十分快速，只需一个或多个周期
即可。

三、函数级别并行设计
1.在函数级别实现任务级并行度。为实现任务级并行度，需将循环推送到多个独立的函数中。原始 compute() 函数
拆分为多个子函数。根据经验法则，顺序函数可并发执行，顺序循环则可采用流水打拍。

2.没有优化之前的代码：
void compute (data_t in[totalNumWords ], data_t Out[totalNumWords ]) {
data_t tmp1[totalNumWords], tmp2[totalNumWords];
A: for (int i = 0; i < totalNumWords ; ++i) {
tmp1[i] = in[i] * 3;
tmp2[i] = in[i] * 3;
}
B: for (int i = 0; i < totalNumWords ; ++i) {
tmp1[i] = tmp1[i] + 25;
}
C: for (int i = 0; i < totalNumWords ; ++i) {
tmp2[i] = tmp2[i] * 2;
}
D: for (int i = 0; i < totalNumWords ; ++i) {
out[i] = tmp1[i] + tmp2[i] * 2;
}

3.优化设计

#include "diamond.h"
#define NUM_WORDS 16
extern "C" {
void diamond(vecOf16Words* vecIn, vecOf16Words* vecOut, int size)
{
hls::stream<vecOf16Words> c0, c1, c2, c3, c4, c5;
assert(size % 16 == 0);
#pragma HLS dataflow
load(vecIn, c0, size);
compute_A(c0, c1, c2, size);
compute_B(c1, c3, size);
compute_C(c2, c4, size);
compute_D(c3, c4,c5, size);
store(c5, vecOut, size);
}
}
void load(vecOf16Words *in, hls::stream<vecOf16Words >& out, int size)
{
Loop0:
for (int i = 0; i < size; i++)
{
#pragma HLS PERFORMANCE target_ti=32
#pragma HLS LOOP_TRIPCOUNT max=32
out.write(in[i]);
}
}
void compute_A(hls::stream<vecOf16Words >& in, hls::stream<vecOf16Words >&
out1, hls::stream<vecOf16Words >& out2, int size)
{
Loop0:
for (int i = 0; i < size; i++)
{
#pragma HLS PERFORMANCE target_ti=32
#pragma HLS LOOP_TRIPCOUNT max=32
vecOf16Words t = in.read();
out1.write(t * 3);
out2.write(t * 3);
}
}
void compute_B(hls::stream<vecOf16Words >& in, hls::stream<vecOf16Words >&
out, int size)
{
Loop0:
for (int i = 0; i < size; i++)
{
#pragma HLS PERFORMANCE target_ti=32
#pragma HLS LOOP_TRIPCOUNT max=32
out.write(in.read() + 25);

}
}
void compute_C(hls::stream<vecOf16Words >& in, hls::stream<vecOf16Words >&
out, int size)
{
Loop0:
for (data_t i = 0; i < size; i++)
{
#pragma HLS PERFORMANCE target_ti=32
#pragma HLS LOOP_TRIPCOUNT max=32
out.write(in.read() * 2);
}
}
void compute_D(hls::stream<vecOf16Words >& in1, hls::stream<vecOf16Words >&
in2, hls::stream<vecOf16Words >& out, int size)
{
Loop0:
for (data_t i = 0; i < size; i++)
{
#pragma HLS PERFORMANCE target_ti=32
#pragma HLS LOOP_TRIPCOUNT max=32
out.write(in1.read() + in2.read());
}
}
void store(hls::stream<vecOf16Words >& in, vecOf16Words *out, int size)
{
Loop0:
for (int i = 0; i < size; i++)
{
#pragma HLS PERFORMANCE target_ti=32
#pragma HLS LOOP_TRIPCOUNT max=32
out[i] = in.read();
}
}