YOLO26N 姿态估计 TensorRT 部署:Jetson 实时推理
1. TensorRT 转换
fromultralyticsimportYOLO model=YOLO("yolo26n-pose.pt")model.export(format="engine",imgsz=640,half=True,batch=1)
# 或 trtexec/usr/src/tensorrt/bin/trtexec\--onnx=yolo26n-pose.onnx\--saveEngine=yolo26n-pose.engine\--fp16\--workspace=2048
2. TensorRT 推理封装
#!/usr/bin/env python3"""trt_pose.py - TensorRT 姿态估计推理"""importtensorrtastrtimportpycuda.driverascudaimportpycuda.autoinitimportnumpyasnpimportcv2importtimeclassTRTPoseDetector:def__init__(self,engine_path,conf_thresh=0.3):self.conf_thresh=conf_thresh logger=trt.Logger(trt.Logger.WARNING)runtime=trt.Runtime(logger)withopen(engine_path,"rb")asf:self.engine=runtime.deserialize_cuda_engine(f.read())self.context=self.engine.create_execution_context()self.inputs=[]self.outputs=[]self.bindings=[]self.stream=cuda.Stream()foriinrange(self.engine.num_io_tensors):name=self.engine.get_tensor_name(i)dtype=trt.nptype(self.engine.get_tensor_dtype(name))shape=self.engine.get_tensor_shape(name)shape=tuple(max(1,s)ifs>=0else1forsinshape)size=trt.volume(shape)host_mem=cuda.pagelocked_empty(size,dtype)device_mem=cuda.mem_alloc(host_mem.nbytes)self.bindings.append(int(device_mem))info={"name":name,"host":host_mem,"device":device_mem,"shape":shape}ifself.engine.get_tensor_mode(name)==trt.TensorIOMode.INPUT:self.inputs.append(info)else:self.outputs.append(info)defpreprocess(self,image):h,w=image.shape[:2]scale=min(640/h,640/w)new_h,new_w=int(h*scale),int(w*scale)resized=cv2.resize(image,(new_w,new_h))canvas=np.full((640,640,3),114,dtype=np.uint8)dy,dx=(640-new_h)//2,(640-new_w)//2canvas[dy:dy+new_h,dx:dx+new_w]=resized blob=canvas[:,:,::-1].transpose(2,0,1).astype(np.float32)/255.0returnnp.expand_dims(blob,axis=0),scale,(dy,dx)defdetect(self,image):blob,scale,pad=self.preprocess(image)np.copyto(self.inputs[0]["host"],blob.ravel())cuda.memcpy_htod_async(self.inputs[0]["device"],self.inputs[0]["host"],self.stream)self.context.set_input_shape(self.inputs[0]["name"],blob.shape)self.context.execute_async_v2(bindings=self.bindings,stream_handle=self.stream.handle)cuda.memcpy_dtoh_async(self.outputs[0]["host"],self.outputs[0]["device"],self.stream)self.stream.synchronize()output=self.outputs[0]["host"].reshape(self.outputs[0]["shape"])returnself.postprocess(output,scale,pad,image.shape[:2])defpostprocess(self,output,scale,pad,orig_shape):predictions=output[0].T# [8400, 56]boxes=predictions[:,:4]scores=predictions[:,4]kpts=predictions[:,6:].reshape(-1,17,3)mask=scores>self.conf_thresh boxes=boxes[mask]scores=scores[mask]kpts=kpts[mask]# 坐标还原dy,dx=pad boxes[:,[0,2]]=(boxes[:,[0,2]]-dx)/scale boxes[:,[1,3]]=(boxes[:,[1,3]]-dy)/scale kpts[:,:,0]=(kpts[:,:,0]-dx)/scale kpts[:,:,1]=(kpts[:,:,1]-dy)/scale results=[]foriinrange(len(boxes)):results.append({'bbox':boxes[i].tolist(),'score':float(scores[i]),'keypoints':kpts[i].tolist(),})returnresults# 骨架连接SKELETON=[(0,1),(0,2),(1,3),(2,4),(5,6),(5,7),(6,8),(7,9),(8,10),(11,12),(11,13),(12,14),(13,15),(14,16),(5,11),(6,12),]defdraw_pose(image,detections):fordetindetections:kpts=det['keypoints']for(i,j)inSKELETON:ifkpts[i][2]>0.3andkpts[j][2]>0.3:pt1=(int(kpts[i][0]),int(kpts[i][1]))pt2=(int(kpts[j][0]),int(kpts[j][1]))cv2.line(image,pt1,pt2,(0,255,0),2)for(x,y,vis)inkpts:ifvis>0.3:cv2.circle(image,(int(x),int(y)),3,(0,0,255),-1)returnimageif__name__=="__main__":model=TRTPoseDetector("yolo26n-pose.engine")cap=cv2.VideoCapture(0)fps_count,fps_start=0,time.time()whileTrue:ret,frame=cap.read()ifnotret:breakdetections=model.detect(frame)frame=draw_pose(frame,detections)fps_count+=1iftime.time()-fps_start>=1.0:fps=fps_count/(time.time()-fps_start)fps_count,fps_start=0,time.time()cv2.putText(frame,f"FPS:{fps:.0f}",(10,30),cv2.FONT_HERSHEY_SIMPLEX,1,(0,255,0),2)cv2.imshow("Pose",frame)ifcv2.waitKey(1)&0xFF==ord('q'):breakcap.release()cv2.destroyAllWindows()
3. 性能基准
YOLO26N-Pose TensorRT 性能(640x640): ┌──────────────────┬──────────┬──────────┐ │ 设备 │ FP16 │ INT8 │ ├──────────────────┼──────────┼──────────┤ │ Jetson Orin NX │ 5.2ms │ 3.8ms │ │ Jetson Orin Nano │ 9.5ms │ 6.5ms │ │ RTX 4090 │ 1.5ms │ 1.1ms │ │ RTX 3060 │ 4.2ms │ 3.0ms │ └──────────────────┴──────────┴──────────┘
总结
| 步骤 | 工具 | 输出 |
|---|
| 导出 ONNX | Ultralytics | .onnx |
| 转 TensorRT | trtexec / Ultralytics | .engine |
| 推理 | TRTPoseDetector | 关键点 |
| 可视化 | draw_pose | 骨架图 |