Skip to content

Commit c663f24

Browse files
committed
update
1 parent c3f3dc0 commit c663f24

File tree

13 files changed

+517
-11
lines changed

13 files changed

+517
-11
lines changed

CMakeLists.txt

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -58,7 +58,8 @@ cuda_add_library(utils_cu_cpp SHARED ${cpp_cuda_srcs})
5858

5959
# add_executable(infer mains/main_yolov8_det.cpp)
6060
# add_executable(infer mains/main_yolov8_seg.cpp)
61-
add_executable(infer mains/main_rtdetr.cpp)
61+
add_executable(infer mains/main_yolov8_pose.cpp)
62+
# add_executable(infer mains/main_rtdetr.cpp)
6263
# 8. 链接要所有要用到的so库
6364
target_link_libraries(infer
6465
utils_cu_cpp # 调用上面编译好的so库

README.md

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@
99
- [导出YOLOv8-Engine模型教程](application/yolov8_app/README.md)
1010
- [yolov8-detection cuda版本](application/yolov8_app/yolov8_det_cuda)
1111
- [yolov8-segment cuda版本](application/yolov8_app/yolov8_seg_cuda)
12-
- [yolov8-pose cuda版本](coming soon)
12+
- [yolov8-pose cuda版本](application/yolov8_app/yolov8_pose_cuda)
1313
## 其他backend推理代码
1414
- [ Openvino ] coming soon
1515
- [ NCNN ] coming soon
@@ -31,8 +31,8 @@ AiInfer
3131
|--memory.hpp # 有关cpu、gpu内存申请和释放的工具类
3232
|--model_info.hpp # 有关模型的前后处理的常用参数定义,例如均值方差、nms阈值等
3333
|--utils.hpp # cpp中常用到的工具函数,计时、mkdir等
34-
|--post_process # 后处理实现目录,包括cpp和cuda后处理加速
35-
|--pre_process # 前处理实现目录,包括cpp和cuda前处理加速
34+
|--post_process # 后处理实现目录,cuda后处理加速,如果你有自定义的后处理也可以写在这里
35+
|--pre_process # 前处理实现目录,cuda前处理加速,如果你有自定义的前处理也可以写在这里
3636
|--workspaces # 工作目录,里面可以放一些测试图片/视频、模型,然后在main.cpp中直接使用相对路径
3737
|--mains # 这里面是main.cpp合集,这里采用每个app单独对应一个main文件,便于理解,写一起太冗余
3838
```
@@ -43,6 +43,7 @@ AiInfer
4343

4444
- linux推荐使用VSCode,windows推荐使用visual studio 2019
4545
- 安装显卡驱动、cuda、cudnn、opencv、tensorrt [安装教程](https://zhuanlan.zhihu.com/p/624170244)
46+
4647
</details>
4748

4849
<details>

application/yolov8_app/README.md

Lines changed: 10 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,12 @@ if dynamic:
1010
dynamic['output1'] = {0: 'batch', 2: 'mask_height', 3: 'mask_width'}
1111
elif isinstance(self.model, DetectionModel):
1212
dynamic['output0'] = {0: 'batch'}
13+
14+
# 补充,注意导出yolov-pose任务的时候有些小问题,作者对pose分支的score进行sigmoid直接使用的是tensor.simoid_()
15+
# 这种replace方法,onnx导出时并不把这种当做sigmoid算子导出,所以pose score分支是有问题的,解决:
16+
# ultralytics/nn/modules.py Pose类的kpts_decode方法,
17+
y[:, 2::3].sigmoid_() # 修改成下面这种形式即可
18+
y[:, 2::3] = y[:, 2::3].sigmoid()
1319
```
1420
- 然后使用下面的命令对yolov8的各任务模型进行导出即可,注意,默认的imgsz是640x640,这个根据你实际情况更改
1521
```bash
@@ -25,7 +31,7 @@ yolo export \
2531
- yolov8检测分支导出onnx shape=[-1,box_num+cls_num,8400],框维度在最后这就带来一个框内存不连续的问题,解决:
2632
```bash
2733
# 前言:yolov3/4/5/x/6/7人家都是[-1,8400,box_num+cls_num],你yolov8咋恁特立独行呢,干他,必须干他
28-
# 使用assets/yolov8_onnx_trans.py直接转换最后一层layer的维度,适用于detect和segment,pose不需要
34+
# 使用assets/yolov8_onnx_trans.py直接转换最后一层layer的维度[detect,segment,pose都要转换],就是将8400这个维度放到前面
2935
```
3036
### yolov8的onnx生成engine文件
3137
- fp16量化生成的命令如下,这个精度损失不大,可以直接使用trtexec完成
@@ -42,4 +48,6 @@ trtexec --onnx=xxx_det_seg_pose_trans.onnx \
4248
- [商汤的ppq的int8量化工具,支持tensorrt|openvino|mnn|ncnn|...](https://github.com/openppl-public/ppq)
4349
- [ppq不会使用的看yolov6的量化教程:](https://github.com/meituan/YOLOv6/tree/main/tools/quantization/ppq)
4450

45-
**然后将生成的engine模型送入到该项目中进行推理即可**
51+
**然后将生成的engine模型送入到该项目中进行推理即可**
52+
### 下面展示一下使用该项目的推理结果
53+
![yolov8](../../assets/yolov8_det_seg_pose_res.png)
Lines changed: 228 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,228 @@
1+
#include "yolov8_pose.hpp"
2+
namespace tensorrt_infer
3+
{
4+
namespace yolov8_cuda
5+
{
6+
void YOLOv8Pose::initParameters(const std::string &engine_file, float score_thr, float nms_thr)
7+
{
8+
if (!file_exist(engine_file))
9+
{
10+
INFO("Error: engine_file is not exist!!!");
11+
exit(0);
12+
}
13+
14+
this->model_info = std::make_shared<ModelInfo>();
15+
// 传入参数的配置
16+
model_info->m_modelPath = engine_file;
17+
model_info->m_postProcCfg.confidence_threshold_ = score_thr;
18+
model_info->m_postProcCfg.nms_threshold_ = nms_thr;
19+
20+
this->model_ = trt::infer::load(engine_file); // 加载infer对象
21+
this->model_->print(); // 打印engine的一些基本信息
22+
23+
// 获取输入的尺寸信息
24+
auto input_dim = this->model_->get_network_dims(0); // 获取输入维度信息
25+
model_info->m_preProcCfg.infer_batch_size = input_dim[0];
26+
model_info->m_preProcCfg.network_input_channels_ = input_dim[1];
27+
model_info->m_preProcCfg.network_input_height_ = input_dim[2];
28+
model_info->m_preProcCfg.network_input_width_ = input_dim[3];
29+
model_info->m_preProcCfg.network_input_numel = input_dim[1] * input_dim[2] * input_dim[3];
30+
model_info->m_preProcCfg.isdynamic_model_ = this->model_->has_dynamic_dim();
31+
// 对输入的图片预处理进行配置,即,yolov8的预处理是除以255,并且是RGB通道输入
32+
model_info->m_preProcCfg.normalize_ = Norm::alpha_beta(1 / 255.0f, 0.0f, ChannelType::RGB);
33+
34+
// 获取输出的尺寸信息
35+
auto output_dim = this->model_->get_network_dims(1);
36+
model_info->m_postProcCfg.bbox_head_dims_ = output_dim;
37+
model_info->m_postProcCfg.bbox_head_dims_output_numel_ = output_dim[1] * output_dim[2];
38+
if (model_info->m_postProcCfg.pose_num_ == 0)
39+
model_info->m_postProcCfg.pose_num_ = (int)((output_dim[2] - 5) / 3); // yolov8 pose,5:xmin,ymin,xmax,ymax,score
40+
model_info->m_postProcCfg.NUM_BOX_ELEMENT += model_info->m_postProcCfg.pose_num_ * 3; // 3:pose_x,pose_y,pose_score
41+
model_info->m_postProcCfg.IMAGE_MAX_BOXES_ADD_ELEMENT = model_info->m_postProcCfg.MAX_IMAGE_BOXES * model_info->m_postProcCfg.NUM_BOX_ELEMENT;
42+
43+
CHECK(cudaStreamCreate(&cu_stream)); // 创建cuda流
44+
}
45+
46+
YOLOv8Pose::~YOLOv8Pose()
47+
{
48+
CHECK(cudaStreamDestroy(cu_stream)); // 销毁cuda流
49+
}
50+
51+
void YOLOv8Pose::adjust_memory(int batch_size)
52+
{
53+
// 申请模型输入和模型输出所用到的内存
54+
input_buffer_.gpu(batch_size * model_info->m_preProcCfg.network_input_numel); // 申请batch个模型输入的gpu内存
55+
bbox_predict_.gpu(batch_size * model_info->m_postProcCfg.bbox_head_dims_output_numel_); // 申请batch个模型输出的gpu内存
56+
57+
// 申请模型解析成box时需要存储的内存,,+32是因为第一个数要设置为框的个数,防止内存溢出
58+
output_boxarray_.gpu(batch_size * (32 + model_info->m_postProcCfg.IMAGE_MAX_BOXES_ADD_ELEMENT));
59+
output_boxarray_.cpu(batch_size * (32 + model_info->m_postProcCfg.IMAGE_MAX_BOXES_ADD_ELEMENT));
60+
61+
if ((int)preprocess_buffers_.size() < batch_size)
62+
{
63+
for (int i = preprocess_buffers_.size(); i < batch_size; ++i)
64+
preprocess_buffers_.push_back(make_shared<Memory<unsigned char>>()); // 添加batch个Memory对象
65+
}
66+
67+
// 申请batch size个仿射矩阵,由于也是动态batch指定,所以直接在这里写了
68+
if ((int)affine_matrixs.size() < batch_size)
69+
{
70+
for (int i = affine_matrixs.size(); i < batch_size; ++i)
71+
affine_matrixs.push_back(AffineMatrix()); // 添加batch个AffineMatrix对象
72+
}
73+
}
74+
75+
void YOLOv8Pose::preprocess_gpu(int ibatch, const Image &image,
76+
shared_ptr<Memory<unsigned char>> preprocess_buffer, AffineMatrix &affine,
77+
cudaStream_t stream_)
78+
{
79+
if (image.channels != model_info->m_preProcCfg.network_input_channels_)
80+
{
81+
INFO("Warning : Number of channels wanted differs from number of channels in the actual image \n");
82+
exit(-1);
83+
}
84+
85+
affine.compute(make_tuple(image.width, image.height),
86+
make_tuple(model_info->m_preProcCfg.network_input_width_, model_info->m_preProcCfg.network_input_height_));
87+
float *input_device = input_buffer_.gpu() + ibatch * model_info->m_preProcCfg.network_input_numel; // 获取当前batch的gpu内存指针
88+
size_t size_image = image.width * image.height * image.channels;
89+
size_t size_matrix = upbound(sizeof(affine.d2i), 32); // 向上取整
90+
uint8_t *gpu_workspace = preprocess_buffer->gpu(size_matrix + size_image); // 这里把仿射矩阵+image_size放在一起申请gpu内存
91+
float *affine_matrix_device = (float *)gpu_workspace;
92+
uint8_t *image_device = gpu_workspace + size_matrix; // 这里只取仿射变换矩阵的gpu内存
93+
94+
// 同上,只不过申请的是cpu内存
95+
uint8_t *cpu_workspace = preprocess_buffer->cpu(size_matrix + size_image);
96+
float *affine_matrix_host = (float *)cpu_workspace;
97+
uint8_t *image_host = cpu_workspace + size_matrix;
98+
99+
// 赋值这一步并不是多余的,这个是从分页内存到固定页内存的数据传输,可以加速向gpu内存进行数据传输
100+
memcpy(image_host, image.bgrptr, size_image); // 给图片内存赋值
101+
memcpy(affine_matrix_host, affine.d2i, sizeof(affine.d2i)); // 给仿射变换矩阵内存赋值
102+
103+
// 从cpu-->gpu,其中image_host也可以替换为image.bgrptr然后删除上面几行,但会慢个0.02ms左右
104+
checkRuntime(cudaMemcpyAsync(image_device, image_host, size_image, cudaMemcpyHostToDevice, stream_)); // 图片 cpu内存上传到gpu内存
105+
checkRuntime(cudaMemcpyAsync(affine_matrix_device, affine_matrix_host, sizeof(affine.d2i),
106+
cudaMemcpyHostToDevice, stream_)); // 仿射变换矩阵 cpu内存上传到gpu内存
107+
// 执行resize+fill[114]
108+
warp_affine_bilinear_and_normalize_plane(image_device, image.width * image.channels, image.width,
109+
image.height, input_device, model_info->m_preProcCfg.network_input_width_,
110+
model_info->m_preProcCfg.network_input_height_, affine_matrix_device, const_value,
111+
model_info->m_preProcCfg.normalize_, stream_);
112+
}
113+
114+
void YOLOv8Pose::postprocess_gpu(int ibatch, cudaStream_t stream_)
115+
{
116+
// boxarray_device:对推理结果进行解析后要存储的gpu指针
117+
float *boxarray_device = output_boxarray_.gpu() + ibatch * (32 + model_info->m_postProcCfg.IMAGE_MAX_BOXES_ADD_ELEMENT);
118+
// affine_matrix_device:获取仿射变换矩阵+size_image的gpu指针,主要是用来是的归一化的框尺寸放缩至相对于图片尺寸
119+
float *affine_matrix_device = (float *)preprocess_buffers_[ibatch]->gpu();
120+
// image_based_bbox_output:推理结果产生的所有预测框的gpu指针
121+
float *image_based_bbox_output = bbox_predict_.gpu() + ibatch * model_info->m_postProcCfg.bbox_head_dims_output_numel_;
122+
123+
checkRuntime(cudaMemsetAsync(boxarray_device, 0, sizeof(int), stream_));
124+
decode_pose_yolov8_kernel_invoker(image_based_bbox_output, model_info->m_postProcCfg.bbox_head_dims_[1], model_info->m_postProcCfg.pose_num_,
125+
model_info->m_postProcCfg.bbox_head_dims_[2], model_info->m_postProcCfg.confidence_threshold_,
126+
affine_matrix_device, boxarray_device, model_info->m_postProcCfg.MAX_IMAGE_BOXES,
127+
model_info->m_postProcCfg.NUM_BOX_ELEMENT, stream_);
128+
129+
// 对筛选后的框进行nms操作
130+
nms_kernel_invoker(boxarray_device, model_info->m_postProcCfg.nms_threshold_, model_info->m_postProcCfg.MAX_IMAGE_BOXES,
131+
model_info->m_postProcCfg.NUM_BOX_ELEMENT, stream_);
132+
}
133+
134+
BatchPoseBoxArray YOLOv8Pose::parser_box(int num_image)
135+
{
136+
BatchPoseBoxArray arrout(num_image);
137+
for (int ib = 0; ib < num_image; ++ib)
138+
{
139+
float *parray = output_boxarray_.cpu() + ib * (32 + model_info->m_postProcCfg.IMAGE_MAX_BOXES_ADD_ELEMENT);
140+
int count = min(model_info->m_postProcCfg.MAX_IMAGE_BOXES, (int)*parray);
141+
PoseBoxArray &output = arrout[ib];
142+
output.reserve(count); // 增加vector的容量大于或等于count的值
143+
for (int i = 0; i < count; ++i)
144+
{
145+
float *pbox = parray + 1 + i * model_info->m_postProcCfg.NUM_BOX_ELEMENT;
146+
int label = pbox[5];
147+
int keepflag = pbox[6];
148+
if (keepflag == 1)
149+
{
150+
PoseBox result_object_box(pbox[0], pbox[1], pbox[2], pbox[3], pbox[4], label);
151+
result_object_box.pose = make_shared<InstancePose>();
152+
for (int pindex = 7; pindex < model_info->m_postProcCfg.NUM_BOX_ELEMENT; pindex += 3)
153+
result_object_box.pose->pose_data.push_back({pbox[pindex], pbox[pindex + 1], pbox[pindex + 2]});
154+
output.emplace_back(result_object_box);
155+
}
156+
}
157+
}
158+
159+
return arrout;
160+
}
161+
162+
PoseBoxArray YOLOv8Pose::forward(const Image &image)
163+
{
164+
auto output = forwards({image});
165+
if (output.empty())
166+
return {};
167+
return output[0];
168+
}
169+
170+
BatchPoseBoxArray YOLOv8Pose::forwards(const std::vector<Image> &images)
171+
{
172+
int num_image = images.size();
173+
if (num_image == 0)
174+
return {};
175+
176+
// 动态设置batch size
177+
auto input_dims = model_->get_network_dims(0);
178+
if (model_info->m_preProcCfg.infer_batch_size != num_image)
179+
{
180+
if (model_info->m_preProcCfg.isdynamic_model_)
181+
{
182+
model_info->m_preProcCfg.infer_batch_size = num_image;
183+
input_dims[0] = num_image;
184+
if (!model_->set_network_dims(0, input_dims)) // 重新绑定输入batch,返回值类型是bool
185+
return {};
186+
}
187+
else
188+
{
189+
if (model_info->m_preProcCfg.infer_batch_size < num_image)
190+
{
191+
INFO(
192+
"When using static shape model, number of images[%d] must be "
193+
"less than or equal to the maximum batch[%d].",
194+
num_image, model_info->m_preProcCfg.infer_batch_size);
195+
return {};
196+
}
197+
}
198+
}
199+
200+
// 由于batch size是动态的,所以需要对gpu/cpu内存进行动态的申请
201+
adjust_memory(model_info->m_preProcCfg.infer_batch_size);
202+
203+
// 对图片进行预处理
204+
for (int i = 0; i < num_image; ++i)
205+
preprocess_gpu(i, images[i], preprocess_buffers_[i], affine_matrixs[i], cu_stream); // input_buffer_会获取到图片预处理好的值
206+
207+
// 推理模型
208+
float *bbox_output_device = bbox_predict_.gpu(); // 获取推理后要存储结果的gpu指针
209+
vector<void *> bindings{input_buffer_.gpu(), bbox_output_device}; // 绑定bindings作为输入进行forward
210+
if (!model_->forward(bindings, cu_stream))
211+
{
212+
INFO("Failed to tensorRT forward.");
213+
return {};
214+
}
215+
216+
// 对推理结果进行解析
217+
for (int ib = 0; ib < num_image; ++ib)
218+
postprocess_gpu(ib, cu_stream);
219+
220+
// 将nms后的框结果从gpu内存传递到cpu内存
221+
checkRuntime(cudaMemcpyAsync(output_boxarray_.cpu(), output_boxarray_.gpu(),
222+
output_boxarray_.gpu_bytes(), cudaMemcpyDeviceToHost, cu_stream));
223+
checkRuntime(cudaStreamSynchronize(cu_stream)); // 阻塞异步流,等流中所有操作执行完成才会继续执行
224+
225+
return parser_box(num_image);
226+
}
227+
}
228+
}
Lines changed: 64 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,64 @@
1+
#ifndef _YOLOV8_POSE_CUDA_HPP_
2+
#define _YOLOV8_POSE_CUDA_HPP_
3+
#include <memory>
4+
#include "backend/tensorrt/trt_infer.hpp"
5+
#include "common/model_info.hpp"
6+
#include "common/utils.hpp"
7+
#include "common/cv_cpp_utils.hpp"
8+
#include "common/memory.hpp"
9+
#include "pre_process/pre_process.cuh"
10+
#include "post_process/post_process.cuh"
11+
12+
namespace tensorrt_infer
13+
{
14+
namespace yolov8_cuda
15+
{
16+
using namespace ai::modelInfo;
17+
using namespace ai::utils;
18+
using namespace ai::cvUtil;
19+
using namespace ai::memory;
20+
using namespace ai::preprocess;
21+
using namespace ai::postprocess;
22+
23+
class YOLOv8Pose
24+
{
25+
public:
26+
YOLOv8Pose() = default;
27+
~YOLOv8Pose();
28+
void initParameters(const std::string &engine_file, float score_thr = 0.5f,
29+
float nms_thr = 0.45f); // 初始化参数
30+
void adjust_memory(int batch_size); // 由于batch size是动态的,所以需要对gpu/cpu内存进行动态的申请
31+
32+
// forward
33+
PoseBoxArray forward(const Image &image);
34+
BatchPoseBoxArray forwards(const std::vector<Image> &images);
35+
36+
// 模型前后处理
37+
void preprocess_gpu(int ibatch, const Image &image,
38+
shared_ptr<Memory<unsigned char>> preprocess_buffer, AffineMatrix &affine,
39+
cudaStream_t stream_);
40+
void postprocess_gpu(int ibatch, cudaStream_t stream_);
41+
BatchPoseBoxArray parser_box(int num_image);
42+
43+
private:
44+
std::shared_ptr<ai::backend::Infer> model_;
45+
std::shared_ptr<ModelInfo> model_info = nullptr;
46+
47+
// 仿射矩阵的声明
48+
std::vector<AffineMatrix> affine_matrixs;
49+
const uint8_t const_value = 114; // 图片resize补边时的值
50+
51+
// 使用自定义的Memory类用来申请gpu/cpu内存
52+
std::vector<std::shared_ptr<Memory<unsigned char>>> preprocess_buffers_;
53+
Memory<float> input_buffer_, bbox_predict_, output_boxarray_;
54+
55+
// 使用cuda流进行操作
56+
cudaStream_t cu_stream;
57+
58+
// time
59+
Timer timer;
60+
};
61+
}
62+
}
63+
64+
#endif // _YOLOV8_POSE_CUDA_HPP_

assets/yolov8_det_seg_pose_res.png

1.48 MB
Loading

0 commit comments

Comments
 (0)