-
Notifications
You must be signed in to change notification settings - Fork 199
Description
//在类中 cudaStream_t m_stream; // 新增CUDA流成员
//std::vector<std::vector> m_objectsBatch; // 保存提取到的分割目标框
调用方式
auto start = std::chrono::system_clock::now();
//yolo.copy(imgs_batch);
// utils::DeviceTimer d_t1; yolo.preprocess(imgs_batch); float t1 = d_t1.getUsedTime();
// utils::DeviceTimer d_t2; yolo.infer(); float t2 = d_t2.getUsedTime();
// utils::DeviceTimer d_t3; yolo.postprocess(imgs_batch); float t3 = d_t3.getUsedTime();
// yolo.extractObjects(imgs_batch);
void YOLOv8Seg::extractObjects(std::vectorcv::Mat& imgsBatch) {
m_objectsBatch.clear();
m_objectsBatch.resize(imgsBatch.size());
// 确保 CUDA 流已初始化
if (!m_stream) {
CHECK(cudaStreamCreate(&m_stream));
}
// 分配设备端缓冲区
float* d_seg_output;
float* d_objects;
float* d_mask_output;
int seg_size = m_param.batch_size * 160 * 160 * 32 * sizeof(float); // [batch, 160*160, 32]
int obj_size = m_param.batch_size * m_param.topK * 39 * sizeof(float);
int mask_size = m_param.batch_size * m_param.topK * 160 * 160 * sizeof(float);
CHECK(cudaMalloc(&d_seg_output, seg_size));
CHECK(cudaMalloc(&d_objects, obj_size));
CHECK(cudaMalloc(&d_mask_output, mask_size));
CHECK(cudaMemsetAsync(d_mask_output, 0, mask_size, m_stream));
// 将主机端数据拷贝到设备端
CHECK(cudaMemcpyAsync(d_seg_output, m_output_seg_host, seg_size, cudaMemcpyHostToDevice, m_stream));
CHECK(cudaMemcpyAsync(d_objects, m_output_objects_host, obj_size, cudaMemcpyHostToDevice, m_stream));
// 调用 CUDA 函数生成掩码
yolov8seg::maskDevice(m_param, d_seg_output, d_objects, d_mask_output, m_stream);
// 回传掩码数据
std::vector<float> h_mask_output(m_param.batch_size * m_param.topK * 160 * 160);
CHECK(cudaMemcpyAsync(h_mask_output.data(), d_mask_output, mask_size, cudaMemcpyDeviceToHost, m_stream));
cudaStreamSynchronize(m_stream);
// 处理每张图像,与原版逻辑一致
for (size_t bi = 0; bi < imgsBatch.size(); bi++) {
int num_boxes = std::min((int)(m_output_objects_host + bi * m_output_obj_area)[0], m_param.topK);
int m_output_obj_area_bi = bi * m_output_obj_area;
for (size_t i = 0; i < num_boxes; i++) {
float* ptr = m_output_objects_host + m_output_obj_area_bi + m_output_objects_width * i + 1;
if (ptr[6]) { // 检查 keepflag
ObjectSeg obj;
obj.label = static_cast<int>(ptr[5]);
obj.prob = ptr[4];
// 计算原始图像上的边界框 (roisrc)
int x_lt_src = std::round(m_dst2src.v0 * ptr[0] + m_dst2src.v1 * ptr[1] + m_dst2src.v2);
int y_lt_src = std::round(m_dst2src.v3 * ptr[0] + m_dst2src.v4 * ptr[1] + m_dst2src.v5);
int x_rb_src = std::round(m_dst2src.v0 * ptr[2] + m_dst2src.v1 * ptr[3] + m_dst2src.v2);
int y_rb_src = std::round(m_dst2src.v3 * ptr[2] + m_dst2src.v4 * ptr[3] + m_dst2src.v5);
cv::Rect roisrc = cv::Rect(x_lt_src, y_lt_src, x_rb_src - x_lt_src, y_rb_src - y_lt_src) & m_thresh_roisrc;
if (roisrc.width <= 0 || roisrc.height <= 0) continue;
obj.rect = roisrc;
// 计算 160x160 上的边界框 (roi160)
int x_lt_160 = std::round(ptr[0] * m_downsample_scale);
int y_lt_160 = std::round(ptr[1] * m_downsample_scale);
int x_rb_160 = std::round(ptr[2] * m_downsample_scale);
int y_rb_160 = std::round(ptr[3] * m_downsample_scale);
cv::Rect roi160(x_lt_160, y_lt_160, x_rb_160 - x_lt_160, y_rb_160 - y_lt_160);
roi160 &= m_thresh_roi160;
if (roi160.width <= 0 || roi160.height <= 0) continue;
// 从主机端掩码数据中提取当前目标的掩码
float* mask_ptr = h_mask_output.data() + bi * m_param.topK * 160 * 160 + i * 160 * 160;
cv::Mat mask_160(160, 160, CV_32F, mask_ptr);
// 裁剪并缩放到 roisrc,与原版完全一致
cv::Mat mask_instance;
cv::resize(cv::Mat(mask_160, roi160), mask_instance, cv::Size(roisrc.width, roisrc.height), cv::INTER_LINEAR);
mask_instance = mask_instance > 0.5f;
obj.boxMask = mask_instance;
// 计算掩码面积
obj.mask_area = cv::countNonZero(mask_instance);
// 计算掩码主方向角度和带角度的长宽
std::vector<cv::Point> mask_points;
cv::findNonZero(mask_instance, mask_points); // 获取掩码中的非零点
if (mask_points.size() >= 5) { // 至少需要5个点以拟合椭圆
cv::RotatedRect min_rect = cv::minAreaRect(mask_points);
obj.mask_angle = min_rect.angle; // 主方向角度(度)
obj.mask_dims = min_rect.size; // 长宽(长度 x 宽度)
}
else {
obj.mask_angle = 0.0f; // 默认角度
obj.mask_dims = cv::Size2f(0.0f, 0.0f); // 默认尺寸
}
m_objectsBatch[bi].push_back(obj);
}
}
}
CHECK(cudaFree(d_seg_output));
CHECK(cudaFree(d_objects));
CHECK(cudaFree(d_mask_output));
}
cu头文件文件也增加接口
//下面结构放在utils.h 中
//struct ObjectSeg {
// cv::Rect_ rect;
// int label = 0;
// float prob = 0.0;
// cv::Mat boxMask;
//};
namespace yolov8seg
{
void decodeDevice(utils::InitParameter param, float* src, int srcWidth, int srcHeight, int srcLength, float* dst, int dstWidth, int dstHeight);
void transposeDevice(utils::InitParameter param, float* src, int srcWidth, int srcHeight, int srcArea, float* dst, int dstWidth, int dstHeight);
// 新增掩码生成接口
void maskDevice(
utils::InitParameter param,
float* seg_output, // [batch, 32, 160, 160]
float* objects, // [batch, topK, 39]
float* mask_output, // [batch, topK, 160*160]
cudaStream_t stream
);
}
cu源文件
// --- 以下为新增的 YOLOv8Seg 相关实现 ---
// 新增掩码生成核函数
global void yolov8seg_mask_kernel(
int batch_size,
int topK,
int seg_w, // 160
int seg_h, // 160
float* seg_output, // [batch, 160160, 32]
float objects, // [batch, topK, 39]
float* mask_output // [batch, topK, 160*160]
) {
const int batch_idx = blockIdx.z;
const int obj_idx = blockIdx.y * blockDim.y + threadIdx.y;
const int pixel_idx = blockIdx.x * blockDim.x + threadIdx.x;
if (batch_idx >= batch_size || obj_idx >= topK || pixel_idx >= seg_w * seg_h) return;
float* obj_ptr = objects + batch_idx * topK * 39 + obj_idx * 39;
if (obj_ptr[6] < 0.5f) return;
// 双精度累加
double sum = 0.0;
int seg_base = batch_idx * seg_w * seg_h * 32 + pixel_idx * 32;
for (int c = 0; c < 32; ++c) {
sum += static_cast<double>(seg_output[seg_base + c]) * static_cast<double>(obj_ptr[7 + c]);
}
// 优化 Sigmoid 计算,模拟 cv::exp 的单精度行为
float sum_float = static_cast<float>(sum); // 转换为单精度
float exp_neg_sum = expf(-sum_float); // 使用单精度 expf
float mask_val = 1.0f / (1.0f + exp_neg_sum);
mask_output[batch_idx * topK * seg_w * seg_h + obj_idx * seg_w * seg_h + pixel_idx] = mask_val;
}
namespace yolov8seg {
void maskDevice(
utils::InitParameter param,
float* seg_output,
float* objects,
float* mask_output,
cudaStream_t stream
) {
const int seg_w = 160;
const int seg_h = 160;
dim3 block(32, 32, 1);
dim3 grid(
(seg_w * seg_h + block.x - 1) / block.x,
(param.topK + block.y - 1) / block.y,
param.batch_size
);
yolov8seg_mask_kernel << <grid, block, 0, stream >> > (
param.batch_size,
param.topK,
seg_w,
seg_h,
seg_output,
objects,
mask_output
);
}
}