Skip to content

yolov8-seg部分,如果目标很多,拷贝时间很久,我改进了一下 #161

@zouwen198317

Description

@zouwen198317

//在类中 cudaStream_t m_stream; // 新增CUDA流成员
//std::vector<std::vector> m_objectsBatch; // 保存提取到的分割目标框
调用方式
auto start = std::chrono::system_clock::now();
//yolo.copy(imgs_batch);
// utils::DeviceTimer d_t1; yolo.preprocess(imgs_batch); float t1 = d_t1.getUsedTime();
// utils::DeviceTimer d_t2; yolo.infer(); float t2 = d_t2.getUsedTime();
// utils::DeviceTimer d_t3; yolo.postprocess(imgs_batch); float t3 = d_t3.getUsedTime();
// yolo.extractObjects(imgs_batch);
void YOLOv8Seg::extractObjects(std::vectorcv::Mat& imgsBatch) {
m_objectsBatch.clear();
m_objectsBatch.resize(imgsBatch.size());

// 确保 CUDA 流已初始化
if (!m_stream) {
    CHECK(cudaStreamCreate(&m_stream));
}

// 分配设备端缓冲区
float* d_seg_output;
float* d_objects;
float* d_mask_output;
int seg_size = m_param.batch_size * 160 * 160 * 32 * sizeof(float);  // [batch, 160*160, 32]
int obj_size = m_param.batch_size * m_param.topK * 39 * sizeof(float);
int mask_size = m_param.batch_size * m_param.topK * 160 * 160 * sizeof(float);

CHECK(cudaMalloc(&d_seg_output, seg_size));
CHECK(cudaMalloc(&d_objects, obj_size));
CHECK(cudaMalloc(&d_mask_output, mask_size));
CHECK(cudaMemsetAsync(d_mask_output, 0, mask_size, m_stream));

// 将主机端数据拷贝到设备端
CHECK(cudaMemcpyAsync(d_seg_output, m_output_seg_host, seg_size, cudaMemcpyHostToDevice, m_stream));
CHECK(cudaMemcpyAsync(d_objects, m_output_objects_host, obj_size, cudaMemcpyHostToDevice, m_stream));

// 调用 CUDA 函数生成掩码
yolov8seg::maskDevice(m_param, d_seg_output, d_objects, d_mask_output, m_stream);

// 回传掩码数据
std::vector<float> h_mask_output(m_param.batch_size * m_param.topK * 160 * 160);
CHECK(cudaMemcpyAsync(h_mask_output.data(), d_mask_output, mask_size, cudaMemcpyDeviceToHost, m_stream));
cudaStreamSynchronize(m_stream);

// 处理每张图像,与原版逻辑一致
for (size_t bi = 0; bi < imgsBatch.size(); bi++) {
    int num_boxes = std::min((int)(m_output_objects_host + bi * m_output_obj_area)[0], m_param.topK);
    int m_output_obj_area_bi = bi * m_output_obj_area;

    for (size_t i = 0; i < num_boxes; i++) {
        float* ptr = m_output_objects_host + m_output_obj_area_bi + m_output_objects_width * i + 1;
        if (ptr[6]) {  // 检查 keepflag
            ObjectSeg obj;
            obj.label = static_cast<int>(ptr[5]);
            obj.prob = ptr[4];

            // 计算原始图像上的边界框 (roisrc)
            int x_lt_src = std::round(m_dst2src.v0 * ptr[0] + m_dst2src.v1 * ptr[1] + m_dst2src.v2);
            int y_lt_src = std::round(m_dst2src.v3 * ptr[0] + m_dst2src.v4 * ptr[1] + m_dst2src.v5);
            int x_rb_src = std::round(m_dst2src.v0 * ptr[2] + m_dst2src.v1 * ptr[3] + m_dst2src.v2);
            int y_rb_src = std::round(m_dst2src.v3 * ptr[2] + m_dst2src.v4 * ptr[3] + m_dst2src.v5);
            cv::Rect roisrc = cv::Rect(x_lt_src, y_lt_src, x_rb_src - x_lt_src, y_rb_src - y_lt_src) & m_thresh_roisrc;
            if (roisrc.width <= 0 || roisrc.height <= 0) continue;
            obj.rect = roisrc;

            // 计算 160x160 上的边界框 (roi160)
            int x_lt_160 = std::round(ptr[0] * m_downsample_scale);
            int y_lt_160 = std::round(ptr[1] * m_downsample_scale);
            int x_rb_160 = std::round(ptr[2] * m_downsample_scale);
            int y_rb_160 = std::round(ptr[3] * m_downsample_scale);
            cv::Rect roi160(x_lt_160, y_lt_160, x_rb_160 - x_lt_160, y_rb_160 - y_lt_160);
            roi160 &= m_thresh_roi160;
            if (roi160.width <= 0 || roi160.height <= 0) continue;

            // 从主机端掩码数据中提取当前目标的掩码
            float* mask_ptr = h_mask_output.data() + bi * m_param.topK * 160 * 160 + i * 160 * 160;
            cv::Mat mask_160(160, 160, CV_32F, mask_ptr);

            // 裁剪并缩放到 roisrc,与原版完全一致
            cv::Mat mask_instance;
            cv::resize(cv::Mat(mask_160, roi160), mask_instance, cv::Size(roisrc.width, roisrc.height), cv::INTER_LINEAR);
            mask_instance = mask_instance > 0.5f;
            obj.boxMask = mask_instance;

            // 计算掩码面积
            obj.mask_area = cv::countNonZero(mask_instance);

            // 计算掩码主方向角度和带角度的长宽
            std::vector<cv::Point> mask_points;
            cv::findNonZero(mask_instance, mask_points);  // 获取掩码中的非零点
            if (mask_points.size() >= 5) {  // 至少需要5个点以拟合椭圆
                cv::RotatedRect min_rect = cv::minAreaRect(mask_points);
                obj.mask_angle = min_rect.angle;  // 主方向角度(度)
                obj.mask_dims = min_rect.size;    // 长宽(长度 x 宽度)
            }
            else {
                obj.mask_angle = 0.0f;            // 默认角度
                obj.mask_dims = cv::Size2f(0.0f, 0.0f);  // 默认尺寸
            }

            m_objectsBatch[bi].push_back(obj);
        }
    }
}

CHECK(cudaFree(d_seg_output));
CHECK(cudaFree(d_objects));
CHECK(cudaFree(d_mask_output));

}

cu头文件文件也增加接口
//下面结构放在utils.h 中
//struct ObjectSeg {
// cv::Rect_ rect;
// int label = 0;
// float prob = 0.0;
// cv::Mat boxMask;
//};

namespace yolov8seg
{
void decodeDevice(utils::InitParameter param, float* src, int srcWidth, int srcHeight, int srcLength, float* dst, int dstWidth, int dstHeight);
void transposeDevice(utils::InitParameter param, float* src, int srcWidth, int srcHeight, int srcArea, float* dst, int dstWidth, int dstHeight);

// 新增掩码生成接口
void maskDevice(
utils::InitParameter param,
float* seg_output, // [batch, 32, 160, 160]
float* objects, // [batch, topK, 39]
float* mask_output, // [batch, topK, 160*160]
cudaStream_t stream
);

}

cu源文件
// --- 以下为新增的 YOLOv8Seg 相关实现 ---

// 新增掩码生成核函数
global void yolov8seg_mask_kernel(
int batch_size,
int topK,
int seg_w, // 160
int seg_h, // 160
float* seg_output, // [batch, 160160, 32]
float
objects, // [batch, topK, 39]
float* mask_output // [batch, topK, 160*160]
) {
const int batch_idx = blockIdx.z;
const int obj_idx = blockIdx.y * blockDim.y + threadIdx.y;
const int pixel_idx = blockIdx.x * blockDim.x + threadIdx.x;

if (batch_idx >= batch_size || obj_idx >= topK || pixel_idx >= seg_w * seg_h) return;

float* obj_ptr = objects + batch_idx * topK * 39 + obj_idx * 39;
if (obj_ptr[6] < 0.5f) return;

// 双精度累加
double sum = 0.0;
int seg_base = batch_idx * seg_w * seg_h * 32 + pixel_idx * 32;
for (int c = 0; c < 32; ++c) {
    sum += static_cast<double>(seg_output[seg_base + c]) * static_cast<double>(obj_ptr[7 + c]);
}

// 优化 Sigmoid 计算,模拟 cv::exp 的单精度行为
float sum_float = static_cast<float>(sum);  // 转换为单精度
float exp_neg_sum = expf(-sum_float);       // 使用单精度 expf
float mask_val = 1.0f / (1.0f + exp_neg_sum);

mask_output[batch_idx * topK * seg_w * seg_h + obj_idx * seg_w * seg_h + pixel_idx] = mask_val;

}

namespace yolov8seg {
void maskDevice(
utils::InitParameter param,
float* seg_output,
float* objects,
float* mask_output,
cudaStream_t stream
) {
const int seg_w = 160;
const int seg_h = 160;
dim3 block(32, 32, 1);
dim3 grid(
(seg_w * seg_h + block.x - 1) / block.x,
(param.topK + block.y - 1) / block.y,
param.batch_size
);

    yolov8seg_mask_kernel << <grid, block, 0, stream >> > (
        param.batch_size,
        param.topK,
        seg_w,
        seg_h,
        seg_output,
        objects,
        mask_output
        );
}

}

Metadata

Metadata

Assignees

No one assigned

    Labels

    No labels
    No labels

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions