Skip to content
Open
Show file tree
Hide file tree
Changes from 8 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion oneflow/core/functional/functional_api.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -2789,7 +2789,7 @@
bind_python: False

- name: "nms"
signature: "Tensor (Tensor x, Float iou_threshold, Int32 keep_n=-1) => Nms"
signature: "Tensor (Tensor x, Tensor scores=None, Tensor input_indices=None, Float iou_threshold, Int32 keep_n=-1) => Nms"
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

看看是否需要/方便,为npu的nms导出独立的api/functor?

bind_python: True

- name: "roi_align"
Expand Down
20 changes: 19 additions & 1 deletion oneflow/core/functional/impl/array_functor.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -588,7 +588,25 @@ class ArgWhereFunctor {
const Symbol<DType>& dtype) const {
auto& attrs = THREAD_CACHED_MUTABLE_ATTR_MAP("dtype");
attrs.SetAllAttrs(dtype->data_type());
return OpInterpUtil::Dispatch<TensorTuple>(*op_, {x}, attrs);

auto device_type = DeviceType::kCPU;
if (x->is_global()) {
device_type = JUST(x->parallel_desc())->device_type();
} else {
device_type = JUST(x->device())->enum_type();
}

if (device_type == DeviceType::kNPU) {
// NOTE: use cpu argwhere when device="npu"
auto cpu_tensor = JUST(one::functional::To(x, "cpu"));
auto result = JUST(OpInterpUtil::Dispatch<TensorTuple>(*op_, {cpu_tensor}, attrs));
for (int i = 0; i < result->size(); ++i) {
(*result)[i] = JUST(one::functional::To((*result)[i], "npu"));
}
return result;
} else {
return OpInterpUtil::Dispatch<TensorTuple>(*op_, {x}, attrs);
}
}

private:
Expand Down
27 changes: 24 additions & 3 deletions oneflow/core/functional/impl/nn_functor.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -4014,17 +4014,38 @@ class PariticalFCSampleDisableBoxing {

class NmsFunctor {
public:
NmsFunctor() { op_ = CHECK_JUST(one::OpBuilder("nms").Input("in").Output("out").Build()); }
NmsFunctor() {
op_ = CHECK_JUST(one::OpBuilder("nms").Input("in").Output("out").Build());
fused_op_ = CHECK_JUST(one::OpBuilder("nms")
.Input("in")
.Input("scores")
.Input("input_indices")
.Output("out")
.Build());
}

Maybe<Tensor> operator()(const std::shared_ptr<one::Tensor>& x, const float& iou_threshold,
Maybe<Tensor> operator()(const std::shared_ptr<one::Tensor>& x,
const Optional<one::Tensor>& scores,
const Optional<one::Tensor>& input_indices, const float& iou_threshold,
const int32_t& keep_n) const {
auto& attrs = THREAD_CACHED_MUTABLE_ATTR_MAP("iou_threshold", "keep_n");
attrs.SetAllAttrs(iou_threshold, keep_n);
return OpInterpUtil::Dispatch<Tensor>(*op_, {x}, attrs);
DeviceType device_type = JUST(x->device())->enum_type();
if (device_type == DeviceType::kNPU) {
if (scores) {
Copy link

Copilot AI Jun 1, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The fused NMS path checks only scores; it should verify both scores and input_indices are provided to avoid passing a null optional downstream.

Suggested change
if (scores) {
if (scores && input_indices) {

Copilot uses AI. Check for mistakes.
return OpInterpUtil::Dispatch<Tensor>(*fused_op_, {x, JUST(scores), JUST(input_indices)},
attrs);
} else {
return OpInterpUtil::Dispatch<Tensor>(*op_, {x}, attrs);
}
} else {
return OpInterpUtil::Dispatch<Tensor>(*op_, {x}, attrs);
}
}

private:
std::shared_ptr<OpExpr> op_;
std::shared_ptr<OpExpr> fused_op_;
};

class RoiAlignFunctor {
Expand Down
4 changes: 3 additions & 1 deletion oneflow/ir/include/OneFlow/OneFlowUserOps.td
Original file line number Diff line number Diff line change
Expand Up @@ -1886,7 +1886,9 @@ def OneFlow_InTopKOp : OneFlow_BaseOp<"in_top_k", [NoMemoryEffect, NoGrad, Decla

def OneFlow_NmsOp : OneFlow_BaseOp<"nms", [NoMemoryEffect, DeclareOpInterfaceMethods<UserOpCompatibleInterface>]> {
let input = (ins
OneFlow_Tensor:$in
OneFlow_Tensor:$in,
Optional<OneFlow_Tensor>:$scores,
Optional<OneFlow_Tensor>:$input_indices
);
let output = (outs
OneFlow_Tensor:$out
Expand Down
6 changes: 5 additions & 1 deletion oneflow/user/ops/nms_op.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,11 @@ Maybe<void> InferNmsTensorDesc(user_op::InferContext* ctx) {
}

Maybe<void> InferNmsDataType(user_op::InferContext* ctx) {
ctx->SetOutputDType("out", 0, DataType::kInt8);
if (ctx->parallel_desc().device_type() == DeviceType::kNPU) {
ctx->SetOutputDType("out", 0, DataType::kInt32);
} else {
ctx->SetOutputDType("out", 0, DataType::kInt8);
}
return Maybe<void>::Ok();
}

Expand Down
10 changes: 8 additions & 2 deletions python/oneflow/nn/modules/nms.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,13 @@

def nms_op(boxes, scores, iou_threshold: float):
score_inds = flow.argsort(scores, dim=0, descending=True)
boxes = flow._C.gather(boxes, score_inds, axis=0)
keep = flow._C.nms(boxes, iou_threshold)
if boxes.device == flow.device("npu"):
sorted_scores = flow.gather(scores, dim=0, index=score_inds)
keep = flow._C.nms(
boxes, sorted_scores, score_inds.to(flow.int32), iou_threshold=iou_threshold
)
else:
boxes = flow._C.gather(boxes, score_inds, axis=0)
keep = flow._C.nms(boxes, iou_threshold=iou_threshold)
index = flow.squeeze(flow.argwhere(keep), dim=[1])
return flow._C.gather(score_inds, index, axis=0)
Loading