@@ -649,6 +649,13 @@ class reduction_impl_algo : public reduction_impl_common<T, BinaryOperation> {
649649 }
650650 }
651651
652+ template <class _T = T, int D = buffer_dim>
653+ auto &getTempBuffer (size_t Size, handler &CGH) {
654+ auto Buffer = std::make_shared<buffer<_T, D>>(range<1 >(Size));
655+ CGH.addReduction (Buffer);
656+ return *Buffer;
657+ }
658+
652659 // / Returns an accessor accessing the memory that will hold the reduction
653660 // / partial sums.
654661 // / If \p Size is equal to one, then the reduction result is the final and
@@ -708,15 +715,28 @@ class reduction_impl_algo : public reduction_impl_common<T, BinaryOperation> {
708715 return {*CounterBuf, CGH};
709716 }
710717
711- RedOutVar &getUserRedVar () { return MRedOut; }
712-
713- static inline result_type *getOutPointer (const rw_accessor_type &OutAcc) {
714- return OutAcc.get_pointer ().get ();
718+ // On discrete (vs. integrated) GPUs it's faster to initialize memory with an
719+ // extra kernel than copy it from the host.
720+ template <typename Name> auto getGroupsCounterAccDiscrete (handler &CGH) {
721+ auto &Buf = getTempBuffer<int , 1 >(1 , CGH);
722+ std::shared_ptr<detail::queue_impl> QueueCopy = CGH.MQueue ;
723+ auto Event = CGH.withAuxHandler (QueueCopy, [&](handler &InitHandler) {
724+ auto Acc = accessor{Buf, InitHandler, sycl::write_only, sycl::no_init};
725+ InitHandler.single_task <Name>([=]() { Acc[0 ] = 0 ; });
726+ });
727+ CGH.depends_on (Event);
728+ return accessor{Buf, CGH};
715729 }
716730
731+ RedOutVar &getUserRedVar () { return MRedOut; }
732+
717733 static inline result_type *getOutPointer (result_type *OutPtr) {
718734 return OutPtr;
719735 }
736+ template <class AccessorType >
737+ static inline result_type *getOutPointer (const AccessorType &OutAcc) {
738+ return OutAcc.get_pointer ().get ();
739+ }
720740
721741private:
722742 template <typename BufferT>
@@ -892,7 +912,7 @@ template <class KernelName> struct RangeFastAtomics;
892912} // namespace main_krn
893913} // namespace reduction
894914template <typename KernelName, typename KernelType, int Dims, class Reduction >
895- void reduCGFuncForRangeFastAtomics (handler &CGH, KernelType KernelFunc,
915+ bool reduCGFuncForRangeFastAtomics (handler &CGH, KernelType KernelFunc,
896916 const range<Dims> &Range,
897917 const nd_range<1 > &NDRange,
898918 Reduction &Redu) {
@@ -927,29 +947,43 @@ void reduCGFuncForRangeFastAtomics(handler &CGH, KernelType KernelFunc,
927947 Reducer.template atomic_combine (Reduction::getOutPointer (Out));
928948 }
929949 });
950+ return Reduction::is_usm || Redu.initializeToIdentity ();
930951}
931952
932953namespace reduction {
933954namespace main_krn {
934955template <class KernelName > struct RangeFastReduce ;
935956} // namespace main_krn
957+ namespace init_krn {
958+ template <class KernelName > struct GroupCounter ;
959+ }
936960} // namespace reduction
937961template <typename KernelName, typename KernelType, int Dims, class Reduction >
938- void reduCGFuncForRangeFastReduce (handler &CGH, KernelType KernelFunc,
962+ bool reduCGFuncForRangeFastReduce (handler &CGH, KernelType KernelFunc,
939963 const range<Dims> &Range,
940964 const nd_range<1 > &NDRange, Reduction &Redu) {
941965 constexpr size_t NElements = Reduction::num_elements;
942966 size_t WGSize = NDRange.get_local_range ().size ();
943967 size_t NWorkGroups = NDRange.get_group_range ().size ();
944968
969+ auto &Out = Redu.getUserRedVar ();
970+ if constexpr (Reduction::is_acc)
971+ associateWithHandler (CGH, &Out, access::target::device);
972+
973+ auto &PartialSumsBuf = Redu.getTempBuffer (NWorkGroups * NElements, CGH);
974+ accessor PartialSums (PartialSumsBuf, CGH, sycl::read_write, sycl::no_init);
975+
945976 bool IsUpdateOfUserVar = !Reduction::is_usm && !Redu.initializeToIdentity ();
946- auto PartialSums =
947- Redu.getWriteAccForPartialReds (NWorkGroups * NElements, CGH);
948- auto Out = (NWorkGroups == 1 )
949- ? PartialSums
950- : Redu.getWriteAccForPartialReds (NElements, CGH);
977+ using InitName =
978+ __sycl_reduction_kernel<reduction::init_krn::GroupCounter, KernelName>;
979+
980+ // Integrated/discrete GPUs have different faster path.
951981 auto NWorkGroupsFinished =
952- Redu.getReadWriteAccessorToInitializedGroupsCounter (CGH);
982+ sycl::detail::getDeviceFromHandler (CGH)
983+ .get_info <info::device::host_unified_memory>()
984+ ? Redu.getReadWriteAccessorToInitializedGroupsCounter (CGH)
985+ : Redu.template getGroupsCounterAccDiscrete <InitName>(CGH);
986+
953987 auto DoReducePartialSumsInLastWG =
954988 Reduction::template getReadWriteLocalAcc<int >(1 , CGH);
955989
@@ -967,50 +1001,57 @@ void reduCGFuncForRangeFastReduce(handler &CGH, KernelType KernelFunc,
9671001 // reduce_over_group is only defined for each T, not for span<T, ...>
9681002 size_t LID = NDId.get_local_id (0 );
9691003 for (int E = 0 ; E < NElements; ++E) {
970- Reducer.getElement (E) =
971- reduce_over_group (Group, Reducer.getElement (E), BOp);
972-
1004+ auto &RedElem = Reducer.getElement (E);
1005+ RedElem = reduce_over_group (Group, RedElem, BOp);
9731006 if (LID == 0 ) {
974- if (NWorkGroups == 1 && IsUpdateOfUserVar)
975- Reducer.getElement (E) =
976- BOp (Reducer.getElement (E), Reduction::getOutPointer (Out)[E]);
977-
978- // if NWorkGroups == 1, then PartialsSum and Out point to same memory.
979- Reduction::getOutPointer (
980- PartialSums)[NDId.get_group_linear_id () * NElements + E] =
981- Reducer.getElement (E);
1007+ if (NWorkGroups == 1 ) {
1008+ auto &OutElem = Reduction::getOutPointer (Out)[E];
1009+ // Can avoid using partial sum and write the final result immediately.
1010+ if (IsUpdateOfUserVar)
1011+ RedElem = BOp (RedElem, OutElem);
1012+ OutElem = RedElem;
1013+ } else {
1014+ PartialSums[NDId.get_group_linear_id () * NElements + E] =
1015+ Reducer.getElement (E);
1016+ }
9821017 }
9831018 }
9841019
1020+ if (NWorkGroups == 1 )
1021+ // We're done.
1022+ return ;
1023+
9851024 // Signal this work-group has finished after all values are reduced
9861025 if (LID == 0 ) {
9871026 auto NFinished =
9881027 sycl::atomic_ref<int , memory_order::relaxed, memory_scope::device,
9891028 access::address_space::global_space>(
9901029 NWorkGroupsFinished[0 ]);
991- DoReducePartialSumsInLastWG[0 ] =
992- ++NFinished == NWorkGroups && NWorkGroups > 1 ;
1030+ DoReducePartialSumsInLastWG[0 ] = ++NFinished == NWorkGroups;
9931031 }
9941032
9951033 sycl::detail::workGroupBarrier ();
9961034 if (DoReducePartialSumsInLastWG[0 ]) {
9971035 // Reduce each result separately
998- // TODO: Opportunity to parallelize across elements
1036+ // TODO: Opportunity to parallelize across elements.
9991037 for (int E = 0 ; E < NElements; ++E) {
1038+ auto &OutElem = Reduction::getOutPointer (Out)[E];
10001039 auto LocalSum = Reducer.getIdentity ();
10011040 for (size_t I = LID; I < NWorkGroups; I += WGSize)
10021041 LocalSum = BOp (LocalSum, PartialSums[I * NElements + E]);
1003- Reducer. getElement (E) = reduce_over_group (Group, LocalSum, BOp);
1042+ auto Result = reduce_over_group (Group, LocalSum, BOp);
10041043
10051044 if (LID == 0 ) {
10061045 if (IsUpdateOfUserVar)
1007- Reducer.getElement (E) =
1008- BOp (Reducer.getElement (E), Reduction::getOutPointer (Out)[E]);
1009- Reduction::getOutPointer (Out)[E] = Reducer.getElement (E);
1046+ Result = BOp (Result, OutElem);
1047+ OutElem = Result;
10101048 }
10111049 }
10121050 }
10131051 });
1052+
1053+ // We've updated user's variable, no extra work needed.
1054+ return false ;
10141055}
10151056
10161057namespace reduction {
@@ -1019,7 +1060,7 @@ template <class KernelName> struct RangeBasic;
10191060} // namespace main_krn
10201061} // namespace reduction
10211062template <typename KernelName, typename KernelType, int Dims, class Reduction >
1022- void reduCGFuncForRangeBasic (handler &CGH, KernelType KernelFunc,
1063+ bool reduCGFuncForRangeBasic (handler &CGH, KernelType KernelFunc,
10231064 const range<Dims> &Range,
10241065 const nd_range<1 > &NDRange, Reduction &Redu) {
10251066 constexpr size_t NElements = Reduction::num_elements;
@@ -1125,10 +1166,13 @@ void reduCGFuncForRangeBasic(handler &CGH, KernelType KernelFunc,
11251166 }
11261167 }
11271168 });
1169+ return Reduction::is_usm || Reduction::is_dw_acc;
11281170}
11291171
1172+ // / Returns "true" if the result has to be saved to user's variable by
1173+ // / reduSaveFinalResultToUserMem.
11301174template <typename KernelName, typename KernelType, int Dims, class Reduction >
1131- void reduCGFuncForRange (handler &CGH, KernelType KernelFunc,
1175+ bool reduCGFuncForRange (handler &CGH, KernelType KernelFunc,
11321176 const range<Dims> &Range, size_t MaxWGSize,
11331177 uint32_t NumConcurrentWorkGroups, Reduction &Redu) {
11341178 size_t NWorkItems = Range.size ();
@@ -1141,16 +1185,15 @@ void reduCGFuncForRange(handler &CGH, KernelType KernelFunc,
11411185 size_t NDRItems = NWorkGroups * WGSize;
11421186 nd_range<1 > NDRange{range<1 >{NDRItems}, range<1 >{WGSize}};
11431187
1144- if constexpr (Reduction::has_fast_atomics) {
1145- reduCGFuncForRangeFastAtomics<KernelName>(CGH, KernelFunc, Range, NDRange,
1146- Redu);
1147-
1148- } else if constexpr (Reduction::has_fast_reduce) {
1149- reduCGFuncForRangeFastReduce<KernelName>(CGH, KernelFunc, Range, NDRange,
1150- Redu);
1151- } else {
1152- reduCGFuncForRangeBasic<KernelName>(CGH, KernelFunc, Range, NDRange, Redu);
1153- }
1188+ if constexpr (Reduction::has_fast_atomics)
1189+ return reduCGFuncForRangeFastAtomics<KernelName>(CGH, KernelFunc, Range,
1190+ NDRange, Redu);
1191+ else if constexpr (Reduction::has_fast_reduce)
1192+ return reduCGFuncForRangeFastReduce<KernelName>(CGH, KernelFunc, Range,
1193+ NDRange, Redu);
1194+ else
1195+ return reduCGFuncForRangeBasic<KernelName>(CGH, KernelFunc, Range, NDRange,
1196+ Redu);
11541197}
11551198
11561199namespace reduction {
0 commit comments