@@ -68,9 +68,9 @@ struct ArithmeticConfiguration
6868 NBL_CONSTEXPR_STATIC_INLINE uint16_t VirtualInvocationsAtLevel1 = LevelInputCount_1 / ItemsPerInvocation_1;
6969
7070 NBL_CONSTEXPR_STATIC_INLINE uint16_t __padding = conditional_value<LevelCount==3 ,uint16_t,SubgroupSize-1 ,0 >::value;
71- NBL_CONSTEXPR_STATIC_INLINE uint16_t __channelStride_1 = conditional_value<LevelCount==3 ,uint16_t,VirtualInvocationsAtLevel1+__padding ,SubgroupSize>::value;
71+ NBL_CONSTEXPR_STATIC_INLINE uint16_t __channelStride_1 = conditional_value<LevelCount==3 ,uint16_t,VirtualInvocationsAtLevel1,SubgroupSize>::value + __padding ;
7272 NBL_CONSTEXPR_STATIC_INLINE uint16_t __channelStride_2 = conditional_value<LevelCount==3 ,uint16_t,SubgroupSize,0 >::value;
73- using ChannelStride = tuple<integral_constant<uint16_t,__channelStride_1>,integral_constant<uint16_t,__channelStride_2> >;
73+ using ChannelStride = tuple<integral_constant<uint16_t,__padding>,integral_constant<uint16_t, __channelStride_1>,integral_constant<uint16_t,__channelStride_2> >; // we don't use stride 0
7474
7575 // user specified the shared mem size of Scalars
7676 NBL_CONSTEXPR_STATIC_INLINE uint32_t SharedScratchElementCount = conditional_value<LevelCount==1 ,uint16_t,
@@ -101,17 +101,17 @@ struct ArithmeticConfiguration
101101 {
102102 const uint16_t ItemsPerNextInvocation = tuple_element<level,ItemsPerInvocation>::type::value;
103103 const uint16_t outChannel = virtualSubgroupID & (ItemsPerNextInvocation-uint16_t (1u));
104- const uint16_t outInvocation = virtualSubgroupID/ ItemsPerNextInvocation;
104+ const uint16_t outInvocation = virtualSubgroupID / ItemsPerNextInvocation;
105105 const uint16_t localOffset = outChannel * tuple_element<level,ChannelStride>::type::value + outInvocation;
106106
107107 if (level==2 )
108108 {
109- const uint16_t baseOffset = LevelInputCount_1 + (SubgroupSize- uint16_t (1u)) * ItemsPerNextInvocation ;
109+ const uint16_t baseOffset = LevelInputCount_1 + (SubgroupSize - uint16_t (1u)) * ItemsPerInvocation_1 ;
110110 return baseOffset + localOffset;
111111 }
112112 else
113113 {
114- const uint16_t paddingOffset = virtualSubgroupID/ (SubgroupSize* ItemsPerInvocation_1);
114+ const uint16_t paddingOffset = virtualSubgroupID / (SubgroupSize * ItemsPerInvocation_1);
115115 return localOffset + paddingOffset;
116116 }
117117 }
@@ -128,11 +128,11 @@ struct ArithmeticConfiguration
128128 static uint16_t sharedLoadIndex (const uint16_t invocationIndex, const uint16_t component)
129129 {
130130 const uint16_t localOffset = component * tuple_element<level,ChannelStride>::type::value + invocationIndex;
131- const uint16_t paddingOffset = invocationIndex/ SubgroupSize;
131+ const uint16_t paddingOffset = invocationIndex / SubgroupSize;
132132
133133 if (level==2 )
134134 {
135- const uint16_t baseOffset = LevelInputCount_1 + (SubgroupSize- uint16_t (1u)) * ItemsPerInvocation_1;
135+ const uint16_t baseOffset = LevelInputCount_1 + (SubgroupSize - uint16_t (1u)) * ItemsPerInvocation_1;
136136 return baseOffset + localOffset + paddingOffset;
137137 }
138138 else
0 commit comments