28 #ifndef VC_COMMON_GATHERIMPLEMENTATION_H_
29 #define VC_COMMON_GATHERIMPLEMENTATION_H_
33 namespace Vc_VERSIONED_NAMESPACE
38 enum class GatherScatterImplementation : int {
45 using SimpleLoopT = std::integral_constant<GatherScatterImplementation, GatherScatterImplementation::SimpleLoop>;
46 using SetIndexZeroT = std::integral_constant<GatherScatterImplementation, GatherScatterImplementation::SetIndexZero>;
47 using BitScanLoopT = std::integral_constant<GatherScatterImplementation, GatherScatterImplementation::BitScanLoop>;
48 using PopcntSwitchT = std::integral_constant<GatherScatterImplementation, GatherScatterImplementation::PopcntSwitch>;
50 template <
typename V,
typename MT,
typename IT>
51 Vc_ALWAYS_INLINE
void executeGather(SetIndexZeroT,
55 typename V::MaskArgument mask)
57 auto indexes = std::forward<IT>(indexes_);
58 indexes.setZeroInverted(
static_cast<decltype(!indexes)
>(mask));
59 const V tmp(mem, indexes);
60 where(mask) | v = tmp;
63 template <
typename V,
typename MT,
typename IT>
64 Vc_ALWAYS_INLINE
void executeGather(SimpleLoopT, V &v,
const MT *mem,
const IT &indexes,
65 const typename V::MaskArgument mask)
67 if (Vc_IS_UNLIKELY(mask.isEmpty())) {
70 #if defined Vc_GCC && Vc_GCC >= 0x40900
72 constexpr std::size_t Sizeof =
sizeof(V);
73 using Builtin [[gnu::vector_size(Sizeof)]] =
typename V::value_type;
74 Builtin tmp =
reinterpret_cast<Builtin
>(v.data());
75 Common::unrolled_loop<std::size_t, 0, V::Size>([&](std::size_t i) {
77 tmp[i] = mem[indexes[i]];
80 v.data() =
reinterpret_cast<typename V::VectorType
>(tmp);
82 Common::unrolled_loop<std::size_t, 0, V::Size>([&](std::size_t i) {
84 v[i] = mem[indexes[i]];
89 template <
typename V,
typename MT,
typename IT>
90 Vc_ALWAYS_INLINE
void executeGather(BitScanLoopT,
94 typename V::MaskArgument mask)
97 size_t bits = mask.toInt();
98 while (Vc_IS_LIKELY(bits > 0)) {
100 asm(
"bsf %[bits],%[i]\n\t"
101 "bsr %[bits],%[j]\n\t"
102 "btr %[i],%[bits]\n\t"
103 "btr %[j],%[bits]\n\t"
104 : [i]
"=r"(i), [j]
"=r"(j), [bits]
"+r"(bits));
105 v[i] = mem[indexes[i]];
106 v[j] = mem[indexes[j]];
110 int bits = mask.toInt();
112 const int i = _bit_scan_forward(bits);
114 v[i] = mem[indexes[i]];
119 template <
typename V,
typename MT,
typename IT>
120 Vc_ALWAYS_INLINE
void executeGather(PopcntSwitchT,
124 typename V::MaskArgument mask,
125 enable_if<V::Size == 16> = nullarg)
127 unsigned int bits = mask.toInt();
128 unsigned int low, high = 0;
129 switch (Vc::Detail::popcnt16(bits)) {
131 v.gather(mem, indexes);
134 low = _bit_scan_forward(bits);
136 v[low] = mem[indexes[low]];
139 high = _bit_scan_reverse(bits);
140 v[high] = mem[indexes[high]];
144 low = _bit_scan_forward(bits);
145 bits ^= high | (1 << low);
146 v[low] = mem[indexes[low]];
149 high = _bit_scan_reverse(bits);
150 v[high] = mem[indexes[high]];
154 low = _bit_scan_forward(bits);
155 bits ^= high | (1 << low);
156 v[low] = mem[indexes[low]];
159 high = _bit_scan_reverse(bits);
160 v[high] = mem[indexes[high]];
164 low = _bit_scan_forward(bits);
165 bits ^= high | (1 << low);
166 v[low] = mem[indexes[low]];
169 high = _bit_scan_reverse(bits);
170 v[high] = mem[indexes[high]];
174 low = _bit_scan_forward(bits);
175 bits ^= high | (1 << low);
176 v[low] = mem[indexes[low]];
179 high = _bit_scan_reverse(bits);
180 v[high] = mem[indexes[high]];
184 low = _bit_scan_forward(bits);
185 bits ^= high | (1 << low);
186 v[low] = mem[indexes[low]];
189 high = _bit_scan_reverse(bits);
190 v[high] = mem[indexes[high]];
194 low = _bit_scan_forward(bits);
195 bits ^= high | (1 << low);
196 v[low] = mem[indexes[low]];
199 high = _bit_scan_reverse(bits);
200 v[high] = mem[indexes[high]];
203 low = _bit_scan_forward(bits);
204 v[low] = mem[indexes[low]];
210 template <
typename V,
typename MT,
typename IT>
211 Vc_ALWAYS_INLINE
void executeGather(PopcntSwitchT,
215 typename V::MaskArgument mask,
216 enable_if<V::Size == 8> = nullarg)
218 unsigned int bits = mask.toInt();
219 unsigned int low, high = 0;
220 switch (Vc::Detail::popcnt8(bits)) {
222 v.gather(mem, indexes);
225 low = _bit_scan_forward(bits);
227 v[low] = mem[indexes[low]];
230 high = _bit_scan_reverse(bits);
231 v[high] = mem[indexes[high]];
235 low = _bit_scan_forward(bits);
236 bits ^= high | (1 << low);
237 v[low] = mem[indexes[low]];
240 high = _bit_scan_reverse(bits);
241 v[high] = mem[indexes[high]];
245 low = _bit_scan_forward(bits);
246 bits ^= high | (1 << low);
247 v[low] = mem[indexes[low]];
250 high = _bit_scan_reverse(bits);
251 v[high] = mem[indexes[high]];
254 low = _bit_scan_forward(bits);
255 v[low] = mem[indexes[low]];
261 template <
typename V,
typename MT,
typename IT>
262 Vc_ALWAYS_INLINE
void executeGather(PopcntSwitchT,
266 typename V::MaskArgument mask,
267 enable_if<V::Size == 4> = nullarg)
269 unsigned int bits = mask.toInt();
270 unsigned int low, high = 0;
271 switch (Vc::Detail::popcnt4(bits)) {
273 v.gather(mem, indexes);
276 low = _bit_scan_forward(bits);
278 v[low] = mem[indexes[low]];
281 high = _bit_scan_reverse(bits);
282 v[high] = mem[indexes[high]];
285 low = _bit_scan_forward(bits);
286 v[low] = mem[indexes[low]];
292 template <
typename V,
typename MT,
typename IT>
293 Vc_ALWAYS_INLINE
void executeGather(PopcntSwitchT,
297 typename V::MaskArgument mask,
298 enable_if<V::Size == 2> = nullarg)
300 unsigned int bits = mask.toInt();
302 switch (Vc::Detail::popcnt4(bits)) {
304 v.gather(mem, indexes);
307 low = _bit_scan_forward(bits);
308 v[low] = mem[indexes[low]];
318 #endif // VC_COMMON_GATHERIMPLEMENTATION_H_