Vc  1.4.2
SIMD Vector Classes for C++
gatherimplementation.h
1 /* This file is part of the Vc library. {{{
2 Copyright © 2014-2015 Matthias Kretz <kretz@kde.org>
3 
4 Redistribution and use in source and binary forms, with or without
5 modification, are permitted provided that the following conditions are met:
6  * Redistributions of source code must retain the above copyright
7  notice, this list of conditions and the following disclaimer.
8  * Redistributions in binary form must reproduce the above copyright
9  notice, this list of conditions and the following disclaimer in the
10  documentation and/or other materials provided with the distribution.
11  * Neither the names of contributing organizations nor the
12  names of its contributors may be used to endorse or promote products
13  derived from this software without specific prior written permission.
14 
15 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
16 ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
17 WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
18 DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER BE LIABLE FOR ANY
19 DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
20 (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
21 LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
22 ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
23 (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
24 SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
25 
26 }}}*/
27 
28 #ifndef VC_COMMON_GATHERIMPLEMENTATION_H_
29 #define VC_COMMON_GATHERIMPLEMENTATION_H_
30 
31 #include "macros.h"
32 
33 namespace Vc_VERSIONED_NAMESPACE
34 {
35 namespace Common
36 {
37 
38 enum class GatherScatterImplementation : int {
39  SimpleLoop,
40  SetIndexZero,
41  BitScanLoop,
42  PopcntSwitch
43 };
44 
45 using SimpleLoopT = std::integral_constant<GatherScatterImplementation, GatherScatterImplementation::SimpleLoop>;
46 using SetIndexZeroT = std::integral_constant<GatherScatterImplementation, GatherScatterImplementation::SetIndexZero>;
47 using BitScanLoopT = std::integral_constant<GatherScatterImplementation, GatherScatterImplementation::BitScanLoop>;
48 using PopcntSwitchT = std::integral_constant<GatherScatterImplementation, GatherScatterImplementation::PopcntSwitch>;
49 
50 template <typename V, typename MT, typename IT>
51 Vc_ALWAYS_INLINE void executeGather(SetIndexZeroT,
52  V &v,
53  const MT *mem,
54  IT &&indexes_,
55  typename V::MaskArgument mask)
56 {
57  auto indexes = std::forward<IT>(indexes_);
58  indexes.setZeroInverted(static_cast<decltype(!indexes)>(mask));
59  const V tmp(mem, indexes);
60  where(mask) | v = tmp;
61 }
62 
63 template <typename V, typename MT, typename IT>
64 Vc_ALWAYS_INLINE void executeGather(SimpleLoopT, V &v, const MT *mem, const IT &indexes,
65  const typename V::MaskArgument mask)
66 {
67  if (Vc_IS_UNLIKELY(mask.isEmpty())) {
68  return;
69  }
70 #if defined Vc_GCC && Vc_GCC >= 0x40900
71  // GCC 4.8 doesn't support dependent type and constexpr vector_size argument
72  constexpr std::size_t Sizeof = sizeof(V);
73  using Builtin [[gnu::vector_size(Sizeof)]] = typename V::value_type;
74  Builtin tmp = reinterpret_cast<Builtin>(v.data());
75  Common::unrolled_loop<std::size_t, 0, V::Size>([&](std::size_t i) {
76  if (mask[i]) {
77  tmp[i] = mem[indexes[i]];
78  }
79  });
80  v.data() = reinterpret_cast<typename V::VectorType>(tmp);
81 #else
82  Common::unrolled_loop<std::size_t, 0, V::Size>([&](std::size_t i) {
83  if (mask[i])
84  v[i] = mem[indexes[i]];
85  });
86 #endif
87 }
88 
89 template <typename V, typename MT, typename IT>
90 Vc_ALWAYS_INLINE void executeGather(BitScanLoopT,
91  V &v,
92  const MT *mem,
93  const IT &indexes,
94  typename V::MaskArgument mask)
95 {
96 #ifdef Vc_GNU_ASM
97  size_t bits = mask.toInt();
98  while (Vc_IS_LIKELY(bits > 0)) {
99  size_t i, j;
100  asm("bsf %[bits],%[i]\n\t"
101  "bsr %[bits],%[j]\n\t"
102  "btr %[i],%[bits]\n\t"
103  "btr %[j],%[bits]\n\t"
104  : [i] "=r"(i), [j] "=r"(j), [bits] "+r"(bits));
105  v[i] = mem[indexes[i]];
106  v[j] = mem[indexes[j]];
107  }
108 #else
109  // Alternative from Vc::SSE (0.7)
110  int bits = mask.toInt();
111  while (bits) {
112  const int i = _bit_scan_forward(bits);
113  bits &= bits - 1;
114  v[i] = mem[indexes[i]];
115  }
116 #endif // Vc_GNU_ASM
117 }
118 
119 template <typename V, typename MT, typename IT>
120 Vc_ALWAYS_INLINE void executeGather(PopcntSwitchT,
121  V &v,
122  const MT *mem,
123  const IT &indexes,
124  typename V::MaskArgument mask,
125  enable_if<V::Size == 16> = nullarg)
126 {
127  unsigned int bits = mask.toInt();
128  unsigned int low, high = 0;
129  switch (Vc::Detail::popcnt16(bits)) {
130  case 16:
131  v.gather(mem, indexes);
132  break;
133  case 15:
134  low = _bit_scan_forward(bits);
135  bits ^= 1 << low;
136  v[low] = mem[indexes[low]];
137  // fallthrough
138  case 14:
139  high = _bit_scan_reverse(bits);
140  v[high] = mem[indexes[high]];
141  high = (1 << high);
142  // fallthrough
143  case 13:
144  low = _bit_scan_forward(bits);
145  bits ^= high | (1 << low);
146  v[low] = mem[indexes[low]];
147  // fallthrough
148  case 12:
149  high = _bit_scan_reverse(bits);
150  v[high] = mem[indexes[high]];
151  high = (1 << high);
152  // fallthrough
153  case 11:
154  low = _bit_scan_forward(bits);
155  bits ^= high | (1 << low);
156  v[low] = mem[indexes[low]];
157  // fallthrough
158  case 10:
159  high = _bit_scan_reverse(bits);
160  v[high] = mem[indexes[high]];
161  high = (1 << high);
162  // fallthrough
163  case 9:
164  low = _bit_scan_forward(bits);
165  bits ^= high | (1 << low);
166  v[low] = mem[indexes[low]];
167  // fallthrough
168  case 8:
169  high = _bit_scan_reverse(bits);
170  v[high] = mem[indexes[high]];
171  high = (1 << high);
172  // fallthrough
173  case 7:
174  low = _bit_scan_forward(bits);
175  bits ^= high | (1 << low);
176  v[low] = mem[indexes[low]];
177  // fallthrough
178  case 6:
179  high = _bit_scan_reverse(bits);
180  v[high] = mem[indexes[high]];
181  high = (1 << high);
182  // fallthrough
183  case 5:
184  low = _bit_scan_forward(bits);
185  bits ^= high | (1 << low);
186  v[low] = mem[indexes[low]];
187  // fallthrough
188  case 4:
189  high = _bit_scan_reverse(bits);
190  v[high] = mem[indexes[high]];
191  high = (1 << high);
192  // fallthrough
193  case 3:
194  low = _bit_scan_forward(bits);
195  bits ^= high | (1 << low);
196  v[low] = mem[indexes[low]];
197  // fallthrough
198  case 2:
199  high = _bit_scan_reverse(bits);
200  v[high] = mem[indexes[high]];
201  // fallthrough
202  case 1:
203  low = _bit_scan_forward(bits);
204  v[low] = mem[indexes[low]];
205  // fallthrough
206  case 0:
207  break;
208  }
209 }
210 template <typename V, typename MT, typename IT>
211 Vc_ALWAYS_INLINE void executeGather(PopcntSwitchT,
212  V &v,
213  const MT *mem,
214  const IT &indexes,
215  typename V::MaskArgument mask,
216  enable_if<V::Size == 8> = nullarg)
217 {
218  unsigned int bits = mask.toInt();
219  unsigned int low, high = 0;
220  switch (Vc::Detail::popcnt8(bits)) {
221  case 8:
222  v.gather(mem, indexes);
223  break;
224  case 7:
225  low = _bit_scan_forward(bits);
226  bits ^= 1 << low;
227  v[low] = mem[indexes[low]];
228  // fallthrough
229  case 6:
230  high = _bit_scan_reverse(bits);
231  v[high] = mem[indexes[high]];
232  high = (1 << high);
233  // fallthrough
234  case 5:
235  low = _bit_scan_forward(bits);
236  bits ^= high | (1 << low);
237  v[low] = mem[indexes[low]];
238  // fallthrough
239  case 4:
240  high = _bit_scan_reverse(bits);
241  v[high] = mem[indexes[high]];
242  high = (1 << high);
243  // fallthrough
244  case 3:
245  low = _bit_scan_forward(bits);
246  bits ^= high | (1 << low);
247  v[low] = mem[indexes[low]];
248  // fallthrough
249  case 2:
250  high = _bit_scan_reverse(bits);
251  v[high] = mem[indexes[high]];
252  // fallthrough
253  case 1:
254  low = _bit_scan_forward(bits);
255  v[low] = mem[indexes[low]];
256  // fallthrough
257  case 0:
258  break;
259  }
260 }
261 template <typename V, typename MT, typename IT>
262 Vc_ALWAYS_INLINE void executeGather(PopcntSwitchT,
263  V &v,
264  const MT *mem,
265  const IT &indexes,
266  typename V::MaskArgument mask,
267  enable_if<V::Size == 4> = nullarg)
268 {
269  unsigned int bits = mask.toInt();
270  unsigned int low, high = 0;
271  switch (Vc::Detail::popcnt4(bits)) {
272  case 4:
273  v.gather(mem, indexes);
274  break;
275  case 3:
276  low = _bit_scan_forward(bits);
277  bits ^= 1 << low;
278  v[low] = mem[indexes[low]];
279  // fallthrough
280  case 2:
281  high = _bit_scan_reverse(bits);
282  v[high] = mem[indexes[high]];
283  // fallthrough
284  case 1:
285  low = _bit_scan_forward(bits);
286  v[low] = mem[indexes[low]];
287  // fallthrough
288  case 0:
289  break;
290  }
291 }
292 template <typename V, typename MT, typename IT>
293 Vc_ALWAYS_INLINE void executeGather(PopcntSwitchT,
294  V &v,
295  const MT *mem,
296  const IT &indexes,
297  typename V::MaskArgument mask,
298  enable_if<V::Size == 2> = nullarg)
299 {
300  unsigned int bits = mask.toInt();
301  unsigned int low;
302  switch (Vc::Detail::popcnt4(bits)) {
303  case 2:
304  v.gather(mem, indexes);
305  break;
306  case 1:
307  low = _bit_scan_forward(bits);
308  v[low] = mem[indexes[low]];
309  // fallthrough
310  case 0:
311  break;
312  }
313 }
314 
315 } // namespace Common
316 } // namespace Vc
317 
318 #endif // VC_COMMON_GATHERIMPLEMENTATION_H_
Vc::where
constexpr WhereImpl::WhereMask< M > where(const M &mask)
Definition: where.h:265