Vc  1.4.3
SIMD Vector Classes for C++
global.h
1 /* This file is part of the Vc library. {{{
2 Copyright © 2009-2015 Matthias Kretz <kretz@kde.org>
3 
4 Redistribution and use in source and binary forms, with or without
5 modification, are permitted provided that the following conditions are met:
6  * Redistributions of source code must retain the above copyright
7  notice, this list of conditions and the following disclaimer.
8  * Redistributions in binary form must reproduce the above copyright
9  notice, this list of conditions and the following disclaimer in the
10  documentation and/or other materials provided with the distribution.
11  * Neither the names of contributing organizations nor the
12  names of its contributors may be used to endorse or promote products
13  derived from this software without specific prior written permission.
14 
15 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
16 ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
17 WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
18 DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER BE LIABLE FOR ANY
19 DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
20 (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
21 LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
22 ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
23 (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
24 SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
25 
26 }}}*/
27 
28 #ifndef VC_GLOBAL_H_
29 #define VC_GLOBAL_H_
30 
31 #include <cstdint>
32 #include "fwddecl.h"
33 
34 #ifdef DOXYGEN
35 
48 #define Vc_ICC __INTEL_COMPILER_BUILD_DATE
49 #undef Vc_ICC
57 #define Vc_CLANG (__clang_major__ * 0x10000 + __clang_minor__ * 0x100 + __clang_patchlevel__)
58 #undef Vc_CLANG
66 #define Vc_APPLECLANG (__clang_major__ * 0x10000 + __clang_minor__ * 0x100 + __clang_patchlevel__)
67 #undef Vc_APPLECLANG
75 #define Vc_GCC (__GNUC__ * 0x10000 + __GNUC_MINOR__ * 0x100 + __GNUC_PATCHLEVEL__)
83 #define Vc_MSVC _MSC_FULL_VER
84 #undef Vc_MSVC
86 
87 #else // DOXYGEN
88 
89 // Compiler defines
90 #ifdef __INTEL_COMPILER
91 #define Vc_ICC __INTEL_COMPILER_BUILD_DATE
92 #elif defined(__clang__) && defined(__apple_build_version__)
93 #define Vc_APPLECLANG (__clang_major__ * 0x10000 + __clang_minor__ * 0x100 + __clang_patchlevel__)
94 #elif defined(__clang__)
95 #define Vc_CLANG (__clang_major__ * 0x10000 + __clang_minor__ * 0x100 + __clang_patchlevel__)
96 #elif defined(__GNUC__)
97 #define Vc_GCC (__GNUC__ * 0x10000 + __GNUC_MINOR__ * 0x100 + __GNUC_PATCHLEVEL__)
98 #elif defined(_MSC_VER)
99 #define Vc_MSVC _MSC_FULL_VER
100 #else
101 #define Vc_UNSUPPORTED_COMPILER 1
102 #endif
103 
104 #if defined Vc_GCC && Vc_GCC >= 0x60000
105 #define Vc_RESET_DIAGNOSTICS _Pragma("GCC diagnostic pop")
106 #pragma GCC diagnostic push
107 #pragma GCC diagnostic ignored "-Wignored-attributes"
108 #else
109 #define Vc_RESET_DIAGNOSTICS
110 #endif
111 
112 #if defined Vc_ICC
113 // 'warning #2922: template parameter "<unnamed>" cannot be used because it follows a
114 // parameter pack and cannot be deduced from the parameters of function template'
115 // This warning is stupid. The parameter is unnamed because I don't want to use it. I see
116 // no other workaround than to disable the warning. Sadly, it doesn't suffice to disable
117 // it for the Vc headers. It must also be disabled at the places Vc types are used.
118 #pragma warning disable 2922
119 #endif
120 
121 #if __cplusplus < 201103 && (!defined Vc_MSVC || _MSC_VER < 1900)
122 # error "Vc requires support for C++11."
123 #elif __cplusplus >= 201402L
124 # define Vc_CXX14 1
125 # if __cplusplus > 201700L
126 # define Vc_CXX17 1
127 # endif
128 #endif
129 
130 #if defined(__GNUC__) && !defined(Vc_NO_INLINE_ASM)
131 #define Vc_GNU_ASM 1
132 #endif
133 
134 #ifdef Vc_GCC
135 # if Vc_GCC >= 0x70000 && defined __i386__
136  // GCC 7 changed alignof(max_align_t) to 16. glibc 2.26 followed with malloc in 2.26.
137  // 1. If GCC >= 7 and libc is not glibc max_align_t and malloc mismatch
138  // 2. If GCC >= 7 and libc is glibc < 2.26 max_align_t and malloc mismatch
139 # ifdef __GLIBC_PREREQ
140 # if __GLIBC_PREREQ(2,26)
141 # define Vc_HAVE_STD_MAX_ALIGN_T 1
142 # endif
143 # endif
144 # elif Vc_GCC >= 0x40900
145 # define Vc_HAVE_STD_MAX_ALIGN_T 1
146 # else
147 # define Vc_HAVE_MAX_ALIGN_T 1
148 # endif
149 #elif !defined(Vc_CLANG) && !defined(Vc_ICC)
150 // Clang/ICC don't provide max_align_t at all
151 // TODO: Clang defines max_align_t since 3.5.0. Whether std::max_align_t is defined depends on the
152 // standard library version.
153 # define Vc_HAVE_STD_MAX_ALIGN_T 1
154 #endif
155 
156 #if defined(Vc_GCC) || defined(Vc_CLANG) || defined Vc_APPLECLANG
157 #define Vc_USE_BUILTIN_VECTOR_TYPES 1
158 #endif
159 
160 #ifdef Vc_MSVC
161 # define Vc_CDECL __cdecl
162 # define Vc_VDECL __vectorcall
163 #else
164 # define Vc_CDECL
165 # define Vc_VDECL
166 #endif
167 
168 /* Define the following strings to a unique integer, which is the only type the preprocessor can
169  * compare. This allows to use -DVc_IMPL=SSE3. The preprocessor will then consider Vc_IMPL and SSE3
170  * to be equal. Of course, it is important to undefine the strings later on!
171  */
172 #define Scalar 0x00100000
173 #define SSE 0x00200000
174 #define SSE2 0x00300000
175 #define SSE3 0x00400000
176 #define SSSE3 0x00500000
177 #define SSE4_1 0x00600000
178 #define SSE4_2 0x00700000
179 #define AVX 0x00800000
180 #define AVX2 0x00900000
181 
182 #define XOP 0x00000001
183 #define FMA4 0x00000002
184 #define F16C 0x00000004
185 #define POPCNT 0x00000008
186 #define SSE4a 0x00000010
187 #define FMA 0x00000020
188 #define BMI2 0x00000040
189 
190 #define IMPL_MASK 0xFFF00000
191 #define EXT_MASK 0x000FFFFF
192 
193 #ifdef Vc_MSVC
194 # ifdef _M_IX86_FP
195 # if _M_IX86_FP >= 1
196 # ifndef __SSE__
197 # define __SSE__ 1
198 # endif
199 # endif
200 # if _M_IX86_FP >= 2
201 # ifndef __SSE2__
202 # define __SSE2__ 1
203 # endif
204 # endif
205 # elif defined(_M_AMD64)
206 // If the target is x86_64 then SSE2 is guaranteed
207 # ifndef __SSE__
208 # define __SSE__ 1
209 # endif
210 # ifndef __SSE2__
211 # define __SSE2__ 1
212 # endif
213 # endif
214 #endif
215 
216 #if defined Vc_ICC && !defined __POPCNT__
217 # if defined __SSE4_2__ || defined __SSE4A__
218 # define __POPCNT__ 1
219 # endif
220 #endif
221 
222 #ifdef VC_IMPL
223 #error "You are using the old VC_IMPL macro. Since Vc 1.0 all Vc macros start with Vc_, i.e. a lower-case 'c'"
224 #endif
225 
226 #ifndef Vc_IMPL
227 
228 # if defined(__AVX2__)
229 # define Vc_IMPL_AVX2 1
230 # define Vc_IMPL_AVX 1
231 # elif defined(__AVX__)
232 # define Vc_IMPL_AVX 1
233 # else
234 # if defined(__SSE4_2__)
235 # define Vc_IMPL_SSE 1
236 # define Vc_IMPL_SSE4_2 1
237 # endif
238 # if defined(__SSE4_1__)
239 # define Vc_IMPL_SSE 1
240 # define Vc_IMPL_SSE4_1 1
241 # endif
242 # if defined(__SSE3__)
243 # define Vc_IMPL_SSE 1
244 # define Vc_IMPL_SSE3 1
245 # endif
246 # if defined(__SSSE3__)
247 # define Vc_IMPL_SSE 1
248 # define Vc_IMPL_SSSE3 1
249 # endif
250 # if defined(__SSE2__)
251 # define Vc_IMPL_SSE 1
252 # define Vc_IMPL_SSE2 1
253 # endif
254 
255 # if defined(Vc_IMPL_SSE)
256  // nothing
257 # else
258 # define Vc_IMPL_Scalar 1
259 # endif
260 # endif
261 # if !defined(Vc_IMPL_Scalar)
262 # ifdef __FMA4__
263 # define Vc_IMPL_FMA4 1
264 # endif
265 # ifdef __XOP__
266 # define Vc_IMPL_XOP 1
267 # endif
268 # ifdef __F16C__
269 # define Vc_IMPL_F16C 1
270 # endif
271 # ifdef __POPCNT__
272 # define Vc_IMPL_POPCNT 1
273 # endif
274 # ifdef __SSE4A__
275 # define Vc_IMPL_SSE4a 1
276 # endif
277 # ifdef __FMA__
278 # define Vc_IMPL_FMA 1
279 # endif
280 # ifdef __BMI2__
281 # define Vc_IMPL_BMI2 1
282 # endif
283 # endif
284 
285 #else // Vc_IMPL
286 
287 # if (Vc_IMPL & IMPL_MASK) == AVX2 // AVX2 supersedes SSE
288 # define Vc_IMPL_AVX2 1
289 # define Vc_IMPL_AVX 1
290 # elif (Vc_IMPL & IMPL_MASK) == AVX // AVX supersedes SSE
291 # define Vc_IMPL_AVX 1
292 # elif (Vc_IMPL & IMPL_MASK) == Scalar
293 # define Vc_IMPL_Scalar 1
294 # elif (Vc_IMPL & IMPL_MASK) == SSE4_2
295 # define Vc_IMPL_SSE4_2 1
296 # define Vc_IMPL_SSE4_1 1
297 # define Vc_IMPL_SSSE3 1
298 # define Vc_IMPL_SSE3 1
299 # define Vc_IMPL_SSE2 1
300 # define Vc_IMPL_SSE 1
301 # elif (Vc_IMPL & IMPL_MASK) == SSE4_1
302 # define Vc_IMPL_SSE4_1 1
303 # define Vc_IMPL_SSSE3 1
304 # define Vc_IMPL_SSE3 1
305 # define Vc_IMPL_SSE2 1
306 # define Vc_IMPL_SSE 1
307 # elif (Vc_IMPL & IMPL_MASK) == SSSE3
308 # define Vc_IMPL_SSSE3 1
309 # define Vc_IMPL_SSE3 1
310 # define Vc_IMPL_SSE2 1
311 # define Vc_IMPL_SSE 1
312 # elif (Vc_IMPL & IMPL_MASK) == SSE3
313 # define Vc_IMPL_SSE3 1
314 # define Vc_IMPL_SSE2 1
315 # define Vc_IMPL_SSE 1
316 # elif (Vc_IMPL & IMPL_MASK) == SSE2
317 # define Vc_IMPL_SSE2 1
318 # define Vc_IMPL_SSE 1
319 # elif (Vc_IMPL & IMPL_MASK) == SSE
320 # define Vc_IMPL_SSE 1
321 # if defined(__SSE4_2__)
322 # define Vc_IMPL_SSE4_2 1
323 # endif
324 # if defined(__SSE4_1__)
325 # define Vc_IMPL_SSE4_1 1
326 # endif
327 # if defined(__SSE3__)
328 # define Vc_IMPL_SSE3 1
329 # endif
330 # if defined(__SSSE3__)
331 # define Vc_IMPL_SSSE3 1
332 # endif
333 # if defined(__SSE2__)
334 # define Vc_IMPL_SSE2 1
335 # endif
336 # elif (Vc_IMPL & IMPL_MASK) == 0 && (Vc_IMPL & SSE4a)
337  // this is for backward compatibility only where SSE4a was included in the main
338  // line of available SIMD instruction sets
339 # define Vc_IMPL_SSE3 1
340 # define Vc_IMPL_SSE2 1
341 # define Vc_IMPL_SSE 1
342 # endif
343 # if (Vc_IMPL & XOP)
344 # define Vc_IMPL_XOP 1
345 # endif
346 # if (Vc_IMPL & FMA4)
347 # define Vc_IMPL_FMA4 1
348 # endif
349 # if (Vc_IMPL & F16C)
350 # define Vc_IMPL_F16C 1
351 # endif
352 # if (!defined(Vc_IMPL_Scalar) && defined(__POPCNT__)) || (Vc_IMPL & POPCNT)
353 # define Vc_IMPL_POPCNT 1
354 # endif
355 # if (Vc_IMPL & SSE4a)
356 # define Vc_IMPL_SSE4a 1
357 # endif
358 # if (Vc_IMPL & FMA)
359 # define Vc_IMPL_FMA 1
360 # endif
361 # if (Vc_IMPL & BMI2)
362 # define Vc_IMPL_BMI2 1
363 # endif
364 # undef Vc_IMPL
365 
366 #endif // Vc_IMPL
367 
368 // If AVX is enabled in the compiler it will use VEX coding for the SIMD instructions.
369 #ifdef __AVX__
370 # define Vc_USE_VEX_CODING 1
371 #endif
372 
373 #ifdef Vc_IMPL_AVX
374 // if we have AVX then we also have all SSE intrinsics
375 # define Vc_IMPL_SSE4_2 1
376 # define Vc_IMPL_SSE4_1 1
377 # define Vc_IMPL_SSSE3 1
378 # define Vc_IMPL_SSE3 1
379 # define Vc_IMPL_SSE2 1
380 # define Vc_IMPL_SSE 1
381 #endif
382 
383 #if defined(Vc_CLANG) && Vc_CLANG >= 0x30600 && Vc_CLANG < 0x30700
384 # if defined(Vc_IMPL_AVX)
385 # warning "clang 3.6.x miscompiles AVX code, frequently losing 50% of the data. Vc will fall back to SSE4 instead."
386 # undef Vc_IMPL_AVX
387 # if defined(Vc_IMPL_AVX2)
388 # undef Vc_IMPL_AVX2
389 # endif
390 # endif
391 #endif
392 
393 # if !defined(Vc_IMPL_Scalar) && !defined(Vc_IMPL_SSE) && !defined(Vc_IMPL_AVX)
394 # error "No suitable Vc implementation was selected! Probably Vc_IMPL was set to an invalid value."
395 # elif defined(Vc_IMPL_SSE) && !defined(Vc_IMPL_SSE2)
396 # error "SSE requested but no SSE2 support. Vc needs at least SSE2!"
397 # endif
398 
399 #undef Scalar
400 #undef SSE
401 #undef SSE2
402 #undef SSE3
403 #undef SSSE3
404 #undef SSE4_1
405 #undef SSE4_2
406 #undef AVX
407 #undef AVX2
408 
409 #undef XOP
410 #undef FMA4
411 #undef F16C
412 #undef POPCNT
413 #undef SSE4a
414 #undef FMA
415 #undef BMI2
416 
417 #undef IMPL_MASK
418 #undef EXT_MASK
419 
420 #if defined Vc_IMPL_AVX2
421 #define Vc_DEFAULT_IMPL_AVX2
422 #elif defined Vc_IMPL_AVX
423 #define Vc_DEFAULT_IMPL_AVX
424 #elif defined Vc_IMPL_SSE
425 #define Vc_DEFAULT_IMPL_SSE
426 #elif defined Vc_IMPL_Scalar
427 #define Vc_DEFAULT_IMPL_Scalar
428 #else
429 #error "Preprocessor logic broken. Please report a bug."
430 #endif
431 
432 #endif // DOXYGEN
433 
434 namespace Vc_VERSIONED_NAMESPACE
435 {
436 
437 typedef signed char int8_t;
438 typedef unsigned char uint8_t;
439 typedef signed short int16_t;
440 typedef unsigned short uint16_t;
441 typedef signed int int32_t;
442 typedef unsigned int uint32_t;
443 typedef signed long long int64_t;
444 typedef unsigned long long uint64_t;
445 
471 };
472 
482 enum Implementation : std::uint_least32_t { // TODO: make enum class
501  ImplementationMask = 0xfff
502 };
503 
514 enum ExtraInstructions : std::uint_least32_t { // TODO: make enum class
518  Fma4Instructions = 0x02000,
520  XopInstructions = 0x04000,
524  Sse4aInstructions = 0x10000,
526  FmaInstructions = 0x20000,
528  VexInstructions = 0x40000,
530  Bmi2Instructions = 0x80000,
531  // PclmulqdqInstructions,
532  // AesInstructions,
533  // RdrandInstructions
534  ExtraInstructionsMask = 0xfffff000u
535 };
536 
546 template <unsigned int Features> struct ImplementationT {
548  static constexpr Implementation current()
549  {
550  return static_cast<Implementation>(Features & ImplementationMask);
551  }
553  static constexpr bool is(Implementation impl)
554  {
555  return static_cast<unsigned int>(impl) == current();
556  }
561  static constexpr bool is_between(Implementation low, Implementation high)
562  {
563  return static_cast<unsigned int>(low) <= current() &&
564  static_cast<unsigned int>(high) >= current();
565  }
569  static constexpr bool runs_on(unsigned int extraInstructions)
570  {
571  return (extraInstructions & Features & ExtraInstructionsMask) ==
572  (Features & ExtraInstructionsMask);
573  }
574 };
582 #ifdef Vc_IMPL_Scalar
583  ScalarImpl
584 #elif defined(Vc_IMPL_AVX2)
585  AVX2Impl
586 #elif defined(Vc_IMPL_AVX)
587  AVXImpl
588 #elif defined(Vc_IMPL_SSE4_2)
589  SSE42Impl
590 #elif defined(Vc_IMPL_SSE4_1)
591  SSE41Impl
592 #elif defined(Vc_IMPL_SSSE3)
593  SSSE3Impl
594 #elif defined(Vc_IMPL_SSE3)
595  SSE3Impl
596 #elif defined(Vc_IMPL_SSE2)
597  SSE2Impl
598 #endif
599 #ifdef Vc_IMPL_SSE4a
601 #ifdef Vc_IMPL_XOP
603 #ifdef Vc_IMPL_FMA4
605 #endif
606 #endif
607 #endif
608 #ifdef Vc_IMPL_POPCNT
610 #endif
611 #ifdef Vc_IMPL_FMA
613 #endif
614 #ifdef Vc_IMPL_BMI2
616 #endif
617 #ifdef Vc_USE_VEX_CODING
619 #endif
620  >;
621 
622 } // namespace Vc
623 
624 #include "version.h"
625 
626 #endif // VC_GLOBAL_H_
627 
628 // vim: foldmethod=marker
Implementation
Enum to identify a certain SIMD instruction set.
Definition: global.h:482
ExtraInstructions
The list of available instructions is not easily described by a linear list of instruction sets.
Definition: global.h:514
MallocAlignment
Enum that specifies the alignment and padding restrictions to use for memory allocation with Vc::mall...
Definition: global.h:452
@ AVX2Impl
x86 AVX + AVX2
Definition: global.h:498
@ MICImpl
Intel Xeon Phi.
Definition: global.h:500
@ SSE41Impl
x86 SSE + SSE2 + SSE3 + SSSE3 + SSE4.1
Definition: global.h:492
@ AVXImpl
x86 AVX
Definition: global.h:496
@ SSSE3Impl
x86 SSE + SSE2 + SSE3 + SSSE3
Definition: global.h:490
@ SSE3Impl
x86 SSE + SSE2 + SSE3
Definition: global.h:488
@ SSE2Impl
x86 SSE + SSE2
Definition: global.h:486
@ SSE42Impl
x86 SSE + SSE2 + SSE3 + SSSE3 + SSE4.1 + SSE4.2
Definition: global.h:494
@ ScalarImpl
uses only fundamental types
Definition: global.h:484
@ Bmi2Instructions
Support for BMI2 instructions.
Definition: global.h:530
@ XopInstructions
Support for XOP instructions.
Definition: global.h:520
@ Fma4Instructions
Support for FMA4 instructions.
Definition: global.h:518
@ FmaInstructions
Support for FMA instructions (3 operand variant)
Definition: global.h:526
@ VexInstructions
Support for ternary instruction coding (VEX)
Definition: global.h:528
@ Float16cInstructions
Support for float16 conversions in hardware.
Definition: global.h:516
@ PopcntInstructions
Support for the population count instruction.
Definition: global.h:522
@ Sse4aInstructions
Support for SSE4a instructions.
Definition: global.h:524
@ AlignOnPage
Align on boundary of page sizes (e.g.
Definition: global.h:470
@ AlignOnCacheline
Align on boundary of cache line sizes (e.g.
Definition: global.h:464
@ AlignOnVector
Align on boundary of vector sizes (e.g.
Definition: global.h:458
This class identifies the specific implementation Vc uses in the current translation unit in terms of...
Definition: global.h:546
static constexpr Implementation current()
Returns the currently used Vc::Implementation.
Definition: global.h:548
static constexpr bool runs_on(unsigned int extraInstructions)
Returns whether the current code would run on a CPU providing extraInstructions.
Definition: global.h:569
static constexpr bool is_between(Implementation low, Implementation high)
Returns whether the current Vc::Implementation implements at least low and at most high.
Definition: global.h:561
static constexpr bool is(Implementation impl)
Returns whether impl is the current Vc::Implementation.
Definition: global.h:553