Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions doc/release/1.15.0-notes.rst
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,11 @@ Improvements
This means it takes all the same arguments, making more code written for
ndarray work for masked array too.

Enable AVX2/AVX512 at compile time
-------------------------------------------------------------
Change to simd.inc.src to use AVX2 or AVX512 at compile time. Solving the gap
that if compile numpy for avx2 (or 512) with -march=native, still get the SSE
code for the simd functions even though rest of the code gets AVX2.

Changes
=======
199 changes: 199 additions & 0 deletions numpy/core/src/umath/simd.inc.src
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,12 @@
#include "numpy/npy_math.h"
#ifdef NPY_HAVE_SSE2_INTRINSICS
#include <emmintrin.h>
#if !defined(_MSC_VER) || _MSC_VER >= 1600
#include <immintrin.h>
#else
#undef __AVX2__
#undef __AVX512F__
#endif
#endif
#include <assert.h>
#include <stdlib.h>
Expand Down Expand Up @@ -401,7 +407,11 @@ static NPY_INLINE npy_double sse2_horizontal_@VOP@___m128d(__m128d v)
* #scalarf = npy_sqrtf, npy_sqrt#
* #c = f, #
* #vtype = __m128, __m128d#
* #vtype256 = __m256, __m256d#
* #vtype512 = __m512, __m512d#
* #vpre = _mm, _mm#
* #vpre256 = _mm256, _mm256#
* #vpre512 = _mm512, _mm512#
* #vsuf = ps, pd#
* #vsufs = ss, sd#
* #nan = NPY_NANF, NPY_NAN#
Expand All @@ -420,6 +430,115 @@ static NPY_INLINE npy_double sse2_horizontal_@VOP@___m128d(__m128d v)
static void
sse2_binary_@kind@_@TYPE@(@type@ * op, @type@ * ip1, @type@ * ip2, npy_intp n)
{
#ifdef __AVX512F__
LOOP_BLOCK_ALIGN_VAR(op, @type@, 64)
op[i] = ip1[i] @OP@ ip2[i];
/* lots of specializations, to squeeze out max performance */
if (npy_is_aligned(&ip1[i], 64) && npy_is_aligned(&ip2[i], 64)) {
if (ip1 == ip2) {
LOOP_BLOCKED(@type@, 64) {
@vtype512@ a = @vpre512@_load_@vsuf@(&ip1[i]);
@vtype512@ c = @vpre512@_@VOP@_@vsuf@(a, a);
@vpre512@_store_@vsuf@(&op[i], c);
}
}
else {
LOOP_BLOCKED(@type@, 64) {
@vtype512@ a = @vpre512@_load_@vsuf@(&ip1[i]);
@vtype512@ b = @vpre512@_load_@vsuf@(&ip2[i]);
@vtype512@ c = @vpre512@_@VOP@_@vsuf@(a, b);
@vpre512@_store_@vsuf@(&op[i], c);
}
}
}
else if (npy_is_aligned(&ip1[i], 64)) {
LOOP_BLOCKED(@type@, 64) {
@vtype512@ a = @vpre512@_load_@vsuf@(&ip1[i]);
@vtype512@ b = @vpre512@_loadu_@vsuf@(&ip2[i]);
@vtype512@ c = @vpre512@_@VOP@_@vsuf@(a, b);
@vpre512@_store_@vsuf@(&op[i], c);
}
}
else if (npy_is_aligned(&ip2[i], 64)) {
LOOP_BLOCKED(@type@, 64) {
@vtype512@ a = @vpre512@_loadu_@vsuf@(&ip1[i]);
@vtype512@ b = @vpre512@_load_@vsuf@(&ip2[i]);
@vtype512@ c = @vpre512@_@VOP@_@vsuf@(a, b);
@vpre512@_store_@vsuf@(&op[i], c);
}
}
else {
if (ip1 == ip2) {
LOOP_BLOCKED(@type@, 64) {
@vtype512@ a = @vpre512@_loadu_@vsuf@(&ip1[i]);
@vtype512@ c = @vpre512@_@VOP@_@vsuf@(a, a);
@vpre512@_store_@vsuf@(&op[i], c);
}
}
else {
LOOP_BLOCKED(@type@, 64) {
@vtype512@ a = @vpre512@_loadu_@vsuf@(&ip1[i]);
@vtype512@ b = @vpre512@_loadu_@vsuf@(&ip2[i]);
@vtype512@ c = @vpre512@_@VOP@_@vsuf@(a, b);
@vpre512@_store_@vsuf@(&op[i], c);
}
}
}
#elif __AVX2__
LOOP_BLOCK_ALIGN_VAR(op, @type@, 32)
op[i] = ip1[i] @OP@ ip2[i];
/* lots of specializations, to squeeze out max performance */
if (npy_is_aligned(&ip1[i], 32) && npy_is_aligned(&ip2[i], 32)) {
if (ip1 == ip2) {
LOOP_BLOCKED(@type@, 32) {
@vtype256@ a = @vpre256@_load_@vsuf@(&ip1[i]);
@vtype256@ c = @vpre256@_@VOP@_@vsuf@(a, a);
@vpre256@_store_@vsuf@(&op[i], c);
}
}
else {
LOOP_BLOCKED(@type@, 32) {
@vtype256@ a = @vpre256@_load_@vsuf@(&ip1[i]);
@vtype256@ b = @vpre256@_load_@vsuf@(&ip2[i]);
@vtype256@ c = @vpre256@_@VOP@_@vsuf@(a, b);
@vpre256@_store_@vsuf@(&op[i], c);
}
}
}
else if (npy_is_aligned(&ip1[i], 32)) {
LOOP_BLOCKED(@type@, 32) {
@vtype256@ a = @vpre256@_load_@vsuf@(&ip1[i]);
@vtype256@ b = @vpre256@_loadu_@vsuf@(&ip2[i]);
@vtype256@ c = @vpre256@_@VOP@_@vsuf@(a, b);
@vpre256@_store_@vsuf@(&op[i], c);
}
}
else if (npy_is_aligned(&ip2[i], 32)) {
LOOP_BLOCKED(@type@, 32) {
@vtype256@ a = @vpre256@_loadu_@vsuf@(&ip1[i]);
@vtype256@ b = @vpre256@_load_@vsuf@(&ip2[i]);
@vtype256@ c = @vpre256@_@VOP@_@vsuf@(a, b);
@vpre256@_store_@vsuf@(&op[i], c);
}
}
else {
if (ip1 == ip2) {
LOOP_BLOCKED(@type@, 32) {
@vtype256@ a = @vpre256@_loadu_@vsuf@(&ip1[i]);
@vtype256@ c = @vpre256@_@VOP@_@vsuf@(a, a);
@vpre256@_store_@vsuf@(&op[i], c);
}
}
else {
LOOP_BLOCKED(@type@, 32) {
@vtype256@ a = @vpre256@_loadu_@vsuf@(&ip1[i]);
@vtype256@ b = @vpre256@_loadu_@vsuf@(&ip2[i]);
@vtype256@ c = @vpre256@_@VOP@_@vsuf@(a, b);
@vpre256@_store_@vsuf@(&op[i], c);
}
}
}
#else
LOOP_BLOCK_ALIGN_VAR(op, @type@, 16)
op[i] = ip1[i] @OP@ ip2[i];
/* lots of specializations, to squeeze out max performance */
Expand Down Expand Up @@ -473,6 +592,7 @@ sse2_binary_@kind@_@TYPE@(@type@ * op, @type@ * ip1, @type@ * ip2, npy_intp n)
}
}
}
#endif
LOOP_BLOCKED_END {
op[i] = ip1[i] @OP@ ip2[i];
}
Expand All @@ -482,6 +602,45 @@ sse2_binary_@kind@_@TYPE@(@type@ * op, @type@ * ip1, @type@ * ip2, npy_intp n)
static void
sse2_binary_scalar1_@kind@_@TYPE@(@type@ * op, @type@ * ip1, @type@ * ip2, npy_intp n)
{
#ifdef __AVX512F__
const @vtype512@ a = @vpre512@_set1_@vsuf@(ip1[0]);
LOOP_BLOCK_ALIGN_VAR(op, @type@, 64)
op[i] = ip1[0] @OP@ ip2[i];
if (npy_is_aligned(&ip2[i], 64)) {
LOOP_BLOCKED(@type@, 64) {
@vtype512@ b = @vpre512@_load_@vsuf@(&ip2[i]);
@vtype512@ c = @vpre512@_@VOP@_@vsuf@(a, b);
@vpre512@_store_@vsuf@(&op[i], c);
}
}
else {
LOOP_BLOCKED(@type@, 64) {
@vtype512@ b = @vpre512@_loadu_@vsuf@(&ip2[i]);
@vtype512@ c = @vpre512@_@VOP@_@vsuf@(a, b);
@vpre512@_store_@vsuf@(&op[i], c);
}
}


#elif __AVX2__
const @vtype256@ a = @vpre256@_set1_@vsuf@(ip1[0]);
LOOP_BLOCK_ALIGN_VAR(op, @type@, 32)
op[i] = ip1[0] @OP@ ip2[i];
if (npy_is_aligned(&ip2[i], 32)) {
LOOP_BLOCKED(@type@, 32) {
@vtype256@ b = @vpre256@_load_@vsuf@(&ip2[i]);
@vtype256@ c = @vpre256@_@VOP@_@vsuf@(a, b);
@vpre256@_store_@vsuf@(&op[i], c);
}
}
else {
LOOP_BLOCKED(@type@, 32) {
@vtype256@ b = @vpre256@_loadu_@vsuf@(&ip2[i]);
@vtype256@ c = @vpre256@_@VOP@_@vsuf@(a, b);
@vpre256@_store_@vsuf@(&op[i], c);
}
}
#else
const @vtype@ a = @vpre@_set1_@vsuf@(ip1[0]);
LOOP_BLOCK_ALIGN_VAR(op, @type@, 16)
op[i] = ip1[0] @OP@ ip2[i];
Expand All @@ -499,6 +658,7 @@ sse2_binary_scalar1_@kind@_@TYPE@(@type@ * op, @type@ * ip1, @type@ * ip2, npy_i
@vpre@_store_@vsuf@(&op[i], c);
}
}
#endif
LOOP_BLOCKED_END {
op[i] = ip1[0] @OP@ ip2[i];
}
Expand All @@ -508,6 +668,44 @@ sse2_binary_scalar1_@kind@_@TYPE@(@type@ * op, @type@ * ip1, @type@ * ip2, npy_i
static void
sse2_binary_scalar2_@kind@_@TYPE@(@type@ * op, @type@ * ip1, @type@ * ip2, npy_intp n)
{
#ifdef __AVX512F__
const @vtype512@ b = @vpre512@_set1_@vsuf@(ip2[0]);
LOOP_BLOCK_ALIGN_VAR(op, @type@, 64)
op[i] = ip1[i] @OP@ ip2[0];
if (npy_is_aligned(&ip1[i], 64)) {
LOOP_BLOCKED(@type@, 64) {
@vtype512@ a = @vpre512@_load_@vsuf@(&ip1[i]);
@vtype512@ c = @vpre512@_@VOP@_@vsuf@(a, b);
@vpre512@_store_@vsuf@(&op[i], c);
}
}
else {
LOOP_BLOCKED(@type@, 64) {
@vtype512@ a = @vpre512@_loadu_@vsuf@(&ip1[i]);
@vtype512@ c = @vpre512@_@VOP@_@vsuf@(a, b);
@vpre512@_store_@vsuf@(&op[i], c);
}
}

#elif __AVX2__
const @vtype256@ b = @vpre256@_set1_@vsuf@(ip2[0]);
LOOP_BLOCK_ALIGN_VAR(op, @type@, 32)
op[i] = ip1[i] @OP@ ip2[0];
if (npy_is_aligned(&ip1[i], 32)) {
LOOP_BLOCKED(@type@, 32) {
@vtype256@ a = @vpre256@_load_@vsuf@(&ip1[i]);
@vtype256@ c = @vpre256@_@VOP@_@vsuf@(a, b);
@vpre256@_store_@vsuf@(&op[i], c);
}
}
else {
LOOP_BLOCKED(@type@, 32) {
@vtype256@ a = @vpre256@_loadu_@vsuf@(&ip1[i]);
@vtype256@ c = @vpre256@_@VOP@_@vsuf@(a, b);
@vpre256@_store_@vsuf@(&op[i], c);
}
}
#else
const @vtype@ b = @vpre@_set1_@vsuf@(ip2[0]);
LOOP_BLOCK_ALIGN_VAR(op, @type@, 16)
op[i] = ip1[i] @OP@ ip2[0];
Expand All @@ -525,6 +723,7 @@ sse2_binary_scalar2_@kind@_@TYPE@(@type@ * op, @type@ * ip1, @type@ * ip2, npy_i
@vpre@_store_@vsuf@(&op[i], c);
}
}
#endif
LOOP_BLOCKED_END {
op[i] = ip1[i] @OP@ ip2[0];
}
Expand Down