[X86] Extend alignedstore PatFrag to cover atomic_store#197861
Conversation
|
@llvm/pr-subscribers-llvm-selectiondag @llvm/pr-subscribers-backend-x86 Author: jofrn ChangesThis commit casts floats to ints in an atomic store during AtomicExpand Smaller FP vectors (`<N x half>`, `<N x bfloat>`) are left to the DAG Store-side counterpart to #148899. Stacked on top of #197860. 5 Files Affected:
diff --git a/llvm/include/llvm/Target/TargetSelectionDAG.td b/llvm/include/llvm/Target/TargetSelectionDAG.td
index 35848f76897b3..60ed4ed1a410f 100644
--- a/llvm/include/llvm/Target/TargetSelectionDAG.td
+++ b/llvm/include/llvm/Target/TargetSelectionDAG.td
@@ -2393,6 +2393,20 @@ def atomic_store_128 :
let MemoryVT = i128;
}
+def atomic_store_128_v2i64 :
+ PatFrag<(ops node:$val, node:$ptr),
+ (atomic_store node:$val, node:$ptr)> {
+ let IsAtomic = true;
+ let MemoryVT = v2i64;
+}
+
+def atomic_store_128_v4i32 :
+ PatFrag<(ops node:$val, node:$ptr),
+ (atomic_store node:$val, node:$ptr)> {
+ let IsAtomic = true;
+ let MemoryVT = v4i32;
+}
+
//===----------------------------------------------------------------------===//
// Selection DAG Pattern Support.
//
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index fea1caf0854f5..064a1e7c138fc 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -32967,6 +32967,19 @@ X86TargetLowering::shouldCastAtomicLoadInIR(LoadInst *LI) const {
return AtomicExpansionKind::None;
}
+TargetLowering::AtomicExpansionKind
+X86TargetLowering::shouldCastAtomicStoreInIR(StoreInst *SI) const {
+ Type *Ty = SI->getValueOperand()->getType();
+ if (!Ty->getScalarType()->isFloatingPointTy())
+ return AtomicExpansionKind::None;
+ // Sub-128-bit FP vectors codegen better when DAG widening folds the value
+ // into an extractelt-from-XMM pattern, instead of an IR-level bitcast to a
+ // scalar integer (which the type legalizer scalarizes).
+ if (Ty->isVectorTy() && Ty->getPrimitiveSizeInBits() < 128)
+ return AtomicExpansionKind::None;
+ return AtomicExpansionKind::CastToInteger;
+}
+
LoadInst *
X86TargetLowering::lowerIdempotentRMWIntoFencedLoad(AtomicRMWInst *AI) const {
unsigned NativeWidth = Subtarget.is64Bit() ? 64 : 32;
diff --git a/llvm/lib/Target/X86/X86ISelLowering.h b/llvm/lib/Target/X86/X86ISelLowering.h
index 9a958525057b6..b26f95ddea388 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.h
+++ b/llvm/lib/Target/X86/X86ISelLowering.h
@@ -894,6 +894,8 @@ namespace llvm {
shouldExpandLogicAtomicRMWInIR(const AtomicRMWInst *AI) const;
TargetLoweringBase::AtomicExpansionKind
shouldCastAtomicLoadInIR(LoadInst *LI) const override;
+ TargetLoweringBase::AtomicExpansionKind
+ shouldCastAtomicStoreInIR(StoreInst *SI) const override;
void emitBitTestAtomicRMWIntrinsic(AtomicRMWInst *AI) const override;
void emitCmpArithAtomicRMWIntrinsic(AtomicRMWInst *AI) const override;
diff --git a/llvm/lib/Target/X86/X86InstrCompiler.td b/llvm/lib/Target/X86/X86InstrCompiler.td
index b2a7bce8d7571..f40edbf911e6f 100644
--- a/llvm/lib/Target/X86/X86InstrCompiler.td
+++ b/llvm/lib/Target/X86/X86InstrCompiler.td
@@ -1242,6 +1242,21 @@ def : Pat<(v4i32 (atomic_load_128_v4i32 addr:$src)),
def : Pat<(v4i32 (atomic_load_128_v4i32 addr:$src)),
(VMOVAPDZ128rm addr:$src)>, Requires<[HasAVX512]>;
+// store atomic <2 x i64>
+def : Pat<(atomic_store_128_v2i64 (v2i64 VR128:$src), addr:$dst),
+ (MOVAPDmr addr:$dst, VR128:$src)>, Requires<[UseSSE2]>;
+def : Pat<(atomic_store_128_v2i64 (v2i64 VR128:$src), addr:$dst),
+ (VMOVAPDmr addr:$dst, VR128:$src)>, Requires<[UseAVX]>;
+def : Pat<(atomic_store_128_v2i64 (v2i64 VR128X:$src), addr:$dst),
+ (VMOVAPDZ128mr addr:$dst, VR128X:$src)>, Requires<[HasAVX512]>;
+// store atomic <4 x i32>
+def : Pat<(atomic_store_128_v4i32 (v4i32 VR128:$src), addr:$dst),
+ (MOVAPDmr addr:$dst, VR128:$src)>, Requires<[UseSSE2]>;
+def : Pat<(atomic_store_128_v4i32 (v4i32 VR128:$src), addr:$dst),
+ (VMOVAPDmr addr:$dst, VR128:$src)>, Requires<[UseAVX]>;
+def : Pat<(atomic_store_128_v4i32 (v4i32 VR128X:$src), addr:$dst),
+ (VMOVAPDZ128mr addr:$dst, VR128X:$src)>, Requires<[HasAVX512]>;
+
// store atomic <2 x i8>
def : Pat<(atomic_store_16
(i16 (trunc (i32 (extractelt
@@ -1293,6 +1308,37 @@ def : Pat<(atomic_store_32
(v4i32 (bitconvert (v16i8 VR128X:$src))), (iPTR 0))),
addr:$dst),
(VMOVPDI2DIZmr addr:$dst, VR128X:$src)>, Requires<[HasAVX512]>;
+// store atomic <2 x half>, <2 x bfloat> (via widened v8f16, v8bf16)
+def : Pat<(atomic_store_32
+ (i32 (extractelt
+ (v4i32 (bitconvert (v8f16 VR128:$src))), (iPTR 0))),
+ addr:$dst),
+ (MOVPDI2DImr addr:$dst, VR128:$src)>, Requires<[UseSSE2]>;
+def : Pat<(atomic_store_32
+ (i32 (extractelt
+ (v4i32 (bitconvert (v8bf16 VR128:$src))), (iPTR 0))),
+ addr:$dst),
+ (MOVPDI2DImr addr:$dst, VR128:$src)>, Requires<[UseSSE2]>;
+def : Pat<(atomic_store_32
+ (i32 (extractelt
+ (v4i32 (bitconvert (v8f16 VR128:$src))), (iPTR 0))),
+ addr:$dst),
+ (VMOVPDI2DImr addr:$dst, VR128:$src)>, Requires<[UseAVX]>;
+def : Pat<(atomic_store_32
+ (i32 (extractelt
+ (v4i32 (bitconvert (v8bf16 VR128:$src))), (iPTR 0))),
+ addr:$dst),
+ (VMOVPDI2DImr addr:$dst, VR128:$src)>, Requires<[UseAVX]>;
+def : Pat<(atomic_store_32
+ (i32 (extractelt
+ (v4i32 (bitconvert (v8f16 VR128X:$src))), (iPTR 0))),
+ addr:$dst),
+ (VMOVPDI2DIZmr addr:$dst, VR128X:$src)>, Requires<[HasAVX512]>;
+def : Pat<(atomic_store_32
+ (i32 (extractelt
+ (v4i32 (bitconvert (v8bf16 VR128X:$src))), (iPTR 0))),
+ addr:$dst),
+ (VMOVPDI2DIZmr addr:$dst, VR128X:$src)>, Requires<[HasAVX512]>;
// store atomic <2 x i32,float>, <4 x i16>, <2 x ptr addrspace(270)>
def : Pat<(atomic_store_64
@@ -1340,6 +1386,37 @@ def : Pat<(atomic_store_64
(v2i64 (bitconvert (v8i16 VR128X:$src))), (iPTR 0))),
addr:$dst),
(VMOVPQI2QIZmr addr:$dst, VR128X:$src)>, Requires<[HasAVX512]>;
+// store atomic <4 x half>, <4 x bfloat> (via widened v8f16, v8bf16)
+def : Pat<(atomic_store_64
+ (i64 (extractelt
+ (v2i64 (bitconvert (v8f16 VR128:$src))), (iPTR 0))),
+ addr:$dst),
+ (MOVPQI2QImr addr:$dst, VR128:$src)>, Requires<[UseSSE2]>;
+def : Pat<(atomic_store_64
+ (i64 (extractelt
+ (v2i64 (bitconvert (v8bf16 VR128:$src))), (iPTR 0))),
+ addr:$dst),
+ (MOVPQI2QImr addr:$dst, VR128:$src)>, Requires<[UseSSE2]>;
+def : Pat<(atomic_store_64
+ (i64 (extractelt
+ (v2i64 (bitconvert (v8f16 VR128:$src))), (iPTR 0))),
+ addr:$dst),
+ (VMOVPQI2QImr addr:$dst, VR128:$src)>, Requires<[UseAVX]>;
+def : Pat<(atomic_store_64
+ (i64 (extractelt
+ (v2i64 (bitconvert (v8bf16 VR128:$src))), (iPTR 0))),
+ addr:$dst),
+ (VMOVPQI2QImr addr:$dst, VR128:$src)>, Requires<[UseAVX]>;
+def : Pat<(atomic_store_64
+ (i64 (extractelt
+ (v2i64 (bitconvert (v8f16 VR128X:$src))), (iPTR 0))),
+ addr:$dst),
+ (VMOVPQI2QIZmr addr:$dst, VR128X:$src)>, Requires<[HasAVX512]>;
+def : Pat<(atomic_store_64
+ (i64 (extractelt
+ (v2i64 (bitconvert (v8bf16 VR128X:$src))), (iPTR 0))),
+ addr:$dst),
+ (VMOVPQI2QIZmr addr:$dst, VR128X:$src)>, Requires<[HasAVX512]>;
// Floating point loads/stores.
def : Pat<(atomic_store_32 (i32 (bitconvert (f32 FR32:$src))), addr:$dst),
diff --git a/llvm/test/CodeGen/X86/atomic-load-store.ll b/llvm/test/CodeGen/X86/atomic-load-store.ll
index 066842739fb61..1d0d0d4dc5c6b 100644
--- a/llvm/test/CodeGen/X86/atomic-load-store.ll
+++ b/llvm/test/CodeGen/X86/atomic-load-store.ll
@@ -765,8 +765,7 @@ define void @store_atomic_vec2_half(ptr %x, <2 x half> %v) {
;
; CHECK-AVX-O3-LABEL: store_atomic_vec2_half:
; CHECK-AVX-O3: # %bb.0:
-; CHECK-AVX-O3-NEXT: vmovd %xmm0, %eax
-; CHECK-AVX-O3-NEXT: movl %eax, (%rdi)
+; CHECK-AVX-O3-NEXT: vmovss %xmm0, (%rdi)
; CHECK-AVX-O3-NEXT: retq
;
; CHECK-SSE-O0-LABEL: store_atomic_vec2_half:
@@ -788,8 +787,7 @@ define void @store_atomic_vec2_half(ptr %x, <2 x half> %v) {
;
; CHECK-AVX-O0-LABEL: store_atomic_vec2_half:
; CHECK-AVX-O0: # %bb.0:
-; CHECK-AVX-O0-NEXT: vmovd %xmm0, %eax
-; CHECK-AVX-O0-NEXT: movl %eax, (%rdi)
+; CHECK-AVX-O0-NEXT: vmovd %xmm0, (%rdi)
; CHECK-AVX-O0-NEXT: retq
store atomic <2 x half> %v, ptr %x release, align 4
ret void
@@ -809,8 +807,7 @@ define void @store_atomic_vec2_bfloat(ptr %x, <2 x bfloat> %v) {
;
; CHECK-AVX-O3-LABEL: store_atomic_vec2_bfloat:
; CHECK-AVX-O3: # %bb.0:
-; CHECK-AVX-O3-NEXT: vmovd %xmm0, %eax
-; CHECK-AVX-O3-NEXT: movl %eax, (%rdi)
+; CHECK-AVX-O3-NEXT: vmovss %xmm0, (%rdi)
; CHECK-AVX-O3-NEXT: retq
;
; CHECK-SSE-O0-LABEL: store_atomic_vec2_bfloat:
@@ -932,8 +929,7 @@ define void @store_atomic_vec4_half(ptr %x, <4 x half> %v) nounwind {
;
; CHECK-AVX-O3-LABEL: store_atomic_vec4_half:
; CHECK-AVX-O3: # %bb.0:
-; CHECK-AVX-O3-NEXT: vmovq %xmm0, %rax
-; CHECK-AVX-O3-NEXT: movq %rax, (%rdi)
+; CHECK-AVX-O3-NEXT: vmovlps %xmm0, (%rdi)
; CHECK-AVX-O3-NEXT: retq
;
; CHECK-SSE2-O0-LABEL: store_atomic_vec4_half:
@@ -1007,8 +1003,7 @@ define void @store_atomic_vec4_half(ptr %x, <4 x half> %v) nounwind {
;
; CHECK-AVX-O0-LABEL: store_atomic_vec4_half:
; CHECK-AVX-O0: # %bb.0:
-; CHECK-AVX-O0-NEXT: vmovq %xmm0, %rax
-; CHECK-AVX-O0-NEXT: movq %rax, (%rdi)
+; CHECK-AVX-O0-NEXT: vmovq %xmm0, (%rdi)
; CHECK-AVX-O0-NEXT: retq
store atomic <4 x half> %v, ptr %x release, align 8
ret void
@@ -1060,8 +1055,7 @@ define void @store_atomic_vec4_bfloat(ptr %x, <4 x bfloat> %v) nounwind {
;
; CHECK-AVX-O3-LABEL: store_atomic_vec4_bfloat:
; CHECK-AVX-O3: # %bb.0:
-; CHECK-AVX-O3-NEXT: vmovq %xmm0, %rax
-; CHECK-AVX-O3-NEXT: movq %rax, (%rdi)
+; CHECK-AVX-O3-NEXT: vmovlps %xmm0, (%rdi)
; CHECK-AVX-O3-NEXT: retq
;
; CHECK-SSE-O0-LABEL: store_atomic_vec4_bfloat:
@@ -1201,6 +1195,87 @@ define void @store_atomic_vec4_bfloat(ptr %x, <4 x bfloat> %v) nounwind {
ret void
}
+define void @store_atomic_vec4_float_align(ptr %x, <4 x float> %v) nounwind {
+; CHECK-SSE2-O3-LABEL: store_atomic_vec4_float_align:
+; CHECK-SSE2-O3: # %bb.0:
+; CHECK-SSE2-O3-NEXT: pushq %rax
+; CHECK-SSE2-O3-NEXT: movq %xmm0, %rsi
+; CHECK-SSE2-O3-NEXT: punpckhqdq {{.*#+}} xmm0 = xmm0[1,1]
+; CHECK-SSE2-O3-NEXT: movq %xmm0, %rdx
+; CHECK-SSE2-O3-NEXT: movl $3, %ecx
+; CHECK-SSE2-O3-NEXT: callq __atomic_store_16@PLT
+; CHECK-SSE2-O3-NEXT: popq %rax
+; CHECK-SSE2-O3-NEXT: retq
+;
+; CHECK-SSE4-O3-LABEL: store_atomic_vec4_float_align:
+; CHECK-SSE4-O3: # %bb.0:
+; CHECK-SSE4-O3-NEXT: pushq %rbx
+; CHECK-SSE4-O3-NEXT: pextrq $1, %xmm0, %rcx
+; CHECK-SSE4-O3-NEXT: movq %xmm0, %rbx
+; CHECK-SSE4-O3-NEXT: movq (%rdi), %rax
+; CHECK-SSE4-O3-NEXT: movq 8(%rdi), %rdx
+; CHECK-SSE4-O3-NEXT: .p2align 4
+; CHECK-SSE4-O3-NEXT: .LBB39_1: # %atomicrmw.start
+; CHECK-SSE4-O3-NEXT: # =>This Inner Loop Header: Depth=1
+; CHECK-SSE4-O3-NEXT: lock cmpxchg16b (%rdi)
+; CHECK-SSE4-O3-NEXT: jne .LBB39_1
+; CHECK-SSE4-O3-NEXT: # %bb.2: # %atomicrmw.end
+; CHECK-SSE4-O3-NEXT: popq %rbx
+; CHECK-SSE4-O3-NEXT: retq
+;
+; CHECK-AVX-O3-LABEL: store_atomic_vec4_float_align:
+; CHECK-AVX-O3: # %bb.0:
+; CHECK-AVX-O3-NEXT: vmovaps %xmm0, (%rdi)
+; CHECK-AVX-O3-NEXT: retq
+;
+; CHECK-SSE2-O0-LABEL: store_atomic_vec4_float_align:
+; CHECK-SSE2-O0: # %bb.0:
+; CHECK-SSE2-O0-NEXT: pushq %rax
+; CHECK-SSE2-O0-NEXT: movq %xmm0, %rsi
+; CHECK-SSE2-O0-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1,1]
+; CHECK-SSE2-O0-NEXT: movq %xmm0, %rdx
+; CHECK-SSE2-O0-NEXT: movl $3, %ecx
+; CHECK-SSE2-O0-NEXT: callq __atomic_store_16@PLT
+; CHECK-SSE2-O0-NEXT: popq %rax
+; CHECK-SSE2-O0-NEXT: retq
+;
+; CHECK-SSE4-O0-LABEL: store_atomic_vec4_float_align:
+; CHECK-SSE4-O0: # %bb.0:
+; CHECK-SSE4-O0-NEXT: pushq %rbx
+; CHECK-SSE4-O0-NEXT: movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; CHECK-SSE4-O0-NEXT: pextrq $1, %xmm0, %rax
+; CHECK-SSE4-O0-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; CHECK-SSE4-O0-NEXT: movq %xmm0, %rax
+; CHECK-SSE4-O0-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; CHECK-SSE4-O0-NEXT: movq (%rdi), %rax
+; CHECK-SSE4-O0-NEXT: movq 8(%rdi), %rdx
+; CHECK-SSE4-O0-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; CHECK-SSE4-O0-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; CHECK-SSE4-O0-NEXT: jmp .LBB39_1
+; CHECK-SSE4-O0-NEXT: .LBB39_1: # %atomicrmw.start
+; CHECK-SSE4-O0-NEXT: # =>This Inner Loop Header: Depth=1
+; CHECK-SSE4-O0-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
+; CHECK-SSE4-O0-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; CHECK-SSE4-O0-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload
+; CHECK-SSE4-O0-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Reload
+; CHECK-SSE4-O0-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
+; CHECK-SSE4-O0-NEXT: lock cmpxchg16b (%rsi)
+; CHECK-SSE4-O0-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; CHECK-SSE4-O0-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; CHECK-SSE4-O0-NEXT: jne .LBB39_1
+; CHECK-SSE4-O0-NEXT: jmp .LBB39_2
+; CHECK-SSE4-O0-NEXT: .LBB39_2: # %atomicrmw.end
+; CHECK-SSE4-O0-NEXT: popq %rbx
+; CHECK-SSE4-O0-NEXT: retq
+;
+; CHECK-AVX-O0-LABEL: store_atomic_vec4_float_align:
+; CHECK-AVX-O0: # %bb.0:
+; CHECK-AVX-O0-NEXT: vmovaps %xmm0, (%rdi)
+; CHECK-AVX-O0-NEXT: retq
+ store atomic <4 x float> %v, ptr %x release, align 16
+ ret void
+}
+
define <2 x half> @atomic_vec2_half(ptr %x) {
; CHECK-SSE-O3-LABEL: atomic_vec2_half:
; CHECK-SSE-O3: # %bb.0:
|
|
|
| def : Pat<(v4i32 (atomic_load_128_v4i32 addr:$src)), | ||
| (VMOVAPDZ128rm addr:$src)>, Requires<[HasAVX512]>; | ||
|
|
||
| // store atomic <2 x i64> |
There was a problem hiding this comment.
This shouldn't require duplicating all of these patterns. There surely must be existing store patterns that can be shared?
There was a problem hiding this comment.
Right, I was wondering the same. I'll take a closer look. Thanks.
There was a problem hiding this comment.
I expect you to need a PatFrag common to store + atomic_store, and update the patterns to use that
There was a problem hiding this comment.
Did this one. Thanks.
| // Sub-128-bit FP vectors codegen better when DAG widening folds the value | ||
| // into an extractelt-from-XMM pattern, instead of an IR-level bitcast to a | ||
| // scalar integer (which the type legalizer scalarizes). |
There was a problem hiding this comment.
I think this is thinking too much about this. The goal is to fully remove shouldCastAtomicStoreInIR
There was a problem hiding this comment.
Mm hm, I can't help yet utilize it. Let me look at this one more closely alsos. thank ya.
01c3c98 to
e0dca2b
Compare
6b8f49c to
48ba9b2
Compare
e0dca2b to
e4c9611
Compare
48ba9b2 to
b110a11
Compare
RKSimon
left a comment
There was a problem hiding this comment.
Do you have ALIGNED xmm/ymm/zmm atomic stores? I can't see any in this version of atomic-load-store.ll
e.g. all I can find is store atomic <8 x double> %v, ptr %x release, align 4
|
|
||
| multiclass avx512_store<bits<8> opc, string OpcodeStr, string BaseName, | ||
| X86VectorVTInfo _, PatFrag st_frag, PatFrag mstore, | ||
| X86VectorVTInfo _, PatFrags st_frag, PatFrag mstore, |
There was a problem hiding this comment.
Usually this would be SDPatternOperator
b110a11 to
e65b07f
Compare
e4c9611 to
a839f91
Compare
e65b07f to
4314cab
Compare
4314cab to
07d1319
Compare
9a15cc0 to
9f6ca39
Compare
07d1319 to
1d7267a
Compare
9f6ca39 to
98275c5
Compare
1d7267a to
0d958f9
Compare
RKSimon
left a comment
There was a problem hiding this comment.
Do we have any aligned vector stores wider that 128-bits?
Those will be translated to library calls as per the X86TargetLowering::X86TargetLowering ctor. |
And we do have test coverage to show them doing that? |
Nope; and I believe we are missing them for loads as well. Perhaps they can be separate commits. |
| auto *St = cast<MemSDNode>(N); | ||
| return St->getAlign() >= St->getMemoryVT().getStoreSize(); | ||
| }]> { | ||
| let GISelPredicateCode = [{ |
There was a problem hiding this comment.
These custom alignment predicates should be removed. This predates having a dedicated MinAlignment field on PatFrags. This would also provide DAG + gisel support without maintaining these 2 custom predicates
There was a problem hiding this comment.
Alright, let me go about seeking how to implement this. Without the frag, alignedstore or MinAlignment; for instance, vmovaps is selected for in AVX checks, and it would not be otherwise. Thanks.
4cdbb5c to
ae93fed
Compare
🪟 Windows x64 Test Results
✅ The build succeeded and all tests passed. |
Vector types that aren't widened are split so that a single ATOMIC_STORE is issued for the entire vector at once. This enables SelectionDAG to translate vectors with type bfloat,half. Store-side counterpart to llvm#165818. Stacked on top of llvm#197619; and below of llvm#197861.
ae93fed to
e2d74b1
Compare
Smaller FP vectors (
<N x half>,<N x bfloat>) are left to the DAGwiden path on subtargets without native FP16/BF16 support; the
v8f16/v8bf16 bitconvert variants added to the Atomic Store Split commit's patterns let the
widened path collapse to a single instruction on AVX+ targets.
Store-side counterpart to #148899. Stacked on top of #201980; and below of #201566.