[SelectionDAG] Split vector types for atomic store#197860
Conversation
|
@llvm/pr-subscribers-llvm-selectiondag @llvm/pr-subscribers-backend-x86 Author: jofrn ChangesVector types that aren't widened are split so that a single ATOMIC_STORE Store-side counterpart to #165818. Stacked on top of #197619. Patch is 22.65 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/197860.diff 3 Files Affected:
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h b/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h
index a1c0e68049544..450eba435cc0b 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h
@@ -986,6 +986,7 @@ class LLVM_LIBRARY_VISIBILITY DAGTypeLegalizer {
SDValue SplitVecOp_ExtVecInRegOp(SDNode *N);
SDValue SplitVecOp_FAKE_USE(SDNode *N);
SDValue SplitVecOp_STORE(StoreSDNode *N, unsigned OpNo);
+ SDValue SplitVecOp_ATOMIC_STORE(AtomicSDNode *N);
SDValue SplitVecOp_VP_STORE(VPStoreSDNode *N, unsigned OpNo);
SDValue SplitVecOp_VP_STRIDED_STORE(VPStridedStoreSDNode *N, unsigned OpNo);
SDValue SplitVecOp_MSTORE(MaskedStoreSDNode *N, unsigned OpNo);
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
index 1cc78382f025f..66574e6cea263 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
@@ -3767,6 +3767,9 @@ bool DAGTypeLegalizer::SplitVectorOperand(SDNode *N, unsigned OpNo) {
case ISD::STORE:
Res = SplitVecOp_STORE(cast<StoreSDNode>(N), OpNo);
break;
+ case ISD::ATOMIC_STORE:
+ Res = SplitVecOp_ATOMIC_STORE(cast<AtomicSDNode>(N));
+ break;
case ISD::VP_STORE:
Res = SplitVecOp_VP_STORE(cast<VPStoreSDNode>(N), OpNo);
break;
@@ -4704,6 +4707,23 @@ SDValue DAGTypeLegalizer::SplitVecOp_STORE(StoreSDNode *N, unsigned OpNo) {
return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Lo, Hi);
}
+SDValue DAGTypeLegalizer::SplitVecOp_ATOMIC_STORE(AtomicSDNode *N) {
+ SDLoc DL(N);
+ SDValue StVal = N->getVal();
+ EVT VT = StVal.getValueType();
+
+ // Issue a single atomic store of an integer that spans the full memory
+ // width. Bitcasting the (illegal) vector value to that integer lets the
+ // type legalizer further legalize the BITCAST input as needed, while the
+ // ATOMIC_STORE itself uses only the legal integer type.
+ EVT IntVT = EVT::getIntegerVT(*DAG.getContext(), VT.getSizeInBits());
+ EVT MemIntVT =
+ EVT::getIntegerVT(*DAG.getContext(), N->getMemoryVT().getSizeInBits());
+ SDValue AsInt = DAG.getBitcast(IntVT, StVal);
+ return DAG.getAtomic(ISD::ATOMIC_STORE, DL, MemIntVT, N->getChain(), AsInt,
+ N->getBasePtr(), N->getMemOperand());
+}
+
SDValue DAGTypeLegalizer::SplitVecOp_CONCAT_VECTORS(SDNode *N) {
SDLoc DL(N);
diff --git a/llvm/test/CodeGen/X86/atomic-load-store.ll b/llvm/test/CodeGen/X86/atomic-load-store.ll
index 91c4d0a3d8c1c..066842739fb61 100644
--- a/llvm/test/CodeGen/X86/atomic-load-store.ll
+++ b/llvm/test/CodeGen/X86/atomic-load-store.ll
@@ -751,6 +751,456 @@ define <2 x float> @atomic_vec2_float_align(ptr %x) {
ret <2 x float> %ret
}
+define void @store_atomic_vec2_half(ptr %x, <2 x half> %v) {
+; CHECK-SSE-O3-LABEL: store_atomic_vec2_half:
+; CHECK-SSE-O3: # %bb.0:
+; CHECK-SSE-O3-NEXT: pextrw $0, %xmm0, %eax
+; CHECK-SSE-O3-NEXT: psrld $16, %xmm0
+; CHECK-SSE-O3-NEXT: pextrw $0, %xmm0, %ecx
+; CHECK-SSE-O3-NEXT: shll $16, %ecx
+; CHECK-SSE-O3-NEXT: movzwl %ax, %eax
+; CHECK-SSE-O3-NEXT: orl %ecx, %eax
+; CHECK-SSE-O3-NEXT: movl %eax, (%rdi)
+; CHECK-SSE-O3-NEXT: retq
+;
+; CHECK-AVX-O3-LABEL: store_atomic_vec2_half:
+; CHECK-AVX-O3: # %bb.0:
+; CHECK-AVX-O3-NEXT: vmovd %xmm0, %eax
+; CHECK-AVX-O3-NEXT: movl %eax, (%rdi)
+; CHECK-AVX-O3-NEXT: retq
+;
+; CHECK-SSE-O0-LABEL: store_atomic_vec2_half:
+; CHECK-SSE-O0: # %bb.0:
+; CHECK-SSE-O0-NEXT: movaps %xmm0, %xmm1
+; CHECK-SSE-O0-NEXT: movaps %xmm1, %xmm0
+; CHECK-SSE-O0-NEXT: psrld $16, %xmm1
+; CHECK-SSE-O0-NEXT: pextrw $0, %xmm1, %eax
+; CHECK-SSE-O0-NEXT: # kill: def $ax killed $ax killed $eax
+; CHECK-SSE-O0-NEXT: # implicit-def: $ecx
+; CHECK-SSE-O0-NEXT: movw %ax, %cx
+; CHECK-SSE-O0-NEXT: shll $16, %ecx
+; CHECK-SSE-O0-NEXT: pextrw $0, %xmm0, %eax
+; CHECK-SSE-O0-NEXT: # kill: def $ax killed $ax killed $eax
+; CHECK-SSE-O0-NEXT: movzwl %ax, %eax
+; CHECK-SSE-O0-NEXT: orl %ecx, %eax
+; CHECK-SSE-O0-NEXT: movl %eax, (%rdi)
+; CHECK-SSE-O0-NEXT: retq
+;
+; CHECK-AVX-O0-LABEL: store_atomic_vec2_half:
+; CHECK-AVX-O0: # %bb.0:
+; CHECK-AVX-O0-NEXT: vmovd %xmm0, %eax
+; CHECK-AVX-O0-NEXT: movl %eax, (%rdi)
+; CHECK-AVX-O0-NEXT: retq
+ store atomic <2 x half> %v, ptr %x release, align 4
+ ret void
+}
+
+define void @store_atomic_vec2_bfloat(ptr %x, <2 x bfloat> %v) {
+; CHECK-SSE-O3-LABEL: store_atomic_vec2_bfloat:
+; CHECK-SSE-O3: # %bb.0:
+; CHECK-SSE-O3-NEXT: pextrw $0, %xmm0, %eax
+; CHECK-SSE-O3-NEXT: psrld $16, %xmm0
+; CHECK-SSE-O3-NEXT: pextrw $0, %xmm0, %ecx
+; CHECK-SSE-O3-NEXT: shll $16, %ecx
+; CHECK-SSE-O3-NEXT: movzwl %ax, %eax
+; CHECK-SSE-O3-NEXT: orl %ecx, %eax
+; CHECK-SSE-O3-NEXT: movl %eax, (%rdi)
+; CHECK-SSE-O3-NEXT: retq
+;
+; CHECK-AVX-O3-LABEL: store_atomic_vec2_bfloat:
+; CHECK-AVX-O3: # %bb.0:
+; CHECK-AVX-O3-NEXT: vmovd %xmm0, %eax
+; CHECK-AVX-O3-NEXT: movl %eax, (%rdi)
+; CHECK-AVX-O3-NEXT: retq
+;
+; CHECK-SSE-O0-LABEL: store_atomic_vec2_bfloat:
+; CHECK-SSE-O0: # %bb.0:
+; CHECK-SSE-O0-NEXT: subq $24, %rsp
+; CHECK-SSE-O0-NEXT: .cfi_def_cfa_offset 32
+; CHECK-SSE-O0-NEXT: movaps %xmm0, %xmm1
+; CHECK-SSE-O0-NEXT: movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; CHECK-SSE-O0-NEXT: pextrw $1, %xmm1, %eax
+; CHECK-SSE-O0-NEXT: shll $16, %eax
+; CHECK-SSE-O0-NEXT: movd %eax, %xmm0
+; CHECK-SSE-O0-NEXT: movd %xmm1, %eax
+; CHECK-SSE-O0-NEXT: shll $16, %eax
+; CHECK-SSE-O0-NEXT: movd %eax, %xmm1
+; CHECK-SSE-O0-NEXT: movss %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; CHECK-SSE-O0-NEXT: callq __truncsfbf2@PLT
+; CHECK-SSE-O0-NEXT: movaps %xmm0, %xmm1
+; CHECK-SSE-O0-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload
+; CHECK-SSE-O0-NEXT: # xmm0 = mem[0],zero,zero,zero
+; CHECK-SSE-O0-NEXT: pextrw $0, %xmm1, %eax
+; CHECK-SSE-O0-NEXT: movw %ax, %cx
+; CHECK-SSE-O0-NEXT: # implicit-def: $eax
+; CHECK-SSE-O0-NEXT: movw %cx, %ax
+; CHECK-SSE-O0-NEXT: shll $16, %eax
+; CHECK-SSE-O0-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; CHECK-SSE-O0-NEXT: callq __truncsfbf2@PLT
+; CHECK-SSE-O0-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 4-byte Reload
+; CHECK-SSE-O0-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload
+; CHECK-SSE-O0-NEXT: pextrw $0, %xmm0, %eax
+; CHECK-SSE-O0-NEXT: # kill: def $ax killed $ax killed $eax
+; CHECK-SSE-O0-NEXT: movzwl %ax, %eax
+; CHECK-SSE-O0-NEXT: orl %ecx, %eax
+; CHECK-SSE-O0-NEXT: movl %eax, (%rdi)
+; CHECK-SSE-O0-NEXT: addq $24, %rsp
+; CHECK-SSE-O0-NEXT: .cfi_def_cfa_offset 8
+; CHECK-SSE-O0-NEXT: retq
+;
+; CHECK-AVX-O0-LABEL: store_atomic_vec2_bfloat:
+; CHECK-AVX-O0: # %bb.0:
+; CHECK-AVX-O0-NEXT: subq $24, %rsp
+; CHECK-AVX-O0-NEXT: .cfi_def_cfa_offset 32
+; CHECK-AVX-O0-NEXT: vmovaps %xmm0, %xmm1
+; CHECK-AVX-O0-NEXT: movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; CHECK-AVX-O0-NEXT: vpextrw $1, %xmm1, %eax
+; CHECK-AVX-O0-NEXT: shll $16, %eax
+; CHECK-AVX-O0-NEXT: vmovd %eax, %xmm0
+; CHECK-AVX-O0-NEXT: vmovd %xmm1, %eax
+; CHECK-AVX-O0-NEXT: shll $16, %eax
+; CHECK-AVX-O0-NEXT: vmovd %eax, %xmm1
+; CHECK-AVX-O0-NEXT: vmovss %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; CHECK-AVX-O0-NEXT: callq __truncsfbf2@PLT
+; CHECK-AVX-O0-NEXT: vmovaps %xmm0, %xmm1
+; CHECK-AVX-O0-NEXT: vmovss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload
+; CHECK-AVX-O0-NEXT: # xmm0 = mem[0],zero,zero,zero
+; CHECK-AVX-O0-NEXT: vpextrw $0, %xmm1, %eax
+; CHECK-AVX-O0-NEXT: movw %ax, %cx
+; CHECK-AVX-O0-NEXT: # implicit-def: $eax
+; CHECK-AVX-O0-NEXT: movw %cx, %ax
+; CHECK-AVX-O0-NEXT: shll $16, %eax
+; CHECK-AVX-O0-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; CHECK-AVX-O0-NEXT: callq __truncsfbf2@PLT
+; CHECK-AVX-O0-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 4-byte Reload
+; CHECK-AVX-O0-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload
+; CHECK-AVX-O0-NEXT: vpextrw $0, %xmm0, %eax
+; CHECK-AVX-O0-NEXT: # kill: def $ax killed $ax killed $eax
+; CHECK-AVX-O0-NEXT: movzwl %ax, %eax
+; CHECK-AVX-O0-NEXT: orl %ecx, %eax
+; CHECK-AVX-O0-NEXT: movl %eax, (%rdi)
+; CHECK-AVX-O0-NEXT: addq $24, %rsp
+; CHECK-AVX-O0-NEXT: .cfi_def_cfa_offset 8
+; CHECK-AVX-O0-NEXT: retq
+ store atomic <2 x bfloat> %v, ptr %x release, align 4
+ ret void
+}
+
+define void @store_atomic_vec4_half(ptr %x, <4 x half> %v) nounwind {
+; CHECK-SSE2-O3-LABEL: store_atomic_vec4_half:
+; CHECK-SSE2-O3: # %bb.0:
+; CHECK-SSE2-O3-NEXT: movdqa %xmm0, %xmm1
+; CHECK-SSE2-O3-NEXT: psrld $16, %xmm1
+; CHECK-SSE2-O3-NEXT: pextrw $0, %xmm1, %eax
+; CHECK-SSE2-O3-NEXT: pextrw $0, %xmm0, %ecx
+; CHECK-SSE2-O3-NEXT: shll $16, %eax
+; CHECK-SSE2-O3-NEXT: movzwl %cx, %ecx
+; CHECK-SSE2-O3-NEXT: orl %eax, %ecx
+; CHECK-SSE2-O3-NEXT: movdqa %xmm0, %xmm1
+; CHECK-SSE2-O3-NEXT: psrlq $48, %xmm1
+; CHECK-SSE2-O3-NEXT: pextrw $0, %xmm1, %eax
+; CHECK-SSE2-O3-NEXT: shll $16, %eax
+; CHECK-SSE2-O3-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,1,1]
+; CHECK-SSE2-O3-NEXT: pextrw $0, %xmm0, %edx
+; CHECK-SSE2-O3-NEXT: movzwl %dx, %edx
+; CHECK-SSE2-O3-NEXT: orl %eax, %edx
+; CHECK-SSE2-O3-NEXT: shlq $32, %rdx
+; CHECK-SSE2-O3-NEXT: orq %rcx, %rdx
+; CHECK-SSE2-O3-NEXT: movq %rdx, (%rdi)
+; CHECK-SSE2-O3-NEXT: retq
+;
+; CHECK-SSE4-O3-LABEL: store_atomic_vec4_half:
+; CHECK-SSE4-O3: # %bb.0:
+; CHECK-SSE4-O3-NEXT: movdqa %xmm0, %xmm1
+; CHECK-SSE4-O3-NEXT: psrld $16, %xmm1
+; CHECK-SSE4-O3-NEXT: pextrw $0, %xmm1, %eax
+; CHECK-SSE4-O3-NEXT: shll $16, %eax
+; CHECK-SSE4-O3-NEXT: pextrw $0, %xmm0, %ecx
+; CHECK-SSE4-O3-NEXT: movzwl %cx, %ecx
+; CHECK-SSE4-O3-NEXT: orl %eax, %ecx
+; CHECK-SSE4-O3-NEXT: movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
+; CHECK-SSE4-O3-NEXT: psrlq $48, %xmm0
+; CHECK-SSE4-O3-NEXT: pextrw $0, %xmm0, %eax
+; CHECK-SSE4-O3-NEXT: shll $16, %eax
+; CHECK-SSE4-O3-NEXT: pextrw $0, %xmm1, %edx
+; CHECK-SSE4-O3-NEXT: movzwl %dx, %edx
+; CHECK-SSE4-O3-NEXT: orl %eax, %edx
+; CHECK-SSE4-O3-NEXT: shlq $32, %rdx
+; CHECK-SSE4-O3-NEXT: orq %rcx, %rdx
+; CHECK-SSE4-O3-NEXT: movq %rdx, (%rdi)
+; CHECK-SSE4-O3-NEXT: retq
+;
+; CHECK-AVX-O3-LABEL: store_atomic_vec4_half:
+; CHECK-AVX-O3: # %bb.0:
+; CHECK-AVX-O3-NEXT: vmovq %xmm0, %rax
+; CHECK-AVX-O3-NEXT: movq %rax, (%rdi)
+; CHECK-AVX-O3-NEXT: retq
+;
+; CHECK-SSE2-O0-LABEL: store_atomic_vec4_half:
+; CHECK-SSE2-O0: # %bb.0:
+; CHECK-SSE2-O0-NEXT: movaps %xmm0, %xmm3
+; CHECK-SSE2-O0-NEXT: movaps %xmm3, %xmm2
+; CHECK-SSE2-O0-NEXT: movaps %xmm3, %xmm1
+; CHECK-SSE2-O0-NEXT: psrlq $48, %xmm1
+; CHECK-SSE2-O0-NEXT: movaps %xmm3, %xmm0
+; CHECK-SSE2-O0-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,1,1]
+; CHECK-SSE2-O0-NEXT: psrld $16, %xmm3
+; CHECK-SSE2-O0-NEXT: pextrw $0, %xmm3, %eax
+; CHECK-SSE2-O0-NEXT: # kill: def $ax killed $ax killed $eax
+; CHECK-SSE2-O0-NEXT: # implicit-def: $ecx
+; CHECK-SSE2-O0-NEXT: movw %ax, %cx
+; CHECK-SSE2-O0-NEXT: shll $16, %ecx
+; CHECK-SSE2-O0-NEXT: pextrw $0, %xmm2, %eax
+; CHECK-SSE2-O0-NEXT: # kill: def $ax killed $ax killed $eax
+; CHECK-SSE2-O0-NEXT: movzwl %ax, %eax
+; CHECK-SSE2-O0-NEXT: orl %ecx, %eax
+; CHECK-SSE2-O0-NEXT: # kill: def $rax killed $eax
+; CHECK-SSE2-O0-NEXT: pextrw $0, %xmm1, %ecx
+; CHECK-SSE2-O0-NEXT: movw %cx, %dx
+; CHECK-SSE2-O0-NEXT: # implicit-def: $ecx
+; CHECK-SSE2-O0-NEXT: movw %dx, %cx
+; CHECK-SSE2-O0-NEXT: shll $16, %ecx
+; CHECK-SSE2-O0-NEXT: pextrw $0, %xmm0, %edx
+; CHECK-SSE2-O0-NEXT: # kill: def $dx killed $dx killed $edx
+; CHECK-SSE2-O0-NEXT: movzwl %dx, %edx
+; CHECK-SSE2-O0-NEXT: orl %ecx, %edx
+; CHECK-SSE2-O0-NEXT: # implicit-def: $rcx
+; CHECK-SSE2-O0-NEXT: movl %edx, %ecx
+; CHECK-SSE2-O0-NEXT: shlq $32, %rcx
+; CHECK-SSE2-O0-NEXT: orq %rcx, %rax
+; CHECK-SSE2-O0-NEXT: movq %rax, (%rdi)
+; CHECK-SSE2-O0-NEXT: retq
+;
+; CHECK-SSE4-O0-LABEL: store_atomic_vec4_half:
+; CHECK-SSE4-O0: # %bb.0:
+; CHECK-SSE4-O0-NEXT: movaps %xmm0, %xmm3
+; CHECK-SSE4-O0-NEXT: movaps %xmm3, %xmm2
+; CHECK-SSE4-O0-NEXT: movaps %xmm3, %xmm1
+; CHECK-SSE4-O0-NEXT: psrlq $48, %xmm1
+; CHECK-SSE4-O0-NEXT: movshdup {{.*#+}} xmm0 = xmm3[1,1,3,3]
+; CHECK-SSE4-O0-NEXT: psrld $16, %xmm3
+; CHECK-SSE4-O0-NEXT: pextrw $0, %xmm3, %eax
+; CHECK-SSE4-O0-NEXT: # kill: def $ax killed $ax killed $eax
+; CHECK-SSE4-O0-NEXT: # implicit-def: $ecx
+; CHECK-SSE4-O0-NEXT: movw %ax, %cx
+; CHECK-SSE4-O0-NEXT: shll $16, %ecx
+; CHECK-SSE4-O0-NEXT: pextrw $0, %xmm2, %eax
+; CHECK-SSE4-O0-NEXT: # kill: def $ax killed $ax killed $eax
+; CHECK-SSE4-O0-NEXT: movzwl %ax, %eax
+; CHECK-SSE4-O0-NEXT: orl %ecx, %eax
+; CHECK-SSE4-O0-NEXT: # kill: def $rax killed $eax
+; CHECK-SSE4-O0-NEXT: pextrw $0, %xmm1, %ecx
+; CHECK-SSE4-O0-NEXT: movw %cx, %dx
+; CHECK-SSE4-O0-NEXT: # implicit-def: $ecx
+; CHECK-SSE4-O0-NEXT: movw %dx, %cx
+; CHECK-SSE4-O0-NEXT: shll $16, %ecx
+; CHECK-SSE4-O0-NEXT: pextrw $0, %xmm0, %edx
+; CHECK-SSE4-O0-NEXT: # kill: def $dx killed $dx killed $edx
+; CHECK-SSE4-O0-NEXT: movzwl %dx, %edx
+; CHECK-SSE4-O0-NEXT: orl %ecx, %edx
+; CHECK-SSE4-O0-NEXT: # implicit-def: $rcx
+; CHECK-SSE4-O0-NEXT: movl %edx, %ecx
+; CHECK-SSE4-O0-NEXT: shlq $32, %rcx
+; CHECK-SSE4-O0-NEXT: orq %rcx, %rax
+; CHECK-SSE4-O0-NEXT: movq %rax, (%rdi)
+; CHECK-SSE4-O0-NEXT: retq
+;
+; CHECK-AVX-O0-LABEL: store_atomic_vec4_half:
+; CHECK-AVX-O0: # %bb.0:
+; CHECK-AVX-O0-NEXT: vmovq %xmm0, %rax
+; CHECK-AVX-O0-NEXT: movq %rax, (%rdi)
+; CHECK-AVX-O0-NEXT: retq
+ store atomic <4 x half> %v, ptr %x release, align 8
+ ret void
+}
+
+define void @store_atomic_vec4_bfloat(ptr %x, <4 x bfloat> %v) nounwind {
+; CHECK-SSE2-O3-LABEL: store_atomic_vec4_bfloat:
+; CHECK-SSE2-O3: # %bb.0:
+; CHECK-SSE2-O3-NEXT: movdqa %xmm0, %xmm1
+; CHECK-SSE2-O3-NEXT: psrld $16, %xmm1
+; CHECK-SSE2-O3-NEXT: pextrw $0, %xmm1, %eax
+; CHECK-SSE2-O3-NEXT: pextrw $0, %xmm0, %ecx
+; CHECK-SSE2-O3-NEXT: shll $16, %eax
+; CHECK-SSE2-O3-NEXT: movzwl %cx, %ecx
+; CHECK-SSE2-O3-NEXT: orl %eax, %ecx
+; CHECK-SSE2-O3-NEXT: movdqa %xmm0, %xmm1
+; CHECK-SSE2-O3-NEXT: psrlq $48, %xmm1
+; CHECK-SSE2-O3-NEXT: pextrw $0, %xmm1, %eax
+; CHECK-SSE2-O3-NEXT: shll $16, %eax
+; CHECK-SSE2-O3-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,1,1]
+; CHECK-SSE2-O3-NEXT: pextrw $0, %xmm0, %edx
+; CHECK-SSE2-O3-NEXT: movzwl %dx, %edx
+; CHECK-SSE2-O3-NEXT: orl %eax, %edx
+; CHECK-SSE2-O3-NEXT: shlq $32, %rdx
+; CHECK-SSE2-O3-NEXT: orq %rcx, %rdx
+; CHECK-SSE2-O3-NEXT: movq %rdx, (%rdi)
+; CHECK-SSE2-O3-NEXT: retq
+;
+; CHECK-SSE4-O3-LABEL: store_atomic_vec4_bfloat:
+; CHECK-SSE4-O3: # %bb.0:
+; CHECK-SSE4-O3-NEXT: movdqa %xmm0, %xmm1
+; CHECK-SSE4-O3-NEXT: psrld $16, %xmm1
+; CHECK-SSE4-O3-NEXT: pextrw $0, %xmm1, %eax
+; CHECK-SSE4-O3-NEXT: shll $16, %eax
+; CHECK-SSE4-O3-NEXT: pextrw $0, %xmm0, %ecx
+; CHECK-SSE4-O3-NEXT: movzwl %cx, %ecx
+; CHECK-SSE4-O3-NEXT: orl %eax, %ecx
+; CHECK-SSE4-O3-NEXT: movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
+; CHECK-SSE4-O3-NEXT: psrlq $48, %xmm0
+; CHECK-SSE4-O3-NEXT: pextrw $0, %xmm0, %eax
+; CHECK-SSE4-O3-NEXT: shll $16, %eax
+; CHECK-SSE4-O3-NEXT: pextrw $0, %xmm1, %edx
+; CHECK-SSE4-O3-NEXT: movzwl %dx, %edx
+; CHECK-SSE4-O3-NEXT: orl %eax, %edx
+; CHECK-SSE4-O3-NEXT: shlq $32, %rdx
+; CHECK-SSE4-O3-NEXT: orq %rcx, %rdx
+; CHECK-SSE4-O3-NEXT: movq %rdx, (%rdi)
+; CHECK-SSE4-O3-NEXT: retq
+;
+; CHECK-AVX-O3-LABEL: store_atomic_vec4_bfloat:
+; CHECK-AVX-O3: # %bb.0:
+; CHECK-AVX-O3-NEXT: vmovq %xmm0, %rax
+; CHECK-AVX-O3-NEXT: movq %rax, (%rdi)
+; CHECK-AVX-O3-NEXT: retq
+;
+; CHECK-SSE-O0-LABEL: store_atomic_vec4_bfloat:
+; CHECK-SSE-O0: # %bb.0:
+; CHECK-SSE-O0-NEXT: subq $40, %rsp
+; CHECK-SSE-O0-NEXT: movaps %xmm0, %xmm1
+; CHECK-SSE-O0-NEXT: movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; CHECK-SSE-O0-NEXT: pextrw $3, %xmm1, %eax
+; CHECK-SSE-O0-NEXT: shll $16, %eax
+; CHECK-SSE-O0-NEXT: movd %eax, %xmm0
+; CHECK-SSE-O0-NEXT: movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; CHECK-SSE-O0-NEXT: pextrw $2, %xmm1, %eax
+; CHECK-SSE-O0-NEXT: shll $16, %eax
+; CHECK-SSE-O0-NEXT: movd %eax, %xmm0
+; CHECK-SSE-O0-NEXT: movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; CHECK-SSE-O0-NEXT: pextrw $1, %xmm1, %eax
+; CHECK-SSE-O0-NEXT: shll $16, %eax
+; CHECK-SSE-O0-NEXT: movd %eax, %xmm0
+; CHECK-SSE-O0-NEXT: movd %xmm1, %eax
+; CHECK-SSE-O0-NEXT: shll $16, %eax
+; CHECK-SSE-O0-NEXT: movd %eax, %xmm1
+; CHECK-SSE-O0-NEXT: movss %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; CHECK-SSE-O0-NEXT: callq __truncsfbf2@PLT
+; CHECK-SSE-O0-NEXT: movaps %xmm0, %xmm1
+; CHECK-SSE-O0-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload
+; CHECK-SSE-O0-NEXT: # xmm0 = mem[0],zero,zero,zero
+; CHECK-SSE-O0-NEXT: pextrw $0, %xmm1, %eax
+; CHECK-SSE-O0-NEXT: movw %ax, %cx
+; CHECK-SSE-O0-NEXT: # implicit-def: $eax
+; CHECK-SSE-O0-NEXT: movw %cx, %ax
+; CHECK-SSE-O0-NEXT: shll $16, %eax
+; CHECK-SSE-O0-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; CHECK-SSE-O0-NEXT: callq __truncsfbf2@PLT
+; CHECK-SSE-O0-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 4-byte Reload
+; CHECK-SSE-O0-NEXT: movaps %xmm0, %xmm1
+; CHECK-SSE-O0-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload
+; CHECK-SSE-O0-NEXT: # xmm0 = mem[0],zero,zero,zero
+; CHECK-SSE-O0-NEXT: pextrw $0, %xmm1, %eax
+; CHECK-SSE-O0-NEXT: # kill: def $ax killed $ax killed $eax
+; CHECK-SSE-O0-NEXT: movzwl %ax, %eax
+; CHECK-SSE-O0-NEXT: orl %ecx, %eax
+; CHECK-SSE-O0-NEXT: # kill: def $rax killed $eax
+; CHECK-SSE-O0-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; CHECK-SSE-O0-NEXT: callq __truncsfbf2@PLT
+; CHECK-SSE-O0-NEXT: movaps %xmm0, %xmm1
+; CHECK-SSE-O0-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload
+; CHECK-SSE-O0-NEXT: # xmm0 = mem[0],zero,zero,zero
+; CHECK-SSE-O0-NEXT: pextrw $0, %xmm1, %eax
+; CHECK-SSE-O0-NEXT: movw %ax, %cx
+; CHECK-SSE-O0-NEXT: # implicit-def: $eax
+; CHECK-SSE-O0-NEXT: movw %cx, %ax
+; CHECK-SSE-O0-NEXT: shll $16, %eax
+; CHECK-SSE-O0-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; CHECK-SSE-O0-NEXT: callq __truncsfbf2@PLT
+; CHECK-SSE-O0-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 4-byte Reload
+; CHECK-SSE-O0-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; CHECK-SSE-O0-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload
+; CHECK-SSE-O0-NEXT: pextrw $0, %xmm0, %edx
+; CHECK-SSE-O0-NEXT: # kill: def $dx killed $dx killed $edx
+; CHECK-SSE-O0-NEXT: movzwl %dx, %edx
+; CHECK-SSE-O0-NEXT: orl %ecx, %edx
+; CHECK-SSE-O0-NEXT: # implicit-def: $rcx
+; CHECK-SSE-O0-NEXT: movl %edx, %ecx
+; CHECK-SSE-O0-NEXT: shlq $32, %rcx
+; CHECK-SSE-O0-NEXT: orq %rcx, %rax
+; CHECK-SSE-O0-NEXT: movq %rax, (%rdi)
+; CHECK-SSE-O0-NEXT: addq $40, %rsp
+; CHECK-SSE-O0-NEXT: retq
+;
+; CHECK-AVX-O0-LABEL: store_atomic_vec4_bfloat:
+; CHECK-AVX-O0: # %bb.0:
+; CHECK-AVX-O0-NEXT: subq $40, %rsp
+; CHECK-AVX-O0-NEXT: vmovaps %xmm0, %xmm1
+; CHECK-AVX-O0-NEXT: movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; CHECK-AVX-O0-NEXT: vpextrw $3, %xmm1, %eax
+; CHECK-AVX-O0-NEXT: shll $16, %eax
+; CHECK-AVX-O0-NEXT: vmovd %eax, %xmm0
+; CHECK-AVX-O0-NEXT: vmovss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; CHECK-AVX-O0-NEXT: vpextrw $2, %xmm1, %eax
+; CHECK-AVX-O0-NEXT: shll $16, %eax
+; CHECK-AVX-O0-NEXT: vmovd %eax, %xmm0
+; CHECK-AVX-O0-NEXT: vmovss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; CHECK-AVX-O0-NEXT: vpextrw $1, %xmm1, %eax
+; CHECK-AVX-O0-NEXT: shll $16, %eax
+; CHECK-AVX-O0-NEXT: vmovd %eax, %xmm0
+; CHECK-AVX-O0-NEXT: vmovd %xmm1, %eax
+; CHECK-AVX-O0-NEXT: shll $16, %eax
+; CHECK-AVX-O0-NEXT: vmov...
[truncated]
|
|
|
| // width. Bitcasting the (illegal) vector value to that integer lets the | ||
| // type legalizer further legalize the BITCAST input as needed, while the | ||
| // ATOMIC_STORE itself uses only the legal integer type. | ||
| EVT IntVT = EVT::getIntegerVT(*DAG.getContext(), VT.getSizeInBits()); |
There was a problem hiding this comment.
Does it apply to <2 x x86_fp80>, <2 x fp128> etc. that no corresponding legal integer type?
There was a problem hiding this comment.
It does apply to those; however, it doesn't matter per se that there is no legal type if so since AtomicExpand.cpp's atomicSizeSupported's call to TLI->getMaxAtomicSizeInBitsSupported() / 8 will cause lowering of these types to library calls instead, i.e. before the DAG lowers them.
| ; CHECK-SSE-O0-LABEL: store_atomic_vec2_bfloat: | ||
| ; CHECK-SSE-O0: # %bb.0: | ||
| ; CHECK-SSE-O0-NEXT: subq $24, %rsp | ||
| ; CHECK-SSE-O0-NEXT: .cfi_def_cfa_offset 32 |
ac8361d to
a730eaf
Compare
01c3c98 to
e0dca2b
Compare
🐧 Linux x64 Test Results
✅ The build succeeded and all tests passed. |
🪟 Windows x64 Test Results
✅ The build succeeded and all tests passed. |
e0dca2b to
e4c9611
Compare
b20b8d4 to
16bfe6a
Compare
e4c9611 to
a839f91
Compare
16bfe6a to
7b09891
Compare
9a15cc0 to
9f6ca39
Compare
7b09891 to
1613d11
Compare
Vector types of 2 elements must be widened. This change does this for vector types of atomic store in SelectionDAG so that it can translate aligned vectors of >1 size.
This change adds patterns to optimize out an extra MOV present after widening the atomic store. Covers <2 x i8> (SSE4.1+), <2 x i16>, <4 x i8>, <2 x i32>, <2 x float>, <4 x i16>, <2 x ptr addrspace(270)>.
Vector types that aren't widened are split so that a single ATOMIC_STORE is issued for the entire vector at once. This enables SelectionDAG to translate vectors with type bfloat,half.
9f6ca39 to
98275c5
Compare
1613d11 to
7fb4fcf
Compare
149e8c0 to
27aec96
Compare
This change adds patterns to optimize out an extra MOV present after widening the atomic store. Covers `<2 x i8>` (SSE4.1+), `<2 x i16>`, `<4 x i8>`, `<2 x i32>`, `<2 x float>`, `<4 x i16>`, `<2 x ptr addrspace(270)>`. Store-side counterpart to #148898. Stacked on top of #197618; and below of #197860.
Vector types that aren't widened are split so that a single ATOMIC_STORE is issued for the entire vector at once. This enables SelectionDAG to translate vectors with type bfloat,half. Store-side counterpart to llvm#165818. Stacked on top of llvm#197619; and below of llvm#197861.
Vector types that aren't widened are split so that a single ATOMIC_STORE
is issued for the entire vector at once. This enables SelectionDAG to
translate vectors with type bfloat,half.
Store-side counterpart to #165818. Stacked on top of #197619; and below of #197861.