[SelectionDAG] Split vector types for atomic load#165818
Conversation
|
@llvm/pr-subscribers-backend-x86 @llvm/pr-subscribers-llvm-selectiondag Author: None (jofrn) ChangesVector types that aren't widened are split so that a single ATOMIC_LOAD is issued for the entire vector at once. This change utilizes the load vectorization infrastructure in SelectionDAG in order to group the vectors. This enables SelectionDAG to translate vectors with type bfloat,half. Patch is 24.21 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/165818.diff 3 Files Affected:
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h b/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h
index ed2c30be7d71d..9028ff4d3401c 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h
@@ -978,6 +978,7 @@ class LLVM_LIBRARY_VISIBILITY DAGTypeLegalizer {
void SplitVecRes_FPOp_MultiType(SDNode *N, SDValue &Lo, SDValue &Hi);
void SplitVecRes_IS_FPCLASS(SDNode *N, SDValue &Lo, SDValue &Hi);
void SplitVecRes_INSERT_VECTOR_ELT(SDNode *N, SDValue &Lo, SDValue &Hi);
+ void SplitVecRes_ATOMIC_LOAD(AtomicSDNode *LD, SDValue &Lo, SDValue &Hi);
void SplitVecRes_LOAD(LoadSDNode *LD, SDValue &Lo, SDValue &Hi);
void SplitVecRes_VP_LOAD(VPLoadSDNode *LD, SDValue &Lo, SDValue &Hi);
void SplitVecRes_VP_LOAD_FF(VPLoadFFSDNode *LD, SDValue &Lo, SDValue &Hi);
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
index 22f9fd548f52b..e34b9fa8e787c 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
@@ -1226,6 +1226,9 @@ void DAGTypeLegalizer::SplitVectorResult(SDNode *N, unsigned ResNo) {
SplitVecRes_STEP_VECTOR(N, Lo, Hi);
break;
case ISD::SIGN_EXTEND_INREG: SplitVecRes_InregOp(N, Lo, Hi); break;
+ case ISD::ATOMIC_LOAD:
+ SplitVecRes_ATOMIC_LOAD(cast<AtomicSDNode>(N), Lo, Hi);
+ break;
case ISD::LOAD:
SplitVecRes_LOAD(cast<LoadSDNode>(N), Lo, Hi);
break;
@@ -2202,6 +2205,40 @@ void DAGTypeLegalizer::SplitVecRes_VP_SPLAT(SDNode *N, SDValue &Lo,
Hi = DAG.getNode(N->getOpcode(), dl, HiVT, N->getOperand(0), MaskHi, EVLHi);
}
+void DAGTypeLegalizer::SplitVecRes_ATOMIC_LOAD(AtomicSDNode *LD, SDValue &Lo,
+ SDValue &Hi) {
+ assert(LD->getExtensionType() == ISD::NON_EXTLOAD &&
+ "Extended load during type legalization!");
+ SDLoc dl(LD);
+ EVT VT = LD->getValueType(0);
+ EVT LoVT, HiVT;
+ std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(VT);
+
+ SDValue Ch = LD->getChain();
+ SDValue Ptr = LD->getBasePtr();
+
+ EVT IntVT = EVT::getIntegerVT(*DAG.getContext(), VT.getSizeInBits());
+ EVT MemIntVT =
+ EVT::getIntegerVT(*DAG.getContext(), LD->getMemoryVT().getSizeInBits());
+ SDValue ALD = DAG.getAtomicLoad(ISD::NON_EXTLOAD, dl, MemIntVT, IntVT, Ch,
+ Ptr, LD->getMemOperand());
+
+ EVT LoIntVT = EVT::getIntegerVT(*DAG.getContext(), LoVT.getSizeInBits());
+ EVT HiIntVT = EVT::getIntegerVT(*DAG.getContext(), HiVT.getSizeInBits());
+ SDValue ExtractLo = DAG.getNode(ISD::TRUNCATE, dl, LoIntVT, ALD);
+ SDValue ExtractHi =
+ DAG.getNode(ISD::SRL, dl, IntVT, ALD,
+ DAG.getIntPtrConstant(VT.getSizeInBits() / 2, dl));
+ ExtractHi = DAG.getNode(ISD::TRUNCATE, dl, HiIntVT, ExtractHi);
+
+ Lo = DAG.getBitcast(LoVT, ExtractLo);
+ Hi = DAG.getBitcast(HiVT, ExtractHi);
+
+ // Legalize the chain result - switch anything that used the old chain to
+ // use the new one.
+ ReplaceValueWith(SDValue(LD, 1), ALD.getValue(1));
+}
+
void DAGTypeLegalizer::SplitVecRes_LOAD(LoadSDNode *LD, SDValue &Lo,
SDValue &Hi) {
assert(ISD::isUNINDEXEDLoad(LD) && "Indexed load during type legalization!");
diff --git a/llvm/test/CodeGen/X86/atomic-load-store.ll b/llvm/test/CodeGen/X86/atomic-load-store.ll
index 9ea21cae97f32..286799f36e80a 100644
--- a/llvm/test/CodeGen/X86/atomic-load-store.ll
+++ b/llvm/test/CodeGen/X86/atomic-load-store.ll
@@ -565,6 +565,180 @@ define <2 x float> @atomic_vec2_float_align(ptr %x) {
ret <2 x float> %ret
}
+define <2 x half> @atomic_vec2_half(ptr %x) {
+; CHECK-O3-LABEL: atomic_vec2_half:
+; CHECK-O3: # %bb.0:
+; CHECK-O3-NEXT: movl (%rdi), %eax
+; CHECK-O3-NEXT: pinsrw $0, %eax, %xmm0
+; CHECK-O3-NEXT: shrl $16, %eax
+; CHECK-O3-NEXT: pinsrw $0, %eax, %xmm1
+; CHECK-O3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; CHECK-O3-NEXT: retq
+;
+; CHECK-SSE-O3-LABEL: atomic_vec2_half:
+; CHECK-SSE-O3: # %bb.0:
+; CHECK-SSE-O3-NEXT: movl (%rdi), %eax
+; CHECK-SSE-O3-NEXT: pinsrw $0, %eax, %xmm0
+; CHECK-SSE-O3-NEXT: shrl $16, %eax
+; CHECK-SSE-O3-NEXT: pinsrw $0, %eax, %xmm1
+; CHECK-SSE-O3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; CHECK-SSE-O3-NEXT: retq
+;
+; CHECK-AVX-O3-LABEL: atomic_vec2_half:
+; CHECK-AVX-O3: # %bb.0:
+; CHECK-AVX-O3-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; CHECK-AVX-O3-NEXT: retq
+;
+; CHECK-AVX512-O3-LABEL: atomic_vec2_half:
+; CHECK-AVX512-O3: # %bb.0:
+; CHECK-AVX512-O3-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; CHECK-AVX512-O3-NEXT: retq
+;
+; CHECK-O0-LABEL: atomic_vec2_half:
+; CHECK-O0: # %bb.0:
+; CHECK-O0-NEXT: movl (%rdi), %eax
+; CHECK-O0-NEXT: movl %eax, %ecx
+; CHECK-O0-NEXT: shrl $16, %ecx
+; CHECK-O0-NEXT: movw %cx, %dx
+; CHECK-O0-NEXT: # implicit-def: $ecx
+; CHECK-O0-NEXT: movw %dx, %cx
+; CHECK-O0-NEXT: # implicit-def: $xmm1
+; CHECK-O0-NEXT: pinsrw $0, %ecx, %xmm1
+; CHECK-O0-NEXT: movw %ax, %cx
+; CHECK-O0-NEXT: # implicit-def: $eax
+; CHECK-O0-NEXT: movw %cx, %ax
+; CHECK-O0-NEXT: # implicit-def: $xmm0
+; CHECK-O0-NEXT: pinsrw $0, %eax, %xmm0
+; CHECK-O0-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; CHECK-O0-NEXT: retq
+;
+; CHECK-SSE-O0-LABEL: atomic_vec2_half:
+; CHECK-SSE-O0: # %bb.0:
+; CHECK-SSE-O0-NEXT: movl (%rdi), %eax
+; CHECK-SSE-O0-NEXT: movl %eax, %ecx
+; CHECK-SSE-O0-NEXT: shrl $16, %ecx
+; CHECK-SSE-O0-NEXT: movw %cx, %dx
+; CHECK-SSE-O0-NEXT: # implicit-def: $ecx
+; CHECK-SSE-O0-NEXT: movw %dx, %cx
+; CHECK-SSE-O0-NEXT: # implicit-def: $xmm1
+; CHECK-SSE-O0-NEXT: pinsrw $0, %ecx, %xmm1
+; CHECK-SSE-O0-NEXT: movw %ax, %cx
+; CHECK-SSE-O0-NEXT: # implicit-def: $eax
+; CHECK-SSE-O0-NEXT: movw %cx, %ax
+; CHECK-SSE-O0-NEXT: # implicit-def: $xmm0
+; CHECK-SSE-O0-NEXT: pinsrw $0, %eax, %xmm0
+; CHECK-SSE-O0-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; CHECK-SSE-O0-NEXT: retq
+;
+; CHECK-AVX-O0-LABEL: atomic_vec2_half:
+; CHECK-AVX-O0: # %bb.0:
+; CHECK-AVX-O0-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; CHECK-AVX-O0-NEXT: retq
+;
+; CHECK-AVX512-O0-LABEL: atomic_vec2_half:
+; CHECK-AVX512-O0: # %bb.0:
+; CHECK-AVX512-O0-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; CHECK-AVX512-O0-NEXT: retq
+ %ret = load atomic <2 x half>, ptr %x acquire, align 4
+ ret <2 x half> %ret
+}
+define <2 x bfloat> @atomic_vec2_bfloat(ptr %x) {
+; CHECK-O3-LABEL: atomic_vec2_bfloat:
+; CHECK-O3: # %bb.0:
+; CHECK-O3-NEXT: movl (%rdi), %eax
+; CHECK-O3-NEXT: pinsrw $0, %eax, %xmm0
+; CHECK-O3-NEXT: shrl $16, %eax
+; CHECK-O3-NEXT: pinsrw $0, %eax, %xmm1
+; CHECK-O3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; CHECK-O3-NEXT: retq
+;
+; CHECK-SSE-O3-LABEL: atomic_vec2_bfloat:
+; CHECK-SSE-O3: # %bb.0:
+; CHECK-SSE-O3-NEXT: movl (%rdi), %eax
+; CHECK-SSE-O3-NEXT: pinsrw $0, %eax, %xmm0
+; CHECK-SSE-O3-NEXT: shrl $16, %eax
+; CHECK-SSE-O3-NEXT: pinsrw $0, %eax, %xmm1
+; CHECK-SSE-O3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; CHECK-SSE-O3-NEXT: retq
+;
+; CHECK-AVX-O3-LABEL: atomic_vec2_bfloat:
+; CHECK-AVX-O3: # %bb.0:
+; CHECK-AVX-O3-NEXT: movl (%rdi), %eax
+; CHECK-AVX-O3-NEXT: movw %ax, -{{[0-9]+}}(%rsp)
+; CHECK-AVX-O3-NEXT: shrl $16, %eax
+; CHECK-AVX-O3-NEXT: movw %ax, -{{[0-9]+}}(%rsp)
+; CHECK-AVX-O3-NEXT: vmovaps -{{[0-9]+}}(%rsp), %xmm0
+; CHECK-AVX-O3-NEXT: retq
+;
+; CHECK-AVX512-O3-LABEL: atomic_vec2_bfloat:
+; CHECK-AVX512-O3: # %bb.0:
+; CHECK-AVX512-O3-NEXT: movl (%rdi), %eax
+; CHECK-AVX512-O3-NEXT: movw %ax, -{{[0-9]+}}(%rsp)
+; CHECK-AVX512-O3-NEXT: shrl $16, %eax
+; CHECK-AVX512-O3-NEXT: movw %ax, -{{[0-9]+}}(%rsp)
+; CHECK-AVX512-O3-NEXT: vmovaps -{{[0-9]+}}(%rsp), %xmm0
+; CHECK-AVX512-O3-NEXT: retq
+;
+; CHECK-O0-LABEL: atomic_vec2_bfloat:
+; CHECK-O0: # %bb.0:
+; CHECK-O0-NEXT: movl (%rdi), %eax
+; CHECK-O0-NEXT: movl %eax, %ecx
+; CHECK-O0-NEXT: shrl $16, %ecx
+; CHECK-O0-NEXT: # kill: def $cx killed $cx killed $ecx
+; CHECK-O0-NEXT: movw %ax, %dx
+; CHECK-O0-NEXT: # implicit-def: $eax
+; CHECK-O0-NEXT: movw %dx, %ax
+; CHECK-O0-NEXT: # implicit-def: $xmm0
+; CHECK-O0-NEXT: pinsrw $0, %eax, %xmm0
+; CHECK-O0-NEXT: # implicit-def: $eax
+; CHECK-O0-NEXT: movw %cx, %ax
+; CHECK-O0-NEXT: # implicit-def: $xmm1
+; CHECK-O0-NEXT: pinsrw $0, %eax, %xmm1
+; CHECK-O0-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; CHECK-O0-NEXT: retq
+;
+; CHECK-SSE-O0-LABEL: atomic_vec2_bfloat:
+; CHECK-SSE-O0: # %bb.0:
+; CHECK-SSE-O0-NEXT: movl (%rdi), %eax
+; CHECK-SSE-O0-NEXT: movl %eax, %ecx
+; CHECK-SSE-O0-NEXT: shrl $16, %ecx
+; CHECK-SSE-O0-NEXT: # kill: def $cx killed $cx killed $ecx
+; CHECK-SSE-O0-NEXT: movw %ax, %dx
+; CHECK-SSE-O0-NEXT: # implicit-def: $eax
+; CHECK-SSE-O0-NEXT: movw %dx, %ax
+; CHECK-SSE-O0-NEXT: # implicit-def: $xmm0
+; CHECK-SSE-O0-NEXT: pinsrw $0, %eax, %xmm0
+; CHECK-SSE-O0-NEXT: # implicit-def: $eax
+; CHECK-SSE-O0-NEXT: movw %cx, %ax
+; CHECK-SSE-O0-NEXT: # implicit-def: $xmm1
+; CHECK-SSE-O0-NEXT: pinsrw $0, %eax, %xmm1
+; CHECK-SSE-O0-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; CHECK-SSE-O0-NEXT: retq
+;
+; CHECK-AVX-O0-LABEL: atomic_vec2_bfloat:
+; CHECK-AVX-O0: # %bb.0:
+; CHECK-AVX-O0-NEXT: movl (%rdi), %eax
+; CHECK-AVX-O0-NEXT: movw %ax, %cx
+; CHECK-AVX-O0-NEXT: movw %cx, -{{[0-9]+}}(%rsp)
+; CHECK-AVX-O0-NEXT: shrl $16, %eax
+; CHECK-AVX-O0-NEXT: # kill: def $ax killed $ax killed $eax
+; CHECK-AVX-O0-NEXT: movw %ax, -{{[0-9]+}}(%rsp)
+; CHECK-AVX-O0-NEXT: vmovaps -{{[0-9]+}}(%rsp), %xmm0
+; CHECK-AVX-O0-NEXT: retq
+;
+; CHECK-AVX512-O0-LABEL: atomic_vec2_bfloat:
+; CHECK-AVX512-O0: # %bb.0:
+; CHECK-AVX512-O0-NEXT: movl (%rdi), %eax
+; CHECK-AVX512-O0-NEXT: movw %ax, %cx
+; CHECK-AVX512-O0-NEXT: movw %cx, -{{[0-9]+}}(%rsp)
+; CHECK-AVX512-O0-NEXT: shrl $16, %eax
+; CHECK-AVX512-O0-NEXT: # kill: def $ax killed $ax killed $eax
+; CHECK-AVX512-O0-NEXT: movw %ax, -{{[0-9]+}}(%rsp)
+; CHECK-AVX512-O0-NEXT: vmovaps -{{[0-9]+}}(%rsp), %xmm0
+; CHECK-AVX512-O0-NEXT: retq
+ %ret = load atomic <2 x bfloat>, ptr %x acquire, align 4
+ ret <2 x bfloat> %ret
+}
define <1 x ptr> @atomic_vec1_ptr(ptr %x) nounwind {
; CHECK-O3-LABEL: atomic_vec1_ptr:
; CHECK-O3: # %bb.0:
@@ -1205,6 +1379,305 @@ define <4 x i16> @atomic_vec4_i16(ptr %x) nounwind {
ret <4 x i16> %ret
}
+define <4 x half> @atomic_vec4_half(ptr %x) nounwind {
+; CHECK-O3-LABEL: atomic_vec4_half:
+; CHECK-O3: # %bb.0:
+; CHECK-O3-NEXT: movq (%rdi), %rax
+; CHECK-O3-NEXT: movl %eax, %ecx
+; CHECK-O3-NEXT: shrl $16, %ecx
+; CHECK-O3-NEXT: pinsrw $0, %ecx, %xmm1
+; CHECK-O3-NEXT: pinsrw $0, %eax, %xmm0
+; CHECK-O3-NEXT: movq %rax, %rcx
+; CHECK-O3-NEXT: shrq $32, %rcx
+; CHECK-O3-NEXT: pinsrw $0, %ecx, %xmm2
+; CHECK-O3-NEXT: shrq $48, %rax
+; CHECK-O3-NEXT: pinsrw $0, %eax, %xmm3
+; CHECK-O3-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
+; CHECK-O3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; CHECK-O3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
+; CHECK-O3-NEXT: retq
+;
+; CHECK-SSE-O3-LABEL: atomic_vec4_half:
+; CHECK-SSE-O3: # %bb.0:
+; CHECK-SSE-O3-NEXT: movq (%rdi), %rax
+; CHECK-SSE-O3-NEXT: movl %eax, %ecx
+; CHECK-SSE-O3-NEXT: shrl $16, %ecx
+; CHECK-SSE-O3-NEXT: pinsrw $0, %ecx, %xmm1
+; CHECK-SSE-O3-NEXT: pinsrw $0, %eax, %xmm0
+; CHECK-SSE-O3-NEXT: movq %rax, %rcx
+; CHECK-SSE-O3-NEXT: shrq $32, %rcx
+; CHECK-SSE-O3-NEXT: pinsrw $0, %ecx, %xmm2
+; CHECK-SSE-O3-NEXT: shrq $48, %rax
+; CHECK-SSE-O3-NEXT: pinsrw $0, %eax, %xmm3
+; CHECK-SSE-O3-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
+; CHECK-SSE-O3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; CHECK-SSE-O3-NEXT: insertps {{.*#+}} xmm0 = xmm0[0],xmm2[0],zero,zero
+; CHECK-SSE-O3-NEXT: retq
+;
+; CHECK-AVX-O3-LABEL: atomic_vec4_half:
+; CHECK-AVX-O3: # %bb.0:
+; CHECK-AVX-O3-NEXT: vmovq (%rdi), %xmm0
+; CHECK-AVX-O3-NEXT: retq
+;
+; CHECK-AVX512-O3-LABEL: atomic_vec4_half:
+; CHECK-AVX512-O3: # %bb.0:
+; CHECK-AVX512-O3-NEXT: vmovq (%rdi), %xmm0
+; CHECK-AVX512-O3-NEXT: retq
+;
+; CHECK-O0-LABEL: atomic_vec4_half:
+; CHECK-O0: # %bb.0:
+; CHECK-O0-NEXT: movq (%rdi), %rax
+; CHECK-O0-NEXT: movl %eax, %ecx
+; CHECK-O0-NEXT: shrl $16, %ecx
+; CHECK-O0-NEXT: movw %cx, %dx
+; CHECK-O0-NEXT: # implicit-def: $ecx
+; CHECK-O0-NEXT: movw %dx, %cx
+; CHECK-O0-NEXT: # implicit-def: $xmm2
+; CHECK-O0-NEXT: pinsrw $0, %ecx, %xmm2
+; CHECK-O0-NEXT: movw %ax, %dx
+; CHECK-O0-NEXT: # implicit-def: $ecx
+; CHECK-O0-NEXT: movw %dx, %cx
+; CHECK-O0-NEXT: # implicit-def: $xmm0
+; CHECK-O0-NEXT: pinsrw $0, %ecx, %xmm0
+; CHECK-O0-NEXT: movq %rax, %rcx
+; CHECK-O0-NEXT: shrq $32, %rcx
+; CHECK-O0-NEXT: movw %cx, %dx
+; CHECK-O0-NEXT: # implicit-def: $ecx
+; CHECK-O0-NEXT: movw %dx, %cx
+; CHECK-O0-NEXT: # implicit-def: $xmm1
+; CHECK-O0-NEXT: pinsrw $0, %ecx, %xmm1
+; CHECK-O0-NEXT: shrq $48, %rax
+; CHECK-O0-NEXT: movw %ax, %cx
+; CHECK-O0-NEXT: # implicit-def: $eax
+; CHECK-O0-NEXT: movw %cx, %ax
+; CHECK-O0-NEXT: # implicit-def: $xmm3
+; CHECK-O0-NEXT: pinsrw $0, %eax, %xmm3
+; CHECK-O0-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3]
+; CHECK-O0-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
+; CHECK-O0-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; CHECK-O0-NEXT: retq
+;
+; CHECK-SSE-O0-LABEL: atomic_vec4_half:
+; CHECK-SSE-O0: # %bb.0:
+; CHECK-SSE-O0-NEXT: movq (%rdi), %rax
+; CHECK-SSE-O0-NEXT: movl %eax, %ecx
+; CHECK-SSE-O0-NEXT: shrl $16, %ecx
+; CHECK-SSE-O0-NEXT: movw %cx, %dx
+; CHECK-SSE-O0-NEXT: # implicit-def: $ecx
+; CHECK-SSE-O0-NEXT: movw %dx, %cx
+; CHECK-SSE-O0-NEXT: # implicit-def: $xmm2
+; CHECK-SSE-O0-NEXT: pinsrw $0, %ecx, %xmm2
+; CHECK-SSE-O0-NEXT: movw %ax, %dx
+; CHECK-SSE-O0-NEXT: # implicit-def: $ecx
+; CHECK-SSE-O0-NEXT: movw %dx, %cx
+; CHECK-SSE-O0-NEXT: # implicit-def: $xmm0
+; CHECK-SSE-O0-NEXT: pinsrw $0, %ecx, %xmm0
+; CHECK-SSE-O0-NEXT: movq %rax, %rcx
+; CHECK-SSE-O0-NEXT: shrq $32, %rcx
+; CHECK-SSE-O0-NEXT: movw %cx, %dx
+; CHECK-SSE-O0-NEXT: # implicit-def: $ecx
+; CHECK-SSE-O0-NEXT: movw %dx, %cx
+; CHECK-SSE-O0-NEXT: # implicit-def: $xmm1
+; CHECK-SSE-O0-NEXT: pinsrw $0, %ecx, %xmm1
+; CHECK-SSE-O0-NEXT: shrq $48, %rax
+; CHECK-SSE-O0-NEXT: movw %ax, %cx
+; CHECK-SSE-O0-NEXT: # implicit-def: $eax
+; CHECK-SSE-O0-NEXT: movw %cx, %ax
+; CHECK-SSE-O0-NEXT: # implicit-def: $xmm3
+; CHECK-SSE-O0-NEXT: pinsrw $0, %eax, %xmm3
+; CHECK-SSE-O0-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3]
+; CHECK-SSE-O0-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
+; CHECK-SSE-O0-NEXT: insertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],zero,zero
+; CHECK-SSE-O0-NEXT: retq
+;
+; CHECK-AVX-O0-LABEL: atomic_vec4_half:
+; CHECK-AVX-O0: # %bb.0:
+; CHECK-AVX-O0-NEXT: vmovq (%rdi), %xmm0
+; CHECK-AVX-O0-NEXT: retq
+;
+; CHECK-AVX512-O0-LABEL: atomic_vec4_half:
+; CHECK-AVX512-O0: # %bb.0:
+; CHECK-AVX512-O0-NEXT: vmovq (%rdi), %xmm0
+; CHECK-AVX512-O0-NEXT: retq
+ %ret = load atomic <4 x half>, ptr %x acquire, align 8
+ ret <4 x half> %ret
+}
+define <4 x bfloat> @atomic_vec4_bfloat(ptr %x) nounwind {
+; CHECK-O3-LABEL: atomic_vec4_bfloat:
+; CHECK-O3: # %bb.0:
+; CHECK-O3-NEXT: movq (%rdi), %rax
+; CHECK-O3-NEXT: movq %rax, %rcx
+; CHECK-O3-NEXT: movq %rax, %rdx
+; CHECK-O3-NEXT: pinsrw $0, %eax, %xmm0
+; CHECK-O3-NEXT: # kill: def $eax killed $eax killed $rax
+; CHECK-O3-NEXT: shrl $16, %eax
+; CHECK-O3-NEXT: shrq $32, %rcx
+; CHECK-O3-NEXT: shrq $48, %rdx
+; CHECK-O3-NEXT: pinsrw $0, %edx, %xmm1
+; CHECK-O3-NEXT: pinsrw $0, %ecx, %xmm2
+; CHECK-O3-NEXT: pinsrw $0, %eax, %xmm3
+; CHECK-O3-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
+; CHECK-O3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3]
+; CHECK-O3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
+; CHECK-O3-NEXT: retq
+;
+; CHECK-SSE-O3-LABEL: atomic_vec4_bfloat:
+; CHECK-SSE-O3: # %bb.0:
+; CHECK-SSE-O3-NEXT: movq (%rdi), %rax
+; CHECK-SSE-O3-NEXT: movq %rax, %rcx
+; CHECK-SSE-O3-NEXT: movq %rax, %rdx
+; CHECK-SSE-O3-NEXT: pinsrw $0, %eax, %xmm0
+; CHECK-SSE-O3-NEXT: # kill: def $eax killed $eax killed $rax
+; CHECK-SSE-O3-NEXT: shrl $16, %eax
+; CHECK-SSE-O3-NEXT: shrq $32, %rcx
+; CHECK-SSE-O3-NEXT: shrq $48, %rdx
+; CHECK-SSE-O3-NEXT: pinsrw $0, %edx, %xmm1
+; CHECK-SSE-O3-NEXT: pinsrw $0, %ecx, %xmm2
+; CHECK-SSE-O3-NEXT: pinsrw $0, %eax, %xmm3
+; CHECK-SSE-O3-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
+; CHECK-SSE-O3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3]
+; CHECK-SSE-O3-NEXT: insertps {{.*#+}} xmm0 = xmm0[0],xmm2[0],zero,zero
+; CHECK-SSE-O3-NEXT: retq
+;
+; CHECK-AVX-O3-LABEL: atomic_vec4_bfloat:
+; CHECK-AVX-O3: # %bb.0:
+; CHECK-AVX-O3-NEXT: movq (%rdi), %rax
+; CHECK-AVX-O3-NEXT: movw %ax, -{{[0-9]+}}(%rsp)
+; CHECK-AVX-O3-NEXT: movq %rax, %rcx
+; CHECK-AVX-O3-NEXT: shrq $48, %rcx
+; CHECK-AVX-O3-NEXT: movw %cx, -{{[0-9]+}}(%rsp)
+; CHECK-AVX-O3-NEXT: movq %rax, %rcx
+; CHECK-AVX-O3-NEXT: shrq $32, %rcx
+; CHECK-AVX-O3-NEXT: movw %cx, -{{[0-9]+}}(%rsp)
+; CHECK-AVX-O3-NEXT: shrl $16, %eax
+; CHECK-AVX-O3-NEXT: movw %ax, -{{[0-9]+}}(%rsp)
+; CHECK-AVX-O3-NEXT: vmovaps -{{[0-9]+}}(%rsp), %xmm0
+; CHECK-AVX-O3-NEXT: retq
+;
+; CHECK-AVX512-O3-LABEL: atomic_vec4_bfloat:
+; CHECK-AVX512-O3: # %bb.0:
+; CHECK-AVX512-O3-NEXT: movq (%rdi), %rax
+; CHECK-AVX512-O3-NEXT: movw %ax, -{{[0-9]+}}(%rsp)
+; CHECK-AVX512-O3-NEXT: movq %rax, %rcx
+; CHECK-AVX512-O3-NEXT: shrq $48, %rcx
+; CHECK-AVX512-O3-NEXT: movw %cx, -{{[0-9]+}}(%rsp)
+; CHECK-AVX512-O3-NEXT: movq %rax, %rcx
+; CHECK-AVX512-O3-NEXT: shrq $32, %rcx
+; CHECK-AVX512-O3-NEXT: movw %cx, -{{[0-9]+}}(%rsp)
+; CHECK-AVX512-O3-NEXT: shrl $16, %eax
+; CHECK-AVX512-O3-NEXT: movw %ax, -{{[0-9]+}}(%rsp)
+; CHECK-AVX512-O3-NEXT: vmovaps -{{[0-9]+}}(%rsp), %xmm0
+; CHECK-AVX512-O3-NEXT: retq
+;
+; CHECK-O0-LABEL: atomic_vec4_bfloat:
+; CHECK-O0: # %bb.0:
+; CHECK-O0-NEXT: movq (%rdi), %rax
+; CHECK-O0-NEXT: movl %eax, %ecx
+; CHECK-O0-NEXT: shrl $16, %ecx
+; CHECK-O0-NEXT: # kill: def $cx killed $cx killed $ecx
+; CHECK-O0-NEXT: movw %ax, %dx
+; CHECK-O0-NEXT: movq %rax, %rsi
+; CHECK-O0-NEXT: shrq $32, %rsi
+; CHECK-O0-NEXT: # kill: def $si killed $si killed $rsi
+; CHECK-O0-NEXT: shrq $48, %rax
+; CHECK-O0-NEXT: movw %ax, %di
+; CHECK-O0-NEXT: # implicit-def: $eax
+; CHECK-O0-NEXT: movw %di, %ax
+; CHECK-O0-NEXT: # implicit-def: $xmm0
+; CHECK-O0-NEXT: pinsrw $0, %eax, %xmm0
+; CHECK-O0-NEXT: # implicit-def: $eax
+; CHECK-O0-NEXT: movw %si, %ax
+; CHECK-O0-NEXT: # implicit-def: $xmm1
+; CHECK-O0-NEXT: pinsrw $0, %eax, %xmm1
+; CHECK-O0-NEXT: punpcklwd {{.*#+}} x...
[truncated]
|
81c146e to
a0038c0
Compare
226394d to
55ec858
Compare
RKSimon
left a comment
There was a problem hiding this comment.
please regenerate the test checks
a0038c0 to
5c2428c
Compare
| SDValue ExtractHi = | ||
| DAG.getNode(ISD::SRL, dl, IntVT, ALD, | ||
| DAG.getIntPtrConstant(VT.getSizeInBits() / 2, dl)); | ||
| ExtractHi = DAG.getNode(ISD::TRUNCATE, dl, HiIntVT, ExtractHi); |
| assert(LD->getExtensionType() == ISD::NON_EXTLOAD && | ||
| "Extended load during type legalization!"); |
There was a problem hiding this comment.
Wouldn't this just be a matter of passing through the extension type instead of hardcoding NON_EXTLOAD below?
| SDValue ExtractLo = DAG.getNode(ISD::TRUNCATE, dl, LoIntVT, ALD); | ||
| SDValue ExtractHi = | ||
| DAG.getNode(ISD::SRL, dl, IntVT, ALD, | ||
| DAG.getIntPtrConstant(VT.getSizeInBits() / 2, dl)); |
There was a problem hiding this comment.
getIntPtrConstant is the wrong type to use, getShiftAmountConstant
5c2428c to
9411926
Compare
55ec858 to
b92b6da
Compare
1434bcf to
8466578
Compare
862a75a to
6de523d
Compare
d37ac30 to
acda4fa
Compare
|
I think this is stuck in the broken graphite rebase loop (#151135) |
🐧 Linux x64 Test Results
✅ The build succeeded and all tests passed. |
🪟 Windows x64 Test Results
✅ The build succeeded and all tests passed. |
| // that is the parent of the matched node. | ||
| bit WantsParent = false; | ||
| } | ||
| //===- TargetSelectionDAG.td - Common code for DAG isels ---*- tablegen -*-===// |
There was a problem hiding this comment.
This reformatted the whole file or something?
There was a problem hiding this comment.
Yes, the AI/UI hid it. We don't want this. Let me see how to revert it...
8909241 to
8466578
Compare
|
@jofrn any chance we can get these atomic patches finished soon please? |
Vector types that aren't widened are split so that a single ATOMIC_LOAD is issued for the entire vector at once. This change utilizes the load vectorization infrastructure in SelectionDAG in order to group the vectors. This enables SelectionDAG to translate vectors with type bfloat,half.
8466578 to
72a340b
Compare
Yes, working on it now. : ) |
| (atomic_load node:$ptr)> { | ||
| let IsAtomic = true; | ||
| let MemoryVT = v4i32; | ||
| } |
There was a problem hiding this comment.
are these relevant to THIS patch? I can't see any uses of them
There was a problem hiding this comment.
My concern has been the glacial speed this patch stack has been committed - I don't want to see dead code sitting there for months and somebody coming along and removing it (as they should).
There was a problem hiding this comment.
glacial speed; lol; not a concern anymore. Will get the stores going after this also by the way; I'll add you to review those ones as well.
Vector types that aren't widened are split so that a single ATOMIC_LOAD is issued for the entire vector at once. This change utilizes the load vectorization infrastructure in SelectionDAG in order to group the vectors. This enables SelectionDAG to translate vectors with type bfloat,half.
Vector types that aren't widened are split so that a single ATOMIC_LOAD is issued for the entire vector at once. This change utilizes the load vectorization infrastructure in SelectionDAG in order to group the vectors. This enables SelectionDAG to translate vectors with type bfloat,half.
Vector types that aren't widened are split so that a single ATOMIC_STORE is issued for the entire vector at once. This enables SelectionDAG to translate vectors with type bfloat,half. Store-side counterpart to llvm#165818. Stacked on top of llvm#197619; and below of llvm#197861.

Vector types that aren't widened are split so that a single ATOMIC_LOAD is issued for the entire vector at once. This change utilizes the load vectorization infrastructure in SelectionDAG in order to group the vectors. This enables SelectionDAG to translate vectors with type bfloat,half.