[SelectionDAG] Split vector types for atomic load by jofrn · Pull Request #165818 · llvm/llvm-project

jofrn · 2025-10-31T02:26:18Z

Vector types that aren't widened are split so that a single ATOMIC_LOAD is issued for the entire vector at once. This change utilizes the load vectorization infrastructure in SelectionDAG in order to group the vectors. This enables SelectionDAG to translate vectors with type bfloat,half.

llvmbot · 2025-10-31T02:26:47Z

@llvm/pr-subscribers-backend-x86

@llvm/pr-subscribers-llvm-selectiondag

Author: None (jofrn)

Changes

Vector types that aren't widened are split so that a single ATOMIC_LOAD is issued for the entire vector at once. This change utilizes the load vectorization infrastructure in SelectionDAG in order to group the vectors. This enables SelectionDAG to translate vectors with type bfloat,half.

Patch is 24.21 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/165818.diff

3 Files Affected:

(modified) llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h (+1)
(modified) llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp (+37)
(modified) llvm/test/CodeGen/X86/atomic-load-store.ll (+473)

diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h b/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h
index ed2c30be7d71d..9028ff4d3401c 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h
@@ -978,6 +978,7 @@ class LLVM_LIBRARY_VISIBILITY DAGTypeLegalizer {
   void SplitVecRes_FPOp_MultiType(SDNode *N, SDValue &Lo, SDValue &Hi);
   void SplitVecRes_IS_FPCLASS(SDNode *N, SDValue &Lo, SDValue &Hi);
   void SplitVecRes_INSERT_VECTOR_ELT(SDNode *N, SDValue &Lo, SDValue &Hi);
+  void SplitVecRes_ATOMIC_LOAD(AtomicSDNode *LD, SDValue &Lo, SDValue &Hi);
   void SplitVecRes_LOAD(LoadSDNode *LD, SDValue &Lo, SDValue &Hi);
   void SplitVecRes_VP_LOAD(VPLoadSDNode *LD, SDValue &Lo, SDValue &Hi);
   void SplitVecRes_VP_LOAD_FF(VPLoadFFSDNode *LD, SDValue &Lo, SDValue &Hi);
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
index 22f9fd548f52b..e34b9fa8e787c 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
@@ -1226,6 +1226,9 @@ void DAGTypeLegalizer::SplitVectorResult(SDNode *N, unsigned ResNo) {
     SplitVecRes_STEP_VECTOR(N, Lo, Hi);
     break;
   case ISD::SIGN_EXTEND_INREG: SplitVecRes_InregOp(N, Lo, Hi); break;
+  case ISD::ATOMIC_LOAD:
+    SplitVecRes_ATOMIC_LOAD(cast<AtomicSDNode>(N), Lo, Hi);
+    break;
   case ISD::LOAD:
     SplitVecRes_LOAD(cast<LoadSDNode>(N), Lo, Hi);
     break;
@@ -2202,6 +2205,40 @@ void DAGTypeLegalizer::SplitVecRes_VP_SPLAT(SDNode *N, SDValue &Lo,
   Hi = DAG.getNode(N->getOpcode(), dl, HiVT, N->getOperand(0), MaskHi, EVLHi);
 }
 
+void DAGTypeLegalizer::SplitVecRes_ATOMIC_LOAD(AtomicSDNode *LD, SDValue &Lo,
+                                               SDValue &Hi) {
+  assert(LD->getExtensionType() == ISD::NON_EXTLOAD &&
+         "Extended load during type legalization!");
+  SDLoc dl(LD);
+  EVT VT = LD->getValueType(0);
+  EVT LoVT, HiVT;
+  std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(VT);
+
+  SDValue Ch = LD->getChain();
+  SDValue Ptr = LD->getBasePtr();
+
+  EVT IntVT = EVT::getIntegerVT(*DAG.getContext(), VT.getSizeInBits());
+  EVT MemIntVT =
+      EVT::getIntegerVT(*DAG.getContext(), LD->getMemoryVT().getSizeInBits());
+  SDValue ALD = DAG.getAtomicLoad(ISD::NON_EXTLOAD, dl, MemIntVT, IntVT, Ch,
+                                  Ptr, LD->getMemOperand());
+
+  EVT LoIntVT = EVT::getIntegerVT(*DAG.getContext(), LoVT.getSizeInBits());
+  EVT HiIntVT = EVT::getIntegerVT(*DAG.getContext(), HiVT.getSizeInBits());
+  SDValue ExtractLo = DAG.getNode(ISD::TRUNCATE, dl, LoIntVT, ALD);
+  SDValue ExtractHi =
+      DAG.getNode(ISD::SRL, dl, IntVT, ALD,
+                  DAG.getIntPtrConstant(VT.getSizeInBits() / 2, dl));
+  ExtractHi = DAG.getNode(ISD::TRUNCATE, dl, HiIntVT, ExtractHi);
+
+  Lo = DAG.getBitcast(LoVT, ExtractLo);
+  Hi = DAG.getBitcast(HiVT, ExtractHi);
+
+  // Legalize the chain result - switch anything that used the old chain to
+  // use the new one.
+  ReplaceValueWith(SDValue(LD, 1), ALD.getValue(1));
+}
+
 void DAGTypeLegalizer::SplitVecRes_LOAD(LoadSDNode *LD, SDValue &Lo,
                                         SDValue &Hi) {
   assert(ISD::isUNINDEXEDLoad(LD) && "Indexed load during type legalization!");
diff --git a/llvm/test/CodeGen/X86/atomic-load-store.ll b/llvm/test/CodeGen/X86/atomic-load-store.ll
index 9ea21cae97f32..286799f36e80a 100644
--- a/llvm/test/CodeGen/X86/atomic-load-store.ll
+++ b/llvm/test/CodeGen/X86/atomic-load-store.ll
@@ -565,6 +565,180 @@ define <2 x float> @atomic_vec2_float_align(ptr %x) {
   ret <2 x float> %ret
 }
 
+define <2 x half> @atomic_vec2_half(ptr %x) {
+; CHECK-O3-LABEL: atomic_vec2_half:
+; CHECK-O3:       # %bb.0:
+; CHECK-O3-NEXT:    movl (%rdi), %eax
+; CHECK-O3-NEXT:    pinsrw $0, %eax, %xmm0
+; CHECK-O3-NEXT:    shrl $16, %eax
+; CHECK-O3-NEXT:    pinsrw $0, %eax, %xmm1
+; CHECK-O3-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; CHECK-O3-NEXT:    retq
+;
+; CHECK-SSE-O3-LABEL: atomic_vec2_half:
+; CHECK-SSE-O3:       # %bb.0:
+; CHECK-SSE-O3-NEXT:    movl (%rdi), %eax
+; CHECK-SSE-O3-NEXT:    pinsrw $0, %eax, %xmm0
+; CHECK-SSE-O3-NEXT:    shrl $16, %eax
+; CHECK-SSE-O3-NEXT:    pinsrw $0, %eax, %xmm1
+; CHECK-SSE-O3-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; CHECK-SSE-O3-NEXT:    retq
+;
+; CHECK-AVX-O3-LABEL: atomic_vec2_half:
+; CHECK-AVX-O3:       # %bb.0:
+; CHECK-AVX-O3-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; CHECK-AVX-O3-NEXT:    retq
+;
+; CHECK-AVX512-O3-LABEL: atomic_vec2_half:
+; CHECK-AVX512-O3:       # %bb.0:
+; CHECK-AVX512-O3-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; CHECK-AVX512-O3-NEXT:    retq
+;
+; CHECK-O0-LABEL: atomic_vec2_half:
+; CHECK-O0:       # %bb.0:
+; CHECK-O0-NEXT:    movl (%rdi), %eax
+; CHECK-O0-NEXT:    movl %eax, %ecx
+; CHECK-O0-NEXT:    shrl $16, %ecx
+; CHECK-O0-NEXT:    movw %cx, %dx
+; CHECK-O0-NEXT:    # implicit-def: $ecx
+; CHECK-O0-NEXT:    movw %dx, %cx
+; CHECK-O0-NEXT:    # implicit-def: $xmm1
+; CHECK-O0-NEXT:    pinsrw $0, %ecx, %xmm1
+; CHECK-O0-NEXT:    movw %ax, %cx
+; CHECK-O0-NEXT:    # implicit-def: $eax
+; CHECK-O0-NEXT:    movw %cx, %ax
+; CHECK-O0-NEXT:    # implicit-def: $xmm0
+; CHECK-O0-NEXT:    pinsrw $0, %eax, %xmm0
+; CHECK-O0-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; CHECK-O0-NEXT:    retq
+;
+; CHECK-SSE-O0-LABEL: atomic_vec2_half:
+; CHECK-SSE-O0:       # %bb.0:
+; CHECK-SSE-O0-NEXT:    movl (%rdi), %eax
+; CHECK-SSE-O0-NEXT:    movl %eax, %ecx
+; CHECK-SSE-O0-NEXT:    shrl $16, %ecx
+; CHECK-SSE-O0-NEXT:    movw %cx, %dx
+; CHECK-SSE-O0-NEXT:    # implicit-def: $ecx
+; CHECK-SSE-O0-NEXT:    movw %dx, %cx
+; CHECK-SSE-O0-NEXT:    # implicit-def: $xmm1
+; CHECK-SSE-O0-NEXT:    pinsrw $0, %ecx, %xmm1
+; CHECK-SSE-O0-NEXT:    movw %ax, %cx
+; CHECK-SSE-O0-NEXT:    # implicit-def: $eax
+; CHECK-SSE-O0-NEXT:    movw %cx, %ax
+; CHECK-SSE-O0-NEXT:    # implicit-def: $xmm0
+; CHECK-SSE-O0-NEXT:    pinsrw $0, %eax, %xmm0
+; CHECK-SSE-O0-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; CHECK-SSE-O0-NEXT:    retq
+;
+; CHECK-AVX-O0-LABEL: atomic_vec2_half:
+; CHECK-AVX-O0:       # %bb.0:
+; CHECK-AVX-O0-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; CHECK-AVX-O0-NEXT:    retq
+;
+; CHECK-AVX512-O0-LABEL: atomic_vec2_half:
+; CHECK-AVX512-O0:       # %bb.0:
+; CHECK-AVX512-O0-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; CHECK-AVX512-O0-NEXT:    retq
+  %ret = load atomic <2 x half>, ptr %x acquire, align 4
+  ret <2 x half> %ret
+}
+define <2 x bfloat> @atomic_vec2_bfloat(ptr %x) {
+; CHECK-O3-LABEL: atomic_vec2_bfloat:
+; CHECK-O3:       # %bb.0:
+; CHECK-O3-NEXT:    movl (%rdi), %eax
+; CHECK-O3-NEXT:    pinsrw $0, %eax, %xmm0
+; CHECK-O3-NEXT:    shrl $16, %eax
+; CHECK-O3-NEXT:    pinsrw $0, %eax, %xmm1
+; CHECK-O3-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; CHECK-O3-NEXT:    retq
+;
+; CHECK-SSE-O3-LABEL: atomic_vec2_bfloat:
+; CHECK-SSE-O3:       # %bb.0:
+; CHECK-SSE-O3-NEXT:    movl (%rdi), %eax
+; CHECK-SSE-O3-NEXT:    pinsrw $0, %eax, %xmm0
+; CHECK-SSE-O3-NEXT:    shrl $16, %eax
+; CHECK-SSE-O3-NEXT:    pinsrw $0, %eax, %xmm1
+; CHECK-SSE-O3-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; CHECK-SSE-O3-NEXT:    retq
+;
+; CHECK-AVX-O3-LABEL: atomic_vec2_bfloat:
+; CHECK-AVX-O3:       # %bb.0:
+; CHECK-AVX-O3-NEXT:    movl (%rdi), %eax
+; CHECK-AVX-O3-NEXT:    movw %ax, -{{[0-9]+}}(%rsp)
+; CHECK-AVX-O3-NEXT:    shrl $16, %eax
+; CHECK-AVX-O3-NEXT:    movw %ax, -{{[0-9]+}}(%rsp)
+; CHECK-AVX-O3-NEXT:    vmovaps -{{[0-9]+}}(%rsp), %xmm0
+; CHECK-AVX-O3-NEXT:    retq
+;
+; CHECK-AVX512-O3-LABEL: atomic_vec2_bfloat:
+; CHECK-AVX512-O3:       # %bb.0:
+; CHECK-AVX512-O3-NEXT:    movl (%rdi), %eax
+; CHECK-AVX512-O3-NEXT:    movw %ax, -{{[0-9]+}}(%rsp)
+; CHECK-AVX512-O3-NEXT:    shrl $16, %eax
+; CHECK-AVX512-O3-NEXT:    movw %ax, -{{[0-9]+}}(%rsp)
+; CHECK-AVX512-O3-NEXT:    vmovaps -{{[0-9]+}}(%rsp), %xmm0
+; CHECK-AVX512-O3-NEXT:    retq
+;
+; CHECK-O0-LABEL: atomic_vec2_bfloat:
+; CHECK-O0:       # %bb.0:
+; CHECK-O0-NEXT:    movl (%rdi), %eax
+; CHECK-O0-NEXT:    movl %eax, %ecx
+; CHECK-O0-NEXT:    shrl $16, %ecx
+; CHECK-O0-NEXT:    # kill: def $cx killed $cx killed $ecx
+; CHECK-O0-NEXT:    movw %ax, %dx
+; CHECK-O0-NEXT:    # implicit-def: $eax
+; CHECK-O0-NEXT:    movw %dx, %ax
+; CHECK-O0-NEXT:    # implicit-def: $xmm0
+; CHECK-O0-NEXT:    pinsrw $0, %eax, %xmm0
+; CHECK-O0-NEXT:    # implicit-def: $eax
+; CHECK-O0-NEXT:    movw %cx, %ax
+; CHECK-O0-NEXT:    # implicit-def: $xmm1
+; CHECK-O0-NEXT:    pinsrw $0, %eax, %xmm1
+; CHECK-O0-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; CHECK-O0-NEXT:    retq
+;
+; CHECK-SSE-O0-LABEL: atomic_vec2_bfloat:
+; CHECK-SSE-O0:       # %bb.0:
+; CHECK-SSE-O0-NEXT:    movl (%rdi), %eax
+; CHECK-SSE-O0-NEXT:    movl %eax, %ecx
+; CHECK-SSE-O0-NEXT:    shrl $16, %ecx
+; CHECK-SSE-O0-NEXT:    # kill: def $cx killed $cx killed $ecx
+; CHECK-SSE-O0-NEXT:    movw %ax, %dx
+; CHECK-SSE-O0-NEXT:    # implicit-def: $eax
+; CHECK-SSE-O0-NEXT:    movw %dx, %ax
+; CHECK-SSE-O0-NEXT:    # implicit-def: $xmm0
+; CHECK-SSE-O0-NEXT:    pinsrw $0, %eax, %xmm0
+; CHECK-SSE-O0-NEXT:    # implicit-def: $eax
+; CHECK-SSE-O0-NEXT:    movw %cx, %ax
+; CHECK-SSE-O0-NEXT:    # implicit-def: $xmm1
+; CHECK-SSE-O0-NEXT:    pinsrw $0, %eax, %xmm1
+; CHECK-SSE-O0-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; CHECK-SSE-O0-NEXT:    retq
+;
+; CHECK-AVX-O0-LABEL: atomic_vec2_bfloat:
+; CHECK-AVX-O0:       # %bb.0:
+; CHECK-AVX-O0-NEXT:    movl (%rdi), %eax
+; CHECK-AVX-O0-NEXT:    movw %ax, %cx
+; CHECK-AVX-O0-NEXT:    movw %cx, -{{[0-9]+}}(%rsp)
+; CHECK-AVX-O0-NEXT:    shrl $16, %eax
+; CHECK-AVX-O0-NEXT:    # kill: def $ax killed $ax killed $eax
+; CHECK-AVX-O0-NEXT:    movw %ax, -{{[0-9]+}}(%rsp)
+; CHECK-AVX-O0-NEXT:    vmovaps -{{[0-9]+}}(%rsp), %xmm0
+; CHECK-AVX-O0-NEXT:    retq
+;
+; CHECK-AVX512-O0-LABEL: atomic_vec2_bfloat:
+; CHECK-AVX512-O0:       # %bb.0:
+; CHECK-AVX512-O0-NEXT:    movl (%rdi), %eax
+; CHECK-AVX512-O0-NEXT:    movw %ax, %cx
+; CHECK-AVX512-O0-NEXT:    movw %cx, -{{[0-9]+}}(%rsp)
+; CHECK-AVX512-O0-NEXT:    shrl $16, %eax
+; CHECK-AVX512-O0-NEXT:    # kill: def $ax killed $ax killed $eax
+; CHECK-AVX512-O0-NEXT:    movw %ax, -{{[0-9]+}}(%rsp)
+; CHECK-AVX512-O0-NEXT:    vmovaps -{{[0-9]+}}(%rsp), %xmm0
+; CHECK-AVX512-O0-NEXT:    retq
+  %ret = load atomic <2 x bfloat>, ptr %x acquire, align 4
+  ret <2 x bfloat> %ret
+}
 define <1 x ptr> @atomic_vec1_ptr(ptr %x) nounwind {
 ; CHECK-O3-LABEL: atomic_vec1_ptr:
 ; CHECK-O3:       # %bb.0:
@@ -1205,6 +1379,305 @@ define <4 x i16> @atomic_vec4_i16(ptr %x) nounwind {
   ret <4 x i16> %ret
 }
 
+define <4 x half> @atomic_vec4_half(ptr %x) nounwind {
+; CHECK-O3-LABEL: atomic_vec4_half:
+; CHECK-O3:       # %bb.0:
+; CHECK-O3-NEXT:    movq (%rdi), %rax
+; CHECK-O3-NEXT:    movl %eax, %ecx
+; CHECK-O3-NEXT:    shrl $16, %ecx
+; CHECK-O3-NEXT:    pinsrw $0, %ecx, %xmm1
+; CHECK-O3-NEXT:    pinsrw $0, %eax, %xmm0
+; CHECK-O3-NEXT:    movq %rax, %rcx
+; CHECK-O3-NEXT:    shrq $32, %rcx
+; CHECK-O3-NEXT:    pinsrw $0, %ecx, %xmm2
+; CHECK-O3-NEXT:    shrq $48, %rax
+; CHECK-O3-NEXT:    pinsrw $0, %eax, %xmm3
+; CHECK-O3-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
+; CHECK-O3-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; CHECK-O3-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
+; CHECK-O3-NEXT:    retq
+;
+; CHECK-SSE-O3-LABEL: atomic_vec4_half:
+; CHECK-SSE-O3:       # %bb.0:
+; CHECK-SSE-O3-NEXT:    movq (%rdi), %rax
+; CHECK-SSE-O3-NEXT:    movl %eax, %ecx
+; CHECK-SSE-O3-NEXT:    shrl $16, %ecx
+; CHECK-SSE-O3-NEXT:    pinsrw $0, %ecx, %xmm1
+; CHECK-SSE-O3-NEXT:    pinsrw $0, %eax, %xmm0
+; CHECK-SSE-O3-NEXT:    movq %rax, %rcx
+; CHECK-SSE-O3-NEXT:    shrq $32, %rcx
+; CHECK-SSE-O3-NEXT:    pinsrw $0, %ecx, %xmm2
+; CHECK-SSE-O3-NEXT:    shrq $48, %rax
+; CHECK-SSE-O3-NEXT:    pinsrw $0, %eax, %xmm3
+; CHECK-SSE-O3-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
+; CHECK-SSE-O3-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; CHECK-SSE-O3-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0],xmm2[0],zero,zero
+; CHECK-SSE-O3-NEXT:    retq
+;
+; CHECK-AVX-O3-LABEL: atomic_vec4_half:
+; CHECK-AVX-O3:       # %bb.0:
+; CHECK-AVX-O3-NEXT:    vmovq (%rdi), %xmm0
+; CHECK-AVX-O3-NEXT:    retq
+;
+; CHECK-AVX512-O3-LABEL: atomic_vec4_half:
+; CHECK-AVX512-O3:       # %bb.0:
+; CHECK-AVX512-O3-NEXT:    vmovq (%rdi), %xmm0
+; CHECK-AVX512-O3-NEXT:    retq
+;
+; CHECK-O0-LABEL: atomic_vec4_half:
+; CHECK-O0:       # %bb.0:
+; CHECK-O0-NEXT:    movq (%rdi), %rax
+; CHECK-O0-NEXT:    movl %eax, %ecx
+; CHECK-O0-NEXT:    shrl $16, %ecx
+; CHECK-O0-NEXT:    movw %cx, %dx
+; CHECK-O0-NEXT:    # implicit-def: $ecx
+; CHECK-O0-NEXT:    movw %dx, %cx
+; CHECK-O0-NEXT:    # implicit-def: $xmm2
+; CHECK-O0-NEXT:    pinsrw $0, %ecx, %xmm2
+; CHECK-O0-NEXT:    movw %ax, %dx
+; CHECK-O0-NEXT:    # implicit-def: $ecx
+; CHECK-O0-NEXT:    movw %dx, %cx
+; CHECK-O0-NEXT:    # implicit-def: $xmm0
+; CHECK-O0-NEXT:    pinsrw $0, %ecx, %xmm0
+; CHECK-O0-NEXT:    movq %rax, %rcx
+; CHECK-O0-NEXT:    shrq $32, %rcx
+; CHECK-O0-NEXT:    movw %cx, %dx
+; CHECK-O0-NEXT:    # implicit-def: $ecx
+; CHECK-O0-NEXT:    movw %dx, %cx
+; CHECK-O0-NEXT:    # implicit-def: $xmm1
+; CHECK-O0-NEXT:    pinsrw $0, %ecx, %xmm1
+; CHECK-O0-NEXT:    shrq $48, %rax
+; CHECK-O0-NEXT:    movw %ax, %cx
+; CHECK-O0-NEXT:    # implicit-def: $eax
+; CHECK-O0-NEXT:    movw %cx, %ax
+; CHECK-O0-NEXT:    # implicit-def: $xmm3
+; CHECK-O0-NEXT:    pinsrw $0, %eax, %xmm3
+; CHECK-O0-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3]
+; CHECK-O0-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
+; CHECK-O0-NEXT:    unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; CHECK-O0-NEXT:    retq
+;
+; CHECK-SSE-O0-LABEL: atomic_vec4_half:
+; CHECK-SSE-O0:       # %bb.0:
+; CHECK-SSE-O0-NEXT:    movq (%rdi), %rax
+; CHECK-SSE-O0-NEXT:    movl %eax, %ecx
+; CHECK-SSE-O0-NEXT:    shrl $16, %ecx
+; CHECK-SSE-O0-NEXT:    movw %cx, %dx
+; CHECK-SSE-O0-NEXT:    # implicit-def: $ecx
+; CHECK-SSE-O0-NEXT:    movw %dx, %cx
+; CHECK-SSE-O0-NEXT:    # implicit-def: $xmm2
+; CHECK-SSE-O0-NEXT:    pinsrw $0, %ecx, %xmm2
+; CHECK-SSE-O0-NEXT:    movw %ax, %dx
+; CHECK-SSE-O0-NEXT:    # implicit-def: $ecx
+; CHECK-SSE-O0-NEXT:    movw %dx, %cx
+; CHECK-SSE-O0-NEXT:    # implicit-def: $xmm0
+; CHECK-SSE-O0-NEXT:    pinsrw $0, %ecx, %xmm0
+; CHECK-SSE-O0-NEXT:    movq %rax, %rcx
+; CHECK-SSE-O0-NEXT:    shrq $32, %rcx
+; CHECK-SSE-O0-NEXT:    movw %cx, %dx
+; CHECK-SSE-O0-NEXT:    # implicit-def: $ecx
+; CHECK-SSE-O0-NEXT:    movw %dx, %cx
+; CHECK-SSE-O0-NEXT:    # implicit-def: $xmm1
+; CHECK-SSE-O0-NEXT:    pinsrw $0, %ecx, %xmm1
+; CHECK-SSE-O0-NEXT:    shrq $48, %rax
+; CHECK-SSE-O0-NEXT:    movw %ax, %cx
+; CHECK-SSE-O0-NEXT:    # implicit-def: $eax
+; CHECK-SSE-O0-NEXT:    movw %cx, %ax
+; CHECK-SSE-O0-NEXT:    # implicit-def: $xmm3
+; CHECK-SSE-O0-NEXT:    pinsrw $0, %eax, %xmm3
+; CHECK-SSE-O0-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3]
+; CHECK-SSE-O0-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
+; CHECK-SSE-O0-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],zero,zero
+; CHECK-SSE-O0-NEXT:    retq
+;
+; CHECK-AVX-O0-LABEL: atomic_vec4_half:
+; CHECK-AVX-O0:       # %bb.0:
+; CHECK-AVX-O0-NEXT:    vmovq (%rdi), %xmm0
+; CHECK-AVX-O0-NEXT:    retq
+;
+; CHECK-AVX512-O0-LABEL: atomic_vec4_half:
+; CHECK-AVX512-O0:       # %bb.0:
+; CHECK-AVX512-O0-NEXT:    vmovq (%rdi), %xmm0
+; CHECK-AVX512-O0-NEXT:    retq
+  %ret = load atomic <4 x half>, ptr %x acquire, align 8
+  ret <4 x half> %ret
+}
+define <4 x bfloat> @atomic_vec4_bfloat(ptr %x) nounwind {
+; CHECK-O3-LABEL: atomic_vec4_bfloat:
+; CHECK-O3:       # %bb.0:
+; CHECK-O3-NEXT:    movq (%rdi), %rax
+; CHECK-O3-NEXT:    movq %rax, %rcx
+; CHECK-O3-NEXT:    movq %rax, %rdx
+; CHECK-O3-NEXT:    pinsrw $0, %eax, %xmm0
+; CHECK-O3-NEXT:    # kill: def $eax killed $eax killed $rax
+; CHECK-O3-NEXT:    shrl $16, %eax
+; CHECK-O3-NEXT:    shrq $32, %rcx
+; CHECK-O3-NEXT:    shrq $48, %rdx
+; CHECK-O3-NEXT:    pinsrw $0, %edx, %xmm1
+; CHECK-O3-NEXT:    pinsrw $0, %ecx, %xmm2
+; CHECK-O3-NEXT:    pinsrw $0, %eax, %xmm3
+; CHECK-O3-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
+; CHECK-O3-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3]
+; CHECK-O3-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
+; CHECK-O3-NEXT:    retq
+;
+; CHECK-SSE-O3-LABEL: atomic_vec4_bfloat:
+; CHECK-SSE-O3:       # %bb.0:
+; CHECK-SSE-O3-NEXT:    movq (%rdi), %rax
+; CHECK-SSE-O3-NEXT:    movq %rax, %rcx
+; CHECK-SSE-O3-NEXT:    movq %rax, %rdx
+; CHECK-SSE-O3-NEXT:    pinsrw $0, %eax, %xmm0
+; CHECK-SSE-O3-NEXT:    # kill: def $eax killed $eax killed $rax
+; CHECK-SSE-O3-NEXT:    shrl $16, %eax
+; CHECK-SSE-O3-NEXT:    shrq $32, %rcx
+; CHECK-SSE-O3-NEXT:    shrq $48, %rdx
+; CHECK-SSE-O3-NEXT:    pinsrw $0, %edx, %xmm1
+; CHECK-SSE-O3-NEXT:    pinsrw $0, %ecx, %xmm2
+; CHECK-SSE-O3-NEXT:    pinsrw $0, %eax, %xmm3
+; CHECK-SSE-O3-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
+; CHECK-SSE-O3-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3]
+; CHECK-SSE-O3-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0],xmm2[0],zero,zero
+; CHECK-SSE-O3-NEXT:    retq
+;
+; CHECK-AVX-O3-LABEL: atomic_vec4_bfloat:
+; CHECK-AVX-O3:       # %bb.0:
+; CHECK-AVX-O3-NEXT:    movq (%rdi), %rax
+; CHECK-AVX-O3-NEXT:    movw %ax, -{{[0-9]+}}(%rsp)
+; CHECK-AVX-O3-NEXT:    movq %rax, %rcx
+; CHECK-AVX-O3-NEXT:    shrq $48, %rcx
+; CHECK-AVX-O3-NEXT:    movw %cx, -{{[0-9]+}}(%rsp)
+; CHECK-AVX-O3-NEXT:    movq %rax, %rcx
+; CHECK-AVX-O3-NEXT:    shrq $32, %rcx
+; CHECK-AVX-O3-NEXT:    movw %cx, -{{[0-9]+}}(%rsp)
+; CHECK-AVX-O3-NEXT:    shrl $16, %eax
+; CHECK-AVX-O3-NEXT:    movw %ax, -{{[0-9]+}}(%rsp)
+; CHECK-AVX-O3-NEXT:    vmovaps -{{[0-9]+}}(%rsp), %xmm0
+; CHECK-AVX-O3-NEXT:    retq
+;
+; CHECK-AVX512-O3-LABEL: atomic_vec4_bfloat:
+; CHECK-AVX512-O3:       # %bb.0:
+; CHECK-AVX512-O3-NEXT:    movq (%rdi), %rax
+; CHECK-AVX512-O3-NEXT:    movw %ax, -{{[0-9]+}}(%rsp)
+; CHECK-AVX512-O3-NEXT:    movq %rax, %rcx
+; CHECK-AVX512-O3-NEXT:    shrq $48, %rcx
+; CHECK-AVX512-O3-NEXT:    movw %cx, -{{[0-9]+}}(%rsp)
+; CHECK-AVX512-O3-NEXT:    movq %rax, %rcx
+; CHECK-AVX512-O3-NEXT:    shrq $32, %rcx
+; CHECK-AVX512-O3-NEXT:    movw %cx, -{{[0-9]+}}(%rsp)
+; CHECK-AVX512-O3-NEXT:    shrl $16, %eax
+; CHECK-AVX512-O3-NEXT:    movw %ax, -{{[0-9]+}}(%rsp)
+; CHECK-AVX512-O3-NEXT:    vmovaps -{{[0-9]+}}(%rsp), %xmm0
+; CHECK-AVX512-O3-NEXT:    retq
+;
+; CHECK-O0-LABEL: atomic_vec4_bfloat:
+; CHECK-O0:       # %bb.0:
+; CHECK-O0-NEXT:    movq (%rdi), %rax
+; CHECK-O0-NEXT:    movl %eax, %ecx
+; CHECK-O0-NEXT:    shrl $16, %ecx
+; CHECK-O0-NEXT:    # kill: def $cx killed $cx killed $ecx
+; CHECK-O0-NEXT:    movw %ax, %dx
+; CHECK-O0-NEXT:    movq %rax, %rsi
+; CHECK-O0-NEXT:    shrq $32, %rsi
+; CHECK-O0-NEXT:    # kill: def $si killed $si killed $rsi
+; CHECK-O0-NEXT:    shrq $48, %rax
+; CHECK-O0-NEXT:    movw %ax, %di
+; CHECK-O0-NEXT:    # implicit-def: $eax
+; CHECK-O0-NEXT:    movw %di, %ax
+; CHECK-O0-NEXT:    # implicit-def: $xmm0
+; CHECK-O0-NEXT:    pinsrw $0, %eax, %xmm0
+; CHECK-O0-NEXT:    # implicit-def: $eax
+; CHECK-O0-NEXT:    movw %si, %ax
+; CHECK-O0-NEXT:    # implicit-def: $xmm1
+; CHECK-O0-NEXT:    pinsrw $0, %eax, %xmm1
+; CHECK-O0-NEXT:    punpcklwd {{.*#+}} x...
[truncated]

jofrn · 2025-10-31T05:13:23Z

RKSimon

please regenerate the test checks

RKSimon · 2025-10-31T16:19:04Z

+  SDValue ExtractHi =
+      DAG.getNode(ISD::SRL, dl, IntVT, ALD,
+                  DAG.getIntPtrConstant(VT.getSizeInBits() / 2, dl));
+  ExtractHi = DAG.getNode(ISD::TRUNCATE, dl, HiIntVT, ExtractHi);


Why not use SplitInteger?

arsenm · 2025-10-31T19:52:55Z

+  assert(LD->getExtensionType() == ISD::NON_EXTLOAD &&
+         "Extended load during type legalization!");


Wouldn't this just be a matter of passing through the extension type instead of hardcoding NON_EXTLOAD below?

arsenm · 2025-10-31T19:53:20Z

+  SDValue ExtractLo = DAG.getNode(ISD::TRUNCATE, dl, LoIntVT, ALD);
+  SDValue ExtractHi =
+      DAG.getNode(ISD::SRL, dl, IntVT, ALD,
+                  DAG.getIntPtrConstant(VT.getSizeInBits() / 2, dl));


getIntPtrConstant is the wrong type to use, getShiftAmountConstant

RKSimon

LGTM

jofrn · 2026-01-20T10:02:18Z

Merge activity

Jan 20, 10:02 AM UTC: A user started a stack merge that includes this pull request via Graphite.
Jan 20, 1:10 PM UTC: Graphite couldn't merge this PR because it had merge conflicts.

RKSimon

rebase against trunk?

arsenm · 2026-01-20T13:38:53Z

I think this is stuck in the broken graphite rebase loop (#151135)

github-actions · 2026-01-20T15:36:26Z

🐧 Linux x64 Test Results

195097 tests passed
5196 tests skipped

✅ The build succeeded and all tests passed.

github-actions · 2026-01-20T15:36:26Z

🪟 Windows x64 Test Results

134492 tests passed
3259 tests skipped

✅ The build succeeded and all tests passed.

arsenm · 2026-01-20T16:01:13Z

-  // that is the parent of the matched node.
-  bit WantsParent = false;
-}
+//===- TargetSelectionDAG.td - Common code for DAG isels ---*- tablegen -*-===//


This reformatted the whole file or something?

Yes, the AI/UI hid it. We don't want this. Let me see how to revert it...

RKSimon · 2026-03-25T16:28:25Z

@jofrn any chance we can get these atomic patches finished soon please?

Vector types that aren't widened are split so that a single ATOMIC_LOAD is issued for the entire vector at once. This change utilizes the load vectorization infrastructure in SelectionDAG in order to group the vectors. This enables SelectionDAG to translate vectors with type bfloat,half.

jofrn · 2026-05-09T21:10:17Z

@jofrn any chance we can get these atomic patches finished soon please?

Yes, working on it now. : )

RKSimon · 2026-05-10T09:37:14Z

+          (atomic_load node:$ptr)> {
+  let IsAtomic = true;
+  let MemoryVT = v4i32;
+}


are these relevant to THIS patch? I can't see any uses of them

I think these can be moved to #148899 ?

They can be. Although, having the split intrinsics (e.g. atomic_load_128_v2i64) here makes #165818 strictly target-independent and #148899 strictly target-dependent, of which the latter, target-dependent one, is an optimization of the former.

My concern has been the glacial speed this patch stack has been committed - I don't want to see dead code sitting there for months and somebody coming along and removing it (as they should).

glacial speed; lol; not a concern anymore. Will get the stores going after this also by the way; I'll add you to review those ones as well.

RKSimon

LGTM - assuming #148899 will get landed shortly afterward

jofrn · 2026-05-11T08:25:15Z

Thanks! Working on #148899 now ; just waiting for the retesting, @RKSimon.

Vector types that aren't widened are split so that a single ATOMIC_LOAD is issued for the entire vector at once. This change utilizes the load vectorization infrastructure in SelectionDAG in order to group the vectors. This enables SelectionDAG to translate vectors with type bfloat,half.

Vector types that aren't widened are split so that a single ATOMIC_STORE is issued for the entire vector at once. This enables SelectionDAG to translate vectors with type bfloat,half. Store-side counterpart to #165818. Stacked on top of #197619; and below of #197861.

Vector types that aren't widened are split so that a single ATOMIC_STORE is issued for the entire vector at once. This enables SelectionDAG to translate vectors with type bfloat,half. Store-side counterpart to llvm#165818. Stacked on top of llvm#197619; and below of llvm#197861.

llvmbot added backend:X86 llvm:SelectionDAG SelectionDAGISel as well labels Oct 31, 2025

jofrn force-pushed the users/jofrn/gt/07-15-_selectiondag_split_vector_types_for_atomic_load branch from 81c146e to a0038c0 Compare October 31, 2025 05:12

jofrn force-pushed the users/jofrn/gt/07-15-_x86_remove_extra_mov_after_widening_atomic_load branch from 226394d to 55ec858 Compare October 31, 2025 05:12

RKSimon requested changes Oct 31, 2025

View reviewed changes

jofrn force-pushed the users/jofrn/gt/07-15-_selectiondag_split_vector_types_for_atomic_load branch from a0038c0 to 5c2428c Compare October 31, 2025 16:05

RKSimon reviewed Oct 31, 2025

View reviewed changes

arsenm reviewed Oct 31, 2025

View reviewed changes

jofrn force-pushed the users/jofrn/gt/07-15-_selectiondag_split_vector_types_for_atomic_load branch from 5c2428c to 9411926 Compare November 6, 2025 16:24

jofrn force-pushed the users/jofrn/gt/07-15-_x86_remove_extra_mov_after_widening_atomic_load branch from 55ec858 to b92b6da Compare November 6, 2025 17:45

jofrn force-pushed the users/jofrn/gt/07-15-_selectiondag_split_vector_types_for_atomic_load branch 2 times, most recently from 1434bcf to 8466578 Compare November 6, 2025 18:19

arsenm approved these changes Nov 7, 2025

View reviewed changes

RKSimon approved these changes Nov 19, 2025

View reviewed changes

jofrn force-pushed the users/jofrn/gt/07-15-_x86_remove_extra_mov_after_widening_atomic_load branch 5 times, most recently from 862a75a to 6de523d Compare January 20, 2026 11:04

jofrn force-pushed the users/jofrn/gt/07-15-_x86_remove_extra_mov_after_widening_atomic_load branch 5 times, most recently from d37ac30 to acda4fa Compare January 20, 2026 12:28

Base automatically changed from users/jofrn/gt/07-15-_x86_remove_extra_mov_after_widening_atomic_load to main January 20, 2026 13:05

RKSimon requested changes Jan 20, 2026

View reviewed changes

arsenm reviewed Jan 20, 2026

View reviewed changes

jofrn force-pushed the users/jofrn/gt/07-15-_selectiondag_split_vector_types_for_atomic_load branch from 8909241 to 8466578 Compare January 28, 2026 15:50

jofrn force-pushed the users/jofrn/gt/07-15-_selectiondag_split_vector_types_for_atomic_load branch from 8466578 to 72a340b Compare May 9, 2026 21:09

jofrn requested a review from RKSimon May 9, 2026 22:37

RKSimon reviewed May 10, 2026

View reviewed changes

jofrn requested a review from RKSimon May 10, 2026 23:51

RKSimon approved these changes May 11, 2026

View reviewed changes

jofrn merged commit b71b576 into main May 11, 2026
10 checks passed

jofrn deleted the users/jofrn/gt/07-15-_selectiondag_split_vector_types_for_atomic_load branch May 11, 2026 08:18

jofrn mentioned this pull request May 15, 2026

[SelectionDAG] Split vector types for atomic store #197860

Merged

This was referenced May 23, 2026

[PowerPC][AIX] Support #pragma comment copyright for AIX tonykuttai/llvm-project#6

Closed

[Clang][Modules] Fix -Wunused-variable (#196577) tonykuttai/llvm-project#7

Closed

		assert(LD->getExtensionType() == ISD::NON_EXTLOAD &&
		"Extended load during type legalization!");

Conversation

jofrn commented Oct 31, 2025

Uh oh!

llvmbot commented Oct 31, 2025 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Uh oh!

jofrn commented Oct 31, 2025 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Uh oh!

RKSimon left a comment

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

RKSimon left a comment

Choose a reason for hiding this comment

Uh oh!

jofrn commented Jan 20, 2026 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Merge activity

Uh oh!

RKSimon left a comment

Choose a reason for hiding this comment

Uh oh!

arsenm commented Jan 20, 2026

Uh oh!

github-actions Bot commented Jan 20, 2026 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

🐧 Linux x64 Test Results

Uh oh!

github-actions Bot commented Jan 20, 2026 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

🪟 Windows x64 Test Results

Uh oh!

Choose a reason for hiding this comment

Uh oh!

jofrn Jan 20, 2026 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Choose a reason for hiding this comment

Uh oh!

RKSimon commented Mar 25, 2026

Uh oh!

jofrn commented May 9, 2026

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

jofrn May 10, 2026 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

RKSimon left a comment

Choose a reason for hiding this comment

Uh oh!

Uh oh!

jofrn commented May 11, 2026

Uh oh!

Reviewers

Assignees

Labels

Projects

Milestone

Development

Uh oh!

4 participants

llvmbot commented Oct 31, 2025 •

edited

Loading

jofrn commented Oct 31, 2025 •

edited

Loading

jofrn commented Jan 20, 2026 •

edited

Loading

github-actions Bot commented Jan 20, 2026 •

edited

Loading

github-actions Bot commented Jan 20, 2026 •

edited

Loading

jofrn Jan 20, 2026 •

edited

Loading

jofrn May 10, 2026 •

edited

Loading