[SelectionDAG] Split vector types for atomic store by jofrn · Pull Request #197860 · llvm/llvm-project

jofrn · 2026-05-15T03:47:40Z

Vector types that aren't widened are split so that a single ATOMIC_STORE
is issued for the entire vector at once. This enables SelectionDAG to
translate vectors with type bfloat,half.

Store-side counterpart to #165818. Stacked on top of #197619; and below of #197861.

llvmorg-github-actions · 2026-05-15T03:48:14Z

@llvm/pr-subscribers-llvm-selectiondag

@llvm/pr-subscribers-backend-x86

Author: jofrn

Changes

Vector types that aren't widened are split so that a single ATOMIC_STORE
is issued for the entire vector at once. This enables SelectionDAG to
translate vectors with type bfloat,half.

Store-side counterpart to #165818. Stacked on top of #197619.

Patch is 22.65 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/197860.diff

3 Files Affected:

(modified) llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h (+1)
(modified) llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp (+20)
(modified) llvm/test/CodeGen/X86/atomic-load-store.ll (+450)

diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h b/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h
index a1c0e68049544..450eba435cc0b 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h
@@ -986,6 +986,7 @@ class LLVM_LIBRARY_VISIBILITY DAGTypeLegalizer {
   SDValue SplitVecOp_ExtVecInRegOp(SDNode *N);
   SDValue SplitVecOp_FAKE_USE(SDNode *N);
   SDValue SplitVecOp_STORE(StoreSDNode *N, unsigned OpNo);
+  SDValue SplitVecOp_ATOMIC_STORE(AtomicSDNode *N);
   SDValue SplitVecOp_VP_STORE(VPStoreSDNode *N, unsigned OpNo);
   SDValue SplitVecOp_VP_STRIDED_STORE(VPStridedStoreSDNode *N, unsigned OpNo);
   SDValue SplitVecOp_MSTORE(MaskedStoreSDNode *N, unsigned OpNo);
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
index 1cc78382f025f..66574e6cea263 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
@@ -3767,6 +3767,9 @@ bool DAGTypeLegalizer::SplitVectorOperand(SDNode *N, unsigned OpNo) {
   case ISD::STORE:
     Res = SplitVecOp_STORE(cast<StoreSDNode>(N), OpNo);
     break;
+  case ISD::ATOMIC_STORE:
+    Res = SplitVecOp_ATOMIC_STORE(cast<AtomicSDNode>(N));
+    break;
   case ISD::VP_STORE:
     Res = SplitVecOp_VP_STORE(cast<VPStoreSDNode>(N), OpNo);
     break;
@@ -4704,6 +4707,23 @@ SDValue DAGTypeLegalizer::SplitVecOp_STORE(StoreSDNode *N, unsigned OpNo) {
   return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Lo, Hi);
 }
 
+SDValue DAGTypeLegalizer::SplitVecOp_ATOMIC_STORE(AtomicSDNode *N) {
+  SDLoc DL(N);
+  SDValue StVal = N->getVal();
+  EVT VT = StVal.getValueType();
+
+  // Issue a single atomic store of an integer that spans the full memory
+  // width. Bitcasting the (illegal) vector value to that integer lets the
+  // type legalizer further legalize the BITCAST input as needed, while the
+  // ATOMIC_STORE itself uses only the legal integer type.
+  EVT IntVT = EVT::getIntegerVT(*DAG.getContext(), VT.getSizeInBits());
+  EVT MemIntVT =
+      EVT::getIntegerVT(*DAG.getContext(), N->getMemoryVT().getSizeInBits());
+  SDValue AsInt = DAG.getBitcast(IntVT, StVal);
+  return DAG.getAtomic(ISD::ATOMIC_STORE, DL, MemIntVT, N->getChain(), AsInt,
+                       N->getBasePtr(), N->getMemOperand());
+}
+
 SDValue DAGTypeLegalizer::SplitVecOp_CONCAT_VECTORS(SDNode *N) {
   SDLoc DL(N);
 
diff --git a/llvm/test/CodeGen/X86/atomic-load-store.ll b/llvm/test/CodeGen/X86/atomic-load-store.ll
index 91c4d0a3d8c1c..066842739fb61 100644
--- a/llvm/test/CodeGen/X86/atomic-load-store.ll
+++ b/llvm/test/CodeGen/X86/atomic-load-store.ll
@@ -751,6 +751,456 @@ define <2 x float> @atomic_vec2_float_align(ptr %x) {
   ret <2 x float> %ret
 }
 
+define void @store_atomic_vec2_half(ptr %x, <2 x half> %v) {
+; CHECK-SSE-O3-LABEL: store_atomic_vec2_half:
+; CHECK-SSE-O3:       # %bb.0:
+; CHECK-SSE-O3-NEXT:    pextrw $0, %xmm0, %eax
+; CHECK-SSE-O3-NEXT:    psrld $16, %xmm0
+; CHECK-SSE-O3-NEXT:    pextrw $0, %xmm0, %ecx
+; CHECK-SSE-O3-NEXT:    shll $16, %ecx
+; CHECK-SSE-O3-NEXT:    movzwl %ax, %eax
+; CHECK-SSE-O3-NEXT:    orl %ecx, %eax
+; CHECK-SSE-O3-NEXT:    movl %eax, (%rdi)
+; CHECK-SSE-O3-NEXT:    retq
+;
+; CHECK-AVX-O3-LABEL: store_atomic_vec2_half:
+; CHECK-AVX-O3:       # %bb.0:
+; CHECK-AVX-O3-NEXT:    vmovd %xmm0, %eax
+; CHECK-AVX-O3-NEXT:    movl %eax, (%rdi)
+; CHECK-AVX-O3-NEXT:    retq
+;
+; CHECK-SSE-O0-LABEL: store_atomic_vec2_half:
+; CHECK-SSE-O0:       # %bb.0:
+; CHECK-SSE-O0-NEXT:    movaps %xmm0, %xmm1
+; CHECK-SSE-O0-NEXT:    movaps %xmm1, %xmm0
+; CHECK-SSE-O0-NEXT:    psrld $16, %xmm1
+; CHECK-SSE-O0-NEXT:    pextrw $0, %xmm1, %eax
+; CHECK-SSE-O0-NEXT:    # kill: def $ax killed $ax killed $eax
+; CHECK-SSE-O0-NEXT:    # implicit-def: $ecx
+; CHECK-SSE-O0-NEXT:    movw %ax, %cx
+; CHECK-SSE-O0-NEXT:    shll $16, %ecx
+; CHECK-SSE-O0-NEXT:    pextrw $0, %xmm0, %eax
+; CHECK-SSE-O0-NEXT:    # kill: def $ax killed $ax killed $eax
+; CHECK-SSE-O0-NEXT:    movzwl %ax, %eax
+; CHECK-SSE-O0-NEXT:    orl %ecx, %eax
+; CHECK-SSE-O0-NEXT:    movl %eax, (%rdi)
+; CHECK-SSE-O0-NEXT:    retq
+;
+; CHECK-AVX-O0-LABEL: store_atomic_vec2_half:
+; CHECK-AVX-O0:       # %bb.0:
+; CHECK-AVX-O0-NEXT:    vmovd %xmm0, %eax
+; CHECK-AVX-O0-NEXT:    movl %eax, (%rdi)
+; CHECK-AVX-O0-NEXT:    retq
+  store atomic <2 x half> %v, ptr %x release, align 4
+  ret void
+}
+
+define void @store_atomic_vec2_bfloat(ptr %x, <2 x bfloat> %v) {
+; CHECK-SSE-O3-LABEL: store_atomic_vec2_bfloat:
+; CHECK-SSE-O3:       # %bb.0:
+; CHECK-SSE-O3-NEXT:    pextrw $0, %xmm0, %eax
+; CHECK-SSE-O3-NEXT:    psrld $16, %xmm0
+; CHECK-SSE-O3-NEXT:    pextrw $0, %xmm0, %ecx
+; CHECK-SSE-O3-NEXT:    shll $16, %ecx
+; CHECK-SSE-O3-NEXT:    movzwl %ax, %eax
+; CHECK-SSE-O3-NEXT:    orl %ecx, %eax
+; CHECK-SSE-O3-NEXT:    movl %eax, (%rdi)
+; CHECK-SSE-O3-NEXT:    retq
+;
+; CHECK-AVX-O3-LABEL: store_atomic_vec2_bfloat:
+; CHECK-AVX-O3:       # %bb.0:
+; CHECK-AVX-O3-NEXT:    vmovd %xmm0, %eax
+; CHECK-AVX-O3-NEXT:    movl %eax, (%rdi)
+; CHECK-AVX-O3-NEXT:    retq
+;
+; CHECK-SSE-O0-LABEL: store_atomic_vec2_bfloat:
+; CHECK-SSE-O0:       # %bb.0:
+; CHECK-SSE-O0-NEXT:    subq $24, %rsp
+; CHECK-SSE-O0-NEXT:    .cfi_def_cfa_offset 32
+; CHECK-SSE-O0-NEXT:    movaps %xmm0, %xmm1
+; CHECK-SSE-O0-NEXT:    movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; CHECK-SSE-O0-NEXT:    pextrw $1, %xmm1, %eax
+; CHECK-SSE-O0-NEXT:    shll $16, %eax
+; CHECK-SSE-O0-NEXT:    movd %eax, %xmm0
+; CHECK-SSE-O0-NEXT:    movd %xmm1, %eax
+; CHECK-SSE-O0-NEXT:    shll $16, %eax
+; CHECK-SSE-O0-NEXT:    movd %eax, %xmm1
+; CHECK-SSE-O0-NEXT:    movss %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; CHECK-SSE-O0-NEXT:    callq __truncsfbf2@PLT
+; CHECK-SSE-O0-NEXT:    movaps %xmm0, %xmm1
+; CHECK-SSE-O0-NEXT:    movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload
+; CHECK-SSE-O0-NEXT:    # xmm0 = mem[0],zero,zero,zero
+; CHECK-SSE-O0-NEXT:    pextrw $0, %xmm1, %eax
+; CHECK-SSE-O0-NEXT:    movw %ax, %cx
+; CHECK-SSE-O0-NEXT:    # implicit-def: $eax
+; CHECK-SSE-O0-NEXT:    movw %cx, %ax
+; CHECK-SSE-O0-NEXT:    shll $16, %eax
+; CHECK-SSE-O0-NEXT:    movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; CHECK-SSE-O0-NEXT:    callq __truncsfbf2@PLT
+; CHECK-SSE-O0-NEXT:    movl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 4-byte Reload
+; CHECK-SSE-O0-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload
+; CHECK-SSE-O0-NEXT:    pextrw $0, %xmm0, %eax
+; CHECK-SSE-O0-NEXT:    # kill: def $ax killed $ax killed $eax
+; CHECK-SSE-O0-NEXT:    movzwl %ax, %eax
+; CHECK-SSE-O0-NEXT:    orl %ecx, %eax
+; CHECK-SSE-O0-NEXT:    movl %eax, (%rdi)
+; CHECK-SSE-O0-NEXT:    addq $24, %rsp
+; CHECK-SSE-O0-NEXT:    .cfi_def_cfa_offset 8
+; CHECK-SSE-O0-NEXT:    retq
+;
+; CHECK-AVX-O0-LABEL: store_atomic_vec2_bfloat:
+; CHECK-AVX-O0:       # %bb.0:
+; CHECK-AVX-O0-NEXT:    subq $24, %rsp
+; CHECK-AVX-O0-NEXT:    .cfi_def_cfa_offset 32
+; CHECK-AVX-O0-NEXT:    vmovaps %xmm0, %xmm1
+; CHECK-AVX-O0-NEXT:    movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; CHECK-AVX-O0-NEXT:    vpextrw $1, %xmm1, %eax
+; CHECK-AVX-O0-NEXT:    shll $16, %eax
+; CHECK-AVX-O0-NEXT:    vmovd %eax, %xmm0
+; CHECK-AVX-O0-NEXT:    vmovd %xmm1, %eax
+; CHECK-AVX-O0-NEXT:    shll $16, %eax
+; CHECK-AVX-O0-NEXT:    vmovd %eax, %xmm1
+; CHECK-AVX-O0-NEXT:    vmovss %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; CHECK-AVX-O0-NEXT:    callq __truncsfbf2@PLT
+; CHECK-AVX-O0-NEXT:    vmovaps %xmm0, %xmm1
+; CHECK-AVX-O0-NEXT:    vmovss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload
+; CHECK-AVX-O0-NEXT:    # xmm0 = mem[0],zero,zero,zero
+; CHECK-AVX-O0-NEXT:    vpextrw $0, %xmm1, %eax
+; CHECK-AVX-O0-NEXT:    movw %ax, %cx
+; CHECK-AVX-O0-NEXT:    # implicit-def: $eax
+; CHECK-AVX-O0-NEXT:    movw %cx, %ax
+; CHECK-AVX-O0-NEXT:    shll $16, %eax
+; CHECK-AVX-O0-NEXT:    movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; CHECK-AVX-O0-NEXT:    callq __truncsfbf2@PLT
+; CHECK-AVX-O0-NEXT:    movl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 4-byte Reload
+; CHECK-AVX-O0-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload
+; CHECK-AVX-O0-NEXT:    vpextrw $0, %xmm0, %eax
+; CHECK-AVX-O0-NEXT:    # kill: def $ax killed $ax killed $eax
+; CHECK-AVX-O0-NEXT:    movzwl %ax, %eax
+; CHECK-AVX-O0-NEXT:    orl %ecx, %eax
+; CHECK-AVX-O0-NEXT:    movl %eax, (%rdi)
+; CHECK-AVX-O0-NEXT:    addq $24, %rsp
+; CHECK-AVX-O0-NEXT:    .cfi_def_cfa_offset 8
+; CHECK-AVX-O0-NEXT:    retq
+  store atomic <2 x bfloat> %v, ptr %x release, align 4
+  ret void
+}
+
+define void @store_atomic_vec4_half(ptr %x, <4 x half> %v) nounwind {
+; CHECK-SSE2-O3-LABEL: store_atomic_vec4_half:
+; CHECK-SSE2-O3:       # %bb.0:
+; CHECK-SSE2-O3-NEXT:    movdqa %xmm0, %xmm1
+; CHECK-SSE2-O3-NEXT:    psrld $16, %xmm1
+; CHECK-SSE2-O3-NEXT:    pextrw $0, %xmm1, %eax
+; CHECK-SSE2-O3-NEXT:    pextrw $0, %xmm0, %ecx
+; CHECK-SSE2-O3-NEXT:    shll $16, %eax
+; CHECK-SSE2-O3-NEXT:    movzwl %cx, %ecx
+; CHECK-SSE2-O3-NEXT:    orl %eax, %ecx
+; CHECK-SSE2-O3-NEXT:    movdqa %xmm0, %xmm1
+; CHECK-SSE2-O3-NEXT:    psrlq $48, %xmm1
+; CHECK-SSE2-O3-NEXT:    pextrw $0, %xmm1, %eax
+; CHECK-SSE2-O3-NEXT:    shll $16, %eax
+; CHECK-SSE2-O3-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,1,1,1]
+; CHECK-SSE2-O3-NEXT:    pextrw $0, %xmm0, %edx
+; CHECK-SSE2-O3-NEXT:    movzwl %dx, %edx
+; CHECK-SSE2-O3-NEXT:    orl %eax, %edx
+; CHECK-SSE2-O3-NEXT:    shlq $32, %rdx
+; CHECK-SSE2-O3-NEXT:    orq %rcx, %rdx
+; CHECK-SSE2-O3-NEXT:    movq %rdx, (%rdi)
+; CHECK-SSE2-O3-NEXT:    retq
+;
+; CHECK-SSE4-O3-LABEL: store_atomic_vec4_half:
+; CHECK-SSE4-O3:       # %bb.0:
+; CHECK-SSE4-O3-NEXT:    movdqa %xmm0, %xmm1
+; CHECK-SSE4-O3-NEXT:    psrld $16, %xmm1
+; CHECK-SSE4-O3-NEXT:    pextrw $0, %xmm1, %eax
+; CHECK-SSE4-O3-NEXT:    shll $16, %eax
+; CHECK-SSE4-O3-NEXT:    pextrw $0, %xmm0, %ecx
+; CHECK-SSE4-O3-NEXT:    movzwl %cx, %ecx
+; CHECK-SSE4-O3-NEXT:    orl %eax, %ecx
+; CHECK-SSE4-O3-NEXT:    movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
+; CHECK-SSE4-O3-NEXT:    psrlq $48, %xmm0
+; CHECK-SSE4-O3-NEXT:    pextrw $0, %xmm0, %eax
+; CHECK-SSE4-O3-NEXT:    shll $16, %eax
+; CHECK-SSE4-O3-NEXT:    pextrw $0, %xmm1, %edx
+; CHECK-SSE4-O3-NEXT:    movzwl %dx, %edx
+; CHECK-SSE4-O3-NEXT:    orl %eax, %edx
+; CHECK-SSE4-O3-NEXT:    shlq $32, %rdx
+; CHECK-SSE4-O3-NEXT:    orq %rcx, %rdx
+; CHECK-SSE4-O3-NEXT:    movq %rdx, (%rdi)
+; CHECK-SSE4-O3-NEXT:    retq
+;
+; CHECK-AVX-O3-LABEL: store_atomic_vec4_half:
+; CHECK-AVX-O3:       # %bb.0:
+; CHECK-AVX-O3-NEXT:    vmovq %xmm0, %rax
+; CHECK-AVX-O3-NEXT:    movq %rax, (%rdi)
+; CHECK-AVX-O3-NEXT:    retq
+;
+; CHECK-SSE2-O0-LABEL: store_atomic_vec4_half:
+; CHECK-SSE2-O0:       # %bb.0:
+; CHECK-SSE2-O0-NEXT:    movaps %xmm0, %xmm3
+; CHECK-SSE2-O0-NEXT:    movaps %xmm3, %xmm2
+; CHECK-SSE2-O0-NEXT:    movaps %xmm3, %xmm1
+; CHECK-SSE2-O0-NEXT:    psrlq $48, %xmm1
+; CHECK-SSE2-O0-NEXT:    movaps %xmm3, %xmm0
+; CHECK-SSE2-O0-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,1,1,1]
+; CHECK-SSE2-O0-NEXT:    psrld $16, %xmm3
+; CHECK-SSE2-O0-NEXT:    pextrw $0, %xmm3, %eax
+; CHECK-SSE2-O0-NEXT:    # kill: def $ax killed $ax killed $eax
+; CHECK-SSE2-O0-NEXT:    # implicit-def: $ecx
+; CHECK-SSE2-O0-NEXT:    movw %ax, %cx
+; CHECK-SSE2-O0-NEXT:    shll $16, %ecx
+; CHECK-SSE2-O0-NEXT:    pextrw $0, %xmm2, %eax
+; CHECK-SSE2-O0-NEXT:    # kill: def $ax killed $ax killed $eax
+; CHECK-SSE2-O0-NEXT:    movzwl %ax, %eax
+; CHECK-SSE2-O0-NEXT:    orl %ecx, %eax
+; CHECK-SSE2-O0-NEXT:    # kill: def $rax killed $eax
+; CHECK-SSE2-O0-NEXT:    pextrw $0, %xmm1, %ecx
+; CHECK-SSE2-O0-NEXT:    movw %cx, %dx
+; CHECK-SSE2-O0-NEXT:    # implicit-def: $ecx
+; CHECK-SSE2-O0-NEXT:    movw %dx, %cx
+; CHECK-SSE2-O0-NEXT:    shll $16, %ecx
+; CHECK-SSE2-O0-NEXT:    pextrw $0, %xmm0, %edx
+; CHECK-SSE2-O0-NEXT:    # kill: def $dx killed $dx killed $edx
+; CHECK-SSE2-O0-NEXT:    movzwl %dx, %edx
+; CHECK-SSE2-O0-NEXT:    orl %ecx, %edx
+; CHECK-SSE2-O0-NEXT:    # implicit-def: $rcx
+; CHECK-SSE2-O0-NEXT:    movl %edx, %ecx
+; CHECK-SSE2-O0-NEXT:    shlq $32, %rcx
+; CHECK-SSE2-O0-NEXT:    orq %rcx, %rax
+; CHECK-SSE2-O0-NEXT:    movq %rax, (%rdi)
+; CHECK-SSE2-O0-NEXT:    retq
+;
+; CHECK-SSE4-O0-LABEL: store_atomic_vec4_half:
+; CHECK-SSE4-O0:       # %bb.0:
+; CHECK-SSE4-O0-NEXT:    movaps %xmm0, %xmm3
+; CHECK-SSE4-O0-NEXT:    movaps %xmm3, %xmm2
+; CHECK-SSE4-O0-NEXT:    movaps %xmm3, %xmm1
+; CHECK-SSE4-O0-NEXT:    psrlq $48, %xmm1
+; CHECK-SSE4-O0-NEXT:    movshdup {{.*#+}} xmm0 = xmm3[1,1,3,3]
+; CHECK-SSE4-O0-NEXT:    psrld $16, %xmm3
+; CHECK-SSE4-O0-NEXT:    pextrw $0, %xmm3, %eax
+; CHECK-SSE4-O0-NEXT:    # kill: def $ax killed $ax killed $eax
+; CHECK-SSE4-O0-NEXT:    # implicit-def: $ecx
+; CHECK-SSE4-O0-NEXT:    movw %ax, %cx
+; CHECK-SSE4-O0-NEXT:    shll $16, %ecx
+; CHECK-SSE4-O0-NEXT:    pextrw $0, %xmm2, %eax
+; CHECK-SSE4-O0-NEXT:    # kill: def $ax killed $ax killed $eax
+; CHECK-SSE4-O0-NEXT:    movzwl %ax, %eax
+; CHECK-SSE4-O0-NEXT:    orl %ecx, %eax
+; CHECK-SSE4-O0-NEXT:    # kill: def $rax killed $eax
+; CHECK-SSE4-O0-NEXT:    pextrw $0, %xmm1, %ecx
+; CHECK-SSE4-O0-NEXT:    movw %cx, %dx
+; CHECK-SSE4-O0-NEXT:    # implicit-def: $ecx
+; CHECK-SSE4-O0-NEXT:    movw %dx, %cx
+; CHECK-SSE4-O0-NEXT:    shll $16, %ecx
+; CHECK-SSE4-O0-NEXT:    pextrw $0, %xmm0, %edx
+; CHECK-SSE4-O0-NEXT:    # kill: def $dx killed $dx killed $edx
+; CHECK-SSE4-O0-NEXT:    movzwl %dx, %edx
+; CHECK-SSE4-O0-NEXT:    orl %ecx, %edx
+; CHECK-SSE4-O0-NEXT:    # implicit-def: $rcx
+; CHECK-SSE4-O0-NEXT:    movl %edx, %ecx
+; CHECK-SSE4-O0-NEXT:    shlq $32, %rcx
+; CHECK-SSE4-O0-NEXT:    orq %rcx, %rax
+; CHECK-SSE4-O0-NEXT:    movq %rax, (%rdi)
+; CHECK-SSE4-O0-NEXT:    retq
+;
+; CHECK-AVX-O0-LABEL: store_atomic_vec4_half:
+; CHECK-AVX-O0:       # %bb.0:
+; CHECK-AVX-O0-NEXT:    vmovq %xmm0, %rax
+; CHECK-AVX-O0-NEXT:    movq %rax, (%rdi)
+; CHECK-AVX-O0-NEXT:    retq
+  store atomic <4 x half> %v, ptr %x release, align 8
+  ret void
+}
+
+define void @store_atomic_vec4_bfloat(ptr %x, <4 x bfloat> %v) nounwind {
+; CHECK-SSE2-O3-LABEL: store_atomic_vec4_bfloat:
+; CHECK-SSE2-O3:       # %bb.0:
+; CHECK-SSE2-O3-NEXT:    movdqa %xmm0, %xmm1
+; CHECK-SSE2-O3-NEXT:    psrld $16, %xmm1
+; CHECK-SSE2-O3-NEXT:    pextrw $0, %xmm1, %eax
+; CHECK-SSE2-O3-NEXT:    pextrw $0, %xmm0, %ecx
+; CHECK-SSE2-O3-NEXT:    shll $16, %eax
+; CHECK-SSE2-O3-NEXT:    movzwl %cx, %ecx
+; CHECK-SSE2-O3-NEXT:    orl %eax, %ecx
+; CHECK-SSE2-O3-NEXT:    movdqa %xmm0, %xmm1
+; CHECK-SSE2-O3-NEXT:    psrlq $48, %xmm1
+; CHECK-SSE2-O3-NEXT:    pextrw $0, %xmm1, %eax
+; CHECK-SSE2-O3-NEXT:    shll $16, %eax
+; CHECK-SSE2-O3-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,1,1,1]
+; CHECK-SSE2-O3-NEXT:    pextrw $0, %xmm0, %edx
+; CHECK-SSE2-O3-NEXT:    movzwl %dx, %edx
+; CHECK-SSE2-O3-NEXT:    orl %eax, %edx
+; CHECK-SSE2-O3-NEXT:    shlq $32, %rdx
+; CHECK-SSE2-O3-NEXT:    orq %rcx, %rdx
+; CHECK-SSE2-O3-NEXT:    movq %rdx, (%rdi)
+; CHECK-SSE2-O3-NEXT:    retq
+;
+; CHECK-SSE4-O3-LABEL: store_atomic_vec4_bfloat:
+; CHECK-SSE4-O3:       # %bb.0:
+; CHECK-SSE4-O3-NEXT:    movdqa %xmm0, %xmm1
+; CHECK-SSE4-O3-NEXT:    psrld $16, %xmm1
+; CHECK-SSE4-O3-NEXT:    pextrw $0, %xmm1, %eax
+; CHECK-SSE4-O3-NEXT:    shll $16, %eax
+; CHECK-SSE4-O3-NEXT:    pextrw $0, %xmm0, %ecx
+; CHECK-SSE4-O3-NEXT:    movzwl %cx, %ecx
+; CHECK-SSE4-O3-NEXT:    orl %eax, %ecx
+; CHECK-SSE4-O3-NEXT:    movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
+; CHECK-SSE4-O3-NEXT:    psrlq $48, %xmm0
+; CHECK-SSE4-O3-NEXT:    pextrw $0, %xmm0, %eax
+; CHECK-SSE4-O3-NEXT:    shll $16, %eax
+; CHECK-SSE4-O3-NEXT:    pextrw $0, %xmm1, %edx
+; CHECK-SSE4-O3-NEXT:    movzwl %dx, %edx
+; CHECK-SSE4-O3-NEXT:    orl %eax, %edx
+; CHECK-SSE4-O3-NEXT:    shlq $32, %rdx
+; CHECK-SSE4-O3-NEXT:    orq %rcx, %rdx
+; CHECK-SSE4-O3-NEXT:    movq %rdx, (%rdi)
+; CHECK-SSE4-O3-NEXT:    retq
+;
+; CHECK-AVX-O3-LABEL: store_atomic_vec4_bfloat:
+; CHECK-AVX-O3:       # %bb.0:
+; CHECK-AVX-O3-NEXT:    vmovq %xmm0, %rax
+; CHECK-AVX-O3-NEXT:    movq %rax, (%rdi)
+; CHECK-AVX-O3-NEXT:    retq
+;
+; CHECK-SSE-O0-LABEL: store_atomic_vec4_bfloat:
+; CHECK-SSE-O0:       # %bb.0:
+; CHECK-SSE-O0-NEXT:    subq $40, %rsp
+; CHECK-SSE-O0-NEXT:    movaps %xmm0, %xmm1
+; CHECK-SSE-O0-NEXT:    movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; CHECK-SSE-O0-NEXT:    pextrw $3, %xmm1, %eax
+; CHECK-SSE-O0-NEXT:    shll $16, %eax
+; CHECK-SSE-O0-NEXT:    movd %eax, %xmm0
+; CHECK-SSE-O0-NEXT:    movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; CHECK-SSE-O0-NEXT:    pextrw $2, %xmm1, %eax
+; CHECK-SSE-O0-NEXT:    shll $16, %eax
+; CHECK-SSE-O0-NEXT:    movd %eax, %xmm0
+; CHECK-SSE-O0-NEXT:    movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; CHECK-SSE-O0-NEXT:    pextrw $1, %xmm1, %eax
+; CHECK-SSE-O0-NEXT:    shll $16, %eax
+; CHECK-SSE-O0-NEXT:    movd %eax, %xmm0
+; CHECK-SSE-O0-NEXT:    movd %xmm1, %eax
+; CHECK-SSE-O0-NEXT:    shll $16, %eax
+; CHECK-SSE-O0-NEXT:    movd %eax, %xmm1
+; CHECK-SSE-O0-NEXT:    movss %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; CHECK-SSE-O0-NEXT:    callq __truncsfbf2@PLT
+; CHECK-SSE-O0-NEXT:    movaps %xmm0, %xmm1
+; CHECK-SSE-O0-NEXT:    movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload
+; CHECK-SSE-O0-NEXT:    # xmm0 = mem[0],zero,zero,zero
+; CHECK-SSE-O0-NEXT:    pextrw $0, %xmm1, %eax
+; CHECK-SSE-O0-NEXT:    movw %ax, %cx
+; CHECK-SSE-O0-NEXT:    # implicit-def: $eax
+; CHECK-SSE-O0-NEXT:    movw %cx, %ax
+; CHECK-SSE-O0-NEXT:    shll $16, %eax
+; CHECK-SSE-O0-NEXT:    movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; CHECK-SSE-O0-NEXT:    callq __truncsfbf2@PLT
+; CHECK-SSE-O0-NEXT:    movl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 4-byte Reload
+; CHECK-SSE-O0-NEXT:    movaps %xmm0, %xmm1
+; CHECK-SSE-O0-NEXT:    movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload
+; CHECK-SSE-O0-NEXT:    # xmm0 = mem[0],zero,zero,zero
+; CHECK-SSE-O0-NEXT:    pextrw $0, %xmm1, %eax
+; CHECK-SSE-O0-NEXT:    # kill: def $ax killed $ax killed $eax
+; CHECK-SSE-O0-NEXT:    movzwl %ax, %eax
+; CHECK-SSE-O0-NEXT:    orl %ecx, %eax
+; CHECK-SSE-O0-NEXT:    # kill: def $rax killed $eax
+; CHECK-SSE-O0-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; CHECK-SSE-O0-NEXT:    callq __truncsfbf2@PLT
+; CHECK-SSE-O0-NEXT:    movaps %xmm0, %xmm1
+; CHECK-SSE-O0-NEXT:    movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload
+; CHECK-SSE-O0-NEXT:    # xmm0 = mem[0],zero,zero,zero
+; CHECK-SSE-O0-NEXT:    pextrw $0, %xmm1, %eax
+; CHECK-SSE-O0-NEXT:    movw %ax, %cx
+; CHECK-SSE-O0-NEXT:    # implicit-def: $eax
+; CHECK-SSE-O0-NEXT:    movw %cx, %ax
+; CHECK-SSE-O0-NEXT:    shll $16, %eax
+; CHECK-SSE-O0-NEXT:    movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; CHECK-SSE-O0-NEXT:    callq __truncsfbf2@PLT
+; CHECK-SSE-O0-NEXT:    movl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 4-byte Reload
+; CHECK-SSE-O0-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; CHECK-SSE-O0-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload
+; CHECK-SSE-O0-NEXT:    pextrw $0, %xmm0, %edx
+; CHECK-SSE-O0-NEXT:    # kill: def $dx killed $dx killed $edx
+; CHECK-SSE-O0-NEXT:    movzwl %dx, %edx
+; CHECK-SSE-O0-NEXT:    orl %ecx, %edx
+; CHECK-SSE-O0-NEXT:    # implicit-def: $rcx
+; CHECK-SSE-O0-NEXT:    movl %edx, %ecx
+; CHECK-SSE-O0-NEXT:    shlq $32, %rcx
+; CHECK-SSE-O0-NEXT:    orq %rcx, %rax
+; CHECK-SSE-O0-NEXT:    movq %rax, (%rdi)
+; CHECK-SSE-O0-NEXT:    addq $40, %rsp
+; CHECK-SSE-O0-NEXT:    retq
+;
+; CHECK-AVX-O0-LABEL: store_atomic_vec4_bfloat:
+; CHECK-AVX-O0:       # %bb.0:
+; CHECK-AVX-O0-NEXT:    subq $40, %rsp
+; CHECK-AVX-O0-NEXT:    vmovaps %xmm0, %xmm1
+; CHECK-AVX-O0-NEXT:    movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; CHECK-AVX-O0-NEXT:    vpextrw $3, %xmm1, %eax
+; CHECK-AVX-O0-NEXT:    shll $16, %eax
+; CHECK-AVX-O0-NEXT:    vmovd %eax, %xmm0
+; CHECK-AVX-O0-NEXT:    vmovss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; CHECK-AVX-O0-NEXT:    vpextrw $2, %xmm1, %eax
+; CHECK-AVX-O0-NEXT:    shll $16, %eax
+; CHECK-AVX-O0-NEXT:    vmovd %eax, %xmm0
+; CHECK-AVX-O0-NEXT:    vmovss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; CHECK-AVX-O0-NEXT:    vpextrw $1, %xmm1, %eax
+; CHECK-AVX-O0-NEXT:    shll $16, %eax
+; CHECK-AVX-O0-NEXT:    vmovd %eax, %xmm0
+; CHECK-AVX-O0-NEXT:    vmovd %xmm1, %eax
+; CHECK-AVX-O0-NEXT:    shll $16, %eax
+; CHECK-AVX-O0-NEXT:    vmov...
[truncated]

github-actions · 2026-05-15T03:48:39Z

⚠️ We detected that you are using a GitHub private e-mail address to contribute to the repo.
Please turn off Keep my email addresses private setting in your account.
See LLVM Developer Policy and LLVM Discourse for more information.

phoebewang · 2026-05-15T08:07:37Z

+  // width. Bitcasting the (illegal) vector value to that integer lets the
+  // type legalizer further legalize the BITCAST input as needed, while the
+  // ATOMIC_STORE itself uses only the legal integer type.
+  EVT IntVT = EVT::getIntegerVT(*DAG.getContext(), VT.getSizeInBits());


Does it apply to <2 x x86_fp80>, <2 x fp128> etc. that no corresponding legal integer type?

It does apply to those; however, it doesn't matter per se that there is no legal type if so since AtomicExpand.cpp's atomicSizeSupported's call to TLI->getMaxAtomicSizeInBitsSupported() / 8 will cause lowering of these types to library calls instead, i.e. before the DAG lowers them.

arsenm · 2026-05-15T12:12:55Z

+; CHECK-SSE-O0-LABEL: store_atomic_vec2_bfloat:
+; CHECK-SSE-O0:       # %bb.0:
+; CHECK-SSE-O0-NEXT:    subq $24, %rsp
+; CHECK-SSE-O0-NEXT:    .cfi_def_cfa_offset 32


Add nounwind to functions

github-actions · 2026-05-16T13:35:23Z

🐧 Linux x64 Test Results

196087 tests passed
5300 tests skipped

✅ The build succeeded and all tests passed.

github-actions · 2026-05-16T13:35:23Z

🪟 Windows x64 Test Results

135312 tests passed
3349 tests skipped

✅ The build succeeded and all tests passed.

RKSimon

LGTM

Vector types of 2 elements must be widened. This change does this for vector types of atomic store in SelectionDAG so that it can translate aligned vectors of >1 size.

This change adds patterns to optimize out an extra MOV present after widening the atomic store. Covers <2 x i8> (SSE4.1+), <2 x i16>, <4 x i8>, <2 x i32>, <2 x float>, <4 x i16>, <2 x ptr addrspace(270)>.

Vector types that aren't widened are split so that a single ATOMIC_STORE is issued for the entire vector at once. This enables SelectionDAG to translate vectors with type bfloat,half.

This change adds patterns to optimize out an extra MOV present after widening the atomic store. Covers `<2 x i8>` (SSE4.1+), `<2 x i16>`, `<4 x i8>`, `<2 x i32>`, `<2 x float>`, `<4 x i16>`, `<2 x ptr addrspace(270)>`. Store-side counterpart to #148898. Stacked on top of #197618; and below of #197860.

Vector types that aren't widened are split so that a single ATOMIC_STORE is issued for the entire vector at once. This enables SelectionDAG to translate vectors with type bfloat,half. Store-side counterpart to llvm#165818. Stacked on top of llvm#197619; and below of llvm#197861.

llvmorg-github-actions Bot added backend:X86 llvm:SelectionDAG SelectionDAGISel as well labels May 15, 2026

jofrn mentioned this pull request May 15, 2026

[X86] Extend alignedstore PatFrag to cover atomic_store #197861

Open

RKSimon self-requested a review May 15, 2026 07:05

phoebewang reviewed May 15, 2026

View reviewed changes

jofrn mentioned this pull request May 15, 2026

[X86] Remove extra MOV after widening atomic store #197619

Merged

arsenm approved these changes May 15, 2026

View reviewed changes

jofrn force-pushed the users/jofrn/x86-remove-extra-mov-atomic-store branch from ac8361d to a730eaf Compare May 16, 2026 13:00

jofrn force-pushed the users/jofrn/split-vec-atomic-store branch from 01c3c98 to e0dca2b Compare May 16, 2026 13:00

jofrn force-pushed the users/jofrn/split-vec-atomic-store branch from e0dca2b to e4c9611 Compare May 16, 2026 17:00

RKSimon approved these changes May 17, 2026

View reviewed changes

jofrn force-pushed the users/jofrn/x86-remove-extra-mov-atomic-store branch 2 times, most recently from b20b8d4 to 16bfe6a Compare May 19, 2026 08:08

jofrn force-pushed the users/jofrn/split-vec-atomic-store branch from e4c9611 to a839f91 Compare May 19, 2026 10:41

jofrn force-pushed the users/jofrn/x86-remove-extra-mov-atomic-store branch from 16bfe6a to 7b09891 Compare May 19, 2026 13:28

jofrn force-pushed the users/jofrn/split-vec-atomic-store branch 2 times, most recently from 9a15cc0 to 9f6ca39 Compare May 20, 2026 21:49

jofrn force-pushed the users/jofrn/x86-remove-extra-mov-atomic-store branch from 7b09891 to 1613d11 Compare May 20, 2026 21:49

jofrn added 3 commits May 20, 2026 14:56

[SelectionDAG] Widen <2 x T> vector types for atomic store

63ef83c

Vector types of 2 elements must be widened. This change does this for vector types of atomic store in SelectionDAG so that it can translate aligned vectors of >1 size.

[X86] Remove extra MOV after widening atomic store

7fb4fcf

This change adds patterns to optimize out an extra MOV present after widening the atomic store. Covers <2 x i8> (SSE4.1+), <2 x i16>, <4 x i8>, <2 x i32>, <2 x float>, <4 x i16>, <2 x ptr addrspace(270)>.

[SelectionDAG] Split vector types for atomic store

98275c5

Vector types that aren't widened are split so that a single ATOMIC_STORE is issued for the entire vector at once. This enables SelectionDAG to translate vectors with type bfloat,half.

jofrn force-pushed the users/jofrn/split-vec-atomic-store branch from 9f6ca39 to 98275c5 Compare May 20, 2026 21:59

jofrn force-pushed the users/jofrn/x86-remove-extra-mov-atomic-store branch from 1613d11 to 7fb4fcf Compare May 20, 2026 21:59

jofrn force-pushed the users/jofrn/x86-remove-extra-mov-atomic-store branch 3 times, most recently from 149e8c0 to 27aec96 Compare June 1, 2026 20:51

Base automatically changed from users/jofrn/x86-remove-extra-mov-atomic-store to main June 1, 2026 21:33

Merge branch 'main' into users/jofrn/split-vec-atomic-store

83cff25

jofrn merged commit 9b92e0f into main Jun 2, 2026
10 checks passed

jofrn deleted the users/jofrn/split-vec-atomic-store branch June 2, 2026 21:25

adams381 mentioned this pull request Jun 4, 2026

[CIR] Lower constant block addresses for goto #201644

Open

jofrn mentioned this pull request Jun 6, 2026

[X86] Replace alignedstore/alignedload custom predicates with MinAlignment. NFC #201980

Open

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

[SelectionDAG] Split vector types for atomic store#197860

[SelectionDAG] Split vector types for atomic store#197860
jofrn merged 4 commits into
mainfrom
users/jofrn/split-vec-atomic-store

jofrn commented May 15, 2026 •

edited

Loading

Uh oh!

llvmorg-github-actions Bot commented May 15, 2026 •

edited

Loading

Uh oh!

github-actions Bot commented May 15, 2026

Uh oh!

phoebewang May 15, 2026

Uh oh!

jofrn May 15, 2026 •

edited

Loading

Uh oh!

arsenm May 15, 2026

Uh oh!

github-actions Bot commented May 16, 2026 •

edited

Loading

Uh oh!

github-actions Bot commented May 16, 2026 •

edited

Loading

Uh oh!

RKSimon left a comment

Uh oh!

Uh oh!

Reviewers

Assignees

Labels

Projects

Milestone

Development

Uh oh!

4 participants

Conversation

jofrn commented May 15, 2026 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Uh oh!

llvmorg-github-actions Bot commented May 15, 2026 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Uh oh!

github-actions Bot commented May 15, 2026

Uh oh!

phoebewang May 15, 2026

Choose a reason for hiding this comment

Uh oh!

jofrn May 15, 2026 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Choose a reason for hiding this comment

Uh oh!

arsenm May 15, 2026

Choose a reason for hiding this comment

Uh oh!

github-actions Bot commented May 16, 2026 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

🐧 Linux x64 Test Results

Uh oh!

github-actions Bot commented May 16, 2026 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

🪟 Windows x64 Test Results

Uh oh!

RKSimon left a comment

Choose a reason for hiding this comment

Uh oh!

Uh oh!

Reviewers

Assignees

Labels

Projects

Milestone

Development

Uh oh!

4 participants

jofrn commented May 15, 2026 •

edited

Loading

llvmorg-github-actions Bot commented May 15, 2026 •

edited

Loading

jofrn May 15, 2026 •

edited

Loading

github-actions Bot commented May 16, 2026 •

edited

Loading

github-actions Bot commented May 16, 2026 •

edited

Loading