[X86] Cast atomic vectors in IR to support floats#148899
Merged
jofrn merged 1 commit intoMay 11, 2026
Merged
Conversation
This was referenced Jul 15, 2025
Contributor
Author
Member
|
@llvm/pr-subscribers-llvm-selectiondag @llvm/pr-subscribers-backend-x86 Author: None (jofrn) ChangesThis commit casts floats to ints in an atomic load during AtomicExpand to support 3 Files Affected:
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 776d3c0a42e2f..3debf30da0a29 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -32070,6 +32070,13 @@ X86TargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const {
}
}
+TargetLowering::AtomicExpansionKind
+X86TargetLowering::shouldCastAtomicLoadInIR(LoadInst *LI) const {
+ if (LI->getType()->getScalarType()->isFloatingPointTy())
+ return AtomicExpansionKind::CastToInteger;
+ return AtomicExpansionKind::None;
+}
+
LoadInst *
X86TargetLowering::lowerIdempotentRMWIntoFencedLoad(AtomicRMWInst *AI) const {
unsigned NativeWidth = Subtarget.is64Bit() ? 64 : 32;
diff --git a/llvm/lib/Target/X86/X86ISelLowering.h b/llvm/lib/Target/X86/X86ISelLowering.h
index 5cb6b3e493a32..43cddb2b53bd6 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.h
+++ b/llvm/lib/Target/X86/X86ISelLowering.h
@@ -1839,6 +1839,8 @@ namespace llvm {
shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const override;
TargetLoweringBase::AtomicExpansionKind
shouldExpandLogicAtomicRMWInIR(AtomicRMWInst *AI) const;
+ TargetLoweringBase::AtomicExpansionKind
+ shouldCastAtomicLoadInIR(LoadInst *LI) const override;
void emitBitTestAtomicRMWIntrinsic(AtomicRMWInst *AI) const override;
void emitCmpArithAtomicRMWIntrinsic(AtomicRMWInst *AI) const override;
diff --git a/llvm/test/CodeGen/X86/atomic-load-store.ll b/llvm/test/CodeGen/X86/atomic-load-store.ll
index 4b818b6cfa57e..039edcbf83544 100644
--- a/llvm/test/CodeGen/X86/atomic-load-store.ll
+++ b/llvm/test/CodeGen/X86/atomic-load-store.ll
@@ -207,19 +207,19 @@ define <1 x bfloat> @atomic_vec1_bfloat(ptr %x) {
; CHECK-O3-LABEL: atomic_vec1_bfloat:
; CHECK-O3: # %bb.0:
; CHECK-O3-NEXT: movzwl (%rdi), %eax
-; CHECK-O3-NEXT: pinsrw $0, %eax, %xmm0
+; CHECK-O3-NEXT: movd %eax, %xmm0
; CHECK-O3-NEXT: retq
;
; CHECK-SSE-O3-LABEL: atomic_vec1_bfloat:
; CHECK-SSE-O3: # %bb.0:
; CHECK-SSE-O3-NEXT: movzwl (%rdi), %eax
-; CHECK-SSE-O3-NEXT: pinsrw $0, %eax, %xmm0
+; CHECK-SSE-O3-NEXT: movd %eax, %xmm0
; CHECK-SSE-O3-NEXT: retq
;
; CHECK-AVX-O3-LABEL: atomic_vec1_bfloat:
; CHECK-AVX-O3: # %bb.0:
; CHECK-AVX-O3-NEXT: movzwl (%rdi), %eax
-; CHECK-AVX-O3-NEXT: vpinsrw $0, %eax, %xmm0, %xmm0
+; CHECK-AVX-O3-NEXT: vmovd %eax, %xmm0
; CHECK-AVX-O3-NEXT: retq
;
; CHECK-O0-LABEL: atomic_vec1_bfloat:
@@ -227,8 +227,7 @@ define <1 x bfloat> @atomic_vec1_bfloat(ptr %x) {
; CHECK-O0-NEXT: movw (%rdi), %cx
; CHECK-O0-NEXT: # implicit-def: $eax
; CHECK-O0-NEXT: movw %cx, %ax
-; CHECK-O0-NEXT: # implicit-def: $xmm0
-; CHECK-O0-NEXT: pinsrw $0, %eax, %xmm0
+; CHECK-O0-NEXT: movd %eax, %xmm0
; CHECK-O0-NEXT: retq
;
; CHECK-SSE-O0-LABEL: atomic_vec1_bfloat:
@@ -236,8 +235,7 @@ define <1 x bfloat> @atomic_vec1_bfloat(ptr %x) {
; CHECK-SSE-O0-NEXT: movw (%rdi), %cx
; CHECK-SSE-O0-NEXT: # implicit-def: $eax
; CHECK-SSE-O0-NEXT: movw %cx, %ax
-; CHECK-SSE-O0-NEXT: # implicit-def: $xmm0
-; CHECK-SSE-O0-NEXT: pinsrw $0, %eax, %xmm0
+; CHECK-SSE-O0-NEXT: movd %eax, %xmm0
; CHECK-SSE-O0-NEXT: retq
;
; CHECK-AVX-O0-LABEL: atomic_vec1_bfloat:
@@ -245,8 +243,7 @@ define <1 x bfloat> @atomic_vec1_bfloat(ptr %x) {
; CHECK-AVX-O0-NEXT: movw (%rdi), %cx
; CHECK-AVX-O0-NEXT: # implicit-def: $eax
; CHECK-AVX-O0-NEXT: movw %cx, %ax
-; CHECK-AVX-O0-NEXT: # implicit-def: $xmm0
-; CHECK-AVX-O0-NEXT: vpinsrw $0, %eax, %xmm0, %xmm0
+; CHECK-AVX-O0-NEXT: vmovd %eax, %xmm0
; CHECK-AVX-O0-NEXT: retq
%ret = load atomic <1 x bfloat>, ptr %x acquire, align 2
ret <1 x bfloat> %ret
@@ -377,6 +374,74 @@ define <2 x float> @atomic_vec2_float_align(ptr %x) {
ret <2 x float> %ret
}
+define <2 x half> @atomic_vec2_half(ptr %x) {
+; CHECK-O3-LABEL: atomic_vec2_half:
+; CHECK-O3: # %bb.0:
+; CHECK-O3-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; CHECK-O3-NEXT: retq
+;
+; CHECK-SSE-O3-LABEL: atomic_vec2_half:
+; CHECK-SSE-O3: # %bb.0:
+; CHECK-SSE-O3-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; CHECK-SSE-O3-NEXT: retq
+;
+; CHECK-AVX-O3-LABEL: atomic_vec2_half:
+; CHECK-AVX-O3: # %bb.0:
+; CHECK-AVX-O3-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; CHECK-AVX-O3-NEXT: retq
+;
+; CHECK-O0-LABEL: atomic_vec2_half:
+; CHECK-O0: # %bb.0:
+; CHECK-O0-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; CHECK-O0-NEXT: retq
+;
+; CHECK-SSE-O0-LABEL: atomic_vec2_half:
+; CHECK-SSE-O0: # %bb.0:
+; CHECK-SSE-O0-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; CHECK-SSE-O0-NEXT: retq
+;
+; CHECK-AVX-O0-LABEL: atomic_vec2_half:
+; CHECK-AVX-O0: # %bb.0:
+; CHECK-AVX-O0-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; CHECK-AVX-O0-NEXT: retq
+ %ret = load atomic <2 x half>, ptr %x acquire, align 4
+ ret <2 x half> %ret
+}
+
+define <2 x bfloat> @atomic_vec2_bfloat(ptr %x) {
+; CHECK-O3-LABEL: atomic_vec2_bfloat:
+; CHECK-O3: # %bb.0:
+; CHECK-O3-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; CHECK-O3-NEXT: retq
+;
+; CHECK-SSE-O3-LABEL: atomic_vec2_bfloat:
+; CHECK-SSE-O3: # %bb.0:
+; CHECK-SSE-O3-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; CHECK-SSE-O3-NEXT: retq
+;
+; CHECK-AVX-O3-LABEL: atomic_vec2_bfloat:
+; CHECK-AVX-O3: # %bb.0:
+; CHECK-AVX-O3-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; CHECK-AVX-O3-NEXT: retq
+;
+; CHECK-O0-LABEL: atomic_vec2_bfloat:
+; CHECK-O0: # %bb.0:
+; CHECK-O0-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; CHECK-O0-NEXT: retq
+;
+; CHECK-SSE-O0-LABEL: atomic_vec2_bfloat:
+; CHECK-SSE-O0: # %bb.0:
+; CHECK-SSE-O0-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; CHECK-SSE-O0-NEXT: retq
+;
+; CHECK-AVX-O0-LABEL: atomic_vec2_bfloat:
+; CHECK-AVX-O0: # %bb.0:
+; CHECK-AVX-O0-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; CHECK-AVX-O0-NEXT: retq
+ %ret = load atomic <2 x bfloat>, ptr %x acquire, align 4
+ ret <2 x bfloat> %ret
+}
+
define <1 x ptr> @atomic_vec1_ptr(ptr %x) nounwind {
; CHECK-O3-LABEL: atomic_vec1_ptr:
; CHECK-O3: # %bb.0:
@@ -457,19 +522,19 @@ define <1 x half> @atomic_vec1_half(ptr %x) {
; CHECK-O3-LABEL: atomic_vec1_half:
; CHECK-O3: # %bb.0:
; CHECK-O3-NEXT: movzwl (%rdi), %eax
-; CHECK-O3-NEXT: pinsrw $0, %eax, %xmm0
+; CHECK-O3-NEXT: movd %eax, %xmm0
; CHECK-O3-NEXT: retq
;
; CHECK-SSE-O3-LABEL: atomic_vec1_half:
; CHECK-SSE-O3: # %bb.0:
; CHECK-SSE-O3-NEXT: movzwl (%rdi), %eax
-; CHECK-SSE-O3-NEXT: pinsrw $0, %eax, %xmm0
+; CHECK-SSE-O3-NEXT: movd %eax, %xmm0
; CHECK-SSE-O3-NEXT: retq
;
; CHECK-AVX-O3-LABEL: atomic_vec1_half:
; CHECK-AVX-O3: # %bb.0:
; CHECK-AVX-O3-NEXT: movzwl (%rdi), %eax
-; CHECK-AVX-O3-NEXT: vpinsrw $0, %eax, %xmm0, %xmm0
+; CHECK-AVX-O3-NEXT: vmovd %eax, %xmm0
; CHECK-AVX-O3-NEXT: retq
;
; CHECK-O0-LABEL: atomic_vec1_half:
@@ -477,8 +542,7 @@ define <1 x half> @atomic_vec1_half(ptr %x) {
; CHECK-O0-NEXT: movw (%rdi), %cx
; CHECK-O0-NEXT: # implicit-def: $eax
; CHECK-O0-NEXT: movw %cx, %ax
-; CHECK-O0-NEXT: # implicit-def: $xmm0
-; CHECK-O0-NEXT: pinsrw $0, %eax, %xmm0
+; CHECK-O0-NEXT: movd %eax, %xmm0
; CHECK-O0-NEXT: retq
;
; CHECK-SSE-O0-LABEL: atomic_vec1_half:
@@ -486,8 +550,7 @@ define <1 x half> @atomic_vec1_half(ptr %x) {
; CHECK-SSE-O0-NEXT: movw (%rdi), %cx
; CHECK-SSE-O0-NEXT: # implicit-def: $eax
; CHECK-SSE-O0-NEXT: movw %cx, %ax
-; CHECK-SSE-O0-NEXT: # implicit-def: $xmm0
-; CHECK-SSE-O0-NEXT: pinsrw $0, %eax, %xmm0
+; CHECK-SSE-O0-NEXT: movd %eax, %xmm0
; CHECK-SSE-O0-NEXT: retq
;
; CHECK-AVX-O0-LABEL: atomic_vec1_half:
@@ -495,8 +558,7 @@ define <1 x half> @atomic_vec1_half(ptr %x) {
; CHECK-AVX-O0-NEXT: movw (%rdi), %cx
; CHECK-AVX-O0-NEXT: # implicit-def: $eax
; CHECK-AVX-O0-NEXT: movw %cx, %ax
-; CHECK-AVX-O0-NEXT: # implicit-def: $xmm0
-; CHECK-AVX-O0-NEXT: vpinsrw $0, %eax, %xmm0, %xmm0
+; CHECK-AVX-O0-NEXT: vmovd %eax, %xmm0
; CHECK-AVX-O0-NEXT: retq
%ret = load atomic <1 x half>, ptr %x acquire, align 2
ret <1 x half> %ret
@@ -841,6 +903,89 @@ define <4 x i16> @atomic_vec4_i16(ptr %x) nounwind {
ret <4 x i16> %ret
}
+define <4 x half> @atomic_vec4_half(ptr %x) nounwind {
+; CHECK-LABEL: atomic_vec4_half:
+; CHECK: # %bb.0:
+; CHECK-NEXT: movq (%rdi), %xmm0
+; CHECK-NEXT: retq
+ %ret = load atomic <4 x half>, ptr %x acquire, align 8
+ ret <4 x half> %ret
+}
+
+define <4 x bfloat> @atomic_vec4_bfloat(ptr %x) nounwind {
+; CHECK-LABEL: atomic_vec4_bfloat:
+; CHECK: # %bb.0:
+; CHECK-NEXT: movq (%rdi), %xmm0
+; CHECK-NEXT: retq
+ %ret = load atomic <4 x bfloat>, ptr %x acquire, align 8
+ ret <4 x bfloat> %ret
+}
+
+define <4 x float> @atomic_vec4_float_align(ptr %x) nounwind {
+; CHECK-O3-LABEL: atomic_vec4_float_align:
+; CHECK-O3: # %bb.0:
+; CHECK-O3-NEXT: pushq %rax
+; CHECK-O3-NEXT: movl $2, %esi
+; CHECK-O3-NEXT: callq __atomic_load_16@PLT
+; CHECK-O3-NEXT: movq %rdx, %xmm1
+; CHECK-O3-NEXT: movq %rax, %xmm0
+; CHECK-O3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; CHECK-O3-NEXT: popq %rax
+; CHECK-O3-NEXT: retq
+;
+; CHECK-SSE-O3-LABEL: atomic_vec4_float_align:
+; CHECK-SSE-O3: # %bb.0:
+; CHECK-SSE-O3-NEXT: pushq %rbx
+; CHECK-SSE-O3-NEXT: xorl %eax, %eax
+; CHECK-SSE-O3-NEXT: xorl %edx, %edx
+; CHECK-SSE-O3-NEXT: xorl %ecx, %ecx
+; CHECK-SSE-O3-NEXT: xorl %ebx, %ebx
+; CHECK-SSE-O3-NEXT: lock cmpxchg16b (%rdi)
+; CHECK-SSE-O3-NEXT: movq %rdx, %xmm1
+; CHECK-SSE-O3-NEXT: movq %rax, %xmm0
+; CHECK-SSE-O3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; CHECK-SSE-O3-NEXT: popq %rbx
+; CHECK-SSE-O3-NEXT: retq
+;
+; CHECK-AVX-O3-LABEL: atomic_vec4_float_align:
+; CHECK-AVX-O3: # %bb.0:
+; CHECK-AVX-O3-NEXT: vmovaps (%rdi), %xmm0
+; CHECK-AVX-O3-NEXT: retq
+;
+; CHECK-O0-LABEL: atomic_vec4_float_align:
+; CHECK-O0: # %bb.0:
+; CHECK-O0-NEXT: pushq %rax
+; CHECK-O0-NEXT: movl $2, %esi
+; CHECK-O0-NEXT: callq __atomic_load_16@PLT
+; CHECK-O0-NEXT: movq %rdx, %xmm1
+; CHECK-O0-NEXT: movq %rax, %xmm0
+; CHECK-O0-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; CHECK-O0-NEXT: popq %rax
+; CHECK-O0-NEXT: retq
+;
+; CHECK-SSE-O0-LABEL: atomic_vec4_float_align:
+; CHECK-SSE-O0: # %bb.0:
+; CHECK-SSE-O0-NEXT: pushq %rbx
+; CHECK-SSE-O0-NEXT: xorl %eax, %eax
+; CHECK-SSE-O0-NEXT: movl %eax, %ebx
+; CHECK-SSE-O0-NEXT: movq %rbx, %rax
+; CHECK-SSE-O0-NEXT: movq %rbx, %rdx
+; CHECK-SSE-O0-NEXT: movq %rbx, %rcx
+; CHECK-SSE-O0-NEXT: lock cmpxchg16b (%rdi)
+; CHECK-SSE-O0-NEXT: movq %rdx, %xmm1
+; CHECK-SSE-O0-NEXT: movq %rax, %xmm0
+; CHECK-SSE-O0-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; CHECK-SSE-O0-NEXT: popq %rbx
+; CHECK-SSE-O0-NEXT: retq
+;
+; CHECK-AVX-O0-LABEL: atomic_vec4_float_align:
+; CHECK-AVX-O0: # %bb.0:
+; CHECK-AVX-O0-NEXT: vmovaps (%rdi), %xmm0
+; CHECK-AVX-O0-NEXT: retq
+ %ret = load atomic <4 x float>, ptr %x acquire, align 16
+ ret <4 x float> %ret
+}
+
define <4 x float> @atomic_vec4_float(ptr %x) nounwind {
; CHECK-O3-LABEL: atomic_vec4_float:
; CHECK-O3: # %bb.0:
|
b8727ee to
ff8979e
Compare
b30786f to
23b20f1
Compare
ff8979e to
aab7a4e
Compare
23b20f1 to
038ab35
Compare
aab7a4e to
74eb40e
Compare
038ab35 to
1f63bc7
Compare
74eb40e to
fc3ec7b
Compare
02b47f7 to
c5418c9
Compare
fc3ec7b to
15e8347
Compare
c5418c9 to
d81b453
Compare
15e8347 to
28b4f4a
Compare
d81b453 to
daca511
Compare
28b4f4a to
1bcf1f1
Compare
daca511 to
cce2b9d
Compare
1bcf1f1 to
98b88dc
Compare
cce2b9d to
02c0d96
Compare
a0038c0 to
5c2428c
Compare
edc1e28 to
35e2752
Compare
5c2428c to
9411926
Compare
4d1cdad to
f9b99b9
Compare
1434bcf to
8466578
Compare
f9b99b9 to
23fb928
Compare
Contributor
Author
Merge activity
|
8466578 to
72a340b
Compare
449e57c to
ad92988
Compare
Base automatically changed from
users/jofrn/gt/07-15-_selectiondag_split_vector_types_for_atomic_load
to
main
May 11, 2026 08:18
This commit casts floats to ints in an atomic load during AtomicExpand to support floating point types. It also is required to support 128 bit vectors in SSE/AVX.
ad92988 to
ef6fd03
Compare
EuphoricThinking
pushed a commit
to EuphoricThinking/llvm-project
that referenced
this pull request
May 14, 2026
This commit casts floats to ints in an atomic load during AtomicExpand to support floating point types. It also is required to support 128 bit vectors in SSE/AVX.
jofrn
added a commit
that referenced
this pull request
May 16, 2026
Extend the X86 \`alignedstore\` PatFrag to also match \`atomic_store\` with vector-size alignment, so existing MOVAPS/MOVAPD/MOVDQA-family aligned-store patterns cover 128-bit aligned vector atomic stores on SSE/AVX/AVX-512 without per-type duplicates. \`<4 x float>\`, \`<2 x double>\`, \`<2 x i64>\`, \`<4 x i32>\`, \`<8 x half>\`, \`<8 x bfloat>\` all codegen to a single \`movaps\`/\`movapd\` on AVX+ via this. Adds v8f16/v8bf16 bitconvert variants to the widen-path \`atomic_store_32\` / \`atomic_store_64\` patterns so \`<2 x half>\`, \`<2 x bfloat>\`, \`<4 x half>\`, \`<4 x bfloat>\` atomic stores reaching the PR4 widen path also collapse to a single instruction on AVX+ targets. Vectors whose \`getTypeAction\` is split rather than widen still rely on PR6's \`SplitVecOp_ATOMIC_STORE\` — that path bitcasts the vector to a scalar integer and issues an integer \`atomic_store_N\`, picked up by the pre-existing scalar atomic-store patterns. The two legalization paths together cover the full vector-atomic-store matrix. Store-side counterpart to #148899.
jofrn
added a commit
that referenced
this pull request
May 16, 2026
Extend the X86 \`alignedstore\` PatFrag to also match \`atomic_store\` with vector-size alignment, so existing MOVAPS/MOVAPD/MOVDQA-family aligned-store patterns cover 128-bit aligned vector atomic stores on SSE/AVX/AVX-512 without per-type duplicates. \`<4 x float>\`, \`<2 x double>\`, \`<2 x i64>\`, \`<4 x i32>\`, \`<8 x half>\`, \`<8 x bfloat>\` all codegen to a single \`movaps\`/\`movapd\` on AVX+ via this. Adds v8f16/v8bf16 bitconvert variants to the widen-path \`atomic_store_32\` / \`atomic_store_64\` patterns so \`<2 x half>\`, \`<2 x bfloat>\`, \`<4 x half>\`, \`<4 x bfloat>\` atomic stores reaching the PR4 widen path also collapse to a single instruction on AVX+ targets. Vectors whose \`getTypeAction\` is split rather than widen still rely on PR6's \`SplitVecOp_ATOMIC_STORE\` — that path bitcasts the vector to a scalar integer and issues an integer \`atomic_store_N\`, picked up by the pre-existing scalar atomic-store patterns. The two legalization paths together cover the full vector-atomic-store matrix. Store-side counterpart to #148899.
pedroMVicente
pushed a commit
to pedroMVicente/llvm-project
that referenced
this pull request
May 19, 2026
This commit casts floats to ints in an atomic load during AtomicExpand to support floating point types. It also is required to support 128 bit vectors in SSE/AVX.
jofrn
added a commit
that referenced
this pull request
May 20, 2026
…dded in #148899) So that atomic floating-point and FP-vector loads are no longer bitcast to an integer at the IR level by AtomicExpand.
This was referenced May 23, 2026
Contributor
Author
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Add this suggestion to a batch that can be applied as a single commit.This suggestion is invalid because no changes were made to the code.Suggestions cannot be applied while the pull request is closed.Suggestions cannot be applied while viewing a subset of changes.Only one suggestion per line can be applied in a batch.Add this suggestion to a batch that can be applied as a single commit.Applying suggestions on deleted lines is not supported.You must change the existing code in this line in order to create a valid suggestion.Outdated suggestions cannot be applied.This suggestion has been applied or marked resolved.Suggestions cannot be applied from pending reviews.Suggestions cannot be applied on multi-line comments.Suggestions cannot be applied while the pull request is queued to merge.Suggestion cannot be applied right now. Please check back later.

This commit casts floats to ints in an atomic load during AtomicExpand to support
floating point types. It also is required to support 128 bit vectors in SSE/AVX.