Given
https://godbolt.org/z/PGc51qhcT
define i128 @typed(ptr %src, ptr %other, i1 %c) {
%tmp = alloca i128, align 16
br i1 %c, label %a, label %b
a:
call void @llvm.memcpy.p0.p0.i64(ptr %tmp, ptr %src, i64 16, i1 false)
br label %b
b:
%p = phi ptr [ %tmp, %a ], [ %other, %0 ]
%v = load i128, ptr %p, align 16
ret i128 %v
}
define i128 @bytes(ptr %src, ptr %other, i1 %c) {
%tmp = alloca [16 x i8], align 16
br i1 %c, label %a, label %b
a:
call void @llvm.memcpy.p0.p0.i64(ptr %tmp, ptr %src, i64 16, i1 false)
br label %b
b:
%p = phi ptr [ %tmp, %a ], [ %other, %0 ]
%v = load i128, ptr %p, align 16
ret i128 %v
}
The only difference here is the type of the alloca. In the latter case the memcpy is not optimized out
typed:
test dl, 1
cmove rdi, rsi
mov rax, qword ptr [rdi]
mov rdx, qword ptr [rdi + 8]
ret
bytes:
test dl, 1
je .LBB1_2
movups xmm0, xmmword ptr [rdi]
movaps xmmword ptr [rsp - 24], xmm0
lea rsi, [rsp - 24]
.LBB1_2:
mov rax, qword ptr [rsi]
mov rdx, qword ptr [rsi + 8]
ret
The branch/phi appears to be important.
This might be related to #164308, but their example is quite messy, so hopefully a smaller reproducer helps in any case.
Given
https://godbolt.org/z/PGc51qhcT
The only difference here is the type of the
alloca. In the latter case the memcpy is not optimized outThe branch/phi appears to be important.
This might be related to #164308, but their example is quite messy, so hopefully a smaller reproducer helps in any case.