Skip to content

[Arm64] addressing mode inefficiencies in Guid:op_Equality(Guid,Guid):bool #35622

@BruceForstall

Description

@BruceForstall

The arm64 generated code for Guid::op_Equality() could be better by (1) incorporating the fp address calculation into the ldr addressing modes, and (2) not using stack at all.

The code:

public static bool operator ==(Guid a, Guid b) =>
    a._a == b._a &&
        Unsafe.Add(ref a._a, 1) == Unsafe.Add(ref b._a, 1) &&
        Unsafe.Add(ref a._a, 2) == Unsafe.Add(ref b._a, 2) &&
        Unsafe.Add(ref a._a, 3) == Unsafe.Add(ref b._a, 3);

This code itself is weird, comparing 4 int values instead of comparing field-by-field of one int, two short, and eight byte. It should compare 2 long on 64-bit.

x64 code is pretty direct translation of this C# code.

arm64 first pushes the 2 16-byte struct-in-register-pair arguments to stack, then reloads each 4-byte element one at a time to compare. The base address of the stack locals are computed over and over, instead of being folded into the subsequent addressing modes that add the offset.

x64 assembly
G_M24558_IG01:
                        ;; bbWeight=1    PerfScore 0.00
G_M24558_IG02:
       8B01                 mov      eax, dword ptr [rcx]
       3B02                 cmp      eax, dword ptr [rdx]
       751D                 jne      SHORT G_M24558_IG05
                        ;; bbWeight=1    PerfScore 5.00
G_M24558_IG03:
       8B4104               mov      eax, dword ptr [rcx+4]
       3B4204               cmp      eax, dword ptr [rdx+4]
       7515                 jne      SHORT G_M24558_IG05
       8B4108               mov      eax, dword ptr [rcx+8]
       3B4208               cmp      eax, dword ptr [rdx+8]
       750D                 jne      SHORT G_M24558_IG05
       8B410C               mov      eax, dword ptr [rcx+12]
       3B420C               cmp      eax, dword ptr [rdx+12]
       0F94C0               sete     al
       0FB6C0               movzx    rax, al
                        ;; bbWeight=0.50 PerfScore 7.63
G_M24558_IG04:
       C3                   ret
                        ;; bbWeight=0.50 PerfScore 0.50
G_M24558_IG05:
       33C0                 xor      eax, eax
                        ;; bbWeight=0.50 PerfScore 0.13
G_M24558_IG06:
       C3                   ret
arm64 assembly
G_M24558_IG01:
        A9BD7BFD          stp     fp, lr, [sp,#-48]!
        910003FD          mov     fp, sp
        F90013A0          str     x0, [fp,#32]
        F90017A1          str     x1, [fp,#40]
        F9000BA2          str     x2, [fp,#16]
        F9000FA3          str     x3, [fp,#24]
                        ;; bbWeight=1    PerfScore 5.50
G_M24558_IG02:
        B94023A0          ldr     w0, [fp,#32]
        B94013A1          ldr     w1, [fp,#16]
        6B01001F          cmp     w0, w1
        540002A1          bne     G_M24558_IG05
                        ;; bbWeight=1    PerfScore 5.50
G_M24558_IG03:
        910083A0          add     x0, fp, #32
        B9400400          ldr     w0, [x0,#4]
        910043A1          add     x1, fp, #16
        B9400421          ldr     w1, [x1,#4]
        6B01001F          cmp     w0, w1
        540001E1          bne     G_M24558_IG05
        910083A0          add     x0, fp, #32
        B9400800          ldr     w0, [x0,#8]
        910043A1          add     x1, fp, #16
        B9400821          ldr     w1, [x1,#8]
        6B01001F          cmp     w0, w1
        54000121          bne     G_M24558_IG05
        910083A0          add     x0, fp, #32
        B9400C00          ldr     w0, [x0,#12]
        910043A1          add     x1, fp, #16
        B9400C21          ldr     w1, [x1,#12]
        6B01001F          cmp     w0, w1
        9A9F17E0          cset    x0, eq
                        ;; bbWeight=0.50 PerfScore 12.50
G_M24558_IG04:
        A8C37BFD          ldp     fp, lr, [sp],#48
        D65F03C0          ret     lr
                        ;; bbWeight=0.50 PerfScore 1.00
G_M24558_IG05:
        52800000          mov     w0, #0
                        ;; bbWeight=0.50 PerfScore 0.25
G_M24558_IG06:
        A8C37BFD          ldp     fp, lr, [sp],#48
        D65F03C0          ret     lr
Possible arm64 assembly after fixing address calculations
G_M24558_IG01:
        stp     fp, lr, [sp,#-48]!
        mov     fp, sp
        str     x0, [fp,#32]
        str     x1, [fp,#40]
        str     x2, [fp,#16]
        str     x3, [fp,#24]
G_M24558_IG02:
        ldr     w0, [fp,#32]
        ldr     w1, [fp,#16]
        cmp     w0, w1
        bne     G_M24558_IG05
G_M24558_IG03:
        ldr     w0, [fp,#36]
        ldr     w1, [fp,#20]
        cmp     w0, w1
        bne     G_M24558_IG05
        ldr     w0, [fp,#40]
        ldr     w1, [fp,#24]
        cmp     w0, w1
        bne     G_M24558_IG05
        ldr     w0, [fp,#44]
        ldr     w1, [fp,#28]
        cmp     w0, w1
        cset    x0, eq
G_M24558_IG04:
        ldp     fp, lr, [sp],#48
        ret     lr
G_M24558_IG05:
        mov     w0, #0
G_M24558_IG06:
        ldp     fp, lr, [sp],#48
        ret     lr

The JIT shouldn't need to put the argument structs on the stack at all. In which case we could generate code like the following (also assuming we can compare full registers, and not 4 bytes at a time).

Possible arm64 assembly fully optimized
G_M24558_IG01:
        stp     fp, lr, [sp,#-16]!
        mov     fp, sp
G_M24558_IG02:
        cmp     x0, x2
        bne     G_M24558_IG05
        cmp     x1, x3
        cset    x0, eq
G_M24558_IG04:
        ldp     fp, lr, [sp],#16
        ret     lr
G_M24558_IG05:
        mov     w0, #0
G_M24558_IG06:
        ldp     fp, lr, [sp],#16
        ret     lr

category:cq
theme:optimization
skill-level:intermediate
cost:medium

Metadata

Metadata

Assignees

Labels

Type

No type

Projects

No projects

Relationships

None yet

Development

No branches or pull requests

Issue actions