Skip to content

CINC/CSEL not emitted inside the loops instead of jumps over single instruction blocks #96380

@neon-sunset

Description

@neon-sunset

Description

It appears that for a straightforward compare -> increment, .NET emits a branch instead of cinc when a method containing such code is inlined inside a loop body.

Analysis

Given a method RuneLength:

[MethodImpl(MethodImplOptions.AggressiveInlining)]
public static int RuneLength(in byte value)
{
    var lzcnt = (uint)BitOperations.LeadingZeroCount(~((uint)value << 24));
    if (lzcnt is 0) lzcnt++;

    return (int)lzcnt;
}

This compiles to:

G_M000_IG01:                ;; offset=0x0000
            stp     fp, lr, [sp, #-0x10]!
            mov     fp, sp
 
G_M000_IG02:                ;; offset=0x0008
            ldrb    w0, [x0]
            lsl     w0, w0, #24
            mvn     w0, w0
            clz     w0, w0
            mov     w1, #1
            cmp     w0, #0
            csel    w0, w0, w1, ne ;; <-- Could have been just cinc with mov 1 elided - we're incrementing zero
 
G_M000_IG03:                ;; offset=0x0024
            ldp     fp, lr, [sp], #0x10
            ret     lr
 
; Total bytes of code 44

We can see that .NET emits mov + cmp + csel here. This isn't cinc but it's a good start.

However, if the method is inlined inside a loop body, the codegen is different.
Consider the below method:

static int Iterate(ref byte ptr, ref byte end)
{
    var acc = 0;
    while (Unsafe.IsAddressLessThan(ref ptr, ref end))
    {
        var length = RuneLength(in ptr);
        acc += length;
        ptr = ref Unsafe.Add(ref ptr, length);
    }

    return acc;
}

Instead of the pattern above, the codegen changes to cbnz label and mov 1 which is more compact but is more expensive because it uses branch execution units which have lower throughput per cycle than csel which uses integer units instead (on modern ARM cores like Firestorm or Cortex-X1):

G_M000_IG01:                ;; offset=0x0000
            stp     fp, lr, [sp, #-0x10]!
            mov     fp, sp
 
G_M000_IG02:                ;; offset=0x0008
            mov     w2, wzr
            cmp     x0, x1
            bhs     G_M000_IG06
            align   [0 bytes for IG03]
            align   [0 bytes]
            align   [0 bytes]
            align   [0 bytes]
 
G_M000_IG03:                ;; offset=0x0014
            ldrb    w3, [x0]
            lsl     w3, w3, #24
            mvn     w3, w3
            clz     w3, w3
            cbnz    w3, G_M000_IG05 ;; <-- Could have been cmp + cinc
 
G_M000_IG04:                ;; offset=0x0028
            mov     w3, #1
 
G_M000_IG05:                ;; offset=0x002C
            add     w2, w2, w3
            sxtw    x3, w3
            add     x0, x0, x3
            cmp     x0, x1
            blo     G_M000_IG03
 
G_M000_IG06:                ;; offset=0x0040
            mov     w0, w2
 
G_M000_IG07:                ;; offset=0x0044
            ldp     fp, lr, [sp], #0x10
            ret     lr
 
; Total bytes of code 76

Configuration

.NET SDK:
 Version:           8.0.100
 Commit:            57efcf1350
 Workload version:  8.0.100-manifests.71b9f198

Runtime Environment:
 OS Name:     Mac OS X
 OS Version:  14.1
 OS Platform: Darwin
 RID:         osx-arm64
 Base Path:   /usr/local/share/dotnet/sdk/8.0.100/

Regression?

No

Happy New Year!

Metadata

Metadata

Labels

area-CodeGen-coreclrCLR JIT compiler in src/coreclr/src/jit and related components such as SuperPMItenet-performancePerformance related issue

Type

No type

Projects

No projects

Milestone

Relationships

None yet

Development

No branches or pull requests

Issue actions