Skip to content

JIT: Suboptimal codegen when using array instead of span #12549

@Thealexbarney

Description

@Thealexbarney

In the following situation, changing the type of a local variable from Span<Vector128<byte>> to Vector128<byte>[] results in performance that is ~80% of the original.

Standard benchmark results are performed by passing a large array to the encryption function. Blocked results are from calling the encryption function on a single 0x10-byte AES block at a time.

Span:
ECB encryption: 5.708 GB/s
ECB encryption blocked: 775.686 MB/s

Array:
ECB encryption: 4.549 GB/s
ECB encryption blocked: 799.295 MB/s

private Vector128<byte>[] RoundKeys { get; }

public void EncryptAesEcb(Span<byte> data)
{
    Span<Vector128<byte>> keys = RoundKeys;
    Span<Vector128<byte>> blocks = MemoryMarshal.Cast<byte, Vector128<byte>>(data);

    for (int i = 0; i < blocks.Length; i++)
    {
        Vector128<byte> b = blocks[i];

        b = Sse2.Xor(b, keys[0]);
        b = Aes.Encrypt(b, keys[1]);
        b = Aes.Encrypt(b, keys[2]);
        b = Aes.Encrypt(b, keys[3]);
        b = Aes.Encrypt(b, keys[4]);
        b = Aes.Encrypt(b, keys[5]);
        b = Aes.Encrypt(b, keys[6]);
        b = Aes.Encrypt(b, keys[7]);
        b = Aes.Encrypt(b, keys[8]);
        b = Aes.Encrypt(b, keys[9]);
        b = Aes.EncryptLast(b, keys[10]);

        blocks[i] = b;
    }
}

Replace Span<Vector128<byte>> keys = RoundKeys; with Vector128<byte>[] keys = RoundKeys; to get the second set of results.

In addition to that, inserting the line Vector128<byte> key10 = keys[10]; somewhere before the loop gives the performance:
ECB encryption: 5.833 GB/s
ECB encryption blocked: 802.582 MB/s

Here's the complete code and JIT output:

Span
ECB encryption: 5.708 GB/s
ECB encryption blocked: 775.686 MB/s
ECB decryption: 5.394 GB/s
ECB decryption blocked: 765.59 MB/s

public void EncryptEcb(Span<byte> data)
{
	Span<Vector128<byte>> keys = RoundKeys;
	Span<Vector128<byte>> blocks = MemoryMarshal.Cast<byte, Vector128<byte>>(data);

	for (int i = 0; i < blocks.Length; i++)
	{
		Vector128<byte> b = blocks[i];

		b = Sse2.Xor(b, keys[0]);
		b = Aes.Encrypt(b, keys[1]);
		b = Aes.Encrypt(b, keys[2]);
		b = Aes.Encrypt(b, keys[3]);
		b = Aes.Encrypt(b, keys[4]);
		b = Aes.Encrypt(b, keys[5]);
		b = Aes.Encrypt(b, keys[6]);
		b = Aes.Encrypt(b, keys[7]);
		b = Aes.Encrypt(b, keys[8]);
		b = Aes.Encrypt(b, keys[9]);
		b = Aes.EncryptLast(b, keys[10]);

		blocks[i] = b;
	}
}


; Assembly listing for method AesContext:EncryptEcb(struct):this
; Emitting BLENDED_CODE for X64 CPU with AVX - Windows
; optimized code
; rsp based frame
; fully interruptible
; Final local variable assignments
;
;  V00 this         [V00,T08] (  3,  3   )     ref  ->  rcx         this class-hnd
;  V01 arg1         [V01,T05] (  4,  8   )   byref  ->  rdx        
;* V02 loc0         [V02    ] (  0,  0   )  struct (16) zero-ref    ld-addr-op
;* V03 loc1         [V03    ] (  0,  0   )  struct (16) zero-ref    ld-addr-op
;  V04 loc2         [V04,T01] (  5, 17   )     int  ->   r9        
;  V05 loc3         [V05,T00] ( 24, 96   )  simd16  ->  mm0        
;  V06 OutArgs      [V06    ] (  1,  1   )  lclBlk (32) [rsp+0x00]   "OutgoingArgSpace"
;  V07 tmp1         [V07,T06] (  4,  6   )     ref  ->  rax         class-hnd "Inlining Arg"
;* V08 tmp2         [V08    ] (  0,  0   )  struct (16) zero-ref    "NewObj constructor temp"
;* V09 tmp3         [V09    ] (  0,  0   )  simd16  ->  zero-ref    ld-addr-op "Inline ldloca(s) first use temp"
;* V10 tmp4         [V10    ] (  0,  0   )  struct ( 8) zero-ref    "NewObj constructor temp"
;  V11 tmp5         [V11,T23] (  2,  2   )   byref  ->  rcx         "Inlining Arg"
;* V12 tmp6         [V12,T32] (  0,  0   )     int  ->  zero-ref    "Inline stloc first use temp"
;* V13 tmp7         [V13,T33] (  0,  0   )     int  ->  zero-ref    "Inline stloc first use temp"
;* V14 tmp8         [V14    ] (  0,  0   )  struct (16) zero-ref    ld-addr-op "Inlining Arg"
;  V15 tmp9         [V15,T27] (  2,  2   )     int  ->  rdx         "Inline stloc first use temp"
;* V16 tmp10        [V16    ] (  0,  0   )    long  ->  zero-ref    "Inline stloc first use temp"
;  V17 tmp11        [V17,T28] (  2,  2   )     int  ->  rdx         "Inline stloc first use temp"
;* V18 tmp12        [V18    ] (  0,  0   )  struct (16) zero-ref    "NewObj constructor temp"
;* V19 tmp13        [V19    ] (  0,  0   )   byref  ->  zero-ref    "Inlining Arg"
;* V20 tmp14        [V20    ] (  0,  0   )  struct ( 8) zero-ref    "NewObj constructor temp"
;* V21 tmp15        [V21    ] (  0,  0   )   byref  ->  zero-ref    V35._pointer(offs=0x00) P-INDEP "field V01._pointer (fldOffset=0x0)"
;* V22 tmp16        [V22    ] (  0,  0   )     int  ->  zero-ref    V35._length(offs=0x08) P-INDEP "field V01._length (fldOffset=0x8)"
;  V23 tmp17        [V23,T02] ( 12, 12   )   byref  ->  rcx         V02._pointer(offs=0x00) P-INDEP "field V02._pointer (fldOffset=0x0)"
;  V24 tmp18        [V24,T03] ( 12, 12   )     int  ->   r8         V02._length(offs=0x08) P-INDEP "field V02._length (fldOffset=0x8)"
;  V25 tmp19        [V25,T09] (  2,  5   )   byref  ->  rax         V03._pointer(offs=0x00) P-INDEP "field V03._pointer (fldOffset=0x0)"
;  V26 tmp20        [V26,T07] (  3,  6   )     int  ->  rdx         V03._length(offs=0x08) P-INDEP "field V03._length (fldOffset=0x8)"
;  V27 tmp21        [V27,T21] (  3,  2   )   byref  ->  rcx         V08._pointer(offs=0x00) P-INDEP "field V08._pointer (fldOffset=0x0)"
;  V28 tmp22        [V28,T22] (  3,  2   )     int  ->   r8         V08._length(offs=0x08) P-INDEP "field V08._length (fldOffset=0x8)"
;  V29 tmp23        [V29,T31] (  2,  1   )   byref  ->  rcx         V10._value(offs=0x00) P-INDEP "field V10._value (fldOffset=0x0)"
;  V30 tmp24        [V30,T24] (  2,  2   )   byref  ->  rax         V14._pointer(offs=0x00) P-INDEP "field V14._pointer (fldOffset=0x0)"
;  V31 tmp25        [V31,T29] (  2,  2   )     int  ->  rdx         V14._length(offs=0x08) P-INDEP "field V14._length (fldOffset=0x8)"
;  V32 tmp26        [V32,T25] (  2,  2   )   byref  ->  rax         V18._pointer(offs=0x00) P-INDEP "field V18._pointer (fldOffset=0x0)"
;  V33 tmp27        [V33,T30] (  2,  2   )     int  ->  rdx         V18._length(offs=0x08) P-INDEP "field V18._length (fldOffset=0x8)"
;  V34 tmp28        [V34,T26] (  2,  2   )   byref  ->  rax         V20._value(offs=0x00) P-INDEP "field V20._value (fldOffset=0x0)"
;* V35 tmp29        [V35    ] (  0,  0   )  struct (16) zero-ref    "Promoted implicit byref"
;  V36 cse0         [V36,T10] (  2,  5   )   byref  ->  r11         "ValNumCSE"
;  V37 cse1         [V37,T11] (  2,  5   )   byref  ->  rsi         "ValNumCSE"
;  V38 cse2         [V38,T12] (  2,  5   )   byref  ->  rdi         "ValNumCSE"
;  V39 cse3         [V39,T13] (  2,  5   )   byref  ->  rbx         "ValNumCSE"
;  V40 cse4         [V40,T14] (  2,  5   )   byref  ->  rbp         "ValNumCSE"
;  V41 cse5         [V41,T15] (  2,  5   )   byref  ->  r14         "ValNumCSE"
;  V42 cse6         [V42,T16] (  2,  5   )   byref  ->  r15         "ValNumCSE"
;  V43 cse7         [V43,T17] (  2,  5   )   byref  ->  r12         "ValNumCSE"
;  V44 cse8         [V44,T18] (  2,  5   )   byref  ->  r13         "ValNumCSE"
;  V45 cse9         [V45,T19] (  2,  5   )   byref  ->  rcx         "ValNumCSE"
;  V46 cse10        [V46,T20] (  2,  5   )   byref  ->  r10         "ValNumCSE"
;  V47 cse11        [V47,T04] (  3, 12   )   byref  ->   r8         "ValNumCSE"
;* V48 cse12        [V48    ] (  0,  0   )    long  ->  zero-ref    "ValNumCSE"
;
; Lcl frame size = 40

G_M18346_IG02:
       mov      rax, gword ptr [rcx+8]
       test     rax, rax
       jne      SHORT G_M18346_IG03
       xor      rcx, rcx
       xor      r8d, r8d
       jmp      SHORT G_M18346_IG04

G_M18346_IG03:
       lea      rcx, bword ptr [rax+16]
       mov      r8d, dword ptr [rax+8]

G_M18346_IG04:
       mov      rax, bword ptr [rdx]
       mov      edx, dword ptr [rdx+8]
       shr      edx, 4
       xor      r9d, r9d
       test     edx, edx
       jle      G_M18346_IG06
       cmp      r8d, 0
       jbe      G_M18346_IG08
       mov      r10, rcx
       cmp      r8d, 1
       jbe      G_M18346_IG08
       lea      r11, bword ptr [rcx+16]
       cmp      r8d, 2
       jbe      G_M18346_IG08
       lea      rsi, bword ptr [rcx+32]
       cmp      r8d, 3
       jbe      G_M18346_IG08
       lea      rdi, bword ptr [rcx+48]
       cmp      r8d, 4
       jbe      G_M18346_IG08
       lea      rbx, bword ptr [rcx+64]
       cmp      r8d, 5
       jbe      G_M18346_IG08
       lea      rbp, bword ptr [rcx+80]
       cmp      r8d, 6
       jbe      G_M18346_IG08
       lea      r14, bword ptr [rcx+96]
       cmp      r8d, 7
       jbe      G_M18346_IG08
       lea      r15, bword ptr [rcx+112]
       cmp      r8d, 8
       jbe      G_M18346_IG08
       lea      r12, bword ptr [rcx+128]
       cmp      r8d, 9
       jbe      G_M18346_IG08
       lea      r13, bword ptr [rcx+144]
       cmp      r8d, 10
       jbe      SHORT G_M18346_IG08
       add      rcx, 160

G_M18346_IG05:
       movsxd   r8, r9d
       shl      r8, 4
       add      r8, rax
       vmovupd  xmm0, xmmword ptr [r8]
       vpxor    xmm0, xmm0, xmmword ptr [r10]
       vaesenc  xmm0, xmm0, xmmword ptr [r11]
       vaesenc  xmm0, xmm0, xmmword ptr [rsi]
       vaesenc  xmm0, xmm0, xmmword ptr [rdi]
       vaesenc  xmm0, xmm0, xmmword ptr [rbx]
       vaesenc  xmm0, xmm0, xmmword ptr [rbp]
       vaesenc  xmm0, xmm0, xmmword ptr [r14]
       vaesenc  xmm0, xmm0, xmmword ptr [r15]
       vaesenc  xmm0, xmm0, xmmword ptr [r12]
       vaesenc  xmm0, xmm0, xmmword ptr [r13]
       vaesenclast xmm0, xmm0, xmmword ptr [rcx]
       vmovupd  xmmword ptr [r8], xmm0
       inc      r9d
       cmp      r9d, edx
       jl       SHORT G_M18346_IG05

G_M18346_IG06:
       add      rsp, 40
       pop      rbx
       pop      rbp
       pop      rsi
       pop      rdi
       pop      r12
       pop      r13
       pop      r14
       pop      r15
       ret      

G_M18346_IG07:
       call     CORINFO_HELP_OVERFLOW
       int3     

; Total bytes of code 336, prolog size 19 for method AesContext:EncryptEcb(struct):this
; ============================================================
Array
ECB encryption: 4.549 GB/s
ECB encryption blocked: 799.295 MB/s
ECB decryption: 5.416 GB/s
ECB decryption blocked: 800.834 MB/s

public void EncryptEcb(Span<byte> data)
{
	Vector128<byte>[] keys = RoundKeys;
	Span<Vector128<byte>> blocks = MemoryMarshal.Cast<byte, Vector128<byte>>(data);

	for (int i = 0; i < blocks.Length; i++)
	{
		Vector128<byte> b = blocks[i];

		b = Sse2.Xor(b, keys[0]);
		b = Aes.Encrypt(b, keys[1]);
		b = Aes.Encrypt(b, keys[2]);
		b = Aes.Encrypt(b, keys[3]);
		b = Aes.Encrypt(b, keys[4]);
		b = Aes.Encrypt(b, keys[5]);
		b = Aes.Encrypt(b, keys[6]);
		b = Aes.Encrypt(b, keys[7]);
		b = Aes.Encrypt(b, keys[8]);
		b = Aes.Encrypt(b, keys[9]);
		b = Aes.EncryptLast(b, keys[10]);

		blocks[i] = b;
	}
}


; Assembly listing for method AesContext:EncryptEcb(struct):this
; Emitting BLENDED_CODE for X64 CPU with AVX - Windows
; optimized code
; rsp based frame
; fully interruptible
; Final local variable assignments
;
;  V00 this         [V00,T07] (  3,  3   )     ref  ->  rcx         this class-hnd
;  V01 arg1         [V01,T05] (  4,  8   )   byref  ->  rdx        
;  V02 loc0         [V02,T01] ( 13, 46   )     ref  ->  rax         class-hnd
;* V03 loc1         [V03    ] (  0,  0   )  struct (16) zero-ref    ld-addr-op
;  V04 loc2         [V04,T03] (  5, 17   )     int  ->   r8        
;  V05 loc3         [V05,T00] ( 24, 96   )  simd16  ->  mm0        
;  V06 OutArgs      [V06    ] (  1,  1   )  lclBlk (32) [rsp+0x00]   "OutgoingArgSpace"
;* V07 tmp1         [V07,T16] (  0,  0   )     int  ->  zero-ref    "Inline stloc first use temp"
;* V08 tmp2         [V08,T17] (  0,  0   )     int  ->  zero-ref    "Inline stloc first use temp"
;* V09 tmp3         [V09    ] (  0,  0   )  struct (16) zero-ref    ld-addr-op "Inlining Arg"
;  V10 tmp4         [V10,T12] (  2,  2   )     int  ->  rdx         "Inline stloc first use temp"
;* V11 tmp5         [V11    ] (  0,  0   )    long  ->  zero-ref    "Inline stloc first use temp"
;  V12 tmp6         [V12,T13] (  2,  2   )     int  ->  rdx         "Inline stloc first use temp"
;* V13 tmp7         [V13    ] (  0,  0   )  struct (16) zero-ref    "NewObj constructor temp"
;* V14 tmp8         [V14    ] (  0,  0   )   byref  ->  zero-ref    "Inlining Arg"
;* V15 tmp9         [V15    ] (  0,  0   )  struct ( 8) zero-ref    "NewObj constructor temp"
;* V16 tmp10        [V16    ] (  0,  0   )   byref  ->  zero-ref    V25._pointer(offs=0x00) P-INDEP "field V01._pointer (fldOffset=0x0)"
;* V17 tmp11        [V17    ] (  0,  0   )     int  ->  zero-ref    V25._length(offs=0x08) P-INDEP "field V01._length (fldOffset=0x8)"
;  V18 tmp12        [V18,T08] (  2,  5   )   byref  ->  rcx         V03._pointer(offs=0x00) P-INDEP "field V03._pointer (fldOffset=0x0)"
;  V19 tmp13        [V19,T06] (  3,  6   )     int  ->  rdx         V03._length(offs=0x08) P-INDEP "field V03._length (fldOffset=0x8)"
;  V20 tmp14        [V20,T09] (  2,  2   )   byref  ->  rcx         V09._pointer(offs=0x00) P-INDEP "field V09._pointer (fldOffset=0x0)"
;  V21 tmp15        [V21,T14] (  2,  2   )     int  ->  rdx         V09._length(offs=0x08) P-INDEP "field V09._length (fldOffset=0x8)"
;  V22 tmp16        [V22,T10] (  2,  2   )   byref  ->  rcx         V13._pointer(offs=0x00) P-INDEP "field V13._pointer (fldOffset=0x0)"
;  V23 tmp17        [V23,T15] (  2,  2   )     int  ->  rdx         V13._length(offs=0x08) P-INDEP "field V13._length (fldOffset=0x8)"
;  V24 tmp18        [V24,T11] (  2,  2   )   byref  ->  rcx         V15._value(offs=0x00) P-INDEP "field V15._value (fldOffset=0x0)"
;* V25 tmp19        [V25    ] (  0,  0   )  struct (16) zero-ref    "Promoted implicit byref"
;  V26 cse0         [V26,T04] (  3, 12   )   byref  ->  r10         "ValNumCSE"
;* V27 cse1         [V27    ] (  0,  0   )    long  ->  zero-ref    "ValNumCSE"
;  V28 cse2         [V28,T02] ( 12, 45   )     int  ->   r9         "ValNumCSE"
;
; Lcl frame size = 40

G_M18342_IG02:
       mov      rax, gword ptr [rcx+8]
       mov      rcx, bword ptr [rdx]
       mov      edx, dword ptr [rdx+8]
       shr      edx, 4
       xor      r8d, r8d
       test     edx, edx
       jle      G_M18342_IG04
       mov      r9d, dword ptr [rax+8]

G_M18342_IG03:
       movsxd   r10, r8d
       shl      r10, 4
       add      r10, rcx
       vmovupd  xmm0, xmmword ptr [r10]
       cmp      r9d, 0
       jbe      G_M18342_IG06
       vpxor    xmm0, xmm0, xmmword ptr [rax+16]
       cmp      r9d, 1
       jbe      G_M18342_IG06
       vaesenc  xmm0, xmm0, xmmword ptr [rax+32]
       cmp      r9d, 2
       jbe      G_M18342_IG06
       vaesenc  xmm0, xmm0, xmmword ptr [rax+48]
       cmp      r9d, 3
       jbe      G_M18342_IG06
       vaesenc  xmm0, xmm0, xmmword ptr [rax+64]
       cmp      r9d, 4
       jbe      SHORT G_M18342_IG06
       vaesenc  xmm0, xmm0, xmmword ptr [rax+80]
       cmp      r9d, 5
       jbe      SHORT G_M18342_IG06
       vaesenc  xmm0, xmm0, xmmword ptr [rax+96]
       cmp      r9d, 6
       jbe      SHORT G_M18342_IG06
       vaesenc  xmm0, xmm0, xmmword ptr [rax+112]
       cmp      r9d, 7
       jbe      SHORT G_M18342_IG06
       vaesenc  xmm0, xmm0, xmmword ptr [rax+128]
       cmp      r9d, 8
       jbe      SHORT G_M18342_IG06
       vaesenc  xmm0, xmm0, xmmword ptr [rax+144]
       cmp      r9d, 9
       jbe      SHORT G_M18342_IG06
       vaesenc  xmm0, xmm0, xmmword ptr [rax+160]
       cmp      r9d, 10
       jbe      SHORT G_M18342_IG06
       vaesenclast xmm0, xmm0, xmmword ptr [rax+176]
       vmovupd  xmmword ptr [r10], xmm0
       inc      r8d
       cmp      r8d, edx
       jl       G_M18342_IG03

G_M18342_IG04:
       add      rsp, 40
       ret      

G_M18342_IG05:
       call     CORINFO_HELP_OVERFLOW
       int3     

; Total bytes of code 243, prolog size 7 for method AesContext:EncryptEcb(struct):this
; ============================================================
Array with early access
ECB encryption: 5.833 GB/s
ECB encryption blocked: 802.582 MB/s
ECB decryption: 5.74 GB/s
ECB decryption blocked: 802.291 MB/s

public void EncryptEcb(Span<byte> data)
{
	Vector128<byte>[] keys = RoundKeys;
	Span<Vector128<byte>> blocks = MemoryMarshal.Cast<byte, Vector128<byte>>(data);

	// Makes the JIT remove all the other range checks on keys
	Vector128<byte> key10 = keys[10];

	for (int i = 0; i < blocks.Length; i++)
	{
		Vector128<byte> b = blocks[i];

		b = Sse2.Xor(b, keys[0]);
		b = Aes.Encrypt(b, keys[1]);
		b = Aes.Encrypt(b, keys[2]);
		b = Aes.Encrypt(b, keys[3]);
		b = Aes.Encrypt(b, keys[4]);
		b = Aes.Encrypt(b, keys[5]);
		b = Aes.Encrypt(b, keys[6]);
		b = Aes.Encrypt(b, keys[7]);
		b = Aes.Encrypt(b, keys[8]);
		b = Aes.Encrypt(b, keys[9]);
		b = Aes.EncryptLast(b, keys[10]);

		blocks[i] = b;
	}
}


; Assembly listing for method AesContext:EncryptEcb(struct):this
; Emitting BLENDED_CODE for X64 CPU with AVX - Windows
; optimized code
; rsp based frame
; fully interruptible
; Final local variable assignments
;
;  V00 this         [V00,T06] (  3,  3   )     ref  ->  rcx         this class-hnd
;  V01 arg1         [V01,T04] (  4,  8   )   byref  ->  rdx        
;  V02 loc0         [V02,T01] ( 13, 46   )     ref  ->  rax         class-hnd
;* V03 loc1         [V03    ] (  0,  0   )  struct (16) zero-ref    ld-addr-op
;  V04 loc2         [V04,T02] (  5, 17   )     int  ->   r8        
;  V05 loc3         [V05,T00] ( 24, 96   )  simd16  ->  mm0        
;  V06 OutArgs      [V06    ] (  1,  1   )  lclBlk (32) [rsp+0x00]   "OutgoingArgSpace"
;* V07 tmp1         [V07,T16] (  0,  0   )     int  ->  zero-ref    "Inline stloc first use temp"
;* V08 tmp2         [V08,T17] (  0,  0   )     int  ->  zero-ref    "Inline stloc first use temp"
;* V09 tmp3         [V09    ] (  0,  0   )  struct (16) zero-ref    ld-addr-op "Inlining Arg"
;  V10 tmp4         [V10,T11] (  2,  2   )     int  ->  rdx         "Inline stloc first use temp"
;* V11 tmp5         [V11    ] (  0,  0   )    long  ->  zero-ref    "Inline stloc first use temp"
;  V12 tmp6         [V12,T12] (  2,  2   )     int  ->  rdx         "Inline stloc first use temp"
;* V13 tmp7         [V13    ] (  0,  0   )  struct (16) zero-ref    "NewObj constructor temp"
;* V14 tmp8         [V14    ] (  0,  0   )   byref  ->  zero-ref    "Inlining Arg"
;* V15 tmp9         [V15    ] (  0,  0   )  struct ( 8) zero-ref    "NewObj constructor temp"
;* V16 tmp10        [V16    ] (  0,  0   )   byref  ->  zero-ref    V25._pointer(offs=0x00) P-INDEP "field V01._pointer (fldOffset=0x0)"
;* V17 tmp11        [V17    ] (  0,  0   )     int  ->  zero-ref    V25._length(offs=0x08) P-INDEP "field V01._length (fldOffset=0x8)"
;  V18 tmp12        [V18,T07] (  2,  5   )   byref  ->  rcx         V03._pointer(offs=0x00) P-INDEP "field V03._pointer (fldOffset=0x0)"
;  V19 tmp13        [V19,T05] (  3,  6   )     int  ->  rdx         V03._length(offs=0x08) P-INDEP "field V03._length (fldOffset=0x8)"
;  V20 tmp14        [V20,T08] (  2,  2   )   byref  ->  rcx         V09._pointer(offs=0x00) P-INDEP "field V09._pointer (fldOffset=0x0)"
;  V21 tmp15        [V21,T13] (  2,  2   )     int  ->  rdx         V09._length(offs=0x08) P-INDEP "field V09._length (fldOffset=0x8)"
;  V22 tmp16        [V22,T09] (  2,  2   )   byref  ->  rcx         V13._pointer(offs=0x00) P-INDEP "field V13._pointer (fldOffset=0x0)"
;  V23 tmp17        [V23,T14] (  2,  2   )     int  ->  rdx         V13._length(offs=0x08) P-INDEP "field V13._length (fldOffset=0x8)"
;  V24 tmp18        [V24,T10] (  2,  2   )   byref  ->  rcx         V15._value(offs=0x00) P-INDEP "field V15._value (fldOffset=0x0)"
;* V25 tmp19        [V25    ] (  0,  0   )  struct (16) zero-ref    "Promoted implicit byref"
;  V26 cse0         [V26,T03] (  3, 12   )   byref  ->   r9         "ValNumCSE"
;* V27 cse1         [V27    ] (  0,  0   )    long  ->  zero-ref    "ValNumCSE"
;  V28 cse2         [V28,T15] (  2,  2   )     int  ->   r8         "ValNumCSE"
;
; Lcl frame size = 40

G_M18342_IG02:
       mov      rax, gword ptr [rcx+8]
       mov      rcx, bword ptr [rdx]
       mov      edx, dword ptr [rdx+8]
       shr      edx, 4
       mov      r8d, dword ptr [rax+8]
       cmp      r8d, 10
       jbe      G_M18342_IG06
       xor      r8d, r8d
       test     edx, edx
       jle      SHORT G_M18342_IG04

G_M18342_IG03:
       movsxd   r9, r8d
       shl      r9, 4
       add      r9, rcx
       vmovupd  xmm0, xmmword ptr [r9]
       vpxor    xmm0, xmm0, xmmword ptr [rax+16]
       vaesenc  xmm0, xmm0, xmmword ptr [rax+32]
       vaesenc  xmm0, xmm0, xmmword ptr [rax+48]
       vaesenc  xmm0, xmm0, xmmword ptr [rax+64]
       vaesenc  xmm0, xmm0, xmmword ptr [rax+80]
       vaesenc  xmm0, xmm0, xmmword ptr [rax+96]
       vaesenc  xmm0, xmm0, xmmword ptr [rax+112]
       vaesenc  xmm0, xmm0, xmmword ptr [rax+128]
       vaesenc  xmm0, xmm0, xmmword ptr [rax+144]
       vaesenc  xmm0, xmm0, xmmword ptr [rax+160]
       vaesenclast xmm0, xmm0, xmmword ptr [rax+176]
       vmovupd  xmmword ptr [r9], xmm0
       inc      r8d
       cmp      r8d, edx
       jl       SHORT G_M18342_IG03

G_M18342_IG04:
       add      rsp, 40
       ret      

G_M18342_IG05:
       call     CORINFO_HELP_OVERFLOW
       int3     

; Total bytes of code 163, prolog size 7 for method AesContext:EncryptEcb(struct):this
; ============================================================

In the last example the JIT omits the range checks for all subsequent array accesses. Getting a 25% speedup from inserting a functionally useless line may be unintuitive to those unfamiliar with the language.

On the other hand, if the compiled code automatically checked the entire range beforehand, it might change the code's behavior if an invalid access was attempted. It would also be difficult to prove that only doing one range check would result in functionally equivalent code.

category:cq
theme:bounds-checks
skill-level:intermediate
cost:medium
impact:medium

Metadata

Metadata

Assignees

No one assigned

    Labels

    area-CodeGen-coreclrCLR JIT compiler in src/coreclr/src/jit and related components such as SuperPMIoptimizationtenet-performancePerformance related issue

    Type

    No type

    Projects

    No projects

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions