-
Notifications
You must be signed in to change notification settings - Fork 5.4k
Description
In the following situation, changing the type of a local variable from Span<Vector128<byte>> to Vector128<byte>[] results in performance that is ~80% of the original.
Standard benchmark results are performed by passing a large array to the encryption function. Blocked results are from calling the encryption function on a single 0x10-byte AES block at a time.
Span:
ECB encryption: 5.708 GB/s
ECB encryption blocked: 775.686 MB/s
Array:
ECB encryption: 4.549 GB/s
ECB encryption blocked: 799.295 MB/s
private Vector128<byte>[] RoundKeys { get; }
public void EncryptAesEcb(Span<byte> data)
{
Span<Vector128<byte>> keys = RoundKeys;
Span<Vector128<byte>> blocks = MemoryMarshal.Cast<byte, Vector128<byte>>(data);
for (int i = 0; i < blocks.Length; i++)
{
Vector128<byte> b = blocks[i];
b = Sse2.Xor(b, keys[0]);
b = Aes.Encrypt(b, keys[1]);
b = Aes.Encrypt(b, keys[2]);
b = Aes.Encrypt(b, keys[3]);
b = Aes.Encrypt(b, keys[4]);
b = Aes.Encrypt(b, keys[5]);
b = Aes.Encrypt(b, keys[6]);
b = Aes.Encrypt(b, keys[7]);
b = Aes.Encrypt(b, keys[8]);
b = Aes.Encrypt(b, keys[9]);
b = Aes.EncryptLast(b, keys[10]);
blocks[i] = b;
}
}
Replace Span<Vector128<byte>> keys = RoundKeys; with Vector128<byte>[] keys = RoundKeys; to get the second set of results.
In addition to that, inserting the line Vector128<byte> key10 = keys[10]; somewhere before the loop gives the performance:
ECB encryption: 5.833 GB/s
ECB encryption blocked: 802.582 MB/s
Here's the complete code and JIT output:
Span
ECB encryption: 5.708 GB/s
ECB encryption blocked: 775.686 MB/s
ECB decryption: 5.394 GB/s
ECB decryption blocked: 765.59 MB/s
public void EncryptEcb(Span<byte> data)
{
Span<Vector128<byte>> keys = RoundKeys;
Span<Vector128<byte>> blocks = MemoryMarshal.Cast<byte, Vector128<byte>>(data);
for (int i = 0; i < blocks.Length; i++)
{
Vector128<byte> b = blocks[i];
b = Sse2.Xor(b, keys[0]);
b = Aes.Encrypt(b, keys[1]);
b = Aes.Encrypt(b, keys[2]);
b = Aes.Encrypt(b, keys[3]);
b = Aes.Encrypt(b, keys[4]);
b = Aes.Encrypt(b, keys[5]);
b = Aes.Encrypt(b, keys[6]);
b = Aes.Encrypt(b, keys[7]);
b = Aes.Encrypt(b, keys[8]);
b = Aes.Encrypt(b, keys[9]);
b = Aes.EncryptLast(b, keys[10]);
blocks[i] = b;
}
}
; Assembly listing for method AesContext:EncryptEcb(struct):this
; Emitting BLENDED_CODE for X64 CPU with AVX - Windows
; optimized code
; rsp based frame
; fully interruptible
; Final local variable assignments
;
; V00 this [V00,T08] ( 3, 3 ) ref -> rcx this class-hnd
; V01 arg1 [V01,T05] ( 4, 8 ) byref -> rdx
;* V02 loc0 [V02 ] ( 0, 0 ) struct (16) zero-ref ld-addr-op
;* V03 loc1 [V03 ] ( 0, 0 ) struct (16) zero-ref ld-addr-op
; V04 loc2 [V04,T01] ( 5, 17 ) int -> r9
; V05 loc3 [V05,T00] ( 24, 96 ) simd16 -> mm0
; V06 OutArgs [V06 ] ( 1, 1 ) lclBlk (32) [rsp+0x00] "OutgoingArgSpace"
; V07 tmp1 [V07,T06] ( 4, 6 ) ref -> rax class-hnd "Inlining Arg"
;* V08 tmp2 [V08 ] ( 0, 0 ) struct (16) zero-ref "NewObj constructor temp"
;* V09 tmp3 [V09 ] ( 0, 0 ) simd16 -> zero-ref ld-addr-op "Inline ldloca(s) first use temp"
;* V10 tmp4 [V10 ] ( 0, 0 ) struct ( 8) zero-ref "NewObj constructor temp"
; V11 tmp5 [V11,T23] ( 2, 2 ) byref -> rcx "Inlining Arg"
;* V12 tmp6 [V12,T32] ( 0, 0 ) int -> zero-ref "Inline stloc first use temp"
;* V13 tmp7 [V13,T33] ( 0, 0 ) int -> zero-ref "Inline stloc first use temp"
;* V14 tmp8 [V14 ] ( 0, 0 ) struct (16) zero-ref ld-addr-op "Inlining Arg"
; V15 tmp9 [V15,T27] ( 2, 2 ) int -> rdx "Inline stloc first use temp"
;* V16 tmp10 [V16 ] ( 0, 0 ) long -> zero-ref "Inline stloc first use temp"
; V17 tmp11 [V17,T28] ( 2, 2 ) int -> rdx "Inline stloc first use temp"
;* V18 tmp12 [V18 ] ( 0, 0 ) struct (16) zero-ref "NewObj constructor temp"
;* V19 tmp13 [V19 ] ( 0, 0 ) byref -> zero-ref "Inlining Arg"
;* V20 tmp14 [V20 ] ( 0, 0 ) struct ( 8) zero-ref "NewObj constructor temp"
;* V21 tmp15 [V21 ] ( 0, 0 ) byref -> zero-ref V35._pointer(offs=0x00) P-INDEP "field V01._pointer (fldOffset=0x0)"
;* V22 tmp16 [V22 ] ( 0, 0 ) int -> zero-ref V35._length(offs=0x08) P-INDEP "field V01._length (fldOffset=0x8)"
; V23 tmp17 [V23,T02] ( 12, 12 ) byref -> rcx V02._pointer(offs=0x00) P-INDEP "field V02._pointer (fldOffset=0x0)"
; V24 tmp18 [V24,T03] ( 12, 12 ) int -> r8 V02._length(offs=0x08) P-INDEP "field V02._length (fldOffset=0x8)"
; V25 tmp19 [V25,T09] ( 2, 5 ) byref -> rax V03._pointer(offs=0x00) P-INDEP "field V03._pointer (fldOffset=0x0)"
; V26 tmp20 [V26,T07] ( 3, 6 ) int -> rdx V03._length(offs=0x08) P-INDEP "field V03._length (fldOffset=0x8)"
; V27 tmp21 [V27,T21] ( 3, 2 ) byref -> rcx V08._pointer(offs=0x00) P-INDEP "field V08._pointer (fldOffset=0x0)"
; V28 tmp22 [V28,T22] ( 3, 2 ) int -> r8 V08._length(offs=0x08) P-INDEP "field V08._length (fldOffset=0x8)"
; V29 tmp23 [V29,T31] ( 2, 1 ) byref -> rcx V10._value(offs=0x00) P-INDEP "field V10._value (fldOffset=0x0)"
; V30 tmp24 [V30,T24] ( 2, 2 ) byref -> rax V14._pointer(offs=0x00) P-INDEP "field V14._pointer (fldOffset=0x0)"
; V31 tmp25 [V31,T29] ( 2, 2 ) int -> rdx V14._length(offs=0x08) P-INDEP "field V14._length (fldOffset=0x8)"
; V32 tmp26 [V32,T25] ( 2, 2 ) byref -> rax V18._pointer(offs=0x00) P-INDEP "field V18._pointer (fldOffset=0x0)"
; V33 tmp27 [V33,T30] ( 2, 2 ) int -> rdx V18._length(offs=0x08) P-INDEP "field V18._length (fldOffset=0x8)"
; V34 tmp28 [V34,T26] ( 2, 2 ) byref -> rax V20._value(offs=0x00) P-INDEP "field V20._value (fldOffset=0x0)"
;* V35 tmp29 [V35 ] ( 0, 0 ) struct (16) zero-ref "Promoted implicit byref"
; V36 cse0 [V36,T10] ( 2, 5 ) byref -> r11 "ValNumCSE"
; V37 cse1 [V37,T11] ( 2, 5 ) byref -> rsi "ValNumCSE"
; V38 cse2 [V38,T12] ( 2, 5 ) byref -> rdi "ValNumCSE"
; V39 cse3 [V39,T13] ( 2, 5 ) byref -> rbx "ValNumCSE"
; V40 cse4 [V40,T14] ( 2, 5 ) byref -> rbp "ValNumCSE"
; V41 cse5 [V41,T15] ( 2, 5 ) byref -> r14 "ValNumCSE"
; V42 cse6 [V42,T16] ( 2, 5 ) byref -> r15 "ValNumCSE"
; V43 cse7 [V43,T17] ( 2, 5 ) byref -> r12 "ValNumCSE"
; V44 cse8 [V44,T18] ( 2, 5 ) byref -> r13 "ValNumCSE"
; V45 cse9 [V45,T19] ( 2, 5 ) byref -> rcx "ValNumCSE"
; V46 cse10 [V46,T20] ( 2, 5 ) byref -> r10 "ValNumCSE"
; V47 cse11 [V47,T04] ( 3, 12 ) byref -> r8 "ValNumCSE"
;* V48 cse12 [V48 ] ( 0, 0 ) long -> zero-ref "ValNumCSE"
;
; Lcl frame size = 40
G_M18346_IG02:
mov rax, gword ptr [rcx+8]
test rax, rax
jne SHORT G_M18346_IG03
xor rcx, rcx
xor r8d, r8d
jmp SHORT G_M18346_IG04
G_M18346_IG03:
lea rcx, bword ptr [rax+16]
mov r8d, dword ptr [rax+8]
G_M18346_IG04:
mov rax, bword ptr [rdx]
mov edx, dword ptr [rdx+8]
shr edx, 4
xor r9d, r9d
test edx, edx
jle G_M18346_IG06
cmp r8d, 0
jbe G_M18346_IG08
mov r10, rcx
cmp r8d, 1
jbe G_M18346_IG08
lea r11, bword ptr [rcx+16]
cmp r8d, 2
jbe G_M18346_IG08
lea rsi, bword ptr [rcx+32]
cmp r8d, 3
jbe G_M18346_IG08
lea rdi, bword ptr [rcx+48]
cmp r8d, 4
jbe G_M18346_IG08
lea rbx, bword ptr [rcx+64]
cmp r8d, 5
jbe G_M18346_IG08
lea rbp, bword ptr [rcx+80]
cmp r8d, 6
jbe G_M18346_IG08
lea r14, bword ptr [rcx+96]
cmp r8d, 7
jbe G_M18346_IG08
lea r15, bword ptr [rcx+112]
cmp r8d, 8
jbe G_M18346_IG08
lea r12, bword ptr [rcx+128]
cmp r8d, 9
jbe G_M18346_IG08
lea r13, bword ptr [rcx+144]
cmp r8d, 10
jbe SHORT G_M18346_IG08
add rcx, 160
G_M18346_IG05:
movsxd r8, r9d
shl r8, 4
add r8, rax
vmovupd xmm0, xmmword ptr [r8]
vpxor xmm0, xmm0, xmmword ptr [r10]
vaesenc xmm0, xmm0, xmmword ptr [r11]
vaesenc xmm0, xmm0, xmmword ptr [rsi]
vaesenc xmm0, xmm0, xmmword ptr [rdi]
vaesenc xmm0, xmm0, xmmword ptr [rbx]
vaesenc xmm0, xmm0, xmmword ptr [rbp]
vaesenc xmm0, xmm0, xmmword ptr [r14]
vaesenc xmm0, xmm0, xmmword ptr [r15]
vaesenc xmm0, xmm0, xmmword ptr [r12]
vaesenc xmm0, xmm0, xmmword ptr [r13]
vaesenclast xmm0, xmm0, xmmword ptr [rcx]
vmovupd xmmword ptr [r8], xmm0
inc r9d
cmp r9d, edx
jl SHORT G_M18346_IG05
G_M18346_IG06:
add rsp, 40
pop rbx
pop rbp
pop rsi
pop rdi
pop r12
pop r13
pop r14
pop r15
ret
G_M18346_IG07:
call CORINFO_HELP_OVERFLOW
int3
; Total bytes of code 336, prolog size 19 for method AesContext:EncryptEcb(struct):this
; ============================================================
Array
ECB encryption: 4.549 GB/s
ECB encryption blocked: 799.295 MB/s
ECB decryption: 5.416 GB/s
ECB decryption blocked: 800.834 MB/s
public void EncryptEcb(Span<byte> data)
{
Vector128<byte>[] keys = RoundKeys;
Span<Vector128<byte>> blocks = MemoryMarshal.Cast<byte, Vector128<byte>>(data);
for (int i = 0; i < blocks.Length; i++)
{
Vector128<byte> b = blocks[i];
b = Sse2.Xor(b, keys[0]);
b = Aes.Encrypt(b, keys[1]);
b = Aes.Encrypt(b, keys[2]);
b = Aes.Encrypt(b, keys[3]);
b = Aes.Encrypt(b, keys[4]);
b = Aes.Encrypt(b, keys[5]);
b = Aes.Encrypt(b, keys[6]);
b = Aes.Encrypt(b, keys[7]);
b = Aes.Encrypt(b, keys[8]);
b = Aes.Encrypt(b, keys[9]);
b = Aes.EncryptLast(b, keys[10]);
blocks[i] = b;
}
}
; Assembly listing for method AesContext:EncryptEcb(struct):this
; Emitting BLENDED_CODE for X64 CPU with AVX - Windows
; optimized code
; rsp based frame
; fully interruptible
; Final local variable assignments
;
; V00 this [V00,T07] ( 3, 3 ) ref -> rcx this class-hnd
; V01 arg1 [V01,T05] ( 4, 8 ) byref -> rdx
; V02 loc0 [V02,T01] ( 13, 46 ) ref -> rax class-hnd
;* V03 loc1 [V03 ] ( 0, 0 ) struct (16) zero-ref ld-addr-op
; V04 loc2 [V04,T03] ( 5, 17 ) int -> r8
; V05 loc3 [V05,T00] ( 24, 96 ) simd16 -> mm0
; V06 OutArgs [V06 ] ( 1, 1 ) lclBlk (32) [rsp+0x00] "OutgoingArgSpace"
;* V07 tmp1 [V07,T16] ( 0, 0 ) int -> zero-ref "Inline stloc first use temp"
;* V08 tmp2 [V08,T17] ( 0, 0 ) int -> zero-ref "Inline stloc first use temp"
;* V09 tmp3 [V09 ] ( 0, 0 ) struct (16) zero-ref ld-addr-op "Inlining Arg"
; V10 tmp4 [V10,T12] ( 2, 2 ) int -> rdx "Inline stloc first use temp"
;* V11 tmp5 [V11 ] ( 0, 0 ) long -> zero-ref "Inline stloc first use temp"
; V12 tmp6 [V12,T13] ( 2, 2 ) int -> rdx "Inline stloc first use temp"
;* V13 tmp7 [V13 ] ( 0, 0 ) struct (16) zero-ref "NewObj constructor temp"
;* V14 tmp8 [V14 ] ( 0, 0 ) byref -> zero-ref "Inlining Arg"
;* V15 tmp9 [V15 ] ( 0, 0 ) struct ( 8) zero-ref "NewObj constructor temp"
;* V16 tmp10 [V16 ] ( 0, 0 ) byref -> zero-ref V25._pointer(offs=0x00) P-INDEP "field V01._pointer (fldOffset=0x0)"
;* V17 tmp11 [V17 ] ( 0, 0 ) int -> zero-ref V25._length(offs=0x08) P-INDEP "field V01._length (fldOffset=0x8)"
; V18 tmp12 [V18,T08] ( 2, 5 ) byref -> rcx V03._pointer(offs=0x00) P-INDEP "field V03._pointer (fldOffset=0x0)"
; V19 tmp13 [V19,T06] ( 3, 6 ) int -> rdx V03._length(offs=0x08) P-INDEP "field V03._length (fldOffset=0x8)"
; V20 tmp14 [V20,T09] ( 2, 2 ) byref -> rcx V09._pointer(offs=0x00) P-INDEP "field V09._pointer (fldOffset=0x0)"
; V21 tmp15 [V21,T14] ( 2, 2 ) int -> rdx V09._length(offs=0x08) P-INDEP "field V09._length (fldOffset=0x8)"
; V22 tmp16 [V22,T10] ( 2, 2 ) byref -> rcx V13._pointer(offs=0x00) P-INDEP "field V13._pointer (fldOffset=0x0)"
; V23 tmp17 [V23,T15] ( 2, 2 ) int -> rdx V13._length(offs=0x08) P-INDEP "field V13._length (fldOffset=0x8)"
; V24 tmp18 [V24,T11] ( 2, 2 ) byref -> rcx V15._value(offs=0x00) P-INDEP "field V15._value (fldOffset=0x0)"
;* V25 tmp19 [V25 ] ( 0, 0 ) struct (16) zero-ref "Promoted implicit byref"
; V26 cse0 [V26,T04] ( 3, 12 ) byref -> r10 "ValNumCSE"
;* V27 cse1 [V27 ] ( 0, 0 ) long -> zero-ref "ValNumCSE"
; V28 cse2 [V28,T02] ( 12, 45 ) int -> r9 "ValNumCSE"
;
; Lcl frame size = 40
G_M18342_IG02:
mov rax, gword ptr [rcx+8]
mov rcx, bword ptr [rdx]
mov edx, dword ptr [rdx+8]
shr edx, 4
xor r8d, r8d
test edx, edx
jle G_M18342_IG04
mov r9d, dword ptr [rax+8]
G_M18342_IG03:
movsxd r10, r8d
shl r10, 4
add r10, rcx
vmovupd xmm0, xmmword ptr [r10]
cmp r9d, 0
jbe G_M18342_IG06
vpxor xmm0, xmm0, xmmword ptr [rax+16]
cmp r9d, 1
jbe G_M18342_IG06
vaesenc xmm0, xmm0, xmmword ptr [rax+32]
cmp r9d, 2
jbe G_M18342_IG06
vaesenc xmm0, xmm0, xmmword ptr [rax+48]
cmp r9d, 3
jbe G_M18342_IG06
vaesenc xmm0, xmm0, xmmword ptr [rax+64]
cmp r9d, 4
jbe SHORT G_M18342_IG06
vaesenc xmm0, xmm0, xmmword ptr [rax+80]
cmp r9d, 5
jbe SHORT G_M18342_IG06
vaesenc xmm0, xmm0, xmmword ptr [rax+96]
cmp r9d, 6
jbe SHORT G_M18342_IG06
vaesenc xmm0, xmm0, xmmword ptr [rax+112]
cmp r9d, 7
jbe SHORT G_M18342_IG06
vaesenc xmm0, xmm0, xmmword ptr [rax+128]
cmp r9d, 8
jbe SHORT G_M18342_IG06
vaesenc xmm0, xmm0, xmmword ptr [rax+144]
cmp r9d, 9
jbe SHORT G_M18342_IG06
vaesenc xmm0, xmm0, xmmword ptr [rax+160]
cmp r9d, 10
jbe SHORT G_M18342_IG06
vaesenclast xmm0, xmm0, xmmword ptr [rax+176]
vmovupd xmmword ptr [r10], xmm0
inc r8d
cmp r8d, edx
jl G_M18342_IG03
G_M18342_IG04:
add rsp, 40
ret
G_M18342_IG05:
call CORINFO_HELP_OVERFLOW
int3
; Total bytes of code 243, prolog size 7 for method AesContext:EncryptEcb(struct):this
; ============================================================
Array with early access
ECB encryption: 5.833 GB/s
ECB encryption blocked: 802.582 MB/s
ECB decryption: 5.74 GB/s
ECB decryption blocked: 802.291 MB/s
public void EncryptEcb(Span<byte> data)
{
Vector128<byte>[] keys = RoundKeys;
Span<Vector128<byte>> blocks = MemoryMarshal.Cast<byte, Vector128<byte>>(data);
// Makes the JIT remove all the other range checks on keys
Vector128<byte> key10 = keys[10];
for (int i = 0; i < blocks.Length; i++)
{
Vector128<byte> b = blocks[i];
b = Sse2.Xor(b, keys[0]);
b = Aes.Encrypt(b, keys[1]);
b = Aes.Encrypt(b, keys[2]);
b = Aes.Encrypt(b, keys[3]);
b = Aes.Encrypt(b, keys[4]);
b = Aes.Encrypt(b, keys[5]);
b = Aes.Encrypt(b, keys[6]);
b = Aes.Encrypt(b, keys[7]);
b = Aes.Encrypt(b, keys[8]);
b = Aes.Encrypt(b, keys[9]);
b = Aes.EncryptLast(b, keys[10]);
blocks[i] = b;
}
}
; Assembly listing for method AesContext:EncryptEcb(struct):this
; Emitting BLENDED_CODE for X64 CPU with AVX - Windows
; optimized code
; rsp based frame
; fully interruptible
; Final local variable assignments
;
; V00 this [V00,T06] ( 3, 3 ) ref -> rcx this class-hnd
; V01 arg1 [V01,T04] ( 4, 8 ) byref -> rdx
; V02 loc0 [V02,T01] ( 13, 46 ) ref -> rax class-hnd
;* V03 loc1 [V03 ] ( 0, 0 ) struct (16) zero-ref ld-addr-op
; V04 loc2 [V04,T02] ( 5, 17 ) int -> r8
; V05 loc3 [V05,T00] ( 24, 96 ) simd16 -> mm0
; V06 OutArgs [V06 ] ( 1, 1 ) lclBlk (32) [rsp+0x00] "OutgoingArgSpace"
;* V07 tmp1 [V07,T16] ( 0, 0 ) int -> zero-ref "Inline stloc first use temp"
;* V08 tmp2 [V08,T17] ( 0, 0 ) int -> zero-ref "Inline stloc first use temp"
;* V09 tmp3 [V09 ] ( 0, 0 ) struct (16) zero-ref ld-addr-op "Inlining Arg"
; V10 tmp4 [V10,T11] ( 2, 2 ) int -> rdx "Inline stloc first use temp"
;* V11 tmp5 [V11 ] ( 0, 0 ) long -> zero-ref "Inline stloc first use temp"
; V12 tmp6 [V12,T12] ( 2, 2 ) int -> rdx "Inline stloc first use temp"
;* V13 tmp7 [V13 ] ( 0, 0 ) struct (16) zero-ref "NewObj constructor temp"
;* V14 tmp8 [V14 ] ( 0, 0 ) byref -> zero-ref "Inlining Arg"
;* V15 tmp9 [V15 ] ( 0, 0 ) struct ( 8) zero-ref "NewObj constructor temp"
;* V16 tmp10 [V16 ] ( 0, 0 ) byref -> zero-ref V25._pointer(offs=0x00) P-INDEP "field V01._pointer (fldOffset=0x0)"
;* V17 tmp11 [V17 ] ( 0, 0 ) int -> zero-ref V25._length(offs=0x08) P-INDEP "field V01._length (fldOffset=0x8)"
; V18 tmp12 [V18,T07] ( 2, 5 ) byref -> rcx V03._pointer(offs=0x00) P-INDEP "field V03._pointer (fldOffset=0x0)"
; V19 tmp13 [V19,T05] ( 3, 6 ) int -> rdx V03._length(offs=0x08) P-INDEP "field V03._length (fldOffset=0x8)"
; V20 tmp14 [V20,T08] ( 2, 2 ) byref -> rcx V09._pointer(offs=0x00) P-INDEP "field V09._pointer (fldOffset=0x0)"
; V21 tmp15 [V21,T13] ( 2, 2 ) int -> rdx V09._length(offs=0x08) P-INDEP "field V09._length (fldOffset=0x8)"
; V22 tmp16 [V22,T09] ( 2, 2 ) byref -> rcx V13._pointer(offs=0x00) P-INDEP "field V13._pointer (fldOffset=0x0)"
; V23 tmp17 [V23,T14] ( 2, 2 ) int -> rdx V13._length(offs=0x08) P-INDEP "field V13._length (fldOffset=0x8)"
; V24 tmp18 [V24,T10] ( 2, 2 ) byref -> rcx V15._value(offs=0x00) P-INDEP "field V15._value (fldOffset=0x0)"
;* V25 tmp19 [V25 ] ( 0, 0 ) struct (16) zero-ref "Promoted implicit byref"
; V26 cse0 [V26,T03] ( 3, 12 ) byref -> r9 "ValNumCSE"
;* V27 cse1 [V27 ] ( 0, 0 ) long -> zero-ref "ValNumCSE"
; V28 cse2 [V28,T15] ( 2, 2 ) int -> r8 "ValNumCSE"
;
; Lcl frame size = 40
G_M18342_IG02:
mov rax, gword ptr [rcx+8]
mov rcx, bword ptr [rdx]
mov edx, dword ptr [rdx+8]
shr edx, 4
mov r8d, dword ptr [rax+8]
cmp r8d, 10
jbe G_M18342_IG06
xor r8d, r8d
test edx, edx
jle SHORT G_M18342_IG04
G_M18342_IG03:
movsxd r9, r8d
shl r9, 4
add r9, rcx
vmovupd xmm0, xmmword ptr [r9]
vpxor xmm0, xmm0, xmmword ptr [rax+16]
vaesenc xmm0, xmm0, xmmword ptr [rax+32]
vaesenc xmm0, xmm0, xmmword ptr [rax+48]
vaesenc xmm0, xmm0, xmmword ptr [rax+64]
vaesenc xmm0, xmm0, xmmword ptr [rax+80]
vaesenc xmm0, xmm0, xmmword ptr [rax+96]
vaesenc xmm0, xmm0, xmmword ptr [rax+112]
vaesenc xmm0, xmm0, xmmword ptr [rax+128]
vaesenc xmm0, xmm0, xmmword ptr [rax+144]
vaesenc xmm0, xmm0, xmmword ptr [rax+160]
vaesenclast xmm0, xmm0, xmmword ptr [rax+176]
vmovupd xmmword ptr [r9], xmm0
inc r8d
cmp r8d, edx
jl SHORT G_M18342_IG03
G_M18342_IG04:
add rsp, 40
ret
G_M18342_IG05:
call CORINFO_HELP_OVERFLOW
int3
; Total bytes of code 163, prolog size 7 for method AesContext:EncryptEcb(struct):this
; ============================================================
In the last example the JIT omits the range checks for all subsequent array accesses. Getting a 25% speedup from inserting a functionally useless line may be unintuitive to those unfamiliar with the language.
On the other hand, if the compiled code automatically checked the entire range beforehand, it might change the code's behavior if an invalid access was attempted. It would also be difficult to prove that only doing one range check would result in functionally equivalent code.
category:cq
theme:bounds-checks
skill-level:intermediate
cost:medium
impact:medium