-
Notifications
You must be signed in to change notification settings - Fork 5.3k
Unroll SpanHelpers.Reverse() for Vector128 #76076
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Unroll SpanHelpers.Reverse() for Vector128 #76076
Conversation
|
Tagging subscribers to this area: @dotnet/area-system-memory Issue Detailsnull
|
|
We see following performance results on an N1 machine: Byte: Char: Int32: These numbers fluctuated between run-to-run on the given N1 machine. However, rationale behind this PR to see if we want such unrolling changes which may improve performance at the cost of readability. If this is something that is useful, it maybe worth exploring to do this in JIT itself in the future. |
See #66993 In general, I still think it's worth doing for a few the most important perf methods like And to repeat some of the statements from that issue, the question is - "where to draw the line":
PS: is |
|
Although, this particular PR seems to remove more code than it adds 👍 (by replacing specializations with a generic function) |
Thanks for your comments, @EgorBo, I second your questions.
I tried to get the generic version for reverse that looks quite elegant too but couldn't get it to work yet. The issue is, the generic version required to use Approx Shuffle<T> Impl |
|
Earlier, we had the type checks before calling |
Hi @kunalspathak, I struggled to get assembly for the intrinsic using the usual way that we used in the past , i.e., using a checked clrjit and release libraries. It could dump the assembly for IndexOfAny benchmarks but couldn't do it for the Reverse. It only dumped the assembly for System.MemoryExtensions:Reverse ; Assembly listing for method System.MemoryExtensions:Reverse[ushort](System.Span`1[ushort])
; Emitting BLENDED_CODE for generic ARM64 CPU - Unix
; optimized code
; fp based frame
; partially interruptible
; No PGO data
; 0 inlinees with PGO data; 5 single block inlinees; 3 inlinees without PGO data
; Final local variable assignments
;
;* V00 arg0 [V00 ] ( 0, 0 ) struct (16) zero-ref multireg-arg ld-addr-op single-def
;# V01 OutArgs [V01 ] ( 1, 1 ) lclBlk ( 0) [sp+00H] "OutgoingArgSpace"
; V02 tmp1 [V02,T03] ( 2, 2 ) byref -> x19 single-def "impAppendStmt"
;* V03 tmp2 [V03 ] ( 0, 0 ) struct (16) zero-ref ld-addr-op "Inlining Arg"
; V04 tmp3 [V04,T02] ( 3, 3 ) long -> x20 "Inlining Arg"
;* V05 tmp4 [V05 ] ( 0, 0 ) bool -> zero-ref "Inlining Arg"
;* V06 tmp5 [V06 ] ( 0, 0 ) bool -> zero-ref "Inlining Arg"
;* V07 tmp6 [V07 ] ( 0, 0 ) byref -> zero-ref "Inline stloc first use temp"
;* V08 tmp7 [V08 ] ( 0, 0 ) byref -> zero-ref "Inline stloc first use temp"
;* V09 tmp8 [V09 ] ( 0, 0 ) ushort -> zero-ref "Inline stloc first use temp"
; V10 tmp9 [V10,T01] ( 2, 1.50) byref -> x0 single-def V00._reference(offs=0x00) P-INDEP "field V00._reference (fldOffset=0x0)"
; V11 tmp10 [V11,T00] ( 3, 2.50) int -> x1 single-def V00._length(offs=0x08) P-INDEP "field V00._length (fldOffset=0x8)"
;* V12 tmp11 [V12 ] ( 0, 0 ) byref -> zero-ref single-def V03._reference(offs=0x00) P-INDEP "field V03._reference (fldOffset=0x0)"
;* V13 tmp12 [V13 ] ( 0, 0 ) int -> zero-ref V03._length(offs=0x08) P-INDEP "field V03._length (fldOffset=0x8)"
; V14 cse0 [V14,T04] ( 3, 1.50) ref -> x1 "CSE - moderate"
;
; Lcl frame size = 0
G_M40471_IG01:
stp fp, lr, [sp, #-0x20]!
stp x19, x20, [sp, #0x10]
mov fp, sp
;; size=12 bbWeight=1 PerfScore 2.50
G_M40471_IG02:
cmp w1, #1
ble G_M40471_IG05
;; size=8 bbWeight=1 PerfScore 1.50
G_M40471_IG03:
mov x19, x0
sxtw x20, w1
cbnz x20, G_M40471_IG04
movz x1, #0xD1FFAB1E
movk x1, #0xD1FFAB1E LSL #16
movk x1, #0xD1FFAB1E LSL #32
mov x0, x1
movz x2, #0xD1FFAB1E // code for System.Diagnostics.Debug:Fail
movk x2, #0xD1FFAB1E LSL #16
movk x2, #0xD1FFAB1E LSL #32
ldr x2, [x2]
blr x2
;; size=48 bbWeight=0.50 PerfScore 4.75
G_M40471_IG04:
mov x0, x19
mov x1, x20
movz x2, #0xD1FFAB1E // code for System.SpanHelpers:ReverseSIMD
movk x2, #0xD1FFAB1E LSL #16
movk x2, #0xD1FFAB1E LSL #32
ldr x2, [x2]
blr x2
;; size=28 bbWeight=0.50 PerfScore 3.25
G_M40471_IG05:
ldp x19, x20, [sp, #0x10]
ldp fp, lr, [sp], #0x20
ret lr
;; size=12 bbWeight=1 PerfScore 3.00
; Total bytes of code 108, prolog size 12, PerfScore 25.80, instruction count 27, allocated bytes for code 108 (MethodHash=512a61e8) for method System.MemoryExtensions:Reverse[ushort](System.Span`1[ushort])
; ============================================================Using gdb, I could extract the following assembly while running a program calling the It matches the implementation of intrinsic. A quick scan show that the offset calculation is not hoisted out of the loop which can be improved. Ideally, it should done in a jit but not sure how easy or hard it would be incorporate it. Assembly from GDB stp x29, x30, [sp, #-16]!
mov x29, sp
cmp x1, #0x10
b.cc 0xffff6f66a2c8 // b.lo, b.ul, b.last
cmp x1, #0x20
b.cc 0xffff6f66a25c // b.lo, b.ul, b.last
lsr x2, x1, #3
lsr x2, x2, #2
mov x3, xzr
cmp x2, #0x0
b.ls 0xffff6f66a248 // b.plast
ldr q16, 0xffff6f66a320
lsl x4, x3, #3
lsl x4, x4, #1
mov x5, x4
add x6, x5, #0x8
sub x4, x1, x4
sub x4, x4, #0x8
sub x7, x4, #0x8
lsl x5, x5, #1
add x5, x0, x5
ld1 {v17.8h}, [x5]
lsl x6, x6, #1
add x6, x0, x6
ld1 {v18.8h}, [x6]
lsl x4, x4, #1
add x4, x0, x4
ld1 {v19.8h}, [x4]
lsl x7, x7, #1
add x7, x0, x7
ld1 {v20.8h}, [x7]
tbl v17.16b, {v17.16b}, v16.16b
tbl v18.16b, {v18.16b}, v16.16b
tbl v19.16b, {v19.16b}, v16.16b
tbl v20.16b, {v20.16b}, v16.16b
st1 {v19.8h}, [x5]
st1 {v20.8h}, [x6]
st1 {v17.8h}, [x4]
st1 {v18.8h}, [x7]
add x3, x3, #0x1
cmp x3, x2
b.cc 0xffff6f66a1d0 // b.lo, b.ul, b.last
0xffff6f66a248: lsl x2, x2, #3
lsl x3, x2, #1
lsl x3, x3, #1
add x0, x3, x0
sub x1, x1, x2, lsl #2
0xffff6f66a25c: cmp x1, #0x10
b.cc 0xffff6f66a2c8 // b.lo, b.ul, b.last
lsr x2, x1, #3
lsr x2, x2, #1
mov x3, xzr
cmp x2, #0x0
b.ls 0xffff6f66a2b8 // b.plast
ldr q16, 0xffff6f66a320
0xffff6f66a27c: lsl x4, x3, #3
add x3, x3, #0x1
sub x5, x1, x3, lsl #3
lsl x4, x4, #1
add x4, x0, x4
ld1 {v17.8h}, [x4]
lsl x5, x5, #1
add x5, x0, x5
ld1 {v18.8h}, [x5]
tbl v17.16b, {v17.16b}, v16.16b
tbl v18.16b, {v18.16b}, v16.16b
st1 {v18.8h}, [x4]
st1 {v17.8h}, [x5]
cmp x3, x2
b.cc 0xffff6f66a27c // b.lo, b.ul, b.last
lsl x2, x2, #3
lsl x3, x2, #1
add x0, x3, x0
sub x1, x1, x3
0xffff6f66a2c8: cmp x1, #0x1
b.ls 0xffff6f66a2fc // b.plast
sbfiz x1, x1, #1, #32
add x1, x0, x1
sub x1, x1, #0x2
0xffff6f66a2dc: ldrsh w2, [x0]
ldrsh w3, [x1]
strh w3, [x0]
strh w2, [x1]
add x0, x0, #0x2
sub x1, x1, #0x2
cmp x0, x1
b.cc 0xffff6f66a2dc // b.lo, b.ul, b.last
0xffff6f66a2fc: ldp x29, x30, [sp], #16
retWould you suggest to manually hoist part of the offset calculation out of the loop? |
It was supposed to get inline since it is marked as Also, I wanted to see the before vs. after disassembly code and I can take a look at the hoisting. |
Hi Kunal, Assembly for the main branch (without the patch)stp x29, x30, [sp, #-16]!
mov x29, sp
cmp x1, #0x10
b.cc 0xffff4a060c74 // b.lo, b.ul, b.last
lsr x2, x1, #3
lsr x2, x2, #1
mov x3, xzr
cmp x2, #0x0
b.ls 0xffff4a060c5c // b.plast
lsl x4, x3, #3
add x3, x3, #0x1
lsl x5, x3, #3
sub x5, x1, x5
lsl x4, x4, #1
add x4, x0, x4
ld1 {v16.8h}, [x4]
lsl x5, x5, #1
add x5, x0, x5
ld1 {v17.8h}, [x5]
ldr q18, 0xffff4a060cb0
tbl v16.16b, {v16.16b}, v18.16b
ldr q18, 0xffff4a060cb0
tbl v17.16b, {v17.16b}, v18.16b
st1 {v17.8h}, [x4]
st1 {v16.8h}, [x5]
cmp x3, x2
b.cc 0xffff4a060c14 // b.lo, b.ul, b.last
lsl x3, x2, #3
lsl x3, x3, #1
add x0, x3, x0
lsl x2, x2, #3
lsl x2, x2, #1
sub x1, x1, x2
cmp x1, #0x1
b.ls 0xffff4a060ca8 // b.plast
sbfiz x1, x1, #1, #32
add x1, x0, x1
sub x1, x1, #0x2
ldrh w2, [x0]
ldrh w3, [x1]
strh w3, [x0]
strh w2, [x1]
add x0, x0, #0x2
sub x1, x1, #0x2
cmp x0, x1
b.cc 0xffff4a060c88 // b.lo, b.ul, b.last
ldp x29, x30, [sp], #16
ret
0xffff4a060cb0 -> 0x0e 0x0f 0x0c 0x0d 0x0a 0x0b 0x08 0x09 0x06 0x07 0x04 0x05 0x02 0x03 0x00 0x01 |
Let me check on my end. |
Before; Assembly listing for method System.SpanHelpers:Reverse(byref,ulong)
; Emitting BLENDED_CODE for generic ARM64 CPU - Unix
; optimized code
; fp based frame
; fully interruptible
; No PGO data
; 0 inlinees with PGO data; 0 single block inlinees; 1 inlinees without PGO data
; Final local variable assignments
;
; V00 arg0 [V00,T03] ( 7, 11.50) byref -> x0
; V01 arg1 [V01,T04] ( 9, 10 ) long -> x1
;* V02 loc0 [V02 ] ( 0, 0 ) ref -> zero-ref class-hnd
;* V03 loc1 [V03 ] ( 0, 0 ) ref -> zero-ref class-hnd
;* V04 loc2 [V04 ] ( 0, 0 ) ref -> zero-ref class-hnd
;* V05 loc3 [V05 ] ( 0, 0 ) ref -> zero-ref class-hnd
;* V06 loc4 [V06 ] ( 0, 0 ) ref -> zero-ref class-hnd
;* V07 loc5 [V07 ] ( 0, 0 ) ref -> zero-ref class-hnd
;* V08 loc6 [V08 ] ( 0, 0 ) ref -> zero-ref class-hnd
;* V09 loc7 [V09 ] ( 0, 0 ) ref -> zero-ref class-hnd
;* V10 loc8 [V10,T13] ( 0, 0 ) long -> zero-ref
; V11 loc9 [V11,T11] ( 4, 5.50) long -> x2
; V12 loc10 [V12,T02] ( 5, 16.50) long -> x3
; V13 loc11 [V13,T08] ( 2, 8 ) long -> x4
; V14 loc12 [V14,T09] ( 2, 8 ) long -> x5
; V15 loc13 [V15,T14] ( 4, 16 ) simd16 -> d17 HFA(simd16)
; V16 loc14 [V16,T15] ( 4, 16 ) simd16 -> d18 HFA(simd16)
;# V17 OutArgs [V17 ] ( 1, 1 ) lclBlk ( 0) [sp+00H] "OutgoingArgSpace"
; V18 tmp1 [V18,T00] ( 7, 21 ) byref -> x0 "Inline stloc first use temp"
; V19 tmp2 [V19,T01] ( 6, 20.50) byref -> x1 "Inline stloc first use temp"
; V20 tmp3 [V20,T10] ( 2, 8 ) int -> x2 "Inline stloc first use temp"
; V21 cse0 [V21,T12] ( 3, 1.50) long -> x3 "CSE - moderate"
; V22 cse1 [V22,T05] ( 3, 12 ) byref -> x4 "CSE - aggressive"
; V23 cse2 [V23,T06] ( 3, 12 ) byref -> x5 "CSE - aggressive"
; V24 cse3 [V24,T16] ( 3, 8.50) simd16 -> d16 HFA(simd16) "CSE - aggressive"
; V25 cse4 [V25,T07] ( 3, 12 ) long -> x3 "CSE - aggressive"
;
; Lcl frame size = 0
G_M13866_IG01:
stp fp, lr, [sp, #-0x10]!
mov fp, sp
;; size=8 bbWeight=1 PerfScore 1.50
G_M13866_IG02:
cmp x1, #8
blo G_M13866_IG06
;; size=8 bbWeight=1 PerfScore 1.50
G_M13866_IG03:
lsr x2, x1, #2
lsr x2, x2, #1
mov x3, xzr
cmp x2, #0
bls G_M13866_IG05
ldr q16, [@RWD00]
align [0 bytes for IG04]
align [0 bytes]
align [0 bytes]
align [0 bytes]
;; size=24 bbWeight=0.50 PerfScore 3.00
G_M13866_IG04:
lsl x4, x3, #2
add x3, x3, #1
sub x5, x1, x3, LSL #2
lsl x4, x4, #2
add x4, x0, x4
ld1 {v17.4s}, [x4]
lsl x5, x5, #2
add x5, x0, x5
ld1 {v18.4s}, [x5]
tbl v17.16b, {v17.16b}, v16.16b
tbl v18.16b, {v18.16b}, v16.16b
st1 {v18.4s}, [x4]
st1 {v17.4s}, [x5]
cmp x3, x2
blo G_M13866_IG04
;; size=60 bbWeight=4 PerfScore 64.00
G_M13866_IG05:
lsl x3, x2, #2
lsl x2, x3, #2
add x0, x2, x0
sub x1, x1, x3, LSL #1
;; size=16 bbWeight=0.50 PerfScore 1.75
G_M13866_IG06:
cmp x1, #1
bls G_M13866_IG09
;; size=8 bbWeight=1 PerfScore 1.50
G_M13866_IG07:
sbfiz x1, x1, #2, #32
add x1, x0, x1
sub x1, x1, #4
align [0 bytes for IG08]
align [0 bytes]
align [0 bytes]
align [0 bytes]
;; size=12 bbWeight=0.50 PerfScore 1.00
G_M13866_IG08:
ldr w2, [x0]
ldr w3, [x1]
str w3, [x0]
str w2, [x1]
add x0, x0, #4
sub x1, x1, #4
cmp x0, x1
blo G_M13866_IG08
;; size=32 bbWeight=4 PerfScore 42.00
G_M13866_IG09:
ldp fp, lr, [sp], #0x10
ret lr
;; size=8 bbWeight=1 PerfScore 2.00
RWD00 dq 0B0A09080F0E0D0Ch, 0302010007060504h
; Total bytes of code 176, prolog size 8, PerfScore 135.85, instruction count 52, allocated bytes for code 176 (MethodHash=86c6c9d5) for method System.SpanHelpers:Reverse(byref,ulong)
; ============================================================
Done
After; Assembly listing for method System.SpanHelpers:ReverseSIMD[int](byref,ulong)
; Emitting BLENDED_CODE for generic ARM64 CPU - Unix
; optimized code
; fp based frame
; fully interruptible
; No PGO data
; 0 inlinees with PGO data; 0 single block inlinees; 1 inlinees without PGO data
; Final local variable assignments
;
; V00 arg0 [V00,T00] ( 13, 28.50) byref -> x0
; V01 arg1 [V01,T03] ( 15, 16.50) long -> x1
;* V02 loc0 [V02 ] ( 0, 0 ) ref -> zero-ref class-hnd
;* V03 loc1 [V03 ] ( 0, 0 ) ref -> zero-ref class-hnd
;* V04 loc2 [V04 ] ( 0, 0 ) ref -> zero-ref class-hnd
;* V05 loc3 [V05 ] ( 0, 0 ) ref -> zero-ref class-hnd
;* V06 loc4 [V06 ] ( 0, 0 ) ref -> zero-ref class-hnd
;* V07 loc5 [V07 ] ( 0, 0 ) ref -> zero-ref class-hnd
;* V08 loc6 [V08 ] ( 0, 0 ) ref -> zero-ref class-hnd
;* V09 loc7 [V09,T24] ( 0, 0 ) long -> zero-ref
; V10 loc8 [V10,T16] ( 8, 11 ) long -> x2
; V11 loc9 [V11,T04] ( 5, 16.50) long -> x3
; V12 loc10 [V12,T12] ( 3, 12 ) long -> x5
; V13 loc11 [V13,T17] ( 2, 8 ) long -> x6
; V14 loc12 [V14,T13] ( 3, 12 ) long -> x4
; V15 loc13 [V15,T18] ( 2, 8 ) long -> x7
; V16 loc14 [V16,T26] ( 4, 16 ) simd16 -> d17 HFA(simd16)
; V17 loc15 [V17,T27] ( 4, 16 ) simd16 -> d18 HFA(simd16)
; V18 loc16 [V18,T28] ( 4, 16 ) simd16 -> d19 HFA(simd16)
; V19 loc17 [V19,T29] ( 4, 16 ) simd16 -> d20 HFA(simd16)
; V20 loc18 [V20,T05] ( 5, 16.50) long -> x3
; V21 loc19 [V21,T19] ( 2, 8 ) long -> x4
; V22 loc20 [V22,T20] ( 2, 8 ) long -> x5
; V23 loc21 [V23,T30] ( 4, 16 ) simd16 -> d17 HFA(simd16)
; V24 loc22 [V24,T31] ( 4, 16 ) simd16 -> d18 HFA(simd16)
;# V25 OutArgs [V25 ] ( 1, 1 ) lclBlk ( 0) [sp+00H] "OutgoingArgSpace"
; V26 tmp1 [V26,T01] ( 7, 21 ) byref -> x0 "Inline stloc first use temp"
; V27 tmp2 [V27,T02] ( 6, 20.50) byref -> x1 "Inline stloc first use temp"
; V28 tmp3 [V28,T21] ( 2, 8 ) int -> x2 "Inline stloc first use temp"
; V29 cse0 [V29,T14] ( 3, 12 ) long -> x4 "CSE - moderate"
; V30 cse1 [V30,T22] ( 3, 1.50) long -> x2 "CSE - moderate"
; V31 cse2 [V31,T23] ( 3, 1.50) long -> x2 "CSE - moderate"
; V32 cse3 [V32,T06] ( 3, 12 ) byref -> x5 "CSE - moderate"
; V33 cse4 [V33,T07] ( 3, 12 ) byref -> x6 "CSE - moderate"
; V34 cse5 [V34,T08] ( 3, 12 ) byref -> x4 "CSE - moderate"
; V35 cse6 [V35,T09] ( 3, 12 ) byref -> x7 "CSE - moderate"
; V36 cse7 [V36,T10] ( 3, 12 ) byref -> x4 "CSE - moderate"
; V37 cse8 [V37,T11] ( 3, 12 ) byref -> x5 "CSE - moderate"
; V38 cse9 [V38,T25] ( 8, 25 ) simd16 -> d16 HFA(simd16) "CSE - aggressive"
; V39 cse10 [V39,T15] ( 3, 12 ) long -> x3 "CSE - moderate"
;
; Lcl frame size = 0
G_M58668_IG01:
stp fp, lr, [sp, #-0x10]!
mov fp, sp
;; size=8 bbWeight=1 PerfScore 1.50
G_M58668_IG02:
cmp x1, #8
blo G_M58668_IG09
;; size=8 bbWeight=1 PerfScore 1.50
G_M58668_IG03:
cmp x1, #16
blo G_M58668_IG06
lsr x2, x1, #2
lsr x2, x2, #2
mov x3, xzr
cmp x2, #0
bls G_M58668_IG05
ldr q16, [@RWD00]
align [0 bytes for IG04]
align [0 bytes]
align [0 bytes]
align [0 bytes]
;; size=32 bbWeight=0.50 PerfScore 3.75
G_M58668_IG04:
lsl x4, x3, #2
lsl x4, x4, #1
mov x5, x4
add x6, x5, #4
sub x4, x1, x4
sub x4, x4, #4
sub x7, x4, #4
lsl x5, x5, #2
add x5, x0, x5
ld1 {v17.4s}, [x5]
lsl x6, x6, #2
add x6, x0, x6
ld1 {v18.4s}, [x6]
lsl x4, x4, #2
add x4, x0, x4
ld1 {v19.4s}, [x4]
lsl x7, x7, #2
add x7, x0, x7
ld1 {v20.4s}, [x7]
tbl v17.16b, {v17.16b}, v16.16b
tbl v18.16b, {v18.16b}, v16.16b
tbl v19.16b, {v19.16b}, v16.16b
tbl v20.16b, {v20.16b}, v16.16b
st1 {v19.4s}, [x5]
st1 {v20.4s}, [x6]
st1 {v17.4s}, [x4]
st1 {v18.4s}, [x7]
add x3, x3, #1
cmp x3, x2
blo G_M58668_IG04
;; size=120 bbWeight=4 PerfScore 122.00
G_M58668_IG05:
lsl x2, x2, #2
lsl x3, x2, #1
lsl x3, x3, #2
add x0, x3, x0
sub x1, x1, x2, LSL #2
;; size=20 bbWeight=0.50 PerfScore 2.25
G_M58668_IG06:
cmp x1, #8
blo G_M58668_IG09
lsr x2, x1, #2
lsr x2, x2, #1
mov x3, xzr
cmp x2, #0
bls G_M58668_IG08
ldr q16, [@RWD00]
align [4 bytes for IG07]
align [0 bytes]
align [0 bytes]
align [0 bytes]
;; size=36 bbWeight=0.50 PerfScore 4.00
G_M58668_IG07:
lsl x4, x3, #2
add x3, x3, #1
sub x5, x1, x3, LSL #2
lsl x4, x4, #2
add x4, x0, x4
ld1 {v17.4s}, [x4]
lsl x5, x5, #2
add x5, x0, x5
ld1 {v18.4s}, [x5]
tbl v17.16b, {v17.16b}, v16.16b
tbl v18.16b, {v18.16b}, v16.16b
st1 {v18.4s}, [x4]
st1 {v17.4s}, [x5]
cmp x3, x2
blo G_M58668_IG07
;; size=60 bbWeight=4 PerfScore 64.00
G_M58668_IG08:
lsl x2, x2, #2
lsl x3, x2, #2
add x0, x3, x0
sub x1, x1, x2, LSL #1
;; size=16 bbWeight=0.50 PerfScore 1.75
G_M58668_IG09:
cmp x1, #1
bls G_M58668_IG12
;; size=8 bbWeight=1 PerfScore 1.50
G_M58668_IG10:
sbfiz x1, x1, #2, #32
add x1, x0, x1
sub x1, x1, #4
align [0 bytes for IG11]
align [0 bytes]
align [0 bytes]
align [0 bytes]
;; size=12 bbWeight=0.50 PerfScore 1.00
G_M58668_IG11:
ldr w2, [x0]
ldr w3, [x1]
str w3, [x0]
str w2, [x1]
add x0, x0, #4
sub x1, x1, #4
cmp x0, x1
blo G_M58668_IG11
;; size=32 bbWeight=4 PerfScore 42.00
G_M58668_IG12:
ldp fp, lr, [sp], #0x10
ret lr
;; size=8 bbWeight=1 PerfScore 2.00
G_M58668_IG13:
bl CORINFO_HELP_THROWDIVZERO
brk_unix #0
;; size=8 bbWeight=0 PerfScore 0.00
RWD00 dq 0B0A09080F0E0D0Ch, 0302010007060504h
; Total bytes of code 368, prolog size 8, PerfScore 284.05, instruction count 103, allocated bytes for code 368 (MethodHash=d8861ad3) for method System.SpanHelpers:ReverseSIMD[int](byref,ulong) |
That's interesting. Can you share the steps to reproduce it? I would like to know what I missed. |
Just did a checked build of before vs. after your changes. |
|
I will wait for @tannergooding until Monday, in case he has anything to add. |
| if (length <= 1) | ||
| return; |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
nit: While the coding style doesn't require braces for these scenarios, having them is preferred:
| if (length <= 1) | |
| return; | |
| if (length <= 1) | |
| { | |
| return; | |
| } |
| if (length <= 1) | ||
| return; | ||
| ref T first = ref elements; | ||
| ref T last = ref Unsafe.Subtract(ref Unsafe.Add(ref first, (int)length), 1); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This type of logic is incredibly hard to read/understand and has led to bugs in the past.
It is likely better to just pin and use pointers here instead. CC @jkotas
Notably in this particular case, it would be much more readable as just ref Unsafe.Add(ref first, length - 1) (especially since we have an overload that takes nuint)
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
ref Unsafe.Add(ref first, length - 1)
I agree that this is more readable.
It is likely better to just pin and use pointers here instead. CC @jkotas
I am ok with using byref arithmetic in small methods like this one where it is easy to review the code for correctness.
| else if (Unsafe.SizeOf<T>() == sizeof(long)) | ||
| { | ||
| ReverseSIMD(ref Unsafe.As<T, long>(ref elements), length); | ||
| return; | ||
| } |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
We could also handle sizeof(Vector128<byte>) which would cover Guid and other cases.
| [MethodImpl(MethodImplOptions.AggressiveInlining)] | ||
| private static void ReverseSIMD<T>(ref T elements, nuint length) where T : struct | ||
| { | ||
| if (Avx2.IsSupported && (nuint)Vector256<T>.Count * 2 <= length) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
What's the perf impact of this unrolling for small inputs?
We've measured the unrolling to be less efficient in many common cases, even where it makes micro-benchmarks look good.
| nuint numElements = (nuint)Vector256<T>.Count; | ||
| nuint numIters = (length / numElements) / 2; |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I imagine this will end up being at best a (length >> cns) >> cns rather than a direct (length >> cns) that you'd get if you did pre-multiply by 2 instead
Would be good to validate the JIT output, but nuint numElements = (nuint)Vector256<T>.Count * 2 or nuint numIters = (length / (numElements * 2)) might be better anyways
| tempFirst = Vector128.Shuffle(tempFirst.As<T, byte>(), Vector128.Create( | ||
| (byte)15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0)).As<byte, T>(); | ||
| tempSecond = Vector128.Shuffle(tempSecond.As<T, byte>(), Vector128.Create( | ||
| (byte)15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0)).As<byte, T>(); | ||
| tempLast = Vector128.Shuffle(tempLast.As<T, byte>(), Vector128.Create( | ||
| (byte)15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0)).As<byte, T>(); | ||
| tempSecondLast = Vector128.Shuffle(tempSecondLast.As<T, byte>(), Vector128.Create( | ||
| (byte)15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0)).As<byte, T>(); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Is this really better? We only get 2 shuffle ports on most modern hardware so we were already saturating the loop with two shuffles. The smaller loop should also have meant everything stays in the micro-op cache and a single decode window.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
You're right about the two shuffle ports. We were seeing an improvement when run in the test suite, but it had a lot of variance. To confirm this, I tested this in isolation using an assembly version in a loop to avoid VM noise, and also via a CPU model. In both instances, there were no improvements.
The executions have high degree of variance when executed on the VM. The following example demonstrates variations between successive runs while reversing a span of 512 ints.
Five consecutive runs on N1 machine.
1) Span<Int32>.Reverse:
| Method | Job | Toolchain | Size | Mean | Error | StdDev | Median | Min | Max | Ratio | MannWhitney(2%) | Allocated | Alloc Ratio |
|-------- |----------- |-------------- |----- |---------:|--------:|--------:|---------:|---------:|---------:|------:|---------------- |----------:|------------:|
| Reverse | Job-XGQNIM | /base_src/ | 512 | 132.7 ns | 0.61 ns | 0.54 ns | 133.0 ns | 131.8 ns | 133.4 ns | 1.00 | Base | - | NA |
| Reverse | Job-HMTPKD | /runtime_src/ | 512 | 139.9 ns | 0.34 ns | 0.30 ns | 139.9 ns | 139.6 ns | 140.6 ns | 1.05 | Slower | - | NA |
2) Span<Int32>.Reverse:
| Method | Job | Toolchain | Size | Mean | Error | StdDev | Median | Min | Max | Ratio | MannWhitney(2%) | Allocated | Alloc Ratio |
|-------- |----------- |-------------- |----- |----------:|---------:|---------:|----------:|----------:|----------:|------:|---------------- |----------:|------------:|
| Reverse | Job-WZDKZQ | /base_src/ | 512 | 138.21 ns | 0.193 ns | 0.171 ns | 138.22 ns | 137.81 ns | 138.47 ns | 1.00 | Base | - | NA |
| Reverse | Job-EFPABP | /runtime_src/ | 512 | 86.81 ns | 0.107 ns | 0.095 ns | 86.77 ns | 86.71 ns | 87.04 ns | 0.63 | Faster | - | NA |
3) Span<Int32>.Reverse:
| Method | Job | Toolchain | Size | Mean | Error | StdDev | Median | Min | Max | Ratio | MannWhitney(2%) | Allocated | Alloc Ratio |
|-------- |----------- |-------------- |----- |----------:|---------:|---------:|----------:|----------:|----------:|------:|---------------- |----------:|------------:|
| Reverse | Job-WXAQSQ | /base_src/ | 512 | 93.83 ns | 0.352 ns | 0.312 ns | 93.90 ns | 93.42 ns | 94.39 ns | 1.00 | Base | - | NA |
| Reverse | Job-TWOFWQ | /runtime_src/ | 512 | 121.68 ns | 0.207 ns | 0.184 ns | 121.62 ns | 121.48 ns | 122.12 ns | 1.30 | Slower | - | NA |
4) Span<Int32>.Reverse:
| Method | Job | Toolchain | Size | Mean | Error | StdDev | Median | Min | Max | Ratio | MannWhitney(2%) | Allocated | Alloc Ratio |
|-------- |----------- |-------------- |----- |---------:|---------:|---------:|---------:|---------:|---------:|------:|---------------- |----------:|------------:|
| Reverse | Job-KMVVOJ | /base_src/ | 512 | 95.07 ns | 0.133 ns | 0.118 ns | 95.06 ns | 94.92 ns | 95.34 ns | 1.00 | Base | - | NA |
| Reverse | Job-ZIDWKX | /runtime_src/ | 512 | 86.95 ns | 0.369 ns | 0.327 ns | 86.98 ns | 86.52 ns | 87.48 ns | 0.91 | Faster | - | NA |
5) Span<Int32>.Reverse:
| Method | Job | Toolchain | Size | Mean | Error | StdDev | Median | Min | Max | Ratio | MannWhitney(2%) | Allocated | Alloc Ratio |
|-------- |----------- |-------------- |----- |---------:|---------:|---------:|---------:|---------:|---------:|------:|---------------- |----------:|------------:|
| Reverse | Job-JHDIRD | /base_src/ | 512 | 94.40 ns | 0.175 ns | 0.146 ns | 94.39 ns | 94.20 ns | 94.68 ns | 1.00 | Base | - | NA |
| Reverse | Job-OCMKTU | /runtime_src/ | 512 | 87.87 ns | 0.211 ns | 0.176 ns | 87.79 ns | 87.71 ns | 88.26 ns | 0.93 | Faster | - | NA |
We'd like to get to the reasons for the variations, but I don't think it's due to the unrolling.
In addition, the existing version cannot benefit from micro-op cache. The micro-op cache is of 32 bytes, so it can hold up to 8 instructions. However, the loop has 15 instructions currently, so it cannot fit in the cache. We could improve the sequence of instructions with LD1 + post-increment/decrement instructions (and LD2 in the future).
If you still want the refactoring part of this patch, then I'm happy to remove the unrolling part. Otherwise, we can close this PR.
|
Status update: once #70944 gets merged (it was opened before this PR) I am going to solve the merge conflicts for this PR, re-run the benchmarks and if everything looks good provide the review and hopefully merge it. Apologies for the delay. |
| nuint firstOffset = i * numElements * 2; | ||
| nuint secondOffset = firstOffset + numElements; | ||
| nuint lastOffset = length - (i * numElements * 2) - numElements; | ||
| nuint secondLastOffset = lastOffset - numElements; |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Haven't checked impact on performance or code quality on such patterns
- Reuse the computation, e.g.,
nuint lastOffset = length - firstOffset - numElements - Extract the computation out of the loop and just increment/decrement offsets by
Vector128<T>.Count
Hi @adamsitnik , thanks for taking a look. Please let me know if I can be of any help to push this forward 🙂 |
…-unroll # Conflicts: # src/libraries/System.Private.CoreLib/src/System/SpanHelpers.Byte.cs # src/libraries/System.Private.CoreLib/src/System/SpanHelpers.Char.cs # src/libraries/System.Private.CoreLib/src/System/SpanHelpers.T.cs # src/libraries/System.Private.CoreLib/src/System/SpanHelpers.cs
adamsitnik
left a comment
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I've solved the merge conflict and re-run the benchmarks against latest main. It seems that in the current shape, the PR would regress the performance.
I can restore the byte fast path added in #70944, but @SwapnilGaikwad it would be great if you could re-run these benchmarks on a different arm64 machine and try to confirm or deny the regression for other types for larger buffers. This is the change I've applied to the benchmarks to benchmark more test cases: adamsitnik/performance@0eaae47
Details
BenchmarkDotNet=v0.13.2.1950-nightly, OS=ubuntu 20.04
Unknown processor
.NET SDK=8.0.100-alpha.1.22567.28
[Host] : .NET 8.0.0 (8.0.22.55902), Arm64 RyuJIT AdvSIMD
LaunchCount=1 MemoryRandomization=True| Type | Job | Size | Mean | Ratio |
|---|---|---|---|---|
| Span<Byte> | PR | 4 | 9.191 ns | 1.43 |
| Span<Byte> | main | 4 | 6.414 ns | 1.00 |
| Span<Char> | PR | 4 | 5.172 ns | 0.91 |
| Span<Char> | main | 4 | 5.654 ns | 1.00 |
| Span<Int32> | PR | 4 | 5.364 ns | 0.99 |
| Span<Int32> | main | 4 | 5.502 ns | 1.00 |
| Span<Int64> | PR | 4 | 6.950 ns | 1.27 |
| Span<Int64> | main | 4 | 5.463 ns | 1.00 |
| Span<Byte> | PR | 8 | 10.164 ns | 1.52 |
| Span<Byte> | main | 8 | 6.734 ns | 1.00 |
| Span<Char> | PR | 8 | 6.566 ns | 1.20 |
| Span<Char> | main | 8 | 5.461 ns | 1.00 |
| Span<Int32> | PR | 8 | 6.716 ns | 1.21 |
| Span<Int32> | main | 8 | 5.548 ns | 1.00 |
| Span<Int64> | PR | 8 | 8.436 ns | 1.18 |
| Span<Int64> | main | 8 | 7.173 ns | 1.00 |
| Span<Byte> | PR | 16 | 12.346 ns | 1.58 |
| Span<Byte> | main | 16 | 8.017 ns | 1.00 |
| Span<Char> | PR | 16 | 7.968 ns | 1.47 |
| Span<Char> | main | 16 | 5.435 ns | 1.00 |
| Span<Int32> | PR | 16 | 7.972 ns | 1.13 |
| Span<Int32> | main | 16 | 7.058 ns | 1.00 |
| Span<Int64> | PR | 16 | 13.088 ns | 1.15 |
| Span<Int64> | main | 16 | 11.362 ns | 1.00 |
| Span<Byte> | PR | 32 | 7.636 ns | 1.09 |
| Span<Byte> | main | 32 | 7.042 ns | 1.00 |
| Span<Char> | PR | 32 | 7.918 ns | 1.07 |
| Span<Char> | main | 32 | 7.394 ns | 1.00 |
| Span<Int32> | PR | 32 | 12.459 ns | 1.09 |
| Span<Int32> | main | 32 | 11.429 ns | 1.00 |
| Span<Int64> | PR | 32 | 22.891 ns | 1.15 |
| Span<Int64> | main | 32 | 19.879 ns | 1.00 |
| Span<Byte> | PR | 64 | 9.282 ns | 1.04 |
| Span<Byte> | main | 64 | 8.957 ns | 1.00 |
| Span<Char> | PR | 64 | 12.515 ns | 1.01 |
| Span<Char> | main | 64 | 12.418 ns | 1.00 |
| Span<Int32> | PR | 64 | 22.076 ns | 1.12 |
| Span<Int32> | main | 64 | 19.699 ns | 1.00 |
| Span<Int64> | PR | 64 | 42.038 ns | 1.14 |
| Span<Int64> | main | 64 | 36.882 ns | 1.00 |
| Span<Byte> | PR | 512 | 37.273 ns | 1.07 |
| Span<Byte> | main | 512 | 34.769 ns | 1.00 |
| Span<Char> | PR | 512 | 78.840 ns | 1.12 |
| Span<Char> | main | 512 | 70.473 ns | 1.00 |
| Span<Int32> | PR | 512 | 152.160 ns | 1.12 |
| Span<Int32> | main | 512 | 136.060 ns | 1.00 |
| Span<Int64> | PR | 512 | 306.989 ns | 1.10 |
| Span<Int64> | main | 512 | 278.162 ns | 1.00 |
Sure. Will do, but unfortunately it will be early next week. |
|
Hi @adamsitnik, give me shout whenever you like to benchmark a newer version. The results are for Arm64 N1 system. Span<Byte>.Reverse:Span<Char>.Reverse:Span<Int32>.Reverse: |
…-unroll # Conflicts: # src/libraries/System.Private.CoreLib/src/System/SpanHelpers.Byte.cs # src/libraries/System.Private.CoreLib/src/System/SpanHelpers.Char.cs # src/libraries/System.Private.CoreLib/src/System/SpanHelpers.cs
6ac82f3 to
a7a2ccb
Compare
adamsitnik
left a comment
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I've solved the merged conflicts and re-run the benchmarks on x64 AMD (Vector256 code path).
To ensure that the results are not affected by random memory alignment, I've enabled memory randomization (BDN feature) and forced the harness to spawn not 1, but 3 processes (--memoryRandomization true --launchCount 3).
There are multiple test cases where performance has slightly regressed. While we could rather easily solve the regression for small spans of bytes by copying part of the code from #70944 I don't see an easy way to solve the regression for integers too.
Since Tanner and others poninted other distadvantages of this approach I am inclined to close this PR.
@SwapnilGaikwad thank you for your contribution!
Details
BenchmarkDotNet=v0.13.2.2043-nightly, OS=Windows 11 (10.0.22621.963)
AMD Ryzen Threadripper PRO 3945WX 12-Cores, 1 CPU, 24 logical and 12 physical cores
.NET SDK=8.0.100-alpha.1.22622.3
[Host] : .NET 8.0.0 (8.0.22.60501), X64 RyuJIT AVX2
Job-ARMDKF : .NET 8.0.0 (42.42.42.42424), X64 RyuJIT AVX2
Job-URHJCS : .NET 8.0.0 (42.42.42.42424), X64 RyuJIT AVX2
LaunchCount=3 MemoryRandomization=True| Type | Job | Size | Mean | Ratio |
|---|---|---|---|---|
| Span<Byte> | PR | 4 | 4.147 ns | 1.14 |
| Span<Byte> | main | 4 | 3.651 ns | 1.00 |
| Span<Char> | PR | 4 | 4.230 ns | 1.07 |
| Span<Char> | main | 4 | 3.961 ns | 1.00 |
| Span<Int32> | PR | 4 | 3.930 ns | 1.03 |
| Span<Int32> | main | 4 | 3.805 ns | 1.00 |
| Span<Byte> | PR | 8 | 5.487 ns | 1.51 |
| Span<Byte> | main | 8 | 3.641 ns | 1.00 |
| Span<Char> | PR | 8 | 6.073 ns | 1.06 |
| Span<Char> | main | 8 | 5.726 ns | 1.00 |
| Span<Int32> | PR | 8 | 3.939 ns | 1.11 |
| Span<Int32> | main | 8 | 3.537 ns | 1.00 |
| Span<Byte> | PR | 14 | 9.856 ns | 2.62 |
| Span<Byte> | main | 14 | 3.787 ns | 1.00 |
| Span<Char> | PR | 14 | 11.027 ns | 0.95 |
| Span<Char> | main | 14 | 11.647 ns | 1.00 |
| Span<Int32> | PR | 14 | 10.734 ns | 2.55 |
| Span<Int32> | main | 14 | 4.210 ns | 1.00 |
| Span<Byte> | PR | 33 | 4.547 ns | 1.20 |
| Span<Byte> | main | 33 | 3.802 ns | 1.00 |
| Span<Char> | PR | 33 | 4.293 ns | 1.10 |
| Span<Char> | main | 33 | 3.886 ns | 1.00 |
| Span<Int32> | PR | 33 | 5.079 ns | 1.22 |
| Span<Int32> | main | 33 | 4.158 ns | 1.00 |
| Span<Byte> | PR | 40 | 7.326 ns | 1.68 |
| Span<Byte> | main | 40 | 4.353 ns | 1.00 |
| Span<Char> | PR | 40 | 7.413 ns | 1.04 |
| Span<Char> | main | 40 | 7.146 ns | 1.00 |
| Span<Int32> | PR | 40 | 8.120 ns | 1.63 |
| Span<Int32> | main | 40 | 5.012 ns | 1.00 |
| Span<Byte> | PR | 512 | 11.273 ns | 1.14 |
| Span<Byte> | main | 512 | 9.892 ns | 1.00 |
| Span<Char> | PR | 512 | 19.102 ns | 1.19 |
| Span<Char> | main | 512 | 16.119 ns | 1.00 |
| Span<Int32> | PR | 512 | 30.309 ns | 1.01 |
| Span<Int32> | main | 512 | 30.154 ns | 1.00 |
|
Thanks @adamsitnik for taking this PR to conclusion. |
No description provided.