Skip to content

Commit 13840dd

Browse files
ebiggersBoringssl LUCI CQ
authored and
Boringssl LUCI CQ
committed
Add explicit prefetching to the new AES-GCM code
Add explicit prefetching to the main loop of the new AES-GCM code, following the same rationale as change I6312e01ff0da70cc52f09194846b82cc6b69d37a. For now the same prefetch distance of 512 bytes is used. Change-Id: Ib57affb414e88675f3a4c8e124728a0cf412bc0a Reviewed-on: https://boringssl-review.googlesource.com/c/boringssl/+/75267 Reviewed-by: David Benjamin <[email protected]> Commit-Queue: David Benjamin <[email protected]>
1 parent 21f54b2 commit 13840dd

8 files changed

+49
-0
lines changed

crypto/fipsmodule/modes/asm/aes-gcm-avx10-x86_64.pl

+7
Original file line numberDiff line numberDiff line change
@@ -1118,6 +1118,13 @@ sub _aes_gcm_update {
11181118
.Laes128$local_label_suffix:
11191119
___
11201120

1121+
# Prefetch the source data 512 bytes ahead into the L1 data cache, to
1122+
# improve performance when the hardware prefetcher is disabled. Assumes the
1123+
# L1 data cache line size is 64 bytes (de facto standard on x86_64).
1124+
for ( my $i = 0 ; $i < 4 * $VL ; $i += 64 ) {
1125+
$code .= "prefetcht0 512+$i($SRC)\n";
1126+
}
1127+
11211128
# Finish the AES encryption of the counter blocks in V0-V3, interleaved
11221129
# with the GHASH update of the ciphertext blocks in GHASHDATA[0-3].
11231130
for my $i ( reverse 1 .. 9 ) {

crypto/fipsmodule/modes/asm/aes-gcm-avx2-x86_64.pl

+6
Original file line numberDiff line numberDiff line change
@@ -805,6 +805,12 @@ sub _aes_gcm_update {
805805
.Laes128$local_label_suffix:
806806
___
807807

808+
# Prefetch the source data 512 bytes ahead into the L1 data cache, to
809+
# improve performance when the hardware prefetcher is disabled. Assumes the
810+
# L1 data cache line size is 64 bytes (de facto standard on x86_64).
811+
$code .= "prefetcht0 512($SRC)\n";
812+
$code .= "prefetcht0 512+64($SRC)\n";
813+
808814
# Finish the AES encryption of the counter blocks in AESDATA[0-3],
809815
# interleaved with the GHASH update of the ciphertext blocks.
810816
for my $i ( reverse 1 .. 9 ) {

gen/bcm/aes-gcm-avx10-x86_64-apple.S

+8
Original file line numberDiff line numberDiff line change
@@ -512,6 +512,10 @@ L$aes192__func1:
512512
vaesenc %zmm9,%zmm3,%zmm3
513513

514514
L$aes128__func1:
515+
prefetcht0 512+0(%rdi)
516+
prefetcht0 512+64(%rdi)
517+
prefetcht0 512+128(%rdi)
518+
prefetcht0 512+192(%rdi)
515519
vpshufb %zmm8,%zmm4,%zmm4
516520
vpxord %zmm10,%zmm4,%zmm4
517521
vpshufb %zmm8,%zmm5,%zmm5
@@ -953,6 +957,10 @@ L$aes192__func2:
953957
vaesenc %zmm9,%zmm3,%zmm3
954958

955959
L$aes128__func2:
960+
prefetcht0 512+0(%rdi)
961+
prefetcht0 512+64(%rdi)
962+
prefetcht0 512+128(%rdi)
963+
prefetcht0 512+192(%rdi)
956964
vpshufb %zmm8,%zmm4,%zmm4
957965
vpxord %zmm10,%zmm4,%zmm4
958966
vpshufb %zmm8,%zmm5,%zmm5

gen/bcm/aes-gcm-avx10-x86_64-linux.S

+8
Original file line numberDiff line numberDiff line change
@@ -514,6 +514,10 @@ _CET_ENDBR
514514
vaesenc %zmm9,%zmm3,%zmm3
515515

516516
.Laes128__func1:
517+
prefetcht0 512+0(%rdi)
518+
prefetcht0 512+64(%rdi)
519+
prefetcht0 512+128(%rdi)
520+
prefetcht0 512+192(%rdi)
517521
vpshufb %zmm8,%zmm4,%zmm4
518522
vpxord %zmm10,%zmm4,%zmm4
519523
vpshufb %zmm8,%zmm5,%zmm5
@@ -957,6 +961,10 @@ _CET_ENDBR
957961
vaesenc %zmm9,%zmm3,%zmm3
958962

959963
.Laes128__func2:
964+
prefetcht0 512+0(%rdi)
965+
prefetcht0 512+64(%rdi)
966+
prefetcht0 512+128(%rdi)
967+
prefetcht0 512+192(%rdi)
960968
vpshufb %zmm8,%zmm4,%zmm4
961969
vpxord %zmm10,%zmm4,%zmm4
962970
vpshufb %zmm8,%zmm5,%zmm5

gen/bcm/aes-gcm-avx10-x86_64-win.asm

+8
Original file line numberDiff line numberDiff line change
@@ -579,6 +579,10 @@ $L$aes192__func1:
579579
vaesenc zmm3,zmm3,zmm9
580580

581581
$L$aes128__func1:
582+
prefetcht0 [((512+0))+rcx]
583+
prefetcht0 [((512+64))+rcx]
584+
prefetcht0 [((512+128))+rcx]
585+
prefetcht0 [((512+192))+rcx]
582586
vpshufb zmm4,zmm4,zmm8
583587
vpxord zmm4,zmm4,zmm10
584588
vpshufb zmm5,zmm5,zmm8
@@ -1061,6 +1065,10 @@ $L$aes192__func2:
10611065
vaesenc zmm3,zmm3,zmm9
10621066

10631067
$L$aes128__func2:
1068+
prefetcht0 [((512+0))+rcx]
1069+
prefetcht0 [((512+64))+rcx]
1070+
prefetcht0 [((512+128))+rcx]
1071+
prefetcht0 [((512+192))+rcx]
10641072
vpshufb zmm4,zmm4,zmm8
10651073
vpxord zmm4,zmm4,zmm10
10661074
vpshufb zmm5,zmm5,zmm8

gen/bcm/aes-gcm-avx2-x86_64-apple.S

+4
Original file line numberDiff line numberDiff line change
@@ -498,6 +498,8 @@ L$aes192__func1:
498498
vaesenc %ymm2,%ymm15,%ymm15
499499

500500
L$aes128__func1:
501+
prefetcht0 512(%rdi)
502+
prefetcht0 512+64(%rdi)
501503

502504
vmovdqu 0(%rsi),%ymm3
503505
vpshufb %ymm0,%ymm3,%ymm3
@@ -983,6 +985,8 @@ L$aes192__func2:
983985
vaesenc %ymm2,%ymm15,%ymm15
984986

985987
L$aes128__func2:
988+
prefetcht0 512(%rdi)
989+
prefetcht0 512+64(%rdi)
986990

987991
vmovdqu 0(%rdi),%ymm3
988992
vpshufb %ymm0,%ymm3,%ymm3

gen/bcm/aes-gcm-avx2-x86_64-linux.S

+4
Original file line numberDiff line numberDiff line change
@@ -500,6 +500,8 @@ _CET_ENDBR
500500
vaesenc %ymm2,%ymm15,%ymm15
501501

502502
.Laes128__func1:
503+
prefetcht0 512(%rdi)
504+
prefetcht0 512+64(%rdi)
503505

504506
vmovdqu 0(%rsi),%ymm3
505507
vpshufb %ymm0,%ymm3,%ymm3
@@ -987,6 +989,8 @@ _CET_ENDBR
987989
vaesenc %ymm2,%ymm15,%ymm15
988990

989991
.Laes128__func2:
992+
prefetcht0 512(%rdi)
993+
prefetcht0 512+64(%rdi)
990994

991995
vmovdqu 0(%rdi),%ymm3
992996
vpshufb %ymm0,%ymm3,%ymm3

gen/bcm/aes-gcm-avx2-x86_64-win.asm

+4
Original file line numberDiff line numberDiff line change
@@ -559,6 +559,8 @@ $L$aes192__func1:
559559
vaesenc ymm15,ymm15,ymm2
560560

561561
$L$aes128__func1:
562+
prefetcht0 [512+rcx]
563+
prefetcht0 [((512+64))+rcx]
562564

563565
vmovdqu ymm3,YMMWORD[rdx]
564566
vpshufb ymm3,ymm3,ymm0
@@ -1085,6 +1087,8 @@ $L$aes192__func2:
10851087
vaesenc ymm15,ymm15,ymm2
10861088

10871089
$L$aes128__func2:
1090+
prefetcht0 [512+rcx]
1091+
prefetcht0 [((512+64))+rcx]
10881092

10891093
vmovdqu ymm3,YMMWORD[rcx]
10901094
vpshufb ymm3,ymm3,ymm0

0 commit comments

Comments
 (0)