Skip to content

Commit 0ed27fb

Browse files
committed
Always end BN_mod_exp_mont_consttime with normal Montgomery reduction.
This partially fixes a bug where, on x86_64, BN_mod_exp_mont_consttime would sometimes return m, the modulus, when it should have returned zero. Thanks to Guido Vranken for reporting it. It is only a partial fix because the same bug also exists in the "rsaz" codepath. The bug only affects zero outputs (with non-zero inputs), so we believe it has no security impact on our cryptographic functions. The fx is to delete lowercase bn_from_montgomery altogether, and have the mont5 path use the same BN_from_montgomery ending as the non-mont5 path. This only impacts the final step of the whole exponentiation and has no measurable perf impact. See the original BoringSSL commit https://boringssl.googlesource.com/boringssl/+/13c9d5c69d04485a7a8840c12185c832026c8315 for further analysis. Original-author: David Benjamin <[email protected]> Reviewed-by: Matt Caswell <[email protected]> Reviewed-by: Paul Dale <[email protected]> (Merged from openssl#18511)
1 parent 8f07881 commit 0ed27fb

File tree

3 files changed

+93
-214
lines changed

3 files changed

+93
-214
lines changed

crypto/bn/asm/x86_64-mont5.pl

Lines changed: 0 additions & 196 deletions
Original file line numberDiff line numberDiff line change
@@ -2101,193 +2101,6 @@
21012101
.size __bn_post4x_internal,.-__bn_post4x_internal
21022102
___
21032103
}
2104-
{
2105-
$code.=<<___;
2106-
.globl bn_from_montgomery
2107-
.type bn_from_montgomery,\@abi-omnipotent
2108-
.align 32
2109-
bn_from_montgomery:
2110-
.cfi_startproc
2111-
testl \$7,`($win64?"48(%rsp)":"%r9d")`
2112-
jz bn_from_mont8x
2113-
xor %eax,%eax
2114-
ret
2115-
.cfi_endproc
2116-
.size bn_from_montgomery,.-bn_from_montgomery
2117-
2118-
.type bn_from_mont8x,\@function,6
2119-
.align 32
2120-
bn_from_mont8x:
2121-
.cfi_startproc
2122-
.byte 0x67
2123-
mov %rsp,%rax
2124-
.cfi_def_cfa_register %rax
2125-
push %rbx
2126-
.cfi_push %rbx
2127-
push %rbp
2128-
.cfi_push %rbp
2129-
push %r12
2130-
.cfi_push %r12
2131-
push %r13
2132-
.cfi_push %r13
2133-
push %r14
2134-
.cfi_push %r14
2135-
push %r15
2136-
.cfi_push %r15
2137-
.Lfrom_prologue:
2138-
2139-
shl \$3,${num}d # convert $num to bytes
2140-
lea ($num,$num,2),%r10 # 3*$num in bytes
2141-
neg $num
2142-
mov ($n0),$n0 # *n0
2143-
2144-
##############################################################
2145-
# Ensure that stack frame doesn't alias with $rptr+3*$num
2146-
# modulo 4096, which covers ret[num], am[num] and n[num]
2147-
# (see bn_exp.c). The stack is allocated to aligned with
2148-
# bn_power5's frame, and as bn_from_montgomery happens to be
2149-
# last operation, we use the opportunity to cleanse it.
2150-
#
2151-
lea -320(%rsp,$num,2),%r11
2152-
mov %rsp,%rbp
2153-
sub $rptr,%r11
2154-
and \$4095,%r11
2155-
cmp %r11,%r10
2156-
jb .Lfrom_sp_alt
2157-
sub %r11,%rbp # align with $aptr
2158-
lea -320(%rbp,$num,2),%rbp # future alloca(frame+2*$num*8+256)
2159-
jmp .Lfrom_sp_done
2160-
2161-
.align 32
2162-
.Lfrom_sp_alt:
2163-
lea 4096-320(,$num,2),%r10
2164-
lea -320(%rbp,$num,2),%rbp # future alloca(frame+2*$num*8+256)
2165-
sub %r10,%r11
2166-
mov \$0,%r10
2167-
cmovc %r10,%r11
2168-
sub %r11,%rbp
2169-
.Lfrom_sp_done:
2170-
and \$-64,%rbp
2171-
mov %rsp,%r11
2172-
sub %rbp,%r11
2173-
and \$-4096,%r11
2174-
lea (%rbp,%r11),%rsp
2175-
mov (%rsp),%r10
2176-
cmp %rbp,%rsp
2177-
ja .Lfrom_page_walk
2178-
jmp .Lfrom_page_walk_done
2179-
2180-
.Lfrom_page_walk:
2181-
lea -4096(%rsp),%rsp
2182-
mov (%rsp),%r10
2183-
cmp %rbp,%rsp
2184-
ja .Lfrom_page_walk
2185-
.Lfrom_page_walk_done:
2186-
2187-
mov $num,%r10
2188-
neg $num
2189-
2190-
##############################################################
2191-
# Stack layout
2192-
#
2193-
# +0 saved $num, used in reduction section
2194-
# +8 &t[2*$num], used in reduction section
2195-
# +32 saved *n0
2196-
# +40 saved %rsp
2197-
# +48 t[2*$num]
2198-
#
2199-
mov $n0, 32(%rsp)
2200-
mov %rax, 40(%rsp) # save original %rsp
2201-
.cfi_cfa_expression %rsp+40,deref,+8
2202-
.Lfrom_body:
2203-
mov $num,%r11
2204-
lea 48(%rsp),%rax
2205-
pxor %xmm0,%xmm0
2206-
jmp .Lmul_by_1
2207-
2208-
.align 32
2209-
.Lmul_by_1:
2210-
movdqu ($aptr),%xmm1
2211-
movdqu 16($aptr),%xmm2
2212-
movdqu 32($aptr),%xmm3
2213-
movdqa %xmm0,(%rax,$num)
2214-
movdqu 48($aptr),%xmm4
2215-
movdqa %xmm0,16(%rax,$num)
2216-
.byte 0x48,0x8d,0xb6,0x40,0x00,0x00,0x00 # lea 64($aptr),$aptr
2217-
movdqa %xmm1,(%rax)
2218-
movdqa %xmm0,32(%rax,$num)
2219-
movdqa %xmm2,16(%rax)
2220-
movdqa %xmm0,48(%rax,$num)
2221-
movdqa %xmm3,32(%rax)
2222-
movdqa %xmm4,48(%rax)
2223-
lea 64(%rax),%rax
2224-
sub \$64,%r11
2225-
jnz .Lmul_by_1
2226-
2227-
movq $rptr,%xmm1
2228-
movq $nptr,%xmm2
2229-
.byte 0x67
2230-
mov $nptr,%rbp
2231-
movq %r10, %xmm3 # -num
2232-
___
2233-
$code.=<<___ if ($addx);
2234-
mov OPENSSL_ia32cap_P+8(%rip),%r11d
2235-
and \$0x80108,%r11d
2236-
cmp \$0x80108,%r11d # check for AD*X+BMI2+BMI1
2237-
jne .Lfrom_mont_nox
2238-
2239-
lea (%rax,$num),$rptr
2240-
call __bn_sqrx8x_reduction
2241-
call __bn_postx4x_internal
2242-
2243-
pxor %xmm0,%xmm0
2244-
lea 48(%rsp),%rax
2245-
jmp .Lfrom_mont_zero
2246-
2247-
.align 32
2248-
.Lfrom_mont_nox:
2249-
___
2250-
$code.=<<___;
2251-
call __bn_sqr8x_reduction
2252-
call __bn_post4x_internal
2253-
2254-
pxor %xmm0,%xmm0
2255-
lea 48(%rsp),%rax
2256-
jmp .Lfrom_mont_zero
2257-
2258-
.align 32
2259-
.Lfrom_mont_zero:
2260-
mov 40(%rsp),%rsi # restore %rsp
2261-
.cfi_def_cfa %rsi,8
2262-
movdqa %xmm0,16*0(%rax)
2263-
movdqa %xmm0,16*1(%rax)
2264-
movdqa %xmm0,16*2(%rax)
2265-
movdqa %xmm0,16*3(%rax)
2266-
lea 16*4(%rax),%rax
2267-
sub \$32,$num
2268-
jnz .Lfrom_mont_zero
2269-
2270-
mov \$1,%rax
2271-
mov -48(%rsi),%r15
2272-
.cfi_restore %r15
2273-
mov -40(%rsi),%r14
2274-
.cfi_restore %r14
2275-
mov -32(%rsi),%r13
2276-
.cfi_restore %r13
2277-
mov -24(%rsi),%r12
2278-
.cfi_restore %r12
2279-
mov -16(%rsi),%rbp
2280-
.cfi_restore %rbp
2281-
mov -8(%rsi),%rbx
2282-
.cfi_restore %rbx
2283-
lea (%rsi),%rsp
2284-
.cfi_def_cfa_register %rsp
2285-
.Lfrom_epilogue:
2286-
ret
2287-
.cfi_endproc
2288-
.size bn_from_mont8x,.-bn_from_mont8x
2289-
___
2290-
}
22912104
}}}
22922105

22932106
if ($addx) {{{
@@ -3894,10 +3707,6 @@
38943707
.rva .LSEH_begin_bn_power5
38953708
.rva .LSEH_end_bn_power5
38963709
.rva .LSEH_info_bn_power5
3897-
3898-
.rva .LSEH_begin_bn_from_mont8x
3899-
.rva .LSEH_end_bn_from_mont8x
3900-
.rva .LSEH_info_bn_from_mont8x
39013710
___
39023711
$code.=<<___ if ($addx);
39033712
.rva .LSEH_begin_bn_mulx4x_mont_gather5
@@ -3929,11 +3738,6 @@
39293738
.byte 9,0,0,0
39303739
.rva mul_handler
39313740
.rva .Lpower5_prologue,.Lpower5_body,.Lpower5_epilogue # HandlerData[]
3932-
.align 8
3933-
.LSEH_info_bn_from_mont8x:
3934-
.byte 9,0,0,0
3935-
.rva mul_handler
3936-
.rva .Lfrom_prologue,.Lfrom_body,.Lfrom_epilogue # HandlerData[]
39373741
___
39383742
$code.=<<___ if ($addx);
39393743
.align 8

crypto/bn/bn_exp.c

Lines changed: 26 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -900,14 +900,21 @@ int BN_mod_exp_mont_consttime(BIGNUM *rr, const BIGNUM *a, const BIGNUM *p,
900900
#if defined(OPENSSL_BN_ASM_MONT5)
901901
if (window == 5 && top > 1) {
902902
/*
903-
* This optimization uses ideas from http://eprint.iacr.org/2011/239,
904-
* specifically optimization of cache-timing attack countermeasures
905-
* and pre-computation optimization.
906-
*/
907-
908-
/*
909-
* Dedicated window==4 case improves 512-bit RSA sign by ~15%, but as
910-
* 512-bit RSA is hardly relevant, we omit it to spare size...
903+
* This optimization uses ideas from https://eprint.iacr.org/2011/239,
904+
* specifically optimization of cache-timing attack countermeasures,
905+
* pre-computation optimization, and Almost Montgomery Multiplication.
906+
*
907+
* The paper discusses a 4-bit window to optimize 512-bit modular
908+
* exponentiation, used in RSA-1024 with CRT, but RSA-1024 is no longer
909+
* important.
910+
*
911+
* |bn_mul_mont_gather5| and |bn_power5| implement the "almost"
912+
* reduction variant, so the values here may not be fully reduced.
913+
* They are bounded by R (i.e. they fit in |top| words), not |m|.
914+
* Additionally, we pass these "almost" reduced inputs into
915+
* |bn_mul_mont|, which implements the normal reduction variant.
916+
* Given those inputs, |bn_mul_mont| may not give reduced
917+
* output, but it will still produce "almost" reduced output.
911918
*/
912919
void bn_mul_mont_gather5(BN_ULONG *rp, const BN_ULONG *ap,
913920
const void *table, const BN_ULONG *np,
@@ -919,9 +926,6 @@ int BN_mod_exp_mont_consttime(BIGNUM *rr, const BIGNUM *a, const BIGNUM *p,
919926
const void *table, const BN_ULONG *np,
920927
const BN_ULONG *n0, int num, int power);
921928
int bn_get_bits5(const BN_ULONG *ap, int off);
922-
int bn_from_montgomery(BN_ULONG *rp, const BN_ULONG *ap,
923-
const BN_ULONG *not_used, const BN_ULONG *np,
924-
const BN_ULONG *n0, int num);
925929

926930
BN_ULONG *n0 = mont->n0, *np;
927931

@@ -1010,14 +1014,18 @@ int BN_mod_exp_mont_consttime(BIGNUM *rr, const BIGNUM *a, const BIGNUM *p,
10101014
}
10111015
}
10121016

1013-
ret = bn_from_montgomery(tmp.d, tmp.d, NULL, np, n0, top);
10141017
tmp.top = top;
1015-
bn_correct_top(&tmp);
1016-
if (ret) {
1017-
if (!BN_copy(rr, &tmp))
1018-
ret = 0;
1019-
goto err; /* non-zero ret means it's not error */
1020-
}
1018+
/*
1019+
* The result is now in |tmp| in Montgomery form, but it may not be
1020+
* fully reduced. This is within bounds for |BN_from_montgomery|
1021+
* (tmp < R <= m*R) so it will, when converting from Montgomery form,
1022+
* produce a fully reduced result.
1023+
*
1024+
* This differs from Figure 2 of the paper, which uses AMM(h, 1) to
1025+
* convert from Montgomery form with unreduced output, followed by an
1026+
* extra reduction step. In the paper's terminology, we replace
1027+
* steps 9 and 10 with MM(h, 1).
1028+
*/
10211029
} else
10221030
#endif
10231031
{

0 commit comments

Comments
 (0)