Fix #10513 - powmod is slow for ulong#10688
Conversation
|
Thanks for your pull request and interest in making D better, @Aditya-132! We are looking forward to reviewing it, and you should be hearing from a maintainer soon.
Please see CONTRIBUTING.md for more information. If you have addressed all reviews or aren't sure how to proceed, don't hesitate to ping us with a simple comment. Bugzilla referencesYour PR doesn't reference any Bugzilla issue. If your PR contains non-trivial changes, please reference a Bugzilla issue or create a manual changelog. Testing this PR locallyIf you don't have a local development environment setup, you can use Digger to test this PR: dub run digger -- build "master + phobos#10688" |
|
@Aditya-132 before I go through it, if you can run and post benchmarks, that'll help with review. Out of curiosity, would you also be able to.do gcc-style asm for gdc and ldc? |
sure |
|
Please give the PR and commit titles a description based on the issues this solves e.g. |
can you provide any example which is already present in code |
instead of modifying powmod we can diredctly modify mulmod |
So, unlike dmd iasm which As all you're doing is mul, and taking the result from two registers, the equivalent should be |
TESTING RESULTScript which i use for testingimport std.stdio;
import std.datetime.stopwatch;
import std.random;
import std.traits;
import std.meta;
import std.algorithm;
import core.time;
// First implementation from the pasted function
static T mulmod1(T)(T a, T b, T c) {
static if (T.sizeof == 8) {
version (D_InlineAsm_X86_64) {
// Fast path for small values
ulong low = void;
ulong high = void;
// Perform 128-bit multiplication: a * b = [high:low]
asm pure @trusted nothrow @nogc
{
mov RAX, a; // Load a into RAX
mul b; // Multiply by b (RDX:RAX = a * b)
mov low, RAX; // Store low 64 bits
mov high, RDX; // Store high 64 bits
}
// Handle overflow for large values
if (high >= c) {
high %= c;
}
if (high == 0) {
return low % c;
}
// Reduce the high 64 bits modulo mod
asm pure @trusted nothrow @nogc
{
mov RAX, high; // Load high part
xor RDX, RDX; // Clear RDX for division
div c; // RAX = high / mod, RDX = high % mod
mov high, RDX; // Store remainder (high % mod)
mov RAX, low; // Load low part
mov RDX, high; // Load reduced high part
div c; // Divide full 128-bit number by mod
mov low, RDX; // Store remainder (final result)
}
return low; // Return (a * b) % mod
}
else {
// Fallback for non-x86_64 platforms
T result = 0;
a %= c;
b %= c;
while (b > 0) {
if (b & 1) {
result = (result + a) % c;
}
a = (a * 2) % c;
b >>= 1;
}
return result;
}
}
else static if (T.sizeof == 4) {
version (D_InlineAsm_X86) {
// Fast path for small values
uint low = void; // Changed to uint for 32-bit
uint high = void; // Changed to uint for 32-bit
// Perform 64-bit multiplication: a * b = [high:low]
asm pure @trusted nothrow @nogc
{
mov EAX, a; // Load a into EAX
mul b; // Multiply by b
mov low, EAX; // Store low bits
mov high, EDX; // Store high bits
}
// Handle overflow for large values
if (high >= c) {
high %= c;
}
if (high == 0) {
return low % c;
}
// Reduce the high bits modulo mod
asm pure @trusted nothrow @nogc
{
mov EAX, high; // Load high part
xor EDX, EDX; // Clear EDX for division
div c; // EAX = high / mod, EDX = high % mod
mov high, EDX; // Store remainder (high % mod)
mov EAX, low; // Load low part
mov EDX, high; // Load reduced high part
div c; // Divide full number by mod
mov low, EDX; // Store remainder (final result)
}
return low; // Return (a * b) % mod
}
else {
// Use 64-bit type for the calculation
ulong result = (cast(ulong)a * cast(ulong)b) % c;
return cast(T)result;
}
}
else {
// Fallback for other sizes
import std.bigint;
return cast(T)((BigInt(a) * BigInt(b)) % BigInt(c));
}
}
// Second implementation from function provided in the query
template Largest(T...) {
static if (T.length == 1)
alias Largest = T[0];
else static if (T[0].sizeof > Largest!(T[1..$]).sizeof)
alias Largest = T[0];
else
alias Largest = Largest!(T[1..$]);
}
static T mulmod2(T)(T a, T b, T c) {
static if (T.sizeof == 8) {
static T addmod(T a, T b, T c) {
b = c - b;
if (a >= b)
return a - b;
else
return c - b + a;
}
T result = 0;
b %= c;
while (a > 0) {
if (a & 1)
result = addmod(result, b, c);
a >>= 1;
b = addmod(b, b, c);
}
return result;
}
else {
alias DoubleT = AliasSeq!(void, ushort, uint, void, ulong)[T.sizeof];
DoubleT result = cast(DoubleT) (cast(DoubleT) a * cast(DoubleT) b);
return result % c;
}
}
// FIX: Explicitly specify the return type
F powmod1(F, G, H)(F x, G n, H m)
if (isUnsigned!F && isUnsigned!G && isUnsigned!H)
{
alias ReturnT = std.traits.Unqual!(Largest!(F, H));
ReturnT base = cast(ReturnT)(x % m);
ReturnT result = 1;
ReturnT modulus = cast(ReturnT)m;
std.traits.Unqual!G exponent = n;
while (exponent > 0) {
if (exponent & 1)
result = mulmod1!ReturnT(result, base, modulus);
base = mulmod1!ReturnT(base, base, modulus);
exponent >>= 1;
}
return cast(F)result;
}
// FIX: Explicitly specify the return type
F powmod2(F, G, H)(F x, G n, H m)
if (isUnsigned!F && isUnsigned!G && isUnsigned!H)
{
alias ReturnT = std.traits.Unqual!(Largest!(F, H));
ReturnT base = cast(ReturnT)(x % m);
ReturnT result = 1;
ReturnT modulus = cast(ReturnT)m;
std.traits.Unqual!G exponent = n;
while (exponent > 0) {
if (exponent & 1)
result = mulmod2!ReturnT(result, base, modulus);
base = mulmod2!ReturnT(base, base, modulus);
exponent >>= 1;
}
return cast(F)result;
}
// DELETED conflicting Unqual template
void main() {
// Run tests first to ensure correctness
writeln("VERIFICATION TESTS:");
runTests();
// Test for 32-bit integers
writeln("\nBENCHMARK 1: Testing mulmod implementations");
writeln("Testing with 32-bit integers:");
benchmarkMulmod!(uint)();
// Test for 64-bit integers
writeln("\nTesting with 64-bit integers:");
benchmarkMulmod!(ulong)();
// Test powmod implementations
writeln("\n\nBENCHMARK 2: Testing powmod implementations");
writeln("Testing with 32-bit integers:");
benchmarkPowmod!(uint)();
writeln("\nTesting with 64-bit integers:");
benchmarkPowmod!(ulong)();
}
void runTests() {
writeln("Running unit tests...");
// Test cases provided in the query
// First set of tests
assert(powmod1(1U, 10U, 3U) == 1);
assert(powmod1(3U, 2U, 6U) == 3);
assert(powmod1(5U, 5U, 15U) == 5);
assert(powmod1(2U, 3U, 5U) == 3);
assert(powmod1(2U, 4U, 5U) == 1);
assert(powmod1(2U, 5U, 5U) == 2);
// Second implementation
assert(powmod2(1U, 10U, 3U) == 1);
assert(powmod2(3U, 2U, 6U) == 3);
assert(powmod2(5U, 5U, 15U) == 5);
assert(powmod2(2U, 3U, 5U) == 3);
assert(powmod2(2U, 4U, 5U) == 1);
assert(powmod2(2U, 5U, 5U) == 2);
// Second set of tests - ulong
ulong a = 18446744073709551615u, b = 20u, c = 18446744073709551610u;
assert(powmod1(a, b, c) == 95367431640625u);
a = 100; b = 7919; c = 18446744073709551557u;
assert(powmod1(a, b, c) == 18223853583554725198u);
a = 117; b = 7919; c = 18446744073709551557u;
assert(powmod1(a, b, c) == 11493139548346411394u);
a = 134; b = 7919; c = 18446744073709551557u;
assert(powmod1(a, b, c) == 10979163786734356774u);
a = 151; b = 7919; c = 18446744073709551557u;
assert(powmod1(a, b, c) == 7023018419737782840u);
a = 168; b = 7919; c = 18446744073709551557u;
assert(powmod1(a, b, c) == 58082701842386811u);
a = 185; b = 7919; c = 18446744073709551557u;
assert(powmod1(a, b, c) == 17423478386299876798u);
a = 202; b = 7919; c = 18446744073709551557u;
assert(powmod1(a, b, c) == 5522733478579799075u);
a = 219; b = 7919; c = 18446744073709551557u;
assert(powmod1(a, b, c) == 15230218982491623487u);
a = 236; b = 7919; c = 18446744073709551557u;
assert(powmod1(a, b, c) == 5198328724976436000u);
a = 0; b = 7919; c = 18446744073709551557u;
assert(powmod1(a, b, c) == 0);
a = 123; b = 0; c = 18446744073709551557u;
assert(powmod1(a, b, c) == 1);
immutable ulong a1 = 253, b1 = 7919, c1 = 18446744073709551557u;
assert(powmod1(a1, b1, c1) == 3883707345459248860u);
// Second implementation
a = 18446744073709551615u; b = 20u; c = 18446744073709551610u;
assert(powmod2(a, b, c) == 95367431640625u);
a = 100; b = 7919; c = 18446744073709551557u;
assert(powmod2(a, b, c) == 18223853583554725198u);
a = 117; b = 7919; c = 18446744073709551557u;
assert(powmod2(a, b, c) == 11493139548346411394u);
a = 134; b = 7919; c = 18446744073709551557u;
assert(powmod2(a, b, c) == 10979163786734356774u);
a = 151; b = 7919; c = 18446744073709551557u;
assert(powmod2(a, b, c) == 7023018419737782840u);
a = 168; b = 7919; c = 18446744073709551557u;
assert(powmod2(a, b, c) == 58082701842386811u);
a = 185; b = 7919; c = 18446744073709551557u;
assert(powmod2(a, b, c) == 17423478386299876798u);
a = 202; b = 7919; c = 18446744073709551557u;
assert(powmod2(a, b, c) == 5522733478579799075u);
a = 219; b = 7919; c = 18446744073709551557u;
assert(powmod2(a, b, c) == 15230218982491623487u);
a = 236; b = 7919; c = 18446744073709551557u;
assert(powmod2(a, b, c) == 5198328724976436000u);
a = 0; b = 7919; c = 18446744073709551557u;
assert(powmod2(a, b, c) == 0);
a = 123; b = 0; c = 18446744073709551557u;
assert(powmod2(a, b, c) == 1);
assert(powmod2(a1, b1, c1) == 3883707345459248860u);
// Third set of tests - uint
uint x = 100, y = 7919, z = 1844674407u;
assert(powmod1(x, y, z) == 1613100340u);
x = 134; y = 7919; z = 1844674407u;
assert(powmod1(x, y, z) == 734956622u);
x = 151; y = 7919; z = 1844674407u;
assert(powmod1(x, y, z) == 1738696945u);
x = 168; y = 7919; z = 1844674407u;
assert(powmod1(x, y, z) == 1247580927u);
x = 185; y = 7919; z = 1844674407u;
assert(powmod1(x, y, z) == 1293855176u);
x = 202; y = 7919; z = 1844674407u;
assert(powmod1(x, y, z) == 1566963682u);
x = 219; y = 7919; z = 1844674407u;
assert(powmod1(x, y, z) == 181227807u);
x = 236; y = 7919; z = 1844674407u;
assert(powmod1(x, y, z) == 217988321u);
x = 253; y = 7919; z = 1844674407u;
assert(powmod1(x, y, z) == 1588843243u);
x = 0; y = 7919; z = 184467u;
assert(powmod1(x, y, z) == 0);
x = 123; y = 0; z = 1844674u;
assert(powmod1(x, y, z) == 1);
// Second implementation
x = 100; y = 7919; z = 1844674407u;
assert(powmod2(x, y, z) == 1613100340u);
x = 134; y = 7919; z = 1844674407u;
assert(powmod2(x, y, z) == 734956622u);
x = 151; y = 7919; z = 1844674407u;
assert(powmod2(x, y, z) == 1738696945u);
x = 168; y = 7919; z = 1844674407u;
assert(powmod2(x, y, z) == 1247580927u);
x = 185; y = 7919; z = 1844674407u;
assert(powmod2(x, y, z) == 1293855176u);
x = 202; y = 7919; z = 1844674407u;
assert(powmod2(x, y, z) == 1566963682u);
x = 219; y = 7919; z = 1844674407u;
assert(powmod2(x, y, z) == 181227807u);
x = 236; y = 7919; z = 1844674407u;
assert(powmod2(x, y, z) == 217988321u);
x = 253; y = 7919; z = 1844674407u;
assert(powmod2(x, y, z) == 1588843243u);
x = 0; y = 7919; z = 184467u;
assert(powmod2(x, y, z) == 0);
x = 123; y = 0; z = 1844674u;
assert(powmod2(x, y, z) == 1);
writeln("All tests passed!");
}
void benchmarkMulmod(T)() {
// Number of iterations for benchmarking
immutable iterations = 1_000_000;
// Generate random test cases
T[] a_values = new T[iterations];
T[] b_values = new T[iterations];
T[] m_values = new T[iterations];
auto rnd = Random(unpredictableSeed);
// Fill arrays with random values
for (size_t i = 0; i < iterations; i++) {
static if (is(T == ulong)) {
a_values[i] = uniform!("[]")(T.min, T.max / 2, rnd);
b_values[i] = uniform!("[]")(T.min, T.max / 2, rnd);
// Avoid modulus being too small or zero
m_values[i] = uniform!("[]")(T.max / 4, T.max - 1, rnd);
} else {
a_values[i] = uniform!("[]")(T.min, T.max, rnd);
b_values[i] = uniform!("[]")(T.min, T.max, rnd);
// Avoid modulus being too small or zero
m_values[i] = uniform!("[]")(T.max / 2, T.max - 1, rnd);
}
}
// Variables to store results to prevent compiler optimizations
T result1 = 0;
T result2 = 0;
// Test first implementation
writeln("Testing first implementation (ASM-based):");
auto sw1 = StopWatch(AutoStart.yes);
for (size_t i = 0; i < iterations; i++) {
result1 ^= mulmod1!T(a_values[i], b_values[i], m_values[i]);
}
sw1.stop();
// Test second implementation
writeln("Testing second implementation (Standard D):");
auto sw2 = StopWatch(AutoStart.yes);
for (size_t i = 0; i < iterations; i++) {
result2 ^= mulmod2!T(a_values[i], b_values[i], m_values[i]);
}
sw2.stop();
// Print results
writefln("First implementation (ASM): %s", sw1.peek());
writefln("Second implementation (D): %s", sw2.peek());
double ratio = (sw2.peek().total!"usecs" * 100.0 / sw1.peek().total!"usecs");
writefln("Performance ratio: %.2f%% (higher means ASM is faster)", ratio);
// Print dummy value to prevent optimization
writefln("Verification values: %s %s", result1, result2);
}
void benchmarkPowmod(T)() {
// Number of iterations for benchmarking
immutable iterations = 10_000;
// Generate random test cases
T[] x_values = new T[iterations];
T[] n_values = new T[iterations];
T[] m_values = new T[iterations];
auto rnd = Random(unpredictableSeed);
// Fill arrays with random values - using realistic ranges
for (size_t i = 0; i < iterations; i++) {
static if (is(T == ulong)) {
x_values[i] = uniform!("[]")(2, 1000000, rnd);
n_values[i] = uniform!("[]")(10, 10000, rnd);
// Avoid modulus being too small or zero
m_values[i] = uniform!("[]")(1000, 10000000, rnd);
} else {
x_values[i] = uniform!("[]")(2, 1000000, rnd);
n_values[i] = uniform!("[]")(10, 10000, rnd);
// Avoid modulus being too small or zero
m_values[i] = uniform!("[]")(1000, 10000000, rnd);
}
}
// Variables to store results to prevent compiler optimizations
T result1 = 0;
T result2 = 0;
// Test first implementation
writeln("Testing first powmod implementation (with ASM mulmod):");
auto sw1 = StopWatch(AutoStart.yes);
for (size_t i = 0; i < iterations; i++) {
result1 ^= powmod1!T(x_values[i], n_values[i], m_values[i]);
}
sw1.stop();
// Test second implementation
writeln("Testing second powmod implementation (with standard D mulmod):");
auto sw2 = StopWatch(AutoStart.yes);
for (size_t i = 0; i < iterations; i++) {
result2 ^= powmod2!T(x_values[i], n_values[i], m_values[i]);
}
sw2.stop();
// Print results
writefln("First implementation (ASM): %s", sw1.peek());
writefln("Second implementation (D): %s", sw2.peek());
double ratio = (sw2.peek().total!"usecs" * 100.0 / sw1.peek().total!"usecs");
writefln("Performance ratio: %.2f%% (higher means ASM is faster)", ratio);
// Print dummy value to prevent optimization
writefln("Verification values: %s %s", result1, result2);
} |
|
@ibuclaw can you explain why a = 100; b = 7919; c = 18446744073709551557u; |
|
Anyone has solution of it please tell me i cant unterstand this errors on why they are coming on x86 systems how much i figureout is that this occured due to 32 bits x86 system the value overflowed |
8c7d8c0 to
43c35de
Compare
Yes. The new fallback code is prone to overflow. The original implementation can't be made simpler by removing the |
i have a solution for that like we do in assembly we treat 16bit number in two register(in 8085) instaed of one and then perform operations on it . we can do same here by implementing 128 bits multipication using more registers in 32 bits (its complex) |
|
I don't understand why you need inline assembler for 32bit powmod, when the original D code should be sufficient? |
ok then i will make changes for it |
|
For the slow path, the Running the benchmark locally, the D code measured faster than the inline asm when the modulus was within this bounds. static T mulmod2(T)(T a, T b, T c) {
static if (T.sizeof == 8) {
static T addmod(T a, T b, T c) {
b = c - b;
if (a >= b)
return a - b;
else
return c - b + a;
}
T result = void; // <---- start new snip
if (c <= 0x100000000)
{
result = a * b;
return result % c;
}
result = 0; // <---- end new snip
b %= c;
while (a > 0) {
if (a & 1)
result = addmod(result, b, c);
a >>= 1;
b = addmod(b, b, c);
}
return result;
}
else {
alias DoubleT = AliasSeq!(void, ushort, uint, void, ulong)[T.sizeof];
DoubleT result = cast(DoubleT) (cast(DoubleT) a * cast(DoubleT) b);
return result % c;
}
} |
ok i agree then what is better way of optimizing it |
|
So I think it should be sufficient to optimize mulmod in the following way: static T mulmod(T a, T b, T c)
{
static if (T.sizeof == 8)
{
if (c <= 0x100000000)
{
T result = a * b;
return result % c;
}
T result = 0;
version (D_InlineAsm_X86_64)
{
// 128-bit mul with asm
}
else version (GNU_OR_LDC_X86_64)
{
// 128-bit mul with gcc extended asm
}
else
{
// slow addmod() method (does pragma(inline, true) help?)
}
return result;
}
else
{
// DoubleT method
}
}Does this make sense? |
It might even help when |
i think you are correct here this looks similiar to other open source language iplementation of function |
|
Sorry for inconvenience |
|
sorry for wasting your time with inLine ASM should have thought of using core.int128 earlier |
Co-authored-by: Iain Buclaw <[email protected]>
Co-authored-by: Iain Buclaw <[email protected]>
6edc6ab to
ef8a738
Compare
std/math/exponential.d
Outdated
| const product128 = mul(Cent(a), Cent(b)); | ||
| Cent centRemainder; | ||
| udivmod(product128, Cent(c), centRemainder); | ||
| return cast(T)(centRemainder.lo); |
There was a problem hiding this comment.
Sorry @Aditya-132, looks like @kinke wasn't done with changes to core.int128.
With that PR merged, the correct overloads to call are:
| const product128 = mul(Cent(a), Cent(b)); | |
| Cent centRemainder; | |
| udivmod(product128, Cent(c), centRemainder); | |
| return cast(T)(centRemainder.lo); | |
| const product128 = mul(a, b); | |
| T remainder = void; | |
| udivmod(product128, c, remainder); | |
| return remainder; |
Thanks for the patience. 🙇
There was a problem hiding this comment.
UFCS should also be possible
T remainder = void;
mul(a, b).udivmod(c, remainder);
return remainder;
There was a problem hiding this comment.
UFCS should also be possible
T remainder = void; mul(a, b).udivmod(c, remainder); return remainder;
its giving error
There was a problem hiding this comment.
Suggested change are also giving error
There was a problem hiding this comment.
DMD PR is merged now, please rebase on top of phobos/master.
There was a problem hiding this comment.
So the error is floating point exception?
There was overflow code between the mul and div once, but that disappeared for some reason?
auto product = mul(a, b);
if (product.hi >= c)
product.hi %= c;
T remainder = void;
udivmod(product, c, remainder);
return remainder;
There was a problem hiding this comment.
should i make above changes in code
There was a problem hiding this comment.
Yes, adding the overflow condition should allow you to use 64bit udivmod.
Co-authored-by: Iain Buclaw <[email protected]>
| else | ||
| { | ||
| if (a & 1) | ||
| result = addmod(result, b, c); | ||
|
|
||
| a >>= 1; | ||
| b = addmod(b, b, c); | ||
| import core.int128 : Cent, mul, udivmod; | ||
| auto product = mul(a, b); | ||
| if (product.hi >= c) | ||
| product.hi %= c; | ||
| T remainder = void; | ||
| udivmod(product, c, remainder); | ||
| return remainder; | ||
| } |
There was a problem hiding this comment.
Is this being rendered wrong or is indentation missing?
There was a problem hiding this comment.
done with correction
|
Is there any specific reason why it fails for [Main / FreeBSD 13.2 x64 (pull_request) |
Unrelated. |
Co-authored-by: Iain Buclaw <[email protected]>
The key optimizations I've made are:
128-bit multiplication via inline assembly:
For 64-bit types on x86_64 platforms, I've added direct use of hardware multiplication that produces a 128-bit result (stored in high:low registers)
This avoids the repeated addition loop in the original algorithm
Efficient modular reduction:
Implemented a reduction algorithm that handles the 128-bit result efficiently
Uses a simplified approach for when the high bits are zero
Early reduction:
The base value is reduced modulo m at the beginning, which improves performance when the base is large and modulus is small
Special case handling:
Added explicit handling for modulus=1 and exponent=0 cases up front
Version checks:
The code falls back to the original algorithm if inline assembly is not available
This ensures portability while providing optimizations where possible