-
Notifications
You must be signed in to change notification settings - Fork 3
Example transforms
Below is examples of how the output assembly code is transformed. I used the following compiler options to compile the C source code.
clang-10 -O3 -ffast-math -S example.c
clang-10 -Xclang -load -Xclang libMathPeephole.so -O3 -ffast-math -S example.c
This rewriting rule substitutes a/b + c/d with (ad + bc) / (bd).
For the following input C code,
double reduction_0(double a0, double a1, double a2, double a3) {
return a0 / a1 + a2 / a3;
}
Clang without the plugin generates assembly code that directly corresponds to the C source code, as shown below.
reduction_0: // @reduction_0
fdiv d0, d0, d1
fdiv d1, d2, d3
fadd d0, d1, d0
ret
By using MathPeephole optimization pass, the number of divisions is reduced to 1.
reduction_0: // @reduction_0
fmul d0, d3, d0
fmul d3, d3, d1
fmadd d0, d2, d1, d0
fdiv d0, d0, d3
ret
Below is another example.
double reduction_1(double a0, double a1, double a2, double a3, double a4, double a5) {
return a0 / a1 + a2 / a3 + a4 / a5;
}
Clang-10 without the plugin generates assembly code with 3 division instructions.
reduction_1: // @reduction_1
fdiv d0, d0, d1
fdiv d1, d2, d3
fdiv d2, d4, d5
fadd d0, d0, d2
fadd d0, d1, d0
ret
Only one division is required after the transform by MathPeephole.
reduction_1: // @reduction_1
fmul d0, d3, d0
fmadd d0, d2, d1, d0
fmul d3, d3, d1
fmul d0, d5, d0
fmul d1, d5, d3
fmadd d0, d4, d3, d0
fdiv d0, d0, d1
ret
This rewriting rule substitutes x/y + z > a/b + c with (xb - ay) / (yb) + z > c, and a/b + c > d with b < 0 ? (a - b * (d - c) < 0) : (-(a - b * (d - c)) < 0).
For the following input C code,
bool cmpdiv_0(double a0, double a1, double a2, double a3) {
return a0 / a1 + a2 < a3;
}
the normal clang generates assembly code that includes a division instruction.
cmpdiv_0: // @cmpdiv_0
fdiv d0, d0, d1
fadd d0, d0, d2
fcmp d0, d3
cset w0, lt
ret
MathPeephole can completely eliminate the division instruction in cases like this.
cmpdiv_0: // @cmpdiv_0
fsub d2, d3, d2
fmov x8, d0
fmul d0, d1, d2
fmov x9, d1
and x9, x9, #0x8000000000000000
fmov x10, d0
eor x8, x8, x9
eor x9, x10, x9
fmov d0, x8
fmov d1, x9
fcmp d0, d1
cset w0, lt
ret
The corresponding IR is shown below.
define dso_local zeroext i1 @cmpdiv_0(double %0, double %1, double %2, double %3) local_unnamed_addr #1 {
%5 = bitcast double %1 to i64
%6 = and i64 %5, -9223372036854775808
%7 = fsub fast double %3, %2
%8 = fmul fast double %7, %1
%9 = bitcast double %0 to i64
%10 = xor i64 %6, %9
%11 = bitcast i64 %10 to double
%12 = bitcast double %8 to i64
%13 = xor i64 %6, %12
%14 = bitcast i64 %13 to double
%15 = fcmp fast olt double %11, %14
ret i1 %15
}
Below is another example C code.
bool cmpdiv_1(double a0, double a1, double a2, double a3, double a4, double a5) {
return a0 / a1 + a2 / a3 < a4 / a5;
}
This is the code generated by clang-10 without a plugin.
cmpdiv_1: // @cmpdiv_1
fdiv d0, d0, d1
fdiv d1, d2, d3
fadd d0, d1, d0
fdiv d1, d4, d5
fcmp d0, d1
cset w0, lt
ret
There is no division instruction in the code generated with MathPeephole.
cmpdiv_1: // @cmpdiv_1
fmul d2, d5, d2
fmul d5, d5, d3
fnmsub d2, d4, d3, d2
fmul d0, d5, d0
fmul d3, d5, d1
fnmsub d0, d2, d1, d0
fmov x8, d3
and x8, x8, #0x8000000000000000
fmov x9, d0
eor x8, x9, x8
fmov d0, x8
fcmp d0, #0.0
cset w0, gt
ret
This rewriting rule substitutes w sqrt(x) + y > z with w >= 0 ? ((z < y) | (wwx > (z-y)(z-y))) : ((z <= y) & (wwx < (z-y)(z-y))). Although the resulting code is substantially longer than the original code, we are seeing the resulting code to be more efficient in many cases.
Below is an example including comparison of sqrt.
bool cmpsqrt_0(double a0, double a1) {
return 1.1 * sqrt(a0) + 2.2 < a1;
}
Below is the code generated by clang-10 without a plugin.
cmpsqrt_0: // @cmpsqrt_0
adrp x8, .LCPI4_0
adrp x9, .LCPI4_1
ldr d2, [x8, :lo12:.LCPI4_0]
ldr d3, [x9, :lo12:.LCPI4_1]
fsqrt d0, d0
fmadd d0, d0, d2, d3
fcmp d0, d1
cset w0, lt
ret
And this is the code generated with MathPeephole. Sqrt is eliminated.
cmpsqrt_0: // @cmpsqrt_0
adrp x8, .LCPI4_0
adrp x9, .LCPI4_1
ldr d2, [x8, :lo12:.LCPI4_0]
ldr d3, [x9, :lo12:.LCPI4_1]
fadd d1, d1, d2
fmul d0, d0, d3
fcmp d1, #0.0
fmul d1, d1, d1
cset w8, ge
fcmp d0, d1
cset w9, lt
and w0, w8, w9
ret
The corresponding IR is shown below.
define dso_local zeroext i1 @cmpsqrt_0(double %0, double %1) local_unnamed_addr #0 {
%3 = fadd fast double %1, -2.200000e+00
%4 = fcmp fast oge double %3, 0.000000e+00
%5 = fmul fast double %3, %3
%6 = fmul fast double %0, 0x3FF35C28F5C28F5D
%7 = fcmp fast olt double %6, %5
%8 = and i1 %4, %7
ret i1 %8
}
Below is another example including comparison of sqrt.
bool complex_0(double a0, double a1, double a2, double a3, double a4, double a5) {
return a0 / sqrt(a1) + a2 < a3;
}
And this is the code generated with MathPeephole. Both division and sqrt are eliminated.
complex_0: // @complex_0
fsub d2, d3, d2
fmul d1, d2, d1
fmov x9, d2
fmov x8, d0
fmul d1, d2, d1
and x10, x9, #0x8000000000000000
eor x8, x8, x10
fmov x11, d1
eor x10, x11, x10
fmov d1, x8
fcmp d1, #0.0
fmul d0, d1, d0
fmov d1, x10
asr x9, x9, #63
cset w8, lt
cset w10, ge
fcmp d1, d0
cset w11, gt
cmp x9, #0 // =0
and w9, w10, w11
cset w10, eq
and w8, w8, w10
orr w0, w9, w8
ret
Vectorized C source code can also be transformed.
typedef float float4 __attribute__((ext_vector_type(4)));
typedef int int4 __attribute__((ext_vector_type(4)));
static __attribute__((always_inline)) float4 sqrtf4(float4 x) {
return (float4) { sqrtf(x[0]), sqrtf(x[1]), sqrtf(x[2]), sqrtf(x[3]) };
}
int4 complexv_1(float4 a0, float4 a1, float4 a2, float4 a3, float4 a4, float4 a5) {
return a0 / a1 + sqrtf4(a2 + 1.1f) < a3 / (a4 + 1.2f) + (a5 + 1.3f) / a0;
}
complexv_1: // @complexv_1
mov w8, #52429
movk w8, #16268, lsl #16
dup v6.4s, w8
mov w8, #39322
movk w8, #16281, lsl #16
dup v7.4s, w8
mov w8, #26214
movk w8, #16294, lsl #16
fadd v4.4s, v4.4s, v7.4s
dup v7.4s, w8
fadd v2.4s, v2.4s, v6.4s
fmul v6.4s, v0.4s, v1.4s
fadd v5.4s, v5.4s, v7.4s
movi v7.4s, #128, lsl #24
fmul v1.4s, v5.4s, v1.4s
fmul v5.4s, v6.4s, v4.4s
fmul v3.4s, v6.4s, v3.4s
fmls v1.4s, v0.4s, v0.4s
and v0.16b, v5.16b, v7.16b
fmla v3.4s, v1.4s, v4.4s
eor v1.16b, v5.16b, v0.16b
eor v0.16b, v3.16b, v0.16b
and v3.16b, v1.16b, v7.16b
fmul v2.4s, v1.4s, v2.4s
ushr v4.4s, v1.4s, #31
eor v5.16b, v0.16b, v3.16b
fmul v1.4s, v1.4s, v2.4s
fmul v0.4s, v5.4s, v0.4s
eor v1.16b, v1.16b, v3.16b
fcmlt v2.4s, v5.4s, #0.0
fcmge v6.4s, v5.4s, #0.0
fcmgt v0.4s, v0.4s, v1.4s
and v0.16b, v6.16b, v0.16b
and v1.16b, v2.16b, v4.16b
orr v0.16b, v0.16b, v1.16b
shl v0.4s, v0.4s, #31
sshr v0.4s, v0.4s, #31
ret
The corresponding IR is shown below.
define dso_local <4 x i32> @complexv_1(<4 x float> %0, <4 x float> %1, <4 x float> %2, <4 x float> %3, <4 x float> %4, <4 x float> %5) local_unnamed_addr #2 {
%7 = fadd fast <4 x float> %2, <float 0x3FF19999A0000000, float 0x3FF19999A0000000, float 0x3FF19999A0000000, float 0x3FF19999A0000000>
%8 = call fast <4 x float> @llvm.sqrt.v4f32(<4 x float> %7)
%9 = fadd fast <4 x float> %4, <float 0x3FF3333340000000, float 0x3FF3333340000000, float 0x3FF3333340000000, float 0x3FF3333340000000>
%10 = fadd fast <4 x float> %5, <float 0x3FF4CCCCC0000000, float 0x3FF4CCCCC0000000, float 0x3FF4CCCCC0000000, float 0x3FF4CCCCC0000000>
%11 = fmul fast <4 x float> %10, %1
%12 = fmul fast <4 x float> %0, %0
%13 = fmul fast <4 x float> %0, %1
%14 = fsub fast <4 x float> %11, %12
%15 = fmul fast <4 x float> %14, %9
%16 = fmul fast <4 x float> %13, %3
%17 = fmul fast <4 x float> %13, %9
%18 = fadd fast <4 x float> %15, %16
%19 = bitcast <4 x float> %17 to <4 x i32>
%20 = and <4 x i32> %19, <i32 -2147483648, i32 -2147483648, i32 -2147483648, i32 -2147483648>
%21 = fmul fast <4 x float> %17, %8
%22 = bitcast <4 x float> %18 to <4 x i32>
%23 = xor <4 x i32> %20, %22
%24 = bitcast <4 x i32> %23 to <4 x float>
%25 = bitcast <4 x float> %21 to <4 x i32>
%26 = xor <4 x i32> %20, %25
%27 = bitcast <4 x i32> %26 to <4 x float>
%28 = fcmp fast ogt <4 x float> %24, %27
%29 = sext <4 x i1> %28 to <4 x i32>
ret <4 x i32> %29
}