Skip to content

Example transforms

Naoki Shibata edited this page Aug 17, 2020 · 1 revision

Below is examples of how the output assembly code is transformed. I used the following compiler options to compile the C source code.

clang-10 -O3 -ffast-math -S example.c
clang-10 -Xclang -load -Xclang libMathPeephole.so -O3 -ffast-math -S example.c

Eliminating FP division instructions by reducing fractions

This rewriting rule substitutes a/b + c/d with (ad + bc) / (bd).

For the following input C code,

double reduction_0(double a0, double a1, double a2, double a3) {
  return a0 / a1 + a2 / a3;
}

Clang without the plugin generates assembly code that directly corresponds to the C source code, as shown below.

reduction_0:                            // @reduction_0
        fdiv    d0, d0, d1
        fdiv    d1, d2, d3
        fadd    d0, d1, d0
        ret

By using MathPeephole optimization pass, the number of divisions is reduced to 1.

reduction_0:                            // @reduction_0
        fmul    d0, d3, d0
        fmul    d3, d3, d1
        fmadd   d0, d2, d1, d0
        fdiv    d0, d0, d3
        ret

Below is another example.

double reduction_1(double a0, double a1, double a2, double a3, double a4, double a5) {
  return a0 / a1 + a2 / a3 + a4 / a5;
}

Clang-10 without the plugin generates assembly code with 3 division instructions.

reduction_1:                            // @reduction_1
        fdiv    d0, d0, d1
        fdiv    d1, d2, d3
        fdiv    d2, d4, d5
        fadd    d0, d0, d2
        fadd    d0, d1, d0
        ret

Only one division is required after the transform by MathPeephole.

reduction_1:                            // @reduction_1
        fmul    d0, d3, d0
        fmadd   d0, d2, d1, d0
        fmul    d3, d3, d1
        fmul    d0, d5, d0
        fmul    d1, d5, d3
        fmadd   d0, d4, d3, d0
        fdiv    d0, d0, d1
        ret

Eliminating FP division instructions with comparison

This rewriting rule substitutes x/y + z > a/b + c with (xb - ay) / (yb) + z > c, and a/b + c > d with b < 0 ? (a - b * (d - c) < 0) : (-(a - b * (d - c)) < 0).

For the following input C code,

bool cmpdiv_0(double a0, double a1, double a2, double a3) {
  return a0 / a1 + a2 < a3;
}

the normal clang generates assembly code that includes a division instruction.

cmpdiv_0:                               // @cmpdiv_0
        fdiv    d0, d0, d1
        fadd    d0, d0, d2
        fcmp    d0, d3
        cset    w0, lt
        ret

MathPeephole can completely eliminate the division instruction in cases like this.

cmpdiv_0:                               // @cmpdiv_0
        fsub    d2, d3, d2
        fmov    x8, d0
        fmul    d0, d1, d2
        fmov    x9, d1
        and     x9, x9, #0x8000000000000000
        fmov    x10, d0
        eor     x8, x8, x9
        eor     x9, x10, x9
        fmov    d0, x8
        fmov    d1, x9
        fcmp    d0, d1
        cset    w0, lt
        ret

The corresponding IR is shown below.

define dso_local zeroext i1 @cmpdiv_0(double %0, double %1, double %2, double %3) local_unnamed_addr #1 {
  %5 = bitcast double %1 to i64
  %6 = and i64 %5, -9223372036854775808
  %7 = fsub fast double %3, %2
  %8 = fmul fast double %7, %1
  %9 = bitcast double %0 to i64
  %10 = xor i64 %6, %9
  %11 = bitcast i64 %10 to double
  %12 = bitcast double %8 to i64
  %13 = xor i64 %6, %12
  %14 = bitcast i64 %13 to double
  %15 = fcmp fast olt double %11, %14
  ret i1 %15
}

Below is another example C code.

bool cmpdiv_1(double a0, double a1, double a2, double a3, double a4, double a5) {
  return a0 / a1 + a2 / a3 < a4 / a5;
}

This is the code generated by clang-10 without a plugin.

cmpdiv_1:                               // @cmpdiv_1
        fdiv    d0, d0, d1
        fdiv    d1, d2, d3
        fadd    d0, d1, d0
        fdiv    d1, d4, d5
        fcmp    d0, d1
        cset    w0, lt
        ret

There is no division instruction in the code generated with MathPeephole.

cmpdiv_1:                               // @cmpdiv_1
        fmul    d2, d5, d2
        fmul    d5, d5, d3
        fnmsub  d2, d4, d3, d2
        fmul    d0, d5, d0
        fmul    d3, d5, d1
        fnmsub  d0, d2, d1, d0
        fmov    x8, d3
        and     x8, x8, #0x8000000000000000
        fmov    x9, d0
        eor     x8, x9, x8
        fmov    d0, x8
        fcmp    d0, #0.0
        cset    w0, gt
        ret

Eliminating FP sqrt instructions with comparison

This rewriting rule substitutes w sqrt(x) + y > z with w >= 0 ? ((z < y) | (wwx > (z-y)(z-y))) : ((z <= y) & (wwx < (z-y)(z-y))). Although the resulting code is substantially longer than the original code, we are seeing the resulting code to be more efficient in many cases.

Below is an example including comparison of sqrt.

bool cmpsqrt_0(double a0, double a1) {
  return 1.1 * sqrt(a0) + 2.2 < a1;
}

Below is the code generated by clang-10 without a plugin.

cmpsqrt_0:                              // @cmpsqrt_0
        adrp    x8, .LCPI4_0
        adrp    x9, .LCPI4_1
        ldr     d2, [x8, :lo12:.LCPI4_0]
        ldr     d3, [x9, :lo12:.LCPI4_1]
        fsqrt   d0, d0
        fmadd   d0, d0, d2, d3
        fcmp    d0, d1
        cset    w0, lt
        ret

And this is the code generated with MathPeephole. Sqrt is eliminated.

cmpsqrt_0:                              // @cmpsqrt_0
        adrp    x8, .LCPI4_0
        adrp    x9, .LCPI4_1
        ldr     d2, [x8, :lo12:.LCPI4_0]
        ldr     d3, [x9, :lo12:.LCPI4_1]
        fadd    d1, d1, d2
        fmul    d0, d0, d3
        fcmp    d1, #0.0
        fmul    d1, d1, d1
        cset    w8, ge
        fcmp    d0, d1
        cset    w9, lt
        and     w0, w8, w9
        ret

The corresponding IR is shown below.

define dso_local zeroext i1 @cmpsqrt_0(double %0, double %1) local_unnamed_addr #0 {
  %3 = fadd fast double %1, -2.200000e+00
  %4 = fcmp fast oge double %3, 0.000000e+00
  %5 = fmul fast double %3, %3
  %6 = fmul fast double %0, 0x3FF35C28F5C28F5D
  %7 = fcmp fast olt double %6, %5
  %8 = and i1 %4, %7
  ret i1 %8
}

Below is another example including comparison of sqrt.

bool complex_0(double a0, double a1, double a2, double a3, double a4, double a5) {
  return  a0 / sqrt(a1) + a2 < a3;
}

And this is the code generated with MathPeephole. Both division and sqrt are eliminated.

complex_0:                              // @complex_0
        fsub    d2, d3, d2
        fmul    d1, d2, d1
        fmov    x9, d2
        fmov    x8, d0
        fmul    d1, d2, d1
        and     x10, x9, #0x8000000000000000
        eor     x8, x8, x10
        fmov    x11, d1
        eor     x10, x11, x10
        fmov    d1, x8
        fcmp    d1, #0.0
        fmul    d0, d1, d0
        fmov    d1, x10
        asr     x9, x9, #63
        cset    w8, lt
        cset    w10, ge
        fcmp    d1, d0
        cset    w11, gt
        cmp     x9, #0                  // =0
        and     w9, w10, w11
        cset    w10, eq
        and     w8, w8, w10
        orr     w0, w9, w8
        ret

Vectorized C source code can also be transformed.

typedef float float4 __attribute__((ext_vector_type(4)));
typedef int int4 __attribute__((ext_vector_type(4)));

static __attribute__((always_inline)) float4 sqrtf4(float4 x) {
  return (float4) { sqrtf(x[0]), sqrtf(x[1]), sqrtf(x[2]), sqrtf(x[3]) };
}

int4 complexv_1(float4 a0, float4 a1, float4 a2, float4 a3, float4 a4, float4 a5) {
  return a0 / a1 + sqrtf4(a2 + 1.1f) < a3 / (a4 + 1.2f) + (a5 + 1.3f) / a0;
}
complexv_1:                             // @complexv_1
        mov     w8, #52429
        movk    w8, #16268, lsl #16
        dup     v6.4s, w8
        mov     w8, #39322
        movk    w8, #16281, lsl #16
        dup     v7.4s, w8
        mov     w8, #26214
        movk    w8, #16294, lsl #16
        fadd    v4.4s, v4.4s, v7.4s
        dup     v7.4s, w8
        fadd    v2.4s, v2.4s, v6.4s
        fmul    v6.4s, v0.4s, v1.4s
        fadd    v5.4s, v5.4s, v7.4s
        movi    v7.4s, #128, lsl #24
        fmul    v1.4s, v5.4s, v1.4s
        fmul    v5.4s, v6.4s, v4.4s
        fmul    v3.4s, v6.4s, v3.4s
        fmls    v1.4s, v0.4s, v0.4s
        and     v0.16b, v5.16b, v7.16b
        fmla    v3.4s, v1.4s, v4.4s
        eor     v1.16b, v5.16b, v0.16b
        eor     v0.16b, v3.16b, v0.16b
        and     v3.16b, v1.16b, v7.16b
        fmul    v2.4s, v1.4s, v2.4s
        ushr    v4.4s, v1.4s, #31
        eor     v5.16b, v0.16b, v3.16b
        fmul    v1.4s, v1.4s, v2.4s
        fmul    v0.4s, v5.4s, v0.4s
        eor     v1.16b, v1.16b, v3.16b
        fcmlt   v2.4s, v5.4s, #0.0
        fcmge   v6.4s, v5.4s, #0.0
        fcmgt   v0.4s, v0.4s, v1.4s
        and     v0.16b, v6.16b, v0.16b
        and     v1.16b, v2.16b, v4.16b
        orr     v0.16b, v0.16b, v1.16b
        shl     v0.4s, v0.4s, #31
        sshr    v0.4s, v0.4s, #31
        ret

The corresponding IR is shown below.

define dso_local <4 x i32> @complexv_1(<4 x float> %0, <4 x float> %1, <4 x float> %2, <4 x float> %3, <4 x float> %4, <4 x float> %5) local_unnamed_addr #2 {
  %7 = fadd fast <4 x float> %2, <float 0x3FF19999A0000000, float 0x3FF19999A0000000, float 0x3FF19999A0000000, float 0x3FF19999A0000000>
  %8 = call fast <4 x float> @llvm.sqrt.v4f32(<4 x float> %7)
  %9 = fadd fast <4 x float> %4, <float 0x3FF3333340000000, float 0x3FF3333340000000, float 0x3FF3333340000000, float 0x3FF3333340000000>
  %10 = fadd fast <4 x float> %5, <float 0x3FF4CCCCC0000000, float 0x3FF4CCCCC0000000, float 0x3FF4CCCCC0000000, float 0x3FF4CCCCC0000000>
  %11 = fmul fast <4 x float> %10, %1
  %12 = fmul fast <4 x float> %0, %0
  %13 = fmul fast <4 x float> %0, %1
  %14 = fsub fast <4 x float> %11, %12
  %15 = fmul fast <4 x float> %14, %9
  %16 = fmul fast <4 x float> %13, %3
  %17 = fmul fast <4 x float> %13, %9
  %18 = fadd fast <4 x float> %15, %16
  %19 = bitcast <4 x float> %17 to <4 x i32>
  %20 = and <4 x i32> %19, <i32 -2147483648, i32 -2147483648, i32 -2147483648, i32 -2147483648>
  %21 = fmul fast <4 x float> %17, %8
  %22 = bitcast <4 x float> %18 to <4 x i32>
  %23 = xor <4 x i32> %20, %22
  %24 = bitcast <4 x i32> %23 to <4 x float>
  %25 = bitcast <4 x float> %21 to <4 x i32>
  %26 = xor <4 x i32> %20, %25
  %27 = bitcast <4 x i32> %26 to <4 x float>
  %28 = fcmp fast ogt <4 x float> %24, %27
  %29 = sext <4 x i1> %28 to <4 x i32>
  ret <4 x i32> %29
}

Clone this wiki locally