Here's a small repro:
#include "Halide.h"
using namespace Halide;
class MyGen : public Generator<MyGen> {
public:
Input<Buffer<float, 1>> in{"input"};
Output<Buffer<float, 1>> out{"output"};
Var x, tx;
void generate() {
out(x) = cast<float>(in(x) >= 0) - cast<float>(in(x) < 0);
out.gpu_tile(x, tx, 16)
.vectorize(tx, 4);
}
};
HALIDE_REGISTER_GENERATOR(MyGen, mygen)
Building for a GPU target results in:
Unhandled exception: Error: Can't represent an integer with this many bits in Metal C: int4x4
Drilling down a bit, what I'm seeing is that the IR after FindIntrinsics includes expressions like:
output[ramp(((.__thread_id_x*4) + output.extent.0) + -16, 1, 4)] =
float32x4((int4x4)widening_sub(int2x4((x4(0.000000f) <= t19)), int2x4((t19 < x4(0.000000f)))))