Skip to content
310 changes: 310 additions & 0 deletions aten/src/ATen/native/quantized/cpu/ACLUtils.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,310 @@
#include <ATen/native/quantized/cpu/ACLUtils.h>

#if AT_MKLDNN_ACL_ENABLED()

#include <ATen/Parallel.h>
#include <ATen/ops/empty.h>
#include <arm_compute/core/Helpers.h>
#include <arm_compute/core/Types.h>
#include <arm_compute/core/Utils.h>
#include <arm_compute/core/utils/quantization/AsymmHelpers.h>

namespace at::native::acl_utils {

QuantMatmul::QuantMatmul(
int64_t weight_dim_0,
int64_t weight_dim_1,
double weight_scale,
int64_t weight_offset,
int8_t* weight_ptr,
std::optional<float*> bias_ptr,
const QuantMatmulCacheKey& cache_key)
: key(cache_key) {
auto wei_q_tensor_info = arm_compute::TensorInfo(
arm_compute::TensorShape(weight_dim_1, weight_dim_0),
1,
arm_compute::DataType::QASYMM8_SIGNED,
arm_compute::QuantizationInfo(weight_scale, -weight_offset, false));
wei_q_tensor_info.set_are_values_constant(true);
wei_q_tensor_.allocator()->init(wei_q_tensor_info);
wei_q_tensor_.allocator()->import_memory(weight_ptr);

if (bias_ptr.has_value()) {
auto bia_tensor_info = arm_compute::TensorInfo(
arm_compute::TensorShape(1, weight_dim_1),
1,
arm_compute::DataType::F32);
bia_tensor_ = arm_compute::Tensor();

bia_tensor_->allocator()->init(bia_tensor_info);
bia_tensor_->allocator()->import_memory(bias_ptr.value());
}
const bool fuse_relu =
std::get<static_cast<int>(QuantMatmulCacheKeyIndex::FUSE_RELU)>(key);
if (fuse_relu) {
relu_info_ =
arm_compute::ActivationLayerInfo(arm_compute::ActivationFunction::RELU);
}
}

QuantMatmul::~QuantMatmul() {
// this will not free memory, it will just tell ACL that we're no longer
// using the pointer
wei_q_tensor_.allocator()->free();
if (bia_tensor_.has_value()) {
bia_tensor_->allocator()->free();
}
}

DynamicQuantMatmul::DynamicQuantMatmul(
int64_t weight_dim_0,
int64_t weight_dim_1,
double weight_scale,
int64_t weight_offset,
int8_t* weight_ptr,
std::optional<float*> bias_ptr,
const QuantMatmulCacheKey& cache_key)
: QuantMatmul(
weight_dim_0,
weight_dim_1,
weight_scale,
weight_offset,
weight_ptr,
bias_ptr,
cache_key) {
int64_t m = std::get<static_cast<int>(QuantMatmulCacheKeyIndex::M)>(key);

auto src_q_tensor_info = arm_compute::TensorInfo(
arm_compute::TensorShape(weight_dim_0, m),
1,
// ACL dyanamically quantized matmuls only support (signed) int8_t
arm_compute::DataType::QASYMM8_SIGNED,
// TODO: setting the initial offset value to int8_t max instead of zero,
// because ACL currently skips MatrixBReduction calculation if the
// source offset at configuration time is zero. This is fixed by this
// PR: https://review.mlplatform.org/c/ml/ComputeLibrary/+/12820/8 This
// will be set to the actual src offset value at runtime.
arm_compute::QuantizationInfo(
/*scale=*/1.0,
/*offset=*/std::numeric_limits<int8_t>::max(),
/*is_dynamic=*/true));
src_q_tensor_info.set_are_values_constant(false);

auto src_tensor_info = arm_compute::TensorInfo(
arm_compute::TensorShape(weight_dim_0, m), arm_compute::Format::F32);
src_tensor_info.set_are_values_constant(false);

auto dst_tensor_info = arm_compute::TensorInfo(
arm_compute::TensorShape(weight_dim_1, m), arm_compute::Format::F32);

src_q_tensor.allocator()->init(src_q_tensor_info);
src_tensor.allocator()->init(src_tensor_info);
dst_tensor.allocator()->init(dst_tensor_info);

src_q_tensor_orig_ =
at::empty({m, weight_dim_0}, at::device(c10::kCPU).dtype(c10::kQInt8));
// allocate/import memory
src_q_tensor.allocator()->import_memory(src_q_tensor_orig_.data_ptr());

if (relu_info_.has_value()) {
relu = arm_compute::NEActivationLayer();
}
}

DynamicQuantMatmul::~DynamicQuantMatmul() {
// this will not free memory, it will just tell ACL that we're no longer
// using the pointer
src_q_tensor.allocator()->free();
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why don't we need to do the same for src_tensor and dst_tensor?

Copy link
Collaborator Author

@fadara01 fadara01 Mar 17, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

These frees just set the pointer in ACL to nulll - i.e. tell ACL we're no longer using the pointer - it does not free any memory (See its impl here)
We do not need to run any tensor.allocator->free for any tensors that we do not do tensor.allocator->import_memory() for.
In this example the memory for src_q_tensor was imported in the constructor and it makes sense to tell ACL to drop pointer/reference in the destructor as we do here.
For the src_tensor/dst_tensor, the memory is imported at run time here and here which is why we ask ACL to drop the reference at runtime too (here)

}

arm_compute::Status DynamicQuantMatmul::validate() {
if (relu_info_.has_value()) {
auto relu_status = arm_compute::NEActivationLayer::validate(
dst_tensor.info(), dst_tensor.info(), relu_info_.value());
if (relu_status.error_code() != arm_compute::ErrorCode::OK) {
return relu_status;
}
}
auto quant_status = arm_compute::NEQuantizationLayer::validate(
src_tensor.info(), src_q_tensor.info());
if (quant_status.error_code() != arm_compute::ErrorCode::OK) {
return quant_status;
}
return arm_compute::NEGEMMLowpMatrixMultiplyCore::validate(
src_q_tensor.info(),
wei_q_tensor_.info(),
bia_tensor_.has_value() ? bia_tensor_.value().info() : nullptr,
dst_tensor.info(),
gemm_info_);
}

void DynamicQuantMatmul::configure() {
quant.configure(&src_tensor, &src_q_tensor);
gemm.configure(
&src_q_tensor,
&wei_q_tensor_,
bia_tensor_.has_value() ? &bia_tensor_.value() : nullptr,
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Nit

Suggested change
bia_tensor_.has_value() ? &bia_tensor_.value() : nullptr,
bia_tensor_.value_or(nullptr),

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Sorry, I've made a mistake there, you'll need to keep the original logic, as you are passing pointer to tensor or null

&dst_tensor,
gemm_info_);
if (relu.has_value()) {
relu->configure(&dst_tensor, &dst_tensor, relu_info_.value());
}
}

StaticQuantMatmul::StaticQuantMatmul(
int64_t weight_dim_0,
int64_t weight_dim_1,
double weight_scale,
int64_t weight_offset,
int8_t* weight_ptr,
std::optional<float*> bias_ptr,
const QuantMatmulCacheKey& cache_key)
: QuantMatmul(
weight_dim_0,
weight_dim_1,
weight_scale,
weight_offset,
weight_ptr,
bias_ptr,
cache_key) {
const int64_t m =
std::get<static_cast<int>(QuantMatmulCacheKeyIndex::M)>(key);
const int64_t input_zero_point =
std::get<static_cast<int>(QuantMatmulCacheKeyIndex::INPUT_OFFSET)>(key);
const double input_scale =
std::get<static_cast<int>(QuantMatmulCacheKeyIndex::INPUT_SCALE)>(key);
const int64_t output_zero_point =
std::get<static_cast<int>(QuantMatmulCacheKeyIndex::OUTPUT_OFFSET)>(key);
const double output_scale =
std::get<static_cast<int>(QuantMatmulCacheKeyIndex::OUTPUT_SCALE)>(key);
const bool signed_input =
std::get<static_cast<int>(QuantMatmulCacheKeyIndex::SIGNED_INPUT)>(key);

const auto input_acl_datatype = signed_input
? arm_compute::DataType::QASYMM8_SIGNED
: arm_compute::DataType::QASYMM8;

auto src_q_tensor_info = arm_compute::TensorInfo(
arm_compute::TensorShape(weight_dim_0, m),
1,
input_acl_datatype,
arm_compute::QuantizationInfo(input_scale, -input_zero_point, false));
src_q_tensor_info.set_are_values_constant(false);
src_q_tensor.allocator()->init(src_q_tensor_info);

if (bias_ptr.has_value()) {
auto bia_q_tensor_info = arm_compute::TensorInfo(
arm_compute::TensorShape(1, weight_dim_1),
1,
arm_compute::DataType::S32,
arm_compute::QuantizationInfo(
1 / (input_scale * weight_scale), 0, false));
bia_q_tensor_ = arm_compute::Tensor();
bia_q_tensor_.value().allocator()->init(bia_q_tensor_info);

float* bias_fp32_buffer = (float*)bia_tensor_.value().buffer();
bia_q_tensor_orig_ =
at::empty({m, weight_dim_0}, at::device(c10::kCPU).dtype(c10::kQInt32));
int32_t* bias_s32_buffer = (int32_t*)bia_q_tensor_orig_.value().data_ptr();
const float bias_scale =
bia_q_tensor_info.quantization_info().uniform().scale;
// Quantize the bias to int32_t. It makes sense to do it here rather in the
// prepack phase because dynamically quantized ACL matmuls don't need the
// bias in int32_t.
at::parallel_for(0, weight_dim_1, 1, [&](int64_t start, int64_t end) {
for (int64_t i = start; i < end; ++i) {
bias_s32_buffer[i] =
int32_t(std::round(bias_fp32_buffer[i] * bias_scale));
}
});
bia_q_tensor_.value().allocator()->import_memory(bias_s32_buffer);
}
auto dst_q_tensor_info = arm_compute::TensorInfo(
arm_compute::TensorShape(weight_dim_1, m),
1,
input_acl_datatype,
arm_compute::QuantizationInfo(output_scale, output_zero_point, false));
dst_q_tensor.allocator()->init(dst_q_tensor_info);

// Setup lowp_gemm output stage
int output_multiplier;
int output_shift;
float multiplier = (input_scale * weight_scale) / output_scale;
arm_compute::quantization::calculate_quantized_multiplier_less_than_one(
multiplier, &output_multiplier, &output_shift);

arm_compute::GEMMLowpOutputStageInfo output_stage_info;
output_stage_info.type =
arm_compute::GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT;
output_stage_info.gemmlowp_multiplier = output_multiplier;
output_stage_info.gemmlowp_shift = output_shift;
output_stage_info.gemmlowp_offset = output_zero_point;

int32_t min_activation = signed_input ? std::numeric_limits<int8_t>::min()
: std::numeric_limits<uint8_t>::min();
int32_t max_activation = signed_input ? std::numeric_limits<int8_t>::max()
: std::numeric_limits<uint8_t>::max();

if (relu_info_.has_value()) {
// figure out min, max values for ReLU
const arm_compute::UniformQuantizationInfo uqinfo =
dst_q_tensor_info.quantization_info().uniform();
std::tie(min_activation, max_activation) =
arm_compute::get_quantized_activation_min_max(
relu_info_.value(), src_q_tensor_info.data_type(), uqinfo);
// fuse ReLU with the GEMM
gemm_info_.set_activation_info(relu_info_.value());
}
output_stage_info.gemmlowp_min_bound = min_activation;
output_stage_info.gemmlowp_max_bound = max_activation;
output_stage_info.output_data_type = dst_q_tensor_info.data_type();

gemm_info_.set_gemmlowp_output_stage(output_stage_info);
}

StaticQuantMatmul::~StaticQuantMatmul() {
// this will not free memory, it will just tell ACL that we're no longer
// using the pointer
if (bia_q_tensor_.has_value()) {
bia_q_tensor_.value().allocator()->free();
}
}

arm_compute::Status StaticQuantMatmul::validate() {
return arm_compute::NEGEMMLowpMatrixMultiplyCore::validate(
src_q_tensor.info(),
wei_q_tensor_.info(),
bia_q_tensor_.has_value() ? bia_q_tensor_.value().info() : nullptr,
dst_q_tensor.info(),
gemm_info_);
}

void StaticQuantMatmul::configure() {
gemm.configure(
&src_q_tensor,
&wei_q_tensor_,
bia_q_tensor_.has_value() ? &bia_q_tensor_.value() : nullptr,
&dst_q_tensor,
gemm_info_);
}

} // namespace at::native::acl_utils

PackedLinearWeightsACL::PackedLinearWeightsACL(
std::unique_ptr<ideep::tensor> weight,
std::optional<ideep::tensor> bias,
at::Tensor orig_weight,
std::optional<at::Tensor> orig_bias)
: PackedLinearWeightsOnednn(
std::move(weight),
std::move(bias),
std::move(orig_weight),
std::move(orig_bias)) {
auto w = *(weight_.get());
k_ = w.get_dim(0);
n_ = w.get_dim(1);
weight_zero_point_ = orig_weight_.q_zero_point();
weight_scale_ = orig_weight_.q_scale();
}

#endif // AT_MKLDNN_ACL_ENABLED()
Loading
Loading