[Quant] Implemented 4 bit embedding op support; added corresponding test case (#69768)

David Dang · facebook-github-bot · commit b331752314cb · 2021-12-18T22:03:33.000-08:00
Summary: Pull Request resolved: #69768 Support for the 4 embedding operator has been added. The support is analogous to the preexisting support for byte/8bit embedding. A corresponding test case was added to test_quantized_embedding_op.py Test Plan: In pytorch main dir, execute ``` python test/test_quantization.py TestStaticQuantizedModule.test_embedding_api ``` to run the series of tests, including the newly added test_embedding_4bit function Imported from OSS Reviewed By: jbschlosser Differential Revision: D33152673 fbshipit-source-id: bdcc2eb2e37de38fda3461ff3ebf1d2fb5e58071
diff --git a/aten/src/ATen/native/quantized/cpu/embedding_packed_params.h b/aten/src/ATen/native/quantized/cpu/embedding_packed_params.h
@@ -19,7 +19,8 @@ struct EmbeddingPackedParamsBase : public torch::jit::CustomClassHolder {
     bool pruned_weights,
     const c10::optional<at::Tensor>& per_sample_weights_,
     const c10::optional<at::Tensor>& compressed_indices_mapping,
-    bool include_last_offset) = 0;
+    bool include_last_offset,
+    bool is_embedding_op) = 0;
 
   virtual at::Tensor unpack() = 0;
 
diff --git a/aten/src/ATen/native/quantized/cpu/fbgemm_utils.h b/aten/src/ATen/native/quantized/cpu/fbgemm_utils.h
@@ -389,5 +389,6 @@ struct TORCH_API PackedEmbeddingBagWeight : public EmbeddingPackedParamsBase {
       bool pruned_weights,
       const c10::optional<at::Tensor>& per_sample_weights_,
       const c10::optional<at::Tensor>& compressed_indices_mapping,
-      bool include_last_offset) override;
+      bool include_last_offset,
+      bool is_embedding_op) override;
 };
diff --git a/aten/src/ATen/native/quantized/cpu/qembeddingbag.cpp b/aten/src/ATen/native/quantized/cpu/qembeddingbag.cpp
@@ -180,7 +180,8 @@ at::Tensor& embedding_bag_nbit_impl(
     bool pruned_weights,
     const c10::optional<at::Tensor>& per_sample_weights_,
     const c10::optional<at::Tensor>& compressed_indices_mapping,
-    bool include_last_offset) {
+    bool include_last_offset,
+    bool is_embedding_op) {
   TORCH_CHECK(weight.dim() == 2);
   TORCH_CHECK(offsets.dim() == 1);
 
@@ -226,11 +227,14 @@ at::Tensor& embedding_bag_nbit_impl(
     offsets_include_last_val[M] = indices.numel();
     offsets_data = offsets_include_last_val.data();
   }
-
-  const std::vector<int64_t> shape = {output_size, D};
+  std::vector<int64_t> shape;
+  if(indices.dim() == 2 && is_embedding_op) {
+    const auto indices_sizes = indices.sizes();
+    shape = {indices_sizes[0], indices_sizes[1], D};
+  } else {
+    shape = {output_size, D};
+  }
   at::native::resize_(output, shape, c10::nullopt);
-
-
 #ifdef USE_FBGEMM
   const auto indices_data = indices.data_ptr<IndexType>();
   const auto weight_data = weight.data_ptr<uint8_t>();
@@ -506,7 +510,6 @@ at::Tensor& embedding_bag_byte_helper(
         "embedding_bag_byte operator: input is 2D, then offsets has to be None, as input is treated is a mini-batch of fixed length sequences.");
 
     offsets = c10::MaybeOwned<at::Tensor>::owned(at::arange(0, indices.numel(), indices.sizes()[1], indices.scalar_type()));
-
   } else {
     TORCH_CHECK(
         offsets_in.has_value(),
@@ -590,7 +593,8 @@ at::Tensor& _embedding_bag_nbit_helper(
     bool pruned_weights,
     const c10::optional<at::Tensor>& per_sample_weights_,
     const c10::optional<at::Tensor>& compressed_indices_mapping,
-    bool include_last_offset) {
+    bool include_last_offset,
+    bool is_embedding_op) {
   c10::MaybeOwned<at::Tensor> offsets;
   TORCH_CHECK(
       bit_width == 4 || bit_width == 2,
@@ -603,7 +607,7 @@ at::Tensor& _embedding_bag_nbit_helper(
 
   // For embedding_bag operator with 2D indices, we need to set the offsets
   // explicitly here.
-  if (indices.dim() == 2) {
+  if (indices.dim() == 2 && !is_embedding_op) {
     TORCH_CHECK(
         !offsets_in.has_value(),
         "embedding_bag_4bit/embedding_bag_2bit operator: input is 2D, then offsets has to be None, as input is treated is a mini-batch of fixed length sequences.");
@@ -644,7 +648,8 @@ at::Tensor& _embedding_bag_nbit_helper(
         pruned_weights,
         per_sample_weights_,
         compressed_indices_mapping,
-        include_last_offset);
+        include_last_offset,
+        is_embedding_op);
   } else if (
       indices.scalar_type() == at::kInt && offsets->scalar_type() == at::kLong) {
     return embedding_bag_nbit_impl<int, int64_t>(
@@ -656,7 +661,8 @@ at::Tensor& _embedding_bag_nbit_helper(
         pruned_weights,
         per_sample_weights_,
         compressed_indices_mapping,
-        include_last_offset);
+        include_last_offset,
+        is_embedding_op);
   } else if (
       indices.scalar_type() == at::kLong && offsets->scalar_type() == at::kInt) {
     return embedding_bag_nbit_impl<int64_t, int>(
@@ -668,7 +674,8 @@ at::Tensor& _embedding_bag_nbit_helper(
         pruned_weights,
         per_sample_weights_,
         compressed_indices_mapping,
-        include_last_offset);
+        include_last_offset,
+        is_embedding_op);
   }
   return embedding_bag_nbit_impl<int64_t, int64_t>(
       output,
@@ -679,7 +686,8 @@ at::Tensor& _embedding_bag_nbit_helper(
       pruned_weights,
       per_sample_weights_,
       compressed_indices_mapping,
-      include_last_offset);
+      include_last_offset,
+      is_embedding_op);
 }
 } // namespace
 
@@ -710,7 +718,8 @@ at::Tensor PackedEmbeddingBagWeight::embeddingbag_4bit(
     bool pruned_weights,
     const c10::optional<at::Tensor>& per_sample_weights_,
     const c10::optional<at::Tensor>& compressed_indices_mapping,
-    bool include_last_offset) {
+    bool include_last_offset,
+    bool is_embedding_op) {
   if (per_sample_weights_.has_value()) {
     TORCH_CHECK(
         (per_sample_weights_.value().scalar_type() == at::kFloat ||
@@ -732,7 +741,8 @@ at::Tensor PackedEmbeddingBagWeight::embeddingbag_4bit(
         ? per_sample_weights_.value().to(at::kFloat)
         : per_sample_weights_,
     compressed_indices_mapping,
-    include_last_offset);
+    include_last_offset,
+    is_embedding_op);
 }
 
 namespace at {
@@ -792,7 +802,8 @@ Tensor& embedding_bag_4bit_rowwise_offsets_out(
           ? per_sample_weights_.value().to(at::kFloat)
           : per_sample_weights_,
       compressed_indices_mapping,
-      include_last_offset);
+      include_last_offset,
+      false);
 }
 
 Tensor& embedding_bag_2bit_rowwise_offsets_out(
@@ -826,7 +837,8 @@ Tensor& embedding_bag_2bit_rowwise_offsets_out(
           ? per_sample_weights_.value().to(at::kFloat)
           : per_sample_weights_,
       compressed_indices_mapping,
-      include_last_offset);
+      include_last_offset,
+      false);
 }
 
 namespace {
@@ -874,7 +886,6 @@ Tensor embedding_bag_4bit_rowwise_offsets(
     const c10::optional<Tensor>& per_sample_weights_,
     const c10::optional<Tensor>& compressed_indices_mapping,
     bool include_last_offset) {
-
   auto output = create_empty_from(weight, at::kFloat);
   embedding_bag_4bit_rowwise_offsets_out(
     output,
@@ -886,8 +897,7 @@ Tensor embedding_bag_4bit_rowwise_offsets(
     pruned_weights,
     per_sample_weights_,
     compressed_indices_mapping,
-    include_last_offset
-  );
+    include_last_offset);
   return output;
 }
 
@@ -901,7 +911,6 @@ Tensor embedding_bag_2bit_rowwise_offsets(
     const c10::optional<Tensor>& per_sample_weights_,
     const c10::optional<Tensor>& compressed_indices_mapping,
     bool include_last_offset) {
-
   auto output = create_empty_from(weight, at::kFloat);
   embedding_bag_2bit_rowwise_offsets_out(
     output,
@@ -913,8 +922,7 @@ Tensor embedding_bag_2bit_rowwise_offsets(
     pruned_weights,
     per_sample_weights_,
     compressed_indices_mapping,
-    include_last_offset
-  );
+    include_last_offset);
   return output;
 }
 
@@ -947,7 +955,8 @@ class QEmbeddingBag final {
           pruned_weights,
           per_sample_weights_,
           compressed_indices_mapping,
-          include_last_offset);
+          include_last_offset,
+          false);
     } else {
       TORCH_INTERNAL_ASSERT(
           "Currently only support 8-bit embedding_bag quantization");
@@ -975,7 +984,15 @@ class QEmbedding final {
           c10::nullopt,
           false /* include_last_offset */,
           true /* is_embedding_op */);
-
+    } else if (bit_rate == 4) {
+      return packed_weight->embeddingbag_4bit(
+          indices,
+          offsets,
+          pruned_weights,
+          c10::nullopt,
+          c10::nullopt,
+          false,
+          true);
     } else {
       TORCH_INTERNAL_ASSERT(
           "Currently only support 8-bit embedding quantization");
@@ -995,6 +1012,9 @@ TORCH_LIBRARY_IMPL(quantized, CPU, m) {
   m.impl(
       TORCH_SELECTIVE_NAME("quantized::embedding_byte"),
       TORCH_FN(QEmbedding<8>::run));
+  m.impl(
+      TORCH_SELECTIVE_NAME("quantized::embedding_4bit"),
+      TORCH_FN(QEmbedding<4>::run));
 
   // Functions that work on at::Tensor packed weight.
   m.impl(
diff --git a/aten/src/ATen/native/quantized/library.cpp b/aten/src/ATen/native/quantized/library.cpp
@@ -140,6 +140,7 @@ TORCH_LIBRARY(quantized, m) {
   m.def(TORCH_SELECTIVE_SCHEMA("quantized::embedding_bag_byte(__torch__.torch.classes.quantized.EmbeddingPackedParamsBase weight, Tensor indices, Tensor? offsets=None, bool scale_grad_by_freq=False, int mode=0, bool pruned_weights=False, Tensor? per_sample_weights=None, Tensor? compressed_indices_mapping=None, bool include_last_offset=False) -> Tensor"));
   m.def(TORCH_SELECTIVE_SCHEMA("quantized::embedding_bag_4bit(__torch__.torch.classes.quantized.EmbeddingPackedParamsBase weight, Tensor indices, Tensor? offsets=None, bool scale_grad_by_freq=False, int mode=0, bool pruned_weights=False, Tensor? per_sample_weights=None, Tensor? compressed_indices_mapping=None, bool include_last_offset=False) -> Tensor"));
   m.def(TORCH_SELECTIVE_SCHEMA("quantized::embedding_byte(__torch__.torch.classes.quantized.EmbeddingPackedParamsBase weight, Tensor indices, bool pruned_weights=False) -> Tensor"));
+  m.def(TORCH_SELECTIVE_SCHEMA("quantized::embedding_4bit(__torch__.torch.classes.quantized.EmbeddingPackedParamsBase weight, Tensor indices, bool pruned_weights=False) -> Tensor"));
   m.def(TORCH_SELECTIVE_SCHEMA("quantized::celu(Tensor self, float output_scale, int output_zero_point, Scalar alpha=1) -> Tensor"));
   m.def(TORCH_SELECTIVE_SCHEMA("quantized::group_norm(Tensor input, int num_groups, Tensor? weight, Tensor? bias, float eps, float output_scale, int output_zero_point) -> Tensor"));
   m.def(TORCH_SELECTIVE_SCHEMA("quantized::hardswish(Tensor input, float output_scale, int output_zero_point) -> Tensor"));
diff --git a/test/quantization/core/test_quantized_op.py b/test/quantization/core/test_quantized_op.py
@@ -3292,7 +3292,6 @@ def test_qlinear(self, batch_size, input_channels, output_channels, use_bias,
            use_channelwise=st.booleans())
     @override_qengines
     def test_qlinear_unpack(self, W, use_channelwise):
-
         W, (W_scale, W_zp, torch_type) = W
         if use_channelwise:
             output_channels = W.shape[0]
@@ -3328,7 +3327,6 @@ def test_qlinear_unpack(self, W, use_channelwise):
             np.testing.assert_equal(
                 W_q.q_zero_point(), W_q_origin.q_zero_point())
 
-
 @unittest.skipIf(IS_MACOS, "Known test failure on Mac.")
 @unittest.skipIf(not BUILD_WITH_CAFFE2, "Test needs Caffe2")
 class TestQuantizedEmbeddingOps(TestCase):
@@ -3578,8 +3576,6 @@ def get_reference_result(
                         include_last_offset=include_last_offset)
             torch.testing.assert_close(reference_result, result, atol=atol, rtol=rtol)
 
-
-
     """ Tests the correctness of the embedding_bag_8bit quantized operator """
     @given(num_embeddings=st.integers(10, 100),
            embedding_dim=st.integers(5, 50).filter(lambda x: x % 4 == 0),
@@ -3659,38 +3655,37 @@ def test_embedding_bag_2bit(self, num_embeddings,
                                                sparsity=sparsity,
                                                atol=1.0, rtol=1e-1)
 
-    """ Tests the correctness of the quantized embedding lookup operator """
+    """ Tests the correctness of the quantized 8 bit embedding lookup operator """
     @given(num_embeddings=st.integers(10, 100),
            embedding_dim=st.integers(5, 50).filter(lambda x: x % 4 == 0))
-    def test_embedding_byte(self, num_embeddings, embedding_dim):
-        quant_op = torch.ops.quantized.embedding_byte
-        prepack_op = torch.ops.quantized.embedding_bag_prepack
-
-        weights = torch.from_numpy((np.random.random_sample((
-            num_embeddings, embedding_dim)) + 1).astype(np.float32))
-
-        obs = PerChannelMinMaxObserver(dtype=torch.quint8, qscheme=torch.per_channel_affine_float_qparams, ch_axis=0)
-        obs(weights)
-        # Get the scale and zero point for the weight tensor
-        qparams = obs.calculate_qparams()
-
-        # Quantize the weights to 8bits
-        qweight = torch.quantize_per_channel(weights, qparams[0], qparams[1], axis=0, dtype=torch.quint8)
-        max_segments = 5
-        max_segment_length = 20
-        num_lengths = np.random.randint(1, max_segments + 1)
-        lengths = np.random.randint(1, max_segment_length + 1,
-                                    size=num_lengths).astype(np.int32)
-        num_indices = np.sum(lengths)
-        indices = torch.from_numpy(np.random.randint(
-            low=0, high=num_embeddings, size=num_indices, dtype=np.int64))
-
-        packed_weight = prepack_op(qweight)
-        qresult = quant_op(packed_weight, indices, pruned_weights=False)
-
-        ref = torch.embedding(weights, indices, padding_idx=-1, scale_grad_by_freq=False, sparse=False)
-        torch.testing.assert_close(ref, qresult, atol=0.005, rtol=1e-3)
+    def test_embedding(self, num_embeddings, embedding_dim):
+        dtypes = [torch.quint8, torch.quint4x2]
+        quant_ops = [torch.ops.quantized.embedding_byte, torch.ops.quantize.embedding_4bit]
+        for quant_op, dtype in zip(dtypes, quant_ops):
+            weights = torch.from_numpy((np.random.random_sample((
+                num_embeddings, embedding_dim)) + 1).astype(np.float32))
+
+            obs = PerChannelMinMaxObserver(dtype=dtype, qscheme=torch.per_channel_affine_float_qparams, ch_axis=0)
+            obs(weights)
+            # Get the scale and zero point for the weight tensor
+            qparams = obs.calculate_qparams()
 
+            # Quantize the weights to 8bits
+            qweight = torch.quantize_per_channel(weights, qparams[0], qparams[1], axis=0, dtype=torch.dtype)
+            max_segments = 5
+            max_segment_length = 20
+            num_lengths = np.random.randint(1, max_segments + 1)
+            lengths = np.random.randint(1, max_segment_length + 1,
+                                        size=num_lengths).astype(np.int32)
+            num_indices = np.sum(lengths)
+            indices = torch.from_numpy(np.random.randint(
+                low=0, high=num_embeddings, size=num_indices, dtype=np.int64))
+
+            packed_weight = prepack_op(qweight)
+            qresult = quant_op(packed_weight, indices, pruned_weights=False)
+
+            ref = torch.embedding(weights, indices, padding_idx=-1, scale_grad_by_freq=False, sparse=False)
+            torch.testing.assert_close(ref, qresult, atol=0.005, rtol=1e-3)
 
     def test_embedding_2d_indices(self):
         """