feat: add ability to request compressed ReadRowsResponse rows

Google APIs · copybara-github · commit 341d70f9f3ac · 2024-01-09T11:47:59.000-08:00
This change allows the client to request raw lz4 compression of the ReadRowsResponse rows data for both ArrowRecordBatches and Avro rows.

PiperOrigin-RevId: 597000088
diff --git a/google/cloud/bigquery/storage/v1/storage.proto b/google/cloud/bigquery/storage/v1/storage.proto
@@ -348,6 +348,23 @@ message ReadRowsResponse {
     // Output only. Arrow schema.
     ArrowSchema arrow_schema = 8 [(google.api.field_behavior) = OUTPUT_ONLY];
   }
+
+  // Optional. If the row data in this ReadRowsResponse is compressed, then
+  // uncompressed byte size is the original size of the uncompressed row data.
+  // If it is set to a value greater than 0, then decompress into a buffer of
+  // size uncompressed_byte_size using the compression codec that was requested
+  // during session creation time and which is specified in
+  // TableReadOptions.response_compression_codec in ReadSession.
+  // This value is not set if no response_compression_codec was not requested
+  // and it is -1 if the requested compression would not have reduced the size
+  // of this ReadRowsResponse's row data. This attempts to match Apache Arrow's
+  // behavior described here https://github.com/apache/arrow/issues/15102 where
+  // the uncompressed length may be set to -1 to indicate that the data that
+  // follows is not compressed, which can be useful for cases where compression
+  // does not yield appreciable savings. When uncompressed_byte_size is not
+  // greater than 0, the client should skip decompression.
+  optional int64 uncompressed_byte_size = 9
+      [(google.api.field_behavior) = OPTIONAL];
 }
 
 // Request message for `SplitReadStream`.
diff --git a/google/cloud/bigquery/storage/v1/stream.proto b/google/cloud/bigquery/storage/v1/stream.proto
@@ -59,6 +59,21 @@ message ReadSession {
 
   // Options dictating how we read a table.
   message TableReadOptions {
+    // Specifies which compression codec to attempt on the entire serialized
+    // response payload (either Arrow record batch or Avro rows). This is
+    // not to be confused with the Apache Arrow native compression codecs
+    // specified in ArrowSerializationOptions. For performance reasons, when
+    // creating a read session requesting Arrow responses, setting both native
+    // Arrow compression and application-level response compression will not be
+    // allowed - choose, at most, one kind of compression.
+    enum ResponseCompressionCodec {
+      // Default is no compression.
+      RESPONSE_COMPRESSION_CODEC_UNSPECIFIED = 0;
+
+      // Use raw LZ4 compression.
+      RESPONSE_COMPRESSION_CODEC_LZ4 = 2;
+    }
+
     // Optional. The names of the fields in the table to be returned. If no
     // field names are specified, then all fields in the table are returned.
     //
@@ -138,6 +153,11 @@ message ReadSession {
     // https://cloud.google.com/bigquery/docs/table-sampling)
     optional double sample_percentage = 5
         [(google.api.field_behavior) = OPTIONAL];
+
+    // Optional. Set response_compression_codec when creating a read session to
+    // enable application-level compression of ReadRows responses.
+    optional ResponseCompressionCodec response_compression_codec = 6
+        [(google.api.field_behavior) = OPTIONAL];
   }
 
   // Output only. Unique identifier for the session, in the form