feat: publish row_key_schema fields in table proto and relevant admin APIs to setup a table with a row_key_schema

Google APIs · copybara-github · commit 33b23a795cf6 · 2025-02-28T11:12:54.000-08:00
PiperOrigin-RevId: 732197624
diff --git a/google/bigtable/admin/v2/bigtable_table_admin.proto b/google/bigtable/admin/v2/bigtable_table_admin.proto
@@ -688,11 +688,15 @@ message UpdateTableRequest {
   // * `change_stream_config`
   // * `change_stream_config.retention_period`
   // * `deletion_protection`
+  // * `row_key_schema`
   //
   // If `column_families` is set in `update_mask`, it will return an
   // UNIMPLEMENTED error.
   google.protobuf.FieldMask update_mask = 2
       [(google.api.field_behavior) = REQUIRED];
+
+  // Optional. If true, ignore safety checks when updating the table.
+  bool ignore_warnings = 3 [(google.api.field_behavior) = OPTIONAL];
 }
 
 // Metadata type for the operation returned by
diff --git a/google/bigtable/admin/v2/table.proto b/google/bigtable/admin/v2/table.proto
@@ -204,6 +204,64 @@ message Table {
     // Otherwise, automated backups are disabled.
     AutomatedBackupPolicy automated_backup_policy = 13;
   }
+
+  // The row key schema for this table. The schema is used to decode the raw row
+  // key bytes into a structured format. The order of field declarations in this
+  // schema is important, as it reflects how the raw row key bytes are
+  // structured. Currently, this only affects how the key is read via a
+  // GoogleSQL query from the ExecuteQuery API.
+  //
+  // For a SQL query, the _key column is still read as raw bytes. But queries
+  // can reference the key fields by name, which will be decoded from _key using
+  // provided type and encoding. Queries that reference key fields will fail if
+  // they encounter an invalid row key.
+  //
+  // For example, if _key = "some_id#2024-04-30#\x00\x13\x00\xf3" with the
+  // following schema:
+  // {
+  //   fields {
+  //     field_name: "id"
+  //     type { string { encoding: utf8_bytes {} } }
+  //   }
+  //   fields {
+  //     field_name: "date"
+  //     type { string { encoding: utf8_bytes {} } }
+  //   }
+  //   fields {
+  //     field_name: "product_code"
+  //     type { int64 { encoding: big_endian_bytes {} } }
+  //   }
+  //   encoding { delimited_bytes { delimiter: "#" } }
+  // }
+  //
+  // The decoded key parts would be:
+  //   id = "some_id", date = "2024-04-30", product_code = 1245427
+  // The query "SELECT _key, product_code FROM table" will return two columns:
+  // /------------------------------------------------------\
+  // |              _key                     | product_code |
+  // | --------------------------------------|--------------|
+  // | "some_id#2024-04-30#\x00\x13\x00\xf3" |   1245427    |
+  // \------------------------------------------------------/
+  //
+  // The schema has the following invariants:
+  // (1) The decoded field values are order-preserved. For read, the field
+  // values will be decoded in sorted mode from the raw bytes.
+  // (2) Every field in the schema must specify a non-empty name.
+  // (3) Every field must specify a type with an associated encoding. The type
+  // is limited to scalar types only: Array, Map, Aggregate, and Struct are not
+  // allowed.
+  // (4) The field names must not collide with existing column family
+  // names and reserved keywords "_key" and "_timestamp".
+  //
+  // The following update operations are allowed for row_key_schema:
+  // - Update from an empty schema to a new schema.
+  // - Remove the existing schema. This operation requires setting the
+  //   `ignore_warnings` flag to `true`, since it might be a backward
+  //   incompatible change. Without the flag, the update request will fail with
+  //   an INVALID_ARGUMENT error.
+  // Any other row key schema update operation (e.g. update existing schema
+  // columns names or types) is currently unsupported.
+  Type.Struct row_key_schema = 15;
 }
 
 // AuthorizedViews represent subsets of a particular Cloud Bigtable table. Users
diff --git a/google/bigtable/admin/v2/types.proto b/google/bigtable/admin/v2/types.proto
@@ -31,35 +31,34 @@ option ruby_package = "Google::Cloud::Bigtable::Admin::V2";
 // familiarity and consistency across products and features.
 //
 // For compatibility with Bigtable's existing untyped APIs, each `Type` includes
-// an `Encoding` which describes how to convert to/from the underlying data.
+// an `Encoding` which describes how to convert to or from the underlying data.
 //
-// Each encoding also defines the following properties:
+// Each encoding can operate in one of two modes:
 //
-//  * Order-preserving: Does the encoded value sort consistently with the
-//    original typed value? Note that Bigtable will always sort data based on
-//    the raw encoded value, *not* the decoded type.
-//     - Example: BYTES values sort in the same order as their raw encodings.
-//     - Counterexample: Encoding INT64 as a fixed-width decimal string does
-//       *not* preserve sort order when dealing with negative numbers.
-//       `INT64(1) > INT64(-1)`, but `STRING("-00001") > STRING("00001)`.
-//  * Self-delimiting: If we concatenate two encoded values, can we always tell
-//    where the first one ends and the second one begins?
-//     - Example: If we encode INT64s to fixed-width STRINGs, the first value
-//       will always contain exactly N digits, possibly preceded by a sign.
-//     - Counterexample: If we concatenate two UTF-8 encoded STRINGs, we have
-//       no way to tell where the first one ends.
-//  * Compatibility: Which other systems have matching encoding schemes? For
-//    example, does this encoding have a GoogleSQL equivalent? HBase? Java?
+//  - Sorted: In this mode, Bigtable guarantees that `Encode(X) <= Encode(Y)`
+//    if and only if `X <= Y`. This is useful anywhere sort order is important,
+//    for example when encoding keys.
+//  - Distinct: In this mode, Bigtable guarantees that if `X != Y` then
+//   `Encode(X) != Encode(Y)`. However, the converse is not guaranteed. For
+//    example, both "{'foo': '1', 'bar': '2'}" and "{'bar': '2', 'foo': '1'}"
+//    are valid encodings of the same JSON value.
+//
+// The API clearly documents which mode is used wherever an encoding can be
+// configured. Each encoding also documents which values are supported in which
+// modes. For example, when encoding INT64 as a numeric STRING, negative numbers
+// cannot be encoded in sorted mode. This is because `INT64(1) > INT64(-1)`, but
+// `STRING("-00001") > STRING("00001")`.
 message Type {
   // Bytes
   // Values of type `Bytes` are stored in `Value.bytes_value`.
   message Bytes {
-    // Rules used to convert to/from lower level types.
+    // Rules used to convert to or from lower level types.
     message Encoding {
-      // Leaves the value "as-is"
-      // * Order-preserving? Yes
-      // * Self-delimiting? No
-      // * Compatibility? N/A
+      // Leaves the value as-is.
+      //
+      // Sorted mode: all values are supported.
+      //
+      // Distinct mode: all values are supported.
       message Raw {}
 
       // Which encoding to use.
@@ -69,27 +68,33 @@ message Type {
       }
     }
 
-    // The encoding to use when converting to/from lower level types.
+    // The encoding to use when converting to or from lower level types.
     Encoding encoding = 1;
   }
 
   // String
   // Values of type `String` are stored in `Value.string_value`.
   message String {
-    // Rules used to convert to/from lower level types.
+    // Rules used to convert to or from lower level types.
     message Encoding {
       // Deprecated: prefer the equivalent `Utf8Bytes`.
       message Utf8Raw {
         option deprecated = true;
       }
 
-      // UTF-8 encoding
-      // * Order-preserving? Yes (code point order)
-      // * Self-delimiting? No
-      // * Compatibility?
-      //    - BigQuery Federation `TEXT` encoding
-      //    - HBase `Bytes.toBytes`
-      //    - Java `String#getBytes(StandardCharsets.UTF_8)`
+      // UTF-8 encoding.
+      //
+      // Sorted mode:
+      //  - All values are supported.
+      //  - Code point order is preserved.
+      //
+      // Distinct mode: all values are supported.
+      //
+      // Compatible with:
+      //
+      //  - BigQuery `TEXT` encoding
+      //  - HBase `Bytes.toBytes`
+      //  - Java `String#getBytes(StandardCharsets.UTF_8)`
       message Utf8Bytes {}
 
       // Which encoding to use.
@@ -102,36 +107,50 @@ message Type {
       }
     }
 
-    // The encoding to use when converting to/from lower level types.
+    // The encoding to use when converting to or from lower level types.
     Encoding encoding = 1;
   }
 
   // Int64
   // Values of type `Int64` are stored in `Value.int_value`.
   message Int64 {
-    // Rules used to convert to/from lower level types.
+    // Rules used to convert to or from lower level types.
     message Encoding {
-      // Encodes the value as an 8-byte big endian twos complement `Bytes`
-      // value.
-      // * Order-preserving? No (positive values only)
-      // * Self-delimiting? Yes
-      // * Compatibility?
-      //    - BigQuery Federation `BINARY` encoding
-      //    - HBase `Bytes.toBytes`
-      //    - Java `ByteBuffer.putLong()` with `ByteOrder.BIG_ENDIAN`
+      // Encodes the value as an 8-byte big-endian two's complement value.
+      //
+      // Sorted mode: non-negative values are supported.
+      //
+      // Distinct mode: all values are supported.
+      //
+      // Compatible with:
+      //
+      //  - BigQuery `BINARY` encoding
+      //  - HBase `Bytes.toBytes`
+      //  - Java `ByteBuffer.putLong()` with `ByteOrder.BIG_ENDIAN`
       message BigEndianBytes {
         // Deprecated: ignored if set.
-        Bytes bytes_type = 1;
+        Bytes bytes_type = 1 [deprecated = true];
       }
 
+      // Encodes the value in a variable length binary format of up to 10 bytes.
+      // Values that are closer to zero use fewer bytes.
+      //
+      // Sorted mode: all values are supported.
+      //
+      // Distinct mode: all values are supported.
+      message OrderedCodeBytes {}
+
       // Which encoding to use.
       oneof encoding {
         // Use `BigEndianBytes` encoding.
         BigEndianBytes big_endian_bytes = 1;
+
+        // Use `OrderedCodeBytes` encoding.
+        OrderedCodeBytes ordered_code_bytes = 2;
       }
     }
 
-    // The encoding to use when converting to/from lower level types.
+    // The encoding to use when converting to or from lower level types.
     Encoding encoding = 1;
   }
 
@@ -149,7 +168,24 @@ message Type {
 
   // Timestamp
   // Values of type `Timestamp` are stored in `Value.timestamp_value`.
-  message Timestamp {}
+  message Timestamp {
+    // Rules used to convert to or from lower level types.
+    message Encoding {
+      // Which encoding to use.
+      oneof encoding {
+        // Encodes the number of microseconds since the Unix epoch using the
+        // given `Int64` encoding. Values must be microsecond-aligned.
+        //
+        // Compatible with:
+        //
+        //  - Java `Instant.truncatedTo()` with `ChronoUnit.MICROS`
+        Int64.Encoding unix_micros_int64 = 1;
+      }
+    }
+
+    // The encoding to use when converting to or from lower level types.
+    Encoding encoding = 1;
+  }
 
   // Date
   // Values of type `Date` are stored in `Value.date_value`.
@@ -170,8 +206,95 @@ message Type {
       Type type = 2;
     }
 
+    // Rules used to convert to or from lower level types.
+    message Encoding {
+      // Uses the encoding of `fields[0].type` as-is.
+      // Only valid if `fields.size == 1`.
+      message Singleton {}
+
+      // Fields are encoded independently and concatenated with a configurable
+      // `delimiter` in between.
+      //
+      // A struct with no fields defined is encoded as a single `delimiter`.
+      //
+      // Sorted mode:
+      //
+      //  - Fields are encoded in sorted mode.
+      //  - Encoded field values must not contain any bytes <= `delimiter[0]`
+      //  - Element-wise order is preserved: `A < B` if `A[0] < B[0]`, or if
+      //    `A[0] == B[0] && A[1] < B[1]`, etc. Strict prefixes sort first.
+      //
+      // Distinct mode:
+      //
+      //  - Fields are encoded in distinct mode.
+      //  - Encoded field values must not contain `delimiter[0]`.
+      message DelimitedBytes {
+        // Byte sequence used to delimit concatenated fields. The delimiter must
+        // contain at least 1 character and at most 50 characters.
+        bytes delimiter = 1;
+      }
+
+      // Fields are encoded independently and concatenated with the fixed byte
+      // pair {0x00, 0x01} in between.
+      //
+      // Any null (0x00) byte in an encoded field is replaced by the fixed byte
+      // pair {0x00, 0xFF}.
+      //
+      // Fields that encode to the empty string "" have special handling:
+      //
+      //  - If *every* field encodes to "", or if the STRUCT has no fields
+      //    defined, then the STRUCT is encoded as the fixed byte pair
+      //    {0x00, 0x00}.
+      //  - Otherwise, the STRUCT only encodes until the last non-empty field,
+      //    omitting any trailing empty fields. Any empty fields that aren't
+      //    omitted are replaced with the fixed byte pair {0x00, 0x00}.
+      //
+      // Examples:
+      //
+      //  - STRUCT()             -> "\00\00"
+      //  - STRUCT("")           -> "\00\00"
+      //  - STRUCT("", "")       -> "\00\00"
+      //  - STRUCT("", "B")      -> "\00\00" + "\00\01" + "B"
+      //  - STRUCT("A", "")      -> "A"
+      //  - STRUCT("", "B", "")  -> "\00\00" + "\00\01" + "B"
+      //  - STRUCT("A", "", "C") -> "A" + "\00\01" + "\00\00" + "\00\01" + "C"
+      //
+      //
+      // Since null bytes are always escaped, this encoding can cause size
+      // blowup for encodings like `Int64.BigEndianBytes` that are likely to
+      // produce many such bytes.
+      //
+      // Sorted mode:
+      //
+      //  - Fields are encoded in sorted mode.
+      //  - All values supported by the field encodings are allowed
+      //  - Element-wise order is preserved: `A < B` if `A[0] < B[0]`, or if
+      //    `A[0] == B[0] && A[1] < B[1]`, etc. Strict prefixes sort first.
+      //
+      // Distinct mode:
+      //
+      //  - Fields are encoded in distinct mode.
+      //  - All values supported by the field encodings are allowed.
+      message OrderedCodeBytes {}
+
+      // Which encoding to use.
+      oneof encoding {
+        // Use `Singleton` encoding.
+        Singleton singleton = 1;
+
+        // Use `DelimitedBytes` encoding.
+        DelimitedBytes delimited_bytes = 2;
+
+        // User `OrderedCodeBytes` encoding.
+        OrderedCodeBytes ordered_code_bytes = 3;
+      }
+    }
+
     // The names and types of the fields in this struct.
     repeated Field fields = 1;
+
+    // The encoding to use when converting to or from lower level types.
+    Encoding encoding = 2;
   }
 
   // An ordered list of elements of a given type.