@@ -31,35 +31,34 @@ option ruby_package = "Google::Cloud::Bigtable::Admin::V2";
3131// familiarity and consistency across products and features.
3232//
3333// For compatibility with Bigtable's existing untyped APIs, each `Type` includes
34- // an `Encoding` which describes how to convert to/ from the underlying data.
34+ // an `Encoding` which describes how to convert to or from the underlying data.
3535//
36- // Each encoding also defines the following properties :
36+ // Each encoding can operate in one of two modes :
3737//
38- // * Order-preserving: Does the encoded value sort consistently with the
39- // original typed value? Note that Bigtable will always sort data based on
40- // the raw encoded value, *not* the decoded type.
41- // - Example: BYTES values sort in the same order as their raw encodings.
42- // - Counterexample: Encoding INT64 as a fixed-width decimal string does
43- // *not* preserve sort order when dealing with negative numbers.
44- // `INT64(1) > INT64(-1)`, but `STRING("-00001") > STRING("00001)`.
45- // * Self-delimiting: If we concatenate two encoded values, can we always tell
46- // where the first one ends and the second one begins?
47- // - Example: If we encode INT64s to fixed-width STRINGs, the first value
48- // will always contain exactly N digits, possibly preceded by a sign.
49- // - Counterexample: If we concatenate two UTF-8 encoded STRINGs, we have
50- // no way to tell where the first one ends.
51- // * Compatibility: Which other systems have matching encoding schemes? For
52- // example, does this encoding have a GoogleSQL equivalent? HBase? Java?
38+ // - Sorted: In this mode, Bigtable guarantees that `Encode(X) <= Encode(Y)`
39+ // if and only if `X <= Y`. This is useful anywhere sort order is important,
40+ // for example when encoding keys.
41+ // - Distinct: In this mode, Bigtable guarantees that if `X != Y` then
42+ // `Encode(X) != Encode(Y)`. However, the converse is not guaranteed. For
43+ // example, both "{'foo': '1', 'bar': '2'}" and "{'bar': '2', 'foo': '1'}"
44+ // are valid encodings of the same JSON value.
45+ //
46+ // The API clearly documents which mode is used wherever an encoding can be
47+ // configured. Each encoding also documents which values are supported in which
48+ // modes. For example, when encoding INT64 as a numeric STRING, negative numbers
49+ // cannot be encoded in sorted mode. This is because `INT64(1) > INT64(-1)`, but
50+ // `STRING("-00001") > STRING("00001")`.
5351message Type {
5452 // Bytes
5553 // Values of type `Bytes` are stored in `Value.bytes_value`.
5654 message Bytes {
57- // Rules used to convert to/ from lower level types.
55+ // Rules used to convert to or from lower level types.
5856 message Encoding {
59- // Leaves the value "as-is"
60- // * Order-preserving? Yes
61- // * Self-delimiting? No
62- // * Compatibility? N/A
57+ // Leaves the value as-is.
58+ //
59+ // Sorted mode: all values are supported.
60+ //
61+ // Distinct mode: all values are supported.
6362 message Raw {}
6463
6564 // Which encoding to use.
@@ -69,27 +68,33 @@ message Type {
6968 }
7069 }
7170
72- // The encoding to use when converting to/ from lower level types.
71+ // The encoding to use when converting to or from lower level types.
7372 Encoding encoding = 1 ;
7473 }
7574
7675 // String
7776 // Values of type `String` are stored in `Value.string_value`.
7877 message String {
79- // Rules used to convert to/ from lower level types.
78+ // Rules used to convert to or from lower level types.
8079 message Encoding {
8180 // Deprecated: prefer the equivalent `Utf8Bytes`.
8281 message Utf8Raw {
8382 option deprecated = true ;
8483 }
8584
86- // UTF-8 encoding
87- // * Order-preserving? Yes (code point order)
88- // * Self-delimiting? No
89- // * Compatibility?
90- // - BigQuery Federation `TEXT` encoding
91- // - HBase `Bytes.toBytes`
92- // - Java `String#getBytes(StandardCharsets.UTF_8)`
85+ // UTF-8 encoding.
86+ //
87+ // Sorted mode:
88+ // - All values are supported.
89+ // - Code point order is preserved.
90+ //
91+ // Distinct mode: all values are supported.
92+ //
93+ // Compatible with:
94+ //
95+ // - BigQuery `TEXT` encoding
96+ // - HBase `Bytes.toBytes`
97+ // - Java `String#getBytes(StandardCharsets.UTF_8)`
9398 message Utf8Bytes {}
9499
95100 // Which encoding to use.
@@ -102,36 +107,50 @@ message Type {
102107 }
103108 }
104109
105- // The encoding to use when converting to/ from lower level types.
110+ // The encoding to use when converting to or from lower level types.
106111 Encoding encoding = 1 ;
107112 }
108113
109114 // Int64
110115 // Values of type `Int64` are stored in `Value.int_value`.
111116 message Int64 {
112- // Rules used to convert to/ from lower level types.
117+ // Rules used to convert to or from lower level types.
113118 message Encoding {
114- // Encodes the value as an 8-byte big endian twos complement `Bytes`
115- // value.
116- // * Order-preserving? No (positive values only)
117- // * Self-delimiting? Yes
118- // * Compatibility?
119- // - BigQuery Federation `BINARY` encoding
120- // - HBase `Bytes.toBytes`
121- // - Java `ByteBuffer.putLong()` with `ByteOrder.BIG_ENDIAN`
119+ // Encodes the value as an 8-byte big-endian two's complement value.
120+ //
121+ // Sorted mode: non-negative values are supported.
122+ //
123+ // Distinct mode: all values are supported.
124+ //
125+ // Compatible with:
126+ //
127+ // - BigQuery `BINARY` encoding
128+ // - HBase `Bytes.toBytes`
129+ // - Java `ByteBuffer.putLong()` with `ByteOrder.BIG_ENDIAN`
122130 message BigEndianBytes {
123131 // Deprecated: ignored if set.
124- Bytes bytes_type = 1 ;
132+ Bytes bytes_type = 1 [ deprecated = true ] ;
125133 }
126134
135+ // Encodes the value in a variable length binary format of up to 10 bytes.
136+ // Values that are closer to zero use fewer bytes.
137+ //
138+ // Sorted mode: all values are supported.
139+ //
140+ // Distinct mode: all values are supported.
141+ message OrderedCodeBytes {}
142+
127143 // Which encoding to use.
128144 oneof encoding {
129145 // Use `BigEndianBytes` encoding.
130146 BigEndianBytes big_endian_bytes = 1 ;
147+
148+ // Use `OrderedCodeBytes` encoding.
149+ OrderedCodeBytes ordered_code_bytes = 2 ;
131150 }
132151 }
133152
134- // The encoding to use when converting to/ from lower level types.
153+ // The encoding to use when converting to or from lower level types.
135154 Encoding encoding = 1 ;
136155 }
137156
@@ -149,7 +168,24 @@ message Type {
149168
150169 // Timestamp
151170 // Values of type `Timestamp` are stored in `Value.timestamp_value`.
152- message Timestamp {}
171+ message Timestamp {
172+ // Rules used to convert to or from lower level types.
173+ message Encoding {
174+ // Which encoding to use.
175+ oneof encoding {
176+ // Encodes the number of microseconds since the Unix epoch using the
177+ // given `Int64` encoding. Values must be microsecond-aligned.
178+ //
179+ // Compatible with:
180+ //
181+ // - Java `Instant.truncatedTo()` with `ChronoUnit.MICROS`
182+ Int64.Encoding unix_micros_int64 = 1 ;
183+ }
184+ }
185+
186+ // The encoding to use when converting to or from lower level types.
187+ Encoding encoding = 1 ;
188+ }
153189
154190 // Date
155191 // Values of type `Date` are stored in `Value.date_value`.
@@ -170,8 +206,95 @@ message Type {
170206 Type type = 2 ;
171207 }
172208
209+ // Rules used to convert to or from lower level types.
210+ message Encoding {
211+ // Uses the encoding of `fields[0].type` as-is.
212+ // Only valid if `fields.size == 1`.
213+ message Singleton {}
214+
215+ // Fields are encoded independently and concatenated with a configurable
216+ // `delimiter` in between.
217+ //
218+ // A struct with no fields defined is encoded as a single `delimiter`.
219+ //
220+ // Sorted mode:
221+ //
222+ // - Fields are encoded in sorted mode.
223+ // - Encoded field values must not contain any bytes <= `delimiter[0]`
224+ // - Element-wise order is preserved: `A < B` if `A[0] < B[0]`, or if
225+ // `A[0] == B[0] && A[1] < B[1]`, etc. Strict prefixes sort first.
226+ //
227+ // Distinct mode:
228+ //
229+ // - Fields are encoded in distinct mode.
230+ // - Encoded field values must not contain `delimiter[0]`.
231+ message DelimitedBytes {
232+ // Byte sequence used to delimit concatenated fields. The delimiter must
233+ // contain at least 1 character and at most 50 characters.
234+ bytes delimiter = 1 ;
235+ }
236+
237+ // Fields are encoded independently and concatenated with the fixed byte
238+ // pair {0x00, 0x01} in between.
239+ //
240+ // Any null (0x00) byte in an encoded field is replaced by the fixed byte
241+ // pair {0x00, 0xFF}.
242+ //
243+ // Fields that encode to the empty string "" have special handling:
244+ //
245+ // - If *every* field encodes to "", or if the STRUCT has no fields
246+ // defined, then the STRUCT is encoded as the fixed byte pair
247+ // {0x00, 0x00}.
248+ // - Otherwise, the STRUCT only encodes until the last non-empty field,
249+ // omitting any trailing empty fields. Any empty fields that aren't
250+ // omitted are replaced with the fixed byte pair {0x00, 0x00}.
251+ //
252+ // Examples:
253+ //
254+ // - STRUCT() -> "\00\00"
255+ // - STRUCT("") -> "\00\00"
256+ // - STRUCT("", "") -> "\00\00"
257+ // - STRUCT("", "B") -> "\00\00" + "\00\01" + "B"
258+ // - STRUCT("A", "") -> "A"
259+ // - STRUCT("", "B", "") -> "\00\00" + "\00\01" + "B"
260+ // - STRUCT("A", "", "C") -> "A" + "\00\01" + "\00\00" + "\00\01" + "C"
261+ //
262+ //
263+ // Since null bytes are always escaped, this encoding can cause size
264+ // blowup for encodings like `Int64.BigEndianBytes` that are likely to
265+ // produce many such bytes.
266+ //
267+ // Sorted mode:
268+ //
269+ // - Fields are encoded in sorted mode.
270+ // - All values supported by the field encodings are allowed
271+ // - Element-wise order is preserved: `A < B` if `A[0] < B[0]`, or if
272+ // `A[0] == B[0] && A[1] < B[1]`, etc. Strict prefixes sort first.
273+ //
274+ // Distinct mode:
275+ //
276+ // - Fields are encoded in distinct mode.
277+ // - All values supported by the field encodings are allowed.
278+ message OrderedCodeBytes {}
279+
280+ // Which encoding to use.
281+ oneof encoding {
282+ // Use `Singleton` encoding.
283+ Singleton singleton = 1 ;
284+
285+ // Use `DelimitedBytes` encoding.
286+ DelimitedBytes delimited_bytes = 2 ;
287+
288+ // User `OrderedCodeBytes` encoding.
289+ OrderedCodeBytes ordered_code_bytes = 3 ;
290+ }
291+ }
292+
173293 // The names and types of the fields in this struct.
174294 repeated Field fields = 1 ;
295+
296+ // The encoding to use when converting to or from lower level types.
297+ Encoding encoding = 2 ;
175298 }
176299
177300 // An ordered list of elements of a given type.
0 commit comments