Skip to content

Commit d714d86

Browse files
shollymannoahdietz
authored andcommitted
feat(bigquery/storage/managedwriter/adapt): support packed field option (#6312)
* feat(bigquery/storage/managedwriter/adapt): support packed field option This PR adds the "packed" field option for repeated numeric scalar types when converting from table schema to proto descriptor. For large repetitions, this can yield wire size encoding benefits. This option is only relevant for proto2 descriptors; proto3 packs by default.
1 parent 455ae02 commit d714d86

2 files changed

Lines changed: 71 additions & 8 deletions

File tree

bigquery/storage/managedwriter/adapt/protoconversion.go

Lines changed: 42 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -63,6 +63,27 @@ var bqTypeToFieldTypeMap = map[storagepb.TableFieldSchema_Type]descriptorpb.Fiel
6363
storagepb.TableFieldSchema_TIMESTAMP: descriptorpb.FieldDescriptorProto_TYPE_INT64,
6464
}
6565

66+
// Primitive types which can leverage packed encoding when repeated/arrays.
67+
//
68+
// Note: many/most of these aren't used when doing schema to proto conversion, but
69+
// are included for completeness.
70+
var packedTypes = []descriptorpb.FieldDescriptorProto_Type{
71+
descriptorpb.FieldDescriptorProto_TYPE_INT32,
72+
descriptorpb.FieldDescriptorProto_TYPE_INT64,
73+
descriptorpb.FieldDescriptorProto_TYPE_UINT32,
74+
descriptorpb.FieldDescriptorProto_TYPE_UINT64,
75+
descriptorpb.FieldDescriptorProto_TYPE_SINT32,
76+
descriptorpb.FieldDescriptorProto_TYPE_SINT64,
77+
descriptorpb.FieldDescriptorProto_TYPE_FIXED32,
78+
descriptorpb.FieldDescriptorProto_TYPE_FIXED64,
79+
descriptorpb.FieldDescriptorProto_TYPE_SFIXED32,
80+
descriptorpb.FieldDescriptorProto_TYPE_SFIXED64,
81+
descriptorpb.FieldDescriptorProto_TYPE_FLOAT,
82+
descriptorpb.FieldDescriptorProto_TYPE_DOUBLE,
83+
descriptorpb.FieldDescriptorProto_TYPE_BOOL,
84+
descriptorpb.FieldDescriptorProto_TYPE_ENUM,
85+
}
86+
6687
// For TableFieldSchema OPTIONAL mode, we use the wrapper types to allow for the
6788
// proper representation of NULL values, as proto3 semantics would just use default value.
6889
var bqTypeToWrapperMap = map[storagepb.TableFieldSchema_Type]string{
@@ -85,7 +106,7 @@ var wellKnownTypesWrapperName = "google/protobuf/wrappers.proto"
85106

86107
// dependencyCache is used to reduce the number of unique messages we generate by caching based on the tableschema.
87108
//
88-
// keys are based on the base64-encoded serialized tableschema value.
109+
// Keys are based on the base64-encoded serialized tableschema value.
89110
type dependencyCache map[string]protoreflect.MessageDescriptor
90111

91112
func (dm dependencyCache) get(schema *storagepb.TableSchema) protoreflect.MessageDescriptor {
@@ -143,7 +164,7 @@ func StorageSchemaToProto3Descriptor(inSchema *storagepb.TableSchema, scope stri
143164
return storageSchemaToDescriptorInternal(inSchema, scope, &dc, true)
144165
}
145166

146-
// internal implementation of the conversion code.
167+
// Internal implementation of the conversion code.
147168
func storageSchemaToDescriptorInternal(inSchema *storagepb.TableSchema, scope string, cache *dependencyCache, useProto3 bool) (protoreflect.MessageDescriptor, error) {
148169
if inSchema == nil {
149170
return nil, newConversionError(scope, fmt.Errorf("no input schema was provided"))
@@ -170,11 +191,11 @@ func storageSchemaToDescriptorInternal(inSchema *storagepb.TableSchema, scope st
170191
break
171192
}
172193
}
173-
// if dep is missing, add to current dependencies
194+
// If dep is missing, add to current dependencies.
174195
if !haveDep {
175196
deps = append(deps, foundDesc.ParentFile())
176197
}
177-
// construct field descriptor for the message
198+
// Construct field descriptor for the message.
178199
fdp, err := tableFieldSchemaToFieldDescriptorProto(f, fNumber, string(foundDesc.FullName()), useProto3)
179200
if err != nil {
180201
return nil, newConversionError(scope, fmt.Errorf("couldn't convert field to FieldDescriptorProto: %w", err))
@@ -277,12 +298,25 @@ func tableFieldSchemaToFieldDescriptorProto(field *storagepb.TableFieldSchema, i
277298

278299
// For (REQUIRED||REPEATED) fields for proto3, or all cases for proto2, we can use the expected scalar types.
279300
if field.GetMode() != storagepb.TableFieldSchema_NULLABLE || !useProto3 {
280-
return &descriptorpb.FieldDescriptorProto{
301+
outType := bqTypeToFieldTypeMap[field.GetType()]
302+
fdp := &descriptorpb.FieldDescriptorProto{
281303
Name: proto.String(name),
282304
Number: proto.Int32(idx),
283-
Type: bqTypeToFieldTypeMap[field.GetType()].Enum(),
305+
Type: outType.Enum(),
284306
Label: convertModeToLabel(field.GetMode(), useProto3),
285-
}, nil
307+
}
308+
// Special case: proto2 repeated fields may benefit from using packed annotation.
309+
if field.GetMode() == storagepb.TableFieldSchema_REPEATED && !useProto3 {
310+
for _, v := range packedTypes {
311+
if outType == v {
312+
fdp.Options = &descriptorpb.FieldOptions{
313+
Packed: proto.Bool(true),
314+
}
315+
break
316+
}
317+
}
318+
}
319+
return fdp, nil
286320
}
287321
// For NULLABLE proto3 fields, use a wrapper type.
288322
return &descriptorpb.FieldDescriptorProto{
@@ -445,7 +479,7 @@ func normalizeName(in string) string {
445479
return strings.Replace(in, ".", "_", -1)
446480
}
447481

448-
// these types don't get normalized into the fully-contained structure.
482+
// These types don't get normalized into the fully-contained structure.
449483
var normalizationSkipList = []string{
450484
/*
451485
TODO: when backend supports resolving well known types, this list should be enabled.

bigquery/storage/managedwriter/adapt/protoconversion_test.go

Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -384,6 +384,35 @@ func TestSchemaToProtoConversion(t *testing.T) {
384384
},
385385
},
386386
},
387+
{
388+
description: "repeated w/packed",
389+
bq: &storagepb.TableSchema{
390+
Fields: []*storagepb.TableFieldSchema{
391+
{Name: "name", Type: storagepb.TableFieldSchema_STRING, Mode: storagepb.TableFieldSchema_NULLABLE},
392+
{Name: "some_lengths", Type: storagepb.TableFieldSchema_INT64, Mode: storagepb.TableFieldSchema_REPEATED},
393+
{Name: "nicknames", Type: storagepb.TableFieldSchema_STRING, Mode: storagepb.TableFieldSchema_REPEATED},
394+
}},
395+
wantProto2: &descriptorpb.DescriptorProto{
396+
Name: proto.String("root"),
397+
Field: []*descriptorpb.FieldDescriptorProto{
398+
{
399+
Name: proto.String("name"),
400+
Number: proto.Int32(1),
401+
Type: descriptorpb.FieldDescriptorProto_TYPE_STRING.Enum(),
402+
Label: descriptorpb.FieldDescriptorProto_LABEL_OPTIONAL.Enum()},
403+
{
404+
Name: proto.String("some_lengths"),
405+
Number: proto.Int32(2),
406+
Type: descriptorpb.FieldDescriptorProto_TYPE_INT64.Enum(),
407+
Label: descriptorpb.FieldDescriptorProto_LABEL_REPEATED.Enum(),
408+
Options: &descriptorpb.FieldOptions{
409+
Packed: proto.Bool(true),
410+
},
411+
},
412+
{Name: proto.String("nicknames"), Number: proto.Int32(3), Type: descriptorpb.FieldDescriptorProto_TYPE_STRING.Enum(), Label: descriptorpb.FieldDescriptorProto_LABEL_REPEATED.Enum()},
413+
},
414+
},
415+
},
387416
}
388417
for _, tc := range testCases {
389418
// Proto2

0 commit comments

Comments
 (0)