Skip to content

Commit cac736f

Browse files
Google APIscopybara-github
authored andcommitted
feat: Added SummaryOptions to ProcessOptions for the Summarizer processor
feat: Added `ListDocuments()` method for Document AI Workbench training documents PiperOrigin-RevId: 567684912
1 parent 54e225e commit cac736f

7 files changed

Lines changed: 216 additions & 28 deletions

File tree

google/cloud/documentai/v1beta3/document.proto

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -955,7 +955,7 @@ message Document {
955955
message RevisionRef {
956956
// Some predefined revision cases.
957957
enum RevisionCase {
958-
// Unspecified case, fallback to read the LATEST_HUMAN_REVIEW.
958+
// Unspecified case, fall back to read the `LATEST_HUMAN_REVIEW`.
959959
REVISION_CASE_UNSPECIFIED = 0;
960960

961961
// The latest revision made by a human.

google/cloud/documentai/v1beta3/document_io.proto

Lines changed: 10 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,13 @@ message RawDocument {
3434
// An IANA MIME type (RFC6838) indicating the nature and format of the
3535
// [content][google.cloud.documentai.v1beta3.RawDocument.content].
3636
string mime_type = 2;
37+
38+
// The display name of the document, it supports all Unicode characters except
39+
// the following:
40+
// `*`, `?`, `[`, `]`, `%`, `{`, `}`,`'`, `\"`, `,`
41+
// `~`, `=` and `:` are reserved.
42+
// If not specified, a default ID is generated.
43+
string display_name = 3;
3744
}
3845

3946
// Specifies a document stored on Cloud Storage.
@@ -117,8 +124,8 @@ message OcrConfig {
117124

118125
// Configurations for premium OCR features.
119126
message PremiumFeatures {
120-
// Turn on selection mark detector in OCR engine. Only available in OCR 2.0+
121-
// processors.
127+
// Turn on selection mark detector in OCR engine. Only available in OCR 2.0
128+
// (and later) processors.
122129
bool enable_selection_mark_detection = 3;
123130

124131
// Turn on font identification model and return font style information.
@@ -159,7 +166,7 @@ message OcrConfig {
159166
bool compute_style_info = 8 [deprecated = true];
160167

161168
// Turn off character box detector in OCR engine. Character box detection is
162-
// enabled by default in OCR 2.0+ processors.
169+
// enabled by default in OCR 2.0 (and later) processors.
163170
bool disable_character_boxes_detection = 10;
164171

165172
// Configurations for premium OCR features.

google/cloud/documentai/v1beta3/document_processor_service.proto

Lines changed: 25 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -348,16 +348,19 @@ message ProcessOptions {
348348
repeated int32 pages = 1 [(google.api.field_behavior) = OPTIONAL];
349349
}
350350

351-
// A subset of pages to process. If not specified, all pages will be
352-
// processed. NOTICE: If any of the page range is set, we will extract and
353-
// process only the given pages from the document. In the output document,
354-
// the page_number is referring to the page number in the original document.
351+
// A subset of pages to process. If not specified, all pages are processed.
352+
// If a page range is set, only the given pages are extracted and processed
353+
// from the document. In the output document,
354+
// [Document.Page.page_number][google.cloud.documentai.v1beta3.Document.Page.page_number]
355+
// refers to the page number in the original document. This configuration
356+
// only applies to sync requests. `page_range` can be only one of the
357+
// following:
355358
oneof page_range {
356359
// Which pages to process (1-indexed).
357360
IndividualPageSelector individual_page_selector = 5;
358361

359-
// Only process certain pages from the start, process all if the document
360-
// has less pages.
362+
// Only process certain pages from the start. Process all if the document
363+
// has fewer pages.
361364
int32 from_start = 6;
362365

363366
// Only process certain pages from the end, same as above.
@@ -367,6 +370,13 @@ message ProcessOptions {
367370
// Only applicable to `OCR_PROCESSOR`. Returns error if set on other
368371
// processor types.
369372
OcrConfig ocr_config = 1;
373+
374+
// Optional. Override the schema of the
375+
// [ProcessorVersion][google.cloud.documentai.v1beta3.ProcessorVersion]. Will
376+
// return an Invalid Argument error if this field is set when the underlying
377+
// [ProcessorVersion][google.cloud.documentai.v1beta3.ProcessorVersion]
378+
// doesn't support schema override.
379+
DocumentSchema schema_override = 8 [(google.api.field_behavior) = OPTIONAL];
370380
}
371381

372382
// Request message for the
@@ -1003,8 +1013,8 @@ message TrainProcessorVersionRequest {
10031013
// Options to control the training of the Custom Document Extraction (CDE)
10041014
// Processor.
10051015
message CustomDocumentExtractionOptions {
1006-
// Training Method for CDE. TRAINING_METHOD_UNSPECIFIED will fallback to
1007-
// MODEL_BASED.
1016+
// Training Method for CDE. `TRAINING_METHOD_UNSPECIFIED` will fall back to
1017+
// `MODEL_BASED`.
10081018
enum TrainingMethod {
10091019
TRAINING_METHOD_UNSPECIFIED = 0;
10101020

@@ -1289,9 +1299,9 @@ message ListEvaluationsResponse {
12891299

12901300
// The request message for the
12911301
// [ImportProcessorVersion][google.cloud.documentai.v1beta3.DocumentProcessorService.ImportProcessorVersion]
1292-
// method. Requirements:
1302+
// method.
12931303
//
1294-
// - The Document AI [Service
1304+
// The Document AI [Service
12951305
// Agent](https://cloud.google.com/iam/docs/service-agents) of the destination
12961306
// project must have [Document AI Editor
12971307
// role](https://cloud.google.com/document-ai/docs/access-control/iam-roles) on
@@ -1300,8 +1310,10 @@ message ListEvaluationsResponse {
13001310
// The destination project is specified as part of the
13011311
// [parent][google.cloud.documentai.v1beta3.ImportProcessorVersionRequest.parent]
13021312
// field. The source project is specified as part of the
1303-
// [source][ImportProcessorVersionRequest.processor_version_source or
1304-
// ImportProcessorVersionRequest.external_processor_version_source] field.
1313+
// [source][google.cloud.documentai.v1beta3.ImportProcessorVersionRequest.processor_version_source]
1314+
// or
1315+
// [external_processor_version_source][google.cloud.documentai.v1beta3.ImportProcessorVersionRequest.external_processor_version_source]
1316+
// field.
13051317
message ImportProcessorVersionRequest {
13061318
// The external source processor version.
13071319
message ExternalProcessorVersionSource {
@@ -1321,7 +1333,7 @@ message ImportProcessorVersionRequest {
13211333
type: "documentai.googleapis.com/ProcessorVersion"
13221334
}];
13231335

1324-
// The source processor version to import from, and can be from different
1336+
// The source processor version to import from. It can be from a different
13251337
// environment and region than the destination processor.
13261338
ExternalProcessorVersionSource external_processor_version_source = 3;
13271339
}

google/cloud/documentai/v1beta3/document_schema.proto

Lines changed: 45 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -24,10 +24,55 @@ option java_package = "com.google.cloud.documentai.v1beta3";
2424
option php_namespace = "Google\\Cloud\\DocumentAI\\V1beta3";
2525
option ruby_package = "Google::Cloud::DocumentAI::V1beta3";
2626

27+
// Metadata for document summarization.
28+
message SummaryOptions {
29+
// The Length enum.
30+
enum Length {
31+
// Default.
32+
LENGTH_UNSPECIFIED = 0;
33+
34+
// A brief summary of one or two sentences.
35+
BRIEF = 1;
36+
37+
// A paragraph-length summary.
38+
MODERATE = 2;
39+
40+
// The longest option available.
41+
COMPREHENSIVE = 3;
42+
}
43+
44+
// The Format enum.
45+
enum Format {
46+
// Default.
47+
FORMAT_UNSPECIFIED = 0;
48+
49+
// Format the output in paragraphs.
50+
PARAGRAPH = 1;
51+
52+
// Format the output in bullets.
53+
BULLETS = 2;
54+
}
55+
56+
// How long the summary should be.
57+
Length length = 1;
58+
59+
// The format the summary should be in.
60+
Format format = 2;
61+
}
62+
63+
// Metadata for how this field value is extracted.
64+
message FieldExtractionMetadata {
65+
// Summary options config.
66+
SummaryOptions summary_options = 2;
67+
}
68+
2769
// Metadata about a property.
2870
message PropertyMetadata {
2971
// Whether the property should be considered as "inactive".
3072
bool inactive = 3;
73+
74+
// Field extraction metadata on the property.
75+
FieldExtractionMetadata field_extraction_metadata = 9;
3176
}
3277

3378
// Metadata about an entity type.

google/cloud/documentai/v1beta3/document_service.proto

Lines changed: 120 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -78,6 +78,15 @@ service DocumentService {
7878
option (google.api.method_signature) = "dataset";
7979
}
8080

81+
// Returns a list of documents present in the dataset.
82+
rpc ListDocuments(ListDocumentsRequest) returns (ListDocumentsResponse) {
83+
option (google.api.http) = {
84+
post: "/v1beta3/{dataset=projects/*/locations/*/processors/*/dataset}:listDocuments"
85+
body: "*"
86+
};
87+
option (google.api.method_signature) = "dataset";
88+
}
89+
8190
// Deletes a set of documents.
8291
rpc BatchDeleteDocuments(BatchDeleteDocumentsRequest)
8392
returns (google.longrunning.Operation) {
@@ -114,7 +123,6 @@ service DocumentService {
114123
// referred to as splits: train, test.
115124
enum DatasetSplitType {
116125
// Default value if the enum is not set.
117-
// go/protodosdonts#do-include-an-unspecified-value-in-an-enum
118126
DATASET_SPLIT_TYPE_UNSPECIFIED = 0;
119127

120128
// Identifies the train documents.
@@ -127,6 +135,21 @@ enum DatasetSplitType {
127135
DATASET_SPLIT_UNASSIGNED = 3;
128136
}
129137

138+
// Describes the labelling status of a document.
139+
enum DocumentLabelingState {
140+
// Default value if the enum is not set.
141+
DOCUMENT_LABELING_STATE_UNSPECIFIED = 0;
142+
143+
// Document has been labelled.
144+
DOCUMENT_LABELED = 1;
145+
146+
// Document has not been labelled.
147+
DOCUMENT_UNLABELED = 2;
148+
149+
// Document has been auto-labelled.
150+
DOCUMENT_AUTO_LABELED = 3;
151+
}
152+
130153
message UpdateDatasetRequest {
131154
// Required. The `name` field of the `Dataset` is used to identify the
132155
// resource to be updated.
@@ -137,7 +160,7 @@ message UpdateDatasetRequest {
137160
}
138161

139162
message UpdateDatasetOperationMetadata {
140-
// The basic metadata of the long running operation.
163+
// The basic metadata of the long-running operation.
141164
CommonOperationMetadata common_metadata = 1;
142165
}
143166

@@ -198,9 +221,9 @@ message ImportDocumentsMetadata {
198221
DocumentId output_document_id = 4;
199222
}
200223

201-
// The validation status of each import config. Status is set to errors if
202-
// there is no documents to import in the import_config, or OK if the
203-
// operation will try to proceed at least one document.
224+
// The validation status of each import config. Status is set to an error if
225+
// there are no documents to import in the `import_config`, or `OK` if the
226+
// operation will try to proceed with at least one document.
204227
message ImportConfigValidationResult {
205228
// The source Cloud Storage URI specified in the import config.
206229
string input_gcs_source = 1;
@@ -209,7 +232,7 @@ message ImportDocumentsMetadata {
209232
google.rpc.Status status = 2;
210233
}
211234

212-
// The basic metadata of the long running operation.
235+
// The basic metadata of the long-running operation.
213236
CommonOperationMetadata common_metadata = 1;
214237

215238
// The list of response details of each document.
@@ -249,6 +272,78 @@ message GetDocumentResponse {
249272
Document document = 1;
250273
}
251274

275+
message ListDocumentsRequest {
276+
// Required. The resource name of the dataset to be listed.
277+
// Format:
278+
// projects/{project}/locations/{location}/processors/{processor}/dataset
279+
string dataset = 1 [
280+
(google.api.field_behavior) = REQUIRED,
281+
(google.api.resource_reference) = {
282+
type: "documentai.googleapis.com/Dataset"
283+
}
284+
];
285+
286+
// The maximum number of documents to return. The service may return
287+
// fewer than this value.
288+
// If unspecified, at most 20 documents will be returned.
289+
// The maximum value is 100; values above 100 will be coerced to 100.
290+
int32 page_size = 2;
291+
292+
// A page token, received from a previous `ListDocuments` call.
293+
// Provide this to retrieve the subsequent page.
294+
//
295+
// When paginating, all other parameters provided to `ListDocuments`
296+
// must match the call that provided the page token.
297+
string page_token = 3;
298+
299+
// Optional. Query to filter the documents based on
300+
// https://google.aip.dev/160.
301+
// ## Currently support query strings are:
302+
//
303+
// `SplitType=DATASET_SPLIT_TEST|DATASET_SPLIT_TRAIN|DATASET_SPLIT_UNASSIGNED`
304+
// - `LabelingState=DOCUMENT_LABELED|DOCUMENT_UNLABELED|DOCUMENT_AUTO_LABELED`
305+
// - `DisplayName=\"file_name.pdf\"`
306+
// - `EntityType=abc/def`
307+
// - `TagName=\"auto-labeling-running\"|\"sampled\"`
308+
//
309+
// Note:
310+
// - Only `AND`, `=` and `!=` are supported.
311+
// e.g. `DisplayName=file_name AND EntityType!=abc` IS supported.
312+
// - Wildcard `*` is supported only in `DisplayName` filter
313+
// - No duplicate filter keys are allowed,
314+
// e.g. `EntityType=a AND EntityType=b` is NOT supported.
315+
// - String match is case sensitive (for filter `DisplayName` & `EntityType`).
316+
string filter = 4 [(google.api.field_behavior) = OPTIONAL];
317+
318+
// Optional. Controls if the ListDocuments request requires a total size
319+
// of matched documents. See ListDocumentsResponse.total_size.
320+
//
321+
// Enabling this flag may adversely impact performance.
322+
//
323+
// Defaults to false.
324+
bool return_total_size = 6 [(google.api.field_behavior) = OPTIONAL];
325+
326+
// Optional. Number of results to skip beginning from the `page_token` if
327+
// provided. https://google.aip.dev/158#skipping-results. It must be a
328+
// non-negative integer. Negative values wil be rejected. Note that this is
329+
// not the number of pages to skip. If this value causes the cursor to move
330+
// past the end of results, `ListDocumentsResponse.document_metadata` and
331+
// `ListDocumentsResponse.next_page_token` will be empty.
332+
int32 skip = 8 [(google.api.field_behavior) = OPTIONAL];
333+
}
334+
335+
message ListDocumentsResponse {
336+
// Document metadata corresponding to the listed documents.
337+
repeated DocumentMetadata document_metadata = 1;
338+
339+
// A token, which can be sent as `page_token` to retrieve the next page.
340+
// If this field is omitted, there are no subsequent pages.
341+
string next_page_token = 2;
342+
343+
// Total count of documents queried.
344+
int32 total_size = 3;
345+
}
346+
252347
message BatchDeleteDocumentsRequest {
253348
// Required. The dataset resource name.
254349
// Format:
@@ -276,7 +371,7 @@ message BatchDeleteDocumentsMetadata {
276371
google.rpc.Status status = 2;
277372
}
278373

279-
// The basic metadata of the long running operation.
374+
// The basic metadata of the long-running operation.
280375
CommonOperationMetadata common_metadata = 1;
281376

282377
// The list of response details of each document.
@@ -323,3 +418,21 @@ message DocumentPageRange {
323418
// Last page number (one-based index) to be returned.
324419
int32 end = 2;
325420
}
421+
422+
// Metadata about a document.
423+
message DocumentMetadata {
424+
// Document identifier.
425+
DocumentId document_id = 1;
426+
427+
// Number of pages in the document.
428+
int32 page_count = 2;
429+
430+
// Type of the dataset split to which the document belongs.
431+
DatasetSplitType dataset_type = 3;
432+
433+
// Labelling state of the document.
434+
DocumentLabelingState labeling_state = 5;
435+
436+
// The display name of the document.
437+
string display_name = 6;
438+
}

google/cloud/documentai/v1beta3/documentai_v1beta3.yaml

Lines changed: 0 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -102,10 +102,6 @@ publishing:
102102
github_label: 'api: documentai'
103103
organization: CLOUD
104104
library_settings:
105-
- version: google.cloud.documentai.v1
106-
dotnet_settings:
107-
ignored_resources:
108-
- documentai.googleapis.com/Location
109105
- version: google.cloud.documentai.v1beta3
110106
dotnet_settings:
111107
ignored_resources:

0 commit comments

Comments
 (0)