@@ -78,6 +78,15 @@ service DocumentService {
7878 option (google.api.method_signature ) = "dataset" ;
7979 }
8080
81+ // Returns a list of documents present in the dataset.
82+ rpc ListDocuments (ListDocumentsRequest ) returns (ListDocumentsResponse ) {
83+ option (google.api.http ) = {
84+ post : "/v1beta3/{dataset=projects/*/locations/*/processors/*/dataset}:listDocuments"
85+ body : "*"
86+ };
87+ option (google.api.method_signature ) = "dataset" ;
88+ }
89+
8190 // Deletes a set of documents.
8291 rpc BatchDeleteDocuments (BatchDeleteDocumentsRequest )
8392 returns (google.longrunning.Operation ) {
@@ -114,7 +123,6 @@ service DocumentService {
114123// referred to as splits: train, test.
115124enum DatasetSplitType {
116125 // Default value if the enum is not set.
117- // go/protodosdonts#do-include-an-unspecified-value-in-an-enum
118126 DATASET_SPLIT_TYPE_UNSPECIFIED = 0 ;
119127
120128 // Identifies the train documents.
@@ -127,6 +135,21 @@ enum DatasetSplitType {
127135 DATASET_SPLIT_UNASSIGNED = 3 ;
128136}
129137
138+ // Describes the labelling status of a document.
139+ enum DocumentLabelingState {
140+ // Default value if the enum is not set.
141+ DOCUMENT_LABELING_STATE_UNSPECIFIED = 0 ;
142+
143+ // Document has been labelled.
144+ DOCUMENT_LABELED = 1 ;
145+
146+ // Document has not been labelled.
147+ DOCUMENT_UNLABELED = 2 ;
148+
149+ // Document has been auto-labelled.
150+ DOCUMENT_AUTO_LABELED = 3 ;
151+ }
152+
130153message UpdateDatasetRequest {
131154 // Required. The `name` field of the `Dataset` is used to identify the
132155 // resource to be updated.
@@ -137,7 +160,7 @@ message UpdateDatasetRequest {
137160}
138161
139162message UpdateDatasetOperationMetadata {
140- // The basic metadata of the long running operation.
163+ // The basic metadata of the long- running operation.
141164 CommonOperationMetadata common_metadata = 1 ;
142165}
143166
@@ -198,9 +221,9 @@ message ImportDocumentsMetadata {
198221 DocumentId output_document_id = 4 ;
199222 }
200223
201- // The validation status of each import config. Status is set to errors if
202- // there is no documents to import in the import_config, or OK if the
203- // operation will try to proceed at least one document.
224+ // The validation status of each import config. Status is set to an error if
225+ // there are no documents to import in the ` import_config` , or `OK` if the
226+ // operation will try to proceed with at least one document.
204227 message ImportConfigValidationResult {
205228 // The source Cloud Storage URI specified in the import config.
206229 string input_gcs_source = 1 ;
@@ -209,7 +232,7 @@ message ImportDocumentsMetadata {
209232 google.rpc.Status status = 2 ;
210233 }
211234
212- // The basic metadata of the long running operation.
235+ // The basic metadata of the long- running operation.
213236 CommonOperationMetadata common_metadata = 1 ;
214237
215238 // The list of response details of each document.
@@ -249,6 +272,78 @@ message GetDocumentResponse {
249272 Document document = 1 ;
250273}
251274
275+ message ListDocumentsRequest {
276+ // Required. The resource name of the dataset to be listed.
277+ // Format:
278+ // projects/{project}/locations/{location}/processors/{processor}/dataset
279+ string dataset = 1 [
280+ (google.api.field_behavior ) = REQUIRED ,
281+ (google.api.resource_reference ) = {
282+ type : "documentai.googleapis.com/Dataset"
283+ }
284+ ];
285+
286+ // The maximum number of documents to return. The service may return
287+ // fewer than this value.
288+ // If unspecified, at most 20 documents will be returned.
289+ // The maximum value is 100; values above 100 will be coerced to 100.
290+ int32 page_size = 2 ;
291+
292+ // A page token, received from a previous `ListDocuments` call.
293+ // Provide this to retrieve the subsequent page.
294+ //
295+ // When paginating, all other parameters provided to `ListDocuments`
296+ // must match the call that provided the page token.
297+ string page_token = 3 ;
298+
299+ // Optional. Query to filter the documents based on
300+ // https://google.aip.dev/160.
301+ // ## Currently support query strings are:
302+ //
303+ // `SplitType=DATASET_SPLIT_TEST|DATASET_SPLIT_TRAIN|DATASET_SPLIT_UNASSIGNED`
304+ // - `LabelingState=DOCUMENT_LABELED|DOCUMENT_UNLABELED|DOCUMENT_AUTO_LABELED`
305+ // - `DisplayName=\"file_name.pdf\"`
306+ // - `EntityType=abc/def`
307+ // - `TagName=\"auto-labeling-running\"|\"sampled\"`
308+ //
309+ // Note:
310+ // - Only `AND`, `=` and `!=` are supported.
311+ // e.g. `DisplayName=file_name AND EntityType!=abc` IS supported.
312+ // - Wildcard `*` is supported only in `DisplayName` filter
313+ // - No duplicate filter keys are allowed,
314+ // e.g. `EntityType=a AND EntityType=b` is NOT supported.
315+ // - String match is case sensitive (for filter `DisplayName` & `EntityType`).
316+ string filter = 4 [(google.api.field_behavior ) = OPTIONAL ];
317+
318+ // Optional. Controls if the ListDocuments request requires a total size
319+ // of matched documents. See ListDocumentsResponse.total_size.
320+ //
321+ // Enabling this flag may adversely impact performance.
322+ //
323+ // Defaults to false.
324+ bool return_total_size = 6 [(google.api.field_behavior ) = OPTIONAL ];
325+
326+ // Optional. Number of results to skip beginning from the `page_token` if
327+ // provided. https://google.aip.dev/158#skipping-results. It must be a
328+ // non-negative integer. Negative values wil be rejected. Note that this is
329+ // not the number of pages to skip. If this value causes the cursor to move
330+ // past the end of results, `ListDocumentsResponse.document_metadata` and
331+ // `ListDocumentsResponse.next_page_token` will be empty.
332+ int32 skip = 8 [(google.api.field_behavior ) = OPTIONAL ];
333+ }
334+
335+ message ListDocumentsResponse {
336+ // Document metadata corresponding to the listed documents.
337+ repeated DocumentMetadata document_metadata = 1 ;
338+
339+ // A token, which can be sent as `page_token` to retrieve the next page.
340+ // If this field is omitted, there are no subsequent pages.
341+ string next_page_token = 2 ;
342+
343+ // Total count of documents queried.
344+ int32 total_size = 3 ;
345+ }
346+
252347message BatchDeleteDocumentsRequest {
253348 // Required. The dataset resource name.
254349 // Format:
@@ -276,7 +371,7 @@ message BatchDeleteDocumentsMetadata {
276371 google.rpc.Status status = 2 ;
277372 }
278373
279- // The basic metadata of the long running operation.
374+ // The basic metadata of the long- running operation.
280375 CommonOperationMetadata common_metadata = 1 ;
281376
282377 // The list of response details of each document.
@@ -323,3 +418,21 @@ message DocumentPageRange {
323418 // Last page number (one-based index) to be returned.
324419 int32 end = 2 ;
325420}
421+
422+ // Metadata about a document.
423+ message DocumentMetadata {
424+ // Document identifier.
425+ DocumentId document_id = 1 ;
426+
427+ // Number of pages in the document.
428+ int32 page_count = 2 ;
429+
430+ // Type of the dataset split to which the document belongs.
431+ DatasetSplitType dataset_type = 3 ;
432+
433+ // Labelling state of the document.
434+ DocumentLabelingState labeling_state = 5 ;
435+
436+ // The display name of the document.
437+ string display_name = 6 ;
438+ }
0 commit comments