Skip to content

Commit 53f9b45

Browse files
authored
Merge pull request #10543 from IQSS/10169-JSON-schema-validation
Improved JSON Schema validation for datasets
2 parents 7d4d534 + 55a8bce commit 53f9b45

File tree

9 files changed

+667
-12
lines changed

9 files changed

+667
-12
lines changed
Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
### Improved JSON Schema validation for datasets
2+
3+
Enhanced JSON schema validation with checks for required and allowed child objects, type checking for field types including `primitive`, `compound` and `controlledVocabulary`. More user-friendly error messages to help pinpoint the issues in the dataset JSON. See [Retrieve a Dataset JSON Schema for a Collection](https://guides.dataverse.org/en/6.3/api/native-api.html#retrieve-a-dataset-json-schema-for-a-collection) in the API Guide and PR #10543.

doc/sphinx-guides/source/api/native-api.rst

Lines changed: 17 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -566,9 +566,7 @@ The fully expanded example above (without environment variables) looks like this
566566
Retrieve a Dataset JSON Schema for a Collection
567567
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
568568

569-
Retrieves a JSON schema customized for a given collection in order to validate a dataset JSON file prior to creating the dataset. This
570-
first version of the schema only includes required elements and fields. In the future we plan to improve the schema by adding controlled
571-
vocabulary and more robust dataset field format testing:
569+
Retrieves a JSON schema customized for a given collection in order to validate a dataset JSON file prior to creating the dataset:
572570

573571
.. code-block:: bash
574572
@@ -593,8 +591,22 @@ While it is recommended to download a copy of the JSON Schema from the collectio
593591
Validate Dataset JSON File for a Collection
594592
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
595593

596-
Validates a dataset JSON file customized for a given collection prior to creating the dataset. The validation only tests for json formatting
597-
and the presence of required elements:
594+
Validates a dataset JSON file customized for a given collection prior to creating the dataset.
595+
596+
The validation tests for:
597+
598+
- JSON formatting
599+
- required fields
600+
- typeClass must follow these rules:
601+
602+
- if multiple = true then value must be a list
603+
- if typeClass = ``primitive`` the value object is a String or a List of Strings depending on the multiple flag
604+
- if typeClass = ``compound`` the value object is a FieldDTO or a List of FieldDTOs depending on the multiple flag
605+
- if typeClass = ``controlledVocabulary`` the values are checked against the list of allowed values stored in the database
606+
- typeName validations (child objects with their required and allowed typeNames are configured automatically by the database schema). Examples include:
607+
608+
- dsDescription validation includes checks for typeName = ``dsDescriptionValue`` (required) and ``dsDescriptionDate`` (optional)
609+
- datasetContact validation includes checks for typeName = ``datasetContactName`` (required) and ``datasetContactEmail``; ``datasetContactAffiliation`` (optional)
598610

599611
.. code-block:: bash
600612
Lines changed: 102 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,102 @@
1+
{
2+
"datasetVersion": {
3+
"license": {
4+
"name": "CC0 1.0",
5+
"uri": "http://creativecommons.org/publicdomain/zero/1.0"
6+
},
7+
"metadataBlocks": {
8+
"citation": {
9+
"fields": [
10+
{
11+
"value": "HTML & More",
12+
"typeClass": "primitive",
13+
"multiple": false,
14+
"typeName": "title"
15+
},
16+
{
17+
"value": [
18+
{
19+
"authorName": {
20+
"value": "Markup, Marty",
21+
"typeClass": "primitive",
22+
"multiple": false,
23+
"typeName": "authorName"
24+
},
25+
"authorAffiliation": {
26+
"value": "W4C",
27+
"typeClass": "primitive",
28+
"multiple": false,
29+
"typeName": "authorAffiliation"
30+
}
31+
}
32+
],
33+
"typeClass": "compound",
34+
"multiple": true,
35+
"typeName": "author"
36+
},
37+
{
38+
"value": [
39+
{
40+
"datasetContactEmail": {
41+
"typeClass": "primitive",
42+
"multiple": false,
43+
"typeName": "datasetContactEmail",
44+
"value": "[email protected]"
45+
},
46+
"datasetContactName": {
47+
"typeClass": "primitive",
48+
"multiple": false,
49+
"typeName": "datasetContactName",
50+
"value": "Markup, Marty"
51+
}
52+
}
53+
],
54+
"typeClass": "compound",
55+
"multiple": true,
56+
"typeName": "datasetContact"
57+
},
58+
{
59+
"value": [
60+
{
61+
"dsDescriptionValue": {
62+
"value": "BEGIN<br></br>END",
63+
"multiple": false,
64+
"typeClass": "primitive",
65+
"typeName": "dsDescriptionValue"
66+
},
67+
"dsDescriptionDate": {
68+
"typeName": "dsDescriptionDate",
69+
"multiple": false,
70+
"typeClass": "primitive",
71+
"value": "2021-07-13"
72+
}
73+
}
74+
],
75+
"typeClass": "compound",
76+
"multiple": true,
77+
"typeName": "dsDescription"
78+
},
79+
{
80+
"value": [
81+
"Medicine, Health and Life Sciences"
82+
],
83+
"typeClass": "controlledVocabulary",
84+
"multiple": true,
85+
"typeName": "subject"
86+
},
87+
{
88+
"typeName": "language",
89+
"multiple": true,
90+
"typeClass": "controlledVocabulary",
91+
"value": [
92+
"English",
93+
"Afar",
94+
"aar"
95+
]
96+
}
97+
],
98+
"displayName": "Citation Metadata"
99+
}
100+
}
101+
}
102+
}

src/main/java/edu/harvard/iq/dataverse/DataverseServiceBean.java

Lines changed: 22 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,7 @@
2222
import edu.harvard.iq.dataverse.storageuse.StorageQuota;
2323
import edu.harvard.iq.dataverse.util.StringUtil;
2424
import edu.harvard.iq.dataverse.util.SystemConfig;
25-
import edu.harvard.iq.dataverse.util.json.JsonUtil;
25+
2626
import java.io.File;
2727
import java.io.IOException;
2828
import java.sql.Timestamp;
@@ -34,6 +34,7 @@
3434
import java.util.logging.Logger;
3535
import java.util.Properties;
3636

37+
import edu.harvard.iq.dataverse.validation.JSONDataValidation;
3738
import jakarta.ejb.EJB;
3839
import jakarta.ejb.Stateless;
3940
import jakarta.inject.Inject;
@@ -888,14 +889,16 @@ public List<Object[]> getDatasetTitlesWithinDataverse(Long dataverseId) {
888889
return em.createNativeQuery(cqString).getResultList();
889890
}
890891

891-
892892
public String getCollectionDatasetSchema(String dataverseAlias) {
893+
return getCollectionDatasetSchema(dataverseAlias, null);
894+
}
895+
public String getCollectionDatasetSchema(String dataverseAlias, Map<String, Map<String,List<String>>> schemaChildMap) {
893896

894897
Dataverse testDV = this.findByAlias(dataverseAlias);
895898

896899
while (!testDV.isMetadataBlockRoot()) {
897900
if (testDV.getOwner() == null) {
898-
break; // we are at the root; which by defintion is metadata blcok root, regarldess of the value
901+
break; // we are at the root; which by definition is metadata block root, regardless of the value
899902
}
900903
testDV = testDV.getOwner();
901904
}
@@ -932,6 +935,8 @@ public String getCollectionDatasetSchema(String dataverseAlias) {
932935
dsft.setRequiredDV(dsft.isRequired());
933936
dsft.setInclude(true);
934937
}
938+
List<String> childrenRequired = new ArrayList<>();
939+
List<String> childrenAllowed = new ArrayList<>();
935940
if (dsft.isHasChildren()) {
936941
for (DatasetFieldType child : dsft.getChildDatasetFieldTypes()) {
937942
DataverseFieldTypeInputLevel dsfIlChild = dataverseFieldTypeInputLevelService.findByDataverseIdDatasetFieldTypeId(testDV.getId(), child.getId());
@@ -944,8 +949,18 @@ public String getCollectionDatasetSchema(String dataverseAlias) {
944949
child.setRequiredDV(child.isRequired() && dsft.isRequired());
945950
child.setInclude(true);
946951
}
952+
if (child.isRequired()) {
953+
childrenRequired.add(child.getName());
954+
}
955+
childrenAllowed.add(child.getName());
947956
}
948957
}
958+
if (schemaChildMap != null) {
959+
Map<String, List<String>> map = new HashMap<>();
960+
map.put("required", childrenRequired);
961+
map.put("allowed", childrenAllowed);
962+
schemaChildMap.put(dsft.getName(), map);
963+
}
949964
if(dsft.isRequiredDV()){
950965
requiredDSFT.add(dsft);
951966
}
@@ -1021,11 +1036,13 @@ private String getCustomMDBSchema (MetadataBlock mdb, List<DatasetFieldType> req
10211036
}
10221037

10231038
public String isDatasetJsonValid(String dataverseAlias, String jsonInput) {
1024-
JSONObject rawSchema = new JSONObject(new JSONTokener(getCollectionDatasetSchema(dataverseAlias)));
1039+
Map<String, Map<String,List<String>>> schemaChildMap = new HashMap<>();
1040+
JSONObject rawSchema = new JSONObject(new JSONTokener(getCollectionDatasetSchema(dataverseAlias, schemaChildMap)));
10251041

1026-
try {
1042+
try {
10271043
Schema schema = SchemaLoader.load(rawSchema);
10281044
schema.validate(new JSONObject(jsonInput)); // throws a ValidationException if this object is invalid
1045+
JSONDataValidation.validate(schema, schemaChildMap, jsonInput); // throws a ValidationException if any objects are invalid
10291046
} catch (ValidationException vx) {
10301047
logger.info(BundleUtil.getStringFromBundle("dataverses.api.validate.json.failed") + " " + vx.getErrorMessage());
10311048
String accumulatedexceptions = "";

src/main/java/edu/harvard/iq/dataverse/api/Datasets.java

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,6 @@
11
package edu.harvard.iq.dataverse.api;
22

33
import com.amazonaws.services.s3.model.PartETag;
4-
54
import edu.harvard.iq.dataverse.*;
65
import edu.harvard.iq.dataverse.DatasetLock.Reason;
76
import edu.harvard.iq.dataverse.actionlogging.ActionLogRecord;

0 commit comments

Comments
 (0)