refactor: Address PR comments.

binh-dam-ibigroup · binh-dam-ibigroup · commit 06b2156db917 · 2021-11-16T10:18:58.000-05:00
diff --git a/src/main/java/com/conveyal/datatools/manager/jobs/MergeFeedsJob.java b/src/main/java/com/conveyal/datatools/manager/jobs/MergeFeedsJob.java
@@ -55,7 +55,8 @@
  * found in any other feed version. Note: There is absolutely no attempt to merge
  * entities based on either expected shared IDs or entity location (e.g., stop
  * coordinates).
- * - {@link MergeFeedsType#SERVICE_PERIOD}:      this strategy is defined in detail at https://github.com/conveyal/datatools-server/issues/185,
+ * - {@link MergeFeedsType#SERVICE_PERIOD}:
+ * this strategy is defined in detail at https://github.com/conveyal/datatools-server/issues/185,
  * but in essence, this strategy attempts to merge an active and future feed into
  * a combined file. For certain entities (specifically stops and routes) it uses
  * alternate fields as primary keys (stop_code and route_short_name) if they are
@@ -66,55 +67,6 @@
  * prefer entities from the active version, so that entities edited in Data Tools would override the values found
  * in the "future" file, which may have limited data attributes due to being exported from scheduling software with
  * limited GTFS support.
- *
- * Reproduced from https://github.com/conveyal/datatools-server/issues/185 on 2019/04/23:
- *
- * 1. When a new GTFS+ feed is loaded in TDM, check as part of the loading and validation process if
- *    the dataset is for a future date. (If all services start in the future, consider the dataset
- *    to be for the future).
- * 2. If it is a future dataset, automatically notify the user that the feed needs to be merged with
- *    most recent active version or a selected one in order to further process the feed.
- * 3. Use the chosen version to merge the future feed. The merging process needs to be efficient so
- *    that the user doesn’t need to wait more than a tolerable time.
- * 4. The merge process shall compare the active and future datasets, validate the following rules
- *    and generate the Merge Validation Report:
- *    i. Merging will be based on route_short_name in the active and future datasets. All matching
- *      route_short_names between the datasets shall be considered same route. Any route_short_name
- *      in active data not present in the future will be appended to the future routes file.
- *    ii. Future feed_info.txt file should get priority over active feed file when difference is
- *      identified.
- *    iii. When difference is found in agency.txt file between active and future feeds, the future
- *      agency.txt file data should be used. Possible issue with missing agency_id referenced by routes
- *    iv. When stop_code is included, stop merging will be based on that. If stop_code is not
- *      included, it will be based on stop_id. All stops in future data will be carried forward and
- *      any stops found in active data that are not in the future data shall be appended. If one
- *      of the feed is missing stop_code, merge fails with a notification to the user with
- *      suggestion that the feed with missing stop_code must be fixed with stop_code.
- *    v. If any service_id in the active feed matches with the future feed, it should be modified
- *      and all associated trip records must also be changed with the modified service_id.
- *      If a service_id from the active calendar has both the start_date and end_date in the
- *      future, the service shall not be appended to the merged file. Records in trips,
- *      calendar_dates, and calendar_attributes referencing this service_id shall also be
- *      removed/ignored. Stop_time records for the ignored trips shall also be removed.
- *      If a service_id from the active calendar has only the end_date in the future, the end_date
- *      shall be set to one day prior to the earliest start_date in future dataset before appending
- *      the calendar record to the merged file.
- *      trip_ids between active and future datasets must not match. If any trip_id is found to be
- *      matching, the merge should fail with appropriate notification to user with the cause of the
- *      failure. Notification should include all matched trip_ids.
- *    vi. New shape_ids in the future datasets should be appended in the merged feed.
- *    vii. Merging fare_attributes will be based on fare_id in the active and future datasets. All
- *      matching fare_ids between the datasets shall be considered same fare. Any fare_id in active
- *      data not present in the future will be appended to the future fare_attributes file.
- *    viii. All fare rules from the future dataset will be included. Any identical fare rules from
- *      the active dataset will be discarded. Any fare rules unique to the active dataset will be
- *      appended to the future file.
- *    ix. All transfers.txt entries with unique stop pairs (from - to) from both the future and
- *      active datasets will be included in the merged file. Entries with duplicate stop pairs from
- *      the active dataset will be discarded.
- *    x. All GTFS+ files should be merged based on how the associated base GTFS file is merged. For
- *      example, directions for routes that are not in the future routes.txt file should be appended
- *      to the future directions.txt file in the merged feed.
  */
 public class MergeFeedsJob extends FeedSourceJob {
 
@@ -143,31 +95,13 @@ public class MergeFeedsJob extends FeedSourceJob {
     // Variables used for a service period merge.
     private FeedMergeContext feedMergeContext;
 
-    public MergeFeedsJob(Auth0UserProfile owner, Set<FeedVersion> feedVersions, String file, MergeFeedsType mergeType) {
-        this(owner, feedVersions, file, mergeType, true);
-    }
-
-    /** Shorthand method to get the future feed during a service period merge */
-    @BsonIgnore @JsonIgnore
-    public FeedToMerge getFutureFeed() {
-        return feedMergeContext.futureFeedToMerge;
-    }
-
-    /** Shorthand method to get the active feed during a service period merge */
-    @BsonIgnore @JsonIgnore
-    public FeedToMerge getActiveFeed() {
-        return feedMergeContext.activeFeedToMerge;
-    }
-
     /**
      * @param owner             user ID that initiated job
      * @param feedVersions      set of feed versions to merge
      * @param file              resulting merge filename (without .zip)
      * @param mergeType         the type of merge to perform {@link MergeFeedsType}
-     * @param storeNewVersion   whether to store merged feed as new version
      */
-    public MergeFeedsJob(Auth0UserProfile owner, Set<FeedVersion> feedVersions, String file,
-                         MergeFeedsType mergeType, boolean storeNewVersion) {
+    public MergeFeedsJob(Auth0UserProfile owner, Set<FeedVersion> feedVersions, String file, MergeFeedsType mergeType) {
         super(owner, mergeType.equals(REGIONAL) ? "Merging project feeds" : "Merging feed versions",
             JobType.MERGE_FEED_VERSIONS);
         this.feedVersions = feedVersions;
@@ -182,7 +116,7 @@ public MergeFeedsJob(Auth0UserProfile owner, Set<FeedVersion> feedVersions, Stri
         // Grab parent feed source depending on merge type.
         FeedSource regionalFeedSource = null;
         // If storing a regional merge as a new version, find the feed source designated by the project.
-        if (mergeType.equals(REGIONAL) && storeNewVersion) {
+        if (mergeType.equals(REGIONAL)) {
             regionalFeedSource = Persistence.feedSources.getById(project.regionalFeedSourceId);
             // Create new feed source if this is the first regional merge.
             if (regionalFeedSource == null) {
@@ -201,7 +135,7 @@ public MergeFeedsJob(Auth0UserProfile owner, Set<FeedVersion> feedVersions, Stri
             : feedVersions.iterator().next().parentFeedSource();
         // Assuming job is successful, mergedVersion will contain the resulting feed version.
         // Merged version will be null if the new version should not be stored.
-        this.mergedVersion = getMergedVersion(this, storeNewVersion);
+        this.mergedVersion = getMergedVersion(this, true);
         this.mergeFeedsResult = new MergeFeedsResult(mergeType);
     }
 
@@ -210,11 +144,6 @@ public Set<FeedVersion> getFeedVersions() {
         return this.feedVersions;
     }
 
-    @BsonIgnore @JsonIgnore
-    public List<FeedToMerge> getFeedsToMerge() {
-        return this.feedMergeContext.feedsToMerge;
-    }
-
     /**
      * The final stage handles clean up (deleting temp file) and adding the next job to process the
      * new merged version (assuming the merge did not fail).
diff --git a/src/main/java/com/conveyal/datatools/manager/jobs/feedmerge/MergeFeedsResult.java b/src/main/java/com/conveyal/datatools/manager/jobs/feedmerge/MergeFeedsResult.java
@@ -18,8 +18,6 @@ public class MergeFeedsResult implements Serializable {
     /** Type of merge operation performed */
     public MergeFeedsType type;
     public MergeStrategy mergeStrategy = MergeStrategy.DEFAULT;
-    /** Contains a set of strings for which there were error-causing duplicate values */
-    public Set<String> idConflicts = new HashSet<>();
     /** Contains the set of IDs for records that were excluded in the merged feed */
     public Set<String> skippedIds = new HashSet<>();
     /**
diff --git a/src/main/java/com/conveyal/datatools/manager/jobs/feedmerge/MergeLineContext.java b/src/main/java/com/conveyal/datatools/manager/jobs/feedmerge/MergeLineContext.java
@@ -41,7 +41,6 @@
 public class MergeLineContext {
     protected static final String AGENCY_ID = "agency_id";
     protected static final String SERVICE_ID = "service_id";
-    private static final String STOPS = "stops";
     private static final Logger LOG = LoggerFactory.getLogger(MergeLineContext.class);
     protected final MergeFeedsJob job;
     private final ZipOutputStream out;
@@ -91,7 +90,7 @@ public static MergeLineContext create(MergeFeedsJob job, Table table, ZipOutputS
                 return new RoutesMergeLineContext(job, table, out);
             case "shapes":
                 return new ShapesMergeLineContext(job, table, out);
-            case STOPS:
+            case "stops":
                 return new StopsMergeLineContext(job, table, out);
             case "trips":
                 return new TripsMergeLineContext(job, table, out);
@@ -268,21 +267,17 @@ public void checkFieldsForMergeConflicts(Set<NewGTFSError> idErrors) throws IOEx
     }
 
     private Set<NewGTFSError> getIdErrors() {
-        Set<NewGTFSError> idErrors;
-        Field field = fieldContext.getField();
+        String fieldValue;
         // If analyzing the second feed (active feed), the service_id always gets feed scoped.
         // See https://github.com/ibi-group/datatools-server/issues/244
         if (handlingActiveFeed && fieldNameEquals(SERVICE_ID)) {
             updateAndRemapOutput();
-            idErrors = referenceTracker
-                .checkReferencesAndUniqueness(keyValue, lineNumber, field, fieldContext.getValueToWrite(),
-                    table, keyField, orderField);
+            fieldValue = fieldContext.getValueToWrite();
         } else {
-            idErrors = referenceTracker
-                .checkReferencesAndUniqueness(keyValue, lineNumber, field, fieldContext.getValue(),
-                    table, keyField, orderField);
+            fieldValue = fieldContext.getValue();
         }
-        return idErrors;
+        return referenceTracker.checkReferencesAndUniqueness(keyValue, lineNumber, fieldContext.getField(),
+            fieldValue, table, keyField, orderField);
     }
 
     protected void checkRoutesAndStopsIds(Set<NewGTFSError> idErrors) throws IOException {
@@ -345,7 +340,11 @@ protected void checkRoutesAndStopsIds(Set<NewGTFSError> idErrors) throws IOExcep
             // where two routes have different short_names, but share the same route_id. We want
             // both of these routes to end up in the merged feed in this case because we're
             // matching on short name, so we must modify the route_id.
-            if (!skipRecord && !referenceTracker.transitIds.contains(String.join(":", keyField, keyValue)) && hasDuplicateError(primaryKeyErrors)) {
+            if (
+                !skipRecord &&
+                !referenceTracker.transitIds.contains(String.join(":", keyField, keyValue)) &&
+                hasDuplicateError(primaryKeyErrors)
+            ) {
                 // Modify route_id and ensure that referencing trips
                 // have route_id updated.
                 updateAndRemapOutput();
@@ -398,7 +397,7 @@ public boolean storeRowAndStopValues() {
             // Store row values for route or stop ID (or alternative ID field) in order
             // to check for ID conflicts. NOTE: This is only intended to be used for
             // routes and stops. Otherwise, this might (will) consume too much memory.
-            case STOPS:
+            case "stops":
             case "routes":
                 // FIXME: This should be revised for tables with order fields, but it should work fine for its
                 //  primary purposes: to detect exact copy rows and to temporarily hold the data in case a reference
diff --git a/src/main/java/com/conveyal/datatools/manager/jobs/feedmerge/StopsMergeLineContext.java b/src/main/java/com/conveyal/datatools/manager/jobs/feedmerge/StopsMergeLineContext.java
@@ -25,15 +25,19 @@ public StopsMergeLineContext(MergeFeedsJob job, Table table, ZipOutputStream out
 
     @Override
     public void checkFirstLineConditions() throws IOException {
-        checkStopCodeStuff();
+        checkThatStopCodesArePopulatedWhereRequired();
     }
 
     @Override
     public void checkFieldsForMergeConflicts(Set<NewGTFSError> idErrors) throws IOException {
         checkRoutesAndStopsIds(idErrors);
     }
 
-    private void checkStopCodeStuff() throws IOException {
+    /**
+     * Checks that the stop_code field of the Stop entities to merge is populated where required.
+     * @throws IOException
+     */
+    private void checkThatStopCodesArePopulatedWhereRequired() throws IOException {
         if (shouldCheckStopCodes()) {
             // Before reading any lines in stops.txt, first determine whether all records contain
             // properly filled stop_codes. The rules governing this logic are as follows:
diff --git a/src/main/java/com/conveyal/datatools/manager/utils/MergeFeedUtils.java b/src/main/java/com/conveyal/datatools/manager/utils/MergeFeedUtils.java
@@ -50,11 +50,16 @@ public static Set<String> getIdsForTable(ZipFile zipFile, Table table) throws IO
             LOG.warn("Table {} not found in zip file: {}", table.name, zipFile.getName());
             return ids;
         }
-        Field[] fieldsFoundInZip = table.getFieldsFromFieldHeaders(csvReader.getHeaders(), null);
-        // Get the key field (id value) for each row.
-        int keyFieldIndex = getFieldIndex(fieldsFoundInZip, keyField);
-        while (csvReader.readRecord()) ids.add(csvReader.get(keyFieldIndex));
-        csvReader.close();
+        try {
+            Field[] fieldsFoundInZip = table.getFieldsFromFieldHeaders(csvReader.getHeaders(), null);
+            // Get the key field (id value) for each row.
+            int keyFieldIndex = getFieldIndex(fieldsFoundInZip, keyField);
+            while (csvReader.readRecord()) {
+                ids.add(csvReader.get(keyFieldIndex));
+            }
+        } finally {
+            csvReader.close();
+        }
         return ids;
     }
 
@@ -117,10 +122,13 @@ public static Set<Field> getAllFields(List<FeedToMerge> feedsToMerge, Table tabl
             if (csvReader == null) {
                 continue;
             }
-            // Get fields found from headers and add them to the shared fields set.
-            Field[] fieldsFoundInZip = table.getFieldsFromFieldHeaders(csvReader.getHeaders(), null);
-            sharedFields.addAll(Arrays.asList(fieldsFoundInZip));
-            csvReader.close();
+            try {
+                // Get fields found from headers and add them to the shared fields set.
+                Field[] fieldsFoundInZip = table.getFieldsFromFieldHeaders(csvReader.getHeaders(), null);
+                sharedFields.addAll(Arrays.asList(fieldsFoundInZip));
+            } finally {
+                csvReader.close();
+            }
         }
         return sharedFields;
     }