[core] Fix memory usage regression in CPD (#5090)

adangel · adangel · commit 3222807dec42 · 2024-06-28T08:28:08.000+02:00
Merge pull request #5090 from Monits:issue-5066
diff --git a/docs/pages/release_notes.md b/docs/pages/release_notes.md
@@ -40,6 +40,7 @@ See also [Maven PMD Plugin]({{ baseurl }}pmd_userdocs_tools_maven.html).
   * [#2827](https://github.com/pmd/pmd/issues/2827): \[cli] Consider processing errors in exit status
 * core
   * [#4992](https://github.com/pmd/pmd/pull/4992): \[core] CPD: Include processing errors in XML report
+  * [#5066](https://github.com/pmd/pmd/issues/5066): \[core] CPD throws java.lang.OutOfMemoryError: Java heap space (since 7.1.0)
 * apex
   * [#4922](https://github.com/pmd/pmd/issues/4922): \[apex] SOQL syntax error with TYPEOF in sub-query
   * [#5053](https://github.com/pmd/pmd/issues/5053): \[apex] CPD fails to parse string literals with escaped characters
diff --git a/pmd-core/src/main/java/net/sourceforge/pmd/cpd/MatchCollector.java b/pmd-core/src/main/java/net/sourceforge/pmd/cpd/MatchCollector.java
@@ -68,15 +68,15 @@ private void reportMatch(TokenEntry mark1, TokenEntry mark2, int dupes) {
          *  - BC
          * It should be reduced to a single match with 3 marks
          */
-        if (tokenMatchSets.computeIfAbsent(mark1.getIndex(), HashSet::new).contains(mark2.getIndex())) {
+        if (tokenMatchSets.computeIfAbsent(mark1.getIndex(), (i) -> new HashSet<>()).contains(mark2.getIndex())) {
             return;
         }
 
         // This may not be a "new match", but actually a sub-match of a larger one.
         // always rely on the lowest mark index, as that's the order in which process them
         final int lowestKey = tokenMatchSets.get(mark1.getIndex()).stream().reduce(mark1.getIndex(), Math::min);
 
-        List<Match> matches = matchTree.computeIfAbsent(lowestKey, ArrayList::new);
+        List<Match> matches = matchTree.computeIfAbsent(lowestKey, (i) -> new ArrayList<>());
         Iterator<Match> matchIterator = matches.iterator();
         while (matchIterator.hasNext()) {
             Match m = matchIterator.next();
@@ -116,8 +116,8 @@ private void reportMatch(TokenEntry mark1, TokenEntry mark2, int dupes) {
     }
 
     private void registerTokenMatch(TokenEntry mark1, TokenEntry mark2) {
-        tokenMatchSets.computeIfAbsent(mark1.getIndex(), HashSet::new).add(mark2.getIndex());
-        tokenMatchSets.computeIfAbsent(mark2.getIndex(), HashSet::new).add(mark1.getIndex());
+        tokenMatchSets.computeIfAbsent(mark1.getIndex(), (i) -> new HashSet<>()).add(mark2.getIndex());
+        tokenMatchSets.computeIfAbsent(mark2.getIndex(), (i) -> new HashSet<>()).add(mark1.getIndex());
     }
 
     List<Match> getMatches() {