Fix potentially cubic-time regex in parsePatch (kpdecker#647)

ExplodingCabbage · web-flow · commit de95cca53297 · 2026-01-07T14:00:27.000Z
* Add a test for the existing handling of Mercurial patches (This is the feature involved in kpdecker#644, and I want to try to avoid regressions. It previously had no automated tests.) * Fix ReDOS * Add release notes
diff --git a/release-notes.md b/release-notes.md
@@ -5,6 +5,7 @@
 - [#631](https://github.com/kpdecker/jsdiff/pull/631) - **fix support for using an `Intl.Segmenter` with `diffWords`**. This has been almost completely broken since the feature was added in v6.0.0, since it would outright crash on any text that featured two consecutive newlines between a pair of words (a very common case).
 - [#635](https://github.com/kpdecker/jsdiff/pull/635) - **small tweaks to tokenization behaviour of `diffWords`** when used *without* an `Intl.Segmenter`. Specifically, the soft hyphen (U+00AD) is no longer considered to be a word break, and the multiplication and division signs (`×` and `÷`) are now treated as punctuation instead of as letters / word characters.
 - [#641](https://github.com/kpdecker/jsdiff/pull/641) - **the format of file headers in `createPatch` etc. patches can now be customised somewhat**. It now takes a `headerOptions` option that can be used to disable the file headers entirely, or omit the `Index:` line and/or the underline. In particular, this was motivated by a request to make jsdiff patches compatible with react-diff-view, which they now are if produced with `headerOptions: FILE_HEADERS_ONLY`.
+- [#647](https://github.com/kpdecker/jsdiff/pull/647) and [#TODO] - **fix ReDOS vulnerabilities in `parsePatch`**. Previously, adversarially-crafted patch headers could take cubic time to parse; now, `parsePatch` should reliably take linear time. (Handling of headers that include the line break characters `\r`, `\u2028`, or `\u2029` in non-trailing positions is also now more reasonable as side effect of the fix.)
 
 ## 8.0.2
 
diff --git a/src/patch/parse.ts b/src/patch/parse.ts
@@ -23,10 +23,27 @@ export function parsePatch(uniDiff: string): StructuredPatch[] {
         break;
       }
 
-      // Diff index
-      const header = (/^(?:Index:|diff(?: -r \w+)+)\s+(.+?)\s*$/).exec(line);
-      if (header) {
-        index.index = header[1];
+      // Try to parse the line as a diff header, like
+      //     Index: README.md
+      // or
+      //     diff -r 9117c6561b0b -r 273ce12ad8f1 .hgignore
+      // or
+      //     Index: something with multiple words
+      // and extract the filename (or whatever else is used as an index name)
+      // from the end (i.e. 'README.md', '.hgignore', or
+      // 'something with multiple words' in the examples above).
+      //
+      // TODO: It seems awkward that we indiscriminately trim off trailing
+      //       whitespace here. Theoretically, couldn't that be meaningful -
+      //       e.g. if the patch represents a diff of a file whose name ends
+      //       with a space? Seems wrong to nuke it.
+      //       But this behaviour has been around since v2.2.1 in 2015, so if
+      //       it's going to change, it should be done cautiously and in a new
+      //       major release, for backwards-compat reasons.
+      //       -- ExplodingCabbage
+      const headerMatch = (/^(?:Index:|diff(?: -r \w+)+)\s+/).exec(line);
+      if (headerMatch) {
+        index.index = line.substring(headerMatch[0].length).trim();
       }
 
       i++;
diff --git a/test/patch/parse.js b/test/patch/parse.js
@@ -172,6 +172,133 @@ Index: test2
         }]);
     });
 
+    it('should parse the header format used by Mercurial', function() {
+      // (At least, Mercurial is the only tool I could find that uses this
+      // format. Claude was unable to suggest any other tool that would produce
+      // this format, and I don't know of any either. See
+      // https://claude.ai/share/51e202d0-9da0-4dfa-a4a4-d6c6b476b300.)
+      //
+      // Support for this got added by Kevin in commit:
+      // 0c9dd6d0e622d8a32b441b45baa797a7e86a4c55
+      //
+      // I find it a bit odd that (at the time of adding this test) our header
+      // parsing has special handling for Mercurial's diff format but does not
+      // support Git's format (given Git is much more popular). I also find it
+      // a bit odd that we discard the information in the header about what
+      // revisions are being diffed and preserve only the filename (which is
+      // available anyway via the lines below, and exposed by us in the
+      // oldFileName and newFileName fields). But for now I am just trying to
+      // document and test the current state of things.
+      //
+      // -- ExplodingCabbage
+
+      // (Patch below was produced by running `hg diff -r 0 -r 1` in the
+      // Mercurial repo for Mercurial itself.)
+      const patchStr = `diff -r 9117c6561b0b -r 273ce12ad8f1 .hgignore
+--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
++++ b/.hgignore	Tue May 03 13:27:13 2005 -0800
+@@ -0,0 +1,1 @@
++.*~
+diff -r 9117c6561b0b -r 273ce12ad8f1 README
+--- a/README	Tue May 03 13:16:10 2005 -0800
++++ b/README	Tue May 03 13:27:13 2005 -0800
+@@ -69,6 +69,10 @@
+
+ Network support (highly experimental):
+
++ # pull the self-hosting hg repo
++ foo$ hg init
++ foo$ hg merge http://selenic.com/hg/
++
+  # export your .hg directory as a directory on your webserver
+  foo$ ln -s .hg ~/public_html/hg-linux
+
+@@ -76,5 +80,10 @@
+  bar$ hg merge http://foo/~user/hg-linux
+
+  This is just a proof of concept of grabbing byte ranges, and is not
+- expected to perform well.
++ expected to perform well. Fixing this needs some pipelining to reduce
++ the number of round trips. See zsync for a similar approach.
+
++ Another approach which does perform well right now is to use rsync.
++ Simply rsync the remote repo to a read-only local copy and then do a
++ local pull.
++
+`;
+
+      const patchObj = parsePatch(patchStr);
+      expect(patchObj).to.deep.equals([
+        {
+          // Parsed from line `diff -r 9117c6561b0b -r 273ce12ad8f1 .hgignore`:
+          index: '.hgignore',
+          // Parsed from line `--- /dev/null	Thu Jan 01 00:00:00 1970 +0000`:
+          oldFileName: '/dev/null',
+          oldHeader: 'Thu Jan 01 00:00:00 1970 +0000',
+          // Parsed from line `+++ b/.hgignore	Tue May 03 13:27:13 2005 -0800`:
+          newFileName: 'b/.hgignore',
+          newHeader: 'Tue May 03 13:27:13 2005 -0800',
+          hunks: [
+            {
+              oldStart: 1,
+              oldLines: 0,
+              newStart: 1,
+              newLines: 1,
+              lines: [
+                '+.*~'
+              ]
+            }
+          ]
+        },
+        {
+          index: 'README',
+          oldFileName: 'a/README',
+          oldHeader: 'Tue May 03 13:16:10 2005 -0800',
+          newFileName: 'b/README',
+          newHeader: 'Tue May 03 13:27:13 2005 -0800',
+          hunks: [
+            {
+              oldStart: 69,
+              oldLines: 6,
+              newStart: 69,
+              newLines: 10,
+              lines: [
+                '',
+                ' Network support (highly experimental):',
+                '',
+                '+ # pull the self-hosting hg repo',
+                '+ foo$ hg init',
+                '+ foo$ hg merge http://selenic.com/hg/',
+                '+',
+                '  # export your .hg directory as a directory on your webserver',
+                '  foo$ ln -s .hg ~/public_html/hg-linux',
+                ''
+              ]
+            },
+            {
+              oldStart: 76,
+              oldLines: 5,
+              newStart: 80,
+              newLines: 10,
+              lines: [
+                '  bar$ hg merge http://foo/~user/hg-linux',
+                '',
+                '  This is just a proof of concept of grabbing byte ranges, and is not',
+                '- expected to perform well.',
+                '+ expected to perform well. Fixing this needs some pipelining to reduce',
+                '+ the number of round trips. See zsync for a similar approach.',
+                '',
+                '+ Another approach which does perform well right now is to use rsync.',
+                '+ Simply rsync the remote repo to a read-only local copy and then do a',
+                '+ local pull.',
+                '+'
+              ]
+            }
+          ]
+        }
+      ]);
+    });
+
     it('should parse multiple files without the Index line', function() {
       expect(parsePatch(
 `--- from\theader1