Fix another bug in diffWords's "intlSegmenter" mode (kpdecker#667)

ExplodingCabbage · web-flow · commit 4b5d1800370b · 2026-02-18T16:43:38.000Z
* Add test showing the bug

* Fix the bug

* Add one more test of another weird case

* Add release notes

* Fix test bug the AI found

* Fix another test bug
diff --git a/release-notes.md b/release-notes.md
@@ -1,5 +1,9 @@
 # Release Notes
 
+## 8.0.4 (prerelease)
+
+- [#667](https://github.com/kpdecker/jsdiff/pull/667) - **fix another bug in `diffWords` when used with an `Intl.Segmenter`**. If the text to be diffed included a combining mark after a whitespace character (i.e. roughly speaking, an accented space), `diffWords` would previously crash. Now this case is handled correctly.
+
 ## 8.0.3
 
 - [#631](https://github.com/kpdecker/jsdiff/pull/631) - **fix support for using an `Intl.Segmenter` with `diffWords`**. This has been almost completely broken since the feature was added in v6.0.0, since it would outright crash on any text that featured two consecutive newlines between a pair of words (a very common case).
diff --git a/src/diff/word.ts b/src/diff/word.ts
@@ -1,6 +1,6 @@
 import Diff from './base.js';
 import type { ChangeObject, CallbackOptionAbortable, CallbackOptionNonabortable, DiffCallbackNonabortable, DiffWordsOptionsAbortable, DiffWordsOptionsNonabortable} from '../types.js';
-import { longestCommonPrefix, longestCommonSuffix, replacePrefix, replaceSuffix, removePrefix, removeSuffix, maximumOverlap, leadingWs, trailingWs } from '../util/string.js';
+import { longestCommonPrefix, longestCommonSuffix, replacePrefix, replaceSuffix, removePrefix, removeSuffix, maximumOverlap, leadingWs, trailingWs, leadingAndTrailingWs, segment } from '../util/string.js';
 
 // Based on https://en.wikipedia.org/wiki/Latin_script_in_Unicode
 //
@@ -72,21 +72,9 @@ class WordDiff extends Diff<string, string> {
       // We want `parts` to be an array whose elements alternate between being
       // pure whitespace and being pure non-whitespace. This is ALMOST what the
       // segments returned by a word-based Intl.Segmenter already look like,
-      // and therefore we can ALMOST get what we want by simply doing...
-      //     parts = Array.from(segmenter.segment(value), segment => segment.segment);
-      // ... but not QUITE, because there's of one annoying special case: every
-      // newline character gets its own segment, instead of sharing a segment
-      // with other surrounding whitespace. We therefore need to manually merge
-      // consecutive segments of whitespace into a single part:
-      parts = [];
-      for (const segmentObj of Array.from(segmenter.segment(value))) {
-        const segment = segmentObj.segment;
-        if (parts.length && (/\s/).test(parts[parts.length - 1]) && (/\s/).test(segment)) {
-          parts[parts.length - 1] += segment;
-        } else {
-          parts.push(segment);
-        }
-      }
+      // but not quite - see explanation in the docs of our custom segment()
+      // function.
+      parts = segment(value, segmenter);
     } else {
       parts = value.match(tokenizeIncludingWhitespace) || [];
     }
@@ -146,15 +134,15 @@ class WordDiff extends Diff<string, string> {
         deletion = change;
       } else {
         if (insertion || deletion) { // May be false at start of text
-          dedupeWhitespaceInChangeObjects(lastKeep, deletion, insertion, change);
+          dedupeWhitespaceInChangeObjects(lastKeep, deletion, insertion, change, options.intlSegmenter);
         }
         lastKeep = change;
         insertion = null;
         deletion = null;
       }
     });
     if (insertion || deletion) {
-      dedupeWhitespaceInChangeObjects(lastKeep, deletion, insertion, null);
+      dedupeWhitespaceInChangeObjects(lastKeep, deletion, insertion, null, options.intlSegmenter);
     }
     return changes;
   }
@@ -209,7 +197,8 @@ function dedupeWhitespaceInChangeObjects(
   startKeep: ChangeObject<string> | null,
   deletion: ChangeObject<string> | null,
   insertion: ChangeObject<string> | null,
-  endKeep: ChangeObject<string> | null
+  endKeep: ChangeObject<string> | null,
+  segmenter?: Intl.Segmenter
 ) {
   // Before returning, we tidy up the leading and trailing whitespace of the
   // change objects to eliminate cases where trailing whitespace in one object
@@ -254,10 +243,8 @@ function dedupeWhitespaceInChangeObjects(
   // * Just a "delete"
   // We handle the three cases separately.
   if (deletion && insertion) {
-    const oldWsPrefix = leadingWs(deletion.value);
-    const oldWsSuffix = trailingWs(deletion.value);
-    const newWsPrefix = leadingWs(insertion.value);
-    const newWsSuffix = trailingWs(insertion.value);
+    const [oldWsPrefix, oldWsSuffix] = leadingAndTrailingWs(deletion.value, segmenter);
+    const [newWsPrefix, newWsSuffix] = leadingAndTrailingWs(insertion.value, segmenter);
 
     if (startKeep) {
       const commonWsPrefix = longestCommonPrefix(oldWsPrefix, newWsPrefix);
@@ -279,18 +266,17 @@ function dedupeWhitespaceInChangeObjects(
     // whitespace and deleting duplicate leading whitespace where
     // present.
     if (startKeep) {
-      const ws = leadingWs(insertion.value);
+      const ws = leadingWs(insertion.value, segmenter);
       insertion.value = insertion.value.substring(ws.length);
     }
     if (endKeep) {
-      const ws = leadingWs(endKeep.value);
+      const ws = leadingWs(endKeep.value, segmenter);
       endKeep.value = endKeep.value.substring(ws.length);
     }
   // otherwise we've got a deletion and no insertion
   } else if (startKeep && endKeep) {
-    const newWsFull = leadingWs(endKeep.value),
-        delWsStart = leadingWs(deletion!.value),
-        delWsEnd = trailingWs(deletion!.value);
+    const newWsFull = leadingWs(endKeep.value, segmenter),
+        [delWsStart, delWsEnd] = leadingAndTrailingWs(deletion!.value, segmenter);
 
     // Any whitespace that comes straight after startKeep in both the old and
     // new texts, assign to startKeep and remove from the deletion.
@@ -318,16 +304,16 @@ function dedupeWhitespaceInChangeObjects(
     // We are at the start of the text. Preserve all the whitespace on
     // endKeep, and just remove whitespace from the end of deletion to the
     // extent that it overlaps with the start of endKeep.
-    const endKeepWsPrefix = leadingWs(endKeep.value);
-    const deletionWsSuffix = trailingWs(deletion!.value);
+    const endKeepWsPrefix = leadingWs(endKeep.value, segmenter);
+    const deletionWsSuffix = trailingWs(deletion!.value, segmenter);
     const overlap = maximumOverlap(deletionWsSuffix, endKeepWsPrefix);
     deletion!.value = removeSuffix(deletion!.value, overlap);
   } else if (startKeep) {
     // We are at the END of the text. Preserve all the whitespace on
     // startKeep, and just remove whitespace from the start of deletion to
     // the extent that it overlaps with the end of startKeep.
-    const startKeepWsSuffix = trailingWs(startKeep.value);
-    const deletionWsPrefix = leadingWs(deletion!.value);
+    const startKeepWsSuffix = trailingWs(startKeep.value, segmenter);
+    const deletionWsPrefix = leadingWs(deletion!.value, segmenter);
     const overlap = maximumOverlap(startKeepWsSuffix, deletionWsPrefix);
     deletion!.value = removePrefix(deletion!.value, overlap);
   }
diff --git a/src/util/string.ts b/src/util/string.ts
@@ -102,7 +102,48 @@ export function hasOnlyUnixLineEndings(string: string): boolean {
   return !string.includes('\r\n') && string.includes('\n');
 }
 
-export function trailingWs(string: string): string {
+/**
+ * Split a string into segments using a word segmenter, merging consecutive
+ * segments if they are both whitespace segments. Whitespace segments can
+ * appear adjacent to one another for two reasons:
+ * - newlines always get their own segment
+ * - where a diacritic is attached to a whitespace character in the text, the
+ *   segment ends after the diacritic, so e.g. " \u0300 " becomes two segments.
+ * This function therefore runs the segmenter's .segment() method and then
+ * merges consecutive segments of whitespace into a single part.
+ */
+export function segment(string: string, segmenter: Intl.Segmenter): string[] {
+  const parts = [];
+  for (const segmentObj of Array.from(segmenter.segment(string))) {
+    const segment = segmentObj.segment;
+    if (parts.length && (/\s/).test(parts[parts.length - 1]) && (/\s/).test(segment)) {
+      parts[parts.length - 1] += segment;
+    } else {
+      parts.push(segment);
+    }
+  }
+  return parts;
+}
+
+// The functions below take a `segmenter` argument so that, when called from
+// diffWords when it is using a segmenter, they can use a notion of what
+// constitutes "whitespace" that is consistent with the segmenter.
+//
+// USUALLY this will be identical to the result of the non-segmenter-based
+// logic, but it differs in at least one case: when whitespace characters are
+// modified by diacritics. A word segmenter considers these diacritics to be
+// part of the whitespace, whereas our non-segmenter-based logic does not.
+//
+// Because the segmenter-based approach necessarily requires segmenting the
+// entire string, we offer a leadingAndTrailingWs function to allow getting the
+// whitespace prefix AND whitespace suffix with a single call to the segmenter,
+// for efficiency's sake.
+
+export function trailingWs(string: string, segmenter?: Intl.Segmenter): string {
+  if (segmenter) {
+    return leadingAndTrailingWs(string, segmenter)[1];
+  }
+
   // Yes, this looks overcomplicated and dumb - why not replace the whole function with
   //     return string.match(/\s*$/)[0]
   // you ask? Because:
@@ -123,8 +164,32 @@ export function trailingWs(string: string): string {
   return string.substring(i + 1);
 }
 
-export function leadingWs(string: string): string {
+export function leadingWs(string: string, segmenter?: Intl.Segmenter): string {
+  if (segmenter) {
+    return leadingAndTrailingWs(string, segmenter)[0];
+  }
+
   // Thankfully the annoying considerations described in trailingWs don't apply here:
   const match = string.match(/^\s*/);
   return match ? match[0] : '';
 }
+
+export function leadingAndTrailingWs(
+  string: string,
+  segmenter?: Intl.Segmenter
+): [string, string] {
+  if (!segmenter) {
+    return [leadingWs(string), trailingWs(string)];
+  }
+
+  if (segmenter.resolvedOptions().granularity != 'word') {
+    throw new Error('The segmenter passed must have a granularity of "word"');
+  }
+
+  const segments = segment(string, segmenter);
+  const firstSeg = segments[0];
+  const lastSeg = segments[segments.length - 1];
+  const head = (/\s/).test(firstSeg) ? firstSeg : '';
+  const tail = (/\s/).test(lastSeg) ? lastSeg : '';
+  return [head, tail];
+}
diff --git a/test/diff/word.js b/test/diff/word.js
@@ -6,11 +6,8 @@ import {expect} from 'chai';
 describe('WordDiff', function() {
   describe('#tokenize', function() {
     it('should give each word & punctuation mark its own token, including leading and trailing whitespace', function() {
-      expect(
-        wordDiff.tokenize(
-          'foo bar baz jurídica wir üben    bla\t\t \txyzáxyz  \n\n\n  animá-los\r\n\r\n(wibbly wobbly)().'
-        )
-      ).to.deep.equal([
+      const string = 'foo bar baz jurídica wir üben    bla\t\t \txyzáxyz  \n\n\n  animá-los\r\n\r\n(wibbly wobbly)().';
+      const expectedResult = [
         'foo ',
         ' bar ',
         ' baz ',
@@ -29,7 +26,12 @@ describe('WordDiff', function() {
         '(',
         ')',
         '.'
-      ]);
+      ];
+      expect(wordDiff.tokenize(string)).to.deep.equal(expectedResult);
+      expect(wordDiff.tokenize(
+        string,
+        { intlSegmenter: new Intl.Segmenter('en', { granularity: 'word' }) }
+      )).to.deep.equal(expectedResult);
     });
 
     // Test for bug reported at https://github.com/kpdecker/jsdiff/issues/553
@@ -379,6 +381,78 @@ describe('WordDiff', function() {
         '<del>A</del><ins>B</ins>\n\nX'
       );
     });
+
+    it('handles diacritics on whitespace differently in Segmenter mode vs normal mode', () => {
+      // Regression test for https://github.com/kpdecker/jsdiff/issues/664
+      const oldString = 'abc \u0300X def';
+      const newString = 'abc \u0300Y ghi';
+
+      expect(diffWords(oldString, newString)).to.deep.equal([
+        {
+          count: 2,
+          added: false,
+          removed: false,
+          value: 'abc \u0300'
+        },
+        {
+          count: 2,
+          added: false,
+          removed: true,
+          value: 'X def'
+        },
+        {
+          count: 2,
+          added: true,
+          removed: false,
+          value: 'Y ghi'
+        }
+      ]);
+
+      expect(diffWords(oldString, newString, { intlSegmenter: new Intl.Segmenter('en', { granularity: 'word' }) })).to.deep.equal([
+        {
+          // Note this is ONE token in segmenter mode, because ' \u0300' is
+          // considered pure whitespace
+          count: 1,
+          added: false,
+          removed: false,
+          value: 'abc \u0300'
+        },
+        {
+          count: 2,
+          added: false,
+          removed: true,
+          value: 'X def'
+        },
+        {
+          count: 2,
+          added: true,
+          removed: false,
+          value: 'Y ghi'
+        }
+      ]);
+    });
+
+    it('handles orphaned diacritics after newlines acceptably', () => {
+      // Oddly enough, an Intl.Segmenter in word mode seems to think that
+      // diacritics can modify spaces, but not newlines. So a diacritic
+      // modifier character after a newline is always a standalone segment.
+      // This test sanity-checks that we behave reasonably when encountering
+      // such segments.
+      expect(
+        diffWords(
+          'abc \n\u0300 X \n\u0300def',
+          'abc \n\u0300 Y def',
+          { intlSegmenter: new Intl.Segmenter('en', { granularity: 'word' }) }
+        )
+      ).to.deep.equal(
+        [
+          { count: 2, added: false, removed: false, value: 'abc \ǹ ' },
+          { count: 2, added: false, removed: true, value: 'X \ǹ' },
+          { count: 1, added: true, removed: false, value: 'Y ' },
+          { count: 1, added: false, removed: false, value: 'def' }
+        ]
+      );
+    });
   });
 
   describe('#diffWordsWithSpace', function() {
diff --git a/test/util/string.js b/test/util/string.js
@@ -1,4 +1,4 @@
-import {longestCommonPrefix, longestCommonSuffix, replacePrefix, replaceSuffix, removePrefix, removeSuffix, maximumOverlap} from '../../libesm/util/string.js';
+import {longestCommonPrefix, longestCommonSuffix, replacePrefix, replaceSuffix, removePrefix, removeSuffix, maximumOverlap, leadingWs, trailingWs} from '../../libesm/util/string.js';
 import {expect} from 'chai';
 
 describe('#longestCommonPrefix', function() {
@@ -88,3 +88,14 @@ describe('#maximumOverlap', function() {
     expect(maximumOverlap('', '')).to.equal('');
   });
 });
+
+describe('leadingWs & trailingWs', function() {
+  it('returns leading/trailing whitespace (with diacritics on whitespace considered part of the whitespace iff we are in segmenter mode)', () => {
+    const segmenter = new Intl.Segmenter('en', { granularity: 'word' });
+    const text = '\t\u0300  foo bar\n baz qux\t \u0300 ';
+    expect(leadingWs(text)).to.equal('\t');
+    expect(leadingWs(text, segmenter)).to.equal('\t\u0300  ');
+    expect(trailingWs(text)).to.equal(' ');
+    expect(trailingWs(text, segmenter)).to.equal('\t \u0300 ');
+  });
+});