Skip to content

Commit 4b5d180

Browse files
Fix another bug in diffWords's "intlSegmenter" mode (kpdecker#667)
* Add test showing the bug * Fix the bug * Add one more test of another weird case * Add release notes * Fix test bug the AI found * Fix another test bug
1 parent 10da50c commit 4b5d180

5 files changed

Lines changed: 181 additions & 41 deletions

File tree

release-notes.md

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,9 @@
11
# Release Notes
22

3+
## 8.0.4 (prerelease)
4+
5+
- [#667](https://github.com/kpdecker/jsdiff/pull/667) - **fix another bug in `diffWords` when used with an `Intl.Segmenter`**. If the text to be diffed included a combining mark after a whitespace character (i.e. roughly speaking, an accented space), `diffWords` would previously crash. Now this case is handled correctly.
6+
37
## 8.0.3
48

59
- [#631](https://github.com/kpdecker/jsdiff/pull/631) - **fix support for using an `Intl.Segmenter` with `diffWords`**. This has been almost completely broken since the feature was added in v6.0.0, since it would outright crash on any text that featured two consecutive newlines between a pair of words (a very common case).

src/diff/word.ts

Lines changed: 18 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
import Diff from './base.js';
22
import type { ChangeObject, CallbackOptionAbortable, CallbackOptionNonabortable, DiffCallbackNonabortable, DiffWordsOptionsAbortable, DiffWordsOptionsNonabortable} from '../types.js';
3-
import { longestCommonPrefix, longestCommonSuffix, replacePrefix, replaceSuffix, removePrefix, removeSuffix, maximumOverlap, leadingWs, trailingWs } from '../util/string.js';
3+
import { longestCommonPrefix, longestCommonSuffix, replacePrefix, replaceSuffix, removePrefix, removeSuffix, maximumOverlap, leadingWs, trailingWs, leadingAndTrailingWs, segment } from '../util/string.js';
44

55
// Based on https://en.wikipedia.org/wiki/Latin_script_in_Unicode
66
//
@@ -72,21 +72,9 @@ class WordDiff extends Diff<string, string> {
7272
// We want `parts` to be an array whose elements alternate between being
7373
// pure whitespace and being pure non-whitespace. This is ALMOST what the
7474
// segments returned by a word-based Intl.Segmenter already look like,
75-
// and therefore we can ALMOST get what we want by simply doing...
76-
// parts = Array.from(segmenter.segment(value), segment => segment.segment);
77-
// ... but not QUITE, because there's of one annoying special case: every
78-
// newline character gets its own segment, instead of sharing a segment
79-
// with other surrounding whitespace. We therefore need to manually merge
80-
// consecutive segments of whitespace into a single part:
81-
parts = [];
82-
for (const segmentObj of Array.from(segmenter.segment(value))) {
83-
const segment = segmentObj.segment;
84-
if (parts.length && (/\s/).test(parts[parts.length - 1]) && (/\s/).test(segment)) {
85-
parts[parts.length - 1] += segment;
86-
} else {
87-
parts.push(segment);
88-
}
89-
}
75+
// but not quite - see explanation in the docs of our custom segment()
76+
// function.
77+
parts = segment(value, segmenter);
9078
} else {
9179
parts = value.match(tokenizeIncludingWhitespace) || [];
9280
}
@@ -146,15 +134,15 @@ class WordDiff extends Diff<string, string> {
146134
deletion = change;
147135
} else {
148136
if (insertion || deletion) { // May be false at start of text
149-
dedupeWhitespaceInChangeObjects(lastKeep, deletion, insertion, change);
137+
dedupeWhitespaceInChangeObjects(lastKeep, deletion, insertion, change, options.intlSegmenter);
150138
}
151139
lastKeep = change;
152140
insertion = null;
153141
deletion = null;
154142
}
155143
});
156144
if (insertion || deletion) {
157-
dedupeWhitespaceInChangeObjects(lastKeep, deletion, insertion, null);
145+
dedupeWhitespaceInChangeObjects(lastKeep, deletion, insertion, null, options.intlSegmenter);
158146
}
159147
return changes;
160148
}
@@ -209,7 +197,8 @@ function dedupeWhitespaceInChangeObjects(
209197
startKeep: ChangeObject<string> | null,
210198
deletion: ChangeObject<string> | null,
211199
insertion: ChangeObject<string> | null,
212-
endKeep: ChangeObject<string> | null
200+
endKeep: ChangeObject<string> | null,
201+
segmenter?: Intl.Segmenter
213202
) {
214203
// Before returning, we tidy up the leading and trailing whitespace of the
215204
// change objects to eliminate cases where trailing whitespace in one object
@@ -254,10 +243,8 @@ function dedupeWhitespaceInChangeObjects(
254243
// * Just a "delete"
255244
// We handle the three cases separately.
256245
if (deletion && insertion) {
257-
const oldWsPrefix = leadingWs(deletion.value);
258-
const oldWsSuffix = trailingWs(deletion.value);
259-
const newWsPrefix = leadingWs(insertion.value);
260-
const newWsSuffix = trailingWs(insertion.value);
246+
const [oldWsPrefix, oldWsSuffix] = leadingAndTrailingWs(deletion.value, segmenter);
247+
const [newWsPrefix, newWsSuffix] = leadingAndTrailingWs(insertion.value, segmenter);
261248

262249
if (startKeep) {
263250
const commonWsPrefix = longestCommonPrefix(oldWsPrefix, newWsPrefix);
@@ -279,18 +266,17 @@ function dedupeWhitespaceInChangeObjects(
279266
// whitespace and deleting duplicate leading whitespace where
280267
// present.
281268
if (startKeep) {
282-
const ws = leadingWs(insertion.value);
269+
const ws = leadingWs(insertion.value, segmenter);
283270
insertion.value = insertion.value.substring(ws.length);
284271
}
285272
if (endKeep) {
286-
const ws = leadingWs(endKeep.value);
273+
const ws = leadingWs(endKeep.value, segmenter);
287274
endKeep.value = endKeep.value.substring(ws.length);
288275
}
289276
// otherwise we've got a deletion and no insertion
290277
} else if (startKeep && endKeep) {
291-
const newWsFull = leadingWs(endKeep.value),
292-
delWsStart = leadingWs(deletion!.value),
293-
delWsEnd = trailingWs(deletion!.value);
278+
const newWsFull = leadingWs(endKeep.value, segmenter),
279+
[delWsStart, delWsEnd] = leadingAndTrailingWs(deletion!.value, segmenter);
294280

295281
// Any whitespace that comes straight after startKeep in both the old and
296282
// new texts, assign to startKeep and remove from the deletion.
@@ -318,16 +304,16 @@ function dedupeWhitespaceInChangeObjects(
318304
// We are at the start of the text. Preserve all the whitespace on
319305
// endKeep, and just remove whitespace from the end of deletion to the
320306
// extent that it overlaps with the start of endKeep.
321-
const endKeepWsPrefix = leadingWs(endKeep.value);
322-
const deletionWsSuffix = trailingWs(deletion!.value);
307+
const endKeepWsPrefix = leadingWs(endKeep.value, segmenter);
308+
const deletionWsSuffix = trailingWs(deletion!.value, segmenter);
323309
const overlap = maximumOverlap(deletionWsSuffix, endKeepWsPrefix);
324310
deletion!.value = removeSuffix(deletion!.value, overlap);
325311
} else if (startKeep) {
326312
// We are at the END of the text. Preserve all the whitespace on
327313
// startKeep, and just remove whitespace from the start of deletion to
328314
// the extent that it overlaps with the end of startKeep.
329-
const startKeepWsSuffix = trailingWs(startKeep.value);
330-
const deletionWsPrefix = leadingWs(deletion!.value);
315+
const startKeepWsSuffix = trailingWs(startKeep.value, segmenter);
316+
const deletionWsPrefix = leadingWs(deletion!.value, segmenter);
331317
const overlap = maximumOverlap(startKeepWsSuffix, deletionWsPrefix);
332318
deletion!.value = removePrefix(deletion!.value, overlap);
333319
}

src/util/string.ts

Lines changed: 67 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -102,7 +102,48 @@ export function hasOnlyUnixLineEndings(string: string): boolean {
102102
return !string.includes('\r\n') && string.includes('\n');
103103
}
104104

105-
export function trailingWs(string: string): string {
105+
/**
106+
* Split a string into segments using a word segmenter, merging consecutive
107+
* segments if they are both whitespace segments. Whitespace segments can
108+
* appear adjacent to one another for two reasons:
109+
* - newlines always get their own segment
110+
* - where a diacritic is attached to a whitespace character in the text, the
111+
* segment ends after the diacritic, so e.g. " \u0300 " becomes two segments.
112+
* This function therefore runs the segmenter's .segment() method and then
113+
* merges consecutive segments of whitespace into a single part.
114+
*/
115+
export function segment(string: string, segmenter: Intl.Segmenter): string[] {
116+
const parts = [];
117+
for (const segmentObj of Array.from(segmenter.segment(string))) {
118+
const segment = segmentObj.segment;
119+
if (parts.length && (/\s/).test(parts[parts.length - 1]) && (/\s/).test(segment)) {
120+
parts[parts.length - 1] += segment;
121+
} else {
122+
parts.push(segment);
123+
}
124+
}
125+
return parts;
126+
}
127+
128+
// The functions below take a `segmenter` argument so that, when called from
129+
// diffWords when it is using a segmenter, they can use a notion of what
130+
// constitutes "whitespace" that is consistent with the segmenter.
131+
//
132+
// USUALLY this will be identical to the result of the non-segmenter-based
133+
// logic, but it differs in at least one case: when whitespace characters are
134+
// modified by diacritics. A word segmenter considers these diacritics to be
135+
// part of the whitespace, whereas our non-segmenter-based logic does not.
136+
//
137+
// Because the segmenter-based approach necessarily requires segmenting the
138+
// entire string, we offer a leadingAndTrailingWs function to allow getting the
139+
// whitespace prefix AND whitespace suffix with a single call to the segmenter,
140+
// for efficiency's sake.
141+
142+
export function trailingWs(string: string, segmenter?: Intl.Segmenter): string {
143+
if (segmenter) {
144+
return leadingAndTrailingWs(string, segmenter)[1];
145+
}
146+
106147
// Yes, this looks overcomplicated and dumb - why not replace the whole function with
107148
// return string.match(/\s*$/)[0]
108149
// you ask? Because:
@@ -123,8 +164,32 @@ export function trailingWs(string: string): string {
123164
return string.substring(i + 1);
124165
}
125166

126-
export function leadingWs(string: string): string {
167+
export function leadingWs(string: string, segmenter?: Intl.Segmenter): string {
168+
if (segmenter) {
169+
return leadingAndTrailingWs(string, segmenter)[0];
170+
}
171+
127172
// Thankfully the annoying considerations described in trailingWs don't apply here:
128173
const match = string.match(/^\s*/);
129174
return match ? match[0] : '';
130175
}
176+
177+
export function leadingAndTrailingWs(
178+
string: string,
179+
segmenter?: Intl.Segmenter
180+
): [string, string] {
181+
if (!segmenter) {
182+
return [leadingWs(string), trailingWs(string)];
183+
}
184+
185+
if (segmenter.resolvedOptions().granularity != 'word') {
186+
throw new Error('The segmenter passed must have a granularity of "word"');
187+
}
188+
189+
const segments = segment(string, segmenter);
190+
const firstSeg = segments[0];
191+
const lastSeg = segments[segments.length - 1];
192+
const head = (/\s/).test(firstSeg) ? firstSeg : '';
193+
const tail = (/\s/).test(lastSeg) ? lastSeg : '';
194+
return [head, tail];
195+
}

test/diff/word.js

Lines changed: 80 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -6,11 +6,8 @@ import {expect} from 'chai';
66
describe('WordDiff', function() {
77
describe('#tokenize', function() {
88
it('should give each word & punctuation mark its own token, including leading and trailing whitespace', function() {
9-
expect(
10-
wordDiff.tokenize(
11-
'foo bar baz jurídica wir üben bla\t\t \txyzáxyz \n\n\n animá-los\r\n\r\n(wibbly wobbly)().'
12-
)
13-
).to.deep.equal([
9+
const string = 'foo bar baz jurídica wir üben bla\t\t \txyzáxyz \n\n\n animá-los\r\n\r\n(wibbly wobbly)().';
10+
const expectedResult = [
1411
'foo ',
1512
' bar ',
1613
' baz ',
@@ -29,7 +26,12 @@ describe('WordDiff', function() {
2926
'(',
3027
')',
3128
'.'
32-
]);
29+
];
30+
expect(wordDiff.tokenize(string)).to.deep.equal(expectedResult);
31+
expect(wordDiff.tokenize(
32+
string,
33+
{ intlSegmenter: new Intl.Segmenter('en', { granularity: 'word' }) }
34+
)).to.deep.equal(expectedResult);
3335
});
3436

3537
// Test for bug reported at https://github.com/kpdecker/jsdiff/issues/553
@@ -379,6 +381,78 @@ describe('WordDiff', function() {
379381
'<del>A</del><ins>B</ins>\n\nX'
380382
);
381383
});
384+
385+
it('handles diacritics on whitespace differently in Segmenter mode vs normal mode', () => {
386+
// Regression test for https://github.com/kpdecker/jsdiff/issues/664
387+
const oldString = 'abc \u0300X def';
388+
const newString = 'abc \u0300Y ghi';
389+
390+
expect(diffWords(oldString, newString)).to.deep.equal([
391+
{
392+
count: 2,
393+
added: false,
394+
removed: false,
395+
value: 'abc \u0300'
396+
},
397+
{
398+
count: 2,
399+
added: false,
400+
removed: true,
401+
value: 'X def'
402+
},
403+
{
404+
count: 2,
405+
added: true,
406+
removed: false,
407+
value: 'Y ghi'
408+
}
409+
]);
410+
411+
expect(diffWords(oldString, newString, { intlSegmenter: new Intl.Segmenter('en', { granularity: 'word' }) })).to.deep.equal([
412+
{
413+
// Note this is ONE token in segmenter mode, because ' \u0300' is
414+
// considered pure whitespace
415+
count: 1,
416+
added: false,
417+
removed: false,
418+
value: 'abc \u0300'
419+
},
420+
{
421+
count: 2,
422+
added: false,
423+
removed: true,
424+
value: 'X def'
425+
},
426+
{
427+
count: 2,
428+
added: true,
429+
removed: false,
430+
value: 'Y ghi'
431+
}
432+
]);
433+
});
434+
435+
it('handles orphaned diacritics after newlines acceptably', () => {
436+
// Oddly enough, an Intl.Segmenter in word mode seems to think that
437+
// diacritics can modify spaces, but not newlines. So a diacritic
438+
// modifier character after a newline is always a standalone segment.
439+
// This test sanity-checks that we behave reasonably when encountering
440+
// such segments.
441+
expect(
442+
diffWords(
443+
'abc \n\u0300 X \n\u0300def',
444+
'abc \n\u0300 Y def',
445+
{ intlSegmenter: new Intl.Segmenter('en', { granularity: 'word' }) }
446+
)
447+
).to.deep.equal(
448+
[
449+
{ count: 2, added: false, removed: false, value: 'abc \ǹ ' },
450+
{ count: 2, added: false, removed: true, value: 'X \ǹ' },
451+
{ count: 1, added: true, removed: false, value: 'Y ' },
452+
{ count: 1, added: false, removed: false, value: 'def' }
453+
]
454+
);
455+
});
382456
});
383457

384458
describe('#diffWordsWithSpace', function() {

test/util/string.js

Lines changed: 12 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
import {longestCommonPrefix, longestCommonSuffix, replacePrefix, replaceSuffix, removePrefix, removeSuffix, maximumOverlap} from '../../libesm/util/string.js';
1+
import {longestCommonPrefix, longestCommonSuffix, replacePrefix, replaceSuffix, removePrefix, removeSuffix, maximumOverlap, leadingWs, trailingWs} from '../../libesm/util/string.js';
22
import {expect} from 'chai';
33

44
describe('#longestCommonPrefix', function() {
@@ -88,3 +88,14 @@ describe('#maximumOverlap', function() {
8888
expect(maximumOverlap('', '')).to.equal('');
8989
});
9090
});
91+
92+
describe('leadingWs & trailingWs', function() {
93+
it('returns leading/trailing whitespace (with diacritics on whitespace considered part of the whitespace iff we are in segmenter mode)', () => {
94+
const segmenter = new Intl.Segmenter('en', { granularity: 'word' });
95+
const text = '\t\u0300 foo bar\n baz qux\t \u0300 ';
96+
expect(leadingWs(text)).to.equal('\t');
97+
expect(leadingWs(text, segmenter)).to.equal('\t\u0300 ');
98+
expect(trailingWs(text)).to.equal(' ');
99+
expect(trailingWs(text, segmenter)).to.equal('\t \u0300 ');
100+
});
101+
});

0 commit comments

Comments
 (0)