11import Diff from './base.js' ;
22import type { ChangeObject , CallbackOptionAbortable , CallbackOptionNonabortable , DiffCallbackNonabortable , DiffWordsOptionsAbortable , DiffWordsOptionsNonabortable } from '../types.js' ;
3- import { longestCommonPrefix , longestCommonSuffix , replacePrefix , replaceSuffix , removePrefix , removeSuffix , maximumOverlap , leadingWs , trailingWs } from '../util/string.js' ;
3+ import { longestCommonPrefix , longestCommonSuffix , replacePrefix , replaceSuffix , removePrefix , removeSuffix , maximumOverlap , leadingWs , trailingWs , leadingAndTrailingWs , segment } from '../util/string.js' ;
44
55// Based on https://en.wikipedia.org/wiki/Latin_script_in_Unicode
66//
@@ -72,21 +72,9 @@ class WordDiff extends Diff<string, string> {
7272 // We want `parts` to be an array whose elements alternate between being
7373 // pure whitespace and being pure non-whitespace. This is ALMOST what the
7474 // segments returned by a word-based Intl.Segmenter already look like,
75- // and therefore we can ALMOST get what we want by simply doing...
76- // parts = Array.from(segmenter.segment(value), segment => segment.segment);
77- // ... but not QUITE, because there's of one annoying special case: every
78- // newline character gets its own segment, instead of sharing a segment
79- // with other surrounding whitespace. We therefore need to manually merge
80- // consecutive segments of whitespace into a single part:
81- parts = [ ] ;
82- for ( const segmentObj of Array . from ( segmenter . segment ( value ) ) ) {
83- const segment = segmentObj . segment ;
84- if ( parts . length && ( / \s / ) . test ( parts [ parts . length - 1 ] ) && ( / \s / ) . test ( segment ) ) {
85- parts [ parts . length - 1 ] += segment ;
86- } else {
87- parts . push ( segment ) ;
88- }
89- }
75+ // but not quite - see explanation in the docs of our custom segment()
76+ // function.
77+ parts = segment ( value , segmenter ) ;
9078 } else {
9179 parts = value . match ( tokenizeIncludingWhitespace ) || [ ] ;
9280 }
@@ -146,15 +134,15 @@ class WordDiff extends Diff<string, string> {
146134 deletion = change ;
147135 } else {
148136 if ( insertion || deletion ) { // May be false at start of text
149- dedupeWhitespaceInChangeObjects ( lastKeep , deletion , insertion , change ) ;
137+ dedupeWhitespaceInChangeObjects ( lastKeep , deletion , insertion , change , options . intlSegmenter ) ;
150138 }
151139 lastKeep = change ;
152140 insertion = null ;
153141 deletion = null ;
154142 }
155143 } ) ;
156144 if ( insertion || deletion ) {
157- dedupeWhitespaceInChangeObjects ( lastKeep , deletion , insertion , null ) ;
145+ dedupeWhitespaceInChangeObjects ( lastKeep , deletion , insertion , null , options . intlSegmenter ) ;
158146 }
159147 return changes ;
160148 }
@@ -209,7 +197,8 @@ function dedupeWhitespaceInChangeObjects(
209197 startKeep : ChangeObject < string > | null ,
210198 deletion : ChangeObject < string > | null ,
211199 insertion : ChangeObject < string > | null ,
212- endKeep : ChangeObject < string > | null
200+ endKeep : ChangeObject < string > | null ,
201+ segmenter ?: Intl . Segmenter
213202) {
214203 // Before returning, we tidy up the leading and trailing whitespace of the
215204 // change objects to eliminate cases where trailing whitespace in one object
@@ -254,10 +243,8 @@ function dedupeWhitespaceInChangeObjects(
254243 // * Just a "delete"
255244 // We handle the three cases separately.
256245 if ( deletion && insertion ) {
257- const oldWsPrefix = leadingWs ( deletion . value ) ;
258- const oldWsSuffix = trailingWs ( deletion . value ) ;
259- const newWsPrefix = leadingWs ( insertion . value ) ;
260- const newWsSuffix = trailingWs ( insertion . value ) ;
246+ const [ oldWsPrefix , oldWsSuffix ] = leadingAndTrailingWs ( deletion . value , segmenter ) ;
247+ const [ newWsPrefix , newWsSuffix ] = leadingAndTrailingWs ( insertion . value , segmenter ) ;
261248
262249 if ( startKeep ) {
263250 const commonWsPrefix = longestCommonPrefix ( oldWsPrefix , newWsPrefix ) ;
@@ -279,18 +266,17 @@ function dedupeWhitespaceInChangeObjects(
279266 // whitespace and deleting duplicate leading whitespace where
280267 // present.
281268 if ( startKeep ) {
282- const ws = leadingWs ( insertion . value ) ;
269+ const ws = leadingWs ( insertion . value , segmenter ) ;
283270 insertion . value = insertion . value . substring ( ws . length ) ;
284271 }
285272 if ( endKeep ) {
286- const ws = leadingWs ( endKeep . value ) ;
273+ const ws = leadingWs ( endKeep . value , segmenter ) ;
287274 endKeep . value = endKeep . value . substring ( ws . length ) ;
288275 }
289276 // otherwise we've got a deletion and no insertion
290277 } else if ( startKeep && endKeep ) {
291- const newWsFull = leadingWs ( endKeep . value ) ,
292- delWsStart = leadingWs ( deletion ! . value ) ,
293- delWsEnd = trailingWs ( deletion ! . value ) ;
278+ const newWsFull = leadingWs ( endKeep . value , segmenter ) ,
279+ [ delWsStart , delWsEnd ] = leadingAndTrailingWs ( deletion ! . value , segmenter ) ;
294280
295281 // Any whitespace that comes straight after startKeep in both the old and
296282 // new texts, assign to startKeep and remove from the deletion.
@@ -318,16 +304,16 @@ function dedupeWhitespaceInChangeObjects(
318304 // We are at the start of the text. Preserve all the whitespace on
319305 // endKeep, and just remove whitespace from the end of deletion to the
320306 // extent that it overlaps with the start of endKeep.
321- const endKeepWsPrefix = leadingWs ( endKeep . value ) ;
322- const deletionWsSuffix = trailingWs ( deletion ! . value ) ;
307+ const endKeepWsPrefix = leadingWs ( endKeep . value , segmenter ) ;
308+ const deletionWsSuffix = trailingWs ( deletion ! . value , segmenter ) ;
323309 const overlap = maximumOverlap ( deletionWsSuffix , endKeepWsPrefix ) ;
324310 deletion ! . value = removeSuffix ( deletion ! . value , overlap ) ;
325311 } else if ( startKeep ) {
326312 // We are at the END of the text. Preserve all the whitespace on
327313 // startKeep, and just remove whitespace from the start of deletion to
328314 // the extent that it overlaps with the end of startKeep.
329- const startKeepWsSuffix = trailingWs ( startKeep . value ) ;
330- const deletionWsPrefix = leadingWs ( deletion ! . value ) ;
315+ const startKeepWsSuffix = trailingWs ( startKeep . value , segmenter ) ;
316+ const deletionWsPrefix = leadingWs ( deletion ! . value , segmenter ) ;
331317 const overlap = maximumOverlap ( startKeepWsSuffix , deletionWsPrefix ) ;
332318 deletion ! . value = removePrefix ( deletion ! . value , overlap ) ;
333319 }
0 commit comments