Skip to content

Commit a6a73e5

Browse files
authored
Merge pull request microsoft#17454 from Microsoft/ben/17408
Search does not work in UTF-16 LE encoded files (fixes microsoft#17408)
2 parents 95fa550 + 00d483b commit a6a73e5

3 files changed

Lines changed: 43 additions & 17 deletions

File tree

src/vs/base/node/encoding.ts

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,18 @@ export const UTF8_with_bom = 'utf8bom';
1313
export const UTF16be = 'utf16be';
1414
export const UTF16le = 'utf16le';
1515

16+
export function bomLength(encoding: string): number {
17+
switch (encoding) {
18+
case UTF8:
19+
return 3;
20+
case UTF16be:
21+
case UTF16le:
22+
return 2;
23+
}
24+
25+
return 0;
26+
}
27+
1628
export function decode(buffer: NodeBuffer, encoding: string, options?: any): string {
1729
return iconv.decode(buffer, toNodeEncoding(encoding), options);
1830
}

src/vs/workbench/services/search/node/worker/searchWorker.ts

Lines changed: 28 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@ import { TPromise } from 'vs/base/common/winjs.base';
1515
import { ISerializedFileMatch } from '../search';
1616
import * as baseMime from 'vs/base/common/mime';
1717
import { ILineMatch } from 'vs/platform/search/common/search';
18-
import { UTF16le, UTF16be, UTF8, UTF8_with_bom, encodingExists, decode } from 'vs/base/node/encoding';
18+
import { UTF16le, UTF16be, UTF8, UTF8_with_bom, encodingExists, decode, bomLength } from 'vs/base/node/encoding';
1919
import { detectMimeAndEncodingFromBuffer } from 'vs/base/node/mime';
2020

2121
import { ISearchWorker, ISearchWorkerSearchArgs, ISearchWorkerSearchResult } from './searchWorkerIpc';
@@ -66,6 +66,9 @@ interface IFileSearchResult {
6666
limitReached?: boolean;
6767
}
6868

69+
const LF = 0x0a;
70+
const CR = 0x0d;
71+
6972
export class SearchWorkerEngine {
7073
private nextSearch = TPromise.wrap(null);
7174
private isCanceled = false;
@@ -205,31 +208,42 @@ export class SearchWorkerEngine {
205208

206209
// Detect encoding and mime when this is the beginning of the file
207210
if (isFirstRead) {
208-
let mimeAndEncoding = detectMimeAndEncodingFromBuffer(buffer, bytesRead);
211+
const mimeAndEncoding = detectMimeAndEncodingFromBuffer(buffer, bytesRead);
209212
if (mimeAndEncoding.mimes[mimeAndEncoding.mimes.length - 1] !== baseMime.MIME_TEXT) {
210213
return clb(null); // skip files that seem binary
211214
}
212215

213216
// Check for BOM offset
214217
switch (mimeAndEncoding.encoding) {
215218
case UTF8:
216-
pos = i = 3;
219+
pos = i = bomLength(UTF8);
217220
options.encoding = UTF8;
218221
break;
219222
case UTF16be:
220-
pos = i = 2;
223+
pos = i = bomLength(UTF16be);
221224
options.encoding = UTF16be;
222225
break;
223226
case UTF16le:
224-
pos = i = 2;
227+
pos = i = bomLength(UTF16le);
225228
options.encoding = UTF16le;
226229
break;
227230
}
228231
}
229232

233+
// when we are running with UTF16le, LF and CR are encoded as
234+
// 0A 00 (LF) and 0D 00 (CR). the zero bytes are at the end
235+
// due to little endianess. since we want to split our buffer
236+
// into lines, we need to skip over the 00 bytes after LF and CR
237+
// so UTF16-LE gets a multiplier of 2, otherwise we would include
238+
// bad 00 bytes in our resulting buffer.
239+
let byteOffsetMultiplier = 1;
240+
if (options.encoding === UTF16le) {
241+
byteOffsetMultiplier = 2;
242+
}
243+
230244
if (lastBufferHadTraillingCR) {
231-
if (buffer[i] === 0x0a) { // LF (Line Feed)
232-
lineFinished(1);
245+
if (buffer[i] === LF) {
246+
lineFinished(1 * byteOffsetMultiplier);
233247
i++;
234248
} else {
235249
lineFinished(0);
@@ -239,16 +253,16 @@ export class SearchWorkerEngine {
239253
}
240254

241255
for (; i < bytesRead; ++i) {
242-
if (buffer[i] === 0x0a) { // LF (Line Feed)
243-
lineFinished(1);
244-
} else if (buffer[i] === 0x0d) { // CR (Carriage Return)
256+
if (buffer[i] === LF) {
257+
lineFinished(1 * byteOffsetMultiplier);
258+
} else if (buffer[i] === CR) { // CR (Carriage Return)
245259
if (i + 1 === bytesRead) {
246260
lastBufferHadTraillingCR = true;
247-
} else if (buffer[i + 1] === 0x0a) { // LF (Line Feed)
248-
lineFinished(2);
261+
} else if (buffer[i + 1] === LF) {
262+
lineFinished(2 * byteOffsetMultiplier);
249263
i++;
250264
} else {
251-
lineFinished(1);
265+
lineFinished(1 * byteOffsetMultiplier);
252266
}
253267
}
254268
}
@@ -339,7 +353,7 @@ export class LineMatch implements ILineMatch {
339353
}
340354

341355
serialize(): ILineMatch {
342-
let result = {
356+
const result = {
343357
preview: this.preview,
344358
lineNumber: this.lineNumber,
345359
offsetAndLengths: this.offsetAndLengths

src/vs/workbench/services/search/test/node/search.test.ts

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -696,7 +696,7 @@ suite('Search', () => {
696696
}
697697
}, () => { }, (error) => {
698698
assert.ok(!error);
699-
assert.equal(c, 2);
699+
assert.equal(c, 3);
700700
done();
701701
});
702702
});
@@ -717,7 +717,7 @@ suite('Search', () => {
717717
}
718718
}, (result) => { }, (error) => {
719719
assert.ok(!error);
720-
assert.equal(c, 748);
720+
assert.equal(c, 776);
721721
done();
722722
});
723723
});
@@ -739,7 +739,7 @@ suite('Search', () => {
739739
}
740740
}, (result) => { }, (error) => {
741741
assert.ok(!error);
742-
assert.equal(c, 366);
742+
assert.equal(c, 394);
743743
done();
744744
});
745745
});

0 commit comments

Comments
 (0)