forked from microsoft/vscode
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathencoding.ts
More file actions
171 lines (137 loc) · 4.31 KB
/
encoding.ts
File metadata and controls
171 lines (137 loc) · 4.31 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
/*---------------------------------------------------------------------------------------------
* Copyright (c) Microsoft Corporation. All rights reserved.
* Licensed under the MIT License. See License.txt in the project root for license information.
*--------------------------------------------------------------------------------------------*/
'use strict';
import stream = require('vs/base/node/stream');
import iconv = require('iconv-lite');
import { TPromise } from 'vs/base/common/winjs.base';
export const UTF8 = 'utf8';
export const UTF8_with_bom = 'utf8bom';
export const UTF16be = 'utf16be';
export const UTF16le = 'utf16le';
export function bomLength(encoding: string): number {
switch (encoding) {
case UTF8:
return 3;
case UTF16be:
case UTF16le:
return 2;
}
return 0;
}
export function decode(buffer: NodeBuffer, encoding: string, options?: any): string {
return iconv.decode(buffer, toNodeEncoding(encoding), options);
}
export function encode(content: string, encoding: string, options?: any): NodeBuffer {
return iconv.encode(content, toNodeEncoding(encoding), options);
}
export function encodingExists(encoding: string): boolean {
return iconv.encodingExists(toNodeEncoding(encoding));
}
export function decodeStream(encoding: string): NodeJS.ReadWriteStream {
return iconv.decodeStream(toNodeEncoding(encoding));
}
export function encodeStream(encoding: string): NodeJS.ReadWriteStream {
return iconv.encodeStream(toNodeEncoding(encoding));
}
function toNodeEncoding(enc: string): string {
if (enc === UTF8_with_bom) {
return UTF8; // iconv does not distinguish UTF 8 with or without BOM, so we need to help it
}
return enc;
}
export function detectEncodingByBOMFromBuffer(buffer: NodeBuffer, bytesRead: number): string {
if (!buffer || bytesRead < 2) {
return null;
}
const b0 = buffer.readUInt8(0);
const b1 = buffer.readUInt8(1);
// UTF-16 BE
if (b0 === 0xFE && b1 === 0xFF) {
return UTF16be;
}
// UTF-16 LE
if (b0 === 0xFF && b1 === 0xFE) {
return UTF16le;
}
if (bytesRead < 3) {
return null;
}
const b2 = buffer.readUInt8(2);
// UTF-8
if (b0 === 0xEF && b1 === 0xBB && b2 === 0xBF) {
return UTF8;
}
return null;
}
/**
* Detects the Byte Order Mark in a given file.
* If no BOM is detected, null will be passed to callback.
*/
export function detectEncodingByBOM(file: string): TPromise<string> {
return stream.readExactlyByFile(file, 3).then(({ buffer, bytesRead }) => detectEncodingByBOMFromBuffer(buffer, bytesRead));
}
const MINIMUM_THRESHOLD = 0.2;
const IGNORE_ENCODINGS = ['ascii', 'utf-8', 'utf-16', 'utf-32'];
const MAPPED_ENCODINGS = {
'ibm866': 'cp866'
};
/**
* Guesses the encoding from buffer.
*/
export async function guessEncodingByBuffer(buffer: NodeBuffer): TPromise<string> {
const jschardet = await import('jschardet');
jschardet.Constants.MINIMUM_THRESHOLD = MINIMUM_THRESHOLD;
const guessed = jschardet.detect(buffer);
if (!guessed || !guessed.encoding) {
return null;
}
const enc = guessed.encoding.toLowerCase();
// Ignore encodings that cannot guess correctly
// (http://chardet.readthedocs.io/en/latest/supported-encodings.html)
if (0 <= IGNORE_ENCODINGS.indexOf(enc)) {
return null;
}
return toIconvLiteEncoding(guessed.encoding);
}
function toIconvLiteEncoding(encodingName: string): string {
const normalizedEncodingName = encodingName.replace(/[^a-zA-Z0-9]/g, '').toLowerCase();
const mapped = MAPPED_ENCODINGS[normalizedEncodingName];
return mapped || normalizedEncodingName;
}
/**
* The encodings that are allowed in a settings file don't match the canonical encoding labels specified by WHATWG.
* See https://encoding.spec.whatwg.org/#names-and-labels
* Iconv-lite strips all non-alphanumeric characters, but ripgrep doesn't. For backcompat, allow these labels.
*/
export function toCanonicalName(enc: string): string {
switch (enc) {
case 'shiftjis':
return 'shift-jis';
case 'utf16le':
return 'utf-16le';
case 'utf16be':
return 'utf-16be';
case 'big5hkcs':
return 'big5-hkcs';
case 'eucjp':
return 'euc-jp';
case 'euckr':
return 'euc-kr';
case 'koi8r':
return 'koi8-r';
case 'koi8u':
return 'koi8-u';
case 'macroman':
return 'x-mac-roman';
case 'utf8bom':
return 'utf8';
default:
const m = enc.match(/windows(\d+)/);
if (m) {
return 'windows-' + m[1];
}
return enc;
}
}