From 973a4a89b6ea99a8e999d73aa7997c52932914f1 Mon Sep 17 00:00:00 2001 From: Bit0r Date: Tue, 14 Apr 2020 10:46:11 +0800 Subject: [PATCH 1/3] =?UTF-8?q?=F0=9F=9A=91=20=E8=A1=A5=E5=85=85=E7=BF=BB?= =?UTF-8?q?=E8=AF=91?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../article.md | 88 +++++++++---------- 1 file changed, 44 insertions(+), 44 deletions(-) diff --git a/9-regular-expressions/08-regexp-character-sets-and-ranges/article.md b/9-regular-expressions/08-regexp-character-sets-and-ranges/article.md index 751399f61c..b880c4c110 100644 --- a/9-regular-expressions/08-regexp-character-sets-and-ranges/article.md +++ b/9-regular-expressions/08-regexp-character-sets-and-ranges/article.md @@ -42,39 +42,39 @@ alert( "Voila".match(/V[oi]la/) ); // null,并没有匹配上 alert( "Exception 0xAF".match(/x[0-9A-F][0-9A-F]/g) ); // xAF ``` -Here `pattern:[0-9A-F]` has two ranges: it searches for a character that is either a digit from `0` to `9` or a letter from `A` to `F`. +`pattern:[0-9A-F]` 表示两个范围:它搜索一个字符,该字符是数字 `0` 到 `9` 或字母 `A` 到 `F`。 -If we'd like to look for lowercase letters as well, we can add the range `a-f`: `pattern:[0-9A-Fa-f]`. Or add the flag `pattern:i`. +如果我们还想查找小写字母,则可以添加范围 `a-f`:`pattern:[0-9A-Fa-f]`。或添加标志 `pattern:i`。 -We can also use character classes inside `[…]`. +我们也可以在 `[…]` 里面使用字符类。 -For instance, if we'd like to look for a wordly character `pattern:\w` or a hyphen `pattern:-`, then the set is `pattern:[\w-]`. +例如,如果我们想要查找单词字符 `pattern:\w` 或连字符 `pattern:-`,则该集合为 `pattern:[\w-]`。 -Combining multiple classes is also possible, e.g. `pattern:[\s\d]` means "a space character or a digit". +也可以组合多个类,例如 `pattern:[\s\d]` 表示 “空格字符或数字”。 -```smart header="Character classes are shorthands for certain character sets" -For instance: +```smart header="字符类是某些字符集的简写" +例如: -- **\d** -- is the same as `pattern:[0-9]`, -- **\w** -- is the same as `pattern:[a-zA-Z0-9_]`, -- **\s** -- is the same as `pattern:[\t\n\v\f\r ]`, plus few other rare unicode space characters. +* **\d** —— 和 `pattern:[0-9]` 相同, +* **\w** —— 和 `pattern:[a-zA-Z0-9_]` 相同, +* **\s** —— 和 `pattern:[\t\n\v\f\r ]` 外加少量罕见的 unicode 空格字符相同。 ``` -### Example: multi-language \w +### 示例:多语言 \w -As the character class `pattern:\w` is a shorthand for `pattern:[a-zA-Z0-9_]`, it can't find Chinese hieroglyphs, Cyrillic letters, etc. +由于字符类 `pattern:\w` 是简写的 `pattern:[a-zA-Z0-9_]`,因此无法找到中文象形文字,西里尔字母等。 -We can write a more universal pattern, that looks for wordly characters in any language. That's easy with unicode properties: `pattern:[\p{Alpha}\p{M}\p{Nd}\p{Pc}\p{Join_C}]`. +我们可以编写一个更通用的模式,该模式可以查找任何语言中的文字字符。这很容易想到就 Unicode 属性:`pattern:[\p{Alpha}\p{M}\p{Nd}\p{Pc}\p{Join_C}]`。 -Let's decipher it. Similar to `pattern:\w`, we're making a set of our own that includes characters with following unicode properties: +让我们理解它。类似于 `pattern:\w`,我们正在制作一个包括以下 unicode 字符的字符集: -- `Alphabetic` (`Alpha`) - for letters, -- `Mark` (`M`) - for accents, -- `Decimal_Number` (`Nd`) - for digits, -- `Connector_Punctuation` (`Pc`) - for the underscore `'_'` and similar characters, -- `Join_Control` (`Join_C`) - two special codes `200c` and `200d`, used in ligatures, e.g. in Arabic. +* `Alphabetic` (`Alpha`) —— 字母, +* `Mark` (`M`) —— 重读, +* `Decimal_Number` (`Nd`) —— 数字, +* `Connector_Punctuation` (`Pc`) —— 下划线 `'_'` 和类似的字符, +* `Join_Control` (`Join_C`) —— 两个特殊代码 `200c` and `200d`,用于连字,例如阿拉伯语。 -An example of use: +使用示例: ```js run let regexp = /[\p{Alpha}\p{M}\p{Nd}\p{Pc}\p{Join_C}]/gu; @@ -85,12 +85,12 @@ let str = `Hi 你好 12`; alert( str.match(regexp) ); // H,i,你,好,1,2 ``` -Of course, we can edit this pattern: add unicode properties or remove them. Unicode properties are covered in more details in the article . +当然,我们可以编辑此模式:添加 unicode 属性或删除它们。文章 中包含了更多 Unicode 属性的细节。 -```warn header="Unicode properties aren't supported in Edge and Firefox" -Unicode properties `pattern:p{…}` are not yet implemented in Edge and Firefox. If we really need them, we can use library [XRegExp](http://xregexp.com/). +```warn header="Edge 和 Firefox 不支持 Unicode 属性" +Edge 和 Firefox 尚未实现 Unicode 属性 `pattern:p{…}`。如果确实需要它们,可以使用库 [XRegExp](http://xregexp.com/)。 -Or just use ranges of characters in a language that interests us, e.g. `pattern:[а-я]` for Cyrillic letters. +或者只使用我们想要的语言范围的字符,例如西里尔字母 `pattern:[а-я]`。 ``` ## 排除范围 @@ -146,26 +146,26 @@ let reg = /[\-\(\)\.\^\+]/g; alert( "1 + 2 - 3".match(reg) ); // 仍能正常工作:+,- ``` -## Ranges and flag "u" +## 范围和标志“u” -If there are surrogate pairs in the set, flag `pattern:u` is required for them to work correctly. +如果集合中有代理对,则需要标志 `pattern:u` 以使其正常工作。 -For instance, let's look for `pattern:[𝒳𝒴]` in the string `subject:𝒳`: +例如,让我们在字符串 `subject:𝒳` 中查找 `pattern:[𝒳𝒴]`: ```js run -alert( '𝒳'.match(/[𝒳𝒴]/) ); // shows a strange character, like [?] -// (the search was performed incorrectly, half-character returned) +alert( '𝒳'.match(/[𝒳𝒴]/) ); // 显示一个奇怪的字符,像 [?] +//(搜索执行不正确,返回了半个字符) ``` -The result is incorrect, because by default regular expressions "don't know" about surrogate pairs. +结果不正确,因为默认情况下正则表达式“不知道”代理对。 -The regular expression engine thinks that `[𝒳𝒴]` -- are not two, but four characters: -1. left half of `𝒳` `(1)`, -2. right half of `𝒳` `(2)`, -3. left half of `𝒴` `(3)`, -4. right half of `𝒴` `(4)`. +正则表达式引擎认为 `[𝒳𝒴]` —— 不是两个,而是四个字符: +1. `𝒳` `(1)` 的左半部分, +2. `𝒳` `(2)` 的右半部分, +3. `𝒴` `(3)` 的左半部分, +4. `𝒴` `(4)` 的右半部分。 -We can see their codes like this: +我们可以看到他们的代码,如下所示: ```js run for(let i=0; i<'𝒳𝒴'.length; i++) { @@ -173,27 +173,27 @@ for(let i=0; i<'𝒳𝒴'.length; i++) { }; ``` -So, the example above finds and shows the left half of `𝒳`. +因此,以上示例查找并显示了 `𝒳` 的左半部分。 -If we add flag `pattern:u`, then the behavior will be correct: +如果我们添加标志 `pattern:u`,那么行为将是正确的: ```js run alert( '𝒳'.match(/[𝒳𝒴]/u) ); // 𝒳 ``` -The similar situation occurs when looking for a range, such as `[𝒳-𝒴]`. +当我们查找范围时也会出现类似的情况,就像 `[𝒳-𝒴]`。 -If we forget to add flag `pattern:u`, there will be an error: +如果我们忘记添加标志 `pattern:u`,则会出现错误: ```js run -'𝒳'.match(/[𝒳-𝒴]/); // Error: Invalid regular expression +'𝒳'.match(/[𝒳-𝒴]/); // 错误:无效的正则表达式 ``` -The reason is that without flag `pattern:u` surrogate pairs are perceived as two characters, so `[𝒳-𝒴]` is interpreted as `[<55349><56499>-<55349><56500>]` (every surrogate pair is replaced with its codes). Now it's easy to see that the range `56499-55349` is invalid: its starting code `56499` is greater than the end `55349`. That's the formal reason for the error. +原因是,没有标志 `pattern:u` 的代理对被视为两个字符,因此 `[𝒳-𝒴]` 被解释为 `[<55349><56499>-<55349><56500>]`(每个代理对都替换为其代码)。现在很容易看出范围 `56499-55349` 是无效的:其起始代码 `56499` 大于终止代码 `55349`。这就是错误的原因。 -With the flag `pattern:u` the pattern works correctly: +使用标志 `pattern:u`,该模式可以正常工作: ```js run -// look for characters from 𝒳 to 𝒵 +// 查找字符从 𝒳 到 𝒵 alert( '𝒴'.match(/[𝒳-𝒵]/u) ); // 𝒴 ``` From 711e328bdc482b6c9ba2354171165438041cfa01 Mon Sep 17 00:00:00 2001 From: Bit0r Date: Tue, 21 Apr 2020 09:55:28 +0800 Subject: [PATCH 2/3] =?UTF-8?q?=E7=BF=BB=E8=AF=91=E6=B6=A6=E8=89=B2?= =?UTF-8?q?=F0=9F=8E=A8?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../08-regexp-character-sets-and-ranges/article.md | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/9-regular-expressions/08-regexp-character-sets-and-ranges/article.md b/9-regular-expressions/08-regexp-character-sets-and-ranges/article.md index b880c4c110..e7726eb2d9 100644 --- a/9-regular-expressions/08-regexp-character-sets-and-ranges/article.md +++ b/9-regular-expressions/08-regexp-character-sets-and-ranges/article.md @@ -42,7 +42,7 @@ alert( "Voila".match(/V[oi]la/) ); // null,并没有匹配上 alert( "Exception 0xAF".match(/x[0-9A-F][0-9A-F]/g) ); // xAF ``` -`pattern:[0-9A-F]` 表示两个范围:它搜索一个字符,该字符是数字 `0` 到 `9` 或字母 `A` 到 `F`。 +`pattern:[0-9A-F]` 表示两个范围:它搜索一个字符,满足数字 `0` 到 `9` 或字母 `A` 到 `F`。 如果我们还想查找小写字母,则可以添加范围 `a-f`:`pattern:[0-9A-Fa-f]`。或添加标志 `pattern:i`。 @@ -66,7 +66,7 @@ alert( "Exception 0xAF".match(/x[0-9A-F][0-9A-F]/g) ); // xAF 我们可以编写一个更通用的模式,该模式可以查找任何语言中的文字字符。这很容易想到就 Unicode 属性:`pattern:[\p{Alpha}\p{M}\p{Nd}\p{Pc}\p{Join_C}]`。 -让我们理解它。类似于 `pattern:\w`,我们正在制作一个包括以下 unicode 字符的字符集: +让我们理解它。类似于 `pattern:\w`,我们在制作自己的一套字符集,包括以下 unicode 字符: * `Alphabetic` (`Alpha`) —— 字母, * `Mark` (`M`) —— 重读, @@ -191,7 +191,7 @@ alert( '𝒳'.match(/[𝒳𝒴]/u) ); // 𝒳 原因是,没有标志 `pattern:u` 的代理对被视为两个字符,因此 `[𝒳-𝒴]` 被解释为 `[<55349><56499>-<55349><56500>]`(每个代理对都替换为其代码)。现在很容易看出范围 `56499-55349` 是无效的:其起始代码 `56499` 大于终止代码 `55349`。这就是错误的原因。 -使用标志 `pattern:u`,该模式可以正常工作: +使用标志 `pattern:u`,该模式可以正常匹配: ```js run // 查找字符从 𝒳 到 𝒵 From 7a46d4cfd81e601bbbd040a78694a37a846d1ed1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E6=97=BA?= Date: Fri, 24 Apr 2020 15:54:33 +0800 Subject: [PATCH 3/3] Update 9-regular-expressions/08-regexp-character-sets-and-ranges/article.md Co-Authored-By: Martin --- .../08-regexp-character-sets-and-ranges/article.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/9-regular-expressions/08-regexp-character-sets-and-ranges/article.md b/9-regular-expressions/08-regexp-character-sets-and-ranges/article.md index e7726eb2d9..9afc10ec4b 100644 --- a/9-regular-expressions/08-regexp-character-sets-and-ranges/article.md +++ b/9-regular-expressions/08-regexp-character-sets-and-ranges/article.md @@ -148,7 +148,7 @@ alert( "1 + 2 - 3".match(reg) ); // 仍能正常工作:+,- ## 范围和标志“u” -如果集合中有代理对,则需要标志 `pattern:u` 以使其正常工作。 +如果集合中有代理对(surrogate pairs),则需要标志 `pattern:u` 以使其正常工作。 例如,让我们在字符串 `subject:𝒳` 中查找 `pattern:[𝒳𝒴]`: