Skip to content

Commit 3a816d6

Browse files
authored
BAEL-6967-decode-unicode-in-string (#14901)
* BAEL-6967-decode-unicode-in-string * update unit test --------- Co-authored-by: tienvn <tienvn@>
1 parent e62aba1 commit 3a816d6

3 files changed

Lines changed: 74 additions & 0 deletions

File tree

libraries-apache-commons-2/pom.xml

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -28,12 +28,18 @@
2828
<artifactId>commons-vfs2</artifactId>
2929
<version>${commons-vfs2.version}</version>
3030
</dependency>
31+
<dependency>
32+
<groupId>org.apache.commons</groupId>
33+
<artifactId>commons-text</artifactId>
34+
<version>${apache-commons-text.version}</version>
35+
</dependency>
3136
</dependencies>
3237

3338
<properties>
3439
<commons-compress.version>1.23.0</commons-compress.version>
3540
<ant.version>1.10.13</ant.version>
3641
<commons-vfs2.version>2.9.0</commons-vfs2.version>
42+
<apache-commons-text.version>1.10.0</apache-commons-text.version>
3743
</properties>
3844

3945
</project>
Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,29 @@
1+
package com.baeldung.commons.convertunicode;
2+
3+
import org.apache.commons.text.StringEscapeUtils;
4+
5+
import java.util.regex.Matcher;
6+
import java.util.regex.Pattern;
7+
8+
public class UnicodeConverterUtil {
9+
10+
public static String decodeWithApacheCommons(String input) {
11+
return StringEscapeUtils.unescapeJava(input);
12+
}
13+
14+
public static String decodeWithPlainJava(String input) {
15+
Pattern pattern = Pattern.compile("\\\\u[0-9a-fA-F]{4}");
16+
Matcher matcher = pattern.matcher(input);
17+
18+
StringBuilder decodedString = new StringBuilder();
19+
20+
while (matcher.find()) {
21+
String unicodeSequence = matcher.group();
22+
char unicodeChar = (char) Integer.parseInt(unicodeSequence.substring(2), 16);
23+
matcher.appendReplacement(decodedString, Character.toString(unicodeChar));
24+
}
25+
26+
matcher.appendTail(decodedString);
27+
return decodedString.toString();
28+
}
29+
}
Lines changed: 39 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,39 @@
1+
package com.baeldung.commons.convertunicode;
2+
3+
import org.junit.jupiter.api.Test;
4+
5+
import static org.junit.jupiter.api.Assertions.assertEquals;
6+
7+
public class UnicodeConverterUnitTest {
8+
9+
@Test
10+
public void whenInputHaveUnicodeSequences_ThenDecode() {
11+
String encodedString = "\\u0048\\u0065\\u006C\\u006C\\u006F World";
12+
String expectedDecodedString = "Hello World";
13+
assertEquals(expectedDecodedString, UnicodeConverterUtil.decodeWithApacheCommons(encodedString));
14+
assertEquals(expectedDecodedString, UnicodeConverterUtil.decodeWithPlainJava(encodedString));
15+
}
16+
17+
@Test
18+
public void whenInputHaveNoUnicodeSequences_ThenDoNothing() {
19+
String inputString = "Hello World";
20+
assertEquals(inputString, UnicodeConverterUtil.decodeWithApacheCommons(inputString));
21+
assertEquals(inputString, UnicodeConverterUtil.decodeWithPlainJava(inputString));
22+
}
23+
24+
@Test
25+
public void whenInputHaveUnicodeSequencesInMiddle_ThenDecode() {
26+
String encodedString = "This is a test \\u0069\\u006E the middle.";
27+
String expectedDecodedString = "This is a test in the middle.";
28+
assertEquals(expectedDecodedString, UnicodeConverterUtil.decodeWithApacheCommons(encodedString));
29+
assertEquals(expectedDecodedString, UnicodeConverterUtil.decodeWithPlainJava(encodedString));
30+
}
31+
32+
@Test
33+
public void whenInputHaveMultipleUnicodeSequences_ThenDecode() {
34+
String encodedString = "Unicode: \\u0048\\u0065\\u006C\\u006C\\u006F \\u0057\\u006F\\u0072\\u006C\\u0064";
35+
String expectedDecodedString = "Unicode: Hello World";
36+
assertEquals(expectedDecodedString, UnicodeConverterUtil.decodeWithApacheCommons(encodedString));
37+
assertEquals(expectedDecodedString, UnicodeConverterUtil.decodeWithPlainJava(encodedString));
38+
}
39+
}

0 commit comments

Comments
 (0)