utf8_kunit.c source code [linux/fs/unicode/tests/utf8_kunit.c]

1	// SPDX-License-Identifier: GPL-2.0-only
2	/*
3	* KUnit tests for utf-8 support.
4	*
5	* Copyright 2017 Collabora Ltd.
6	*/
7
8	#include <linux/unicode.h>
9	#include <kunit/test.h>
10
11	#include "../utf8n.h"
12
13	static const struct {
14	/ UTF-8 strings in this vector _must_ be NULL-terminated. /
15	unsigned char str[`10`];
16	unsigned char dec[`10`];
17	} nfdi_test_data[] = {
18	/ Trivial sequence /
19	{
20	/ "ABba" decomposes to itself /
21	.str = "aBba",
22	.dec = "aBba",
23	},
24	/ Simple equivalent sequences /
25	{
26	/ 'VULGAR FRACTION ONE QUARTER' cannot decompose to*
27	'NUMBER 1' + 'FRACTION SLASH' + 'NUMBER 4' on
28	canonical decomposition /*
29	.str = {`0xc2`, `0xbc`, `0x00`},
30	.dec = {`0xc2`, `0xbc`, `0x00`},
31	},
32	{
33	/ 'LATIN SMALL LETTER A WITH DIAERESIS' decomposes to*
34	'LETTER A' + 'COMBINING DIAERESIS' /*
35	.str = {`0xc3`, `0xa4`, `0x00`},
36	.dec = {`0x61`, `0xcc`, `0x88`, `0x00`},
37	},
38	{
39	/ 'LATIN SMALL LETTER LJ' can't decompose to*
40	'LETTER L' + 'LETTER J' on canonical decomposition /*
41	.str = {`0xC7`, `0x89`, `0x00`},
42	.dec = {`0xC7`, `0x89`, `0x00`},
43	},
44	{
45	/ GREEK ANO TELEIA decomposes to MIDDLE DOT /
46	.str = {`0xCE`, `0x87`, `0x00`},
47	.dec = {`0xC2`, `0xB7`, `0x00`}
48	},
49	/ Canonical ordering /
50	{
51	/ A + 'COMBINING ACUTE ACCENT' + 'COMBINING OGONEK' decomposes*
52	to A + 'COMBINING OGONEK' + 'COMBINING ACUTE ACCENT' /*
53	.str = {`0x41`, `0xcc`, `0x81`, `0xcc`, `0xa8`, `0x0`},
54	.dec = {`0x41`, `0xcc`, `0xa8`, `0xcc`, `0x81`, `0x0`},
55	},
56	{
57	/ 'LATIN SMALL LETTER A WITH DIAERESIS' + 'COMBINING OGONEK'*
58	decomposes to
59	'LETTER A' + 'COMBINING OGONEK' + 'COMBINING DIAERESIS' /*
60	.str = {`0xc3`, `0xa4`, `0xCC`, `0xA8`, `0x00`},
61
62	.dec = {`0x61`, `0xCC`, `0xA8`, `0xcc`, `0x88`, `0x00`},
63	},
64
65	};
66
67	static const struct {
68	/ UTF-8 strings in this vector _must_ be NULL-terminated. /
69	unsigned char str[`30`];
70	unsigned char ncf[`30`];
71	} nfdicf_test_data[] = {
72	/ Trivial sequences /
73	{
74	/ "ABba" folds to lowercase /
75	.str = {`0x41`, `0x42`, `0x62`, `0x61`, `0x00`},
76	.ncf = {`0x61`, `0x62`, `0x62`, `0x61`, `0x00`},
77	},
78	{
79	/ All ASCII folds to lower-case /
80	.str = "ABCDEFGHIJKLMNOPQRSTUVWXYZ0.1",
81	.ncf = "abcdefghijklmnopqrstuvwxyz0.1",
82	},
83	{
84	/ LATIN SMALL LETTER SHARP S folds to*
85	LATIN SMALL LETTER S + LATIN SMALL LETTER S /*
86	.str = {`0xc3`, `0x9f`, `0x00`},
87	.ncf = {`0x73`, `0x73`, `0x00`},
88	},
89	{
90	/ LATIN CAPITAL LETTER A WITH RING ABOVE folds to*
91	LATIN SMALL LETTER A + COMBINING RING ABOVE /*
92	.str = {`0xC3`, `0x85`, `0x00`},
93	.ncf = {`0x61`, `0xcc`, `0x8a`, `0x00`},
94	},
95	/ Introduced by UTF-8.0.0. /
96	/ Cherokee letters are interesting test-cases because they fold*
97	to upper-case. Before 8.0.0, Cherokee lowercase were
98	undefined, thus, the folding from LC is not stable between
99	7.0.0 -> 8.0.0, but it is from UC. /*
100	{
101	/ CHEROKEE SMALL LETTER A folds to CHEROKEE LETTER A /
102	.str = {`0xea`, `0xad`, `0xb0`, `0x00`},
103	.ncf = {`0xe1`, `0x8e`, `0xa0`, `0x00`},
104	},
105	{
106	/ CHEROKEE SMALL LETTER YE folds to CHEROKEE LETTER YE /
107	.str = {`0xe1`, `0x8f`, `0xb8`, `0x00`},
108	.ncf = {`0xe1`, `0x8f`, `0xb0`, `0x00`},
109	},
110	{
111	/ OLD HUNGARIAN CAPITAL LETTER AMB folds to*
112	OLD HUNGARIAN SMALL LETTER AMB /*
113	.str = {`0xf0`, `0x90`, `0xb2`, `0x83`, `0x00`},
114	.ncf = {`0xf0`, `0x90`, `0xb3`, `0x83`, `0x00`},
115	},
116	/ Introduced by UTF-9.0.0. /
117	{
118	/ OSAGE CAPITAL LETTER CHA folds to*
119	OSAGE SMALL LETTER CHA /*
120	.str = {`0xf0`, `0x90`, `0x92`, `0xb5`, `0x00`},
121	.ncf = {`0xf0`, `0x90`, `0x93`, `0x9d`, `0x00`},
122	},
123	{
124	/ LATIN CAPITAL LETTER SMALL CAPITAL I folds to*
125	LATIN LETTER SMALL CAPITAL I /*
126	.str = {`0xea`, `0x9e`, `0xae`, `0x00`},
127	.ncf = {`0xc9`, `0xaa`, `0x00`},
128	},
129	/ Introduced by UTF-11.0.0. /
130	{
131	/ GEORGIAN SMALL LETTER AN folds to GEORGIAN MTAVRULI*
132	CAPITAL LETTER AN /*
133	.str = {`0xe1`, `0xb2`, `0x90`, `0x00`},
134	.ncf = {`0xe1`, `0x83`, `0x90`, `0x00`},
135	}
136	};
137
138	static ssize_t utf8len(const struct unicode_map um, enum* utf8_normalization n,
139	const char *s)
140	{
141	return utf8nlen(um, n, s, len: (size_t)-`1`);
142	}
143
144	static int utf8cursor(struct utf8cursor u8c, const* struct unicode_map *um,
145	enum utf8_normalization n, const char *s)
146	{
147	return utf8ncursor(u8c, um, n, s, len: (unsigned int)-`1`);
148	}
149
150	static void check_utf8_nfdi(struct kunit *test)
151	{
152	int i;
153	struct utf8cursor u8c;
154	struct unicode_map *um = test->priv;
155
156	for (i = `0`; i < ARRAY_SIZE(nfdi_test_data); i++) {
157	int len = strlen(nfdi_test_data[i].str);
158	int nlen = strlen(nfdi_test_data[i].dec);
159	int j = `0`;
160	unsigned char c;
161	int ret;
162
163	KUNIT_EXPECT_EQ(test, utf8len(um, UTF8_NFDI, nfdi_test_data[i].str), nlen);
164	KUNIT_EXPECT_EQ(test, utf8nlen(um, UTF8_NFDI, nfdi_test_data[i].str, len),
165	nlen);
166
167
168	ret = utf8cursor(u8c: &u8c, um, n: UTF8_NFDI, s: nfdi_test_data[i].str);
169	KUNIT_EXPECT_TRUE_MSG(test, ret >= `0`, "Can't create cursor\n");
170
171	while ((c = utf8byte(u8c: &u8c)) > `0`) {
172	KUNIT_EXPECT_EQ_MSG(test, c, nfdi_test_data[i].dec[j],
173	"Unexpected byte 0x%x should be 0x%x\n",
174	c, nfdi_test_data[i].dec[j]);
175	j++;
176	}
177
178	KUNIT_EXPECT_EQ(test, j, nlen);
179	}
180	}
181
182	static void check_utf8_nfdicf(struct kunit *test)
183	{
184	int i;
185	struct utf8cursor u8c;
186	struct unicode_map *um = test->priv;
187
188	for (i = `0`; i < ARRAY_SIZE(nfdicf_test_data); i++) {
189	int len = strlen(nfdicf_test_data[i].str);
190	int nlen = strlen(nfdicf_test_data[i].ncf);
191	int j = `0`;
192	int ret;
193	unsigned char c;
194
195	KUNIT_EXPECT_EQ(test, utf8len(um, UTF8_NFDICF, nfdicf_test_data[i].str),
196	nlen);
197	KUNIT_EXPECT_EQ(test, utf8nlen(um, UTF8_NFDICF, nfdicf_test_data[i].str, len),
198	nlen);
199
200	ret = utf8cursor(u8c: &u8c, um, n: UTF8_NFDICF, s: nfdicf_test_data[i].str);
201	KUNIT_EXPECT_TRUE_MSG(test, ret >= `0`, "Can't create cursor\n");
202
203	while ((c = utf8byte(u8c: &u8c)) > `0`) {
204	KUNIT_EXPECT_EQ_MSG(test, c, nfdicf_test_data[i].ncf[j],
205	"Unexpected byte 0x%x should be 0x%x\n",
206	c, nfdicf_test_data[i].ncf[j]);
207	j++;
208	}
209
210	KUNIT_EXPECT_EQ(test, j, nlen);
211	}
212	}
213
214	static void check_utf8_comparisons(struct kunit *test)
215	{
216	int i;
217	struct unicode_map *um = test->priv;
218
219	for (i = `0`; i < ARRAY_SIZE(nfdi_test_data); i++) {
220	const struct qstr s1 = {.name = nfdi_test_data[i].str,
221	.len = sizeof(nfdi_test_data[i].str)};
222	const struct qstr s2 = {.name = nfdi_test_data[i].dec,
223	.len = sizeof(nfdi_test_data[i].dec)};
224
225	/ strncmp returns 0 when strings are equal /
226	KUNIT_EXPECT_TRUE_MSG(test, utf8_strncmp(um, &s1, &s2) == `0`,
227	"%s %s comparison mismatch\n", s1.name, s2.name);
228	}
229
230	for (i = `0`; i < ARRAY_SIZE(nfdicf_test_data); i++) {
231	const struct qstr s1 = {.name = nfdicf_test_data[i].str,
232	.len = sizeof(nfdicf_test_data[i].str)};
233	const struct qstr s2 = {.name = nfdicf_test_data[i].ncf,
234	.len = sizeof(nfdicf_test_data[i].ncf)};
235
236	/ strncasecmp returns 0 when strings are equal /
237	KUNIT_EXPECT_TRUE_MSG(test, utf8_strncasecmp(um, &s1, &s2) == `0`,
238	"%s %s comparison mismatch\n", s1.name, s2.name);
239	}
240	}
241
242	static void check_supported_versions(struct kunit *test)
243	{
244	struct unicode_map *um = test->priv;
245	/ Unicode 7.0.0 should be supported. /
246	KUNIT_EXPECT_TRUE(test, utf8version_is_supported(um, UNICODE_AGE(`7`, `0`, `0`)));
247
248	/ Unicode 9.0.0 should be supported. /
249	KUNIT_EXPECT_TRUE(test, utf8version_is_supported(um, UNICODE_AGE(`9`, `0`, `0`)));
250
251	/ Unicode 1x.0.0 (the latest version) should be supported. /
252	KUNIT_EXPECT_TRUE(test, utf8version_is_supported(um, UTF8_LATEST));
253
254	/ Next versions don't exist. /
255	KUNIT_EXPECT_FALSE(test, utf8version_is_supported(um, UNICODE_AGE(`13`, `0`, `0`)));
256	KUNIT_EXPECT_FALSE(test, utf8version_is_supported(um, UNICODE_AGE(`0`, `0`, `0`)));
257	KUNIT_EXPECT_FALSE(test, utf8version_is_supported(um, UNICODE_AGE(-`1`, -`1`, -`1`)));
258	}
259
260	static struct kunit_case unicode_normalization_test_cases[] = {
261	KUNIT_CASE(check_supported_versions),
262	KUNIT_CASE(check_utf8_comparisons),
263	KUNIT_CASE(check_utf8_nfdicf),
264	KUNIT_CASE(check_utf8_nfdi),
265	{}
266	};
267
268	static int init_test_ucd(struct kunit *test)
269	{
270	struct unicode_map *um = utf8_load(UTF8_LATEST);
271
272	test->priv = um;
273
274	KUNIT_EXPECT_EQ_MSG(test, IS_ERR(um), `0`,
275	"%s: Unable to load utf8 table.\n", __func__);
276
277	return `0`;
278	}
279
280	static void exit_test_ucd(struct kunit *test)
281	{
282	utf8_unload(um: test->priv);
283	}
284
285	static struct kunit_suite unicode_normalization_test_suite = {
286	.name = "unicode_normalization",
287	.test_cases = unicode_normalization_test_cases,
288	.init = init_test_ucd,
289	.exit = exit_test_ucd,
290	};
291
292	kunit_test_suite(unicode_normalization_test_suite);
293
294
295	MODULE_AUTHOR("Gabriel Krisman Bertazi <krisman@collabora.co.uk>");
296	MODULE_DESCRIPTION("KUnit tests for utf-8 support.");
297	MODULE_LICENSE("GPL");
298

source code of linux/fs/unicode/tests/utf8_kunit.c