fs/unicode/utf8-selftest.c

0001 // SPDX-License-Identifier: GPL-2.0-only
0002 /*
0003  * Kernel module for testing utf-8 support.
0004  *
0005  * Copyright 2017 Collabora Ltd.
0006  */
0007
0008 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
0009
0010 #include <linux/module.h>
0011 #include <linux/printk.h>
0012 #include <linux/unicode.h>
0013 #include <linux/dcache.h>
0014
0015 #include "utf8n.h"
0016
0017 unsigned int failed_tests;
0018 unsigned int total_tests;
0019
0020 /* Tests will be based on this version. */
0021 #define UTF8_LATEST UNICODE_AGE(12, 1, 0)
0022
0023 #define _test(cond, func, line, fmt, ...) do {              \
0024         total_tests++;                      \
0025         if (!cond) {                        \
0026             failed_tests++;                 \
0027             pr_err("test %s:%d Failed: %s%s",       \
0028                    func, line, #cond, (fmt?":":"."));   \
0029             if (fmt)                    \
0030                 pr_err(fmt, ##__VA_ARGS__);     \
0031         }                           \
0032     } while (0)
0033 #define test_f(cond, fmt, ...) _test(cond, __func__, __LINE__, fmt, ##__VA_ARGS__)
0034 #define test(cond) _test(cond, __func__, __LINE__, "")
0035
0036 static const struct {
0037     /* UTF-8 strings in this vector _must_ be NULL-terminated. */
0038     unsigned char str[10];
0039     unsigned char dec[10];
0040 } nfdi_test_data[] = {
0041     /* Trivial sequence */
0042     {
0043         /* "ABba" decomposes to itself */
0044         .str = "aBba",
0045         .dec = "aBba",
0046     },
0047     /* Simple equivalent sequences */
0048     {
0049                /* 'VULGAR FRACTION ONE QUARTER' cannot decompose to
0050                   'NUMBER 1' + 'FRACTION SLASH' + 'NUMBER 4' on
0051                   canonical decomposition */
0052                .str = {0xc2, 0xbc, 0x00},
0053            .dec = {0xc2, 0xbc, 0x00},
0054     },
0055     {
0056         /* 'LATIN SMALL LETTER A WITH DIAERESIS' decomposes to
0057            'LETTER A' + 'COMBINING DIAERESIS' */
0058         .str = {0xc3, 0xa4, 0x00},
0059         .dec = {0x61, 0xcc, 0x88, 0x00},
0060     },
0061     {
0062         /* 'LATIN SMALL LETTER LJ' can't decompose to
0063            'LETTER L' + 'LETTER J' on canonical decomposition */
0064         .str = {0xC7, 0x89, 0x00},
0065         .dec = {0xC7, 0x89, 0x00},
0066     },
0067     {
0068         /* GREEK ANO TELEIA decomposes to MIDDLE DOT */
0069         .str = {0xCE, 0x87, 0x00},
0070         .dec = {0xC2, 0xB7, 0x00}
0071     },
0072     /* Canonical ordering */
0073     {
0074         /* A + 'COMBINING ACUTE ACCENT' + 'COMBINING OGONEK' decomposes
0075            to A + 'COMBINING OGONEK' + 'COMBINING ACUTE ACCENT' */
0076         .str = {0x41, 0xcc, 0x81, 0xcc, 0xa8, 0x0},
0077         .dec = {0x41, 0xcc, 0xa8, 0xcc, 0x81, 0x0},
0078     },
0079     {
0080         /* 'LATIN SMALL LETTER A WITH DIAERESIS' + 'COMBINING OGONEK'
0081            decomposes to
0082            'LETTER A' + 'COMBINING OGONEK' + 'COMBINING DIAERESIS' */
0083         .str = {0xc3, 0xa4, 0xCC, 0xA8, 0x00},
0084
0085         .dec = {0x61, 0xCC, 0xA8, 0xcc, 0x88, 0x00},
0086     },
0087
0088 };
0089
0090 static const struct {
0091     /* UTF-8 strings in this vector _must_ be NULL-terminated. */
0092     unsigned char str[30];
0093     unsigned char ncf[30];
0094 } nfdicf_test_data[] = {
0095     /* Trivial sequences */
0096     {
0097         /* "ABba" folds to lowercase */
0098         .str = {0x41, 0x42, 0x62, 0x61, 0x00},
0099         .ncf = {0x61, 0x62, 0x62, 0x61, 0x00},
0100     },
0101     {
0102         /* All ASCII folds to lower-case */
0103         .str = "ABCDEFGHIJKLMNOPQRSTUVWXYZ0.1",
0104         .ncf = "abcdefghijklmnopqrstuvwxyz0.1",
0105     },
0106     {
0107         /* LATIN SMALL LETTER SHARP S folds to
0108            LATIN SMALL LETTER S + LATIN SMALL LETTER S */
0109         .str = {0xc3, 0x9f, 0x00},
0110         .ncf = {0x73, 0x73, 0x00},
0111     },
0112     {
0113         /* LATIN CAPITAL LETTER A WITH RING ABOVE folds to
0114            LATIN SMALL LETTER A + COMBINING RING ABOVE */
0115         .str = {0xC3, 0x85, 0x00},
0116         .ncf = {0x61, 0xcc, 0x8a, 0x00},
0117     },
0118     /* Introduced by UTF-8.0.0. */
0119     /* Cherokee letters are interesting test-cases because they fold
0120        to upper-case.  Before 8.0.0, Cherokee lowercase were
0121        undefined, thus, the folding from LC is not stable between
0122        7.0.0 -> 8.0.0, but it is from UC. */
0123     {
0124         /* CHEROKEE SMALL LETTER A folds to CHEROKEE LETTER A */
0125         .str = {0xea, 0xad, 0xb0, 0x00},
0126         .ncf = {0xe1, 0x8e, 0xa0, 0x00},
0127     },
0128     {
0129         /* CHEROKEE SMALL LETTER YE folds to CHEROKEE LETTER YE */
0130         .str = {0xe1, 0x8f, 0xb8, 0x00},
0131         .ncf = {0xe1, 0x8f, 0xb0, 0x00},
0132     },
0133     {
0134         /* OLD HUNGARIAN CAPITAL LETTER AMB folds to
0135            OLD HUNGARIAN SMALL LETTER AMB */
0136         .str = {0xf0, 0x90, 0xb2, 0x83, 0x00},
0137         .ncf = {0xf0, 0x90, 0xb3, 0x83, 0x00},
0138     },
0139     /* Introduced by UTF-9.0.0. */
0140     {
0141         /* OSAGE CAPITAL LETTER CHA folds to
0142            OSAGE SMALL LETTER CHA */
0143         .str = {0xf0, 0x90, 0x92, 0xb5, 0x00},
0144         .ncf = {0xf0, 0x90, 0x93, 0x9d, 0x00},
0145     },
0146     {
0147         /* LATIN CAPITAL LETTER SMALL CAPITAL I folds to
0148            LATIN LETTER SMALL CAPITAL I */
0149         .str = {0xea, 0x9e, 0xae, 0x00},
0150         .ncf = {0xc9, 0xaa, 0x00},
0151     },
0152     /* Introduced by UTF-11.0.0. */
0153     {
0154         /* GEORGIAN SMALL LETTER AN folds to GEORGIAN MTAVRULI
0155            CAPITAL LETTER AN */
0156         .str = {0xe1, 0xb2, 0x90, 0x00},
0157         .ncf = {0xe1, 0x83, 0x90, 0x00},
0158     }
0159 };
0160
0161 static ssize_t utf8len(const struct unicode_map *um, enum utf8_normalization n,
0162         const char *s)
0163 {
0164     return utf8nlen(um, n, s, (size_t)-1);
0165 }
0166
0167 static int utf8cursor(struct utf8cursor *u8c, const struct unicode_map *um,
0168         enum utf8_normalization n, const char *s)
0169 {
0170     return utf8ncursor(u8c, um, n, s, (unsigned int)-1);
0171 }
0172
0173 static void check_utf8_nfdi(struct unicode_map *um)
0174 {
0175     int i;
0176     struct utf8cursor u8c;
0177
0178     for (i = 0; i < ARRAY_SIZE(nfdi_test_data); i++) {
0179         int len = strlen(nfdi_test_data[i].str);
0180         int nlen = strlen(nfdi_test_data[i].dec);
0181         int j = 0;
0182         unsigned char c;
0183
0184         test((utf8len(um, UTF8_NFDI, nfdi_test_data[i].str) == nlen));
0185         test((utf8nlen(um, UTF8_NFDI, nfdi_test_data[i].str, len) ==
0186             nlen));
0187
0188         if (utf8cursor(&u8c, um, UTF8_NFDI, nfdi_test_data[i].str) < 0)
0189             pr_err("can't create cursor\n");
0190
0191         while ((c = utf8byte(&u8c)) > 0) {
0192             test_f((c == nfdi_test_data[i].dec[j]),
0193                    "Unexpected byte 0x%x should be 0x%x\n",
0194                    c, nfdi_test_data[i].dec[j]);
0195             j++;
0196         }
0197
0198         test((j == nlen));
0199     }
0200 }
0201
0202 static void check_utf8_nfdicf(struct unicode_map *um)
0203 {
0204     int i;
0205     struct utf8cursor u8c;
0206
0207     for (i = 0; i < ARRAY_SIZE(nfdicf_test_data); i++) {
0208         int len = strlen(nfdicf_test_data[i].str);
0209         int nlen = strlen(nfdicf_test_data[i].ncf);
0210         int j = 0;
0211         unsigned char c;
0212
0213         test((utf8len(um, UTF8_NFDICF, nfdicf_test_data[i].str) ==
0214                 nlen));
0215         test((utf8nlen(um, UTF8_NFDICF, nfdicf_test_data[i].str, len) ==
0216                 nlen));
0217
0218         if (utf8cursor(&u8c, um, UTF8_NFDICF,
0219                 nfdicf_test_data[i].str) < 0)
0220             pr_err("can't create cursor\n");
0221
0222         while ((c = utf8byte(&u8c)) > 0) {
0223             test_f((c == nfdicf_test_data[i].ncf[j]),
0224                    "Unexpected byte 0x%x should be 0x%x\n",
0225                    c, nfdicf_test_data[i].ncf[j]);
0226             j++;
0227         }
0228
0229         test((j == nlen));
0230     }
0231 }
0232
0233 static void check_utf8_comparisons(struct unicode_map *table)
0234 {
0235     int i;
0236
0237     for (i = 0; i < ARRAY_SIZE(nfdi_test_data); i++) {
0238         const struct qstr s1 = {.name = nfdi_test_data[i].str,
0239                     .len = sizeof(nfdi_test_data[i].str)};
0240         const struct qstr s2 = {.name = nfdi_test_data[i].dec,
0241                     .len = sizeof(nfdi_test_data[i].dec)};
0242
0243         test_f(!utf8_strncmp(table, &s1, &s2),
0244                "%s %s comparison mismatch\n", s1.name, s2.name);
0245     }
0246
0247     for (i = 0; i < ARRAY_SIZE(nfdicf_test_data); i++) {
0248         const struct qstr s1 = {.name = nfdicf_test_data[i].str,
0249                     .len = sizeof(nfdicf_test_data[i].str)};
0250         const struct qstr s2 = {.name = nfdicf_test_data[i].ncf,
0251                     .len = sizeof(nfdicf_test_data[i].ncf)};
0252
0253         test_f(!utf8_strncasecmp(table, &s1, &s2),
0254                "%s %s comparison mismatch\n", s1.name, s2.name);
0255     }
0256 }
0257
0258 static void check_supported_versions(struct unicode_map *um)
0259 {
0260     /* Unicode 7.0.0 should be supported. */
0261     test(utf8version_is_supported(um, UNICODE_AGE(7, 0, 0)));
0262
0263     /* Unicode 9.0.0 should be supported. */
0264     test(utf8version_is_supported(um, UNICODE_AGE(9, 0, 0)));
0265
0266     /* Unicode 1x.0.0 (the latest version) should be supported. */
0267     test(utf8version_is_supported(um, UTF8_LATEST));
0268
0269     /* Next versions don't exist. */
0270     test(!utf8version_is_supported(um, UNICODE_AGE(13, 0, 0)));
0271     test(!utf8version_is_supported(um, UNICODE_AGE(0, 0, 0)));
0272     test(!utf8version_is_supported(um, UNICODE_AGE(-1, -1, -1)));
0273 }
0274
0275 static int __init init_test_ucd(void)
0276 {
0277     struct unicode_map *um;
0278
0279     failed_tests = 0;
0280     total_tests = 0;
0281
0282     um = utf8_load(UTF8_LATEST);
0283     if (IS_ERR(um)) {
0284         pr_err("%s: Unable to load utf8 table.\n", __func__);
0285         return PTR_ERR(um);
0286     }
0287
0288     check_supported_versions(um);
0289     check_utf8_nfdi(um);
0290     check_utf8_nfdicf(um);
0291     check_utf8_comparisons(um);
0292
0293     if (!failed_tests)
0294         pr_info("All %u tests passed\n", total_tests);
0295     else
0296         pr_err("%u out of %u tests failed\n", failed_tests,
0297                total_tests);
0298     utf8_unload(um);
0299     return 0;
0300 }
0301
0302 static void __exit exit_test_ucd(void)
0303 {
0304 }
0305
0306 module_init(init_test_ucd);
0307 module_exit(exit_test_ucd);
0308
0309 MODULE_AUTHOR("Gabriel Krisman Bertazi <krisman@collabora.co.uk>");
0310 MODULE_LICENSE("GPL");