fs/unicode/utf8-norm.c

0001 // SPDX-License-Identifier: GPL-2.0-only
0002 /*
0003  * Copyright (c) 2014 SGI.
0004  * All rights reserved.
0005  */
0006
0007 #include "utf8n.h"
0008
0009 int utf8version_is_supported(const struct unicode_map *um, unsigned int version)
0010 {
0011     int i = um->tables->utf8agetab_size - 1;
0012
0013     while (i >= 0 && um->tables->utf8agetab[i] != 0) {
0014         if (version == um->tables->utf8agetab[i])
0015             return 1;
0016         i--;
0017     }
0018     return 0;
0019 }
0020
0021 /*
0022  * UTF-8 valid ranges.
0023  *
0024  * The UTF-8 encoding spreads the bits of a 32bit word over several
0025  * bytes. This table gives the ranges that can be held and how they'd
0026  * be represented.
0027  *
0028  * 0x00000000 0x0000007F: 0xxxxxxx
0029  * 0x00000000 0x000007FF: 110xxxxx 10xxxxxx
0030  * 0x00000000 0x0000FFFF: 1110xxxx 10xxxxxx 10xxxxxx
0031  * 0x00000000 0x001FFFFF: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
0032  * 0x00000000 0x03FFFFFF: 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
0033  * 0x00000000 0x7FFFFFFF: 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
0034  *
0035  * There is an additional requirement on UTF-8, in that only the
0036  * shortest representation of a 32bit value is to be used.  A decoder
0037  * must not decode sequences that do not satisfy this requirement.
0038  * Thus the allowed ranges have a lower bound.
0039  *
0040  * 0x00000000 0x0000007F: 0xxxxxxx
0041  * 0x00000080 0x000007FF: 110xxxxx 10xxxxxx
0042  * 0x00000800 0x0000FFFF: 1110xxxx 10xxxxxx 10xxxxxx
0043  * 0x00010000 0x001FFFFF: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
0044  * 0x00200000 0x03FFFFFF: 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
0045  * 0x04000000 0x7FFFFFFF: 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
0046  *
0047  * Actual unicode characters are limited to the range 0x0 - 0x10FFFF,
0048  * 17 planes of 65536 values.  This limits the sequences actually seen
0049  * even more, to just the following.
0050  *
0051  *          0 -     0x7F: 0                   - 0x7F
0052  *       0x80 -    0x7FF: 0xC2 0x80           - 0xDF 0xBF
0053  *      0x800 -   0xFFFF: 0xE0 0xA0 0x80      - 0xEF 0xBF 0xBF
0054  *    0x10000 - 0x10FFFF: 0xF0 0x90 0x80 0x80 - 0xF4 0x8F 0xBF 0xBF
0055  *
0056  * Within those ranges the surrogates 0xD800 - 0xDFFF are not allowed.
0057  *
0058  * Note that the longest sequence seen with valid usage is 4 bytes,
0059  * the same a single UTF-32 character.  This makes the UTF-8
0060  * representation of Unicode strictly smaller than UTF-32.
0061  *
0062  * The shortest sequence requirement was introduced by:
0063  *    Corrigendum #1: UTF-8 Shortest Form
0064  * It can be found here:
0065  *    http://www.unicode.org/versions/corrigendum1.html
0066  *
0067  */
0068
0069 /*
0070  * Return the number of bytes used by the current UTF-8 sequence.
0071  * Assumes the input points to the first byte of a valid UTF-8
0072  * sequence.
0073  */
0074 static inline int utf8clen(const char *s)
0075 {
0076     unsigned char c = *s;
0077
0078     return 1 + (c >= 0xC0) + (c >= 0xE0) + (c >= 0xF0);
0079 }
0080
0081 /*
0082  * Decode a 3-byte UTF-8 sequence.
0083  */
0084 static unsigned int
0085 utf8decode3(const char *str)
0086 {
0087     unsigned int        uc;
0088
0089     uc = *str++ & 0x0F;
0090     uc <<= 6;
0091     uc |= *str++ & 0x3F;
0092     uc <<= 6;
0093     uc |= *str++ & 0x3F;
0094
0095     return uc;
0096 }
0097
0098 /*
0099  * Encode a 3-byte UTF-8 sequence.
0100  */
0101 static int
0102 utf8encode3(char *str, unsigned int val)
0103 {
0104     str[2] = (val & 0x3F) | 0x80;
0105     val >>= 6;
0106     str[1] = (val & 0x3F) | 0x80;
0107     val >>= 6;
0108     str[0] = val | 0xE0;
0109
0110     return 3;
0111 }
0112
0113 /*
0114  * utf8trie_t
0115  *
0116  * A compact binary tree, used to decode UTF-8 characters.
0117  *
0118  * Internal nodes are one byte for the node itself, and up to three
0119  * bytes for an offset into the tree.  The first byte contains the
0120  * following information:
0121  *  NEXTBYTE  - flag        - advance to next byte if set
0122  *  BITNUM    - 3 bit field - the bit number to tested
0123  *  OFFLEN    - 2 bit field - number of bytes in the offset
0124  * if offlen == 0 (non-branching node)
0125  *  RIGHTPATH - 1 bit field - set if the following node is for the
0126  *                            right-hand path (tested bit is set)
0127  *  TRIENODE  - 1 bit field - set if the following node is an internal
0128  *                            node, otherwise it is a leaf node
0129  * if offlen != 0 (branching node)
0130  *  LEFTNODE  - 1 bit field - set if the left-hand node is internal
0131  *  RIGHTNODE - 1 bit field - set if the right-hand node is internal
0132  *
0133  * Due to the way utf8 works, there cannot be branching nodes with
0134  * NEXTBYTE set, and moreover those nodes always have a righthand
0135  * descendant.
0136  */
0137 typedef const unsigned char utf8trie_t;
0138 #define BITNUM      0x07
0139 #define NEXTBYTE    0x08
0140 #define OFFLEN      0x30
0141 #define OFFLEN_SHIFT    4
0142 #define RIGHTPATH   0x40
0143 #define TRIENODE    0x80
0144 #define RIGHTNODE   0x40
0145 #define LEFTNODE    0x80
0146
0147 /*
0148  * utf8leaf_t
0149  *
0150  * The leaves of the trie are embedded in the trie, and so the same
0151  * underlying datatype: unsigned char.
0152  *
0153  * leaf[0]: The unicode version, stored as a generation number that is
0154  *          an index into ->utf8agetab[].  With this we can filter code
0155  *          points based on the unicode version in which they were
0156  *          defined.  The CCC of a non-defined code point is 0.
0157  * leaf[1]: Canonical Combining Class. During normalization, we need
0158  *          to do a stable sort into ascending order of all characters
0159  *          with a non-zero CCC that occur between two characters with
0160  *          a CCC of 0, or at the begin or end of a string.
0161  *          The unicode standard guarantees that all CCC values are
0162  *          between 0 and 254 inclusive, which leaves 255 available as
0163  *          a special value.
0164  *          Code points with CCC 0 are known as stoppers.
0165  * leaf[2]: Decomposition. If leaf[1] == 255, then leaf[2] is the
0166  *          start of a NUL-terminated string that is the decomposition
0167  *          of the character.
0168  *          The CCC of a decomposable character is the same as the CCC
0169  *          of the first character of its decomposition.
0170  *          Some characters decompose as the empty string: these are
0171  *          characters with the Default_Ignorable_Code_Point property.
0172  *          These do affect normalization, as they all have CCC 0.
0173  *
0174  * The decompositions in the trie have been fully expanded, with the
0175  * exception of Hangul syllables, which are decomposed algorithmically.
0176  *
0177  * Casefolding, if applicable, is also done using decompositions.
0178  *
0179  * The trie is constructed in such a way that leaves exist for all
0180  * UTF-8 sequences that match the criteria from the "UTF-8 valid
0181  * ranges" comment above, and only for those sequences.  Therefore a
0182  * lookup in the trie can be used to validate the UTF-8 input.
0183  */
0184 typedef const unsigned char utf8leaf_t;
0185
0186 #define LEAF_GEN(LEAF)  ((LEAF)[0])
0187 #define LEAF_CCC(LEAF)  ((LEAF)[1])
0188 #define LEAF_STR(LEAF)  ((const char *)((LEAF) + 2))
0189
0190 #define MINCCC      (0)
0191 #define MAXCCC      (254)
0192 #define STOPPER     (0)
0193 #define DECOMPOSE   (255)
0194
0195 /* Marker for hangul syllable decomposition. */
0196 #define HANGUL      ((char)(255))
0197 /* Size of the synthesized leaf used for Hangul syllable decomposition. */
0198 #define UTF8HANGULLEAF  (12)
0199
0200 /*
0201  * Hangul decomposition (algorithm from Section 3.12 of Unicode 6.3.0)
0202  *
0203  * AC00;<Hangul Syllable, First>;Lo;0;L;;;;;N;;;;;
0204  * D7A3;<Hangul Syllable, Last>;Lo;0;L;;;;;N;;;;;
0205  *
0206  * SBase = 0xAC00
0207  * LBase = 0x1100
0208  * VBase = 0x1161
0209  * TBase = 0x11A7
0210  * LCount = 19
0211  * VCount = 21
0212  * TCount = 28
0213  * NCount = 588 (VCount * TCount)
0214  * SCount = 11172 (LCount * NCount)
0215  *
0216  * Decomposition:
0217  *   SIndex = s - SBase
0218  *
0219  * LV (Canonical/Full)
0220  *   LIndex = SIndex / NCount
0221  *   VIndex = (Sindex % NCount) / TCount
0222  *   LPart = LBase + LIndex
0223  *   VPart = VBase + VIndex
0224  *
0225  * LVT (Canonical)
0226  *   LVIndex = (SIndex / TCount) * TCount
0227  *   TIndex = (Sindex % TCount)
0228  *   LVPart = SBase + LVIndex
0229  *   TPart = TBase + TIndex
0230  *
0231  * LVT (Full)
0232  *   LIndex = SIndex / NCount
0233  *   VIndex = (Sindex % NCount) / TCount
0234  *   TIndex = (Sindex % TCount)
0235  *   LPart = LBase + LIndex
0236  *   VPart = VBase + VIndex
0237  *   if (TIndex == 0) {
0238  *          d = <LPart, VPart>
0239  *   } else {
0240  *          TPart = TBase + TIndex
0241  *          d = <LPart, TPart, VPart>
0242  *   }
0243  */
0244
0245 /* Constants */
0246 #define SB  (0xAC00)
0247 #define LB  (0x1100)
0248 #define VB  (0x1161)
0249 #define TB  (0x11A7)
0250 #define LC  (19)
0251 #define VC  (21)
0252 #define TC  (28)
0253 #define NC  (VC * TC)
0254 #define SC  (LC * NC)
0255
0256 /* Algorithmic decomposition of hangul syllable. */
0257 static utf8leaf_t *
0258 utf8hangul(const char *str, unsigned char *hangul)
0259 {
0260     unsigned int    si;
0261     unsigned int    li;
0262     unsigned int    vi;
0263     unsigned int    ti;
0264     unsigned char   *h;
0265
0266     /* Calculate the SI, LI, VI, and TI values. */
0267     si = utf8decode3(str) - SB;
0268     li = si / NC;
0269     vi = (si % NC) / TC;
0270     ti = si % TC;
0271
0272     /* Fill in base of leaf. */
0273     h = hangul;
0274     LEAF_GEN(h) = 2;
0275     LEAF_CCC(h) = DECOMPOSE;
0276     h += 2;
0277
0278     /* Add LPart, a 3-byte UTF-8 sequence. */
0279     h += utf8encode3((char *)h, li + LB);
0280
0281     /* Add VPart, a 3-byte UTF-8 sequence. */
0282     h += utf8encode3((char *)h, vi + VB);
0283
0284     /* Add TPart if required, also a 3-byte UTF-8 sequence. */
0285     if (ti)
0286         h += utf8encode3((char *)h, ti + TB);
0287
0288     /* Terminate string. */
0289     h[0] = '\0';
0290
0291     return hangul;
0292 }
0293
0294 /*
0295  * Use trie to scan s, touching at most len bytes.
0296  * Returns the leaf if one exists, NULL otherwise.
0297  *
0298  * A non-NULL return guarantees that the UTF-8 sequence starting at s
0299  * is well-formed and corresponds to a known unicode code point.  The
0300  * shorthand for this will be "is valid UTF-8 unicode".
0301  */
0302 static utf8leaf_t *utf8nlookup(const struct unicode_map *um,
0303         enum utf8_normalization n, unsigned char *hangul, const char *s,
0304         size_t len)
0305 {
0306     utf8trie_t  *trie = um->tables->utf8data + um->ntab[n]->offset;
0307     int     offlen;
0308     int     offset;
0309     int     mask;
0310     int     node;
0311
0312     if (len == 0)
0313         return NULL;
0314
0315     node = 1;
0316     while (node) {
0317         offlen = (*trie & OFFLEN) >> OFFLEN_SHIFT;
0318         if (*trie & NEXTBYTE) {
0319             if (--len == 0)
0320                 return NULL;
0321             s++;
0322         }
0323         mask = 1 << (*trie & BITNUM);
0324         if (*s & mask) {
0325             /* Right leg */
0326             if (offlen) {
0327                 /* Right node at offset of trie */
0328                 node = (*trie & RIGHTNODE);
0329                 offset = trie[offlen];
0330                 while (--offlen) {
0331                     offset <<= 8;
0332                     offset |= trie[offlen];
0333                 }
0334                 trie += offset;
0335             } else if (*trie & RIGHTPATH) {
0336                 /* Right node after this node */
0337                 node = (*trie & TRIENODE);
0338                 trie++;
0339             } else {
0340                 /* No right node. */
0341                 return NULL;
0342             }
0343         } else {
0344             /* Left leg */
0345             if (offlen) {
0346                 /* Left node after this node. */
0347                 node = (*trie & LEFTNODE);
0348                 trie += offlen + 1;
0349             } else if (*trie & RIGHTPATH) {
0350                 /* No left node. */
0351                 return NULL;
0352             } else {
0353                 /* Left node after this node */
0354                 node = (*trie & TRIENODE);
0355                 trie++;
0356             }
0357         }
0358     }
0359     /*
0360      * Hangul decomposition is done algorithmically. These are the
0361      * codepoints >= 0xAC00 and <= 0xD7A3. Their UTF-8 encoding is
0362      * always 3 bytes long, so s has been advanced twice, and the
0363      * start of the sequence is at s-2.
0364      */
0365     if (LEAF_CCC(trie) == DECOMPOSE && LEAF_STR(trie)[0] == HANGUL)
0366         trie = utf8hangul(s - 2, hangul);
0367     return trie;
0368 }
0369
0370 /*
0371  * Use trie to scan s.
0372  * Returns the leaf if one exists, NULL otherwise.
0373  *
0374  * Forwards to utf8nlookup().
0375  */
0376 static utf8leaf_t *utf8lookup(const struct unicode_map *um,
0377         enum utf8_normalization n, unsigned char *hangul, const char *s)
0378 {
0379     return utf8nlookup(um, n, hangul, s, (size_t)-1);
0380 }
0381
0382 /*
0383  * Length of the normalization of s, touch at most len bytes.
0384  * Return -1 if s is not valid UTF-8 unicode.
0385  */
0386 ssize_t utf8nlen(const struct unicode_map *um, enum utf8_normalization n,
0387         const char *s, size_t len)
0388 {
0389     utf8leaf_t  *leaf;
0390     size_t      ret = 0;
0391     unsigned char   hangul[UTF8HANGULLEAF];
0392
0393     while (len && *s) {
0394         leaf = utf8nlookup(um, n, hangul, s, len);
0395         if (!leaf)
0396             return -1;
0397         if (um->tables->utf8agetab[LEAF_GEN(leaf)] >
0398             um->ntab[n]->maxage)
0399             ret += utf8clen(s);
0400         else if (LEAF_CCC(leaf) == DECOMPOSE)
0401             ret += strlen(LEAF_STR(leaf));
0402         else
0403             ret += utf8clen(s);
0404         len -= utf8clen(s);
0405         s += utf8clen(s);
0406     }
0407     return ret;
0408 }
0409
0410 /*
0411  * Set up an utf8cursor for use by utf8byte().
0412  *
0413  *   u8c    : pointer to cursor.
0414  *   data   : const struct utf8data to use for normalization.
0415  *   s      : string.
0416  *   len    : length of s.
0417  *
0418  * Returns -1 on error, 0 on success.
0419  */
0420 int utf8ncursor(struct utf8cursor *u8c, const struct unicode_map *um,
0421         enum utf8_normalization n, const char *s, size_t len)
0422 {
0423     if (!s)
0424         return -1;
0425     u8c->um = um;
0426     u8c->n = n;
0427     u8c->s = s;
0428     u8c->p = NULL;
0429     u8c->ss = NULL;
0430     u8c->sp = NULL;
0431     u8c->len = len;
0432     u8c->slen = 0;
0433     u8c->ccc = STOPPER;
0434     u8c->nccc = STOPPER;
0435     /* Check we didn't clobber the maximum length. */
0436     if (u8c->len != len)
0437         return -1;
0438     /* The first byte of s may not be an utf8 continuation. */
0439     if (len > 0 && (*s & 0xC0) == 0x80)
0440         return -1;
0441     return 0;
0442 }
0443
0444 /*
0445  * Get one byte from the normalized form of the string described by u8c.
0446  *
0447  * Returns the byte cast to an unsigned char on succes, and -1 on failure.
0448  *
0449  * The cursor keeps track of the location in the string in u8c->s.
0450  * When a character is decomposed, the current location is stored in
0451  * u8c->p, and u8c->s is set to the start of the decomposition. Note
0452  * that bytes from a decomposition do not count against u8c->len.
0453  *
0454  * Characters are emitted if they match the current CCC in u8c->ccc.
0455  * Hitting end-of-string while u8c->ccc == STOPPER means we're done,
0456  * and the function returns 0 in that case.
0457  *
0458  * Sorting by CCC is done by repeatedly scanning the string.  The
0459  * values of u8c->s and u8c->p are stored in u8c->ss and u8c->sp at
0460  * the start of the scan.  The first pass finds the lowest CCC to be
0461  * emitted and stores it in u8c->nccc, the second pass emits the
0462  * characters with this CCC and finds the next lowest CCC. This limits
0463  * the number of passes to 1 + the number of different CCCs in the
0464  * sequence being scanned.
0465  *
0466  * Therefore:
0467  *  u8c->p  != NULL -> a decomposition is being scanned.
0468  *  u8c->ss != NULL -> this is a repeating scan.
0469  *  u8c->ccc == -1   -> this is the first scan of a repeating scan.
0470  */
0471 int utf8byte(struct utf8cursor *u8c)
0472 {
0473     utf8leaf_t *leaf;
0474     int ccc;
0475
0476     for (;;) {
0477         /* Check for the end of a decomposed character. */
0478         if (u8c->p && *u8c->s == '\0') {
0479             u8c->s = u8c->p;
0480             u8c->p = NULL;
0481         }
0482
0483         /* Check for end-of-string. */
0484         if (!u8c->p && (u8c->len == 0 || *u8c->s == '\0')) {
0485             /* There is no next byte. */
0486             if (u8c->ccc == STOPPER)
0487                 return 0;
0488             /* End-of-string during a scan counts as a stopper. */
0489             ccc = STOPPER;
0490             goto ccc_mismatch;
0491         } else if ((*u8c->s & 0xC0) == 0x80) {
0492             /* This is a continuation of the current character. */
0493             if (!u8c->p)
0494                 u8c->len--;
0495             return (unsigned char)*u8c->s++;
0496         }
0497
0498         /* Look up the data for the current character. */
0499         if (u8c->p) {
0500             leaf = utf8lookup(u8c->um, u8c->n, u8c->hangul, u8c->s);
0501         } else {
0502             leaf = utf8nlookup(u8c->um, u8c->n, u8c->hangul,
0503                        u8c->s, u8c->len);
0504         }
0505
0506         /* No leaf found implies that the input is a binary blob. */
0507         if (!leaf)
0508             return -1;
0509
0510         ccc = LEAF_CCC(leaf);
0511         /* Characters that are too new have CCC 0. */
0512         if (u8c->um->tables->utf8agetab[LEAF_GEN(leaf)] >
0513             u8c->um->ntab[u8c->n]->maxage) {
0514             ccc = STOPPER;
0515         } else if (ccc == DECOMPOSE) {
0516             u8c->len -= utf8clen(u8c->s);
0517             u8c->p = u8c->s + utf8clen(u8c->s);
0518             u8c->s = LEAF_STR(leaf);
0519             /* Empty decomposition implies CCC 0. */
0520             if (*u8c->s == '\0') {
0521                 if (u8c->ccc == STOPPER)
0522                     continue;
0523                 ccc = STOPPER;
0524                 goto ccc_mismatch;
0525             }
0526
0527             leaf = utf8lookup(u8c->um, u8c->n, u8c->hangul, u8c->s);
0528             if (!leaf)
0529                 return -1;
0530             ccc = LEAF_CCC(leaf);
0531         }
0532
0533         /*
0534          * If this is not a stopper, then see if it updates
0535          * the next canonical class to be emitted.
0536          */
0537         if (ccc != STOPPER && u8c->ccc < ccc && ccc < u8c->nccc)
0538             u8c->nccc = ccc;
0539
0540         /*
0541          * Return the current byte if this is the current
0542          * combining class.
0543          */
0544         if (ccc == u8c->ccc) {
0545             if (!u8c->p)
0546                 u8c->len--;
0547             return (unsigned char)*u8c->s++;
0548         }
0549
0550         /* Current combining class mismatch. */
0551 ccc_mismatch:
0552         if (u8c->nccc == STOPPER) {
0553             /*
0554              * Scan forward for the first canonical class
0555              * to be emitted.  Save the position from
0556              * which to restart.
0557              */
0558             u8c->ccc = MINCCC - 1;
0559             u8c->nccc = ccc;
0560             u8c->sp = u8c->p;
0561             u8c->ss = u8c->s;
0562             u8c->slen = u8c->len;
0563             if (!u8c->p)
0564                 u8c->len -= utf8clen(u8c->s);
0565             u8c->s += utf8clen(u8c->s);
0566         } else if (ccc != STOPPER) {
0567             /* Not a stopper, and not the ccc we're emitting. */
0568             if (!u8c->p)
0569                 u8c->len -= utf8clen(u8c->s);
0570             u8c->s += utf8clen(u8c->s);
0571         } else if (u8c->nccc != MAXCCC + 1) {
0572             /* At a stopper, restart for next ccc. */
0573             u8c->ccc = u8c->nccc;
0574             u8c->nccc = MAXCCC + 1;
0575             u8c->s = u8c->ss;
0576             u8c->p = u8c->sp;
0577             u8c->len = u8c->slen;
0578         } else {
0579             /* All done, proceed from here. */
0580             u8c->ccc = STOPPER;
0581             u8c->nccc = STOPPER;
0582             u8c->sp = NULL;
0583             u8c->ss = NULL;
0584             u8c->slen = 0;
0585         }
0586     }
0587 }
0588
0589 #ifdef CONFIG_UNICODE_NORMALIZATION_SELFTEST_MODULE
0590 EXPORT_SYMBOL_GPL(utf8version_is_supported);
0591 EXPORT_SYMBOL_GPL(utf8nlen);
0592 EXPORT_SYMBOL_GPL(utf8ncursor);
0593 EXPORT_SYMBOL_GPL(utf8byte);
0594 #endif