fs/unicode/mkutf8data.c

0001 /*
0002  * Copyright (c) 2014 SGI.
0003  * All rights reserved.
0004  *
0005  * This program is free software; you can redistribute it and/or
0006  * modify it under the terms of the GNU General Public License as
0007  * published by the Free Software Foundation.
0008  *
0009  * This program is distributed in the hope that it would be useful,
0010  * but WITHOUT ANY WARRANTY; without even the implied warranty of
0011  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
0012  * GNU General Public License for more details.
0013  *
0014  * You should have received a copy of the GNU General Public License
0015  * along with this program; if not, write the Free Software Foundation,
0016  * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
0017  */
0018
0019 /* Generator for a compact trie for unicode normalization */
0020
0021 #include <sys/types.h>
0022 #include <stddef.h>
0023 #include <stdlib.h>
0024 #include <stdio.h>
0025 #include <assert.h>
0026 #include <string.h>
0027 #include <unistd.h>
0028 #include <errno.h>
0029
0030 /* Default names of the in- and output files. */
0031
0032 #define AGE_NAME    "DerivedAge.txt"
0033 #define CCC_NAME    "DerivedCombiningClass.txt"
0034 #define PROP_NAME   "DerivedCoreProperties.txt"
0035 #define DATA_NAME   "UnicodeData.txt"
0036 #define FOLD_NAME   "CaseFolding.txt"
0037 #define NORM_NAME   "NormalizationCorrections.txt"
0038 #define TEST_NAME   "NormalizationTest.txt"
0039 #define UTF8_NAME   "utf8data.h"
0040
0041 const char  *age_name  = AGE_NAME;
0042 const char  *ccc_name  = CCC_NAME;
0043 const char  *prop_name = PROP_NAME;
0044 const char  *data_name = DATA_NAME;
0045 const char  *fold_name = FOLD_NAME;
0046 const char  *norm_name = NORM_NAME;
0047 const char  *test_name = TEST_NAME;
0048 const char  *utf8_name = UTF8_NAME;
0049
0050 int verbose = 0;
0051
0052 /* An arbitrary line size limit on input lines. */
0053
0054 #define LINESIZE    1024
0055 char line[LINESIZE];
0056 char buf0[LINESIZE];
0057 char buf1[LINESIZE];
0058 char buf2[LINESIZE];
0059 char buf3[LINESIZE];
0060
0061 const char *argv0;
0062
0063 #define ARRAY_SIZE(x) (sizeof(x) / sizeof((x)[0]))
0064
0065 /* ------------------------------------------------------------------ */
0066
0067 /*
0068  * Unicode version numbers consist of three parts: major, minor, and a
0069  * revision.  These numbers are packed into an unsigned int to obtain
0070  * a single version number.
0071  *
0072  * To save space in the generated trie, the unicode version is not
0073  * stored directly, instead we calculate a generation number from the
0074  * unicode versions seen in the DerivedAge file, and use that as an
0075  * index into a table of unicode versions.
0076  */
0077 #define UNICODE_MAJ_SHIFT       (16)
0078 #define UNICODE_MIN_SHIFT       (8)
0079
0080 #define UNICODE_MAJ_MAX         ((unsigned short)-1)
0081 #define UNICODE_MIN_MAX         ((unsigned char)-1)
0082 #define UNICODE_REV_MAX         ((unsigned char)-1)
0083
0084 #define UNICODE_AGE(MAJ,MIN,REV)            \
0085     (((unsigned int)(MAJ) << UNICODE_MAJ_SHIFT) |   \
0086      ((unsigned int)(MIN) << UNICODE_MIN_SHIFT) |   \
0087      ((unsigned int)(REV)))
0088
0089 unsigned int *ages;
0090 int ages_count;
0091
0092 unsigned int unicode_maxage;
0093
0094 static int age_valid(unsigned int major, unsigned int minor,
0095              unsigned int revision)
0096 {
0097     if (major > UNICODE_MAJ_MAX)
0098         return 0;
0099     if (minor > UNICODE_MIN_MAX)
0100         return 0;
0101     if (revision > UNICODE_REV_MAX)
0102         return 0;
0103     return 1;
0104 }
0105
0106 /* ------------------------------------------------------------------ */
0107
0108 /*
0109  * utf8trie_t
0110  *
0111  * A compact binary tree, used to decode UTF-8 characters.
0112  *
0113  * Internal nodes are one byte for the node itself, and up to three
0114  * bytes for an offset into the tree.  The first byte contains the
0115  * following information:
0116  *  NEXTBYTE  - flag        - advance to next byte if set
0117  *  BITNUM    - 3 bit field - the bit number to tested
0118  *  OFFLEN    - 2 bit field - number of bytes in the offset
0119  * if offlen == 0 (non-branching node)
0120  *  RIGHTPATH - 1 bit field - set if the following node is for the
0121  *                            right-hand path (tested bit is set)
0122  *  TRIENODE  - 1 bit field - set if the following node is an internal
0123  *                            node, otherwise it is a leaf node
0124  * if offlen != 0 (branching node)
0125  *  LEFTNODE  - 1 bit field - set if the left-hand node is internal
0126  *  RIGHTNODE - 1 bit field - set if the right-hand node is internal
0127  *
0128  * Due to the way utf8 works, there cannot be branching nodes with
0129  * NEXTBYTE set, and moreover those nodes always have a righthand
0130  * descendant.
0131  */
0132 typedef unsigned char utf8trie_t;
0133 #define BITNUM      0x07
0134 #define NEXTBYTE    0x08
0135 #define OFFLEN      0x30
0136 #define OFFLEN_SHIFT    4
0137 #define RIGHTPATH   0x40
0138 #define TRIENODE    0x80
0139 #define RIGHTNODE   0x40
0140 #define LEFTNODE    0x80
0141
0142 /*
0143  * utf8leaf_t
0144  *
0145  * The leaves of the trie are embedded in the trie, and so the same
0146  * underlying datatype, unsigned char.
0147  *
0148  * leaf[0]: The unicode version, stored as a generation number that is
0149  *          an index into utf8agetab[].  With this we can filter code
0150  *          points based on the unicode version in which they were
0151  *          defined.  The CCC of a non-defined code point is 0.
0152  * leaf[1]: Canonical Combining Class. During normalization, we need
0153  *          to do a stable sort into ascending order of all characters
0154  *          with a non-zero CCC that occur between two characters with
0155  *          a CCC of 0, or at the begin or end of a string.
0156  *          The unicode standard guarantees that all CCC values are
0157  *          between 0 and 254 inclusive, which leaves 255 available as
0158  *          a special value.
0159  *          Code points with CCC 0 are known as stoppers.
0160  * leaf[2]: Decomposition. If leaf[1] == 255, then leaf[2] is the
0161  *          start of a NUL-terminated string that is the decomposition
0162  *          of the character.
0163  *          The CCC of a decomposable character is the same as the CCC
0164  *          of the first character of its decomposition.
0165  *          Some characters decompose as the empty string: these are
0166  *          characters with the Default_Ignorable_Code_Point property.
0167  *          These do affect normalization, as they all have CCC 0.
0168  *
0169  * The decompositions in the trie have been fully expanded.
0170  *
0171  * Casefolding, if applicable, is also done using decompositions.
0172  */
0173 typedef unsigned char utf8leaf_t;
0174
0175 #define LEAF_GEN(LEAF)  ((LEAF)[0])
0176 #define LEAF_CCC(LEAF)  ((LEAF)[1])
0177 #define LEAF_STR(LEAF)  ((const char*)((LEAF) + 2))
0178
0179 #define MAXGEN      (255)
0180
0181 #define MINCCC      (0)
0182 #define MAXCCC      (254)
0183 #define STOPPER     (0)
0184 #define DECOMPOSE   (255)
0185 #define HANGUL      ((char)(255))
0186
0187 #define UTF8HANGULLEAF  (12)
0188
0189 struct tree;
0190 static utf8leaf_t *utf8nlookup(struct tree *, unsigned char *,
0191                    const char *, size_t);
0192 static utf8leaf_t *utf8lookup(struct tree *, unsigned char *, const char *);
0193
0194 unsigned char *utf8data;
0195 size_t utf8data_size;
0196
0197 utf8trie_t *nfdi;
0198 utf8trie_t *nfdicf;
0199
0200 /* ------------------------------------------------------------------ */
0201
0202 /*
0203  * UTF8 valid ranges.
0204  *
0205  * The UTF-8 encoding spreads the bits of a 32bit word over several
0206  * bytes. This table gives the ranges that can be held and how they'd
0207  * be represented.
0208  *
0209  * 0x00000000 0x0000007F: 0xxxxxxx
0210  * 0x00000000 0x000007FF: 110xxxxx 10xxxxxx
0211  * 0x00000000 0x0000FFFF: 1110xxxx 10xxxxxx 10xxxxxx
0212  * 0x00000000 0x001FFFFF: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
0213  * 0x00000000 0x03FFFFFF: 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
0214  * 0x00000000 0x7FFFFFFF: 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
0215  *
0216  * There is an additional requirement on UTF-8, in that only the
0217  * shortest representation of a 32bit value is to be used.  A decoder
0218  * must not decode sequences that do not satisfy this requirement.
0219  * Thus the allowed ranges have a lower bound.
0220  *
0221  * 0x00000000 0x0000007F: 0xxxxxxx
0222  * 0x00000080 0x000007FF: 110xxxxx 10xxxxxx
0223  * 0x00000800 0x0000FFFF: 1110xxxx 10xxxxxx 10xxxxxx
0224  * 0x00010000 0x001FFFFF: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
0225  * 0x00200000 0x03FFFFFF: 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
0226  * 0x04000000 0x7FFFFFFF: 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
0227  *
0228  * Actual unicode characters are limited to the range 0x0 - 0x10FFFF,
0229  * 17 planes of 65536 values.  This limits the sequences actually seen
0230  * even more, to just the following.
0231  *
0232  *          0 -     0x7f: 0                     0x7f
0233  *       0x80 -    0x7ff: 0xc2 0x80             0xdf 0xbf
0234  *      0x800 -   0xffff: 0xe0 0xa0 0x80        0xef 0xbf 0xbf
0235  *    0x10000 - 0x10ffff: 0xf0 0x90 0x80 0x80   0xf4 0x8f 0xbf 0xbf
0236  *
0237  * Even within those ranges not all values are allowed: the surrogates
0238  * 0xd800 - 0xdfff should never be seen.
0239  *
0240  * Note that the longest sequence seen with valid usage is 4 bytes,
0241  * the same a single UTF-32 character.  This makes the UTF-8
0242  * representation of Unicode strictly smaller than UTF-32.
0243  *
0244  * The shortest sequence requirement was introduced by:
0245  *    Corrigendum #1: UTF-8 Shortest Form
0246  * It can be found here:
0247  *    http://www.unicode.org/versions/corrigendum1.html
0248  *
0249  */
0250
0251 #define UTF8_2_BITS     0xC0
0252 #define UTF8_3_BITS     0xE0
0253 #define UTF8_4_BITS     0xF0
0254 #define UTF8_N_BITS     0x80
0255 #define UTF8_2_MASK     0xE0
0256 #define UTF8_3_MASK     0xF0
0257 #define UTF8_4_MASK     0xF8
0258 #define UTF8_N_MASK     0xC0
0259 #define UTF8_V_MASK     0x3F
0260 #define UTF8_V_SHIFT    6
0261
0262 static int utf8encode(char *str, unsigned int val)
0263 {
0264     int len;
0265
0266     if (val < 0x80) {
0267         str[0] = val;
0268         len = 1;
0269     } else if (val < 0x800) {
0270         str[1] = val & UTF8_V_MASK;
0271         str[1] |= UTF8_N_BITS;
0272         val >>= UTF8_V_SHIFT;
0273         str[0] = val;
0274         str[0] |= UTF8_2_BITS;
0275         len = 2;
0276     } else if (val < 0x10000) {
0277         str[2] = val & UTF8_V_MASK;
0278         str[2] |= UTF8_N_BITS;
0279         val >>= UTF8_V_SHIFT;
0280         str[1] = val & UTF8_V_MASK;
0281         str[1] |= UTF8_N_BITS;
0282         val >>= UTF8_V_SHIFT;
0283         str[0] = val;
0284         str[0] |= UTF8_3_BITS;
0285         len = 3;
0286     } else if (val < 0x110000) {
0287         str[3] = val & UTF8_V_MASK;
0288         str[3] |= UTF8_N_BITS;
0289         val >>= UTF8_V_SHIFT;
0290         str[2] = val & UTF8_V_MASK;
0291         str[2] |= UTF8_N_BITS;
0292         val >>= UTF8_V_SHIFT;
0293         str[1] = val & UTF8_V_MASK;
0294         str[1] |= UTF8_N_BITS;
0295         val >>= UTF8_V_SHIFT;
0296         str[0] = val;
0297         str[0] |= UTF8_4_BITS;
0298         len = 4;
0299     } else {
0300         printf("%#x: illegal val\n", val);
0301         len = 0;
0302     }
0303     return len;
0304 }
0305
0306 static unsigned int utf8decode(const char *str)
0307 {
0308     const unsigned char *s = (const unsigned char*)str;
0309     unsigned int unichar = 0;
0310
0311     if (*s < 0x80) {
0312         unichar = *s;
0313     } else if (*s < UTF8_3_BITS) {
0314         unichar = *s++ & 0x1F;
0315         unichar <<= UTF8_V_SHIFT;
0316         unichar |= *s & 0x3F;
0317     } else if (*s < UTF8_4_BITS) {
0318         unichar = *s++ & 0x0F;
0319         unichar <<= UTF8_V_SHIFT;
0320         unichar |= *s++ & 0x3F;
0321         unichar <<= UTF8_V_SHIFT;
0322         unichar |= *s & 0x3F;
0323     } else {
0324         unichar = *s++ & 0x0F;
0325         unichar <<= UTF8_V_SHIFT;
0326         unichar |= *s++ & 0x3F;
0327         unichar <<= UTF8_V_SHIFT;
0328         unichar |= *s++ & 0x3F;
0329         unichar <<= UTF8_V_SHIFT;
0330         unichar |= *s & 0x3F;
0331     }
0332     return unichar;
0333 }
0334
0335 static int utf32valid(unsigned int unichar)
0336 {
0337     return unichar < 0x110000;
0338 }
0339
0340 #define HANGUL_SYLLABLE(U)  ((U) >= 0xAC00 && (U) <= 0xD7A3)
0341
0342 #define NODE 1
0343 #define LEAF 0
0344
0345 struct tree {
0346     void *root;
0347     int childnode;
0348     const char *type;
0349     unsigned int maxage;
0350     struct tree *next;
0351     int (*leaf_equal)(void *, void *);
0352     void (*leaf_print)(void *, int);
0353     int (*leaf_mark)(void *);
0354     int (*leaf_size)(void *);
0355     int *(*leaf_index)(struct tree *, void *);
0356     unsigned char *(*leaf_emit)(void *, unsigned char *);
0357     int leafindex[0x110000];
0358     int index;
0359 };
0360
0361 struct node {
0362     int index;
0363     int offset;
0364     int mark;
0365     int size;
0366     struct node *parent;
0367     void *left;
0368     void *right;
0369     unsigned char bitnum;
0370     unsigned char nextbyte;
0371     unsigned char leftnode;
0372     unsigned char rightnode;
0373     unsigned int keybits;
0374     unsigned int keymask;
0375 };
0376
0377 /*
0378  * Example lookup function for a tree.
0379  */
0380 static void *lookup(struct tree *tree, const char *key)
0381 {
0382     struct node *node;
0383     void *leaf = NULL;
0384
0385     node = tree->root;
0386     while (!leaf && node) {
0387         if (node->nextbyte)
0388             key++;
0389         if (*key & (1 << (node->bitnum & 7))) {
0390             /* Right leg */
0391             if (node->rightnode == NODE) {
0392                 node = node->right;
0393             } else if (node->rightnode == LEAF) {
0394                 leaf = node->right;
0395             } else {
0396                 node = NULL;
0397             }
0398         } else {
0399             /* Left leg */
0400             if (node->leftnode == NODE) {
0401                 node = node->left;
0402             } else if (node->leftnode == LEAF) {
0403                 leaf = node->left;
0404             } else {
0405                 node = NULL;
0406             }
0407         }
0408     }
0409
0410     return leaf;
0411 }
0412
0413 /*
0414  * A simple non-recursive tree walker: keep track of visits to the
0415  * left and right branches in the leftmask and rightmask.
0416  */
0417 static void tree_walk(struct tree *tree)
0418 {
0419     struct node *node;
0420     unsigned int leftmask;
0421     unsigned int rightmask;
0422     unsigned int bitmask;
0423     int indent = 1;
0424     int nodes, singletons, leaves;
0425
0426     nodes = singletons = leaves = 0;
0427
0428     printf("%s_%x root %p\n", tree->type, tree->maxage, tree->root);
0429     if (tree->childnode == LEAF) {
0430         assert(tree->root);
0431         tree->leaf_print(tree->root, indent);
0432         leaves = 1;
0433     } else {
0434         assert(tree->childnode == NODE);
0435         node = tree->root;
0436         leftmask = rightmask = 0;
0437         while (node) {
0438             printf("%*snode @ %p bitnum %d nextbyte %d"
0439                    " left %p right %p mask %x bits %x\n",
0440                 indent, "", node,
0441                 node->bitnum, node->nextbyte,
0442                 node->left, node->right,
0443                 node->keymask, node->keybits);
0444             nodes += 1;
0445             if (!(node->left && node->right))
0446                 singletons += 1;
0447
0448             while (node) {
0449                 bitmask = 1 << node->bitnum;
0450                 if ((leftmask & bitmask) == 0) {
0451                     leftmask |= bitmask;
0452                     if (node->leftnode == LEAF) {
0453                         assert(node->left);
0454                         tree->leaf_print(node->left,
0455                                  indent+1);
0456                         leaves += 1;
0457                     } else if (node->left) {
0458                         assert(node->leftnode == NODE);
0459                         indent += 1;
0460                         node = node->left;
0461                         break;
0462                     }
0463                 }
0464                 if ((rightmask & bitmask) == 0) {
0465                     rightmask |= bitmask;
0466                     if (node->rightnode == LEAF) {
0467                         assert(node->right);
0468                         tree->leaf_print(node->right,
0469                                  indent+1);
0470                         leaves += 1;
0471                     } else if (node->right) {
0472                         assert(node->rightnode == NODE);
0473                         indent += 1;
0474                         node = node->right;
0475                         break;
0476                     }
0477                 }
0478                 leftmask &= ~bitmask;
0479                 rightmask &= ~bitmask;
0480                 node = node->parent;
0481                 indent -= 1;
0482             }
0483         }
0484     }
0485     printf("nodes %d leaves %d singletons %d\n",
0486            nodes, leaves, singletons);
0487 }
0488
0489 /*
0490  * Allocate an initialize a new internal node.
0491  */
0492 static struct node *alloc_node(struct node *parent)
0493 {
0494     struct node *node;
0495     int bitnum;
0496
0497     node = malloc(sizeof(*node));
0498     node->left = node->right = NULL;
0499     node->parent = parent;
0500     node->leftnode = NODE;
0501     node->rightnode = NODE;
0502     node->keybits = 0;
0503     node->keymask = 0;
0504     node->mark = 0;
0505     node->index = 0;
0506     node->offset = -1;
0507     node->size = 4;
0508
0509     if (node->parent) {
0510         bitnum = parent->bitnum;
0511         if ((bitnum & 7) == 0) {
0512             node->bitnum = bitnum + 7 + 8;
0513             node->nextbyte = 1;
0514         } else {
0515             node->bitnum = bitnum - 1;
0516             node->nextbyte = 0;
0517         }
0518     } else {
0519         node->bitnum = 7;
0520         node->nextbyte = 0;
0521     }
0522
0523     return node;
0524 }
0525
0526 /*
0527  * Insert a new leaf into the tree, and collapse any subtrees that are
0528  * fully populated and end in identical leaves. A nextbyte tagged
0529  * internal node will not be removed to preserve the tree's integrity.
0530  * Note that due to the structure of utf8, no nextbyte tagged node
0531  * will be a candidate for removal.
0532  */
0533 static int insert(struct tree *tree, char *key, int keylen, void *leaf)
0534 {
0535     struct node *node;
0536     struct node *parent;
0537     void **cursor;
0538     int keybits;
0539
0540     assert(keylen >= 1 && keylen <= 4);
0541
0542     node = NULL;
0543     cursor = &tree->root;
0544     keybits = 8 * keylen;
0545
0546     /* Insert, creating path along the way. */
0547     while (keybits) {
0548         if (!*cursor)
0549             *cursor = alloc_node(node);
0550         node = *cursor;
0551         if (node->nextbyte)
0552             key++;
0553         if (*key & (1 << (node->bitnum & 7)))
0554             cursor = &node->right;
0555         else
0556             cursor = &node->left;
0557         keybits--;
0558     }
0559     *cursor = leaf;
0560
0561     /* Merge subtrees if possible. */
0562     while (node) {
0563         if (*key & (1 << (node->bitnum & 7)))
0564             node->rightnode = LEAF;
0565         else
0566             node->leftnode = LEAF;
0567         if (node->nextbyte)
0568             break;
0569         if (node->leftnode == NODE || node->rightnode == NODE)
0570             break;
0571         assert(node->left);
0572         assert(node->right);
0573         /* Compare */
0574         if (! tree->leaf_equal(node->left, node->right))
0575             break;
0576         /* Keep left, drop right leaf. */
0577         leaf = node->left;
0578         /* Check in parent */
0579         parent = node->parent;
0580         if (!parent) {
0581             /* root of tree! */
0582             tree->root = leaf;
0583             tree->childnode = LEAF;
0584         } else if (parent->left == node) {
0585             parent->left = leaf;
0586             parent->leftnode = LEAF;
0587             if (parent->right) {
0588                 parent->keymask = 0;
0589                 parent->keybits = 0;
0590             } else {
0591                 parent->keymask |= (1 << node->bitnum);
0592             }
0593         } else if (parent->right == node) {
0594             parent->right = leaf;
0595             parent->rightnode = LEAF;
0596             if (parent->left) {
0597                 parent->keymask = 0;
0598                 parent->keybits = 0;
0599             } else {
0600                 parent->keymask |= (1 << node->bitnum);
0601                 parent->keybits |= (1 << node->bitnum);
0602             }
0603         } else {
0604             /* internal tree error */
0605             assert(0);
0606         }
0607         free(node);
0608         node = parent;
0609     }
0610
0611     /* Propagate keymasks up along singleton chains. */
0612     while (node) {
0613         parent = node->parent;
0614         if (!parent)
0615             break;
0616         /* Nix the mask for parents with two children. */
0617         if (node->keymask == 0) {
0618             parent->keymask = 0;
0619             parent->keybits = 0;
0620         } else if (parent->left && parent->right) {
0621             parent->keymask = 0;
0622             parent->keybits = 0;
0623         } else {
0624             assert((parent->keymask & node->keymask) == 0);
0625             parent->keymask |= node->keymask;
0626             parent->keymask |= (1 << parent->bitnum);
0627             parent->keybits |= node->keybits;
0628             if (parent->right)
0629                 parent->keybits |= (1 << parent->bitnum);
0630         }
0631         node = parent;
0632     }
0633
0634     return 0;
0635 }
0636
0637 /*
0638  * Prune internal nodes.
0639  *
0640  * Fully populated subtrees that end at the same leaf have already
0641  * been collapsed.  There are still internal nodes that have for both
0642  * their left and right branches a sequence of singletons that make
0643  * identical choices and end in identical leaves.  The keymask and
0644  * keybits collected in the nodes describe the choices made in these
0645  * singleton chains.  When they are identical for the left and right
0646  * branch of a node, and the two leaves comare identical, the node in
0647  * question can be removed.
0648  *
0649  * Note that nodes with the nextbyte tag set will not be removed by
0650  * this to ensure tree integrity.  Note as well that the structure of
0651  * utf8 ensures that these nodes would not have been candidates for
0652  * removal in any case.
0653  */
0654 static void prune(struct tree *tree)
0655 {
0656     struct node *node;
0657     struct node *left;
0658     struct node *right;
0659     struct node *parent;
0660     void *leftleaf;
0661     void *rightleaf;
0662     unsigned int leftmask;
0663     unsigned int rightmask;
0664     unsigned int bitmask;
0665     int count;
0666
0667     if (verbose > 0)
0668         printf("Pruning %s_%x\n", tree->type, tree->maxage);
0669
0670     count = 0;
0671     if (tree->childnode == LEAF)
0672         return;
0673     if (!tree->root)
0674         return;
0675
0676     leftmask = rightmask = 0;
0677     node = tree->root;
0678     while (node) {
0679         if (node->nextbyte)
0680             goto advance;
0681         if (node->leftnode == LEAF)
0682             goto advance;
0683         if (node->rightnode == LEAF)
0684             goto advance;
0685         if (!node->left)
0686             goto advance;
0687         if (!node->right)
0688             goto advance;
0689         left = node->left;
0690         right = node->right;
0691         if (left->keymask == 0)
0692             goto advance;
0693         if (right->keymask == 0)
0694             goto advance;
0695         if (left->keymask != right->keymask)
0696             goto advance;
0697         if (left->keybits != right->keybits)
0698             goto advance;
0699         leftleaf = NULL;
0700         while (!leftleaf) {
0701             assert(left->left || left->right);
0702             if (left->leftnode == LEAF)
0703                 leftleaf = left->left;
0704             else if (left->rightnode == LEAF)
0705                 leftleaf = left->right;
0706             else if (left->left)
0707                 left = left->left;
0708             else if (left->right)
0709                 left = left->right;
0710             else
0711                 assert(0);
0712         }
0713         rightleaf = NULL;
0714         while (!rightleaf) {
0715             assert(right->left || right->right);
0716             if (right->leftnode == LEAF)
0717                 rightleaf = right->left;
0718             else if (right->rightnode == LEAF)
0719                 rightleaf = right->right;
0720             else if (right->left)
0721                 right = right->left;
0722             else if (right->right)
0723                 right = right->right;
0724             else
0725                 assert(0);
0726         }
0727         if (! tree->leaf_equal(leftleaf, rightleaf))
0728             goto advance;
0729         /*
0730          * This node has identical singleton-only subtrees.
0731          * Remove it.
0732          */
0733         parent = node->parent;
0734         left = node->left;
0735         right = node->right;
0736         if (parent->left == node)
0737             parent->left = left;
0738         else if (parent->right == node)
0739             parent->right = left;
0740         else
0741             assert(0);
0742         left->parent = parent;
0743         left->keymask |= (1 << node->bitnum);
0744         node->left = NULL;
0745         while (node) {
0746             bitmask = 1 << node->bitnum;
0747             leftmask &= ~bitmask;
0748             rightmask &= ~bitmask;
0749             if (node->leftnode == NODE && node->left) {
0750                 left = node->left;
0751                 free(node);
0752                 count++;
0753                 node = left;
0754             } else if (node->rightnode == NODE && node->right) {
0755                 right = node->right;
0756                 free(node);
0757                 count++;
0758                 node = right;
0759             } else {
0760                 node = NULL;
0761             }
0762         }
0763         /* Propagate keymasks up along singleton chains. */
0764         node = parent;
0765         /* Force re-check */
0766         bitmask = 1 << node->bitnum;
0767         leftmask &= ~bitmask;
0768         rightmask &= ~bitmask;
0769         for (;;) {
0770             if (node->left && node->right)
0771                 break;
0772             if (node->left) {
0773                 left = node->left;
0774                 node->keymask |= left->keymask;
0775                 node->keybits |= left->keybits;
0776             }
0777             if (node->right) {
0778                 right = node->right;
0779                 node->keymask |= right->keymask;
0780                 node->keybits |= right->keybits;
0781             }
0782             node->keymask |= (1 << node->bitnum);
0783             node = node->parent;
0784             /* Force re-check */
0785             bitmask = 1 << node->bitnum;
0786             leftmask &= ~bitmask;
0787             rightmask &= ~bitmask;
0788         }
0789     advance:
0790         bitmask = 1 << node->bitnum;
0791         if ((leftmask & bitmask) == 0 &&
0792             node->leftnode == NODE &&
0793             node->left) {
0794             leftmask |= bitmask;
0795             node = node->left;
0796         } else if ((rightmask & bitmask) == 0 &&
0797                node->rightnode == NODE &&
0798                node->right) {
0799             rightmask |= bitmask;
0800             node = node->right;
0801         } else {
0802             leftmask &= ~bitmask;
0803             rightmask &= ~bitmask;
0804             node = node->parent;
0805         }
0806     }
0807     if (verbose > 0)
0808         printf("Pruned %d nodes\n", count);
0809 }
0810
0811 /*
0812  * Mark the nodes in the tree that lead to leaves that must be
0813  * emitted.
0814  */
0815 static void mark_nodes(struct tree *tree)
0816 {
0817     struct node *node;
0818     struct node *n;
0819     unsigned int leftmask;
0820     unsigned int rightmask;
0821     unsigned int bitmask;
0822     int marked;
0823
0824     marked = 0;
0825     if (verbose > 0)
0826         printf("Marking %s_%x\n", tree->type, tree->maxage);
0827     if (tree->childnode == LEAF)
0828         goto done;
0829
0830     assert(tree->childnode == NODE);
0831     node = tree->root;
0832     leftmask = rightmask = 0;
0833     while (node) {
0834         bitmask = 1 << node->bitnum;
0835         if ((leftmask & bitmask) == 0) {
0836             leftmask |= bitmask;
0837             if (node->leftnode == LEAF) {
0838                 assert(node->left);
0839                 if (tree->leaf_mark(node->left)) {
0840                     n = node;
0841                     while (n && !n->mark) {
0842                         marked++;
0843                         n->mark = 1;
0844                         n = n->parent;
0845                     }
0846                 }
0847             } else if (node->left) {
0848                 assert(node->leftnode == NODE);
0849                 node = node->left;
0850                 continue;
0851             }
0852         }
0853         if ((rightmask & bitmask) == 0) {
0854             rightmask |= bitmask;
0855             if (node->rightnode == LEAF) {
0856                 assert(node->right);
0857                 if (tree->leaf_mark(node->right)) {
0858                     n = node;
0859                     while (n && !n->mark) {
0860                         marked++;
0861                         n->mark = 1;
0862                         n = n->parent;
0863                     }
0864                 }
0865             } else if (node->right) {
0866                 assert(node->rightnode == NODE);
0867                 node = node->right;
0868                 continue;
0869             }
0870         }
0871         leftmask &= ~bitmask;
0872         rightmask &= ~bitmask;
0873         node = node->parent;
0874     }
0875
0876     /* second pass: left siblings and singletons */
0877
0878     assert(tree->childnode == NODE);
0879     node = tree->root;
0880     leftmask = rightmask = 0;
0881     while (node) {
0882         bitmask = 1 << node->bitnum;
0883         if ((leftmask & bitmask) == 0) {
0884             leftmask |= bitmask;
0885             if (node->leftnode == LEAF) {
0886                 assert(node->left);
0887                 if (tree->leaf_mark(node->left)) {
0888                     n = node;
0889                     while (n && !n->mark) {
0890                         marked++;
0891                         n->mark = 1;
0892                         n = n->parent;
0893                     }
0894                 }
0895             } else if (node->left) {
0896                 assert(node->leftnode == NODE);
0897                 node = node->left;
0898                 if (!node->mark && node->parent->mark) {
0899                     marked++;
0900                     node->mark = 1;
0901                 }
0902                 continue;
0903             }
0904         }
0905         if ((rightmask & bitmask) == 0) {
0906             rightmask |= bitmask;
0907             if (node->rightnode == LEAF) {
0908                 assert(node->right);
0909                 if (tree->leaf_mark(node->right)) {
0910                     n = node;
0911                     while (n && !n->mark) {
0912                         marked++;
0913                         n->mark = 1;
0914                         n = n->parent;
0915                     }
0916                 }
0917             } else if (node->right) {
0918                 assert(node->rightnode == NODE);
0919                 node = node->right;
0920                 if (!node->mark && node->parent->mark &&
0921                     !node->parent->left) {
0922                     marked++;
0923                     node->mark = 1;
0924                 }
0925                 continue;
0926             }
0927         }
0928         leftmask &= ~bitmask;
0929         rightmask &= ~bitmask;
0930         node = node->parent;
0931     }
0932 done:
0933     if (verbose > 0)
0934         printf("Marked %d nodes\n", marked);
0935 }
0936
0937 /*
0938  * Compute the index of each node and leaf, which is the offset in the
0939  * emitted trie.  These values must be pre-computed because relative
0940  * offsets between nodes are used to navigate the tree.
0941  */
0942 static int index_nodes(struct tree *tree, int index)
0943 {
0944     struct node *node;
0945     unsigned int leftmask;
0946     unsigned int rightmask;
0947     unsigned int bitmask;
0948     int count;
0949     int indent;
0950
0951     /* Align to a cache line (or half a cache line?). */
0952     while (index % 64)
0953         index++;
0954     tree->index = index;
0955     indent = 1;
0956     count = 0;
0957
0958     if (verbose > 0)
0959         printf("Indexing %s_%x: %d\n", tree->type, tree->maxage, index);
0960     if (tree->childnode == LEAF) {
0961         index += tree->leaf_size(tree->root);
0962         goto done;
0963     }
0964
0965     assert(tree->childnode == NODE);
0966     node = tree->root;
0967     leftmask = rightmask = 0;
0968     while (node) {
0969         if (!node->mark)
0970             goto skip;
0971         count++;
0972         if (node->index != index)
0973             node->index = index;
0974         index += node->size;
0975 skip:
0976         while (node) {
0977             bitmask = 1 << node->bitnum;
0978             if (node->mark && (leftmask & bitmask) == 0) {
0979                 leftmask |= bitmask;
0980                 if (node->leftnode == LEAF) {
0981                     assert(node->left);
0982                     *tree->leaf_index(tree, node->left) =
0983                                     index;
0984                     index += tree->leaf_size(node->left);
0985                     count++;
0986                 } else if (node->left) {
0987                     assert(node->leftnode == NODE);
0988                     indent += 1;
0989                     node = node->left;
0990                     break;
0991                 }
0992             }
0993             if (node->mark && (rightmask & bitmask) == 0) {
0994                 rightmask |= bitmask;
0995                 if (node->rightnode == LEAF) {
0996                     assert(node->right);
0997                     *tree->leaf_index(tree, node->right) = index;
0998                     index += tree->leaf_size(node->right);
0999                     count++;
1000                 } else if (node->right) {
1001                     assert(node->rightnode == NODE);
1002                     indent += 1;
1003                     node = node->right;
1004                     break;
1005                 }
1006             }
1007             leftmask &= ~bitmask;
1008             rightmask &= ~bitmask;
1009             node = node->parent;
1010             indent -= 1;
1011         }
1012     }
1013 done:
1014     /* Round up to a multiple of 16 */
1015     while (index % 16)
1016         index++;
1017     if (verbose > 0)
1018         printf("Final index %d\n", index);
1019     return index;
1020 }
1021
1022 /*
1023  * Mark the nodes in a subtree, helper for size_nodes().
1024  */
1025 static int mark_subtree(struct node *node)
1026 {
1027     int changed;
1028
1029     if (!node || node->mark)
1030         return 0;
1031     node->mark = 1;
1032     node->index = node->parent->index;
1033     changed = 1;
1034     if (node->leftnode == NODE)
1035         changed += mark_subtree(node->left);
1036     if (node->rightnode == NODE)
1037         changed += mark_subtree(node->right);
1038     return changed;
1039 }
1040
1041 /*
1042  * Compute the size of nodes and leaves. We start by assuming that
1043  * each node needs to store a three-byte offset. The indexes of the
1044  * nodes are calculated based on that, and then this function is
1045  * called to see if the sizes of some nodes can be reduced.  This is
1046  * repeated until no more changes are seen.
1047  */
1048 static int size_nodes(struct tree *tree)
1049 {
1050     struct tree *next;
1051     struct node *node;
1052     struct node *right;
1053     struct node *n;
1054     unsigned int leftmask;
1055     unsigned int rightmask;
1056     unsigned int bitmask;
1057     unsigned int pathbits;
1058     unsigned int pathmask;
1059     unsigned int nbit;
1060     int changed;
1061     int offset;
1062     int size;
1063     int indent;
1064
1065     indent = 1;
1066     changed = 0;
1067     size = 0;
1068
1069     if (verbose > 0)
1070         printf("Sizing %s_%x\n", tree->type, tree->maxage);
1071     if (tree->childnode == LEAF)
1072         goto done;
1073
1074     assert(tree->childnode == NODE);
1075     pathbits = 0;
1076     pathmask = 0;
1077     node = tree->root;
1078     leftmask = rightmask = 0;
1079     while (node) {
1080         if (!node->mark)
1081             goto skip;
1082         offset = 0;
1083         if (!node->left || !node->right) {
1084             size = 1;
1085         } else {
1086             if (node->rightnode == NODE) {
1087                 /*
1088                  * If the right node is not marked,
1089                  * look for a corresponding node in
1090                  * the next tree.  Such a node need
1091                  * not exist.
1092                  */
1093                 right = node->right;
1094                 next = tree->next;
1095                 while (!right->mark) {
1096                     assert(next);
1097                     n = next->root;
1098                     while (n->bitnum != node->bitnum) {
1099                         nbit = 1 << n->bitnum;
1100                         if (!(pathmask & nbit))
1101                             break;
1102                         if (pathbits & nbit) {
1103                             if (n->rightnode == LEAF)
1104                                 break;
1105                             n = n->right;
1106                         } else {
1107                             if (n->leftnode == LEAF)
1108                                 break;
1109                             n = n->left;
1110                         }
1111                     }
1112                     if (n->bitnum != node->bitnum)
1113                         break;
1114                     n = n->right;
1115                     right = n;
1116                     next = next->next;
1117                 }
1118                 /* Make sure the right node is marked. */
1119                 if (!right->mark)
1120                     changed += mark_subtree(right);
1121                 offset = right->index - node->index;
1122             } else {
1123                 offset = *tree->leaf_index(tree, node->right);
1124                 offset -= node->index;
1125             }
1126             assert(offset >= 0);
1127             assert(offset <= 0xffffff);
1128             if (offset <= 0xff) {
1129                 size = 2;
1130             } else if (offset <= 0xffff) {
1131                 size = 3;
1132             } else { /* offset <= 0xffffff */
1133                 size = 4;
1134             }
1135         }
1136         if (node->size != size || node->offset != offset) {
1137             node->size = size;
1138             node->offset = offset;
1139             changed++;
1140         }
1141 skip:
1142         while (node) {
1143             bitmask = 1 << node->bitnum;
1144             pathmask |= bitmask;
1145             if (node->mark && (leftmask & bitmask) == 0) {
1146                 leftmask |= bitmask;
1147                 if (node->leftnode == LEAF) {
1148                     assert(node->left);
1149                 } else if (node->left) {
1150                     assert(node->leftnode == NODE);
1151                     indent += 1;
1152                     node = node->left;
1153                     break;
1154                 }
1155             }
1156             if (node->mark && (rightmask & bitmask) == 0) {
1157                 rightmask |= bitmask;
1158                 pathbits |= bitmask;
1159                 if (node->rightnode == LEAF) {
1160                     assert(node->right);
1161                 } else if (node->right) {
1162                     assert(node->rightnode == NODE);
1163                     indent += 1;
1164                     node = node->right;
1165                     break;
1166                 }
1167             }
1168             leftmask &= ~bitmask;
1169             rightmask &= ~bitmask;
1170             pathmask &= ~bitmask;
1171             pathbits &= ~bitmask;
1172             node = node->parent;
1173             indent -= 1;
1174         }
1175     }
1176 done:
1177     if (verbose > 0)
1178         printf("Found %d changes\n", changed);
1179     return changed;
1180 }
1181
1182 /*
1183  * Emit a trie for the given tree into the data array.
1184  */
1185 static void emit(struct tree *tree, unsigned char *data)
1186 {
1187     struct node *node;
1188     unsigned int leftmask;
1189     unsigned int rightmask;
1190     unsigned int bitmask;
1191     int offlen;
1192     int offset;
1193     int index;
1194     int indent;
1195     int size;
1196     int bytes;
1197     int leaves;
1198     int nodes[4];
1199     unsigned char byte;
1200
1201     nodes[0] = nodes[1] = nodes[2] = nodes[3] = 0;
1202     leaves = 0;
1203     bytes = 0;
1204     index = tree->index;
1205     data += index;
1206     indent = 1;
1207     if (verbose > 0)
1208         printf("Emitting %s_%x\n", tree->type, tree->maxage);
1209     if (tree->childnode == LEAF) {
1210         assert(tree->root);
1211         tree->leaf_emit(tree->root, data);
1212         size = tree->leaf_size(tree->root);
1213         index += size;
1214         leaves++;
1215         goto done;
1216     }
1217
1218     assert(tree->childnode == NODE);
1219     node = tree->root;
1220     leftmask = rightmask = 0;
1221     while (node) {
1222         if (!node->mark)
1223             goto skip;
1224         assert(node->offset != -1);
1225         assert(node->index == index);
1226
1227         byte = 0;
1228         if (node->nextbyte)
1229             byte |= NEXTBYTE;
1230         byte |= (node->bitnum & BITNUM);
1231         if (node->left && node->right) {
1232             if (node->leftnode == NODE)
1233                 byte |= LEFTNODE;
1234             if (node->rightnode == NODE)
1235                 byte |= RIGHTNODE;
1236             if (node->offset <= 0xff)
1237                 offlen = 1;
1238             else if (node->offset <= 0xffff)
1239                 offlen = 2;
1240             else
1241                 offlen = 3;
1242             nodes[offlen]++;
1243             offset = node->offset;
1244             byte |= offlen << OFFLEN_SHIFT;
1245             *data++ = byte;
1246             index++;
1247             while (offlen--) {
1248                 *data++ = offset & 0xff;
1249                 index++;
1250                 offset >>= 8;
1251             }
1252         } else if (node->left) {
1253             if (node->leftnode == NODE)
1254                 byte |= TRIENODE;
1255             nodes[0]++;
1256             *data++ = byte;
1257             index++;
1258         } else if (node->right) {
1259             byte |= RIGHTNODE;
1260             if (node->rightnode == NODE)
1261                 byte |= TRIENODE;
1262             nodes[0]++;
1263             *data++ = byte;
1264             index++;
1265         } else {
1266             assert(0);
1267         }
1268 skip:
1269         while (node) {
1270             bitmask = 1 << node->bitnum;
1271             if (node->mark && (leftmask & bitmask) == 0) {
1272                 leftmask |= bitmask;
1273                 if (node->leftnode == LEAF) {
1274                     assert(node->left);
1275                     data = tree->leaf_emit(node->left,
1276                                    data);
1277                     size = tree->leaf_size(node->left);
1278                     index += size;
1279                     bytes += size;
1280                     leaves++;
1281                 } else if (node->left) {
1282                     assert(node->leftnode == NODE);
1283                     indent += 1;
1284                     node = node->left;
1285                     break;
1286                 }
1287             }
1288             if (node->mark && (rightmask & bitmask) == 0) {
1289                 rightmask |= bitmask;
1290                 if (node->rightnode == LEAF) {
1291                     assert(node->right);
1292                     data = tree->leaf_emit(node->right,
1293                                    data);
1294                     size = tree->leaf_size(node->right);
1295                     index += size;
1296                     bytes += size;
1297                     leaves++;
1298                 } else if (node->right) {
1299                     assert(node->rightnode == NODE);
1300                     indent += 1;
1301                     node = node->right;
1302                     break;
1303                 }
1304             }
1305             leftmask &= ~bitmask;
1306             rightmask &= ~bitmask;
1307             node = node->parent;
1308             indent -= 1;
1309         }
1310     }
1311 done:
1312     if (verbose > 0) {
1313         printf("Emitted %d (%d) leaves",
1314             leaves, bytes);
1315         printf(" %d (%d+%d+%d+%d) nodes",
1316             nodes[0] + nodes[1] + nodes[2] + nodes[3],
1317             nodes[0], nodes[1], nodes[2], nodes[3]);
1318         printf(" %d total\n", index - tree->index);
1319     }
1320 }
1321
1322 /* ------------------------------------------------------------------ */
1323
1324 /*
1325  * Unicode data.
1326  *
1327  * We need to keep track of the Canonical Combining Class, the Age,
1328  * and decompositions for a code point.
1329  *
1330  * For the Age, we store the index into the ages table.  Effectively
1331  * this is a generation number that the table maps to a unicode
1332  * version.
1333  *
1334  * The correction field is used to indicate that this entry is in the
1335  * corrections array, which contains decompositions that were
1336  * corrected in later revisions.  The value of the correction field is
1337  * the Unicode version in which the mapping was corrected.
1338  */
1339 struct unicode_data {
1340     unsigned int code;
1341     int ccc;
1342     int gen;
1343     int correction;
1344     unsigned int *utf32nfdi;
1345     unsigned int *utf32nfdicf;
1346     char *utf8nfdi;
1347     char *utf8nfdicf;
1348 };
1349
1350 struct unicode_data unicode_data[0x110000];
1351 struct unicode_data *corrections;
1352 int    corrections_count;
1353
1354 struct tree *nfdi_tree;
1355 struct tree *nfdicf_tree;
1356
1357 struct tree *trees;
1358 int          trees_count;
1359
1360 /*
1361  * Check the corrections array to see if this entry was corrected at
1362  * some point.
1363  */
1364 static struct unicode_data *corrections_lookup(struct unicode_data *u)
1365 {
1366     int i;
1367
1368     for (i = 0; i != corrections_count; i++)
1369         if (u->code == corrections[i].code)
1370             return &corrections[i];
1371     return u;
1372 }
1373
1374 static int nfdi_equal(void *l, void *r)
1375 {
1376     struct unicode_data *left = l;
1377     struct unicode_data *right = r;
1378
1379     if (left->gen != right->gen)
1380         return 0;
1381     if (left->ccc != right->ccc)
1382         return 0;
1383     if (left->utf8nfdi && right->utf8nfdi &&
1384         strcmp(left->utf8nfdi, right->utf8nfdi) == 0)
1385         return 1;
1386     if (left->utf8nfdi || right->utf8nfdi)
1387         return 0;
1388     return 1;
1389 }
1390
1391 static int nfdicf_equal(void *l, void *r)
1392 {
1393     struct unicode_data *left = l;
1394     struct unicode_data *right = r;
1395
1396     if (left->gen != right->gen)
1397         return 0;
1398     if (left->ccc != right->ccc)
1399         return 0;
1400     if (left->utf8nfdicf && right->utf8nfdicf &&
1401         strcmp(left->utf8nfdicf, right->utf8nfdicf) == 0)
1402         return 1;
1403     if (left->utf8nfdicf && right->utf8nfdicf)
1404         return 0;
1405     if (left->utf8nfdicf || right->utf8nfdicf)
1406         return 0;
1407     if (left->utf8nfdi && right->utf8nfdi &&
1408         strcmp(left->utf8nfdi, right->utf8nfdi) == 0)
1409         return 1;
1410     if (left->utf8nfdi || right->utf8nfdi)
1411         return 0;
1412     return 1;
1413 }
1414
1415 static void nfdi_print(void *l, int indent)
1416 {
1417     struct unicode_data *leaf = l;
1418
1419     printf("%*sleaf @ %p code %X ccc %d gen %d", indent, "", leaf,
1420         leaf->code, leaf->ccc, leaf->gen);
1421
1422     if (leaf->utf8nfdi && leaf->utf8nfdi[0] == HANGUL)
1423         printf(" nfdi \"%s\"", "HANGUL SYLLABLE");
1424     else if (leaf->utf8nfdi)
1425         printf(" nfdi \"%s\"", (const char*)leaf->utf8nfdi);
1426
1427     printf("\n");
1428 }
1429
1430 static void nfdicf_print(void *l, int indent)
1431 {
1432     struct unicode_data *leaf = l;
1433
1434     printf("%*sleaf @ %p code %X ccc %d gen %d", indent, "", leaf,
1435         leaf->code, leaf->ccc, leaf->gen);
1436
1437     if (leaf->utf8nfdicf)
1438         printf(" nfdicf \"%s\"", (const char*)leaf->utf8nfdicf);
1439     else if (leaf->utf8nfdi && leaf->utf8nfdi[0] == HANGUL)
1440         printf(" nfdi \"%s\"", "HANGUL SYLLABLE");
1441     else if (leaf->utf8nfdi)
1442         printf(" nfdi \"%s\"", (const char*)leaf->utf8nfdi);
1443     printf("\n");
1444 }
1445
1446 static int nfdi_mark(void *l)
1447 {
1448     return 1;
1449 }
1450
1451 static int nfdicf_mark(void *l)
1452 {
1453     struct unicode_data *leaf = l;
1454
1455     if (leaf->utf8nfdicf)
1456         return 1;
1457     return 0;
1458 }
1459
1460 static int correction_mark(void *l)
1461 {
1462     struct unicode_data *leaf = l;
1463
1464     return leaf->correction;
1465 }
1466
1467 static int nfdi_size(void *l)
1468 {
1469     struct unicode_data *leaf = l;
1470     int size = 2;
1471
1472     if (HANGUL_SYLLABLE(leaf->code))
1473         size += 1;
1474     else if (leaf->utf8nfdi)
1475         size += strlen(leaf->utf8nfdi) + 1;
1476     return size;
1477 }
1478
1479 static int nfdicf_size(void *l)
1480 {
1481     struct unicode_data *leaf = l;
1482     int size = 2;
1483
1484     if (HANGUL_SYLLABLE(leaf->code))
1485         size += 1;
1486     else if (leaf->utf8nfdicf)
1487         size += strlen(leaf->utf8nfdicf) + 1;
1488     else if (leaf->utf8nfdi)
1489         size += strlen(leaf->utf8nfdi) + 1;
1490     return size;
1491 }
1492
1493 static int *nfdi_index(struct tree *tree, void *l)
1494 {
1495     struct unicode_data *leaf = l;
1496
1497     return &tree->leafindex[leaf->code];
1498 }
1499
1500 static int *nfdicf_index(struct tree *tree, void *l)
1501 {
1502     struct unicode_data *leaf = l;
1503
1504     return &tree->leafindex[leaf->code];
1505 }
1506
1507 static unsigned char *nfdi_emit(void *l, unsigned char *data)
1508 {
1509     struct unicode_data *leaf = l;
1510     unsigned char *s;
1511
1512     *data++ = leaf->gen;
1513
1514     if (HANGUL_SYLLABLE(leaf->code)) {
1515         *data++ = DECOMPOSE;
1516         *data++ = HANGUL;
1517     } else if (leaf->utf8nfdi) {
1518         *data++ = DECOMPOSE;
1519         s = (unsigned char*)leaf->utf8nfdi;
1520         while ((*data++ = *s++) != 0)
1521             ;
1522     } else {
1523         *data++ = leaf->ccc;
1524     }
1525     return data;
1526 }
1527
1528 static unsigned char *nfdicf_emit(void *l, unsigned char *data)
1529 {
1530     struct unicode_data *leaf = l;
1531     unsigned char *s;
1532
1533     *data++ = leaf->gen;
1534
1535     if (HANGUL_SYLLABLE(leaf->code)) {
1536         *data++ = DECOMPOSE;
1537         *data++ = HANGUL;
1538     } else if (leaf->utf8nfdicf) {
1539         *data++ = DECOMPOSE;
1540         s = (unsigned char*)leaf->utf8nfdicf;
1541         while ((*data++ = *s++) != 0)
1542             ;
1543     } else if (leaf->utf8nfdi) {
1544         *data++ = DECOMPOSE;
1545         s = (unsigned char*)leaf->utf8nfdi;
1546         while ((*data++ = *s++) != 0)
1547             ;
1548     } else {
1549         *data++ = leaf->ccc;
1550     }
1551     return data;
1552 }
1553
1554 static void utf8_create(struct unicode_data *data)
1555 {
1556     char utf[18*4+1];
1557     char *u;
1558     unsigned int *um;
1559     int i;
1560
1561     if (data->utf8nfdi) {
1562         assert(data->utf8nfdi[0] == HANGUL);
1563         return;
1564     }
1565
1566     u = utf;
1567     um = data->utf32nfdi;
1568     if (um) {
1569         for (i = 0; um[i]; i++)
1570             u += utf8encode(u, um[i]);
1571         *u = '\0';
1572         data->utf8nfdi = strdup(utf);
1573     }
1574     u = utf;
1575     um = data->utf32nfdicf;
1576     if (um) {
1577         for (i = 0; um[i]; i++)
1578             u += utf8encode(u, um[i]);
1579         *u = '\0';
1580         if (!data->utf8nfdi || strcmp(data->utf8nfdi, utf))
1581             data->utf8nfdicf = strdup(utf);
1582     }
1583 }
1584
1585 static void utf8_init(void)
1586 {
1587     unsigned int unichar;
1588     int i;
1589
1590     for (unichar = 0; unichar != 0x110000; unichar++)
1591         utf8_create(&unicode_data[unichar]);
1592
1593     for (i = 0; i != corrections_count; i++)
1594         utf8_create(&corrections[i]);
1595 }
1596
1597 static void trees_init(void)
1598 {
1599     struct unicode_data *data;
1600     unsigned int maxage;
1601     unsigned int nextage;
1602     int count;
1603     int i;
1604     int j;
1605
1606     /* Count the number of different ages. */
1607     count = 0;
1608     nextage = (unsigned int)-1;
1609     do {
1610         maxage = nextage;
1611         nextage = 0;
1612         for (i = 0; i <= corrections_count; i++) {
1613             data = &corrections[i];
1614             if (nextage < data->correction &&
1615                 data->correction < maxage)
1616                 nextage = data->correction;
1617         }
1618         count++;
1619     } while (nextage);
1620
1621     /* Two trees per age: nfdi and nfdicf */
1622     trees_count = count * 2;
1623     trees = calloc(trees_count, sizeof(struct tree));
1624
1625     /* Assign ages to the trees. */
1626     count = trees_count;
1627     nextage = (unsigned int)-1;
1628     do {
1629         maxage = nextage;
1630         trees[--count].maxage = maxage;
1631         trees[--count].maxage = maxage;
1632         nextage = 0;
1633         for (i = 0; i <= corrections_count; i++) {
1634             data = &corrections[i];
1635             if (nextage < data->correction &&
1636                 data->correction < maxage)
1637                 nextage = data->correction;
1638         }
1639     } while (nextage);
1640
1641     /* The ages assigned above are off by one. */
1642     for (i = 0; i != trees_count; i++) {
1643         j = 0;
1644         while (ages[j] < trees[i].maxage)
1645             j++;
1646         trees[i].maxage = ages[j-1];
1647     }
1648
1649     /* Set up the forwarding between trees. */
1650     trees[trees_count-2].next = &trees[trees_count-1];
1651     trees[trees_count-1].leaf_mark = nfdi_mark;
1652     trees[trees_count-2].leaf_mark = nfdicf_mark;
1653     for (i = 0; i != trees_count-2; i += 2) {
1654         trees[i].next = &trees[trees_count-2];
1655         trees[i].leaf_mark = correction_mark;
1656         trees[i+1].next = &trees[trees_count-1];
1657         trees[i+1].leaf_mark = correction_mark;
1658     }
1659
1660     /* Assign the callouts. */
1661     for (i = 0; i != trees_count; i += 2) {
1662         trees[i].type = "nfdicf";
1663         trees[i].leaf_equal = nfdicf_equal;
1664         trees[i].leaf_print = nfdicf_print;
1665         trees[i].leaf_size = nfdicf_size;
1666         trees[i].leaf_index = nfdicf_index;
1667         trees[i].leaf_emit = nfdicf_emit;
1668
1669         trees[i+1].type = "nfdi";
1670         trees[i+1].leaf_equal = nfdi_equal;
1671         trees[i+1].leaf_print = nfdi_print;
1672         trees[i+1].leaf_size = nfdi_size;
1673         trees[i+1].leaf_index = nfdi_index;
1674         trees[i+1].leaf_emit = nfdi_emit;
1675     }
1676
1677     /* Finish init. */
1678     for (i = 0; i != trees_count; i++)
1679         trees[i].childnode = NODE;
1680 }
1681
1682 static void trees_populate(void)
1683 {
1684     struct unicode_data *data;
1685     unsigned int unichar;
1686     char keyval[4];
1687     int keylen;
1688     int i;
1689
1690     for (i = 0; i != trees_count; i++) {
1691         if (verbose > 0) {
1692             printf("Populating %s_%x\n",
1693                 trees[i].type, trees[i].maxage);
1694         }
1695         for (unichar = 0; unichar != 0x110000; unichar++) {
1696             if (unicode_data[unichar].gen < 0)
1697                 continue;
1698             keylen = utf8encode(keyval, unichar);
1699             data = corrections_lookup(&unicode_data[unichar]);
1700             if (data->correction <= trees[i].maxage)
1701                 data = &unicode_data[unichar];
1702             insert(&trees[i], keyval, keylen, data);
1703         }
1704     }
1705 }
1706
1707 static void trees_reduce(void)
1708 {
1709     int i;
1710     int size;
1711     int changed;
1712
1713     for (i = 0; i != trees_count; i++)
1714         prune(&trees[i]);
1715     for (i = 0; i != trees_count; i++)
1716         mark_nodes(&trees[i]);
1717     do {
1718         size = 0;
1719         for (i = 0; i != trees_count; i++)
1720             size = index_nodes(&trees[i], size);
1721         changed = 0;
1722         for (i = 0; i != trees_count; i++)
1723             changed += size_nodes(&trees[i]);
1724     } while (changed);
1725
1726     utf8data = calloc(size, 1);
1727     utf8data_size = size;
1728     for (i = 0; i != trees_count; i++)
1729         emit(&trees[i], utf8data);
1730
1731     if (verbose > 0) {
1732         for (i = 0; i != trees_count; i++) {
1733             printf("%s_%x idx %d\n",
1734                 trees[i].type, trees[i].maxage, trees[i].index);
1735         }
1736     }
1737
1738     nfdi = utf8data + trees[trees_count-1].index;
1739     nfdicf = utf8data + trees[trees_count-2].index;
1740
1741     nfdi_tree = &trees[trees_count-1];
1742     nfdicf_tree = &trees[trees_count-2];
1743 }
1744
1745 static void verify(struct tree *tree)
1746 {
1747     struct unicode_data *data;
1748     utf8leaf_t  *leaf;
1749     unsigned int    unichar;
1750     char        key[4];
1751     unsigned char   hangul[UTF8HANGULLEAF];
1752     int     report;
1753     int     nocf;
1754
1755     if (verbose > 0)
1756         printf("Verifying %s_%x\n", tree->type, tree->maxage);
1757     nocf = strcmp(tree->type, "nfdicf");
1758
1759     for (unichar = 0; unichar != 0x110000; unichar++) {
1760         report = 0;
1761         data = corrections_lookup(&unicode_data[unichar]);
1762         if (data->correction <= tree->maxage)
1763             data = &unicode_data[unichar];
1764         utf8encode(key,unichar);
1765         leaf = utf8lookup(tree, hangul, key);
1766
1767         if (!leaf) {
1768             if (data->gen != -1)
1769                 report++;
1770             if (unichar < 0xd800 || unichar > 0xdfff)
1771                 report++;
1772         } else {
1773             if (unichar >= 0xd800 && unichar <= 0xdfff)
1774                 report++;
1775             if (data->gen == -1)
1776                 report++;
1777             if (data->gen != LEAF_GEN(leaf))
1778                 report++;
1779             if (LEAF_CCC(leaf) == DECOMPOSE) {
1780                 if (HANGUL_SYLLABLE(data->code)) {
1781                     if (data->utf8nfdi[0] != HANGUL)
1782                         report++;
1783                 } else if (nocf) {
1784                     if (!data->utf8nfdi) {
1785                         report++;
1786                     } else if (strcmp(data->utf8nfdi,
1787                               LEAF_STR(leaf))) {
1788                         report++;
1789                     }
1790                 } else {
1791                     if (!data->utf8nfdicf &&
1792                         !data->utf8nfdi) {
1793                         report++;
1794                     } else if (data->utf8nfdicf) {
1795                         if (strcmp(data->utf8nfdicf,
1796                                LEAF_STR(leaf)))
1797                             report++;
1798                     } else if (strcmp(data->utf8nfdi,
1799                               LEAF_STR(leaf))) {
1800                         report++;
1801                     }
1802                 }
1803             } else if (data->ccc != LEAF_CCC(leaf)) {
1804                 report++;
1805             }
1806         }
1807         if (report) {
1808             printf("%X code %X gen %d ccc %d"
1809                 " nfdi -> \"%s\"",
1810                 unichar, data->code, data->gen,
1811                 data->ccc,
1812                 data->utf8nfdi);
1813             if (leaf) {
1814                 printf(" gen %d ccc %d"
1815                     " nfdi -> \"%s\"",
1816                     LEAF_GEN(leaf),
1817                     LEAF_CCC(leaf),
1818                     LEAF_CCC(leaf) == DECOMPOSE ?
1819                         LEAF_STR(leaf) : "");
1820             }
1821             printf("\n");
1822         }
1823     }
1824 }
1825
1826 static void trees_verify(void)
1827 {
1828     int i;
1829
1830     for (i = 0; i != trees_count; i++)
1831         verify(&trees[i]);
1832 }
1833
1834 /* ------------------------------------------------------------------ */
1835
1836 static void help(void)
1837 {
1838     printf("Usage: %s [options]\n", argv0);
1839     printf("\n");
1840     printf("This program creates an a data trie used for parsing and\n");
1841     printf("normalization of UTF-8 strings. The trie is derived from\n");
1842     printf("a set of input files from the Unicode character database\n");
1843     printf("found at: http://www.unicode.org/Public/UCD/latest/ucd/\n");
1844     printf("\n");
1845     printf("The generated tree supports two normalization forms:\n");
1846     printf("\n");
1847     printf("\tnfdi:\n");
1848     printf("\t- Apply unicode normalization form NFD.\n");
1849     printf("\t- Remove any Default_Ignorable_Code_Point.\n");
1850     printf("\n");
1851     printf("\tnfdicf:\n");
1852     printf("\t- Apply unicode normalization form NFD.\n");
1853     printf("\t- Remove any Default_Ignorable_Code_Point.\n");
1854     printf("\t- Apply a full casefold (C + F).\n");
1855     printf("\n");
1856     printf("These forms were chosen as being most useful when dealing\n");
1857     printf("with file names: NFD catches most cases where characters\n");
1858     printf("should be considered equivalent. The ignorables are mostly\n");
1859     printf("invisible, making names hard to type.\n");
1860     printf("\n");
1861     printf("The options to specify the files to be used are listed\n");
1862     printf("below with their default values, which are the names used\n");
1863     printf("by version 11.0.0 of the Unicode Character Database.\n");
1864     printf("\n");
1865     printf("The input files:\n");
1866     printf("\t-a %s\n", AGE_NAME);
1867     printf("\t-c %s\n", CCC_NAME);
1868     printf("\t-p %s\n", PROP_NAME);
1869     printf("\t-d %s\n", DATA_NAME);
1870     printf("\t-f %s\n", FOLD_NAME);
1871     printf("\t-n %s\n", NORM_NAME);
1872     printf("\n");
1873     printf("Additionally, the generated tables are tested using:\n");
1874     printf("\t-t %s\n", TEST_NAME);
1875     printf("\n");
1876     printf("Finally, the output file:\n");
1877     printf("\t-o %s\n", UTF8_NAME);
1878     printf("\n");
1879 }
1880
1881 static void usage(void)
1882 {
1883     help();
1884     exit(1);
1885 }
1886
1887 static void open_fail(const char *name, int error)
1888 {
1889     printf("Error %d opening %s: %s\n", error, name, strerror(error));
1890     exit(1);
1891 }
1892
1893 static void file_fail(const char *filename)
1894 {
1895     printf("Error parsing %s\n", filename);
1896     exit(1);
1897 }
1898
1899 static void line_fail(const char *filename, const char *line)
1900 {
1901     printf("Error parsing %s:%s\n", filename, line);
1902     exit(1);
1903 }
1904
1905 /* ------------------------------------------------------------------ */
1906
1907 static void print_utf32(unsigned int *utf32str)
1908 {
1909     int i;
1910
1911     for (i = 0; utf32str[i]; i++)
1912         printf(" %X", utf32str[i]);
1913 }
1914
1915 static void print_utf32nfdi(unsigned int unichar)
1916 {
1917     printf(" %X ->", unichar);
1918     print_utf32(unicode_data[unichar].utf32nfdi);
1919     printf("\n");
1920 }
1921
1922 static void print_utf32nfdicf(unsigned int unichar)
1923 {
1924     printf(" %X ->", unichar);
1925     print_utf32(unicode_data[unichar].utf32nfdicf);
1926     printf("\n");
1927 }
1928
1929 /* ------------------------------------------------------------------ */
1930
1931 static void age_init(void)
1932 {
1933     FILE *file;
1934     unsigned int first;
1935     unsigned int last;
1936     unsigned int unichar;
1937     unsigned int major;
1938     unsigned int minor;
1939     unsigned int revision;
1940     int gen;
1941     int count;
1942     int ret;
1943
1944     if (verbose > 0)
1945         printf("Parsing %s\n", age_name);
1946
1947     file = fopen(age_name, "r");
1948     if (!file)
1949         open_fail(age_name, errno);
1950     count = 0;
1951
1952     gen = 0;
1953     while (fgets(line, LINESIZE, file)) {
1954         ret = sscanf(line, "# Age=V%d_%d_%d",
1955                 &major, &minor, &revision);
1956         if (ret == 3) {
1957             ages_count++;
1958             if (verbose > 1)
1959                 printf(" Age V%d_%d_%d\n",
1960                     major, minor, revision);
1961             if (!age_valid(major, minor, revision))
1962                 line_fail(age_name, line);
1963             continue;
1964         }
1965         ret = sscanf(line, "# Age=V%d_%d", &major, &minor);
1966         if (ret == 2) {
1967             ages_count++;
1968             if (verbose > 1)
1969                 printf(" Age V%d_%d\n", major, minor);
1970             if (!age_valid(major, minor, 0))
1971                 line_fail(age_name, line);
1972             continue;
1973         }
1974     }
1975
1976     /* We must have found something above. */
1977     if (verbose > 1)
1978         printf("%d age entries\n", ages_count);
1979     if (ages_count == 0 || ages_count > MAXGEN)
1980         file_fail(age_name);
1981
1982     /* There is a 0 entry. */
1983     ages_count++;
1984     ages = calloc(ages_count + 1, sizeof(*ages));
1985     /* And a guard entry. */
1986     ages[ages_count] = (unsigned int)-1;
1987
1988     rewind(file);
1989     count = 0;
1990     gen = 0;
1991     while (fgets(line, LINESIZE, file)) {
1992         ret = sscanf(line, "# Age=V%d_%d_%d",
1993                 &major, &minor, &revision);
1994         if (ret == 3) {
1995             ages[++gen] =
1996                 UNICODE_AGE(major, minor, revision);
1997             if (verbose > 1)
1998                 printf(" Age V%d_%d_%d = gen %d\n",
1999                     major, minor, revision, gen);
2000             if (!age_valid(major, minor, revision))
2001                 line_fail(age_name, line);
2002             continue;
2003         }
2004         ret = sscanf(line, "# Age=V%d_%d", &major, &minor);
2005         if (ret == 2) {
2006             ages[++gen] = UNICODE_AGE(major, minor, 0);
2007             if (verbose > 1)
2008                 printf(" Age V%d_%d = %d\n",
2009                     major, minor, gen);
2010             if (!age_valid(major, minor, 0))
2011                 line_fail(age_name, line);
2012             continue;
2013         }
2014         ret = sscanf(line, "%X..%X ; %d.%d #",
2015                  &first, &last, &major, &minor);
2016         if (ret == 4) {
2017             for (unichar = first; unichar <= last; unichar++)
2018                 unicode_data[unichar].gen = gen;
2019             count += 1 + last - first;
2020             if (verbose > 1)
2021                 printf("  %X..%X gen %d\n", first, last, gen);
2022             if (!utf32valid(first) || !utf32valid(last))
2023                 line_fail(age_name, line);
2024             continue;
2025         }
2026         ret = sscanf(line, "%X ; %d.%d #", &unichar, &major, &minor);
2027         if (ret == 3) {
2028             unicode_data[unichar].gen = gen;
2029             count++;
2030             if (verbose > 1)
2031                 printf("  %X gen %d\n", unichar, gen);
2032             if (!utf32valid(unichar))
2033                 line_fail(age_name, line);
2034             continue;
2035         }
2036     }
2037     unicode_maxage = ages[gen];
2038     fclose(file);
2039
2040     /* Nix surrogate block */
2041     if (verbose > 1)
2042         printf(" Removing surrogate block D800..DFFF\n");
2043     for (unichar = 0xd800; unichar <= 0xdfff; unichar++)
2044         unicode_data[unichar].gen = -1;
2045
2046     if (verbose > 0)
2047             printf("Found %d entries\n", count);
2048     if (count == 0)
2049         file_fail(age_name);
2050 }
2051
2052 static void ccc_init(void)
2053 {
2054     FILE *file;
2055     unsigned int first;
2056     unsigned int last;
2057     unsigned int unichar;
2058     unsigned int value;
2059     int count;
2060     int ret;
2061
2062     if (verbose > 0)
2063         printf("Parsing %s\n", ccc_name);
2064
2065     file = fopen(ccc_name, "r");
2066     if (!file)
2067         open_fail(ccc_name, errno);
2068
2069     count = 0;
2070     while (fgets(line, LINESIZE, file)) {
2071         ret = sscanf(line, "%X..%X ; %d #", &first, &last, &value);
2072         if (ret == 3) {
2073             for (unichar = first; unichar <= last; unichar++) {
2074                 unicode_data[unichar].ccc = value;
2075                                 count++;
2076             }
2077             if (verbose > 1)
2078                 printf(" %X..%X ccc %d\n", first, last, value);
2079             if (!utf32valid(first) || !utf32valid(last))
2080                 line_fail(ccc_name, line);
2081             continue;
2082         }
2083         ret = sscanf(line, "%X ; %d #", &unichar, &value);
2084         if (ret == 2) {
2085             unicode_data[unichar].ccc = value;
2086                         count++;
2087             if (verbose > 1)
2088                 printf(" %X ccc %d\n", unichar, value);
2089             if (!utf32valid(unichar))
2090                 line_fail(ccc_name, line);
2091             continue;
2092         }
2093     }
2094     fclose(file);
2095
2096     if (verbose > 0)
2097         printf("Found %d entries\n", count);
2098     if (count == 0)
2099         file_fail(ccc_name);
2100 }
2101
2102 static int ignore_compatibility_form(char *type)
2103 {
2104     int i;
2105     char *ignored_types[] = {"font", "noBreak", "initial", "medial",
2106                  "final", "isolated", "circle", "super",
2107                  "sub", "vertical", "wide", "narrow",
2108                  "small", "square", "fraction", "compat"};
2109
2110     for (i = 0 ; i < ARRAY_SIZE(ignored_types); i++)
2111         if (strcmp(type, ignored_types[i]) == 0)
2112             return 1;
2113     return 0;
2114 }
2115
2116 static void nfdi_init(void)
2117 {
2118     FILE *file;
2119     unsigned int unichar;
2120     unsigned int mapping[19]; /* Magic - guaranteed not to be exceeded. */
2121     char *s;
2122     char *type;
2123     unsigned int *um;
2124     int count;
2125     int i;
2126     int ret;
2127
2128     if (verbose > 0)
2129         printf("Parsing %s\n", data_name);
2130     file = fopen(data_name, "r");
2131     if (!file)
2132         open_fail(data_name, errno);
2133
2134     count = 0;
2135     while (fgets(line, LINESIZE, file)) {
2136         ret = sscanf(line, "%X;%*[^;];%*[^;];%*[^;];%*[^;];%[^;];",
2137                  &unichar, buf0);
2138         if (ret != 2)
2139             continue;
2140         if (!utf32valid(unichar))
2141             line_fail(data_name, line);
2142
2143         s = buf0;
2144         /* skip over <tag> */
2145         if (*s == '<') {
2146             type = ++s;
2147             while (*++s != '>');
2148             *s++ = '\0';
2149             if(ignore_compatibility_form(type))
2150                 continue;
2151         }
2152         /* decode the decomposition into UTF-32 */
2153         i = 0;
2154         while (*s) {
2155             mapping[i] = strtoul(s, &s, 16);
2156             if (!utf32valid(mapping[i]))
2157                 line_fail(data_name, line);
2158             i++;
2159         }
2160         mapping[i++] = 0;
2161
2162         um = malloc(i * sizeof(unsigned int));
2163         memcpy(um, mapping, i * sizeof(unsigned int));
2164         unicode_data[unichar].utf32nfdi = um;
2165
2166         if (verbose > 1)
2167             print_utf32nfdi(unichar);
2168         count++;
2169     }
2170     fclose(file);
2171     if (verbose > 0)
2172         printf("Found %d entries\n", count);
2173     if (count == 0)
2174         file_fail(data_name);
2175 }
2176
2177 static void nfdicf_init(void)
2178 {
2179     FILE *file;
2180     unsigned int unichar;
2181     unsigned int mapping[19]; /* Magic - guaranteed not to be exceeded. */
2182     char status;
2183     char *s;
2184     unsigned int *um;
2185     int i;
2186     int count;
2187     int ret;
2188
2189     if (verbose > 0)
2190         printf("Parsing %s\n", fold_name);
2191     file = fopen(fold_name, "r");
2192     if (!file)
2193         open_fail(fold_name, errno);
2194
2195     count = 0;
2196     while (fgets(line, LINESIZE, file)) {
2197         ret = sscanf(line, "%X; %c; %[^;];", &unichar, &status, buf0);
2198         if (ret != 3)
2199             continue;
2200         if (!utf32valid(unichar))
2201             line_fail(fold_name, line);
2202         /* Use the C+F casefold. */
2203         if (status != 'C' && status != 'F')
2204             continue;
2205         s = buf0;
2206         if (*s == '<')
2207             while (*s++ != ' ')
2208                 ;
2209         i = 0;
2210         while (*s) {
2211             mapping[i] = strtoul(s, &s, 16);
2212             if (!utf32valid(mapping[i]))
2213                 line_fail(fold_name, line);
2214             i++;
2215         }
2216         mapping[i++] = 0;
2217
2218         um = malloc(i * sizeof(unsigned int));
2219         memcpy(um, mapping, i * sizeof(unsigned int));
2220         unicode_data[unichar].utf32nfdicf = um;
2221
2222         if (verbose > 1)
2223             print_utf32nfdicf(unichar);
2224         count++;
2225     }
2226     fclose(file);
2227     if (verbose > 0)
2228         printf("Found %d entries\n", count);
2229     if (count == 0)
2230         file_fail(fold_name);
2231 }
2232
2233 static void ignore_init(void)
2234 {
2235     FILE *file;
2236     unsigned int unichar;
2237     unsigned int first;
2238     unsigned int last;
2239     unsigned int *um;
2240     int count;
2241     int ret;
2242
2243     if (verbose > 0)
2244         printf("Parsing %s\n", prop_name);
2245     file = fopen(prop_name, "r");
2246     if (!file)
2247         open_fail(prop_name, errno);
2248     assert(file);
2249     count = 0;
2250     while (fgets(line, LINESIZE, file)) {
2251         ret = sscanf(line, "%X..%X ; %s # ", &first, &last, buf0);
2252         if (ret == 3) {
2253             if (strcmp(buf0, "Default_Ignorable_Code_Point"))
2254                 continue;
2255             if (!utf32valid(first) || !utf32valid(last))
2256                 line_fail(prop_name, line);
2257             for (unichar = first; unichar <= last; unichar++) {
2258                 free(unicode_data[unichar].utf32nfdi);
2259                 um = malloc(sizeof(unsigned int));
2260                 *um = 0;
2261                 unicode_data[unichar].utf32nfdi = um;
2262                 free(unicode_data[unichar].utf32nfdicf);
2263                 um = malloc(sizeof(unsigned int));
2264                 *um = 0;
2265                 unicode_data[unichar].utf32nfdicf = um;
2266                 count++;
2267             }
2268             if (verbose > 1)
2269                 printf(" %X..%X Default_Ignorable_Code_Point\n",
2270                     first, last);
2271             continue;
2272         }
2273         ret = sscanf(line, "%X ; %s # ", &unichar, buf0);
2274         if (ret == 2) {
2275             if (strcmp(buf0, "Default_Ignorable_Code_Point"))
2276                 continue;
2277             if (!utf32valid(unichar))
2278                 line_fail(prop_name, line);
2279             free(unicode_data[unichar].utf32nfdi);
2280             um = malloc(sizeof(unsigned int));
2281             *um = 0;
2282             unicode_data[unichar].utf32nfdi = um;
2283             free(unicode_data[unichar].utf32nfdicf);
2284             um = malloc(sizeof(unsigned int));
2285             *um = 0;
2286             unicode_data[unichar].utf32nfdicf = um;
2287             if (verbose > 1)
2288                 printf(" %X Default_Ignorable_Code_Point\n",
2289                     unichar);
2290             count++;
2291             continue;
2292         }
2293     }
2294     fclose(file);
2295
2296     if (verbose > 0)
2297         printf("Found %d entries\n", count);
2298     if (count == 0)
2299         file_fail(prop_name);
2300 }
2301
2302 static void corrections_init(void)
2303 {
2304     FILE *file;
2305     unsigned int unichar;
2306     unsigned int major;
2307     unsigned int minor;
2308     unsigned int revision;
2309     unsigned int age;
2310     unsigned int *um;
2311     unsigned int mapping[19]; /* Magic - guaranteed not to be exceeded. */
2312     char *s;
2313     int i;
2314     int count;
2315     int ret;
2316
2317     if (verbose > 0)
2318         printf("Parsing %s\n", norm_name);
2319     file = fopen(norm_name, "r");
2320     if (!file)
2321         open_fail(norm_name, errno);
2322
2323     count = 0;
2324     while (fgets(line, LINESIZE, file)) {
2325         ret = sscanf(line, "%X;%[^;];%[^;];%d.%d.%d #",
2326                 &unichar, buf0, buf1,
2327                 &major, &minor, &revision);
2328         if (ret != 6)
2329             continue;
2330         if (!utf32valid(unichar) || !age_valid(major, minor, revision))
2331             line_fail(norm_name, line);
2332         count++;
2333     }
2334     corrections = calloc(count, sizeof(struct unicode_data));
2335     corrections_count = count;
2336     rewind(file);
2337
2338     count = 0;
2339     while (fgets(line, LINESIZE, file)) {
2340         ret = sscanf(line, "%X;%[^;];%[^;];%d.%d.%d #",
2341                 &unichar, buf0, buf1,
2342                 &major, &minor, &revision);
2343         if (ret != 6)
2344             continue;
2345         if (!utf32valid(unichar) || !age_valid(major, minor, revision))
2346             line_fail(norm_name, line);
2347         corrections[count] = unicode_data[unichar];
2348         assert(corrections[count].code == unichar);
2349         age = UNICODE_AGE(major, minor, revision);
2350         corrections[count].correction = age;
2351
2352         i = 0;
2353         s = buf0;
2354         while (*s) {
2355             mapping[i] = strtoul(s, &s, 16);
2356             if (!utf32valid(mapping[i]))
2357                 line_fail(norm_name, line);
2358             i++;
2359         }
2360         mapping[i++] = 0;
2361
2362         um = malloc(i * sizeof(unsigned int));
2363         memcpy(um, mapping, i * sizeof(unsigned int));
2364         corrections[count].utf32nfdi = um;
2365
2366         if (verbose > 1)
2367             printf(" %X -> %s -> %s V%d_%d_%d\n",
2368                 unichar, buf0, buf1, major, minor, revision);
2369         count++;
2370     }
2371     fclose(file);
2372
2373     if (verbose > 0)
2374             printf("Found %d entries\n", count);
2375     if (count == 0)
2376         file_fail(norm_name);
2377 }
2378
2379 /* ------------------------------------------------------------------ */
2380
2381 /*
2382  * Hangul decomposition (algorithm from Section 3.12 of Unicode 6.3.0)
2383  *
2384  * AC00;<Hangul Syllable, First>;Lo;0;L;;;;;N;;;;;
2385  * D7A3;<Hangul Syllable, Last>;Lo;0;L;;;;;N;;;;;
2386  *
2387  * SBase = 0xAC00
2388  * LBase = 0x1100
2389  * VBase = 0x1161
2390  * TBase = 0x11A7
2391  * LCount = 19
2392  * VCount = 21
2393  * TCount = 28
2394  * NCount = 588 (VCount * TCount)
2395  * SCount = 11172 (LCount * NCount)
2396  *
2397  * Decomposition:
2398  *   SIndex = s - SBase
2399  *
2400  * LV (Canonical/Full)
2401  *   LIndex = SIndex / NCount
2402  *   VIndex = (Sindex % NCount) / TCount
2403  *   LPart = LBase + LIndex
2404  *   VPart = VBase + VIndex
2405  *
2406  * LVT (Canonical)
2407  *   LVIndex = (SIndex / TCount) * TCount
2408  *   TIndex = (Sindex % TCount)
2409  *   LVPart = SBase + LVIndex
2410  *   TPart = TBase + TIndex
2411  *
2412  * LVT (Full)
2413  *   LIndex = SIndex / NCount
2414  *   VIndex = (Sindex % NCount) / TCount
2415  *   TIndex = (Sindex % TCount)
2416  *   LPart = LBase + LIndex
2417  *   VPart = VBase + VIndex
2418  *   if (TIndex == 0) {
2419  *          d = <LPart, VPart>
2420  *   } else {
2421  *          TPart = TBase + TIndex
2422  *          d = <LPart, VPart, TPart>
2423  *   }
2424  *
2425  */
2426
2427 static void hangul_decompose(void)
2428 {
2429     unsigned int sb = 0xAC00;
2430     unsigned int lb = 0x1100;
2431     unsigned int vb = 0x1161;
2432     unsigned int tb = 0x11a7;
2433     /* unsigned int lc = 19; */
2434     unsigned int vc = 21;
2435     unsigned int tc = 28;
2436     unsigned int nc = (vc * tc);
2437     /* unsigned int sc = (lc * nc); */
2438     unsigned int unichar;
2439     unsigned int mapping[4];
2440     unsigned int *um;
2441         int count;
2442     int i;
2443
2444     if (verbose > 0)
2445         printf("Decomposing hangul\n");
2446     /* Hangul */
2447     count = 0;
2448     for (unichar = 0xAC00; unichar <= 0xD7A3; unichar++) {
2449         unsigned int si = unichar - sb;
2450         unsigned int li = si / nc;
2451         unsigned int vi = (si % nc) / tc;
2452         unsigned int ti = si % tc;
2453
2454         i = 0;
2455         mapping[i++] = lb + li;
2456         mapping[i++] = vb + vi;
2457         if (ti)
2458             mapping[i++] = tb + ti;
2459         mapping[i++] = 0;
2460
2461         assert(!unicode_data[unichar].utf32nfdi);
2462         um = malloc(i * sizeof(unsigned int));
2463         memcpy(um, mapping, i * sizeof(unsigned int));
2464         unicode_data[unichar].utf32nfdi = um;
2465
2466         assert(!unicode_data[unichar].utf32nfdicf);
2467         um = malloc(i * sizeof(unsigned int));
2468         memcpy(um, mapping, i * sizeof(unsigned int));
2469         unicode_data[unichar].utf32nfdicf = um;
2470
2471         /*
2472          * Add a cookie as a reminder that the hangul syllable
2473          * decompositions must not be stored in the generated
2474          * trie.
2475          */
2476         unicode_data[unichar].utf8nfdi = malloc(2);
2477         unicode_data[unichar].utf8nfdi[0] = HANGUL;
2478         unicode_data[unichar].utf8nfdi[1] = '\0';
2479
2480         if (verbose > 1)
2481             print_utf32nfdi(unichar);
2482
2483         count++;
2484     }
2485     if (verbose > 0)
2486         printf("Created %d entries\n", count);
2487 }
2488
2489 static void nfdi_decompose(void)
2490 {
2491     unsigned int unichar;
2492     unsigned int mapping[19]; /* Magic - guaranteed not to be exceeded. */
2493     unsigned int *um;
2494     unsigned int *dc;
2495     int count;
2496     int i;
2497     int j;
2498     int ret;
2499
2500     if (verbose > 0)
2501         printf("Decomposing nfdi\n");
2502
2503     count = 0;
2504     for (unichar = 0; unichar != 0x110000; unichar++) {
2505         if (!unicode_data[unichar].utf32nfdi)
2506             continue;
2507         for (;;) {
2508             ret = 1;
2509             i = 0;
2510             um = unicode_data[unichar].utf32nfdi;
2511             while (*um) {
2512                 dc = unicode_data[*um].utf32nfdi;
2513                 if (dc) {
2514                     for (j = 0; dc[j]; j++)
2515                         mapping[i++] = dc[j];
2516                     ret = 0;
2517                 } else {
2518                     mapping[i++] = *um;
2519                 }
2520                 um++;
2521             }
2522             mapping[i++] = 0;
2523             if (ret)
2524                 break;
2525             free(unicode_data[unichar].utf32nfdi);
2526             um = malloc(i * sizeof(unsigned int));
2527             memcpy(um, mapping, i * sizeof(unsigned int));
2528             unicode_data[unichar].utf32nfdi = um;
2529         }
2530         /* Add this decomposition to nfdicf if there is no entry. */
2531         if (!unicode_data[unichar].utf32nfdicf) {
2532             um = malloc(i * sizeof(unsigned int));
2533             memcpy(um, mapping, i * sizeof(unsigned int));
2534             unicode_data[unichar].utf32nfdicf = um;
2535         }
2536         if (verbose > 1)
2537             print_utf32nfdi(unichar);
2538         count++;
2539     }
2540     if (verbose > 0)
2541         printf("Processed %d entries\n", count);
2542 }
2543
2544 static void nfdicf_decompose(void)
2545 {
2546     unsigned int unichar;
2547     unsigned int mapping[19]; /* Magic - guaranteed not to be exceeded. */
2548     unsigned int *um;
2549     unsigned int *dc;
2550     int count;
2551     int i;
2552     int j;
2553     int ret;
2554
2555     if (verbose > 0)
2556         printf("Decomposing nfdicf\n");
2557     count = 0;
2558     for (unichar = 0; unichar != 0x110000; unichar++) {
2559         if (!unicode_data[unichar].utf32nfdicf)
2560             continue;
2561         for (;;) {
2562             ret = 1;
2563             i = 0;
2564             um = unicode_data[unichar].utf32nfdicf;
2565             while (*um) {
2566                 dc = unicode_data[*um].utf32nfdicf;
2567                 if (dc) {
2568                     for (j = 0; dc[j]; j++)
2569                         mapping[i++] = dc[j];
2570                     ret = 0;
2571                 } else {
2572                     mapping[i++] = *um;
2573                 }
2574                 um++;
2575             }
2576             mapping[i++] = 0;
2577             if (ret)
2578                 break;
2579             free(unicode_data[unichar].utf32nfdicf);
2580             um = malloc(i * sizeof(unsigned int));
2581             memcpy(um, mapping, i * sizeof(unsigned int));
2582             unicode_data[unichar].utf32nfdicf = um;
2583         }
2584         if (verbose > 1)
2585             print_utf32nfdicf(unichar);
2586         count++;
2587     }
2588     if (verbose > 0)
2589         printf("Processed %d entries\n", count);
2590 }
2591
2592 /* ------------------------------------------------------------------ */
2593
2594 int utf8agemax(struct tree *, const char *);
2595 int utf8nagemax(struct tree *, const char *, size_t);
2596 int utf8agemin(struct tree *, const char *);
2597 int utf8nagemin(struct tree *, const char *, size_t);
2598 ssize_t utf8len(struct tree *, const char *);
2599 ssize_t utf8nlen(struct tree *, const char *, size_t);
2600 struct utf8cursor;
2601 int utf8cursor(struct utf8cursor *, struct tree *, const char *);
2602 int utf8ncursor(struct utf8cursor *, struct tree *, const char *, size_t);
2603 int utf8byte(struct utf8cursor *);
2604
2605 /*
2606  * Hangul decomposition (algorithm from Section 3.12 of Unicode 6.3.0)
2607  *
2608  * AC00;<Hangul Syllable, First>;Lo;0;L;;;;;N;;;;;
2609  * D7A3;<Hangul Syllable, Last>;Lo;0;L;;;;;N;;;;;
2610  *
2611  * SBase = 0xAC00
2612  * LBase = 0x1100
2613  * VBase = 0x1161
2614  * TBase = 0x11A7
2615  * LCount = 19
2616  * VCount = 21
2617  * TCount = 28
2618  * NCount = 588 (VCount * TCount)
2619  * SCount = 11172 (LCount * NCount)
2620  *
2621  * Decomposition:
2622  *   SIndex = s - SBase
2623  *
2624  * LV (Canonical/Full)
2625  *   LIndex = SIndex / NCount
2626  *   VIndex = (Sindex % NCount) / TCount
2627  *   LPart = LBase + LIndex
2628  *   VPart = VBase + VIndex
2629  *
2630  * LVT (Canonical)
2631  *   LVIndex = (SIndex / TCount) * TCount
2632  *   TIndex = (Sindex % TCount)
2633  *   LVPart = SBase + LVIndex
2634  *   TPart = TBase + TIndex
2635  *
2636  * LVT (Full)
2637  *   LIndex = SIndex / NCount
2638  *   VIndex = (Sindex % NCount) / TCount
2639  *   TIndex = (Sindex % TCount)
2640  *   LPart = LBase + LIndex
2641  *   VPart = VBase + VIndex
2642  *   if (TIndex == 0) {
2643  *          d = <LPart, VPart>
2644  *   } else {
2645  *          TPart = TBase + TIndex
2646  *          d = <LPart, VPart, TPart>
2647  *   }
2648  */
2649
2650 /* Constants */
2651 #define SB  (0xAC00)
2652 #define LB  (0x1100)
2653 #define VB  (0x1161)
2654 #define TB  (0x11A7)
2655 #define LC  (19)
2656 #define VC  (21)
2657 #define TC  (28)
2658 #define NC  (VC * TC)
2659 #define SC  (LC * NC)
2660
2661 /* Algorithmic decomposition of hangul syllable. */
2662 static utf8leaf_t *utf8hangul(const char *str, unsigned char *hangul)
2663 {
2664     unsigned int    si;
2665     unsigned int    li;
2666     unsigned int    vi;
2667     unsigned int    ti;
2668     unsigned char   *h;
2669
2670     /* Calculate the SI, LI, VI, and TI values. */
2671     si = utf8decode(str) - SB;
2672     li = si / NC;
2673     vi = (si % NC) / TC;
2674     ti = si % TC;
2675
2676     /* Fill in base of leaf. */
2677     h = hangul;
2678     LEAF_GEN(h) = 2;
2679     LEAF_CCC(h) = DECOMPOSE;
2680     h += 2;
2681
2682     /* Add LPart, a 3-byte UTF-8 sequence. */
2683     h += utf8encode((char *)h, li + LB);
2684
2685     /* Add VPart, a 3-byte UTF-8 sequence. */
2686     h += utf8encode((char *)h, vi + VB);
2687
2688     /* Add TPart if required, also a 3-byte UTF-8 sequence. */
2689     if (ti)
2690         h += utf8encode((char *)h, ti + TB);
2691
2692     /* Terminate string. */
2693     h[0] = '\0';
2694
2695     return hangul;
2696 }
2697
2698 /*
2699  * Use trie to scan s, touching at most len bytes.
2700  * Returns the leaf if one exists, NULL otherwise.
2701  *
2702  * A non-NULL return guarantees that the UTF-8 sequence starting at s
2703  * is well-formed and corresponds to a known unicode code point.  The
2704  * shorthand for this will be "is valid UTF-8 unicode".
2705  */
2706 static utf8leaf_t *utf8nlookup(struct tree *tree, unsigned char *hangul,
2707                    const char *s, size_t len)
2708 {
2709     utf8trie_t  *trie;
2710     int     offlen;
2711     int     offset;
2712     int     mask;
2713     int     node;
2714
2715     if (!tree)
2716         return NULL;
2717     if (len == 0)
2718         return NULL;
2719     node = 1;
2720     trie = utf8data + tree->index;
2721     while (node) {
2722         offlen = (*trie & OFFLEN) >> OFFLEN_SHIFT;
2723         if (*trie & NEXTBYTE) {
2724             if (--len == 0)
2725                 return NULL;
2726             s++;
2727         }
2728         mask = 1 << (*trie & BITNUM);
2729         if (*s & mask) {
2730             /* Right leg */
2731             if (offlen) {
2732                 /* Right node at offset of trie */
2733                 node = (*trie & RIGHTNODE);
2734                 offset = trie[offlen];
2735                 while (--offlen) {
2736                     offset <<= 8;
2737                     offset |= trie[offlen];
2738                 }
2739                 trie += offset;
2740             } else if (*trie & RIGHTPATH) {
2741                 /* Right node after this node */
2742                 node = (*trie & TRIENODE);
2743                 trie++;
2744             } else {
2745                 /* No right node. */
2746                 return NULL;
2747             }
2748         } else {
2749             /* Left leg */
2750             if (offlen) {
2751                 /* Left node after this node. */
2752                 node = (*trie & LEFTNODE);
2753                 trie += offlen + 1;
2754             } else if (*trie & RIGHTPATH) {
2755                 /* No left node. */
2756                 return NULL;
2757             } else {
2758                 /* Left node after this node */
2759                 node = (*trie & TRIENODE);
2760                 trie++;
2761             }
2762         }
2763     }
2764     /*
2765      * Hangul decomposition is done algorithmically. These are the
2766      * codepoints >= 0xAC00 and <= 0xD7A3. Their UTF-8 encoding is
2767      * always 3 bytes long, so s has been advanced twice, and the
2768      * start of the sequence is at s-2.
2769      */
2770     if (LEAF_CCC(trie) == DECOMPOSE && LEAF_STR(trie)[0] == HANGUL)
2771         trie = utf8hangul(s - 2, hangul);
2772     return trie;
2773 }
2774
2775 /*
2776  * Use trie to scan s.
2777  * Returns the leaf if one exists, NULL otherwise.
2778  *
2779  * Forwards to trie_nlookup().
2780  */
2781 static utf8leaf_t *utf8lookup(struct tree *tree, unsigned char *hangul,
2782                   const char *s)
2783 {
2784     return utf8nlookup(tree, hangul, s, (size_t)-1);
2785 }
2786
2787 /*
2788  * Return the number of bytes used by the current UTF-8 sequence.
2789  * Assumes the input points to the first byte of a valid UTF-8
2790  * sequence.
2791  */
2792 static inline int utf8clen(const char *s)
2793 {
2794     unsigned char c = *s;
2795     return 1 + (c >= 0xC0) + (c >= 0xE0) + (c >= 0xF0);
2796 }
2797
2798 /*
2799  * Maximum age of any character in s.
2800  * Return -1 if s is not valid UTF-8 unicode.
2801  * Return 0 if only non-assigned code points are used.
2802  */
2803 int utf8agemax(struct tree *tree, const char *s)
2804 {
2805     utf8leaf_t  *leaf;
2806     int     age = 0;
2807     int     leaf_age;
2808     unsigned char   hangul[UTF8HANGULLEAF];
2809
2810     if (!tree)
2811         return -1;
2812
2813     while (*s) {
2814         leaf = utf8lookup(tree, hangul, s);
2815         if (!leaf)
2816             return -1;
2817         leaf_age = ages[LEAF_GEN(leaf)];
2818         if (leaf_age <= tree->maxage && leaf_age > age)
2819             age = leaf_age;
2820         s += utf8clen(s);
2821     }
2822     return age;
2823 }
2824
2825 /*
2826  * Minimum age of any character in s.
2827  * Return -1 if s is not valid UTF-8 unicode.
2828  * Return 0 if non-assigned code points are used.
2829  */
2830 int utf8agemin(struct tree *tree, const char *s)
2831 {
2832     utf8leaf_t  *leaf;
2833     int     age;
2834     int     leaf_age;
2835     unsigned char   hangul[UTF8HANGULLEAF];
2836
2837     if (!tree)
2838         return -1;
2839     age = tree->maxage;
2840     while (*s) {
2841         leaf = utf8lookup(tree, hangul, s);
2842         if (!leaf)
2843             return -1;
2844         leaf_age = ages[LEAF_GEN(leaf)];
2845         if (leaf_age <= tree->maxage && leaf_age < age)
2846             age = leaf_age;
2847         s += utf8clen(s);
2848     }
2849     return age;
2850 }
2851
2852 /*
2853  * Maximum age of any character in s, touch at most len bytes.
2854  * Return -1 if s is not valid UTF-8 unicode.
2855  */
2856 int utf8nagemax(struct tree *tree, const char *s, size_t len)
2857 {
2858     utf8leaf_t  *leaf;
2859     int     age = 0;
2860     int     leaf_age;
2861     unsigned char   hangul[UTF8HANGULLEAF];
2862
2863     if (!tree)
2864         return -1;
2865
2866         while (len && *s) {
2867         leaf = utf8nlookup(tree, hangul, s, len);
2868         if (!leaf)
2869             return -1;
2870         leaf_age = ages[LEAF_GEN(leaf)];
2871         if (leaf_age <= tree->maxage && leaf_age > age)
2872             age = leaf_age;
2873         len -= utf8clen(s);
2874         s += utf8clen(s);
2875     }
2876     return age;
2877 }
2878
2879 /*
2880  * Maximum age of any character in s, touch at most len bytes.
2881  * Return -1 if s is not valid UTF-8 unicode.
2882  */
2883 int utf8nagemin(struct tree *tree, const char *s, size_t len)
2884 {
2885     utf8leaf_t  *leaf;
2886     int     leaf_age;
2887     int     age;
2888     unsigned char   hangul[UTF8HANGULLEAF];
2889
2890     if (!tree)
2891         return -1;
2892     age = tree->maxage;
2893         while (len && *s) {
2894         leaf = utf8nlookup(tree, hangul, s, len);
2895         if (!leaf)
2896             return -1;
2897         leaf_age = ages[LEAF_GEN(leaf)];
2898         if (leaf_age <= tree->maxage && leaf_age < age)
2899             age = leaf_age;
2900         len -= utf8clen(s);
2901         s += utf8clen(s);
2902     }
2903     return age;
2904 }
2905
2906 /*
2907  * Length of the normalization of s.
2908  * Return -1 if s is not valid UTF-8 unicode.
2909  *
2910  * A string of Default_Ignorable_Code_Point has length 0.
2911  */
2912 ssize_t utf8len(struct tree *tree, const char *s)
2913 {
2914     utf8leaf_t  *leaf;
2915     size_t      ret = 0;
2916     unsigned char   hangul[UTF8HANGULLEAF];
2917
2918     if (!tree)
2919         return -1;
2920     while (*s) {
2921         leaf = utf8lookup(tree, hangul, s);
2922         if (!leaf)
2923             return -1;
2924         if (ages[LEAF_GEN(leaf)] > tree->maxage)
2925             ret += utf8clen(s);
2926         else if (LEAF_CCC(leaf) == DECOMPOSE)
2927             ret += strlen(LEAF_STR(leaf));
2928         else
2929             ret += utf8clen(s);
2930         s += utf8clen(s);
2931     }
2932     return ret;
2933 }
2934
2935 /*
2936  * Length of the normalization of s, touch at most len bytes.
2937  * Return -1 if s is not valid UTF-8 unicode.
2938  */
2939 ssize_t utf8nlen(struct tree *tree, const char *s, size_t len)
2940 {
2941     utf8leaf_t  *leaf;
2942     size_t      ret = 0;
2943     unsigned char   hangul[UTF8HANGULLEAF];
2944
2945     if (!tree)
2946         return -1;
2947     while (len && *s) {
2948         leaf = utf8nlookup(tree, hangul, s, len);
2949         if (!leaf)
2950             return -1;
2951         if (ages[LEAF_GEN(leaf)] > tree->maxage)
2952             ret += utf8clen(s);
2953         else if (LEAF_CCC(leaf) == DECOMPOSE)
2954             ret += strlen(LEAF_STR(leaf));
2955         else
2956             ret += utf8clen(s);
2957         len -= utf8clen(s);
2958         s += utf8clen(s);
2959     }
2960     return ret;
2961 }
2962
2963 /*
2964  * Cursor structure used by the normalizer.
2965  */
2966 struct utf8cursor {
2967     struct tree *tree;
2968     const char  *s;
2969     const char  *p;
2970     const char  *ss;
2971     const char  *sp;
2972     unsigned int    len;
2973     unsigned int    slen;
2974     short int   ccc;
2975     short int   nccc;
2976     unsigned int    unichar;
2977     unsigned char   hangul[UTF8HANGULLEAF];
2978 };
2979
2980 /*
2981  * Set up an utf8cursor for use by utf8byte().
2982  *
2983  *   s      : string.
2984  *   len    : length of s.
2985  *   u8c    : pointer to cursor.
2986  *   trie   : utf8trie_t to use for normalization.
2987  *
2988  * Returns -1 on error, 0 on success.
2989  */
2990 int utf8ncursor(struct utf8cursor *u8c, struct tree *tree, const char *s,
2991         size_t len)
2992 {
2993     if (!tree)
2994         return -1;
2995     if (!s)
2996         return -1;
2997     u8c->tree = tree;
2998     u8c->s = s;
2999     u8c->p = NULL;
3000     u8c->ss = NULL;
3001     u8c->sp = NULL;
3002     u8c->len = len;
3003     u8c->slen = 0;
3004     u8c->ccc = STOPPER;
3005     u8c->nccc = STOPPER;
3006     u8c->unichar = 0;
3007     /* Check we didn't clobber the maximum length. */
3008     if (u8c->len != len)
3009         return -1;
3010     /* The first byte of s may not be an utf8 continuation. */
3011     if (len > 0 && (*s & 0xC0) == 0x80)
3012         return -1;
3013     return 0;
3014 }
3015
3016 /*
3017  * Set up an utf8cursor for use by utf8byte().
3018  *
3019  *   s      : NUL-terminated string.
3020  *   u8c    : pointer to cursor.
3021  *   trie   : utf8trie_t to use for normalization.
3022  *
3023  * Returns -1 on error, 0 on success.
3024  */
3025 int utf8cursor(struct utf8cursor *u8c, struct tree *tree, const char *s)
3026 {
3027     return utf8ncursor(u8c, tree, s, (unsigned int)-1);
3028 }
3029
3030 /*
3031  * Get one byte from the normalized form of the string described by u8c.
3032  *
3033  * Returns the byte cast to an unsigned char on succes, and -1 on failure.
3034  *
3035  * The cursor keeps track of the location in the string in u8c->s.
3036  * When a character is decomposed, the current location is stored in
3037  * u8c->p, and u8c->s is set to the start of the decomposition. Note
3038  * that bytes from a decomposition do not count against u8c->len.
3039  *
3040  * Characters are emitted if they match the current CCC in u8c->ccc.
3041  * Hitting end-of-string while u8c->ccc == STOPPER means we're done,
3042  * and the function returns 0 in that case.
3043  *
3044  * Sorting by CCC is done by repeatedly scanning the string.  The
3045  * values of u8c->s and u8c->p are stored in u8c->ss and u8c->sp at
3046  * the start of the scan.  The first pass finds the lowest CCC to be
3047  * emitted and stores it in u8c->nccc, the second pass emits the
3048  * characters with this CCC and finds the next lowest CCC. This limits
3049  * the number of passes to 1 + the number of different CCCs in the
3050  * sequence being scanned.
3051  *
3052  * Therefore:
3053  *  u8c->p  != NULL -> a decomposition is being scanned.
3054  *  u8c->ss != NULL -> this is a repeating scan.
3055  *  u8c->ccc == -1  -> this is the first scan of a repeating scan.
3056  */
3057 int utf8byte(struct utf8cursor *u8c)
3058 {
3059     utf8leaf_t *leaf;
3060     int ccc;
3061
3062     for (;;) {
3063         /* Check for the end of a decomposed character. */
3064         if (u8c->p && *u8c->s == '\0') {
3065             u8c->s = u8c->p;
3066             u8c->p = NULL;
3067         }
3068
3069         /* Check for end-of-string. */
3070         if (!u8c->p && (u8c->len == 0 || *u8c->s == '\0')) {
3071             /* There is no next byte. */
3072             if (u8c->ccc == STOPPER)
3073                 return 0;
3074             /* End-of-string during a scan counts as a stopper. */
3075             ccc = STOPPER;
3076             goto ccc_mismatch;
3077         } else if ((*u8c->s & 0xC0) == 0x80) {
3078             /* This is a continuation of the current character. */
3079             if (!u8c->p)
3080                 u8c->len--;
3081             return (unsigned char)*u8c->s++;
3082         }
3083
3084         /* Look up the data for the current character. */
3085         if (u8c->p) {
3086             leaf = utf8lookup(u8c->tree, u8c->hangul, u8c->s);
3087         } else {
3088             leaf = utf8nlookup(u8c->tree, u8c->hangul,
3089                        u8c->s, u8c->len);
3090         }
3091
3092         /* No leaf found implies that the input is a binary blob. */
3093         if (!leaf)
3094             return -1;
3095
3096         /* Characters that are too new have CCC 0. */
3097         if (ages[LEAF_GEN(leaf)] > u8c->tree->maxage) {
3098             ccc = STOPPER;
3099         } else if ((ccc = LEAF_CCC(leaf)) == DECOMPOSE) {
3100             u8c->len -= utf8clen(u8c->s);
3101             u8c->p = u8c->s + utf8clen(u8c->s);
3102             u8c->s = LEAF_STR(leaf);
3103             /* Empty decomposition implies CCC 0. */
3104             if (*u8c->s == '\0') {
3105                 if (u8c->ccc == STOPPER)
3106                     continue;
3107                 ccc = STOPPER;
3108                 goto ccc_mismatch;
3109             }
3110             leaf = utf8lookup(u8c->tree, u8c->hangul, u8c->s);
3111             ccc = LEAF_CCC(leaf);
3112         }
3113         u8c->unichar = utf8decode(u8c->s);
3114
3115         /*
3116          * If this is not a stopper, then see if it updates
3117          * the next canonical class to be emitted.
3118          */
3119         if (ccc != STOPPER && u8c->ccc < ccc && ccc < u8c->nccc)
3120             u8c->nccc = ccc;
3121
3122         /*
3123          * Return the current byte if this is the current
3124          * combining class.
3125          */
3126         if (ccc == u8c->ccc) {
3127             if (!u8c->p)
3128                 u8c->len--;
3129             return (unsigned char)*u8c->s++;
3130         }
3131
3132         /* Current combining class mismatch. */
3133     ccc_mismatch:
3134         if (u8c->nccc == STOPPER) {
3135             /*
3136              * Scan forward for the first canonical class
3137              * to be emitted.  Save the position from
3138              * which to restart.
3139              */
3140             assert(u8c->ccc == STOPPER);
3141             u8c->ccc = MINCCC - 1;
3142             u8c->nccc = ccc;
3143             u8c->sp = u8c->p;
3144             u8c->ss = u8c->s;
3145             u8c->slen = u8c->len;
3146             if (!u8c->p)
3147                 u8c->len -= utf8clen(u8c->s);
3148             u8c->s += utf8clen(u8c->s);
3149         } else if (ccc != STOPPER) {
3150             /* Not a stopper, and not the ccc we're emitting. */
3151             if (!u8c->p)
3152                 u8c->len -= utf8clen(u8c->s);
3153             u8c->s += utf8clen(u8c->s);
3154         } else if (u8c->nccc != MAXCCC + 1) {
3155             /* At a stopper, restart for next ccc. */
3156             u8c->ccc = u8c->nccc;
3157             u8c->nccc = MAXCCC + 1;
3158             u8c->s = u8c->ss;
3159             u8c->p = u8c->sp;
3160             u8c->len = u8c->slen;
3161         } else {
3162             /* All done, proceed from here. */
3163             u8c->ccc = STOPPER;
3164             u8c->nccc = STOPPER;
3165             u8c->sp = NULL;
3166             u8c->ss = NULL;
3167             u8c->slen = 0;
3168         }
3169     }
3170 }
3171
3172 /* ------------------------------------------------------------------ */
3173
3174 static int normalize_line(struct tree *tree)
3175 {
3176     char *s;
3177     char *t;
3178     int c;
3179     struct utf8cursor u8c;
3180
3181     /* First test: null-terminated string. */
3182     s = buf2;
3183     t = buf3;
3184     if (utf8cursor(&u8c, tree, s))
3185         return -1;
3186     while ((c = utf8byte(&u8c)) > 0)
3187         if (c != (unsigned char)*t++)
3188             return -1;
3189     if (c < 0)
3190         return -1;
3191     if (*t != 0)
3192         return -1;
3193
3194     /* Second test: length-limited string. */
3195     s = buf2;
3196     /* Replace NUL with a value that will cause an error if seen. */
3197     s[strlen(s) + 1] = -1;
3198     t = buf3;
3199     if (utf8cursor(&u8c, tree, s))
3200         return -1;
3201     while ((c = utf8byte(&u8c)) > 0)
3202         if (c != (unsigned char)*t++)
3203             return -1;
3204     if (c < 0)
3205         return -1;
3206     if (*t != 0)
3207         return -1;
3208
3209     return 0;
3210 }
3211
3212 static void normalization_test(void)
3213 {
3214     FILE *file;
3215     unsigned int unichar;
3216     struct unicode_data *data;
3217     char *s;
3218     char *t;
3219     int ret;
3220     int ignorables;
3221     int tests = 0;
3222     int failures = 0;
3223
3224     if (verbose > 0)
3225         printf("Parsing %s\n", test_name);
3226     /* Step one, read data from file. */
3227     file = fopen(test_name, "r");
3228     if (!file)
3229         open_fail(test_name, errno);
3230
3231     while (fgets(line, LINESIZE, file)) {
3232         ret = sscanf(line, "%[^;];%*[^;];%[^;];%*[^;];%*[^;];",
3233                  buf0, buf1);
3234         if (ret != 2 || *line == '#')
3235             continue;
3236         s = buf0;
3237         t = buf2;
3238         while (*s) {
3239             unichar = strtoul(s, &s, 16);
3240             t += utf8encode(t, unichar);
3241         }
3242         *t = '\0';
3243
3244         ignorables = 0;
3245         s = buf1;
3246         t = buf3;
3247         while (*s) {
3248             unichar = strtoul(s, &s, 16);
3249             data = &unicode_data[unichar];
3250             if (data->utf8nfdi && !*data->utf8nfdi)
3251                 ignorables = 1;
3252             else
3253                 t += utf8encode(t, unichar);
3254         }
3255         *t = '\0';
3256
3257         tests++;
3258         if (normalize_line(nfdi_tree) < 0) {
3259             printf("Line %s -> %s", buf0, buf1);
3260             if (ignorables)
3261                 printf(" (ignorables removed)");
3262             printf(" failure\n");
3263             failures++;
3264         }
3265     }
3266     fclose(file);
3267     if (verbose > 0)
3268         printf("Ran %d tests with %d failures\n", tests, failures);
3269     if (failures)
3270         file_fail(test_name);
3271 }
3272
3273 /* ------------------------------------------------------------------ */
3274
3275 static void write_file(void)
3276 {
3277     FILE *file;
3278     int i;
3279     int j;
3280     int t;
3281     int gen;
3282
3283     if (verbose > 0)
3284         printf("Writing %s\n", utf8_name);
3285     file = fopen(utf8_name, "w");
3286     if (!file)
3287         open_fail(utf8_name, errno);
3288
3289     fprintf(file, "/* This file is generated code, do not edit. */\n");
3290     fprintf(file, "\n");
3291     fprintf(file, "#include <linux/module.h>\n");
3292     fprintf(file, "#include <linux/kernel.h>\n");
3293     fprintf(file, "#include \"utf8n.h\"\n");
3294     fprintf(file, "\n");
3295     fprintf(file, "static const unsigned int utf8agetab[] = {\n");
3296     for (i = 0; i != ages_count; i++)
3297         fprintf(file, "\t%#x%s\n", ages[i],
3298             ages[i] == unicode_maxage ? "" : ",");
3299     fprintf(file, "};\n");
3300     fprintf(file, "\n");
3301     fprintf(file, "static const struct utf8data utf8nfdicfdata[] = {\n");
3302     t = 0;
3303     for (gen = 0; gen < ages_count; gen++) {
3304         fprintf(file, "\t{ %#x, %d }%s\n",
3305             ages[gen], trees[t].index,
3306             ages[gen] == unicode_maxage ? "" : ",");
3307         if (trees[t].maxage == ages[gen])
3308             t += 2;
3309     }
3310     fprintf(file, "};\n");
3311     fprintf(file, "\n");
3312     fprintf(file, "static const struct utf8data utf8nfdidata[] = {\n");
3313     t = 1;
3314     for (gen = 0; gen < ages_count; gen++) {
3315         fprintf(file, "\t{ %#x, %d }%s\n",
3316             ages[gen], trees[t].index,
3317             ages[gen] == unicode_maxage ? "" : ",");
3318         if (trees[t].maxage == ages[gen])
3319             t += 2;
3320     }
3321     fprintf(file, "};\n");
3322     fprintf(file, "\n");
3323     fprintf(file, "static const unsigned char utf8data[%zd] = {\n",
3324         utf8data_size);
3325     t = 0;
3326     for (i = 0; i != utf8data_size; i += 16) {
3327         if (i == trees[t].index) {
3328             fprintf(file, "\t/* %s_%x */\n",
3329                 trees[t].type, trees[t].maxage);
3330             if (t < trees_count-1)
3331                 t++;
3332         }
3333         fprintf(file, "\t");
3334         for (j = i; j != i + 16; j++)
3335             fprintf(file, "0x%.2x%s", utf8data[j],
3336                 (j < utf8data_size -1 ? "," : ""));
3337         fprintf(file, "\n");
3338     }
3339     fprintf(file, "};\n");
3340     fprintf(file, "\n");
3341     fprintf(file, "struct utf8data_table utf8_data_table = {\n");
3342     fprintf(file, "\t.utf8agetab = utf8agetab,\n");
3343     fprintf(file, "\t.utf8agetab_size = ARRAY_SIZE(utf8agetab),\n");
3344     fprintf(file, "\n");
3345     fprintf(file, "\t.utf8nfdicfdata = utf8nfdicfdata,\n");
3346     fprintf(file, "\t.utf8nfdicfdata_size = ARRAY_SIZE(utf8nfdicfdata),\n");
3347     fprintf(file, "\n");
3348     fprintf(file, "\t.utf8nfdidata = utf8nfdidata,\n");
3349     fprintf(file, "\t.utf8nfdidata_size = ARRAY_SIZE(utf8nfdidata),\n");
3350     fprintf(file, "\n");
3351     fprintf(file, "\t.utf8data = utf8data,\n");
3352     fprintf(file, "};\n");
3353     fprintf(file, "EXPORT_SYMBOL_GPL(utf8_data_table);");
3354     fprintf(file, "\n");
3355     fprintf(file, "MODULE_LICENSE(\"GPL v2\");\n");
3356     fclose(file);
3357 }
3358
3359 /* ------------------------------------------------------------------ */
3360
3361 int main(int argc, char *argv[])
3362 {
3363     unsigned int unichar;
3364     int opt;
3365
3366     argv0 = argv[0];
3367
3368     while ((opt = getopt(argc, argv, "a:c:d:f:hn:o:p:t:v")) != -1) {
3369         switch (opt) {
3370         case 'a':
3371             age_name = optarg;
3372             break;
3373         case 'c':
3374             ccc_name = optarg;
3375             break;
3376         case 'd':
3377             data_name = optarg;
3378             break;
3379         case 'f':
3380             fold_name = optarg;
3381             break;
3382         case 'n':
3383             norm_name = optarg;
3384             break;
3385         case 'o':
3386             utf8_name = optarg;
3387             break;
3388         case 'p':
3389             prop_name = optarg;
3390             break;
3391         case 't':
3392             test_name = optarg;
3393             break;
3394         case 'v':
3395             verbose++;
3396             break;
3397         case 'h':
3398             help();
3399             exit(0);
3400         default:
3401             usage();
3402         }
3403     }
3404
3405     if (verbose > 1)
3406         help();
3407     for (unichar = 0; unichar != 0x110000; unichar++)
3408         unicode_data[unichar].code = unichar;
3409     age_init();
3410     ccc_init();
3411     nfdi_init();
3412     nfdicf_init();
3413     ignore_init();
3414     corrections_init();
3415     hangul_decompose();
3416     nfdi_decompose();
3417     nfdicf_decompose();
3418     utf8_init();
3419     trees_init();
3420     trees_populate();
3421     trees_reduce();
3422     trees_verify();
3423     /* Prevent "unused function" warning. */
3424     (void)lookup(nfdi_tree, " ");
3425     if (verbose > 2)
3426         tree_walk(nfdi_tree);
3427     if (verbose > 2)
3428         tree_walk(nfdicf_tree);
3429     normalization_test();
3430     write_file();
3431
3432     return 0;
3433 }