0001
0002
0003
0004
0005
0006
0007
0008
0009
0010
0011
0012
0013
0014
0015
0016
0017
0018
0019
0020
0021 #include <sys/types.h>
0022 #include <stddef.h>
0023 #include <stdlib.h>
0024 #include <stdio.h>
0025 #include <assert.h>
0026 #include <string.h>
0027 #include <unistd.h>
0028 #include <errno.h>
0029
0030
0031
0032 #define AGE_NAME "DerivedAge.txt"
0033 #define CCC_NAME "DerivedCombiningClass.txt"
0034 #define PROP_NAME "DerivedCoreProperties.txt"
0035 #define DATA_NAME "UnicodeData.txt"
0036 #define FOLD_NAME "CaseFolding.txt"
0037 #define NORM_NAME "NormalizationCorrections.txt"
0038 #define TEST_NAME "NormalizationTest.txt"
0039 #define UTF8_NAME "utf8data.h"
0040
0041 const char *age_name = AGE_NAME;
0042 const char *ccc_name = CCC_NAME;
0043 const char *prop_name = PROP_NAME;
0044 const char *data_name = DATA_NAME;
0045 const char *fold_name = FOLD_NAME;
0046 const char *norm_name = NORM_NAME;
0047 const char *test_name = TEST_NAME;
0048 const char *utf8_name = UTF8_NAME;
0049
0050 int verbose = 0;
0051
0052
0053
0054 #define LINESIZE 1024
0055 char line[LINESIZE];
0056 char buf0[LINESIZE];
0057 char buf1[LINESIZE];
0058 char buf2[LINESIZE];
0059 char buf3[LINESIZE];
0060
0061 const char *argv0;
0062
0063 #define ARRAY_SIZE(x) (sizeof(x) / sizeof((x)[0]))
0064
0065
0066
0067
0068
0069
0070
0071
0072
0073
0074
0075
0076
0077 #define UNICODE_MAJ_SHIFT (16)
0078 #define UNICODE_MIN_SHIFT (8)
0079
0080 #define UNICODE_MAJ_MAX ((unsigned short)-1)
0081 #define UNICODE_MIN_MAX ((unsigned char)-1)
0082 #define UNICODE_REV_MAX ((unsigned char)-1)
0083
0084 #define UNICODE_AGE(MAJ,MIN,REV) \
0085 (((unsigned int)(MAJ) << UNICODE_MAJ_SHIFT) | \
0086 ((unsigned int)(MIN) << UNICODE_MIN_SHIFT) | \
0087 ((unsigned int)(REV)))
0088
0089 unsigned int *ages;
0090 int ages_count;
0091
0092 unsigned int unicode_maxage;
0093
0094 static int age_valid(unsigned int major, unsigned int minor,
0095 unsigned int revision)
0096 {
0097 if (major > UNICODE_MAJ_MAX)
0098 return 0;
0099 if (minor > UNICODE_MIN_MAX)
0100 return 0;
0101 if (revision > UNICODE_REV_MAX)
0102 return 0;
0103 return 1;
0104 }
0105
0106
0107
0108
0109
0110
0111
0112
0113
0114
0115
0116
0117
0118
0119
0120
0121
0122
0123
0124
0125
0126
0127
0128
0129
0130
0131
0132 typedef unsigned char utf8trie_t;
0133 #define BITNUM 0x07
0134 #define NEXTBYTE 0x08
0135 #define OFFLEN 0x30
0136 #define OFFLEN_SHIFT 4
0137 #define RIGHTPATH 0x40
0138 #define TRIENODE 0x80
0139 #define RIGHTNODE 0x40
0140 #define LEFTNODE 0x80
0141
0142
0143
0144
0145
0146
0147
0148
0149
0150
0151
0152
0153
0154
0155
0156
0157
0158
0159
0160
0161
0162
0163
0164
0165
0166
0167
0168
0169
0170
0171
0172
0173 typedef unsigned char utf8leaf_t;
0174
0175 #define LEAF_GEN(LEAF) ((LEAF)[0])
0176 #define LEAF_CCC(LEAF) ((LEAF)[1])
0177 #define LEAF_STR(LEAF) ((const char*)((LEAF) + 2))
0178
0179 #define MAXGEN (255)
0180
0181 #define MINCCC (0)
0182 #define MAXCCC (254)
0183 #define STOPPER (0)
0184 #define DECOMPOSE (255)
0185 #define HANGUL ((char)(255))
0186
0187 #define UTF8HANGULLEAF (12)
0188
0189 struct tree;
0190 static utf8leaf_t *utf8nlookup(struct tree *, unsigned char *,
0191 const char *, size_t);
0192 static utf8leaf_t *utf8lookup(struct tree *, unsigned char *, const char *);
0193
0194 unsigned char *utf8data;
0195 size_t utf8data_size;
0196
0197 utf8trie_t *nfdi;
0198 utf8trie_t *nfdicf;
0199
0200
0201
0202
0203
0204
0205
0206
0207
0208
0209
0210
0211
0212
0213
0214
0215
0216
0217
0218
0219
0220
0221
0222
0223
0224
0225
0226
0227
0228
0229
0230
0231
0232
0233
0234
0235
0236
0237
0238
0239
0240
0241
0242
0243
0244
0245
0246
0247
0248
0249
0250
0251 #define UTF8_2_BITS 0xC0
0252 #define UTF8_3_BITS 0xE0
0253 #define UTF8_4_BITS 0xF0
0254 #define UTF8_N_BITS 0x80
0255 #define UTF8_2_MASK 0xE0
0256 #define UTF8_3_MASK 0xF0
0257 #define UTF8_4_MASK 0xF8
0258 #define UTF8_N_MASK 0xC0
0259 #define UTF8_V_MASK 0x3F
0260 #define UTF8_V_SHIFT 6
0261
0262 static int utf8encode(char *str, unsigned int val)
0263 {
0264 int len;
0265
0266 if (val < 0x80) {
0267 str[0] = val;
0268 len = 1;
0269 } else if (val < 0x800) {
0270 str[1] = val & UTF8_V_MASK;
0271 str[1] |= UTF8_N_BITS;
0272 val >>= UTF8_V_SHIFT;
0273 str[0] = val;
0274 str[0] |= UTF8_2_BITS;
0275 len = 2;
0276 } else if (val < 0x10000) {
0277 str[2] = val & UTF8_V_MASK;
0278 str[2] |= UTF8_N_BITS;
0279 val >>= UTF8_V_SHIFT;
0280 str[1] = val & UTF8_V_MASK;
0281 str[1] |= UTF8_N_BITS;
0282 val >>= UTF8_V_SHIFT;
0283 str[0] = val;
0284 str[0] |= UTF8_3_BITS;
0285 len = 3;
0286 } else if (val < 0x110000) {
0287 str[3] = val & UTF8_V_MASK;
0288 str[3] |= UTF8_N_BITS;
0289 val >>= UTF8_V_SHIFT;
0290 str[2] = val & UTF8_V_MASK;
0291 str[2] |= UTF8_N_BITS;
0292 val >>= UTF8_V_SHIFT;
0293 str[1] = val & UTF8_V_MASK;
0294 str[1] |= UTF8_N_BITS;
0295 val >>= UTF8_V_SHIFT;
0296 str[0] = val;
0297 str[0] |= UTF8_4_BITS;
0298 len = 4;
0299 } else {
0300 printf("%#x: illegal val\n", val);
0301 len = 0;
0302 }
0303 return len;
0304 }
0305
0306 static unsigned int utf8decode(const char *str)
0307 {
0308 const unsigned char *s = (const unsigned char*)str;
0309 unsigned int unichar = 0;
0310
0311 if (*s < 0x80) {
0312 unichar = *s;
0313 } else if (*s < UTF8_3_BITS) {
0314 unichar = *s++ & 0x1F;
0315 unichar <<= UTF8_V_SHIFT;
0316 unichar |= *s & 0x3F;
0317 } else if (*s < UTF8_4_BITS) {
0318 unichar = *s++ & 0x0F;
0319 unichar <<= UTF8_V_SHIFT;
0320 unichar |= *s++ & 0x3F;
0321 unichar <<= UTF8_V_SHIFT;
0322 unichar |= *s & 0x3F;
0323 } else {
0324 unichar = *s++ & 0x0F;
0325 unichar <<= UTF8_V_SHIFT;
0326 unichar |= *s++ & 0x3F;
0327 unichar <<= UTF8_V_SHIFT;
0328 unichar |= *s++ & 0x3F;
0329 unichar <<= UTF8_V_SHIFT;
0330 unichar |= *s & 0x3F;
0331 }
0332 return unichar;
0333 }
0334
0335 static int utf32valid(unsigned int unichar)
0336 {
0337 return unichar < 0x110000;
0338 }
0339
0340 #define HANGUL_SYLLABLE(U) ((U) >= 0xAC00 && (U) <= 0xD7A3)
0341
0342 #define NODE 1
0343 #define LEAF 0
0344
0345 struct tree {
0346 void *root;
0347 int childnode;
0348 const char *type;
0349 unsigned int maxage;
0350 struct tree *next;
0351 int (*leaf_equal)(void *, void *);
0352 void (*leaf_print)(void *, int);
0353 int (*leaf_mark)(void *);
0354 int (*leaf_size)(void *);
0355 int *(*leaf_index)(struct tree *, void *);
0356 unsigned char *(*leaf_emit)(void *, unsigned char *);
0357 int leafindex[0x110000];
0358 int index;
0359 };
0360
0361 struct node {
0362 int index;
0363 int offset;
0364 int mark;
0365 int size;
0366 struct node *parent;
0367 void *left;
0368 void *right;
0369 unsigned char bitnum;
0370 unsigned char nextbyte;
0371 unsigned char leftnode;
0372 unsigned char rightnode;
0373 unsigned int keybits;
0374 unsigned int keymask;
0375 };
0376
0377
0378
0379
0380 static void *lookup(struct tree *tree, const char *key)
0381 {
0382 struct node *node;
0383 void *leaf = NULL;
0384
0385 node = tree->root;
0386 while (!leaf && node) {
0387 if (node->nextbyte)
0388 key++;
0389 if (*key & (1 << (node->bitnum & 7))) {
0390
0391 if (node->rightnode == NODE) {
0392 node = node->right;
0393 } else if (node->rightnode == LEAF) {
0394 leaf = node->right;
0395 } else {
0396 node = NULL;
0397 }
0398 } else {
0399
0400 if (node->leftnode == NODE) {
0401 node = node->left;
0402 } else if (node->leftnode == LEAF) {
0403 leaf = node->left;
0404 } else {
0405 node = NULL;
0406 }
0407 }
0408 }
0409
0410 return leaf;
0411 }
0412
0413
0414
0415
0416
0417 static void tree_walk(struct tree *tree)
0418 {
0419 struct node *node;
0420 unsigned int leftmask;
0421 unsigned int rightmask;
0422 unsigned int bitmask;
0423 int indent = 1;
0424 int nodes, singletons, leaves;
0425
0426 nodes = singletons = leaves = 0;
0427
0428 printf("%s_%x root %p\n", tree->type, tree->maxage, tree->root);
0429 if (tree->childnode == LEAF) {
0430 assert(tree->root);
0431 tree->leaf_print(tree->root, indent);
0432 leaves = 1;
0433 } else {
0434 assert(tree->childnode == NODE);
0435 node = tree->root;
0436 leftmask = rightmask = 0;
0437 while (node) {
0438 printf("%*snode @ %p bitnum %d nextbyte %d"
0439 " left %p right %p mask %x bits %x\n",
0440 indent, "", node,
0441 node->bitnum, node->nextbyte,
0442 node->left, node->right,
0443 node->keymask, node->keybits);
0444 nodes += 1;
0445 if (!(node->left && node->right))
0446 singletons += 1;
0447
0448 while (node) {
0449 bitmask = 1 << node->bitnum;
0450 if ((leftmask & bitmask) == 0) {
0451 leftmask |= bitmask;
0452 if (node->leftnode == LEAF) {
0453 assert(node->left);
0454 tree->leaf_print(node->left,
0455 indent+1);
0456 leaves += 1;
0457 } else if (node->left) {
0458 assert(node->leftnode == NODE);
0459 indent += 1;
0460 node = node->left;
0461 break;
0462 }
0463 }
0464 if ((rightmask & bitmask) == 0) {
0465 rightmask |= bitmask;
0466 if (node->rightnode == LEAF) {
0467 assert(node->right);
0468 tree->leaf_print(node->right,
0469 indent+1);
0470 leaves += 1;
0471 } else if (node->right) {
0472 assert(node->rightnode == NODE);
0473 indent += 1;
0474 node = node->right;
0475 break;
0476 }
0477 }
0478 leftmask &= ~bitmask;
0479 rightmask &= ~bitmask;
0480 node = node->parent;
0481 indent -= 1;
0482 }
0483 }
0484 }
0485 printf("nodes %d leaves %d singletons %d\n",
0486 nodes, leaves, singletons);
0487 }
0488
0489
0490
0491
0492 static struct node *alloc_node(struct node *parent)
0493 {
0494 struct node *node;
0495 int bitnum;
0496
0497 node = malloc(sizeof(*node));
0498 node->left = node->right = NULL;
0499 node->parent = parent;
0500 node->leftnode = NODE;
0501 node->rightnode = NODE;
0502 node->keybits = 0;
0503 node->keymask = 0;
0504 node->mark = 0;
0505 node->index = 0;
0506 node->offset = -1;
0507 node->size = 4;
0508
0509 if (node->parent) {
0510 bitnum = parent->bitnum;
0511 if ((bitnum & 7) == 0) {
0512 node->bitnum = bitnum + 7 + 8;
0513 node->nextbyte = 1;
0514 } else {
0515 node->bitnum = bitnum - 1;
0516 node->nextbyte = 0;
0517 }
0518 } else {
0519 node->bitnum = 7;
0520 node->nextbyte = 0;
0521 }
0522
0523 return node;
0524 }
0525
0526
0527
0528
0529
0530
0531
0532
0533 static int insert(struct tree *tree, char *key, int keylen, void *leaf)
0534 {
0535 struct node *node;
0536 struct node *parent;
0537 void **cursor;
0538 int keybits;
0539
0540 assert(keylen >= 1 && keylen <= 4);
0541
0542 node = NULL;
0543 cursor = &tree->root;
0544 keybits = 8 * keylen;
0545
0546
0547 while (keybits) {
0548 if (!*cursor)
0549 *cursor = alloc_node(node);
0550 node = *cursor;
0551 if (node->nextbyte)
0552 key++;
0553 if (*key & (1 << (node->bitnum & 7)))
0554 cursor = &node->right;
0555 else
0556 cursor = &node->left;
0557 keybits--;
0558 }
0559 *cursor = leaf;
0560
0561
0562 while (node) {
0563 if (*key & (1 << (node->bitnum & 7)))
0564 node->rightnode = LEAF;
0565 else
0566 node->leftnode = LEAF;
0567 if (node->nextbyte)
0568 break;
0569 if (node->leftnode == NODE || node->rightnode == NODE)
0570 break;
0571 assert(node->left);
0572 assert(node->right);
0573
0574 if (! tree->leaf_equal(node->left, node->right))
0575 break;
0576
0577 leaf = node->left;
0578
0579 parent = node->parent;
0580 if (!parent) {
0581
0582 tree->root = leaf;
0583 tree->childnode = LEAF;
0584 } else if (parent->left == node) {
0585 parent->left = leaf;
0586 parent->leftnode = LEAF;
0587 if (parent->right) {
0588 parent->keymask = 0;
0589 parent->keybits = 0;
0590 } else {
0591 parent->keymask |= (1 << node->bitnum);
0592 }
0593 } else if (parent->right == node) {
0594 parent->right = leaf;
0595 parent->rightnode = LEAF;
0596 if (parent->left) {
0597 parent->keymask = 0;
0598 parent->keybits = 0;
0599 } else {
0600 parent->keymask |= (1 << node->bitnum);
0601 parent->keybits |= (1 << node->bitnum);
0602 }
0603 } else {
0604
0605 assert(0);
0606 }
0607 free(node);
0608 node = parent;
0609 }
0610
0611
0612 while (node) {
0613 parent = node->parent;
0614 if (!parent)
0615 break;
0616
0617 if (node->keymask == 0) {
0618 parent->keymask = 0;
0619 parent->keybits = 0;
0620 } else if (parent->left && parent->right) {
0621 parent->keymask = 0;
0622 parent->keybits = 0;
0623 } else {
0624 assert((parent->keymask & node->keymask) == 0);
0625 parent->keymask |= node->keymask;
0626 parent->keymask |= (1 << parent->bitnum);
0627 parent->keybits |= node->keybits;
0628 if (parent->right)
0629 parent->keybits |= (1 << parent->bitnum);
0630 }
0631 node = parent;
0632 }
0633
0634 return 0;
0635 }
0636
0637
0638
0639
0640
0641
0642
0643
0644
0645
0646
0647
0648
0649
0650
0651
0652
0653
0654 static void prune(struct tree *tree)
0655 {
0656 struct node *node;
0657 struct node *left;
0658 struct node *right;
0659 struct node *parent;
0660 void *leftleaf;
0661 void *rightleaf;
0662 unsigned int leftmask;
0663 unsigned int rightmask;
0664 unsigned int bitmask;
0665 int count;
0666
0667 if (verbose > 0)
0668 printf("Pruning %s_%x\n", tree->type, tree->maxage);
0669
0670 count = 0;
0671 if (tree->childnode == LEAF)
0672 return;
0673 if (!tree->root)
0674 return;
0675
0676 leftmask = rightmask = 0;
0677 node = tree->root;
0678 while (node) {
0679 if (node->nextbyte)
0680 goto advance;
0681 if (node->leftnode == LEAF)
0682 goto advance;
0683 if (node->rightnode == LEAF)
0684 goto advance;
0685 if (!node->left)
0686 goto advance;
0687 if (!node->right)
0688 goto advance;
0689 left = node->left;
0690 right = node->right;
0691 if (left->keymask == 0)
0692 goto advance;
0693 if (right->keymask == 0)
0694 goto advance;
0695 if (left->keymask != right->keymask)
0696 goto advance;
0697 if (left->keybits != right->keybits)
0698 goto advance;
0699 leftleaf = NULL;
0700 while (!leftleaf) {
0701 assert(left->left || left->right);
0702 if (left->leftnode == LEAF)
0703 leftleaf = left->left;
0704 else if (left->rightnode == LEAF)
0705 leftleaf = left->right;
0706 else if (left->left)
0707 left = left->left;
0708 else if (left->right)
0709 left = left->right;
0710 else
0711 assert(0);
0712 }
0713 rightleaf = NULL;
0714 while (!rightleaf) {
0715 assert(right->left || right->right);
0716 if (right->leftnode == LEAF)
0717 rightleaf = right->left;
0718 else if (right->rightnode == LEAF)
0719 rightleaf = right->right;
0720 else if (right->left)
0721 right = right->left;
0722 else if (right->right)
0723 right = right->right;
0724 else
0725 assert(0);
0726 }
0727 if (! tree->leaf_equal(leftleaf, rightleaf))
0728 goto advance;
0729
0730
0731
0732
0733 parent = node->parent;
0734 left = node->left;
0735 right = node->right;
0736 if (parent->left == node)
0737 parent->left = left;
0738 else if (parent->right == node)
0739 parent->right = left;
0740 else
0741 assert(0);
0742 left->parent = parent;
0743 left->keymask |= (1 << node->bitnum);
0744 node->left = NULL;
0745 while (node) {
0746 bitmask = 1 << node->bitnum;
0747 leftmask &= ~bitmask;
0748 rightmask &= ~bitmask;
0749 if (node->leftnode == NODE && node->left) {
0750 left = node->left;
0751 free(node);
0752 count++;
0753 node = left;
0754 } else if (node->rightnode == NODE && node->right) {
0755 right = node->right;
0756 free(node);
0757 count++;
0758 node = right;
0759 } else {
0760 node = NULL;
0761 }
0762 }
0763
0764 node = parent;
0765
0766 bitmask = 1 << node->bitnum;
0767 leftmask &= ~bitmask;
0768 rightmask &= ~bitmask;
0769 for (;;) {
0770 if (node->left && node->right)
0771 break;
0772 if (node->left) {
0773 left = node->left;
0774 node->keymask |= left->keymask;
0775 node->keybits |= left->keybits;
0776 }
0777 if (node->right) {
0778 right = node->right;
0779 node->keymask |= right->keymask;
0780 node->keybits |= right->keybits;
0781 }
0782 node->keymask |= (1 << node->bitnum);
0783 node = node->parent;
0784
0785 bitmask = 1 << node->bitnum;
0786 leftmask &= ~bitmask;
0787 rightmask &= ~bitmask;
0788 }
0789 advance:
0790 bitmask = 1 << node->bitnum;
0791 if ((leftmask & bitmask) == 0 &&
0792 node->leftnode == NODE &&
0793 node->left) {
0794 leftmask |= bitmask;
0795 node = node->left;
0796 } else if ((rightmask & bitmask) == 0 &&
0797 node->rightnode == NODE &&
0798 node->right) {
0799 rightmask |= bitmask;
0800 node = node->right;
0801 } else {
0802 leftmask &= ~bitmask;
0803 rightmask &= ~bitmask;
0804 node = node->parent;
0805 }
0806 }
0807 if (verbose > 0)
0808 printf("Pruned %d nodes\n", count);
0809 }
0810
0811
0812
0813
0814
0815 static void mark_nodes(struct tree *tree)
0816 {
0817 struct node *node;
0818 struct node *n;
0819 unsigned int leftmask;
0820 unsigned int rightmask;
0821 unsigned int bitmask;
0822 int marked;
0823
0824 marked = 0;
0825 if (verbose > 0)
0826 printf("Marking %s_%x\n", tree->type, tree->maxage);
0827 if (tree->childnode == LEAF)
0828 goto done;
0829
0830 assert(tree->childnode == NODE);
0831 node = tree->root;
0832 leftmask = rightmask = 0;
0833 while (node) {
0834 bitmask = 1 << node->bitnum;
0835 if ((leftmask & bitmask) == 0) {
0836 leftmask |= bitmask;
0837 if (node->leftnode == LEAF) {
0838 assert(node->left);
0839 if (tree->leaf_mark(node->left)) {
0840 n = node;
0841 while (n && !n->mark) {
0842 marked++;
0843 n->mark = 1;
0844 n = n->parent;
0845 }
0846 }
0847 } else if (node->left) {
0848 assert(node->leftnode == NODE);
0849 node = node->left;
0850 continue;
0851 }
0852 }
0853 if ((rightmask & bitmask) == 0) {
0854 rightmask |= bitmask;
0855 if (node->rightnode == LEAF) {
0856 assert(node->right);
0857 if (tree->leaf_mark(node->right)) {
0858 n = node;
0859 while (n && !n->mark) {
0860 marked++;
0861 n->mark = 1;
0862 n = n->parent;
0863 }
0864 }
0865 } else if (node->right) {
0866 assert(node->rightnode == NODE);
0867 node = node->right;
0868 continue;
0869 }
0870 }
0871 leftmask &= ~bitmask;
0872 rightmask &= ~bitmask;
0873 node = node->parent;
0874 }
0875
0876
0877
0878 assert(tree->childnode == NODE);
0879 node = tree->root;
0880 leftmask = rightmask = 0;
0881 while (node) {
0882 bitmask = 1 << node->bitnum;
0883 if ((leftmask & bitmask) == 0) {
0884 leftmask |= bitmask;
0885 if (node->leftnode == LEAF) {
0886 assert(node->left);
0887 if (tree->leaf_mark(node->left)) {
0888 n = node;
0889 while (n && !n->mark) {
0890 marked++;
0891 n->mark = 1;
0892 n = n->parent;
0893 }
0894 }
0895 } else if (node->left) {
0896 assert(node->leftnode == NODE);
0897 node = node->left;
0898 if (!node->mark && node->parent->mark) {
0899 marked++;
0900 node->mark = 1;
0901 }
0902 continue;
0903 }
0904 }
0905 if ((rightmask & bitmask) == 0) {
0906 rightmask |= bitmask;
0907 if (node->rightnode == LEAF) {
0908 assert(node->right);
0909 if (tree->leaf_mark(node->right)) {
0910 n = node;
0911 while (n && !n->mark) {
0912 marked++;
0913 n->mark = 1;
0914 n = n->parent;
0915 }
0916 }
0917 } else if (node->right) {
0918 assert(node->rightnode == NODE);
0919 node = node->right;
0920 if (!node->mark && node->parent->mark &&
0921 !node->parent->left) {
0922 marked++;
0923 node->mark = 1;
0924 }
0925 continue;
0926 }
0927 }
0928 leftmask &= ~bitmask;
0929 rightmask &= ~bitmask;
0930 node = node->parent;
0931 }
0932 done:
0933 if (verbose > 0)
0934 printf("Marked %d nodes\n", marked);
0935 }
0936
0937
0938
0939
0940
0941
0942 static int index_nodes(struct tree *tree, int index)
0943 {
0944 struct node *node;
0945 unsigned int leftmask;
0946 unsigned int rightmask;
0947 unsigned int bitmask;
0948 int count;
0949 int indent;
0950
0951
0952 while (index % 64)
0953 index++;
0954 tree->index = index;
0955 indent = 1;
0956 count = 0;
0957
0958 if (verbose > 0)
0959 printf("Indexing %s_%x: %d\n", tree->type, tree->maxage, index);
0960 if (tree->childnode == LEAF) {
0961 index += tree->leaf_size(tree->root);
0962 goto done;
0963 }
0964
0965 assert(tree->childnode == NODE);
0966 node = tree->root;
0967 leftmask = rightmask = 0;
0968 while (node) {
0969 if (!node->mark)
0970 goto skip;
0971 count++;
0972 if (node->index != index)
0973 node->index = index;
0974 index += node->size;
0975 skip:
0976 while (node) {
0977 bitmask = 1 << node->bitnum;
0978 if (node->mark && (leftmask & bitmask) == 0) {
0979 leftmask |= bitmask;
0980 if (node->leftnode == LEAF) {
0981 assert(node->left);
0982 *tree->leaf_index(tree, node->left) =
0983 index;
0984 index += tree->leaf_size(node->left);
0985 count++;
0986 } else if (node->left) {
0987 assert(node->leftnode == NODE);
0988 indent += 1;
0989 node = node->left;
0990 break;
0991 }
0992 }
0993 if (node->mark && (rightmask & bitmask) == 0) {
0994 rightmask |= bitmask;
0995 if (node->rightnode == LEAF) {
0996 assert(node->right);
0997 *tree->leaf_index(tree, node->right) = index;
0998 index += tree->leaf_size(node->right);
0999 count++;
1000 } else if (node->right) {
1001 assert(node->rightnode == NODE);
1002 indent += 1;
1003 node = node->right;
1004 break;
1005 }
1006 }
1007 leftmask &= ~bitmask;
1008 rightmask &= ~bitmask;
1009 node = node->parent;
1010 indent -= 1;
1011 }
1012 }
1013 done:
1014
1015 while (index % 16)
1016 index++;
1017 if (verbose > 0)
1018 printf("Final index %d\n", index);
1019 return index;
1020 }
1021
1022
1023
1024
1025 static int mark_subtree(struct node *node)
1026 {
1027 int changed;
1028
1029 if (!node || node->mark)
1030 return 0;
1031 node->mark = 1;
1032 node->index = node->parent->index;
1033 changed = 1;
1034 if (node->leftnode == NODE)
1035 changed += mark_subtree(node->left);
1036 if (node->rightnode == NODE)
1037 changed += mark_subtree(node->right);
1038 return changed;
1039 }
1040
1041
1042
1043
1044
1045
1046
1047
1048 static int size_nodes(struct tree *tree)
1049 {
1050 struct tree *next;
1051 struct node *node;
1052 struct node *right;
1053 struct node *n;
1054 unsigned int leftmask;
1055 unsigned int rightmask;
1056 unsigned int bitmask;
1057 unsigned int pathbits;
1058 unsigned int pathmask;
1059 unsigned int nbit;
1060 int changed;
1061 int offset;
1062 int size;
1063 int indent;
1064
1065 indent = 1;
1066 changed = 0;
1067 size = 0;
1068
1069 if (verbose > 0)
1070 printf("Sizing %s_%x\n", tree->type, tree->maxage);
1071 if (tree->childnode == LEAF)
1072 goto done;
1073
1074 assert(tree->childnode == NODE);
1075 pathbits = 0;
1076 pathmask = 0;
1077 node = tree->root;
1078 leftmask = rightmask = 0;
1079 while (node) {
1080 if (!node->mark)
1081 goto skip;
1082 offset = 0;
1083 if (!node->left || !node->right) {
1084 size = 1;
1085 } else {
1086 if (node->rightnode == NODE) {
1087
1088
1089
1090
1091
1092
1093 right = node->right;
1094 next = tree->next;
1095 while (!right->mark) {
1096 assert(next);
1097 n = next->root;
1098 while (n->bitnum != node->bitnum) {
1099 nbit = 1 << n->bitnum;
1100 if (!(pathmask & nbit))
1101 break;
1102 if (pathbits & nbit) {
1103 if (n->rightnode == LEAF)
1104 break;
1105 n = n->right;
1106 } else {
1107 if (n->leftnode == LEAF)
1108 break;
1109 n = n->left;
1110 }
1111 }
1112 if (n->bitnum != node->bitnum)
1113 break;
1114 n = n->right;
1115 right = n;
1116 next = next->next;
1117 }
1118
1119 if (!right->mark)
1120 changed += mark_subtree(right);
1121 offset = right->index - node->index;
1122 } else {
1123 offset = *tree->leaf_index(tree, node->right);
1124 offset -= node->index;
1125 }
1126 assert(offset >= 0);
1127 assert(offset <= 0xffffff);
1128 if (offset <= 0xff) {
1129 size = 2;
1130 } else if (offset <= 0xffff) {
1131 size = 3;
1132 } else {
1133 size = 4;
1134 }
1135 }
1136 if (node->size != size || node->offset != offset) {
1137 node->size = size;
1138 node->offset = offset;
1139 changed++;
1140 }
1141 skip:
1142 while (node) {
1143 bitmask = 1 << node->bitnum;
1144 pathmask |= bitmask;
1145 if (node->mark && (leftmask & bitmask) == 0) {
1146 leftmask |= bitmask;
1147 if (node->leftnode == LEAF) {
1148 assert(node->left);
1149 } else if (node->left) {
1150 assert(node->leftnode == NODE);
1151 indent += 1;
1152 node = node->left;
1153 break;
1154 }
1155 }
1156 if (node->mark && (rightmask & bitmask) == 0) {
1157 rightmask |= bitmask;
1158 pathbits |= bitmask;
1159 if (node->rightnode == LEAF) {
1160 assert(node->right);
1161 } else if (node->right) {
1162 assert(node->rightnode == NODE);
1163 indent += 1;
1164 node = node->right;
1165 break;
1166 }
1167 }
1168 leftmask &= ~bitmask;
1169 rightmask &= ~bitmask;
1170 pathmask &= ~bitmask;
1171 pathbits &= ~bitmask;
1172 node = node->parent;
1173 indent -= 1;
1174 }
1175 }
1176 done:
1177 if (verbose > 0)
1178 printf("Found %d changes\n", changed);
1179 return changed;
1180 }
1181
1182
1183
1184
1185 static void emit(struct tree *tree, unsigned char *data)
1186 {
1187 struct node *node;
1188 unsigned int leftmask;
1189 unsigned int rightmask;
1190 unsigned int bitmask;
1191 int offlen;
1192 int offset;
1193 int index;
1194 int indent;
1195 int size;
1196 int bytes;
1197 int leaves;
1198 int nodes[4];
1199 unsigned char byte;
1200
1201 nodes[0] = nodes[1] = nodes[2] = nodes[3] = 0;
1202 leaves = 0;
1203 bytes = 0;
1204 index = tree->index;
1205 data += index;
1206 indent = 1;
1207 if (verbose > 0)
1208 printf("Emitting %s_%x\n", tree->type, tree->maxage);
1209 if (tree->childnode == LEAF) {
1210 assert(tree->root);
1211 tree->leaf_emit(tree->root, data);
1212 size = tree->leaf_size(tree->root);
1213 index += size;
1214 leaves++;
1215 goto done;
1216 }
1217
1218 assert(tree->childnode == NODE);
1219 node = tree->root;
1220 leftmask = rightmask = 0;
1221 while (node) {
1222 if (!node->mark)
1223 goto skip;
1224 assert(node->offset != -1);
1225 assert(node->index == index);
1226
1227 byte = 0;
1228 if (node->nextbyte)
1229 byte |= NEXTBYTE;
1230 byte |= (node->bitnum & BITNUM);
1231 if (node->left && node->right) {
1232 if (node->leftnode == NODE)
1233 byte |= LEFTNODE;
1234 if (node->rightnode == NODE)
1235 byte |= RIGHTNODE;
1236 if (node->offset <= 0xff)
1237 offlen = 1;
1238 else if (node->offset <= 0xffff)
1239 offlen = 2;
1240 else
1241 offlen = 3;
1242 nodes[offlen]++;
1243 offset = node->offset;
1244 byte |= offlen << OFFLEN_SHIFT;
1245 *data++ = byte;
1246 index++;
1247 while (offlen--) {
1248 *data++ = offset & 0xff;
1249 index++;
1250 offset >>= 8;
1251 }
1252 } else if (node->left) {
1253 if (node->leftnode == NODE)
1254 byte |= TRIENODE;
1255 nodes[0]++;
1256 *data++ = byte;
1257 index++;
1258 } else if (node->right) {
1259 byte |= RIGHTNODE;
1260 if (node->rightnode == NODE)
1261 byte |= TRIENODE;
1262 nodes[0]++;
1263 *data++ = byte;
1264 index++;
1265 } else {
1266 assert(0);
1267 }
1268 skip:
1269 while (node) {
1270 bitmask = 1 << node->bitnum;
1271 if (node->mark && (leftmask & bitmask) == 0) {
1272 leftmask |= bitmask;
1273 if (node->leftnode == LEAF) {
1274 assert(node->left);
1275 data = tree->leaf_emit(node->left,
1276 data);
1277 size = tree->leaf_size(node->left);
1278 index += size;
1279 bytes += size;
1280 leaves++;
1281 } else if (node->left) {
1282 assert(node->leftnode == NODE);
1283 indent += 1;
1284 node = node->left;
1285 break;
1286 }
1287 }
1288 if (node->mark && (rightmask & bitmask) == 0) {
1289 rightmask |= bitmask;
1290 if (node->rightnode == LEAF) {
1291 assert(node->right);
1292 data = tree->leaf_emit(node->right,
1293 data);
1294 size = tree->leaf_size(node->right);
1295 index += size;
1296 bytes += size;
1297 leaves++;
1298 } else if (node->right) {
1299 assert(node->rightnode == NODE);
1300 indent += 1;
1301 node = node->right;
1302 break;
1303 }
1304 }
1305 leftmask &= ~bitmask;
1306 rightmask &= ~bitmask;
1307 node = node->parent;
1308 indent -= 1;
1309 }
1310 }
1311 done:
1312 if (verbose > 0) {
1313 printf("Emitted %d (%d) leaves",
1314 leaves, bytes);
1315 printf(" %d (%d+%d+%d+%d) nodes",
1316 nodes[0] + nodes[1] + nodes[2] + nodes[3],
1317 nodes[0], nodes[1], nodes[2], nodes[3]);
1318 printf(" %d total\n", index - tree->index);
1319 }
1320 }
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339 struct unicode_data {
1340 unsigned int code;
1341 int ccc;
1342 int gen;
1343 int correction;
1344 unsigned int *utf32nfdi;
1345 unsigned int *utf32nfdicf;
1346 char *utf8nfdi;
1347 char *utf8nfdicf;
1348 };
1349
1350 struct unicode_data unicode_data[0x110000];
1351 struct unicode_data *corrections;
1352 int corrections_count;
1353
1354 struct tree *nfdi_tree;
1355 struct tree *nfdicf_tree;
1356
1357 struct tree *trees;
1358 int trees_count;
1359
1360
1361
1362
1363
1364 static struct unicode_data *corrections_lookup(struct unicode_data *u)
1365 {
1366 int i;
1367
1368 for (i = 0; i != corrections_count; i++)
1369 if (u->code == corrections[i].code)
1370 return &corrections[i];
1371 return u;
1372 }
1373
1374 static int nfdi_equal(void *l, void *r)
1375 {
1376 struct unicode_data *left = l;
1377 struct unicode_data *right = r;
1378
1379 if (left->gen != right->gen)
1380 return 0;
1381 if (left->ccc != right->ccc)
1382 return 0;
1383 if (left->utf8nfdi && right->utf8nfdi &&
1384 strcmp(left->utf8nfdi, right->utf8nfdi) == 0)
1385 return 1;
1386 if (left->utf8nfdi || right->utf8nfdi)
1387 return 0;
1388 return 1;
1389 }
1390
1391 static int nfdicf_equal(void *l, void *r)
1392 {
1393 struct unicode_data *left = l;
1394 struct unicode_data *right = r;
1395
1396 if (left->gen != right->gen)
1397 return 0;
1398 if (left->ccc != right->ccc)
1399 return 0;
1400 if (left->utf8nfdicf && right->utf8nfdicf &&
1401 strcmp(left->utf8nfdicf, right->utf8nfdicf) == 0)
1402 return 1;
1403 if (left->utf8nfdicf && right->utf8nfdicf)
1404 return 0;
1405 if (left->utf8nfdicf || right->utf8nfdicf)
1406 return 0;
1407 if (left->utf8nfdi && right->utf8nfdi &&
1408 strcmp(left->utf8nfdi, right->utf8nfdi) == 0)
1409 return 1;
1410 if (left->utf8nfdi || right->utf8nfdi)
1411 return 0;
1412 return 1;
1413 }
1414
1415 static void nfdi_print(void *l, int indent)
1416 {
1417 struct unicode_data *leaf = l;
1418
1419 printf("%*sleaf @ %p code %X ccc %d gen %d", indent, "", leaf,
1420 leaf->code, leaf->ccc, leaf->gen);
1421
1422 if (leaf->utf8nfdi && leaf->utf8nfdi[0] == HANGUL)
1423 printf(" nfdi \"%s\"", "HANGUL SYLLABLE");
1424 else if (leaf->utf8nfdi)
1425 printf(" nfdi \"%s\"", (const char*)leaf->utf8nfdi);
1426
1427 printf("\n");
1428 }
1429
1430 static void nfdicf_print(void *l, int indent)
1431 {
1432 struct unicode_data *leaf = l;
1433
1434 printf("%*sleaf @ %p code %X ccc %d gen %d", indent, "", leaf,
1435 leaf->code, leaf->ccc, leaf->gen);
1436
1437 if (leaf->utf8nfdicf)
1438 printf(" nfdicf \"%s\"", (const char*)leaf->utf8nfdicf);
1439 else if (leaf->utf8nfdi && leaf->utf8nfdi[0] == HANGUL)
1440 printf(" nfdi \"%s\"", "HANGUL SYLLABLE");
1441 else if (leaf->utf8nfdi)
1442 printf(" nfdi \"%s\"", (const char*)leaf->utf8nfdi);
1443 printf("\n");
1444 }
1445
1446 static int nfdi_mark(void *l)
1447 {
1448 return 1;
1449 }
1450
1451 static int nfdicf_mark(void *l)
1452 {
1453 struct unicode_data *leaf = l;
1454
1455 if (leaf->utf8nfdicf)
1456 return 1;
1457 return 0;
1458 }
1459
1460 static int correction_mark(void *l)
1461 {
1462 struct unicode_data *leaf = l;
1463
1464 return leaf->correction;
1465 }
1466
1467 static int nfdi_size(void *l)
1468 {
1469 struct unicode_data *leaf = l;
1470 int size = 2;
1471
1472 if (HANGUL_SYLLABLE(leaf->code))
1473 size += 1;
1474 else if (leaf->utf8nfdi)
1475 size += strlen(leaf->utf8nfdi) + 1;
1476 return size;
1477 }
1478
1479 static int nfdicf_size(void *l)
1480 {
1481 struct unicode_data *leaf = l;
1482 int size = 2;
1483
1484 if (HANGUL_SYLLABLE(leaf->code))
1485 size += 1;
1486 else if (leaf->utf8nfdicf)
1487 size += strlen(leaf->utf8nfdicf) + 1;
1488 else if (leaf->utf8nfdi)
1489 size += strlen(leaf->utf8nfdi) + 1;
1490 return size;
1491 }
1492
1493 static int *nfdi_index(struct tree *tree, void *l)
1494 {
1495 struct unicode_data *leaf = l;
1496
1497 return &tree->leafindex[leaf->code];
1498 }
1499
1500 static int *nfdicf_index(struct tree *tree, void *l)
1501 {
1502 struct unicode_data *leaf = l;
1503
1504 return &tree->leafindex[leaf->code];
1505 }
1506
1507 static unsigned char *nfdi_emit(void *l, unsigned char *data)
1508 {
1509 struct unicode_data *leaf = l;
1510 unsigned char *s;
1511
1512 *data++ = leaf->gen;
1513
1514 if (HANGUL_SYLLABLE(leaf->code)) {
1515 *data++ = DECOMPOSE;
1516 *data++ = HANGUL;
1517 } else if (leaf->utf8nfdi) {
1518 *data++ = DECOMPOSE;
1519 s = (unsigned char*)leaf->utf8nfdi;
1520 while ((*data++ = *s++) != 0)
1521 ;
1522 } else {
1523 *data++ = leaf->ccc;
1524 }
1525 return data;
1526 }
1527
1528 static unsigned char *nfdicf_emit(void *l, unsigned char *data)
1529 {
1530 struct unicode_data *leaf = l;
1531 unsigned char *s;
1532
1533 *data++ = leaf->gen;
1534
1535 if (HANGUL_SYLLABLE(leaf->code)) {
1536 *data++ = DECOMPOSE;
1537 *data++ = HANGUL;
1538 } else if (leaf->utf8nfdicf) {
1539 *data++ = DECOMPOSE;
1540 s = (unsigned char*)leaf->utf8nfdicf;
1541 while ((*data++ = *s++) != 0)
1542 ;
1543 } else if (leaf->utf8nfdi) {
1544 *data++ = DECOMPOSE;
1545 s = (unsigned char*)leaf->utf8nfdi;
1546 while ((*data++ = *s++) != 0)
1547 ;
1548 } else {
1549 *data++ = leaf->ccc;
1550 }
1551 return data;
1552 }
1553
1554 static void utf8_create(struct unicode_data *data)
1555 {
1556 char utf[18*4+1];
1557 char *u;
1558 unsigned int *um;
1559 int i;
1560
1561 if (data->utf8nfdi) {
1562 assert(data->utf8nfdi[0] == HANGUL);
1563 return;
1564 }
1565
1566 u = utf;
1567 um = data->utf32nfdi;
1568 if (um) {
1569 for (i = 0; um[i]; i++)
1570 u += utf8encode(u, um[i]);
1571 *u = '\0';
1572 data->utf8nfdi = strdup(utf);
1573 }
1574 u = utf;
1575 um = data->utf32nfdicf;
1576 if (um) {
1577 for (i = 0; um[i]; i++)
1578 u += utf8encode(u, um[i]);
1579 *u = '\0';
1580 if (!data->utf8nfdi || strcmp(data->utf8nfdi, utf))
1581 data->utf8nfdicf = strdup(utf);
1582 }
1583 }
1584
1585 static void utf8_init(void)
1586 {
1587 unsigned int unichar;
1588 int i;
1589
1590 for (unichar = 0; unichar != 0x110000; unichar++)
1591 utf8_create(&unicode_data[unichar]);
1592
1593 for (i = 0; i != corrections_count; i++)
1594 utf8_create(&corrections[i]);
1595 }
1596
1597 static void trees_init(void)
1598 {
1599 struct unicode_data *data;
1600 unsigned int maxage;
1601 unsigned int nextage;
1602 int count;
1603 int i;
1604 int j;
1605
1606
1607 count = 0;
1608 nextage = (unsigned int)-1;
1609 do {
1610 maxage = nextage;
1611 nextage = 0;
1612 for (i = 0; i <= corrections_count; i++) {
1613 data = &corrections[i];
1614 if (nextage < data->correction &&
1615 data->correction < maxage)
1616 nextage = data->correction;
1617 }
1618 count++;
1619 } while (nextage);
1620
1621
1622 trees_count = count * 2;
1623 trees = calloc(trees_count, sizeof(struct tree));
1624
1625
1626 count = trees_count;
1627 nextage = (unsigned int)-1;
1628 do {
1629 maxage = nextage;
1630 trees[--count].maxage = maxage;
1631 trees[--count].maxage = maxage;
1632 nextage = 0;
1633 for (i = 0; i <= corrections_count; i++) {
1634 data = &corrections[i];
1635 if (nextage < data->correction &&
1636 data->correction < maxage)
1637 nextage = data->correction;
1638 }
1639 } while (nextage);
1640
1641
1642 for (i = 0; i != trees_count; i++) {
1643 j = 0;
1644 while (ages[j] < trees[i].maxage)
1645 j++;
1646 trees[i].maxage = ages[j-1];
1647 }
1648
1649
1650 trees[trees_count-2].next = &trees[trees_count-1];
1651 trees[trees_count-1].leaf_mark = nfdi_mark;
1652 trees[trees_count-2].leaf_mark = nfdicf_mark;
1653 for (i = 0; i != trees_count-2; i += 2) {
1654 trees[i].next = &trees[trees_count-2];
1655 trees[i].leaf_mark = correction_mark;
1656 trees[i+1].next = &trees[trees_count-1];
1657 trees[i+1].leaf_mark = correction_mark;
1658 }
1659
1660
1661 for (i = 0; i != trees_count; i += 2) {
1662 trees[i].type = "nfdicf";
1663 trees[i].leaf_equal = nfdicf_equal;
1664 trees[i].leaf_print = nfdicf_print;
1665 trees[i].leaf_size = nfdicf_size;
1666 trees[i].leaf_index = nfdicf_index;
1667 trees[i].leaf_emit = nfdicf_emit;
1668
1669 trees[i+1].type = "nfdi";
1670 trees[i+1].leaf_equal = nfdi_equal;
1671 trees[i+1].leaf_print = nfdi_print;
1672 trees[i+1].leaf_size = nfdi_size;
1673 trees[i+1].leaf_index = nfdi_index;
1674 trees[i+1].leaf_emit = nfdi_emit;
1675 }
1676
1677
1678 for (i = 0; i != trees_count; i++)
1679 trees[i].childnode = NODE;
1680 }
1681
1682 static void trees_populate(void)
1683 {
1684 struct unicode_data *data;
1685 unsigned int unichar;
1686 char keyval[4];
1687 int keylen;
1688 int i;
1689
1690 for (i = 0; i != trees_count; i++) {
1691 if (verbose > 0) {
1692 printf("Populating %s_%x\n",
1693 trees[i].type, trees[i].maxage);
1694 }
1695 for (unichar = 0; unichar != 0x110000; unichar++) {
1696 if (unicode_data[unichar].gen < 0)
1697 continue;
1698 keylen = utf8encode(keyval, unichar);
1699 data = corrections_lookup(&unicode_data[unichar]);
1700 if (data->correction <= trees[i].maxage)
1701 data = &unicode_data[unichar];
1702 insert(&trees[i], keyval, keylen, data);
1703 }
1704 }
1705 }
1706
1707 static void trees_reduce(void)
1708 {
1709 int i;
1710 int size;
1711 int changed;
1712
1713 for (i = 0; i != trees_count; i++)
1714 prune(&trees[i]);
1715 for (i = 0; i != trees_count; i++)
1716 mark_nodes(&trees[i]);
1717 do {
1718 size = 0;
1719 for (i = 0; i != trees_count; i++)
1720 size = index_nodes(&trees[i], size);
1721 changed = 0;
1722 for (i = 0; i != trees_count; i++)
1723 changed += size_nodes(&trees[i]);
1724 } while (changed);
1725
1726 utf8data = calloc(size, 1);
1727 utf8data_size = size;
1728 for (i = 0; i != trees_count; i++)
1729 emit(&trees[i], utf8data);
1730
1731 if (verbose > 0) {
1732 for (i = 0; i != trees_count; i++) {
1733 printf("%s_%x idx %d\n",
1734 trees[i].type, trees[i].maxage, trees[i].index);
1735 }
1736 }
1737
1738 nfdi = utf8data + trees[trees_count-1].index;
1739 nfdicf = utf8data + trees[trees_count-2].index;
1740
1741 nfdi_tree = &trees[trees_count-1];
1742 nfdicf_tree = &trees[trees_count-2];
1743 }
1744
1745 static void verify(struct tree *tree)
1746 {
1747 struct unicode_data *data;
1748 utf8leaf_t *leaf;
1749 unsigned int unichar;
1750 char key[4];
1751 unsigned char hangul[UTF8HANGULLEAF];
1752 int report;
1753 int nocf;
1754
1755 if (verbose > 0)
1756 printf("Verifying %s_%x\n", tree->type, tree->maxage);
1757 nocf = strcmp(tree->type, "nfdicf");
1758
1759 for (unichar = 0; unichar != 0x110000; unichar++) {
1760 report = 0;
1761 data = corrections_lookup(&unicode_data[unichar]);
1762 if (data->correction <= tree->maxage)
1763 data = &unicode_data[unichar];
1764 utf8encode(key,unichar);
1765 leaf = utf8lookup(tree, hangul, key);
1766
1767 if (!leaf) {
1768 if (data->gen != -1)
1769 report++;
1770 if (unichar < 0xd800 || unichar > 0xdfff)
1771 report++;
1772 } else {
1773 if (unichar >= 0xd800 && unichar <= 0xdfff)
1774 report++;
1775 if (data->gen == -1)
1776 report++;
1777 if (data->gen != LEAF_GEN(leaf))
1778 report++;
1779 if (LEAF_CCC(leaf) == DECOMPOSE) {
1780 if (HANGUL_SYLLABLE(data->code)) {
1781 if (data->utf8nfdi[0] != HANGUL)
1782 report++;
1783 } else if (nocf) {
1784 if (!data->utf8nfdi) {
1785 report++;
1786 } else if (strcmp(data->utf8nfdi,
1787 LEAF_STR(leaf))) {
1788 report++;
1789 }
1790 } else {
1791 if (!data->utf8nfdicf &&
1792 !data->utf8nfdi) {
1793 report++;
1794 } else if (data->utf8nfdicf) {
1795 if (strcmp(data->utf8nfdicf,
1796 LEAF_STR(leaf)))
1797 report++;
1798 } else if (strcmp(data->utf8nfdi,
1799 LEAF_STR(leaf))) {
1800 report++;
1801 }
1802 }
1803 } else if (data->ccc != LEAF_CCC(leaf)) {
1804 report++;
1805 }
1806 }
1807 if (report) {
1808 printf("%X code %X gen %d ccc %d"
1809 " nfdi -> \"%s\"",
1810 unichar, data->code, data->gen,
1811 data->ccc,
1812 data->utf8nfdi);
1813 if (leaf) {
1814 printf(" gen %d ccc %d"
1815 " nfdi -> \"%s\"",
1816 LEAF_GEN(leaf),
1817 LEAF_CCC(leaf),
1818 LEAF_CCC(leaf) == DECOMPOSE ?
1819 LEAF_STR(leaf) : "");
1820 }
1821 printf("\n");
1822 }
1823 }
1824 }
1825
1826 static void trees_verify(void)
1827 {
1828 int i;
1829
1830 for (i = 0; i != trees_count; i++)
1831 verify(&trees[i]);
1832 }
1833
1834
1835
1836 static void help(void)
1837 {
1838 printf("Usage: %s [options]\n", argv0);
1839 printf("\n");
1840 printf("This program creates an a data trie used for parsing and\n");
1841 printf("normalization of UTF-8 strings. The trie is derived from\n");
1842 printf("a set of input files from the Unicode character database\n");
1843 printf("found at: http://www.unicode.org/Public/UCD/latest/ucd/\n");
1844 printf("\n");
1845 printf("The generated tree supports two normalization forms:\n");
1846 printf("\n");
1847 printf("\tnfdi:\n");
1848 printf("\t- Apply unicode normalization form NFD.\n");
1849 printf("\t- Remove any Default_Ignorable_Code_Point.\n");
1850 printf("\n");
1851 printf("\tnfdicf:\n");
1852 printf("\t- Apply unicode normalization form NFD.\n");
1853 printf("\t- Remove any Default_Ignorable_Code_Point.\n");
1854 printf("\t- Apply a full casefold (C + F).\n");
1855 printf("\n");
1856 printf("These forms were chosen as being most useful when dealing\n");
1857 printf("with file names: NFD catches most cases where characters\n");
1858 printf("should be considered equivalent. The ignorables are mostly\n");
1859 printf("invisible, making names hard to type.\n");
1860 printf("\n");
1861 printf("The options to specify the files to be used are listed\n");
1862 printf("below with their default values, which are the names used\n");
1863 printf("by version 11.0.0 of the Unicode Character Database.\n");
1864 printf("\n");
1865 printf("The input files:\n");
1866 printf("\t-a %s\n", AGE_NAME);
1867 printf("\t-c %s\n", CCC_NAME);
1868 printf("\t-p %s\n", PROP_NAME);
1869 printf("\t-d %s\n", DATA_NAME);
1870 printf("\t-f %s\n", FOLD_NAME);
1871 printf("\t-n %s\n", NORM_NAME);
1872 printf("\n");
1873 printf("Additionally, the generated tables are tested using:\n");
1874 printf("\t-t %s\n", TEST_NAME);
1875 printf("\n");
1876 printf("Finally, the output file:\n");
1877 printf("\t-o %s\n", UTF8_NAME);
1878 printf("\n");
1879 }
1880
1881 static void usage(void)
1882 {
1883 help();
1884 exit(1);
1885 }
1886
1887 static void open_fail(const char *name, int error)
1888 {
1889 printf("Error %d opening %s: %s\n", error, name, strerror(error));
1890 exit(1);
1891 }
1892
1893 static void file_fail(const char *filename)
1894 {
1895 printf("Error parsing %s\n", filename);
1896 exit(1);
1897 }
1898
1899 static void line_fail(const char *filename, const char *line)
1900 {
1901 printf("Error parsing %s:%s\n", filename, line);
1902 exit(1);
1903 }
1904
1905
1906
1907 static void print_utf32(unsigned int *utf32str)
1908 {
1909 int i;
1910
1911 for (i = 0; utf32str[i]; i++)
1912 printf(" %X", utf32str[i]);
1913 }
1914
1915 static void print_utf32nfdi(unsigned int unichar)
1916 {
1917 printf(" %X ->", unichar);
1918 print_utf32(unicode_data[unichar].utf32nfdi);
1919 printf("\n");
1920 }
1921
1922 static void print_utf32nfdicf(unsigned int unichar)
1923 {
1924 printf(" %X ->", unichar);
1925 print_utf32(unicode_data[unichar].utf32nfdicf);
1926 printf("\n");
1927 }
1928
1929
1930
1931 static void age_init(void)
1932 {
1933 FILE *file;
1934 unsigned int first;
1935 unsigned int last;
1936 unsigned int unichar;
1937 unsigned int major;
1938 unsigned int minor;
1939 unsigned int revision;
1940 int gen;
1941 int count;
1942 int ret;
1943
1944 if (verbose > 0)
1945 printf("Parsing %s\n", age_name);
1946
1947 file = fopen(age_name, "r");
1948 if (!file)
1949 open_fail(age_name, errno);
1950 count = 0;
1951
1952 gen = 0;
1953 while (fgets(line, LINESIZE, file)) {
1954 ret = sscanf(line, "# Age=V%d_%d_%d",
1955 &major, &minor, &revision);
1956 if (ret == 3) {
1957 ages_count++;
1958 if (verbose > 1)
1959 printf(" Age V%d_%d_%d\n",
1960 major, minor, revision);
1961 if (!age_valid(major, minor, revision))
1962 line_fail(age_name, line);
1963 continue;
1964 }
1965 ret = sscanf(line, "# Age=V%d_%d", &major, &minor);
1966 if (ret == 2) {
1967 ages_count++;
1968 if (verbose > 1)
1969 printf(" Age V%d_%d\n", major, minor);
1970 if (!age_valid(major, minor, 0))
1971 line_fail(age_name, line);
1972 continue;
1973 }
1974 }
1975
1976
1977 if (verbose > 1)
1978 printf("%d age entries\n", ages_count);
1979 if (ages_count == 0 || ages_count > MAXGEN)
1980 file_fail(age_name);
1981
1982
1983 ages_count++;
1984 ages = calloc(ages_count + 1, sizeof(*ages));
1985
1986 ages[ages_count] = (unsigned int)-1;
1987
1988 rewind(file);
1989 count = 0;
1990 gen = 0;
1991 while (fgets(line, LINESIZE, file)) {
1992 ret = sscanf(line, "# Age=V%d_%d_%d",
1993 &major, &minor, &revision);
1994 if (ret == 3) {
1995 ages[++gen] =
1996 UNICODE_AGE(major, minor, revision);
1997 if (verbose > 1)
1998 printf(" Age V%d_%d_%d = gen %d\n",
1999 major, minor, revision, gen);
2000 if (!age_valid(major, minor, revision))
2001 line_fail(age_name, line);
2002 continue;
2003 }
2004 ret = sscanf(line, "# Age=V%d_%d", &major, &minor);
2005 if (ret == 2) {
2006 ages[++gen] = UNICODE_AGE(major, minor, 0);
2007 if (verbose > 1)
2008 printf(" Age V%d_%d = %d\n",
2009 major, minor, gen);
2010 if (!age_valid(major, minor, 0))
2011 line_fail(age_name, line);
2012 continue;
2013 }
2014 ret = sscanf(line, "%X..%X ; %d.%d #",
2015 &first, &last, &major, &minor);
2016 if (ret == 4) {
2017 for (unichar = first; unichar <= last; unichar++)
2018 unicode_data[unichar].gen = gen;
2019 count += 1 + last - first;
2020 if (verbose > 1)
2021 printf(" %X..%X gen %d\n", first, last, gen);
2022 if (!utf32valid(first) || !utf32valid(last))
2023 line_fail(age_name, line);
2024 continue;
2025 }
2026 ret = sscanf(line, "%X ; %d.%d #", &unichar, &major, &minor);
2027 if (ret == 3) {
2028 unicode_data[unichar].gen = gen;
2029 count++;
2030 if (verbose > 1)
2031 printf(" %X gen %d\n", unichar, gen);
2032 if (!utf32valid(unichar))
2033 line_fail(age_name, line);
2034 continue;
2035 }
2036 }
2037 unicode_maxage = ages[gen];
2038 fclose(file);
2039
2040
2041 if (verbose > 1)
2042 printf(" Removing surrogate block D800..DFFF\n");
2043 for (unichar = 0xd800; unichar <= 0xdfff; unichar++)
2044 unicode_data[unichar].gen = -1;
2045
2046 if (verbose > 0)
2047 printf("Found %d entries\n", count);
2048 if (count == 0)
2049 file_fail(age_name);
2050 }
2051
2052 static void ccc_init(void)
2053 {
2054 FILE *file;
2055 unsigned int first;
2056 unsigned int last;
2057 unsigned int unichar;
2058 unsigned int value;
2059 int count;
2060 int ret;
2061
2062 if (verbose > 0)
2063 printf("Parsing %s\n", ccc_name);
2064
2065 file = fopen(ccc_name, "r");
2066 if (!file)
2067 open_fail(ccc_name, errno);
2068
2069 count = 0;
2070 while (fgets(line, LINESIZE, file)) {
2071 ret = sscanf(line, "%X..%X ; %d #", &first, &last, &value);
2072 if (ret == 3) {
2073 for (unichar = first; unichar <= last; unichar++) {
2074 unicode_data[unichar].ccc = value;
2075 count++;
2076 }
2077 if (verbose > 1)
2078 printf(" %X..%X ccc %d\n", first, last, value);
2079 if (!utf32valid(first) || !utf32valid(last))
2080 line_fail(ccc_name, line);
2081 continue;
2082 }
2083 ret = sscanf(line, "%X ; %d #", &unichar, &value);
2084 if (ret == 2) {
2085 unicode_data[unichar].ccc = value;
2086 count++;
2087 if (verbose > 1)
2088 printf(" %X ccc %d\n", unichar, value);
2089 if (!utf32valid(unichar))
2090 line_fail(ccc_name, line);
2091 continue;
2092 }
2093 }
2094 fclose(file);
2095
2096 if (verbose > 0)
2097 printf("Found %d entries\n", count);
2098 if (count == 0)
2099 file_fail(ccc_name);
2100 }
2101
2102 static int ignore_compatibility_form(char *type)
2103 {
2104 int i;
2105 char *ignored_types[] = {"font", "noBreak", "initial", "medial",
2106 "final", "isolated", "circle", "super",
2107 "sub", "vertical", "wide", "narrow",
2108 "small", "square", "fraction", "compat"};
2109
2110 for (i = 0 ; i < ARRAY_SIZE(ignored_types); i++)
2111 if (strcmp(type, ignored_types[i]) == 0)
2112 return 1;
2113 return 0;
2114 }
2115
2116 static void nfdi_init(void)
2117 {
2118 FILE *file;
2119 unsigned int unichar;
2120 unsigned int mapping[19];
2121 char *s;
2122 char *type;
2123 unsigned int *um;
2124 int count;
2125 int i;
2126 int ret;
2127
2128 if (verbose > 0)
2129 printf("Parsing %s\n", data_name);
2130 file = fopen(data_name, "r");
2131 if (!file)
2132 open_fail(data_name, errno);
2133
2134 count = 0;
2135 while (fgets(line, LINESIZE, file)) {
2136 ret = sscanf(line, "%X;%*[^;];%*[^;];%*[^;];%*[^;];%[^;];",
2137 &unichar, buf0);
2138 if (ret != 2)
2139 continue;
2140 if (!utf32valid(unichar))
2141 line_fail(data_name, line);
2142
2143 s = buf0;
2144
2145 if (*s == '<') {
2146 type = ++s;
2147 while (*++s != '>');
2148 *s++ = '\0';
2149 if(ignore_compatibility_form(type))
2150 continue;
2151 }
2152
2153 i = 0;
2154 while (*s) {
2155 mapping[i] = strtoul(s, &s, 16);
2156 if (!utf32valid(mapping[i]))
2157 line_fail(data_name, line);
2158 i++;
2159 }
2160 mapping[i++] = 0;
2161
2162 um = malloc(i * sizeof(unsigned int));
2163 memcpy(um, mapping, i * sizeof(unsigned int));
2164 unicode_data[unichar].utf32nfdi = um;
2165
2166 if (verbose > 1)
2167 print_utf32nfdi(unichar);
2168 count++;
2169 }
2170 fclose(file);
2171 if (verbose > 0)
2172 printf("Found %d entries\n", count);
2173 if (count == 0)
2174 file_fail(data_name);
2175 }
2176
2177 static void nfdicf_init(void)
2178 {
2179 FILE *file;
2180 unsigned int unichar;
2181 unsigned int mapping[19];
2182 char status;
2183 char *s;
2184 unsigned int *um;
2185 int i;
2186 int count;
2187 int ret;
2188
2189 if (verbose > 0)
2190 printf("Parsing %s\n", fold_name);
2191 file = fopen(fold_name, "r");
2192 if (!file)
2193 open_fail(fold_name, errno);
2194
2195 count = 0;
2196 while (fgets(line, LINESIZE, file)) {
2197 ret = sscanf(line, "%X; %c; %[^;];", &unichar, &status, buf0);
2198 if (ret != 3)
2199 continue;
2200 if (!utf32valid(unichar))
2201 line_fail(fold_name, line);
2202
2203 if (status != 'C' && status != 'F')
2204 continue;
2205 s = buf0;
2206 if (*s == '<')
2207 while (*s++ != ' ')
2208 ;
2209 i = 0;
2210 while (*s) {
2211 mapping[i] = strtoul(s, &s, 16);
2212 if (!utf32valid(mapping[i]))
2213 line_fail(fold_name, line);
2214 i++;
2215 }
2216 mapping[i++] = 0;
2217
2218 um = malloc(i * sizeof(unsigned int));
2219 memcpy(um, mapping, i * sizeof(unsigned int));
2220 unicode_data[unichar].utf32nfdicf = um;
2221
2222 if (verbose > 1)
2223 print_utf32nfdicf(unichar);
2224 count++;
2225 }
2226 fclose(file);
2227 if (verbose > 0)
2228 printf("Found %d entries\n", count);
2229 if (count == 0)
2230 file_fail(fold_name);
2231 }
2232
2233 static void ignore_init(void)
2234 {
2235 FILE *file;
2236 unsigned int unichar;
2237 unsigned int first;
2238 unsigned int last;
2239 unsigned int *um;
2240 int count;
2241 int ret;
2242
2243 if (verbose > 0)
2244 printf("Parsing %s\n", prop_name);
2245 file = fopen(prop_name, "r");
2246 if (!file)
2247 open_fail(prop_name, errno);
2248 assert(file);
2249 count = 0;
2250 while (fgets(line, LINESIZE, file)) {
2251 ret = sscanf(line, "%X..%X ; %s # ", &first, &last, buf0);
2252 if (ret == 3) {
2253 if (strcmp(buf0, "Default_Ignorable_Code_Point"))
2254 continue;
2255 if (!utf32valid(first) || !utf32valid(last))
2256 line_fail(prop_name, line);
2257 for (unichar = first; unichar <= last; unichar++) {
2258 free(unicode_data[unichar].utf32nfdi);
2259 um = malloc(sizeof(unsigned int));
2260 *um = 0;
2261 unicode_data[unichar].utf32nfdi = um;
2262 free(unicode_data[unichar].utf32nfdicf);
2263 um = malloc(sizeof(unsigned int));
2264 *um = 0;
2265 unicode_data[unichar].utf32nfdicf = um;
2266 count++;
2267 }
2268 if (verbose > 1)
2269 printf(" %X..%X Default_Ignorable_Code_Point\n",
2270 first, last);
2271 continue;
2272 }
2273 ret = sscanf(line, "%X ; %s # ", &unichar, buf0);
2274 if (ret == 2) {
2275 if (strcmp(buf0, "Default_Ignorable_Code_Point"))
2276 continue;
2277 if (!utf32valid(unichar))
2278 line_fail(prop_name, line);
2279 free(unicode_data[unichar].utf32nfdi);
2280 um = malloc(sizeof(unsigned int));
2281 *um = 0;
2282 unicode_data[unichar].utf32nfdi = um;
2283 free(unicode_data[unichar].utf32nfdicf);
2284 um = malloc(sizeof(unsigned int));
2285 *um = 0;
2286 unicode_data[unichar].utf32nfdicf = um;
2287 if (verbose > 1)
2288 printf(" %X Default_Ignorable_Code_Point\n",
2289 unichar);
2290 count++;
2291 continue;
2292 }
2293 }
2294 fclose(file);
2295
2296 if (verbose > 0)
2297 printf("Found %d entries\n", count);
2298 if (count == 0)
2299 file_fail(prop_name);
2300 }
2301
2302 static void corrections_init(void)
2303 {
2304 FILE *file;
2305 unsigned int unichar;
2306 unsigned int major;
2307 unsigned int minor;
2308 unsigned int revision;
2309 unsigned int age;
2310 unsigned int *um;
2311 unsigned int mapping[19];
2312 char *s;
2313 int i;
2314 int count;
2315 int ret;
2316
2317 if (verbose > 0)
2318 printf("Parsing %s\n", norm_name);
2319 file = fopen(norm_name, "r");
2320 if (!file)
2321 open_fail(norm_name, errno);
2322
2323 count = 0;
2324 while (fgets(line, LINESIZE, file)) {
2325 ret = sscanf(line, "%X;%[^;];%[^;];%d.%d.%d #",
2326 &unichar, buf0, buf1,
2327 &major, &minor, &revision);
2328 if (ret != 6)
2329 continue;
2330 if (!utf32valid(unichar) || !age_valid(major, minor, revision))
2331 line_fail(norm_name, line);
2332 count++;
2333 }
2334 corrections = calloc(count, sizeof(struct unicode_data));
2335 corrections_count = count;
2336 rewind(file);
2337
2338 count = 0;
2339 while (fgets(line, LINESIZE, file)) {
2340 ret = sscanf(line, "%X;%[^;];%[^;];%d.%d.%d #",
2341 &unichar, buf0, buf1,
2342 &major, &minor, &revision);
2343 if (ret != 6)
2344 continue;
2345 if (!utf32valid(unichar) || !age_valid(major, minor, revision))
2346 line_fail(norm_name, line);
2347 corrections[count] = unicode_data[unichar];
2348 assert(corrections[count].code == unichar);
2349 age = UNICODE_AGE(major, minor, revision);
2350 corrections[count].correction = age;
2351
2352 i = 0;
2353 s = buf0;
2354 while (*s) {
2355 mapping[i] = strtoul(s, &s, 16);
2356 if (!utf32valid(mapping[i]))
2357 line_fail(norm_name, line);
2358 i++;
2359 }
2360 mapping[i++] = 0;
2361
2362 um = malloc(i * sizeof(unsigned int));
2363 memcpy(um, mapping, i * sizeof(unsigned int));
2364 corrections[count].utf32nfdi = um;
2365
2366 if (verbose > 1)
2367 printf(" %X -> %s -> %s V%d_%d_%d\n",
2368 unichar, buf0, buf1, major, minor, revision);
2369 count++;
2370 }
2371 fclose(file);
2372
2373 if (verbose > 0)
2374 printf("Found %d entries\n", count);
2375 if (count == 0)
2376 file_fail(norm_name);
2377 }
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427 static void hangul_decompose(void)
2428 {
2429 unsigned int sb = 0xAC00;
2430 unsigned int lb = 0x1100;
2431 unsigned int vb = 0x1161;
2432 unsigned int tb = 0x11a7;
2433
2434 unsigned int vc = 21;
2435 unsigned int tc = 28;
2436 unsigned int nc = (vc * tc);
2437
2438 unsigned int unichar;
2439 unsigned int mapping[4];
2440 unsigned int *um;
2441 int count;
2442 int i;
2443
2444 if (verbose > 0)
2445 printf("Decomposing hangul\n");
2446
2447 count = 0;
2448 for (unichar = 0xAC00; unichar <= 0xD7A3; unichar++) {
2449 unsigned int si = unichar - sb;
2450 unsigned int li = si / nc;
2451 unsigned int vi = (si % nc) / tc;
2452 unsigned int ti = si % tc;
2453
2454 i = 0;
2455 mapping[i++] = lb + li;
2456 mapping[i++] = vb + vi;
2457 if (ti)
2458 mapping[i++] = tb + ti;
2459 mapping[i++] = 0;
2460
2461 assert(!unicode_data[unichar].utf32nfdi);
2462 um = malloc(i * sizeof(unsigned int));
2463 memcpy(um, mapping, i * sizeof(unsigned int));
2464 unicode_data[unichar].utf32nfdi = um;
2465
2466 assert(!unicode_data[unichar].utf32nfdicf);
2467 um = malloc(i * sizeof(unsigned int));
2468 memcpy(um, mapping, i * sizeof(unsigned int));
2469 unicode_data[unichar].utf32nfdicf = um;
2470
2471
2472
2473
2474
2475
2476 unicode_data[unichar].utf8nfdi = malloc(2);
2477 unicode_data[unichar].utf8nfdi[0] = HANGUL;
2478 unicode_data[unichar].utf8nfdi[1] = '\0';
2479
2480 if (verbose > 1)
2481 print_utf32nfdi(unichar);
2482
2483 count++;
2484 }
2485 if (verbose > 0)
2486 printf("Created %d entries\n", count);
2487 }
2488
2489 static void nfdi_decompose(void)
2490 {
2491 unsigned int unichar;
2492 unsigned int mapping[19];
2493 unsigned int *um;
2494 unsigned int *dc;
2495 int count;
2496 int i;
2497 int j;
2498 int ret;
2499
2500 if (verbose > 0)
2501 printf("Decomposing nfdi\n");
2502
2503 count = 0;
2504 for (unichar = 0; unichar != 0x110000; unichar++) {
2505 if (!unicode_data[unichar].utf32nfdi)
2506 continue;
2507 for (;;) {
2508 ret = 1;
2509 i = 0;
2510 um = unicode_data[unichar].utf32nfdi;
2511 while (*um) {
2512 dc = unicode_data[*um].utf32nfdi;
2513 if (dc) {
2514 for (j = 0; dc[j]; j++)
2515 mapping[i++] = dc[j];
2516 ret = 0;
2517 } else {
2518 mapping[i++] = *um;
2519 }
2520 um++;
2521 }
2522 mapping[i++] = 0;
2523 if (ret)
2524 break;
2525 free(unicode_data[unichar].utf32nfdi);
2526 um = malloc(i * sizeof(unsigned int));
2527 memcpy(um, mapping, i * sizeof(unsigned int));
2528 unicode_data[unichar].utf32nfdi = um;
2529 }
2530
2531 if (!unicode_data[unichar].utf32nfdicf) {
2532 um = malloc(i * sizeof(unsigned int));
2533 memcpy(um, mapping, i * sizeof(unsigned int));
2534 unicode_data[unichar].utf32nfdicf = um;
2535 }
2536 if (verbose > 1)
2537 print_utf32nfdi(unichar);
2538 count++;
2539 }
2540 if (verbose > 0)
2541 printf("Processed %d entries\n", count);
2542 }
2543
2544 static void nfdicf_decompose(void)
2545 {
2546 unsigned int unichar;
2547 unsigned int mapping[19];
2548 unsigned int *um;
2549 unsigned int *dc;
2550 int count;
2551 int i;
2552 int j;
2553 int ret;
2554
2555 if (verbose > 0)
2556 printf("Decomposing nfdicf\n");
2557 count = 0;
2558 for (unichar = 0; unichar != 0x110000; unichar++) {
2559 if (!unicode_data[unichar].utf32nfdicf)
2560 continue;
2561 for (;;) {
2562 ret = 1;
2563 i = 0;
2564 um = unicode_data[unichar].utf32nfdicf;
2565 while (*um) {
2566 dc = unicode_data[*um].utf32nfdicf;
2567 if (dc) {
2568 for (j = 0; dc[j]; j++)
2569 mapping[i++] = dc[j];
2570 ret = 0;
2571 } else {
2572 mapping[i++] = *um;
2573 }
2574 um++;
2575 }
2576 mapping[i++] = 0;
2577 if (ret)
2578 break;
2579 free(unicode_data[unichar].utf32nfdicf);
2580 um = malloc(i * sizeof(unsigned int));
2581 memcpy(um, mapping, i * sizeof(unsigned int));
2582 unicode_data[unichar].utf32nfdicf = um;
2583 }
2584 if (verbose > 1)
2585 print_utf32nfdicf(unichar);
2586 count++;
2587 }
2588 if (verbose > 0)
2589 printf("Processed %d entries\n", count);
2590 }
2591
2592
2593
2594 int utf8agemax(struct tree *, const char *);
2595 int utf8nagemax(struct tree *, const char *, size_t);
2596 int utf8agemin(struct tree *, const char *);
2597 int utf8nagemin(struct tree *, const char *, size_t);
2598 ssize_t utf8len(struct tree *, const char *);
2599 ssize_t utf8nlen(struct tree *, const char *, size_t);
2600 struct utf8cursor;
2601 int utf8cursor(struct utf8cursor *, struct tree *, const char *);
2602 int utf8ncursor(struct utf8cursor *, struct tree *, const char *, size_t);
2603 int utf8byte(struct utf8cursor *);
2604
2605
2606
2607
2608
2609
2610
2611
2612
2613
2614
2615
2616
2617
2618
2619
2620
2621
2622
2623
2624
2625
2626
2627
2628
2629
2630
2631
2632
2633
2634
2635
2636
2637
2638
2639
2640
2641
2642
2643
2644
2645
2646
2647
2648
2649
2650
2651 #define SB (0xAC00)
2652 #define LB (0x1100)
2653 #define VB (0x1161)
2654 #define TB (0x11A7)
2655 #define LC (19)
2656 #define VC (21)
2657 #define TC (28)
2658 #define NC (VC * TC)
2659 #define SC (LC * NC)
2660
2661
2662 static utf8leaf_t *utf8hangul(const char *str, unsigned char *hangul)
2663 {
2664 unsigned int si;
2665 unsigned int li;
2666 unsigned int vi;
2667 unsigned int ti;
2668 unsigned char *h;
2669
2670
2671 si = utf8decode(str) - SB;
2672 li = si / NC;
2673 vi = (si % NC) / TC;
2674 ti = si % TC;
2675
2676
2677 h = hangul;
2678 LEAF_GEN(h) = 2;
2679 LEAF_CCC(h) = DECOMPOSE;
2680 h += 2;
2681
2682
2683 h += utf8encode((char *)h, li + LB);
2684
2685
2686 h += utf8encode((char *)h, vi + VB);
2687
2688
2689 if (ti)
2690 h += utf8encode((char *)h, ti + TB);
2691
2692
2693 h[0] = '\0';
2694
2695 return hangul;
2696 }
2697
2698
2699
2700
2701
2702
2703
2704
2705
2706 static utf8leaf_t *utf8nlookup(struct tree *tree, unsigned char *hangul,
2707 const char *s, size_t len)
2708 {
2709 utf8trie_t *trie;
2710 int offlen;
2711 int offset;
2712 int mask;
2713 int node;
2714
2715 if (!tree)
2716 return NULL;
2717 if (len == 0)
2718 return NULL;
2719 node = 1;
2720 trie = utf8data + tree->index;
2721 while (node) {
2722 offlen = (*trie & OFFLEN) >> OFFLEN_SHIFT;
2723 if (*trie & NEXTBYTE) {
2724 if (--len == 0)
2725 return NULL;
2726 s++;
2727 }
2728 mask = 1 << (*trie & BITNUM);
2729 if (*s & mask) {
2730
2731 if (offlen) {
2732
2733 node = (*trie & RIGHTNODE);
2734 offset = trie[offlen];
2735 while (--offlen) {
2736 offset <<= 8;
2737 offset |= trie[offlen];
2738 }
2739 trie += offset;
2740 } else if (*trie & RIGHTPATH) {
2741
2742 node = (*trie & TRIENODE);
2743 trie++;
2744 } else {
2745
2746 return NULL;
2747 }
2748 } else {
2749
2750 if (offlen) {
2751
2752 node = (*trie & LEFTNODE);
2753 trie += offlen + 1;
2754 } else if (*trie & RIGHTPATH) {
2755
2756 return NULL;
2757 } else {
2758
2759 node = (*trie & TRIENODE);
2760 trie++;
2761 }
2762 }
2763 }
2764
2765
2766
2767
2768
2769
2770 if (LEAF_CCC(trie) == DECOMPOSE && LEAF_STR(trie)[0] == HANGUL)
2771 trie = utf8hangul(s - 2, hangul);
2772 return trie;
2773 }
2774
2775
2776
2777
2778
2779
2780
2781 static utf8leaf_t *utf8lookup(struct tree *tree, unsigned char *hangul,
2782 const char *s)
2783 {
2784 return utf8nlookup(tree, hangul, s, (size_t)-1);
2785 }
2786
2787
2788
2789
2790
2791
2792 static inline int utf8clen(const char *s)
2793 {
2794 unsigned char c = *s;
2795 return 1 + (c >= 0xC0) + (c >= 0xE0) + (c >= 0xF0);
2796 }
2797
2798
2799
2800
2801
2802
2803 int utf8agemax(struct tree *tree, const char *s)
2804 {
2805 utf8leaf_t *leaf;
2806 int age = 0;
2807 int leaf_age;
2808 unsigned char hangul[UTF8HANGULLEAF];
2809
2810 if (!tree)
2811 return -1;
2812
2813 while (*s) {
2814 leaf = utf8lookup(tree, hangul, s);
2815 if (!leaf)
2816 return -1;
2817 leaf_age = ages[LEAF_GEN(leaf)];
2818 if (leaf_age <= tree->maxage && leaf_age > age)
2819 age = leaf_age;
2820 s += utf8clen(s);
2821 }
2822 return age;
2823 }
2824
2825
2826
2827
2828
2829
2830 int utf8agemin(struct tree *tree, const char *s)
2831 {
2832 utf8leaf_t *leaf;
2833 int age;
2834 int leaf_age;
2835 unsigned char hangul[UTF8HANGULLEAF];
2836
2837 if (!tree)
2838 return -1;
2839 age = tree->maxage;
2840 while (*s) {
2841 leaf = utf8lookup(tree, hangul, s);
2842 if (!leaf)
2843 return -1;
2844 leaf_age = ages[LEAF_GEN(leaf)];
2845 if (leaf_age <= tree->maxage && leaf_age < age)
2846 age = leaf_age;
2847 s += utf8clen(s);
2848 }
2849 return age;
2850 }
2851
2852
2853
2854
2855
2856 int utf8nagemax(struct tree *tree, const char *s, size_t len)
2857 {
2858 utf8leaf_t *leaf;
2859 int age = 0;
2860 int leaf_age;
2861 unsigned char hangul[UTF8HANGULLEAF];
2862
2863 if (!tree)
2864 return -1;
2865
2866 while (len && *s) {
2867 leaf = utf8nlookup(tree, hangul, s, len);
2868 if (!leaf)
2869 return -1;
2870 leaf_age = ages[LEAF_GEN(leaf)];
2871 if (leaf_age <= tree->maxage && leaf_age > age)
2872 age = leaf_age;
2873 len -= utf8clen(s);
2874 s += utf8clen(s);
2875 }
2876 return age;
2877 }
2878
2879
2880
2881
2882
2883 int utf8nagemin(struct tree *tree, const char *s, size_t len)
2884 {
2885 utf8leaf_t *leaf;
2886 int leaf_age;
2887 int age;
2888 unsigned char hangul[UTF8HANGULLEAF];
2889
2890 if (!tree)
2891 return -1;
2892 age = tree->maxage;
2893 while (len && *s) {
2894 leaf = utf8nlookup(tree, hangul, s, len);
2895 if (!leaf)
2896 return -1;
2897 leaf_age = ages[LEAF_GEN(leaf)];
2898 if (leaf_age <= tree->maxage && leaf_age < age)
2899 age = leaf_age;
2900 len -= utf8clen(s);
2901 s += utf8clen(s);
2902 }
2903 return age;
2904 }
2905
2906
2907
2908
2909
2910
2911
2912 ssize_t utf8len(struct tree *tree, const char *s)
2913 {
2914 utf8leaf_t *leaf;
2915 size_t ret = 0;
2916 unsigned char hangul[UTF8HANGULLEAF];
2917
2918 if (!tree)
2919 return -1;
2920 while (*s) {
2921 leaf = utf8lookup(tree, hangul, s);
2922 if (!leaf)
2923 return -1;
2924 if (ages[LEAF_GEN(leaf)] > tree->maxage)
2925 ret += utf8clen(s);
2926 else if (LEAF_CCC(leaf) == DECOMPOSE)
2927 ret += strlen(LEAF_STR(leaf));
2928 else
2929 ret += utf8clen(s);
2930 s += utf8clen(s);
2931 }
2932 return ret;
2933 }
2934
2935
2936
2937
2938
2939 ssize_t utf8nlen(struct tree *tree, const char *s, size_t len)
2940 {
2941 utf8leaf_t *leaf;
2942 size_t ret = 0;
2943 unsigned char hangul[UTF8HANGULLEAF];
2944
2945 if (!tree)
2946 return -1;
2947 while (len && *s) {
2948 leaf = utf8nlookup(tree, hangul, s, len);
2949 if (!leaf)
2950 return -1;
2951 if (ages[LEAF_GEN(leaf)] > tree->maxage)
2952 ret += utf8clen(s);
2953 else if (LEAF_CCC(leaf) == DECOMPOSE)
2954 ret += strlen(LEAF_STR(leaf));
2955 else
2956 ret += utf8clen(s);
2957 len -= utf8clen(s);
2958 s += utf8clen(s);
2959 }
2960 return ret;
2961 }
2962
2963
2964
2965
2966 struct utf8cursor {
2967 struct tree *tree;
2968 const char *s;
2969 const char *p;
2970 const char *ss;
2971 const char *sp;
2972 unsigned int len;
2973 unsigned int slen;
2974 short int ccc;
2975 short int nccc;
2976 unsigned int unichar;
2977 unsigned char hangul[UTF8HANGULLEAF];
2978 };
2979
2980
2981
2982
2983
2984
2985
2986
2987
2988
2989
2990 int utf8ncursor(struct utf8cursor *u8c, struct tree *tree, const char *s,
2991 size_t len)
2992 {
2993 if (!tree)
2994 return -1;
2995 if (!s)
2996 return -1;
2997 u8c->tree = tree;
2998 u8c->s = s;
2999 u8c->p = NULL;
3000 u8c->ss = NULL;
3001 u8c->sp = NULL;
3002 u8c->len = len;
3003 u8c->slen = 0;
3004 u8c->ccc = STOPPER;
3005 u8c->nccc = STOPPER;
3006 u8c->unichar = 0;
3007
3008 if (u8c->len != len)
3009 return -1;
3010
3011 if (len > 0 && (*s & 0xC0) == 0x80)
3012 return -1;
3013 return 0;
3014 }
3015
3016
3017
3018
3019
3020
3021
3022
3023
3024
3025 int utf8cursor(struct utf8cursor *u8c, struct tree *tree, const char *s)
3026 {
3027 return utf8ncursor(u8c, tree, s, (unsigned int)-1);
3028 }
3029
3030
3031
3032
3033
3034
3035
3036
3037
3038
3039
3040
3041
3042
3043
3044
3045
3046
3047
3048
3049
3050
3051
3052
3053
3054
3055
3056
3057 int utf8byte(struct utf8cursor *u8c)
3058 {
3059 utf8leaf_t *leaf;
3060 int ccc;
3061
3062 for (;;) {
3063
3064 if (u8c->p && *u8c->s == '\0') {
3065 u8c->s = u8c->p;
3066 u8c->p = NULL;
3067 }
3068
3069
3070 if (!u8c->p && (u8c->len == 0 || *u8c->s == '\0')) {
3071
3072 if (u8c->ccc == STOPPER)
3073 return 0;
3074
3075 ccc = STOPPER;
3076 goto ccc_mismatch;
3077 } else if ((*u8c->s & 0xC0) == 0x80) {
3078
3079 if (!u8c->p)
3080 u8c->len--;
3081 return (unsigned char)*u8c->s++;
3082 }
3083
3084
3085 if (u8c->p) {
3086 leaf = utf8lookup(u8c->tree, u8c->hangul, u8c->s);
3087 } else {
3088 leaf = utf8nlookup(u8c->tree, u8c->hangul,
3089 u8c->s, u8c->len);
3090 }
3091
3092
3093 if (!leaf)
3094 return -1;
3095
3096
3097 if (ages[LEAF_GEN(leaf)] > u8c->tree->maxage) {
3098 ccc = STOPPER;
3099 } else if ((ccc = LEAF_CCC(leaf)) == DECOMPOSE) {
3100 u8c->len -= utf8clen(u8c->s);
3101 u8c->p = u8c->s + utf8clen(u8c->s);
3102 u8c->s = LEAF_STR(leaf);
3103
3104 if (*u8c->s == '\0') {
3105 if (u8c->ccc == STOPPER)
3106 continue;
3107 ccc = STOPPER;
3108 goto ccc_mismatch;
3109 }
3110 leaf = utf8lookup(u8c->tree, u8c->hangul, u8c->s);
3111 ccc = LEAF_CCC(leaf);
3112 }
3113 u8c->unichar = utf8decode(u8c->s);
3114
3115
3116
3117
3118
3119 if (ccc != STOPPER && u8c->ccc < ccc && ccc < u8c->nccc)
3120 u8c->nccc = ccc;
3121
3122
3123
3124
3125
3126 if (ccc == u8c->ccc) {
3127 if (!u8c->p)
3128 u8c->len--;
3129 return (unsigned char)*u8c->s++;
3130 }
3131
3132
3133 ccc_mismatch:
3134 if (u8c->nccc == STOPPER) {
3135
3136
3137
3138
3139
3140 assert(u8c->ccc == STOPPER);
3141 u8c->ccc = MINCCC - 1;
3142 u8c->nccc = ccc;
3143 u8c->sp = u8c->p;
3144 u8c->ss = u8c->s;
3145 u8c->slen = u8c->len;
3146 if (!u8c->p)
3147 u8c->len -= utf8clen(u8c->s);
3148 u8c->s += utf8clen(u8c->s);
3149 } else if (ccc != STOPPER) {
3150
3151 if (!u8c->p)
3152 u8c->len -= utf8clen(u8c->s);
3153 u8c->s += utf8clen(u8c->s);
3154 } else if (u8c->nccc != MAXCCC + 1) {
3155
3156 u8c->ccc = u8c->nccc;
3157 u8c->nccc = MAXCCC + 1;
3158 u8c->s = u8c->ss;
3159 u8c->p = u8c->sp;
3160 u8c->len = u8c->slen;
3161 } else {
3162
3163 u8c->ccc = STOPPER;
3164 u8c->nccc = STOPPER;
3165 u8c->sp = NULL;
3166 u8c->ss = NULL;
3167 u8c->slen = 0;
3168 }
3169 }
3170 }
3171
3172
3173
3174 static int normalize_line(struct tree *tree)
3175 {
3176 char *s;
3177 char *t;
3178 int c;
3179 struct utf8cursor u8c;
3180
3181
3182 s = buf2;
3183 t = buf3;
3184 if (utf8cursor(&u8c, tree, s))
3185 return -1;
3186 while ((c = utf8byte(&u8c)) > 0)
3187 if (c != (unsigned char)*t++)
3188 return -1;
3189 if (c < 0)
3190 return -1;
3191 if (*t != 0)
3192 return -1;
3193
3194
3195 s = buf2;
3196
3197 s[strlen(s) + 1] = -1;
3198 t = buf3;
3199 if (utf8cursor(&u8c, tree, s))
3200 return -1;
3201 while ((c = utf8byte(&u8c)) > 0)
3202 if (c != (unsigned char)*t++)
3203 return -1;
3204 if (c < 0)
3205 return -1;
3206 if (*t != 0)
3207 return -1;
3208
3209 return 0;
3210 }
3211
3212 static void normalization_test(void)
3213 {
3214 FILE *file;
3215 unsigned int unichar;
3216 struct unicode_data *data;
3217 char *s;
3218 char *t;
3219 int ret;
3220 int ignorables;
3221 int tests = 0;
3222 int failures = 0;
3223
3224 if (verbose > 0)
3225 printf("Parsing %s\n", test_name);
3226
3227 file = fopen(test_name, "r");
3228 if (!file)
3229 open_fail(test_name, errno);
3230
3231 while (fgets(line, LINESIZE, file)) {
3232 ret = sscanf(line, "%[^;];%*[^;];%[^;];%*[^;];%*[^;];",
3233 buf0, buf1);
3234 if (ret != 2 || *line == '#')
3235 continue;
3236 s = buf0;
3237 t = buf2;
3238 while (*s) {
3239 unichar = strtoul(s, &s, 16);
3240 t += utf8encode(t, unichar);
3241 }
3242 *t = '\0';
3243
3244 ignorables = 0;
3245 s = buf1;
3246 t = buf3;
3247 while (*s) {
3248 unichar = strtoul(s, &s, 16);
3249 data = &unicode_data[unichar];
3250 if (data->utf8nfdi && !*data->utf8nfdi)
3251 ignorables = 1;
3252 else
3253 t += utf8encode(t, unichar);
3254 }
3255 *t = '\0';
3256
3257 tests++;
3258 if (normalize_line(nfdi_tree) < 0) {
3259 printf("Line %s -> %s", buf0, buf1);
3260 if (ignorables)
3261 printf(" (ignorables removed)");
3262 printf(" failure\n");
3263 failures++;
3264 }
3265 }
3266 fclose(file);
3267 if (verbose > 0)
3268 printf("Ran %d tests with %d failures\n", tests, failures);
3269 if (failures)
3270 file_fail(test_name);
3271 }
3272
3273
3274
3275 static void write_file(void)
3276 {
3277 FILE *file;
3278 int i;
3279 int j;
3280 int t;
3281 int gen;
3282
3283 if (verbose > 0)
3284 printf("Writing %s\n", utf8_name);
3285 file = fopen(utf8_name, "w");
3286 if (!file)
3287 open_fail(utf8_name, errno);
3288
3289 fprintf(file, "/* This file is generated code, do not edit. */\n");
3290 fprintf(file, "\n");
3291 fprintf(file, "#include <linux/module.h>\n");
3292 fprintf(file, "#include <linux/kernel.h>\n");
3293 fprintf(file, "#include \"utf8n.h\"\n");
3294 fprintf(file, "\n");
3295 fprintf(file, "static const unsigned int utf8agetab[] = {\n");
3296 for (i = 0; i != ages_count; i++)
3297 fprintf(file, "\t%#x%s\n", ages[i],
3298 ages[i] == unicode_maxage ? "" : ",");
3299 fprintf(file, "};\n");
3300 fprintf(file, "\n");
3301 fprintf(file, "static const struct utf8data utf8nfdicfdata[] = {\n");
3302 t = 0;
3303 for (gen = 0; gen < ages_count; gen++) {
3304 fprintf(file, "\t{ %#x, %d }%s\n",
3305 ages[gen], trees[t].index,
3306 ages[gen] == unicode_maxage ? "" : ",");
3307 if (trees[t].maxage == ages[gen])
3308 t += 2;
3309 }
3310 fprintf(file, "};\n");
3311 fprintf(file, "\n");
3312 fprintf(file, "static const struct utf8data utf8nfdidata[] = {\n");
3313 t = 1;
3314 for (gen = 0; gen < ages_count; gen++) {
3315 fprintf(file, "\t{ %#x, %d }%s\n",
3316 ages[gen], trees[t].index,
3317 ages[gen] == unicode_maxage ? "" : ",");
3318 if (trees[t].maxage == ages[gen])
3319 t += 2;
3320 }
3321 fprintf(file, "};\n");
3322 fprintf(file, "\n");
3323 fprintf(file, "static const unsigned char utf8data[%zd] = {\n",
3324 utf8data_size);
3325 t = 0;
3326 for (i = 0; i != utf8data_size; i += 16) {
3327 if (i == trees[t].index) {
3328 fprintf(file, "\t/* %s_%x */\n",
3329 trees[t].type, trees[t].maxage);
3330 if (t < trees_count-1)
3331 t++;
3332 }
3333 fprintf(file, "\t");
3334 for (j = i; j != i + 16; j++)
3335 fprintf(file, "0x%.2x%s", utf8data[j],
3336 (j < utf8data_size -1 ? "," : ""));
3337 fprintf(file, "\n");
3338 }
3339 fprintf(file, "};\n");
3340 fprintf(file, "\n");
3341 fprintf(file, "struct utf8data_table utf8_data_table = {\n");
3342 fprintf(file, "\t.utf8agetab = utf8agetab,\n");
3343 fprintf(file, "\t.utf8agetab_size = ARRAY_SIZE(utf8agetab),\n");
3344 fprintf(file, "\n");
3345 fprintf(file, "\t.utf8nfdicfdata = utf8nfdicfdata,\n");
3346 fprintf(file, "\t.utf8nfdicfdata_size = ARRAY_SIZE(utf8nfdicfdata),\n");
3347 fprintf(file, "\n");
3348 fprintf(file, "\t.utf8nfdidata = utf8nfdidata,\n");
3349 fprintf(file, "\t.utf8nfdidata_size = ARRAY_SIZE(utf8nfdidata),\n");
3350 fprintf(file, "\n");
3351 fprintf(file, "\t.utf8data = utf8data,\n");
3352 fprintf(file, "};\n");
3353 fprintf(file, "EXPORT_SYMBOL_GPL(utf8_data_table);");
3354 fprintf(file, "\n");
3355 fprintf(file, "MODULE_LICENSE(\"GPL v2\");\n");
3356 fclose(file);
3357 }
3358
3359
3360
3361 int main(int argc, char *argv[])
3362 {
3363 unsigned int unichar;
3364 int opt;
3365
3366 argv0 = argv[0];
3367
3368 while ((opt = getopt(argc, argv, "a:c:d:f:hn:o:p:t:v")) != -1) {
3369 switch (opt) {
3370 case 'a':
3371 age_name = optarg;
3372 break;
3373 case 'c':
3374 ccc_name = optarg;
3375 break;
3376 case 'd':
3377 data_name = optarg;
3378 break;
3379 case 'f':
3380 fold_name = optarg;
3381 break;
3382 case 'n':
3383 norm_name = optarg;
3384 break;
3385 case 'o':
3386 utf8_name = optarg;
3387 break;
3388 case 'p':
3389 prop_name = optarg;
3390 break;
3391 case 't':
3392 test_name = optarg;
3393 break;
3394 case 'v':
3395 verbose++;
3396 break;
3397 case 'h':
3398 help();
3399 exit(0);
3400 default:
3401 usage();
3402 }
3403 }
3404
3405 if (verbose > 1)
3406 help();
3407 for (unichar = 0; unichar != 0x110000; unichar++)
3408 unicode_data[unichar].code = unichar;
3409 age_init();
3410 ccc_init();
3411 nfdi_init();
3412 nfdicf_init();
3413 ignore_init();
3414 corrections_init();
3415 hangul_decompose();
3416 nfdi_decompose();
3417 nfdicf_decompose();
3418 utf8_init();
3419 trees_init();
3420 trees_populate();
3421 trees_reduce();
3422 trees_verify();
3423
3424 (void)lookup(nfdi_tree, " ");
3425 if (verbose > 2)
3426 tree_walk(nfdi_tree);
3427 if (verbose > 2)
3428 tree_walk(nfdicf_tree);
3429 normalization_test();
3430 write_file();
3431
3432 return 0;
3433 }