Back to home page

LXR

 
 

    


0001 /*
0002  * lib/ts_fsm.c    A naive finite state machine text search approach
0003  *
0004  *      This program is free software; you can redistribute it and/or
0005  *      modify it under the terms of the GNU General Public License
0006  *      as published by the Free Software Foundation; either version
0007  *      2 of the License, or (at your option) any later version.
0008  *
0009  * Authors: Thomas Graf <tgraf@suug.ch>
0010  *
0011  * ==========================================================================
0012  *
0013  *   A finite state machine consists of n states (struct ts_fsm_token)
0014  *   representing the pattern as a finite automation. The data is read
0015  *   sequentially on an octet basis. Every state token specifies the number
0016  *   of recurrences and the type of value accepted which can be either a
0017  *   specific character or ctype based set of characters. The available
0018  *   type of recurrences include 1, (0|1), [0 n], and [1 n].
0019  *
0020  *   The algorithm differs between strict/non-strict mode specifying
0021  *   whether the pattern has to start at the first octet. Strict mode
0022  *   is enabled by default and can be disabled by inserting
0023  *   TS_FSM_HEAD_IGNORE as the first token in the chain.
0024  *
0025  *   The runtime performance of the algorithm should be around O(n),
0026  *   however while in strict mode the average runtime can be better.
0027  */
0028 
0029 #include <linux/module.h>
0030 #include <linux/types.h>
0031 #include <linux/string.h>
0032 #include <linux/ctype.h>
0033 #include <linux/textsearch.h>
0034 #include <linux/textsearch_fsm.h>
0035 
0036 struct ts_fsm
0037 {
0038     unsigned int        ntokens;
0039     struct ts_fsm_token tokens[0];
0040 };
0041 
0042 /* other values derived from ctype.h */
0043 #define _A      0x100 /* ascii */
0044 #define _W      0x200 /* wildcard */
0045 
0046 /* Map to _ctype flags and some magic numbers */
0047 static const u16 token_map[TS_FSM_TYPE_MAX+1] = {
0048     [TS_FSM_SPECIFIC] = 0,
0049     [TS_FSM_WILDCARD] = _W,
0050     [TS_FSM_CNTRL]    = _C,
0051     [TS_FSM_LOWER]    = _L,
0052     [TS_FSM_UPPER]    = _U,
0053     [TS_FSM_PUNCT]    = _P,
0054     [TS_FSM_SPACE]    = _S,
0055     [TS_FSM_DIGIT]    = _D,
0056     [TS_FSM_XDIGIT]   = _D | _X,
0057     [TS_FSM_ALPHA]    = _U | _L,
0058     [TS_FSM_ALNUM]    = _U | _L | _D,
0059     [TS_FSM_PRINT]    = _P | _U | _L | _D | _SP,
0060     [TS_FSM_GRAPH]    = _P | _U | _L | _D,
0061     [TS_FSM_ASCII]    = _A,
0062 };
0063 
0064 static const u16 token_lookup_tbl[256] = {
0065 _W|_A|_C,      _W|_A|_C,     _W|_A|_C,     _W|_A|_C,        /*   0-  3 */
0066 _W|_A|_C,      _W|_A|_C,     _W|_A|_C,     _W|_A|_C,        /*   4-  7 */
0067 _W|_A|_C,      _W|_A|_C|_S,  _W|_A|_C|_S,  _W|_A|_C|_S,     /*   8- 11 */
0068 _W|_A|_C|_S,   _W|_A|_C|_S,  _W|_A|_C,     _W|_A|_C,        /*  12- 15 */
0069 _W|_A|_C,      _W|_A|_C,     _W|_A|_C,     _W|_A|_C,        /*  16- 19 */
0070 _W|_A|_C,      _W|_A|_C,     _W|_A|_C,     _W|_A|_C,        /*  20- 23 */
0071 _W|_A|_C,      _W|_A|_C,     _W|_A|_C,     _W|_A|_C,        /*  24- 27 */
0072 _W|_A|_C,      _W|_A|_C,     _W|_A|_C,     _W|_A|_C,        /*  28- 31 */
0073 _W|_A|_S|_SP,  _W|_A|_P,     _W|_A|_P,     _W|_A|_P,        /*  32- 35 */
0074 _W|_A|_P,      _W|_A|_P,     _W|_A|_P,     _W|_A|_P,        /*  36- 39 */
0075 _W|_A|_P,      _W|_A|_P,     _W|_A|_P,     _W|_A|_P,        /*  40- 43 */
0076 _W|_A|_P,      _W|_A|_P,     _W|_A|_P,     _W|_A|_P,        /*  44- 47 */
0077 _W|_A|_D,      _W|_A|_D,     _W|_A|_D,     _W|_A|_D,        /*  48- 51 */
0078 _W|_A|_D,      _W|_A|_D,     _W|_A|_D,     _W|_A|_D,        /*  52- 55 */
0079 _W|_A|_D,      _W|_A|_D,     _W|_A|_P,     _W|_A|_P,        /*  56- 59 */
0080 _W|_A|_P,      _W|_A|_P,     _W|_A|_P,     _W|_A|_P,        /*  60- 63 */
0081 _W|_A|_P,      _W|_A|_U|_X,  _W|_A|_U|_X,  _W|_A|_U|_X,     /*  64- 67 */
0082 _W|_A|_U|_X,   _W|_A|_U|_X,  _W|_A|_U|_X,  _W|_A|_U,        /*  68- 71 */
0083 _W|_A|_U,      _W|_A|_U,     _W|_A|_U,     _W|_A|_U,        /*  72- 75 */
0084 _W|_A|_U,      _W|_A|_U,     _W|_A|_U,     _W|_A|_U,        /*  76- 79 */
0085 _W|_A|_U,      _W|_A|_U,     _W|_A|_U,     _W|_A|_U,        /*  80- 83 */
0086 _W|_A|_U,      _W|_A|_U,     _W|_A|_U,     _W|_A|_U,        /*  84- 87 */
0087 _W|_A|_U,      _W|_A|_U,     _W|_A|_U,     _W|_A|_P,        /*  88- 91 */
0088 _W|_A|_P,      _W|_A|_P,     _W|_A|_P,     _W|_A|_P,        /*  92- 95 */
0089 _W|_A|_P,      _W|_A|_L|_X,  _W|_A|_L|_X,  _W|_A|_L|_X,     /*  96- 99 */
0090 _W|_A|_L|_X,   _W|_A|_L|_X,  _W|_A|_L|_X,  _W|_A|_L,        /* 100-103 */
0091 _W|_A|_L,      _W|_A|_L,     _W|_A|_L,     _W|_A|_L,        /* 104-107 */
0092 _W|_A|_L,      _W|_A|_L,     _W|_A|_L,     _W|_A|_L,        /* 108-111 */
0093 _W|_A|_L,      _W|_A|_L,     _W|_A|_L,     _W|_A|_L,        /* 112-115 */
0094 _W|_A|_L,      _W|_A|_L,     _W|_A|_L,     _W|_A|_L,        /* 116-119 */
0095 _W|_A|_L,      _W|_A|_L,     _W|_A|_L,     _W|_A|_P,        /* 120-123 */
0096 _W|_A|_P,      _W|_A|_P,     _W|_A|_P,     _W|_A|_C,        /* 124-127 */
0097 _W,            _W,           _W,           _W,          /* 128-131 */
0098 _W,            _W,           _W,           _W,          /* 132-135 */
0099 _W,            _W,           _W,           _W,          /* 136-139 */
0100 _W,            _W,           _W,           _W,          /* 140-143 */
0101 _W,            _W,           _W,           _W,          /* 144-147 */
0102 _W,            _W,           _W,           _W,          /* 148-151 */
0103 _W,            _W,           _W,           _W,          /* 152-155 */
0104 _W,            _W,           _W,           _W,          /* 156-159 */
0105 _W|_S|_SP,     _W|_P,        _W|_P,        _W|_P,       /* 160-163 */
0106 _W|_P,         _W|_P,        _W|_P,        _W|_P,       /* 164-167 */
0107 _W|_P,         _W|_P,        _W|_P,        _W|_P,       /* 168-171 */
0108 _W|_P,         _W|_P,        _W|_P,        _W|_P,       /* 172-175 */
0109 _W|_P,         _W|_P,        _W|_P,        _W|_P,       /* 176-179 */
0110 _W|_P,         _W|_P,        _W|_P,        _W|_P,       /* 180-183 */
0111 _W|_P,         _W|_P,        _W|_P,        _W|_P,       /* 184-187 */
0112 _W|_P,         _W|_P,        _W|_P,        _W|_P,       /* 188-191 */
0113 _W|_U,         _W|_U,        _W|_U,        _W|_U,       /* 192-195 */
0114 _W|_U,         _W|_U,        _W|_U,        _W|_U,       /* 196-199 */
0115 _W|_U,         _W|_U,        _W|_U,        _W|_U,       /* 200-203 */
0116 _W|_U,         _W|_U,        _W|_U,        _W|_U,       /* 204-207 */
0117 _W|_U,         _W|_U,        _W|_U,        _W|_U,       /* 208-211 */
0118 _W|_U,         _W|_U,        _W|_U,        _W|_P,       /* 212-215 */
0119 _W|_U,         _W|_U,        _W|_U,        _W|_U,       /* 216-219 */
0120 _W|_U,         _W|_U,        _W|_U,        _W|_L,       /* 220-223 */
0121 _W|_L,         _W|_L,        _W|_L,        _W|_L,       /* 224-227 */
0122 _W|_L,         _W|_L,        _W|_L,        _W|_L,       /* 228-231 */
0123 _W|_L,         _W|_L,        _W|_L,        _W|_L,       /* 232-235 */
0124 _W|_L,         _W|_L,        _W|_L,        _W|_L,       /* 236-239 */
0125 _W|_L,         _W|_L,        _W|_L,        _W|_L,       /* 240-243 */
0126 _W|_L,         _W|_L,        _W|_L,        _W|_P,       /* 244-247 */
0127 _W|_L,         _W|_L,        _W|_L,        _W|_L,       /* 248-251 */
0128 _W|_L,         _W|_L,        _W|_L,        _W|_L};      /* 252-255 */
0129 
0130 static inline int match_token(struct ts_fsm_token *t, u8 d)
0131 {
0132     if (t->type)
0133         return (token_lookup_tbl[d] & t->type) != 0;
0134     else
0135         return t->value == d;
0136 }
0137 
0138 static unsigned int fsm_find(struct ts_config *conf, struct ts_state *state)
0139 {
0140     struct ts_fsm *fsm = ts_config_priv(conf);
0141     struct ts_fsm_token *cur = NULL, *next;
0142     unsigned int match_start, block_idx = 0, tok_idx;
0143     unsigned block_len = 0, strict, consumed = state->offset;
0144     const u8 *data;
0145 
0146 #define GET_NEXT_BLOCK()        \
0147 ({  consumed += block_idx;      \
0148     block_idx = 0;          \
0149     block_len = conf->get_next_block(consumed, &data, conf, state); })
0150 
0151 #define TOKEN_MISMATCH()        \
0152     do {                \
0153         if (strict)     \
0154             goto no_match;  \
0155         block_idx++;        \
0156         goto startover;     \
0157     } while(0)
0158 
0159 #define end_of_data() unlikely(block_idx >= block_len && !GET_NEXT_BLOCK())
0160 
0161     if (end_of_data())
0162         goto no_match;
0163 
0164     strict = fsm->tokens[0].recur != TS_FSM_HEAD_IGNORE;
0165 
0166 startover:
0167     match_start = consumed + block_idx;
0168 
0169     for (tok_idx = 0; tok_idx < fsm->ntokens; tok_idx++) {
0170         cur = &fsm->tokens[tok_idx];
0171 
0172         if (likely(tok_idx < (fsm->ntokens - 1)))
0173             next = &fsm->tokens[tok_idx + 1];
0174         else
0175             next = NULL;
0176 
0177         switch (cur->recur) {
0178         case TS_FSM_SINGLE:
0179             if (end_of_data())
0180                 goto no_match;
0181 
0182             if (!match_token(cur, data[block_idx]))
0183                 TOKEN_MISMATCH();
0184             break;
0185 
0186         case TS_FSM_PERHAPS:
0187             if (end_of_data() ||
0188                 !match_token(cur, data[block_idx]))
0189                 continue;
0190             break;
0191 
0192         case TS_FSM_MULTI:
0193             if (end_of_data())
0194                 goto no_match;
0195 
0196             if (!match_token(cur, data[block_idx]))
0197                 TOKEN_MISMATCH();
0198 
0199             block_idx++;
0200             /* fall through */
0201 
0202         case TS_FSM_ANY:
0203             if (next == NULL)
0204                 goto found_match;
0205 
0206             if (end_of_data())
0207                 continue;
0208 
0209             while (!match_token(next, data[block_idx])) {
0210                 if (!match_token(cur, data[block_idx]))
0211                     TOKEN_MISMATCH();
0212                 block_idx++;
0213                 if (end_of_data())
0214                     goto no_match;
0215             }
0216             continue;
0217 
0218         /*
0219          * Optimization: Prefer small local loop over jumping
0220          * back and forth until garbage at head is munched.
0221          */
0222         case TS_FSM_HEAD_IGNORE:
0223             if (end_of_data())
0224                 continue;
0225 
0226             while (!match_token(next, data[block_idx])) {
0227                 /*
0228                  * Special case, don't start over upon
0229                  * a mismatch, give the user the
0230                  * chance to specify the type of data
0231                  * allowed to be ignored.
0232                  */
0233                 if (!match_token(cur, data[block_idx]))
0234                     goto no_match;
0235 
0236                 block_idx++;
0237                 if (end_of_data())
0238                     goto no_match;
0239             }
0240 
0241             match_start = consumed + block_idx;
0242             continue;
0243         }
0244 
0245         block_idx++;
0246     }
0247 
0248     if (end_of_data())
0249         goto found_match;
0250 
0251 no_match:
0252     return UINT_MAX;
0253 
0254 found_match:
0255     state->offset = consumed + block_idx;
0256     return match_start;
0257 }
0258 
0259 static struct ts_config *fsm_init(const void *pattern, unsigned int len,
0260                     gfp_t gfp_mask, int flags)
0261 {
0262     int i, err = -EINVAL;
0263     struct ts_config *conf;
0264     struct ts_fsm *fsm;
0265     struct ts_fsm_token *tokens = (struct ts_fsm_token *) pattern;
0266     unsigned int ntokens = len / sizeof(*tokens);
0267     size_t priv_size = sizeof(*fsm) + len;
0268 
0269     if (len  % sizeof(struct ts_fsm_token) || ntokens < 1)
0270         goto errout;
0271 
0272     if (flags & TS_IGNORECASE)
0273         goto errout;
0274 
0275     for (i = 0; i < ntokens; i++) {
0276         struct ts_fsm_token *t = &tokens[i];
0277 
0278         if (t->type > TS_FSM_TYPE_MAX || t->recur > TS_FSM_RECUR_MAX)
0279             goto errout;
0280 
0281         if (t->recur == TS_FSM_HEAD_IGNORE &&
0282             (i != 0 || i == (ntokens - 1)))
0283             goto errout;
0284     }
0285 
0286     conf = alloc_ts_config(priv_size, gfp_mask);
0287     if (IS_ERR(conf))
0288         return conf;
0289 
0290     conf->flags = flags;
0291     fsm = ts_config_priv(conf);
0292     fsm->ntokens = ntokens;
0293     memcpy(fsm->tokens, pattern, len);
0294 
0295     for (i = 0; i < fsm->ntokens; i++) {
0296         struct ts_fsm_token *t = &fsm->tokens[i];
0297         t->type = token_map[t->type];
0298     }
0299 
0300     return conf;
0301 
0302 errout:
0303     return ERR_PTR(err);
0304 }
0305 
0306 static void *fsm_get_pattern(struct ts_config *conf)
0307 {
0308     struct ts_fsm *fsm = ts_config_priv(conf);
0309     return fsm->tokens;
0310 }
0311 
0312 static unsigned int fsm_get_pattern_len(struct ts_config *conf)
0313 {
0314     struct ts_fsm *fsm = ts_config_priv(conf);
0315     return fsm->ntokens * sizeof(struct ts_fsm_token);
0316 }
0317 
0318 static struct ts_ops fsm_ops = {
0319     .name         = "fsm",
0320     .find         = fsm_find,
0321     .init         = fsm_init,
0322     .get_pattern      = fsm_get_pattern,
0323     .get_pattern_len  = fsm_get_pattern_len,
0324     .owner        = THIS_MODULE,
0325     .list         = LIST_HEAD_INIT(fsm_ops.list)
0326 };
0327 
0328 static int __init init_fsm(void)
0329 {
0330     return textsearch_register(&fsm_ops);
0331 }
0332 
0333 static void __exit exit_fsm(void)
0334 {
0335     textsearch_unregister(&fsm_ops);
0336 }
0337 
0338 MODULE_LICENSE("GPL");
0339 
0340 module_init(init_fsm);
0341 module_exit(exit_fsm);