Back to home page

OSCL-LXR

 
 

    


0001 // SPDX-License-Identifier: GPL-2.0-or-later
0002 /*
0003  * lib/ts_fsm.c    A naive finite state machine text search approach
0004  *
0005  * Authors: Thomas Graf <tgraf@suug.ch>
0006  *
0007  * ==========================================================================
0008  *
0009  *   A finite state machine consists of n states (struct ts_fsm_token)
0010  *   representing the pattern as a finite automaton. The data is read
0011  *   sequentially on an octet basis. Every state token specifies the number
0012  *   of recurrences and the type of value accepted which can be either a
0013  *   specific character or ctype based set of characters. The available
0014  *   type of recurrences include 1, (0|1), [0 n], and [1 n].
0015  *
0016  *   The algorithm differs between strict/non-strict mode specifying
0017  *   whether the pattern has to start at the first octet. Strict mode
0018  *   is enabled by default and can be disabled by inserting
0019  *   TS_FSM_HEAD_IGNORE as the first token in the chain.
0020  *
0021  *   The runtime performance of the algorithm should be around O(n),
0022  *   however while in strict mode the average runtime can be better.
0023  */
0024 
0025 #include <linux/module.h>
0026 #include <linux/types.h>
0027 #include <linux/string.h>
0028 #include <linux/ctype.h>
0029 #include <linux/textsearch.h>
0030 #include <linux/textsearch_fsm.h>
0031 
0032 struct ts_fsm
0033 {
0034     unsigned int        ntokens;
0035     struct ts_fsm_token tokens[];
0036 };
0037 
0038 /* other values derived from ctype.h */
0039 #define _A      0x100 /* ascii */
0040 #define _W      0x200 /* wildcard */
0041 
0042 /* Map to _ctype flags and some magic numbers */
0043 static const u16 token_map[TS_FSM_TYPE_MAX+1] = {
0044     [TS_FSM_SPECIFIC] = 0,
0045     [TS_FSM_WILDCARD] = _W,
0046     [TS_FSM_CNTRL]    = _C,
0047     [TS_FSM_LOWER]    = _L,
0048     [TS_FSM_UPPER]    = _U,
0049     [TS_FSM_PUNCT]    = _P,
0050     [TS_FSM_SPACE]    = _S,
0051     [TS_FSM_DIGIT]    = _D,
0052     [TS_FSM_XDIGIT]   = _D | _X,
0053     [TS_FSM_ALPHA]    = _U | _L,
0054     [TS_FSM_ALNUM]    = _U | _L | _D,
0055     [TS_FSM_PRINT]    = _P | _U | _L | _D | _SP,
0056     [TS_FSM_GRAPH]    = _P | _U | _L | _D,
0057     [TS_FSM_ASCII]    = _A,
0058 };
0059 
0060 static const u16 token_lookup_tbl[256] = {
0061 _W|_A|_C,      _W|_A|_C,     _W|_A|_C,     _W|_A|_C,        /*   0-  3 */
0062 _W|_A|_C,      _W|_A|_C,     _W|_A|_C,     _W|_A|_C,        /*   4-  7 */
0063 _W|_A|_C,      _W|_A|_C|_S,  _W|_A|_C|_S,  _W|_A|_C|_S,     /*   8- 11 */
0064 _W|_A|_C|_S,   _W|_A|_C|_S,  _W|_A|_C,     _W|_A|_C,        /*  12- 15 */
0065 _W|_A|_C,      _W|_A|_C,     _W|_A|_C,     _W|_A|_C,        /*  16- 19 */
0066 _W|_A|_C,      _W|_A|_C,     _W|_A|_C,     _W|_A|_C,        /*  20- 23 */
0067 _W|_A|_C,      _W|_A|_C,     _W|_A|_C,     _W|_A|_C,        /*  24- 27 */
0068 _W|_A|_C,      _W|_A|_C,     _W|_A|_C,     _W|_A|_C,        /*  28- 31 */
0069 _W|_A|_S|_SP,  _W|_A|_P,     _W|_A|_P,     _W|_A|_P,        /*  32- 35 */
0070 _W|_A|_P,      _W|_A|_P,     _W|_A|_P,     _W|_A|_P,        /*  36- 39 */
0071 _W|_A|_P,      _W|_A|_P,     _W|_A|_P,     _W|_A|_P,        /*  40- 43 */
0072 _W|_A|_P,      _W|_A|_P,     _W|_A|_P,     _W|_A|_P,        /*  44- 47 */
0073 _W|_A|_D,      _W|_A|_D,     _W|_A|_D,     _W|_A|_D,        /*  48- 51 */
0074 _W|_A|_D,      _W|_A|_D,     _W|_A|_D,     _W|_A|_D,        /*  52- 55 */
0075 _W|_A|_D,      _W|_A|_D,     _W|_A|_P,     _W|_A|_P,        /*  56- 59 */
0076 _W|_A|_P,      _W|_A|_P,     _W|_A|_P,     _W|_A|_P,        /*  60- 63 */
0077 _W|_A|_P,      _W|_A|_U|_X,  _W|_A|_U|_X,  _W|_A|_U|_X,     /*  64- 67 */
0078 _W|_A|_U|_X,   _W|_A|_U|_X,  _W|_A|_U|_X,  _W|_A|_U,        /*  68- 71 */
0079 _W|_A|_U,      _W|_A|_U,     _W|_A|_U,     _W|_A|_U,        /*  72- 75 */
0080 _W|_A|_U,      _W|_A|_U,     _W|_A|_U,     _W|_A|_U,        /*  76- 79 */
0081 _W|_A|_U,      _W|_A|_U,     _W|_A|_U,     _W|_A|_U,        /*  80- 83 */
0082 _W|_A|_U,      _W|_A|_U,     _W|_A|_U,     _W|_A|_U,        /*  84- 87 */
0083 _W|_A|_U,      _W|_A|_U,     _W|_A|_U,     _W|_A|_P,        /*  88- 91 */
0084 _W|_A|_P,      _W|_A|_P,     _W|_A|_P,     _W|_A|_P,        /*  92- 95 */
0085 _W|_A|_P,      _W|_A|_L|_X,  _W|_A|_L|_X,  _W|_A|_L|_X,     /*  96- 99 */
0086 _W|_A|_L|_X,   _W|_A|_L|_X,  _W|_A|_L|_X,  _W|_A|_L,        /* 100-103 */
0087 _W|_A|_L,      _W|_A|_L,     _W|_A|_L,     _W|_A|_L,        /* 104-107 */
0088 _W|_A|_L,      _W|_A|_L,     _W|_A|_L,     _W|_A|_L,        /* 108-111 */
0089 _W|_A|_L,      _W|_A|_L,     _W|_A|_L,     _W|_A|_L,        /* 112-115 */
0090 _W|_A|_L,      _W|_A|_L,     _W|_A|_L,     _W|_A|_L,        /* 116-119 */
0091 _W|_A|_L,      _W|_A|_L,     _W|_A|_L,     _W|_A|_P,        /* 120-123 */
0092 _W|_A|_P,      _W|_A|_P,     _W|_A|_P,     _W|_A|_C,        /* 124-127 */
0093 _W,            _W,           _W,           _W,          /* 128-131 */
0094 _W,            _W,           _W,           _W,          /* 132-135 */
0095 _W,            _W,           _W,           _W,          /* 136-139 */
0096 _W,            _W,           _W,           _W,          /* 140-143 */
0097 _W,            _W,           _W,           _W,          /* 144-147 */
0098 _W,            _W,           _W,           _W,          /* 148-151 */
0099 _W,            _W,           _W,           _W,          /* 152-155 */
0100 _W,            _W,           _W,           _W,          /* 156-159 */
0101 _W|_S|_SP,     _W|_P,        _W|_P,        _W|_P,       /* 160-163 */
0102 _W|_P,         _W|_P,        _W|_P,        _W|_P,       /* 164-167 */
0103 _W|_P,         _W|_P,        _W|_P,        _W|_P,       /* 168-171 */
0104 _W|_P,         _W|_P,        _W|_P,        _W|_P,       /* 172-175 */
0105 _W|_P,         _W|_P,        _W|_P,        _W|_P,       /* 176-179 */
0106 _W|_P,         _W|_P,        _W|_P,        _W|_P,       /* 180-183 */
0107 _W|_P,         _W|_P,        _W|_P,        _W|_P,       /* 184-187 */
0108 _W|_P,         _W|_P,        _W|_P,        _W|_P,       /* 188-191 */
0109 _W|_U,         _W|_U,        _W|_U,        _W|_U,       /* 192-195 */
0110 _W|_U,         _W|_U,        _W|_U,        _W|_U,       /* 196-199 */
0111 _W|_U,         _W|_U,        _W|_U,        _W|_U,       /* 200-203 */
0112 _W|_U,         _W|_U,        _W|_U,        _W|_U,       /* 204-207 */
0113 _W|_U,         _W|_U,        _W|_U,        _W|_U,       /* 208-211 */
0114 _W|_U,         _W|_U,        _W|_U,        _W|_P,       /* 212-215 */
0115 _W|_U,         _W|_U,        _W|_U,        _W|_U,       /* 216-219 */
0116 _W|_U,         _W|_U,        _W|_U,        _W|_L,       /* 220-223 */
0117 _W|_L,         _W|_L,        _W|_L,        _W|_L,       /* 224-227 */
0118 _W|_L,         _W|_L,        _W|_L,        _W|_L,       /* 228-231 */
0119 _W|_L,         _W|_L,        _W|_L,        _W|_L,       /* 232-235 */
0120 _W|_L,         _W|_L,        _W|_L,        _W|_L,       /* 236-239 */
0121 _W|_L,         _W|_L,        _W|_L,        _W|_L,       /* 240-243 */
0122 _W|_L,         _W|_L,        _W|_L,        _W|_P,       /* 244-247 */
0123 _W|_L,         _W|_L,        _W|_L,        _W|_L,       /* 248-251 */
0124 _W|_L,         _W|_L,        _W|_L,        _W|_L};      /* 252-255 */
0125 
0126 static inline int match_token(struct ts_fsm_token *t, u8 d)
0127 {
0128     if (t->type)
0129         return (token_lookup_tbl[d] & t->type) != 0;
0130     else
0131         return t->value == d;
0132 }
0133 
0134 static unsigned int fsm_find(struct ts_config *conf, struct ts_state *state)
0135 {
0136     struct ts_fsm *fsm = ts_config_priv(conf);
0137     struct ts_fsm_token *cur = NULL, *next;
0138     unsigned int match_start, block_idx = 0, tok_idx;
0139     unsigned block_len = 0, strict, consumed = state->offset;
0140     const u8 *data;
0141 
0142 #define GET_NEXT_BLOCK()        \
0143 ({  consumed += block_idx;      \
0144     block_idx = 0;          \
0145     block_len = conf->get_next_block(consumed, &data, conf, state); })
0146 
0147 #define TOKEN_MISMATCH()        \
0148     do {                \
0149         if (strict)     \
0150             goto no_match;  \
0151         block_idx++;        \
0152         goto startover;     \
0153     } while(0)
0154 
0155 #define end_of_data() unlikely(block_idx >= block_len && !GET_NEXT_BLOCK())
0156 
0157     if (end_of_data())
0158         goto no_match;
0159 
0160     strict = fsm->tokens[0].recur != TS_FSM_HEAD_IGNORE;
0161 
0162 startover:
0163     match_start = consumed + block_idx;
0164 
0165     for (tok_idx = 0; tok_idx < fsm->ntokens; tok_idx++) {
0166         cur = &fsm->tokens[tok_idx];
0167 
0168         if (likely(tok_idx < (fsm->ntokens - 1)))
0169             next = &fsm->tokens[tok_idx + 1];
0170         else
0171             next = NULL;
0172 
0173         switch (cur->recur) {
0174         case TS_FSM_SINGLE:
0175             if (end_of_data())
0176                 goto no_match;
0177 
0178             if (!match_token(cur, data[block_idx]))
0179                 TOKEN_MISMATCH();
0180             break;
0181 
0182         case TS_FSM_PERHAPS:
0183             if (end_of_data() ||
0184                 !match_token(cur, data[block_idx]))
0185                 continue;
0186             break;
0187 
0188         case TS_FSM_MULTI:
0189             if (end_of_data())
0190                 goto no_match;
0191 
0192             if (!match_token(cur, data[block_idx]))
0193                 TOKEN_MISMATCH();
0194 
0195             block_idx++;
0196             fallthrough;
0197 
0198         case TS_FSM_ANY:
0199             if (next == NULL)
0200                 goto found_match;
0201 
0202             if (end_of_data())
0203                 continue;
0204 
0205             while (!match_token(next, data[block_idx])) {
0206                 if (!match_token(cur, data[block_idx]))
0207                     TOKEN_MISMATCH();
0208                 block_idx++;
0209                 if (end_of_data())
0210                     goto no_match;
0211             }
0212             continue;
0213 
0214         /*
0215          * Optimization: Prefer small local loop over jumping
0216          * back and forth until garbage at head is munched.
0217          */
0218         case TS_FSM_HEAD_IGNORE:
0219             if (end_of_data())
0220                 continue;
0221 
0222             while (!match_token(next, data[block_idx])) {
0223                 /*
0224                  * Special case, don't start over upon
0225                  * a mismatch, give the user the
0226                  * chance to specify the type of data
0227                  * allowed to be ignored.
0228                  */
0229                 if (!match_token(cur, data[block_idx]))
0230                     goto no_match;
0231 
0232                 block_idx++;
0233                 if (end_of_data())
0234                     goto no_match;
0235             }
0236 
0237             match_start = consumed + block_idx;
0238             continue;
0239         }
0240 
0241         block_idx++;
0242     }
0243 
0244     if (end_of_data())
0245         goto found_match;
0246 
0247 no_match:
0248     return UINT_MAX;
0249 
0250 found_match:
0251     state->offset = consumed + block_idx;
0252     return match_start;
0253 }
0254 
0255 static struct ts_config *fsm_init(const void *pattern, unsigned int len,
0256                     gfp_t gfp_mask, int flags)
0257 {
0258     int i, err = -EINVAL;
0259     struct ts_config *conf;
0260     struct ts_fsm *fsm;
0261     struct ts_fsm_token *tokens = (struct ts_fsm_token *) pattern;
0262     unsigned int ntokens = len / sizeof(*tokens);
0263     size_t priv_size = sizeof(*fsm) + len;
0264 
0265     if (len  % sizeof(struct ts_fsm_token) || ntokens < 1)
0266         goto errout;
0267 
0268     if (flags & TS_IGNORECASE)
0269         goto errout;
0270 
0271     for (i = 0; i < ntokens; i++) {
0272         struct ts_fsm_token *t = &tokens[i];
0273 
0274         if (t->type > TS_FSM_TYPE_MAX || t->recur > TS_FSM_RECUR_MAX)
0275             goto errout;
0276 
0277         if (t->recur == TS_FSM_HEAD_IGNORE &&
0278             (i != 0 || i == (ntokens - 1)))
0279             goto errout;
0280     }
0281 
0282     conf = alloc_ts_config(priv_size, gfp_mask);
0283     if (IS_ERR(conf))
0284         return conf;
0285 
0286     conf->flags = flags;
0287     fsm = ts_config_priv(conf);
0288     fsm->ntokens = ntokens;
0289     memcpy(fsm->tokens, pattern, len);
0290 
0291     for (i = 0; i < fsm->ntokens; i++) {
0292         struct ts_fsm_token *t = &fsm->tokens[i];
0293         t->type = token_map[t->type];
0294     }
0295 
0296     return conf;
0297 
0298 errout:
0299     return ERR_PTR(err);
0300 }
0301 
0302 static void *fsm_get_pattern(struct ts_config *conf)
0303 {
0304     struct ts_fsm *fsm = ts_config_priv(conf);
0305     return fsm->tokens;
0306 }
0307 
0308 static unsigned int fsm_get_pattern_len(struct ts_config *conf)
0309 {
0310     struct ts_fsm *fsm = ts_config_priv(conf);
0311     return fsm->ntokens * sizeof(struct ts_fsm_token);
0312 }
0313 
0314 static struct ts_ops fsm_ops = {
0315     .name         = "fsm",
0316     .find         = fsm_find,
0317     .init         = fsm_init,
0318     .get_pattern      = fsm_get_pattern,
0319     .get_pattern_len  = fsm_get_pattern_len,
0320     .owner        = THIS_MODULE,
0321     .list         = LIST_HEAD_INIT(fsm_ops.list)
0322 };
0323 
0324 static int __init init_fsm(void)
0325 {
0326     return textsearch_register(&fsm_ops);
0327 }
0328 
0329 static void __exit exit_fsm(void)
0330 {
0331     textsearch_unregister(&fsm_ops);
0332 }
0333 
0334 MODULE_LICENSE("GPL");
0335 
0336 module_init(init_fsm);
0337 module_exit(exit_fsm);