Back to home page

LXR

 
 

    


0001 /*
0002  * lib/ts_bm.c      Boyer-Moore text search implementation
0003  *
0004  *      This program is free software; you can redistribute it and/or
0005  *      modify it under the terms of the GNU General Public License
0006  *      as published by the Free Software Foundation; either version
0007  *      2 of the License, or (at your option) any later version.
0008  *
0009  * Authors: Pablo Neira Ayuso <pablo@eurodev.net>
0010  *
0011  * ==========================================================================
0012  * 
0013  *   Implements Boyer-Moore string matching algorithm:
0014  *
0015  *   [1] A Fast String Searching Algorithm, R.S. Boyer and Moore.
0016  *       Communications of the Association for Computing Machinery, 
0017  *       20(10), 1977, pp. 762-772.
0018  *       http://www.cs.utexas.edu/users/moore/publications/fstrpos.pdf
0019  *
0020  *   [2] Handbook of Exact String Matching Algorithms, Thierry Lecroq, 2004
0021  *       http://www-igm.univ-mlv.fr/~lecroq/string/string.pdf
0022  *
0023  *   Note: Since Boyer-Moore (BM) performs searches for matchings from right 
0024  *   to left, it's still possible that a matching could be spread over 
0025  *   multiple blocks, in that case this algorithm won't find any coincidence.
0026  *   
0027  *   If you're willing to ensure that such thing won't ever happen, use the
0028  *   Knuth-Pratt-Morris (KMP) implementation instead. In conclusion, choose 
0029  *   the proper string search algorithm depending on your setting. 
0030  *
0031  *   Say you're using the textsearch infrastructure for filtering, NIDS or 
0032  *   any similar security focused purpose, then go KMP. Otherwise, if you 
0033  *   really care about performance, say you're classifying packets to apply
0034  *   Quality of Service (QoS) policies, and you don't mind about possible
0035  *   matchings spread over multiple fragments, then go BM.
0036  */
0037 
0038 #include <linux/kernel.h>
0039 #include <linux/module.h>
0040 #include <linux/types.h>
0041 #include <linux/string.h>
0042 #include <linux/ctype.h>
0043 #include <linux/textsearch.h>
0044 
0045 /* Alphabet size, use ASCII */
0046 #define ASIZE 256
0047 
0048 #if 0
0049 #define DEBUGP printk
0050 #else
0051 #define DEBUGP(args, format...)
0052 #endif
0053 
0054 struct ts_bm
0055 {
0056     u8 *        pattern;
0057     unsigned int    patlen;
0058     unsigned int    bad_shift[ASIZE];
0059     unsigned int    good_shift[0];
0060 };
0061 
0062 static unsigned int bm_find(struct ts_config *conf, struct ts_state *state)
0063 {
0064     struct ts_bm *bm = ts_config_priv(conf);
0065     unsigned int i, text_len, consumed = state->offset;
0066     const u8 *text;
0067     int shift = bm->patlen - 1, bs;
0068     const u8 icase = conf->flags & TS_IGNORECASE;
0069 
0070     for (;;) {
0071         text_len = conf->get_next_block(consumed, &text, conf, state);
0072 
0073         if (unlikely(text_len == 0))
0074             break;
0075 
0076         while (shift < text_len) {
0077             DEBUGP("Searching in position %d (%c)\n", 
0078                 shift, text[shift]);
0079             for (i = 0; i < bm->patlen; i++) 
0080                 if ((icase ? toupper(text[shift-i])
0081                     : text[shift-i])
0082                     != bm->pattern[bm->patlen-1-i])
0083                      goto next;
0084 
0085             /* London calling... */
0086             DEBUGP("found!\n");
0087             return consumed += (shift-(bm->patlen-1));
0088 
0089 next:           bs = bm->bad_shift[text[shift-i]];
0090 
0091             /* Now jumping to... */
0092             shift = max_t(int, shift-i+bs, shift+bm->good_shift[i]);
0093         }
0094         consumed += text_len;
0095     }
0096 
0097     return UINT_MAX;
0098 }
0099 
0100 static int subpattern(u8 *pattern, int i, int j, int g)
0101 {
0102     int x = i+g-1, y = j+g-1, ret = 0;
0103 
0104     while(pattern[x--] == pattern[y--]) {
0105         if (y < 0) {
0106             ret = 1;
0107             break;
0108         }
0109         if (--g == 0) {
0110             ret = pattern[i-1] != pattern[j-1];
0111             break;
0112         }
0113     }
0114 
0115     return ret;
0116 }
0117 
0118 static void compute_prefix_tbl(struct ts_bm *bm, int flags)
0119 {
0120     int i, j, g;
0121 
0122     for (i = 0; i < ASIZE; i++)
0123         bm->bad_shift[i] = bm->patlen;
0124     for (i = 0; i < bm->patlen - 1; i++) {
0125         bm->bad_shift[bm->pattern[i]] = bm->patlen - 1 - i;
0126         if (flags & TS_IGNORECASE)
0127             bm->bad_shift[tolower(bm->pattern[i])]
0128                 = bm->patlen - 1 - i;
0129     }
0130 
0131     /* Compute the good shift array, used to match reocurrences 
0132      * of a subpattern */
0133     bm->good_shift[0] = 1;
0134     for (i = 1; i < bm->patlen; i++)
0135         bm->good_shift[i] = bm->patlen;
0136         for (i = bm->patlen-1, g = 1; i > 0; g++, i--) {
0137         for (j = i-1; j >= 1-g ; j--)
0138             if (subpattern(bm->pattern, i, j, g)) {
0139                 bm->good_shift[g] = bm->patlen-j-g;
0140                 break;
0141             }
0142     }
0143 }
0144 
0145 static struct ts_config *bm_init(const void *pattern, unsigned int len,
0146                  gfp_t gfp_mask, int flags)
0147 {
0148     struct ts_config *conf;
0149     struct ts_bm *bm;
0150     int i;
0151     unsigned int prefix_tbl_len = len * sizeof(unsigned int);
0152     size_t priv_size = sizeof(*bm) + len + prefix_tbl_len;
0153 
0154     conf = alloc_ts_config(priv_size, gfp_mask);
0155     if (IS_ERR(conf))
0156         return conf;
0157 
0158     conf->flags = flags;
0159     bm = ts_config_priv(conf);
0160     bm->patlen = len;
0161     bm->pattern = (u8 *) bm->good_shift + prefix_tbl_len;
0162     if (flags & TS_IGNORECASE)
0163         for (i = 0; i < len; i++)
0164             bm->pattern[i] = toupper(((u8 *)pattern)[i]);
0165     else
0166         memcpy(bm->pattern, pattern, len);
0167     compute_prefix_tbl(bm, flags);
0168 
0169     return conf;
0170 }
0171 
0172 static void *bm_get_pattern(struct ts_config *conf)
0173 {
0174     struct ts_bm *bm = ts_config_priv(conf);
0175     return bm->pattern;
0176 }
0177 
0178 static unsigned int bm_get_pattern_len(struct ts_config *conf)
0179 {
0180     struct ts_bm *bm = ts_config_priv(conf);
0181     return bm->patlen;
0182 }
0183 
0184 static struct ts_ops bm_ops = {
0185     .name         = "bm",
0186     .find         = bm_find,
0187     .init         = bm_init,
0188     .get_pattern      = bm_get_pattern,
0189     .get_pattern_len  = bm_get_pattern_len,
0190     .owner        = THIS_MODULE,
0191     .list         = LIST_HEAD_INIT(bm_ops.list)
0192 };
0193 
0194 static int __init init_bm(void)
0195 {
0196     return textsearch_register(&bm_ops);
0197 }
0198 
0199 static void __exit exit_bm(void)
0200 {
0201     textsearch_unregister(&bm_ops);
0202 }
0203 
0204 MODULE_LICENSE("GPL");
0205 
0206 module_init(init_bm);
0207 module_exit(exit_bm);