0001
0002
0003
0004
0005
0006
0007
0008 #include <linux/kernel.h>
0009 #include <linux/init.h>
0010 #include <linux/module.h>
0011 #include <linux/list.h>
0012 #include <linux/rbtree.h>
0013 #include <linux/netlink.h>
0014 #include <linux/netfilter.h>
0015 #include <linux/netfilter/nf_tables.h>
0016 #include <net/netfilter/nf_tables_core.h>
0017
0018 struct nft_rbtree {
0019 struct rb_root root;
0020 rwlock_t lock;
0021 seqcount_rwlock_t count;
0022 struct delayed_work gc_work;
0023 };
0024
0025 struct nft_rbtree_elem {
0026 struct rb_node node;
0027 struct nft_set_ext ext;
0028 };
0029
0030 static bool nft_rbtree_interval_end(const struct nft_rbtree_elem *rbe)
0031 {
0032 return nft_set_ext_exists(&rbe->ext, NFT_SET_EXT_FLAGS) &&
0033 (*nft_set_ext_flags(&rbe->ext) & NFT_SET_ELEM_INTERVAL_END);
0034 }
0035
0036 static bool nft_rbtree_interval_start(const struct nft_rbtree_elem *rbe)
0037 {
0038 return !nft_rbtree_interval_end(rbe);
0039 }
0040
0041 static bool nft_rbtree_equal(const struct nft_set *set, const void *this,
0042 const struct nft_rbtree_elem *interval)
0043 {
0044 return memcmp(this, nft_set_ext_key(&interval->ext), set->klen) == 0;
0045 }
0046
0047 static bool __nft_rbtree_lookup(const struct net *net, const struct nft_set *set,
0048 const u32 *key, const struct nft_set_ext **ext,
0049 unsigned int seq)
0050 {
0051 struct nft_rbtree *priv = nft_set_priv(set);
0052 const struct nft_rbtree_elem *rbe, *interval = NULL;
0053 u8 genmask = nft_genmask_cur(net);
0054 const struct rb_node *parent;
0055 const void *this;
0056 int d;
0057
0058 parent = rcu_dereference_raw(priv->root.rb_node);
0059 while (parent != NULL) {
0060 if (read_seqcount_retry(&priv->count, seq))
0061 return false;
0062
0063 rbe = rb_entry(parent, struct nft_rbtree_elem, node);
0064
0065 this = nft_set_ext_key(&rbe->ext);
0066 d = memcmp(this, key, set->klen);
0067 if (d < 0) {
0068 parent = rcu_dereference_raw(parent->rb_left);
0069 if (interval &&
0070 nft_rbtree_equal(set, this, interval) &&
0071 nft_rbtree_interval_end(rbe) &&
0072 nft_rbtree_interval_start(interval))
0073 continue;
0074 interval = rbe;
0075 } else if (d > 0)
0076 parent = rcu_dereference_raw(parent->rb_right);
0077 else {
0078 if (!nft_set_elem_active(&rbe->ext, genmask)) {
0079 parent = rcu_dereference_raw(parent->rb_left);
0080 continue;
0081 }
0082
0083 if (nft_set_elem_expired(&rbe->ext))
0084 return false;
0085
0086 if (nft_rbtree_interval_end(rbe)) {
0087 if (nft_set_is_anonymous(set))
0088 return false;
0089 parent = rcu_dereference_raw(parent->rb_left);
0090 interval = NULL;
0091 continue;
0092 }
0093
0094 *ext = &rbe->ext;
0095 return true;
0096 }
0097 }
0098
0099 if (set->flags & NFT_SET_INTERVAL && interval != NULL &&
0100 nft_set_elem_active(&interval->ext, genmask) &&
0101 !nft_set_elem_expired(&interval->ext) &&
0102 nft_rbtree_interval_start(interval)) {
0103 *ext = &interval->ext;
0104 return true;
0105 }
0106
0107 return false;
0108 }
0109
0110 INDIRECT_CALLABLE_SCOPE
0111 bool nft_rbtree_lookup(const struct net *net, const struct nft_set *set,
0112 const u32 *key, const struct nft_set_ext **ext)
0113 {
0114 struct nft_rbtree *priv = nft_set_priv(set);
0115 unsigned int seq = read_seqcount_begin(&priv->count);
0116 bool ret;
0117
0118 ret = __nft_rbtree_lookup(net, set, key, ext, seq);
0119 if (ret || !read_seqcount_retry(&priv->count, seq))
0120 return ret;
0121
0122 read_lock_bh(&priv->lock);
0123 seq = read_seqcount_begin(&priv->count);
0124 ret = __nft_rbtree_lookup(net, set, key, ext, seq);
0125 read_unlock_bh(&priv->lock);
0126
0127 return ret;
0128 }
0129
0130 static bool __nft_rbtree_get(const struct net *net, const struct nft_set *set,
0131 const u32 *key, struct nft_rbtree_elem **elem,
0132 unsigned int seq, unsigned int flags, u8 genmask)
0133 {
0134 struct nft_rbtree_elem *rbe, *interval = NULL;
0135 struct nft_rbtree *priv = nft_set_priv(set);
0136 const struct rb_node *parent;
0137 const void *this;
0138 int d;
0139
0140 parent = rcu_dereference_raw(priv->root.rb_node);
0141 while (parent != NULL) {
0142 if (read_seqcount_retry(&priv->count, seq))
0143 return false;
0144
0145 rbe = rb_entry(parent, struct nft_rbtree_elem, node);
0146
0147 this = nft_set_ext_key(&rbe->ext);
0148 d = memcmp(this, key, set->klen);
0149 if (d < 0) {
0150 parent = rcu_dereference_raw(parent->rb_left);
0151 if (!(flags & NFT_SET_ELEM_INTERVAL_END))
0152 interval = rbe;
0153 } else if (d > 0) {
0154 parent = rcu_dereference_raw(parent->rb_right);
0155 if (flags & NFT_SET_ELEM_INTERVAL_END)
0156 interval = rbe;
0157 } else {
0158 if (!nft_set_elem_active(&rbe->ext, genmask)) {
0159 parent = rcu_dereference_raw(parent->rb_left);
0160 continue;
0161 }
0162
0163 if (nft_set_elem_expired(&rbe->ext))
0164 return false;
0165
0166 if (!nft_set_ext_exists(&rbe->ext, NFT_SET_EXT_FLAGS) ||
0167 (*nft_set_ext_flags(&rbe->ext) & NFT_SET_ELEM_INTERVAL_END) ==
0168 (flags & NFT_SET_ELEM_INTERVAL_END)) {
0169 *elem = rbe;
0170 return true;
0171 }
0172
0173 if (nft_rbtree_interval_end(rbe))
0174 interval = NULL;
0175
0176 parent = rcu_dereference_raw(parent->rb_left);
0177 }
0178 }
0179
0180 if (set->flags & NFT_SET_INTERVAL && interval != NULL &&
0181 nft_set_elem_active(&interval->ext, genmask) &&
0182 !nft_set_elem_expired(&interval->ext) &&
0183 ((!nft_rbtree_interval_end(interval) &&
0184 !(flags & NFT_SET_ELEM_INTERVAL_END)) ||
0185 (nft_rbtree_interval_end(interval) &&
0186 (flags & NFT_SET_ELEM_INTERVAL_END)))) {
0187 *elem = interval;
0188 return true;
0189 }
0190
0191 return false;
0192 }
0193
0194 static void *nft_rbtree_get(const struct net *net, const struct nft_set *set,
0195 const struct nft_set_elem *elem, unsigned int flags)
0196 {
0197 struct nft_rbtree *priv = nft_set_priv(set);
0198 unsigned int seq = read_seqcount_begin(&priv->count);
0199 struct nft_rbtree_elem *rbe = ERR_PTR(-ENOENT);
0200 const u32 *key = (const u32 *)&elem->key.val;
0201 u8 genmask = nft_genmask_cur(net);
0202 bool ret;
0203
0204 ret = __nft_rbtree_get(net, set, key, &rbe, seq, flags, genmask);
0205 if (ret || !read_seqcount_retry(&priv->count, seq))
0206 return rbe;
0207
0208 read_lock_bh(&priv->lock);
0209 seq = read_seqcount_begin(&priv->count);
0210 ret = __nft_rbtree_get(net, set, key, &rbe, seq, flags, genmask);
0211 if (!ret)
0212 rbe = ERR_PTR(-ENOENT);
0213 read_unlock_bh(&priv->lock);
0214
0215 return rbe;
0216 }
0217
0218 static int __nft_rbtree_insert(const struct net *net, const struct nft_set *set,
0219 struct nft_rbtree_elem *new,
0220 struct nft_set_ext **ext)
0221 {
0222 bool overlap = false, dup_end_left = false, dup_end_right = false;
0223 struct nft_rbtree *priv = nft_set_priv(set);
0224 u8 genmask = nft_genmask_next(net);
0225 struct nft_rbtree_elem *rbe;
0226 struct rb_node *parent, **p;
0227 int d;
0228
0229
0230
0231
0232
0233
0234
0235
0236
0237
0238
0239
0240
0241
0242
0243
0244
0245
0246
0247
0248
0249
0250
0251
0252
0253
0254
0255
0256
0257
0258
0259
0260
0261
0262
0263
0264
0265
0266
0267
0268
0269
0270
0271
0272
0273
0274
0275
0276
0277
0278
0279
0280
0281
0282 parent = NULL;
0283 p = &priv->root.rb_node;
0284 while (*p != NULL) {
0285 parent = *p;
0286 rbe = rb_entry(parent, struct nft_rbtree_elem, node);
0287 d = memcmp(nft_set_ext_key(&rbe->ext),
0288 nft_set_ext_key(&new->ext),
0289 set->klen);
0290 if (d < 0) {
0291 p = &parent->rb_left;
0292
0293 if (nft_rbtree_interval_start(new)) {
0294 if (nft_rbtree_interval_end(rbe) &&
0295 nft_set_elem_active(&rbe->ext, genmask) &&
0296 !nft_set_elem_expired(&rbe->ext) && !*p)
0297 overlap = false;
0298 } else {
0299 if (dup_end_left && !*p)
0300 return -ENOTEMPTY;
0301
0302 overlap = nft_rbtree_interval_end(rbe) &&
0303 nft_set_elem_active(&rbe->ext,
0304 genmask) &&
0305 !nft_set_elem_expired(&rbe->ext);
0306
0307 if (overlap) {
0308 dup_end_right = true;
0309 continue;
0310 }
0311 }
0312 } else if (d > 0) {
0313 p = &parent->rb_right;
0314
0315 if (nft_rbtree_interval_end(new)) {
0316 if (dup_end_right && !*p)
0317 return -ENOTEMPTY;
0318
0319 overlap = nft_rbtree_interval_end(rbe) &&
0320 nft_set_elem_active(&rbe->ext,
0321 genmask) &&
0322 !nft_set_elem_expired(&rbe->ext);
0323
0324 if (overlap) {
0325 dup_end_left = true;
0326 continue;
0327 }
0328 } else if (nft_set_elem_active(&rbe->ext, genmask) &&
0329 !nft_set_elem_expired(&rbe->ext)) {
0330 overlap = nft_rbtree_interval_end(rbe);
0331 }
0332 } else {
0333 if (nft_rbtree_interval_end(rbe) &&
0334 nft_rbtree_interval_start(new)) {
0335 p = &parent->rb_left;
0336
0337 if (nft_set_elem_active(&rbe->ext, genmask) &&
0338 !nft_set_elem_expired(&rbe->ext))
0339 overlap = false;
0340 } else if (nft_rbtree_interval_start(rbe) &&
0341 nft_rbtree_interval_end(new)) {
0342 p = &parent->rb_right;
0343
0344 if (nft_set_elem_active(&rbe->ext, genmask) &&
0345 !nft_set_elem_expired(&rbe->ext))
0346 overlap = false;
0347 } else if (nft_set_elem_active(&rbe->ext, genmask) &&
0348 !nft_set_elem_expired(&rbe->ext)) {
0349 *ext = &rbe->ext;
0350 return -EEXIST;
0351 } else {
0352 overlap = false;
0353 if (nft_rbtree_interval_end(rbe))
0354 p = &parent->rb_left;
0355 else
0356 p = &parent->rb_right;
0357 }
0358 }
0359
0360 dup_end_left = dup_end_right = false;
0361 }
0362
0363 if (overlap)
0364 return -ENOTEMPTY;
0365
0366 rb_link_node_rcu(&new->node, parent, p);
0367 rb_insert_color(&new->node, &priv->root);
0368 return 0;
0369 }
0370
0371 static int nft_rbtree_insert(const struct net *net, const struct nft_set *set,
0372 const struct nft_set_elem *elem,
0373 struct nft_set_ext **ext)
0374 {
0375 struct nft_rbtree *priv = nft_set_priv(set);
0376 struct nft_rbtree_elem *rbe = elem->priv;
0377 int err;
0378
0379 write_lock_bh(&priv->lock);
0380 write_seqcount_begin(&priv->count);
0381 err = __nft_rbtree_insert(net, set, rbe, ext);
0382 write_seqcount_end(&priv->count);
0383 write_unlock_bh(&priv->lock);
0384
0385 return err;
0386 }
0387
0388 static void nft_rbtree_remove(const struct net *net,
0389 const struct nft_set *set,
0390 const struct nft_set_elem *elem)
0391 {
0392 struct nft_rbtree *priv = nft_set_priv(set);
0393 struct nft_rbtree_elem *rbe = elem->priv;
0394
0395 write_lock_bh(&priv->lock);
0396 write_seqcount_begin(&priv->count);
0397 rb_erase(&rbe->node, &priv->root);
0398 write_seqcount_end(&priv->count);
0399 write_unlock_bh(&priv->lock);
0400 }
0401
0402 static void nft_rbtree_activate(const struct net *net,
0403 const struct nft_set *set,
0404 const struct nft_set_elem *elem)
0405 {
0406 struct nft_rbtree_elem *rbe = elem->priv;
0407
0408 nft_set_elem_change_active(net, set, &rbe->ext);
0409 nft_set_elem_clear_busy(&rbe->ext);
0410 }
0411
0412 static bool nft_rbtree_flush(const struct net *net,
0413 const struct nft_set *set, void *priv)
0414 {
0415 struct nft_rbtree_elem *rbe = priv;
0416
0417 if (!nft_set_elem_mark_busy(&rbe->ext) ||
0418 !nft_is_active(net, &rbe->ext)) {
0419 nft_set_elem_change_active(net, set, &rbe->ext);
0420 return true;
0421 }
0422 return false;
0423 }
0424
0425 static void *nft_rbtree_deactivate(const struct net *net,
0426 const struct nft_set *set,
0427 const struct nft_set_elem *elem)
0428 {
0429 const struct nft_rbtree *priv = nft_set_priv(set);
0430 const struct rb_node *parent = priv->root.rb_node;
0431 struct nft_rbtree_elem *rbe, *this = elem->priv;
0432 u8 genmask = nft_genmask_next(net);
0433 int d;
0434
0435 while (parent != NULL) {
0436 rbe = rb_entry(parent, struct nft_rbtree_elem, node);
0437
0438 d = memcmp(nft_set_ext_key(&rbe->ext), &elem->key.val,
0439 set->klen);
0440 if (d < 0)
0441 parent = parent->rb_left;
0442 else if (d > 0)
0443 parent = parent->rb_right;
0444 else {
0445 if (nft_rbtree_interval_end(rbe) &&
0446 nft_rbtree_interval_start(this)) {
0447 parent = parent->rb_left;
0448 continue;
0449 } else if (nft_rbtree_interval_start(rbe) &&
0450 nft_rbtree_interval_end(this)) {
0451 parent = parent->rb_right;
0452 continue;
0453 } else if (!nft_set_elem_active(&rbe->ext, genmask)) {
0454 parent = parent->rb_left;
0455 continue;
0456 }
0457 nft_rbtree_flush(net, set, rbe);
0458 return rbe;
0459 }
0460 }
0461 return NULL;
0462 }
0463
0464 static void nft_rbtree_walk(const struct nft_ctx *ctx,
0465 struct nft_set *set,
0466 struct nft_set_iter *iter)
0467 {
0468 struct nft_rbtree *priv = nft_set_priv(set);
0469 struct nft_rbtree_elem *rbe;
0470 struct nft_set_elem elem;
0471 struct rb_node *node;
0472
0473 read_lock_bh(&priv->lock);
0474 for (node = rb_first(&priv->root); node != NULL; node = rb_next(node)) {
0475 rbe = rb_entry(node, struct nft_rbtree_elem, node);
0476
0477 if (iter->count < iter->skip)
0478 goto cont;
0479 if (nft_set_elem_expired(&rbe->ext))
0480 goto cont;
0481 if (!nft_set_elem_active(&rbe->ext, iter->genmask))
0482 goto cont;
0483
0484 elem.priv = rbe;
0485
0486 iter->err = iter->fn(ctx, set, iter, &elem);
0487 if (iter->err < 0) {
0488 read_unlock_bh(&priv->lock);
0489 return;
0490 }
0491 cont:
0492 iter->count++;
0493 }
0494 read_unlock_bh(&priv->lock);
0495 }
0496
0497 static void nft_rbtree_gc(struct work_struct *work)
0498 {
0499 struct nft_rbtree_elem *rbe, *rbe_end = NULL, *rbe_prev = NULL;
0500 struct nft_set_gc_batch *gcb = NULL;
0501 struct nft_rbtree *priv;
0502 struct rb_node *node;
0503 struct nft_set *set;
0504
0505 priv = container_of(work, struct nft_rbtree, gc_work.work);
0506 set = nft_set_container_of(priv);
0507
0508 write_lock_bh(&priv->lock);
0509 write_seqcount_begin(&priv->count);
0510 for (node = rb_first(&priv->root); node != NULL; node = rb_next(node)) {
0511 rbe = rb_entry(node, struct nft_rbtree_elem, node);
0512
0513 if (nft_rbtree_interval_end(rbe)) {
0514 rbe_end = rbe;
0515 continue;
0516 }
0517 if (!nft_set_elem_expired(&rbe->ext))
0518 continue;
0519 if (nft_set_elem_mark_busy(&rbe->ext))
0520 continue;
0521
0522 if (rbe_prev) {
0523 rb_erase(&rbe_prev->node, &priv->root);
0524 rbe_prev = NULL;
0525 }
0526 gcb = nft_set_gc_batch_check(set, gcb, GFP_ATOMIC);
0527 if (!gcb)
0528 break;
0529
0530 atomic_dec(&set->nelems);
0531 nft_set_gc_batch_add(gcb, rbe);
0532 rbe_prev = rbe;
0533
0534 if (rbe_end) {
0535 atomic_dec(&set->nelems);
0536 nft_set_gc_batch_add(gcb, rbe_end);
0537 rb_erase(&rbe_end->node, &priv->root);
0538 rbe_end = NULL;
0539 }
0540 node = rb_next(node);
0541 if (!node)
0542 break;
0543 }
0544 if (rbe_prev)
0545 rb_erase(&rbe_prev->node, &priv->root);
0546 write_seqcount_end(&priv->count);
0547 write_unlock_bh(&priv->lock);
0548
0549 rbe = nft_set_catchall_gc(set);
0550 if (rbe) {
0551 gcb = nft_set_gc_batch_check(set, gcb, GFP_ATOMIC);
0552 if (gcb)
0553 nft_set_gc_batch_add(gcb, rbe);
0554 }
0555 nft_set_gc_batch_complete(gcb);
0556
0557 queue_delayed_work(system_power_efficient_wq, &priv->gc_work,
0558 nft_set_gc_interval(set));
0559 }
0560
0561 static u64 nft_rbtree_privsize(const struct nlattr * const nla[],
0562 const struct nft_set_desc *desc)
0563 {
0564 return sizeof(struct nft_rbtree);
0565 }
0566
0567 static int nft_rbtree_init(const struct nft_set *set,
0568 const struct nft_set_desc *desc,
0569 const struct nlattr * const nla[])
0570 {
0571 struct nft_rbtree *priv = nft_set_priv(set);
0572
0573 rwlock_init(&priv->lock);
0574 seqcount_rwlock_init(&priv->count, &priv->lock);
0575 priv->root = RB_ROOT;
0576
0577 INIT_DEFERRABLE_WORK(&priv->gc_work, nft_rbtree_gc);
0578 if (set->flags & NFT_SET_TIMEOUT)
0579 queue_delayed_work(system_power_efficient_wq, &priv->gc_work,
0580 nft_set_gc_interval(set));
0581
0582 return 0;
0583 }
0584
0585 static void nft_rbtree_destroy(const struct nft_set *set)
0586 {
0587 struct nft_rbtree *priv = nft_set_priv(set);
0588 struct nft_rbtree_elem *rbe;
0589 struct rb_node *node;
0590
0591 cancel_delayed_work_sync(&priv->gc_work);
0592 rcu_barrier();
0593 while ((node = priv->root.rb_node) != NULL) {
0594 rb_erase(node, &priv->root);
0595 rbe = rb_entry(node, struct nft_rbtree_elem, node);
0596 nft_set_elem_destroy(set, rbe, true);
0597 }
0598 }
0599
0600 static bool nft_rbtree_estimate(const struct nft_set_desc *desc, u32 features,
0601 struct nft_set_estimate *est)
0602 {
0603 if (desc->field_count > 1)
0604 return false;
0605
0606 if (desc->size)
0607 est->size = sizeof(struct nft_rbtree) +
0608 desc->size * sizeof(struct nft_rbtree_elem);
0609 else
0610 est->size = ~0;
0611
0612 est->lookup = NFT_SET_CLASS_O_LOG_N;
0613 est->space = NFT_SET_CLASS_O_N;
0614
0615 return true;
0616 }
0617
0618 const struct nft_set_type nft_set_rbtree_type = {
0619 .features = NFT_SET_INTERVAL | NFT_SET_MAP | NFT_SET_OBJECT | NFT_SET_TIMEOUT,
0620 .ops = {
0621 .privsize = nft_rbtree_privsize,
0622 .elemsize = offsetof(struct nft_rbtree_elem, ext),
0623 .estimate = nft_rbtree_estimate,
0624 .init = nft_rbtree_init,
0625 .destroy = nft_rbtree_destroy,
0626 .insert = nft_rbtree_insert,
0627 .remove = nft_rbtree_remove,
0628 .deactivate = nft_rbtree_deactivate,
0629 .flush = nft_rbtree_flush,
0630 .activate = nft_rbtree_activate,
0631 .lookup = nft_rbtree_lookup,
0632 .walk = nft_rbtree_walk,
0633 .get = nft_rbtree_get,
0634 },
0635 };