Back to home page

OSCL-LXR

 
 

    


0001 // SPDX-License-Identifier: GPL-2.0-only
0002 /*
0003  * xfrm_policy.c
0004  *
0005  * Changes:
0006  *  Mitsuru KANDA @USAGI
0007  *  Kazunori MIYAZAWA @USAGI
0008  *  Kunihiro Ishiguro <kunihiro@ipinfusion.com>
0009  *      IPv6 support
0010  *  Kazunori MIYAZAWA @USAGI
0011  *  YOSHIFUJI Hideaki
0012  *      Split up af-specific portion
0013  *  Derek Atkins <derek@ihtfp.com>      Add the post_input processor
0014  *
0015  */
0016 
0017 #include <linux/err.h>
0018 #include <linux/slab.h>
0019 #include <linux/kmod.h>
0020 #include <linux/list.h>
0021 #include <linux/spinlock.h>
0022 #include <linux/workqueue.h>
0023 #include <linux/notifier.h>
0024 #include <linux/netdevice.h>
0025 #include <linux/netfilter.h>
0026 #include <linux/module.h>
0027 #include <linux/cache.h>
0028 #include <linux/cpu.h>
0029 #include <linux/audit.h>
0030 #include <linux/rhashtable.h>
0031 #include <linux/if_tunnel.h>
0032 #include <net/dst.h>
0033 #include <net/flow.h>
0034 #include <net/inet_ecn.h>
0035 #include <net/xfrm.h>
0036 #include <net/ip.h>
0037 #include <net/gre.h>
0038 #if IS_ENABLED(CONFIG_IPV6_MIP6)
0039 #include <net/mip6.h>
0040 #endif
0041 #ifdef CONFIG_XFRM_STATISTICS
0042 #include <net/snmp.h>
0043 #endif
0044 #ifdef CONFIG_XFRM_ESPINTCP
0045 #include <net/espintcp.h>
0046 #endif
0047 
0048 #include "xfrm_hash.h"
0049 
0050 #define XFRM_QUEUE_TMO_MIN ((unsigned)(HZ/10))
0051 #define XFRM_QUEUE_TMO_MAX ((unsigned)(60*HZ))
0052 #define XFRM_MAX_QUEUE_LEN  100
0053 
0054 struct xfrm_flo {
0055     struct dst_entry *dst_orig;
0056     u8 flags;
0057 };
0058 
0059 /* prefixes smaller than this are stored in lists, not trees. */
0060 #define INEXACT_PREFIXLEN_IPV4  16
0061 #define INEXACT_PREFIXLEN_IPV6  48
0062 
0063 struct xfrm_pol_inexact_node {
0064     struct rb_node node;
0065     union {
0066         xfrm_address_t addr;
0067         struct rcu_head rcu;
0068     };
0069     u8 prefixlen;
0070 
0071     struct rb_root root;
0072 
0073     /* the policies matching this node, can be empty list */
0074     struct hlist_head hhead;
0075 };
0076 
0077 /* xfrm inexact policy search tree:
0078  * xfrm_pol_inexact_bin = hash(dir,type,family,if_id);
0079  *  |
0080  * +---- root_d: sorted by daddr:prefix
0081  * |                 |
0082  * |        xfrm_pol_inexact_node
0083  * |                 |
0084  * |                 +- root: sorted by saddr/prefix
0085  * |                 |              |
0086  * |                 |         xfrm_pol_inexact_node
0087  * |                 |              |
0088  * |                 |              + root: unused
0089  * |                 |              |
0090  * |                 |              + hhead: saddr:daddr policies
0091  * |                 |
0092  * |                 +- coarse policies and all any:daddr policies
0093  * |
0094  * +---- root_s: sorted by saddr:prefix
0095  * |                 |
0096  * |        xfrm_pol_inexact_node
0097  * |                 |
0098  * |                 + root: unused
0099  * |                 |
0100  * |                 + hhead: saddr:any policies
0101  * |
0102  * +---- coarse policies and all any:any policies
0103  *
0104  * Lookups return four candidate lists:
0105  * 1. any:any list from top-level xfrm_pol_inexact_bin
0106  * 2. any:daddr list from daddr tree
0107  * 3. saddr:daddr list from 2nd level daddr tree
0108  * 4. saddr:any list from saddr tree
0109  *
0110  * This result set then needs to be searched for the policy with
0111  * the lowest priority.  If two results have same prio, youngest one wins.
0112  */
0113 
0114 struct xfrm_pol_inexact_key {
0115     possible_net_t net;
0116     u32 if_id;
0117     u16 family;
0118     u8 dir, type;
0119 };
0120 
0121 struct xfrm_pol_inexact_bin {
0122     struct xfrm_pol_inexact_key k;
0123     struct rhash_head head;
0124     /* list containing '*:*' policies */
0125     struct hlist_head hhead;
0126 
0127     seqcount_spinlock_t count;
0128     /* tree sorted by daddr/prefix */
0129     struct rb_root root_d;
0130 
0131     /* tree sorted by saddr/prefix */
0132     struct rb_root root_s;
0133 
0134     /* slow path below */
0135     struct list_head inexact_bins;
0136     struct rcu_head rcu;
0137 };
0138 
0139 enum xfrm_pol_inexact_candidate_type {
0140     XFRM_POL_CAND_BOTH,
0141     XFRM_POL_CAND_SADDR,
0142     XFRM_POL_CAND_DADDR,
0143     XFRM_POL_CAND_ANY,
0144 
0145     XFRM_POL_CAND_MAX,
0146 };
0147 
0148 struct xfrm_pol_inexact_candidates {
0149     struct hlist_head *res[XFRM_POL_CAND_MAX];
0150 };
0151 
0152 static DEFINE_SPINLOCK(xfrm_if_cb_lock);
0153 static struct xfrm_if_cb const __rcu *xfrm_if_cb __read_mostly;
0154 
0155 static DEFINE_SPINLOCK(xfrm_policy_afinfo_lock);
0156 static struct xfrm_policy_afinfo const __rcu *xfrm_policy_afinfo[AF_INET6 + 1]
0157                         __read_mostly;
0158 
0159 static struct kmem_cache *xfrm_dst_cache __ro_after_init;
0160 
0161 static struct rhashtable xfrm_policy_inexact_table;
0162 static const struct rhashtable_params xfrm_pol_inexact_params;
0163 
0164 static void xfrm_init_pmtu(struct xfrm_dst **bundle, int nr);
0165 static int stale_bundle(struct dst_entry *dst);
0166 static int xfrm_bundle_ok(struct xfrm_dst *xdst);
0167 static void xfrm_policy_queue_process(struct timer_list *t);
0168 
0169 static void __xfrm_policy_link(struct xfrm_policy *pol, int dir);
0170 static struct xfrm_policy *__xfrm_policy_unlink(struct xfrm_policy *pol,
0171                         int dir);
0172 
0173 static struct xfrm_pol_inexact_bin *
0174 xfrm_policy_inexact_lookup(struct net *net, u8 type, u16 family, u8 dir,
0175                u32 if_id);
0176 
0177 static struct xfrm_pol_inexact_bin *
0178 xfrm_policy_inexact_lookup_rcu(struct net *net,
0179                    u8 type, u16 family, u8 dir, u32 if_id);
0180 static struct xfrm_policy *
0181 xfrm_policy_insert_list(struct hlist_head *chain, struct xfrm_policy *policy,
0182             bool excl);
0183 static void xfrm_policy_insert_inexact_list(struct hlist_head *chain,
0184                         struct xfrm_policy *policy);
0185 
0186 static bool
0187 xfrm_policy_find_inexact_candidates(struct xfrm_pol_inexact_candidates *cand,
0188                     struct xfrm_pol_inexact_bin *b,
0189                     const xfrm_address_t *saddr,
0190                     const xfrm_address_t *daddr);
0191 
0192 static inline bool xfrm_pol_hold_rcu(struct xfrm_policy *policy)
0193 {
0194     return refcount_inc_not_zero(&policy->refcnt);
0195 }
0196 
0197 static inline bool
0198 __xfrm4_selector_match(const struct xfrm_selector *sel, const struct flowi *fl)
0199 {
0200     const struct flowi4 *fl4 = &fl->u.ip4;
0201 
0202     return  addr4_match(fl4->daddr, sel->daddr.a4, sel->prefixlen_d) &&
0203         addr4_match(fl4->saddr, sel->saddr.a4, sel->prefixlen_s) &&
0204         !((xfrm_flowi_dport(fl, &fl4->uli) ^ sel->dport) & sel->dport_mask) &&
0205         !((xfrm_flowi_sport(fl, &fl4->uli) ^ sel->sport) & sel->sport_mask) &&
0206         (fl4->flowi4_proto == sel->proto || !sel->proto) &&
0207         (fl4->flowi4_oif == sel->ifindex || !sel->ifindex);
0208 }
0209 
0210 static inline bool
0211 __xfrm6_selector_match(const struct xfrm_selector *sel, const struct flowi *fl)
0212 {
0213     const struct flowi6 *fl6 = &fl->u.ip6;
0214 
0215     return  addr_match(&fl6->daddr, &sel->daddr, sel->prefixlen_d) &&
0216         addr_match(&fl6->saddr, &sel->saddr, sel->prefixlen_s) &&
0217         !((xfrm_flowi_dport(fl, &fl6->uli) ^ sel->dport) & sel->dport_mask) &&
0218         !((xfrm_flowi_sport(fl, &fl6->uli) ^ sel->sport) & sel->sport_mask) &&
0219         (fl6->flowi6_proto == sel->proto || !sel->proto) &&
0220         (fl6->flowi6_oif == sel->ifindex || !sel->ifindex);
0221 }
0222 
0223 bool xfrm_selector_match(const struct xfrm_selector *sel, const struct flowi *fl,
0224              unsigned short family)
0225 {
0226     switch (family) {
0227     case AF_INET:
0228         return __xfrm4_selector_match(sel, fl);
0229     case AF_INET6:
0230         return __xfrm6_selector_match(sel, fl);
0231     }
0232     return false;
0233 }
0234 
0235 static const struct xfrm_policy_afinfo *xfrm_policy_get_afinfo(unsigned short family)
0236 {
0237     const struct xfrm_policy_afinfo *afinfo;
0238 
0239     if (unlikely(family >= ARRAY_SIZE(xfrm_policy_afinfo)))
0240         return NULL;
0241     rcu_read_lock();
0242     afinfo = rcu_dereference(xfrm_policy_afinfo[family]);
0243     if (unlikely(!afinfo))
0244         rcu_read_unlock();
0245     return afinfo;
0246 }
0247 
0248 /* Called with rcu_read_lock(). */
0249 static const struct xfrm_if_cb *xfrm_if_get_cb(void)
0250 {
0251     return rcu_dereference(xfrm_if_cb);
0252 }
0253 
0254 struct dst_entry *__xfrm_dst_lookup(struct net *net, int tos, int oif,
0255                     const xfrm_address_t *saddr,
0256                     const xfrm_address_t *daddr,
0257                     int family, u32 mark)
0258 {
0259     const struct xfrm_policy_afinfo *afinfo;
0260     struct dst_entry *dst;
0261 
0262     afinfo = xfrm_policy_get_afinfo(family);
0263     if (unlikely(afinfo == NULL))
0264         return ERR_PTR(-EAFNOSUPPORT);
0265 
0266     dst = afinfo->dst_lookup(net, tos, oif, saddr, daddr, mark);
0267 
0268     rcu_read_unlock();
0269 
0270     return dst;
0271 }
0272 EXPORT_SYMBOL(__xfrm_dst_lookup);
0273 
0274 static inline struct dst_entry *xfrm_dst_lookup(struct xfrm_state *x,
0275                         int tos, int oif,
0276                         xfrm_address_t *prev_saddr,
0277                         xfrm_address_t *prev_daddr,
0278                         int family, u32 mark)
0279 {
0280     struct net *net = xs_net(x);
0281     xfrm_address_t *saddr = &x->props.saddr;
0282     xfrm_address_t *daddr = &x->id.daddr;
0283     struct dst_entry *dst;
0284 
0285     if (x->type->flags & XFRM_TYPE_LOCAL_COADDR) {
0286         saddr = x->coaddr;
0287         daddr = prev_daddr;
0288     }
0289     if (x->type->flags & XFRM_TYPE_REMOTE_COADDR) {
0290         saddr = prev_saddr;
0291         daddr = x->coaddr;
0292     }
0293 
0294     dst = __xfrm_dst_lookup(net, tos, oif, saddr, daddr, family, mark);
0295 
0296     if (!IS_ERR(dst)) {
0297         if (prev_saddr != saddr)
0298             memcpy(prev_saddr, saddr,  sizeof(*prev_saddr));
0299         if (prev_daddr != daddr)
0300             memcpy(prev_daddr, daddr,  sizeof(*prev_daddr));
0301     }
0302 
0303     return dst;
0304 }
0305 
0306 static inline unsigned long make_jiffies(long secs)
0307 {
0308     if (secs >= (MAX_SCHEDULE_TIMEOUT-1)/HZ)
0309         return MAX_SCHEDULE_TIMEOUT-1;
0310     else
0311         return secs*HZ;
0312 }
0313 
0314 static void xfrm_policy_timer(struct timer_list *t)
0315 {
0316     struct xfrm_policy *xp = from_timer(xp, t, timer);
0317     time64_t now = ktime_get_real_seconds();
0318     time64_t next = TIME64_MAX;
0319     int warn = 0;
0320     int dir;
0321 
0322     read_lock(&xp->lock);
0323 
0324     if (unlikely(xp->walk.dead))
0325         goto out;
0326 
0327     dir = xfrm_policy_id2dir(xp->index);
0328 
0329     if (xp->lft.hard_add_expires_seconds) {
0330         time64_t tmo = xp->lft.hard_add_expires_seconds +
0331             xp->curlft.add_time - now;
0332         if (tmo <= 0)
0333             goto expired;
0334         if (tmo < next)
0335             next = tmo;
0336     }
0337     if (xp->lft.hard_use_expires_seconds) {
0338         time64_t tmo = xp->lft.hard_use_expires_seconds +
0339             (xp->curlft.use_time ? : xp->curlft.add_time) - now;
0340         if (tmo <= 0)
0341             goto expired;
0342         if (tmo < next)
0343             next = tmo;
0344     }
0345     if (xp->lft.soft_add_expires_seconds) {
0346         time64_t tmo = xp->lft.soft_add_expires_seconds +
0347             xp->curlft.add_time - now;
0348         if (tmo <= 0) {
0349             warn = 1;
0350             tmo = XFRM_KM_TIMEOUT;
0351         }
0352         if (tmo < next)
0353             next = tmo;
0354     }
0355     if (xp->lft.soft_use_expires_seconds) {
0356         time64_t tmo = xp->lft.soft_use_expires_seconds +
0357             (xp->curlft.use_time ? : xp->curlft.add_time) - now;
0358         if (tmo <= 0) {
0359             warn = 1;
0360             tmo = XFRM_KM_TIMEOUT;
0361         }
0362         if (tmo < next)
0363             next = tmo;
0364     }
0365 
0366     if (warn)
0367         km_policy_expired(xp, dir, 0, 0);
0368     if (next != TIME64_MAX &&
0369         !mod_timer(&xp->timer, jiffies + make_jiffies(next)))
0370         xfrm_pol_hold(xp);
0371 
0372 out:
0373     read_unlock(&xp->lock);
0374     xfrm_pol_put(xp);
0375     return;
0376 
0377 expired:
0378     read_unlock(&xp->lock);
0379     if (!xfrm_policy_delete(xp, dir))
0380         km_policy_expired(xp, dir, 1, 0);
0381     xfrm_pol_put(xp);
0382 }
0383 
0384 /* Allocate xfrm_policy. Not used here, it is supposed to be used by pfkeyv2
0385  * SPD calls.
0386  */
0387 
0388 struct xfrm_policy *xfrm_policy_alloc(struct net *net, gfp_t gfp)
0389 {
0390     struct xfrm_policy *policy;
0391 
0392     policy = kzalloc(sizeof(struct xfrm_policy), gfp);
0393 
0394     if (policy) {
0395         write_pnet(&policy->xp_net, net);
0396         INIT_LIST_HEAD(&policy->walk.all);
0397         INIT_HLIST_NODE(&policy->bydst_inexact_list);
0398         INIT_HLIST_NODE(&policy->bydst);
0399         INIT_HLIST_NODE(&policy->byidx);
0400         rwlock_init(&policy->lock);
0401         refcount_set(&policy->refcnt, 1);
0402         skb_queue_head_init(&policy->polq.hold_queue);
0403         timer_setup(&policy->timer, xfrm_policy_timer, 0);
0404         timer_setup(&policy->polq.hold_timer,
0405                 xfrm_policy_queue_process, 0);
0406     }
0407     return policy;
0408 }
0409 EXPORT_SYMBOL(xfrm_policy_alloc);
0410 
0411 static void xfrm_policy_destroy_rcu(struct rcu_head *head)
0412 {
0413     struct xfrm_policy *policy = container_of(head, struct xfrm_policy, rcu);
0414 
0415     security_xfrm_policy_free(policy->security);
0416     kfree(policy);
0417 }
0418 
0419 /* Destroy xfrm_policy: descendant resources must be released to this moment. */
0420 
0421 void xfrm_policy_destroy(struct xfrm_policy *policy)
0422 {
0423     BUG_ON(!policy->walk.dead);
0424 
0425     if (del_timer(&policy->timer) || del_timer(&policy->polq.hold_timer))
0426         BUG();
0427 
0428     call_rcu(&policy->rcu, xfrm_policy_destroy_rcu);
0429 }
0430 EXPORT_SYMBOL(xfrm_policy_destroy);
0431 
0432 /* Rule must be locked. Release descendant resources, announce
0433  * entry dead. The rule must be unlinked from lists to the moment.
0434  */
0435 
0436 static void xfrm_policy_kill(struct xfrm_policy *policy)
0437 {
0438     write_lock_bh(&policy->lock);
0439     policy->walk.dead = 1;
0440     write_unlock_bh(&policy->lock);
0441 
0442     atomic_inc(&policy->genid);
0443 
0444     if (del_timer(&policy->polq.hold_timer))
0445         xfrm_pol_put(policy);
0446     skb_queue_purge(&policy->polq.hold_queue);
0447 
0448     if (del_timer(&policy->timer))
0449         xfrm_pol_put(policy);
0450 
0451     xfrm_pol_put(policy);
0452 }
0453 
0454 static unsigned int xfrm_policy_hashmax __read_mostly = 1 * 1024 * 1024;
0455 
0456 static inline unsigned int idx_hash(struct net *net, u32 index)
0457 {
0458     return __idx_hash(index, net->xfrm.policy_idx_hmask);
0459 }
0460 
0461 /* calculate policy hash thresholds */
0462 static void __get_hash_thresh(struct net *net,
0463                   unsigned short family, int dir,
0464                   u8 *dbits, u8 *sbits)
0465 {
0466     switch (family) {
0467     case AF_INET:
0468         *dbits = net->xfrm.policy_bydst[dir].dbits4;
0469         *sbits = net->xfrm.policy_bydst[dir].sbits4;
0470         break;
0471 
0472     case AF_INET6:
0473         *dbits = net->xfrm.policy_bydst[dir].dbits6;
0474         *sbits = net->xfrm.policy_bydst[dir].sbits6;
0475         break;
0476 
0477     default:
0478         *dbits = 0;
0479         *sbits = 0;
0480     }
0481 }
0482 
0483 static struct hlist_head *policy_hash_bysel(struct net *net,
0484                         const struct xfrm_selector *sel,
0485                         unsigned short family, int dir)
0486 {
0487     unsigned int hmask = net->xfrm.policy_bydst[dir].hmask;
0488     unsigned int hash;
0489     u8 dbits;
0490     u8 sbits;
0491 
0492     __get_hash_thresh(net, family, dir, &dbits, &sbits);
0493     hash = __sel_hash(sel, family, hmask, dbits, sbits);
0494 
0495     if (hash == hmask + 1)
0496         return NULL;
0497 
0498     return rcu_dereference_check(net->xfrm.policy_bydst[dir].table,
0499              lockdep_is_held(&net->xfrm.xfrm_policy_lock)) + hash;
0500 }
0501 
0502 static struct hlist_head *policy_hash_direct(struct net *net,
0503                          const xfrm_address_t *daddr,
0504                          const xfrm_address_t *saddr,
0505                          unsigned short family, int dir)
0506 {
0507     unsigned int hmask = net->xfrm.policy_bydst[dir].hmask;
0508     unsigned int hash;
0509     u8 dbits;
0510     u8 sbits;
0511 
0512     __get_hash_thresh(net, family, dir, &dbits, &sbits);
0513     hash = __addr_hash(daddr, saddr, family, hmask, dbits, sbits);
0514 
0515     return rcu_dereference_check(net->xfrm.policy_bydst[dir].table,
0516              lockdep_is_held(&net->xfrm.xfrm_policy_lock)) + hash;
0517 }
0518 
0519 static void xfrm_dst_hash_transfer(struct net *net,
0520                    struct hlist_head *list,
0521                    struct hlist_head *ndsttable,
0522                    unsigned int nhashmask,
0523                    int dir)
0524 {
0525     struct hlist_node *tmp, *entry0 = NULL;
0526     struct xfrm_policy *pol;
0527     unsigned int h0 = 0;
0528     u8 dbits;
0529     u8 sbits;
0530 
0531 redo:
0532     hlist_for_each_entry_safe(pol, tmp, list, bydst) {
0533         unsigned int h;
0534 
0535         __get_hash_thresh(net, pol->family, dir, &dbits, &sbits);
0536         h = __addr_hash(&pol->selector.daddr, &pol->selector.saddr,
0537                 pol->family, nhashmask, dbits, sbits);
0538         if (!entry0) {
0539             hlist_del_rcu(&pol->bydst);
0540             hlist_add_head_rcu(&pol->bydst, ndsttable + h);
0541             h0 = h;
0542         } else {
0543             if (h != h0)
0544                 continue;
0545             hlist_del_rcu(&pol->bydst);
0546             hlist_add_behind_rcu(&pol->bydst, entry0);
0547         }
0548         entry0 = &pol->bydst;
0549     }
0550     if (!hlist_empty(list)) {
0551         entry0 = NULL;
0552         goto redo;
0553     }
0554 }
0555 
0556 static void xfrm_idx_hash_transfer(struct hlist_head *list,
0557                    struct hlist_head *nidxtable,
0558                    unsigned int nhashmask)
0559 {
0560     struct hlist_node *tmp;
0561     struct xfrm_policy *pol;
0562 
0563     hlist_for_each_entry_safe(pol, tmp, list, byidx) {
0564         unsigned int h;
0565 
0566         h = __idx_hash(pol->index, nhashmask);
0567         hlist_add_head(&pol->byidx, nidxtable+h);
0568     }
0569 }
0570 
0571 static unsigned long xfrm_new_hash_mask(unsigned int old_hmask)
0572 {
0573     return ((old_hmask + 1) << 1) - 1;
0574 }
0575 
0576 static void xfrm_bydst_resize(struct net *net, int dir)
0577 {
0578     unsigned int hmask = net->xfrm.policy_bydst[dir].hmask;
0579     unsigned int nhashmask = xfrm_new_hash_mask(hmask);
0580     unsigned int nsize = (nhashmask + 1) * sizeof(struct hlist_head);
0581     struct hlist_head *ndst = xfrm_hash_alloc(nsize);
0582     struct hlist_head *odst;
0583     int i;
0584 
0585     if (!ndst)
0586         return;
0587 
0588     spin_lock_bh(&net->xfrm.xfrm_policy_lock);
0589     write_seqcount_begin(&net->xfrm.xfrm_policy_hash_generation);
0590 
0591     odst = rcu_dereference_protected(net->xfrm.policy_bydst[dir].table,
0592                 lockdep_is_held(&net->xfrm.xfrm_policy_lock));
0593 
0594     for (i = hmask; i >= 0; i--)
0595         xfrm_dst_hash_transfer(net, odst + i, ndst, nhashmask, dir);
0596 
0597     rcu_assign_pointer(net->xfrm.policy_bydst[dir].table, ndst);
0598     net->xfrm.policy_bydst[dir].hmask = nhashmask;
0599 
0600     write_seqcount_end(&net->xfrm.xfrm_policy_hash_generation);
0601     spin_unlock_bh(&net->xfrm.xfrm_policy_lock);
0602 
0603     synchronize_rcu();
0604 
0605     xfrm_hash_free(odst, (hmask + 1) * sizeof(struct hlist_head));
0606 }
0607 
0608 static void xfrm_byidx_resize(struct net *net, int total)
0609 {
0610     unsigned int hmask = net->xfrm.policy_idx_hmask;
0611     unsigned int nhashmask = xfrm_new_hash_mask(hmask);
0612     unsigned int nsize = (nhashmask + 1) * sizeof(struct hlist_head);
0613     struct hlist_head *oidx = net->xfrm.policy_byidx;
0614     struct hlist_head *nidx = xfrm_hash_alloc(nsize);
0615     int i;
0616 
0617     if (!nidx)
0618         return;
0619 
0620     spin_lock_bh(&net->xfrm.xfrm_policy_lock);
0621 
0622     for (i = hmask; i >= 0; i--)
0623         xfrm_idx_hash_transfer(oidx + i, nidx, nhashmask);
0624 
0625     net->xfrm.policy_byidx = nidx;
0626     net->xfrm.policy_idx_hmask = nhashmask;
0627 
0628     spin_unlock_bh(&net->xfrm.xfrm_policy_lock);
0629 
0630     xfrm_hash_free(oidx, (hmask + 1) * sizeof(struct hlist_head));
0631 }
0632 
0633 static inline int xfrm_bydst_should_resize(struct net *net, int dir, int *total)
0634 {
0635     unsigned int cnt = net->xfrm.policy_count[dir];
0636     unsigned int hmask = net->xfrm.policy_bydst[dir].hmask;
0637 
0638     if (total)
0639         *total += cnt;
0640 
0641     if ((hmask + 1) < xfrm_policy_hashmax &&
0642         cnt > hmask)
0643         return 1;
0644 
0645     return 0;
0646 }
0647 
0648 static inline int xfrm_byidx_should_resize(struct net *net, int total)
0649 {
0650     unsigned int hmask = net->xfrm.policy_idx_hmask;
0651 
0652     if ((hmask + 1) < xfrm_policy_hashmax &&
0653         total > hmask)
0654         return 1;
0655 
0656     return 0;
0657 }
0658 
0659 void xfrm_spd_getinfo(struct net *net, struct xfrmk_spdinfo *si)
0660 {
0661     si->incnt = net->xfrm.policy_count[XFRM_POLICY_IN];
0662     si->outcnt = net->xfrm.policy_count[XFRM_POLICY_OUT];
0663     si->fwdcnt = net->xfrm.policy_count[XFRM_POLICY_FWD];
0664     si->inscnt = net->xfrm.policy_count[XFRM_POLICY_IN+XFRM_POLICY_MAX];
0665     si->outscnt = net->xfrm.policy_count[XFRM_POLICY_OUT+XFRM_POLICY_MAX];
0666     si->fwdscnt = net->xfrm.policy_count[XFRM_POLICY_FWD+XFRM_POLICY_MAX];
0667     si->spdhcnt = net->xfrm.policy_idx_hmask;
0668     si->spdhmcnt = xfrm_policy_hashmax;
0669 }
0670 EXPORT_SYMBOL(xfrm_spd_getinfo);
0671 
0672 static DEFINE_MUTEX(hash_resize_mutex);
0673 static void xfrm_hash_resize(struct work_struct *work)
0674 {
0675     struct net *net = container_of(work, struct net, xfrm.policy_hash_work);
0676     int dir, total;
0677 
0678     mutex_lock(&hash_resize_mutex);
0679 
0680     total = 0;
0681     for (dir = 0; dir < XFRM_POLICY_MAX; dir++) {
0682         if (xfrm_bydst_should_resize(net, dir, &total))
0683             xfrm_bydst_resize(net, dir);
0684     }
0685     if (xfrm_byidx_should_resize(net, total))
0686         xfrm_byidx_resize(net, total);
0687 
0688     mutex_unlock(&hash_resize_mutex);
0689 }
0690 
0691 /* Make sure *pol can be inserted into fastbin.
0692  * Useful to check that later insert requests will be successful
0693  * (provided xfrm_policy_lock is held throughout).
0694  */
0695 static struct xfrm_pol_inexact_bin *
0696 xfrm_policy_inexact_alloc_bin(const struct xfrm_policy *pol, u8 dir)
0697 {
0698     struct xfrm_pol_inexact_bin *bin, *prev;
0699     struct xfrm_pol_inexact_key k = {
0700         .family = pol->family,
0701         .type = pol->type,
0702         .dir = dir,
0703         .if_id = pol->if_id,
0704     };
0705     struct net *net = xp_net(pol);
0706 
0707     lockdep_assert_held(&net->xfrm.xfrm_policy_lock);
0708 
0709     write_pnet(&k.net, net);
0710     bin = rhashtable_lookup_fast(&xfrm_policy_inexact_table, &k,
0711                      xfrm_pol_inexact_params);
0712     if (bin)
0713         return bin;
0714 
0715     bin = kzalloc(sizeof(*bin), GFP_ATOMIC);
0716     if (!bin)
0717         return NULL;
0718 
0719     bin->k = k;
0720     INIT_HLIST_HEAD(&bin->hhead);
0721     bin->root_d = RB_ROOT;
0722     bin->root_s = RB_ROOT;
0723     seqcount_spinlock_init(&bin->count, &net->xfrm.xfrm_policy_lock);
0724 
0725     prev = rhashtable_lookup_get_insert_key(&xfrm_policy_inexact_table,
0726                         &bin->k, &bin->head,
0727                         xfrm_pol_inexact_params);
0728     if (!prev) {
0729         list_add(&bin->inexact_bins, &net->xfrm.inexact_bins);
0730         return bin;
0731     }
0732 
0733     kfree(bin);
0734 
0735     return IS_ERR(prev) ? NULL : prev;
0736 }
0737 
0738 static bool xfrm_pol_inexact_addr_use_any_list(const xfrm_address_t *addr,
0739                            int family, u8 prefixlen)
0740 {
0741     if (xfrm_addr_any(addr, family))
0742         return true;
0743 
0744     if (family == AF_INET6 && prefixlen < INEXACT_PREFIXLEN_IPV6)
0745         return true;
0746 
0747     if (family == AF_INET && prefixlen < INEXACT_PREFIXLEN_IPV4)
0748         return true;
0749 
0750     return false;
0751 }
0752 
0753 static bool
0754 xfrm_policy_inexact_insert_use_any_list(const struct xfrm_policy *policy)
0755 {
0756     const xfrm_address_t *addr;
0757     bool saddr_any, daddr_any;
0758     u8 prefixlen;
0759 
0760     addr = &policy->selector.saddr;
0761     prefixlen = policy->selector.prefixlen_s;
0762 
0763     saddr_any = xfrm_pol_inexact_addr_use_any_list(addr,
0764                                policy->family,
0765                                prefixlen);
0766     addr = &policy->selector.daddr;
0767     prefixlen = policy->selector.prefixlen_d;
0768     daddr_any = xfrm_pol_inexact_addr_use_any_list(addr,
0769                                policy->family,
0770                                prefixlen);
0771     return saddr_any && daddr_any;
0772 }
0773 
0774 static void xfrm_pol_inexact_node_init(struct xfrm_pol_inexact_node *node,
0775                        const xfrm_address_t *addr, u8 prefixlen)
0776 {
0777     node->addr = *addr;
0778     node->prefixlen = prefixlen;
0779 }
0780 
0781 static struct xfrm_pol_inexact_node *
0782 xfrm_pol_inexact_node_alloc(const xfrm_address_t *addr, u8 prefixlen)
0783 {
0784     struct xfrm_pol_inexact_node *node;
0785 
0786     node = kzalloc(sizeof(*node), GFP_ATOMIC);
0787     if (node)
0788         xfrm_pol_inexact_node_init(node, addr, prefixlen);
0789 
0790     return node;
0791 }
0792 
0793 static int xfrm_policy_addr_delta(const xfrm_address_t *a,
0794                   const xfrm_address_t *b,
0795                   u8 prefixlen, u16 family)
0796 {
0797     u32 ma, mb, mask;
0798     unsigned int pdw, pbi;
0799     int delta = 0;
0800 
0801     switch (family) {
0802     case AF_INET:
0803         if (prefixlen == 0)
0804             return 0;
0805         mask = ~0U << (32 - prefixlen);
0806         ma = ntohl(a->a4) & mask;
0807         mb = ntohl(b->a4) & mask;
0808         if (ma < mb)
0809             delta = -1;
0810         else if (ma > mb)
0811             delta = 1;
0812         break;
0813     case AF_INET6:
0814         pdw = prefixlen >> 5;
0815         pbi = prefixlen & 0x1f;
0816 
0817         if (pdw) {
0818             delta = memcmp(a->a6, b->a6, pdw << 2);
0819             if (delta)
0820                 return delta;
0821         }
0822         if (pbi) {
0823             mask = ~0U << (32 - pbi);
0824             ma = ntohl(a->a6[pdw]) & mask;
0825             mb = ntohl(b->a6[pdw]) & mask;
0826             if (ma < mb)
0827                 delta = -1;
0828             else if (ma > mb)
0829                 delta = 1;
0830         }
0831         break;
0832     default:
0833         break;
0834     }
0835 
0836     return delta;
0837 }
0838 
0839 static void xfrm_policy_inexact_list_reinsert(struct net *net,
0840                           struct xfrm_pol_inexact_node *n,
0841                           u16 family)
0842 {
0843     unsigned int matched_s, matched_d;
0844     struct xfrm_policy *policy, *p;
0845 
0846     matched_s = 0;
0847     matched_d = 0;
0848 
0849     list_for_each_entry_reverse(policy, &net->xfrm.policy_all, walk.all) {
0850         struct hlist_node *newpos = NULL;
0851         bool matches_s, matches_d;
0852 
0853         if (!policy->bydst_reinsert)
0854             continue;
0855 
0856         WARN_ON_ONCE(policy->family != family);
0857 
0858         policy->bydst_reinsert = false;
0859         hlist_for_each_entry(p, &n->hhead, bydst) {
0860             if (policy->priority > p->priority)
0861                 newpos = &p->bydst;
0862             else if (policy->priority == p->priority &&
0863                  policy->pos > p->pos)
0864                 newpos = &p->bydst;
0865             else
0866                 break;
0867         }
0868 
0869         if (newpos)
0870             hlist_add_behind_rcu(&policy->bydst, newpos);
0871         else
0872             hlist_add_head_rcu(&policy->bydst, &n->hhead);
0873 
0874         /* paranoia checks follow.
0875          * Check that the reinserted policy matches at least
0876          * saddr or daddr for current node prefix.
0877          *
0878          * Matching both is fine, matching saddr in one policy
0879          * (but not daddr) and then matching only daddr in another
0880          * is a bug.
0881          */
0882         matches_s = xfrm_policy_addr_delta(&policy->selector.saddr,
0883                            &n->addr,
0884                            n->prefixlen,
0885                            family) == 0;
0886         matches_d = xfrm_policy_addr_delta(&policy->selector.daddr,
0887                            &n->addr,
0888                            n->prefixlen,
0889                            family) == 0;
0890         if (matches_s && matches_d)
0891             continue;
0892 
0893         WARN_ON_ONCE(!matches_s && !matches_d);
0894         if (matches_s)
0895             matched_s++;
0896         if (matches_d)
0897             matched_d++;
0898         WARN_ON_ONCE(matched_s && matched_d);
0899     }
0900 }
0901 
0902 static void xfrm_policy_inexact_node_reinsert(struct net *net,
0903                           struct xfrm_pol_inexact_node *n,
0904                           struct rb_root *new,
0905                           u16 family)
0906 {
0907     struct xfrm_pol_inexact_node *node;
0908     struct rb_node **p, *parent;
0909 
0910     /* we should not have another subtree here */
0911     WARN_ON_ONCE(!RB_EMPTY_ROOT(&n->root));
0912 restart:
0913     parent = NULL;
0914     p = &new->rb_node;
0915     while (*p) {
0916         u8 prefixlen;
0917         int delta;
0918 
0919         parent = *p;
0920         node = rb_entry(*p, struct xfrm_pol_inexact_node, node);
0921 
0922         prefixlen = min(node->prefixlen, n->prefixlen);
0923 
0924         delta = xfrm_policy_addr_delta(&n->addr, &node->addr,
0925                            prefixlen, family);
0926         if (delta < 0) {
0927             p = &parent->rb_left;
0928         } else if (delta > 0) {
0929             p = &parent->rb_right;
0930         } else {
0931             bool same_prefixlen = node->prefixlen == n->prefixlen;
0932             struct xfrm_policy *tmp;
0933 
0934             hlist_for_each_entry(tmp, &n->hhead, bydst) {
0935                 tmp->bydst_reinsert = true;
0936                 hlist_del_rcu(&tmp->bydst);
0937             }
0938 
0939             node->prefixlen = prefixlen;
0940 
0941             xfrm_policy_inexact_list_reinsert(net, node, family);
0942 
0943             if (same_prefixlen) {
0944                 kfree_rcu(n, rcu);
0945                 return;
0946             }
0947 
0948             rb_erase(*p, new);
0949             kfree_rcu(n, rcu);
0950             n = node;
0951             goto restart;
0952         }
0953     }
0954 
0955     rb_link_node_rcu(&n->node, parent, p);
0956     rb_insert_color(&n->node, new);
0957 }
0958 
0959 /* merge nodes v and n */
0960 static void xfrm_policy_inexact_node_merge(struct net *net,
0961                        struct xfrm_pol_inexact_node *v,
0962                        struct xfrm_pol_inexact_node *n,
0963                        u16 family)
0964 {
0965     struct xfrm_pol_inexact_node *node;
0966     struct xfrm_policy *tmp;
0967     struct rb_node *rnode;
0968 
0969     /* To-be-merged node v has a subtree.
0970      *
0971      * Dismantle it and insert its nodes to n->root.
0972      */
0973     while ((rnode = rb_first(&v->root)) != NULL) {
0974         node = rb_entry(rnode, struct xfrm_pol_inexact_node, node);
0975         rb_erase(&node->node, &v->root);
0976         xfrm_policy_inexact_node_reinsert(net, node, &n->root,
0977                           family);
0978     }
0979 
0980     hlist_for_each_entry(tmp, &v->hhead, bydst) {
0981         tmp->bydst_reinsert = true;
0982         hlist_del_rcu(&tmp->bydst);
0983     }
0984 
0985     xfrm_policy_inexact_list_reinsert(net, n, family);
0986 }
0987 
0988 static struct xfrm_pol_inexact_node *
0989 xfrm_policy_inexact_insert_node(struct net *net,
0990                 struct rb_root *root,
0991                 xfrm_address_t *addr,
0992                 u16 family, u8 prefixlen, u8 dir)
0993 {
0994     struct xfrm_pol_inexact_node *cached = NULL;
0995     struct rb_node **p, *parent = NULL;
0996     struct xfrm_pol_inexact_node *node;
0997 
0998     p = &root->rb_node;
0999     while (*p) {
1000         int delta;
1001 
1002         parent = *p;
1003         node = rb_entry(*p, struct xfrm_pol_inexact_node, node);
1004 
1005         delta = xfrm_policy_addr_delta(addr, &node->addr,
1006                            node->prefixlen,
1007                            family);
1008         if (delta == 0 && prefixlen >= node->prefixlen) {
1009             WARN_ON_ONCE(cached); /* ipsec policies got lost */
1010             return node;
1011         }
1012 
1013         if (delta < 0)
1014             p = &parent->rb_left;
1015         else
1016             p = &parent->rb_right;
1017 
1018         if (prefixlen < node->prefixlen) {
1019             delta = xfrm_policy_addr_delta(addr, &node->addr,
1020                                prefixlen,
1021                                family);
1022             if (delta)
1023                 continue;
1024 
1025             /* This node is a subnet of the new prefix. It needs
1026              * to be removed and re-inserted with the smaller
1027              * prefix and all nodes that are now also covered
1028              * by the reduced prefixlen.
1029              */
1030             rb_erase(&node->node, root);
1031 
1032             if (!cached) {
1033                 xfrm_pol_inexact_node_init(node, addr,
1034                                prefixlen);
1035                 cached = node;
1036             } else {
1037                 /* This node also falls within the new
1038                  * prefixlen. Merge the to-be-reinserted
1039                  * node and this one.
1040                  */
1041                 xfrm_policy_inexact_node_merge(net, node,
1042                                    cached, family);
1043                 kfree_rcu(node, rcu);
1044             }
1045 
1046             /* restart */
1047             p = &root->rb_node;
1048             parent = NULL;
1049         }
1050     }
1051 
1052     node = cached;
1053     if (!node) {
1054         node = xfrm_pol_inexact_node_alloc(addr, prefixlen);
1055         if (!node)
1056             return NULL;
1057     }
1058 
1059     rb_link_node_rcu(&node->node, parent, p);
1060     rb_insert_color(&node->node, root);
1061 
1062     return node;
1063 }
1064 
1065 static void xfrm_policy_inexact_gc_tree(struct rb_root *r, bool rm)
1066 {
1067     struct xfrm_pol_inexact_node *node;
1068     struct rb_node *rn = rb_first(r);
1069 
1070     while (rn) {
1071         node = rb_entry(rn, struct xfrm_pol_inexact_node, node);
1072 
1073         xfrm_policy_inexact_gc_tree(&node->root, rm);
1074         rn = rb_next(rn);
1075 
1076         if (!hlist_empty(&node->hhead) || !RB_EMPTY_ROOT(&node->root)) {
1077             WARN_ON_ONCE(rm);
1078             continue;
1079         }
1080 
1081         rb_erase(&node->node, r);
1082         kfree_rcu(node, rcu);
1083     }
1084 }
1085 
1086 static void __xfrm_policy_inexact_prune_bin(struct xfrm_pol_inexact_bin *b, bool net_exit)
1087 {
1088     write_seqcount_begin(&b->count);
1089     xfrm_policy_inexact_gc_tree(&b->root_d, net_exit);
1090     xfrm_policy_inexact_gc_tree(&b->root_s, net_exit);
1091     write_seqcount_end(&b->count);
1092 
1093     if (!RB_EMPTY_ROOT(&b->root_d) || !RB_EMPTY_ROOT(&b->root_s) ||
1094         !hlist_empty(&b->hhead)) {
1095         WARN_ON_ONCE(net_exit);
1096         return;
1097     }
1098 
1099     if (rhashtable_remove_fast(&xfrm_policy_inexact_table, &b->head,
1100                    xfrm_pol_inexact_params) == 0) {
1101         list_del(&b->inexact_bins);
1102         kfree_rcu(b, rcu);
1103     }
1104 }
1105 
1106 static void xfrm_policy_inexact_prune_bin(struct xfrm_pol_inexact_bin *b)
1107 {
1108     struct net *net = read_pnet(&b->k.net);
1109 
1110     spin_lock_bh(&net->xfrm.xfrm_policy_lock);
1111     __xfrm_policy_inexact_prune_bin(b, false);
1112     spin_unlock_bh(&net->xfrm.xfrm_policy_lock);
1113 }
1114 
1115 static void __xfrm_policy_inexact_flush(struct net *net)
1116 {
1117     struct xfrm_pol_inexact_bin *bin, *t;
1118 
1119     lockdep_assert_held(&net->xfrm.xfrm_policy_lock);
1120 
1121     list_for_each_entry_safe(bin, t, &net->xfrm.inexact_bins, inexact_bins)
1122         __xfrm_policy_inexact_prune_bin(bin, false);
1123 }
1124 
1125 static struct hlist_head *
1126 xfrm_policy_inexact_alloc_chain(struct xfrm_pol_inexact_bin *bin,
1127                 struct xfrm_policy *policy, u8 dir)
1128 {
1129     struct xfrm_pol_inexact_node *n;
1130     struct net *net;
1131 
1132     net = xp_net(policy);
1133     lockdep_assert_held(&net->xfrm.xfrm_policy_lock);
1134 
1135     if (xfrm_policy_inexact_insert_use_any_list(policy))
1136         return &bin->hhead;
1137 
1138     if (xfrm_pol_inexact_addr_use_any_list(&policy->selector.daddr,
1139                            policy->family,
1140                            policy->selector.prefixlen_d)) {
1141         write_seqcount_begin(&bin->count);
1142         n = xfrm_policy_inexact_insert_node(net,
1143                             &bin->root_s,
1144                             &policy->selector.saddr,
1145                             policy->family,
1146                             policy->selector.prefixlen_s,
1147                             dir);
1148         write_seqcount_end(&bin->count);
1149         if (!n)
1150             return NULL;
1151 
1152         return &n->hhead;
1153     }
1154 
1155     /* daddr is fixed */
1156     write_seqcount_begin(&bin->count);
1157     n = xfrm_policy_inexact_insert_node(net,
1158                         &bin->root_d,
1159                         &policy->selector.daddr,
1160                         policy->family,
1161                         policy->selector.prefixlen_d, dir);
1162     write_seqcount_end(&bin->count);
1163     if (!n)
1164         return NULL;
1165 
1166     /* saddr is wildcard */
1167     if (xfrm_pol_inexact_addr_use_any_list(&policy->selector.saddr,
1168                            policy->family,
1169                            policy->selector.prefixlen_s))
1170         return &n->hhead;
1171 
1172     write_seqcount_begin(&bin->count);
1173     n = xfrm_policy_inexact_insert_node(net,
1174                         &n->root,
1175                         &policy->selector.saddr,
1176                         policy->family,
1177                         policy->selector.prefixlen_s, dir);
1178     write_seqcount_end(&bin->count);
1179     if (!n)
1180         return NULL;
1181 
1182     return &n->hhead;
1183 }
1184 
1185 static struct xfrm_policy *
1186 xfrm_policy_inexact_insert(struct xfrm_policy *policy, u8 dir, int excl)
1187 {
1188     struct xfrm_pol_inexact_bin *bin;
1189     struct xfrm_policy *delpol;
1190     struct hlist_head *chain;
1191     struct net *net;
1192 
1193     bin = xfrm_policy_inexact_alloc_bin(policy, dir);
1194     if (!bin)
1195         return ERR_PTR(-ENOMEM);
1196 
1197     net = xp_net(policy);
1198     lockdep_assert_held(&net->xfrm.xfrm_policy_lock);
1199 
1200     chain = xfrm_policy_inexact_alloc_chain(bin, policy, dir);
1201     if (!chain) {
1202         __xfrm_policy_inexact_prune_bin(bin, false);
1203         return ERR_PTR(-ENOMEM);
1204     }
1205 
1206     delpol = xfrm_policy_insert_list(chain, policy, excl);
1207     if (delpol && excl) {
1208         __xfrm_policy_inexact_prune_bin(bin, false);
1209         return ERR_PTR(-EEXIST);
1210     }
1211 
1212     chain = &net->xfrm.policy_inexact[dir];
1213     xfrm_policy_insert_inexact_list(chain, policy);
1214 
1215     if (delpol)
1216         __xfrm_policy_inexact_prune_bin(bin, false);
1217 
1218     return delpol;
1219 }
1220 
1221 static void xfrm_hash_rebuild(struct work_struct *work)
1222 {
1223     struct net *net = container_of(work, struct net,
1224                        xfrm.policy_hthresh.work);
1225     unsigned int hmask;
1226     struct xfrm_policy *pol;
1227     struct xfrm_policy *policy;
1228     struct hlist_head *chain;
1229     struct hlist_head *odst;
1230     struct hlist_node *newpos;
1231     int i;
1232     int dir;
1233     unsigned seq;
1234     u8 lbits4, rbits4, lbits6, rbits6;
1235 
1236     mutex_lock(&hash_resize_mutex);
1237 
1238     /* read selector prefixlen thresholds */
1239     do {
1240         seq = read_seqbegin(&net->xfrm.policy_hthresh.lock);
1241 
1242         lbits4 = net->xfrm.policy_hthresh.lbits4;
1243         rbits4 = net->xfrm.policy_hthresh.rbits4;
1244         lbits6 = net->xfrm.policy_hthresh.lbits6;
1245         rbits6 = net->xfrm.policy_hthresh.rbits6;
1246     } while (read_seqretry(&net->xfrm.policy_hthresh.lock, seq));
1247 
1248     spin_lock_bh(&net->xfrm.xfrm_policy_lock);
1249     write_seqcount_begin(&net->xfrm.xfrm_policy_hash_generation);
1250 
1251     /* make sure that we can insert the indirect policies again before
1252      * we start with destructive action.
1253      */
1254     list_for_each_entry(policy, &net->xfrm.policy_all, walk.all) {
1255         struct xfrm_pol_inexact_bin *bin;
1256         u8 dbits, sbits;
1257 
1258         dir = xfrm_policy_id2dir(policy->index);
1259         if (policy->walk.dead || dir >= XFRM_POLICY_MAX)
1260             continue;
1261 
1262         if ((dir & XFRM_POLICY_MASK) == XFRM_POLICY_OUT) {
1263             if (policy->family == AF_INET) {
1264                 dbits = rbits4;
1265                 sbits = lbits4;
1266             } else {
1267                 dbits = rbits6;
1268                 sbits = lbits6;
1269             }
1270         } else {
1271             if (policy->family == AF_INET) {
1272                 dbits = lbits4;
1273                 sbits = rbits4;
1274             } else {
1275                 dbits = lbits6;
1276                 sbits = rbits6;
1277             }
1278         }
1279 
1280         if (policy->selector.prefixlen_d < dbits ||
1281             policy->selector.prefixlen_s < sbits)
1282             continue;
1283 
1284         bin = xfrm_policy_inexact_alloc_bin(policy, dir);
1285         if (!bin)
1286             goto out_unlock;
1287 
1288         if (!xfrm_policy_inexact_alloc_chain(bin, policy, dir))
1289             goto out_unlock;
1290     }
1291 
1292     /* reset the bydst and inexact table in all directions */
1293     for (dir = 0; dir < XFRM_POLICY_MAX; dir++) {
1294         struct hlist_node *n;
1295 
1296         hlist_for_each_entry_safe(policy, n,
1297                       &net->xfrm.policy_inexact[dir],
1298                       bydst_inexact_list) {
1299             hlist_del_rcu(&policy->bydst);
1300             hlist_del_init(&policy->bydst_inexact_list);
1301         }
1302 
1303         hmask = net->xfrm.policy_bydst[dir].hmask;
1304         odst = net->xfrm.policy_bydst[dir].table;
1305         for (i = hmask; i >= 0; i--) {
1306             hlist_for_each_entry_safe(policy, n, odst + i, bydst)
1307                 hlist_del_rcu(&policy->bydst);
1308         }
1309         if ((dir & XFRM_POLICY_MASK) == XFRM_POLICY_OUT) {
1310             /* dir out => dst = remote, src = local */
1311             net->xfrm.policy_bydst[dir].dbits4 = rbits4;
1312             net->xfrm.policy_bydst[dir].sbits4 = lbits4;
1313             net->xfrm.policy_bydst[dir].dbits6 = rbits6;
1314             net->xfrm.policy_bydst[dir].sbits6 = lbits6;
1315         } else {
1316             /* dir in/fwd => dst = local, src = remote */
1317             net->xfrm.policy_bydst[dir].dbits4 = lbits4;
1318             net->xfrm.policy_bydst[dir].sbits4 = rbits4;
1319             net->xfrm.policy_bydst[dir].dbits6 = lbits6;
1320             net->xfrm.policy_bydst[dir].sbits6 = rbits6;
1321         }
1322     }
1323 
1324     /* re-insert all policies by order of creation */
1325     list_for_each_entry_reverse(policy, &net->xfrm.policy_all, walk.all) {
1326         if (policy->walk.dead)
1327             continue;
1328         dir = xfrm_policy_id2dir(policy->index);
1329         if (dir >= XFRM_POLICY_MAX) {
1330             /* skip socket policies */
1331             continue;
1332         }
1333         newpos = NULL;
1334         chain = policy_hash_bysel(net, &policy->selector,
1335                       policy->family, dir);
1336 
1337         if (!chain) {
1338             void *p = xfrm_policy_inexact_insert(policy, dir, 0);
1339 
1340             WARN_ONCE(IS_ERR(p), "reinsert: %ld\n", PTR_ERR(p));
1341             continue;
1342         }
1343 
1344         hlist_for_each_entry(pol, chain, bydst) {
1345             if (policy->priority >= pol->priority)
1346                 newpos = &pol->bydst;
1347             else
1348                 break;
1349         }
1350         if (newpos)
1351             hlist_add_behind_rcu(&policy->bydst, newpos);
1352         else
1353             hlist_add_head_rcu(&policy->bydst, chain);
1354     }
1355 
1356 out_unlock:
1357     __xfrm_policy_inexact_flush(net);
1358     write_seqcount_end(&net->xfrm.xfrm_policy_hash_generation);
1359     spin_unlock_bh(&net->xfrm.xfrm_policy_lock);
1360 
1361     mutex_unlock(&hash_resize_mutex);
1362 }
1363 
1364 void xfrm_policy_hash_rebuild(struct net *net)
1365 {
1366     schedule_work(&net->xfrm.policy_hthresh.work);
1367 }
1368 EXPORT_SYMBOL(xfrm_policy_hash_rebuild);
1369 
1370 /* Generate new index... KAME seems to generate them ordered by cost
1371  * of an absolute inpredictability of ordering of rules. This will not pass. */
1372 static u32 xfrm_gen_index(struct net *net, int dir, u32 index)
1373 {
1374     static u32 idx_generator;
1375 
1376     for (;;) {
1377         struct hlist_head *list;
1378         struct xfrm_policy *p;
1379         u32 idx;
1380         int found;
1381 
1382         if (!index) {
1383             idx = (idx_generator | dir);
1384             idx_generator += 8;
1385         } else {
1386             idx = index;
1387             index = 0;
1388         }
1389 
1390         if (idx == 0)
1391             idx = 8;
1392         list = net->xfrm.policy_byidx + idx_hash(net, idx);
1393         found = 0;
1394         hlist_for_each_entry(p, list, byidx) {
1395             if (p->index == idx) {
1396                 found = 1;
1397                 break;
1398             }
1399         }
1400         if (!found)
1401             return idx;
1402     }
1403 }
1404 
1405 static inline int selector_cmp(struct xfrm_selector *s1, struct xfrm_selector *s2)
1406 {
1407     u32 *p1 = (u32 *) s1;
1408     u32 *p2 = (u32 *) s2;
1409     int len = sizeof(struct xfrm_selector) / sizeof(u32);
1410     int i;
1411 
1412     for (i = 0; i < len; i++) {
1413         if (p1[i] != p2[i])
1414             return 1;
1415     }
1416 
1417     return 0;
1418 }
1419 
1420 static void xfrm_policy_requeue(struct xfrm_policy *old,
1421                 struct xfrm_policy *new)
1422 {
1423     struct xfrm_policy_queue *pq = &old->polq;
1424     struct sk_buff_head list;
1425 
1426     if (skb_queue_empty(&pq->hold_queue))
1427         return;
1428 
1429     __skb_queue_head_init(&list);
1430 
1431     spin_lock_bh(&pq->hold_queue.lock);
1432     skb_queue_splice_init(&pq->hold_queue, &list);
1433     if (del_timer(&pq->hold_timer))
1434         xfrm_pol_put(old);
1435     spin_unlock_bh(&pq->hold_queue.lock);
1436 
1437     pq = &new->polq;
1438 
1439     spin_lock_bh(&pq->hold_queue.lock);
1440     skb_queue_splice(&list, &pq->hold_queue);
1441     pq->timeout = XFRM_QUEUE_TMO_MIN;
1442     if (!mod_timer(&pq->hold_timer, jiffies))
1443         xfrm_pol_hold(new);
1444     spin_unlock_bh(&pq->hold_queue.lock);
1445 }
1446 
1447 static inline bool xfrm_policy_mark_match(const struct xfrm_mark *mark,
1448                       struct xfrm_policy *pol)
1449 {
1450     return mark->v == pol->mark.v && mark->m == pol->mark.m;
1451 }
1452 
1453 static u32 xfrm_pol_bin_key(const void *data, u32 len, u32 seed)
1454 {
1455     const struct xfrm_pol_inexact_key *k = data;
1456     u32 a = k->type << 24 | k->dir << 16 | k->family;
1457 
1458     return jhash_3words(a, k->if_id, net_hash_mix(read_pnet(&k->net)),
1459                 seed);
1460 }
1461 
1462 static u32 xfrm_pol_bin_obj(const void *data, u32 len, u32 seed)
1463 {
1464     const struct xfrm_pol_inexact_bin *b = data;
1465 
1466     return xfrm_pol_bin_key(&b->k, 0, seed);
1467 }
1468 
1469 static int xfrm_pol_bin_cmp(struct rhashtable_compare_arg *arg,
1470                 const void *ptr)
1471 {
1472     const struct xfrm_pol_inexact_key *key = arg->key;
1473     const struct xfrm_pol_inexact_bin *b = ptr;
1474     int ret;
1475 
1476     if (!net_eq(read_pnet(&b->k.net), read_pnet(&key->net)))
1477         return -1;
1478 
1479     ret = b->k.dir ^ key->dir;
1480     if (ret)
1481         return ret;
1482 
1483     ret = b->k.type ^ key->type;
1484     if (ret)
1485         return ret;
1486 
1487     ret = b->k.family ^ key->family;
1488     if (ret)
1489         return ret;
1490 
1491     return b->k.if_id ^ key->if_id;
1492 }
1493 
1494 static const struct rhashtable_params xfrm_pol_inexact_params = {
1495     .head_offset        = offsetof(struct xfrm_pol_inexact_bin, head),
1496     .hashfn         = xfrm_pol_bin_key,
1497     .obj_hashfn     = xfrm_pol_bin_obj,
1498     .obj_cmpfn      = xfrm_pol_bin_cmp,
1499     .automatic_shrinking    = true,
1500 };
1501 
1502 static void xfrm_policy_insert_inexact_list(struct hlist_head *chain,
1503                         struct xfrm_policy *policy)
1504 {
1505     struct xfrm_policy *pol, *delpol = NULL;
1506     struct hlist_node *newpos = NULL;
1507     int i = 0;
1508 
1509     hlist_for_each_entry(pol, chain, bydst_inexact_list) {
1510         if (pol->type == policy->type &&
1511             pol->if_id == policy->if_id &&
1512             !selector_cmp(&pol->selector, &policy->selector) &&
1513             xfrm_policy_mark_match(&policy->mark, pol) &&
1514             xfrm_sec_ctx_match(pol->security, policy->security) &&
1515             !WARN_ON(delpol)) {
1516             delpol = pol;
1517             if (policy->priority > pol->priority)
1518                 continue;
1519         } else if (policy->priority >= pol->priority) {
1520             newpos = &pol->bydst_inexact_list;
1521             continue;
1522         }
1523         if (delpol)
1524             break;
1525     }
1526 
1527     if (newpos)
1528         hlist_add_behind_rcu(&policy->bydst_inexact_list, newpos);
1529     else
1530         hlist_add_head_rcu(&policy->bydst_inexact_list, chain);
1531 
1532     hlist_for_each_entry(pol, chain, bydst_inexact_list) {
1533         pol->pos = i;
1534         i++;
1535     }
1536 }
1537 
1538 static struct xfrm_policy *xfrm_policy_insert_list(struct hlist_head *chain,
1539                            struct xfrm_policy *policy,
1540                            bool excl)
1541 {
1542     struct xfrm_policy *pol, *newpos = NULL, *delpol = NULL;
1543 
1544     hlist_for_each_entry(pol, chain, bydst) {
1545         if (pol->type == policy->type &&
1546             pol->if_id == policy->if_id &&
1547             !selector_cmp(&pol->selector, &policy->selector) &&
1548             xfrm_policy_mark_match(&policy->mark, pol) &&
1549             xfrm_sec_ctx_match(pol->security, policy->security) &&
1550             !WARN_ON(delpol)) {
1551             if (excl)
1552                 return ERR_PTR(-EEXIST);
1553             delpol = pol;
1554             if (policy->priority > pol->priority)
1555                 continue;
1556         } else if (policy->priority >= pol->priority) {
1557             newpos = pol;
1558             continue;
1559         }
1560         if (delpol)
1561             break;
1562     }
1563 
1564     if (newpos)
1565         hlist_add_behind_rcu(&policy->bydst, &newpos->bydst);
1566     else
1567         hlist_add_head_rcu(&policy->bydst, chain);
1568 
1569     return delpol;
1570 }
1571 
1572 int xfrm_policy_insert(int dir, struct xfrm_policy *policy, int excl)
1573 {
1574     struct net *net = xp_net(policy);
1575     struct xfrm_policy *delpol;
1576     struct hlist_head *chain;
1577 
1578     spin_lock_bh(&net->xfrm.xfrm_policy_lock);
1579     chain = policy_hash_bysel(net, &policy->selector, policy->family, dir);
1580     if (chain)
1581         delpol = xfrm_policy_insert_list(chain, policy, excl);
1582     else
1583         delpol = xfrm_policy_inexact_insert(policy, dir, excl);
1584 
1585     if (IS_ERR(delpol)) {
1586         spin_unlock_bh(&net->xfrm.xfrm_policy_lock);
1587         return PTR_ERR(delpol);
1588     }
1589 
1590     __xfrm_policy_link(policy, dir);
1591 
1592     /* After previous checking, family can either be AF_INET or AF_INET6 */
1593     if (policy->family == AF_INET)
1594         rt_genid_bump_ipv4(net);
1595     else
1596         rt_genid_bump_ipv6(net);
1597 
1598     if (delpol) {
1599         xfrm_policy_requeue(delpol, policy);
1600         __xfrm_policy_unlink(delpol, dir);
1601     }
1602     policy->index = delpol ? delpol->index : xfrm_gen_index(net, dir, policy->index);
1603     hlist_add_head(&policy->byidx, net->xfrm.policy_byidx+idx_hash(net, policy->index));
1604     policy->curlft.add_time = ktime_get_real_seconds();
1605     policy->curlft.use_time = 0;
1606     if (!mod_timer(&policy->timer, jiffies + HZ))
1607         xfrm_pol_hold(policy);
1608     spin_unlock_bh(&net->xfrm.xfrm_policy_lock);
1609 
1610     if (delpol)
1611         xfrm_policy_kill(delpol);
1612     else if (xfrm_bydst_should_resize(net, dir, NULL))
1613         schedule_work(&net->xfrm.policy_hash_work);
1614 
1615     return 0;
1616 }
1617 EXPORT_SYMBOL(xfrm_policy_insert);
1618 
1619 static struct xfrm_policy *
1620 __xfrm_policy_bysel_ctx(struct hlist_head *chain, const struct xfrm_mark *mark,
1621             u32 if_id, u8 type, int dir, struct xfrm_selector *sel,
1622             struct xfrm_sec_ctx *ctx)
1623 {
1624     struct xfrm_policy *pol;
1625 
1626     if (!chain)
1627         return NULL;
1628 
1629     hlist_for_each_entry(pol, chain, bydst) {
1630         if (pol->type == type &&
1631             pol->if_id == if_id &&
1632             xfrm_policy_mark_match(mark, pol) &&
1633             !selector_cmp(sel, &pol->selector) &&
1634             xfrm_sec_ctx_match(ctx, pol->security))
1635             return pol;
1636     }
1637 
1638     return NULL;
1639 }
1640 
1641 struct xfrm_policy *
1642 xfrm_policy_bysel_ctx(struct net *net, const struct xfrm_mark *mark, u32 if_id,
1643               u8 type, int dir, struct xfrm_selector *sel,
1644               struct xfrm_sec_ctx *ctx, int delete, int *err)
1645 {
1646     struct xfrm_pol_inexact_bin *bin = NULL;
1647     struct xfrm_policy *pol, *ret = NULL;
1648     struct hlist_head *chain;
1649 
1650     *err = 0;
1651     spin_lock_bh(&net->xfrm.xfrm_policy_lock);
1652     chain = policy_hash_bysel(net, sel, sel->family, dir);
1653     if (!chain) {
1654         struct xfrm_pol_inexact_candidates cand;
1655         int i;
1656 
1657         bin = xfrm_policy_inexact_lookup(net, type,
1658                          sel->family, dir, if_id);
1659         if (!bin) {
1660             spin_unlock_bh(&net->xfrm.xfrm_policy_lock);
1661             return NULL;
1662         }
1663 
1664         if (!xfrm_policy_find_inexact_candidates(&cand, bin,
1665                              &sel->saddr,
1666                              &sel->daddr)) {
1667             spin_unlock_bh(&net->xfrm.xfrm_policy_lock);
1668             return NULL;
1669         }
1670 
1671         pol = NULL;
1672         for (i = 0; i < ARRAY_SIZE(cand.res); i++) {
1673             struct xfrm_policy *tmp;
1674 
1675             tmp = __xfrm_policy_bysel_ctx(cand.res[i], mark,
1676                               if_id, type, dir,
1677                               sel, ctx);
1678             if (!tmp)
1679                 continue;
1680 
1681             if (!pol || tmp->pos < pol->pos)
1682                 pol = tmp;
1683         }
1684     } else {
1685         pol = __xfrm_policy_bysel_ctx(chain, mark, if_id, type, dir,
1686                           sel, ctx);
1687     }
1688 
1689     if (pol) {
1690         xfrm_pol_hold(pol);
1691         if (delete) {
1692             *err = security_xfrm_policy_delete(pol->security);
1693             if (*err) {
1694                 spin_unlock_bh(&net->xfrm.xfrm_policy_lock);
1695                 return pol;
1696             }
1697             __xfrm_policy_unlink(pol, dir);
1698         }
1699         ret = pol;
1700     }
1701     spin_unlock_bh(&net->xfrm.xfrm_policy_lock);
1702 
1703     if (ret && delete)
1704         xfrm_policy_kill(ret);
1705     if (bin && delete)
1706         xfrm_policy_inexact_prune_bin(bin);
1707     return ret;
1708 }
1709 EXPORT_SYMBOL(xfrm_policy_bysel_ctx);
1710 
1711 struct xfrm_policy *
1712 xfrm_policy_byid(struct net *net, const struct xfrm_mark *mark, u32 if_id,
1713          u8 type, int dir, u32 id, int delete, int *err)
1714 {
1715     struct xfrm_policy *pol, *ret;
1716     struct hlist_head *chain;
1717 
1718     *err = -ENOENT;
1719     if (xfrm_policy_id2dir(id) != dir)
1720         return NULL;
1721 
1722     *err = 0;
1723     spin_lock_bh(&net->xfrm.xfrm_policy_lock);
1724     chain = net->xfrm.policy_byidx + idx_hash(net, id);
1725     ret = NULL;
1726     hlist_for_each_entry(pol, chain, byidx) {
1727         if (pol->type == type && pol->index == id &&
1728             pol->if_id == if_id && xfrm_policy_mark_match(mark, pol)) {
1729             xfrm_pol_hold(pol);
1730             if (delete) {
1731                 *err = security_xfrm_policy_delete(
1732                                 pol->security);
1733                 if (*err) {
1734                     spin_unlock_bh(&net->xfrm.xfrm_policy_lock);
1735                     return pol;
1736                 }
1737                 __xfrm_policy_unlink(pol, dir);
1738             }
1739             ret = pol;
1740             break;
1741         }
1742     }
1743     spin_unlock_bh(&net->xfrm.xfrm_policy_lock);
1744 
1745     if (ret && delete)
1746         xfrm_policy_kill(ret);
1747     return ret;
1748 }
1749 EXPORT_SYMBOL(xfrm_policy_byid);
1750 
1751 #ifdef CONFIG_SECURITY_NETWORK_XFRM
1752 static inline int
1753 xfrm_policy_flush_secctx_check(struct net *net, u8 type, bool task_valid)
1754 {
1755     struct xfrm_policy *pol;
1756     int err = 0;
1757 
1758     list_for_each_entry(pol, &net->xfrm.policy_all, walk.all) {
1759         if (pol->walk.dead ||
1760             xfrm_policy_id2dir(pol->index) >= XFRM_POLICY_MAX ||
1761             pol->type != type)
1762             continue;
1763 
1764         err = security_xfrm_policy_delete(pol->security);
1765         if (err) {
1766             xfrm_audit_policy_delete(pol, 0, task_valid);
1767             return err;
1768         }
1769     }
1770     return err;
1771 }
1772 #else
1773 static inline int
1774 xfrm_policy_flush_secctx_check(struct net *net, u8 type, bool task_valid)
1775 {
1776     return 0;
1777 }
1778 #endif
1779 
1780 int xfrm_policy_flush(struct net *net, u8 type, bool task_valid)
1781 {
1782     int dir, err = 0, cnt = 0;
1783     struct xfrm_policy *pol;
1784 
1785     spin_lock_bh(&net->xfrm.xfrm_policy_lock);
1786 
1787     err = xfrm_policy_flush_secctx_check(net, type, task_valid);
1788     if (err)
1789         goto out;
1790 
1791 again:
1792     list_for_each_entry(pol, &net->xfrm.policy_all, walk.all) {
1793         dir = xfrm_policy_id2dir(pol->index);
1794         if (pol->walk.dead ||
1795             dir >= XFRM_POLICY_MAX ||
1796             pol->type != type)
1797             continue;
1798 
1799         __xfrm_policy_unlink(pol, dir);
1800         spin_unlock_bh(&net->xfrm.xfrm_policy_lock);
1801         cnt++;
1802         xfrm_audit_policy_delete(pol, 1, task_valid);
1803         xfrm_policy_kill(pol);
1804         spin_lock_bh(&net->xfrm.xfrm_policy_lock);
1805         goto again;
1806     }
1807     if (cnt)
1808         __xfrm_policy_inexact_flush(net);
1809     else
1810         err = -ESRCH;
1811 out:
1812     spin_unlock_bh(&net->xfrm.xfrm_policy_lock);
1813     return err;
1814 }
1815 EXPORT_SYMBOL(xfrm_policy_flush);
1816 
1817 int xfrm_policy_walk(struct net *net, struct xfrm_policy_walk *walk,
1818              int (*func)(struct xfrm_policy *, int, int, void*),
1819              void *data)
1820 {
1821     struct xfrm_policy *pol;
1822     struct xfrm_policy_walk_entry *x;
1823     int error = 0;
1824 
1825     if (walk->type >= XFRM_POLICY_TYPE_MAX &&
1826         walk->type != XFRM_POLICY_TYPE_ANY)
1827         return -EINVAL;
1828 
1829     if (list_empty(&walk->walk.all) && walk->seq != 0)
1830         return 0;
1831 
1832     spin_lock_bh(&net->xfrm.xfrm_policy_lock);
1833     if (list_empty(&walk->walk.all))
1834         x = list_first_entry(&net->xfrm.policy_all, struct xfrm_policy_walk_entry, all);
1835     else
1836         x = list_first_entry(&walk->walk.all,
1837                      struct xfrm_policy_walk_entry, all);
1838 
1839     list_for_each_entry_from(x, &net->xfrm.policy_all, all) {
1840         if (x->dead)
1841             continue;
1842         pol = container_of(x, struct xfrm_policy, walk);
1843         if (walk->type != XFRM_POLICY_TYPE_ANY &&
1844             walk->type != pol->type)
1845             continue;
1846         error = func(pol, xfrm_policy_id2dir(pol->index),
1847                  walk->seq, data);
1848         if (error) {
1849             list_move_tail(&walk->walk.all, &x->all);
1850             goto out;
1851         }
1852         walk->seq++;
1853     }
1854     if (walk->seq == 0) {
1855         error = -ENOENT;
1856         goto out;
1857     }
1858     list_del_init(&walk->walk.all);
1859 out:
1860     spin_unlock_bh(&net->xfrm.xfrm_policy_lock);
1861     return error;
1862 }
1863 EXPORT_SYMBOL(xfrm_policy_walk);
1864 
1865 void xfrm_policy_walk_init(struct xfrm_policy_walk *walk, u8 type)
1866 {
1867     INIT_LIST_HEAD(&walk->walk.all);
1868     walk->walk.dead = 1;
1869     walk->type = type;
1870     walk->seq = 0;
1871 }
1872 EXPORT_SYMBOL(xfrm_policy_walk_init);
1873 
1874 void xfrm_policy_walk_done(struct xfrm_policy_walk *walk, struct net *net)
1875 {
1876     if (list_empty(&walk->walk.all))
1877         return;
1878 
1879     spin_lock_bh(&net->xfrm.xfrm_policy_lock); /*FIXME where is net? */
1880     list_del(&walk->walk.all);
1881     spin_unlock_bh(&net->xfrm.xfrm_policy_lock);
1882 }
1883 EXPORT_SYMBOL(xfrm_policy_walk_done);
1884 
1885 /*
1886  * Find policy to apply to this flow.
1887  *
1888  * Returns 0 if policy found, else an -errno.
1889  */
1890 static int xfrm_policy_match(const struct xfrm_policy *pol,
1891                  const struct flowi *fl,
1892                  u8 type, u16 family, int dir, u32 if_id)
1893 {
1894     const struct xfrm_selector *sel = &pol->selector;
1895     int ret = -ESRCH;
1896     bool match;
1897 
1898     if (pol->family != family ||
1899         pol->if_id != if_id ||
1900         (fl->flowi_mark & pol->mark.m) != pol->mark.v ||
1901         pol->type != type)
1902         return ret;
1903 
1904     match = xfrm_selector_match(sel, fl, family);
1905     if (match)
1906         ret = security_xfrm_policy_lookup(pol->security, fl->flowi_secid);
1907     return ret;
1908 }
1909 
1910 static struct xfrm_pol_inexact_node *
1911 xfrm_policy_lookup_inexact_addr(const struct rb_root *r,
1912                 seqcount_spinlock_t *count,
1913                 const xfrm_address_t *addr, u16 family)
1914 {
1915     const struct rb_node *parent;
1916     int seq;
1917 
1918 again:
1919     seq = read_seqcount_begin(count);
1920 
1921     parent = rcu_dereference_raw(r->rb_node);
1922     while (parent) {
1923         struct xfrm_pol_inexact_node *node;
1924         int delta;
1925 
1926         node = rb_entry(parent, struct xfrm_pol_inexact_node, node);
1927 
1928         delta = xfrm_policy_addr_delta(addr, &node->addr,
1929                            node->prefixlen, family);
1930         if (delta < 0) {
1931             parent = rcu_dereference_raw(parent->rb_left);
1932             continue;
1933         } else if (delta > 0) {
1934             parent = rcu_dereference_raw(parent->rb_right);
1935             continue;
1936         }
1937 
1938         return node;
1939     }
1940 
1941     if (read_seqcount_retry(count, seq))
1942         goto again;
1943 
1944     return NULL;
1945 }
1946 
1947 static bool
1948 xfrm_policy_find_inexact_candidates(struct xfrm_pol_inexact_candidates *cand,
1949                     struct xfrm_pol_inexact_bin *b,
1950                     const xfrm_address_t *saddr,
1951                     const xfrm_address_t *daddr)
1952 {
1953     struct xfrm_pol_inexact_node *n;
1954     u16 family;
1955 
1956     if (!b)
1957         return false;
1958 
1959     family = b->k.family;
1960     memset(cand, 0, sizeof(*cand));
1961     cand->res[XFRM_POL_CAND_ANY] = &b->hhead;
1962 
1963     n = xfrm_policy_lookup_inexact_addr(&b->root_d, &b->count, daddr,
1964                         family);
1965     if (n) {
1966         cand->res[XFRM_POL_CAND_DADDR] = &n->hhead;
1967         n = xfrm_policy_lookup_inexact_addr(&n->root, &b->count, saddr,
1968                             family);
1969         if (n)
1970             cand->res[XFRM_POL_CAND_BOTH] = &n->hhead;
1971     }
1972 
1973     n = xfrm_policy_lookup_inexact_addr(&b->root_s, &b->count, saddr,
1974                         family);
1975     if (n)
1976         cand->res[XFRM_POL_CAND_SADDR] = &n->hhead;
1977 
1978     return true;
1979 }
1980 
1981 static struct xfrm_pol_inexact_bin *
1982 xfrm_policy_inexact_lookup_rcu(struct net *net, u8 type, u16 family,
1983                    u8 dir, u32 if_id)
1984 {
1985     struct xfrm_pol_inexact_key k = {
1986         .family = family,
1987         .type = type,
1988         .dir = dir,
1989         .if_id = if_id,
1990     };
1991 
1992     write_pnet(&k.net, net);
1993 
1994     return rhashtable_lookup(&xfrm_policy_inexact_table, &k,
1995                  xfrm_pol_inexact_params);
1996 }
1997 
1998 static struct xfrm_pol_inexact_bin *
1999 xfrm_policy_inexact_lookup(struct net *net, u8 type, u16 family,
2000                u8 dir, u32 if_id)
2001 {
2002     struct xfrm_pol_inexact_bin *bin;
2003 
2004     lockdep_assert_held(&net->xfrm.xfrm_policy_lock);
2005 
2006     rcu_read_lock();
2007     bin = xfrm_policy_inexact_lookup_rcu(net, type, family, dir, if_id);
2008     rcu_read_unlock();
2009 
2010     return bin;
2011 }
2012 
2013 static struct xfrm_policy *
2014 __xfrm_policy_eval_candidates(struct hlist_head *chain,
2015                   struct xfrm_policy *prefer,
2016                   const struct flowi *fl,
2017                   u8 type, u16 family, int dir, u32 if_id)
2018 {
2019     u32 priority = prefer ? prefer->priority : ~0u;
2020     struct xfrm_policy *pol;
2021 
2022     if (!chain)
2023         return NULL;
2024 
2025     hlist_for_each_entry_rcu(pol, chain, bydst) {
2026         int err;
2027 
2028         if (pol->priority > priority)
2029             break;
2030 
2031         err = xfrm_policy_match(pol, fl, type, family, dir, if_id);
2032         if (err) {
2033             if (err != -ESRCH)
2034                 return ERR_PTR(err);
2035 
2036             continue;
2037         }
2038 
2039         if (prefer) {
2040             /* matches.  Is it older than *prefer? */
2041             if (pol->priority == priority &&
2042                 prefer->pos < pol->pos)
2043                 return prefer;
2044         }
2045 
2046         return pol;
2047     }
2048 
2049     return NULL;
2050 }
2051 
2052 static struct xfrm_policy *
2053 xfrm_policy_eval_candidates(struct xfrm_pol_inexact_candidates *cand,
2054                 struct xfrm_policy *prefer,
2055                 const struct flowi *fl,
2056                 u8 type, u16 family, int dir, u32 if_id)
2057 {
2058     struct xfrm_policy *tmp;
2059     int i;
2060 
2061     for (i = 0; i < ARRAY_SIZE(cand->res); i++) {
2062         tmp = __xfrm_policy_eval_candidates(cand->res[i],
2063                             prefer,
2064                             fl, type, family, dir,
2065                             if_id);
2066         if (!tmp)
2067             continue;
2068 
2069         if (IS_ERR(tmp))
2070             return tmp;
2071         prefer = tmp;
2072     }
2073 
2074     return prefer;
2075 }
2076 
2077 static struct xfrm_policy *xfrm_policy_lookup_bytype(struct net *net, u8 type,
2078                              const struct flowi *fl,
2079                              u16 family, u8 dir,
2080                              u32 if_id)
2081 {
2082     struct xfrm_pol_inexact_candidates cand;
2083     const xfrm_address_t *daddr, *saddr;
2084     struct xfrm_pol_inexact_bin *bin;
2085     struct xfrm_policy *pol, *ret;
2086     struct hlist_head *chain;
2087     unsigned int sequence;
2088     int err;
2089 
2090     daddr = xfrm_flowi_daddr(fl, family);
2091     saddr = xfrm_flowi_saddr(fl, family);
2092     if (unlikely(!daddr || !saddr))
2093         return NULL;
2094 
2095     rcu_read_lock();
2096  retry:
2097     do {
2098         sequence = read_seqcount_begin(&net->xfrm.xfrm_policy_hash_generation);
2099         chain = policy_hash_direct(net, daddr, saddr, family, dir);
2100     } while (read_seqcount_retry(&net->xfrm.xfrm_policy_hash_generation, sequence));
2101 
2102     ret = NULL;
2103     hlist_for_each_entry_rcu(pol, chain, bydst) {
2104         err = xfrm_policy_match(pol, fl, type, family, dir, if_id);
2105         if (err) {
2106             if (err == -ESRCH)
2107                 continue;
2108             else {
2109                 ret = ERR_PTR(err);
2110                 goto fail;
2111             }
2112         } else {
2113             ret = pol;
2114             break;
2115         }
2116     }
2117     bin = xfrm_policy_inexact_lookup_rcu(net, type, family, dir, if_id);
2118     if (!bin || !xfrm_policy_find_inexact_candidates(&cand, bin, saddr,
2119                              daddr))
2120         goto skip_inexact;
2121 
2122     pol = xfrm_policy_eval_candidates(&cand, ret, fl, type,
2123                       family, dir, if_id);
2124     if (pol) {
2125         ret = pol;
2126         if (IS_ERR(pol))
2127             goto fail;
2128     }
2129 
2130 skip_inexact:
2131     if (read_seqcount_retry(&net->xfrm.xfrm_policy_hash_generation, sequence))
2132         goto retry;
2133 
2134     if (ret && !xfrm_pol_hold_rcu(ret))
2135         goto retry;
2136 fail:
2137     rcu_read_unlock();
2138 
2139     return ret;
2140 }
2141 
2142 static struct xfrm_policy *xfrm_policy_lookup(struct net *net,
2143                           const struct flowi *fl,
2144                           u16 family, u8 dir, u32 if_id)
2145 {
2146 #ifdef CONFIG_XFRM_SUB_POLICY
2147     struct xfrm_policy *pol;
2148 
2149     pol = xfrm_policy_lookup_bytype(net, XFRM_POLICY_TYPE_SUB, fl, family,
2150                     dir, if_id);
2151     if (pol != NULL)
2152         return pol;
2153 #endif
2154     return xfrm_policy_lookup_bytype(net, XFRM_POLICY_TYPE_MAIN, fl, family,
2155                      dir, if_id);
2156 }
2157 
2158 static struct xfrm_policy *xfrm_sk_policy_lookup(const struct sock *sk, int dir,
2159                          const struct flowi *fl,
2160                          u16 family, u32 if_id)
2161 {
2162     struct xfrm_policy *pol;
2163 
2164     rcu_read_lock();
2165  again:
2166     pol = rcu_dereference(sk->sk_policy[dir]);
2167     if (pol != NULL) {
2168         bool match;
2169         int err = 0;
2170 
2171         if (pol->family != family) {
2172             pol = NULL;
2173             goto out;
2174         }
2175 
2176         match = xfrm_selector_match(&pol->selector, fl, family);
2177         if (match) {
2178             if ((sk->sk_mark & pol->mark.m) != pol->mark.v ||
2179                 pol->if_id != if_id) {
2180                 pol = NULL;
2181                 goto out;
2182             }
2183             err = security_xfrm_policy_lookup(pol->security,
2184                               fl->flowi_secid);
2185             if (!err) {
2186                 if (!xfrm_pol_hold_rcu(pol))
2187                     goto again;
2188             } else if (err == -ESRCH) {
2189                 pol = NULL;
2190             } else {
2191                 pol = ERR_PTR(err);
2192             }
2193         } else
2194             pol = NULL;
2195     }
2196 out:
2197     rcu_read_unlock();
2198     return pol;
2199 }
2200 
2201 static void __xfrm_policy_link(struct xfrm_policy *pol, int dir)
2202 {
2203     struct net *net = xp_net(pol);
2204 
2205     list_add(&pol->walk.all, &net->xfrm.policy_all);
2206     net->xfrm.policy_count[dir]++;
2207     xfrm_pol_hold(pol);
2208 }
2209 
2210 static struct xfrm_policy *__xfrm_policy_unlink(struct xfrm_policy *pol,
2211                         int dir)
2212 {
2213     struct net *net = xp_net(pol);
2214 
2215     if (list_empty(&pol->walk.all))
2216         return NULL;
2217 
2218     /* Socket policies are not hashed. */
2219     if (!hlist_unhashed(&pol->bydst)) {
2220         hlist_del_rcu(&pol->bydst);
2221         hlist_del_init(&pol->bydst_inexact_list);
2222         hlist_del(&pol->byidx);
2223     }
2224 
2225     list_del_init(&pol->walk.all);
2226     net->xfrm.policy_count[dir]--;
2227 
2228     return pol;
2229 }
2230 
2231 static void xfrm_sk_policy_link(struct xfrm_policy *pol, int dir)
2232 {
2233     __xfrm_policy_link(pol, XFRM_POLICY_MAX + dir);
2234 }
2235 
2236 static void xfrm_sk_policy_unlink(struct xfrm_policy *pol, int dir)
2237 {
2238     __xfrm_policy_unlink(pol, XFRM_POLICY_MAX + dir);
2239 }
2240 
2241 int xfrm_policy_delete(struct xfrm_policy *pol, int dir)
2242 {
2243     struct net *net = xp_net(pol);
2244 
2245     spin_lock_bh(&net->xfrm.xfrm_policy_lock);
2246     pol = __xfrm_policy_unlink(pol, dir);
2247     spin_unlock_bh(&net->xfrm.xfrm_policy_lock);
2248     if (pol) {
2249         xfrm_policy_kill(pol);
2250         return 0;
2251     }
2252     return -ENOENT;
2253 }
2254 EXPORT_SYMBOL(xfrm_policy_delete);
2255 
2256 int xfrm_sk_policy_insert(struct sock *sk, int dir, struct xfrm_policy *pol)
2257 {
2258     struct net *net = sock_net(sk);
2259     struct xfrm_policy *old_pol;
2260 
2261 #ifdef CONFIG_XFRM_SUB_POLICY
2262     if (pol && pol->type != XFRM_POLICY_TYPE_MAIN)
2263         return -EINVAL;
2264 #endif
2265 
2266     spin_lock_bh(&net->xfrm.xfrm_policy_lock);
2267     old_pol = rcu_dereference_protected(sk->sk_policy[dir],
2268                 lockdep_is_held(&net->xfrm.xfrm_policy_lock));
2269     if (pol) {
2270         pol->curlft.add_time = ktime_get_real_seconds();
2271         pol->index = xfrm_gen_index(net, XFRM_POLICY_MAX+dir, 0);
2272         xfrm_sk_policy_link(pol, dir);
2273     }
2274     rcu_assign_pointer(sk->sk_policy[dir], pol);
2275     if (old_pol) {
2276         if (pol)
2277             xfrm_policy_requeue(old_pol, pol);
2278 
2279         /* Unlinking succeeds always. This is the only function
2280          * allowed to delete or replace socket policy.
2281          */
2282         xfrm_sk_policy_unlink(old_pol, dir);
2283     }
2284     spin_unlock_bh(&net->xfrm.xfrm_policy_lock);
2285 
2286     if (old_pol) {
2287         xfrm_policy_kill(old_pol);
2288     }
2289     return 0;
2290 }
2291 
2292 static struct xfrm_policy *clone_policy(const struct xfrm_policy *old, int dir)
2293 {
2294     struct xfrm_policy *newp = xfrm_policy_alloc(xp_net(old), GFP_ATOMIC);
2295     struct net *net = xp_net(old);
2296 
2297     if (newp) {
2298         newp->selector = old->selector;
2299         if (security_xfrm_policy_clone(old->security,
2300                            &newp->security)) {
2301             kfree(newp);
2302             return NULL;  /* ENOMEM */
2303         }
2304         newp->lft = old->lft;
2305         newp->curlft = old->curlft;
2306         newp->mark = old->mark;
2307         newp->if_id = old->if_id;
2308         newp->action = old->action;
2309         newp->flags = old->flags;
2310         newp->xfrm_nr = old->xfrm_nr;
2311         newp->index = old->index;
2312         newp->type = old->type;
2313         newp->family = old->family;
2314         memcpy(newp->xfrm_vec, old->xfrm_vec,
2315                newp->xfrm_nr*sizeof(struct xfrm_tmpl));
2316         spin_lock_bh(&net->xfrm.xfrm_policy_lock);
2317         xfrm_sk_policy_link(newp, dir);
2318         spin_unlock_bh(&net->xfrm.xfrm_policy_lock);
2319         xfrm_pol_put(newp);
2320     }
2321     return newp;
2322 }
2323 
2324 int __xfrm_sk_clone_policy(struct sock *sk, const struct sock *osk)
2325 {
2326     const struct xfrm_policy *p;
2327     struct xfrm_policy *np;
2328     int i, ret = 0;
2329 
2330     rcu_read_lock();
2331     for (i = 0; i < 2; i++) {
2332         p = rcu_dereference(osk->sk_policy[i]);
2333         if (p) {
2334             np = clone_policy(p, i);
2335             if (unlikely(!np)) {
2336                 ret = -ENOMEM;
2337                 break;
2338             }
2339             rcu_assign_pointer(sk->sk_policy[i], np);
2340         }
2341     }
2342     rcu_read_unlock();
2343     return ret;
2344 }
2345 
2346 static int
2347 xfrm_get_saddr(struct net *net, int oif, xfrm_address_t *local,
2348            xfrm_address_t *remote, unsigned short family, u32 mark)
2349 {
2350     int err;
2351     const struct xfrm_policy_afinfo *afinfo = xfrm_policy_get_afinfo(family);
2352 
2353     if (unlikely(afinfo == NULL))
2354         return -EINVAL;
2355     err = afinfo->get_saddr(net, oif, local, remote, mark);
2356     rcu_read_unlock();
2357     return err;
2358 }
2359 
2360 /* Resolve list of templates for the flow, given policy. */
2361 
2362 static int
2363 xfrm_tmpl_resolve_one(struct xfrm_policy *policy, const struct flowi *fl,
2364               struct xfrm_state **xfrm, unsigned short family)
2365 {
2366     struct net *net = xp_net(policy);
2367     int nx;
2368     int i, error;
2369     xfrm_address_t *daddr = xfrm_flowi_daddr(fl, family);
2370     xfrm_address_t *saddr = xfrm_flowi_saddr(fl, family);
2371     xfrm_address_t tmp;
2372 
2373     for (nx = 0, i = 0; i < policy->xfrm_nr; i++) {
2374         struct xfrm_state *x;
2375         xfrm_address_t *remote = daddr;
2376         xfrm_address_t *local  = saddr;
2377         struct xfrm_tmpl *tmpl = &policy->xfrm_vec[i];
2378 
2379         if (tmpl->mode == XFRM_MODE_TUNNEL ||
2380             tmpl->mode == XFRM_MODE_BEET) {
2381             remote = &tmpl->id.daddr;
2382             local = &tmpl->saddr;
2383             if (xfrm_addr_any(local, tmpl->encap_family)) {
2384                 error = xfrm_get_saddr(net, fl->flowi_oif,
2385                                &tmp, remote,
2386                                tmpl->encap_family, 0);
2387                 if (error)
2388                     goto fail;
2389                 local = &tmp;
2390             }
2391         }
2392 
2393         x = xfrm_state_find(remote, local, fl, tmpl, policy, &error,
2394                     family, policy->if_id);
2395 
2396         if (x && x->km.state == XFRM_STATE_VALID) {
2397             xfrm[nx++] = x;
2398             daddr = remote;
2399             saddr = local;
2400             continue;
2401         }
2402         if (x) {
2403             error = (x->km.state == XFRM_STATE_ERROR ?
2404                  -EINVAL : -EAGAIN);
2405             xfrm_state_put(x);
2406         } else if (error == -ESRCH) {
2407             error = -EAGAIN;
2408         }
2409 
2410         if (!tmpl->optional)
2411             goto fail;
2412     }
2413     return nx;
2414 
2415 fail:
2416     for (nx--; nx >= 0; nx--)
2417         xfrm_state_put(xfrm[nx]);
2418     return error;
2419 }
2420 
2421 static int
2422 xfrm_tmpl_resolve(struct xfrm_policy **pols, int npols, const struct flowi *fl,
2423           struct xfrm_state **xfrm, unsigned short family)
2424 {
2425     struct xfrm_state *tp[XFRM_MAX_DEPTH];
2426     struct xfrm_state **tpp = (npols > 1) ? tp : xfrm;
2427     int cnx = 0;
2428     int error;
2429     int ret;
2430     int i;
2431 
2432     for (i = 0; i < npols; i++) {
2433         if (cnx + pols[i]->xfrm_nr >= XFRM_MAX_DEPTH) {
2434             error = -ENOBUFS;
2435             goto fail;
2436         }
2437 
2438         ret = xfrm_tmpl_resolve_one(pols[i], fl, &tpp[cnx], family);
2439         if (ret < 0) {
2440             error = ret;
2441             goto fail;
2442         } else
2443             cnx += ret;
2444     }
2445 
2446     /* found states are sorted for outbound processing */
2447     if (npols > 1)
2448         xfrm_state_sort(xfrm, tpp, cnx, family);
2449 
2450     return cnx;
2451 
2452  fail:
2453     for (cnx--; cnx >= 0; cnx--)
2454         xfrm_state_put(tpp[cnx]);
2455     return error;
2456 
2457 }
2458 
2459 static int xfrm_get_tos(const struct flowi *fl, int family)
2460 {
2461     if (family == AF_INET)
2462         return IPTOS_RT_MASK & fl->u.ip4.flowi4_tos;
2463 
2464     return 0;
2465 }
2466 
2467 static inline struct xfrm_dst *xfrm_alloc_dst(struct net *net, int family)
2468 {
2469     const struct xfrm_policy_afinfo *afinfo = xfrm_policy_get_afinfo(family);
2470     struct dst_ops *dst_ops;
2471     struct xfrm_dst *xdst;
2472 
2473     if (!afinfo)
2474         return ERR_PTR(-EINVAL);
2475 
2476     switch (family) {
2477     case AF_INET:
2478         dst_ops = &net->xfrm.xfrm4_dst_ops;
2479         break;
2480 #if IS_ENABLED(CONFIG_IPV6)
2481     case AF_INET6:
2482         dst_ops = &net->xfrm.xfrm6_dst_ops;
2483         break;
2484 #endif
2485     default:
2486         BUG();
2487     }
2488     xdst = dst_alloc(dst_ops, NULL, 1, DST_OBSOLETE_NONE, 0);
2489 
2490     if (likely(xdst)) {
2491         memset_after(xdst, 0, u.dst);
2492     } else
2493         xdst = ERR_PTR(-ENOBUFS);
2494 
2495     rcu_read_unlock();
2496 
2497     return xdst;
2498 }
2499 
2500 static void xfrm_init_path(struct xfrm_dst *path, struct dst_entry *dst,
2501                int nfheader_len)
2502 {
2503     if (dst->ops->family == AF_INET6) {
2504         struct rt6_info *rt = (struct rt6_info *)dst;
2505         path->path_cookie = rt6_get_cookie(rt);
2506         path->u.rt6.rt6i_nfheader_len = nfheader_len;
2507     }
2508 }
2509 
2510 static inline int xfrm_fill_dst(struct xfrm_dst *xdst, struct net_device *dev,
2511                 const struct flowi *fl)
2512 {
2513     const struct xfrm_policy_afinfo *afinfo =
2514         xfrm_policy_get_afinfo(xdst->u.dst.ops->family);
2515     int err;
2516 
2517     if (!afinfo)
2518         return -EINVAL;
2519 
2520     err = afinfo->fill_dst(xdst, dev, fl);
2521 
2522     rcu_read_unlock();
2523 
2524     return err;
2525 }
2526 
2527 
2528 /* Allocate chain of dst_entry's, attach known xfrm's, calculate
2529  * all the metrics... Shortly, bundle a bundle.
2530  */
2531 
2532 static struct dst_entry *xfrm_bundle_create(struct xfrm_policy *policy,
2533                         struct xfrm_state **xfrm,
2534                         struct xfrm_dst **bundle,
2535                         int nx,
2536                         const struct flowi *fl,
2537                         struct dst_entry *dst)
2538 {
2539     const struct xfrm_state_afinfo *afinfo;
2540     const struct xfrm_mode *inner_mode;
2541     struct net *net = xp_net(policy);
2542     unsigned long now = jiffies;
2543     struct net_device *dev;
2544     struct xfrm_dst *xdst_prev = NULL;
2545     struct xfrm_dst *xdst0 = NULL;
2546     int i = 0;
2547     int err;
2548     int header_len = 0;
2549     int nfheader_len = 0;
2550     int trailer_len = 0;
2551     int tos;
2552     int family = policy->selector.family;
2553     xfrm_address_t saddr, daddr;
2554 
2555     xfrm_flowi_addr_get(fl, &saddr, &daddr, family);
2556 
2557     tos = xfrm_get_tos(fl, family);
2558 
2559     dst_hold(dst);
2560 
2561     for (; i < nx; i++) {
2562         struct xfrm_dst *xdst = xfrm_alloc_dst(net, family);
2563         struct dst_entry *dst1 = &xdst->u.dst;
2564 
2565         err = PTR_ERR(xdst);
2566         if (IS_ERR(xdst)) {
2567             dst_release(dst);
2568             goto put_states;
2569         }
2570 
2571         bundle[i] = xdst;
2572         if (!xdst_prev)
2573             xdst0 = xdst;
2574         else
2575             /* Ref count is taken during xfrm_alloc_dst()
2576              * No need to do dst_clone() on dst1
2577              */
2578             xfrm_dst_set_child(xdst_prev, &xdst->u.dst);
2579 
2580         if (xfrm[i]->sel.family == AF_UNSPEC) {
2581             inner_mode = xfrm_ip2inner_mode(xfrm[i],
2582                             xfrm_af2proto(family));
2583             if (!inner_mode) {
2584                 err = -EAFNOSUPPORT;
2585                 dst_release(dst);
2586                 goto put_states;
2587             }
2588         } else
2589             inner_mode = &xfrm[i]->inner_mode;
2590 
2591         xdst->route = dst;
2592         dst_copy_metrics(dst1, dst);
2593 
2594         if (xfrm[i]->props.mode != XFRM_MODE_TRANSPORT) {
2595             __u32 mark = 0;
2596             int oif;
2597 
2598             if (xfrm[i]->props.smark.v || xfrm[i]->props.smark.m)
2599                 mark = xfrm_smark_get(fl->flowi_mark, xfrm[i]);
2600 
2601             family = xfrm[i]->props.family;
2602             oif = fl->flowi_oif ? : fl->flowi_l3mdev;
2603             dst = xfrm_dst_lookup(xfrm[i], tos, oif,
2604                           &saddr, &daddr, family, mark);
2605             err = PTR_ERR(dst);
2606             if (IS_ERR(dst))
2607                 goto put_states;
2608         } else
2609             dst_hold(dst);
2610 
2611         dst1->xfrm = xfrm[i];
2612         xdst->xfrm_genid = xfrm[i]->genid;
2613 
2614         dst1->obsolete = DST_OBSOLETE_FORCE_CHK;
2615         dst1->lastuse = now;
2616 
2617         dst1->input = dst_discard;
2618 
2619         rcu_read_lock();
2620         afinfo = xfrm_state_afinfo_get_rcu(inner_mode->family);
2621         if (likely(afinfo))
2622             dst1->output = afinfo->output;
2623         else
2624             dst1->output = dst_discard_out;
2625         rcu_read_unlock();
2626 
2627         xdst_prev = xdst;
2628 
2629         header_len += xfrm[i]->props.header_len;
2630         if (xfrm[i]->type->flags & XFRM_TYPE_NON_FRAGMENT)
2631             nfheader_len += xfrm[i]->props.header_len;
2632         trailer_len += xfrm[i]->props.trailer_len;
2633     }
2634 
2635     xfrm_dst_set_child(xdst_prev, dst);
2636     xdst0->path = dst;
2637 
2638     err = -ENODEV;
2639     dev = dst->dev;
2640     if (!dev)
2641         goto free_dst;
2642 
2643     xfrm_init_path(xdst0, dst, nfheader_len);
2644     xfrm_init_pmtu(bundle, nx);
2645 
2646     for (xdst_prev = xdst0; xdst_prev != (struct xfrm_dst *)dst;
2647          xdst_prev = (struct xfrm_dst *) xfrm_dst_child(&xdst_prev->u.dst)) {
2648         err = xfrm_fill_dst(xdst_prev, dev, fl);
2649         if (err)
2650             goto free_dst;
2651 
2652         xdst_prev->u.dst.header_len = header_len;
2653         xdst_prev->u.dst.trailer_len = trailer_len;
2654         header_len -= xdst_prev->u.dst.xfrm->props.header_len;
2655         trailer_len -= xdst_prev->u.dst.xfrm->props.trailer_len;
2656     }
2657 
2658     return &xdst0->u.dst;
2659 
2660 put_states:
2661     for (; i < nx; i++)
2662         xfrm_state_put(xfrm[i]);
2663 free_dst:
2664     if (xdst0)
2665         dst_release_immediate(&xdst0->u.dst);
2666 
2667     return ERR_PTR(err);
2668 }
2669 
2670 static int xfrm_expand_policies(const struct flowi *fl, u16 family,
2671                 struct xfrm_policy **pols,
2672                 int *num_pols, int *num_xfrms)
2673 {
2674     int i;
2675 
2676     if (*num_pols == 0 || !pols[0]) {
2677         *num_pols = 0;
2678         *num_xfrms = 0;
2679         return 0;
2680     }
2681     if (IS_ERR(pols[0])) {
2682         *num_pols = 0;
2683         return PTR_ERR(pols[0]);
2684     }
2685 
2686     *num_xfrms = pols[0]->xfrm_nr;
2687 
2688 #ifdef CONFIG_XFRM_SUB_POLICY
2689     if (pols[0]->action == XFRM_POLICY_ALLOW &&
2690         pols[0]->type != XFRM_POLICY_TYPE_MAIN) {
2691         pols[1] = xfrm_policy_lookup_bytype(xp_net(pols[0]),
2692                             XFRM_POLICY_TYPE_MAIN,
2693                             fl, family,
2694                             XFRM_POLICY_OUT,
2695                             pols[0]->if_id);
2696         if (pols[1]) {
2697             if (IS_ERR(pols[1])) {
2698                 xfrm_pols_put(pols, *num_pols);
2699                 *num_pols = 0;
2700                 return PTR_ERR(pols[1]);
2701             }
2702             (*num_pols)++;
2703             (*num_xfrms) += pols[1]->xfrm_nr;
2704         }
2705     }
2706 #endif
2707     for (i = 0; i < *num_pols; i++) {
2708         if (pols[i]->action != XFRM_POLICY_ALLOW) {
2709             *num_xfrms = -1;
2710             break;
2711         }
2712     }
2713 
2714     return 0;
2715 
2716 }
2717 
2718 static struct xfrm_dst *
2719 xfrm_resolve_and_create_bundle(struct xfrm_policy **pols, int num_pols,
2720                    const struct flowi *fl, u16 family,
2721                    struct dst_entry *dst_orig)
2722 {
2723     struct net *net = xp_net(pols[0]);
2724     struct xfrm_state *xfrm[XFRM_MAX_DEPTH];
2725     struct xfrm_dst *bundle[XFRM_MAX_DEPTH];
2726     struct xfrm_dst *xdst;
2727     struct dst_entry *dst;
2728     int err;
2729 
2730     /* Try to instantiate a bundle */
2731     err = xfrm_tmpl_resolve(pols, num_pols, fl, xfrm, family);
2732     if (err <= 0) {
2733         if (err == 0)
2734             return NULL;
2735 
2736         if (err != -EAGAIN)
2737             XFRM_INC_STATS(net, LINUX_MIB_XFRMOUTPOLERROR);
2738         return ERR_PTR(err);
2739     }
2740 
2741     dst = xfrm_bundle_create(pols[0], xfrm, bundle, err, fl, dst_orig);
2742     if (IS_ERR(dst)) {
2743         XFRM_INC_STATS(net, LINUX_MIB_XFRMOUTBUNDLEGENERROR);
2744         return ERR_CAST(dst);
2745     }
2746 
2747     xdst = (struct xfrm_dst *)dst;
2748     xdst->num_xfrms = err;
2749     xdst->num_pols = num_pols;
2750     memcpy(xdst->pols, pols, sizeof(struct xfrm_policy *) * num_pols);
2751     xdst->policy_genid = atomic_read(&pols[0]->genid);
2752 
2753     return xdst;
2754 }
2755 
2756 static void xfrm_policy_queue_process(struct timer_list *t)
2757 {
2758     struct sk_buff *skb;
2759     struct sock *sk;
2760     struct dst_entry *dst;
2761     struct xfrm_policy *pol = from_timer(pol, t, polq.hold_timer);
2762     struct net *net = xp_net(pol);
2763     struct xfrm_policy_queue *pq = &pol->polq;
2764     struct flowi fl;
2765     struct sk_buff_head list;
2766     __u32 skb_mark;
2767 
2768     spin_lock(&pq->hold_queue.lock);
2769     skb = skb_peek(&pq->hold_queue);
2770     if (!skb) {
2771         spin_unlock(&pq->hold_queue.lock);
2772         goto out;
2773     }
2774     dst = skb_dst(skb);
2775     sk = skb->sk;
2776 
2777     /* Fixup the mark to support VTI. */
2778     skb_mark = skb->mark;
2779     skb->mark = pol->mark.v;
2780     xfrm_decode_session(skb, &fl, dst->ops->family);
2781     skb->mark = skb_mark;
2782     spin_unlock(&pq->hold_queue.lock);
2783 
2784     dst_hold(xfrm_dst_path(dst));
2785     dst = xfrm_lookup(net, xfrm_dst_path(dst), &fl, sk, XFRM_LOOKUP_QUEUE);
2786     if (IS_ERR(dst))
2787         goto purge_queue;
2788 
2789     if (dst->flags & DST_XFRM_QUEUE) {
2790         dst_release(dst);
2791 
2792         if (pq->timeout >= XFRM_QUEUE_TMO_MAX)
2793             goto purge_queue;
2794 
2795         pq->timeout = pq->timeout << 1;
2796         if (!mod_timer(&pq->hold_timer, jiffies + pq->timeout))
2797             xfrm_pol_hold(pol);
2798         goto out;
2799     }
2800 
2801     dst_release(dst);
2802 
2803     __skb_queue_head_init(&list);
2804 
2805     spin_lock(&pq->hold_queue.lock);
2806     pq->timeout = 0;
2807     skb_queue_splice_init(&pq->hold_queue, &list);
2808     spin_unlock(&pq->hold_queue.lock);
2809 
2810     while (!skb_queue_empty(&list)) {
2811         skb = __skb_dequeue(&list);
2812 
2813         /* Fixup the mark to support VTI. */
2814         skb_mark = skb->mark;
2815         skb->mark = pol->mark.v;
2816         xfrm_decode_session(skb, &fl, skb_dst(skb)->ops->family);
2817         skb->mark = skb_mark;
2818 
2819         dst_hold(xfrm_dst_path(skb_dst(skb)));
2820         dst = xfrm_lookup(net, xfrm_dst_path(skb_dst(skb)), &fl, skb->sk, 0);
2821         if (IS_ERR(dst)) {
2822             kfree_skb(skb);
2823             continue;
2824         }
2825 
2826         nf_reset_ct(skb);
2827         skb_dst_drop(skb);
2828         skb_dst_set(skb, dst);
2829 
2830         dst_output(net, skb->sk, skb);
2831     }
2832 
2833 out:
2834     xfrm_pol_put(pol);
2835     return;
2836 
2837 purge_queue:
2838     pq->timeout = 0;
2839     skb_queue_purge(&pq->hold_queue);
2840     xfrm_pol_put(pol);
2841 }
2842 
2843 static int xdst_queue_output(struct net *net, struct sock *sk, struct sk_buff *skb)
2844 {
2845     unsigned long sched_next;
2846     struct dst_entry *dst = skb_dst(skb);
2847     struct xfrm_dst *xdst = (struct xfrm_dst *) dst;
2848     struct xfrm_policy *pol = xdst->pols[0];
2849     struct xfrm_policy_queue *pq = &pol->polq;
2850 
2851     if (unlikely(skb_fclone_busy(sk, skb))) {
2852         kfree_skb(skb);
2853         return 0;
2854     }
2855 
2856     if (pq->hold_queue.qlen > XFRM_MAX_QUEUE_LEN) {
2857         kfree_skb(skb);
2858         return -EAGAIN;
2859     }
2860 
2861     skb_dst_force(skb);
2862 
2863     spin_lock_bh(&pq->hold_queue.lock);
2864 
2865     if (!pq->timeout)
2866         pq->timeout = XFRM_QUEUE_TMO_MIN;
2867 
2868     sched_next = jiffies + pq->timeout;
2869 
2870     if (del_timer(&pq->hold_timer)) {
2871         if (time_before(pq->hold_timer.expires, sched_next))
2872             sched_next = pq->hold_timer.expires;
2873         xfrm_pol_put(pol);
2874     }
2875 
2876     __skb_queue_tail(&pq->hold_queue, skb);
2877     if (!mod_timer(&pq->hold_timer, sched_next))
2878         xfrm_pol_hold(pol);
2879 
2880     spin_unlock_bh(&pq->hold_queue.lock);
2881 
2882     return 0;
2883 }
2884 
2885 static struct xfrm_dst *xfrm_create_dummy_bundle(struct net *net,
2886                          struct xfrm_flo *xflo,
2887                          const struct flowi *fl,
2888                          int num_xfrms,
2889                          u16 family)
2890 {
2891     int err;
2892     struct net_device *dev;
2893     struct dst_entry *dst;
2894     struct dst_entry *dst1;
2895     struct xfrm_dst *xdst;
2896 
2897     xdst = xfrm_alloc_dst(net, family);
2898     if (IS_ERR(xdst))
2899         return xdst;
2900 
2901     if (!(xflo->flags & XFRM_LOOKUP_QUEUE) ||
2902         net->xfrm.sysctl_larval_drop ||
2903         num_xfrms <= 0)
2904         return xdst;
2905 
2906     dst = xflo->dst_orig;
2907     dst1 = &xdst->u.dst;
2908     dst_hold(dst);
2909     xdst->route = dst;
2910 
2911     dst_copy_metrics(dst1, dst);
2912 
2913     dst1->obsolete = DST_OBSOLETE_FORCE_CHK;
2914     dst1->flags |= DST_XFRM_QUEUE;
2915     dst1->lastuse = jiffies;
2916 
2917     dst1->input = dst_discard;
2918     dst1->output = xdst_queue_output;
2919 
2920     dst_hold(dst);
2921     xfrm_dst_set_child(xdst, dst);
2922     xdst->path = dst;
2923 
2924     xfrm_init_path((struct xfrm_dst *)dst1, dst, 0);
2925 
2926     err = -ENODEV;
2927     dev = dst->dev;
2928     if (!dev)
2929         goto free_dst;
2930 
2931     err = xfrm_fill_dst(xdst, dev, fl);
2932     if (err)
2933         goto free_dst;
2934 
2935 out:
2936     return xdst;
2937 
2938 free_dst:
2939     dst_release(dst1);
2940     xdst = ERR_PTR(err);
2941     goto out;
2942 }
2943 
2944 static struct xfrm_dst *xfrm_bundle_lookup(struct net *net,
2945                        const struct flowi *fl,
2946                        u16 family, u8 dir,
2947                        struct xfrm_flo *xflo, u32 if_id)
2948 {
2949     struct xfrm_policy *pols[XFRM_POLICY_TYPE_MAX];
2950     int num_pols = 0, num_xfrms = 0, err;
2951     struct xfrm_dst *xdst;
2952 
2953     /* Resolve policies to use if we couldn't get them from
2954      * previous cache entry */
2955     num_pols = 1;
2956     pols[0] = xfrm_policy_lookup(net, fl, family, dir, if_id);
2957     err = xfrm_expand_policies(fl, family, pols,
2958                        &num_pols, &num_xfrms);
2959     if (err < 0)
2960         goto inc_error;
2961     if (num_pols == 0)
2962         return NULL;
2963     if (num_xfrms <= 0)
2964         goto make_dummy_bundle;
2965 
2966     xdst = xfrm_resolve_and_create_bundle(pols, num_pols, fl, family,
2967                           xflo->dst_orig);
2968     if (IS_ERR(xdst)) {
2969         err = PTR_ERR(xdst);
2970         if (err == -EREMOTE) {
2971             xfrm_pols_put(pols, num_pols);
2972             return NULL;
2973         }
2974 
2975         if (err != -EAGAIN)
2976             goto error;
2977         goto make_dummy_bundle;
2978     } else if (xdst == NULL) {
2979         num_xfrms = 0;
2980         goto make_dummy_bundle;
2981     }
2982 
2983     return xdst;
2984 
2985 make_dummy_bundle:
2986     /* We found policies, but there's no bundles to instantiate:
2987      * either because the policy blocks, has no transformations or
2988      * we could not build template (no xfrm_states).*/
2989     xdst = xfrm_create_dummy_bundle(net, xflo, fl, num_xfrms, family);
2990     if (IS_ERR(xdst)) {
2991         xfrm_pols_put(pols, num_pols);
2992         return ERR_CAST(xdst);
2993     }
2994     xdst->num_pols = num_pols;
2995     xdst->num_xfrms = num_xfrms;
2996     memcpy(xdst->pols, pols, sizeof(struct xfrm_policy *) * num_pols);
2997 
2998     return xdst;
2999 
3000 inc_error:
3001     XFRM_INC_STATS(net, LINUX_MIB_XFRMOUTPOLERROR);
3002 error:
3003     xfrm_pols_put(pols, num_pols);
3004     return ERR_PTR(err);
3005 }
3006 
3007 static struct dst_entry *make_blackhole(struct net *net, u16 family,
3008                     struct dst_entry *dst_orig)
3009 {
3010     const struct xfrm_policy_afinfo *afinfo = xfrm_policy_get_afinfo(family);
3011     struct dst_entry *ret;
3012 
3013     if (!afinfo) {
3014         dst_release(dst_orig);
3015         return ERR_PTR(-EINVAL);
3016     } else {
3017         ret = afinfo->blackhole_route(net, dst_orig);
3018     }
3019     rcu_read_unlock();
3020 
3021     return ret;
3022 }
3023 
3024 /* Finds/creates a bundle for given flow and if_id
3025  *
3026  * At the moment we eat a raw IP route. Mostly to speed up lookups
3027  * on interfaces with disabled IPsec.
3028  *
3029  * xfrm_lookup uses an if_id of 0 by default, and is provided for
3030  * compatibility
3031  */
3032 struct dst_entry *xfrm_lookup_with_ifid(struct net *net,
3033                     struct dst_entry *dst_orig,
3034                     const struct flowi *fl,
3035                     const struct sock *sk,
3036                     int flags, u32 if_id)
3037 {
3038     struct xfrm_policy *pols[XFRM_POLICY_TYPE_MAX];
3039     struct xfrm_dst *xdst;
3040     struct dst_entry *dst, *route;
3041     u16 family = dst_orig->ops->family;
3042     u8 dir = XFRM_POLICY_OUT;
3043     int i, err, num_pols, num_xfrms = 0, drop_pols = 0;
3044 
3045     dst = NULL;
3046     xdst = NULL;
3047     route = NULL;
3048 
3049     sk = sk_const_to_full_sk(sk);
3050     if (sk && sk->sk_policy[XFRM_POLICY_OUT]) {
3051         num_pols = 1;
3052         pols[0] = xfrm_sk_policy_lookup(sk, XFRM_POLICY_OUT, fl, family,
3053                         if_id);
3054         err = xfrm_expand_policies(fl, family, pols,
3055                        &num_pols, &num_xfrms);
3056         if (err < 0)
3057             goto dropdst;
3058 
3059         if (num_pols) {
3060             if (num_xfrms <= 0) {
3061                 drop_pols = num_pols;
3062                 goto no_transform;
3063             }
3064 
3065             xdst = xfrm_resolve_and_create_bundle(
3066                     pols, num_pols, fl,
3067                     family, dst_orig);
3068 
3069             if (IS_ERR(xdst)) {
3070                 xfrm_pols_put(pols, num_pols);
3071                 err = PTR_ERR(xdst);
3072                 if (err == -EREMOTE)
3073                     goto nopol;
3074 
3075                 goto dropdst;
3076             } else if (xdst == NULL) {
3077                 num_xfrms = 0;
3078                 drop_pols = num_pols;
3079                 goto no_transform;
3080             }
3081 
3082             route = xdst->route;
3083         }
3084     }
3085 
3086     if (xdst == NULL) {
3087         struct xfrm_flo xflo;
3088 
3089         xflo.dst_orig = dst_orig;
3090         xflo.flags = flags;
3091 
3092         /* To accelerate a bit...  */
3093         if (!if_id && ((dst_orig->flags & DST_NOXFRM) ||
3094                    !net->xfrm.policy_count[XFRM_POLICY_OUT]))
3095             goto nopol;
3096 
3097         xdst = xfrm_bundle_lookup(net, fl, family, dir, &xflo, if_id);
3098         if (xdst == NULL)
3099             goto nopol;
3100         if (IS_ERR(xdst)) {
3101             err = PTR_ERR(xdst);
3102             goto dropdst;
3103         }
3104 
3105         num_pols = xdst->num_pols;
3106         num_xfrms = xdst->num_xfrms;
3107         memcpy(pols, xdst->pols, sizeof(struct xfrm_policy *) * num_pols);
3108         route = xdst->route;
3109     }
3110 
3111     dst = &xdst->u.dst;
3112     if (route == NULL && num_xfrms > 0) {
3113         /* The only case when xfrm_bundle_lookup() returns a
3114          * bundle with null route, is when the template could
3115          * not be resolved. It means policies are there, but
3116          * bundle could not be created, since we don't yet
3117          * have the xfrm_state's. We need to wait for KM to
3118          * negotiate new SA's or bail out with error.*/
3119         if (net->xfrm.sysctl_larval_drop) {
3120             XFRM_INC_STATS(net, LINUX_MIB_XFRMOUTNOSTATES);
3121             err = -EREMOTE;
3122             goto error;
3123         }
3124 
3125         err = -EAGAIN;
3126 
3127         XFRM_INC_STATS(net, LINUX_MIB_XFRMOUTNOSTATES);
3128         goto error;
3129     }
3130 
3131 no_transform:
3132     if (num_pols == 0)
3133         goto nopol;
3134 
3135     if ((flags & XFRM_LOOKUP_ICMP) &&
3136         !(pols[0]->flags & XFRM_POLICY_ICMP)) {
3137         err = -ENOENT;
3138         goto error;
3139     }
3140 
3141     for (i = 0; i < num_pols; i++)
3142         pols[i]->curlft.use_time = ktime_get_real_seconds();
3143 
3144     if (num_xfrms < 0) {
3145         /* Prohibit the flow */
3146         XFRM_INC_STATS(net, LINUX_MIB_XFRMOUTPOLBLOCK);
3147         err = -EPERM;
3148         goto error;
3149     } else if (num_xfrms > 0) {
3150         /* Flow transformed */
3151         dst_release(dst_orig);
3152     } else {
3153         /* Flow passes untransformed */
3154         dst_release(dst);
3155         dst = dst_orig;
3156     }
3157 ok:
3158     xfrm_pols_put(pols, drop_pols);
3159     if (dst && dst->xfrm &&
3160         dst->xfrm->props.mode == XFRM_MODE_TUNNEL)
3161         dst->flags |= DST_XFRM_TUNNEL;
3162     return dst;
3163 
3164 nopol:
3165     if ((!dst_orig->dev || !(dst_orig->dev->flags & IFF_LOOPBACK)) &&
3166         net->xfrm.policy_default[dir] == XFRM_USERPOLICY_BLOCK) {
3167         err = -EPERM;
3168         goto error;
3169     }
3170     if (!(flags & XFRM_LOOKUP_ICMP)) {
3171         dst = dst_orig;
3172         goto ok;
3173     }
3174     err = -ENOENT;
3175 error:
3176     dst_release(dst);
3177 dropdst:
3178     if (!(flags & XFRM_LOOKUP_KEEP_DST_REF))
3179         dst_release(dst_orig);
3180     xfrm_pols_put(pols, drop_pols);
3181     return ERR_PTR(err);
3182 }
3183 EXPORT_SYMBOL(xfrm_lookup_with_ifid);
3184 
3185 /* Main function: finds/creates a bundle for given flow.
3186  *
3187  * At the moment we eat a raw IP route. Mostly to speed up lookups
3188  * on interfaces with disabled IPsec.
3189  */
3190 struct dst_entry *xfrm_lookup(struct net *net, struct dst_entry *dst_orig,
3191                   const struct flowi *fl, const struct sock *sk,
3192                   int flags)
3193 {
3194     return xfrm_lookup_with_ifid(net, dst_orig, fl, sk, flags, 0);
3195 }
3196 EXPORT_SYMBOL(xfrm_lookup);
3197 
3198 /* Callers of xfrm_lookup_route() must ensure a call to dst_output().
3199  * Otherwise we may send out blackholed packets.
3200  */
3201 struct dst_entry *xfrm_lookup_route(struct net *net, struct dst_entry *dst_orig,
3202                     const struct flowi *fl,
3203                     const struct sock *sk, int flags)
3204 {
3205     struct dst_entry *dst = xfrm_lookup(net, dst_orig, fl, sk,
3206                         flags | XFRM_LOOKUP_QUEUE |
3207                         XFRM_LOOKUP_KEEP_DST_REF);
3208 
3209     if (PTR_ERR(dst) == -EREMOTE)
3210         return make_blackhole(net, dst_orig->ops->family, dst_orig);
3211 
3212     if (IS_ERR(dst))
3213         dst_release(dst_orig);
3214 
3215     return dst;
3216 }
3217 EXPORT_SYMBOL(xfrm_lookup_route);
3218 
3219 static inline int
3220 xfrm_secpath_reject(int idx, struct sk_buff *skb, const struct flowi *fl)
3221 {
3222     struct sec_path *sp = skb_sec_path(skb);
3223     struct xfrm_state *x;
3224 
3225     if (!sp || idx < 0 || idx >= sp->len)
3226         return 0;
3227     x = sp->xvec[idx];
3228     if (!x->type->reject)
3229         return 0;
3230     return x->type->reject(x, skb, fl);
3231 }
3232 
3233 /* When skb is transformed back to its "native" form, we have to
3234  * check policy restrictions. At the moment we make this in maximally
3235  * stupid way. Shame on me. :-) Of course, connected sockets must
3236  * have policy cached at them.
3237  */
3238 
3239 static inline int
3240 xfrm_state_ok(const struct xfrm_tmpl *tmpl, const struct xfrm_state *x,
3241           unsigned short family)
3242 {
3243     if (xfrm_state_kern(x))
3244         return tmpl->optional && !xfrm_state_addr_cmp(tmpl, x, tmpl->encap_family);
3245     return  x->id.proto == tmpl->id.proto &&
3246         (x->id.spi == tmpl->id.spi || !tmpl->id.spi) &&
3247         (x->props.reqid == tmpl->reqid || !tmpl->reqid) &&
3248         x->props.mode == tmpl->mode &&
3249         (tmpl->allalgs || (tmpl->aalgos & (1<<x->props.aalgo)) ||
3250          !(xfrm_id_proto_match(tmpl->id.proto, IPSEC_PROTO_ANY))) &&
3251         !(x->props.mode != XFRM_MODE_TRANSPORT &&
3252           xfrm_state_addr_cmp(tmpl, x, family));
3253 }
3254 
3255 /*
3256  * 0 or more than 0 is returned when validation is succeeded (either bypass
3257  * because of optional transport mode, or next index of the matched secpath
3258  * state with the template.
3259  * -1 is returned when no matching template is found.
3260  * Otherwise "-2 - errored_index" is returned.
3261  */
3262 static inline int
3263 xfrm_policy_ok(const struct xfrm_tmpl *tmpl, const struct sec_path *sp, int start,
3264            unsigned short family)
3265 {
3266     int idx = start;
3267 
3268     if (tmpl->optional) {
3269         if (tmpl->mode == XFRM_MODE_TRANSPORT)
3270             return start;
3271     } else
3272         start = -1;
3273     for (; idx < sp->len; idx++) {
3274         if (xfrm_state_ok(tmpl, sp->xvec[idx], family))
3275             return ++idx;
3276         if (sp->xvec[idx]->props.mode != XFRM_MODE_TRANSPORT) {
3277             if (start == -1)
3278                 start = -2-idx;
3279             break;
3280         }
3281     }
3282     return start;
3283 }
3284 
3285 static void
3286 decode_session4(struct sk_buff *skb, struct flowi *fl, bool reverse)
3287 {
3288     const struct iphdr *iph = ip_hdr(skb);
3289     int ihl = iph->ihl;
3290     u8 *xprth = skb_network_header(skb) + ihl * 4;
3291     struct flowi4 *fl4 = &fl->u.ip4;
3292     int oif = 0;
3293 
3294     if (skb_dst(skb) && skb_dst(skb)->dev)
3295         oif = skb_dst(skb)->dev->ifindex;
3296 
3297     memset(fl4, 0, sizeof(struct flowi4));
3298     fl4->flowi4_mark = skb->mark;
3299     fl4->flowi4_oif = reverse ? skb->skb_iif : oif;
3300 
3301     fl4->flowi4_proto = iph->protocol;
3302     fl4->daddr = reverse ? iph->saddr : iph->daddr;
3303     fl4->saddr = reverse ? iph->daddr : iph->saddr;
3304     fl4->flowi4_tos = iph->tos & ~INET_ECN_MASK;
3305 
3306     if (!ip_is_fragment(iph)) {
3307         switch (iph->protocol) {
3308         case IPPROTO_UDP:
3309         case IPPROTO_UDPLITE:
3310         case IPPROTO_TCP:
3311         case IPPROTO_SCTP:
3312         case IPPROTO_DCCP:
3313             if (xprth + 4 < skb->data ||
3314                 pskb_may_pull(skb, xprth + 4 - skb->data)) {
3315                 __be16 *ports;
3316 
3317                 xprth = skb_network_header(skb) + ihl * 4;
3318                 ports = (__be16 *)xprth;
3319 
3320                 fl4->fl4_sport = ports[!!reverse];
3321                 fl4->fl4_dport = ports[!reverse];
3322             }
3323             break;
3324         case IPPROTO_ICMP:
3325             if (xprth + 2 < skb->data ||
3326                 pskb_may_pull(skb, xprth + 2 - skb->data)) {
3327                 u8 *icmp;
3328 
3329                 xprth = skb_network_header(skb) + ihl * 4;
3330                 icmp = xprth;
3331 
3332                 fl4->fl4_icmp_type = icmp[0];
3333                 fl4->fl4_icmp_code = icmp[1];
3334             }
3335             break;
3336         case IPPROTO_GRE:
3337             if (xprth + 12 < skb->data ||
3338                 pskb_may_pull(skb, xprth + 12 - skb->data)) {
3339                 __be16 *greflags;
3340                 __be32 *gre_hdr;
3341 
3342                 xprth = skb_network_header(skb) + ihl * 4;
3343                 greflags = (__be16 *)xprth;
3344                 gre_hdr = (__be32 *)xprth;
3345 
3346                 if (greflags[0] & GRE_KEY) {
3347                     if (greflags[0] & GRE_CSUM)
3348                         gre_hdr++;
3349                     fl4->fl4_gre_key = gre_hdr[1];
3350                 }
3351             }
3352             break;
3353         default:
3354             break;
3355         }
3356     }
3357 }
3358 
3359 #if IS_ENABLED(CONFIG_IPV6)
3360 static void
3361 decode_session6(struct sk_buff *skb, struct flowi *fl, bool reverse)
3362 {
3363     struct flowi6 *fl6 = &fl->u.ip6;
3364     int onlyproto = 0;
3365     const struct ipv6hdr *hdr = ipv6_hdr(skb);
3366     u32 offset = sizeof(*hdr);
3367     struct ipv6_opt_hdr *exthdr;
3368     const unsigned char *nh = skb_network_header(skb);
3369     u16 nhoff = IP6CB(skb)->nhoff;
3370     int oif = 0;
3371     u8 nexthdr;
3372 
3373     if (!nhoff)
3374         nhoff = offsetof(struct ipv6hdr, nexthdr);
3375 
3376     nexthdr = nh[nhoff];
3377 
3378     if (skb_dst(skb) && skb_dst(skb)->dev)
3379         oif = skb_dst(skb)->dev->ifindex;
3380 
3381     memset(fl6, 0, sizeof(struct flowi6));
3382     fl6->flowi6_mark = skb->mark;
3383     fl6->flowi6_oif = reverse ? skb->skb_iif : oif;
3384 
3385     fl6->daddr = reverse ? hdr->saddr : hdr->daddr;
3386     fl6->saddr = reverse ? hdr->daddr : hdr->saddr;
3387 
3388     while (nh + offset + sizeof(*exthdr) < skb->data ||
3389            pskb_may_pull(skb, nh + offset + sizeof(*exthdr) - skb->data)) {
3390         nh = skb_network_header(skb);
3391         exthdr = (struct ipv6_opt_hdr *)(nh + offset);
3392 
3393         switch (nexthdr) {
3394         case NEXTHDR_FRAGMENT:
3395             onlyproto = 1;
3396             fallthrough;
3397         case NEXTHDR_ROUTING:
3398         case NEXTHDR_HOP:
3399         case NEXTHDR_DEST:
3400             offset += ipv6_optlen(exthdr);
3401             nexthdr = exthdr->nexthdr;
3402             break;
3403         case IPPROTO_UDP:
3404         case IPPROTO_UDPLITE:
3405         case IPPROTO_TCP:
3406         case IPPROTO_SCTP:
3407         case IPPROTO_DCCP:
3408             if (!onlyproto && (nh + offset + 4 < skb->data ||
3409                  pskb_may_pull(skb, nh + offset + 4 - skb->data))) {
3410                 __be16 *ports;
3411 
3412                 nh = skb_network_header(skb);
3413                 ports = (__be16 *)(nh + offset);
3414                 fl6->fl6_sport = ports[!!reverse];
3415                 fl6->fl6_dport = ports[!reverse];
3416             }
3417             fl6->flowi6_proto = nexthdr;
3418             return;
3419         case IPPROTO_ICMPV6:
3420             if (!onlyproto && (nh + offset + 2 < skb->data ||
3421                 pskb_may_pull(skb, nh + offset + 2 - skb->data))) {
3422                 u8 *icmp;
3423 
3424                 nh = skb_network_header(skb);
3425                 icmp = (u8 *)(nh + offset);
3426                 fl6->fl6_icmp_type = icmp[0];
3427                 fl6->fl6_icmp_code = icmp[1];
3428             }
3429             fl6->flowi6_proto = nexthdr;
3430             return;
3431         case IPPROTO_GRE:
3432             if (!onlyproto &&
3433                 (nh + offset + 12 < skb->data ||
3434                  pskb_may_pull(skb, nh + offset + 12 - skb->data))) {
3435                 struct gre_base_hdr *gre_hdr;
3436                 __be32 *gre_key;
3437 
3438                 nh = skb_network_header(skb);
3439                 gre_hdr = (struct gre_base_hdr *)(nh + offset);
3440                 gre_key = (__be32 *)(gre_hdr + 1);
3441 
3442                 if (gre_hdr->flags & GRE_KEY) {
3443                     if (gre_hdr->flags & GRE_CSUM)
3444                         gre_key++;
3445                     fl6->fl6_gre_key = *gre_key;
3446                 }
3447             }
3448             fl6->flowi6_proto = nexthdr;
3449             return;
3450 
3451 #if IS_ENABLED(CONFIG_IPV6_MIP6)
3452         case IPPROTO_MH:
3453             offset += ipv6_optlen(exthdr);
3454             if (!onlyproto && (nh + offset + 3 < skb->data ||
3455                 pskb_may_pull(skb, nh + offset + 3 - skb->data))) {
3456                 struct ip6_mh *mh;
3457 
3458                 nh = skb_network_header(skb);
3459                 mh = (struct ip6_mh *)(nh + offset);
3460                 fl6->fl6_mh_type = mh->ip6mh_type;
3461             }
3462             fl6->flowi6_proto = nexthdr;
3463             return;
3464 #endif
3465         default:
3466             fl6->flowi6_proto = nexthdr;
3467             return;
3468         }
3469     }
3470 }
3471 #endif
3472 
3473 int __xfrm_decode_session(struct sk_buff *skb, struct flowi *fl,
3474               unsigned int family, int reverse)
3475 {
3476     switch (family) {
3477     case AF_INET:
3478         decode_session4(skb, fl, reverse);
3479         break;
3480 #if IS_ENABLED(CONFIG_IPV6)
3481     case AF_INET6:
3482         decode_session6(skb, fl, reverse);
3483         break;
3484 #endif
3485     default:
3486         return -EAFNOSUPPORT;
3487     }
3488 
3489     return security_xfrm_decode_session(skb, &fl->flowi_secid);
3490 }
3491 EXPORT_SYMBOL(__xfrm_decode_session);
3492 
3493 static inline int secpath_has_nontransport(const struct sec_path *sp, int k, int *idxp)
3494 {
3495     for (; k < sp->len; k++) {
3496         if (sp->xvec[k]->props.mode != XFRM_MODE_TRANSPORT) {
3497             *idxp = k;
3498             return 1;
3499         }
3500     }
3501 
3502     return 0;
3503 }
3504 
3505 int __xfrm_policy_check(struct sock *sk, int dir, struct sk_buff *skb,
3506             unsigned short family)
3507 {
3508     struct net *net = dev_net(skb->dev);
3509     struct xfrm_policy *pol;
3510     struct xfrm_policy *pols[XFRM_POLICY_TYPE_MAX];
3511     int npols = 0;
3512     int xfrm_nr;
3513     int pi;
3514     int reverse;
3515     struct flowi fl;
3516     int xerr_idx = -1;
3517     const struct xfrm_if_cb *ifcb;
3518     struct sec_path *sp;
3519     struct xfrm_if *xi;
3520     u32 if_id = 0;
3521 
3522     rcu_read_lock();
3523     ifcb = xfrm_if_get_cb();
3524 
3525     if (ifcb) {
3526         xi = ifcb->decode_session(skb, family);
3527         if (xi) {
3528             if_id = xi->p.if_id;
3529             net = xi->net;
3530         }
3531     }
3532     rcu_read_unlock();
3533 
3534     reverse = dir & ~XFRM_POLICY_MASK;
3535     dir &= XFRM_POLICY_MASK;
3536 
3537     if (__xfrm_decode_session(skb, &fl, family, reverse) < 0) {
3538         XFRM_INC_STATS(net, LINUX_MIB_XFRMINHDRERROR);
3539         return 0;
3540     }
3541 
3542     nf_nat_decode_session(skb, &fl, family);
3543 
3544     /* First, check used SA against their selectors. */
3545     sp = skb_sec_path(skb);
3546     if (sp) {
3547         int i;
3548 
3549         for (i = sp->len - 1; i >= 0; i--) {
3550             struct xfrm_state *x = sp->xvec[i];
3551             if (!xfrm_selector_match(&x->sel, &fl, family)) {
3552                 XFRM_INC_STATS(net, LINUX_MIB_XFRMINSTATEMISMATCH);
3553                 return 0;
3554             }
3555         }
3556     }
3557 
3558     pol = NULL;
3559     sk = sk_to_full_sk(sk);
3560     if (sk && sk->sk_policy[dir]) {
3561         pol = xfrm_sk_policy_lookup(sk, dir, &fl, family, if_id);
3562         if (IS_ERR(pol)) {
3563             XFRM_INC_STATS(net, LINUX_MIB_XFRMINPOLERROR);
3564             return 0;
3565         }
3566     }
3567 
3568     if (!pol)
3569         pol = xfrm_policy_lookup(net, &fl, family, dir, if_id);
3570 
3571     if (IS_ERR(pol)) {
3572         XFRM_INC_STATS(net, LINUX_MIB_XFRMINPOLERROR);
3573         return 0;
3574     }
3575 
3576     if (!pol) {
3577         if (net->xfrm.policy_default[dir] == XFRM_USERPOLICY_BLOCK) {
3578             XFRM_INC_STATS(net, LINUX_MIB_XFRMINNOPOLS);
3579             return 0;
3580         }
3581 
3582         if (sp && secpath_has_nontransport(sp, 0, &xerr_idx)) {
3583             xfrm_secpath_reject(xerr_idx, skb, &fl);
3584             XFRM_INC_STATS(net, LINUX_MIB_XFRMINNOPOLS);
3585             return 0;
3586         }
3587         return 1;
3588     }
3589 
3590     pol->curlft.use_time = ktime_get_real_seconds();
3591 
3592     pols[0] = pol;
3593     npols++;
3594 #ifdef CONFIG_XFRM_SUB_POLICY
3595     if (pols[0]->type != XFRM_POLICY_TYPE_MAIN) {
3596         pols[1] = xfrm_policy_lookup_bytype(net, XFRM_POLICY_TYPE_MAIN,
3597                             &fl, family,
3598                             XFRM_POLICY_IN, if_id);
3599         if (pols[1]) {
3600             if (IS_ERR(pols[1])) {
3601                 XFRM_INC_STATS(net, LINUX_MIB_XFRMINPOLERROR);
3602                 xfrm_pol_put(pols[0]);
3603                 return 0;
3604             }
3605             pols[1]->curlft.use_time = ktime_get_real_seconds();
3606             npols++;
3607         }
3608     }
3609 #endif
3610 
3611     if (pol->action == XFRM_POLICY_ALLOW) {
3612         static struct sec_path dummy;
3613         struct xfrm_tmpl *tp[XFRM_MAX_DEPTH];
3614         struct xfrm_tmpl *stp[XFRM_MAX_DEPTH];
3615         struct xfrm_tmpl **tpp = tp;
3616         int ti = 0;
3617         int i, k;
3618 
3619         sp = skb_sec_path(skb);
3620         if (!sp)
3621             sp = &dummy;
3622 
3623         for (pi = 0; pi < npols; pi++) {
3624             if (pols[pi] != pol &&
3625                 pols[pi]->action != XFRM_POLICY_ALLOW) {
3626                 XFRM_INC_STATS(net, LINUX_MIB_XFRMINPOLBLOCK);
3627                 goto reject;
3628             }
3629             if (ti + pols[pi]->xfrm_nr >= XFRM_MAX_DEPTH) {
3630                 XFRM_INC_STATS(net, LINUX_MIB_XFRMINBUFFERERROR);
3631                 goto reject_error;
3632             }
3633             for (i = 0; i < pols[pi]->xfrm_nr; i++)
3634                 tpp[ti++] = &pols[pi]->xfrm_vec[i];
3635         }
3636         xfrm_nr = ti;
3637 
3638         if (net->xfrm.policy_default[dir] == XFRM_USERPOLICY_BLOCK &&
3639             !xfrm_nr) {
3640             XFRM_INC_STATS(net, LINUX_MIB_XFRMINNOSTATES);
3641             goto reject;
3642         }
3643 
3644         if (npols > 1) {
3645             xfrm_tmpl_sort(stp, tpp, xfrm_nr, family);
3646             tpp = stp;
3647         }
3648 
3649         /* For each tunnel xfrm, find the first matching tmpl.
3650          * For each tmpl before that, find corresponding xfrm.
3651          * Order is _important_. Later we will implement
3652          * some barriers, but at the moment barriers
3653          * are implied between each two transformations.
3654          */
3655         for (i = xfrm_nr-1, k = 0; i >= 0; i--) {
3656             k = xfrm_policy_ok(tpp[i], sp, k, family);
3657             if (k < 0) {
3658                 if (k < -1)
3659                     /* "-2 - errored_index" returned */
3660                     xerr_idx = -(2+k);
3661                 XFRM_INC_STATS(net, LINUX_MIB_XFRMINTMPLMISMATCH);
3662                 goto reject;
3663             }
3664         }
3665 
3666         if (secpath_has_nontransport(sp, k, &xerr_idx)) {
3667             XFRM_INC_STATS(net, LINUX_MIB_XFRMINTMPLMISMATCH);
3668             goto reject;
3669         }
3670 
3671         xfrm_pols_put(pols, npols);
3672         return 1;
3673     }
3674     XFRM_INC_STATS(net, LINUX_MIB_XFRMINPOLBLOCK);
3675 
3676 reject:
3677     xfrm_secpath_reject(xerr_idx, skb, &fl);
3678 reject_error:
3679     xfrm_pols_put(pols, npols);
3680     return 0;
3681 }
3682 EXPORT_SYMBOL(__xfrm_policy_check);
3683 
3684 int __xfrm_route_forward(struct sk_buff *skb, unsigned short family)
3685 {
3686     struct net *net = dev_net(skb->dev);
3687     struct flowi fl;
3688     struct dst_entry *dst;
3689     int res = 1;
3690 
3691     if (xfrm_decode_session(skb, &fl, family) < 0) {
3692         XFRM_INC_STATS(net, LINUX_MIB_XFRMFWDHDRERROR);
3693         return 0;
3694     }
3695 
3696     skb_dst_force(skb);
3697     if (!skb_dst(skb)) {
3698         XFRM_INC_STATS(net, LINUX_MIB_XFRMFWDHDRERROR);
3699         return 0;
3700     }
3701 
3702     dst = xfrm_lookup(net, skb_dst(skb), &fl, NULL, XFRM_LOOKUP_QUEUE);
3703     if (IS_ERR(dst)) {
3704         res = 0;
3705         dst = NULL;
3706     }
3707     skb_dst_set(skb, dst);
3708     return res;
3709 }
3710 EXPORT_SYMBOL(__xfrm_route_forward);
3711 
3712 /* Optimize later using cookies and generation ids. */
3713 
3714 static struct dst_entry *xfrm_dst_check(struct dst_entry *dst, u32 cookie)
3715 {
3716     /* Code (such as __xfrm4_bundle_create()) sets dst->obsolete
3717      * to DST_OBSOLETE_FORCE_CHK to force all XFRM destinations to
3718      * get validated by dst_ops->check on every use.  We do this
3719      * because when a normal route referenced by an XFRM dst is
3720      * obsoleted we do not go looking around for all parent
3721      * referencing XFRM dsts so that we can invalidate them.  It
3722      * is just too much work.  Instead we make the checks here on
3723      * every use.  For example:
3724      *
3725      *  XFRM dst A --> IPv4 dst X
3726      *
3727      * X is the "xdst->route" of A (X is also the "dst->path" of A
3728      * in this example).  If X is marked obsolete, "A" will not
3729      * notice.  That's what we are validating here via the
3730      * stale_bundle() check.
3731      *
3732      * When a dst is removed from the fib tree, DST_OBSOLETE_DEAD will
3733      * be marked on it.
3734      * This will force stale_bundle() to fail on any xdst bundle with
3735      * this dst linked in it.
3736      */
3737     if (dst->obsolete < 0 && !stale_bundle(dst))
3738         return dst;
3739 
3740     return NULL;
3741 }
3742 
3743 static int stale_bundle(struct dst_entry *dst)
3744 {
3745     return !xfrm_bundle_ok((struct xfrm_dst *)dst);
3746 }
3747 
3748 void xfrm_dst_ifdown(struct dst_entry *dst, struct net_device *dev)
3749 {
3750     while ((dst = xfrm_dst_child(dst)) && dst->xfrm && dst->dev == dev) {
3751         dst->dev = blackhole_netdev;
3752         dev_hold(dst->dev);
3753         dev_put(dev);
3754     }
3755 }
3756 EXPORT_SYMBOL(xfrm_dst_ifdown);
3757 
3758 static void xfrm_link_failure(struct sk_buff *skb)
3759 {
3760     /* Impossible. Such dst must be popped before reaches point of failure. */
3761 }
3762 
3763 static struct dst_entry *xfrm_negative_advice(struct dst_entry *dst)
3764 {
3765     if (dst) {
3766         if (dst->obsolete) {
3767             dst_release(dst);
3768             dst = NULL;
3769         }
3770     }
3771     return dst;
3772 }
3773 
3774 static void xfrm_init_pmtu(struct xfrm_dst **bundle, int nr)
3775 {
3776     while (nr--) {
3777         struct xfrm_dst *xdst = bundle[nr];
3778         u32 pmtu, route_mtu_cached;
3779         struct dst_entry *dst;
3780 
3781         dst = &xdst->u.dst;
3782         pmtu = dst_mtu(xfrm_dst_child(dst));
3783         xdst->child_mtu_cached = pmtu;
3784 
3785         pmtu = xfrm_state_mtu(dst->xfrm, pmtu);
3786 
3787         route_mtu_cached = dst_mtu(xdst->route);
3788         xdst->route_mtu_cached = route_mtu_cached;
3789 
3790         if (pmtu > route_mtu_cached)
3791             pmtu = route_mtu_cached;
3792 
3793         dst_metric_set(dst, RTAX_MTU, pmtu);
3794     }
3795 }
3796 
3797 /* Check that the bundle accepts the flow and its components are
3798  * still valid.
3799  */
3800 
3801 static int xfrm_bundle_ok(struct xfrm_dst *first)
3802 {
3803     struct xfrm_dst *bundle[XFRM_MAX_DEPTH];
3804     struct dst_entry *dst = &first->u.dst;
3805     struct xfrm_dst *xdst;
3806     int start_from, nr;
3807     u32 mtu;
3808 
3809     if (!dst_check(xfrm_dst_path(dst), ((struct xfrm_dst *)dst)->path_cookie) ||
3810         (dst->dev && !netif_running(dst->dev)))
3811         return 0;
3812 
3813     if (dst->flags & DST_XFRM_QUEUE)
3814         return 1;
3815 
3816     start_from = nr = 0;
3817     do {
3818         struct xfrm_dst *xdst = (struct xfrm_dst *)dst;
3819 
3820         if (dst->xfrm->km.state != XFRM_STATE_VALID)
3821             return 0;
3822         if (xdst->xfrm_genid != dst->xfrm->genid)
3823             return 0;
3824         if (xdst->num_pols > 0 &&
3825             xdst->policy_genid != atomic_read(&xdst->pols[0]->genid))
3826             return 0;
3827 
3828         bundle[nr++] = xdst;
3829 
3830         mtu = dst_mtu(xfrm_dst_child(dst));
3831         if (xdst->child_mtu_cached != mtu) {
3832             start_from = nr;
3833             xdst->child_mtu_cached = mtu;
3834         }
3835 
3836         if (!dst_check(xdst->route, xdst->route_cookie))
3837             return 0;
3838         mtu = dst_mtu(xdst->route);
3839         if (xdst->route_mtu_cached != mtu) {
3840             start_from = nr;
3841             xdst->route_mtu_cached = mtu;
3842         }
3843 
3844         dst = xfrm_dst_child(dst);
3845     } while (dst->xfrm);
3846 
3847     if (likely(!start_from))
3848         return 1;
3849 
3850     xdst = bundle[start_from - 1];
3851     mtu = xdst->child_mtu_cached;
3852     while (start_from--) {
3853         dst = &xdst->u.dst;
3854 
3855         mtu = xfrm_state_mtu(dst->xfrm, mtu);
3856         if (mtu > xdst->route_mtu_cached)
3857             mtu = xdst->route_mtu_cached;
3858         dst_metric_set(dst, RTAX_MTU, mtu);
3859         if (!start_from)
3860             break;
3861 
3862         xdst = bundle[start_from - 1];
3863         xdst->child_mtu_cached = mtu;
3864     }
3865 
3866     return 1;
3867 }
3868 
3869 static unsigned int xfrm_default_advmss(const struct dst_entry *dst)
3870 {
3871     return dst_metric_advmss(xfrm_dst_path(dst));
3872 }
3873 
3874 static unsigned int xfrm_mtu(const struct dst_entry *dst)
3875 {
3876     unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
3877 
3878     return mtu ? : dst_mtu(xfrm_dst_path(dst));
3879 }
3880 
3881 static const void *xfrm_get_dst_nexthop(const struct dst_entry *dst,
3882                     const void *daddr)
3883 {
3884     while (dst->xfrm) {
3885         const struct xfrm_state *xfrm = dst->xfrm;
3886 
3887         dst = xfrm_dst_child(dst);
3888 
3889         if (xfrm->props.mode == XFRM_MODE_TRANSPORT)
3890             continue;
3891         if (xfrm->type->flags & XFRM_TYPE_REMOTE_COADDR)
3892             daddr = xfrm->coaddr;
3893         else if (!(xfrm->type->flags & XFRM_TYPE_LOCAL_COADDR))
3894             daddr = &xfrm->id.daddr;
3895     }
3896     return daddr;
3897 }
3898 
3899 static struct neighbour *xfrm_neigh_lookup(const struct dst_entry *dst,
3900                        struct sk_buff *skb,
3901                        const void *daddr)
3902 {
3903     const struct dst_entry *path = xfrm_dst_path(dst);
3904 
3905     if (!skb)
3906         daddr = xfrm_get_dst_nexthop(dst, daddr);
3907     return path->ops->neigh_lookup(path, skb, daddr);
3908 }
3909 
3910 static void xfrm_confirm_neigh(const struct dst_entry *dst, const void *daddr)
3911 {
3912     const struct dst_entry *path = xfrm_dst_path(dst);
3913 
3914     daddr = xfrm_get_dst_nexthop(dst, daddr);
3915     path->ops->confirm_neigh(path, daddr);
3916 }
3917 
3918 int xfrm_policy_register_afinfo(const struct xfrm_policy_afinfo *afinfo, int family)
3919 {
3920     int err = 0;
3921 
3922     if (WARN_ON(family >= ARRAY_SIZE(xfrm_policy_afinfo)))
3923         return -EAFNOSUPPORT;
3924 
3925     spin_lock(&xfrm_policy_afinfo_lock);
3926     if (unlikely(xfrm_policy_afinfo[family] != NULL))
3927         err = -EEXIST;
3928     else {
3929         struct dst_ops *dst_ops = afinfo->dst_ops;
3930         if (likely(dst_ops->kmem_cachep == NULL))
3931             dst_ops->kmem_cachep = xfrm_dst_cache;
3932         if (likely(dst_ops->check == NULL))
3933             dst_ops->check = xfrm_dst_check;
3934         if (likely(dst_ops->default_advmss == NULL))
3935             dst_ops->default_advmss = xfrm_default_advmss;
3936         if (likely(dst_ops->mtu == NULL))
3937             dst_ops->mtu = xfrm_mtu;
3938         if (likely(dst_ops->negative_advice == NULL))
3939             dst_ops->negative_advice = xfrm_negative_advice;
3940         if (likely(dst_ops->link_failure == NULL))
3941             dst_ops->link_failure = xfrm_link_failure;
3942         if (likely(dst_ops->neigh_lookup == NULL))
3943             dst_ops->neigh_lookup = xfrm_neigh_lookup;
3944         if (likely(!dst_ops->confirm_neigh))
3945             dst_ops->confirm_neigh = xfrm_confirm_neigh;
3946         rcu_assign_pointer(xfrm_policy_afinfo[family], afinfo);
3947     }
3948     spin_unlock(&xfrm_policy_afinfo_lock);
3949 
3950     return err;
3951 }
3952 EXPORT_SYMBOL(xfrm_policy_register_afinfo);
3953 
3954 void xfrm_policy_unregister_afinfo(const struct xfrm_policy_afinfo *afinfo)
3955 {
3956     struct dst_ops *dst_ops = afinfo->dst_ops;
3957     int i;
3958 
3959     for (i = 0; i < ARRAY_SIZE(xfrm_policy_afinfo); i++) {
3960         if (xfrm_policy_afinfo[i] != afinfo)
3961             continue;
3962         RCU_INIT_POINTER(xfrm_policy_afinfo[i], NULL);
3963         break;
3964     }
3965 
3966     synchronize_rcu();
3967 
3968     dst_ops->kmem_cachep = NULL;
3969     dst_ops->check = NULL;
3970     dst_ops->negative_advice = NULL;
3971     dst_ops->link_failure = NULL;
3972 }
3973 EXPORT_SYMBOL(xfrm_policy_unregister_afinfo);
3974 
3975 void xfrm_if_register_cb(const struct xfrm_if_cb *ifcb)
3976 {
3977     spin_lock(&xfrm_if_cb_lock);
3978     rcu_assign_pointer(xfrm_if_cb, ifcb);
3979     spin_unlock(&xfrm_if_cb_lock);
3980 }
3981 EXPORT_SYMBOL(xfrm_if_register_cb);
3982 
3983 void xfrm_if_unregister_cb(void)
3984 {
3985     RCU_INIT_POINTER(xfrm_if_cb, NULL);
3986     synchronize_rcu();
3987 }
3988 EXPORT_SYMBOL(xfrm_if_unregister_cb);
3989 
3990 #ifdef CONFIG_XFRM_STATISTICS
3991 static int __net_init xfrm_statistics_init(struct net *net)
3992 {
3993     int rv;
3994     net->mib.xfrm_statistics = alloc_percpu(struct linux_xfrm_mib);
3995     if (!net->mib.xfrm_statistics)
3996         return -ENOMEM;
3997     rv = xfrm_proc_init(net);
3998     if (rv < 0)
3999         free_percpu(net->mib.xfrm_statistics);
4000     return rv;
4001 }
4002 
4003 static void xfrm_statistics_fini(struct net *net)
4004 {
4005     xfrm_proc_fini(net);
4006     free_percpu(net->mib.xfrm_statistics);
4007 }
4008 #else
4009 static int __net_init xfrm_statistics_init(struct net *net)
4010 {
4011     return 0;
4012 }
4013 
4014 static void xfrm_statistics_fini(struct net *net)
4015 {
4016 }
4017 #endif
4018 
4019 static int __net_init xfrm_policy_init(struct net *net)
4020 {
4021     unsigned int hmask, sz;
4022     int dir, err;
4023 
4024     if (net_eq(net, &init_net)) {
4025         xfrm_dst_cache = kmem_cache_create("xfrm_dst_cache",
4026                        sizeof(struct xfrm_dst),
4027                        0, SLAB_HWCACHE_ALIGN|SLAB_PANIC,
4028                        NULL);
4029         err = rhashtable_init(&xfrm_policy_inexact_table,
4030                       &xfrm_pol_inexact_params);
4031         BUG_ON(err);
4032     }
4033 
4034     hmask = 8 - 1;
4035     sz = (hmask+1) * sizeof(struct hlist_head);
4036 
4037     net->xfrm.policy_byidx = xfrm_hash_alloc(sz);
4038     if (!net->xfrm.policy_byidx)
4039         goto out_byidx;
4040     net->xfrm.policy_idx_hmask = hmask;
4041 
4042     for (dir = 0; dir < XFRM_POLICY_MAX; dir++) {
4043         struct xfrm_policy_hash *htab;
4044 
4045         net->xfrm.policy_count[dir] = 0;
4046         net->xfrm.policy_count[XFRM_POLICY_MAX + dir] = 0;
4047         INIT_HLIST_HEAD(&net->xfrm.policy_inexact[dir]);
4048 
4049         htab = &net->xfrm.policy_bydst[dir];
4050         htab->table = xfrm_hash_alloc(sz);
4051         if (!htab->table)
4052             goto out_bydst;
4053         htab->hmask = hmask;
4054         htab->dbits4 = 32;
4055         htab->sbits4 = 32;
4056         htab->dbits6 = 128;
4057         htab->sbits6 = 128;
4058     }
4059     net->xfrm.policy_hthresh.lbits4 = 32;
4060     net->xfrm.policy_hthresh.rbits4 = 32;
4061     net->xfrm.policy_hthresh.lbits6 = 128;
4062     net->xfrm.policy_hthresh.rbits6 = 128;
4063 
4064     seqlock_init(&net->xfrm.policy_hthresh.lock);
4065 
4066     INIT_LIST_HEAD(&net->xfrm.policy_all);
4067     INIT_LIST_HEAD(&net->xfrm.inexact_bins);
4068     INIT_WORK(&net->xfrm.policy_hash_work, xfrm_hash_resize);
4069     INIT_WORK(&net->xfrm.policy_hthresh.work, xfrm_hash_rebuild);
4070     return 0;
4071 
4072 out_bydst:
4073     for (dir--; dir >= 0; dir--) {
4074         struct xfrm_policy_hash *htab;
4075 
4076         htab = &net->xfrm.policy_bydst[dir];
4077         xfrm_hash_free(htab->table, sz);
4078     }
4079     xfrm_hash_free(net->xfrm.policy_byidx, sz);
4080 out_byidx:
4081     return -ENOMEM;
4082 }
4083 
4084 static void xfrm_policy_fini(struct net *net)
4085 {
4086     struct xfrm_pol_inexact_bin *b, *t;
4087     unsigned int sz;
4088     int dir;
4089 
4090     flush_work(&net->xfrm.policy_hash_work);
4091 #ifdef CONFIG_XFRM_SUB_POLICY
4092     xfrm_policy_flush(net, XFRM_POLICY_TYPE_SUB, false);
4093 #endif
4094     xfrm_policy_flush(net, XFRM_POLICY_TYPE_MAIN, false);
4095 
4096     WARN_ON(!list_empty(&net->xfrm.policy_all));
4097 
4098     for (dir = 0; dir < XFRM_POLICY_MAX; dir++) {
4099         struct xfrm_policy_hash *htab;
4100 
4101         WARN_ON(!hlist_empty(&net->xfrm.policy_inexact[dir]));
4102 
4103         htab = &net->xfrm.policy_bydst[dir];
4104         sz = (htab->hmask + 1) * sizeof(struct hlist_head);
4105         WARN_ON(!hlist_empty(htab->table));
4106         xfrm_hash_free(htab->table, sz);
4107     }
4108 
4109     sz = (net->xfrm.policy_idx_hmask + 1) * sizeof(struct hlist_head);
4110     WARN_ON(!hlist_empty(net->xfrm.policy_byidx));
4111     xfrm_hash_free(net->xfrm.policy_byidx, sz);
4112 
4113     spin_lock_bh(&net->xfrm.xfrm_policy_lock);
4114     list_for_each_entry_safe(b, t, &net->xfrm.inexact_bins, inexact_bins)
4115         __xfrm_policy_inexact_prune_bin(b, true);
4116     spin_unlock_bh(&net->xfrm.xfrm_policy_lock);
4117 }
4118 
4119 static int __net_init xfrm_net_init(struct net *net)
4120 {
4121     int rv;
4122 
4123     /* Initialize the per-net locks here */
4124     spin_lock_init(&net->xfrm.xfrm_state_lock);
4125     spin_lock_init(&net->xfrm.xfrm_policy_lock);
4126     seqcount_spinlock_init(&net->xfrm.xfrm_policy_hash_generation, &net->xfrm.xfrm_policy_lock);
4127     mutex_init(&net->xfrm.xfrm_cfg_mutex);
4128     net->xfrm.policy_default[XFRM_POLICY_IN] = XFRM_USERPOLICY_ACCEPT;
4129     net->xfrm.policy_default[XFRM_POLICY_FWD] = XFRM_USERPOLICY_ACCEPT;
4130     net->xfrm.policy_default[XFRM_POLICY_OUT] = XFRM_USERPOLICY_ACCEPT;
4131 
4132     rv = xfrm_statistics_init(net);
4133     if (rv < 0)
4134         goto out_statistics;
4135     rv = xfrm_state_init(net);
4136     if (rv < 0)
4137         goto out_state;
4138     rv = xfrm_policy_init(net);
4139     if (rv < 0)
4140         goto out_policy;
4141     rv = xfrm_sysctl_init(net);
4142     if (rv < 0)
4143         goto out_sysctl;
4144 
4145     return 0;
4146 
4147 out_sysctl:
4148     xfrm_policy_fini(net);
4149 out_policy:
4150     xfrm_state_fini(net);
4151 out_state:
4152     xfrm_statistics_fini(net);
4153 out_statistics:
4154     return rv;
4155 }
4156 
4157 static void __net_exit xfrm_net_exit(struct net *net)
4158 {
4159     xfrm_sysctl_fini(net);
4160     xfrm_policy_fini(net);
4161     xfrm_state_fini(net);
4162     xfrm_statistics_fini(net);
4163 }
4164 
4165 static struct pernet_operations __net_initdata xfrm_net_ops = {
4166     .init = xfrm_net_init,
4167     .exit = xfrm_net_exit,
4168 };
4169 
4170 void __init xfrm_init(void)
4171 {
4172     register_pernet_subsys(&xfrm_net_ops);
4173     xfrm_dev_init();
4174     xfrm_input_init();
4175 
4176 #ifdef CONFIG_XFRM_ESPINTCP
4177     espintcp_init();
4178 #endif
4179 }
4180 
4181 #ifdef CONFIG_AUDITSYSCALL
4182 static void xfrm_audit_common_policyinfo(struct xfrm_policy *xp,
4183                      struct audit_buffer *audit_buf)
4184 {
4185     struct xfrm_sec_ctx *ctx = xp->security;
4186     struct xfrm_selector *sel = &xp->selector;
4187 
4188     if (ctx)
4189         audit_log_format(audit_buf, " sec_alg=%u sec_doi=%u sec_obj=%s",
4190                  ctx->ctx_alg, ctx->ctx_doi, ctx->ctx_str);
4191 
4192     switch (sel->family) {
4193     case AF_INET:
4194         audit_log_format(audit_buf, " src=%pI4", &sel->saddr.a4);
4195         if (sel->prefixlen_s != 32)
4196             audit_log_format(audit_buf, " src_prefixlen=%d",
4197                      sel->prefixlen_s);
4198         audit_log_format(audit_buf, " dst=%pI4", &sel->daddr.a4);
4199         if (sel->prefixlen_d != 32)
4200             audit_log_format(audit_buf, " dst_prefixlen=%d",
4201                      sel->prefixlen_d);
4202         break;
4203     case AF_INET6:
4204         audit_log_format(audit_buf, " src=%pI6", sel->saddr.a6);
4205         if (sel->prefixlen_s != 128)
4206             audit_log_format(audit_buf, " src_prefixlen=%d",
4207                      sel->prefixlen_s);
4208         audit_log_format(audit_buf, " dst=%pI6", sel->daddr.a6);
4209         if (sel->prefixlen_d != 128)
4210             audit_log_format(audit_buf, " dst_prefixlen=%d",
4211                      sel->prefixlen_d);
4212         break;
4213     }
4214 }
4215 
4216 void xfrm_audit_policy_add(struct xfrm_policy *xp, int result, bool task_valid)
4217 {
4218     struct audit_buffer *audit_buf;
4219 
4220     audit_buf = xfrm_audit_start("SPD-add");
4221     if (audit_buf == NULL)
4222         return;
4223     xfrm_audit_helper_usrinfo(task_valid, audit_buf);
4224     audit_log_format(audit_buf, " res=%u", result);
4225     xfrm_audit_common_policyinfo(xp, audit_buf);
4226     audit_log_end(audit_buf);
4227 }
4228 EXPORT_SYMBOL_GPL(xfrm_audit_policy_add);
4229 
4230 void xfrm_audit_policy_delete(struct xfrm_policy *xp, int result,
4231                   bool task_valid)
4232 {
4233     struct audit_buffer *audit_buf;
4234 
4235     audit_buf = xfrm_audit_start("SPD-delete");
4236     if (audit_buf == NULL)
4237         return;
4238     xfrm_audit_helper_usrinfo(task_valid, audit_buf);
4239     audit_log_format(audit_buf, " res=%u", result);
4240     xfrm_audit_common_policyinfo(xp, audit_buf);
4241     audit_log_end(audit_buf);
4242 }
4243 EXPORT_SYMBOL_GPL(xfrm_audit_policy_delete);
4244 #endif
4245 
4246 #ifdef CONFIG_XFRM_MIGRATE
4247 static bool xfrm_migrate_selector_match(const struct xfrm_selector *sel_cmp,
4248                     const struct xfrm_selector *sel_tgt)
4249 {
4250     if (sel_cmp->proto == IPSEC_ULPROTO_ANY) {
4251         if (sel_tgt->family == sel_cmp->family &&
4252             xfrm_addr_equal(&sel_tgt->daddr, &sel_cmp->daddr,
4253                     sel_cmp->family) &&
4254             xfrm_addr_equal(&sel_tgt->saddr, &sel_cmp->saddr,
4255                     sel_cmp->family) &&
4256             sel_tgt->prefixlen_d == sel_cmp->prefixlen_d &&
4257             sel_tgt->prefixlen_s == sel_cmp->prefixlen_s) {
4258             return true;
4259         }
4260     } else {
4261         if (memcmp(sel_tgt, sel_cmp, sizeof(*sel_tgt)) == 0) {
4262             return true;
4263         }
4264     }
4265     return false;
4266 }
4267 
4268 static struct xfrm_policy *xfrm_migrate_policy_find(const struct xfrm_selector *sel,
4269                             u8 dir, u8 type, struct net *net, u32 if_id)
4270 {
4271     struct xfrm_policy *pol, *ret = NULL;
4272     struct hlist_head *chain;
4273     u32 priority = ~0U;
4274 
4275     spin_lock_bh(&net->xfrm.xfrm_policy_lock);
4276     chain = policy_hash_direct(net, &sel->daddr, &sel->saddr, sel->family, dir);
4277     hlist_for_each_entry(pol, chain, bydst) {
4278         if ((if_id == 0 || pol->if_id == if_id) &&
4279             xfrm_migrate_selector_match(sel, &pol->selector) &&
4280             pol->type == type) {
4281             ret = pol;
4282             priority = ret->priority;
4283             break;
4284         }
4285     }
4286     chain = &net->xfrm.policy_inexact[dir];
4287     hlist_for_each_entry(pol, chain, bydst_inexact_list) {
4288         if ((pol->priority >= priority) && ret)
4289             break;
4290 
4291         if ((if_id == 0 || pol->if_id == if_id) &&
4292             xfrm_migrate_selector_match(sel, &pol->selector) &&
4293             pol->type == type) {
4294             ret = pol;
4295             break;
4296         }
4297     }
4298 
4299     xfrm_pol_hold(ret);
4300 
4301     spin_unlock_bh(&net->xfrm.xfrm_policy_lock);
4302 
4303     return ret;
4304 }
4305 
4306 static int migrate_tmpl_match(const struct xfrm_migrate *m, const struct xfrm_tmpl *t)
4307 {
4308     int match = 0;
4309 
4310     if (t->mode == m->mode && t->id.proto == m->proto &&
4311         (m->reqid == 0 || t->reqid == m->reqid)) {
4312         switch (t->mode) {
4313         case XFRM_MODE_TUNNEL:
4314         case XFRM_MODE_BEET:
4315             if (xfrm_addr_equal(&t->id.daddr, &m->old_daddr,
4316                         m->old_family) &&
4317                 xfrm_addr_equal(&t->saddr, &m->old_saddr,
4318                         m->old_family)) {
4319                 match = 1;
4320             }
4321             break;
4322         case XFRM_MODE_TRANSPORT:
4323             /* in case of transport mode, template does not store
4324                any IP addresses, hence we just compare mode and
4325                protocol */
4326             match = 1;
4327             break;
4328         default:
4329             break;
4330         }
4331     }
4332     return match;
4333 }
4334 
4335 /* update endpoint address(es) of template(s) */
4336 static int xfrm_policy_migrate(struct xfrm_policy *pol,
4337                    struct xfrm_migrate *m, int num_migrate)
4338 {
4339     struct xfrm_migrate *mp;
4340     int i, j, n = 0;
4341 
4342     write_lock_bh(&pol->lock);
4343     if (unlikely(pol->walk.dead)) {
4344         /* target policy has been deleted */
4345         write_unlock_bh(&pol->lock);
4346         return -ENOENT;
4347     }
4348 
4349     for (i = 0; i < pol->xfrm_nr; i++) {
4350         for (j = 0, mp = m; j < num_migrate; j++, mp++) {
4351             if (!migrate_tmpl_match(mp, &pol->xfrm_vec[i]))
4352                 continue;
4353             n++;
4354             if (pol->xfrm_vec[i].mode != XFRM_MODE_TUNNEL &&
4355                 pol->xfrm_vec[i].mode != XFRM_MODE_BEET)
4356                 continue;
4357             /* update endpoints */
4358             memcpy(&pol->xfrm_vec[i].id.daddr, &mp->new_daddr,
4359                    sizeof(pol->xfrm_vec[i].id.daddr));
4360             memcpy(&pol->xfrm_vec[i].saddr, &mp->new_saddr,
4361                    sizeof(pol->xfrm_vec[i].saddr));
4362             pol->xfrm_vec[i].encap_family = mp->new_family;
4363             /* flush bundles */
4364             atomic_inc(&pol->genid);
4365         }
4366     }
4367 
4368     write_unlock_bh(&pol->lock);
4369 
4370     if (!n)
4371         return -ENODATA;
4372 
4373     return 0;
4374 }
4375 
4376 static int xfrm_migrate_check(const struct xfrm_migrate *m, int num_migrate)
4377 {
4378     int i, j;
4379 
4380     if (num_migrate < 1 || num_migrate > XFRM_MAX_DEPTH)
4381         return -EINVAL;
4382 
4383     for (i = 0; i < num_migrate; i++) {
4384         if (xfrm_addr_any(&m[i].new_daddr, m[i].new_family) ||
4385             xfrm_addr_any(&m[i].new_saddr, m[i].new_family))
4386             return -EINVAL;
4387 
4388         /* check if there is any duplicated entry */
4389         for (j = i + 1; j < num_migrate; j++) {
4390             if (!memcmp(&m[i].old_daddr, &m[j].old_daddr,
4391                     sizeof(m[i].old_daddr)) &&
4392                 !memcmp(&m[i].old_saddr, &m[j].old_saddr,
4393                     sizeof(m[i].old_saddr)) &&
4394                 m[i].proto == m[j].proto &&
4395                 m[i].mode == m[j].mode &&
4396                 m[i].reqid == m[j].reqid &&
4397                 m[i].old_family == m[j].old_family)
4398                 return -EINVAL;
4399         }
4400     }
4401 
4402     return 0;
4403 }
4404 
4405 int xfrm_migrate(const struct xfrm_selector *sel, u8 dir, u8 type,
4406          struct xfrm_migrate *m, int num_migrate,
4407          struct xfrm_kmaddress *k, struct net *net,
4408          struct xfrm_encap_tmpl *encap, u32 if_id)
4409 {
4410     int i, err, nx_cur = 0, nx_new = 0;
4411     struct xfrm_policy *pol = NULL;
4412     struct xfrm_state *x, *xc;
4413     struct xfrm_state *x_cur[XFRM_MAX_DEPTH];
4414     struct xfrm_state *x_new[XFRM_MAX_DEPTH];
4415     struct xfrm_migrate *mp;
4416 
4417     /* Stage 0 - sanity checks */
4418     if ((err = xfrm_migrate_check(m, num_migrate)) < 0)
4419         goto out;
4420 
4421     if (dir >= XFRM_POLICY_MAX) {
4422         err = -EINVAL;
4423         goto out;
4424     }
4425 
4426     /* Stage 1 - find policy */
4427     if ((pol = xfrm_migrate_policy_find(sel, dir, type, net, if_id)) == NULL) {
4428         err = -ENOENT;
4429         goto out;
4430     }
4431 
4432     /* Stage 2 - find and update state(s) */
4433     for (i = 0, mp = m; i < num_migrate; i++, mp++) {
4434         if ((x = xfrm_migrate_state_find(mp, net, if_id))) {
4435             x_cur[nx_cur] = x;
4436             nx_cur++;
4437             xc = xfrm_state_migrate(x, mp, encap);
4438             if (xc) {
4439                 x_new[nx_new] = xc;
4440                 nx_new++;
4441             } else {
4442                 err = -ENODATA;
4443                 goto restore_state;
4444             }
4445         }
4446     }
4447 
4448     /* Stage 3 - update policy */
4449     if ((err = xfrm_policy_migrate(pol, m, num_migrate)) < 0)
4450         goto restore_state;
4451 
4452     /* Stage 4 - delete old state(s) */
4453     if (nx_cur) {
4454         xfrm_states_put(x_cur, nx_cur);
4455         xfrm_states_delete(x_cur, nx_cur);
4456     }
4457 
4458     /* Stage 5 - announce */
4459     km_migrate(sel, dir, type, m, num_migrate, k, encap);
4460 
4461     xfrm_pol_put(pol);
4462 
4463     return 0;
4464 out:
4465     return err;
4466 
4467 restore_state:
4468     if (pol)
4469         xfrm_pol_put(pol);
4470     if (nx_cur)
4471         xfrm_states_put(x_cur, nx_cur);
4472     if (nx_new)
4473         xfrm_states_delete(x_new, nx_new);
4474 
4475     return err;
4476 }
4477 EXPORT_SYMBOL(xfrm_migrate);
4478 #endif