Back to home page

OSCL-LXR

 
 

    


0001 // SPDX-License-Identifier: GPL-2.0-or-later
0002 /*
0003  *  IPv6 output functions
0004  *  Linux INET6 implementation
0005  *
0006  *  Authors:
0007  *  Pedro Roque     <roque@di.fc.ul.pt>
0008  *
0009  *  Based on linux/net/ipv4/ip_output.c
0010  *
0011  *  Changes:
0012  *  A.N.Kuznetsov   :   airthmetics in fragmentation.
0013  *              extension headers are implemented.
0014  *              route changes now work.
0015  *              ip6_forward does not confuse sniffers.
0016  *              etc.
0017  *
0018  *      H. von Brand    :       Added missing #include <linux/string.h>
0019  *  Imran Patel :   frag id should be in NBO
0020  *      Kazunori MIYAZAWA @USAGI
0021  *          :       add ip6_append_data and related functions
0022  *              for datagram xmit
0023  */
0024 
0025 #include <linux/errno.h>
0026 #include <linux/kernel.h>
0027 #include <linux/string.h>
0028 #include <linux/socket.h>
0029 #include <linux/net.h>
0030 #include <linux/netdevice.h>
0031 #include <linux/if_arp.h>
0032 #include <linux/in6.h>
0033 #include <linux/tcp.h>
0034 #include <linux/route.h>
0035 #include <linux/module.h>
0036 #include <linux/slab.h>
0037 
0038 #include <linux/bpf-cgroup.h>
0039 #include <linux/netfilter.h>
0040 #include <linux/netfilter_ipv6.h>
0041 
0042 #include <net/sock.h>
0043 #include <net/snmp.h>
0044 
0045 #include <net/ipv6.h>
0046 #include <net/ndisc.h>
0047 #include <net/protocol.h>
0048 #include <net/ip6_route.h>
0049 #include <net/addrconf.h>
0050 #include <net/rawv6.h>
0051 #include <net/icmp.h>
0052 #include <net/xfrm.h>
0053 #include <net/checksum.h>
0054 #include <linux/mroute6.h>
0055 #include <net/l3mdev.h>
0056 #include <net/lwtunnel.h>
0057 #include <net/ip_tunnels.h>
0058 
0059 static int ip6_finish_output2(struct net *net, struct sock *sk, struct sk_buff *skb)
0060 {
0061     struct dst_entry *dst = skb_dst(skb);
0062     struct net_device *dev = dst->dev;
0063     struct inet6_dev *idev = ip6_dst_idev(dst);
0064     unsigned int hh_len = LL_RESERVED_SPACE(dev);
0065     const struct in6_addr *daddr, *nexthop;
0066     struct ipv6hdr *hdr;
0067     struct neighbour *neigh;
0068     int ret;
0069 
0070     /* Be paranoid, rather than too clever. */
0071     if (unlikely(hh_len > skb_headroom(skb)) && dev->header_ops) {
0072         skb = skb_expand_head(skb, hh_len);
0073         if (!skb) {
0074             IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTDISCARDS);
0075             return -ENOMEM;
0076         }
0077     }
0078 
0079     hdr = ipv6_hdr(skb);
0080     daddr = &hdr->daddr;
0081     if (ipv6_addr_is_multicast(daddr)) {
0082         if (!(dev->flags & IFF_LOOPBACK) && sk_mc_loop(sk) &&
0083             ((mroute6_is_socket(net, skb) &&
0084              !(IP6CB(skb)->flags & IP6SKB_FORWARDED)) ||
0085              ipv6_chk_mcast_addr(dev, daddr, &hdr->saddr))) {
0086             struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
0087 
0088             /* Do not check for IFF_ALLMULTI; multicast routing
0089                is not supported in any case.
0090              */
0091             if (newskb)
0092                 NF_HOOK(NFPROTO_IPV6, NF_INET_POST_ROUTING,
0093                     net, sk, newskb, NULL, newskb->dev,
0094                     dev_loopback_xmit);
0095 
0096             if (hdr->hop_limit == 0) {
0097                 IP6_INC_STATS(net, idev,
0098                           IPSTATS_MIB_OUTDISCARDS);
0099                 kfree_skb(skb);
0100                 return 0;
0101             }
0102         }
0103 
0104         IP6_UPD_PO_STATS(net, idev, IPSTATS_MIB_OUTMCAST, skb->len);
0105         if (IPV6_ADDR_MC_SCOPE(daddr) <= IPV6_ADDR_SCOPE_NODELOCAL &&
0106             !(dev->flags & IFF_LOOPBACK)) {
0107             kfree_skb(skb);
0108             return 0;
0109         }
0110     }
0111 
0112     if (lwtunnel_xmit_redirect(dst->lwtstate)) {
0113         int res = lwtunnel_xmit(skb);
0114 
0115         if (res < 0 || res == LWTUNNEL_XMIT_DONE)
0116             return res;
0117     }
0118 
0119     rcu_read_lock_bh();
0120     nexthop = rt6_nexthop((struct rt6_info *)dst, daddr);
0121     neigh = __ipv6_neigh_lookup_noref(dev, nexthop);
0122 
0123     if (unlikely(IS_ERR_OR_NULL(neigh))) {
0124         if (unlikely(!neigh))
0125             neigh = __neigh_create(&nd_tbl, nexthop, dev, false);
0126         if (IS_ERR(neigh)) {
0127             rcu_read_unlock_bh();
0128             IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTNOROUTES);
0129             kfree_skb_reason(skb, SKB_DROP_REASON_NEIGH_CREATEFAIL);
0130             return -EINVAL;
0131         }
0132     }
0133     sock_confirm_neigh(skb, neigh);
0134     ret = neigh_output(neigh, skb, false);
0135     rcu_read_unlock_bh();
0136     return ret;
0137 }
0138 
0139 static int
0140 ip6_finish_output_gso_slowpath_drop(struct net *net, struct sock *sk,
0141                     struct sk_buff *skb, unsigned int mtu)
0142 {
0143     struct sk_buff *segs, *nskb;
0144     netdev_features_t features;
0145     int ret = 0;
0146 
0147     /* Please see corresponding comment in ip_finish_output_gso
0148      * describing the cases where GSO segment length exceeds the
0149      * egress MTU.
0150      */
0151     features = netif_skb_features(skb);
0152     segs = skb_gso_segment(skb, features & ~NETIF_F_GSO_MASK);
0153     if (IS_ERR_OR_NULL(segs)) {
0154         kfree_skb(skb);
0155         return -ENOMEM;
0156     }
0157 
0158     consume_skb(skb);
0159 
0160     skb_list_walk_safe(segs, segs, nskb) {
0161         int err;
0162 
0163         skb_mark_not_on_list(segs);
0164         err = ip6_fragment(net, sk, segs, ip6_finish_output2);
0165         if (err && ret == 0)
0166             ret = err;
0167     }
0168 
0169     return ret;
0170 }
0171 
0172 static int __ip6_finish_output(struct net *net, struct sock *sk, struct sk_buff *skb)
0173 {
0174     unsigned int mtu;
0175 
0176 #if defined(CONFIG_NETFILTER) && defined(CONFIG_XFRM)
0177     /* Policy lookup after SNAT yielded a new policy */
0178     if (skb_dst(skb)->xfrm) {
0179         IP6CB(skb)->flags |= IP6SKB_REROUTED;
0180         return dst_output(net, sk, skb);
0181     }
0182 #endif
0183 
0184     mtu = ip6_skb_dst_mtu(skb);
0185     if (skb_is_gso(skb) &&
0186         !(IP6CB(skb)->flags & IP6SKB_FAKEJUMBO) &&
0187         !skb_gso_validate_network_len(skb, mtu))
0188         return ip6_finish_output_gso_slowpath_drop(net, sk, skb, mtu);
0189 
0190     if ((skb->len > mtu && !skb_is_gso(skb)) ||
0191         dst_allfrag(skb_dst(skb)) ||
0192         (IP6CB(skb)->frag_max_size && skb->len > IP6CB(skb)->frag_max_size))
0193         return ip6_fragment(net, sk, skb, ip6_finish_output2);
0194     else
0195         return ip6_finish_output2(net, sk, skb);
0196 }
0197 
0198 static int ip6_finish_output(struct net *net, struct sock *sk, struct sk_buff *skb)
0199 {
0200     int ret;
0201 
0202     ret = BPF_CGROUP_RUN_PROG_INET_EGRESS(sk, skb);
0203     switch (ret) {
0204     case NET_XMIT_SUCCESS:
0205     case NET_XMIT_CN:
0206         return __ip6_finish_output(net, sk, skb) ? : ret;
0207     default:
0208         kfree_skb_reason(skb, SKB_DROP_REASON_BPF_CGROUP_EGRESS);
0209         return ret;
0210     }
0211 }
0212 
0213 int ip6_output(struct net *net, struct sock *sk, struct sk_buff *skb)
0214 {
0215     struct net_device *dev = skb_dst(skb)->dev, *indev = skb->dev;
0216     struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
0217 
0218     skb->protocol = htons(ETH_P_IPV6);
0219     skb->dev = dev;
0220 
0221     if (unlikely(idev->cnf.disable_ipv6)) {
0222         IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTDISCARDS);
0223         kfree_skb_reason(skb, SKB_DROP_REASON_IPV6DISABLED);
0224         return 0;
0225     }
0226 
0227     return NF_HOOK_COND(NFPROTO_IPV6, NF_INET_POST_ROUTING,
0228                 net, sk, skb, indev, dev,
0229                 ip6_finish_output,
0230                 !(IP6CB(skb)->flags & IP6SKB_REROUTED));
0231 }
0232 EXPORT_SYMBOL(ip6_output);
0233 
0234 bool ip6_autoflowlabel(struct net *net, const struct ipv6_pinfo *np)
0235 {
0236     if (!np->autoflowlabel_set)
0237         return ip6_default_np_autolabel(net);
0238     else
0239         return np->autoflowlabel;
0240 }
0241 
0242 /*
0243  * xmit an sk_buff (used by TCP, SCTP and DCCP)
0244  * Note : socket lock is not held for SYNACK packets, but might be modified
0245  * by calls to skb_set_owner_w() and ipv6_local_error(),
0246  * which are using proper atomic operations or spinlocks.
0247  */
0248 int ip6_xmit(const struct sock *sk, struct sk_buff *skb, struct flowi6 *fl6,
0249          __u32 mark, struct ipv6_txoptions *opt, int tclass, u32 priority)
0250 {
0251     struct net *net = sock_net(sk);
0252     const struct ipv6_pinfo *np = inet6_sk(sk);
0253     struct in6_addr *first_hop = &fl6->daddr;
0254     struct dst_entry *dst = skb_dst(skb);
0255     struct net_device *dev = dst->dev;
0256     struct inet6_dev *idev = ip6_dst_idev(dst);
0257     struct hop_jumbo_hdr *hop_jumbo;
0258     int hoplen = sizeof(*hop_jumbo);
0259     unsigned int head_room;
0260     struct ipv6hdr *hdr;
0261     u8  proto = fl6->flowi6_proto;
0262     int seg_len = skb->len;
0263     int hlimit = -1;
0264     u32 mtu;
0265 
0266     head_room = sizeof(struct ipv6hdr) + hoplen + LL_RESERVED_SPACE(dev);
0267     if (opt)
0268         head_room += opt->opt_nflen + opt->opt_flen;
0269 
0270     if (unlikely(head_room > skb_headroom(skb))) {
0271         skb = skb_expand_head(skb, head_room);
0272         if (!skb) {
0273             IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTDISCARDS);
0274             return -ENOBUFS;
0275         }
0276     }
0277 
0278     if (opt) {
0279         seg_len += opt->opt_nflen + opt->opt_flen;
0280 
0281         if (opt->opt_flen)
0282             ipv6_push_frag_opts(skb, opt, &proto);
0283 
0284         if (opt->opt_nflen)
0285             ipv6_push_nfrag_opts(skb, opt, &proto, &first_hop,
0286                          &fl6->saddr);
0287     }
0288 
0289     if (unlikely(seg_len > IPV6_MAXPLEN)) {
0290         hop_jumbo = skb_push(skb, hoplen);
0291 
0292         hop_jumbo->nexthdr = proto;
0293         hop_jumbo->hdrlen = 0;
0294         hop_jumbo->tlv_type = IPV6_TLV_JUMBO;
0295         hop_jumbo->tlv_len = 4;
0296         hop_jumbo->jumbo_payload_len = htonl(seg_len + hoplen);
0297 
0298         proto = IPPROTO_HOPOPTS;
0299         seg_len = 0;
0300         IP6CB(skb)->flags |= IP6SKB_FAKEJUMBO;
0301     }
0302 
0303     skb_push(skb, sizeof(struct ipv6hdr));
0304     skb_reset_network_header(skb);
0305     hdr = ipv6_hdr(skb);
0306 
0307     /*
0308      *  Fill in the IPv6 header
0309      */
0310     if (np)
0311         hlimit = np->hop_limit;
0312     if (hlimit < 0)
0313         hlimit = ip6_dst_hoplimit(dst);
0314 
0315     ip6_flow_hdr(hdr, tclass, ip6_make_flowlabel(net, skb, fl6->flowlabel,
0316                 ip6_autoflowlabel(net, np), fl6));
0317 
0318     hdr->payload_len = htons(seg_len);
0319     hdr->nexthdr = proto;
0320     hdr->hop_limit = hlimit;
0321 
0322     hdr->saddr = fl6->saddr;
0323     hdr->daddr = *first_hop;
0324 
0325     skb->protocol = htons(ETH_P_IPV6);
0326     skb->priority = priority;
0327     skb->mark = mark;
0328 
0329     mtu = dst_mtu(dst);
0330     if ((skb->len <= mtu) || skb->ignore_df || skb_is_gso(skb)) {
0331         IP6_UPD_PO_STATS(net, idev, IPSTATS_MIB_OUT, skb->len);
0332 
0333         /* if egress device is enslaved to an L3 master device pass the
0334          * skb to its handler for processing
0335          */
0336         skb = l3mdev_ip6_out((struct sock *)sk, skb);
0337         if (unlikely(!skb))
0338             return 0;
0339 
0340         /* hooks should never assume socket lock is held.
0341          * we promote our socket to non const
0342          */
0343         return NF_HOOK(NFPROTO_IPV6, NF_INET_LOCAL_OUT,
0344                    net, (struct sock *)sk, skb, NULL, dev,
0345                    dst_output);
0346     }
0347 
0348     skb->dev = dev;
0349     /* ipv6_local_error() does not require socket lock,
0350      * we promote our socket to non const
0351      */
0352     ipv6_local_error((struct sock *)sk, EMSGSIZE, fl6, mtu);
0353 
0354     IP6_INC_STATS(net, idev, IPSTATS_MIB_FRAGFAILS);
0355     kfree_skb(skb);
0356     return -EMSGSIZE;
0357 }
0358 EXPORT_SYMBOL(ip6_xmit);
0359 
0360 static int ip6_call_ra_chain(struct sk_buff *skb, int sel)
0361 {
0362     struct ip6_ra_chain *ra;
0363     struct sock *last = NULL;
0364 
0365     read_lock(&ip6_ra_lock);
0366     for (ra = ip6_ra_chain; ra; ra = ra->next) {
0367         struct sock *sk = ra->sk;
0368         if (sk && ra->sel == sel &&
0369             (!sk->sk_bound_dev_if ||
0370              sk->sk_bound_dev_if == skb->dev->ifindex)) {
0371             struct ipv6_pinfo *np = inet6_sk(sk);
0372 
0373             if (np && np->rtalert_isolate &&
0374                 !net_eq(sock_net(sk), dev_net(skb->dev))) {
0375                 continue;
0376             }
0377             if (last) {
0378                 struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
0379                 if (skb2)
0380                     rawv6_rcv(last, skb2);
0381             }
0382             last = sk;
0383         }
0384     }
0385 
0386     if (last) {
0387         rawv6_rcv(last, skb);
0388         read_unlock(&ip6_ra_lock);
0389         return 1;
0390     }
0391     read_unlock(&ip6_ra_lock);
0392     return 0;
0393 }
0394 
0395 static int ip6_forward_proxy_check(struct sk_buff *skb)
0396 {
0397     struct ipv6hdr *hdr = ipv6_hdr(skb);
0398     u8 nexthdr = hdr->nexthdr;
0399     __be16 frag_off;
0400     int offset;
0401 
0402     if (ipv6_ext_hdr(nexthdr)) {
0403         offset = ipv6_skip_exthdr(skb, sizeof(*hdr), &nexthdr, &frag_off);
0404         if (offset < 0)
0405             return 0;
0406     } else
0407         offset = sizeof(struct ipv6hdr);
0408 
0409     if (nexthdr == IPPROTO_ICMPV6) {
0410         struct icmp6hdr *icmp6;
0411 
0412         if (!pskb_may_pull(skb, (skb_network_header(skb) +
0413                      offset + 1 - skb->data)))
0414             return 0;
0415 
0416         icmp6 = (struct icmp6hdr *)(skb_network_header(skb) + offset);
0417 
0418         switch (icmp6->icmp6_type) {
0419         case NDISC_ROUTER_SOLICITATION:
0420         case NDISC_ROUTER_ADVERTISEMENT:
0421         case NDISC_NEIGHBOUR_SOLICITATION:
0422         case NDISC_NEIGHBOUR_ADVERTISEMENT:
0423         case NDISC_REDIRECT:
0424             /* For reaction involving unicast neighbor discovery
0425              * message destined to the proxied address, pass it to
0426              * input function.
0427              */
0428             return 1;
0429         default:
0430             break;
0431         }
0432     }
0433 
0434     /*
0435      * The proxying router can't forward traffic sent to a link-local
0436      * address, so signal the sender and discard the packet. This
0437      * behavior is clarified by the MIPv6 specification.
0438      */
0439     if (ipv6_addr_type(&hdr->daddr) & IPV6_ADDR_LINKLOCAL) {
0440         dst_link_failure(skb);
0441         return -1;
0442     }
0443 
0444     return 0;
0445 }
0446 
0447 static inline int ip6_forward_finish(struct net *net, struct sock *sk,
0448                      struct sk_buff *skb)
0449 {
0450     struct dst_entry *dst = skb_dst(skb);
0451 
0452     __IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTFORWDATAGRAMS);
0453     __IP6_ADD_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTOCTETS, skb->len);
0454 
0455 #ifdef CONFIG_NET_SWITCHDEV
0456     if (skb->offload_l3_fwd_mark) {
0457         consume_skb(skb);
0458         return 0;
0459     }
0460 #endif
0461 
0462     skb_clear_tstamp(skb);
0463     return dst_output(net, sk, skb);
0464 }
0465 
0466 static bool ip6_pkt_too_big(const struct sk_buff *skb, unsigned int mtu)
0467 {
0468     if (skb->len <= mtu)
0469         return false;
0470 
0471     /* ipv6 conntrack defrag sets max_frag_size + ignore_df */
0472     if (IP6CB(skb)->frag_max_size && IP6CB(skb)->frag_max_size > mtu)
0473         return true;
0474 
0475     if (skb->ignore_df)
0476         return false;
0477 
0478     if (skb_is_gso(skb) && skb_gso_validate_network_len(skb, mtu))
0479         return false;
0480 
0481     return true;
0482 }
0483 
0484 int ip6_forward(struct sk_buff *skb)
0485 {
0486     struct dst_entry *dst = skb_dst(skb);
0487     struct ipv6hdr *hdr = ipv6_hdr(skb);
0488     struct inet6_skb_parm *opt = IP6CB(skb);
0489     struct net *net = dev_net(dst->dev);
0490     struct inet6_dev *idev;
0491     SKB_DR(reason);
0492     u32 mtu;
0493 
0494     idev = __in6_dev_get_safely(dev_get_by_index_rcu(net, IP6CB(skb)->iif));
0495     if (net->ipv6.devconf_all->forwarding == 0)
0496         goto error;
0497 
0498     if (skb->pkt_type != PACKET_HOST)
0499         goto drop;
0500 
0501     if (unlikely(skb->sk))
0502         goto drop;
0503 
0504     if (skb_warn_if_lro(skb))
0505         goto drop;
0506 
0507     if (!net->ipv6.devconf_all->disable_policy &&
0508         (!idev || !idev->cnf.disable_policy) &&
0509         !xfrm6_policy_check(NULL, XFRM_POLICY_FWD, skb)) {
0510         __IP6_INC_STATS(net, idev, IPSTATS_MIB_INDISCARDS);
0511         goto drop;
0512     }
0513 
0514     skb_forward_csum(skb);
0515 
0516     /*
0517      *  We DO NOT make any processing on
0518      *  RA packets, pushing them to user level AS IS
0519      *  without ane WARRANTY that application will be able
0520      *  to interpret them. The reason is that we
0521      *  cannot make anything clever here.
0522      *
0523      *  We are not end-node, so that if packet contains
0524      *  AH/ESP, we cannot make anything.
0525      *  Defragmentation also would be mistake, RA packets
0526      *  cannot be fragmented, because there is no warranty
0527      *  that different fragments will go along one path. --ANK
0528      */
0529     if (unlikely(opt->flags & IP6SKB_ROUTERALERT)) {
0530         if (ip6_call_ra_chain(skb, ntohs(opt->ra)))
0531             return 0;
0532     }
0533 
0534     /*
0535      *  check and decrement ttl
0536      */
0537     if (hdr->hop_limit <= 1) {
0538         icmpv6_send(skb, ICMPV6_TIME_EXCEED, ICMPV6_EXC_HOPLIMIT, 0);
0539         __IP6_INC_STATS(net, idev, IPSTATS_MIB_INHDRERRORS);
0540 
0541         kfree_skb_reason(skb, SKB_DROP_REASON_IP_INHDR);
0542         return -ETIMEDOUT;
0543     }
0544 
0545     /* XXX: idev->cnf.proxy_ndp? */
0546     if (net->ipv6.devconf_all->proxy_ndp &&
0547         pneigh_lookup(&nd_tbl, net, &hdr->daddr, skb->dev, 0)) {
0548         int proxied = ip6_forward_proxy_check(skb);
0549         if (proxied > 0) {
0550             hdr->hop_limit--;
0551             return ip6_input(skb);
0552         } else if (proxied < 0) {
0553             __IP6_INC_STATS(net, idev, IPSTATS_MIB_INDISCARDS);
0554             goto drop;
0555         }
0556     }
0557 
0558     if (!xfrm6_route_forward(skb)) {
0559         __IP6_INC_STATS(net, idev, IPSTATS_MIB_INDISCARDS);
0560         SKB_DR_SET(reason, XFRM_POLICY);
0561         goto drop;
0562     }
0563     dst = skb_dst(skb);
0564 
0565     /* IPv6 specs say nothing about it, but it is clear that we cannot
0566        send redirects to source routed frames.
0567        We don't send redirects to frames decapsulated from IPsec.
0568      */
0569     if (IP6CB(skb)->iif == dst->dev->ifindex &&
0570         opt->srcrt == 0 && !skb_sec_path(skb)) {
0571         struct in6_addr *target = NULL;
0572         struct inet_peer *peer;
0573         struct rt6_info *rt;
0574 
0575         /*
0576          *  incoming and outgoing devices are the same
0577          *  send a redirect.
0578          */
0579 
0580         rt = (struct rt6_info *) dst;
0581         if (rt->rt6i_flags & RTF_GATEWAY)
0582             target = &rt->rt6i_gateway;
0583         else
0584             target = &hdr->daddr;
0585 
0586         peer = inet_getpeer_v6(net->ipv6.peers, &hdr->daddr, 1);
0587 
0588         /* Limit redirects both by destination (here)
0589            and by source (inside ndisc_send_redirect)
0590          */
0591         if (inet_peer_xrlim_allow(peer, 1*HZ))
0592             ndisc_send_redirect(skb, target);
0593         if (peer)
0594             inet_putpeer(peer);
0595     } else {
0596         int addrtype = ipv6_addr_type(&hdr->saddr);
0597 
0598         /* This check is security critical. */
0599         if (addrtype == IPV6_ADDR_ANY ||
0600             addrtype & (IPV6_ADDR_MULTICAST | IPV6_ADDR_LOOPBACK))
0601             goto error;
0602         if (addrtype & IPV6_ADDR_LINKLOCAL) {
0603             icmpv6_send(skb, ICMPV6_DEST_UNREACH,
0604                     ICMPV6_NOT_NEIGHBOUR, 0);
0605             goto error;
0606         }
0607     }
0608 
0609     mtu = ip6_dst_mtu_maybe_forward(dst, true);
0610     if (mtu < IPV6_MIN_MTU)
0611         mtu = IPV6_MIN_MTU;
0612 
0613     if (ip6_pkt_too_big(skb, mtu)) {
0614         /* Again, force OUTPUT device used as source address */
0615         skb->dev = dst->dev;
0616         icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
0617         __IP6_INC_STATS(net, idev, IPSTATS_MIB_INTOOBIGERRORS);
0618         __IP6_INC_STATS(net, ip6_dst_idev(dst),
0619                 IPSTATS_MIB_FRAGFAILS);
0620         kfree_skb_reason(skb, SKB_DROP_REASON_PKT_TOO_BIG);
0621         return -EMSGSIZE;
0622     }
0623 
0624     if (skb_cow(skb, dst->dev->hard_header_len)) {
0625         __IP6_INC_STATS(net, ip6_dst_idev(dst),
0626                 IPSTATS_MIB_OUTDISCARDS);
0627         goto drop;
0628     }
0629 
0630     hdr = ipv6_hdr(skb);
0631 
0632     /* Mangling hops number delayed to point after skb COW */
0633 
0634     hdr->hop_limit--;
0635 
0636     return NF_HOOK(NFPROTO_IPV6, NF_INET_FORWARD,
0637                net, NULL, skb, skb->dev, dst->dev,
0638                ip6_forward_finish);
0639 
0640 error:
0641     __IP6_INC_STATS(net, idev, IPSTATS_MIB_INADDRERRORS);
0642     SKB_DR_SET(reason, IP_INADDRERRORS);
0643 drop:
0644     kfree_skb_reason(skb, reason);
0645     return -EINVAL;
0646 }
0647 
0648 static void ip6_copy_metadata(struct sk_buff *to, struct sk_buff *from)
0649 {
0650     to->pkt_type = from->pkt_type;
0651     to->priority = from->priority;
0652     to->protocol = from->protocol;
0653     skb_dst_drop(to);
0654     skb_dst_set(to, dst_clone(skb_dst(from)));
0655     to->dev = from->dev;
0656     to->mark = from->mark;
0657 
0658     skb_copy_hash(to, from);
0659 
0660 #ifdef CONFIG_NET_SCHED
0661     to->tc_index = from->tc_index;
0662 #endif
0663     nf_copy(to, from);
0664     skb_ext_copy(to, from);
0665     skb_copy_secmark(to, from);
0666 }
0667 
0668 int ip6_fraglist_init(struct sk_buff *skb, unsigned int hlen, u8 *prevhdr,
0669               u8 nexthdr, __be32 frag_id,
0670               struct ip6_fraglist_iter *iter)
0671 {
0672     unsigned int first_len;
0673     struct frag_hdr *fh;
0674 
0675     /* BUILD HEADER */
0676     *prevhdr = NEXTHDR_FRAGMENT;
0677     iter->tmp_hdr = kmemdup(skb_network_header(skb), hlen, GFP_ATOMIC);
0678     if (!iter->tmp_hdr)
0679         return -ENOMEM;
0680 
0681     iter->frag = skb_shinfo(skb)->frag_list;
0682     skb_frag_list_init(skb);
0683 
0684     iter->offset = 0;
0685     iter->hlen = hlen;
0686     iter->frag_id = frag_id;
0687     iter->nexthdr = nexthdr;
0688 
0689     __skb_pull(skb, hlen);
0690     fh = __skb_push(skb, sizeof(struct frag_hdr));
0691     __skb_push(skb, hlen);
0692     skb_reset_network_header(skb);
0693     memcpy(skb_network_header(skb), iter->tmp_hdr, hlen);
0694 
0695     fh->nexthdr = nexthdr;
0696     fh->reserved = 0;
0697     fh->frag_off = htons(IP6_MF);
0698     fh->identification = frag_id;
0699 
0700     first_len = skb_pagelen(skb);
0701     skb->data_len = first_len - skb_headlen(skb);
0702     skb->len = first_len;
0703     ipv6_hdr(skb)->payload_len = htons(first_len - sizeof(struct ipv6hdr));
0704 
0705     return 0;
0706 }
0707 EXPORT_SYMBOL(ip6_fraglist_init);
0708 
0709 void ip6_fraglist_prepare(struct sk_buff *skb,
0710               struct ip6_fraglist_iter *iter)
0711 {
0712     struct sk_buff *frag = iter->frag;
0713     unsigned int hlen = iter->hlen;
0714     struct frag_hdr *fh;
0715 
0716     frag->ip_summed = CHECKSUM_NONE;
0717     skb_reset_transport_header(frag);
0718     fh = __skb_push(frag, sizeof(struct frag_hdr));
0719     __skb_push(frag, hlen);
0720     skb_reset_network_header(frag);
0721     memcpy(skb_network_header(frag), iter->tmp_hdr, hlen);
0722     iter->offset += skb->len - hlen - sizeof(struct frag_hdr);
0723     fh->nexthdr = iter->nexthdr;
0724     fh->reserved = 0;
0725     fh->frag_off = htons(iter->offset);
0726     if (frag->next)
0727         fh->frag_off |= htons(IP6_MF);
0728     fh->identification = iter->frag_id;
0729     ipv6_hdr(frag)->payload_len = htons(frag->len - sizeof(struct ipv6hdr));
0730     ip6_copy_metadata(frag, skb);
0731 }
0732 EXPORT_SYMBOL(ip6_fraglist_prepare);
0733 
0734 void ip6_frag_init(struct sk_buff *skb, unsigned int hlen, unsigned int mtu,
0735            unsigned short needed_tailroom, int hdr_room, u8 *prevhdr,
0736            u8 nexthdr, __be32 frag_id, struct ip6_frag_state *state)
0737 {
0738     state->prevhdr = prevhdr;
0739     state->nexthdr = nexthdr;
0740     state->frag_id = frag_id;
0741 
0742     state->hlen = hlen;
0743     state->mtu = mtu;
0744 
0745     state->left = skb->len - hlen;  /* Space per frame */
0746     state->ptr = hlen;      /* Where to start from */
0747 
0748     state->hroom = hdr_room;
0749     state->troom = needed_tailroom;
0750 
0751     state->offset = 0;
0752 }
0753 EXPORT_SYMBOL(ip6_frag_init);
0754 
0755 struct sk_buff *ip6_frag_next(struct sk_buff *skb, struct ip6_frag_state *state)
0756 {
0757     u8 *prevhdr = state->prevhdr, *fragnexthdr_offset;
0758     struct sk_buff *frag;
0759     struct frag_hdr *fh;
0760     unsigned int len;
0761 
0762     len = state->left;
0763     /* IF: it doesn't fit, use 'mtu' - the data space left */
0764     if (len > state->mtu)
0765         len = state->mtu;
0766     /* IF: we are not sending up to and including the packet end
0767        then align the next start on an eight byte boundary */
0768     if (len < state->left)
0769         len &= ~7;
0770 
0771     /* Allocate buffer */
0772     frag = alloc_skb(len + state->hlen + sizeof(struct frag_hdr) +
0773              state->hroom + state->troom, GFP_ATOMIC);
0774     if (!frag)
0775         return ERR_PTR(-ENOMEM);
0776 
0777     /*
0778      *  Set up data on packet
0779      */
0780 
0781     ip6_copy_metadata(frag, skb);
0782     skb_reserve(frag, state->hroom);
0783     skb_put(frag, len + state->hlen + sizeof(struct frag_hdr));
0784     skb_reset_network_header(frag);
0785     fh = (struct frag_hdr *)(skb_network_header(frag) + state->hlen);
0786     frag->transport_header = (frag->network_header + state->hlen +
0787                   sizeof(struct frag_hdr));
0788 
0789     /*
0790      *  Charge the memory for the fragment to any owner
0791      *  it might possess
0792      */
0793     if (skb->sk)
0794         skb_set_owner_w(frag, skb->sk);
0795 
0796     /*
0797      *  Copy the packet header into the new buffer.
0798      */
0799     skb_copy_from_linear_data(skb, skb_network_header(frag), state->hlen);
0800 
0801     fragnexthdr_offset = skb_network_header(frag);
0802     fragnexthdr_offset += prevhdr - skb_network_header(skb);
0803     *fragnexthdr_offset = NEXTHDR_FRAGMENT;
0804 
0805     /*
0806      *  Build fragment header.
0807      */
0808     fh->nexthdr = state->nexthdr;
0809     fh->reserved = 0;
0810     fh->identification = state->frag_id;
0811 
0812     /*
0813      *  Copy a block of the IP datagram.
0814      */
0815     BUG_ON(skb_copy_bits(skb, state->ptr, skb_transport_header(frag),
0816                  len));
0817     state->left -= len;
0818 
0819     fh->frag_off = htons(state->offset);
0820     if (state->left > 0)
0821         fh->frag_off |= htons(IP6_MF);
0822     ipv6_hdr(frag)->payload_len = htons(frag->len - sizeof(struct ipv6hdr));
0823 
0824     state->ptr += len;
0825     state->offset += len;
0826 
0827     return frag;
0828 }
0829 EXPORT_SYMBOL(ip6_frag_next);
0830 
0831 int ip6_fragment(struct net *net, struct sock *sk, struct sk_buff *skb,
0832          int (*output)(struct net *, struct sock *, struct sk_buff *))
0833 {
0834     struct sk_buff *frag;
0835     struct rt6_info *rt = (struct rt6_info *)skb_dst(skb);
0836     struct ipv6_pinfo *np = skb->sk && !dev_recursion_level() ?
0837                 inet6_sk(skb->sk) : NULL;
0838     bool mono_delivery_time = skb->mono_delivery_time;
0839     struct ip6_frag_state state;
0840     unsigned int mtu, hlen, nexthdr_offset;
0841     ktime_t tstamp = skb->tstamp;
0842     int hroom, err = 0;
0843     __be32 frag_id;
0844     u8 *prevhdr, nexthdr = 0;
0845 
0846     err = ip6_find_1stfragopt(skb, &prevhdr);
0847     if (err < 0)
0848         goto fail;
0849     hlen = err;
0850     nexthdr = *prevhdr;
0851     nexthdr_offset = prevhdr - skb_network_header(skb);
0852 
0853     mtu = ip6_skb_dst_mtu(skb);
0854 
0855     /* We must not fragment if the socket is set to force MTU discovery
0856      * or if the skb it not generated by a local socket.
0857      */
0858     if (unlikely(!skb->ignore_df && skb->len > mtu))
0859         goto fail_toobig;
0860 
0861     if (IP6CB(skb)->frag_max_size) {
0862         if (IP6CB(skb)->frag_max_size > mtu)
0863             goto fail_toobig;
0864 
0865         /* don't send fragments larger than what we received */
0866         mtu = IP6CB(skb)->frag_max_size;
0867         if (mtu < IPV6_MIN_MTU)
0868             mtu = IPV6_MIN_MTU;
0869     }
0870 
0871     if (np && np->frag_size < mtu) {
0872         if (np->frag_size)
0873             mtu = np->frag_size;
0874     }
0875     if (mtu < hlen + sizeof(struct frag_hdr) + 8)
0876         goto fail_toobig;
0877     mtu -= hlen + sizeof(struct frag_hdr);
0878 
0879     frag_id = ipv6_select_ident(net, &ipv6_hdr(skb)->daddr,
0880                     &ipv6_hdr(skb)->saddr);
0881 
0882     if (skb->ip_summed == CHECKSUM_PARTIAL &&
0883         (err = skb_checksum_help(skb)))
0884         goto fail;
0885 
0886     prevhdr = skb_network_header(skb) + nexthdr_offset;
0887     hroom = LL_RESERVED_SPACE(rt->dst.dev);
0888     if (skb_has_frag_list(skb)) {
0889         unsigned int first_len = skb_pagelen(skb);
0890         struct ip6_fraglist_iter iter;
0891         struct sk_buff *frag2;
0892 
0893         if (first_len - hlen > mtu ||
0894             ((first_len - hlen) & 7) ||
0895             skb_cloned(skb) ||
0896             skb_headroom(skb) < (hroom + sizeof(struct frag_hdr)))
0897             goto slow_path;
0898 
0899         skb_walk_frags(skb, frag) {
0900             /* Correct geometry. */
0901             if (frag->len > mtu ||
0902                 ((frag->len & 7) && frag->next) ||
0903                 skb_headroom(frag) < (hlen + hroom + sizeof(struct frag_hdr)))
0904                 goto slow_path_clean;
0905 
0906             /* Partially cloned skb? */
0907             if (skb_shared(frag))
0908                 goto slow_path_clean;
0909 
0910             BUG_ON(frag->sk);
0911             if (skb->sk) {
0912                 frag->sk = skb->sk;
0913                 frag->destructor = sock_wfree;
0914             }
0915             skb->truesize -= frag->truesize;
0916         }
0917 
0918         err = ip6_fraglist_init(skb, hlen, prevhdr, nexthdr, frag_id,
0919                     &iter);
0920         if (err < 0)
0921             goto fail;
0922 
0923         for (;;) {
0924             /* Prepare header of the next frame,
0925              * before previous one went down. */
0926             if (iter.frag)
0927                 ip6_fraglist_prepare(skb, &iter);
0928 
0929             skb_set_delivery_time(skb, tstamp, mono_delivery_time);
0930             err = output(net, sk, skb);
0931             if (!err)
0932                 IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
0933                           IPSTATS_MIB_FRAGCREATES);
0934 
0935             if (err || !iter.frag)
0936                 break;
0937 
0938             skb = ip6_fraglist_next(&iter);
0939         }
0940 
0941         kfree(iter.tmp_hdr);
0942 
0943         if (err == 0) {
0944             IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
0945                       IPSTATS_MIB_FRAGOKS);
0946             return 0;
0947         }
0948 
0949         kfree_skb_list(iter.frag);
0950 
0951         IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
0952                   IPSTATS_MIB_FRAGFAILS);
0953         return err;
0954 
0955 slow_path_clean:
0956         skb_walk_frags(skb, frag2) {
0957             if (frag2 == frag)
0958                 break;
0959             frag2->sk = NULL;
0960             frag2->destructor = NULL;
0961             skb->truesize += frag2->truesize;
0962         }
0963     }
0964 
0965 slow_path:
0966     /*
0967      *  Fragment the datagram.
0968      */
0969 
0970     ip6_frag_init(skb, hlen, mtu, rt->dst.dev->needed_tailroom,
0971               LL_RESERVED_SPACE(rt->dst.dev), prevhdr, nexthdr, frag_id,
0972               &state);
0973 
0974     /*
0975      *  Keep copying data until we run out.
0976      */
0977 
0978     while (state.left > 0) {
0979         frag = ip6_frag_next(skb, &state);
0980         if (IS_ERR(frag)) {
0981             err = PTR_ERR(frag);
0982             goto fail;
0983         }
0984 
0985         /*
0986          *  Put this fragment into the sending queue.
0987          */
0988         skb_set_delivery_time(frag, tstamp, mono_delivery_time);
0989         err = output(net, sk, frag);
0990         if (err)
0991             goto fail;
0992 
0993         IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
0994                   IPSTATS_MIB_FRAGCREATES);
0995     }
0996     IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
0997               IPSTATS_MIB_FRAGOKS);
0998     consume_skb(skb);
0999     return err;
1000 
1001 fail_toobig:
1002     if (skb->sk && dst_allfrag(skb_dst(skb)))
1003         sk_gso_disable(skb->sk);
1004 
1005     icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
1006     err = -EMSGSIZE;
1007 
1008 fail:
1009     IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
1010               IPSTATS_MIB_FRAGFAILS);
1011     kfree_skb(skb);
1012     return err;
1013 }
1014 
1015 static inline int ip6_rt_check(const struct rt6key *rt_key,
1016                    const struct in6_addr *fl_addr,
1017                    const struct in6_addr *addr_cache)
1018 {
1019     return (rt_key->plen != 128 || !ipv6_addr_equal(fl_addr, &rt_key->addr)) &&
1020         (!addr_cache || !ipv6_addr_equal(fl_addr, addr_cache));
1021 }
1022 
1023 static struct dst_entry *ip6_sk_dst_check(struct sock *sk,
1024                       struct dst_entry *dst,
1025                       const struct flowi6 *fl6)
1026 {
1027     struct ipv6_pinfo *np = inet6_sk(sk);
1028     struct rt6_info *rt;
1029 
1030     if (!dst)
1031         goto out;
1032 
1033     if (dst->ops->family != AF_INET6) {
1034         dst_release(dst);
1035         return NULL;
1036     }
1037 
1038     rt = (struct rt6_info *)dst;
1039     /* Yes, checking route validity in not connected
1040      * case is not very simple. Take into account,
1041      * that we do not support routing by source, TOS,
1042      * and MSG_DONTROUTE        --ANK (980726)
1043      *
1044      * 1. ip6_rt_check(): If route was host route,
1045      *    check that cached destination is current.
1046      *    If it is network route, we still may
1047      *    check its validity using saved pointer
1048      *    to the last used address: daddr_cache.
1049      *    We do not want to save whole address now,
1050      *    (because main consumer of this service
1051      *    is tcp, which has not this problem),
1052      *    so that the last trick works only on connected
1053      *    sockets.
1054      * 2. oif also should be the same.
1055      */
1056     if (ip6_rt_check(&rt->rt6i_dst, &fl6->daddr, np->daddr_cache) ||
1057 #ifdef CONFIG_IPV6_SUBTREES
1058         ip6_rt_check(&rt->rt6i_src, &fl6->saddr, np->saddr_cache) ||
1059 #endif
1060        (fl6->flowi6_oif && fl6->flowi6_oif != dst->dev->ifindex)) {
1061         dst_release(dst);
1062         dst = NULL;
1063     }
1064 
1065 out:
1066     return dst;
1067 }
1068 
1069 static int ip6_dst_lookup_tail(struct net *net, const struct sock *sk,
1070                    struct dst_entry **dst, struct flowi6 *fl6)
1071 {
1072 #ifdef CONFIG_IPV6_OPTIMISTIC_DAD
1073     struct neighbour *n;
1074     struct rt6_info *rt;
1075 #endif
1076     int err;
1077     int flags = 0;
1078 
1079     /* The correct way to handle this would be to do
1080      * ip6_route_get_saddr, and then ip6_route_output; however,
1081      * the route-specific preferred source forces the
1082      * ip6_route_output call _before_ ip6_route_get_saddr.
1083      *
1084      * In source specific routing (no src=any default route),
1085      * ip6_route_output will fail given src=any saddr, though, so
1086      * that's why we try it again later.
1087      */
1088     if (ipv6_addr_any(&fl6->saddr)) {
1089         struct fib6_info *from;
1090         struct rt6_info *rt;
1091 
1092         *dst = ip6_route_output(net, sk, fl6);
1093         rt = (*dst)->error ? NULL : (struct rt6_info *)*dst;
1094 
1095         rcu_read_lock();
1096         from = rt ? rcu_dereference(rt->from) : NULL;
1097         err = ip6_route_get_saddr(net, from, &fl6->daddr,
1098                       sk ? inet6_sk(sk)->srcprefs : 0,
1099                       &fl6->saddr);
1100         rcu_read_unlock();
1101 
1102         if (err)
1103             goto out_err_release;
1104 
1105         /* If we had an erroneous initial result, pretend it
1106          * never existed and let the SA-enabled version take
1107          * over.
1108          */
1109         if ((*dst)->error) {
1110             dst_release(*dst);
1111             *dst = NULL;
1112         }
1113 
1114         if (fl6->flowi6_oif)
1115             flags |= RT6_LOOKUP_F_IFACE;
1116     }
1117 
1118     if (!*dst)
1119         *dst = ip6_route_output_flags(net, sk, fl6, flags);
1120 
1121     err = (*dst)->error;
1122     if (err)
1123         goto out_err_release;
1124 
1125 #ifdef CONFIG_IPV6_OPTIMISTIC_DAD
1126     /*
1127      * Here if the dst entry we've looked up
1128      * has a neighbour entry that is in the INCOMPLETE
1129      * state and the src address from the flow is
1130      * marked as OPTIMISTIC, we release the found
1131      * dst entry and replace it instead with the
1132      * dst entry of the nexthop router
1133      */
1134     rt = (struct rt6_info *) *dst;
1135     rcu_read_lock_bh();
1136     n = __ipv6_neigh_lookup_noref(rt->dst.dev,
1137                       rt6_nexthop(rt, &fl6->daddr));
1138     err = n && !(n->nud_state & NUD_VALID) ? -EINVAL : 0;
1139     rcu_read_unlock_bh();
1140 
1141     if (err) {
1142         struct inet6_ifaddr *ifp;
1143         struct flowi6 fl_gw6;
1144         int redirect;
1145 
1146         ifp = ipv6_get_ifaddr(net, &fl6->saddr,
1147                       (*dst)->dev, 1);
1148 
1149         redirect = (ifp && ifp->flags & IFA_F_OPTIMISTIC);
1150         if (ifp)
1151             in6_ifa_put(ifp);
1152 
1153         if (redirect) {
1154             /*
1155              * We need to get the dst entry for the
1156              * default router instead
1157              */
1158             dst_release(*dst);
1159             memcpy(&fl_gw6, fl6, sizeof(struct flowi6));
1160             memset(&fl_gw6.daddr, 0, sizeof(struct in6_addr));
1161             *dst = ip6_route_output(net, sk, &fl_gw6);
1162             err = (*dst)->error;
1163             if (err)
1164                 goto out_err_release;
1165         }
1166     }
1167 #endif
1168     if (ipv6_addr_v4mapped(&fl6->saddr) &&
1169         !(ipv6_addr_v4mapped(&fl6->daddr) || ipv6_addr_any(&fl6->daddr))) {
1170         err = -EAFNOSUPPORT;
1171         goto out_err_release;
1172     }
1173 
1174     return 0;
1175 
1176 out_err_release:
1177     dst_release(*dst);
1178     *dst = NULL;
1179 
1180     if (err == -ENETUNREACH)
1181         IP6_INC_STATS(net, NULL, IPSTATS_MIB_OUTNOROUTES);
1182     return err;
1183 }
1184 
1185 /**
1186  *  ip6_dst_lookup - perform route lookup on flow
1187  *  @net: Network namespace to perform lookup in
1188  *  @sk: socket which provides route info
1189  *  @dst: pointer to dst_entry * for result
1190  *  @fl6: flow to lookup
1191  *
1192  *  This function performs a route lookup on the given flow.
1193  *
1194  *  It returns zero on success, or a standard errno code on error.
1195  */
1196 int ip6_dst_lookup(struct net *net, struct sock *sk, struct dst_entry **dst,
1197            struct flowi6 *fl6)
1198 {
1199     *dst = NULL;
1200     return ip6_dst_lookup_tail(net, sk, dst, fl6);
1201 }
1202 EXPORT_SYMBOL_GPL(ip6_dst_lookup);
1203 
1204 /**
1205  *  ip6_dst_lookup_flow - perform route lookup on flow with ipsec
1206  *  @net: Network namespace to perform lookup in
1207  *  @sk: socket which provides route info
1208  *  @fl6: flow to lookup
1209  *  @final_dst: final destination address for ipsec lookup
1210  *
1211  *  This function performs a route lookup on the given flow.
1212  *
1213  *  It returns a valid dst pointer on success, or a pointer encoded
1214  *  error code.
1215  */
1216 struct dst_entry *ip6_dst_lookup_flow(struct net *net, const struct sock *sk, struct flowi6 *fl6,
1217                       const struct in6_addr *final_dst)
1218 {
1219     struct dst_entry *dst = NULL;
1220     int err;
1221 
1222     err = ip6_dst_lookup_tail(net, sk, &dst, fl6);
1223     if (err)
1224         return ERR_PTR(err);
1225     if (final_dst)
1226         fl6->daddr = *final_dst;
1227 
1228     return xfrm_lookup_route(net, dst, flowi6_to_flowi(fl6), sk, 0);
1229 }
1230 EXPORT_SYMBOL_GPL(ip6_dst_lookup_flow);
1231 
1232 /**
1233  *  ip6_sk_dst_lookup_flow - perform socket cached route lookup on flow
1234  *  @sk: socket which provides the dst cache and route info
1235  *  @fl6: flow to lookup
1236  *  @final_dst: final destination address for ipsec lookup
1237  *  @connected: whether @sk is connected or not
1238  *
1239  *  This function performs a route lookup on the given flow with the
1240  *  possibility of using the cached route in the socket if it is valid.
1241  *  It will take the socket dst lock when operating on the dst cache.
1242  *  As a result, this function can only be used in process context.
1243  *
1244  *  In addition, for a connected socket, cache the dst in the socket
1245  *  if the current cache is not valid.
1246  *
1247  *  It returns a valid dst pointer on success, or a pointer encoded
1248  *  error code.
1249  */
1250 struct dst_entry *ip6_sk_dst_lookup_flow(struct sock *sk, struct flowi6 *fl6,
1251                      const struct in6_addr *final_dst,
1252                      bool connected)
1253 {
1254     struct dst_entry *dst = sk_dst_check(sk, inet6_sk(sk)->dst_cookie);
1255 
1256     dst = ip6_sk_dst_check(sk, dst, fl6);
1257     if (dst)
1258         return dst;
1259 
1260     dst = ip6_dst_lookup_flow(sock_net(sk), sk, fl6, final_dst);
1261     if (connected && !IS_ERR(dst))
1262         ip6_sk_dst_store_flow(sk, dst_clone(dst), fl6);
1263 
1264     return dst;
1265 }
1266 EXPORT_SYMBOL_GPL(ip6_sk_dst_lookup_flow);
1267 
1268 /**
1269  *      ip6_dst_lookup_tunnel - perform route lookup on tunnel
1270  *      @skb: Packet for which lookup is done
1271  *      @dev: Tunnel device
1272  *      @net: Network namespace of tunnel device
1273  *      @sock: Socket which provides route info
1274  *      @saddr: Memory to store the src ip address
1275  *      @info: Tunnel information
1276  *      @protocol: IP protocol
1277  *      @use_cache: Flag to enable cache usage
1278  *      This function performs a route lookup on a tunnel
1279  *
1280  *      It returns a valid dst pointer and stores src address to be used in
1281  *      tunnel in param saddr on success, else a pointer encoded error code.
1282  */
1283 
1284 struct dst_entry *ip6_dst_lookup_tunnel(struct sk_buff *skb,
1285                     struct net_device *dev,
1286                     struct net *net,
1287                     struct socket *sock,
1288                     struct in6_addr *saddr,
1289                     const struct ip_tunnel_info *info,
1290                     u8 protocol,
1291                     bool use_cache)
1292 {
1293     struct dst_entry *dst = NULL;
1294 #ifdef CONFIG_DST_CACHE
1295     struct dst_cache *dst_cache;
1296 #endif
1297     struct flowi6 fl6;
1298     __u8 prio;
1299 
1300 #ifdef CONFIG_DST_CACHE
1301     dst_cache = (struct dst_cache *)&info->dst_cache;
1302     if (use_cache) {
1303         dst = dst_cache_get_ip6(dst_cache, saddr);
1304         if (dst)
1305             return dst;
1306     }
1307 #endif
1308     memset(&fl6, 0, sizeof(fl6));
1309     fl6.flowi6_mark = skb->mark;
1310     fl6.flowi6_proto = protocol;
1311     fl6.daddr = info->key.u.ipv6.dst;
1312     fl6.saddr = info->key.u.ipv6.src;
1313     prio = info->key.tos;
1314     fl6.flowlabel = ip6_make_flowinfo(prio, info->key.label);
1315 
1316     dst = ipv6_stub->ipv6_dst_lookup_flow(net, sock->sk, &fl6,
1317                           NULL);
1318     if (IS_ERR(dst)) {
1319         netdev_dbg(dev, "no route to %pI6\n", &fl6.daddr);
1320         return ERR_PTR(-ENETUNREACH);
1321     }
1322     if (dst->dev == dev) { /* is this necessary? */
1323         netdev_dbg(dev, "circular route to %pI6\n", &fl6.daddr);
1324         dst_release(dst);
1325         return ERR_PTR(-ELOOP);
1326     }
1327 #ifdef CONFIG_DST_CACHE
1328     if (use_cache)
1329         dst_cache_set_ip6(dst_cache, dst, &fl6.saddr);
1330 #endif
1331     *saddr = fl6.saddr;
1332     return dst;
1333 }
1334 EXPORT_SYMBOL_GPL(ip6_dst_lookup_tunnel);
1335 
1336 static inline struct ipv6_opt_hdr *ip6_opt_dup(struct ipv6_opt_hdr *src,
1337                            gfp_t gfp)
1338 {
1339     return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL;
1340 }
1341 
1342 static inline struct ipv6_rt_hdr *ip6_rthdr_dup(struct ipv6_rt_hdr *src,
1343                         gfp_t gfp)
1344 {
1345     return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL;
1346 }
1347 
1348 static void ip6_append_data_mtu(unsigned int *mtu,
1349                 int *maxfraglen,
1350                 unsigned int fragheaderlen,
1351                 struct sk_buff *skb,
1352                 struct rt6_info *rt,
1353                 unsigned int orig_mtu)
1354 {
1355     if (!(rt->dst.flags & DST_XFRM_TUNNEL)) {
1356         if (!skb) {
1357             /* first fragment, reserve header_len */
1358             *mtu = orig_mtu - rt->dst.header_len;
1359 
1360         } else {
1361             /*
1362              * this fragment is not first, the headers
1363              * space is regarded as data space.
1364              */
1365             *mtu = orig_mtu;
1366         }
1367         *maxfraglen = ((*mtu - fragheaderlen) & ~7)
1368                   + fragheaderlen - sizeof(struct frag_hdr);
1369     }
1370 }
1371 
1372 static int ip6_setup_cork(struct sock *sk, struct inet_cork_full *cork,
1373               struct inet6_cork *v6_cork, struct ipcm6_cookie *ipc6,
1374               struct rt6_info *rt)
1375 {
1376     struct ipv6_pinfo *np = inet6_sk(sk);
1377     unsigned int mtu;
1378     struct ipv6_txoptions *nopt, *opt = ipc6->opt;
1379 
1380     /* callers pass dst together with a reference, set it first so
1381      * ip6_cork_release() can put it down even in case of an error.
1382      */
1383     cork->base.dst = &rt->dst;
1384 
1385     /*
1386      * setup for corking
1387      */
1388     if (opt) {
1389         if (WARN_ON(v6_cork->opt))
1390             return -EINVAL;
1391 
1392         nopt = v6_cork->opt = kzalloc(sizeof(*opt), sk->sk_allocation);
1393         if (unlikely(!nopt))
1394             return -ENOBUFS;
1395 
1396         nopt->tot_len = sizeof(*opt);
1397         nopt->opt_flen = opt->opt_flen;
1398         nopt->opt_nflen = opt->opt_nflen;
1399 
1400         nopt->dst0opt = ip6_opt_dup(opt->dst0opt, sk->sk_allocation);
1401         if (opt->dst0opt && !nopt->dst0opt)
1402             return -ENOBUFS;
1403 
1404         nopt->dst1opt = ip6_opt_dup(opt->dst1opt, sk->sk_allocation);
1405         if (opt->dst1opt && !nopt->dst1opt)
1406             return -ENOBUFS;
1407 
1408         nopt->hopopt = ip6_opt_dup(opt->hopopt, sk->sk_allocation);
1409         if (opt->hopopt && !nopt->hopopt)
1410             return -ENOBUFS;
1411 
1412         nopt->srcrt = ip6_rthdr_dup(opt->srcrt, sk->sk_allocation);
1413         if (opt->srcrt && !nopt->srcrt)
1414             return -ENOBUFS;
1415 
1416         /* need source address above miyazawa*/
1417     }
1418     v6_cork->hop_limit = ipc6->hlimit;
1419     v6_cork->tclass = ipc6->tclass;
1420     if (rt->dst.flags & DST_XFRM_TUNNEL)
1421         mtu = np->pmtudisc >= IPV6_PMTUDISC_PROBE ?
1422               READ_ONCE(rt->dst.dev->mtu) : dst_mtu(&rt->dst);
1423     else
1424         mtu = np->pmtudisc >= IPV6_PMTUDISC_PROBE ?
1425             READ_ONCE(rt->dst.dev->mtu) : dst_mtu(xfrm_dst_path(&rt->dst));
1426     if (np->frag_size < mtu) {
1427         if (np->frag_size)
1428             mtu = np->frag_size;
1429     }
1430     cork->base.fragsize = mtu;
1431     cork->base.gso_size = ipc6->gso_size;
1432     cork->base.tx_flags = 0;
1433     cork->base.mark = ipc6->sockc.mark;
1434     sock_tx_timestamp(sk, ipc6->sockc.tsflags, &cork->base.tx_flags);
1435 
1436     if (dst_allfrag(xfrm_dst_path(&rt->dst)))
1437         cork->base.flags |= IPCORK_ALLFRAG;
1438     cork->base.length = 0;
1439 
1440     cork->base.transmit_time = ipc6->sockc.transmit_time;
1441 
1442     return 0;
1443 }
1444 
1445 static int __ip6_append_data(struct sock *sk,
1446                  struct sk_buff_head *queue,
1447                  struct inet_cork_full *cork_full,
1448                  struct inet6_cork *v6_cork,
1449                  struct page_frag *pfrag,
1450                  int getfrag(void *from, char *to, int offset,
1451                      int len, int odd, struct sk_buff *skb),
1452                  void *from, size_t length, int transhdrlen,
1453                  unsigned int flags, struct ipcm6_cookie *ipc6)
1454 {
1455     struct sk_buff *skb, *skb_prev = NULL;
1456     struct inet_cork *cork = &cork_full->base;
1457     struct flowi6 *fl6 = &cork_full->fl.u.ip6;
1458     unsigned int maxfraglen, fragheaderlen, mtu, orig_mtu, pmtu;
1459     struct ubuf_info *uarg = NULL;
1460     int exthdrlen = 0;
1461     int dst_exthdrlen = 0;
1462     int hh_len;
1463     int copy;
1464     int err;
1465     int offset = 0;
1466     bool zc = false;
1467     u32 tskey = 0;
1468     struct rt6_info *rt = (struct rt6_info *)cork->dst;
1469     struct ipv6_txoptions *opt = v6_cork->opt;
1470     int csummode = CHECKSUM_NONE;
1471     unsigned int maxnonfragsize, headersize;
1472     unsigned int wmem_alloc_delta = 0;
1473     bool paged, extra_uref = false;
1474 
1475     skb = skb_peek_tail(queue);
1476     if (!skb) {
1477         exthdrlen = opt ? opt->opt_flen : 0;
1478         dst_exthdrlen = rt->dst.header_len - rt->rt6i_nfheader_len;
1479     }
1480 
1481     paged = !!cork->gso_size;
1482     mtu = cork->gso_size ? IP6_MAX_MTU : cork->fragsize;
1483     orig_mtu = mtu;
1484 
1485     if (cork->tx_flags & SKBTX_ANY_SW_TSTAMP &&
1486         sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID)
1487         tskey = atomic_inc_return(&sk->sk_tskey) - 1;
1488 
1489     hh_len = LL_RESERVED_SPACE(rt->dst.dev);
1490 
1491     fragheaderlen = sizeof(struct ipv6hdr) + rt->rt6i_nfheader_len +
1492             (opt ? opt->opt_nflen : 0);
1493 
1494     headersize = sizeof(struct ipv6hdr) +
1495              (opt ? opt->opt_flen + opt->opt_nflen : 0) +
1496              (dst_allfrag(&rt->dst) ?
1497               sizeof(struct frag_hdr) : 0) +
1498              rt->rt6i_nfheader_len;
1499 
1500     if (mtu <= fragheaderlen ||
1501         ((mtu - fragheaderlen) & ~7) + fragheaderlen <= sizeof(struct frag_hdr))
1502         goto emsgsize;
1503 
1504     maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen -
1505              sizeof(struct frag_hdr);
1506 
1507     /* as per RFC 7112 section 5, the entire IPv6 Header Chain must fit
1508      * the first fragment
1509      */
1510     if (headersize + transhdrlen > mtu)
1511         goto emsgsize;
1512 
1513     if (cork->length + length > mtu - headersize && ipc6->dontfrag &&
1514         (sk->sk_protocol == IPPROTO_UDP ||
1515          sk->sk_protocol == IPPROTO_ICMPV6 ||
1516          sk->sk_protocol == IPPROTO_RAW)) {
1517         ipv6_local_rxpmtu(sk, fl6, mtu - headersize +
1518                 sizeof(struct ipv6hdr));
1519         goto emsgsize;
1520     }
1521 
1522     if (ip6_sk_ignore_df(sk))
1523         maxnonfragsize = sizeof(struct ipv6hdr) + IPV6_MAXPLEN;
1524     else
1525         maxnonfragsize = mtu;
1526 
1527     if (cork->length + length > maxnonfragsize - headersize) {
1528 emsgsize:
1529         pmtu = max_t(int, mtu - headersize + sizeof(struct ipv6hdr), 0);
1530         ipv6_local_error(sk, EMSGSIZE, fl6, pmtu);
1531         return -EMSGSIZE;
1532     }
1533 
1534     /* CHECKSUM_PARTIAL only with no extension headers and when
1535      * we are not going to fragment
1536      */
1537     if (transhdrlen && sk->sk_protocol == IPPROTO_UDP &&
1538         headersize == sizeof(struct ipv6hdr) &&
1539         length <= mtu - headersize &&
1540         (!(flags & MSG_MORE) || cork->gso_size) &&
1541         rt->dst.dev->features & (NETIF_F_IPV6_CSUM | NETIF_F_HW_CSUM))
1542         csummode = CHECKSUM_PARTIAL;
1543 
1544     if ((flags & MSG_ZEROCOPY) && length) {
1545         struct msghdr *msg = from;
1546 
1547         if (getfrag == ip_generic_getfrag && msg->msg_ubuf) {
1548             if (skb_zcopy(skb) && msg->msg_ubuf != skb_zcopy(skb))
1549                 return -EINVAL;
1550 
1551             /* Leave uarg NULL if can't zerocopy, callers should
1552              * be able to handle it.
1553              */
1554             if ((rt->dst.dev->features & NETIF_F_SG) &&
1555                 csummode == CHECKSUM_PARTIAL) {
1556                 paged = true;
1557                 zc = true;
1558                 uarg = msg->msg_ubuf;
1559             }
1560         } else if (sock_flag(sk, SOCK_ZEROCOPY)) {
1561             uarg = msg_zerocopy_realloc(sk, length, skb_zcopy(skb));
1562             if (!uarg)
1563                 return -ENOBUFS;
1564             extra_uref = !skb_zcopy(skb);   /* only ref on new uarg */
1565             if (rt->dst.dev->features & NETIF_F_SG &&
1566                 csummode == CHECKSUM_PARTIAL) {
1567                 paged = true;
1568                 zc = true;
1569             } else {
1570                 uarg->zerocopy = 0;
1571                 skb_zcopy_set(skb, uarg, &extra_uref);
1572             }
1573         }
1574     }
1575 
1576     /*
1577      * Let's try using as much space as possible.
1578      * Use MTU if total length of the message fits into the MTU.
1579      * Otherwise, we need to reserve fragment header and
1580      * fragment alignment (= 8-15 octects, in total).
1581      *
1582      * Note that we may need to "move" the data from the tail
1583      * of the buffer to the new fragment when we split
1584      * the message.
1585      *
1586      * FIXME: It may be fragmented into multiple chunks
1587      *        at once if non-fragmentable extension headers
1588      *        are too large.
1589      * --yoshfuji
1590      */
1591 
1592     cork->length += length;
1593     if (!skb)
1594         goto alloc_new_skb;
1595 
1596     while (length > 0) {
1597         /* Check if the remaining data fits into current packet. */
1598         copy = (cork->length <= mtu && !(cork->flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - skb->len;
1599         if (copy < length)
1600             copy = maxfraglen - skb->len;
1601 
1602         if (copy <= 0) {
1603             char *data;
1604             unsigned int datalen;
1605             unsigned int fraglen;
1606             unsigned int fraggap;
1607             unsigned int alloclen, alloc_extra;
1608             unsigned int pagedlen;
1609 alloc_new_skb:
1610             /* There's no room in the current skb */
1611             if (skb)
1612                 fraggap = skb->len - maxfraglen;
1613             else
1614                 fraggap = 0;
1615             /* update mtu and maxfraglen if necessary */
1616             if (!skb || !skb_prev)
1617                 ip6_append_data_mtu(&mtu, &maxfraglen,
1618                             fragheaderlen, skb, rt,
1619                             orig_mtu);
1620 
1621             skb_prev = skb;
1622 
1623             /*
1624              * If remaining data exceeds the mtu,
1625              * we know we need more fragment(s).
1626              */
1627             datalen = length + fraggap;
1628 
1629             if (datalen > (cork->length <= mtu && !(cork->flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - fragheaderlen)
1630                 datalen = maxfraglen - fragheaderlen - rt->dst.trailer_len;
1631             fraglen = datalen + fragheaderlen;
1632             pagedlen = 0;
1633 
1634             alloc_extra = hh_len;
1635             alloc_extra += dst_exthdrlen;
1636             alloc_extra += rt->dst.trailer_len;
1637 
1638             /* We just reserve space for fragment header.
1639              * Note: this may be overallocation if the message
1640              * (without MSG_MORE) fits into the MTU.
1641              */
1642             alloc_extra += sizeof(struct frag_hdr);
1643 
1644             if ((flags & MSG_MORE) &&
1645                 !(rt->dst.dev->features&NETIF_F_SG))
1646                 alloclen = mtu;
1647             else if (!paged &&
1648                  (fraglen + alloc_extra < SKB_MAX_ALLOC ||
1649                   !(rt->dst.dev->features & NETIF_F_SG)))
1650                 alloclen = fraglen;
1651             else if (!zc) {
1652                 alloclen = min_t(int, fraglen, MAX_HEADER);
1653                 pagedlen = fraglen - alloclen;
1654             } else {
1655                 alloclen = fragheaderlen + transhdrlen;
1656                 pagedlen = datalen - transhdrlen;
1657             }
1658             alloclen += alloc_extra;
1659 
1660             if (datalen != length + fraggap) {
1661                 /*
1662                  * this is not the last fragment, the trailer
1663                  * space is regarded as data space.
1664                  */
1665                 datalen += rt->dst.trailer_len;
1666             }
1667 
1668             fraglen = datalen + fragheaderlen;
1669 
1670             copy = datalen - transhdrlen - fraggap - pagedlen;
1671             if (copy < 0) {
1672                 err = -EINVAL;
1673                 goto error;
1674             }
1675             if (transhdrlen) {
1676                 skb = sock_alloc_send_skb(sk, alloclen,
1677                         (flags & MSG_DONTWAIT), &err);
1678             } else {
1679                 skb = NULL;
1680                 if (refcount_read(&sk->sk_wmem_alloc) + wmem_alloc_delta <=
1681                     2 * sk->sk_sndbuf)
1682                     skb = alloc_skb(alloclen,
1683                             sk->sk_allocation);
1684                 if (unlikely(!skb))
1685                     err = -ENOBUFS;
1686             }
1687             if (!skb)
1688                 goto error;
1689             /*
1690              *  Fill in the control structures
1691              */
1692             skb->protocol = htons(ETH_P_IPV6);
1693             skb->ip_summed = csummode;
1694             skb->csum = 0;
1695             /* reserve for fragmentation and ipsec header */
1696             skb_reserve(skb, hh_len + sizeof(struct frag_hdr) +
1697                     dst_exthdrlen);
1698 
1699             /*
1700              *  Find where to start putting bytes
1701              */
1702             data = skb_put(skb, fraglen - pagedlen);
1703             skb_set_network_header(skb, exthdrlen);
1704             data += fragheaderlen;
1705             skb->transport_header = (skb->network_header +
1706                          fragheaderlen);
1707             if (fraggap) {
1708                 skb->csum = skb_copy_and_csum_bits(
1709                     skb_prev, maxfraglen,
1710                     data + transhdrlen, fraggap);
1711                 skb_prev->csum = csum_sub(skb_prev->csum,
1712                               skb->csum);
1713                 data += fraggap;
1714                 pskb_trim_unique(skb_prev, maxfraglen);
1715             }
1716             if (copy > 0 &&
1717                 getfrag(from, data + transhdrlen, offset,
1718                     copy, fraggap, skb) < 0) {
1719                 err = -EFAULT;
1720                 kfree_skb(skb);
1721                 goto error;
1722             }
1723 
1724             offset += copy;
1725             length -= copy + transhdrlen;
1726             transhdrlen = 0;
1727             exthdrlen = 0;
1728             dst_exthdrlen = 0;
1729 
1730             /* Only the initial fragment is time stamped */
1731             skb_shinfo(skb)->tx_flags = cork->tx_flags;
1732             cork->tx_flags = 0;
1733             skb_shinfo(skb)->tskey = tskey;
1734             tskey = 0;
1735             skb_zcopy_set(skb, uarg, &extra_uref);
1736 
1737             if ((flags & MSG_CONFIRM) && !skb_prev)
1738                 skb_set_dst_pending_confirm(skb, 1);
1739 
1740             /*
1741              * Put the packet on the pending queue
1742              */
1743             if (!skb->destructor) {
1744                 skb->destructor = sock_wfree;
1745                 skb->sk = sk;
1746                 wmem_alloc_delta += skb->truesize;
1747             }
1748             __skb_queue_tail(queue, skb);
1749             continue;
1750         }
1751 
1752         if (copy > length)
1753             copy = length;
1754 
1755         if (!(rt->dst.dev->features&NETIF_F_SG) &&
1756             skb_tailroom(skb) >= copy) {
1757             unsigned int off;
1758 
1759             off = skb->len;
1760             if (getfrag(from, skb_put(skb, copy),
1761                         offset, copy, off, skb) < 0) {
1762                 __skb_trim(skb, off);
1763                 err = -EFAULT;
1764                 goto error;
1765             }
1766         } else if (!zc) {
1767             int i = skb_shinfo(skb)->nr_frags;
1768 
1769             err = -ENOMEM;
1770             if (!sk_page_frag_refill(sk, pfrag))
1771                 goto error;
1772 
1773             skb_zcopy_downgrade_managed(skb);
1774             if (!skb_can_coalesce(skb, i, pfrag->page,
1775                           pfrag->offset)) {
1776                 err = -EMSGSIZE;
1777                 if (i == MAX_SKB_FRAGS)
1778                     goto error;
1779 
1780                 __skb_fill_page_desc(skb, i, pfrag->page,
1781                              pfrag->offset, 0);
1782                 skb_shinfo(skb)->nr_frags = ++i;
1783                 get_page(pfrag->page);
1784             }
1785             copy = min_t(int, copy, pfrag->size - pfrag->offset);
1786             if (getfrag(from,
1787                     page_address(pfrag->page) + pfrag->offset,
1788                     offset, copy, skb->len, skb) < 0)
1789                 goto error_efault;
1790 
1791             pfrag->offset += copy;
1792             skb_frag_size_add(&skb_shinfo(skb)->frags[i - 1], copy);
1793             skb->len += copy;
1794             skb->data_len += copy;
1795             skb->truesize += copy;
1796             wmem_alloc_delta += copy;
1797         } else {
1798             err = skb_zerocopy_iter_dgram(skb, from, copy);
1799             if (err < 0)
1800                 goto error;
1801         }
1802         offset += copy;
1803         length -= copy;
1804     }
1805 
1806     if (wmem_alloc_delta)
1807         refcount_add(wmem_alloc_delta, &sk->sk_wmem_alloc);
1808     return 0;
1809 
1810 error_efault:
1811     err = -EFAULT;
1812 error:
1813     net_zcopy_put_abort(uarg, extra_uref);
1814     cork->length -= length;
1815     IP6_INC_STATS(sock_net(sk), rt->rt6i_idev, IPSTATS_MIB_OUTDISCARDS);
1816     refcount_add(wmem_alloc_delta, &sk->sk_wmem_alloc);
1817     return err;
1818 }
1819 
1820 int ip6_append_data(struct sock *sk,
1821             int getfrag(void *from, char *to, int offset, int len,
1822                 int odd, struct sk_buff *skb),
1823             void *from, size_t length, int transhdrlen,
1824             struct ipcm6_cookie *ipc6, struct flowi6 *fl6,
1825             struct rt6_info *rt, unsigned int flags)
1826 {
1827     struct inet_sock *inet = inet_sk(sk);
1828     struct ipv6_pinfo *np = inet6_sk(sk);
1829     int exthdrlen;
1830     int err;
1831 
1832     if (flags&MSG_PROBE)
1833         return 0;
1834     if (skb_queue_empty(&sk->sk_write_queue)) {
1835         /*
1836          * setup for corking
1837          */
1838         dst_hold(&rt->dst);
1839         err = ip6_setup_cork(sk, &inet->cork, &np->cork,
1840                      ipc6, rt);
1841         if (err)
1842             return err;
1843 
1844         inet->cork.fl.u.ip6 = *fl6;
1845         exthdrlen = (ipc6->opt ? ipc6->opt->opt_flen : 0);
1846         length += exthdrlen;
1847         transhdrlen += exthdrlen;
1848     } else {
1849         transhdrlen = 0;
1850     }
1851 
1852     return __ip6_append_data(sk, &sk->sk_write_queue, &inet->cork,
1853                  &np->cork, sk_page_frag(sk), getfrag,
1854                  from, length, transhdrlen, flags, ipc6);
1855 }
1856 EXPORT_SYMBOL_GPL(ip6_append_data);
1857 
1858 static void ip6_cork_steal_dst(struct sk_buff *skb, struct inet_cork_full *cork)
1859 {
1860     struct dst_entry *dst = cork->base.dst;
1861 
1862     cork->base.dst = NULL;
1863     cork->base.flags &= ~IPCORK_ALLFRAG;
1864     skb_dst_set(skb, dst);
1865 }
1866 
1867 static void ip6_cork_release(struct inet_cork_full *cork,
1868                  struct inet6_cork *v6_cork)
1869 {
1870     if (v6_cork->opt) {
1871         struct ipv6_txoptions *opt = v6_cork->opt;
1872 
1873         kfree(opt->dst0opt);
1874         kfree(opt->dst1opt);
1875         kfree(opt->hopopt);
1876         kfree(opt->srcrt);
1877         kfree(opt);
1878         v6_cork->opt = NULL;
1879     }
1880 
1881     if (cork->base.dst) {
1882         dst_release(cork->base.dst);
1883         cork->base.dst = NULL;
1884         cork->base.flags &= ~IPCORK_ALLFRAG;
1885     }
1886 }
1887 
1888 struct sk_buff *__ip6_make_skb(struct sock *sk,
1889                    struct sk_buff_head *queue,
1890                    struct inet_cork_full *cork,
1891                    struct inet6_cork *v6_cork)
1892 {
1893     struct sk_buff *skb, *tmp_skb;
1894     struct sk_buff **tail_skb;
1895     struct in6_addr *final_dst;
1896     struct ipv6_pinfo *np = inet6_sk(sk);
1897     struct net *net = sock_net(sk);
1898     struct ipv6hdr *hdr;
1899     struct ipv6_txoptions *opt = v6_cork->opt;
1900     struct rt6_info *rt = (struct rt6_info *)cork->base.dst;
1901     struct flowi6 *fl6 = &cork->fl.u.ip6;
1902     unsigned char proto = fl6->flowi6_proto;
1903 
1904     skb = __skb_dequeue(queue);
1905     if (!skb)
1906         goto out;
1907     tail_skb = &(skb_shinfo(skb)->frag_list);
1908 
1909     /* move skb->data to ip header from ext header */
1910     if (skb->data < skb_network_header(skb))
1911         __skb_pull(skb, skb_network_offset(skb));
1912     while ((tmp_skb = __skb_dequeue(queue)) != NULL) {
1913         __skb_pull(tmp_skb, skb_network_header_len(skb));
1914         *tail_skb = tmp_skb;
1915         tail_skb = &(tmp_skb->next);
1916         skb->len += tmp_skb->len;
1917         skb->data_len += tmp_skb->len;
1918         skb->truesize += tmp_skb->truesize;
1919         tmp_skb->destructor = NULL;
1920         tmp_skb->sk = NULL;
1921     }
1922 
1923     /* Allow local fragmentation. */
1924     skb->ignore_df = ip6_sk_ignore_df(sk);
1925     __skb_pull(skb, skb_network_header_len(skb));
1926 
1927     final_dst = &fl6->daddr;
1928     if (opt && opt->opt_flen)
1929         ipv6_push_frag_opts(skb, opt, &proto);
1930     if (opt && opt->opt_nflen)
1931         ipv6_push_nfrag_opts(skb, opt, &proto, &final_dst, &fl6->saddr);
1932 
1933     skb_push(skb, sizeof(struct ipv6hdr));
1934     skb_reset_network_header(skb);
1935     hdr = ipv6_hdr(skb);
1936 
1937     ip6_flow_hdr(hdr, v6_cork->tclass,
1938              ip6_make_flowlabel(net, skb, fl6->flowlabel,
1939                     ip6_autoflowlabel(net, np), fl6));
1940     hdr->hop_limit = v6_cork->hop_limit;
1941     hdr->nexthdr = proto;
1942     hdr->saddr = fl6->saddr;
1943     hdr->daddr = *final_dst;
1944 
1945     skb->priority = sk->sk_priority;
1946     skb->mark = cork->base.mark;
1947     skb->tstamp = cork->base.transmit_time;
1948 
1949     ip6_cork_steal_dst(skb, cork);
1950     IP6_UPD_PO_STATS(net, rt->rt6i_idev, IPSTATS_MIB_OUT, skb->len);
1951     if (proto == IPPROTO_ICMPV6) {
1952         struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
1953 
1954         ICMP6MSGOUT_INC_STATS(net, idev, icmp6_hdr(skb)->icmp6_type);
1955         ICMP6_INC_STATS(net, idev, ICMP6_MIB_OUTMSGS);
1956     }
1957 
1958     ip6_cork_release(cork, v6_cork);
1959 out:
1960     return skb;
1961 }
1962 
1963 int ip6_send_skb(struct sk_buff *skb)
1964 {
1965     struct net *net = sock_net(skb->sk);
1966     struct rt6_info *rt = (struct rt6_info *)skb_dst(skb);
1967     int err;
1968 
1969     err = ip6_local_out(net, skb->sk, skb);
1970     if (err) {
1971         if (err > 0)
1972             err = net_xmit_errno(err);
1973         if (err)
1974             IP6_INC_STATS(net, rt->rt6i_idev,
1975                       IPSTATS_MIB_OUTDISCARDS);
1976     }
1977 
1978     return err;
1979 }
1980 
1981 int ip6_push_pending_frames(struct sock *sk)
1982 {
1983     struct sk_buff *skb;
1984 
1985     skb = ip6_finish_skb(sk);
1986     if (!skb)
1987         return 0;
1988 
1989     return ip6_send_skb(skb);
1990 }
1991 EXPORT_SYMBOL_GPL(ip6_push_pending_frames);
1992 
1993 static void __ip6_flush_pending_frames(struct sock *sk,
1994                        struct sk_buff_head *queue,
1995                        struct inet_cork_full *cork,
1996                        struct inet6_cork *v6_cork)
1997 {
1998     struct sk_buff *skb;
1999 
2000     while ((skb = __skb_dequeue_tail(queue)) != NULL) {
2001         if (skb_dst(skb))
2002             IP6_INC_STATS(sock_net(sk), ip6_dst_idev(skb_dst(skb)),
2003                       IPSTATS_MIB_OUTDISCARDS);
2004         kfree_skb(skb);
2005     }
2006 
2007     ip6_cork_release(cork, v6_cork);
2008 }
2009 
2010 void ip6_flush_pending_frames(struct sock *sk)
2011 {
2012     __ip6_flush_pending_frames(sk, &sk->sk_write_queue,
2013                    &inet_sk(sk)->cork, &inet6_sk(sk)->cork);
2014 }
2015 EXPORT_SYMBOL_GPL(ip6_flush_pending_frames);
2016 
2017 struct sk_buff *ip6_make_skb(struct sock *sk,
2018                  int getfrag(void *from, char *to, int offset,
2019                      int len, int odd, struct sk_buff *skb),
2020                  void *from, size_t length, int transhdrlen,
2021                  struct ipcm6_cookie *ipc6, struct rt6_info *rt,
2022                  unsigned int flags, struct inet_cork_full *cork)
2023 {
2024     struct inet6_cork v6_cork;
2025     struct sk_buff_head queue;
2026     int exthdrlen = (ipc6->opt ? ipc6->opt->opt_flen : 0);
2027     int err;
2028 
2029     if (flags & MSG_PROBE) {
2030         dst_release(&rt->dst);
2031         return NULL;
2032     }
2033 
2034     __skb_queue_head_init(&queue);
2035 
2036     cork->base.flags = 0;
2037     cork->base.addr = 0;
2038     cork->base.opt = NULL;
2039     v6_cork.opt = NULL;
2040     err = ip6_setup_cork(sk, cork, &v6_cork, ipc6, rt);
2041     if (err) {
2042         ip6_cork_release(cork, &v6_cork);
2043         return ERR_PTR(err);
2044     }
2045     if (ipc6->dontfrag < 0)
2046         ipc6->dontfrag = inet6_sk(sk)->dontfrag;
2047 
2048     err = __ip6_append_data(sk, &queue, cork, &v6_cork,
2049                 &current->task_frag, getfrag, from,
2050                 length + exthdrlen, transhdrlen + exthdrlen,
2051                 flags, ipc6);
2052     if (err) {
2053         __ip6_flush_pending_frames(sk, &queue, cork, &v6_cork);
2054         return ERR_PTR(err);
2055     }
2056 
2057     return __ip6_make_skb(sk, &queue, cork, &v6_cork);
2058 }