0001
0002
0003
0004
0005
0006
0007
0008
0009
0010
0011
0012
0013
0014
0015
0016
0017
0018
0019
0020
0021
0022
0023
0024
0025
0026
0027
0028
0029
0030
0031
0032
0033
0034
0035
0036
0037
0038
0039
0040
0041
0042
0043
0044
0045
0046
0047
0048
0049 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
0050
0051 #include <linux/ethtool.h>
0052 #include <linux/filter.h>
0053 #include <linux/types.h>
0054 #include <linux/mm.h>
0055 #include <linux/capability.h>
0056 #include <linux/fcntl.h>
0057 #include <linux/socket.h>
0058 #include <linux/in.h>
0059 #include <linux/inet.h>
0060 #include <linux/netdevice.h>
0061 #include <linux/if_packet.h>
0062 #include <linux/wireless.h>
0063 #include <linux/kernel.h>
0064 #include <linux/kmod.h>
0065 #include <linux/slab.h>
0066 #include <linux/vmalloc.h>
0067 #include <net/net_namespace.h>
0068 #include <net/ip.h>
0069 #include <net/protocol.h>
0070 #include <linux/skbuff.h>
0071 #include <net/sock.h>
0072 #include <linux/errno.h>
0073 #include <linux/timer.h>
0074 #include <linux/uaccess.h>
0075 #include <asm/ioctls.h>
0076 #include <asm/page.h>
0077 #include <asm/cacheflush.h>
0078 #include <asm/io.h>
0079 #include <linux/proc_fs.h>
0080 #include <linux/seq_file.h>
0081 #include <linux/poll.h>
0082 #include <linux/module.h>
0083 #include <linux/init.h>
0084 #include <linux/mutex.h>
0085 #include <linux/if_vlan.h>
0086 #include <linux/virtio_net.h>
0087 #include <linux/errqueue.h>
0088 #include <linux/net_tstamp.h>
0089 #include <linux/percpu.h>
0090 #ifdef CONFIG_INET
0091 #include <net/inet_common.h>
0092 #endif
0093 #include <linux/bpf.h>
0094 #include <net/compat.h>
0095 #include <linux/netfilter_netdev.h>
0096
0097 #include "internal.h"
0098
0099
0100
0101
0102
0103
0104
0105
0106
0107
0108
0109
0110
0111
0112
0113
0114
0115
0116
0117
0118
0119
0120
0121
0122
0123
0124
0125
0126
0127
0128
0129
0130
0131
0132
0133
0134
0135
0136
0137
0138
0139
0140
0141
0142
0143
0144
0145
0146
0147
0148
0149
0150
0151
0152
0153
0154
0155
0156
0157
0158
0159 struct packet_mreq_max {
0160 int mr_ifindex;
0161 unsigned short mr_type;
0162 unsigned short mr_alen;
0163 unsigned char mr_address[MAX_ADDR_LEN];
0164 };
0165
0166 union tpacket_uhdr {
0167 struct tpacket_hdr *h1;
0168 struct tpacket2_hdr *h2;
0169 struct tpacket3_hdr *h3;
0170 void *raw;
0171 };
0172
0173 static int packet_set_ring(struct sock *sk, union tpacket_req_u *req_u,
0174 int closing, int tx_ring);
0175
0176 #define V3_ALIGNMENT (8)
0177
0178 #define BLK_HDR_LEN (ALIGN(sizeof(struct tpacket_block_desc), V3_ALIGNMENT))
0179
0180 #define BLK_PLUS_PRIV(sz_of_priv) \
0181 (BLK_HDR_LEN + ALIGN((sz_of_priv), V3_ALIGNMENT))
0182
0183 #define BLOCK_STATUS(x) ((x)->hdr.bh1.block_status)
0184 #define BLOCK_NUM_PKTS(x) ((x)->hdr.bh1.num_pkts)
0185 #define BLOCK_O2FP(x) ((x)->hdr.bh1.offset_to_first_pkt)
0186 #define BLOCK_LEN(x) ((x)->hdr.bh1.blk_len)
0187 #define BLOCK_SNUM(x) ((x)->hdr.bh1.seq_num)
0188 #define BLOCK_O2PRIV(x) ((x)->offset_to_priv)
0189
0190 struct packet_sock;
0191 static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev,
0192 struct packet_type *pt, struct net_device *orig_dev);
0193
0194 static void *packet_previous_frame(struct packet_sock *po,
0195 struct packet_ring_buffer *rb,
0196 int status);
0197 static void packet_increment_head(struct packet_ring_buffer *buff);
0198 static int prb_curr_blk_in_use(struct tpacket_block_desc *);
0199 static void *prb_dispatch_next_block(struct tpacket_kbdq_core *,
0200 struct packet_sock *);
0201 static void prb_retire_current_block(struct tpacket_kbdq_core *,
0202 struct packet_sock *, unsigned int status);
0203 static int prb_queue_frozen(struct tpacket_kbdq_core *);
0204 static void prb_open_block(struct tpacket_kbdq_core *,
0205 struct tpacket_block_desc *);
0206 static void prb_retire_rx_blk_timer_expired(struct timer_list *);
0207 static void _prb_refresh_rx_retire_blk_timer(struct tpacket_kbdq_core *);
0208 static void prb_fill_rxhash(struct tpacket_kbdq_core *, struct tpacket3_hdr *);
0209 static void prb_clear_rxhash(struct tpacket_kbdq_core *,
0210 struct tpacket3_hdr *);
0211 static void prb_fill_vlan_info(struct tpacket_kbdq_core *,
0212 struct tpacket3_hdr *);
0213 static void packet_flush_mclist(struct sock *sk);
0214 static u16 packet_pick_tx_queue(struct sk_buff *skb);
0215
0216 struct packet_skb_cb {
0217 union {
0218 struct sockaddr_pkt pkt;
0219 union {
0220
0221
0222
0223
0224 unsigned int origlen;
0225 struct sockaddr_ll ll;
0226 };
0227 } sa;
0228 };
0229
0230 #define vio_le() virtio_legacy_is_little_endian()
0231
0232 #define PACKET_SKB_CB(__skb) ((struct packet_skb_cb *)((__skb)->cb))
0233
0234 #define GET_PBDQC_FROM_RB(x) ((struct tpacket_kbdq_core *)(&(x)->prb_bdqc))
0235 #define GET_PBLOCK_DESC(x, bid) \
0236 ((struct tpacket_block_desc *)((x)->pkbdq[(bid)].buffer))
0237 #define GET_CURR_PBLOCK_DESC_FROM_CORE(x) \
0238 ((struct tpacket_block_desc *)((x)->pkbdq[(x)->kactive_blk_num].buffer))
0239 #define GET_NEXT_PRB_BLK_NUM(x) \
0240 (((x)->kactive_blk_num < ((x)->knum_blocks-1)) ? \
0241 ((x)->kactive_blk_num+1) : 0)
0242
0243 static void __fanout_unlink(struct sock *sk, struct packet_sock *po);
0244 static void __fanout_link(struct sock *sk, struct packet_sock *po);
0245
0246 #ifdef CONFIG_NETFILTER_EGRESS
0247 static noinline struct sk_buff *nf_hook_direct_egress(struct sk_buff *skb)
0248 {
0249 struct sk_buff *next, *head = NULL, *tail;
0250 int rc;
0251
0252 rcu_read_lock();
0253 for (; skb != NULL; skb = next) {
0254 next = skb->next;
0255 skb_mark_not_on_list(skb);
0256
0257 if (!nf_hook_egress(skb, &rc, skb->dev))
0258 continue;
0259
0260 if (!head)
0261 head = skb;
0262 else
0263 tail->next = skb;
0264
0265 tail = skb;
0266 }
0267 rcu_read_unlock();
0268
0269 return head;
0270 }
0271 #endif
0272
0273 static int packet_direct_xmit(struct sk_buff *skb)
0274 {
0275 #ifdef CONFIG_NETFILTER_EGRESS
0276 if (nf_hook_egress_active()) {
0277 skb = nf_hook_direct_egress(skb);
0278 if (!skb)
0279 return NET_XMIT_DROP;
0280 }
0281 #endif
0282 return dev_direct_xmit(skb, packet_pick_tx_queue(skb));
0283 }
0284
0285 static struct net_device *packet_cached_dev_get(struct packet_sock *po)
0286 {
0287 struct net_device *dev;
0288
0289 rcu_read_lock();
0290 dev = rcu_dereference(po->cached_dev);
0291 dev_hold(dev);
0292 rcu_read_unlock();
0293
0294 return dev;
0295 }
0296
0297 static void packet_cached_dev_assign(struct packet_sock *po,
0298 struct net_device *dev)
0299 {
0300 rcu_assign_pointer(po->cached_dev, dev);
0301 }
0302
0303 static void packet_cached_dev_reset(struct packet_sock *po)
0304 {
0305 RCU_INIT_POINTER(po->cached_dev, NULL);
0306 }
0307
0308 static bool packet_use_direct_xmit(const struct packet_sock *po)
0309 {
0310 return po->xmit == packet_direct_xmit;
0311 }
0312
0313 static u16 packet_pick_tx_queue(struct sk_buff *skb)
0314 {
0315 struct net_device *dev = skb->dev;
0316 const struct net_device_ops *ops = dev->netdev_ops;
0317 int cpu = raw_smp_processor_id();
0318 u16 queue_index;
0319
0320 #ifdef CONFIG_XPS
0321 skb->sender_cpu = cpu + 1;
0322 #endif
0323 skb_record_rx_queue(skb, cpu % dev->real_num_tx_queues);
0324 if (ops->ndo_select_queue) {
0325 queue_index = ops->ndo_select_queue(dev, skb, NULL);
0326 queue_index = netdev_cap_txqueue(dev, queue_index);
0327 } else {
0328 queue_index = netdev_pick_tx(dev, skb, NULL);
0329 }
0330
0331 return queue_index;
0332 }
0333
0334
0335
0336
0337
0338 static void __register_prot_hook(struct sock *sk)
0339 {
0340 struct packet_sock *po = pkt_sk(sk);
0341
0342 if (!po->running) {
0343 if (po->fanout)
0344 __fanout_link(sk, po);
0345 else
0346 dev_add_pack(&po->prot_hook);
0347
0348 sock_hold(sk);
0349 po->running = 1;
0350 }
0351 }
0352
0353 static void register_prot_hook(struct sock *sk)
0354 {
0355 lockdep_assert_held_once(&pkt_sk(sk)->bind_lock);
0356 __register_prot_hook(sk);
0357 }
0358
0359
0360
0361
0362
0363
0364
0365 static void __unregister_prot_hook(struct sock *sk, bool sync)
0366 {
0367 struct packet_sock *po = pkt_sk(sk);
0368
0369 lockdep_assert_held_once(&po->bind_lock);
0370
0371 po->running = 0;
0372
0373 if (po->fanout)
0374 __fanout_unlink(sk, po);
0375 else
0376 __dev_remove_pack(&po->prot_hook);
0377
0378 __sock_put(sk);
0379
0380 if (sync) {
0381 spin_unlock(&po->bind_lock);
0382 synchronize_net();
0383 spin_lock(&po->bind_lock);
0384 }
0385 }
0386
0387 static void unregister_prot_hook(struct sock *sk, bool sync)
0388 {
0389 struct packet_sock *po = pkt_sk(sk);
0390
0391 if (po->running)
0392 __unregister_prot_hook(sk, sync);
0393 }
0394
0395 static inline struct page * __pure pgv_to_page(void *addr)
0396 {
0397 if (is_vmalloc_addr(addr))
0398 return vmalloc_to_page(addr);
0399 return virt_to_page(addr);
0400 }
0401
0402 static void __packet_set_status(struct packet_sock *po, void *frame, int status)
0403 {
0404 union tpacket_uhdr h;
0405
0406 h.raw = frame;
0407 switch (po->tp_version) {
0408 case TPACKET_V1:
0409 h.h1->tp_status = status;
0410 flush_dcache_page(pgv_to_page(&h.h1->tp_status));
0411 break;
0412 case TPACKET_V2:
0413 h.h2->tp_status = status;
0414 flush_dcache_page(pgv_to_page(&h.h2->tp_status));
0415 break;
0416 case TPACKET_V3:
0417 h.h3->tp_status = status;
0418 flush_dcache_page(pgv_to_page(&h.h3->tp_status));
0419 break;
0420 default:
0421 WARN(1, "TPACKET version not supported.\n");
0422 BUG();
0423 }
0424
0425 smp_wmb();
0426 }
0427
0428 static int __packet_get_status(const struct packet_sock *po, void *frame)
0429 {
0430 union tpacket_uhdr h;
0431
0432 smp_rmb();
0433
0434 h.raw = frame;
0435 switch (po->tp_version) {
0436 case TPACKET_V1:
0437 flush_dcache_page(pgv_to_page(&h.h1->tp_status));
0438 return h.h1->tp_status;
0439 case TPACKET_V2:
0440 flush_dcache_page(pgv_to_page(&h.h2->tp_status));
0441 return h.h2->tp_status;
0442 case TPACKET_V3:
0443 flush_dcache_page(pgv_to_page(&h.h3->tp_status));
0444 return h.h3->tp_status;
0445 default:
0446 WARN(1, "TPACKET version not supported.\n");
0447 BUG();
0448 return 0;
0449 }
0450 }
0451
0452 static __u32 tpacket_get_timestamp(struct sk_buff *skb, struct timespec64 *ts,
0453 unsigned int flags)
0454 {
0455 struct skb_shared_hwtstamps *shhwtstamps = skb_hwtstamps(skb);
0456
0457 if (shhwtstamps &&
0458 (flags & SOF_TIMESTAMPING_RAW_HARDWARE) &&
0459 ktime_to_timespec64_cond(shhwtstamps->hwtstamp, ts))
0460 return TP_STATUS_TS_RAW_HARDWARE;
0461
0462 if ((flags & SOF_TIMESTAMPING_SOFTWARE) &&
0463 ktime_to_timespec64_cond(skb_tstamp(skb), ts))
0464 return TP_STATUS_TS_SOFTWARE;
0465
0466 return 0;
0467 }
0468
0469 static __u32 __packet_set_timestamp(struct packet_sock *po, void *frame,
0470 struct sk_buff *skb)
0471 {
0472 union tpacket_uhdr h;
0473 struct timespec64 ts;
0474 __u32 ts_status;
0475
0476 if (!(ts_status = tpacket_get_timestamp(skb, &ts, po->tp_tstamp)))
0477 return 0;
0478
0479 h.raw = frame;
0480
0481
0482
0483
0484
0485
0486
0487 switch (po->tp_version) {
0488 case TPACKET_V1:
0489 h.h1->tp_sec = ts.tv_sec;
0490 h.h1->tp_usec = ts.tv_nsec / NSEC_PER_USEC;
0491 break;
0492 case TPACKET_V2:
0493 h.h2->tp_sec = ts.tv_sec;
0494 h.h2->tp_nsec = ts.tv_nsec;
0495 break;
0496 case TPACKET_V3:
0497 h.h3->tp_sec = ts.tv_sec;
0498 h.h3->tp_nsec = ts.tv_nsec;
0499 break;
0500 default:
0501 WARN(1, "TPACKET version not supported.\n");
0502 BUG();
0503 }
0504
0505
0506 flush_dcache_page(pgv_to_page(&h.h1->tp_sec));
0507 smp_wmb();
0508
0509 return ts_status;
0510 }
0511
0512 static void *packet_lookup_frame(const struct packet_sock *po,
0513 const struct packet_ring_buffer *rb,
0514 unsigned int position,
0515 int status)
0516 {
0517 unsigned int pg_vec_pos, frame_offset;
0518 union tpacket_uhdr h;
0519
0520 pg_vec_pos = position / rb->frames_per_block;
0521 frame_offset = position % rb->frames_per_block;
0522
0523 h.raw = rb->pg_vec[pg_vec_pos].buffer +
0524 (frame_offset * rb->frame_size);
0525
0526 if (status != __packet_get_status(po, h.raw))
0527 return NULL;
0528
0529 return h.raw;
0530 }
0531
0532 static void *packet_current_frame(struct packet_sock *po,
0533 struct packet_ring_buffer *rb,
0534 int status)
0535 {
0536 return packet_lookup_frame(po, rb, rb->head, status);
0537 }
0538
0539 static void prb_del_retire_blk_timer(struct tpacket_kbdq_core *pkc)
0540 {
0541 del_timer_sync(&pkc->retire_blk_timer);
0542 }
0543
0544 static void prb_shutdown_retire_blk_timer(struct packet_sock *po,
0545 struct sk_buff_head *rb_queue)
0546 {
0547 struct tpacket_kbdq_core *pkc;
0548
0549 pkc = GET_PBDQC_FROM_RB(&po->rx_ring);
0550
0551 spin_lock_bh(&rb_queue->lock);
0552 pkc->delete_blk_timer = 1;
0553 spin_unlock_bh(&rb_queue->lock);
0554
0555 prb_del_retire_blk_timer(pkc);
0556 }
0557
0558 static void prb_setup_retire_blk_timer(struct packet_sock *po)
0559 {
0560 struct tpacket_kbdq_core *pkc;
0561
0562 pkc = GET_PBDQC_FROM_RB(&po->rx_ring);
0563 timer_setup(&pkc->retire_blk_timer, prb_retire_rx_blk_timer_expired,
0564 0);
0565 pkc->retire_blk_timer.expires = jiffies;
0566 }
0567
0568 static int prb_calc_retire_blk_tmo(struct packet_sock *po,
0569 int blk_size_in_bytes)
0570 {
0571 struct net_device *dev;
0572 unsigned int mbits, div;
0573 struct ethtool_link_ksettings ecmd;
0574 int err;
0575
0576 rtnl_lock();
0577 dev = __dev_get_by_index(sock_net(&po->sk), po->ifindex);
0578 if (unlikely(!dev)) {
0579 rtnl_unlock();
0580 return DEFAULT_PRB_RETIRE_TOV;
0581 }
0582 err = __ethtool_get_link_ksettings(dev, &ecmd);
0583 rtnl_unlock();
0584 if (err)
0585 return DEFAULT_PRB_RETIRE_TOV;
0586
0587
0588
0589
0590 if (ecmd.base.speed < SPEED_1000 ||
0591 ecmd.base.speed == SPEED_UNKNOWN)
0592 return DEFAULT_PRB_RETIRE_TOV;
0593
0594 div = ecmd.base.speed / 1000;
0595 mbits = (blk_size_in_bytes * 8) / (1024 * 1024);
0596
0597 if (div)
0598 mbits /= div;
0599
0600 if (div)
0601 return mbits + 1;
0602 return mbits;
0603 }
0604
0605 static void prb_init_ft_ops(struct tpacket_kbdq_core *p1,
0606 union tpacket_req_u *req_u)
0607 {
0608 p1->feature_req_word = req_u->req3.tp_feature_req_word;
0609 }
0610
0611 static void init_prb_bdqc(struct packet_sock *po,
0612 struct packet_ring_buffer *rb,
0613 struct pgv *pg_vec,
0614 union tpacket_req_u *req_u)
0615 {
0616 struct tpacket_kbdq_core *p1 = GET_PBDQC_FROM_RB(rb);
0617 struct tpacket_block_desc *pbd;
0618
0619 memset(p1, 0x0, sizeof(*p1));
0620
0621 p1->knxt_seq_num = 1;
0622 p1->pkbdq = pg_vec;
0623 pbd = (struct tpacket_block_desc *)pg_vec[0].buffer;
0624 p1->pkblk_start = pg_vec[0].buffer;
0625 p1->kblk_size = req_u->req3.tp_block_size;
0626 p1->knum_blocks = req_u->req3.tp_block_nr;
0627 p1->hdrlen = po->tp_hdrlen;
0628 p1->version = po->tp_version;
0629 p1->last_kactive_blk_num = 0;
0630 po->stats.stats3.tp_freeze_q_cnt = 0;
0631 if (req_u->req3.tp_retire_blk_tov)
0632 p1->retire_blk_tov = req_u->req3.tp_retire_blk_tov;
0633 else
0634 p1->retire_blk_tov = prb_calc_retire_blk_tmo(po,
0635 req_u->req3.tp_block_size);
0636 p1->tov_in_jiffies = msecs_to_jiffies(p1->retire_blk_tov);
0637 p1->blk_sizeof_priv = req_u->req3.tp_sizeof_priv;
0638 rwlock_init(&p1->blk_fill_in_prog_lock);
0639
0640 p1->max_frame_len = p1->kblk_size - BLK_PLUS_PRIV(p1->blk_sizeof_priv);
0641 prb_init_ft_ops(p1, req_u);
0642 prb_setup_retire_blk_timer(po);
0643 prb_open_block(p1, pbd);
0644 }
0645
0646
0647
0648
0649 static void _prb_refresh_rx_retire_blk_timer(struct tpacket_kbdq_core *pkc)
0650 {
0651 mod_timer(&pkc->retire_blk_timer,
0652 jiffies + pkc->tov_in_jiffies);
0653 pkc->last_kactive_blk_num = pkc->kactive_blk_num;
0654 }
0655
0656
0657
0658
0659
0660
0661
0662
0663
0664
0665
0666
0667
0668
0669
0670
0671
0672
0673
0674
0675
0676
0677
0678
0679 static void prb_retire_rx_blk_timer_expired(struct timer_list *t)
0680 {
0681 struct packet_sock *po =
0682 from_timer(po, t, rx_ring.prb_bdqc.retire_blk_timer);
0683 struct tpacket_kbdq_core *pkc = GET_PBDQC_FROM_RB(&po->rx_ring);
0684 unsigned int frozen;
0685 struct tpacket_block_desc *pbd;
0686
0687 spin_lock(&po->sk.sk_receive_queue.lock);
0688
0689 frozen = prb_queue_frozen(pkc);
0690 pbd = GET_CURR_PBLOCK_DESC_FROM_CORE(pkc);
0691
0692 if (unlikely(pkc->delete_blk_timer))
0693 goto out;
0694
0695
0696
0697
0698
0699
0700
0701
0702
0703
0704 if (BLOCK_NUM_PKTS(pbd)) {
0705
0706 write_lock(&pkc->blk_fill_in_prog_lock);
0707 write_unlock(&pkc->blk_fill_in_prog_lock);
0708 }
0709
0710 if (pkc->last_kactive_blk_num == pkc->kactive_blk_num) {
0711 if (!frozen) {
0712 if (!BLOCK_NUM_PKTS(pbd)) {
0713
0714 goto refresh_timer;
0715 }
0716 prb_retire_current_block(pkc, po, TP_STATUS_BLK_TMO);
0717 if (!prb_dispatch_next_block(pkc, po))
0718 goto refresh_timer;
0719 else
0720 goto out;
0721 } else {
0722
0723
0724
0725 if (prb_curr_blk_in_use(pbd)) {
0726
0727
0728
0729
0730 goto refresh_timer;
0731 } else {
0732
0733
0734
0735
0736
0737
0738
0739 prb_open_block(pkc, pbd);
0740 goto out;
0741 }
0742 }
0743 }
0744
0745 refresh_timer:
0746 _prb_refresh_rx_retire_blk_timer(pkc);
0747
0748 out:
0749 spin_unlock(&po->sk.sk_receive_queue.lock);
0750 }
0751
0752 static void prb_flush_block(struct tpacket_kbdq_core *pkc1,
0753 struct tpacket_block_desc *pbd1, __u32 status)
0754 {
0755
0756
0757 #if ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE == 1
0758 u8 *start, *end;
0759
0760 start = (u8 *)pbd1;
0761
0762
0763 start += PAGE_SIZE;
0764
0765 end = (u8 *)PAGE_ALIGN((unsigned long)pkc1->pkblk_end);
0766 for (; start < end; start += PAGE_SIZE)
0767 flush_dcache_page(pgv_to_page(start));
0768
0769 smp_wmb();
0770 #endif
0771
0772
0773
0774 BLOCK_STATUS(pbd1) = status;
0775
0776
0777
0778 #if ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE == 1
0779 start = (u8 *)pbd1;
0780 flush_dcache_page(pgv_to_page(start));
0781
0782 smp_wmb();
0783 #endif
0784 }
0785
0786
0787
0788
0789
0790
0791
0792
0793
0794
0795 static void prb_close_block(struct tpacket_kbdq_core *pkc1,
0796 struct tpacket_block_desc *pbd1,
0797 struct packet_sock *po, unsigned int stat)
0798 {
0799 __u32 status = TP_STATUS_USER | stat;
0800
0801 struct tpacket3_hdr *last_pkt;
0802 struct tpacket_hdr_v1 *h1 = &pbd1->hdr.bh1;
0803 struct sock *sk = &po->sk;
0804
0805 if (atomic_read(&po->tp_drops))
0806 status |= TP_STATUS_LOSING;
0807
0808 last_pkt = (struct tpacket3_hdr *)pkc1->prev;
0809 last_pkt->tp_next_offset = 0;
0810
0811
0812 if (BLOCK_NUM_PKTS(pbd1)) {
0813 h1->ts_last_pkt.ts_sec = last_pkt->tp_sec;
0814 h1->ts_last_pkt.ts_nsec = last_pkt->tp_nsec;
0815 } else {
0816
0817
0818
0819
0820
0821 struct timespec64 ts;
0822 ktime_get_real_ts64(&ts);
0823 h1->ts_last_pkt.ts_sec = ts.tv_sec;
0824 h1->ts_last_pkt.ts_nsec = ts.tv_nsec;
0825 }
0826
0827 smp_wmb();
0828
0829
0830 prb_flush_block(pkc1, pbd1, status);
0831
0832 sk->sk_data_ready(sk);
0833
0834 pkc1->kactive_blk_num = GET_NEXT_PRB_BLK_NUM(pkc1);
0835 }
0836
0837 static void prb_thaw_queue(struct tpacket_kbdq_core *pkc)
0838 {
0839 pkc->reset_pending_on_curr_blk = 0;
0840 }
0841
0842
0843
0844
0845
0846
0847
0848
0849 static void prb_open_block(struct tpacket_kbdq_core *pkc1,
0850 struct tpacket_block_desc *pbd1)
0851 {
0852 struct timespec64 ts;
0853 struct tpacket_hdr_v1 *h1 = &pbd1->hdr.bh1;
0854
0855 smp_rmb();
0856
0857
0858
0859
0860
0861 BLOCK_SNUM(pbd1) = pkc1->knxt_seq_num++;
0862 BLOCK_NUM_PKTS(pbd1) = 0;
0863 BLOCK_LEN(pbd1) = BLK_PLUS_PRIV(pkc1->blk_sizeof_priv);
0864
0865 ktime_get_real_ts64(&ts);
0866
0867 h1->ts_first_pkt.ts_sec = ts.tv_sec;
0868 h1->ts_first_pkt.ts_nsec = ts.tv_nsec;
0869
0870 pkc1->pkblk_start = (char *)pbd1;
0871 pkc1->nxt_offset = pkc1->pkblk_start + BLK_PLUS_PRIV(pkc1->blk_sizeof_priv);
0872
0873 BLOCK_O2FP(pbd1) = (__u32)BLK_PLUS_PRIV(pkc1->blk_sizeof_priv);
0874 BLOCK_O2PRIV(pbd1) = BLK_HDR_LEN;
0875
0876 pbd1->version = pkc1->version;
0877 pkc1->prev = pkc1->nxt_offset;
0878 pkc1->pkblk_end = pkc1->pkblk_start + pkc1->kblk_size;
0879
0880 prb_thaw_queue(pkc1);
0881 _prb_refresh_rx_retire_blk_timer(pkc1);
0882
0883 smp_wmb();
0884 }
0885
0886
0887
0888
0889
0890
0891
0892
0893
0894
0895
0896
0897
0898
0899
0900
0901
0902
0903
0904
0905
0906
0907
0908
0909 static void prb_freeze_queue(struct tpacket_kbdq_core *pkc,
0910 struct packet_sock *po)
0911 {
0912 pkc->reset_pending_on_curr_blk = 1;
0913 po->stats.stats3.tp_freeze_q_cnt++;
0914 }
0915
0916 #define TOTAL_PKT_LEN_INCL_ALIGN(length) (ALIGN((length), V3_ALIGNMENT))
0917
0918
0919
0920
0921
0922
0923
0924 static void *prb_dispatch_next_block(struct tpacket_kbdq_core *pkc,
0925 struct packet_sock *po)
0926 {
0927 struct tpacket_block_desc *pbd;
0928
0929 smp_rmb();
0930
0931
0932 pbd = GET_CURR_PBLOCK_DESC_FROM_CORE(pkc);
0933
0934
0935 if (TP_STATUS_USER & BLOCK_STATUS(pbd)) {
0936 prb_freeze_queue(pkc, po);
0937 return NULL;
0938 }
0939
0940
0941
0942
0943
0944
0945 prb_open_block(pkc, pbd);
0946 return (void *)pkc->nxt_offset;
0947 }
0948
0949 static void prb_retire_current_block(struct tpacket_kbdq_core *pkc,
0950 struct packet_sock *po, unsigned int status)
0951 {
0952 struct tpacket_block_desc *pbd = GET_CURR_PBLOCK_DESC_FROM_CORE(pkc);
0953
0954
0955 if (likely(TP_STATUS_KERNEL == BLOCK_STATUS(pbd))) {
0956
0957
0958
0959
0960
0961
0962
0963
0964
0965 if (!(status & TP_STATUS_BLK_TMO)) {
0966
0967 write_lock(&pkc->blk_fill_in_prog_lock);
0968 write_unlock(&pkc->blk_fill_in_prog_lock);
0969 }
0970 prb_close_block(pkc, pbd, po, status);
0971 return;
0972 }
0973 }
0974
0975 static int prb_curr_blk_in_use(struct tpacket_block_desc *pbd)
0976 {
0977 return TP_STATUS_USER & BLOCK_STATUS(pbd);
0978 }
0979
0980 static int prb_queue_frozen(struct tpacket_kbdq_core *pkc)
0981 {
0982 return pkc->reset_pending_on_curr_blk;
0983 }
0984
0985 static void prb_clear_blk_fill_status(struct packet_ring_buffer *rb)
0986 __releases(&pkc->blk_fill_in_prog_lock)
0987 {
0988 struct tpacket_kbdq_core *pkc = GET_PBDQC_FROM_RB(rb);
0989
0990 read_unlock(&pkc->blk_fill_in_prog_lock);
0991 }
0992
0993 static void prb_fill_rxhash(struct tpacket_kbdq_core *pkc,
0994 struct tpacket3_hdr *ppd)
0995 {
0996 ppd->hv1.tp_rxhash = skb_get_hash(pkc->skb);
0997 }
0998
0999 static void prb_clear_rxhash(struct tpacket_kbdq_core *pkc,
1000 struct tpacket3_hdr *ppd)
1001 {
1002 ppd->hv1.tp_rxhash = 0;
1003 }
1004
1005 static void prb_fill_vlan_info(struct tpacket_kbdq_core *pkc,
1006 struct tpacket3_hdr *ppd)
1007 {
1008 if (skb_vlan_tag_present(pkc->skb)) {
1009 ppd->hv1.tp_vlan_tci = skb_vlan_tag_get(pkc->skb);
1010 ppd->hv1.tp_vlan_tpid = ntohs(pkc->skb->vlan_proto);
1011 ppd->tp_status = TP_STATUS_VLAN_VALID | TP_STATUS_VLAN_TPID_VALID;
1012 } else {
1013 ppd->hv1.tp_vlan_tci = 0;
1014 ppd->hv1.tp_vlan_tpid = 0;
1015 ppd->tp_status = TP_STATUS_AVAILABLE;
1016 }
1017 }
1018
1019 static void prb_run_all_ft_ops(struct tpacket_kbdq_core *pkc,
1020 struct tpacket3_hdr *ppd)
1021 {
1022 ppd->hv1.tp_padding = 0;
1023 prb_fill_vlan_info(pkc, ppd);
1024
1025 if (pkc->feature_req_word & TP_FT_REQ_FILL_RXHASH)
1026 prb_fill_rxhash(pkc, ppd);
1027 else
1028 prb_clear_rxhash(pkc, ppd);
1029 }
1030
1031 static void prb_fill_curr_block(char *curr,
1032 struct tpacket_kbdq_core *pkc,
1033 struct tpacket_block_desc *pbd,
1034 unsigned int len)
1035 __acquires(&pkc->blk_fill_in_prog_lock)
1036 {
1037 struct tpacket3_hdr *ppd;
1038
1039 ppd = (struct tpacket3_hdr *)curr;
1040 ppd->tp_next_offset = TOTAL_PKT_LEN_INCL_ALIGN(len);
1041 pkc->prev = curr;
1042 pkc->nxt_offset += TOTAL_PKT_LEN_INCL_ALIGN(len);
1043 BLOCK_LEN(pbd) += TOTAL_PKT_LEN_INCL_ALIGN(len);
1044 BLOCK_NUM_PKTS(pbd) += 1;
1045 read_lock(&pkc->blk_fill_in_prog_lock);
1046 prb_run_all_ft_ops(pkc, ppd);
1047 }
1048
1049
1050 static void *__packet_lookup_frame_in_block(struct packet_sock *po,
1051 struct sk_buff *skb,
1052 unsigned int len
1053 )
1054 {
1055 struct tpacket_kbdq_core *pkc;
1056 struct tpacket_block_desc *pbd;
1057 char *curr, *end;
1058
1059 pkc = GET_PBDQC_FROM_RB(&po->rx_ring);
1060 pbd = GET_CURR_PBLOCK_DESC_FROM_CORE(pkc);
1061
1062
1063 if (prb_queue_frozen(pkc)) {
1064
1065
1066
1067
1068 if (prb_curr_blk_in_use(pbd)) {
1069
1070 return NULL;
1071 } else {
1072
1073
1074
1075
1076
1077
1078 prb_open_block(pkc, pbd);
1079 }
1080 }
1081
1082 smp_mb();
1083 curr = pkc->nxt_offset;
1084 pkc->skb = skb;
1085 end = (char *)pbd + pkc->kblk_size;
1086
1087
1088 if (curr+TOTAL_PKT_LEN_INCL_ALIGN(len) < end) {
1089 prb_fill_curr_block(curr, pkc, pbd, len);
1090 return (void *)curr;
1091 }
1092
1093
1094 prb_retire_current_block(pkc, po, 0);
1095
1096
1097 curr = (char *)prb_dispatch_next_block(pkc, po);
1098 if (curr) {
1099 pbd = GET_CURR_PBLOCK_DESC_FROM_CORE(pkc);
1100 prb_fill_curr_block(curr, pkc, pbd, len);
1101 return (void *)curr;
1102 }
1103
1104
1105
1106
1107
1108 return NULL;
1109 }
1110
1111 static void *packet_current_rx_frame(struct packet_sock *po,
1112 struct sk_buff *skb,
1113 int status, unsigned int len)
1114 {
1115 char *curr = NULL;
1116 switch (po->tp_version) {
1117 case TPACKET_V1:
1118 case TPACKET_V2:
1119 curr = packet_lookup_frame(po, &po->rx_ring,
1120 po->rx_ring.head, status);
1121 return curr;
1122 case TPACKET_V3:
1123 return __packet_lookup_frame_in_block(po, skb, len);
1124 default:
1125 WARN(1, "TPACKET version not supported\n");
1126 BUG();
1127 return NULL;
1128 }
1129 }
1130
1131 static void *prb_lookup_block(const struct packet_sock *po,
1132 const struct packet_ring_buffer *rb,
1133 unsigned int idx,
1134 int status)
1135 {
1136 struct tpacket_kbdq_core *pkc = GET_PBDQC_FROM_RB(rb);
1137 struct tpacket_block_desc *pbd = GET_PBLOCK_DESC(pkc, idx);
1138
1139 if (status != BLOCK_STATUS(pbd))
1140 return NULL;
1141 return pbd;
1142 }
1143
1144 static int prb_previous_blk_num(struct packet_ring_buffer *rb)
1145 {
1146 unsigned int prev;
1147 if (rb->prb_bdqc.kactive_blk_num)
1148 prev = rb->prb_bdqc.kactive_blk_num-1;
1149 else
1150 prev = rb->prb_bdqc.knum_blocks-1;
1151 return prev;
1152 }
1153
1154
1155 static void *__prb_previous_block(struct packet_sock *po,
1156 struct packet_ring_buffer *rb,
1157 int status)
1158 {
1159 unsigned int previous = prb_previous_blk_num(rb);
1160 return prb_lookup_block(po, rb, previous, status);
1161 }
1162
1163 static void *packet_previous_rx_frame(struct packet_sock *po,
1164 struct packet_ring_buffer *rb,
1165 int status)
1166 {
1167 if (po->tp_version <= TPACKET_V2)
1168 return packet_previous_frame(po, rb, status);
1169
1170 return __prb_previous_block(po, rb, status);
1171 }
1172
1173 static void packet_increment_rx_head(struct packet_sock *po,
1174 struct packet_ring_buffer *rb)
1175 {
1176 switch (po->tp_version) {
1177 case TPACKET_V1:
1178 case TPACKET_V2:
1179 return packet_increment_head(rb);
1180 case TPACKET_V3:
1181 default:
1182 WARN(1, "TPACKET version not supported.\n");
1183 BUG();
1184 return;
1185 }
1186 }
1187
1188 static void *packet_previous_frame(struct packet_sock *po,
1189 struct packet_ring_buffer *rb,
1190 int status)
1191 {
1192 unsigned int previous = rb->head ? rb->head - 1 : rb->frame_max;
1193 return packet_lookup_frame(po, rb, previous, status);
1194 }
1195
1196 static void packet_increment_head(struct packet_ring_buffer *buff)
1197 {
1198 buff->head = buff->head != buff->frame_max ? buff->head+1 : 0;
1199 }
1200
1201 static void packet_inc_pending(struct packet_ring_buffer *rb)
1202 {
1203 this_cpu_inc(*rb->pending_refcnt);
1204 }
1205
1206 static void packet_dec_pending(struct packet_ring_buffer *rb)
1207 {
1208 this_cpu_dec(*rb->pending_refcnt);
1209 }
1210
1211 static unsigned int packet_read_pending(const struct packet_ring_buffer *rb)
1212 {
1213 unsigned int refcnt = 0;
1214 int cpu;
1215
1216
1217 if (rb->pending_refcnt == NULL)
1218 return 0;
1219
1220 for_each_possible_cpu(cpu)
1221 refcnt += *per_cpu_ptr(rb->pending_refcnt, cpu);
1222
1223 return refcnt;
1224 }
1225
1226 static int packet_alloc_pending(struct packet_sock *po)
1227 {
1228 po->rx_ring.pending_refcnt = NULL;
1229
1230 po->tx_ring.pending_refcnt = alloc_percpu(unsigned int);
1231 if (unlikely(po->tx_ring.pending_refcnt == NULL))
1232 return -ENOBUFS;
1233
1234 return 0;
1235 }
1236
1237 static void packet_free_pending(struct packet_sock *po)
1238 {
1239 free_percpu(po->tx_ring.pending_refcnt);
1240 }
1241
1242 #define ROOM_POW_OFF 2
1243 #define ROOM_NONE 0x0
1244 #define ROOM_LOW 0x1
1245 #define ROOM_NORMAL 0x2
1246
1247 static bool __tpacket_has_room(const struct packet_sock *po, int pow_off)
1248 {
1249 int idx, len;
1250
1251 len = READ_ONCE(po->rx_ring.frame_max) + 1;
1252 idx = READ_ONCE(po->rx_ring.head);
1253 if (pow_off)
1254 idx += len >> pow_off;
1255 if (idx >= len)
1256 idx -= len;
1257 return packet_lookup_frame(po, &po->rx_ring, idx, TP_STATUS_KERNEL);
1258 }
1259
1260 static bool __tpacket_v3_has_room(const struct packet_sock *po, int pow_off)
1261 {
1262 int idx, len;
1263
1264 len = READ_ONCE(po->rx_ring.prb_bdqc.knum_blocks);
1265 idx = READ_ONCE(po->rx_ring.prb_bdqc.kactive_blk_num);
1266 if (pow_off)
1267 idx += len >> pow_off;
1268 if (idx >= len)
1269 idx -= len;
1270 return prb_lookup_block(po, &po->rx_ring, idx, TP_STATUS_KERNEL);
1271 }
1272
1273 static int __packet_rcv_has_room(const struct packet_sock *po,
1274 const struct sk_buff *skb)
1275 {
1276 const struct sock *sk = &po->sk;
1277 int ret = ROOM_NONE;
1278
1279 if (po->prot_hook.func != tpacket_rcv) {
1280 int rcvbuf = READ_ONCE(sk->sk_rcvbuf);
1281 int avail = rcvbuf - atomic_read(&sk->sk_rmem_alloc)
1282 - (skb ? skb->truesize : 0);
1283
1284 if (avail > (rcvbuf >> ROOM_POW_OFF))
1285 return ROOM_NORMAL;
1286 else if (avail > 0)
1287 return ROOM_LOW;
1288 else
1289 return ROOM_NONE;
1290 }
1291
1292 if (po->tp_version == TPACKET_V3) {
1293 if (__tpacket_v3_has_room(po, ROOM_POW_OFF))
1294 ret = ROOM_NORMAL;
1295 else if (__tpacket_v3_has_room(po, 0))
1296 ret = ROOM_LOW;
1297 } else {
1298 if (__tpacket_has_room(po, ROOM_POW_OFF))
1299 ret = ROOM_NORMAL;
1300 else if (__tpacket_has_room(po, 0))
1301 ret = ROOM_LOW;
1302 }
1303
1304 return ret;
1305 }
1306
1307 static int packet_rcv_has_room(struct packet_sock *po, struct sk_buff *skb)
1308 {
1309 int pressure, ret;
1310
1311 ret = __packet_rcv_has_room(po, skb);
1312 pressure = ret != ROOM_NORMAL;
1313
1314 if (READ_ONCE(po->pressure) != pressure)
1315 WRITE_ONCE(po->pressure, pressure);
1316
1317 return ret;
1318 }
1319
1320 static void packet_rcv_try_clear_pressure(struct packet_sock *po)
1321 {
1322 if (READ_ONCE(po->pressure) &&
1323 __packet_rcv_has_room(po, NULL) == ROOM_NORMAL)
1324 WRITE_ONCE(po->pressure, 0);
1325 }
1326
1327 static void packet_sock_destruct(struct sock *sk)
1328 {
1329 skb_queue_purge(&sk->sk_error_queue);
1330
1331 WARN_ON(atomic_read(&sk->sk_rmem_alloc));
1332 WARN_ON(refcount_read(&sk->sk_wmem_alloc));
1333
1334 if (!sock_flag(sk, SOCK_DEAD)) {
1335 pr_err("Attempt to release alive packet socket: %p\n", sk);
1336 return;
1337 }
1338
1339 sk_refcnt_debug_dec(sk);
1340 }
1341
1342 static bool fanout_flow_is_huge(struct packet_sock *po, struct sk_buff *skb)
1343 {
1344 u32 *history = po->rollover->history;
1345 u32 victim, rxhash;
1346 int i, count = 0;
1347
1348 rxhash = skb_get_hash(skb);
1349 for (i = 0; i < ROLLOVER_HLEN; i++)
1350 if (READ_ONCE(history[i]) == rxhash)
1351 count++;
1352
1353 victim = prandom_u32() % ROLLOVER_HLEN;
1354
1355
1356 if (READ_ONCE(history[victim]) != rxhash)
1357 WRITE_ONCE(history[victim], rxhash);
1358
1359 return count > (ROLLOVER_HLEN >> 1);
1360 }
1361
1362 static unsigned int fanout_demux_hash(struct packet_fanout *f,
1363 struct sk_buff *skb,
1364 unsigned int num)
1365 {
1366 return reciprocal_scale(__skb_get_hash_symmetric(skb), num);
1367 }
1368
1369 static unsigned int fanout_demux_lb(struct packet_fanout *f,
1370 struct sk_buff *skb,
1371 unsigned int num)
1372 {
1373 unsigned int val = atomic_inc_return(&f->rr_cur);
1374
1375 return val % num;
1376 }
1377
1378 static unsigned int fanout_demux_cpu(struct packet_fanout *f,
1379 struct sk_buff *skb,
1380 unsigned int num)
1381 {
1382 return smp_processor_id() % num;
1383 }
1384
1385 static unsigned int fanout_demux_rnd(struct packet_fanout *f,
1386 struct sk_buff *skb,
1387 unsigned int num)
1388 {
1389 return prandom_u32_max(num);
1390 }
1391
1392 static unsigned int fanout_demux_rollover(struct packet_fanout *f,
1393 struct sk_buff *skb,
1394 unsigned int idx, bool try_self,
1395 unsigned int num)
1396 {
1397 struct packet_sock *po, *po_next, *po_skip = NULL;
1398 unsigned int i, j, room = ROOM_NONE;
1399
1400 po = pkt_sk(rcu_dereference(f->arr[idx]));
1401
1402 if (try_self) {
1403 room = packet_rcv_has_room(po, skb);
1404 if (room == ROOM_NORMAL ||
1405 (room == ROOM_LOW && !fanout_flow_is_huge(po, skb)))
1406 return idx;
1407 po_skip = po;
1408 }
1409
1410 i = j = min_t(int, po->rollover->sock, num - 1);
1411 do {
1412 po_next = pkt_sk(rcu_dereference(f->arr[i]));
1413 if (po_next != po_skip && !READ_ONCE(po_next->pressure) &&
1414 packet_rcv_has_room(po_next, skb) == ROOM_NORMAL) {
1415 if (i != j)
1416 po->rollover->sock = i;
1417 atomic_long_inc(&po->rollover->num);
1418 if (room == ROOM_LOW)
1419 atomic_long_inc(&po->rollover->num_huge);
1420 return i;
1421 }
1422
1423 if (++i == num)
1424 i = 0;
1425 } while (i != j);
1426
1427 atomic_long_inc(&po->rollover->num_failed);
1428 return idx;
1429 }
1430
1431 static unsigned int fanout_demux_qm(struct packet_fanout *f,
1432 struct sk_buff *skb,
1433 unsigned int num)
1434 {
1435 return skb_get_queue_mapping(skb) % num;
1436 }
1437
1438 static unsigned int fanout_demux_bpf(struct packet_fanout *f,
1439 struct sk_buff *skb,
1440 unsigned int num)
1441 {
1442 struct bpf_prog *prog;
1443 unsigned int ret = 0;
1444
1445 rcu_read_lock();
1446 prog = rcu_dereference(f->bpf_prog);
1447 if (prog)
1448 ret = bpf_prog_run_clear_cb(prog, skb) % num;
1449 rcu_read_unlock();
1450
1451 return ret;
1452 }
1453
1454 static bool fanout_has_flag(struct packet_fanout *f, u16 flag)
1455 {
1456 return f->flags & (flag >> 8);
1457 }
1458
1459 static int packet_rcv_fanout(struct sk_buff *skb, struct net_device *dev,
1460 struct packet_type *pt, struct net_device *orig_dev)
1461 {
1462 struct packet_fanout *f = pt->af_packet_priv;
1463 unsigned int num = READ_ONCE(f->num_members);
1464 struct net *net = read_pnet(&f->net);
1465 struct packet_sock *po;
1466 unsigned int idx;
1467
1468 if (!net_eq(dev_net(dev), net) || !num) {
1469 kfree_skb(skb);
1470 return 0;
1471 }
1472
1473 if (fanout_has_flag(f, PACKET_FANOUT_FLAG_DEFRAG)) {
1474 skb = ip_check_defrag(net, skb, IP_DEFRAG_AF_PACKET);
1475 if (!skb)
1476 return 0;
1477 }
1478 switch (f->type) {
1479 case PACKET_FANOUT_HASH:
1480 default:
1481 idx = fanout_demux_hash(f, skb, num);
1482 break;
1483 case PACKET_FANOUT_LB:
1484 idx = fanout_demux_lb(f, skb, num);
1485 break;
1486 case PACKET_FANOUT_CPU:
1487 idx = fanout_demux_cpu(f, skb, num);
1488 break;
1489 case PACKET_FANOUT_RND:
1490 idx = fanout_demux_rnd(f, skb, num);
1491 break;
1492 case PACKET_FANOUT_QM:
1493 idx = fanout_demux_qm(f, skb, num);
1494 break;
1495 case PACKET_FANOUT_ROLLOVER:
1496 idx = fanout_demux_rollover(f, skb, 0, false, num);
1497 break;
1498 case PACKET_FANOUT_CBPF:
1499 case PACKET_FANOUT_EBPF:
1500 idx = fanout_demux_bpf(f, skb, num);
1501 break;
1502 }
1503
1504 if (fanout_has_flag(f, PACKET_FANOUT_FLAG_ROLLOVER))
1505 idx = fanout_demux_rollover(f, skb, idx, true, num);
1506
1507 po = pkt_sk(rcu_dereference(f->arr[idx]));
1508 return po->prot_hook.func(skb, dev, &po->prot_hook, orig_dev);
1509 }
1510
1511 DEFINE_MUTEX(fanout_mutex);
1512 EXPORT_SYMBOL_GPL(fanout_mutex);
1513 static LIST_HEAD(fanout_list);
1514 static u16 fanout_next_id;
1515
1516 static void __fanout_link(struct sock *sk, struct packet_sock *po)
1517 {
1518 struct packet_fanout *f = po->fanout;
1519
1520 spin_lock(&f->lock);
1521 rcu_assign_pointer(f->arr[f->num_members], sk);
1522 smp_wmb();
1523 f->num_members++;
1524 if (f->num_members == 1)
1525 dev_add_pack(&f->prot_hook);
1526 spin_unlock(&f->lock);
1527 }
1528
1529 static void __fanout_unlink(struct sock *sk, struct packet_sock *po)
1530 {
1531 struct packet_fanout *f = po->fanout;
1532 int i;
1533
1534 spin_lock(&f->lock);
1535 for (i = 0; i < f->num_members; i++) {
1536 if (rcu_dereference_protected(f->arr[i],
1537 lockdep_is_held(&f->lock)) == sk)
1538 break;
1539 }
1540 BUG_ON(i >= f->num_members);
1541 rcu_assign_pointer(f->arr[i],
1542 rcu_dereference_protected(f->arr[f->num_members - 1],
1543 lockdep_is_held(&f->lock)));
1544 f->num_members--;
1545 if (f->num_members == 0)
1546 __dev_remove_pack(&f->prot_hook);
1547 spin_unlock(&f->lock);
1548 }
1549
1550 static bool match_fanout_group(struct packet_type *ptype, struct sock *sk)
1551 {
1552 if (sk->sk_family != PF_PACKET)
1553 return false;
1554
1555 return ptype->af_packet_priv == pkt_sk(sk)->fanout;
1556 }
1557
1558 static void fanout_init_data(struct packet_fanout *f)
1559 {
1560 switch (f->type) {
1561 case PACKET_FANOUT_LB:
1562 atomic_set(&f->rr_cur, 0);
1563 break;
1564 case PACKET_FANOUT_CBPF:
1565 case PACKET_FANOUT_EBPF:
1566 RCU_INIT_POINTER(f->bpf_prog, NULL);
1567 break;
1568 }
1569 }
1570
1571 static void __fanout_set_data_bpf(struct packet_fanout *f, struct bpf_prog *new)
1572 {
1573 struct bpf_prog *old;
1574
1575 spin_lock(&f->lock);
1576 old = rcu_dereference_protected(f->bpf_prog, lockdep_is_held(&f->lock));
1577 rcu_assign_pointer(f->bpf_prog, new);
1578 spin_unlock(&f->lock);
1579
1580 if (old) {
1581 synchronize_net();
1582 bpf_prog_destroy(old);
1583 }
1584 }
1585
1586 static int fanout_set_data_cbpf(struct packet_sock *po, sockptr_t data,
1587 unsigned int len)
1588 {
1589 struct bpf_prog *new;
1590 struct sock_fprog fprog;
1591 int ret;
1592
1593 if (sock_flag(&po->sk, SOCK_FILTER_LOCKED))
1594 return -EPERM;
1595
1596 ret = copy_bpf_fprog_from_user(&fprog, data, len);
1597 if (ret)
1598 return ret;
1599
1600 ret = bpf_prog_create_from_user(&new, &fprog, NULL, false);
1601 if (ret)
1602 return ret;
1603
1604 __fanout_set_data_bpf(po->fanout, new);
1605 return 0;
1606 }
1607
1608 static int fanout_set_data_ebpf(struct packet_sock *po, sockptr_t data,
1609 unsigned int len)
1610 {
1611 struct bpf_prog *new;
1612 u32 fd;
1613
1614 if (sock_flag(&po->sk, SOCK_FILTER_LOCKED))
1615 return -EPERM;
1616 if (len != sizeof(fd))
1617 return -EINVAL;
1618 if (copy_from_sockptr(&fd, data, len))
1619 return -EFAULT;
1620
1621 new = bpf_prog_get_type(fd, BPF_PROG_TYPE_SOCKET_FILTER);
1622 if (IS_ERR(new))
1623 return PTR_ERR(new);
1624
1625 __fanout_set_data_bpf(po->fanout, new);
1626 return 0;
1627 }
1628
1629 static int fanout_set_data(struct packet_sock *po, sockptr_t data,
1630 unsigned int len)
1631 {
1632 switch (po->fanout->type) {
1633 case PACKET_FANOUT_CBPF:
1634 return fanout_set_data_cbpf(po, data, len);
1635 case PACKET_FANOUT_EBPF:
1636 return fanout_set_data_ebpf(po, data, len);
1637 default:
1638 return -EINVAL;
1639 }
1640 }
1641
1642 static void fanout_release_data(struct packet_fanout *f)
1643 {
1644 switch (f->type) {
1645 case PACKET_FANOUT_CBPF:
1646 case PACKET_FANOUT_EBPF:
1647 __fanout_set_data_bpf(f, NULL);
1648 }
1649 }
1650
1651 static bool __fanout_id_is_free(struct sock *sk, u16 candidate_id)
1652 {
1653 struct packet_fanout *f;
1654
1655 list_for_each_entry(f, &fanout_list, list) {
1656 if (f->id == candidate_id &&
1657 read_pnet(&f->net) == sock_net(sk)) {
1658 return false;
1659 }
1660 }
1661 return true;
1662 }
1663
1664 static bool fanout_find_new_id(struct sock *sk, u16 *new_id)
1665 {
1666 u16 id = fanout_next_id;
1667
1668 do {
1669 if (__fanout_id_is_free(sk, id)) {
1670 *new_id = id;
1671 fanout_next_id = id + 1;
1672 return true;
1673 }
1674
1675 id++;
1676 } while (id != fanout_next_id);
1677
1678 return false;
1679 }
1680
1681 static int fanout_add(struct sock *sk, struct fanout_args *args)
1682 {
1683 struct packet_rollover *rollover = NULL;
1684 struct packet_sock *po = pkt_sk(sk);
1685 u16 type_flags = args->type_flags;
1686 struct packet_fanout *f, *match;
1687 u8 type = type_flags & 0xff;
1688 u8 flags = type_flags >> 8;
1689 u16 id = args->id;
1690 int err;
1691
1692 switch (type) {
1693 case PACKET_FANOUT_ROLLOVER:
1694 if (type_flags & PACKET_FANOUT_FLAG_ROLLOVER)
1695 return -EINVAL;
1696 break;
1697 case PACKET_FANOUT_HASH:
1698 case PACKET_FANOUT_LB:
1699 case PACKET_FANOUT_CPU:
1700 case PACKET_FANOUT_RND:
1701 case PACKET_FANOUT_QM:
1702 case PACKET_FANOUT_CBPF:
1703 case PACKET_FANOUT_EBPF:
1704 break;
1705 default:
1706 return -EINVAL;
1707 }
1708
1709 mutex_lock(&fanout_mutex);
1710
1711 err = -EALREADY;
1712 if (po->fanout)
1713 goto out;
1714
1715 if (type == PACKET_FANOUT_ROLLOVER ||
1716 (type_flags & PACKET_FANOUT_FLAG_ROLLOVER)) {
1717 err = -ENOMEM;
1718 rollover = kzalloc(sizeof(*rollover), GFP_KERNEL);
1719 if (!rollover)
1720 goto out;
1721 atomic_long_set(&rollover->num, 0);
1722 atomic_long_set(&rollover->num_huge, 0);
1723 atomic_long_set(&rollover->num_failed, 0);
1724 }
1725
1726 if (type_flags & PACKET_FANOUT_FLAG_UNIQUEID) {
1727 if (id != 0) {
1728 err = -EINVAL;
1729 goto out;
1730 }
1731 if (!fanout_find_new_id(sk, &id)) {
1732 err = -ENOMEM;
1733 goto out;
1734 }
1735
1736 flags &= ~(PACKET_FANOUT_FLAG_UNIQUEID >> 8);
1737 }
1738
1739 match = NULL;
1740 list_for_each_entry(f, &fanout_list, list) {
1741 if (f->id == id &&
1742 read_pnet(&f->net) == sock_net(sk)) {
1743 match = f;
1744 break;
1745 }
1746 }
1747 err = -EINVAL;
1748 if (match) {
1749 if (match->flags != flags)
1750 goto out;
1751 if (args->max_num_members &&
1752 args->max_num_members != match->max_num_members)
1753 goto out;
1754 } else {
1755 if (args->max_num_members > PACKET_FANOUT_MAX)
1756 goto out;
1757 if (!args->max_num_members)
1758
1759 args->max_num_members = 256;
1760 err = -ENOMEM;
1761 match = kvzalloc(struct_size(match, arr, args->max_num_members),
1762 GFP_KERNEL);
1763 if (!match)
1764 goto out;
1765 write_pnet(&match->net, sock_net(sk));
1766 match->id = id;
1767 match->type = type;
1768 match->flags = flags;
1769 INIT_LIST_HEAD(&match->list);
1770 spin_lock_init(&match->lock);
1771 refcount_set(&match->sk_ref, 0);
1772 fanout_init_data(match);
1773 match->prot_hook.type = po->prot_hook.type;
1774 match->prot_hook.dev = po->prot_hook.dev;
1775 match->prot_hook.func = packet_rcv_fanout;
1776 match->prot_hook.af_packet_priv = match;
1777 match->prot_hook.af_packet_net = read_pnet(&match->net);
1778 match->prot_hook.id_match = match_fanout_group;
1779 match->max_num_members = args->max_num_members;
1780 list_add(&match->list, &fanout_list);
1781 }
1782 err = -EINVAL;
1783
1784 spin_lock(&po->bind_lock);
1785 if (po->running &&
1786 match->type == type &&
1787 match->prot_hook.type == po->prot_hook.type &&
1788 match->prot_hook.dev == po->prot_hook.dev) {
1789 err = -ENOSPC;
1790 if (refcount_read(&match->sk_ref) < match->max_num_members) {
1791 __dev_remove_pack(&po->prot_hook);
1792
1793
1794 WRITE_ONCE(po->fanout, match);
1795
1796 po->rollover = rollover;
1797 rollover = NULL;
1798 refcount_set(&match->sk_ref, refcount_read(&match->sk_ref) + 1);
1799 __fanout_link(sk, po);
1800 err = 0;
1801 }
1802 }
1803 spin_unlock(&po->bind_lock);
1804
1805 if (err && !refcount_read(&match->sk_ref)) {
1806 list_del(&match->list);
1807 kvfree(match);
1808 }
1809
1810 out:
1811 kfree(rollover);
1812 mutex_unlock(&fanout_mutex);
1813 return err;
1814 }
1815
1816
1817
1818
1819
1820
1821 static struct packet_fanout *fanout_release(struct sock *sk)
1822 {
1823 struct packet_sock *po = pkt_sk(sk);
1824 struct packet_fanout *f;
1825
1826 mutex_lock(&fanout_mutex);
1827 f = po->fanout;
1828 if (f) {
1829 po->fanout = NULL;
1830
1831 if (refcount_dec_and_test(&f->sk_ref))
1832 list_del(&f->list);
1833 else
1834 f = NULL;
1835 }
1836 mutex_unlock(&fanout_mutex);
1837
1838 return f;
1839 }
1840
1841 static bool packet_extra_vlan_len_allowed(const struct net_device *dev,
1842 struct sk_buff *skb)
1843 {
1844
1845
1846
1847
1848 if (unlikely(dev->type != ARPHRD_ETHER))
1849 return false;
1850
1851 skb_reset_mac_header(skb);
1852 return likely(eth_hdr(skb)->h_proto == htons(ETH_P_8021Q));
1853 }
1854
1855 static const struct proto_ops packet_ops;
1856
1857 static const struct proto_ops packet_ops_spkt;
1858
1859 static int packet_rcv_spkt(struct sk_buff *skb, struct net_device *dev,
1860 struct packet_type *pt, struct net_device *orig_dev)
1861 {
1862 struct sock *sk;
1863 struct sockaddr_pkt *spkt;
1864
1865
1866
1867
1868
1869
1870 sk = pt->af_packet_priv;
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883 if (skb->pkt_type == PACKET_LOOPBACK)
1884 goto out;
1885
1886 if (!net_eq(dev_net(dev), sock_net(sk)))
1887 goto out;
1888
1889 skb = skb_share_check(skb, GFP_ATOMIC);
1890 if (skb == NULL)
1891 goto oom;
1892
1893
1894 skb_dst_drop(skb);
1895
1896
1897 nf_reset_ct(skb);
1898
1899 spkt = &PACKET_SKB_CB(skb)->sa.pkt;
1900
1901 skb_push(skb, skb->data - skb_mac_header(skb));
1902
1903
1904
1905
1906
1907 spkt->spkt_family = dev->type;
1908 strlcpy(spkt->spkt_device, dev->name, sizeof(spkt->spkt_device));
1909 spkt->spkt_protocol = skb->protocol;
1910
1911
1912
1913
1914
1915
1916 if (sock_queue_rcv_skb(sk, skb) == 0)
1917 return 0;
1918
1919 out:
1920 kfree_skb(skb);
1921 oom:
1922 return 0;
1923 }
1924
1925 static void packet_parse_headers(struct sk_buff *skb, struct socket *sock)
1926 {
1927 int depth;
1928
1929 if ((!skb->protocol || skb->protocol == htons(ETH_P_ALL)) &&
1930 sock->type == SOCK_RAW) {
1931 skb_reset_mac_header(skb);
1932 skb->protocol = dev_parse_header_protocol(skb);
1933 }
1934
1935
1936 if (likely(skb->dev->type == ARPHRD_ETHER) &&
1937 eth_type_vlan(skb->protocol) &&
1938 __vlan_get_protocol(skb, skb->protocol, &depth) != 0) {
1939 if (pskb_may_pull(skb, depth))
1940 skb_set_network_header(skb, depth);
1941 }
1942
1943 skb_probe_transport_header(skb);
1944 }
1945
1946
1947
1948
1949
1950
1951 static int packet_sendmsg_spkt(struct socket *sock, struct msghdr *msg,
1952 size_t len)
1953 {
1954 struct sock *sk = sock->sk;
1955 DECLARE_SOCKADDR(struct sockaddr_pkt *, saddr, msg->msg_name);
1956 struct sk_buff *skb = NULL;
1957 struct net_device *dev;
1958 struct sockcm_cookie sockc;
1959 __be16 proto = 0;
1960 int err;
1961 int extra_len = 0;
1962
1963
1964
1965
1966
1967 if (saddr) {
1968 if (msg->msg_namelen < sizeof(struct sockaddr))
1969 return -EINVAL;
1970 if (msg->msg_namelen == sizeof(struct sockaddr_pkt))
1971 proto = saddr->spkt_protocol;
1972 } else
1973 return -ENOTCONN;
1974
1975
1976
1977
1978
1979 saddr->spkt_device[sizeof(saddr->spkt_device) - 1] = 0;
1980 retry:
1981 rcu_read_lock();
1982 dev = dev_get_by_name_rcu(sock_net(sk), saddr->spkt_device);
1983 err = -ENODEV;
1984 if (dev == NULL)
1985 goto out_unlock;
1986
1987 err = -ENETDOWN;
1988 if (!(dev->flags & IFF_UP))
1989 goto out_unlock;
1990
1991
1992
1993
1994
1995
1996 if (unlikely(sock_flag(sk, SOCK_NOFCS))) {
1997 if (!netif_supports_nofcs(dev)) {
1998 err = -EPROTONOSUPPORT;
1999 goto out_unlock;
2000 }
2001 extra_len = 4;
2002 }
2003
2004 err = -EMSGSIZE;
2005 if (len > dev->mtu + dev->hard_header_len + VLAN_HLEN + extra_len)
2006 goto out_unlock;
2007
2008 if (!skb) {
2009 size_t reserved = LL_RESERVED_SPACE(dev);
2010 int tlen = dev->needed_tailroom;
2011 unsigned int hhlen = dev->header_ops ? dev->hard_header_len : 0;
2012
2013 rcu_read_unlock();
2014 skb = sock_wmalloc(sk, len + reserved + tlen, 0, GFP_KERNEL);
2015 if (skb == NULL)
2016 return -ENOBUFS;
2017
2018
2019
2020
2021 skb_reserve(skb, reserved);
2022 skb_reset_network_header(skb);
2023
2024
2025 if (hhlen) {
2026 skb->data -= hhlen;
2027 skb->tail -= hhlen;
2028 if (len < hhlen)
2029 skb_reset_network_header(skb);
2030 }
2031 err = memcpy_from_msg(skb_put(skb, len), msg, len);
2032 if (err)
2033 goto out_free;
2034 goto retry;
2035 }
2036
2037 if (!dev_validate_header(dev, skb->data, len)) {
2038 err = -EINVAL;
2039 goto out_unlock;
2040 }
2041 if (len > (dev->mtu + dev->hard_header_len + extra_len) &&
2042 !packet_extra_vlan_len_allowed(dev, skb)) {
2043 err = -EMSGSIZE;
2044 goto out_unlock;
2045 }
2046
2047 sockcm_init(&sockc, sk);
2048 if (msg->msg_controllen) {
2049 err = sock_cmsg_send(sk, msg, &sockc);
2050 if (unlikely(err))
2051 goto out_unlock;
2052 }
2053
2054 skb->protocol = proto;
2055 skb->dev = dev;
2056 skb->priority = sk->sk_priority;
2057 skb->mark = sk->sk_mark;
2058 skb->tstamp = sockc.transmit_time;
2059
2060 skb_setup_tx_timestamp(skb, sockc.tsflags);
2061
2062 if (unlikely(extra_len == 4))
2063 skb->no_fcs = 1;
2064
2065 packet_parse_headers(skb, sock);
2066
2067 dev_queue_xmit(skb);
2068 rcu_read_unlock();
2069 return len;
2070
2071 out_unlock:
2072 rcu_read_unlock();
2073 out_free:
2074 kfree_skb(skb);
2075 return err;
2076 }
2077
2078 static unsigned int run_filter(struct sk_buff *skb,
2079 const struct sock *sk,
2080 unsigned int res)
2081 {
2082 struct sk_filter *filter;
2083
2084 rcu_read_lock();
2085 filter = rcu_dereference(sk->sk_filter);
2086 if (filter != NULL)
2087 res = bpf_prog_run_clear_cb(filter->prog, skb);
2088 rcu_read_unlock();
2089
2090 return res;
2091 }
2092
2093 static int packet_rcv_vnet(struct msghdr *msg, const struct sk_buff *skb,
2094 size_t *len)
2095 {
2096 struct virtio_net_hdr vnet_hdr;
2097
2098 if (*len < sizeof(vnet_hdr))
2099 return -EINVAL;
2100 *len -= sizeof(vnet_hdr);
2101
2102 if (virtio_net_hdr_from_skb(skb, &vnet_hdr, vio_le(), true, 0))
2103 return -EINVAL;
2104
2105 return memcpy_to_msg(msg, (void *)&vnet_hdr, sizeof(vnet_hdr));
2106 }
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120 static int packet_rcv(struct sk_buff *skb, struct net_device *dev,
2121 struct packet_type *pt, struct net_device *orig_dev)
2122 {
2123 struct sock *sk;
2124 struct sockaddr_ll *sll;
2125 struct packet_sock *po;
2126 u8 *skb_head = skb->data;
2127 int skb_len = skb->len;
2128 unsigned int snaplen, res;
2129 bool is_drop_n_account = false;
2130
2131 if (skb->pkt_type == PACKET_LOOPBACK)
2132 goto drop;
2133
2134 sk = pt->af_packet_priv;
2135 po = pkt_sk(sk);
2136
2137 if (!net_eq(dev_net(dev), sock_net(sk)))
2138 goto drop;
2139
2140 skb->dev = dev;
2141
2142 if (dev_has_header(dev)) {
2143
2144
2145
2146
2147
2148
2149
2150 if (sk->sk_type != SOCK_DGRAM)
2151 skb_push(skb, skb->data - skb_mac_header(skb));
2152 else if (skb->pkt_type == PACKET_OUTGOING) {
2153
2154 skb_pull(skb, skb_network_offset(skb));
2155 }
2156 }
2157
2158 snaplen = skb->len;
2159
2160 res = run_filter(skb, sk, snaplen);
2161 if (!res)
2162 goto drop_n_restore;
2163 if (snaplen > res)
2164 snaplen = res;
2165
2166 if (atomic_read(&sk->sk_rmem_alloc) >= sk->sk_rcvbuf)
2167 goto drop_n_acct;
2168
2169 if (skb_shared(skb)) {
2170 struct sk_buff *nskb = skb_clone(skb, GFP_ATOMIC);
2171 if (nskb == NULL)
2172 goto drop_n_acct;
2173
2174 if (skb_head != skb->data) {
2175 skb->data = skb_head;
2176 skb->len = skb_len;
2177 }
2178 consume_skb(skb);
2179 skb = nskb;
2180 }
2181
2182 sock_skb_cb_check_size(sizeof(*PACKET_SKB_CB(skb)) + MAX_ADDR_LEN - 8);
2183
2184 sll = &PACKET_SKB_CB(skb)->sa.ll;
2185 sll->sll_hatype = dev->type;
2186 sll->sll_pkttype = skb->pkt_type;
2187 if (unlikely(po->origdev))
2188 sll->sll_ifindex = orig_dev->ifindex;
2189 else
2190 sll->sll_ifindex = dev->ifindex;
2191
2192 sll->sll_halen = dev_parse_header(skb, sll->sll_addr);
2193
2194
2195
2196
2197 PACKET_SKB_CB(skb)->sa.origlen = skb->len;
2198
2199 if (pskb_trim(skb, snaplen))
2200 goto drop_n_acct;
2201
2202 skb_set_owner_r(skb, sk);
2203 skb->dev = NULL;
2204 skb_dst_drop(skb);
2205
2206
2207 nf_reset_ct(skb);
2208
2209 spin_lock(&sk->sk_receive_queue.lock);
2210 po->stats.stats1.tp_packets++;
2211 sock_skb_set_dropcount(sk, skb);
2212 skb_clear_delivery_time(skb);
2213 __skb_queue_tail(&sk->sk_receive_queue, skb);
2214 spin_unlock(&sk->sk_receive_queue.lock);
2215 sk->sk_data_ready(sk);
2216 return 0;
2217
2218 drop_n_acct:
2219 is_drop_n_account = true;
2220 atomic_inc(&po->tp_drops);
2221 atomic_inc(&sk->sk_drops);
2222
2223 drop_n_restore:
2224 if (skb_head != skb->data && skb_shared(skb)) {
2225 skb->data = skb_head;
2226 skb->len = skb_len;
2227 }
2228 drop:
2229 if (!is_drop_n_account)
2230 consume_skb(skb);
2231 else
2232 kfree_skb(skb);
2233 return 0;
2234 }
2235
2236 static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev,
2237 struct packet_type *pt, struct net_device *orig_dev)
2238 {
2239 struct sock *sk;
2240 struct packet_sock *po;
2241 struct sockaddr_ll *sll;
2242 union tpacket_uhdr h;
2243 u8 *skb_head = skb->data;
2244 int skb_len = skb->len;
2245 unsigned int snaplen, res;
2246 unsigned long status = TP_STATUS_USER;
2247 unsigned short macoff, hdrlen;
2248 unsigned int netoff;
2249 struct sk_buff *copy_skb = NULL;
2250 struct timespec64 ts;
2251 __u32 ts_status;
2252 bool is_drop_n_account = false;
2253 unsigned int slot_id = 0;
2254 bool do_vnet = false;
2255
2256
2257
2258
2259
2260 BUILD_BUG_ON(TPACKET_ALIGN(sizeof(*h.h2)) != 32);
2261 BUILD_BUG_ON(TPACKET_ALIGN(sizeof(*h.h3)) != 48);
2262
2263 if (skb->pkt_type == PACKET_LOOPBACK)
2264 goto drop;
2265
2266 sk = pt->af_packet_priv;
2267 po = pkt_sk(sk);
2268
2269 if (!net_eq(dev_net(dev), sock_net(sk)))
2270 goto drop;
2271
2272 if (dev_has_header(dev)) {
2273 if (sk->sk_type != SOCK_DGRAM)
2274 skb_push(skb, skb->data - skb_mac_header(skb));
2275 else if (skb->pkt_type == PACKET_OUTGOING) {
2276
2277 skb_pull(skb, skb_network_offset(skb));
2278 }
2279 }
2280
2281 snaplen = skb->len;
2282
2283 res = run_filter(skb, sk, snaplen);
2284 if (!res)
2285 goto drop_n_restore;
2286
2287
2288 if (__packet_rcv_has_room(po, skb) == ROOM_NONE) {
2289 atomic_inc(&po->tp_drops);
2290 goto drop_n_restore;
2291 }
2292
2293 if (skb->ip_summed == CHECKSUM_PARTIAL)
2294 status |= TP_STATUS_CSUMNOTREADY;
2295 else if (skb->pkt_type != PACKET_OUTGOING &&
2296 (skb->ip_summed == CHECKSUM_COMPLETE ||
2297 skb_csum_unnecessary(skb)))
2298 status |= TP_STATUS_CSUM_VALID;
2299
2300 if (snaplen > res)
2301 snaplen = res;
2302
2303 if (sk->sk_type == SOCK_DGRAM) {
2304 macoff = netoff = TPACKET_ALIGN(po->tp_hdrlen) + 16 +
2305 po->tp_reserve;
2306 } else {
2307 unsigned int maclen = skb_network_offset(skb);
2308 netoff = TPACKET_ALIGN(po->tp_hdrlen +
2309 (maclen < 16 ? 16 : maclen)) +
2310 po->tp_reserve;
2311 if (po->has_vnet_hdr) {
2312 netoff += sizeof(struct virtio_net_hdr);
2313 do_vnet = true;
2314 }
2315 macoff = netoff - maclen;
2316 }
2317 if (netoff > USHRT_MAX) {
2318 atomic_inc(&po->tp_drops);
2319 goto drop_n_restore;
2320 }
2321 if (po->tp_version <= TPACKET_V2) {
2322 if (macoff + snaplen > po->rx_ring.frame_size) {
2323 if (po->copy_thresh &&
2324 atomic_read(&sk->sk_rmem_alloc) < sk->sk_rcvbuf) {
2325 if (skb_shared(skb)) {
2326 copy_skb = skb_clone(skb, GFP_ATOMIC);
2327 } else {
2328 copy_skb = skb_get(skb);
2329 skb_head = skb->data;
2330 }
2331 if (copy_skb) {
2332 memset(&PACKET_SKB_CB(copy_skb)->sa.ll, 0,
2333 sizeof(PACKET_SKB_CB(copy_skb)->sa.ll));
2334 skb_set_owner_r(copy_skb, sk);
2335 }
2336 }
2337 snaplen = po->rx_ring.frame_size - macoff;
2338 if ((int)snaplen < 0) {
2339 snaplen = 0;
2340 do_vnet = false;
2341 }
2342 }
2343 } else if (unlikely(macoff + snaplen >
2344 GET_PBDQC_FROM_RB(&po->rx_ring)->max_frame_len)) {
2345 u32 nval;
2346
2347 nval = GET_PBDQC_FROM_RB(&po->rx_ring)->max_frame_len - macoff;
2348 pr_err_once("tpacket_rcv: packet too big, clamped from %u to %u. macoff=%u\n",
2349 snaplen, nval, macoff);
2350 snaplen = nval;
2351 if (unlikely((int)snaplen < 0)) {
2352 snaplen = 0;
2353 macoff = GET_PBDQC_FROM_RB(&po->rx_ring)->max_frame_len;
2354 do_vnet = false;
2355 }
2356 }
2357 spin_lock(&sk->sk_receive_queue.lock);
2358 h.raw = packet_current_rx_frame(po, skb,
2359 TP_STATUS_KERNEL, (macoff+snaplen));
2360 if (!h.raw)
2361 goto drop_n_account;
2362
2363 if (po->tp_version <= TPACKET_V2) {
2364 slot_id = po->rx_ring.head;
2365 if (test_bit(slot_id, po->rx_ring.rx_owner_map))
2366 goto drop_n_account;
2367 __set_bit(slot_id, po->rx_ring.rx_owner_map);
2368 }
2369
2370 if (do_vnet &&
2371 virtio_net_hdr_from_skb(skb, h.raw + macoff -
2372 sizeof(struct virtio_net_hdr),
2373 vio_le(), true, 0)) {
2374 if (po->tp_version == TPACKET_V3)
2375 prb_clear_blk_fill_status(&po->rx_ring);
2376 goto drop_n_account;
2377 }
2378
2379 if (po->tp_version <= TPACKET_V2) {
2380 packet_increment_rx_head(po, &po->rx_ring);
2381
2382
2383
2384
2385
2386
2387 if (atomic_read(&po->tp_drops))
2388 status |= TP_STATUS_LOSING;
2389 }
2390
2391 po->stats.stats1.tp_packets++;
2392 if (copy_skb) {
2393 status |= TP_STATUS_COPY;
2394 skb_clear_delivery_time(copy_skb);
2395 __skb_queue_tail(&sk->sk_receive_queue, copy_skb);
2396 }
2397 spin_unlock(&sk->sk_receive_queue.lock);
2398
2399 skb_copy_bits(skb, 0, h.raw + macoff, snaplen);
2400
2401
2402
2403
2404 ts_status = tpacket_get_timestamp(skb, &ts,
2405 po->tp_tstamp | SOF_TIMESTAMPING_SOFTWARE);
2406 if (!ts_status)
2407 ktime_get_real_ts64(&ts);
2408
2409 status |= ts_status;
2410
2411 switch (po->tp_version) {
2412 case TPACKET_V1:
2413 h.h1->tp_len = skb->len;
2414 h.h1->tp_snaplen = snaplen;
2415 h.h1->tp_mac = macoff;
2416 h.h1->tp_net = netoff;
2417 h.h1->tp_sec = ts.tv_sec;
2418 h.h1->tp_usec = ts.tv_nsec / NSEC_PER_USEC;
2419 hdrlen = sizeof(*h.h1);
2420 break;
2421 case TPACKET_V2:
2422 h.h2->tp_len = skb->len;
2423 h.h2->tp_snaplen = snaplen;
2424 h.h2->tp_mac = macoff;
2425 h.h2->tp_net = netoff;
2426 h.h2->tp_sec = ts.tv_sec;
2427 h.h2->tp_nsec = ts.tv_nsec;
2428 if (skb_vlan_tag_present(skb)) {
2429 h.h2->tp_vlan_tci = skb_vlan_tag_get(skb);
2430 h.h2->tp_vlan_tpid = ntohs(skb->vlan_proto);
2431 status |= TP_STATUS_VLAN_VALID | TP_STATUS_VLAN_TPID_VALID;
2432 } else {
2433 h.h2->tp_vlan_tci = 0;
2434 h.h2->tp_vlan_tpid = 0;
2435 }
2436 memset(h.h2->tp_padding, 0, sizeof(h.h2->tp_padding));
2437 hdrlen = sizeof(*h.h2);
2438 break;
2439 case TPACKET_V3:
2440
2441
2442
2443 h.h3->tp_status |= status;
2444 h.h3->tp_len = skb->len;
2445 h.h3->tp_snaplen = snaplen;
2446 h.h3->tp_mac = macoff;
2447 h.h3->tp_net = netoff;
2448 h.h3->tp_sec = ts.tv_sec;
2449 h.h3->tp_nsec = ts.tv_nsec;
2450 memset(h.h3->tp_padding, 0, sizeof(h.h3->tp_padding));
2451 hdrlen = sizeof(*h.h3);
2452 break;
2453 default:
2454 BUG();
2455 }
2456
2457 sll = h.raw + TPACKET_ALIGN(hdrlen);
2458 sll->sll_halen = dev_parse_header(skb, sll->sll_addr);
2459 sll->sll_family = AF_PACKET;
2460 sll->sll_hatype = dev->type;
2461 sll->sll_protocol = skb->protocol;
2462 sll->sll_pkttype = skb->pkt_type;
2463 if (unlikely(po->origdev))
2464 sll->sll_ifindex = orig_dev->ifindex;
2465 else
2466 sll->sll_ifindex = dev->ifindex;
2467
2468 smp_mb();
2469
2470 #if ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE == 1
2471 if (po->tp_version <= TPACKET_V2) {
2472 u8 *start, *end;
2473
2474 end = (u8 *) PAGE_ALIGN((unsigned long) h.raw +
2475 macoff + snaplen);
2476
2477 for (start = h.raw; start < end; start += PAGE_SIZE)
2478 flush_dcache_page(pgv_to_page(start));
2479 }
2480 smp_wmb();
2481 #endif
2482
2483 if (po->tp_version <= TPACKET_V2) {
2484 spin_lock(&sk->sk_receive_queue.lock);
2485 __packet_set_status(po, h.raw, status);
2486 __clear_bit(slot_id, po->rx_ring.rx_owner_map);
2487 spin_unlock(&sk->sk_receive_queue.lock);
2488 sk->sk_data_ready(sk);
2489 } else if (po->tp_version == TPACKET_V3) {
2490 prb_clear_blk_fill_status(&po->rx_ring);
2491 }
2492
2493 drop_n_restore:
2494 if (skb_head != skb->data && skb_shared(skb)) {
2495 skb->data = skb_head;
2496 skb->len = skb_len;
2497 }
2498 drop:
2499 if (!is_drop_n_account)
2500 consume_skb(skb);
2501 else
2502 kfree_skb(skb);
2503 return 0;
2504
2505 drop_n_account:
2506 spin_unlock(&sk->sk_receive_queue.lock);
2507 atomic_inc(&po->tp_drops);
2508 is_drop_n_account = true;
2509
2510 sk->sk_data_ready(sk);
2511 kfree_skb(copy_skb);
2512 goto drop_n_restore;
2513 }
2514
2515 static void tpacket_destruct_skb(struct sk_buff *skb)
2516 {
2517 struct packet_sock *po = pkt_sk(skb->sk);
2518
2519 if (likely(po->tx_ring.pg_vec)) {
2520 void *ph;
2521 __u32 ts;
2522
2523 ph = skb_zcopy_get_nouarg(skb);
2524 packet_dec_pending(&po->tx_ring);
2525
2526 ts = __packet_set_timestamp(po, ph, skb);
2527 __packet_set_status(po, ph, TP_STATUS_AVAILABLE | ts);
2528
2529 if (!packet_read_pending(&po->tx_ring))
2530 complete(&po->skb_completion);
2531 }
2532
2533 sock_wfree(skb);
2534 }
2535
2536 static int __packet_snd_vnet_parse(struct virtio_net_hdr *vnet_hdr, size_t len)
2537 {
2538 if ((vnet_hdr->flags & VIRTIO_NET_HDR_F_NEEDS_CSUM) &&
2539 (__virtio16_to_cpu(vio_le(), vnet_hdr->csum_start) +
2540 __virtio16_to_cpu(vio_le(), vnet_hdr->csum_offset) + 2 >
2541 __virtio16_to_cpu(vio_le(), vnet_hdr->hdr_len)))
2542 vnet_hdr->hdr_len = __cpu_to_virtio16(vio_le(),
2543 __virtio16_to_cpu(vio_le(), vnet_hdr->csum_start) +
2544 __virtio16_to_cpu(vio_le(), vnet_hdr->csum_offset) + 2);
2545
2546 if (__virtio16_to_cpu(vio_le(), vnet_hdr->hdr_len) > len)
2547 return -EINVAL;
2548
2549 return 0;
2550 }
2551
2552 static int packet_snd_vnet_parse(struct msghdr *msg, size_t *len,
2553 struct virtio_net_hdr *vnet_hdr)
2554 {
2555 if (*len < sizeof(*vnet_hdr))
2556 return -EINVAL;
2557 *len -= sizeof(*vnet_hdr);
2558
2559 if (!copy_from_iter_full(vnet_hdr, sizeof(*vnet_hdr), &msg->msg_iter))
2560 return -EFAULT;
2561
2562 return __packet_snd_vnet_parse(vnet_hdr, *len);
2563 }
2564
2565 static int tpacket_fill_skb(struct packet_sock *po, struct sk_buff *skb,
2566 void *frame, struct net_device *dev, void *data, int tp_len,
2567 __be16 proto, unsigned char *addr, int hlen, int copylen,
2568 const struct sockcm_cookie *sockc)
2569 {
2570 union tpacket_uhdr ph;
2571 int to_write, offset, len, nr_frags, len_max;
2572 struct socket *sock = po->sk.sk_socket;
2573 struct page *page;
2574 int err;
2575
2576 ph.raw = frame;
2577
2578 skb->protocol = proto;
2579 skb->dev = dev;
2580 skb->priority = po->sk.sk_priority;
2581 skb->mark = po->sk.sk_mark;
2582 skb->tstamp = sockc->transmit_time;
2583 skb_setup_tx_timestamp(skb, sockc->tsflags);
2584 skb_zcopy_set_nouarg(skb, ph.raw);
2585
2586 skb_reserve(skb, hlen);
2587 skb_reset_network_header(skb);
2588
2589 to_write = tp_len;
2590
2591 if (sock->type == SOCK_DGRAM) {
2592 err = dev_hard_header(skb, dev, ntohs(proto), addr,
2593 NULL, tp_len);
2594 if (unlikely(err < 0))
2595 return -EINVAL;
2596 } else if (copylen) {
2597 int hdrlen = min_t(int, copylen, tp_len);
2598
2599 skb_push(skb, dev->hard_header_len);
2600 skb_put(skb, copylen - dev->hard_header_len);
2601 err = skb_store_bits(skb, 0, data, hdrlen);
2602 if (unlikely(err))
2603 return err;
2604 if (!dev_validate_header(dev, skb->data, hdrlen))
2605 return -EINVAL;
2606
2607 data += hdrlen;
2608 to_write -= hdrlen;
2609 }
2610
2611 offset = offset_in_page(data);
2612 len_max = PAGE_SIZE - offset;
2613 len = ((to_write > len_max) ? len_max : to_write);
2614
2615 skb->data_len = to_write;
2616 skb->len += to_write;
2617 skb->truesize += to_write;
2618 refcount_add(to_write, &po->sk.sk_wmem_alloc);
2619
2620 while (likely(to_write)) {
2621 nr_frags = skb_shinfo(skb)->nr_frags;
2622
2623 if (unlikely(nr_frags >= MAX_SKB_FRAGS)) {
2624 pr_err("Packet exceed the number of skb frags(%lu)\n",
2625 MAX_SKB_FRAGS);
2626 return -EFAULT;
2627 }
2628
2629 page = pgv_to_page(data);
2630 data += len;
2631 flush_dcache_page(page);
2632 get_page(page);
2633 skb_fill_page_desc(skb, nr_frags, page, offset, len);
2634 to_write -= len;
2635 offset = 0;
2636 len_max = PAGE_SIZE;
2637 len = ((to_write > len_max) ? len_max : to_write);
2638 }
2639
2640 packet_parse_headers(skb, sock);
2641
2642 return tp_len;
2643 }
2644
2645 static int tpacket_parse_header(struct packet_sock *po, void *frame,
2646 int size_max, void **data)
2647 {
2648 union tpacket_uhdr ph;
2649 int tp_len, off;
2650
2651 ph.raw = frame;
2652
2653 switch (po->tp_version) {
2654 case TPACKET_V3:
2655 if (ph.h3->tp_next_offset != 0) {
2656 pr_warn_once("variable sized slot not supported");
2657 return -EINVAL;
2658 }
2659 tp_len = ph.h3->tp_len;
2660 break;
2661 case TPACKET_V2:
2662 tp_len = ph.h2->tp_len;
2663 break;
2664 default:
2665 tp_len = ph.h1->tp_len;
2666 break;
2667 }
2668 if (unlikely(tp_len > size_max)) {
2669 pr_err("packet size is too long (%d > %d)\n", tp_len, size_max);
2670 return -EMSGSIZE;
2671 }
2672
2673 if (unlikely(po->tp_tx_has_off)) {
2674 int off_min, off_max;
2675
2676 off_min = po->tp_hdrlen - sizeof(struct sockaddr_ll);
2677 off_max = po->tx_ring.frame_size - tp_len;
2678 if (po->sk.sk_type == SOCK_DGRAM) {
2679 switch (po->tp_version) {
2680 case TPACKET_V3:
2681 off = ph.h3->tp_net;
2682 break;
2683 case TPACKET_V2:
2684 off = ph.h2->tp_net;
2685 break;
2686 default:
2687 off = ph.h1->tp_net;
2688 break;
2689 }
2690 } else {
2691 switch (po->tp_version) {
2692 case TPACKET_V3:
2693 off = ph.h3->tp_mac;
2694 break;
2695 case TPACKET_V2:
2696 off = ph.h2->tp_mac;
2697 break;
2698 default:
2699 off = ph.h1->tp_mac;
2700 break;
2701 }
2702 }
2703 if (unlikely((off < off_min) || (off_max < off)))
2704 return -EINVAL;
2705 } else {
2706 off = po->tp_hdrlen - sizeof(struct sockaddr_ll);
2707 }
2708
2709 *data = frame + off;
2710 return tp_len;
2711 }
2712
2713 static int tpacket_snd(struct packet_sock *po, struct msghdr *msg)
2714 {
2715 struct sk_buff *skb = NULL;
2716 struct net_device *dev;
2717 struct virtio_net_hdr *vnet_hdr = NULL;
2718 struct sockcm_cookie sockc;
2719 __be16 proto;
2720 int err, reserve = 0;
2721 void *ph;
2722 DECLARE_SOCKADDR(struct sockaddr_ll *, saddr, msg->msg_name);
2723 bool need_wait = !(msg->msg_flags & MSG_DONTWAIT);
2724 unsigned char *addr = NULL;
2725 int tp_len, size_max;
2726 void *data;
2727 int len_sum = 0;
2728 int status = TP_STATUS_AVAILABLE;
2729 int hlen, tlen, copylen = 0;
2730 long timeo = 0;
2731
2732 mutex_lock(&po->pg_vec_lock);
2733
2734
2735
2736
2737 if (unlikely(!po->tx_ring.pg_vec)) {
2738 err = -EBUSY;
2739 goto out;
2740 }
2741 if (likely(saddr == NULL)) {
2742 dev = packet_cached_dev_get(po);
2743 proto = READ_ONCE(po->num);
2744 } else {
2745 err = -EINVAL;
2746 if (msg->msg_namelen < sizeof(struct sockaddr_ll))
2747 goto out;
2748 if (msg->msg_namelen < (saddr->sll_halen
2749 + offsetof(struct sockaddr_ll,
2750 sll_addr)))
2751 goto out;
2752 proto = saddr->sll_protocol;
2753 dev = dev_get_by_index(sock_net(&po->sk), saddr->sll_ifindex);
2754 if (po->sk.sk_socket->type == SOCK_DGRAM) {
2755 if (dev && msg->msg_namelen < dev->addr_len +
2756 offsetof(struct sockaddr_ll, sll_addr))
2757 goto out_put;
2758 addr = saddr->sll_addr;
2759 }
2760 }
2761
2762 err = -ENXIO;
2763 if (unlikely(dev == NULL))
2764 goto out;
2765 err = -ENETDOWN;
2766 if (unlikely(!(dev->flags & IFF_UP)))
2767 goto out_put;
2768
2769 sockcm_init(&sockc, &po->sk);
2770 if (msg->msg_controllen) {
2771 err = sock_cmsg_send(&po->sk, msg, &sockc);
2772 if (unlikely(err))
2773 goto out_put;
2774 }
2775
2776 if (po->sk.sk_socket->type == SOCK_RAW)
2777 reserve = dev->hard_header_len;
2778 size_max = po->tx_ring.frame_size
2779 - (po->tp_hdrlen - sizeof(struct sockaddr_ll));
2780
2781 if ((size_max > dev->mtu + reserve + VLAN_HLEN) && !po->has_vnet_hdr)
2782 size_max = dev->mtu + reserve + VLAN_HLEN;
2783
2784 reinit_completion(&po->skb_completion);
2785
2786 do {
2787 ph = packet_current_frame(po, &po->tx_ring,
2788 TP_STATUS_SEND_REQUEST);
2789 if (unlikely(ph == NULL)) {
2790 if (need_wait && skb) {
2791 timeo = sock_sndtimeo(&po->sk, msg->msg_flags & MSG_DONTWAIT);
2792 timeo = wait_for_completion_interruptible_timeout(&po->skb_completion, timeo);
2793 if (timeo <= 0) {
2794 err = !timeo ? -ETIMEDOUT : -ERESTARTSYS;
2795 goto out_put;
2796 }
2797 }
2798
2799 continue;
2800 }
2801
2802 skb = NULL;
2803 tp_len = tpacket_parse_header(po, ph, size_max, &data);
2804 if (tp_len < 0)
2805 goto tpacket_error;
2806
2807 status = TP_STATUS_SEND_REQUEST;
2808 hlen = LL_RESERVED_SPACE(dev);
2809 tlen = dev->needed_tailroom;
2810 if (po->has_vnet_hdr) {
2811 vnet_hdr = data;
2812 data += sizeof(*vnet_hdr);
2813 tp_len -= sizeof(*vnet_hdr);
2814 if (tp_len < 0 ||
2815 __packet_snd_vnet_parse(vnet_hdr, tp_len)) {
2816 tp_len = -EINVAL;
2817 goto tpacket_error;
2818 }
2819 copylen = __virtio16_to_cpu(vio_le(),
2820 vnet_hdr->hdr_len);
2821 }
2822 copylen = max_t(int, copylen, dev->hard_header_len);
2823 skb = sock_alloc_send_skb(&po->sk,
2824 hlen + tlen + sizeof(struct sockaddr_ll) +
2825 (copylen - dev->hard_header_len),
2826 !need_wait, &err);
2827
2828 if (unlikely(skb == NULL)) {
2829
2830 if (likely(len_sum > 0))
2831 err = len_sum;
2832 goto out_status;
2833 }
2834 tp_len = tpacket_fill_skb(po, skb, ph, dev, data, tp_len, proto,
2835 addr, hlen, copylen, &sockc);
2836 if (likely(tp_len >= 0) &&
2837 tp_len > dev->mtu + reserve &&
2838 !po->has_vnet_hdr &&
2839 !packet_extra_vlan_len_allowed(dev, skb))
2840 tp_len = -EMSGSIZE;
2841
2842 if (unlikely(tp_len < 0)) {
2843 tpacket_error:
2844 if (po->tp_loss) {
2845 __packet_set_status(po, ph,
2846 TP_STATUS_AVAILABLE);
2847 packet_increment_head(&po->tx_ring);
2848 kfree_skb(skb);
2849 continue;
2850 } else {
2851 status = TP_STATUS_WRONG_FORMAT;
2852 err = tp_len;
2853 goto out_status;
2854 }
2855 }
2856
2857 if (po->has_vnet_hdr) {
2858 if (virtio_net_hdr_to_skb(skb, vnet_hdr, vio_le())) {
2859 tp_len = -EINVAL;
2860 goto tpacket_error;
2861 }
2862 virtio_net_hdr_set_proto(skb, vnet_hdr);
2863 }
2864
2865 skb->destructor = tpacket_destruct_skb;
2866 __packet_set_status(po, ph, TP_STATUS_SENDING);
2867 packet_inc_pending(&po->tx_ring);
2868
2869 status = TP_STATUS_SEND_REQUEST;
2870 err = po->xmit(skb);
2871 if (unlikely(err != 0)) {
2872 if (err > 0)
2873 err = net_xmit_errno(err);
2874 if (err && __packet_get_status(po, ph) ==
2875 TP_STATUS_AVAILABLE) {
2876
2877 skb = NULL;
2878 goto out_status;
2879 }
2880
2881
2882
2883
2884 err = 0;
2885 }
2886 packet_increment_head(&po->tx_ring);
2887 len_sum += tp_len;
2888 } while (likely((ph != NULL) ||
2889
2890
2891
2892
2893
2894
2895 (need_wait && packet_read_pending(&po->tx_ring))));
2896
2897 err = len_sum;
2898 goto out_put;
2899
2900 out_status:
2901 __packet_set_status(po, ph, status);
2902 kfree_skb(skb);
2903 out_put:
2904 dev_put(dev);
2905 out:
2906 mutex_unlock(&po->pg_vec_lock);
2907 return err;
2908 }
2909
2910 static struct sk_buff *packet_alloc_skb(struct sock *sk, size_t prepad,
2911 size_t reserve, size_t len,
2912 size_t linear, int noblock,
2913 int *err)
2914 {
2915 struct sk_buff *skb;
2916
2917
2918 if (prepad + len < PAGE_SIZE || !linear)
2919 linear = len;
2920
2921 skb = sock_alloc_send_pskb(sk, prepad + linear, len - linear, noblock,
2922 err, 0);
2923 if (!skb)
2924 return NULL;
2925
2926 skb_reserve(skb, reserve);
2927 skb_put(skb, linear);
2928 skb->data_len = len - linear;
2929 skb->len += len - linear;
2930
2931 return skb;
2932 }
2933
2934 static int packet_snd(struct socket *sock, struct msghdr *msg, size_t len)
2935 {
2936 struct sock *sk = sock->sk;
2937 DECLARE_SOCKADDR(struct sockaddr_ll *, saddr, msg->msg_name);
2938 struct sk_buff *skb;
2939 struct net_device *dev;
2940 __be16 proto;
2941 unsigned char *addr = NULL;
2942 int err, reserve = 0;
2943 struct sockcm_cookie sockc;
2944 struct virtio_net_hdr vnet_hdr = { 0 };
2945 int offset = 0;
2946 struct packet_sock *po = pkt_sk(sk);
2947 bool has_vnet_hdr = false;
2948 int hlen, tlen, linear;
2949 int extra_len = 0;
2950
2951
2952
2953
2954
2955 if (likely(saddr == NULL)) {
2956 dev = packet_cached_dev_get(po);
2957 proto = READ_ONCE(po->num);
2958 } else {
2959 err = -EINVAL;
2960 if (msg->msg_namelen < sizeof(struct sockaddr_ll))
2961 goto out;
2962 if (msg->msg_namelen < (saddr->sll_halen + offsetof(struct sockaddr_ll, sll_addr)))
2963 goto out;
2964 proto = saddr->sll_protocol;
2965 dev = dev_get_by_index(sock_net(sk), saddr->sll_ifindex);
2966 if (sock->type == SOCK_DGRAM) {
2967 if (dev && msg->msg_namelen < dev->addr_len +
2968 offsetof(struct sockaddr_ll, sll_addr))
2969 goto out_unlock;
2970 addr = saddr->sll_addr;
2971 }
2972 }
2973
2974 err = -ENXIO;
2975 if (unlikely(dev == NULL))
2976 goto out_unlock;
2977 err = -ENETDOWN;
2978 if (unlikely(!(dev->flags & IFF_UP)))
2979 goto out_unlock;
2980
2981 sockcm_init(&sockc, sk);
2982 sockc.mark = sk->sk_mark;
2983 if (msg->msg_controllen) {
2984 err = sock_cmsg_send(sk, msg, &sockc);
2985 if (unlikely(err))
2986 goto out_unlock;
2987 }
2988
2989 if (sock->type == SOCK_RAW)
2990 reserve = dev->hard_header_len;
2991 if (po->has_vnet_hdr) {
2992 err = packet_snd_vnet_parse(msg, &len, &vnet_hdr);
2993 if (err)
2994 goto out_unlock;
2995 has_vnet_hdr = true;
2996 }
2997
2998 if (unlikely(sock_flag(sk, SOCK_NOFCS))) {
2999 if (!netif_supports_nofcs(dev)) {
3000 err = -EPROTONOSUPPORT;
3001 goto out_unlock;
3002 }
3003 extra_len = 4;
3004 }
3005
3006 err = -EMSGSIZE;
3007 if (!vnet_hdr.gso_type &&
3008 (len > dev->mtu + reserve + VLAN_HLEN + extra_len))
3009 goto out_unlock;
3010
3011 err = -ENOBUFS;
3012 hlen = LL_RESERVED_SPACE(dev);
3013 tlen = dev->needed_tailroom;
3014 linear = __virtio16_to_cpu(vio_le(), vnet_hdr.hdr_len);
3015 linear = max(linear, min_t(int, len, dev->hard_header_len));
3016 skb = packet_alloc_skb(sk, hlen + tlen, hlen, len, linear,
3017 msg->msg_flags & MSG_DONTWAIT, &err);
3018 if (skb == NULL)
3019 goto out_unlock;
3020
3021 skb_reset_network_header(skb);
3022
3023 err = -EINVAL;
3024 if (sock->type == SOCK_DGRAM) {
3025 offset = dev_hard_header(skb, dev, ntohs(proto), addr, NULL, len);
3026 if (unlikely(offset < 0))
3027 goto out_free;
3028 } else if (reserve) {
3029 skb_reserve(skb, -reserve);
3030 if (len < reserve + sizeof(struct ipv6hdr) &&
3031 dev->min_header_len != dev->hard_header_len)
3032 skb_reset_network_header(skb);
3033 }
3034
3035
3036 err = skb_copy_datagram_from_iter(skb, offset, &msg->msg_iter, len);
3037 if (err)
3038 goto out_free;
3039
3040 if ((sock->type == SOCK_RAW &&
3041 !dev_validate_header(dev, skb->data, len)) || !skb->len) {
3042 err = -EINVAL;
3043 goto out_free;
3044 }
3045
3046 skb_setup_tx_timestamp(skb, sockc.tsflags);
3047
3048 if (!vnet_hdr.gso_type && (len > dev->mtu + reserve + extra_len) &&
3049 !packet_extra_vlan_len_allowed(dev, skb)) {
3050 err = -EMSGSIZE;
3051 goto out_free;
3052 }
3053
3054 skb->protocol = proto;
3055 skb->dev = dev;
3056 skb->priority = sk->sk_priority;
3057 skb->mark = sockc.mark;
3058 skb->tstamp = sockc.transmit_time;
3059
3060 if (unlikely(extra_len == 4))
3061 skb->no_fcs = 1;
3062
3063 packet_parse_headers(skb, sock);
3064
3065 if (has_vnet_hdr) {
3066 err = virtio_net_hdr_to_skb(skb, &vnet_hdr, vio_le());
3067 if (err)
3068 goto out_free;
3069 len += sizeof(vnet_hdr);
3070 virtio_net_hdr_set_proto(skb, &vnet_hdr);
3071 }
3072
3073 err = po->xmit(skb);
3074 if (unlikely(err != 0)) {
3075 if (err > 0)
3076 err = net_xmit_errno(err);
3077 if (err)
3078 goto out_unlock;
3079 }
3080
3081 dev_put(dev);
3082
3083 return len;
3084
3085 out_free:
3086 kfree_skb(skb);
3087 out_unlock:
3088 dev_put(dev);
3089 out:
3090 return err;
3091 }
3092
3093 static int packet_sendmsg(struct socket *sock, struct msghdr *msg, size_t len)
3094 {
3095 struct sock *sk = sock->sk;
3096 struct packet_sock *po = pkt_sk(sk);
3097
3098
3099
3100
3101 if (data_race(po->tx_ring.pg_vec))
3102 return tpacket_snd(po, msg);
3103
3104 return packet_snd(sock, msg, len);
3105 }
3106
3107
3108
3109
3110
3111
3112 static int packet_release(struct socket *sock)
3113 {
3114 struct sock *sk = sock->sk;
3115 struct packet_sock *po;
3116 struct packet_fanout *f;
3117 struct net *net;
3118 union tpacket_req_u req_u;
3119
3120 if (!sk)
3121 return 0;
3122
3123 net = sock_net(sk);
3124 po = pkt_sk(sk);
3125
3126 mutex_lock(&net->packet.sklist_lock);
3127 sk_del_node_init_rcu(sk);
3128 mutex_unlock(&net->packet.sklist_lock);
3129
3130 sock_prot_inuse_add(net, sk->sk_prot, -1);
3131
3132 spin_lock(&po->bind_lock);
3133 unregister_prot_hook(sk, false);
3134 packet_cached_dev_reset(po);
3135
3136 if (po->prot_hook.dev) {
3137 netdev_put(po->prot_hook.dev, &po->prot_hook.dev_tracker);
3138 po->prot_hook.dev = NULL;
3139 }
3140 spin_unlock(&po->bind_lock);
3141
3142 packet_flush_mclist(sk);
3143
3144 lock_sock(sk);
3145 if (po->rx_ring.pg_vec) {
3146 memset(&req_u, 0, sizeof(req_u));
3147 packet_set_ring(sk, &req_u, 1, 0);
3148 }
3149
3150 if (po->tx_ring.pg_vec) {
3151 memset(&req_u, 0, sizeof(req_u));
3152 packet_set_ring(sk, &req_u, 1, 1);
3153 }
3154 release_sock(sk);
3155
3156 f = fanout_release(sk);
3157
3158 synchronize_net();
3159
3160 kfree(po->rollover);
3161 if (f) {
3162 fanout_release_data(f);
3163 kvfree(f);
3164 }
3165
3166
3167
3168 sock_orphan(sk);
3169 sock->sk = NULL;
3170
3171
3172
3173 skb_queue_purge(&sk->sk_receive_queue);
3174 packet_free_pending(po);
3175 sk_refcnt_debug_release(sk);
3176
3177 sock_put(sk);
3178 return 0;
3179 }
3180
3181
3182
3183
3184
3185 static int packet_do_bind(struct sock *sk, const char *name, int ifindex,
3186 __be16 proto)
3187 {
3188 struct packet_sock *po = pkt_sk(sk);
3189 struct net_device *dev = NULL;
3190 bool unlisted = false;
3191 bool need_rehook;
3192 int ret = 0;
3193
3194 lock_sock(sk);
3195 spin_lock(&po->bind_lock);
3196 rcu_read_lock();
3197
3198 if (po->fanout) {
3199 ret = -EINVAL;
3200 goto out_unlock;
3201 }
3202
3203 if (name) {
3204 dev = dev_get_by_name_rcu(sock_net(sk), name);
3205 if (!dev) {
3206 ret = -ENODEV;
3207 goto out_unlock;
3208 }
3209 } else if (ifindex) {
3210 dev = dev_get_by_index_rcu(sock_net(sk), ifindex);
3211 if (!dev) {
3212 ret = -ENODEV;
3213 goto out_unlock;
3214 }
3215 }
3216
3217 need_rehook = po->prot_hook.type != proto || po->prot_hook.dev != dev;
3218
3219 if (need_rehook) {
3220 dev_hold(dev);
3221 if (po->running) {
3222 rcu_read_unlock();
3223
3224
3225
3226 WRITE_ONCE(po->num, 0);
3227 __unregister_prot_hook(sk, true);
3228 rcu_read_lock();
3229 if (dev)
3230 unlisted = !dev_get_by_index_rcu(sock_net(sk),
3231 dev->ifindex);
3232 }
3233
3234 BUG_ON(po->running);
3235 WRITE_ONCE(po->num, proto);
3236 po->prot_hook.type = proto;
3237
3238 netdev_put(po->prot_hook.dev, &po->prot_hook.dev_tracker);
3239
3240 if (unlikely(unlisted)) {
3241 po->prot_hook.dev = NULL;
3242 WRITE_ONCE(po->ifindex, -1);
3243 packet_cached_dev_reset(po);
3244 } else {
3245 netdev_hold(dev, &po->prot_hook.dev_tracker,
3246 GFP_ATOMIC);
3247 po->prot_hook.dev = dev;
3248 WRITE_ONCE(po->ifindex, dev ? dev->ifindex : 0);
3249 packet_cached_dev_assign(po, dev);
3250 }
3251 dev_put(dev);
3252 }
3253
3254 if (proto == 0 || !need_rehook)
3255 goto out_unlock;
3256
3257 if (!unlisted && (!dev || (dev->flags & IFF_UP))) {
3258 register_prot_hook(sk);
3259 } else {
3260 sk->sk_err = ENETDOWN;
3261 if (!sock_flag(sk, SOCK_DEAD))
3262 sk_error_report(sk);
3263 }
3264
3265 out_unlock:
3266 rcu_read_unlock();
3267 spin_unlock(&po->bind_lock);
3268 release_sock(sk);
3269 return ret;
3270 }
3271
3272
3273
3274
3275
3276 static int packet_bind_spkt(struct socket *sock, struct sockaddr *uaddr,
3277 int addr_len)
3278 {
3279 struct sock *sk = sock->sk;
3280 char name[sizeof(uaddr->sa_data) + 1];
3281
3282
3283
3284
3285
3286 if (addr_len != sizeof(struct sockaddr))
3287 return -EINVAL;
3288
3289
3290
3291 memcpy(name, uaddr->sa_data, sizeof(uaddr->sa_data));
3292 name[sizeof(uaddr->sa_data)] = 0;
3293
3294 return packet_do_bind(sk, name, 0, pkt_sk(sk)->num);
3295 }
3296
3297 static int packet_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
3298 {
3299 struct sockaddr_ll *sll = (struct sockaddr_ll *)uaddr;
3300 struct sock *sk = sock->sk;
3301
3302
3303
3304
3305
3306 if (addr_len < sizeof(struct sockaddr_ll))
3307 return -EINVAL;
3308 if (sll->sll_family != AF_PACKET)
3309 return -EINVAL;
3310
3311 return packet_do_bind(sk, NULL, sll->sll_ifindex,
3312 sll->sll_protocol ? : pkt_sk(sk)->num);
3313 }
3314
3315 static struct proto packet_proto = {
3316 .name = "PACKET",
3317 .owner = THIS_MODULE,
3318 .obj_size = sizeof(struct packet_sock),
3319 };
3320
3321
3322
3323
3324
3325 static int packet_create(struct net *net, struct socket *sock, int protocol,
3326 int kern)
3327 {
3328 struct sock *sk;
3329 struct packet_sock *po;
3330 __be16 proto = (__force __be16)protocol;
3331 int err;
3332
3333 if (!ns_capable(net->user_ns, CAP_NET_RAW))
3334 return -EPERM;
3335 if (sock->type != SOCK_DGRAM && sock->type != SOCK_RAW &&
3336 sock->type != SOCK_PACKET)
3337 return -ESOCKTNOSUPPORT;
3338
3339 sock->state = SS_UNCONNECTED;
3340
3341 err = -ENOBUFS;
3342 sk = sk_alloc(net, PF_PACKET, GFP_KERNEL, &packet_proto, kern);
3343 if (sk == NULL)
3344 goto out;
3345
3346 sock->ops = &packet_ops;
3347 if (sock->type == SOCK_PACKET)
3348 sock->ops = &packet_ops_spkt;
3349
3350 sock_init_data(sock, sk);
3351
3352 po = pkt_sk(sk);
3353 init_completion(&po->skb_completion);
3354 sk->sk_family = PF_PACKET;
3355 po->num = proto;
3356 po->xmit = dev_queue_xmit;
3357
3358 err = packet_alloc_pending(po);
3359 if (err)
3360 goto out2;
3361
3362 packet_cached_dev_reset(po);
3363
3364 sk->sk_destruct = packet_sock_destruct;
3365 sk_refcnt_debug_inc(sk);
3366
3367
3368
3369
3370
3371 spin_lock_init(&po->bind_lock);
3372 mutex_init(&po->pg_vec_lock);
3373 po->rollover = NULL;
3374 po->prot_hook.func = packet_rcv;
3375
3376 if (sock->type == SOCK_PACKET)
3377 po->prot_hook.func = packet_rcv_spkt;
3378
3379 po->prot_hook.af_packet_priv = sk;
3380 po->prot_hook.af_packet_net = sock_net(sk);
3381
3382 if (proto) {
3383 po->prot_hook.type = proto;
3384 __register_prot_hook(sk);
3385 }
3386
3387 mutex_lock(&net->packet.sklist_lock);
3388 sk_add_node_tail_rcu(sk, &net->packet.sklist);
3389 mutex_unlock(&net->packet.sklist_lock);
3390
3391 sock_prot_inuse_add(net, &packet_proto, 1);
3392
3393 return 0;
3394 out2:
3395 sk_free(sk);
3396 out:
3397 return err;
3398 }
3399
3400
3401
3402
3403
3404
3405 static int packet_recvmsg(struct socket *sock, struct msghdr *msg, size_t len,
3406 int flags)
3407 {
3408 struct sock *sk = sock->sk;
3409 struct sk_buff *skb;
3410 int copied, err;
3411 int vnet_hdr_len = 0;
3412 unsigned int origlen = 0;
3413
3414 err = -EINVAL;
3415 if (flags & ~(MSG_PEEK|MSG_DONTWAIT|MSG_TRUNC|MSG_CMSG_COMPAT|MSG_ERRQUEUE))
3416 goto out;
3417
3418 #if 0
3419
3420 if (pkt_sk(sk)->ifindex < 0)
3421 return -ENODEV;
3422 #endif
3423
3424 if (flags & MSG_ERRQUEUE) {
3425 err = sock_recv_errqueue(sk, msg, len,
3426 SOL_PACKET, PACKET_TX_TIMESTAMP);
3427 goto out;
3428 }
3429
3430
3431
3432
3433
3434
3435
3436
3437
3438
3439 skb = skb_recv_datagram(sk, flags, &err);
3440
3441
3442
3443
3444
3445
3446
3447 if (skb == NULL)
3448 goto out;
3449
3450 packet_rcv_try_clear_pressure(pkt_sk(sk));
3451
3452 if (pkt_sk(sk)->has_vnet_hdr) {
3453 err = packet_rcv_vnet(msg, skb, &len);
3454 if (err)
3455 goto out_free;
3456 vnet_hdr_len = sizeof(struct virtio_net_hdr);
3457 }
3458
3459
3460
3461
3462
3463 copied = skb->len;
3464 if (copied > len) {
3465 copied = len;
3466 msg->msg_flags |= MSG_TRUNC;
3467 }
3468
3469 err = skb_copy_datagram_msg(skb, 0, msg, copied);
3470 if (err)
3471 goto out_free;
3472
3473 if (sock->type != SOCK_PACKET) {
3474 struct sockaddr_ll *sll = &PACKET_SKB_CB(skb)->sa.ll;
3475
3476
3477 origlen = PACKET_SKB_CB(skb)->sa.origlen;
3478 sll->sll_family = AF_PACKET;
3479 sll->sll_protocol = skb->protocol;
3480 }
3481
3482 sock_recv_cmsgs(msg, sk, skb);
3483
3484 if (msg->msg_name) {
3485 const size_t max_len = min(sizeof(skb->cb),
3486 sizeof(struct sockaddr_storage));
3487 int copy_len;
3488
3489
3490
3491
3492 if (sock->type == SOCK_PACKET) {
3493 __sockaddr_check_size(sizeof(struct sockaddr_pkt));
3494 msg->msg_namelen = sizeof(struct sockaddr_pkt);
3495 copy_len = msg->msg_namelen;
3496 } else {
3497 struct sockaddr_ll *sll = &PACKET_SKB_CB(skb)->sa.ll;
3498
3499 msg->msg_namelen = sll->sll_halen +
3500 offsetof(struct sockaddr_ll, sll_addr);
3501 copy_len = msg->msg_namelen;
3502 if (msg->msg_namelen < sizeof(struct sockaddr_ll)) {
3503 memset(msg->msg_name +
3504 offsetof(struct sockaddr_ll, sll_addr),
3505 0, sizeof(sll->sll_addr));
3506 msg->msg_namelen = sizeof(struct sockaddr_ll);
3507 }
3508 }
3509 if (WARN_ON_ONCE(copy_len > max_len)) {
3510 copy_len = max_len;
3511 msg->msg_namelen = copy_len;
3512 }
3513 memcpy(msg->msg_name, &PACKET_SKB_CB(skb)->sa, copy_len);
3514 }
3515
3516 if (pkt_sk(sk)->auxdata) {
3517 struct tpacket_auxdata aux;
3518
3519 aux.tp_status = TP_STATUS_USER;
3520 if (skb->ip_summed == CHECKSUM_PARTIAL)
3521 aux.tp_status |= TP_STATUS_CSUMNOTREADY;
3522 else if (skb->pkt_type != PACKET_OUTGOING &&
3523 (skb->ip_summed == CHECKSUM_COMPLETE ||
3524 skb_csum_unnecessary(skb)))
3525 aux.tp_status |= TP_STATUS_CSUM_VALID;
3526
3527 aux.tp_len = origlen;
3528 aux.tp_snaplen = skb->len;
3529 aux.tp_mac = 0;
3530 aux.tp_net = skb_network_offset(skb);
3531 if (skb_vlan_tag_present(skb)) {
3532 aux.tp_vlan_tci = skb_vlan_tag_get(skb);
3533 aux.tp_vlan_tpid = ntohs(skb->vlan_proto);
3534 aux.tp_status |= TP_STATUS_VLAN_VALID | TP_STATUS_VLAN_TPID_VALID;
3535 } else {
3536 aux.tp_vlan_tci = 0;
3537 aux.tp_vlan_tpid = 0;
3538 }
3539 put_cmsg(msg, SOL_PACKET, PACKET_AUXDATA, sizeof(aux), &aux);
3540 }
3541
3542
3543
3544
3545
3546 err = vnet_hdr_len + ((flags&MSG_TRUNC) ? skb->len : copied);
3547
3548 out_free:
3549 skb_free_datagram(sk, skb);
3550 out:
3551 return err;
3552 }
3553
3554 static int packet_getname_spkt(struct socket *sock, struct sockaddr *uaddr,
3555 int peer)
3556 {
3557 struct net_device *dev;
3558 struct sock *sk = sock->sk;
3559
3560 if (peer)
3561 return -EOPNOTSUPP;
3562
3563 uaddr->sa_family = AF_PACKET;
3564 memset(uaddr->sa_data, 0, sizeof(uaddr->sa_data));
3565 rcu_read_lock();
3566 dev = dev_get_by_index_rcu(sock_net(sk), READ_ONCE(pkt_sk(sk)->ifindex));
3567 if (dev)
3568 strlcpy(uaddr->sa_data, dev->name, sizeof(uaddr->sa_data));
3569 rcu_read_unlock();
3570
3571 return sizeof(*uaddr);
3572 }
3573
3574 static int packet_getname(struct socket *sock, struct sockaddr *uaddr,
3575 int peer)
3576 {
3577 struct net_device *dev;
3578 struct sock *sk = sock->sk;
3579 struct packet_sock *po = pkt_sk(sk);
3580 DECLARE_SOCKADDR(struct sockaddr_ll *, sll, uaddr);
3581 int ifindex;
3582
3583 if (peer)
3584 return -EOPNOTSUPP;
3585
3586 ifindex = READ_ONCE(po->ifindex);
3587 sll->sll_family = AF_PACKET;
3588 sll->sll_ifindex = ifindex;
3589 sll->sll_protocol = READ_ONCE(po->num);
3590 sll->sll_pkttype = 0;
3591 rcu_read_lock();
3592 dev = dev_get_by_index_rcu(sock_net(sk), ifindex);
3593 if (dev) {
3594 sll->sll_hatype = dev->type;
3595 sll->sll_halen = dev->addr_len;
3596 memcpy(sll->sll_addr, dev->dev_addr, dev->addr_len);
3597 } else {
3598 sll->sll_hatype = 0;
3599 sll->sll_halen = 0;
3600 }
3601 rcu_read_unlock();
3602
3603 return offsetof(struct sockaddr_ll, sll_addr) + sll->sll_halen;
3604 }
3605
3606 static int packet_dev_mc(struct net_device *dev, struct packet_mclist *i,
3607 int what)
3608 {
3609 switch (i->type) {
3610 case PACKET_MR_MULTICAST:
3611 if (i->alen != dev->addr_len)
3612 return -EINVAL;
3613 if (what > 0)
3614 return dev_mc_add(dev, i->addr);
3615 else
3616 return dev_mc_del(dev, i->addr);
3617 break;
3618 case PACKET_MR_PROMISC:
3619 return dev_set_promiscuity(dev, what);
3620 case PACKET_MR_ALLMULTI:
3621 return dev_set_allmulti(dev, what);
3622 case PACKET_MR_UNICAST:
3623 if (i->alen != dev->addr_len)
3624 return -EINVAL;
3625 if (what > 0)
3626 return dev_uc_add(dev, i->addr);
3627 else
3628 return dev_uc_del(dev, i->addr);
3629 break;
3630 default:
3631 break;
3632 }
3633 return 0;
3634 }
3635
3636 static void packet_dev_mclist_delete(struct net_device *dev,
3637 struct packet_mclist **mlp)
3638 {
3639 struct packet_mclist *ml;
3640
3641 while ((ml = *mlp) != NULL) {
3642 if (ml->ifindex == dev->ifindex) {
3643 packet_dev_mc(dev, ml, -1);
3644 *mlp = ml->next;
3645 kfree(ml);
3646 } else
3647 mlp = &ml->next;
3648 }
3649 }
3650
3651 static int packet_mc_add(struct sock *sk, struct packet_mreq_max *mreq)
3652 {
3653 struct packet_sock *po = pkt_sk(sk);
3654 struct packet_mclist *ml, *i;
3655 struct net_device *dev;
3656 int err;
3657
3658 rtnl_lock();
3659
3660 err = -ENODEV;
3661 dev = __dev_get_by_index(sock_net(sk), mreq->mr_ifindex);
3662 if (!dev)
3663 goto done;
3664
3665 err = -EINVAL;
3666 if (mreq->mr_alen > dev->addr_len)
3667 goto done;
3668
3669 err = -ENOBUFS;
3670 i = kmalloc(sizeof(*i), GFP_KERNEL);
3671 if (i == NULL)
3672 goto done;
3673
3674 err = 0;
3675 for (ml = po->mclist; ml; ml = ml->next) {
3676 if (ml->ifindex == mreq->mr_ifindex &&
3677 ml->type == mreq->mr_type &&
3678 ml->alen == mreq->mr_alen &&
3679 memcmp(ml->addr, mreq->mr_address, ml->alen) == 0) {
3680 ml->count++;
3681
3682 kfree(i);
3683 goto done;
3684 }
3685 }
3686
3687 i->type = mreq->mr_type;
3688 i->ifindex = mreq->mr_ifindex;
3689 i->alen = mreq->mr_alen;
3690 memcpy(i->addr, mreq->mr_address, i->alen);
3691 memset(i->addr + i->alen, 0, sizeof(i->addr) - i->alen);
3692 i->count = 1;
3693 i->next = po->mclist;
3694 po->mclist = i;
3695 err = packet_dev_mc(dev, i, 1);
3696 if (err) {
3697 po->mclist = i->next;
3698 kfree(i);
3699 }
3700
3701 done:
3702 rtnl_unlock();
3703 return err;
3704 }
3705
3706 static int packet_mc_drop(struct sock *sk, struct packet_mreq_max *mreq)
3707 {
3708 struct packet_mclist *ml, **mlp;
3709
3710 rtnl_lock();
3711
3712 for (mlp = &pkt_sk(sk)->mclist; (ml = *mlp) != NULL; mlp = &ml->next) {
3713 if (ml->ifindex == mreq->mr_ifindex &&
3714 ml->type == mreq->mr_type &&
3715 ml->alen == mreq->mr_alen &&
3716 memcmp(ml->addr, mreq->mr_address, ml->alen) == 0) {
3717 if (--ml->count == 0) {
3718 struct net_device *dev;
3719 *mlp = ml->next;
3720 dev = __dev_get_by_index(sock_net(sk), ml->ifindex);
3721 if (dev)
3722 packet_dev_mc(dev, ml, -1);
3723 kfree(ml);
3724 }
3725 break;
3726 }
3727 }
3728 rtnl_unlock();
3729 return 0;
3730 }
3731
3732 static void packet_flush_mclist(struct sock *sk)
3733 {
3734 struct packet_sock *po = pkt_sk(sk);
3735 struct packet_mclist *ml;
3736
3737 if (!po->mclist)
3738 return;
3739
3740 rtnl_lock();
3741 while ((ml = po->mclist) != NULL) {
3742 struct net_device *dev;
3743
3744 po->mclist = ml->next;
3745 dev = __dev_get_by_index(sock_net(sk), ml->ifindex);
3746 if (dev != NULL)
3747 packet_dev_mc(dev, ml, -1);
3748 kfree(ml);
3749 }
3750 rtnl_unlock();
3751 }
3752
3753 static int
3754 packet_setsockopt(struct socket *sock, int level, int optname, sockptr_t optval,
3755 unsigned int optlen)
3756 {
3757 struct sock *sk = sock->sk;
3758 struct packet_sock *po = pkt_sk(sk);
3759 int ret;
3760
3761 if (level != SOL_PACKET)
3762 return -ENOPROTOOPT;
3763
3764 switch (optname) {
3765 case PACKET_ADD_MEMBERSHIP:
3766 case PACKET_DROP_MEMBERSHIP:
3767 {
3768 struct packet_mreq_max mreq;
3769 int len = optlen;
3770 memset(&mreq, 0, sizeof(mreq));
3771 if (len < sizeof(struct packet_mreq))
3772 return -EINVAL;
3773 if (len > sizeof(mreq))
3774 len = sizeof(mreq);
3775 if (copy_from_sockptr(&mreq, optval, len))
3776 return -EFAULT;
3777 if (len < (mreq.mr_alen + offsetof(struct packet_mreq, mr_address)))
3778 return -EINVAL;
3779 if (optname == PACKET_ADD_MEMBERSHIP)
3780 ret = packet_mc_add(sk, &mreq);
3781 else
3782 ret = packet_mc_drop(sk, &mreq);
3783 return ret;
3784 }
3785
3786 case PACKET_RX_RING:
3787 case PACKET_TX_RING:
3788 {
3789 union tpacket_req_u req_u;
3790 int len;
3791
3792 lock_sock(sk);
3793 switch (po->tp_version) {
3794 case TPACKET_V1:
3795 case TPACKET_V2:
3796 len = sizeof(req_u.req);
3797 break;
3798 case TPACKET_V3:
3799 default:
3800 len = sizeof(req_u.req3);
3801 break;
3802 }
3803 if (optlen < len) {
3804 ret = -EINVAL;
3805 } else {
3806 if (copy_from_sockptr(&req_u.req, optval, len))
3807 ret = -EFAULT;
3808 else
3809 ret = packet_set_ring(sk, &req_u, 0,
3810 optname == PACKET_TX_RING);
3811 }
3812 release_sock(sk);
3813 return ret;
3814 }
3815 case PACKET_COPY_THRESH:
3816 {
3817 int val;
3818
3819 if (optlen != sizeof(val))
3820 return -EINVAL;
3821 if (copy_from_sockptr(&val, optval, sizeof(val)))
3822 return -EFAULT;
3823
3824 pkt_sk(sk)->copy_thresh = val;
3825 return 0;
3826 }
3827 case PACKET_VERSION:
3828 {
3829 int val;
3830
3831 if (optlen != sizeof(val))
3832 return -EINVAL;
3833 if (copy_from_sockptr(&val, optval, sizeof(val)))
3834 return -EFAULT;
3835 switch (val) {
3836 case TPACKET_V1:
3837 case TPACKET_V2:
3838 case TPACKET_V3:
3839 break;
3840 default:
3841 return -EINVAL;
3842 }
3843 lock_sock(sk);
3844 if (po->rx_ring.pg_vec || po->tx_ring.pg_vec) {
3845 ret = -EBUSY;
3846 } else {
3847 po->tp_version = val;
3848 ret = 0;
3849 }
3850 release_sock(sk);
3851 return ret;
3852 }
3853 case PACKET_RESERVE:
3854 {
3855 unsigned int val;
3856
3857 if (optlen != sizeof(val))
3858 return -EINVAL;
3859 if (copy_from_sockptr(&val, optval, sizeof(val)))
3860 return -EFAULT;
3861 if (val > INT_MAX)
3862 return -EINVAL;
3863 lock_sock(sk);
3864 if (po->rx_ring.pg_vec || po->tx_ring.pg_vec) {
3865 ret = -EBUSY;
3866 } else {
3867 po->tp_reserve = val;
3868 ret = 0;
3869 }
3870 release_sock(sk);
3871 return ret;
3872 }
3873 case PACKET_LOSS:
3874 {
3875 unsigned int val;
3876
3877 if (optlen != sizeof(val))
3878 return -EINVAL;
3879 if (copy_from_sockptr(&val, optval, sizeof(val)))
3880 return -EFAULT;
3881
3882 lock_sock(sk);
3883 if (po->rx_ring.pg_vec || po->tx_ring.pg_vec) {
3884 ret = -EBUSY;
3885 } else {
3886 po->tp_loss = !!val;
3887 ret = 0;
3888 }
3889 release_sock(sk);
3890 return ret;
3891 }
3892 case PACKET_AUXDATA:
3893 {
3894 int val;
3895
3896 if (optlen < sizeof(val))
3897 return -EINVAL;
3898 if (copy_from_sockptr(&val, optval, sizeof(val)))
3899 return -EFAULT;
3900
3901 lock_sock(sk);
3902 po->auxdata = !!val;
3903 release_sock(sk);
3904 return 0;
3905 }
3906 case PACKET_ORIGDEV:
3907 {
3908 int val;
3909
3910 if (optlen < sizeof(val))
3911 return -EINVAL;
3912 if (copy_from_sockptr(&val, optval, sizeof(val)))
3913 return -EFAULT;
3914
3915 lock_sock(sk);
3916 po->origdev = !!val;
3917 release_sock(sk);
3918 return 0;
3919 }
3920 case PACKET_VNET_HDR:
3921 {
3922 int val;
3923
3924 if (sock->type != SOCK_RAW)
3925 return -EINVAL;
3926 if (optlen < sizeof(val))
3927 return -EINVAL;
3928 if (copy_from_sockptr(&val, optval, sizeof(val)))
3929 return -EFAULT;
3930
3931 lock_sock(sk);
3932 if (po->rx_ring.pg_vec || po->tx_ring.pg_vec) {
3933 ret = -EBUSY;
3934 } else {
3935 po->has_vnet_hdr = !!val;
3936 ret = 0;
3937 }
3938 release_sock(sk);
3939 return ret;
3940 }
3941 case PACKET_TIMESTAMP:
3942 {
3943 int val;
3944
3945 if (optlen != sizeof(val))
3946 return -EINVAL;
3947 if (copy_from_sockptr(&val, optval, sizeof(val)))
3948 return -EFAULT;
3949
3950 po->tp_tstamp = val;
3951 return 0;
3952 }
3953 case PACKET_FANOUT:
3954 {
3955 struct fanout_args args = { 0 };
3956
3957 if (optlen != sizeof(int) && optlen != sizeof(args))
3958 return -EINVAL;
3959 if (copy_from_sockptr(&args, optval, optlen))
3960 return -EFAULT;
3961
3962 return fanout_add(sk, &args);
3963 }
3964 case PACKET_FANOUT_DATA:
3965 {
3966
3967 if (!READ_ONCE(po->fanout))
3968 return -EINVAL;
3969
3970 return fanout_set_data(po, optval, optlen);
3971 }
3972 case PACKET_IGNORE_OUTGOING:
3973 {
3974 int val;
3975
3976 if (optlen != sizeof(val))
3977 return -EINVAL;
3978 if (copy_from_sockptr(&val, optval, sizeof(val)))
3979 return -EFAULT;
3980 if (val < 0 || val > 1)
3981 return -EINVAL;
3982
3983 po->prot_hook.ignore_outgoing = !!val;
3984 return 0;
3985 }
3986 case PACKET_TX_HAS_OFF:
3987 {
3988 unsigned int val;
3989
3990 if (optlen != sizeof(val))
3991 return -EINVAL;
3992 if (copy_from_sockptr(&val, optval, sizeof(val)))
3993 return -EFAULT;
3994
3995 lock_sock(sk);
3996 if (!po->rx_ring.pg_vec && !po->tx_ring.pg_vec)
3997 po->tp_tx_has_off = !!val;
3998
3999 release_sock(sk);
4000 return 0;
4001 }
4002 case PACKET_QDISC_BYPASS:
4003 {
4004 int val;
4005
4006 if (optlen != sizeof(val))
4007 return -EINVAL;
4008 if (copy_from_sockptr(&val, optval, sizeof(val)))
4009 return -EFAULT;
4010
4011 po->xmit = val ? packet_direct_xmit : dev_queue_xmit;
4012 return 0;
4013 }
4014 default:
4015 return -ENOPROTOOPT;
4016 }
4017 }
4018
4019 static int packet_getsockopt(struct socket *sock, int level, int optname,
4020 char __user *optval, int __user *optlen)
4021 {
4022 int len;
4023 int val, lv = sizeof(val);
4024 struct sock *sk = sock->sk;
4025 struct packet_sock *po = pkt_sk(sk);
4026 void *data = &val;
4027 union tpacket_stats_u st;
4028 struct tpacket_rollover_stats rstats;
4029 int drops;
4030
4031 if (level != SOL_PACKET)
4032 return -ENOPROTOOPT;
4033
4034 if (get_user(len, optlen))
4035 return -EFAULT;
4036
4037 if (len < 0)
4038 return -EINVAL;
4039
4040 switch (optname) {
4041 case PACKET_STATISTICS:
4042 spin_lock_bh(&sk->sk_receive_queue.lock);
4043 memcpy(&st, &po->stats, sizeof(st));
4044 memset(&po->stats, 0, sizeof(po->stats));
4045 spin_unlock_bh(&sk->sk_receive_queue.lock);
4046 drops = atomic_xchg(&po->tp_drops, 0);
4047
4048 if (po->tp_version == TPACKET_V3) {
4049 lv = sizeof(struct tpacket_stats_v3);
4050 st.stats3.tp_drops = drops;
4051 st.stats3.tp_packets += drops;
4052 data = &st.stats3;
4053 } else {
4054 lv = sizeof(struct tpacket_stats);
4055 st.stats1.tp_drops = drops;
4056 st.stats1.tp_packets += drops;
4057 data = &st.stats1;
4058 }
4059
4060 break;
4061 case PACKET_AUXDATA:
4062 val = po->auxdata;
4063 break;
4064 case PACKET_ORIGDEV:
4065 val = po->origdev;
4066 break;
4067 case PACKET_VNET_HDR:
4068 val = po->has_vnet_hdr;
4069 break;
4070 case PACKET_VERSION:
4071 val = po->tp_version;
4072 break;
4073 case PACKET_HDRLEN:
4074 if (len > sizeof(int))
4075 len = sizeof(int);
4076 if (len < sizeof(int))
4077 return -EINVAL;
4078 if (copy_from_user(&val, optval, len))
4079 return -EFAULT;
4080 switch (val) {
4081 case TPACKET_V1:
4082 val = sizeof(struct tpacket_hdr);
4083 break;
4084 case TPACKET_V2:
4085 val = sizeof(struct tpacket2_hdr);
4086 break;
4087 case TPACKET_V3:
4088 val = sizeof(struct tpacket3_hdr);
4089 break;
4090 default:
4091 return -EINVAL;
4092 }
4093 break;
4094 case PACKET_RESERVE:
4095 val = po->tp_reserve;
4096 break;
4097 case PACKET_LOSS:
4098 val = po->tp_loss;
4099 break;
4100 case PACKET_TIMESTAMP:
4101 val = po->tp_tstamp;
4102 break;
4103 case PACKET_FANOUT:
4104 val = (po->fanout ?
4105 ((u32)po->fanout->id |
4106 ((u32)po->fanout->type << 16) |
4107 ((u32)po->fanout->flags << 24)) :
4108 0);
4109 break;
4110 case PACKET_IGNORE_OUTGOING:
4111 val = po->prot_hook.ignore_outgoing;
4112 break;
4113 case PACKET_ROLLOVER_STATS:
4114 if (!po->rollover)
4115 return -EINVAL;
4116 rstats.tp_all = atomic_long_read(&po->rollover->num);
4117 rstats.tp_huge = atomic_long_read(&po->rollover->num_huge);
4118 rstats.tp_failed = atomic_long_read(&po->rollover->num_failed);
4119 data = &rstats;
4120 lv = sizeof(rstats);
4121 break;
4122 case PACKET_TX_HAS_OFF:
4123 val = po->tp_tx_has_off;
4124 break;
4125 case PACKET_QDISC_BYPASS:
4126 val = packet_use_direct_xmit(po);
4127 break;
4128 default:
4129 return -ENOPROTOOPT;
4130 }
4131
4132 if (len > lv)
4133 len = lv;
4134 if (put_user(len, optlen))
4135 return -EFAULT;
4136 if (copy_to_user(optval, data, len))
4137 return -EFAULT;
4138 return 0;
4139 }
4140
4141 static int packet_notifier(struct notifier_block *this,
4142 unsigned long msg, void *ptr)
4143 {
4144 struct sock *sk;
4145 struct net_device *dev = netdev_notifier_info_to_dev(ptr);
4146 struct net *net = dev_net(dev);
4147
4148 rcu_read_lock();
4149 sk_for_each_rcu(sk, &net->packet.sklist) {
4150 struct packet_sock *po = pkt_sk(sk);
4151
4152 switch (msg) {
4153 case NETDEV_UNREGISTER:
4154 if (po->mclist)
4155 packet_dev_mclist_delete(dev, &po->mclist);
4156 fallthrough;
4157
4158 case NETDEV_DOWN:
4159 if (dev->ifindex == po->ifindex) {
4160 spin_lock(&po->bind_lock);
4161 if (po->running) {
4162 __unregister_prot_hook(sk, false);
4163 sk->sk_err = ENETDOWN;
4164 if (!sock_flag(sk, SOCK_DEAD))
4165 sk_error_report(sk);
4166 }
4167 if (msg == NETDEV_UNREGISTER) {
4168 packet_cached_dev_reset(po);
4169 WRITE_ONCE(po->ifindex, -1);
4170 netdev_put(po->prot_hook.dev,
4171 &po->prot_hook.dev_tracker);
4172 po->prot_hook.dev = NULL;
4173 }
4174 spin_unlock(&po->bind_lock);
4175 }
4176 break;
4177 case NETDEV_UP:
4178 if (dev->ifindex == po->ifindex) {
4179 spin_lock(&po->bind_lock);
4180 if (po->num)
4181 register_prot_hook(sk);
4182 spin_unlock(&po->bind_lock);
4183 }
4184 break;
4185 }
4186 }
4187 rcu_read_unlock();
4188 return NOTIFY_DONE;
4189 }
4190
4191
4192 static int packet_ioctl(struct socket *sock, unsigned int cmd,
4193 unsigned long arg)
4194 {
4195 struct sock *sk = sock->sk;
4196
4197 switch (cmd) {
4198 case SIOCOUTQ:
4199 {
4200 int amount = sk_wmem_alloc_get(sk);
4201
4202 return put_user(amount, (int __user *)arg);
4203 }
4204 case SIOCINQ:
4205 {
4206 struct sk_buff *skb;
4207 int amount = 0;
4208
4209 spin_lock_bh(&sk->sk_receive_queue.lock);
4210 skb = skb_peek(&sk->sk_receive_queue);
4211 if (skb)
4212 amount = skb->len;
4213 spin_unlock_bh(&sk->sk_receive_queue.lock);
4214 return put_user(amount, (int __user *)arg);
4215 }
4216 #ifdef CONFIG_INET
4217 case SIOCADDRT:
4218 case SIOCDELRT:
4219 case SIOCDARP:
4220 case SIOCGARP:
4221 case SIOCSARP:
4222 case SIOCGIFADDR:
4223 case SIOCSIFADDR:
4224 case SIOCGIFBRDADDR:
4225 case SIOCSIFBRDADDR:
4226 case SIOCGIFNETMASK:
4227 case SIOCSIFNETMASK:
4228 case SIOCGIFDSTADDR:
4229 case SIOCSIFDSTADDR:
4230 case SIOCSIFFLAGS:
4231 return inet_dgram_ops.ioctl(sock, cmd, arg);
4232 #endif
4233
4234 default:
4235 return -ENOIOCTLCMD;
4236 }
4237 return 0;
4238 }
4239
4240 static __poll_t packet_poll(struct file *file, struct socket *sock,
4241 poll_table *wait)
4242 {
4243 struct sock *sk = sock->sk;
4244 struct packet_sock *po = pkt_sk(sk);
4245 __poll_t mask = datagram_poll(file, sock, wait);
4246
4247 spin_lock_bh(&sk->sk_receive_queue.lock);
4248 if (po->rx_ring.pg_vec) {
4249 if (!packet_previous_rx_frame(po, &po->rx_ring,
4250 TP_STATUS_KERNEL))
4251 mask |= EPOLLIN | EPOLLRDNORM;
4252 }
4253 packet_rcv_try_clear_pressure(po);
4254 spin_unlock_bh(&sk->sk_receive_queue.lock);
4255 spin_lock_bh(&sk->sk_write_queue.lock);
4256 if (po->tx_ring.pg_vec) {
4257 if (packet_current_frame(po, &po->tx_ring, TP_STATUS_AVAILABLE))
4258 mask |= EPOLLOUT | EPOLLWRNORM;
4259 }
4260 spin_unlock_bh(&sk->sk_write_queue.lock);
4261 return mask;
4262 }
4263
4264
4265
4266
4267
4268
4269 static void packet_mm_open(struct vm_area_struct *vma)
4270 {
4271 struct file *file = vma->vm_file;
4272 struct socket *sock = file->private_data;
4273 struct sock *sk = sock->sk;
4274
4275 if (sk)
4276 atomic_inc(&pkt_sk(sk)->mapped);
4277 }
4278
4279 static void packet_mm_close(struct vm_area_struct *vma)
4280 {
4281 struct file *file = vma->vm_file;
4282 struct socket *sock = file->private_data;
4283 struct sock *sk = sock->sk;
4284
4285 if (sk)
4286 atomic_dec(&pkt_sk(sk)->mapped);
4287 }
4288
4289 static const struct vm_operations_struct packet_mmap_ops = {
4290 .open = packet_mm_open,
4291 .close = packet_mm_close,
4292 };
4293
4294 static void free_pg_vec(struct pgv *pg_vec, unsigned int order,
4295 unsigned int len)
4296 {
4297 int i;
4298
4299 for (i = 0; i < len; i++) {
4300 if (likely(pg_vec[i].buffer)) {
4301 if (is_vmalloc_addr(pg_vec[i].buffer))
4302 vfree(pg_vec[i].buffer);
4303 else
4304 free_pages((unsigned long)pg_vec[i].buffer,
4305 order);
4306 pg_vec[i].buffer = NULL;
4307 }
4308 }
4309 kfree(pg_vec);
4310 }
4311
4312 static char *alloc_one_pg_vec_page(unsigned long order)
4313 {
4314 char *buffer;
4315 gfp_t gfp_flags = GFP_KERNEL | __GFP_COMP |
4316 __GFP_ZERO | __GFP_NOWARN | __GFP_NORETRY;
4317
4318 buffer = (char *) __get_free_pages(gfp_flags, order);
4319 if (buffer)
4320 return buffer;
4321
4322
4323 buffer = vzalloc(array_size((1 << order), PAGE_SIZE));
4324 if (buffer)
4325 return buffer;
4326
4327
4328 gfp_flags &= ~__GFP_NORETRY;
4329 buffer = (char *) __get_free_pages(gfp_flags, order);
4330 if (buffer)
4331 return buffer;
4332
4333
4334 return NULL;
4335 }
4336
4337 static struct pgv *alloc_pg_vec(struct tpacket_req *req, int order)
4338 {
4339 unsigned int block_nr = req->tp_block_nr;
4340 struct pgv *pg_vec;
4341 int i;
4342
4343 pg_vec = kcalloc(block_nr, sizeof(struct pgv), GFP_KERNEL | __GFP_NOWARN);
4344 if (unlikely(!pg_vec))
4345 goto out;
4346
4347 for (i = 0; i < block_nr; i++) {
4348 pg_vec[i].buffer = alloc_one_pg_vec_page(order);
4349 if (unlikely(!pg_vec[i].buffer))
4350 goto out_free_pgvec;
4351 }
4352
4353 out:
4354 return pg_vec;
4355
4356 out_free_pgvec:
4357 free_pg_vec(pg_vec, order, block_nr);
4358 pg_vec = NULL;
4359 goto out;
4360 }
4361
4362 static int packet_set_ring(struct sock *sk, union tpacket_req_u *req_u,
4363 int closing, int tx_ring)
4364 {
4365 struct pgv *pg_vec = NULL;
4366 struct packet_sock *po = pkt_sk(sk);
4367 unsigned long *rx_owner_map = NULL;
4368 int was_running, order = 0;
4369 struct packet_ring_buffer *rb;
4370 struct sk_buff_head *rb_queue;
4371 __be16 num;
4372 int err;
4373
4374 struct tpacket_req *req = &req_u->req;
4375
4376 rb = tx_ring ? &po->tx_ring : &po->rx_ring;
4377 rb_queue = tx_ring ? &sk->sk_write_queue : &sk->sk_receive_queue;
4378
4379 err = -EBUSY;
4380 if (!closing) {
4381 if (atomic_read(&po->mapped))
4382 goto out;
4383 if (packet_read_pending(rb))
4384 goto out;
4385 }
4386
4387 if (req->tp_block_nr) {
4388 unsigned int min_frame_size;
4389
4390
4391 err = -EBUSY;
4392 if (unlikely(rb->pg_vec))
4393 goto out;
4394
4395 switch (po->tp_version) {
4396 case TPACKET_V1:
4397 po->tp_hdrlen = TPACKET_HDRLEN;
4398 break;
4399 case TPACKET_V2:
4400 po->tp_hdrlen = TPACKET2_HDRLEN;
4401 break;
4402 case TPACKET_V3:
4403 po->tp_hdrlen = TPACKET3_HDRLEN;
4404 break;
4405 }
4406
4407 err = -EINVAL;
4408 if (unlikely((int)req->tp_block_size <= 0))
4409 goto out;
4410 if (unlikely(!PAGE_ALIGNED(req->tp_block_size)))
4411 goto out;
4412 min_frame_size = po->tp_hdrlen + po->tp_reserve;
4413 if (po->tp_version >= TPACKET_V3 &&
4414 req->tp_block_size <
4415 BLK_PLUS_PRIV((u64)req_u->req3.tp_sizeof_priv) + min_frame_size)
4416 goto out;
4417 if (unlikely(req->tp_frame_size < min_frame_size))
4418 goto out;
4419 if (unlikely(req->tp_frame_size & (TPACKET_ALIGNMENT - 1)))
4420 goto out;
4421
4422 rb->frames_per_block = req->tp_block_size / req->tp_frame_size;
4423 if (unlikely(rb->frames_per_block == 0))
4424 goto out;
4425 if (unlikely(rb->frames_per_block > UINT_MAX / req->tp_block_nr))
4426 goto out;
4427 if (unlikely((rb->frames_per_block * req->tp_block_nr) !=
4428 req->tp_frame_nr))
4429 goto out;
4430
4431 err = -ENOMEM;
4432 order = get_order(req->tp_block_size);
4433 pg_vec = alloc_pg_vec(req, order);
4434 if (unlikely(!pg_vec))
4435 goto out;
4436 switch (po->tp_version) {
4437 case TPACKET_V3:
4438
4439 if (!tx_ring) {
4440 init_prb_bdqc(po, rb, pg_vec, req_u);
4441 } else {
4442 struct tpacket_req3 *req3 = &req_u->req3;
4443
4444 if (req3->tp_retire_blk_tov ||
4445 req3->tp_sizeof_priv ||
4446 req3->tp_feature_req_word) {
4447 err = -EINVAL;
4448 goto out_free_pg_vec;
4449 }
4450 }
4451 break;
4452 default:
4453 if (!tx_ring) {
4454 rx_owner_map = bitmap_alloc(req->tp_frame_nr,
4455 GFP_KERNEL | __GFP_NOWARN | __GFP_ZERO);
4456 if (!rx_owner_map)
4457 goto out_free_pg_vec;
4458 }
4459 break;
4460 }
4461 }
4462
4463 else {
4464 err = -EINVAL;
4465 if (unlikely(req->tp_frame_nr))
4466 goto out;
4467 }
4468
4469
4470
4471 spin_lock(&po->bind_lock);
4472 was_running = po->running;
4473 num = po->num;
4474 if (was_running) {
4475 WRITE_ONCE(po->num, 0);
4476 __unregister_prot_hook(sk, false);
4477 }
4478 spin_unlock(&po->bind_lock);
4479
4480 synchronize_net();
4481
4482 err = -EBUSY;
4483 mutex_lock(&po->pg_vec_lock);
4484 if (closing || atomic_read(&po->mapped) == 0) {
4485 err = 0;
4486 spin_lock_bh(&rb_queue->lock);
4487 swap(rb->pg_vec, pg_vec);
4488 if (po->tp_version <= TPACKET_V2)
4489 swap(rb->rx_owner_map, rx_owner_map);
4490 rb->frame_max = (req->tp_frame_nr - 1);
4491 rb->head = 0;
4492 rb->frame_size = req->tp_frame_size;
4493 spin_unlock_bh(&rb_queue->lock);
4494
4495 swap(rb->pg_vec_order, order);
4496 swap(rb->pg_vec_len, req->tp_block_nr);
4497
4498 rb->pg_vec_pages = req->tp_block_size/PAGE_SIZE;
4499 po->prot_hook.func = (po->rx_ring.pg_vec) ?
4500 tpacket_rcv : packet_rcv;
4501 skb_queue_purge(rb_queue);
4502 if (atomic_read(&po->mapped))
4503 pr_err("packet_mmap: vma is busy: %d\n",
4504 atomic_read(&po->mapped));
4505 }
4506 mutex_unlock(&po->pg_vec_lock);
4507
4508 spin_lock(&po->bind_lock);
4509 if (was_running) {
4510 WRITE_ONCE(po->num, num);
4511 register_prot_hook(sk);
4512 }
4513 spin_unlock(&po->bind_lock);
4514 if (pg_vec && (po->tp_version > TPACKET_V2)) {
4515
4516 if (!tx_ring)
4517 prb_shutdown_retire_blk_timer(po, rb_queue);
4518 }
4519
4520 out_free_pg_vec:
4521 if (pg_vec) {
4522 bitmap_free(rx_owner_map);
4523 free_pg_vec(pg_vec, order, req->tp_block_nr);
4524 }
4525 out:
4526 return err;
4527 }
4528
4529 static int packet_mmap(struct file *file, struct socket *sock,
4530 struct vm_area_struct *vma)
4531 {
4532 struct sock *sk = sock->sk;
4533 struct packet_sock *po = pkt_sk(sk);
4534 unsigned long size, expected_size;
4535 struct packet_ring_buffer *rb;
4536 unsigned long start;
4537 int err = -EINVAL;
4538 int i;
4539
4540 if (vma->vm_pgoff)
4541 return -EINVAL;
4542
4543 mutex_lock(&po->pg_vec_lock);
4544
4545 expected_size = 0;
4546 for (rb = &po->rx_ring; rb <= &po->tx_ring; rb++) {
4547 if (rb->pg_vec) {
4548 expected_size += rb->pg_vec_len
4549 * rb->pg_vec_pages
4550 * PAGE_SIZE;
4551 }
4552 }
4553
4554 if (expected_size == 0)
4555 goto out;
4556
4557 size = vma->vm_end - vma->vm_start;
4558 if (size != expected_size)
4559 goto out;
4560
4561 start = vma->vm_start;
4562 for (rb = &po->rx_ring; rb <= &po->tx_ring; rb++) {
4563 if (rb->pg_vec == NULL)
4564 continue;
4565
4566 for (i = 0; i < rb->pg_vec_len; i++) {
4567 struct page *page;
4568 void *kaddr = rb->pg_vec[i].buffer;
4569 int pg_num;
4570
4571 for (pg_num = 0; pg_num < rb->pg_vec_pages; pg_num++) {
4572 page = pgv_to_page(kaddr);
4573 err = vm_insert_page(vma, start, page);
4574 if (unlikely(err))
4575 goto out;
4576 start += PAGE_SIZE;
4577 kaddr += PAGE_SIZE;
4578 }
4579 }
4580 }
4581
4582 atomic_inc(&po->mapped);
4583 vma->vm_ops = &packet_mmap_ops;
4584 err = 0;
4585
4586 out:
4587 mutex_unlock(&po->pg_vec_lock);
4588 return err;
4589 }
4590
4591 static const struct proto_ops packet_ops_spkt = {
4592 .family = PF_PACKET,
4593 .owner = THIS_MODULE,
4594 .release = packet_release,
4595 .bind = packet_bind_spkt,
4596 .connect = sock_no_connect,
4597 .socketpair = sock_no_socketpair,
4598 .accept = sock_no_accept,
4599 .getname = packet_getname_spkt,
4600 .poll = datagram_poll,
4601 .ioctl = packet_ioctl,
4602 .gettstamp = sock_gettstamp,
4603 .listen = sock_no_listen,
4604 .shutdown = sock_no_shutdown,
4605 .sendmsg = packet_sendmsg_spkt,
4606 .recvmsg = packet_recvmsg,
4607 .mmap = sock_no_mmap,
4608 .sendpage = sock_no_sendpage,
4609 };
4610
4611 static const struct proto_ops packet_ops = {
4612 .family = PF_PACKET,
4613 .owner = THIS_MODULE,
4614 .release = packet_release,
4615 .bind = packet_bind,
4616 .connect = sock_no_connect,
4617 .socketpair = sock_no_socketpair,
4618 .accept = sock_no_accept,
4619 .getname = packet_getname,
4620 .poll = packet_poll,
4621 .ioctl = packet_ioctl,
4622 .gettstamp = sock_gettstamp,
4623 .listen = sock_no_listen,
4624 .shutdown = sock_no_shutdown,
4625 .setsockopt = packet_setsockopt,
4626 .getsockopt = packet_getsockopt,
4627 .sendmsg = packet_sendmsg,
4628 .recvmsg = packet_recvmsg,
4629 .mmap = packet_mmap,
4630 .sendpage = sock_no_sendpage,
4631 };
4632
4633 static const struct net_proto_family packet_family_ops = {
4634 .family = PF_PACKET,
4635 .create = packet_create,
4636 .owner = THIS_MODULE,
4637 };
4638
4639 static struct notifier_block packet_netdev_notifier = {
4640 .notifier_call = packet_notifier,
4641 };
4642
4643 #ifdef CONFIG_PROC_FS
4644
4645 static void *packet_seq_start(struct seq_file *seq, loff_t *pos)
4646 __acquires(RCU)
4647 {
4648 struct net *net = seq_file_net(seq);
4649
4650 rcu_read_lock();
4651 return seq_hlist_start_head_rcu(&net->packet.sklist, *pos);
4652 }
4653
4654 static void *packet_seq_next(struct seq_file *seq, void *v, loff_t *pos)
4655 {
4656 struct net *net = seq_file_net(seq);
4657 return seq_hlist_next_rcu(v, &net->packet.sklist, pos);
4658 }
4659
4660 static void packet_seq_stop(struct seq_file *seq, void *v)
4661 __releases(RCU)
4662 {
4663 rcu_read_unlock();
4664 }
4665
4666 static int packet_seq_show(struct seq_file *seq, void *v)
4667 {
4668 if (v == SEQ_START_TOKEN)
4669 seq_printf(seq,
4670 "%*sRefCnt Type Proto Iface R Rmem User Inode\n",
4671 IS_ENABLED(CONFIG_64BIT) ? -17 : -9, "sk");
4672 else {
4673 struct sock *s = sk_entry(v);
4674 const struct packet_sock *po = pkt_sk(s);
4675
4676 seq_printf(seq,
4677 "%pK %-6d %-4d %04x %-5d %1d %-6u %-6u %-6lu\n",
4678 s,
4679 refcount_read(&s->sk_refcnt),
4680 s->sk_type,
4681 ntohs(READ_ONCE(po->num)),
4682 READ_ONCE(po->ifindex),
4683 po->running,
4684 atomic_read(&s->sk_rmem_alloc),
4685 from_kuid_munged(seq_user_ns(seq), sock_i_uid(s)),
4686 sock_i_ino(s));
4687 }
4688
4689 return 0;
4690 }
4691
4692 static const struct seq_operations packet_seq_ops = {
4693 .start = packet_seq_start,
4694 .next = packet_seq_next,
4695 .stop = packet_seq_stop,
4696 .show = packet_seq_show,
4697 };
4698 #endif
4699
4700 static int __net_init packet_net_init(struct net *net)
4701 {
4702 mutex_init(&net->packet.sklist_lock);
4703 INIT_HLIST_HEAD(&net->packet.sklist);
4704
4705 #ifdef CONFIG_PROC_FS
4706 if (!proc_create_net("packet", 0, net->proc_net, &packet_seq_ops,
4707 sizeof(struct seq_net_private)))
4708 return -ENOMEM;
4709 #endif
4710
4711 return 0;
4712 }
4713
4714 static void __net_exit packet_net_exit(struct net *net)
4715 {
4716 remove_proc_entry("packet", net->proc_net);
4717 WARN_ON_ONCE(!hlist_empty(&net->packet.sklist));
4718 }
4719
4720 static struct pernet_operations packet_net_ops = {
4721 .init = packet_net_init,
4722 .exit = packet_net_exit,
4723 };
4724
4725
4726 static void __exit packet_exit(void)
4727 {
4728 unregister_netdevice_notifier(&packet_netdev_notifier);
4729 unregister_pernet_subsys(&packet_net_ops);
4730 sock_unregister(PF_PACKET);
4731 proto_unregister(&packet_proto);
4732 }
4733
4734 static int __init packet_init(void)
4735 {
4736 int rc;
4737
4738 rc = proto_register(&packet_proto, 0);
4739 if (rc)
4740 goto out;
4741 rc = sock_register(&packet_family_ops);
4742 if (rc)
4743 goto out_proto;
4744 rc = register_pernet_subsys(&packet_net_ops);
4745 if (rc)
4746 goto out_sock;
4747 rc = register_netdevice_notifier(&packet_netdev_notifier);
4748 if (rc)
4749 goto out_pernet;
4750
4751 return 0;
4752
4753 out_pernet:
4754 unregister_pernet_subsys(&packet_net_ops);
4755 out_sock:
4756 sock_unregister(PF_PACKET);
4757 out_proto:
4758 proto_unregister(&packet_proto);
4759 out:
4760 return rc;
4761 }
4762
4763 module_init(packet_init);
4764 module_exit(packet_exit);
4765 MODULE_LICENSE("GPL");
4766 MODULE_ALIAS_NETPROTO(PF_PACKET);