Back to home page

OSCL-LXR

 
 

    


0001 // SPDX-License-Identifier: GPL-2.0
0002 /* Copyright(c) 2013 - 2018 Intel Corporation. */
0003 
0004 #include <linux/prefetch.h>
0005 
0006 #include "iavf.h"
0007 #include "iavf_trace.h"
0008 #include "iavf_prototype.h"
0009 
0010 static inline __le64 build_ctob(u32 td_cmd, u32 td_offset, unsigned int size,
0011                 u32 td_tag)
0012 {
0013     return cpu_to_le64(IAVF_TX_DESC_DTYPE_DATA |
0014                ((u64)td_cmd  << IAVF_TXD_QW1_CMD_SHIFT) |
0015                ((u64)td_offset << IAVF_TXD_QW1_OFFSET_SHIFT) |
0016                ((u64)size  << IAVF_TXD_QW1_TX_BUF_SZ_SHIFT) |
0017                ((u64)td_tag  << IAVF_TXD_QW1_L2TAG1_SHIFT));
0018 }
0019 
0020 #define IAVF_TXD_CMD (IAVF_TX_DESC_CMD_EOP | IAVF_TX_DESC_CMD_RS)
0021 
0022 /**
0023  * iavf_unmap_and_free_tx_resource - Release a Tx buffer
0024  * @ring:      the ring that owns the buffer
0025  * @tx_buffer: the buffer to free
0026  **/
0027 static void iavf_unmap_and_free_tx_resource(struct iavf_ring *ring,
0028                         struct iavf_tx_buffer *tx_buffer)
0029 {
0030     if (tx_buffer->skb) {
0031         if (tx_buffer->tx_flags & IAVF_TX_FLAGS_FD_SB)
0032             kfree(tx_buffer->raw_buf);
0033         else
0034             dev_kfree_skb_any(tx_buffer->skb);
0035         if (dma_unmap_len(tx_buffer, len))
0036             dma_unmap_single(ring->dev,
0037                      dma_unmap_addr(tx_buffer, dma),
0038                      dma_unmap_len(tx_buffer, len),
0039                      DMA_TO_DEVICE);
0040     } else if (dma_unmap_len(tx_buffer, len)) {
0041         dma_unmap_page(ring->dev,
0042                    dma_unmap_addr(tx_buffer, dma),
0043                    dma_unmap_len(tx_buffer, len),
0044                    DMA_TO_DEVICE);
0045     }
0046 
0047     tx_buffer->next_to_watch = NULL;
0048     tx_buffer->skb = NULL;
0049     dma_unmap_len_set(tx_buffer, len, 0);
0050     /* tx_buffer must be completely set up in the transmit path */
0051 }
0052 
0053 /**
0054  * iavf_clean_tx_ring - Free any empty Tx buffers
0055  * @tx_ring: ring to be cleaned
0056  **/
0057 void iavf_clean_tx_ring(struct iavf_ring *tx_ring)
0058 {
0059     unsigned long bi_size;
0060     u16 i;
0061 
0062     /* ring already cleared, nothing to do */
0063     if (!tx_ring->tx_bi)
0064         return;
0065 
0066     /* Free all the Tx ring sk_buffs */
0067     for (i = 0; i < tx_ring->count; i++)
0068         iavf_unmap_and_free_tx_resource(tx_ring, &tx_ring->tx_bi[i]);
0069 
0070     bi_size = sizeof(struct iavf_tx_buffer) * tx_ring->count;
0071     memset(tx_ring->tx_bi, 0, bi_size);
0072 
0073     /* Zero out the descriptor ring */
0074     memset(tx_ring->desc, 0, tx_ring->size);
0075 
0076     tx_ring->next_to_use = 0;
0077     tx_ring->next_to_clean = 0;
0078 
0079     if (!tx_ring->netdev)
0080         return;
0081 
0082     /* cleanup Tx queue statistics */
0083     netdev_tx_reset_queue(txring_txq(tx_ring));
0084 }
0085 
0086 /**
0087  * iavf_free_tx_resources - Free Tx resources per queue
0088  * @tx_ring: Tx descriptor ring for a specific queue
0089  *
0090  * Free all transmit software resources
0091  **/
0092 void iavf_free_tx_resources(struct iavf_ring *tx_ring)
0093 {
0094     iavf_clean_tx_ring(tx_ring);
0095     kfree(tx_ring->tx_bi);
0096     tx_ring->tx_bi = NULL;
0097 
0098     if (tx_ring->desc) {
0099         dma_free_coherent(tx_ring->dev, tx_ring->size,
0100                   tx_ring->desc, tx_ring->dma);
0101         tx_ring->desc = NULL;
0102     }
0103 }
0104 
0105 /**
0106  * iavf_get_tx_pending - how many Tx descriptors not processed
0107  * @ring: the ring of descriptors
0108  * @in_sw: is tx_pending being checked in SW or HW
0109  *
0110  * Since there is no access to the ring head register
0111  * in XL710, we need to use our local copies
0112  **/
0113 u32 iavf_get_tx_pending(struct iavf_ring *ring, bool in_sw)
0114 {
0115     u32 head, tail;
0116 
0117     /* underlying hardware might not allow access and/or always return
0118      * 0 for the head/tail registers so just use the cached values
0119      */
0120     head = ring->next_to_clean;
0121     tail = ring->next_to_use;
0122 
0123     if (head != tail)
0124         return (head < tail) ?
0125             tail - head : (tail + ring->count - head);
0126 
0127     return 0;
0128 }
0129 
0130 /**
0131  * iavf_detect_recover_hung - Function to detect and recover hung_queues
0132  * @vsi:  pointer to vsi struct with tx queues
0133  *
0134  * VSI has netdev and netdev has TX queues. This function is to check each of
0135  * those TX queues if they are hung, trigger recovery by issuing SW interrupt.
0136  **/
0137 void iavf_detect_recover_hung(struct iavf_vsi *vsi)
0138 {
0139     struct iavf_ring *tx_ring = NULL;
0140     struct net_device *netdev;
0141     unsigned int i;
0142     int packets;
0143 
0144     if (!vsi)
0145         return;
0146 
0147     if (test_bit(__IAVF_VSI_DOWN, vsi->state))
0148         return;
0149 
0150     netdev = vsi->netdev;
0151     if (!netdev)
0152         return;
0153 
0154     if (!netif_carrier_ok(netdev))
0155         return;
0156 
0157     for (i = 0; i < vsi->back->num_active_queues; i++) {
0158         tx_ring = &vsi->back->tx_rings[i];
0159         if (tx_ring && tx_ring->desc) {
0160             /* If packet counter has not changed the queue is
0161              * likely stalled, so force an interrupt for this
0162              * queue.
0163              *
0164              * prev_pkt_ctr would be negative if there was no
0165              * pending work.
0166              */
0167             packets = tx_ring->stats.packets & INT_MAX;
0168             if (tx_ring->tx_stats.prev_pkt_ctr == packets) {
0169                 iavf_force_wb(vsi, tx_ring->q_vector);
0170                 continue;
0171             }
0172 
0173             /* Memory barrier between read of packet count and call
0174              * to iavf_get_tx_pending()
0175              */
0176             smp_rmb();
0177             tx_ring->tx_stats.prev_pkt_ctr =
0178               iavf_get_tx_pending(tx_ring, true) ? packets : -1;
0179         }
0180     }
0181 }
0182 
0183 #define WB_STRIDE 4
0184 
0185 /**
0186  * iavf_clean_tx_irq - Reclaim resources after transmit completes
0187  * @vsi: the VSI we care about
0188  * @tx_ring: Tx ring to clean
0189  * @napi_budget: Used to determine if we are in netpoll
0190  *
0191  * Returns true if there's any budget left (e.g. the clean is finished)
0192  **/
0193 static bool iavf_clean_tx_irq(struct iavf_vsi *vsi,
0194                   struct iavf_ring *tx_ring, int napi_budget)
0195 {
0196     int i = tx_ring->next_to_clean;
0197     struct iavf_tx_buffer *tx_buf;
0198     struct iavf_tx_desc *tx_desc;
0199     unsigned int total_bytes = 0, total_packets = 0;
0200     unsigned int budget = IAVF_DEFAULT_IRQ_WORK;
0201 
0202     tx_buf = &tx_ring->tx_bi[i];
0203     tx_desc = IAVF_TX_DESC(tx_ring, i);
0204     i -= tx_ring->count;
0205 
0206     do {
0207         struct iavf_tx_desc *eop_desc = tx_buf->next_to_watch;
0208 
0209         /* if next_to_watch is not set then there is no work pending */
0210         if (!eop_desc)
0211             break;
0212 
0213         /* prevent any other reads prior to eop_desc */
0214         smp_rmb();
0215 
0216         iavf_trace(clean_tx_irq, tx_ring, tx_desc, tx_buf);
0217         /* if the descriptor isn't done, no work yet to do */
0218         if (!(eop_desc->cmd_type_offset_bsz &
0219               cpu_to_le64(IAVF_TX_DESC_DTYPE_DESC_DONE)))
0220             break;
0221 
0222         /* clear next_to_watch to prevent false hangs */
0223         tx_buf->next_to_watch = NULL;
0224 
0225         /* update the statistics for this packet */
0226         total_bytes += tx_buf->bytecount;
0227         total_packets += tx_buf->gso_segs;
0228 
0229         /* free the skb */
0230         napi_consume_skb(tx_buf->skb, napi_budget);
0231 
0232         /* unmap skb header data */
0233         dma_unmap_single(tx_ring->dev,
0234                  dma_unmap_addr(tx_buf, dma),
0235                  dma_unmap_len(tx_buf, len),
0236                  DMA_TO_DEVICE);
0237 
0238         /* clear tx_buffer data */
0239         tx_buf->skb = NULL;
0240         dma_unmap_len_set(tx_buf, len, 0);
0241 
0242         /* unmap remaining buffers */
0243         while (tx_desc != eop_desc) {
0244             iavf_trace(clean_tx_irq_unmap,
0245                    tx_ring, tx_desc, tx_buf);
0246 
0247             tx_buf++;
0248             tx_desc++;
0249             i++;
0250             if (unlikely(!i)) {
0251                 i -= tx_ring->count;
0252                 tx_buf = tx_ring->tx_bi;
0253                 tx_desc = IAVF_TX_DESC(tx_ring, 0);
0254             }
0255 
0256             /* unmap any remaining paged data */
0257             if (dma_unmap_len(tx_buf, len)) {
0258                 dma_unmap_page(tx_ring->dev,
0259                            dma_unmap_addr(tx_buf, dma),
0260                            dma_unmap_len(tx_buf, len),
0261                            DMA_TO_DEVICE);
0262                 dma_unmap_len_set(tx_buf, len, 0);
0263             }
0264         }
0265 
0266         /* move us one more past the eop_desc for start of next pkt */
0267         tx_buf++;
0268         tx_desc++;
0269         i++;
0270         if (unlikely(!i)) {
0271             i -= tx_ring->count;
0272             tx_buf = tx_ring->tx_bi;
0273             tx_desc = IAVF_TX_DESC(tx_ring, 0);
0274         }
0275 
0276         prefetch(tx_desc);
0277 
0278         /* update budget accounting */
0279         budget--;
0280     } while (likely(budget));
0281 
0282     i += tx_ring->count;
0283     tx_ring->next_to_clean = i;
0284     u64_stats_update_begin(&tx_ring->syncp);
0285     tx_ring->stats.bytes += total_bytes;
0286     tx_ring->stats.packets += total_packets;
0287     u64_stats_update_end(&tx_ring->syncp);
0288     tx_ring->q_vector->tx.total_bytes += total_bytes;
0289     tx_ring->q_vector->tx.total_packets += total_packets;
0290 
0291     if (tx_ring->flags & IAVF_TXR_FLAGS_WB_ON_ITR) {
0292         /* check to see if there are < 4 descriptors
0293          * waiting to be written back, then kick the hardware to force
0294          * them to be written back in case we stay in NAPI.
0295          * In this mode on X722 we do not enable Interrupt.
0296          */
0297         unsigned int j = iavf_get_tx_pending(tx_ring, false);
0298 
0299         if (budget &&
0300             ((j / WB_STRIDE) == 0) && (j > 0) &&
0301             !test_bit(__IAVF_VSI_DOWN, vsi->state) &&
0302             (IAVF_DESC_UNUSED(tx_ring) != tx_ring->count))
0303             tx_ring->arm_wb = true;
0304     }
0305 
0306     /* notify netdev of completed buffers */
0307     netdev_tx_completed_queue(txring_txq(tx_ring),
0308                   total_packets, total_bytes);
0309 
0310 #define TX_WAKE_THRESHOLD ((s16)(DESC_NEEDED * 2))
0311     if (unlikely(total_packets && netif_carrier_ok(tx_ring->netdev) &&
0312              (IAVF_DESC_UNUSED(tx_ring) >= TX_WAKE_THRESHOLD))) {
0313         /* Make sure that anybody stopping the queue after this
0314          * sees the new next_to_clean.
0315          */
0316         smp_mb();
0317         if (__netif_subqueue_stopped(tx_ring->netdev,
0318                          tx_ring->queue_index) &&
0319            !test_bit(__IAVF_VSI_DOWN, vsi->state)) {
0320             netif_wake_subqueue(tx_ring->netdev,
0321                         tx_ring->queue_index);
0322             ++tx_ring->tx_stats.restart_queue;
0323         }
0324     }
0325 
0326     return !!budget;
0327 }
0328 
0329 /**
0330  * iavf_enable_wb_on_itr - Arm hardware to do a wb, interrupts are not enabled
0331  * @vsi: the VSI we care about
0332  * @q_vector: the vector on which to enable writeback
0333  *
0334  **/
0335 static void iavf_enable_wb_on_itr(struct iavf_vsi *vsi,
0336                   struct iavf_q_vector *q_vector)
0337 {
0338     u16 flags = q_vector->tx.ring[0].flags;
0339     u32 val;
0340 
0341     if (!(flags & IAVF_TXR_FLAGS_WB_ON_ITR))
0342         return;
0343 
0344     if (q_vector->arm_wb_state)
0345         return;
0346 
0347     val = IAVF_VFINT_DYN_CTLN1_WB_ON_ITR_MASK |
0348           IAVF_VFINT_DYN_CTLN1_ITR_INDX_MASK; /* set noitr */
0349 
0350     wr32(&vsi->back->hw,
0351          IAVF_VFINT_DYN_CTLN1(q_vector->reg_idx), val);
0352     q_vector->arm_wb_state = true;
0353 }
0354 
0355 /**
0356  * iavf_force_wb - Issue SW Interrupt so HW does a wb
0357  * @vsi: the VSI we care about
0358  * @q_vector: the vector  on which to force writeback
0359  *
0360  **/
0361 void iavf_force_wb(struct iavf_vsi *vsi, struct iavf_q_vector *q_vector)
0362 {
0363     u32 val = IAVF_VFINT_DYN_CTLN1_INTENA_MASK |
0364           IAVF_VFINT_DYN_CTLN1_ITR_INDX_MASK | /* set noitr */
0365           IAVF_VFINT_DYN_CTLN1_SWINT_TRIG_MASK |
0366           IAVF_VFINT_DYN_CTLN1_SW_ITR_INDX_ENA_MASK
0367           /* allow 00 to be written to the index */;
0368 
0369     wr32(&vsi->back->hw,
0370          IAVF_VFINT_DYN_CTLN1(q_vector->reg_idx),
0371          val);
0372 }
0373 
0374 static inline bool iavf_container_is_rx(struct iavf_q_vector *q_vector,
0375                     struct iavf_ring_container *rc)
0376 {
0377     return &q_vector->rx == rc;
0378 }
0379 
0380 #define IAVF_AIM_MULTIPLIER_100G    2560
0381 #define IAVF_AIM_MULTIPLIER_50G     1280
0382 #define IAVF_AIM_MULTIPLIER_40G     1024
0383 #define IAVF_AIM_MULTIPLIER_20G     512
0384 #define IAVF_AIM_MULTIPLIER_10G     256
0385 #define IAVF_AIM_MULTIPLIER_1G      32
0386 
0387 static unsigned int iavf_mbps_itr_multiplier(u32 speed_mbps)
0388 {
0389     switch (speed_mbps) {
0390     case SPEED_100000:
0391         return IAVF_AIM_MULTIPLIER_100G;
0392     case SPEED_50000:
0393         return IAVF_AIM_MULTIPLIER_50G;
0394     case SPEED_40000:
0395         return IAVF_AIM_MULTIPLIER_40G;
0396     case SPEED_25000:
0397     case SPEED_20000:
0398         return IAVF_AIM_MULTIPLIER_20G;
0399     case SPEED_10000:
0400     default:
0401         return IAVF_AIM_MULTIPLIER_10G;
0402     case SPEED_1000:
0403     case SPEED_100:
0404         return IAVF_AIM_MULTIPLIER_1G;
0405     }
0406 }
0407 
0408 static unsigned int
0409 iavf_virtchnl_itr_multiplier(enum virtchnl_link_speed speed_virtchnl)
0410 {
0411     switch (speed_virtchnl) {
0412     case VIRTCHNL_LINK_SPEED_40GB:
0413         return IAVF_AIM_MULTIPLIER_40G;
0414     case VIRTCHNL_LINK_SPEED_25GB:
0415     case VIRTCHNL_LINK_SPEED_20GB:
0416         return IAVF_AIM_MULTIPLIER_20G;
0417     case VIRTCHNL_LINK_SPEED_10GB:
0418     default:
0419         return IAVF_AIM_MULTIPLIER_10G;
0420     case VIRTCHNL_LINK_SPEED_1GB:
0421     case VIRTCHNL_LINK_SPEED_100MB:
0422         return IAVF_AIM_MULTIPLIER_1G;
0423     }
0424 }
0425 
0426 static unsigned int iavf_itr_divisor(struct iavf_adapter *adapter)
0427 {
0428     if (ADV_LINK_SUPPORT(adapter))
0429         return IAVF_ITR_ADAPTIVE_MIN_INC *
0430             iavf_mbps_itr_multiplier(adapter->link_speed_mbps);
0431     else
0432         return IAVF_ITR_ADAPTIVE_MIN_INC *
0433             iavf_virtchnl_itr_multiplier(adapter->link_speed);
0434 }
0435 
0436 /**
0437  * iavf_update_itr - update the dynamic ITR value based on statistics
0438  * @q_vector: structure containing interrupt and ring information
0439  * @rc: structure containing ring performance data
0440  *
0441  * Stores a new ITR value based on packets and byte
0442  * counts during the last interrupt.  The advantage of per interrupt
0443  * computation is faster updates and more accurate ITR for the current
0444  * traffic pattern.  Constants in this function were computed
0445  * based on theoretical maximum wire speed and thresholds were set based
0446  * on testing data as well as attempting to minimize response time
0447  * while increasing bulk throughput.
0448  **/
0449 static void iavf_update_itr(struct iavf_q_vector *q_vector,
0450                 struct iavf_ring_container *rc)
0451 {
0452     unsigned int avg_wire_size, packets, bytes, itr;
0453     unsigned long next_update = jiffies;
0454 
0455     /* If we don't have any rings just leave ourselves set for maximum
0456      * possible latency so we take ourselves out of the equation.
0457      */
0458     if (!rc->ring || !ITR_IS_DYNAMIC(rc->ring->itr_setting))
0459         return;
0460 
0461     /* For Rx we want to push the delay up and default to low latency.
0462      * for Tx we want to pull the delay down and default to high latency.
0463      */
0464     itr = iavf_container_is_rx(q_vector, rc) ?
0465           IAVF_ITR_ADAPTIVE_MIN_USECS | IAVF_ITR_ADAPTIVE_LATENCY :
0466           IAVF_ITR_ADAPTIVE_MAX_USECS | IAVF_ITR_ADAPTIVE_LATENCY;
0467 
0468     /* If we didn't update within up to 1 - 2 jiffies we can assume
0469      * that either packets are coming in so slow there hasn't been
0470      * any work, or that there is so much work that NAPI is dealing
0471      * with interrupt moderation and we don't need to do anything.
0472      */
0473     if (time_after(next_update, rc->next_update))
0474         goto clear_counts;
0475 
0476     /* If itr_countdown is set it means we programmed an ITR within
0477      * the last 4 interrupt cycles. This has a side effect of us
0478      * potentially firing an early interrupt. In order to work around
0479      * this we need to throw out any data received for a few
0480      * interrupts following the update.
0481      */
0482     if (q_vector->itr_countdown) {
0483         itr = rc->target_itr;
0484         goto clear_counts;
0485     }
0486 
0487     packets = rc->total_packets;
0488     bytes = rc->total_bytes;
0489 
0490     if (iavf_container_is_rx(q_vector, rc)) {
0491         /* If Rx there are 1 to 4 packets and bytes are less than
0492          * 9000 assume insufficient data to use bulk rate limiting
0493          * approach unless Tx is already in bulk rate limiting. We
0494          * are likely latency driven.
0495          */
0496         if (packets && packets < 4 && bytes < 9000 &&
0497             (q_vector->tx.target_itr & IAVF_ITR_ADAPTIVE_LATENCY)) {
0498             itr = IAVF_ITR_ADAPTIVE_LATENCY;
0499             goto adjust_by_size;
0500         }
0501     } else if (packets < 4) {
0502         /* If we have Tx and Rx ITR maxed and Tx ITR is running in
0503          * bulk mode and we are receiving 4 or fewer packets just
0504          * reset the ITR_ADAPTIVE_LATENCY bit for latency mode so
0505          * that the Rx can relax.
0506          */
0507         if (rc->target_itr == IAVF_ITR_ADAPTIVE_MAX_USECS &&
0508             (q_vector->rx.target_itr & IAVF_ITR_MASK) ==
0509              IAVF_ITR_ADAPTIVE_MAX_USECS)
0510             goto clear_counts;
0511     } else if (packets > 32) {
0512         /* If we have processed over 32 packets in a single interrupt
0513          * for Tx assume we need to switch over to "bulk" mode.
0514          */
0515         rc->target_itr &= ~IAVF_ITR_ADAPTIVE_LATENCY;
0516     }
0517 
0518     /* We have no packets to actually measure against. This means
0519      * either one of the other queues on this vector is active or
0520      * we are a Tx queue doing TSO with too high of an interrupt rate.
0521      *
0522      * Between 4 and 56 we can assume that our current interrupt delay
0523      * is only slightly too low. As such we should increase it by a small
0524      * fixed amount.
0525      */
0526     if (packets < 56) {
0527         itr = rc->target_itr + IAVF_ITR_ADAPTIVE_MIN_INC;
0528         if ((itr & IAVF_ITR_MASK) > IAVF_ITR_ADAPTIVE_MAX_USECS) {
0529             itr &= IAVF_ITR_ADAPTIVE_LATENCY;
0530             itr += IAVF_ITR_ADAPTIVE_MAX_USECS;
0531         }
0532         goto clear_counts;
0533     }
0534 
0535     if (packets <= 256) {
0536         itr = min(q_vector->tx.current_itr, q_vector->rx.current_itr);
0537         itr &= IAVF_ITR_MASK;
0538 
0539         /* Between 56 and 112 is our "goldilocks" zone where we are
0540          * working out "just right". Just report that our current
0541          * ITR is good for us.
0542          */
0543         if (packets <= 112)
0544             goto clear_counts;
0545 
0546         /* If packet count is 128 or greater we are likely looking
0547          * at a slight overrun of the delay we want. Try halving
0548          * our delay to see if that will cut the number of packets
0549          * in half per interrupt.
0550          */
0551         itr /= 2;
0552         itr &= IAVF_ITR_MASK;
0553         if (itr < IAVF_ITR_ADAPTIVE_MIN_USECS)
0554             itr = IAVF_ITR_ADAPTIVE_MIN_USECS;
0555 
0556         goto clear_counts;
0557     }
0558 
0559     /* The paths below assume we are dealing with a bulk ITR since
0560      * number of packets is greater than 256. We are just going to have
0561      * to compute a value and try to bring the count under control,
0562      * though for smaller packet sizes there isn't much we can do as
0563      * NAPI polling will likely be kicking in sooner rather than later.
0564      */
0565     itr = IAVF_ITR_ADAPTIVE_BULK;
0566 
0567 adjust_by_size:
0568     /* If packet counts are 256 or greater we can assume we have a gross
0569      * overestimation of what the rate should be. Instead of trying to fine
0570      * tune it just use the formula below to try and dial in an exact value
0571      * give the current packet size of the frame.
0572      */
0573     avg_wire_size = bytes / packets;
0574 
0575     /* The following is a crude approximation of:
0576      *  wmem_default / (size + overhead) = desired_pkts_per_int
0577      *  rate / bits_per_byte / (size + ethernet overhead) = pkt_rate
0578      *  (desired_pkt_rate / pkt_rate) * usecs_per_sec = ITR value
0579      *
0580      * Assuming wmem_default is 212992 and overhead is 640 bytes per
0581      * packet, (256 skb, 64 headroom, 320 shared info), we can reduce the
0582      * formula down to
0583      *
0584      *  (170 * (size + 24)) / (size + 640) = ITR
0585      *
0586      * We first do some math on the packet size and then finally bitshift
0587      * by 8 after rounding up. We also have to account for PCIe link speed
0588      * difference as ITR scales based on this.
0589      */
0590     if (avg_wire_size <= 60) {
0591         /* Start at 250k ints/sec */
0592         avg_wire_size = 4096;
0593     } else if (avg_wire_size <= 380) {
0594         /* 250K ints/sec to 60K ints/sec */
0595         avg_wire_size *= 40;
0596         avg_wire_size += 1696;
0597     } else if (avg_wire_size <= 1084) {
0598         /* 60K ints/sec to 36K ints/sec */
0599         avg_wire_size *= 15;
0600         avg_wire_size += 11452;
0601     } else if (avg_wire_size <= 1980) {
0602         /* 36K ints/sec to 30K ints/sec */
0603         avg_wire_size *= 5;
0604         avg_wire_size += 22420;
0605     } else {
0606         /* plateau at a limit of 30K ints/sec */
0607         avg_wire_size = 32256;
0608     }
0609 
0610     /* If we are in low latency mode halve our delay which doubles the
0611      * rate to somewhere between 100K to 16K ints/sec
0612      */
0613     if (itr & IAVF_ITR_ADAPTIVE_LATENCY)
0614         avg_wire_size /= 2;
0615 
0616     /* Resultant value is 256 times larger than it needs to be. This
0617      * gives us room to adjust the value as needed to either increase
0618      * or decrease the value based on link speeds of 10G, 2.5G, 1G, etc.
0619      *
0620      * Use addition as we have already recorded the new latency flag
0621      * for the ITR value.
0622      */
0623     itr += DIV_ROUND_UP(avg_wire_size,
0624                 iavf_itr_divisor(q_vector->adapter)) *
0625         IAVF_ITR_ADAPTIVE_MIN_INC;
0626 
0627     if ((itr & IAVF_ITR_MASK) > IAVF_ITR_ADAPTIVE_MAX_USECS) {
0628         itr &= IAVF_ITR_ADAPTIVE_LATENCY;
0629         itr += IAVF_ITR_ADAPTIVE_MAX_USECS;
0630     }
0631 
0632 clear_counts:
0633     /* write back value */
0634     rc->target_itr = itr;
0635 
0636     /* next update should occur within next jiffy */
0637     rc->next_update = next_update + 1;
0638 
0639     rc->total_bytes = 0;
0640     rc->total_packets = 0;
0641 }
0642 
0643 /**
0644  * iavf_setup_tx_descriptors - Allocate the Tx descriptors
0645  * @tx_ring: the tx ring to set up
0646  *
0647  * Return 0 on success, negative on error
0648  **/
0649 int iavf_setup_tx_descriptors(struct iavf_ring *tx_ring)
0650 {
0651     struct device *dev = tx_ring->dev;
0652     int bi_size;
0653 
0654     if (!dev)
0655         return -ENOMEM;
0656 
0657     /* warn if we are about to overwrite the pointer */
0658     WARN_ON(tx_ring->tx_bi);
0659     bi_size = sizeof(struct iavf_tx_buffer) * tx_ring->count;
0660     tx_ring->tx_bi = kzalloc(bi_size, GFP_KERNEL);
0661     if (!tx_ring->tx_bi)
0662         goto err;
0663 
0664     /* round up to nearest 4K */
0665     tx_ring->size = tx_ring->count * sizeof(struct iavf_tx_desc);
0666     tx_ring->size = ALIGN(tx_ring->size, 4096);
0667     tx_ring->desc = dma_alloc_coherent(dev, tx_ring->size,
0668                        &tx_ring->dma, GFP_KERNEL);
0669     if (!tx_ring->desc) {
0670         dev_info(dev, "Unable to allocate memory for the Tx descriptor ring, size=%d\n",
0671              tx_ring->size);
0672         goto err;
0673     }
0674 
0675     tx_ring->next_to_use = 0;
0676     tx_ring->next_to_clean = 0;
0677     tx_ring->tx_stats.prev_pkt_ctr = -1;
0678     return 0;
0679 
0680 err:
0681     kfree(tx_ring->tx_bi);
0682     tx_ring->tx_bi = NULL;
0683     return -ENOMEM;
0684 }
0685 
0686 /**
0687  * iavf_clean_rx_ring - Free Rx buffers
0688  * @rx_ring: ring to be cleaned
0689  **/
0690 void iavf_clean_rx_ring(struct iavf_ring *rx_ring)
0691 {
0692     unsigned long bi_size;
0693     u16 i;
0694 
0695     /* ring already cleared, nothing to do */
0696     if (!rx_ring->rx_bi)
0697         return;
0698 
0699     if (rx_ring->skb) {
0700         dev_kfree_skb(rx_ring->skb);
0701         rx_ring->skb = NULL;
0702     }
0703 
0704     /* Free all the Rx ring sk_buffs */
0705     for (i = 0; i < rx_ring->count; i++) {
0706         struct iavf_rx_buffer *rx_bi = &rx_ring->rx_bi[i];
0707 
0708         if (!rx_bi->page)
0709             continue;
0710 
0711         /* Invalidate cache lines that may have been written to by
0712          * device so that we avoid corrupting memory.
0713          */
0714         dma_sync_single_range_for_cpu(rx_ring->dev,
0715                           rx_bi->dma,
0716                           rx_bi->page_offset,
0717                           rx_ring->rx_buf_len,
0718                           DMA_FROM_DEVICE);
0719 
0720         /* free resources associated with mapping */
0721         dma_unmap_page_attrs(rx_ring->dev, rx_bi->dma,
0722                      iavf_rx_pg_size(rx_ring),
0723                      DMA_FROM_DEVICE,
0724                      IAVF_RX_DMA_ATTR);
0725 
0726         __page_frag_cache_drain(rx_bi->page, rx_bi->pagecnt_bias);
0727 
0728         rx_bi->page = NULL;
0729         rx_bi->page_offset = 0;
0730     }
0731 
0732     bi_size = sizeof(struct iavf_rx_buffer) * rx_ring->count;
0733     memset(rx_ring->rx_bi, 0, bi_size);
0734 
0735     /* Zero out the descriptor ring */
0736     memset(rx_ring->desc, 0, rx_ring->size);
0737 
0738     rx_ring->next_to_alloc = 0;
0739     rx_ring->next_to_clean = 0;
0740     rx_ring->next_to_use = 0;
0741 }
0742 
0743 /**
0744  * iavf_free_rx_resources - Free Rx resources
0745  * @rx_ring: ring to clean the resources from
0746  *
0747  * Free all receive software resources
0748  **/
0749 void iavf_free_rx_resources(struct iavf_ring *rx_ring)
0750 {
0751     iavf_clean_rx_ring(rx_ring);
0752     kfree(rx_ring->rx_bi);
0753     rx_ring->rx_bi = NULL;
0754 
0755     if (rx_ring->desc) {
0756         dma_free_coherent(rx_ring->dev, rx_ring->size,
0757                   rx_ring->desc, rx_ring->dma);
0758         rx_ring->desc = NULL;
0759     }
0760 }
0761 
0762 /**
0763  * iavf_setup_rx_descriptors - Allocate Rx descriptors
0764  * @rx_ring: Rx descriptor ring (for a specific queue) to setup
0765  *
0766  * Returns 0 on success, negative on failure
0767  **/
0768 int iavf_setup_rx_descriptors(struct iavf_ring *rx_ring)
0769 {
0770     struct device *dev = rx_ring->dev;
0771     int bi_size;
0772 
0773     /* warn if we are about to overwrite the pointer */
0774     WARN_ON(rx_ring->rx_bi);
0775     bi_size = sizeof(struct iavf_rx_buffer) * rx_ring->count;
0776     rx_ring->rx_bi = kzalloc(bi_size, GFP_KERNEL);
0777     if (!rx_ring->rx_bi)
0778         goto err;
0779 
0780     u64_stats_init(&rx_ring->syncp);
0781 
0782     /* Round up to nearest 4K */
0783     rx_ring->size = rx_ring->count * sizeof(union iavf_32byte_rx_desc);
0784     rx_ring->size = ALIGN(rx_ring->size, 4096);
0785     rx_ring->desc = dma_alloc_coherent(dev, rx_ring->size,
0786                        &rx_ring->dma, GFP_KERNEL);
0787 
0788     if (!rx_ring->desc) {
0789         dev_info(dev, "Unable to allocate memory for the Rx descriptor ring, size=%d\n",
0790              rx_ring->size);
0791         goto err;
0792     }
0793 
0794     rx_ring->next_to_alloc = 0;
0795     rx_ring->next_to_clean = 0;
0796     rx_ring->next_to_use = 0;
0797 
0798     return 0;
0799 err:
0800     kfree(rx_ring->rx_bi);
0801     rx_ring->rx_bi = NULL;
0802     return -ENOMEM;
0803 }
0804 
0805 /**
0806  * iavf_release_rx_desc - Store the new tail and head values
0807  * @rx_ring: ring to bump
0808  * @val: new head index
0809  **/
0810 static inline void iavf_release_rx_desc(struct iavf_ring *rx_ring, u32 val)
0811 {
0812     rx_ring->next_to_use = val;
0813 
0814     /* update next to alloc since we have filled the ring */
0815     rx_ring->next_to_alloc = val;
0816 
0817     /* Force memory writes to complete before letting h/w
0818      * know there are new descriptors to fetch.  (Only
0819      * applicable for weak-ordered memory model archs,
0820      * such as IA-64).
0821      */
0822     wmb();
0823     writel(val, rx_ring->tail);
0824 }
0825 
0826 /**
0827  * iavf_rx_offset - Return expected offset into page to access data
0828  * @rx_ring: Ring we are requesting offset of
0829  *
0830  * Returns the offset value for ring into the data buffer.
0831  */
0832 static inline unsigned int iavf_rx_offset(struct iavf_ring *rx_ring)
0833 {
0834     return ring_uses_build_skb(rx_ring) ? IAVF_SKB_PAD : 0;
0835 }
0836 
0837 /**
0838  * iavf_alloc_mapped_page - recycle or make a new page
0839  * @rx_ring: ring to use
0840  * @bi: rx_buffer struct to modify
0841  *
0842  * Returns true if the page was successfully allocated or
0843  * reused.
0844  **/
0845 static bool iavf_alloc_mapped_page(struct iavf_ring *rx_ring,
0846                    struct iavf_rx_buffer *bi)
0847 {
0848     struct page *page = bi->page;
0849     dma_addr_t dma;
0850 
0851     /* since we are recycling buffers we should seldom need to alloc */
0852     if (likely(page)) {
0853         rx_ring->rx_stats.page_reuse_count++;
0854         return true;
0855     }
0856 
0857     /* alloc new page for storage */
0858     page = dev_alloc_pages(iavf_rx_pg_order(rx_ring));
0859     if (unlikely(!page)) {
0860         rx_ring->rx_stats.alloc_page_failed++;
0861         return false;
0862     }
0863 
0864     /* map page for use */
0865     dma = dma_map_page_attrs(rx_ring->dev, page, 0,
0866                  iavf_rx_pg_size(rx_ring),
0867                  DMA_FROM_DEVICE,
0868                  IAVF_RX_DMA_ATTR);
0869 
0870     /* if mapping failed free memory back to system since
0871      * there isn't much point in holding memory we can't use
0872      */
0873     if (dma_mapping_error(rx_ring->dev, dma)) {
0874         __free_pages(page, iavf_rx_pg_order(rx_ring));
0875         rx_ring->rx_stats.alloc_page_failed++;
0876         return false;
0877     }
0878 
0879     bi->dma = dma;
0880     bi->page = page;
0881     bi->page_offset = iavf_rx_offset(rx_ring);
0882 
0883     /* initialize pagecnt_bias to 1 representing we fully own page */
0884     bi->pagecnt_bias = 1;
0885 
0886     return true;
0887 }
0888 
0889 /**
0890  * iavf_receive_skb - Send a completed packet up the stack
0891  * @rx_ring:  rx ring in play
0892  * @skb: packet to send up
0893  * @vlan_tag: vlan tag for packet
0894  **/
0895 static void iavf_receive_skb(struct iavf_ring *rx_ring,
0896                  struct sk_buff *skb, u16 vlan_tag)
0897 {
0898     struct iavf_q_vector *q_vector = rx_ring->q_vector;
0899 
0900     if ((rx_ring->netdev->features & NETIF_F_HW_VLAN_CTAG_RX) &&
0901         (vlan_tag & VLAN_VID_MASK))
0902         __vlan_hwaccel_put_tag(skb, htons(ETH_P_8021Q), vlan_tag);
0903     else if ((rx_ring->netdev->features & NETIF_F_HW_VLAN_STAG_RX) &&
0904          vlan_tag & VLAN_VID_MASK)
0905         __vlan_hwaccel_put_tag(skb, htons(ETH_P_8021AD), vlan_tag);
0906 
0907     napi_gro_receive(&q_vector->napi, skb);
0908 }
0909 
0910 /**
0911  * iavf_alloc_rx_buffers - Replace used receive buffers
0912  * @rx_ring: ring to place buffers on
0913  * @cleaned_count: number of buffers to replace
0914  *
0915  * Returns false if all allocations were successful, true if any fail
0916  **/
0917 bool iavf_alloc_rx_buffers(struct iavf_ring *rx_ring, u16 cleaned_count)
0918 {
0919     u16 ntu = rx_ring->next_to_use;
0920     union iavf_rx_desc *rx_desc;
0921     struct iavf_rx_buffer *bi;
0922 
0923     /* do nothing if no valid netdev defined */
0924     if (!rx_ring->netdev || !cleaned_count)
0925         return false;
0926 
0927     rx_desc = IAVF_RX_DESC(rx_ring, ntu);
0928     bi = &rx_ring->rx_bi[ntu];
0929 
0930     do {
0931         if (!iavf_alloc_mapped_page(rx_ring, bi))
0932             goto no_buffers;
0933 
0934         /* sync the buffer for use by the device */
0935         dma_sync_single_range_for_device(rx_ring->dev, bi->dma,
0936                          bi->page_offset,
0937                          rx_ring->rx_buf_len,
0938                          DMA_FROM_DEVICE);
0939 
0940         /* Refresh the desc even if buffer_addrs didn't change
0941          * because each write-back erases this info.
0942          */
0943         rx_desc->read.pkt_addr = cpu_to_le64(bi->dma + bi->page_offset);
0944 
0945         rx_desc++;
0946         bi++;
0947         ntu++;
0948         if (unlikely(ntu == rx_ring->count)) {
0949             rx_desc = IAVF_RX_DESC(rx_ring, 0);
0950             bi = rx_ring->rx_bi;
0951             ntu = 0;
0952         }
0953 
0954         /* clear the status bits for the next_to_use descriptor */
0955         rx_desc->wb.qword1.status_error_len = 0;
0956 
0957         cleaned_count--;
0958     } while (cleaned_count);
0959 
0960     if (rx_ring->next_to_use != ntu)
0961         iavf_release_rx_desc(rx_ring, ntu);
0962 
0963     return false;
0964 
0965 no_buffers:
0966     if (rx_ring->next_to_use != ntu)
0967         iavf_release_rx_desc(rx_ring, ntu);
0968 
0969     /* make sure to come back via polling to try again after
0970      * allocation failure
0971      */
0972     return true;
0973 }
0974 
0975 /**
0976  * iavf_rx_checksum - Indicate in skb if hw indicated a good cksum
0977  * @vsi: the VSI we care about
0978  * @skb: skb currently being received and modified
0979  * @rx_desc: the receive descriptor
0980  **/
0981 static inline void iavf_rx_checksum(struct iavf_vsi *vsi,
0982                     struct sk_buff *skb,
0983                     union iavf_rx_desc *rx_desc)
0984 {
0985     struct iavf_rx_ptype_decoded decoded;
0986     u32 rx_error, rx_status;
0987     bool ipv4, ipv6;
0988     u8 ptype;
0989     u64 qword;
0990 
0991     qword = le64_to_cpu(rx_desc->wb.qword1.status_error_len);
0992     ptype = (qword & IAVF_RXD_QW1_PTYPE_MASK) >> IAVF_RXD_QW1_PTYPE_SHIFT;
0993     rx_error = (qword & IAVF_RXD_QW1_ERROR_MASK) >>
0994            IAVF_RXD_QW1_ERROR_SHIFT;
0995     rx_status = (qword & IAVF_RXD_QW1_STATUS_MASK) >>
0996             IAVF_RXD_QW1_STATUS_SHIFT;
0997     decoded = decode_rx_desc_ptype(ptype);
0998 
0999     skb->ip_summed = CHECKSUM_NONE;
1000 
1001     skb_checksum_none_assert(skb);
1002 
1003     /* Rx csum enabled and ip headers found? */
1004     if (!(vsi->netdev->features & NETIF_F_RXCSUM))
1005         return;
1006 
1007     /* did the hardware decode the packet and checksum? */
1008     if (!(rx_status & BIT(IAVF_RX_DESC_STATUS_L3L4P_SHIFT)))
1009         return;
1010 
1011     /* both known and outer_ip must be set for the below code to work */
1012     if (!(decoded.known && decoded.outer_ip))
1013         return;
1014 
1015     ipv4 = (decoded.outer_ip == IAVF_RX_PTYPE_OUTER_IP) &&
1016            (decoded.outer_ip_ver == IAVF_RX_PTYPE_OUTER_IPV4);
1017     ipv6 = (decoded.outer_ip == IAVF_RX_PTYPE_OUTER_IP) &&
1018            (decoded.outer_ip_ver == IAVF_RX_PTYPE_OUTER_IPV6);
1019 
1020     if (ipv4 &&
1021         (rx_error & (BIT(IAVF_RX_DESC_ERROR_IPE_SHIFT) |
1022              BIT(IAVF_RX_DESC_ERROR_EIPE_SHIFT))))
1023         goto checksum_fail;
1024 
1025     /* likely incorrect csum if alternate IP extension headers found */
1026     if (ipv6 &&
1027         rx_status & BIT(IAVF_RX_DESC_STATUS_IPV6EXADD_SHIFT))
1028         /* don't increment checksum err here, non-fatal err */
1029         return;
1030 
1031     /* there was some L4 error, count error and punt packet to the stack */
1032     if (rx_error & BIT(IAVF_RX_DESC_ERROR_L4E_SHIFT))
1033         goto checksum_fail;
1034 
1035     /* handle packets that were not able to be checksummed due
1036      * to arrival speed, in this case the stack can compute
1037      * the csum.
1038      */
1039     if (rx_error & BIT(IAVF_RX_DESC_ERROR_PPRS_SHIFT))
1040         return;
1041 
1042     /* Only report checksum unnecessary for TCP, UDP, or SCTP */
1043     switch (decoded.inner_prot) {
1044     case IAVF_RX_PTYPE_INNER_PROT_TCP:
1045     case IAVF_RX_PTYPE_INNER_PROT_UDP:
1046     case IAVF_RX_PTYPE_INNER_PROT_SCTP:
1047         skb->ip_summed = CHECKSUM_UNNECESSARY;
1048         fallthrough;
1049     default:
1050         break;
1051     }
1052 
1053     return;
1054 
1055 checksum_fail:
1056     vsi->back->hw_csum_rx_error++;
1057 }
1058 
1059 /**
1060  * iavf_ptype_to_htype - get a hash type
1061  * @ptype: the ptype value from the descriptor
1062  *
1063  * Returns a hash type to be used by skb_set_hash
1064  **/
1065 static inline int iavf_ptype_to_htype(u8 ptype)
1066 {
1067     struct iavf_rx_ptype_decoded decoded = decode_rx_desc_ptype(ptype);
1068 
1069     if (!decoded.known)
1070         return PKT_HASH_TYPE_NONE;
1071 
1072     if (decoded.outer_ip == IAVF_RX_PTYPE_OUTER_IP &&
1073         decoded.payload_layer == IAVF_RX_PTYPE_PAYLOAD_LAYER_PAY4)
1074         return PKT_HASH_TYPE_L4;
1075     else if (decoded.outer_ip == IAVF_RX_PTYPE_OUTER_IP &&
1076          decoded.payload_layer == IAVF_RX_PTYPE_PAYLOAD_LAYER_PAY3)
1077         return PKT_HASH_TYPE_L3;
1078     else
1079         return PKT_HASH_TYPE_L2;
1080 }
1081 
1082 /**
1083  * iavf_rx_hash - set the hash value in the skb
1084  * @ring: descriptor ring
1085  * @rx_desc: specific descriptor
1086  * @skb: skb currently being received and modified
1087  * @rx_ptype: Rx packet type
1088  **/
1089 static inline void iavf_rx_hash(struct iavf_ring *ring,
1090                 union iavf_rx_desc *rx_desc,
1091                 struct sk_buff *skb,
1092                 u8 rx_ptype)
1093 {
1094     u32 hash;
1095     const __le64 rss_mask =
1096         cpu_to_le64((u64)IAVF_RX_DESC_FLTSTAT_RSS_HASH <<
1097                 IAVF_RX_DESC_STATUS_FLTSTAT_SHIFT);
1098 
1099     if (ring->netdev->features & NETIF_F_RXHASH)
1100         return;
1101 
1102     if ((rx_desc->wb.qword1.status_error_len & rss_mask) == rss_mask) {
1103         hash = le32_to_cpu(rx_desc->wb.qword0.hi_dword.rss);
1104         skb_set_hash(skb, hash, iavf_ptype_to_htype(rx_ptype));
1105     }
1106 }
1107 
1108 /**
1109  * iavf_process_skb_fields - Populate skb header fields from Rx descriptor
1110  * @rx_ring: rx descriptor ring packet is being transacted on
1111  * @rx_desc: pointer to the EOP Rx descriptor
1112  * @skb: pointer to current skb being populated
1113  * @rx_ptype: the packet type decoded by hardware
1114  *
1115  * This function checks the ring, descriptor, and packet information in
1116  * order to populate the hash, checksum, VLAN, protocol, and
1117  * other fields within the skb.
1118  **/
1119 static inline
1120 void iavf_process_skb_fields(struct iavf_ring *rx_ring,
1121                  union iavf_rx_desc *rx_desc, struct sk_buff *skb,
1122                  u8 rx_ptype)
1123 {
1124     iavf_rx_hash(rx_ring, rx_desc, skb, rx_ptype);
1125 
1126     iavf_rx_checksum(rx_ring->vsi, skb, rx_desc);
1127 
1128     skb_record_rx_queue(skb, rx_ring->queue_index);
1129 
1130     /* modifies the skb - consumes the enet header */
1131     skb->protocol = eth_type_trans(skb, rx_ring->netdev);
1132 }
1133 
1134 /**
1135  * iavf_cleanup_headers - Correct empty headers
1136  * @rx_ring: rx descriptor ring packet is being transacted on
1137  * @skb: pointer to current skb being fixed
1138  *
1139  * Also address the case where we are pulling data in on pages only
1140  * and as such no data is present in the skb header.
1141  *
1142  * In addition if skb is not at least 60 bytes we need to pad it so that
1143  * it is large enough to qualify as a valid Ethernet frame.
1144  *
1145  * Returns true if an error was encountered and skb was freed.
1146  **/
1147 static bool iavf_cleanup_headers(struct iavf_ring *rx_ring, struct sk_buff *skb)
1148 {
1149     /* if eth_skb_pad returns an error the skb was freed */
1150     if (eth_skb_pad(skb))
1151         return true;
1152 
1153     return false;
1154 }
1155 
1156 /**
1157  * iavf_reuse_rx_page - page flip buffer and store it back on the ring
1158  * @rx_ring: rx descriptor ring to store buffers on
1159  * @old_buff: donor buffer to have page reused
1160  *
1161  * Synchronizes page for reuse by the adapter
1162  **/
1163 static void iavf_reuse_rx_page(struct iavf_ring *rx_ring,
1164                    struct iavf_rx_buffer *old_buff)
1165 {
1166     struct iavf_rx_buffer *new_buff;
1167     u16 nta = rx_ring->next_to_alloc;
1168 
1169     new_buff = &rx_ring->rx_bi[nta];
1170 
1171     /* update, and store next to alloc */
1172     nta++;
1173     rx_ring->next_to_alloc = (nta < rx_ring->count) ? nta : 0;
1174 
1175     /* transfer page from old buffer to new buffer */
1176     new_buff->dma       = old_buff->dma;
1177     new_buff->page      = old_buff->page;
1178     new_buff->page_offset   = old_buff->page_offset;
1179     new_buff->pagecnt_bias  = old_buff->pagecnt_bias;
1180 }
1181 
1182 /**
1183  * iavf_can_reuse_rx_page - Determine if this page can be reused by
1184  * the adapter for another receive
1185  *
1186  * @rx_buffer: buffer containing the page
1187  *
1188  * If page is reusable, rx_buffer->page_offset is adjusted to point to
1189  * an unused region in the page.
1190  *
1191  * For small pages, @truesize will be a constant value, half the size
1192  * of the memory at page.  We'll attempt to alternate between high and
1193  * low halves of the page, with one half ready for use by the hardware
1194  * and the other half being consumed by the stack.  We use the page
1195  * ref count to determine whether the stack has finished consuming the
1196  * portion of this page that was passed up with a previous packet.  If
1197  * the page ref count is >1, we'll assume the "other" half page is
1198  * still busy, and this page cannot be reused.
1199  *
1200  * For larger pages, @truesize will be the actual space used by the
1201  * received packet (adjusted upward to an even multiple of the cache
1202  * line size).  This will advance through the page by the amount
1203  * actually consumed by the received packets while there is still
1204  * space for a buffer.  Each region of larger pages will be used at
1205  * most once, after which the page will not be reused.
1206  *
1207  * In either case, if the page is reusable its refcount is increased.
1208  **/
1209 static bool iavf_can_reuse_rx_page(struct iavf_rx_buffer *rx_buffer)
1210 {
1211     unsigned int pagecnt_bias = rx_buffer->pagecnt_bias;
1212     struct page *page = rx_buffer->page;
1213 
1214     /* Is any reuse possible? */
1215     if (!dev_page_is_reusable(page))
1216         return false;
1217 
1218 #if (PAGE_SIZE < 8192)
1219     /* if we are only owner of page we can reuse it */
1220     if (unlikely((page_count(page) - pagecnt_bias) > 1))
1221         return false;
1222 #else
1223 #define IAVF_LAST_OFFSET \
1224     (SKB_WITH_OVERHEAD(PAGE_SIZE) - IAVF_RXBUFFER_2048)
1225     if (rx_buffer->page_offset > IAVF_LAST_OFFSET)
1226         return false;
1227 #endif
1228 
1229     /* If we have drained the page fragment pool we need to update
1230      * the pagecnt_bias and page count so that we fully restock the
1231      * number of references the driver holds.
1232      */
1233     if (unlikely(!pagecnt_bias)) {
1234         page_ref_add(page, USHRT_MAX);
1235         rx_buffer->pagecnt_bias = USHRT_MAX;
1236     }
1237 
1238     return true;
1239 }
1240 
1241 /**
1242  * iavf_add_rx_frag - Add contents of Rx buffer to sk_buff
1243  * @rx_ring: rx descriptor ring to transact packets on
1244  * @rx_buffer: buffer containing page to add
1245  * @skb: sk_buff to place the data into
1246  * @size: packet length from rx_desc
1247  *
1248  * This function will add the data contained in rx_buffer->page to the skb.
1249  * It will just attach the page as a frag to the skb.
1250  *
1251  * The function will then update the page offset.
1252  **/
1253 static void iavf_add_rx_frag(struct iavf_ring *rx_ring,
1254                  struct iavf_rx_buffer *rx_buffer,
1255                  struct sk_buff *skb,
1256                  unsigned int size)
1257 {
1258 #if (PAGE_SIZE < 8192)
1259     unsigned int truesize = iavf_rx_pg_size(rx_ring) / 2;
1260 #else
1261     unsigned int truesize = SKB_DATA_ALIGN(size + iavf_rx_offset(rx_ring));
1262 #endif
1263 
1264     if (!size)
1265         return;
1266 
1267     skb_add_rx_frag(skb, skb_shinfo(skb)->nr_frags, rx_buffer->page,
1268             rx_buffer->page_offset, size, truesize);
1269 
1270     /* page is being used so we must update the page offset */
1271 #if (PAGE_SIZE < 8192)
1272     rx_buffer->page_offset ^= truesize;
1273 #else
1274     rx_buffer->page_offset += truesize;
1275 #endif
1276 }
1277 
1278 /**
1279  * iavf_get_rx_buffer - Fetch Rx buffer and synchronize data for use
1280  * @rx_ring: rx descriptor ring to transact packets on
1281  * @size: size of buffer to add to skb
1282  *
1283  * This function will pull an Rx buffer from the ring and synchronize it
1284  * for use by the CPU.
1285  */
1286 static struct iavf_rx_buffer *iavf_get_rx_buffer(struct iavf_ring *rx_ring,
1287                          const unsigned int size)
1288 {
1289     struct iavf_rx_buffer *rx_buffer;
1290 
1291     rx_buffer = &rx_ring->rx_bi[rx_ring->next_to_clean];
1292     prefetchw(rx_buffer->page);
1293     if (!size)
1294         return rx_buffer;
1295 
1296     /* we are reusing so sync this buffer for CPU use */
1297     dma_sync_single_range_for_cpu(rx_ring->dev,
1298                       rx_buffer->dma,
1299                       rx_buffer->page_offset,
1300                       size,
1301                       DMA_FROM_DEVICE);
1302 
1303     /* We have pulled a buffer for use, so decrement pagecnt_bias */
1304     rx_buffer->pagecnt_bias--;
1305 
1306     return rx_buffer;
1307 }
1308 
1309 /**
1310  * iavf_construct_skb - Allocate skb and populate it
1311  * @rx_ring: rx descriptor ring to transact packets on
1312  * @rx_buffer: rx buffer to pull data from
1313  * @size: size of buffer to add to skb
1314  *
1315  * This function allocates an skb.  It then populates it with the page
1316  * data from the current receive descriptor, taking care to set up the
1317  * skb correctly.
1318  */
1319 static struct sk_buff *iavf_construct_skb(struct iavf_ring *rx_ring,
1320                       struct iavf_rx_buffer *rx_buffer,
1321                       unsigned int size)
1322 {
1323     void *va;
1324 #if (PAGE_SIZE < 8192)
1325     unsigned int truesize = iavf_rx_pg_size(rx_ring) / 2;
1326 #else
1327     unsigned int truesize = SKB_DATA_ALIGN(size);
1328 #endif
1329     unsigned int headlen;
1330     struct sk_buff *skb;
1331 
1332     if (!rx_buffer)
1333         return NULL;
1334     /* prefetch first cache line of first page */
1335     va = page_address(rx_buffer->page) + rx_buffer->page_offset;
1336     net_prefetch(va);
1337 
1338     /* allocate a skb to store the frags */
1339     skb = __napi_alloc_skb(&rx_ring->q_vector->napi,
1340                    IAVF_RX_HDR_SIZE,
1341                    GFP_ATOMIC | __GFP_NOWARN);
1342     if (unlikely(!skb))
1343         return NULL;
1344 
1345     /* Determine available headroom for copy */
1346     headlen = size;
1347     if (headlen > IAVF_RX_HDR_SIZE)
1348         headlen = eth_get_headlen(skb->dev, va, IAVF_RX_HDR_SIZE);
1349 
1350     /* align pull length to size of long to optimize memcpy performance */
1351     memcpy(__skb_put(skb, headlen), va, ALIGN(headlen, sizeof(long)));
1352 
1353     /* update all of the pointers */
1354     size -= headlen;
1355     if (size) {
1356         skb_add_rx_frag(skb, 0, rx_buffer->page,
1357                 rx_buffer->page_offset + headlen,
1358                 size, truesize);
1359 
1360         /* buffer is used by skb, update page_offset */
1361 #if (PAGE_SIZE < 8192)
1362         rx_buffer->page_offset ^= truesize;
1363 #else
1364         rx_buffer->page_offset += truesize;
1365 #endif
1366     } else {
1367         /* buffer is unused, reset bias back to rx_buffer */
1368         rx_buffer->pagecnt_bias++;
1369     }
1370 
1371     return skb;
1372 }
1373 
1374 /**
1375  * iavf_build_skb - Build skb around an existing buffer
1376  * @rx_ring: Rx descriptor ring to transact packets on
1377  * @rx_buffer: Rx buffer to pull data from
1378  * @size: size of buffer to add to skb
1379  *
1380  * This function builds an skb around an existing Rx buffer, taking care
1381  * to set up the skb correctly and avoid any memcpy overhead.
1382  */
1383 static struct sk_buff *iavf_build_skb(struct iavf_ring *rx_ring,
1384                       struct iavf_rx_buffer *rx_buffer,
1385                       unsigned int size)
1386 {
1387     void *va;
1388 #if (PAGE_SIZE < 8192)
1389     unsigned int truesize = iavf_rx_pg_size(rx_ring) / 2;
1390 #else
1391     unsigned int truesize = SKB_DATA_ALIGN(sizeof(struct skb_shared_info)) +
1392                 SKB_DATA_ALIGN(IAVF_SKB_PAD + size);
1393 #endif
1394     struct sk_buff *skb;
1395 
1396     if (!rx_buffer || !size)
1397         return NULL;
1398     /* prefetch first cache line of first page */
1399     va = page_address(rx_buffer->page) + rx_buffer->page_offset;
1400     net_prefetch(va);
1401 
1402     /* build an skb around the page buffer */
1403     skb = napi_build_skb(va - IAVF_SKB_PAD, truesize);
1404     if (unlikely(!skb))
1405         return NULL;
1406 
1407     /* update pointers within the skb to store the data */
1408     skb_reserve(skb, IAVF_SKB_PAD);
1409     __skb_put(skb, size);
1410 
1411     /* buffer is used by skb, update page_offset */
1412 #if (PAGE_SIZE < 8192)
1413     rx_buffer->page_offset ^= truesize;
1414 #else
1415     rx_buffer->page_offset += truesize;
1416 #endif
1417 
1418     return skb;
1419 }
1420 
1421 /**
1422  * iavf_put_rx_buffer - Clean up used buffer and either recycle or free
1423  * @rx_ring: rx descriptor ring to transact packets on
1424  * @rx_buffer: rx buffer to pull data from
1425  *
1426  * This function will clean up the contents of the rx_buffer.  It will
1427  * either recycle the buffer or unmap it and free the associated resources.
1428  */
1429 static void iavf_put_rx_buffer(struct iavf_ring *rx_ring,
1430                    struct iavf_rx_buffer *rx_buffer)
1431 {
1432     if (!rx_buffer)
1433         return;
1434 
1435     if (iavf_can_reuse_rx_page(rx_buffer)) {
1436         /* hand second half of page back to the ring */
1437         iavf_reuse_rx_page(rx_ring, rx_buffer);
1438         rx_ring->rx_stats.page_reuse_count++;
1439     } else {
1440         /* we are not reusing the buffer so unmap it */
1441         dma_unmap_page_attrs(rx_ring->dev, rx_buffer->dma,
1442                      iavf_rx_pg_size(rx_ring),
1443                      DMA_FROM_DEVICE, IAVF_RX_DMA_ATTR);
1444         __page_frag_cache_drain(rx_buffer->page,
1445                     rx_buffer->pagecnt_bias);
1446     }
1447 
1448     /* clear contents of buffer_info */
1449     rx_buffer->page = NULL;
1450 }
1451 
1452 /**
1453  * iavf_is_non_eop - process handling of non-EOP buffers
1454  * @rx_ring: Rx ring being processed
1455  * @rx_desc: Rx descriptor for current buffer
1456  * @skb: Current socket buffer containing buffer in progress
1457  *
1458  * This function updates next to clean.  If the buffer is an EOP buffer
1459  * this function exits returning false, otherwise it will place the
1460  * sk_buff in the next buffer to be chained and return true indicating
1461  * that this is in fact a non-EOP buffer.
1462  **/
1463 static bool iavf_is_non_eop(struct iavf_ring *rx_ring,
1464                 union iavf_rx_desc *rx_desc,
1465                 struct sk_buff *skb)
1466 {
1467     u32 ntc = rx_ring->next_to_clean + 1;
1468 
1469     /* fetch, update, and store next to clean */
1470     ntc = (ntc < rx_ring->count) ? ntc : 0;
1471     rx_ring->next_to_clean = ntc;
1472 
1473     prefetch(IAVF_RX_DESC(rx_ring, ntc));
1474 
1475     /* if we are the last buffer then there is nothing else to do */
1476 #define IAVF_RXD_EOF BIT(IAVF_RX_DESC_STATUS_EOF_SHIFT)
1477     if (likely(iavf_test_staterr(rx_desc, IAVF_RXD_EOF)))
1478         return false;
1479 
1480     rx_ring->rx_stats.non_eop_descs++;
1481 
1482     return true;
1483 }
1484 
1485 /**
1486  * iavf_clean_rx_irq - Clean completed descriptors from Rx ring - bounce buf
1487  * @rx_ring: rx descriptor ring to transact packets on
1488  * @budget: Total limit on number of packets to process
1489  *
1490  * This function provides a "bounce buffer" approach to Rx interrupt
1491  * processing.  The advantage to this is that on systems that have
1492  * expensive overhead for IOMMU access this provides a means of avoiding
1493  * it by maintaining the mapping of the page to the system.
1494  *
1495  * Returns amount of work completed
1496  **/
1497 static int iavf_clean_rx_irq(struct iavf_ring *rx_ring, int budget)
1498 {
1499     unsigned int total_rx_bytes = 0, total_rx_packets = 0;
1500     struct sk_buff *skb = rx_ring->skb;
1501     u16 cleaned_count = IAVF_DESC_UNUSED(rx_ring);
1502     bool failure = false;
1503 
1504     while (likely(total_rx_packets < (unsigned int)budget)) {
1505         struct iavf_rx_buffer *rx_buffer;
1506         union iavf_rx_desc *rx_desc;
1507         unsigned int size;
1508         u16 vlan_tag = 0;
1509         u8 rx_ptype;
1510         u64 qword;
1511 
1512         /* return some buffers to hardware, one at a time is too slow */
1513         if (cleaned_count >= IAVF_RX_BUFFER_WRITE) {
1514             failure = failure ||
1515                   iavf_alloc_rx_buffers(rx_ring, cleaned_count);
1516             cleaned_count = 0;
1517         }
1518 
1519         rx_desc = IAVF_RX_DESC(rx_ring, rx_ring->next_to_clean);
1520 
1521         /* status_error_len will always be zero for unused descriptors
1522          * because it's cleared in cleanup, and overlaps with hdr_addr
1523          * which is always zero because packet split isn't used, if the
1524          * hardware wrote DD then the length will be non-zero
1525          */
1526         qword = le64_to_cpu(rx_desc->wb.qword1.status_error_len);
1527 
1528         /* This memory barrier is needed to keep us from reading
1529          * any other fields out of the rx_desc until we have
1530          * verified the descriptor has been written back.
1531          */
1532         dma_rmb();
1533 #define IAVF_RXD_DD BIT(IAVF_RX_DESC_STATUS_DD_SHIFT)
1534         if (!iavf_test_staterr(rx_desc, IAVF_RXD_DD))
1535             break;
1536 
1537         size = (qword & IAVF_RXD_QW1_LENGTH_PBUF_MASK) >>
1538                IAVF_RXD_QW1_LENGTH_PBUF_SHIFT;
1539 
1540         iavf_trace(clean_rx_irq, rx_ring, rx_desc, skb);
1541         rx_buffer = iavf_get_rx_buffer(rx_ring, size);
1542 
1543         /* retrieve a buffer from the ring */
1544         if (skb)
1545             iavf_add_rx_frag(rx_ring, rx_buffer, skb, size);
1546         else if (ring_uses_build_skb(rx_ring))
1547             skb = iavf_build_skb(rx_ring, rx_buffer, size);
1548         else
1549             skb = iavf_construct_skb(rx_ring, rx_buffer, size);
1550 
1551         /* exit if we failed to retrieve a buffer */
1552         if (!skb) {
1553             rx_ring->rx_stats.alloc_buff_failed++;
1554             if (rx_buffer && size)
1555                 rx_buffer->pagecnt_bias++;
1556             break;
1557         }
1558 
1559         iavf_put_rx_buffer(rx_ring, rx_buffer);
1560         cleaned_count++;
1561 
1562         if (iavf_is_non_eop(rx_ring, rx_desc, skb))
1563             continue;
1564 
1565         /* ERR_MASK will only have valid bits if EOP set, and
1566          * what we are doing here is actually checking
1567          * IAVF_RX_DESC_ERROR_RXE_SHIFT, since it is the zeroth bit in
1568          * the error field
1569          */
1570         if (unlikely(iavf_test_staterr(rx_desc, BIT(IAVF_RXD_QW1_ERROR_SHIFT)))) {
1571             dev_kfree_skb_any(skb);
1572             skb = NULL;
1573             continue;
1574         }
1575 
1576         if (iavf_cleanup_headers(rx_ring, skb)) {
1577             skb = NULL;
1578             continue;
1579         }
1580 
1581         /* probably a little skewed due to removing CRC */
1582         total_rx_bytes += skb->len;
1583 
1584         qword = le64_to_cpu(rx_desc->wb.qword1.status_error_len);
1585         rx_ptype = (qword & IAVF_RXD_QW1_PTYPE_MASK) >>
1586                IAVF_RXD_QW1_PTYPE_SHIFT;
1587 
1588         /* populate checksum, VLAN, and protocol */
1589         iavf_process_skb_fields(rx_ring, rx_desc, skb, rx_ptype);
1590 
1591         if (qword & BIT(IAVF_RX_DESC_STATUS_L2TAG1P_SHIFT) &&
1592             rx_ring->flags & IAVF_TXRX_FLAGS_VLAN_TAG_LOC_L2TAG1)
1593             vlan_tag = le16_to_cpu(rx_desc->wb.qword0.lo_dword.l2tag1);
1594         if (rx_desc->wb.qword2.ext_status &
1595             cpu_to_le16(BIT(IAVF_RX_DESC_EXT_STATUS_L2TAG2P_SHIFT)) &&
1596             rx_ring->flags & IAVF_RXR_FLAGS_VLAN_TAG_LOC_L2TAG2_2)
1597             vlan_tag = le16_to_cpu(rx_desc->wb.qword2.l2tag2_2);
1598 
1599         iavf_trace(clean_rx_irq_rx, rx_ring, rx_desc, skb);
1600         iavf_receive_skb(rx_ring, skb, vlan_tag);
1601         skb = NULL;
1602 
1603         /* update budget accounting */
1604         total_rx_packets++;
1605     }
1606 
1607     rx_ring->skb = skb;
1608 
1609     u64_stats_update_begin(&rx_ring->syncp);
1610     rx_ring->stats.packets += total_rx_packets;
1611     rx_ring->stats.bytes += total_rx_bytes;
1612     u64_stats_update_end(&rx_ring->syncp);
1613     rx_ring->q_vector->rx.total_packets += total_rx_packets;
1614     rx_ring->q_vector->rx.total_bytes += total_rx_bytes;
1615 
1616     /* guarantee a trip back through this routine if there was a failure */
1617     return failure ? budget : (int)total_rx_packets;
1618 }
1619 
1620 static inline u32 iavf_buildreg_itr(const int type, u16 itr)
1621 {
1622     u32 val;
1623 
1624     /* We don't bother with setting the CLEARPBA bit as the data sheet
1625      * points out doing so is "meaningless since it was already
1626      * auto-cleared". The auto-clearing happens when the interrupt is
1627      * asserted.
1628      *
1629      * Hardware errata 28 for also indicates that writing to a
1630      * xxINT_DYN_CTLx CSR with INTENA_MSK (bit 31) set to 0 will clear
1631      * an event in the PBA anyway so we need to rely on the automask
1632      * to hold pending events for us until the interrupt is re-enabled
1633      *
1634      * The itr value is reported in microseconds, and the register
1635      * value is recorded in 2 microsecond units. For this reason we
1636      * only need to shift by the interval shift - 1 instead of the
1637      * full value.
1638      */
1639     itr &= IAVF_ITR_MASK;
1640 
1641     val = IAVF_VFINT_DYN_CTLN1_INTENA_MASK |
1642           (type << IAVF_VFINT_DYN_CTLN1_ITR_INDX_SHIFT) |
1643           (itr << (IAVF_VFINT_DYN_CTLN1_INTERVAL_SHIFT - 1));
1644 
1645     return val;
1646 }
1647 
1648 /* a small macro to shorten up some long lines */
1649 #define INTREG IAVF_VFINT_DYN_CTLN1
1650 
1651 /* The act of updating the ITR will cause it to immediately trigger. In order
1652  * to prevent this from throwing off adaptive update statistics we defer the
1653  * update so that it can only happen so often. So after either Tx or Rx are
1654  * updated we make the adaptive scheme wait until either the ITR completely
1655  * expires via the next_update expiration or we have been through at least
1656  * 3 interrupts.
1657  */
1658 #define ITR_COUNTDOWN_START 3
1659 
1660 /**
1661  * iavf_update_enable_itr - Update itr and re-enable MSIX interrupt
1662  * @vsi: the VSI we care about
1663  * @q_vector: q_vector for which itr is being updated and interrupt enabled
1664  *
1665  **/
1666 static inline void iavf_update_enable_itr(struct iavf_vsi *vsi,
1667                       struct iavf_q_vector *q_vector)
1668 {
1669     struct iavf_hw *hw = &vsi->back->hw;
1670     u32 intval;
1671 
1672     /* These will do nothing if dynamic updates are not enabled */
1673     iavf_update_itr(q_vector, &q_vector->tx);
1674     iavf_update_itr(q_vector, &q_vector->rx);
1675 
1676     /* This block of logic allows us to get away with only updating
1677      * one ITR value with each interrupt. The idea is to perform a
1678      * pseudo-lazy update with the following criteria.
1679      *
1680      * 1. Rx is given higher priority than Tx if both are in same state
1681      * 2. If we must reduce an ITR that is given highest priority.
1682      * 3. We then give priority to increasing ITR based on amount.
1683      */
1684     if (q_vector->rx.target_itr < q_vector->rx.current_itr) {
1685         /* Rx ITR needs to be reduced, this is highest priority */
1686         intval = iavf_buildreg_itr(IAVF_RX_ITR,
1687                        q_vector->rx.target_itr);
1688         q_vector->rx.current_itr = q_vector->rx.target_itr;
1689         q_vector->itr_countdown = ITR_COUNTDOWN_START;
1690     } else if ((q_vector->tx.target_itr < q_vector->tx.current_itr) ||
1691            ((q_vector->rx.target_itr - q_vector->rx.current_itr) <
1692             (q_vector->tx.target_itr - q_vector->tx.current_itr))) {
1693         /* Tx ITR needs to be reduced, this is second priority
1694          * Tx ITR needs to be increased more than Rx, fourth priority
1695          */
1696         intval = iavf_buildreg_itr(IAVF_TX_ITR,
1697                        q_vector->tx.target_itr);
1698         q_vector->tx.current_itr = q_vector->tx.target_itr;
1699         q_vector->itr_countdown = ITR_COUNTDOWN_START;
1700     } else if (q_vector->rx.current_itr != q_vector->rx.target_itr) {
1701         /* Rx ITR needs to be increased, third priority */
1702         intval = iavf_buildreg_itr(IAVF_RX_ITR,
1703                        q_vector->rx.target_itr);
1704         q_vector->rx.current_itr = q_vector->rx.target_itr;
1705         q_vector->itr_countdown = ITR_COUNTDOWN_START;
1706     } else {
1707         /* No ITR update, lowest priority */
1708         intval = iavf_buildreg_itr(IAVF_ITR_NONE, 0);
1709         if (q_vector->itr_countdown)
1710             q_vector->itr_countdown--;
1711     }
1712 
1713     if (!test_bit(__IAVF_VSI_DOWN, vsi->state))
1714         wr32(hw, INTREG(q_vector->reg_idx), intval);
1715 }
1716 
1717 /**
1718  * iavf_napi_poll - NAPI polling Rx/Tx cleanup routine
1719  * @napi: napi struct with our devices info in it
1720  * @budget: amount of work driver is allowed to do this pass, in packets
1721  *
1722  * This function will clean all queues associated with a q_vector.
1723  *
1724  * Returns the amount of work done
1725  **/
1726 int iavf_napi_poll(struct napi_struct *napi, int budget)
1727 {
1728     struct iavf_q_vector *q_vector =
1729                    container_of(napi, struct iavf_q_vector, napi);
1730     struct iavf_vsi *vsi = q_vector->vsi;
1731     struct iavf_ring *ring;
1732     bool clean_complete = true;
1733     bool arm_wb = false;
1734     int budget_per_ring;
1735     int work_done = 0;
1736 
1737     if (test_bit(__IAVF_VSI_DOWN, vsi->state)) {
1738         napi_complete(napi);
1739         return 0;
1740     }
1741 
1742     /* Since the actual Tx work is minimal, we can give the Tx a larger
1743      * budget and be more aggressive about cleaning up the Tx descriptors.
1744      */
1745     iavf_for_each_ring(ring, q_vector->tx) {
1746         if (!iavf_clean_tx_irq(vsi, ring, budget)) {
1747             clean_complete = false;
1748             continue;
1749         }
1750         arm_wb |= ring->arm_wb;
1751         ring->arm_wb = false;
1752     }
1753 
1754     /* Handle case where we are called by netpoll with a budget of 0 */
1755     if (budget <= 0)
1756         goto tx_only;
1757 
1758     /* We attempt to distribute budget to each Rx queue fairly, but don't
1759      * allow the budget to go below 1 because that would exit polling early.
1760      */
1761     budget_per_ring = max(budget/q_vector->num_ringpairs, 1);
1762 
1763     iavf_for_each_ring(ring, q_vector->rx) {
1764         int cleaned = iavf_clean_rx_irq(ring, budget_per_ring);
1765 
1766         work_done += cleaned;
1767         /* if we clean as many as budgeted, we must not be done */
1768         if (cleaned >= budget_per_ring)
1769             clean_complete = false;
1770     }
1771 
1772     /* If work not completed, return budget and polling will return */
1773     if (!clean_complete) {
1774         int cpu_id = smp_processor_id();
1775 
1776         /* It is possible that the interrupt affinity has changed but,
1777          * if the cpu is pegged at 100%, polling will never exit while
1778          * traffic continues and the interrupt will be stuck on this
1779          * cpu.  We check to make sure affinity is correct before we
1780          * continue to poll, otherwise we must stop polling so the
1781          * interrupt can move to the correct cpu.
1782          */
1783         if (!cpumask_test_cpu(cpu_id, &q_vector->affinity_mask)) {
1784             /* Tell napi that we are done polling */
1785             napi_complete_done(napi, work_done);
1786 
1787             /* Force an interrupt */
1788             iavf_force_wb(vsi, q_vector);
1789 
1790             /* Return budget-1 so that polling stops */
1791             return budget - 1;
1792         }
1793 tx_only:
1794         if (arm_wb) {
1795             q_vector->tx.ring[0].tx_stats.tx_force_wb++;
1796             iavf_enable_wb_on_itr(vsi, q_vector);
1797         }
1798         return budget;
1799     }
1800 
1801     if (vsi->back->flags & IAVF_TXR_FLAGS_WB_ON_ITR)
1802         q_vector->arm_wb_state = false;
1803 
1804     /* Exit the polling mode, but don't re-enable interrupts if stack might
1805      * poll us due to busy-polling
1806      */
1807     if (likely(napi_complete_done(napi, work_done)))
1808         iavf_update_enable_itr(vsi, q_vector);
1809 
1810     return min_t(int, work_done, budget - 1);
1811 }
1812 
1813 /**
1814  * iavf_tx_prepare_vlan_flags - prepare generic TX VLAN tagging flags for HW
1815  * @skb:     send buffer
1816  * @tx_ring: ring to send buffer on
1817  * @flags:   the tx flags to be set
1818  *
1819  * Checks the skb and set up correspondingly several generic transmit flags
1820  * related to VLAN tagging for the HW, such as VLAN, DCB, etc.
1821  *
1822  * Returns error code indicate the frame should be dropped upon error and the
1823  * otherwise  returns 0 to indicate the flags has been set properly.
1824  **/
1825 static void iavf_tx_prepare_vlan_flags(struct sk_buff *skb,
1826                        struct iavf_ring *tx_ring, u32 *flags)
1827 {
1828     u32  tx_flags = 0;
1829 
1830 
1831     /* stack will only request hardware VLAN insertion offload for protocols
1832      * that the driver supports and has enabled
1833      */
1834     if (!skb_vlan_tag_present(skb))
1835         return;
1836 
1837     tx_flags |= skb_vlan_tag_get(skb) << IAVF_TX_FLAGS_VLAN_SHIFT;
1838     if (tx_ring->flags & IAVF_TXR_FLAGS_VLAN_TAG_LOC_L2TAG2) {
1839         tx_flags |= IAVF_TX_FLAGS_HW_OUTER_SINGLE_VLAN;
1840     } else if (tx_ring->flags & IAVF_TXRX_FLAGS_VLAN_TAG_LOC_L2TAG1) {
1841         tx_flags |= IAVF_TX_FLAGS_HW_VLAN;
1842     } else {
1843         dev_dbg(tx_ring->dev, "Unsupported Tx VLAN tag location requested\n");
1844         return;
1845     }
1846 
1847     *flags = tx_flags;
1848 }
1849 
1850 /**
1851  * iavf_tso - set up the tso context descriptor
1852  * @first:    pointer to first Tx buffer for xmit
1853  * @hdr_len:  ptr to the size of the packet header
1854  * @cd_type_cmd_tso_mss: Quad Word 1
1855  *
1856  * Returns 0 if no TSO can happen, 1 if tso is going, or error
1857  **/
1858 static int iavf_tso(struct iavf_tx_buffer *first, u8 *hdr_len,
1859             u64 *cd_type_cmd_tso_mss)
1860 {
1861     struct sk_buff *skb = first->skb;
1862     u64 cd_cmd, cd_tso_len, cd_mss;
1863     union {
1864         struct iphdr *v4;
1865         struct ipv6hdr *v6;
1866         unsigned char *hdr;
1867     } ip;
1868     union {
1869         struct tcphdr *tcp;
1870         struct udphdr *udp;
1871         unsigned char *hdr;
1872     } l4;
1873     u32 paylen, l4_offset;
1874     u16 gso_segs, gso_size;
1875     int err;
1876 
1877     if (skb->ip_summed != CHECKSUM_PARTIAL)
1878         return 0;
1879 
1880     if (!skb_is_gso(skb))
1881         return 0;
1882 
1883     err = skb_cow_head(skb, 0);
1884     if (err < 0)
1885         return err;
1886 
1887     ip.hdr = skb_network_header(skb);
1888     l4.hdr = skb_transport_header(skb);
1889 
1890     /* initialize outer IP header fields */
1891     if (ip.v4->version == 4) {
1892         ip.v4->tot_len = 0;
1893         ip.v4->check = 0;
1894     } else {
1895         ip.v6->payload_len = 0;
1896     }
1897 
1898     if (skb_shinfo(skb)->gso_type & (SKB_GSO_GRE |
1899                      SKB_GSO_GRE_CSUM |
1900                      SKB_GSO_IPXIP4 |
1901                      SKB_GSO_IPXIP6 |
1902                      SKB_GSO_UDP_TUNNEL |
1903                      SKB_GSO_UDP_TUNNEL_CSUM)) {
1904         if (!(skb_shinfo(skb)->gso_type & SKB_GSO_PARTIAL) &&
1905             (skb_shinfo(skb)->gso_type & SKB_GSO_UDP_TUNNEL_CSUM)) {
1906             l4.udp->len = 0;
1907 
1908             /* determine offset of outer transport header */
1909             l4_offset = l4.hdr - skb->data;
1910 
1911             /* remove payload length from outer checksum */
1912             paylen = skb->len - l4_offset;
1913             csum_replace_by_diff(&l4.udp->check,
1914                          (__force __wsum)htonl(paylen));
1915         }
1916 
1917         /* reset pointers to inner headers */
1918         ip.hdr = skb_inner_network_header(skb);
1919         l4.hdr = skb_inner_transport_header(skb);
1920 
1921         /* initialize inner IP header fields */
1922         if (ip.v4->version == 4) {
1923             ip.v4->tot_len = 0;
1924             ip.v4->check = 0;
1925         } else {
1926             ip.v6->payload_len = 0;
1927         }
1928     }
1929 
1930     /* determine offset of inner transport header */
1931     l4_offset = l4.hdr - skb->data;
1932     /* remove payload length from inner checksum */
1933     paylen = skb->len - l4_offset;
1934 
1935     if (skb_shinfo(skb)->gso_type & SKB_GSO_UDP_L4) {
1936         csum_replace_by_diff(&l4.udp->check,
1937                      (__force __wsum)htonl(paylen));
1938         /* compute length of UDP segmentation header */
1939         *hdr_len = (u8)sizeof(l4.udp) + l4_offset;
1940     } else {
1941         csum_replace_by_diff(&l4.tcp->check,
1942                      (__force __wsum)htonl(paylen));
1943         /* compute length of TCP segmentation header */
1944         *hdr_len = (u8)((l4.tcp->doff * 4) + l4_offset);
1945     }
1946 
1947     /* pull values out of skb_shinfo */
1948     gso_size = skb_shinfo(skb)->gso_size;
1949     gso_segs = skb_shinfo(skb)->gso_segs;
1950 
1951     /* update GSO size and bytecount with header size */
1952     first->gso_segs = gso_segs;
1953     first->bytecount += (first->gso_segs - 1) * *hdr_len;
1954 
1955     /* find the field values */
1956     cd_cmd = IAVF_TX_CTX_DESC_TSO;
1957     cd_tso_len = skb->len - *hdr_len;
1958     cd_mss = gso_size;
1959     *cd_type_cmd_tso_mss |= (cd_cmd << IAVF_TXD_CTX_QW1_CMD_SHIFT) |
1960                 (cd_tso_len << IAVF_TXD_CTX_QW1_TSO_LEN_SHIFT) |
1961                 (cd_mss << IAVF_TXD_CTX_QW1_MSS_SHIFT);
1962     return 1;
1963 }
1964 
1965 /**
1966  * iavf_tx_enable_csum - Enable Tx checksum offloads
1967  * @skb: send buffer
1968  * @tx_flags: pointer to Tx flags currently set
1969  * @td_cmd: Tx descriptor command bits to set
1970  * @td_offset: Tx descriptor header offsets to set
1971  * @tx_ring: Tx descriptor ring
1972  * @cd_tunneling: ptr to context desc bits
1973  **/
1974 static int iavf_tx_enable_csum(struct sk_buff *skb, u32 *tx_flags,
1975                    u32 *td_cmd, u32 *td_offset,
1976                    struct iavf_ring *tx_ring,
1977                    u32 *cd_tunneling)
1978 {
1979     union {
1980         struct iphdr *v4;
1981         struct ipv6hdr *v6;
1982         unsigned char *hdr;
1983     } ip;
1984     union {
1985         struct tcphdr *tcp;
1986         struct udphdr *udp;
1987         unsigned char *hdr;
1988     } l4;
1989     unsigned char *exthdr;
1990     u32 offset, cmd = 0;
1991     __be16 frag_off;
1992     u8 l4_proto = 0;
1993 
1994     if (skb->ip_summed != CHECKSUM_PARTIAL)
1995         return 0;
1996 
1997     ip.hdr = skb_network_header(skb);
1998     l4.hdr = skb_transport_header(skb);
1999 
2000     /* compute outer L2 header size */
2001     offset = ((ip.hdr - skb->data) / 2) << IAVF_TX_DESC_LENGTH_MACLEN_SHIFT;
2002 
2003     if (skb->encapsulation) {
2004         u32 tunnel = 0;
2005         /* define outer network header type */
2006         if (*tx_flags & IAVF_TX_FLAGS_IPV4) {
2007             tunnel |= (*tx_flags & IAVF_TX_FLAGS_TSO) ?
2008                   IAVF_TX_CTX_EXT_IP_IPV4 :
2009                   IAVF_TX_CTX_EXT_IP_IPV4_NO_CSUM;
2010 
2011             l4_proto = ip.v4->protocol;
2012         } else if (*tx_flags & IAVF_TX_FLAGS_IPV6) {
2013             tunnel |= IAVF_TX_CTX_EXT_IP_IPV6;
2014 
2015             exthdr = ip.hdr + sizeof(*ip.v6);
2016             l4_proto = ip.v6->nexthdr;
2017             if (l4.hdr != exthdr)
2018                 ipv6_skip_exthdr(skb, exthdr - skb->data,
2019                          &l4_proto, &frag_off);
2020         }
2021 
2022         /* define outer transport */
2023         switch (l4_proto) {
2024         case IPPROTO_UDP:
2025             tunnel |= IAVF_TXD_CTX_UDP_TUNNELING;
2026             *tx_flags |= IAVF_TX_FLAGS_VXLAN_TUNNEL;
2027             break;
2028         case IPPROTO_GRE:
2029             tunnel |= IAVF_TXD_CTX_GRE_TUNNELING;
2030             *tx_flags |= IAVF_TX_FLAGS_VXLAN_TUNNEL;
2031             break;
2032         case IPPROTO_IPIP:
2033         case IPPROTO_IPV6:
2034             *tx_flags |= IAVF_TX_FLAGS_VXLAN_TUNNEL;
2035             l4.hdr = skb_inner_network_header(skb);
2036             break;
2037         default:
2038             if (*tx_flags & IAVF_TX_FLAGS_TSO)
2039                 return -1;
2040 
2041             skb_checksum_help(skb);
2042             return 0;
2043         }
2044 
2045         /* compute outer L3 header size */
2046         tunnel |= ((l4.hdr - ip.hdr) / 4) <<
2047               IAVF_TXD_CTX_QW0_EXT_IPLEN_SHIFT;
2048 
2049         /* switch IP header pointer from outer to inner header */
2050         ip.hdr = skb_inner_network_header(skb);
2051 
2052         /* compute tunnel header size */
2053         tunnel |= ((ip.hdr - l4.hdr) / 2) <<
2054               IAVF_TXD_CTX_QW0_NATLEN_SHIFT;
2055 
2056         /* indicate if we need to offload outer UDP header */
2057         if ((*tx_flags & IAVF_TX_FLAGS_TSO) &&
2058             !(skb_shinfo(skb)->gso_type & SKB_GSO_PARTIAL) &&
2059             (skb_shinfo(skb)->gso_type & SKB_GSO_UDP_TUNNEL_CSUM))
2060             tunnel |= IAVF_TXD_CTX_QW0_L4T_CS_MASK;
2061 
2062         /* record tunnel offload values */
2063         *cd_tunneling |= tunnel;
2064 
2065         /* switch L4 header pointer from outer to inner */
2066         l4.hdr = skb_inner_transport_header(skb);
2067         l4_proto = 0;
2068 
2069         /* reset type as we transition from outer to inner headers */
2070         *tx_flags &= ~(IAVF_TX_FLAGS_IPV4 | IAVF_TX_FLAGS_IPV6);
2071         if (ip.v4->version == 4)
2072             *tx_flags |= IAVF_TX_FLAGS_IPV4;
2073         if (ip.v6->version == 6)
2074             *tx_flags |= IAVF_TX_FLAGS_IPV6;
2075     }
2076 
2077     /* Enable IP checksum offloads */
2078     if (*tx_flags & IAVF_TX_FLAGS_IPV4) {
2079         l4_proto = ip.v4->protocol;
2080         /* the stack computes the IP header already, the only time we
2081          * need the hardware to recompute it is in the case of TSO.
2082          */
2083         cmd |= (*tx_flags & IAVF_TX_FLAGS_TSO) ?
2084                IAVF_TX_DESC_CMD_IIPT_IPV4_CSUM :
2085                IAVF_TX_DESC_CMD_IIPT_IPV4;
2086     } else if (*tx_flags & IAVF_TX_FLAGS_IPV6) {
2087         cmd |= IAVF_TX_DESC_CMD_IIPT_IPV6;
2088 
2089         exthdr = ip.hdr + sizeof(*ip.v6);
2090         l4_proto = ip.v6->nexthdr;
2091         if (l4.hdr != exthdr)
2092             ipv6_skip_exthdr(skb, exthdr - skb->data,
2093                      &l4_proto, &frag_off);
2094     }
2095 
2096     /* compute inner L3 header size */
2097     offset |= ((l4.hdr - ip.hdr) / 4) << IAVF_TX_DESC_LENGTH_IPLEN_SHIFT;
2098 
2099     /* Enable L4 checksum offloads */
2100     switch (l4_proto) {
2101     case IPPROTO_TCP:
2102         /* enable checksum offloads */
2103         cmd |= IAVF_TX_DESC_CMD_L4T_EOFT_TCP;
2104         offset |= l4.tcp->doff << IAVF_TX_DESC_LENGTH_L4_FC_LEN_SHIFT;
2105         break;
2106     case IPPROTO_SCTP:
2107         /* enable SCTP checksum offload */
2108         cmd |= IAVF_TX_DESC_CMD_L4T_EOFT_SCTP;
2109         offset |= (sizeof(struct sctphdr) >> 2) <<
2110               IAVF_TX_DESC_LENGTH_L4_FC_LEN_SHIFT;
2111         break;
2112     case IPPROTO_UDP:
2113         /* enable UDP checksum offload */
2114         cmd |= IAVF_TX_DESC_CMD_L4T_EOFT_UDP;
2115         offset |= (sizeof(struct udphdr) >> 2) <<
2116               IAVF_TX_DESC_LENGTH_L4_FC_LEN_SHIFT;
2117         break;
2118     default:
2119         if (*tx_flags & IAVF_TX_FLAGS_TSO)
2120             return -1;
2121         skb_checksum_help(skb);
2122         return 0;
2123     }
2124 
2125     *td_cmd |= cmd;
2126     *td_offset |= offset;
2127 
2128     return 1;
2129 }
2130 
2131 /**
2132  * iavf_create_tx_ctx - Build the Tx context descriptor
2133  * @tx_ring:  ring to create the descriptor on
2134  * @cd_type_cmd_tso_mss: Quad Word 1
2135  * @cd_tunneling: Quad Word 0 - bits 0-31
2136  * @cd_l2tag2: Quad Word 0 - bits 32-63
2137  **/
2138 static void iavf_create_tx_ctx(struct iavf_ring *tx_ring,
2139                    const u64 cd_type_cmd_tso_mss,
2140                    const u32 cd_tunneling, const u32 cd_l2tag2)
2141 {
2142     struct iavf_tx_context_desc *context_desc;
2143     int i = tx_ring->next_to_use;
2144 
2145     if ((cd_type_cmd_tso_mss == IAVF_TX_DESC_DTYPE_CONTEXT) &&
2146         !cd_tunneling && !cd_l2tag2)
2147         return;
2148 
2149     /* grab the next descriptor */
2150     context_desc = IAVF_TX_CTXTDESC(tx_ring, i);
2151 
2152     i++;
2153     tx_ring->next_to_use = (i < tx_ring->count) ? i : 0;
2154 
2155     /* cpu_to_le32 and assign to struct fields */
2156     context_desc->tunneling_params = cpu_to_le32(cd_tunneling);
2157     context_desc->l2tag2 = cpu_to_le16(cd_l2tag2);
2158     context_desc->rsvd = cpu_to_le16(0);
2159     context_desc->type_cmd_tso_mss = cpu_to_le64(cd_type_cmd_tso_mss);
2160 }
2161 
2162 /**
2163  * __iavf_chk_linearize - Check if there are more than 8 buffers per packet
2164  * @skb:      send buffer
2165  *
2166  * Note: Our HW can't DMA more than 8 buffers to build a packet on the wire
2167  * and so we need to figure out the cases where we need to linearize the skb.
2168  *
2169  * For TSO we need to count the TSO header and segment payload separately.
2170  * As such we need to check cases where we have 7 fragments or more as we
2171  * can potentially require 9 DMA transactions, 1 for the TSO header, 1 for
2172  * the segment payload in the first descriptor, and another 7 for the
2173  * fragments.
2174  **/
2175 bool __iavf_chk_linearize(struct sk_buff *skb)
2176 {
2177     const skb_frag_t *frag, *stale;
2178     int nr_frags, sum;
2179 
2180     /* no need to check if number of frags is less than 7 */
2181     nr_frags = skb_shinfo(skb)->nr_frags;
2182     if (nr_frags < (IAVF_MAX_BUFFER_TXD - 1))
2183         return false;
2184 
2185     /* We need to walk through the list and validate that each group
2186      * of 6 fragments totals at least gso_size.
2187      */
2188     nr_frags -= IAVF_MAX_BUFFER_TXD - 2;
2189     frag = &skb_shinfo(skb)->frags[0];
2190 
2191     /* Initialize size to the negative value of gso_size minus 1.  We
2192      * use this as the worst case scenerio in which the frag ahead
2193      * of us only provides one byte which is why we are limited to 6
2194      * descriptors for a single transmit as the header and previous
2195      * fragment are already consuming 2 descriptors.
2196      */
2197     sum = 1 - skb_shinfo(skb)->gso_size;
2198 
2199     /* Add size of frags 0 through 4 to create our initial sum */
2200     sum += skb_frag_size(frag++);
2201     sum += skb_frag_size(frag++);
2202     sum += skb_frag_size(frag++);
2203     sum += skb_frag_size(frag++);
2204     sum += skb_frag_size(frag++);
2205 
2206     /* Walk through fragments adding latest fragment, testing it, and
2207      * then removing stale fragments from the sum.
2208      */
2209     for (stale = &skb_shinfo(skb)->frags[0];; stale++) {
2210         int stale_size = skb_frag_size(stale);
2211 
2212         sum += skb_frag_size(frag++);
2213 
2214         /* The stale fragment may present us with a smaller
2215          * descriptor than the actual fragment size. To account
2216          * for that we need to remove all the data on the front and
2217          * figure out what the remainder would be in the last
2218          * descriptor associated with the fragment.
2219          */
2220         if (stale_size > IAVF_MAX_DATA_PER_TXD) {
2221             int align_pad = -(skb_frag_off(stale)) &
2222                     (IAVF_MAX_READ_REQ_SIZE - 1);
2223 
2224             sum -= align_pad;
2225             stale_size -= align_pad;
2226 
2227             do {
2228                 sum -= IAVF_MAX_DATA_PER_TXD_ALIGNED;
2229                 stale_size -= IAVF_MAX_DATA_PER_TXD_ALIGNED;
2230             } while (stale_size > IAVF_MAX_DATA_PER_TXD);
2231         }
2232 
2233         /* if sum is negative we failed to make sufficient progress */
2234         if (sum < 0)
2235             return true;
2236 
2237         if (!nr_frags--)
2238             break;
2239 
2240         sum -= stale_size;
2241     }
2242 
2243     return false;
2244 }
2245 
2246 /**
2247  * __iavf_maybe_stop_tx - 2nd level check for tx stop conditions
2248  * @tx_ring: the ring to be checked
2249  * @size:    the size buffer we want to assure is available
2250  *
2251  * Returns -EBUSY if a stop is needed, else 0
2252  **/
2253 int __iavf_maybe_stop_tx(struct iavf_ring *tx_ring, int size)
2254 {
2255     netif_stop_subqueue(tx_ring->netdev, tx_ring->queue_index);
2256     /* Memory barrier before checking head and tail */
2257     smp_mb();
2258 
2259     /* Check again in a case another CPU has just made room available. */
2260     if (likely(IAVF_DESC_UNUSED(tx_ring) < size))
2261         return -EBUSY;
2262 
2263     /* A reprieve! - use start_queue because it doesn't call schedule */
2264     netif_start_subqueue(tx_ring->netdev, tx_ring->queue_index);
2265     ++tx_ring->tx_stats.restart_queue;
2266     return 0;
2267 }
2268 
2269 /**
2270  * iavf_tx_map - Build the Tx descriptor
2271  * @tx_ring:  ring to send buffer on
2272  * @skb:      send buffer
2273  * @first:    first buffer info buffer to use
2274  * @tx_flags: collected send information
2275  * @hdr_len:  size of the packet header
2276  * @td_cmd:   the command field in the descriptor
2277  * @td_offset: offset for checksum or crc
2278  **/
2279 static inline void iavf_tx_map(struct iavf_ring *tx_ring, struct sk_buff *skb,
2280                    struct iavf_tx_buffer *first, u32 tx_flags,
2281                    const u8 hdr_len, u32 td_cmd, u32 td_offset)
2282 {
2283     unsigned int data_len = skb->data_len;
2284     unsigned int size = skb_headlen(skb);
2285     skb_frag_t *frag;
2286     struct iavf_tx_buffer *tx_bi;
2287     struct iavf_tx_desc *tx_desc;
2288     u16 i = tx_ring->next_to_use;
2289     u32 td_tag = 0;
2290     dma_addr_t dma;
2291 
2292     if (tx_flags & IAVF_TX_FLAGS_HW_VLAN) {
2293         td_cmd |= IAVF_TX_DESC_CMD_IL2TAG1;
2294         td_tag = (tx_flags & IAVF_TX_FLAGS_VLAN_MASK) >>
2295              IAVF_TX_FLAGS_VLAN_SHIFT;
2296     }
2297 
2298     first->tx_flags = tx_flags;
2299 
2300     dma = dma_map_single(tx_ring->dev, skb->data, size, DMA_TO_DEVICE);
2301 
2302     tx_desc = IAVF_TX_DESC(tx_ring, i);
2303     tx_bi = first;
2304 
2305     for (frag = &skb_shinfo(skb)->frags[0];; frag++) {
2306         unsigned int max_data = IAVF_MAX_DATA_PER_TXD_ALIGNED;
2307 
2308         if (dma_mapping_error(tx_ring->dev, dma))
2309             goto dma_error;
2310 
2311         /* record length, and DMA address */
2312         dma_unmap_len_set(tx_bi, len, size);
2313         dma_unmap_addr_set(tx_bi, dma, dma);
2314 
2315         /* align size to end of page */
2316         max_data += -dma & (IAVF_MAX_READ_REQ_SIZE - 1);
2317         tx_desc->buffer_addr = cpu_to_le64(dma);
2318 
2319         while (unlikely(size > IAVF_MAX_DATA_PER_TXD)) {
2320             tx_desc->cmd_type_offset_bsz =
2321                 build_ctob(td_cmd, td_offset,
2322                        max_data, td_tag);
2323 
2324             tx_desc++;
2325             i++;
2326 
2327             if (i == tx_ring->count) {
2328                 tx_desc = IAVF_TX_DESC(tx_ring, 0);
2329                 i = 0;
2330             }
2331 
2332             dma += max_data;
2333             size -= max_data;
2334 
2335             max_data = IAVF_MAX_DATA_PER_TXD_ALIGNED;
2336             tx_desc->buffer_addr = cpu_to_le64(dma);
2337         }
2338 
2339         if (likely(!data_len))
2340             break;
2341 
2342         tx_desc->cmd_type_offset_bsz = build_ctob(td_cmd, td_offset,
2343                               size, td_tag);
2344 
2345         tx_desc++;
2346         i++;
2347 
2348         if (i == tx_ring->count) {
2349             tx_desc = IAVF_TX_DESC(tx_ring, 0);
2350             i = 0;
2351         }
2352 
2353         size = skb_frag_size(frag);
2354         data_len -= size;
2355 
2356         dma = skb_frag_dma_map(tx_ring->dev, frag, 0, size,
2357                        DMA_TO_DEVICE);
2358 
2359         tx_bi = &tx_ring->tx_bi[i];
2360     }
2361 
2362     netdev_tx_sent_queue(txring_txq(tx_ring), first->bytecount);
2363 
2364     i++;
2365     if (i == tx_ring->count)
2366         i = 0;
2367 
2368     tx_ring->next_to_use = i;
2369 
2370     iavf_maybe_stop_tx(tx_ring, DESC_NEEDED);
2371 
2372     /* write last descriptor with RS and EOP bits */
2373     td_cmd |= IAVF_TXD_CMD;
2374     tx_desc->cmd_type_offset_bsz =
2375             build_ctob(td_cmd, td_offset, size, td_tag);
2376 
2377     skb_tx_timestamp(skb);
2378 
2379     /* Force memory writes to complete before letting h/w know there
2380      * are new descriptors to fetch.
2381      *
2382      * We also use this memory barrier to make certain all of the
2383      * status bits have been updated before next_to_watch is written.
2384      */
2385     wmb();
2386 
2387     /* set next_to_watch value indicating a packet is present */
2388     first->next_to_watch = tx_desc;
2389 
2390     /* notify HW of packet */
2391     if (netif_xmit_stopped(txring_txq(tx_ring)) || !netdev_xmit_more()) {
2392         writel(i, tx_ring->tail);
2393     }
2394 
2395     return;
2396 
2397 dma_error:
2398     dev_info(tx_ring->dev, "TX DMA map failed\n");
2399 
2400     /* clear dma mappings for failed tx_bi map */
2401     for (;;) {
2402         tx_bi = &tx_ring->tx_bi[i];
2403         iavf_unmap_and_free_tx_resource(tx_ring, tx_bi);
2404         if (tx_bi == first)
2405             break;
2406         if (i == 0)
2407             i = tx_ring->count;
2408         i--;
2409     }
2410 
2411     tx_ring->next_to_use = i;
2412 }
2413 
2414 /**
2415  * iavf_xmit_frame_ring - Sends buffer on Tx ring
2416  * @skb:     send buffer
2417  * @tx_ring: ring to send buffer on
2418  *
2419  * Returns NETDEV_TX_OK if sent, else an error code
2420  **/
2421 static netdev_tx_t iavf_xmit_frame_ring(struct sk_buff *skb,
2422                     struct iavf_ring *tx_ring)
2423 {
2424     u64 cd_type_cmd_tso_mss = IAVF_TX_DESC_DTYPE_CONTEXT;
2425     u32 cd_tunneling = 0, cd_l2tag2 = 0;
2426     struct iavf_tx_buffer *first;
2427     u32 td_offset = 0;
2428     u32 tx_flags = 0;
2429     __be16 protocol;
2430     u32 td_cmd = 0;
2431     u8 hdr_len = 0;
2432     int tso, count;
2433 
2434     /* prefetch the data, we'll need it later */
2435     prefetch(skb->data);
2436 
2437     iavf_trace(xmit_frame_ring, skb, tx_ring);
2438 
2439     count = iavf_xmit_descriptor_count(skb);
2440     if (iavf_chk_linearize(skb, count)) {
2441         if (__skb_linearize(skb)) {
2442             dev_kfree_skb_any(skb);
2443             return NETDEV_TX_OK;
2444         }
2445         count = iavf_txd_use_count(skb->len);
2446         tx_ring->tx_stats.tx_linearize++;
2447     }
2448 
2449     /* need: 1 descriptor per page * PAGE_SIZE/IAVF_MAX_DATA_PER_TXD,
2450      *       + 1 desc for skb_head_len/IAVF_MAX_DATA_PER_TXD,
2451      *       + 4 desc gap to avoid the cache line where head is,
2452      *       + 1 desc for context descriptor,
2453      * otherwise try next time
2454      */
2455     if (iavf_maybe_stop_tx(tx_ring, count + 4 + 1)) {
2456         tx_ring->tx_stats.tx_busy++;
2457         return NETDEV_TX_BUSY;
2458     }
2459 
2460     /* record the location of the first descriptor for this packet */
2461     first = &tx_ring->tx_bi[tx_ring->next_to_use];
2462     first->skb = skb;
2463     first->bytecount = skb->len;
2464     first->gso_segs = 1;
2465 
2466     /* prepare the xmit flags */
2467     iavf_tx_prepare_vlan_flags(skb, tx_ring, &tx_flags);
2468     if (tx_flags & IAVF_TX_FLAGS_HW_OUTER_SINGLE_VLAN) {
2469         cd_type_cmd_tso_mss |= IAVF_TX_CTX_DESC_IL2TAG2 <<
2470             IAVF_TXD_CTX_QW1_CMD_SHIFT;
2471         cd_l2tag2 = (tx_flags & IAVF_TX_FLAGS_VLAN_MASK) >>
2472             IAVF_TX_FLAGS_VLAN_SHIFT;
2473     }
2474 
2475     /* obtain protocol of skb */
2476     protocol = vlan_get_protocol(skb);
2477 
2478     /* setup IPv4/IPv6 offloads */
2479     if (protocol == htons(ETH_P_IP))
2480         tx_flags |= IAVF_TX_FLAGS_IPV4;
2481     else if (protocol == htons(ETH_P_IPV6))
2482         tx_flags |= IAVF_TX_FLAGS_IPV6;
2483 
2484     tso = iavf_tso(first, &hdr_len, &cd_type_cmd_tso_mss);
2485 
2486     if (tso < 0)
2487         goto out_drop;
2488     else if (tso)
2489         tx_flags |= IAVF_TX_FLAGS_TSO;
2490 
2491     /* Always offload the checksum, since it's in the data descriptor */
2492     tso = iavf_tx_enable_csum(skb, &tx_flags, &td_cmd, &td_offset,
2493                   tx_ring, &cd_tunneling);
2494     if (tso < 0)
2495         goto out_drop;
2496 
2497     /* always enable CRC insertion offload */
2498     td_cmd |= IAVF_TX_DESC_CMD_ICRC;
2499 
2500     iavf_create_tx_ctx(tx_ring, cd_type_cmd_tso_mss,
2501                cd_tunneling, cd_l2tag2);
2502 
2503     iavf_tx_map(tx_ring, skb, first, tx_flags, hdr_len,
2504             td_cmd, td_offset);
2505 
2506     return NETDEV_TX_OK;
2507 
2508 out_drop:
2509     iavf_trace(xmit_frame_ring_drop, first->skb, tx_ring);
2510     dev_kfree_skb_any(first->skb);
2511     first->skb = NULL;
2512     return NETDEV_TX_OK;
2513 }
2514 
2515 /**
2516  * iavf_xmit_frame - Selects the correct VSI and Tx queue to send buffer
2517  * @skb:    send buffer
2518  * @netdev: network interface device structure
2519  *
2520  * Returns NETDEV_TX_OK if sent, else an error code
2521  **/
2522 netdev_tx_t iavf_xmit_frame(struct sk_buff *skb, struct net_device *netdev)
2523 {
2524     struct iavf_adapter *adapter = netdev_priv(netdev);
2525     struct iavf_ring *tx_ring = &adapter->tx_rings[skb->queue_mapping];
2526 
2527     /* hardware can't handle really short frames, hardware padding works
2528      * beyond this point
2529      */
2530     if (unlikely(skb->len < IAVF_MIN_TX_LEN)) {
2531         if (skb_pad(skb, IAVF_MIN_TX_LEN - skb->len))
2532             return NETDEV_TX_OK;
2533         skb->len = IAVF_MIN_TX_LEN;
2534         skb_set_tail_pointer(skb, IAVF_MIN_TX_LEN);
2535     }
2536 
2537     return iavf_xmit_frame_ring(skb, tx_ring);
2538 }