google/gve/gve_tx_dqo.c

0001 // SPDX-License-Identifier: (GPL-2.0 OR MIT)
0002 /* Google virtual Ethernet (gve) driver
0003  *
0004  * Copyright (C) 2015-2021 Google, Inc.
0005  */
0006
0007 #include "gve.h"
0008 #include "gve_adminq.h"
0009 #include "gve_utils.h"
0010 #include "gve_dqo.h"
0011 #include <linux/tcp.h>
0012 #include <linux/slab.h>
0013 #include <linux/skbuff.h>
0014
0015 /* Returns true if a gve_tx_pending_packet_dqo object is available. */
0016 static bool gve_has_pending_packet(struct gve_tx_ring *tx)
0017 {
0018     /* Check TX path's list. */
0019     if (tx->dqo_tx.free_pending_packets != -1)
0020         return true;
0021
0022     /* Check completion handler's list. */
0023     if (atomic_read_acquire(&tx->dqo_compl.free_pending_packets) != -1)
0024         return true;
0025
0026     return false;
0027 }
0028
0029 static struct gve_tx_pending_packet_dqo *
0030 gve_alloc_pending_packet(struct gve_tx_ring *tx)
0031 {
0032     struct gve_tx_pending_packet_dqo *pending_packet;
0033     s16 index;
0034
0035     index = tx->dqo_tx.free_pending_packets;
0036
0037     /* No pending_packets available, try to steal the list from the
0038      * completion handler.
0039      */
0040     if (unlikely(index == -1)) {
0041         tx->dqo_tx.free_pending_packets =
0042             atomic_xchg(&tx->dqo_compl.free_pending_packets, -1);
0043         index = tx->dqo_tx.free_pending_packets;
0044
0045         if (unlikely(index == -1))
0046             return NULL;
0047     }
0048
0049     pending_packet = &tx->dqo.pending_packets[index];
0050
0051     /* Remove pending_packet from free list */
0052     tx->dqo_tx.free_pending_packets = pending_packet->next;
0053     pending_packet->state = GVE_PACKET_STATE_PENDING_DATA_COMPL;
0054
0055     return pending_packet;
0056 }
0057
0058 static void
0059 gve_free_pending_packet(struct gve_tx_ring *tx,
0060             struct gve_tx_pending_packet_dqo *pending_packet)
0061 {
0062     s16 index = pending_packet - tx->dqo.pending_packets;
0063
0064     pending_packet->state = GVE_PACKET_STATE_UNALLOCATED;
0065     while (true) {
0066         s16 old_head = atomic_read_acquire(&tx->dqo_compl.free_pending_packets);
0067
0068         pending_packet->next = old_head;
0069         if (atomic_cmpxchg(&tx->dqo_compl.free_pending_packets,
0070                    old_head, index) == old_head) {
0071             break;
0072         }
0073     }
0074 }
0075
0076 /* gve_tx_free_desc - Cleans up all pending tx requests and buffers.
0077  */
0078 static void gve_tx_clean_pending_packets(struct gve_tx_ring *tx)
0079 {
0080     int i;
0081
0082     for (i = 0; i < tx->dqo.num_pending_packets; i++) {
0083         struct gve_tx_pending_packet_dqo *cur_state =
0084             &tx->dqo.pending_packets[i];
0085         int j;
0086
0087         for (j = 0; j < cur_state->num_bufs; j++) {
0088             if (j == 0) {
0089                 dma_unmap_single(tx->dev,
0090                     dma_unmap_addr(cur_state, dma[j]),
0091                     dma_unmap_len(cur_state, len[j]),
0092                     DMA_TO_DEVICE);
0093             } else {
0094                 dma_unmap_page(tx->dev,
0095                     dma_unmap_addr(cur_state, dma[j]),
0096                     dma_unmap_len(cur_state, len[j]),
0097                     DMA_TO_DEVICE);
0098             }
0099         }
0100         if (cur_state->skb) {
0101             dev_consume_skb_any(cur_state->skb);
0102             cur_state->skb = NULL;
0103         }
0104     }
0105 }
0106
0107 static void gve_tx_free_ring_dqo(struct gve_priv *priv, int idx)
0108 {
0109     struct gve_tx_ring *tx = &priv->tx[idx];
0110     struct device *hdev = &priv->pdev->dev;
0111     size_t bytes;
0112
0113     gve_tx_remove_from_block(priv, idx);
0114
0115     if (tx->q_resources) {
0116         dma_free_coherent(hdev, sizeof(*tx->q_resources),
0117                   tx->q_resources, tx->q_resources_bus);
0118         tx->q_resources = NULL;
0119     }
0120
0121     if (tx->dqo.compl_ring) {
0122         bytes = sizeof(tx->dqo.compl_ring[0]) *
0123             (tx->dqo.complq_mask + 1);
0124         dma_free_coherent(hdev, bytes, tx->dqo.compl_ring,
0125                   tx->complq_bus_dqo);
0126         tx->dqo.compl_ring = NULL;
0127     }
0128
0129     if (tx->dqo.tx_ring) {
0130         bytes = sizeof(tx->dqo.tx_ring[0]) * (tx->mask + 1);
0131         dma_free_coherent(hdev, bytes, tx->dqo.tx_ring, tx->bus);
0132         tx->dqo.tx_ring = NULL;
0133     }
0134
0135     kvfree(tx->dqo.pending_packets);
0136     tx->dqo.pending_packets = NULL;
0137
0138     netif_dbg(priv, drv, priv->dev, "freed tx queue %d\n", idx);
0139 }
0140
0141 static int gve_tx_alloc_ring_dqo(struct gve_priv *priv, int idx)
0142 {
0143     struct gve_tx_ring *tx = &priv->tx[idx];
0144     struct device *hdev = &priv->pdev->dev;
0145     int num_pending_packets;
0146     size_t bytes;
0147     int i;
0148
0149     memset(tx, 0, sizeof(*tx));
0150     tx->q_num = idx;
0151     tx->dev = &priv->pdev->dev;
0152     tx->netdev_txq = netdev_get_tx_queue(priv->dev, idx);
0153     atomic_set_release(&tx->dqo_compl.hw_tx_head, 0);
0154
0155     /* Queue sizes must be a power of 2 */
0156     tx->mask = priv->tx_desc_cnt - 1;
0157     tx->dqo.complq_mask = priv->options_dqo_rda.tx_comp_ring_entries - 1;
0158
0159     /* The max number of pending packets determines the maximum number of
0160      * descriptors which maybe written to the completion queue.
0161      *
0162      * We must set the number small enough to make sure we never overrun the
0163      * completion queue.
0164      */
0165     num_pending_packets = tx->dqo.complq_mask + 1;
0166
0167     /* Reserve space for descriptor completions, which will be reported at
0168      * most every GVE_TX_MIN_RE_INTERVAL packets.
0169      */
0170     num_pending_packets -=
0171         (tx->dqo.complq_mask + 1) / GVE_TX_MIN_RE_INTERVAL;
0172
0173     /* Each packet may have at most 2 buffer completions if it receives both
0174      * a miss and reinjection completion.
0175      */
0176     num_pending_packets /= 2;
0177
0178     tx->dqo.num_pending_packets = min_t(int, num_pending_packets, S16_MAX);
0179     tx->dqo.pending_packets = kvcalloc(tx->dqo.num_pending_packets,
0180                        sizeof(tx->dqo.pending_packets[0]),
0181                        GFP_KERNEL);
0182     if (!tx->dqo.pending_packets)
0183         goto err;
0184
0185     /* Set up linked list of pending packets */
0186     for (i = 0; i < tx->dqo.num_pending_packets - 1; i++)
0187         tx->dqo.pending_packets[i].next = i + 1;
0188
0189     tx->dqo.pending_packets[tx->dqo.num_pending_packets - 1].next = -1;
0190     atomic_set_release(&tx->dqo_compl.free_pending_packets, -1);
0191     tx->dqo_compl.miss_completions.head = -1;
0192     tx->dqo_compl.miss_completions.tail = -1;
0193     tx->dqo_compl.timed_out_completions.head = -1;
0194     tx->dqo_compl.timed_out_completions.tail = -1;
0195
0196     bytes = sizeof(tx->dqo.tx_ring[0]) * (tx->mask + 1);
0197     tx->dqo.tx_ring = dma_alloc_coherent(hdev, bytes, &tx->bus, GFP_KERNEL);
0198     if (!tx->dqo.tx_ring)
0199         goto err;
0200
0201     bytes = sizeof(tx->dqo.compl_ring[0]) * (tx->dqo.complq_mask + 1);
0202     tx->dqo.compl_ring = dma_alloc_coherent(hdev, bytes,
0203                         &tx->complq_bus_dqo,
0204                         GFP_KERNEL);
0205     if (!tx->dqo.compl_ring)
0206         goto err;
0207
0208     tx->q_resources = dma_alloc_coherent(hdev, sizeof(*tx->q_resources),
0209                          &tx->q_resources_bus, GFP_KERNEL);
0210     if (!tx->q_resources)
0211         goto err;
0212
0213     gve_tx_add_to_block(priv, idx);
0214
0215     return 0;
0216
0217 err:
0218     gve_tx_free_ring_dqo(priv, idx);
0219     return -ENOMEM;
0220 }
0221
0222 int gve_tx_alloc_rings_dqo(struct gve_priv *priv)
0223 {
0224     int err = 0;
0225     int i;
0226
0227     for (i = 0; i < priv->tx_cfg.num_queues; i++) {
0228         err = gve_tx_alloc_ring_dqo(priv, i);
0229         if (err) {
0230             netif_err(priv, drv, priv->dev,
0231                   "Failed to alloc tx ring=%d: err=%d\n",
0232                   i, err);
0233             goto err;
0234         }
0235     }
0236
0237     return 0;
0238
0239 err:
0240     for (i--; i >= 0; i--)
0241         gve_tx_free_ring_dqo(priv, i);
0242
0243     return err;
0244 }
0245
0246 void gve_tx_free_rings_dqo(struct gve_priv *priv)
0247 {
0248     int i;
0249
0250     for (i = 0; i < priv->tx_cfg.num_queues; i++) {
0251         struct gve_tx_ring *tx = &priv->tx[i];
0252
0253         gve_clean_tx_done_dqo(priv, tx, /*napi=*/NULL);
0254         netdev_tx_reset_queue(tx->netdev_txq);
0255         gve_tx_clean_pending_packets(tx);
0256
0257         gve_tx_free_ring_dqo(priv, i);
0258     }
0259 }
0260
0261 /* Returns the number of slots available in the ring */
0262 static u32 num_avail_tx_slots(const struct gve_tx_ring *tx)
0263 {
0264     u32 num_used = (tx->dqo_tx.tail - tx->dqo_tx.head) & tx->mask;
0265
0266     return tx->mask - num_used;
0267 }
0268
0269 /* Stops the queue if available descriptors is less than 'count'.
0270  * Return: 0 if stop is not required.
0271  */
0272 static int gve_maybe_stop_tx_dqo(struct gve_tx_ring *tx, int count)
0273 {
0274     if (likely(gve_has_pending_packet(tx) &&
0275            num_avail_tx_slots(tx) >= count))
0276         return 0;
0277
0278     /* Update cached TX head pointer */
0279     tx->dqo_tx.head = atomic_read_acquire(&tx->dqo_compl.hw_tx_head);
0280
0281     if (likely(gve_has_pending_packet(tx) &&
0282            num_avail_tx_slots(tx) >= count))
0283         return 0;
0284
0285     /* No space, so stop the queue */
0286     tx->stop_queue++;
0287     netif_tx_stop_queue(tx->netdev_txq);
0288
0289     /* Sync with restarting queue in `gve_tx_poll_dqo()` */
0290     mb();
0291
0292     /* After stopping queue, check if we can transmit again in order to
0293      * avoid TOCTOU bug.
0294      */
0295     tx->dqo_tx.head = atomic_read_acquire(&tx->dqo_compl.hw_tx_head);
0296
0297     if (likely(!gve_has_pending_packet(tx) ||
0298            num_avail_tx_slots(tx) < count))
0299         return -EBUSY;
0300
0301     netif_tx_start_queue(tx->netdev_txq);
0302     tx->wake_queue++;
0303     return 0;
0304 }
0305
0306 static void gve_extract_tx_metadata_dqo(const struct sk_buff *skb,
0307                     struct gve_tx_metadata_dqo *metadata)
0308 {
0309     memset(metadata, 0, sizeof(*metadata));
0310     metadata->version = GVE_TX_METADATA_VERSION_DQO;
0311
0312     if (skb->l4_hash) {
0313         u16 path_hash = skb->hash ^ (skb->hash >> 16);
0314
0315         path_hash &= (1 << 15) - 1;
0316         if (unlikely(path_hash == 0))
0317             path_hash = ~path_hash;
0318
0319         metadata->path_hash = path_hash;
0320     }
0321 }
0322
0323 static void gve_tx_fill_pkt_desc_dqo(struct gve_tx_ring *tx, u32 *desc_idx,
0324                      struct sk_buff *skb, u32 len, u64 addr,
0325                      s16 compl_tag, bool eop, bool is_gso)
0326 {
0327     const bool checksum_offload_en = skb->ip_summed == CHECKSUM_PARTIAL;
0328
0329     while (len > 0) {
0330         struct gve_tx_pkt_desc_dqo *desc =
0331             &tx->dqo.tx_ring[*desc_idx].pkt;
0332         u32 cur_len = min_t(u32, len, GVE_TX_MAX_BUF_SIZE_DQO);
0333         bool cur_eop = eop && cur_len == len;
0334
0335         *desc = (struct gve_tx_pkt_desc_dqo){
0336             .buf_addr = cpu_to_le64(addr),
0337             .dtype = GVE_TX_PKT_DESC_DTYPE_DQO,
0338             .end_of_packet = cur_eop,
0339             .checksum_offload_enable = checksum_offload_en,
0340             .compl_tag = cpu_to_le16(compl_tag),
0341             .buf_size = cur_len,
0342         };
0343
0344         addr += cur_len;
0345         len -= cur_len;
0346         *desc_idx = (*desc_idx + 1) & tx->mask;
0347     }
0348 }
0349
0350 /* Validates and prepares `skb` for TSO.
0351  *
0352  * Returns header length, or < 0 if invalid.
0353  */
0354 static int gve_prep_tso(struct sk_buff *skb)
0355 {
0356     struct tcphdr *tcp;
0357     int header_len;
0358     u32 paylen;
0359     int err;
0360
0361     /* Note: HW requires MSS (gso_size) to be <= 9728 and the total length
0362      * of the TSO to be <= 262143.
0363      *
0364      * However, we don't validate these because:
0365      * - Hypervisor enforces a limit of 9K MTU
0366      * - Kernel will not produce a TSO larger than 64k
0367      */
0368
0369     if (unlikely(skb_shinfo(skb)->gso_size < GVE_TX_MIN_TSO_MSS_DQO))
0370         return -1;
0371
0372     /* Needed because we will modify header. */
0373     err = skb_cow_head(skb, 0);
0374     if (err < 0)
0375         return err;
0376
0377     tcp = tcp_hdr(skb);
0378
0379     /* Remove payload length from checksum. */
0380     paylen = skb->len - skb_transport_offset(skb);
0381
0382     switch (skb_shinfo(skb)->gso_type) {
0383     case SKB_GSO_TCPV4:
0384     case SKB_GSO_TCPV6:
0385         csum_replace_by_diff(&tcp->check,
0386                      (__force __wsum)htonl(paylen));
0387
0388         /* Compute length of segmentation header. */
0389         header_len = skb_tcp_all_headers(skb);
0390         break;
0391     default:
0392         return -EINVAL;
0393     }
0394
0395     if (unlikely(header_len > GVE_TX_MAX_HDR_SIZE_DQO))
0396         return -EINVAL;
0397
0398     return header_len;
0399 }
0400
0401 static void gve_tx_fill_tso_ctx_desc(struct gve_tx_tso_context_desc_dqo *desc,
0402                      const struct sk_buff *skb,
0403                      const struct gve_tx_metadata_dqo *metadata,
0404                      int header_len)
0405 {
0406     *desc = (struct gve_tx_tso_context_desc_dqo){
0407         .header_len = header_len,
0408         .cmd_dtype = {
0409             .dtype = GVE_TX_TSO_CTX_DESC_DTYPE_DQO,
0410             .tso = 1,
0411         },
0412         .flex0 = metadata->bytes[0],
0413         .flex5 = metadata->bytes[5],
0414         .flex6 = metadata->bytes[6],
0415         .flex7 = metadata->bytes[7],
0416         .flex8 = metadata->bytes[8],
0417         .flex9 = metadata->bytes[9],
0418         .flex10 = metadata->bytes[10],
0419         .flex11 = metadata->bytes[11],
0420     };
0421     desc->tso_total_len = skb->len - header_len;
0422     desc->mss = skb_shinfo(skb)->gso_size;
0423 }
0424
0425 static void
0426 gve_tx_fill_general_ctx_desc(struct gve_tx_general_context_desc_dqo *desc,
0427                  const struct gve_tx_metadata_dqo *metadata)
0428 {
0429     *desc = (struct gve_tx_general_context_desc_dqo){
0430         .flex0 = metadata->bytes[0],
0431         .flex1 = metadata->bytes[1],
0432         .flex2 = metadata->bytes[2],
0433         .flex3 = metadata->bytes[3],
0434         .flex4 = metadata->bytes[4],
0435         .flex5 = metadata->bytes[5],
0436         .flex6 = metadata->bytes[6],
0437         .flex7 = metadata->bytes[7],
0438         .flex8 = metadata->bytes[8],
0439         .flex9 = metadata->bytes[9],
0440         .flex10 = metadata->bytes[10],
0441         .flex11 = metadata->bytes[11],
0442         .cmd_dtype = {.dtype = GVE_TX_GENERAL_CTX_DESC_DTYPE_DQO},
0443     };
0444 }
0445
0446 /* Returns 0 on success, or < 0 on error.
0447  *
0448  * Before this function is called, the caller must ensure
0449  * gve_has_pending_packet(tx) returns true.
0450  */
0451 static int gve_tx_add_skb_no_copy_dqo(struct gve_tx_ring *tx,
0452                       struct sk_buff *skb)
0453 {
0454     const struct skb_shared_info *shinfo = skb_shinfo(skb);
0455     const bool is_gso = skb_is_gso(skb);
0456     u32 desc_idx = tx->dqo_tx.tail;
0457
0458     struct gve_tx_pending_packet_dqo *pkt;
0459     struct gve_tx_metadata_dqo metadata;
0460     s16 completion_tag;
0461     int i;
0462
0463     pkt = gve_alloc_pending_packet(tx);
0464     pkt->skb = skb;
0465     pkt->num_bufs = 0;
0466     completion_tag = pkt - tx->dqo.pending_packets;
0467
0468     gve_extract_tx_metadata_dqo(skb, &metadata);
0469     if (is_gso) {
0470         int header_len = gve_prep_tso(skb);
0471
0472         if (unlikely(header_len < 0))
0473             goto err;
0474
0475         gve_tx_fill_tso_ctx_desc(&tx->dqo.tx_ring[desc_idx].tso_ctx,
0476                      skb, &metadata, header_len);
0477         desc_idx = (desc_idx + 1) & tx->mask;
0478     }
0479
0480     gve_tx_fill_general_ctx_desc(&tx->dqo.tx_ring[desc_idx].general_ctx,
0481                      &metadata);
0482     desc_idx = (desc_idx + 1) & tx->mask;
0483
0484     /* Note: HW requires that the size of a non-TSO packet be within the
0485      * range of [17, 9728].
0486      *
0487      * We don't double check because
0488      * - We limited `netdev->min_mtu` to ETH_MIN_MTU.
0489      * - Hypervisor won't allow MTU larger than 9216.
0490      */
0491
0492     /* Map the linear portion of skb */
0493     {
0494         u32 len = skb_headlen(skb);
0495         dma_addr_t addr;
0496
0497         addr = dma_map_single(tx->dev, skb->data, len, DMA_TO_DEVICE);
0498         if (unlikely(dma_mapping_error(tx->dev, addr)))
0499             goto err;
0500
0501         dma_unmap_len_set(pkt, len[pkt->num_bufs], len);
0502         dma_unmap_addr_set(pkt, dma[pkt->num_bufs], addr);
0503         ++pkt->num_bufs;
0504
0505         gve_tx_fill_pkt_desc_dqo(tx, &desc_idx, skb, len, addr,
0506                      completion_tag,
0507                      /*eop=*/shinfo->nr_frags == 0, is_gso);
0508     }
0509
0510     for (i = 0; i < shinfo->nr_frags; i++) {
0511         const skb_frag_t *frag = &shinfo->frags[i];
0512         bool is_eop = i == (shinfo->nr_frags - 1);
0513         u32 len = skb_frag_size(frag);
0514         dma_addr_t addr;
0515
0516         addr = skb_frag_dma_map(tx->dev, frag, 0, len, DMA_TO_DEVICE);
0517         if (unlikely(dma_mapping_error(tx->dev, addr)))
0518             goto err;
0519
0520         dma_unmap_len_set(pkt, len[pkt->num_bufs], len);
0521         dma_unmap_addr_set(pkt, dma[pkt->num_bufs], addr);
0522         ++pkt->num_bufs;
0523
0524         gve_tx_fill_pkt_desc_dqo(tx, &desc_idx, skb, len, addr,
0525                      completion_tag, is_eop, is_gso);
0526     }
0527
0528     /* Commit the changes to our state */
0529     tx->dqo_tx.tail = desc_idx;
0530
0531     /* Request a descriptor completion on the last descriptor of the
0532      * packet if we are allowed to by the HW enforced interval.
0533      */
0534     {
0535         u32 last_desc_idx = (desc_idx - 1) & tx->mask;
0536         u32 last_report_event_interval =
0537             (last_desc_idx - tx->dqo_tx.last_re_idx) & tx->mask;
0538
0539         if (unlikely(last_report_event_interval >=
0540                  GVE_TX_MIN_RE_INTERVAL)) {
0541             tx->dqo.tx_ring[last_desc_idx].pkt.report_event = true;
0542             tx->dqo_tx.last_re_idx = last_desc_idx;
0543         }
0544     }
0545
0546     return 0;
0547
0548 err:
0549     for (i = 0; i < pkt->num_bufs; i++) {
0550         if (i == 0) {
0551             dma_unmap_single(tx->dev,
0552                      dma_unmap_addr(pkt, dma[i]),
0553                      dma_unmap_len(pkt, len[i]),
0554                      DMA_TO_DEVICE);
0555         } else {
0556             dma_unmap_page(tx->dev,
0557                        dma_unmap_addr(pkt, dma[i]),
0558                        dma_unmap_len(pkt, len[i]),
0559                        DMA_TO_DEVICE);
0560         }
0561     }
0562
0563     pkt->skb = NULL;
0564     pkt->num_bufs = 0;
0565     gve_free_pending_packet(tx, pkt);
0566
0567     return -1;
0568 }
0569
0570 static int gve_num_descs_per_buf(size_t size)
0571 {
0572     return DIV_ROUND_UP(size, GVE_TX_MAX_BUF_SIZE_DQO);
0573 }
0574
0575 static int gve_num_buffer_descs_needed(const struct sk_buff *skb)
0576 {
0577     const struct skb_shared_info *shinfo = skb_shinfo(skb);
0578     int num_descs;
0579     int i;
0580
0581     num_descs = gve_num_descs_per_buf(skb_headlen(skb));
0582
0583     for (i = 0; i < shinfo->nr_frags; i++) {
0584         unsigned int frag_size = skb_frag_size(&shinfo->frags[i]);
0585
0586         num_descs += gve_num_descs_per_buf(frag_size);
0587     }
0588
0589     return num_descs;
0590 }
0591
0592 /* Returns true if HW is capable of sending TSO represented by `skb`.
0593  *
0594  * Each segment must not span more than GVE_TX_MAX_DATA_DESCS buffers.
0595  * - The header is counted as one buffer for every single segment.
0596  * - A buffer which is split between two segments is counted for both.
0597  * - If a buffer contains both header and payload, it is counted as two buffers.
0598  */
0599 static bool gve_can_send_tso(const struct sk_buff *skb)
0600 {
0601     const int max_bufs_per_seg = GVE_TX_MAX_DATA_DESCS - 1;
0602     const struct skb_shared_info *shinfo = skb_shinfo(skb);
0603     const int header_len = skb_tcp_all_headers(skb);
0604     const int gso_size = shinfo->gso_size;
0605     int cur_seg_num_bufs;
0606     int cur_seg_size;
0607     int i;
0608
0609     cur_seg_size = skb_headlen(skb) - header_len;
0610     cur_seg_num_bufs = cur_seg_size > 0;
0611
0612     for (i = 0; i < shinfo->nr_frags; i++) {
0613         if (cur_seg_size >= gso_size) {
0614             cur_seg_size %= gso_size;
0615             cur_seg_num_bufs = cur_seg_size > 0;
0616         }
0617
0618         if (unlikely(++cur_seg_num_bufs > max_bufs_per_seg))
0619             return false;
0620
0621         cur_seg_size += skb_frag_size(&shinfo->frags[i]);
0622     }
0623
0624     return true;
0625 }
0626
0627 /* Attempt to transmit specified SKB.
0628  *
0629  * Returns 0 if the SKB was transmitted or dropped.
0630  * Returns -1 if there is not currently enough space to transmit the SKB.
0631  */
0632 static int gve_try_tx_skb(struct gve_priv *priv, struct gve_tx_ring *tx,
0633               struct sk_buff *skb)
0634 {
0635     int num_buffer_descs;
0636     int total_num_descs;
0637
0638     if (skb_is_gso(skb)) {
0639         /* If TSO doesn't meet HW requirements, attempt to linearize the
0640          * packet.
0641          */
0642         if (unlikely(!gve_can_send_tso(skb) &&
0643                  skb_linearize(skb) < 0)) {
0644             net_err_ratelimited("%s: Failed to transmit TSO packet\n",
0645                         priv->dev->name);
0646             goto drop;
0647         }
0648
0649         num_buffer_descs = gve_num_buffer_descs_needed(skb);
0650     } else {
0651         num_buffer_descs = gve_num_buffer_descs_needed(skb);
0652
0653         if (unlikely(num_buffer_descs > GVE_TX_MAX_DATA_DESCS)) {
0654             if (unlikely(skb_linearize(skb) < 0))
0655                 goto drop;
0656
0657             num_buffer_descs = 1;
0658         }
0659     }
0660
0661     /* Metadata + (optional TSO) + data descriptors. */
0662     total_num_descs = 1 + skb_is_gso(skb) + num_buffer_descs;
0663     if (unlikely(gve_maybe_stop_tx_dqo(tx, total_num_descs +
0664             GVE_TX_MIN_DESC_PREVENT_CACHE_OVERLAP))) {
0665         return -1;
0666     }
0667
0668     if (unlikely(gve_tx_add_skb_no_copy_dqo(tx, skb) < 0))
0669         goto drop;
0670
0671     netdev_tx_sent_queue(tx->netdev_txq, skb->len);
0672     skb_tx_timestamp(skb);
0673     return 0;
0674
0675 drop:
0676     tx->dropped_pkt++;
0677     dev_kfree_skb_any(skb);
0678     return 0;
0679 }
0680
0681 /* Transmit a given skb and ring the doorbell. */
0682 netdev_tx_t gve_tx_dqo(struct sk_buff *skb, struct net_device *dev)
0683 {
0684     struct gve_priv *priv = netdev_priv(dev);
0685     struct gve_tx_ring *tx;
0686
0687     tx = &priv->tx[skb_get_queue_mapping(skb)];
0688     if (unlikely(gve_try_tx_skb(priv, tx, skb) < 0)) {
0689         /* We need to ring the txq doorbell -- we have stopped the Tx
0690          * queue for want of resources, but prior calls to gve_tx()
0691          * may have added descriptors without ringing the doorbell.
0692          */
0693         gve_tx_put_doorbell_dqo(priv, tx->q_resources, tx->dqo_tx.tail);
0694         return NETDEV_TX_BUSY;
0695     }
0696
0697     if (!netif_xmit_stopped(tx->netdev_txq) && netdev_xmit_more())
0698         return NETDEV_TX_OK;
0699
0700     gve_tx_put_doorbell_dqo(priv, tx->q_resources, tx->dqo_tx.tail);
0701     return NETDEV_TX_OK;
0702 }
0703
0704 static void add_to_list(struct gve_tx_ring *tx, struct gve_index_list *list,
0705             struct gve_tx_pending_packet_dqo *pending_packet)
0706 {
0707     s16 old_tail, index;
0708
0709     index = pending_packet - tx->dqo.pending_packets;
0710     old_tail = list->tail;
0711     list->tail = index;
0712     if (old_tail == -1)
0713         list->head = index;
0714     else
0715         tx->dqo.pending_packets[old_tail].next = index;
0716
0717     pending_packet->next = -1;
0718     pending_packet->prev = old_tail;
0719 }
0720
0721 static void remove_from_list(struct gve_tx_ring *tx,
0722                  struct gve_index_list *list,
0723                  struct gve_tx_pending_packet_dqo *pkt)
0724 {
0725     s16 prev_index, next_index;
0726
0727     prev_index = pkt->prev;
0728     next_index = pkt->next;
0729
0730     if (prev_index == -1) {
0731         /* Node is head */
0732         list->head = next_index;
0733     } else {
0734         tx->dqo.pending_packets[prev_index].next = next_index;
0735     }
0736     if (next_index == -1) {
0737         /* Node is tail */
0738         list->tail = prev_index;
0739     } else {
0740         tx->dqo.pending_packets[next_index].prev = prev_index;
0741     }
0742 }
0743
0744 static void gve_unmap_packet(struct device *dev,
0745                  struct gve_tx_pending_packet_dqo *pkt)
0746 {
0747     int i;
0748
0749     /* SKB linear portion is guaranteed to be mapped */
0750     dma_unmap_single(dev, dma_unmap_addr(pkt, dma[0]),
0751              dma_unmap_len(pkt, len[0]), DMA_TO_DEVICE);
0752     for (i = 1; i < pkt->num_bufs; i++) {
0753         dma_unmap_page(dev, dma_unmap_addr(pkt, dma[i]),
0754                    dma_unmap_len(pkt, len[i]), DMA_TO_DEVICE);
0755     }
0756     pkt->num_bufs = 0;
0757 }
0758
0759 /* Completion types and expected behavior:
0760  * No Miss compl + Packet compl = Packet completed normally.
0761  * Miss compl + Re-inject compl = Packet completed normally.
0762  * No Miss compl + Re-inject compl = Skipped i.e. packet not completed.
0763  * Miss compl + Packet compl = Skipped i.e. packet not completed.
0764  */
0765 static void gve_handle_packet_completion(struct gve_priv *priv,
0766                      struct gve_tx_ring *tx, bool is_napi,
0767                      u16 compl_tag, u64 *bytes, u64 *pkts,
0768                      bool is_reinjection)
0769 {
0770     struct gve_tx_pending_packet_dqo *pending_packet;
0771
0772     if (unlikely(compl_tag >= tx->dqo.num_pending_packets)) {
0773         net_err_ratelimited("%s: Invalid TX completion tag: %d\n",
0774                     priv->dev->name, (int)compl_tag);
0775         return;
0776     }
0777
0778     pending_packet = &tx->dqo.pending_packets[compl_tag];
0779
0780     if (unlikely(is_reinjection)) {
0781         if (unlikely(pending_packet->state ==
0782                  GVE_PACKET_STATE_TIMED_OUT_COMPL)) {
0783             net_err_ratelimited("%s: Re-injection completion: %d received after timeout.\n",
0784                         priv->dev->name, (int)compl_tag);
0785             /* Packet was already completed as a result of timeout,
0786              * so just remove from list and free pending packet.
0787              */
0788             remove_from_list(tx,
0789                      &tx->dqo_compl.timed_out_completions,
0790                      pending_packet);
0791             gve_free_pending_packet(tx, pending_packet);
0792             return;
0793         }
0794         if (unlikely(pending_packet->state !=
0795                  GVE_PACKET_STATE_PENDING_REINJECT_COMPL)) {
0796             /* No outstanding miss completion but packet allocated
0797              * implies packet receives a re-injection completion
0798              * without a prior miss completion. Return without
0799              * completing the packet.
0800              */
0801             net_err_ratelimited("%s: Re-injection completion received without corresponding miss completion: %d\n",
0802                         priv->dev->name, (int)compl_tag);
0803             return;
0804         }
0805         remove_from_list(tx, &tx->dqo_compl.miss_completions,
0806                  pending_packet);
0807     } else {
0808         /* Packet is allocated but not a pending data completion. */
0809         if (unlikely(pending_packet->state !=
0810                  GVE_PACKET_STATE_PENDING_DATA_COMPL)) {
0811             net_err_ratelimited("%s: No pending data completion: %d\n",
0812                         priv->dev->name, (int)compl_tag);
0813             return;
0814         }
0815     }
0816     gve_unmap_packet(tx->dev, pending_packet);
0817
0818     *bytes += pending_packet->skb->len;
0819     (*pkts)++;
0820     napi_consume_skb(pending_packet->skb, is_napi);
0821     pending_packet->skb = NULL;
0822     gve_free_pending_packet(tx, pending_packet);
0823 }
0824
0825 static void gve_handle_miss_completion(struct gve_priv *priv,
0826                        struct gve_tx_ring *tx, u16 compl_tag,
0827                        u64 *bytes, u64 *pkts)
0828 {
0829     struct gve_tx_pending_packet_dqo *pending_packet;
0830
0831     if (unlikely(compl_tag >= tx->dqo.num_pending_packets)) {
0832         net_err_ratelimited("%s: Invalid TX completion tag: %d\n",
0833                     priv->dev->name, (int)compl_tag);
0834         return;
0835     }
0836
0837     pending_packet = &tx->dqo.pending_packets[compl_tag];
0838     if (unlikely(pending_packet->state !=
0839                 GVE_PACKET_STATE_PENDING_DATA_COMPL)) {
0840         net_err_ratelimited("%s: Unexpected packet state: %d for completion tag : %d\n",
0841                     priv->dev->name, (int)pending_packet->state,
0842                     (int)compl_tag);
0843         return;
0844     }
0845
0846     pending_packet->state = GVE_PACKET_STATE_PENDING_REINJECT_COMPL;
0847     /* jiffies can wraparound but time comparisons can handle overflows. */
0848     pending_packet->timeout_jiffies =
0849             jiffies +
0850             msecs_to_jiffies(GVE_REINJECT_COMPL_TIMEOUT *
0851                      MSEC_PER_SEC);
0852     add_to_list(tx, &tx->dqo_compl.miss_completions, pending_packet);
0853
0854     *bytes += pending_packet->skb->len;
0855     (*pkts)++;
0856 }
0857
0858 static void remove_miss_completions(struct gve_priv *priv,
0859                     struct gve_tx_ring *tx)
0860 {
0861     struct gve_tx_pending_packet_dqo *pending_packet;
0862     s16 next_index;
0863
0864     next_index = tx->dqo_compl.miss_completions.head;
0865     while (next_index != -1) {
0866         pending_packet = &tx->dqo.pending_packets[next_index];
0867         next_index = pending_packet->next;
0868         /* Break early because packets should timeout in order. */
0869         if (time_is_after_jiffies(pending_packet->timeout_jiffies))
0870             break;
0871
0872         remove_from_list(tx, &tx->dqo_compl.miss_completions,
0873                  pending_packet);
0874         /* Unmap buffers and free skb but do not unallocate packet i.e.
0875          * the completion tag is not freed to ensure that the driver
0876          * can take appropriate action if a corresponding valid
0877          * completion is received later.
0878          */
0879         gve_unmap_packet(tx->dev, pending_packet);
0880         /* This indicates the packet was dropped. */
0881         dev_kfree_skb_any(pending_packet->skb);
0882         pending_packet->skb = NULL;
0883         tx->dropped_pkt++;
0884         net_err_ratelimited("%s: No reinjection completion was received for: %d.\n",
0885                     priv->dev->name,
0886                     (int)(pending_packet - tx->dqo.pending_packets));
0887
0888         pending_packet->state = GVE_PACKET_STATE_TIMED_OUT_COMPL;
0889         pending_packet->timeout_jiffies =
0890                 jiffies +
0891                 msecs_to_jiffies(GVE_DEALLOCATE_COMPL_TIMEOUT *
0892                          MSEC_PER_SEC);
0893         /* Maintain pending packet in another list so the packet can be
0894          * unallocated at a later time.
0895          */
0896         add_to_list(tx, &tx->dqo_compl.timed_out_completions,
0897                 pending_packet);
0898     }
0899 }
0900
0901 static void remove_timed_out_completions(struct gve_priv *priv,
0902                      struct gve_tx_ring *tx)
0903 {
0904     struct gve_tx_pending_packet_dqo *pending_packet;
0905     s16 next_index;
0906
0907     next_index = tx->dqo_compl.timed_out_completions.head;
0908     while (next_index != -1) {
0909         pending_packet = &tx->dqo.pending_packets[next_index];
0910         next_index = pending_packet->next;
0911         /* Break early because packets should timeout in order. */
0912         if (time_is_after_jiffies(pending_packet->timeout_jiffies))
0913             break;
0914
0915         remove_from_list(tx, &tx->dqo_compl.timed_out_completions,
0916                  pending_packet);
0917         gve_free_pending_packet(tx, pending_packet);
0918     }
0919 }
0920
0921 int gve_clean_tx_done_dqo(struct gve_priv *priv, struct gve_tx_ring *tx,
0922               struct napi_struct *napi)
0923 {
0924     u64 reinject_compl_bytes = 0;
0925     u64 reinject_compl_pkts = 0;
0926     int num_descs_cleaned = 0;
0927     u64 miss_compl_bytes = 0;
0928     u64 miss_compl_pkts = 0;
0929     u64 pkt_compl_bytes = 0;
0930     u64 pkt_compl_pkts = 0;
0931
0932     /* Limit in order to avoid blocking for too long */
0933     while (!napi || pkt_compl_pkts < napi->weight) {
0934         struct gve_tx_compl_desc *compl_desc =
0935             &tx->dqo.compl_ring[tx->dqo_compl.head];
0936         u16 type;
0937
0938         if (compl_desc->generation == tx->dqo_compl.cur_gen_bit)
0939             break;
0940
0941         /* Prefetch the next descriptor. */
0942         prefetch(&tx->dqo.compl_ring[(tx->dqo_compl.head + 1) &
0943                 tx->dqo.complq_mask]);
0944
0945         /* Do not read data until we own the descriptor */
0946         dma_rmb();
0947         type = compl_desc->type;
0948
0949         if (type == GVE_COMPL_TYPE_DQO_DESC) {
0950             /* This is the last descriptor fetched by HW plus one */
0951             u16 tx_head = le16_to_cpu(compl_desc->tx_head);
0952
0953             atomic_set_release(&tx->dqo_compl.hw_tx_head, tx_head);
0954         } else if (type == GVE_COMPL_TYPE_DQO_PKT) {
0955             u16 compl_tag = le16_to_cpu(compl_desc->completion_tag);
0956
0957             gve_handle_packet_completion(priv, tx, !!napi,
0958                              compl_tag,
0959                              &pkt_compl_bytes,
0960                              &pkt_compl_pkts,
0961                              /*is_reinjection=*/false);
0962         } else if (type == GVE_COMPL_TYPE_DQO_MISS) {
0963             u16 compl_tag = le16_to_cpu(compl_desc->completion_tag);
0964
0965             gve_handle_miss_completion(priv, tx, compl_tag,
0966                            &miss_compl_bytes,
0967                            &miss_compl_pkts);
0968         } else if (type == GVE_COMPL_TYPE_DQO_REINJECTION) {
0969             u16 compl_tag = le16_to_cpu(compl_desc->completion_tag);
0970
0971             gve_handle_packet_completion(priv, tx, !!napi,
0972                              compl_tag,
0973                              &reinject_compl_bytes,
0974                              &reinject_compl_pkts,
0975                              /*is_reinjection=*/true);
0976         }
0977
0978         tx->dqo_compl.head =
0979             (tx->dqo_compl.head + 1) & tx->dqo.complq_mask;
0980         /* Flip the generation bit when we wrap around */
0981         tx->dqo_compl.cur_gen_bit ^= tx->dqo_compl.head == 0;
0982         num_descs_cleaned++;
0983     }
0984
0985     netdev_tx_completed_queue(tx->netdev_txq,
0986                   pkt_compl_pkts + miss_compl_pkts,
0987                   pkt_compl_bytes + miss_compl_bytes);
0988
0989     remove_miss_completions(priv, tx);
0990     remove_timed_out_completions(priv, tx);
0991
0992     u64_stats_update_begin(&tx->statss);
0993     tx->bytes_done += pkt_compl_bytes + reinject_compl_bytes;
0994     tx->pkt_done += pkt_compl_pkts + reinject_compl_pkts;
0995     u64_stats_update_end(&tx->statss);
0996     return num_descs_cleaned;
0997 }
0998
0999 bool gve_tx_poll_dqo(struct gve_notify_block *block, bool do_clean)
1000 {
1001     struct gve_tx_compl_desc *compl_desc;
1002     struct gve_tx_ring *tx = block->tx;
1003     struct gve_priv *priv = block->priv;
1004
1005     if (do_clean) {
1006         int num_descs_cleaned = gve_clean_tx_done_dqo(priv, tx,
1007                                   &block->napi);
1008
1009         /* Sync with queue being stopped in `gve_maybe_stop_tx_dqo()` */
1010         mb();
1011
1012         if (netif_tx_queue_stopped(tx->netdev_txq) &&
1013             num_descs_cleaned > 0) {
1014             tx->wake_queue++;
1015             netif_tx_wake_queue(tx->netdev_txq);
1016         }
1017     }
1018
1019     /* Return true if we still have work. */
1020     compl_desc = &tx->dqo.compl_ring[tx->dqo_compl.head];
1021     return compl_desc->generation != tx->dqo_compl.cur_gen_bit;
1022 }