google/gve/gve_main.c

0001 // SPDX-License-Identifier: (GPL-2.0 OR MIT)
0002 /* Google virtual Ethernet (gve) driver
0003  *
0004  * Copyright (C) 2015-2021 Google, Inc.
0005  */
0006
0007 #include <linux/cpumask.h>
0008 #include <linux/etherdevice.h>
0009 #include <linux/interrupt.h>
0010 #include <linux/module.h>
0011 #include <linux/pci.h>
0012 #include <linux/sched.h>
0013 #include <linux/timer.h>
0014 #include <linux/workqueue.h>
0015 #include <net/sch_generic.h>
0016 #include "gve.h"
0017 #include "gve_dqo.h"
0018 #include "gve_adminq.h"
0019 #include "gve_register.h"
0020
0021 #define GVE_DEFAULT_RX_COPYBREAK    (256)
0022
0023 #define DEFAULT_MSG_LEVEL   (NETIF_MSG_DRV | NETIF_MSG_LINK)
0024 #define GVE_VERSION     "1.0.0"
0025 #define GVE_VERSION_PREFIX  "GVE-"
0026
0027 // Minimum amount of time between queue kicks in msec (10 seconds)
0028 #define MIN_TX_TIMEOUT_GAP (1000 * 10)
0029
0030 const char gve_version_str[] = GVE_VERSION;
0031 static const char gve_version_prefix[] = GVE_VERSION_PREFIX;
0032
0033 static netdev_tx_t gve_start_xmit(struct sk_buff *skb, struct net_device *dev)
0034 {
0035     struct gve_priv *priv = netdev_priv(dev);
0036
0037     if (gve_is_gqi(priv))
0038         return gve_tx(skb, dev);
0039     else
0040         return gve_tx_dqo(skb, dev);
0041 }
0042
0043 static void gve_get_stats(struct net_device *dev, struct rtnl_link_stats64 *s)
0044 {
0045     struct gve_priv *priv = netdev_priv(dev);
0046     unsigned int start;
0047     u64 packets, bytes;
0048     int ring;
0049
0050     if (priv->rx) {
0051         for (ring = 0; ring < priv->rx_cfg.num_queues; ring++) {
0052             do {
0053                 start =
0054                   u64_stats_fetch_begin_irq(&priv->rx[ring].statss);
0055                 packets = priv->rx[ring].rpackets;
0056                 bytes = priv->rx[ring].rbytes;
0057             } while (u64_stats_fetch_retry_irq(&priv->rx[ring].statss,
0058                                start));
0059             s->rx_packets += packets;
0060             s->rx_bytes += bytes;
0061         }
0062     }
0063     if (priv->tx) {
0064         for (ring = 0; ring < priv->tx_cfg.num_queues; ring++) {
0065             do {
0066                 start =
0067                   u64_stats_fetch_begin_irq(&priv->tx[ring].statss);
0068                 packets = priv->tx[ring].pkt_done;
0069                 bytes = priv->tx[ring].bytes_done;
0070             } while (u64_stats_fetch_retry_irq(&priv->tx[ring].statss,
0071                                start));
0072             s->tx_packets += packets;
0073             s->tx_bytes += bytes;
0074         }
0075     }
0076 }
0077
0078 static int gve_alloc_counter_array(struct gve_priv *priv)
0079 {
0080     priv->counter_array =
0081         dma_alloc_coherent(&priv->pdev->dev,
0082                    priv->num_event_counters *
0083                    sizeof(*priv->counter_array),
0084                    &priv->counter_array_bus, GFP_KERNEL);
0085     if (!priv->counter_array)
0086         return -ENOMEM;
0087
0088     return 0;
0089 }
0090
0091 static void gve_free_counter_array(struct gve_priv *priv)
0092 {
0093     if (!priv->counter_array)
0094         return;
0095
0096     dma_free_coherent(&priv->pdev->dev,
0097               priv->num_event_counters *
0098               sizeof(*priv->counter_array),
0099               priv->counter_array, priv->counter_array_bus);
0100     priv->counter_array = NULL;
0101 }
0102
0103 /* NIC requests to report stats */
0104 static void gve_stats_report_task(struct work_struct *work)
0105 {
0106     struct gve_priv *priv = container_of(work, struct gve_priv,
0107                          stats_report_task);
0108     if (gve_get_do_report_stats(priv)) {
0109         gve_handle_report_stats(priv);
0110         gve_clear_do_report_stats(priv);
0111     }
0112 }
0113
0114 static void gve_stats_report_schedule(struct gve_priv *priv)
0115 {
0116     if (!gve_get_probe_in_progress(priv) &&
0117         !gve_get_reset_in_progress(priv)) {
0118         gve_set_do_report_stats(priv);
0119         queue_work(priv->gve_wq, &priv->stats_report_task);
0120     }
0121 }
0122
0123 static void gve_stats_report_timer(struct timer_list *t)
0124 {
0125     struct gve_priv *priv = from_timer(priv, t, stats_report_timer);
0126
0127     mod_timer(&priv->stats_report_timer,
0128           round_jiffies(jiffies +
0129           msecs_to_jiffies(priv->stats_report_timer_period)));
0130     gve_stats_report_schedule(priv);
0131 }
0132
0133 static int gve_alloc_stats_report(struct gve_priv *priv)
0134 {
0135     int tx_stats_num, rx_stats_num;
0136
0137     tx_stats_num = (GVE_TX_STATS_REPORT_NUM + NIC_TX_STATS_REPORT_NUM) *
0138                priv->tx_cfg.num_queues;
0139     rx_stats_num = (GVE_RX_STATS_REPORT_NUM + NIC_RX_STATS_REPORT_NUM) *
0140                priv->rx_cfg.num_queues;
0141     priv->stats_report_len = struct_size(priv->stats_report, stats,
0142                          tx_stats_num + rx_stats_num);
0143     priv->stats_report =
0144         dma_alloc_coherent(&priv->pdev->dev, priv->stats_report_len,
0145                    &priv->stats_report_bus, GFP_KERNEL);
0146     if (!priv->stats_report)
0147         return -ENOMEM;
0148     /* Set up timer for the report-stats task */
0149     timer_setup(&priv->stats_report_timer, gve_stats_report_timer, 0);
0150     priv->stats_report_timer_period = GVE_STATS_REPORT_TIMER_PERIOD;
0151     return 0;
0152 }
0153
0154 static void gve_free_stats_report(struct gve_priv *priv)
0155 {
0156     if (!priv->stats_report)
0157         return;
0158
0159     del_timer_sync(&priv->stats_report_timer);
0160     dma_free_coherent(&priv->pdev->dev, priv->stats_report_len,
0161               priv->stats_report, priv->stats_report_bus);
0162     priv->stats_report = NULL;
0163 }
0164
0165 static irqreturn_t gve_mgmnt_intr(int irq, void *arg)
0166 {
0167     struct gve_priv *priv = arg;
0168
0169     queue_work(priv->gve_wq, &priv->service_task);
0170     return IRQ_HANDLED;
0171 }
0172
0173 static irqreturn_t gve_intr(int irq, void *arg)
0174 {
0175     struct gve_notify_block *block = arg;
0176     struct gve_priv *priv = block->priv;
0177
0178     iowrite32be(GVE_IRQ_MASK, gve_irq_doorbell(priv, block));
0179     napi_schedule_irqoff(&block->napi);
0180     return IRQ_HANDLED;
0181 }
0182
0183 static irqreturn_t gve_intr_dqo(int irq, void *arg)
0184 {
0185     struct gve_notify_block *block = arg;
0186
0187     /* Interrupts are automatically masked */
0188     napi_schedule_irqoff(&block->napi);
0189     return IRQ_HANDLED;
0190 }
0191
0192 static int gve_napi_poll(struct napi_struct *napi, int budget)
0193 {
0194     struct gve_notify_block *block;
0195     __be32 __iomem *irq_doorbell;
0196     bool reschedule = false;
0197     struct gve_priv *priv;
0198     int work_done = 0;
0199
0200     block = container_of(napi, struct gve_notify_block, napi);
0201     priv = block->priv;
0202
0203     if (block->tx)
0204         reschedule |= gve_tx_poll(block, budget);
0205     if (block->rx) {
0206         work_done = gve_rx_poll(block, budget);
0207         reschedule |= work_done == budget;
0208     }
0209
0210     if (reschedule)
0211         return budget;
0212
0213        /* Complete processing - don't unmask irq if busy polling is enabled */
0214     if (likely(napi_complete_done(napi, work_done))) {
0215         irq_doorbell = gve_irq_doorbell(priv, block);
0216         iowrite32be(GVE_IRQ_ACK | GVE_IRQ_EVENT, irq_doorbell);
0217
0218         /* Ensure IRQ ACK is visible before we check pending work.
0219          * If queue had issued updates, it would be truly visible.
0220          */
0221         mb();
0222
0223         if (block->tx)
0224             reschedule |= gve_tx_clean_pending(priv, block->tx);
0225         if (block->rx)
0226             reschedule |= gve_rx_work_pending(block->rx);
0227
0228         if (reschedule && napi_reschedule(napi))
0229             iowrite32be(GVE_IRQ_MASK, irq_doorbell);
0230     }
0231     return work_done;
0232 }
0233
0234 static int gve_napi_poll_dqo(struct napi_struct *napi, int budget)
0235 {
0236     struct gve_notify_block *block =
0237         container_of(napi, struct gve_notify_block, napi);
0238     struct gve_priv *priv = block->priv;
0239     bool reschedule = false;
0240     int work_done = 0;
0241
0242     /* Clear PCI MSI-X Pending Bit Array (PBA)
0243      *
0244      * This bit is set if an interrupt event occurs while the vector is
0245      * masked. If this bit is set and we reenable the interrupt, it will
0246      * fire again. Since we're just about to poll the queue state, we don't
0247      * need it to fire again.
0248      *
0249      * Under high softirq load, it's possible that the interrupt condition
0250      * is triggered twice before we got the chance to process it.
0251      */
0252     gve_write_irq_doorbell_dqo(priv, block,
0253                    GVE_ITR_NO_UPDATE_DQO | GVE_ITR_CLEAR_PBA_BIT_DQO);
0254
0255     if (block->tx)
0256         reschedule |= gve_tx_poll_dqo(block, /*do_clean=*/true);
0257
0258     if (block->rx) {
0259         work_done = gve_rx_poll_dqo(block, budget);
0260         reschedule |= work_done == budget;
0261     }
0262
0263     if (reschedule)
0264         return budget;
0265
0266     if (likely(napi_complete_done(napi, work_done))) {
0267         /* Enable interrupts again.
0268          *
0269          * We don't need to repoll afterwards because HW supports the
0270          * PCI MSI-X PBA feature.
0271          *
0272          * Another interrupt would be triggered if a new event came in
0273          * since the last one.
0274          */
0275         gve_write_irq_doorbell_dqo(priv, block,
0276                        GVE_ITR_NO_UPDATE_DQO | GVE_ITR_ENABLE_BIT_DQO);
0277     }
0278
0279     return work_done;
0280 }
0281
0282 static int gve_alloc_notify_blocks(struct gve_priv *priv)
0283 {
0284     int num_vecs_requested = priv->num_ntfy_blks + 1;
0285     char *name = priv->dev->name;
0286     unsigned int active_cpus;
0287     int vecs_enabled;
0288     int i, j;
0289     int err;
0290
0291     priv->msix_vectors = kvcalloc(num_vecs_requested,
0292                       sizeof(*priv->msix_vectors), GFP_KERNEL);
0293     if (!priv->msix_vectors)
0294         return -ENOMEM;
0295     for (i = 0; i < num_vecs_requested; i++)
0296         priv->msix_vectors[i].entry = i;
0297     vecs_enabled = pci_enable_msix_range(priv->pdev, priv->msix_vectors,
0298                          GVE_MIN_MSIX, num_vecs_requested);
0299     if (vecs_enabled < 0) {
0300         dev_err(&priv->pdev->dev, "Could not enable min msix %d/%d\n",
0301             GVE_MIN_MSIX, vecs_enabled);
0302         err = vecs_enabled;
0303         goto abort_with_msix_vectors;
0304     }
0305     if (vecs_enabled != num_vecs_requested) {
0306         int new_num_ntfy_blks = (vecs_enabled - 1) & ~0x1;
0307         int vecs_per_type = new_num_ntfy_blks / 2;
0308         int vecs_left = new_num_ntfy_blks % 2;
0309
0310         priv->num_ntfy_blks = new_num_ntfy_blks;
0311         priv->mgmt_msix_idx = priv->num_ntfy_blks;
0312         priv->tx_cfg.max_queues = min_t(int, priv->tx_cfg.max_queues,
0313                         vecs_per_type);
0314         priv->rx_cfg.max_queues = min_t(int, priv->rx_cfg.max_queues,
0315                         vecs_per_type + vecs_left);
0316         dev_err(&priv->pdev->dev,
0317             "Could not enable desired msix, only enabled %d, adjusting tx max queues to %d, and rx max queues to %d\n",
0318             vecs_enabled, priv->tx_cfg.max_queues,
0319             priv->rx_cfg.max_queues);
0320         if (priv->tx_cfg.num_queues > priv->tx_cfg.max_queues)
0321             priv->tx_cfg.num_queues = priv->tx_cfg.max_queues;
0322         if (priv->rx_cfg.num_queues > priv->rx_cfg.max_queues)
0323             priv->rx_cfg.num_queues = priv->rx_cfg.max_queues;
0324     }
0325     /* Half the notification blocks go to TX and half to RX */
0326     active_cpus = min_t(int, priv->num_ntfy_blks / 2, num_online_cpus());
0327
0328     /* Setup Management Vector  - the last vector */
0329     snprintf(priv->mgmt_msix_name, sizeof(priv->mgmt_msix_name), "%s-mgmnt",
0330          name);
0331     err = request_irq(priv->msix_vectors[priv->mgmt_msix_idx].vector,
0332               gve_mgmnt_intr, 0, priv->mgmt_msix_name, priv);
0333     if (err) {
0334         dev_err(&priv->pdev->dev, "Did not receive management vector.\n");
0335         goto abort_with_msix_enabled;
0336     }
0337     priv->irq_db_indices =
0338         dma_alloc_coherent(&priv->pdev->dev,
0339                    priv->num_ntfy_blks *
0340                    sizeof(*priv->irq_db_indices),
0341                    &priv->irq_db_indices_bus, GFP_KERNEL);
0342     if (!priv->irq_db_indices) {
0343         err = -ENOMEM;
0344         goto abort_with_mgmt_vector;
0345     }
0346
0347     priv->ntfy_blocks = kvzalloc(priv->num_ntfy_blks *
0348                      sizeof(*priv->ntfy_blocks), GFP_KERNEL);
0349     if (!priv->ntfy_blocks) {
0350         err = -ENOMEM;
0351         goto abort_with_irq_db_indices;
0352     }
0353
0354     /* Setup the other blocks - the first n-1 vectors */
0355     for (i = 0; i < priv->num_ntfy_blks; i++) {
0356         struct gve_notify_block *block = &priv->ntfy_blocks[i];
0357         int msix_idx = i;
0358
0359         snprintf(block->name, sizeof(block->name), "%s-ntfy-block.%d",
0360              name, i);
0361         block->priv = priv;
0362         err = request_irq(priv->msix_vectors[msix_idx].vector,
0363                   gve_is_gqi(priv) ? gve_intr : gve_intr_dqo,
0364                   0, block->name, block);
0365         if (err) {
0366             dev_err(&priv->pdev->dev,
0367                 "Failed to receive msix vector %d\n", i);
0368             goto abort_with_some_ntfy_blocks;
0369         }
0370         irq_set_affinity_hint(priv->msix_vectors[msix_idx].vector,
0371                       get_cpu_mask(i % active_cpus));
0372         block->irq_db_index = &priv->irq_db_indices[i].index;
0373     }
0374     return 0;
0375 abort_with_some_ntfy_blocks:
0376     for (j = 0; j < i; j++) {
0377         struct gve_notify_block *block = &priv->ntfy_blocks[j];
0378         int msix_idx = j;
0379
0380         irq_set_affinity_hint(priv->msix_vectors[msix_idx].vector,
0381                       NULL);
0382         free_irq(priv->msix_vectors[msix_idx].vector, block);
0383     }
0384     kvfree(priv->ntfy_blocks);
0385     priv->ntfy_blocks = NULL;
0386 abort_with_irq_db_indices:
0387     dma_free_coherent(&priv->pdev->dev, priv->num_ntfy_blks *
0388               sizeof(*priv->irq_db_indices),
0389               priv->irq_db_indices, priv->irq_db_indices_bus);
0390     priv->irq_db_indices = NULL;
0391 abort_with_mgmt_vector:
0392     free_irq(priv->msix_vectors[priv->mgmt_msix_idx].vector, priv);
0393 abort_with_msix_enabled:
0394     pci_disable_msix(priv->pdev);
0395 abort_with_msix_vectors:
0396     kvfree(priv->msix_vectors);
0397     priv->msix_vectors = NULL;
0398     return err;
0399 }
0400
0401 static void gve_free_notify_blocks(struct gve_priv *priv)
0402 {
0403     int i;
0404
0405     if (!priv->msix_vectors)
0406         return;
0407
0408     /* Free the irqs */
0409     for (i = 0; i < priv->num_ntfy_blks; i++) {
0410         struct gve_notify_block *block = &priv->ntfy_blocks[i];
0411         int msix_idx = i;
0412
0413         irq_set_affinity_hint(priv->msix_vectors[msix_idx].vector,
0414                       NULL);
0415         free_irq(priv->msix_vectors[msix_idx].vector, block);
0416     }
0417     free_irq(priv->msix_vectors[priv->mgmt_msix_idx].vector, priv);
0418     kvfree(priv->ntfy_blocks);
0419     priv->ntfy_blocks = NULL;
0420     dma_free_coherent(&priv->pdev->dev, priv->num_ntfy_blks *
0421               sizeof(*priv->irq_db_indices),
0422               priv->irq_db_indices, priv->irq_db_indices_bus);
0423     priv->irq_db_indices = NULL;
0424     pci_disable_msix(priv->pdev);
0425     kvfree(priv->msix_vectors);
0426     priv->msix_vectors = NULL;
0427 }
0428
0429 static int gve_setup_device_resources(struct gve_priv *priv)
0430 {
0431     int err;
0432
0433     err = gve_alloc_counter_array(priv);
0434     if (err)
0435         return err;
0436     err = gve_alloc_notify_blocks(priv);
0437     if (err)
0438         goto abort_with_counter;
0439     err = gve_alloc_stats_report(priv);
0440     if (err)
0441         goto abort_with_ntfy_blocks;
0442     err = gve_adminq_configure_device_resources(priv,
0443                             priv->counter_array_bus,
0444                             priv->num_event_counters,
0445                             priv->irq_db_indices_bus,
0446                             priv->num_ntfy_blks);
0447     if (unlikely(err)) {
0448         dev_err(&priv->pdev->dev,
0449             "could not setup device_resources: err=%d\n", err);
0450         err = -ENXIO;
0451         goto abort_with_stats_report;
0452     }
0453
0454     if (priv->queue_format == GVE_DQO_RDA_FORMAT) {
0455         priv->ptype_lut_dqo = kvzalloc(sizeof(*priv->ptype_lut_dqo),
0456                            GFP_KERNEL);
0457         if (!priv->ptype_lut_dqo) {
0458             err = -ENOMEM;
0459             goto abort_with_stats_report;
0460         }
0461         err = gve_adminq_get_ptype_map_dqo(priv, priv->ptype_lut_dqo);
0462         if (err) {
0463             dev_err(&priv->pdev->dev,
0464                 "Failed to get ptype map: err=%d\n", err);
0465             goto abort_with_ptype_lut;
0466         }
0467     }
0468
0469     err = gve_adminq_report_stats(priv, priv->stats_report_len,
0470                       priv->stats_report_bus,
0471                       GVE_STATS_REPORT_TIMER_PERIOD);
0472     if (err)
0473         dev_err(&priv->pdev->dev,
0474             "Failed to report stats: err=%d\n", err);
0475     gve_set_device_resources_ok(priv);
0476     return 0;
0477
0478 abort_with_ptype_lut:
0479     kvfree(priv->ptype_lut_dqo);
0480     priv->ptype_lut_dqo = NULL;
0481 abort_with_stats_report:
0482     gve_free_stats_report(priv);
0483 abort_with_ntfy_blocks:
0484     gve_free_notify_blocks(priv);
0485 abort_with_counter:
0486     gve_free_counter_array(priv);
0487
0488     return err;
0489 }
0490
0491 static void gve_trigger_reset(struct gve_priv *priv);
0492
0493 static void gve_teardown_device_resources(struct gve_priv *priv)
0494 {
0495     int err;
0496
0497     /* Tell device its resources are being freed */
0498     if (gve_get_device_resources_ok(priv)) {
0499         /* detach the stats report */
0500         err = gve_adminq_report_stats(priv, 0, 0x0, GVE_STATS_REPORT_TIMER_PERIOD);
0501         if (err) {
0502             dev_err(&priv->pdev->dev,
0503                 "Failed to detach stats report: err=%d\n", err);
0504             gve_trigger_reset(priv);
0505         }
0506         err = gve_adminq_deconfigure_device_resources(priv);
0507         if (err) {
0508             dev_err(&priv->pdev->dev,
0509                 "Could not deconfigure device resources: err=%d\n",
0510                 err);
0511             gve_trigger_reset(priv);
0512         }
0513     }
0514
0515     kvfree(priv->ptype_lut_dqo);
0516     priv->ptype_lut_dqo = NULL;
0517
0518     gve_free_counter_array(priv);
0519     gve_free_notify_blocks(priv);
0520     gve_free_stats_report(priv);
0521     gve_clear_device_resources_ok(priv);
0522 }
0523
0524 static void gve_add_napi(struct gve_priv *priv, int ntfy_idx,
0525              int (*gve_poll)(struct napi_struct *, int))
0526 {
0527     struct gve_notify_block *block = &priv->ntfy_blocks[ntfy_idx];
0528
0529     netif_napi_add(priv->dev, &block->napi, gve_poll,
0530                NAPI_POLL_WEIGHT);
0531 }
0532
0533 static void gve_remove_napi(struct gve_priv *priv, int ntfy_idx)
0534 {
0535     struct gve_notify_block *block = &priv->ntfy_blocks[ntfy_idx];
0536
0537     netif_napi_del(&block->napi);
0538 }
0539
0540 static int gve_register_qpls(struct gve_priv *priv)
0541 {
0542     int num_qpls = gve_num_tx_qpls(priv) + gve_num_rx_qpls(priv);
0543     int err;
0544     int i;
0545
0546     for (i = 0; i < num_qpls; i++) {
0547         err = gve_adminq_register_page_list(priv, &priv->qpls[i]);
0548         if (err) {
0549             netif_err(priv, drv, priv->dev,
0550                   "failed to register queue page list %d\n",
0551                   priv->qpls[i].id);
0552             /* This failure will trigger a reset - no need to clean
0553              * up
0554              */
0555             return err;
0556         }
0557     }
0558     return 0;
0559 }
0560
0561 static int gve_unregister_qpls(struct gve_priv *priv)
0562 {
0563     int num_qpls = gve_num_tx_qpls(priv) + gve_num_rx_qpls(priv);
0564     int err;
0565     int i;
0566
0567     for (i = 0; i < num_qpls; i++) {
0568         err = gve_adminq_unregister_page_list(priv, priv->qpls[i].id);
0569         /* This failure will trigger a reset - no need to clean up */
0570         if (err) {
0571             netif_err(priv, drv, priv->dev,
0572                   "Failed to unregister queue page list %d\n",
0573                   priv->qpls[i].id);
0574             return err;
0575         }
0576     }
0577     return 0;
0578 }
0579
0580 static int gve_create_rings(struct gve_priv *priv)
0581 {
0582     int err;
0583     int i;
0584
0585     err = gve_adminq_create_tx_queues(priv, priv->tx_cfg.num_queues);
0586     if (err) {
0587         netif_err(priv, drv, priv->dev, "failed to create %d tx queues\n",
0588               priv->tx_cfg.num_queues);
0589         /* This failure will trigger a reset - no need to clean
0590          * up
0591          */
0592         return err;
0593     }
0594     netif_dbg(priv, drv, priv->dev, "created %d tx queues\n",
0595           priv->tx_cfg.num_queues);
0596
0597     err = gve_adminq_create_rx_queues(priv, priv->rx_cfg.num_queues);
0598     if (err) {
0599         netif_err(priv, drv, priv->dev, "failed to create %d rx queues\n",
0600               priv->rx_cfg.num_queues);
0601         /* This failure will trigger a reset - no need to clean
0602          * up
0603          */
0604         return err;
0605     }
0606     netif_dbg(priv, drv, priv->dev, "created %d rx queues\n",
0607           priv->rx_cfg.num_queues);
0608
0609     if (gve_is_gqi(priv)) {
0610         /* Rx data ring has been prefilled with packet buffers at queue
0611          * allocation time.
0612          *
0613          * Write the doorbell to provide descriptor slots and packet
0614          * buffers to the NIC.
0615          */
0616         for (i = 0; i < priv->rx_cfg.num_queues; i++)
0617             gve_rx_write_doorbell(priv, &priv->rx[i]);
0618     } else {
0619         for (i = 0; i < priv->rx_cfg.num_queues; i++) {
0620             /* Post buffers and ring doorbell. */
0621             gve_rx_post_buffers_dqo(&priv->rx[i]);
0622         }
0623     }
0624
0625     return 0;
0626 }
0627
0628 static void add_napi_init_sync_stats(struct gve_priv *priv,
0629                      int (*napi_poll)(struct napi_struct *napi,
0630                               int budget))
0631 {
0632     int i;
0633
0634     /* Add tx napi & init sync stats*/
0635     for (i = 0; i < priv->tx_cfg.num_queues; i++) {
0636         int ntfy_idx = gve_tx_idx_to_ntfy(priv, i);
0637
0638         u64_stats_init(&priv->tx[i].statss);
0639         priv->tx[i].ntfy_id = ntfy_idx;
0640         gve_add_napi(priv, ntfy_idx, napi_poll);
0641     }
0642     /* Add rx napi  & init sync stats*/
0643     for (i = 0; i < priv->rx_cfg.num_queues; i++) {
0644         int ntfy_idx = gve_rx_idx_to_ntfy(priv, i);
0645
0646         u64_stats_init(&priv->rx[i].statss);
0647         priv->rx[i].ntfy_id = ntfy_idx;
0648         gve_add_napi(priv, ntfy_idx, napi_poll);
0649     }
0650 }
0651
0652 static void gve_tx_free_rings(struct gve_priv *priv)
0653 {
0654     if (gve_is_gqi(priv)) {
0655         gve_tx_free_rings_gqi(priv);
0656     } else {
0657         gve_tx_free_rings_dqo(priv);
0658     }
0659 }
0660
0661 static int gve_alloc_rings(struct gve_priv *priv)
0662 {
0663     int err;
0664
0665     /* Setup tx rings */
0666     priv->tx = kvcalloc(priv->tx_cfg.num_queues, sizeof(*priv->tx),
0667                 GFP_KERNEL);
0668     if (!priv->tx)
0669         return -ENOMEM;
0670
0671     if (gve_is_gqi(priv))
0672         err = gve_tx_alloc_rings(priv);
0673     else
0674         err = gve_tx_alloc_rings_dqo(priv);
0675     if (err)
0676         goto free_tx;
0677
0678     /* Setup rx rings */
0679     priv->rx = kvcalloc(priv->rx_cfg.num_queues, sizeof(*priv->rx),
0680                 GFP_KERNEL);
0681     if (!priv->rx) {
0682         err = -ENOMEM;
0683         goto free_tx_queue;
0684     }
0685
0686     if (gve_is_gqi(priv))
0687         err = gve_rx_alloc_rings(priv);
0688     else
0689         err = gve_rx_alloc_rings_dqo(priv);
0690     if (err)
0691         goto free_rx;
0692
0693     if (gve_is_gqi(priv))
0694         add_napi_init_sync_stats(priv, gve_napi_poll);
0695     else
0696         add_napi_init_sync_stats(priv, gve_napi_poll_dqo);
0697
0698     return 0;
0699
0700 free_rx:
0701     kvfree(priv->rx);
0702     priv->rx = NULL;
0703 free_tx_queue:
0704     gve_tx_free_rings(priv);
0705 free_tx:
0706     kvfree(priv->tx);
0707     priv->tx = NULL;
0708     return err;
0709 }
0710
0711 static int gve_destroy_rings(struct gve_priv *priv)
0712 {
0713     int err;
0714
0715     err = gve_adminq_destroy_tx_queues(priv, priv->tx_cfg.num_queues);
0716     if (err) {
0717         netif_err(priv, drv, priv->dev,
0718               "failed to destroy tx queues\n");
0719         /* This failure will trigger a reset - no need to clean up */
0720         return err;
0721     }
0722     netif_dbg(priv, drv, priv->dev, "destroyed tx queues\n");
0723     err = gve_adminq_destroy_rx_queues(priv, priv->rx_cfg.num_queues);
0724     if (err) {
0725         netif_err(priv, drv, priv->dev,
0726               "failed to destroy rx queues\n");
0727         /* This failure will trigger a reset - no need to clean up */
0728         return err;
0729     }
0730     netif_dbg(priv, drv, priv->dev, "destroyed rx queues\n");
0731     return 0;
0732 }
0733
0734 static void gve_rx_free_rings(struct gve_priv *priv)
0735 {
0736     if (gve_is_gqi(priv))
0737         gve_rx_free_rings_gqi(priv);
0738     else
0739         gve_rx_free_rings_dqo(priv);
0740 }
0741
0742 static void gve_free_rings(struct gve_priv *priv)
0743 {
0744     int ntfy_idx;
0745     int i;
0746
0747     if (priv->tx) {
0748         for (i = 0; i < priv->tx_cfg.num_queues; i++) {
0749             ntfy_idx = gve_tx_idx_to_ntfy(priv, i);
0750             gve_remove_napi(priv, ntfy_idx);
0751         }
0752         gve_tx_free_rings(priv);
0753         kvfree(priv->tx);
0754         priv->tx = NULL;
0755     }
0756     if (priv->rx) {
0757         for (i = 0; i < priv->rx_cfg.num_queues; i++) {
0758             ntfy_idx = gve_rx_idx_to_ntfy(priv, i);
0759             gve_remove_napi(priv, ntfy_idx);
0760         }
0761         gve_rx_free_rings(priv);
0762         kvfree(priv->rx);
0763         priv->rx = NULL;
0764     }
0765 }
0766
0767 int gve_alloc_page(struct gve_priv *priv, struct device *dev,
0768            struct page **page, dma_addr_t *dma,
0769            enum dma_data_direction dir, gfp_t gfp_flags)
0770 {
0771     *page = alloc_page(gfp_flags);
0772     if (!*page) {
0773         priv->page_alloc_fail++;
0774         return -ENOMEM;
0775     }
0776     *dma = dma_map_page(dev, *page, 0, PAGE_SIZE, dir);
0777     if (dma_mapping_error(dev, *dma)) {
0778         priv->dma_mapping_error++;
0779         put_page(*page);
0780         return -ENOMEM;
0781     }
0782     return 0;
0783 }
0784
0785 static int gve_alloc_queue_page_list(struct gve_priv *priv, u32 id,
0786                      int pages)
0787 {
0788     struct gve_queue_page_list *qpl = &priv->qpls[id];
0789     int err;
0790     int i;
0791
0792     if (pages + priv->num_registered_pages > priv->max_registered_pages) {
0793         netif_err(priv, drv, priv->dev,
0794               "Reached max number of registered pages %llu > %llu\n",
0795               pages + priv->num_registered_pages,
0796               priv->max_registered_pages);
0797         return -EINVAL;
0798     }
0799
0800     qpl->id = id;
0801     qpl->num_entries = 0;
0802     qpl->pages = kvcalloc(pages, sizeof(*qpl->pages), GFP_KERNEL);
0803     /* caller handles clean up */
0804     if (!qpl->pages)
0805         return -ENOMEM;
0806     qpl->page_buses = kvcalloc(pages, sizeof(*qpl->page_buses), GFP_KERNEL);
0807     /* caller handles clean up */
0808     if (!qpl->page_buses)
0809         return -ENOMEM;
0810
0811     for (i = 0; i < pages; i++) {
0812         err = gve_alloc_page(priv, &priv->pdev->dev, &qpl->pages[i],
0813                      &qpl->page_buses[i],
0814                      gve_qpl_dma_dir(priv, id), GFP_KERNEL);
0815         /* caller handles clean up */
0816         if (err)
0817             return -ENOMEM;
0818         qpl->num_entries++;
0819     }
0820     priv->num_registered_pages += pages;
0821
0822     return 0;
0823 }
0824
0825 void gve_free_page(struct device *dev, struct page *page, dma_addr_t dma,
0826            enum dma_data_direction dir)
0827 {
0828     if (!dma_mapping_error(dev, dma))
0829         dma_unmap_page(dev, dma, PAGE_SIZE, dir);
0830     if (page)
0831         put_page(page);
0832 }
0833
0834 static void gve_free_queue_page_list(struct gve_priv *priv, u32 id)
0835 {
0836     struct gve_queue_page_list *qpl = &priv->qpls[id];
0837     int i;
0838
0839     if (!qpl->pages)
0840         return;
0841     if (!qpl->page_buses)
0842         goto free_pages;
0843
0844     for (i = 0; i < qpl->num_entries; i++)
0845         gve_free_page(&priv->pdev->dev, qpl->pages[i],
0846                   qpl->page_buses[i], gve_qpl_dma_dir(priv, id));
0847
0848     kvfree(qpl->page_buses);
0849 free_pages:
0850     kvfree(qpl->pages);
0851     priv->num_registered_pages -= qpl->num_entries;
0852 }
0853
0854 static int gve_alloc_qpls(struct gve_priv *priv)
0855 {
0856     int num_qpls = gve_num_tx_qpls(priv) + gve_num_rx_qpls(priv);
0857     int i, j;
0858     int err;
0859
0860     if (num_qpls == 0)
0861         return 0;
0862
0863     priv->qpls = kvcalloc(num_qpls, sizeof(*priv->qpls), GFP_KERNEL);
0864     if (!priv->qpls)
0865         return -ENOMEM;
0866
0867     for (i = 0; i < gve_num_tx_qpls(priv); i++) {
0868         err = gve_alloc_queue_page_list(priv, i,
0869                         priv->tx_pages_per_qpl);
0870         if (err)
0871             goto free_qpls;
0872     }
0873     for (; i < num_qpls; i++) {
0874         err = gve_alloc_queue_page_list(priv, i,
0875                         priv->rx_data_slot_cnt);
0876         if (err)
0877             goto free_qpls;
0878     }
0879
0880     priv->qpl_cfg.qpl_map_size = BITS_TO_LONGS(num_qpls) *
0881                      sizeof(unsigned long) * BITS_PER_BYTE;
0882     priv->qpl_cfg.qpl_id_map = kvcalloc(BITS_TO_LONGS(num_qpls),
0883                         sizeof(unsigned long), GFP_KERNEL);
0884     if (!priv->qpl_cfg.qpl_id_map) {
0885         err = -ENOMEM;
0886         goto free_qpls;
0887     }
0888
0889     return 0;
0890
0891 free_qpls:
0892     for (j = 0; j <= i; j++)
0893         gve_free_queue_page_list(priv, j);
0894     kvfree(priv->qpls);
0895     return err;
0896 }
0897
0898 static void gve_free_qpls(struct gve_priv *priv)
0899 {
0900     int num_qpls = gve_num_tx_qpls(priv) + gve_num_rx_qpls(priv);
0901     int i;
0902
0903     if (num_qpls == 0)
0904         return;
0905
0906     kvfree(priv->qpl_cfg.qpl_id_map);
0907
0908     for (i = 0; i < num_qpls; i++)
0909         gve_free_queue_page_list(priv, i);
0910
0911     kvfree(priv->qpls);
0912 }
0913
0914 /* Use this to schedule a reset when the device is capable of continuing
0915  * to handle other requests in its current state. If it is not, do a reset
0916  * in thread instead.
0917  */
0918 void gve_schedule_reset(struct gve_priv *priv)
0919 {
0920     gve_set_do_reset(priv);
0921     queue_work(priv->gve_wq, &priv->service_task);
0922 }
0923
0924 static void gve_reset_and_teardown(struct gve_priv *priv, bool was_up);
0925 static int gve_reset_recovery(struct gve_priv *priv, bool was_up);
0926 static void gve_turndown(struct gve_priv *priv);
0927 static void gve_turnup(struct gve_priv *priv);
0928
0929 static int gve_open(struct net_device *dev)
0930 {
0931     struct gve_priv *priv = netdev_priv(dev);
0932     int err;
0933
0934     err = gve_alloc_qpls(priv);
0935     if (err)
0936         return err;
0937
0938     err = gve_alloc_rings(priv);
0939     if (err)
0940         goto free_qpls;
0941
0942     err = netif_set_real_num_tx_queues(dev, priv->tx_cfg.num_queues);
0943     if (err)
0944         goto free_rings;
0945     err = netif_set_real_num_rx_queues(dev, priv->rx_cfg.num_queues);
0946     if (err)
0947         goto free_rings;
0948
0949     err = gve_register_qpls(priv);
0950     if (err)
0951         goto reset;
0952
0953     if (!gve_is_gqi(priv)) {
0954         /* Hard code this for now. This may be tuned in the future for
0955          * performance.
0956          */
0957         priv->data_buffer_size_dqo = GVE_RX_BUFFER_SIZE_DQO;
0958     }
0959     err = gve_create_rings(priv);
0960     if (err)
0961         goto reset;
0962
0963     gve_set_device_rings_ok(priv);
0964
0965     if (gve_get_report_stats(priv))
0966         mod_timer(&priv->stats_report_timer,
0967               round_jiffies(jiffies +
0968                 msecs_to_jiffies(priv->stats_report_timer_period)));
0969
0970     gve_turnup(priv);
0971     queue_work(priv->gve_wq, &priv->service_task);
0972     priv->interface_up_cnt++;
0973     return 0;
0974
0975 free_rings:
0976     gve_free_rings(priv);
0977 free_qpls:
0978     gve_free_qpls(priv);
0979     return err;
0980
0981 reset:
0982     /* This must have been called from a reset due to the rtnl lock
0983      * so just return at this point.
0984      */
0985     if (gve_get_reset_in_progress(priv))
0986         return err;
0987     /* Otherwise reset before returning */
0988     gve_reset_and_teardown(priv, true);
0989     /* if this fails there is nothing we can do so just ignore the return */
0990     gve_reset_recovery(priv, false);
0991     /* return the original error */
0992     return err;
0993 }
0994
0995 static int gve_close(struct net_device *dev)
0996 {
0997     struct gve_priv *priv = netdev_priv(dev);
0998     int err;
0999
1000     netif_carrier_off(dev);
1001     if (gve_get_device_rings_ok(priv)) {
1002         gve_turndown(priv);
1003         err = gve_destroy_rings(priv);
1004         if (err)
1005             goto err;
1006         err = gve_unregister_qpls(priv);
1007         if (err)
1008             goto err;
1009         gve_clear_device_rings_ok(priv);
1010     }
1011     del_timer_sync(&priv->stats_report_timer);
1012
1013     gve_free_rings(priv);
1014     gve_free_qpls(priv);
1015     priv->interface_down_cnt++;
1016     return 0;
1017
1018 err:
1019     /* This must have been called from a reset due to the rtnl lock
1020      * so just return at this point.
1021      */
1022     if (gve_get_reset_in_progress(priv))
1023         return err;
1024     /* Otherwise reset before returning */
1025     gve_reset_and_teardown(priv, true);
1026     return gve_reset_recovery(priv, false);
1027 }
1028
1029 int gve_adjust_queues(struct gve_priv *priv,
1030               struct gve_queue_config new_rx_config,
1031               struct gve_queue_config new_tx_config)
1032 {
1033     int err;
1034
1035     if (netif_carrier_ok(priv->dev)) {
1036         /* To make this process as simple as possible we teardown the
1037          * device, set the new configuration, and then bring the device
1038          * up again.
1039          */
1040         err = gve_close(priv->dev);
1041         /* we have already tried to reset in close,
1042          * just fail at this point
1043          */
1044         if (err)
1045             return err;
1046         priv->tx_cfg = new_tx_config;
1047         priv->rx_cfg = new_rx_config;
1048
1049         err = gve_open(priv->dev);
1050         if (err)
1051             goto err;
1052
1053         return 0;
1054     }
1055     /* Set the config for the next up. */
1056     priv->tx_cfg = new_tx_config;
1057     priv->rx_cfg = new_rx_config;
1058
1059     return 0;
1060 err:
1061     netif_err(priv, drv, priv->dev,
1062           "Adjust queues failed! !!! DISABLING ALL QUEUES !!!\n");
1063     gve_turndown(priv);
1064     return err;
1065 }
1066
1067 static void gve_turndown(struct gve_priv *priv)
1068 {
1069     int idx;
1070
1071     if (netif_carrier_ok(priv->dev))
1072         netif_carrier_off(priv->dev);
1073
1074     if (!gve_get_napi_enabled(priv))
1075         return;
1076
1077     /* Disable napi to prevent more work from coming in */
1078     for (idx = 0; idx < priv->tx_cfg.num_queues; idx++) {
1079         int ntfy_idx = gve_tx_idx_to_ntfy(priv, idx);
1080         struct gve_notify_block *block = &priv->ntfy_blocks[ntfy_idx];
1081
1082         napi_disable(&block->napi);
1083     }
1084     for (idx = 0; idx < priv->rx_cfg.num_queues; idx++) {
1085         int ntfy_idx = gve_rx_idx_to_ntfy(priv, idx);
1086         struct gve_notify_block *block = &priv->ntfy_blocks[ntfy_idx];
1087
1088         napi_disable(&block->napi);
1089     }
1090
1091     /* Stop tx queues */
1092     netif_tx_disable(priv->dev);
1093
1094     gve_clear_napi_enabled(priv);
1095     gve_clear_report_stats(priv);
1096 }
1097
1098 static void gve_turnup(struct gve_priv *priv)
1099 {
1100     int idx;
1101
1102     /* Start the tx queues */
1103     netif_tx_start_all_queues(priv->dev);
1104
1105     /* Enable napi and unmask interrupts for all queues */
1106     for (idx = 0; idx < priv->tx_cfg.num_queues; idx++) {
1107         int ntfy_idx = gve_tx_idx_to_ntfy(priv, idx);
1108         struct gve_notify_block *block = &priv->ntfy_blocks[ntfy_idx];
1109
1110         napi_enable(&block->napi);
1111         if (gve_is_gqi(priv)) {
1112             iowrite32be(0, gve_irq_doorbell(priv, block));
1113         } else {
1114             gve_set_itr_coalesce_usecs_dqo(priv, block,
1115                                priv->tx_coalesce_usecs);
1116         }
1117     }
1118     for (idx = 0; idx < priv->rx_cfg.num_queues; idx++) {
1119         int ntfy_idx = gve_rx_idx_to_ntfy(priv, idx);
1120         struct gve_notify_block *block = &priv->ntfy_blocks[ntfy_idx];
1121
1122         napi_enable(&block->napi);
1123         if (gve_is_gqi(priv)) {
1124             iowrite32be(0, gve_irq_doorbell(priv, block));
1125         } else {
1126             gve_set_itr_coalesce_usecs_dqo(priv, block,
1127                                priv->rx_coalesce_usecs);
1128         }
1129     }
1130
1131     gve_set_napi_enabled(priv);
1132 }
1133
1134 static void gve_tx_timeout(struct net_device *dev, unsigned int txqueue)
1135 {
1136     struct gve_notify_block *block;
1137     struct gve_tx_ring *tx = NULL;
1138     struct gve_priv *priv;
1139     u32 last_nic_done;
1140     u32 current_time;
1141     u32 ntfy_idx;
1142
1143     netdev_info(dev, "Timeout on tx queue, %d", txqueue);
1144     priv = netdev_priv(dev);
1145     if (txqueue > priv->tx_cfg.num_queues)
1146         goto reset;
1147
1148     ntfy_idx = gve_tx_idx_to_ntfy(priv, txqueue);
1149     if (ntfy_idx >= priv->num_ntfy_blks)
1150         goto reset;
1151
1152     block = &priv->ntfy_blocks[ntfy_idx];
1153     tx = block->tx;
1154
1155     current_time = jiffies_to_msecs(jiffies);
1156     if (tx->last_kick_msec + MIN_TX_TIMEOUT_GAP > current_time)
1157         goto reset;
1158
1159     /* Check to see if there are missed completions, which will allow us to
1160      * kick the queue.
1161      */
1162     last_nic_done = gve_tx_load_event_counter(priv, tx);
1163     if (last_nic_done - tx->done) {
1164         netdev_info(dev, "Kicking queue %d", txqueue);
1165         iowrite32be(GVE_IRQ_MASK, gve_irq_doorbell(priv, block));
1166         napi_schedule(&block->napi);
1167         tx->last_kick_msec = current_time;
1168         goto out;
1169     } // Else reset.
1170
1171 reset:
1172     gve_schedule_reset(priv);
1173
1174 out:
1175     if (tx)
1176         tx->queue_timeout++;
1177     priv->tx_timeo_cnt++;
1178 }
1179
1180 static int gve_set_features(struct net_device *netdev,
1181                 netdev_features_t features)
1182 {
1183     const netdev_features_t orig_features = netdev->features;
1184     struct gve_priv *priv = netdev_priv(netdev);
1185     int err;
1186
1187     if ((netdev->features & NETIF_F_LRO) != (features & NETIF_F_LRO)) {
1188         netdev->features ^= NETIF_F_LRO;
1189         if (netif_carrier_ok(netdev)) {
1190             /* To make this process as simple as possible we
1191              * teardown the device, set the new configuration,
1192              * and then bring the device up again.
1193              */
1194             err = gve_close(netdev);
1195             /* We have already tried to reset in close, just fail
1196              * at this point.
1197              */
1198             if (err)
1199                 goto err;
1200
1201             err = gve_open(netdev);
1202             if (err)
1203                 goto err;
1204         }
1205     }
1206
1207     return 0;
1208 err:
1209     /* Reverts the change on error. */
1210     netdev->features = orig_features;
1211     netif_err(priv, drv, netdev,
1212           "Set features failed! !!! DISABLING ALL QUEUES !!!\n");
1213     return err;
1214 }
1215
1216 static const struct net_device_ops gve_netdev_ops = {
1217     .ndo_start_xmit     =   gve_start_xmit,
1218     .ndo_open       =   gve_open,
1219     .ndo_stop       =   gve_close,
1220     .ndo_get_stats64    =   gve_get_stats,
1221     .ndo_tx_timeout         =       gve_tx_timeout,
1222     .ndo_set_features   =   gve_set_features,
1223 };
1224
1225 static void gve_handle_status(struct gve_priv *priv, u32 status)
1226 {
1227     if (GVE_DEVICE_STATUS_RESET_MASK & status) {
1228         dev_info(&priv->pdev->dev, "Device requested reset.\n");
1229         gve_set_do_reset(priv);
1230     }
1231     if (GVE_DEVICE_STATUS_REPORT_STATS_MASK & status) {
1232         priv->stats_report_trigger_cnt++;
1233         gve_set_do_report_stats(priv);
1234     }
1235 }
1236
1237 static void gve_handle_reset(struct gve_priv *priv)
1238 {
1239     /* A service task will be scheduled at the end of probe to catch any
1240      * resets that need to happen, and we don't want to reset until
1241      * probe is done.
1242      */
1243     if (gve_get_probe_in_progress(priv))
1244         return;
1245
1246     if (gve_get_do_reset(priv)) {
1247         rtnl_lock();
1248         gve_reset(priv, false);
1249         rtnl_unlock();
1250     }
1251 }
1252
1253 void gve_handle_report_stats(struct gve_priv *priv)
1254 {
1255     struct stats *stats = priv->stats_report->stats;
1256     int idx, stats_idx = 0;
1257     unsigned int start = 0;
1258     u64 tx_bytes;
1259
1260     if (!gve_get_report_stats(priv))
1261         return;
1262
1263     be64_add_cpu(&priv->stats_report->written_count, 1);
1264     /* tx stats */
1265     if (priv->tx) {
1266         for (idx = 0; idx < priv->tx_cfg.num_queues; idx++) {
1267             u32 last_completion = 0;
1268             u32 tx_frames = 0;
1269
1270             /* DQO doesn't currently support these metrics. */
1271             if (gve_is_gqi(priv)) {
1272                 last_completion = priv->tx[idx].done;
1273                 tx_frames = priv->tx[idx].req;
1274             }
1275
1276             do {
1277                 start = u64_stats_fetch_begin_irq(&priv->tx[idx].statss);
1278                 tx_bytes = priv->tx[idx].bytes_done;
1279             } while (u64_stats_fetch_retry_irq(&priv->tx[idx].statss, start));
1280             stats[stats_idx++] = (struct stats) {
1281                 .stat_name = cpu_to_be32(TX_WAKE_CNT),
1282                 .value = cpu_to_be64(priv->tx[idx].wake_queue),
1283                 .queue_id = cpu_to_be32(idx),
1284             };
1285             stats[stats_idx++] = (struct stats) {
1286                 .stat_name = cpu_to_be32(TX_STOP_CNT),
1287                 .value = cpu_to_be64(priv->tx[idx].stop_queue),
1288                 .queue_id = cpu_to_be32(idx),
1289             };
1290             stats[stats_idx++] = (struct stats) {
1291                 .stat_name = cpu_to_be32(TX_FRAMES_SENT),
1292                 .value = cpu_to_be64(tx_frames),
1293                 .queue_id = cpu_to_be32(idx),
1294             };
1295             stats[stats_idx++] = (struct stats) {
1296                 .stat_name = cpu_to_be32(TX_BYTES_SENT),
1297                 .value = cpu_to_be64(tx_bytes),
1298                 .queue_id = cpu_to_be32(idx),
1299             };
1300             stats[stats_idx++] = (struct stats) {
1301                 .stat_name = cpu_to_be32(TX_LAST_COMPLETION_PROCESSED),
1302                 .value = cpu_to_be64(last_completion),
1303                 .queue_id = cpu_to_be32(idx),
1304             };
1305             stats[stats_idx++] = (struct stats) {
1306                 .stat_name = cpu_to_be32(TX_TIMEOUT_CNT),
1307                 .value = cpu_to_be64(priv->tx[idx].queue_timeout),
1308                 .queue_id = cpu_to_be32(idx),
1309             };
1310         }
1311     }
1312     /* rx stats */
1313     if (priv->rx) {
1314         for (idx = 0; idx < priv->rx_cfg.num_queues; idx++) {
1315             stats[stats_idx++] = (struct stats) {
1316                 .stat_name = cpu_to_be32(RX_NEXT_EXPECTED_SEQUENCE),
1317                 .value = cpu_to_be64(priv->rx[idx].desc.seqno),
1318                 .queue_id = cpu_to_be32(idx),
1319             };
1320             stats[stats_idx++] = (struct stats) {
1321                 .stat_name = cpu_to_be32(RX_BUFFERS_POSTED),
1322                 .value = cpu_to_be64(priv->rx[0].fill_cnt),
1323                 .queue_id = cpu_to_be32(idx),
1324             };
1325         }
1326     }
1327 }
1328
1329 static void gve_handle_link_status(struct gve_priv *priv, bool link_status)
1330 {
1331     if (!gve_get_napi_enabled(priv))
1332         return;
1333
1334     if (link_status == netif_carrier_ok(priv->dev))
1335         return;
1336
1337     if (link_status) {
1338         netdev_info(priv->dev, "Device link is up.\n");
1339         netif_carrier_on(priv->dev);
1340     } else {
1341         netdev_info(priv->dev, "Device link is down.\n");
1342         netif_carrier_off(priv->dev);
1343     }
1344 }
1345
1346 /* Handle NIC status register changes, reset requests and report stats */
1347 static void gve_service_task(struct work_struct *work)
1348 {
1349     struct gve_priv *priv = container_of(work, struct gve_priv,
1350                          service_task);
1351     u32 status = ioread32be(&priv->reg_bar0->device_status);
1352
1353     gve_handle_status(priv, status);
1354
1355     gve_handle_reset(priv);
1356     gve_handle_link_status(priv, GVE_DEVICE_STATUS_LINK_STATUS_MASK & status);
1357 }
1358
1359 static int gve_init_priv(struct gve_priv *priv, bool skip_describe_device)
1360 {
1361     int num_ntfy;
1362     int err;
1363
1364     /* Set up the adminq */
1365     err = gve_adminq_alloc(&priv->pdev->dev, priv);
1366     if (err) {
1367         dev_err(&priv->pdev->dev,
1368             "Failed to alloc admin queue: err=%d\n", err);
1369         return err;
1370     }
1371
1372     if (skip_describe_device)
1373         goto setup_device;
1374
1375     priv->queue_format = GVE_QUEUE_FORMAT_UNSPECIFIED;
1376     /* Get the initial information we need from the device */
1377     err = gve_adminq_describe_device(priv);
1378     if (err) {
1379         dev_err(&priv->pdev->dev,
1380             "Could not get device information: err=%d\n", err);
1381         goto err;
1382     }
1383     priv->dev->mtu = priv->dev->max_mtu;
1384     num_ntfy = pci_msix_vec_count(priv->pdev);
1385     if (num_ntfy <= 0) {
1386         dev_err(&priv->pdev->dev,
1387             "could not count MSI-x vectors: err=%d\n", num_ntfy);
1388         err = num_ntfy;
1389         goto err;
1390     } else if (num_ntfy < GVE_MIN_MSIX) {
1391         dev_err(&priv->pdev->dev, "gve needs at least %d MSI-x vectors, but only has %d\n",
1392             GVE_MIN_MSIX, num_ntfy);
1393         err = -EINVAL;
1394         goto err;
1395     }
1396
1397     priv->num_registered_pages = 0;
1398     priv->rx_copybreak = GVE_DEFAULT_RX_COPYBREAK;
1399     /* gvnic has one Notification Block per MSI-x vector, except for the
1400      * management vector
1401      */
1402     priv->num_ntfy_blks = (num_ntfy - 1) & ~0x1;
1403     priv->mgmt_msix_idx = priv->num_ntfy_blks;
1404
1405     priv->tx_cfg.max_queues =
1406         min_t(int, priv->tx_cfg.max_queues, priv->num_ntfy_blks / 2);
1407     priv->rx_cfg.max_queues =
1408         min_t(int, priv->rx_cfg.max_queues, priv->num_ntfy_blks / 2);
1409
1410     priv->tx_cfg.num_queues = priv->tx_cfg.max_queues;
1411     priv->rx_cfg.num_queues = priv->rx_cfg.max_queues;
1412     if (priv->default_num_queues > 0) {
1413         priv->tx_cfg.num_queues = min_t(int, priv->default_num_queues,
1414                         priv->tx_cfg.num_queues);
1415         priv->rx_cfg.num_queues = min_t(int, priv->default_num_queues,
1416                         priv->rx_cfg.num_queues);
1417     }
1418
1419     dev_info(&priv->pdev->dev, "TX queues %d, RX queues %d\n",
1420          priv->tx_cfg.num_queues, priv->rx_cfg.num_queues);
1421     dev_info(&priv->pdev->dev, "Max TX queues %d, Max RX queues %d\n",
1422          priv->tx_cfg.max_queues, priv->rx_cfg.max_queues);
1423
1424     if (!gve_is_gqi(priv)) {
1425         priv->tx_coalesce_usecs = GVE_TX_IRQ_RATELIMIT_US_DQO;
1426         priv->rx_coalesce_usecs = GVE_RX_IRQ_RATELIMIT_US_DQO;
1427     }
1428
1429 setup_device:
1430     err = gve_setup_device_resources(priv);
1431     if (!err)
1432         return 0;
1433 err:
1434     gve_adminq_free(&priv->pdev->dev, priv);
1435     return err;
1436 }
1437
1438 static void gve_teardown_priv_resources(struct gve_priv *priv)
1439 {
1440     gve_teardown_device_resources(priv);
1441     gve_adminq_free(&priv->pdev->dev, priv);
1442 }
1443
1444 static void gve_trigger_reset(struct gve_priv *priv)
1445 {
1446     /* Reset the device by releasing the AQ */
1447     gve_adminq_release(priv);
1448 }
1449
1450 static void gve_reset_and_teardown(struct gve_priv *priv, bool was_up)
1451 {
1452     gve_trigger_reset(priv);
1453     /* With the reset having already happened, close cannot fail */
1454     if (was_up)
1455         gve_close(priv->dev);
1456     gve_teardown_priv_resources(priv);
1457 }
1458
1459 static int gve_reset_recovery(struct gve_priv *priv, bool was_up)
1460 {
1461     int err;
1462
1463     err = gve_init_priv(priv, true);
1464     if (err)
1465         goto err;
1466     if (was_up) {
1467         err = gve_open(priv->dev);
1468         if (err)
1469             goto err;
1470     }
1471     return 0;
1472 err:
1473     dev_err(&priv->pdev->dev, "Reset failed! !!! DISABLING ALL QUEUES !!!\n");
1474     gve_turndown(priv);
1475     return err;
1476 }
1477
1478 int gve_reset(struct gve_priv *priv, bool attempt_teardown)
1479 {
1480     bool was_up = netif_carrier_ok(priv->dev);
1481     int err;
1482
1483     dev_info(&priv->pdev->dev, "Performing reset\n");
1484     gve_clear_do_reset(priv);
1485     gve_set_reset_in_progress(priv);
1486     /* If we aren't attempting to teardown normally, just go turndown and
1487      * reset right away.
1488      */
1489     if (!attempt_teardown) {
1490         gve_turndown(priv);
1491         gve_reset_and_teardown(priv, was_up);
1492     } else {
1493         /* Otherwise attempt to close normally */
1494         if (was_up) {
1495             err = gve_close(priv->dev);
1496             /* If that fails reset as we did above */
1497             if (err)
1498                 gve_reset_and_teardown(priv, was_up);
1499         }
1500         /* Clean up any remaining resources */
1501         gve_teardown_priv_resources(priv);
1502     }
1503
1504     /* Set it all back up */
1505     err = gve_reset_recovery(priv, was_up);
1506     gve_clear_reset_in_progress(priv);
1507     priv->reset_cnt++;
1508     priv->interface_up_cnt = 0;
1509     priv->interface_down_cnt = 0;
1510     priv->stats_report_trigger_cnt = 0;
1511     return err;
1512 }
1513
1514 static void gve_write_version(u8 __iomem *driver_version_register)
1515 {
1516     const char *c = gve_version_prefix;
1517
1518     while (*c) {
1519         writeb(*c, driver_version_register);
1520         c++;
1521     }
1522
1523     c = gve_version_str;
1524     while (*c) {
1525         writeb(*c, driver_version_register);
1526         c++;
1527     }
1528     writeb('\n', driver_version_register);
1529 }
1530
1531 static int gve_probe(struct pci_dev *pdev, const struct pci_device_id *ent)
1532 {
1533     int max_tx_queues, max_rx_queues;
1534     struct net_device *dev;
1535     __be32 __iomem *db_bar;
1536     struct gve_registers __iomem *reg_bar;
1537     struct gve_priv *priv;
1538     int err;
1539
1540     err = pci_enable_device(pdev);
1541     if (err)
1542         return err;
1543
1544     err = pci_request_regions(pdev, "gvnic-cfg");
1545     if (err)
1546         goto abort_with_enabled;
1547
1548     pci_set_master(pdev);
1549
1550     err = dma_set_mask_and_coherent(&pdev->dev, DMA_BIT_MASK(64));
1551     if (err) {
1552         dev_err(&pdev->dev, "Failed to set dma mask: err=%d\n", err);
1553         goto abort_with_pci_region;
1554     }
1555
1556     reg_bar = pci_iomap(pdev, GVE_REGISTER_BAR, 0);
1557     if (!reg_bar) {
1558         dev_err(&pdev->dev, "Failed to map pci bar!\n");
1559         err = -ENOMEM;
1560         goto abort_with_pci_region;
1561     }
1562
1563     db_bar = pci_iomap(pdev, GVE_DOORBELL_BAR, 0);
1564     if (!db_bar) {
1565         dev_err(&pdev->dev, "Failed to map doorbell bar!\n");
1566         err = -ENOMEM;
1567         goto abort_with_reg_bar;
1568     }
1569
1570     gve_write_version(&reg_bar->driver_version);
1571     /* Get max queues to alloc etherdev */
1572     max_tx_queues = ioread32be(&reg_bar->max_tx_queues);
1573     max_rx_queues = ioread32be(&reg_bar->max_rx_queues);
1574     /* Alloc and setup the netdev and priv */
1575     dev = alloc_etherdev_mqs(sizeof(*priv), max_tx_queues, max_rx_queues);
1576     if (!dev) {
1577         dev_err(&pdev->dev, "could not allocate netdev\n");
1578         err = -ENOMEM;
1579         goto abort_with_db_bar;
1580     }
1581     SET_NETDEV_DEV(dev, &pdev->dev);
1582     pci_set_drvdata(pdev, dev);
1583     dev->ethtool_ops = &gve_ethtool_ops;
1584     dev->netdev_ops = &gve_netdev_ops;
1585
1586     /* Set default and supported features.
1587      *
1588      * Features might be set in other locations as well (such as
1589      * `gve_adminq_describe_device`).
1590      */
1591     dev->hw_features = NETIF_F_HIGHDMA;
1592     dev->hw_features |= NETIF_F_SG;
1593     dev->hw_features |= NETIF_F_HW_CSUM;
1594     dev->hw_features |= NETIF_F_TSO;
1595     dev->hw_features |= NETIF_F_TSO6;
1596     dev->hw_features |= NETIF_F_TSO_ECN;
1597     dev->hw_features |= NETIF_F_RXCSUM;
1598     dev->hw_features |= NETIF_F_RXHASH;
1599     dev->features = dev->hw_features;
1600     dev->watchdog_timeo = 5 * HZ;
1601     dev->min_mtu = ETH_MIN_MTU;
1602     netif_carrier_off(dev);
1603
1604     priv = netdev_priv(dev);
1605     priv->dev = dev;
1606     priv->pdev = pdev;
1607     priv->msg_enable = DEFAULT_MSG_LEVEL;
1608     priv->reg_bar0 = reg_bar;
1609     priv->db_bar2 = db_bar;
1610     priv->service_task_flags = 0x0;
1611     priv->state_flags = 0x0;
1612     priv->ethtool_flags = 0x0;
1613
1614     gve_set_probe_in_progress(priv);
1615     priv->gve_wq = alloc_ordered_workqueue("gve", 0);
1616     if (!priv->gve_wq) {
1617         dev_err(&pdev->dev, "Could not allocate workqueue");
1618         err = -ENOMEM;
1619         goto abort_with_netdev;
1620     }
1621     INIT_WORK(&priv->service_task, gve_service_task);
1622     INIT_WORK(&priv->stats_report_task, gve_stats_report_task);
1623     priv->tx_cfg.max_queues = max_tx_queues;
1624     priv->rx_cfg.max_queues = max_rx_queues;
1625
1626     err = gve_init_priv(priv, false);
1627     if (err)
1628         goto abort_with_wq;
1629
1630     err = register_netdev(dev);
1631     if (err)
1632         goto abort_with_gve_init;
1633
1634     dev_info(&pdev->dev, "GVE version %s\n", gve_version_str);
1635     dev_info(&pdev->dev, "GVE queue format %d\n", (int)priv->queue_format);
1636     gve_clear_probe_in_progress(priv);
1637     queue_work(priv->gve_wq, &priv->service_task);
1638     return 0;
1639
1640 abort_with_gve_init:
1641     gve_teardown_priv_resources(priv);
1642
1643 abort_with_wq:
1644     destroy_workqueue(priv->gve_wq);
1645
1646 abort_with_netdev:
1647     free_netdev(dev);
1648
1649 abort_with_db_bar:
1650     pci_iounmap(pdev, db_bar);
1651
1652 abort_with_reg_bar:
1653     pci_iounmap(pdev, reg_bar);
1654
1655 abort_with_pci_region:
1656     pci_release_regions(pdev);
1657
1658 abort_with_enabled:
1659     pci_disable_device(pdev);
1660     return err;
1661 }
1662
1663 static void gve_remove(struct pci_dev *pdev)
1664 {
1665     struct net_device *netdev = pci_get_drvdata(pdev);
1666     struct gve_priv *priv = netdev_priv(netdev);
1667     __be32 __iomem *db_bar = priv->db_bar2;
1668     void __iomem *reg_bar = priv->reg_bar0;
1669
1670     unregister_netdev(netdev);
1671     gve_teardown_priv_resources(priv);
1672     destroy_workqueue(priv->gve_wq);
1673     free_netdev(netdev);
1674     pci_iounmap(pdev, db_bar);
1675     pci_iounmap(pdev, reg_bar);
1676     pci_release_regions(pdev);
1677     pci_disable_device(pdev);
1678 }
1679
1680 static void gve_shutdown(struct pci_dev *pdev)
1681 {
1682     struct net_device *netdev = pci_get_drvdata(pdev);
1683     struct gve_priv *priv = netdev_priv(netdev);
1684     bool was_up = netif_carrier_ok(priv->dev);
1685
1686     rtnl_lock();
1687     if (was_up && gve_close(priv->dev)) {
1688         /* If the dev was up, attempt to close, if close fails, reset */
1689         gve_reset_and_teardown(priv, was_up);
1690     } else {
1691         /* If the dev wasn't up or close worked, finish tearing down */
1692         gve_teardown_priv_resources(priv);
1693     }
1694     rtnl_unlock();
1695 }
1696
1697 #ifdef CONFIG_PM
1698 static int gve_suspend(struct pci_dev *pdev, pm_message_t state)
1699 {
1700     struct net_device *netdev = pci_get_drvdata(pdev);
1701     struct gve_priv *priv = netdev_priv(netdev);
1702     bool was_up = netif_carrier_ok(priv->dev);
1703
1704     priv->suspend_cnt++;
1705     rtnl_lock();
1706     if (was_up && gve_close(priv->dev)) {
1707         /* If the dev was up, attempt to close, if close fails, reset */
1708         gve_reset_and_teardown(priv, was_up);
1709     } else {
1710         /* If the dev wasn't up or close worked, finish tearing down */
1711         gve_teardown_priv_resources(priv);
1712     }
1713     priv->up_before_suspend = was_up;
1714     rtnl_unlock();
1715     return 0;
1716 }
1717
1718 static int gve_resume(struct pci_dev *pdev)
1719 {
1720     struct net_device *netdev = pci_get_drvdata(pdev);
1721     struct gve_priv *priv = netdev_priv(netdev);
1722     int err;
1723
1724     priv->resume_cnt++;
1725     rtnl_lock();
1726     err = gve_reset_recovery(priv, priv->up_before_suspend);
1727     rtnl_unlock();
1728     return err;
1729 }
1730 #endif /* CONFIG_PM */
1731
1732 static const struct pci_device_id gve_id_table[] = {
1733     { PCI_DEVICE(PCI_VENDOR_ID_GOOGLE, PCI_DEV_ID_GVNIC) },
1734     { }
1735 };
1736
1737 static struct pci_driver gvnic_driver = {
1738     .name       = "gvnic",
1739     .id_table   = gve_id_table,
1740     .probe      = gve_probe,
1741     .remove     = gve_remove,
1742     .shutdown   = gve_shutdown,
1743 #ifdef CONFIG_PM
1744     .suspend        = gve_suspend,
1745     .resume         = gve_resume,
1746 #endif
1747 };
1748
1749 module_pci_driver(gvnic_driver);
1750
1751 MODULE_DEVICE_TABLE(pci, gve_id_table);
1752 MODULE_AUTHOR("Google, Inc.");
1753 MODULE_DESCRIPTION("gVNIC Driver");
1754 MODULE_LICENSE("Dual MIT/GPL");
1755 MODULE_VERSION(GVE_VERSION);