chelsio/cxgb4/sge.c

0001 /*
0002  * This file is part of the Chelsio T4 Ethernet driver for Linux.
0003  *
0004  * Copyright (c) 2003-2014 Chelsio Communications, Inc. All rights reserved.
0005  *
0006  * This software is available to you under a choice of one of two
0007  * licenses.  You may choose to be licensed under the terms of the GNU
0008  * General Public License (GPL) Version 2, available from the file
0009  * COPYING in the main directory of this source tree, or the
0010  * OpenIB.org BSD license below:
0011  *
0012  *     Redistribution and use in source and binary forms, with or
0013  *     without modification, are permitted provided that the following
0014  *     conditions are met:
0015  *
0016  *      - Redistributions of source code must retain the above
0017  *        copyright notice, this list of conditions and the following
0018  *        disclaimer.
0019  *
0020  *      - Redistributions in binary form must reproduce the above
0021  *        copyright notice, this list of conditions and the following
0022  *        disclaimer in the documentation and/or other materials
0023  *        provided with the distribution.
0024  *
0025  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
0026  * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
0027  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
0028  * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
0029  * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
0030  * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
0031  * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
0032  * SOFTWARE.
0033  */
0034
0035 #include <linux/skbuff.h>
0036 #include <linux/netdevice.h>
0037 #include <linux/etherdevice.h>
0038 #include <linux/if_vlan.h>
0039 #include <linux/ip.h>
0040 #include <linux/dma-mapping.h>
0041 #include <linux/jiffies.h>
0042 #include <linux/prefetch.h>
0043 #include <linux/export.h>
0044 #include <net/xfrm.h>
0045 #include <net/ipv6.h>
0046 #include <net/tcp.h>
0047 #include <net/busy_poll.h>
0048 #ifdef CONFIG_CHELSIO_T4_FCOE
0049 #include <scsi/fc/fc_fcoe.h>
0050 #endif /* CONFIG_CHELSIO_T4_FCOE */
0051 #include "cxgb4.h"
0052 #include "t4_regs.h"
0053 #include "t4_values.h"
0054 #include "t4_msg.h"
0055 #include "t4fw_api.h"
0056 #include "cxgb4_ptp.h"
0057 #include "cxgb4_uld.h"
0058 #include "cxgb4_tc_mqprio.h"
0059 #include "sched.h"
0060
0061 /*
0062  * Rx buffer size.  We use largish buffers if possible but settle for single
0063  * pages under memory shortage.
0064  */
0065 #if PAGE_SHIFT >= 16
0066 # define FL_PG_ORDER 0
0067 #else
0068 # define FL_PG_ORDER (16 - PAGE_SHIFT)
0069 #endif
0070
0071 /* RX_PULL_LEN should be <= RX_COPY_THRES */
0072 #define RX_COPY_THRES    256
0073 #define RX_PULL_LEN      128
0074
0075 /*
0076  * Main body length for sk_buffs used for Rx Ethernet packets with fragments.
0077  * Should be >= RX_PULL_LEN but possibly bigger to give pskb_may_pull some room.
0078  */
0079 #define RX_PKT_SKB_LEN   512
0080
0081 /*
0082  * Max number of Tx descriptors we clean up at a time.  Should be modest as
0083  * freeing skbs isn't cheap and it happens while holding locks.  We just need
0084  * to free packets faster than they arrive, we eventually catch up and keep
0085  * the amortized cost reasonable.  Must be >= 2 * TXQ_STOP_THRES.  It should
0086  * also match the CIDX Flush Threshold.
0087  */
0088 #define MAX_TX_RECLAIM 32
0089
0090 /*
0091  * Max number of Rx buffers we replenish at a time.  Again keep this modest,
0092  * allocating buffers isn't cheap either.
0093  */
0094 #define MAX_RX_REFILL 16U
0095
0096 /*
0097  * Period of the Rx queue check timer.  This timer is infrequent as it has
0098  * something to do only when the system experiences severe memory shortage.
0099  */
0100 #define RX_QCHECK_PERIOD (HZ / 2)
0101
0102 /*
0103  * Period of the Tx queue check timer.
0104  */
0105 #define TX_QCHECK_PERIOD (HZ / 2)
0106
0107 /*
0108  * Max number of Tx descriptors to be reclaimed by the Tx timer.
0109  */
0110 #define MAX_TIMER_TX_RECLAIM 100
0111
0112 /*
0113  * Timer index used when backing off due to memory shortage.
0114  */
0115 #define NOMEM_TMR_IDX (SGE_NTIMERS - 1)
0116
0117 /*
0118  * Suspension threshold for non-Ethernet Tx queues.  We require enough room
0119  * for a full sized WR.
0120  */
0121 #define TXQ_STOP_THRES (SGE_MAX_WR_LEN / sizeof(struct tx_desc))
0122
0123 /*
0124  * Max Tx descriptor space we allow for an Ethernet packet to be inlined
0125  * into a WR.
0126  */
0127 #define MAX_IMM_TX_PKT_LEN 256
0128
0129 /*
0130  * Max size of a WR sent through a control Tx queue.
0131  */
0132 #define MAX_CTRL_WR_LEN SGE_MAX_WR_LEN
0133
0134 struct rx_sw_desc {                /* SW state per Rx descriptor */
0135     struct page *page;
0136     dma_addr_t dma_addr;
0137 };
0138
0139 /*
0140  * Rx buffer sizes for "useskbs" Free List buffers (one ingress packet pe skb
0141  * buffer).  We currently only support two sizes for 1500- and 9000-byte MTUs.
0142  * We could easily support more but there doesn't seem to be much need for
0143  * that ...
0144  */
0145 #define FL_MTU_SMALL 1500
0146 #define FL_MTU_LARGE 9000
0147
0148 static inline unsigned int fl_mtu_bufsize(struct adapter *adapter,
0149                       unsigned int mtu)
0150 {
0151     struct sge *s = &adapter->sge;
0152
0153     return ALIGN(s->pktshift + ETH_HLEN + VLAN_HLEN + mtu, s->fl_align);
0154 }
0155
0156 #define FL_MTU_SMALL_BUFSIZE(adapter) fl_mtu_bufsize(adapter, FL_MTU_SMALL)
0157 #define FL_MTU_LARGE_BUFSIZE(adapter) fl_mtu_bufsize(adapter, FL_MTU_LARGE)
0158
0159 /*
0160  * Bits 0..3 of rx_sw_desc.dma_addr have special meaning.  The hardware uses
0161  * these to specify the buffer size as an index into the SGE Free List Buffer
0162  * Size register array.  We also use bit 4, when the buffer has been unmapped
0163  * for DMA, but this is of course never sent to the hardware and is only used
0164  * to prevent double unmappings.  All of the above requires that the Free List
0165  * Buffers which we allocate have the bottom 5 bits free (0) -- i.e. are
0166  * 32-byte or or a power of 2 greater in alignment.  Since the SGE's minimal
0167  * Free List Buffer alignment is 32 bytes, this works out for us ...
0168  */
0169 enum {
0170     RX_BUF_FLAGS     = 0x1f,   /* bottom five bits are special */
0171     RX_BUF_SIZE      = 0x0f,   /* bottom three bits are for buf sizes */
0172     RX_UNMAPPED_BUF  = 0x10,   /* buffer is not mapped */
0173
0174     /*
0175      * XXX We shouldn't depend on being able to use these indices.
0176      * XXX Especially when some other Master PF has initialized the
0177      * XXX adapter or we use the Firmware Configuration File.  We
0178      * XXX should really search through the Host Buffer Size register
0179      * XXX array for the appropriately sized buffer indices.
0180      */
0181     RX_SMALL_PG_BUF  = 0x0,   /* small (PAGE_SIZE) page buffer */
0182     RX_LARGE_PG_BUF  = 0x1,   /* buffer large (FL_PG_ORDER) page buffer */
0183
0184     RX_SMALL_MTU_BUF = 0x2,   /* small MTU buffer */
0185     RX_LARGE_MTU_BUF = 0x3,   /* large MTU buffer */
0186 };
0187
0188 static int timer_pkt_quota[] = {1, 1, 2, 3, 4, 5};
0189 #define MIN_NAPI_WORK  1
0190
0191 static inline dma_addr_t get_buf_addr(const struct rx_sw_desc *d)
0192 {
0193     return d->dma_addr & ~(dma_addr_t)RX_BUF_FLAGS;
0194 }
0195
0196 static inline bool is_buf_mapped(const struct rx_sw_desc *d)
0197 {
0198     return !(d->dma_addr & RX_UNMAPPED_BUF);
0199 }
0200
0201 /**
0202  *  txq_avail - return the number of available slots in a Tx queue
0203  *  @q: the Tx queue
0204  *
0205  *  Returns the number of descriptors in a Tx queue available to write new
0206  *  packets.
0207  */
0208 static inline unsigned int txq_avail(const struct sge_txq *q)
0209 {
0210     return q->size - 1 - q->in_use;
0211 }
0212
0213 /**
0214  *  fl_cap - return the capacity of a free-buffer list
0215  *  @fl: the FL
0216  *
0217  *  Returns the capacity of a free-buffer list.  The capacity is less than
0218  *  the size because one descriptor needs to be left unpopulated, otherwise
0219  *  HW will think the FL is empty.
0220  */
0221 static inline unsigned int fl_cap(const struct sge_fl *fl)
0222 {
0223     return fl->size - 8;   /* 1 descriptor = 8 buffers */
0224 }
0225
0226 /**
0227  *  fl_starving - return whether a Free List is starving.
0228  *  @adapter: pointer to the adapter
0229  *  @fl: the Free List
0230  *
0231  *  Tests specified Free List to see whether the number of buffers
0232  *  available to the hardware has falled below our "starvation"
0233  *  threshold.
0234  */
0235 static inline bool fl_starving(const struct adapter *adapter,
0236                    const struct sge_fl *fl)
0237 {
0238     const struct sge *s = &adapter->sge;
0239
0240     return fl->avail - fl->pend_cred <= s->fl_starve_thres;
0241 }
0242
0243 int cxgb4_map_skb(struct device *dev, const struct sk_buff *skb,
0244           dma_addr_t *addr)
0245 {
0246     const skb_frag_t *fp, *end;
0247     const struct skb_shared_info *si;
0248
0249     *addr = dma_map_single(dev, skb->data, skb_headlen(skb), DMA_TO_DEVICE);
0250     if (dma_mapping_error(dev, *addr))
0251         goto out_err;
0252
0253     si = skb_shinfo(skb);
0254     end = &si->frags[si->nr_frags];
0255
0256     for (fp = si->frags; fp < end; fp++) {
0257         *++addr = skb_frag_dma_map(dev, fp, 0, skb_frag_size(fp),
0258                        DMA_TO_DEVICE);
0259         if (dma_mapping_error(dev, *addr))
0260             goto unwind;
0261     }
0262     return 0;
0263
0264 unwind:
0265     while (fp-- > si->frags)
0266         dma_unmap_page(dev, *--addr, skb_frag_size(fp), DMA_TO_DEVICE);
0267
0268     dma_unmap_single(dev, addr[-1], skb_headlen(skb), DMA_TO_DEVICE);
0269 out_err:
0270     return -ENOMEM;
0271 }
0272 EXPORT_SYMBOL(cxgb4_map_skb);
0273
0274 static void unmap_skb(struct device *dev, const struct sk_buff *skb,
0275               const dma_addr_t *addr)
0276 {
0277     const skb_frag_t *fp, *end;
0278     const struct skb_shared_info *si;
0279
0280     dma_unmap_single(dev, *addr++, skb_headlen(skb), DMA_TO_DEVICE);
0281
0282     si = skb_shinfo(skb);
0283     end = &si->frags[si->nr_frags];
0284     for (fp = si->frags; fp < end; fp++)
0285         dma_unmap_page(dev, *addr++, skb_frag_size(fp), DMA_TO_DEVICE);
0286 }
0287
0288 #ifdef CONFIG_NEED_DMA_MAP_STATE
0289 /**
0290  *  deferred_unmap_destructor - unmap a packet when it is freed
0291  *  @skb: the packet
0292  *
0293  *  This is the packet destructor used for Tx packets that need to remain
0294  *  mapped until they are freed rather than until their Tx descriptors are
0295  *  freed.
0296  */
0297 static void deferred_unmap_destructor(struct sk_buff *skb)
0298 {
0299     unmap_skb(skb->dev->dev.parent, skb, (dma_addr_t *)skb->head);
0300 }
0301 #endif
0302
0303 /**
0304  *  free_tx_desc - reclaims Tx descriptors and their buffers
0305  *  @adap: the adapter
0306  *  @q: the Tx queue to reclaim descriptors from
0307  *  @n: the number of descriptors to reclaim
0308  *  @unmap: whether the buffers should be unmapped for DMA
0309  *
0310  *  Reclaims Tx descriptors from an SGE Tx queue and frees the associated
0311  *  Tx buffers.  Called with the Tx queue lock held.
0312  */
0313 void free_tx_desc(struct adapter *adap, struct sge_txq *q,
0314           unsigned int n, bool unmap)
0315 {
0316     unsigned int cidx = q->cidx;
0317     struct tx_sw_desc *d;
0318
0319     d = &q->sdesc[cidx];
0320     while (n--) {
0321         if (d->skb) {                       /* an SGL is present */
0322             if (unmap && d->addr[0]) {
0323                 unmap_skb(adap->pdev_dev, d->skb, d->addr);
0324                 memset(d->addr, 0, sizeof(d->addr));
0325             }
0326             dev_consume_skb_any(d->skb);
0327             d->skb = NULL;
0328         }
0329         ++d;
0330         if (++cidx == q->size) {
0331             cidx = 0;
0332             d = q->sdesc;
0333         }
0334     }
0335     q->cidx = cidx;
0336 }
0337
0338 /*
0339  * Return the number of reclaimable descriptors in a Tx queue.
0340  */
0341 static inline int reclaimable(const struct sge_txq *q)
0342 {
0343     int hw_cidx = ntohs(READ_ONCE(q->stat->cidx));
0344     hw_cidx -= q->cidx;
0345     return hw_cidx < 0 ? hw_cidx + q->size : hw_cidx;
0346 }
0347
0348 /**
0349  *  reclaim_completed_tx - reclaims completed TX Descriptors
0350  *  @adap: the adapter
0351  *  @q: the Tx queue to reclaim completed descriptors from
0352  *  @maxreclaim: the maximum number of TX Descriptors to reclaim or -1
0353  *  @unmap: whether the buffers should be unmapped for DMA
0354  *
0355  *  Reclaims Tx Descriptors that the SGE has indicated it has processed,
0356  *  and frees the associated buffers if possible.  If @max == -1, then
0357  *  we'll use a defaiult maximum.  Called with the TX Queue locked.
0358  */
0359 static inline int reclaim_completed_tx(struct adapter *adap, struct sge_txq *q,
0360                        int maxreclaim, bool unmap)
0361 {
0362     int reclaim = reclaimable(q);
0363
0364     if (reclaim) {
0365         /*
0366          * Limit the amount of clean up work we do at a time to keep
0367          * the Tx lock hold time O(1).
0368          */
0369         if (maxreclaim < 0)
0370             maxreclaim = MAX_TX_RECLAIM;
0371         if (reclaim > maxreclaim)
0372             reclaim = maxreclaim;
0373
0374         free_tx_desc(adap, q, reclaim, unmap);
0375         q->in_use -= reclaim;
0376     }
0377
0378     return reclaim;
0379 }
0380
0381 /**
0382  *  cxgb4_reclaim_completed_tx - reclaims completed Tx descriptors
0383  *  @adap: the adapter
0384  *  @q: the Tx queue to reclaim completed descriptors from
0385  *  @unmap: whether the buffers should be unmapped for DMA
0386  *
0387  *  Reclaims Tx descriptors that the SGE has indicated it has processed,
0388  *  and frees the associated buffers if possible.  Called with the Tx
0389  *  queue locked.
0390  */
0391 void cxgb4_reclaim_completed_tx(struct adapter *adap, struct sge_txq *q,
0392                 bool unmap)
0393 {
0394     (void)reclaim_completed_tx(adap, q, -1, unmap);
0395 }
0396 EXPORT_SYMBOL(cxgb4_reclaim_completed_tx);
0397
0398 static inline int get_buf_size(struct adapter *adapter,
0399                    const struct rx_sw_desc *d)
0400 {
0401     struct sge *s = &adapter->sge;
0402     unsigned int rx_buf_size_idx = d->dma_addr & RX_BUF_SIZE;
0403     int buf_size;
0404
0405     switch (rx_buf_size_idx) {
0406     case RX_SMALL_PG_BUF:
0407         buf_size = PAGE_SIZE;
0408         break;
0409
0410     case RX_LARGE_PG_BUF:
0411         buf_size = PAGE_SIZE << s->fl_pg_order;
0412         break;
0413
0414     case RX_SMALL_MTU_BUF:
0415         buf_size = FL_MTU_SMALL_BUFSIZE(adapter);
0416         break;
0417
0418     case RX_LARGE_MTU_BUF:
0419         buf_size = FL_MTU_LARGE_BUFSIZE(adapter);
0420         break;
0421
0422     default:
0423         BUG();
0424     }
0425
0426     return buf_size;
0427 }
0428
0429 /**
0430  *  free_rx_bufs - free the Rx buffers on an SGE free list
0431  *  @adap: the adapter
0432  *  @q: the SGE free list to free buffers from
0433  *  @n: how many buffers to free
0434  *
0435  *  Release the next @n buffers on an SGE free-buffer Rx queue.   The
0436  *  buffers must be made inaccessible to HW before calling this function.
0437  */
0438 static void free_rx_bufs(struct adapter *adap, struct sge_fl *q, int n)
0439 {
0440     while (n--) {
0441         struct rx_sw_desc *d = &q->sdesc[q->cidx];
0442
0443         if (is_buf_mapped(d))
0444             dma_unmap_page(adap->pdev_dev, get_buf_addr(d),
0445                        get_buf_size(adap, d),
0446                        DMA_FROM_DEVICE);
0447         put_page(d->page);
0448         d->page = NULL;
0449         if (++q->cidx == q->size)
0450             q->cidx = 0;
0451         q->avail--;
0452     }
0453 }
0454
0455 /**
0456  *  unmap_rx_buf - unmap the current Rx buffer on an SGE free list
0457  *  @adap: the adapter
0458  *  @q: the SGE free list
0459  *
0460  *  Unmap the current buffer on an SGE free-buffer Rx queue.   The
0461  *  buffer must be made inaccessible to HW before calling this function.
0462  *
0463  *  This is similar to @free_rx_bufs above but does not free the buffer.
0464  *  Do note that the FL still loses any further access to the buffer.
0465  */
0466 static void unmap_rx_buf(struct adapter *adap, struct sge_fl *q)
0467 {
0468     struct rx_sw_desc *d = &q->sdesc[q->cidx];
0469
0470     if (is_buf_mapped(d))
0471         dma_unmap_page(adap->pdev_dev, get_buf_addr(d),
0472                    get_buf_size(adap, d), DMA_FROM_DEVICE);
0473     d->page = NULL;
0474     if (++q->cidx == q->size)
0475         q->cidx = 0;
0476     q->avail--;
0477 }
0478
0479 static inline void ring_fl_db(struct adapter *adap, struct sge_fl *q)
0480 {
0481     if (q->pend_cred >= 8) {
0482         u32 val = adap->params.arch.sge_fl_db;
0483
0484         if (is_t4(adap->params.chip))
0485             val |= PIDX_V(q->pend_cred / 8);
0486         else
0487             val |= PIDX_T5_V(q->pend_cred / 8);
0488
0489         /* Make sure all memory writes to the Free List queue are
0490          * committed before we tell the hardware about them.
0491          */
0492         wmb();
0493
0494         /* If we don't have access to the new User Doorbell (T5+), use
0495          * the old doorbell mechanism; otherwise use the new BAR2
0496          * mechanism.
0497          */
0498         if (unlikely(q->bar2_addr == NULL)) {
0499             t4_write_reg(adap, MYPF_REG(SGE_PF_KDOORBELL_A),
0500                      val | QID_V(q->cntxt_id));
0501         } else {
0502             writel(val | QID_V(q->bar2_qid),
0503                    q->bar2_addr + SGE_UDB_KDOORBELL);
0504
0505             /* This Write memory Barrier will force the write to
0506              * the User Doorbell area to be flushed.
0507              */
0508             wmb();
0509         }
0510         q->pend_cred &= 7;
0511     }
0512 }
0513
0514 static inline void set_rx_sw_desc(struct rx_sw_desc *sd, struct page *pg,
0515                   dma_addr_t mapping)
0516 {
0517     sd->page = pg;
0518     sd->dma_addr = mapping;      /* includes size low bits */
0519 }
0520
0521 /**
0522  *  refill_fl - refill an SGE Rx buffer ring
0523  *  @adap: the adapter
0524  *  @q: the ring to refill
0525  *  @n: the number of new buffers to allocate
0526  *  @gfp: the gfp flags for the allocations
0527  *
0528  *  (Re)populate an SGE free-buffer queue with up to @n new packet buffers,
0529  *  allocated with the supplied gfp flags.  The caller must assure that
0530  *  @n does not exceed the queue's capacity.  If afterwards the queue is
0531  *  found critically low mark it as starving in the bitmap of starving FLs.
0532  *
0533  *  Returns the number of buffers allocated.
0534  */
0535 static unsigned int refill_fl(struct adapter *adap, struct sge_fl *q, int n,
0536                   gfp_t gfp)
0537 {
0538     struct sge *s = &adap->sge;
0539     struct page *pg;
0540     dma_addr_t mapping;
0541     unsigned int cred = q->avail;
0542     __be64 *d = &q->desc[q->pidx];
0543     struct rx_sw_desc *sd = &q->sdesc[q->pidx];
0544     int node;
0545
0546 #ifdef CONFIG_DEBUG_FS
0547     if (test_bit(q->cntxt_id - adap->sge.egr_start, adap->sge.blocked_fl))
0548         goto out;
0549 #endif
0550
0551     gfp |= __GFP_NOWARN;
0552     node = dev_to_node(adap->pdev_dev);
0553
0554     if (s->fl_pg_order == 0)
0555         goto alloc_small_pages;
0556
0557     /*
0558      * Prefer large buffers
0559      */
0560     while (n) {
0561         pg = alloc_pages_node(node, gfp | __GFP_COMP, s->fl_pg_order);
0562         if (unlikely(!pg)) {
0563             q->large_alloc_failed++;
0564             break;       /* fall back to single pages */
0565         }
0566
0567         mapping = dma_map_page(adap->pdev_dev, pg, 0,
0568                        PAGE_SIZE << s->fl_pg_order,
0569                        DMA_FROM_DEVICE);
0570         if (unlikely(dma_mapping_error(adap->pdev_dev, mapping))) {
0571             __free_pages(pg, s->fl_pg_order);
0572             q->mapping_err++;
0573             goto out;   /* do not try small pages for this error */
0574         }
0575         mapping |= RX_LARGE_PG_BUF;
0576         *d++ = cpu_to_be64(mapping);
0577
0578         set_rx_sw_desc(sd, pg, mapping);
0579         sd++;
0580
0581         q->avail++;
0582         if (++q->pidx == q->size) {
0583             q->pidx = 0;
0584             sd = q->sdesc;
0585             d = q->desc;
0586         }
0587         n--;
0588     }
0589
0590 alloc_small_pages:
0591     while (n--) {
0592         pg = alloc_pages_node(node, gfp, 0);
0593         if (unlikely(!pg)) {
0594             q->alloc_failed++;
0595             break;
0596         }
0597
0598         mapping = dma_map_page(adap->pdev_dev, pg, 0, PAGE_SIZE,
0599                        DMA_FROM_DEVICE);
0600         if (unlikely(dma_mapping_error(adap->pdev_dev, mapping))) {
0601             put_page(pg);
0602             q->mapping_err++;
0603             goto out;
0604         }
0605         *d++ = cpu_to_be64(mapping);
0606
0607         set_rx_sw_desc(sd, pg, mapping);
0608         sd++;
0609
0610         q->avail++;
0611         if (++q->pidx == q->size) {
0612             q->pidx = 0;
0613             sd = q->sdesc;
0614             d = q->desc;
0615         }
0616     }
0617
0618 out:    cred = q->avail - cred;
0619     q->pend_cred += cred;
0620     ring_fl_db(adap, q);
0621
0622     if (unlikely(fl_starving(adap, q))) {
0623         smp_wmb();
0624         q->low++;
0625         set_bit(q->cntxt_id - adap->sge.egr_start,
0626             adap->sge.starving_fl);
0627     }
0628
0629     return cred;
0630 }
0631
0632 static inline void __refill_fl(struct adapter *adap, struct sge_fl *fl)
0633 {
0634     refill_fl(adap, fl, min(MAX_RX_REFILL, fl_cap(fl) - fl->avail),
0635           GFP_ATOMIC);
0636 }
0637
0638 /**
0639  *  alloc_ring - allocate resources for an SGE descriptor ring
0640  *  @dev: the PCI device's core device
0641  *  @nelem: the number of descriptors
0642  *  @elem_size: the size of each descriptor
0643  *  @sw_size: the size of the SW state associated with each ring element
0644  *  @phys: the physical address of the allocated ring
0645  *  @metadata: address of the array holding the SW state for the ring
0646  *  @stat_size: extra space in HW ring for status information
0647  *  @node: preferred node for memory allocations
0648  *
0649  *  Allocates resources for an SGE descriptor ring, such as Tx queues,
0650  *  free buffer lists, or response queues.  Each SGE ring requires
0651  *  space for its HW descriptors plus, optionally, space for the SW state
0652  *  associated with each HW entry (the metadata).  The function returns
0653  *  three values: the virtual address for the HW ring (the return value
0654  *  of the function), the bus address of the HW ring, and the address
0655  *  of the SW ring.
0656  */
0657 static void *alloc_ring(struct device *dev, size_t nelem, size_t elem_size,
0658             size_t sw_size, dma_addr_t *phys, void *metadata,
0659             size_t stat_size, int node)
0660 {
0661     size_t len = nelem * elem_size + stat_size;
0662     void *s = NULL;
0663     void *p = dma_alloc_coherent(dev, len, phys, GFP_KERNEL);
0664
0665     if (!p)
0666         return NULL;
0667     if (sw_size) {
0668         s = kcalloc_node(sw_size, nelem, GFP_KERNEL, node);
0669
0670         if (!s) {
0671             dma_free_coherent(dev, len, p, *phys);
0672             return NULL;
0673         }
0674     }
0675     if (metadata)
0676         *(void **)metadata = s;
0677     return p;
0678 }
0679
0680 /**
0681  *  sgl_len - calculates the size of an SGL of the given capacity
0682  *  @n: the number of SGL entries
0683  *
0684  *  Calculates the number of flits needed for a scatter/gather list that
0685  *  can hold the given number of entries.
0686  */
0687 static inline unsigned int sgl_len(unsigned int n)
0688 {
0689     /* A Direct Scatter Gather List uses 32-bit lengths and 64-bit PCI DMA
0690      * addresses.  The DSGL Work Request starts off with a 32-bit DSGL
0691      * ULPTX header, then Length0, then Address0, then, for 1 <= i <= N,
0692      * repeated sequences of { Length[i], Length[i+1], Address[i],
0693      * Address[i+1] } (this ensures that all addresses are on 64-bit
0694      * boundaries).  If N is even, then Length[N+1] should be set to 0 and
0695      * Address[N+1] is omitted.
0696      *
0697      * The following calculation incorporates all of the above.  It's
0698      * somewhat hard to follow but, briefly: the "+2" accounts for the
0699      * first two flits which include the DSGL header, Length0 and
0700      * Address0; the "(3*(n-1))/2" covers the main body of list entries (3
0701      * flits for every pair of the remaining N) +1 if (n-1) is odd; and
0702      * finally the "+((n-1)&1)" adds the one remaining flit needed if
0703      * (n-1) is odd ...
0704      */
0705     n--;
0706     return (3 * n) / 2 + (n & 1) + 2;
0707 }
0708
0709 /**
0710  *  flits_to_desc - returns the num of Tx descriptors for the given flits
0711  *  @n: the number of flits
0712  *
0713  *  Returns the number of Tx descriptors needed for the supplied number
0714  *  of flits.
0715  */
0716 static inline unsigned int flits_to_desc(unsigned int n)
0717 {
0718     BUG_ON(n > SGE_MAX_WR_LEN / 8);
0719     return DIV_ROUND_UP(n, 8);
0720 }
0721
0722 /**
0723  *  is_eth_imm - can an Ethernet packet be sent as immediate data?
0724  *  @skb: the packet
0725  *  @chip_ver: chip version
0726  *
0727  *  Returns whether an Ethernet packet is small enough to fit as
0728  *  immediate data. Return value corresponds to headroom required.
0729  */
0730 static inline int is_eth_imm(const struct sk_buff *skb, unsigned int chip_ver)
0731 {
0732     int hdrlen = 0;
0733
0734     if (skb->encapsulation && skb_shinfo(skb)->gso_size &&
0735         chip_ver > CHELSIO_T5) {
0736         hdrlen = sizeof(struct cpl_tx_tnl_lso);
0737         hdrlen += sizeof(struct cpl_tx_pkt_core);
0738     } else if (skb_shinfo(skb)->gso_type & SKB_GSO_UDP_L4) {
0739         return 0;
0740     } else {
0741         hdrlen = skb_shinfo(skb)->gso_size ?
0742              sizeof(struct cpl_tx_pkt_lso_core) : 0;
0743         hdrlen += sizeof(struct cpl_tx_pkt);
0744     }
0745     if (skb->len <= MAX_IMM_TX_PKT_LEN - hdrlen)
0746         return hdrlen;
0747     return 0;
0748 }
0749
0750 /**
0751  *  calc_tx_flits - calculate the number of flits for a packet Tx WR
0752  *  @skb: the packet
0753  *  @chip_ver: chip version
0754  *
0755  *  Returns the number of flits needed for a Tx WR for the given Ethernet
0756  *  packet, including the needed WR and CPL headers.
0757  */
0758 static inline unsigned int calc_tx_flits(const struct sk_buff *skb,
0759                      unsigned int chip_ver)
0760 {
0761     unsigned int flits;
0762     int hdrlen = is_eth_imm(skb, chip_ver);
0763
0764     /* If the skb is small enough, we can pump it out as a work request
0765      * with only immediate data.  In that case we just have to have the
0766      * TX Packet header plus the skb data in the Work Request.
0767      */
0768
0769     if (hdrlen)
0770         return DIV_ROUND_UP(skb->len + hdrlen, sizeof(__be64));
0771
0772     /* Otherwise, we're going to have to construct a Scatter gather list
0773      * of the skb body and fragments.  We also include the flits necessary
0774      * for the TX Packet Work Request and CPL.  We always have a firmware
0775      * Write Header (incorporated as part of the cpl_tx_pkt_lso and
0776      * cpl_tx_pkt structures), followed by either a TX Packet Write CPL
0777      * message or, if we're doing a Large Send Offload, an LSO CPL message
0778      * with an embedded TX Packet Write CPL message.
0779      */
0780     flits = sgl_len(skb_shinfo(skb)->nr_frags + 1);
0781     if (skb_shinfo(skb)->gso_size) {
0782         if (skb->encapsulation && chip_ver > CHELSIO_T5) {
0783             hdrlen = sizeof(struct fw_eth_tx_pkt_wr) +
0784                  sizeof(struct cpl_tx_tnl_lso);
0785         } else if (skb_shinfo(skb)->gso_type & SKB_GSO_UDP_L4) {
0786             u32 pkt_hdrlen;
0787
0788             pkt_hdrlen = eth_get_headlen(skb->dev, skb->data,
0789                              skb_headlen(skb));
0790             hdrlen = sizeof(struct fw_eth_tx_eo_wr) +
0791                  round_up(pkt_hdrlen, 16);
0792         } else {
0793             hdrlen = sizeof(struct fw_eth_tx_pkt_wr) +
0794                  sizeof(struct cpl_tx_pkt_lso_core);
0795         }
0796
0797         hdrlen += sizeof(struct cpl_tx_pkt_core);
0798         flits += (hdrlen / sizeof(__be64));
0799     } else {
0800         flits += (sizeof(struct fw_eth_tx_pkt_wr) +
0801               sizeof(struct cpl_tx_pkt_core)) / sizeof(__be64);
0802     }
0803     return flits;
0804 }
0805
0806 /**
0807  *  calc_tx_descs - calculate the number of Tx descriptors for a packet
0808  *  @skb: the packet
0809  *  @chip_ver: chip version
0810  *
0811  *  Returns the number of Tx descriptors needed for the given Ethernet
0812  *  packet, including the needed WR and CPL headers.
0813  */
0814 static inline unsigned int calc_tx_descs(const struct sk_buff *skb,
0815                      unsigned int chip_ver)
0816 {
0817     return flits_to_desc(calc_tx_flits(skb, chip_ver));
0818 }
0819
0820 /**
0821  *  cxgb4_write_sgl - populate a scatter/gather list for a packet
0822  *  @skb: the packet
0823  *  @q: the Tx queue we are writing into
0824  *  @sgl: starting location for writing the SGL
0825  *  @end: points right after the end of the SGL
0826  *  @start: start offset into skb main-body data to include in the SGL
0827  *  @addr: the list of bus addresses for the SGL elements
0828  *
0829  *  Generates a gather list for the buffers that make up a packet.
0830  *  The caller must provide adequate space for the SGL that will be written.
0831  *  The SGL includes all of the packet's page fragments and the data in its
0832  *  main body except for the first @start bytes.  @sgl must be 16-byte
0833  *  aligned and within a Tx descriptor with available space.  @end points
0834  *  right after the end of the SGL but does not account for any potential
0835  *  wrap around, i.e., @end > @sgl.
0836  */
0837 void cxgb4_write_sgl(const struct sk_buff *skb, struct sge_txq *q,
0838              struct ulptx_sgl *sgl, u64 *end, unsigned int start,
0839              const dma_addr_t *addr)
0840 {
0841     unsigned int i, len;
0842     struct ulptx_sge_pair *to;
0843     const struct skb_shared_info *si = skb_shinfo(skb);
0844     unsigned int nfrags = si->nr_frags;
0845     struct ulptx_sge_pair buf[MAX_SKB_FRAGS / 2 + 1];
0846
0847     len = skb_headlen(skb) - start;
0848     if (likely(len)) {
0849         sgl->len0 = htonl(len);
0850         sgl->addr0 = cpu_to_be64(addr[0] + start);
0851         nfrags++;
0852     } else {
0853         sgl->len0 = htonl(skb_frag_size(&si->frags[0]));
0854         sgl->addr0 = cpu_to_be64(addr[1]);
0855     }
0856
0857     sgl->cmd_nsge = htonl(ULPTX_CMD_V(ULP_TX_SC_DSGL) |
0858                   ULPTX_NSGE_V(nfrags));
0859     if (likely(--nfrags == 0))
0860         return;
0861     /*
0862      * Most of the complexity below deals with the possibility we hit the
0863      * end of the queue in the middle of writing the SGL.  For this case
0864      * only we create the SGL in a temporary buffer and then copy it.
0865      */
0866     to = (u8 *)end > (u8 *)q->stat ? buf : sgl->sge;
0867
0868     for (i = (nfrags != si->nr_frags); nfrags >= 2; nfrags -= 2, to++) {
0869         to->len[0] = cpu_to_be32(skb_frag_size(&si->frags[i]));
0870         to->len[1] = cpu_to_be32(skb_frag_size(&si->frags[++i]));
0871         to->addr[0] = cpu_to_be64(addr[i]);
0872         to->addr[1] = cpu_to_be64(addr[++i]);
0873     }
0874     if (nfrags) {
0875         to->len[0] = cpu_to_be32(skb_frag_size(&si->frags[i]));
0876         to->len[1] = cpu_to_be32(0);
0877         to->addr[0] = cpu_to_be64(addr[i + 1]);
0878     }
0879     if (unlikely((u8 *)end > (u8 *)q->stat)) {
0880         unsigned int part0 = (u8 *)q->stat - (u8 *)sgl->sge, part1;
0881
0882         if (likely(part0))
0883             memcpy(sgl->sge, buf, part0);
0884         part1 = (u8 *)end - (u8 *)q->stat;
0885         memcpy(q->desc, (u8 *)buf + part0, part1);
0886         end = (void *)q->desc + part1;
0887     }
0888     if ((uintptr_t)end & 8)           /* 0-pad to multiple of 16 */
0889         *end = 0;
0890 }
0891 EXPORT_SYMBOL(cxgb4_write_sgl);
0892
0893 /*  cxgb4_write_partial_sgl - populate SGL for partial packet
0894  *  @skb: the packet
0895  *  @q: the Tx queue we are writing into
0896  *  @sgl: starting location for writing the SGL
0897  *  @end: points right after the end of the SGL
0898  *  @addr: the list of bus addresses for the SGL elements
0899  *  @start: start offset in the SKB where partial data starts
0900  *  @len: length of data from @start to send out
0901  *
0902  *  This API will handle sending out partial data of a skb if required.
0903  *  Unlike cxgb4_write_sgl, @start can be any offset into the skb data,
0904  *  and @len will decide how much data after @start offset to send out.
0905  */
0906 void cxgb4_write_partial_sgl(const struct sk_buff *skb, struct sge_txq *q,
0907                  struct ulptx_sgl *sgl, u64 *end,
0908                  const dma_addr_t *addr, u32 start, u32 len)
0909 {
0910     struct ulptx_sge_pair buf[MAX_SKB_FRAGS / 2 + 1] = {0}, *to;
0911     u32 frag_size, skb_linear_data_len = skb_headlen(skb);
0912     struct skb_shared_info *si = skb_shinfo(skb);
0913     u8 i = 0, frag_idx = 0, nfrags = 0;
0914     skb_frag_t *frag;
0915
0916     /* Fill the first SGL either from linear data or from partial
0917      * frag based on @start.
0918      */
0919     if (unlikely(start < skb_linear_data_len)) {
0920         frag_size = min(len, skb_linear_data_len - start);
0921         sgl->len0 = htonl(frag_size);
0922         sgl->addr0 = cpu_to_be64(addr[0] + start);
0923         len -= frag_size;
0924         nfrags++;
0925     } else {
0926         start -= skb_linear_data_len;
0927         frag = &si->frags[frag_idx];
0928         frag_size = skb_frag_size(frag);
0929         /* find the first frag */
0930         while (start >= frag_size) {
0931             start -= frag_size;
0932             frag_idx++;
0933             frag = &si->frags[frag_idx];
0934             frag_size = skb_frag_size(frag);
0935         }
0936
0937         frag_size = min(len, skb_frag_size(frag) - start);
0938         sgl->len0 = cpu_to_be32(frag_size);
0939         sgl->addr0 = cpu_to_be64(addr[frag_idx + 1] + start);
0940         len -= frag_size;
0941         nfrags++;
0942         frag_idx++;
0943     }
0944
0945     /* If the entire partial data fit in one SGL, then send it out
0946      * now.
0947      */
0948     if (!len)
0949         goto done;
0950
0951     /* Most of the complexity below deals with the possibility we hit the
0952      * end of the queue in the middle of writing the SGL.  For this case
0953      * only we create the SGL in a temporary buffer and then copy it.
0954      */
0955     to = (u8 *)end > (u8 *)q->stat ? buf : sgl->sge;
0956
0957     /* If the skb couldn't fit in first SGL completely, fill the
0958      * rest of the frags in subsequent SGLs. Note that each SGL
0959      * pair can store 2 frags.
0960      */
0961     while (len) {
0962         frag_size = min(len, skb_frag_size(&si->frags[frag_idx]));
0963         to->len[i & 1] = cpu_to_be32(frag_size);
0964         to->addr[i & 1] = cpu_to_be64(addr[frag_idx + 1]);
0965         if (i && (i & 1))
0966             to++;
0967         nfrags++;
0968         frag_idx++;
0969         i++;
0970         len -= frag_size;
0971     }
0972
0973     /* If we ended in an odd boundary, then set the second SGL's
0974      * length in the pair to 0.
0975      */
0976     if (i & 1)
0977         to->len[1] = cpu_to_be32(0);
0978
0979     /* Copy from temporary buffer to Tx ring, in case we hit the
0980      * end of the queue in the middle of writing the SGL.
0981      */
0982     if (unlikely((u8 *)end > (u8 *)q->stat)) {
0983         u32 part0 = (u8 *)q->stat - (u8 *)sgl->sge, part1;
0984
0985         if (likely(part0))
0986             memcpy(sgl->sge, buf, part0);
0987         part1 = (u8 *)end - (u8 *)q->stat;
0988         memcpy(q->desc, (u8 *)buf + part0, part1);
0989         end = (void *)q->desc + part1;
0990     }
0991
0992     /* 0-pad to multiple of 16 */
0993     if ((uintptr_t)end & 8)
0994         *end = 0;
0995 done:
0996     sgl->cmd_nsge = htonl(ULPTX_CMD_V(ULP_TX_SC_DSGL) |
0997             ULPTX_NSGE_V(nfrags));
0998 }
0999 EXPORT_SYMBOL(cxgb4_write_partial_sgl);
1000
1001 /* This function copies 64 byte coalesced work request to
1002  * memory mapped BAR2 space. For coalesced WR SGE fetches
1003  * data from the FIFO instead of from Host.
1004  */
1005 static void cxgb_pio_copy(u64 __iomem *dst, u64 *src)
1006 {
1007     int count = 8;
1008
1009     while (count) {
1010         writeq(*src, dst);
1011         src++;
1012         dst++;
1013         count--;
1014     }
1015 }
1016
1017 /**
1018  *  cxgb4_ring_tx_db - check and potentially ring a Tx queue's doorbell
1019  *  @adap: the adapter
1020  *  @q: the Tx queue
1021  *  @n: number of new descriptors to give to HW
1022  *
1023  *  Ring the doorbel for a Tx queue.
1024  */
1025 inline void cxgb4_ring_tx_db(struct adapter *adap, struct sge_txq *q, int n)
1026 {
1027     /* Make sure that all writes to the TX Descriptors are committed
1028      * before we tell the hardware about them.
1029      */
1030     wmb();
1031
1032     /* If we don't have access to the new User Doorbell (T5+), use the old
1033      * doorbell mechanism; otherwise use the new BAR2 mechanism.
1034      */
1035     if (unlikely(q->bar2_addr == NULL)) {
1036         u32 val = PIDX_V(n);
1037         unsigned long flags;
1038
1039         /* For T4 we need to participate in the Doorbell Recovery
1040          * mechanism.
1041          */
1042         spin_lock_irqsave(&q->db_lock, flags);
1043         if (!q->db_disabled)
1044             t4_write_reg(adap, MYPF_REG(SGE_PF_KDOORBELL_A),
1045                      QID_V(q->cntxt_id) | val);
1046         else
1047             q->db_pidx_inc += n;
1048         q->db_pidx = q->pidx;
1049         spin_unlock_irqrestore(&q->db_lock, flags);
1050     } else {
1051         u32 val = PIDX_T5_V(n);
1052
1053         /* T4 and later chips share the same PIDX field offset within
1054          * the doorbell, but T5 and later shrank the field in order to
1055          * gain a bit for Doorbell Priority.  The field was absurdly
1056          * large in the first place (14 bits) so we just use the T5
1057          * and later limits and warn if a Queue ID is too large.
1058          */
1059         WARN_ON(val & DBPRIO_F);
1060
1061         /* If we're only writing a single TX Descriptor and we can use
1062          * Inferred QID registers, we can use the Write Combining
1063          * Gather Buffer; otherwise we use the simple doorbell.
1064          */
1065         if (n == 1 && q->bar2_qid == 0) {
1066             int index = (q->pidx
1067                      ? (q->pidx - 1)
1068                      : (q->size - 1));
1069             u64 *wr = (u64 *)&q->desc[index];
1070
1071             cxgb_pio_copy((u64 __iomem *)
1072                       (q->bar2_addr + SGE_UDB_WCDOORBELL),
1073                       wr);
1074         } else {
1075             writel(val | QID_V(q->bar2_qid),
1076                    q->bar2_addr + SGE_UDB_KDOORBELL);
1077         }
1078
1079         /* This Write Memory Barrier will force the write to the User
1080          * Doorbell area to be flushed.  This is needed to prevent
1081          * writes on different CPUs for the same queue from hitting
1082          * the adapter out of order.  This is required when some Work
1083          * Requests take the Write Combine Gather Buffer path (user
1084          * doorbell area offset [SGE_UDB_WCDOORBELL..+63]) and some
1085          * take the traditional path where we simply increment the
1086          * PIDX (User Doorbell area SGE_UDB_KDOORBELL) and have the
1087          * hardware DMA read the actual Work Request.
1088          */
1089         wmb();
1090     }
1091 }
1092 EXPORT_SYMBOL(cxgb4_ring_tx_db);
1093
1094 /**
1095  *  cxgb4_inline_tx_skb - inline a packet's data into Tx descriptors
1096  *  @skb: the packet
1097  *  @q: the Tx queue where the packet will be inlined
1098  *  @pos: starting position in the Tx queue where to inline the packet
1099  *
1100  *  Inline a packet's contents directly into Tx descriptors, starting at
1101  *  the given position within the Tx DMA ring.
1102  *  Most of the complexity of this operation is dealing with wrap arounds
1103  *  in the middle of the packet we want to inline.
1104  */
1105 void cxgb4_inline_tx_skb(const struct sk_buff *skb,
1106              const struct sge_txq *q, void *pos)
1107 {
1108     int left = (void *)q->stat - pos;
1109     u64 *p;
1110
1111     if (likely(skb->len <= left)) {
1112         if (likely(!skb->data_len))
1113             skb_copy_from_linear_data(skb, pos, skb->len);
1114         else
1115             skb_copy_bits(skb, 0, pos, skb->len);
1116         pos += skb->len;
1117     } else {
1118         skb_copy_bits(skb, 0, pos, left);
1119         skb_copy_bits(skb, left, q->desc, skb->len - left);
1120         pos = (void *)q->desc + (skb->len - left);
1121     }
1122
1123     /* 0-pad to multiple of 16 */
1124     p = PTR_ALIGN(pos, 8);
1125     if ((uintptr_t)p & 8)
1126         *p = 0;
1127 }
1128 EXPORT_SYMBOL(cxgb4_inline_tx_skb);
1129
1130 static void *inline_tx_skb_header(const struct sk_buff *skb,
1131                   const struct sge_txq *q,  void *pos,
1132                   int length)
1133 {
1134     u64 *p;
1135     int left = (void *)q->stat - pos;
1136
1137     if (likely(length <= left)) {
1138         memcpy(pos, skb->data, length);
1139         pos += length;
1140     } else {
1141         memcpy(pos, skb->data, left);
1142         memcpy(q->desc, skb->data + left, length - left);
1143         pos = (void *)q->desc + (length - left);
1144     }
1145     /* 0-pad to multiple of 16 */
1146     p = PTR_ALIGN(pos, 8);
1147     if ((uintptr_t)p & 8) {
1148         *p = 0;
1149         return p + 1;
1150     }
1151     return p;
1152 }
1153
1154 /*
1155  * Figure out what HW csum a packet wants and return the appropriate control
1156  * bits.
1157  */
1158 static u64 hwcsum(enum chip_type chip, const struct sk_buff *skb)
1159 {
1160     int csum_type;
1161     bool inner_hdr_csum = false;
1162     u16 proto, ver;
1163
1164     if (skb->encapsulation &&
1165         (CHELSIO_CHIP_VERSION(chip) > CHELSIO_T5))
1166         inner_hdr_csum = true;
1167
1168     if (inner_hdr_csum) {
1169         ver = inner_ip_hdr(skb)->version;
1170         proto = (ver == 4) ? inner_ip_hdr(skb)->protocol :
1171             inner_ipv6_hdr(skb)->nexthdr;
1172     } else {
1173         ver = ip_hdr(skb)->version;
1174         proto = (ver == 4) ? ip_hdr(skb)->protocol :
1175             ipv6_hdr(skb)->nexthdr;
1176     }
1177
1178     if (ver == 4) {
1179         if (proto == IPPROTO_TCP)
1180             csum_type = TX_CSUM_TCPIP;
1181         else if (proto == IPPROTO_UDP)
1182             csum_type = TX_CSUM_UDPIP;
1183         else {
1184 nocsum:         /*
1185              * unknown protocol, disable HW csum
1186              * and hope a bad packet is detected
1187              */
1188             return TXPKT_L4CSUM_DIS_F;
1189         }
1190     } else {
1191         /*
1192          * this doesn't work with extension headers
1193          */
1194         if (proto == IPPROTO_TCP)
1195             csum_type = TX_CSUM_TCPIP6;
1196         else if (proto == IPPROTO_UDP)
1197             csum_type = TX_CSUM_UDPIP6;
1198         else
1199             goto nocsum;
1200     }
1201
1202     if (likely(csum_type >= TX_CSUM_TCPIP)) {
1203         int eth_hdr_len, l4_len;
1204         u64 hdr_len;
1205
1206         if (inner_hdr_csum) {
1207             /* This allows checksum offload for all encapsulated
1208              * packets like GRE etc..
1209              */
1210             l4_len = skb_inner_network_header_len(skb);
1211             eth_hdr_len = skb_inner_network_offset(skb) - ETH_HLEN;
1212         } else {
1213             l4_len = skb_network_header_len(skb);
1214             eth_hdr_len = skb_network_offset(skb) - ETH_HLEN;
1215         }
1216         hdr_len = TXPKT_IPHDR_LEN_V(l4_len);
1217
1218         if (CHELSIO_CHIP_VERSION(chip) <= CHELSIO_T5)
1219             hdr_len |= TXPKT_ETHHDR_LEN_V(eth_hdr_len);
1220         else
1221             hdr_len |= T6_TXPKT_ETHHDR_LEN_V(eth_hdr_len);
1222         return TXPKT_CSUM_TYPE_V(csum_type) | hdr_len;
1223     } else {
1224         int start = skb_transport_offset(skb);
1225
1226         return TXPKT_CSUM_TYPE_V(csum_type) |
1227             TXPKT_CSUM_START_V(start) |
1228             TXPKT_CSUM_LOC_V(start + skb->csum_offset);
1229     }
1230 }
1231
1232 static void eth_txq_stop(struct sge_eth_txq *q)
1233 {
1234     netif_tx_stop_queue(q->txq);
1235     q->q.stops++;
1236 }
1237
1238 static inline void txq_advance(struct sge_txq *q, unsigned int n)
1239 {
1240     q->in_use += n;
1241     q->pidx += n;
1242     if (q->pidx >= q->size)
1243         q->pidx -= q->size;
1244 }
1245
1246 #ifdef CONFIG_CHELSIO_T4_FCOE
1247 static inline int
1248 cxgb_fcoe_offload(struct sk_buff *skb, struct adapter *adap,
1249           const struct port_info *pi, u64 *cntrl)
1250 {
1251     const struct cxgb_fcoe *fcoe = &pi->fcoe;
1252
1253     if (!(fcoe->flags & CXGB_FCOE_ENABLED))
1254         return 0;
1255
1256     if (skb->protocol != htons(ETH_P_FCOE))
1257         return 0;
1258
1259     skb_reset_mac_header(skb);
1260     skb->mac_len = sizeof(struct ethhdr);
1261
1262     skb_set_network_header(skb, skb->mac_len);
1263     skb_set_transport_header(skb, skb->mac_len + sizeof(struct fcoe_hdr));
1264
1265     if (!cxgb_fcoe_sof_eof_supported(adap, skb))
1266         return -ENOTSUPP;
1267
1268     /* FC CRC offload */
1269     *cntrl = TXPKT_CSUM_TYPE_V(TX_CSUM_FCOE) |
1270              TXPKT_L4CSUM_DIS_F | TXPKT_IPCSUM_DIS_F |
1271              TXPKT_CSUM_START_V(CXGB_FCOE_TXPKT_CSUM_START) |
1272              TXPKT_CSUM_END_V(CXGB_FCOE_TXPKT_CSUM_END) |
1273              TXPKT_CSUM_LOC_V(CXGB_FCOE_TXPKT_CSUM_END);
1274     return 0;
1275 }
1276 #endif /* CONFIG_CHELSIO_T4_FCOE */
1277
1278 /* Returns tunnel type if hardware supports offloading of the same.
1279  * It is called only for T5 and onwards.
1280  */
1281 enum cpl_tx_tnl_lso_type cxgb_encap_offload_supported(struct sk_buff *skb)
1282 {
1283     u8 l4_hdr = 0;
1284     enum cpl_tx_tnl_lso_type tnl_type = TX_TNL_TYPE_OPAQUE;
1285     struct port_info *pi = netdev_priv(skb->dev);
1286     struct adapter *adapter = pi->adapter;
1287
1288     if (skb->inner_protocol_type != ENCAP_TYPE_ETHER ||
1289         skb->inner_protocol != htons(ETH_P_TEB))
1290         return tnl_type;
1291
1292     switch (vlan_get_protocol(skb)) {
1293     case htons(ETH_P_IP):
1294         l4_hdr = ip_hdr(skb)->protocol;
1295         break;
1296     case htons(ETH_P_IPV6):
1297         l4_hdr = ipv6_hdr(skb)->nexthdr;
1298         break;
1299     default:
1300         return tnl_type;
1301     }
1302
1303     switch (l4_hdr) {
1304     case IPPROTO_UDP:
1305         if (adapter->vxlan_port == udp_hdr(skb)->dest)
1306             tnl_type = TX_TNL_TYPE_VXLAN;
1307         else if (adapter->geneve_port == udp_hdr(skb)->dest)
1308             tnl_type = TX_TNL_TYPE_GENEVE;
1309         break;
1310     default:
1311         return tnl_type;
1312     }
1313
1314     return tnl_type;
1315 }
1316
1317 static inline void t6_fill_tnl_lso(struct sk_buff *skb,
1318                    struct cpl_tx_tnl_lso *tnl_lso,
1319                    enum cpl_tx_tnl_lso_type tnl_type)
1320 {
1321     u32 val;
1322     int in_eth_xtra_len;
1323     int l3hdr_len = skb_network_header_len(skb);
1324     int eth_xtra_len = skb_network_offset(skb) - ETH_HLEN;
1325     const struct skb_shared_info *ssi = skb_shinfo(skb);
1326     bool v6 = (ip_hdr(skb)->version == 6);
1327
1328     val = CPL_TX_TNL_LSO_OPCODE_V(CPL_TX_TNL_LSO) |
1329           CPL_TX_TNL_LSO_FIRST_F |
1330           CPL_TX_TNL_LSO_LAST_F |
1331           (v6 ? CPL_TX_TNL_LSO_IPV6OUT_F : 0) |
1332           CPL_TX_TNL_LSO_ETHHDRLENOUT_V(eth_xtra_len / 4) |
1333           CPL_TX_TNL_LSO_IPHDRLENOUT_V(l3hdr_len / 4) |
1334           (v6 ? 0 : CPL_TX_TNL_LSO_IPHDRCHKOUT_F) |
1335           CPL_TX_TNL_LSO_IPLENSETOUT_F |
1336           (v6 ? 0 : CPL_TX_TNL_LSO_IPIDINCOUT_F);
1337     tnl_lso->op_to_IpIdSplitOut = htonl(val);
1338
1339     tnl_lso->IpIdOffsetOut = 0;
1340
1341     /* Get the tunnel header length */
1342     val = skb_inner_mac_header(skb) - skb_mac_header(skb);
1343     in_eth_xtra_len = skb_inner_network_header(skb) -
1344               skb_inner_mac_header(skb) - ETH_HLEN;
1345
1346     switch (tnl_type) {
1347     case TX_TNL_TYPE_VXLAN:
1348     case TX_TNL_TYPE_GENEVE:
1349         tnl_lso->UdpLenSetOut_to_TnlHdrLen =
1350             htons(CPL_TX_TNL_LSO_UDPCHKCLROUT_F |
1351             CPL_TX_TNL_LSO_UDPLENSETOUT_F);
1352         break;
1353     default:
1354         tnl_lso->UdpLenSetOut_to_TnlHdrLen = 0;
1355         break;
1356     }
1357
1358     tnl_lso->UdpLenSetOut_to_TnlHdrLen |=
1359          htons(CPL_TX_TNL_LSO_TNLHDRLEN_V(val) |
1360                CPL_TX_TNL_LSO_TNLTYPE_V(tnl_type));
1361
1362     tnl_lso->r1 = 0;
1363
1364     val = CPL_TX_TNL_LSO_ETHHDRLEN_V(in_eth_xtra_len / 4) |
1365           CPL_TX_TNL_LSO_IPV6_V(inner_ip_hdr(skb)->version == 6) |
1366           CPL_TX_TNL_LSO_IPHDRLEN_V(skb_inner_network_header_len(skb) / 4) |
1367           CPL_TX_TNL_LSO_TCPHDRLEN_V(inner_tcp_hdrlen(skb) / 4);
1368     tnl_lso->Flow_to_TcpHdrLen = htonl(val);
1369
1370     tnl_lso->IpIdOffset = htons(0);
1371
1372     tnl_lso->IpIdSplit_to_Mss = htons(CPL_TX_TNL_LSO_MSS_V(ssi->gso_size));
1373     tnl_lso->TCPSeqOffset = htonl(0);
1374     tnl_lso->EthLenOffset_Size = htonl(CPL_TX_TNL_LSO_SIZE_V(skb->len));
1375 }
1376
1377 static inline void *write_tso_wr(struct adapter *adap, struct sk_buff *skb,
1378                  struct cpl_tx_pkt_lso_core *lso)
1379 {
1380     int eth_xtra_len = skb_network_offset(skb) - ETH_HLEN;
1381     int l3hdr_len = skb_network_header_len(skb);
1382     const struct skb_shared_info *ssi;
1383     bool ipv6 = false;
1384
1385     ssi = skb_shinfo(skb);
1386     if (ssi->gso_type & SKB_GSO_TCPV6)
1387         ipv6 = true;
1388
1389     lso->lso_ctrl = htonl(LSO_OPCODE_V(CPL_TX_PKT_LSO) |
1390                   LSO_FIRST_SLICE_F | LSO_LAST_SLICE_F |
1391                   LSO_IPV6_V(ipv6) |
1392                   LSO_ETHHDR_LEN_V(eth_xtra_len / 4) |
1393                   LSO_IPHDR_LEN_V(l3hdr_len / 4) |
1394                   LSO_TCPHDR_LEN_V(tcp_hdr(skb)->doff));
1395     lso->ipid_ofst = htons(0);
1396     lso->mss = htons(ssi->gso_size);
1397     lso->seqno_offset = htonl(0);
1398     if (is_t4(adap->params.chip))
1399         lso->len = htonl(skb->len);
1400     else
1401         lso->len = htonl(LSO_T5_XFER_SIZE_V(skb->len));
1402
1403     return (void *)(lso + 1);
1404 }
1405
1406 /**
1407  *  t4_sge_eth_txq_egress_update - handle Ethernet TX Queue update
1408  *  @adap: the adapter
1409  *  @eq: the Ethernet TX Queue
1410  *  @maxreclaim: the maximum number of TX Descriptors to reclaim or -1
1411  *
1412  *  We're typically called here to update the state of an Ethernet TX
1413  *  Queue with respect to the hardware's progress in consuming the TX
1414  *  Work Requests that we've put on that Egress Queue.  This happens
1415  *  when we get Egress Queue Update messages and also prophylactically
1416  *  in regular timer-based Ethernet TX Queue maintenance.
1417  */
1418 int t4_sge_eth_txq_egress_update(struct adapter *adap, struct sge_eth_txq *eq,
1419                  int maxreclaim)
1420 {
1421     unsigned int reclaimed, hw_cidx;
1422     struct sge_txq *q = &eq->q;
1423     int hw_in_use;
1424
1425     if (!q->in_use || !__netif_tx_trylock(eq->txq))
1426         return 0;
1427
1428     /* Reclaim pending completed TX Descriptors. */
1429     reclaimed = reclaim_completed_tx(adap, &eq->q, maxreclaim, true);
1430
1431     hw_cidx = ntohs(READ_ONCE(q->stat->cidx));
1432     hw_in_use = q->pidx - hw_cidx;
1433     if (hw_in_use < 0)
1434         hw_in_use += q->size;
1435
1436     /* If the TX Queue is currently stopped and there's now more than half
1437      * the queue available, restart it.  Otherwise bail out since the rest
1438      * of what we want do here is with the possibility of shipping any
1439      * currently buffered Coalesced TX Work Request.
1440      */
1441     if (netif_tx_queue_stopped(eq->txq) && hw_in_use < (q->size / 2)) {
1442         netif_tx_wake_queue(eq->txq);
1443         eq->q.restarts++;
1444     }
1445
1446     __netif_tx_unlock(eq->txq);
1447     return reclaimed;
1448 }
1449
1450 static inline int cxgb4_validate_skb(struct sk_buff *skb,
1451                      struct net_device *dev,
1452                      u32 min_pkt_len)
1453 {
1454     u32 max_pkt_len;
1455
1456     /* The chip min packet length is 10 octets but some firmware
1457      * commands have a minimum packet length requirement. So, play
1458      * safe and reject anything shorter than @min_pkt_len.
1459      */
1460     if (unlikely(skb->len < min_pkt_len))
1461         return -EINVAL;
1462
1463     /* Discard the packet if the length is greater than mtu */
1464     max_pkt_len = ETH_HLEN + dev->mtu;
1465
1466     if (skb_vlan_tagged(skb))
1467         max_pkt_len += VLAN_HLEN;
1468
1469     if (!skb_shinfo(skb)->gso_size && (unlikely(skb->len > max_pkt_len)))
1470         return -EINVAL;
1471
1472     return 0;
1473 }
1474
1475 static void *write_eo_udp_wr(struct sk_buff *skb, struct fw_eth_tx_eo_wr *wr,
1476                  u32 hdr_len)
1477 {
1478     wr->u.udpseg.type = FW_ETH_TX_EO_TYPE_UDPSEG;
1479     wr->u.udpseg.ethlen = skb_network_offset(skb);
1480     wr->u.udpseg.iplen = cpu_to_be16(skb_network_header_len(skb));
1481     wr->u.udpseg.udplen = sizeof(struct udphdr);
1482     wr->u.udpseg.rtplen = 0;
1483     wr->u.udpseg.r4 = 0;
1484     if (skb_shinfo(skb)->gso_size)
1485         wr->u.udpseg.mss = cpu_to_be16(skb_shinfo(skb)->gso_size);
1486     else
1487         wr->u.udpseg.mss = cpu_to_be16(skb->len - hdr_len);
1488     wr->u.udpseg.schedpktsize = wr->u.udpseg.mss;
1489     wr->u.udpseg.plen = cpu_to_be32(skb->len - hdr_len);
1490
1491     return (void *)(wr + 1);
1492 }
1493
1494 /**
1495  *  cxgb4_eth_xmit - add a packet to an Ethernet Tx queue
1496  *  @skb: the packet
1497  *  @dev: the egress net device
1498  *
1499  *  Add a packet to an SGE Ethernet Tx queue.  Runs with softirqs disabled.
1500  */
1501 static netdev_tx_t cxgb4_eth_xmit(struct sk_buff *skb, struct net_device *dev)
1502 {
1503     enum cpl_tx_tnl_lso_type tnl_type = TX_TNL_TYPE_OPAQUE;
1504     bool ptp_enabled = is_ptp_enabled(skb, dev);
1505     unsigned int last_desc, flits, ndesc;
1506     u32 wr_mid, ctrl0, op, sgl_off = 0;
1507     const struct skb_shared_info *ssi;
1508     int len, qidx, credits, ret, left;
1509     struct tx_sw_desc *sgl_sdesc;
1510     struct fw_eth_tx_eo_wr *eowr;
1511     struct fw_eth_tx_pkt_wr *wr;
1512     struct cpl_tx_pkt_core *cpl;
1513     const struct port_info *pi;
1514     bool immediate = false;
1515     u64 cntrl, *end, *sgl;
1516     struct sge_eth_txq *q;
1517     unsigned int chip_ver;
1518     struct adapter *adap;
1519
1520     ret = cxgb4_validate_skb(skb, dev, ETH_HLEN);
1521     if (ret)
1522         goto out_free;
1523
1524     pi = netdev_priv(dev);
1525     adap = pi->adapter;
1526     ssi = skb_shinfo(skb);
1527 #if IS_ENABLED(CONFIG_CHELSIO_IPSEC_INLINE)
1528     if (xfrm_offload(skb) && !ssi->gso_size)
1529         return adap->uld[CXGB4_ULD_IPSEC].tx_handler(skb, dev);
1530 #endif /* CHELSIO_IPSEC_INLINE */
1531
1532 #if IS_ENABLED(CONFIG_CHELSIO_TLS_DEVICE)
1533     if (cxgb4_is_ktls_skb(skb) &&
1534         (skb->len - skb_tcp_all_headers(skb)))
1535         return adap->uld[CXGB4_ULD_KTLS].tx_handler(skb, dev);
1536 #endif /* CHELSIO_TLS_DEVICE */
1537
1538     qidx = skb_get_queue_mapping(skb);
1539     if (ptp_enabled) {
1540         if (!(adap->ptp_tx_skb)) {
1541             skb_shinfo(skb)->tx_flags |= SKBTX_IN_PROGRESS;
1542             adap->ptp_tx_skb = skb_get(skb);
1543         } else {
1544             goto out_free;
1545         }
1546         q = &adap->sge.ptptxq;
1547     } else {
1548         q = &adap->sge.ethtxq[qidx + pi->first_qset];
1549     }
1550     skb_tx_timestamp(skb);
1551
1552     reclaim_completed_tx(adap, &q->q, -1, true);
1553     cntrl = TXPKT_L4CSUM_DIS_F | TXPKT_IPCSUM_DIS_F;
1554
1555 #ifdef CONFIG_CHELSIO_T4_FCOE
1556     ret = cxgb_fcoe_offload(skb, adap, pi, &cntrl);
1557     if (unlikely(ret == -EOPNOTSUPP))
1558         goto out_free;
1559 #endif /* CONFIG_CHELSIO_T4_FCOE */
1560
1561     chip_ver = CHELSIO_CHIP_VERSION(adap->params.chip);
1562     flits = calc_tx_flits(skb, chip_ver);
1563     ndesc = flits_to_desc(flits);
1564     credits = txq_avail(&q->q) - ndesc;
1565
1566     if (unlikely(credits < 0)) {
1567         eth_txq_stop(q);
1568         dev_err(adap->pdev_dev,
1569             "%s: Tx ring %u full while queue awake!\n",
1570             dev->name, qidx);
1571         return NETDEV_TX_BUSY;
1572     }
1573
1574     if (is_eth_imm(skb, chip_ver))
1575         immediate = true;
1576
1577     if (skb->encapsulation && chip_ver > CHELSIO_T5)
1578         tnl_type = cxgb_encap_offload_supported(skb);
1579
1580     last_desc = q->q.pidx + ndesc - 1;
1581     if (last_desc >= q->q.size)
1582         last_desc -= q->q.size;
1583     sgl_sdesc = &q->q.sdesc[last_desc];
1584
1585     if (!immediate &&
1586         unlikely(cxgb4_map_skb(adap->pdev_dev, skb, sgl_sdesc->addr) < 0)) {
1587         memset(sgl_sdesc->addr, 0, sizeof(sgl_sdesc->addr));
1588         q->mapping_err++;
1589         goto out_free;
1590     }
1591
1592     wr_mid = FW_WR_LEN16_V(DIV_ROUND_UP(flits, 2));
1593     if (unlikely(credits < ETHTXQ_STOP_THRES)) {
1594         /* After we're done injecting the Work Request for this
1595          * packet, we'll be below our "stop threshold" so stop the TX
1596          * Queue now and schedule a request for an SGE Egress Queue
1597          * Update message. The queue will get started later on when
1598          * the firmware processes this Work Request and sends us an
1599          * Egress Queue Status Update message indicating that space
1600          * has opened up.
1601          */
1602         eth_txq_stop(q);
1603         if (chip_ver > CHELSIO_T5)
1604             wr_mid |= FW_WR_EQUEQ_F | FW_WR_EQUIQ_F;
1605     }
1606
1607     wr = (void *)&q->q.desc[q->q.pidx];
1608     eowr = (void *)&q->q.desc[q->q.pidx];
1609     wr->equiq_to_len16 = htonl(wr_mid);
1610     wr->r3 = cpu_to_be64(0);
1611     if (skb_shinfo(skb)->gso_type & SKB_GSO_UDP_L4)
1612         end = (u64 *)eowr + flits;
1613     else
1614         end = (u64 *)wr + flits;
1615
1616     len = immediate ? skb->len : 0;
1617     len += sizeof(*cpl);
1618     if (ssi->gso_size && !(ssi->gso_type & SKB_GSO_UDP_L4)) {
1619         struct cpl_tx_pkt_lso_core *lso = (void *)(wr + 1);
1620         struct cpl_tx_tnl_lso *tnl_lso = (void *)(wr + 1);
1621
1622         if (tnl_type)
1623             len += sizeof(*tnl_lso);
1624         else
1625             len += sizeof(*lso);
1626
1627         wr->op_immdlen = htonl(FW_WR_OP_V(FW_ETH_TX_PKT_WR) |
1628                        FW_WR_IMMDLEN_V(len));
1629         if (tnl_type) {
1630             struct iphdr *iph = ip_hdr(skb);
1631
1632             t6_fill_tnl_lso(skb, tnl_lso, tnl_type);
1633             cpl = (void *)(tnl_lso + 1);
1634             /* Driver is expected to compute partial checksum that
1635              * does not include the IP Total Length.
1636              */
1637             if (iph->version == 4) {
1638                 iph->check = 0;
1639                 iph->tot_len = 0;
1640                 iph->check = ~ip_fast_csum((u8 *)iph, iph->ihl);
1641             }
1642             if (skb->ip_summed == CHECKSUM_PARTIAL)
1643                 cntrl = hwcsum(adap->params.chip, skb);
1644         } else {
1645             cpl = write_tso_wr(adap, skb, lso);
1646             cntrl = hwcsum(adap->params.chip, skb);
1647         }
1648         sgl = (u64 *)(cpl + 1); /* sgl start here */
1649         q->tso++;
1650         q->tx_cso += ssi->gso_segs;
1651     } else if (ssi->gso_size) {
1652         u64 *start;
1653         u32 hdrlen;
1654
1655         hdrlen = eth_get_headlen(dev, skb->data, skb_headlen(skb));
1656         len += hdrlen;
1657         wr->op_immdlen = cpu_to_be32(FW_WR_OP_V(FW_ETH_TX_EO_WR) |
1658                          FW_ETH_TX_EO_WR_IMMDLEN_V(len));
1659         cpl = write_eo_udp_wr(skb, eowr, hdrlen);
1660         cntrl = hwcsum(adap->params.chip, skb);
1661
1662         start = (u64 *)(cpl + 1);
1663         sgl = (u64 *)inline_tx_skb_header(skb, &q->q, (void *)start,
1664                           hdrlen);
1665         if (unlikely(start > sgl)) {
1666             left = (u8 *)end - (u8 *)q->q.stat;
1667             end = (void *)q->q.desc + left;
1668         }
1669         sgl_off = hdrlen;
1670         q->uso++;
1671         q->tx_cso += ssi->gso_segs;
1672     } else {
1673         if (ptp_enabled)
1674             op = FW_PTP_TX_PKT_WR;
1675         else
1676             op = FW_ETH_TX_PKT_WR;
1677         wr->op_immdlen = htonl(FW_WR_OP_V(op) |
1678                        FW_WR_IMMDLEN_V(len));
1679         cpl = (void *)(wr + 1);
1680         sgl = (u64 *)(cpl + 1);
1681         if (skb->ip_summed == CHECKSUM_PARTIAL) {
1682             cntrl = hwcsum(adap->params.chip, skb) |
1683                 TXPKT_IPCSUM_DIS_F;
1684             q->tx_cso++;
1685         }
1686     }
1687
1688     if (unlikely((u8 *)sgl >= (u8 *)q->q.stat)) {
1689         /* If current position is already at the end of the
1690          * txq, reset the current to point to start of the queue
1691          * and update the end ptr as well.
1692          */
1693         left = (u8 *)end - (u8 *)q->q.stat;
1694         end = (void *)q->q.desc + left;
1695         sgl = (void *)q->q.desc;
1696     }
1697
1698     if (skb_vlan_tag_present(skb)) {
1699         q->vlan_ins++;
1700         cntrl |= TXPKT_VLAN_VLD_F | TXPKT_VLAN_V(skb_vlan_tag_get(skb));
1701 #ifdef CONFIG_CHELSIO_T4_FCOE
1702         if (skb->protocol == htons(ETH_P_FCOE))
1703             cntrl |= TXPKT_VLAN_V(
1704                  ((skb->priority & 0x7) << VLAN_PRIO_SHIFT));
1705 #endif /* CONFIG_CHELSIO_T4_FCOE */
1706     }
1707
1708     ctrl0 = TXPKT_OPCODE_V(CPL_TX_PKT_XT) | TXPKT_INTF_V(pi->tx_chan) |
1709         TXPKT_PF_V(adap->pf);
1710     if (ptp_enabled)
1711         ctrl0 |= TXPKT_TSTAMP_F;
1712 #ifdef CONFIG_CHELSIO_T4_DCB
1713     if (is_t4(adap->params.chip))
1714         ctrl0 |= TXPKT_OVLAN_IDX_V(q->dcb_prio);
1715     else
1716         ctrl0 |= TXPKT_T5_OVLAN_IDX_V(q->dcb_prio);
1717 #endif
1718     cpl->ctrl0 = htonl(ctrl0);
1719     cpl->pack = htons(0);
1720     cpl->len = htons(skb->len);
1721     cpl->ctrl1 = cpu_to_be64(cntrl);
1722
1723     if (immediate) {
1724         cxgb4_inline_tx_skb(skb, &q->q, sgl);
1725         dev_consume_skb_any(skb);
1726     } else {
1727         cxgb4_write_sgl(skb, &q->q, (void *)sgl, end, sgl_off,
1728                 sgl_sdesc->addr);
1729         skb_orphan(skb);
1730         sgl_sdesc->skb = skb;
1731     }
1732
1733     txq_advance(&q->q, ndesc);
1734
1735     cxgb4_ring_tx_db(adap, &q->q, ndesc);
1736     return NETDEV_TX_OK;
1737
1738 out_free:
1739     dev_kfree_skb_any(skb);
1740     return NETDEV_TX_OK;
1741 }
1742
1743 /* Constants ... */
1744 enum {
1745     /* Egress Queue sizes, producer and consumer indices are all in units
1746      * of Egress Context Units bytes.  Note that as far as the hardware is
1747      * concerned, the free list is an Egress Queue (the host produces free
1748      * buffers which the hardware consumes) and free list entries are
1749      * 64-bit PCI DMA addresses.
1750      */
1751     EQ_UNIT = SGE_EQ_IDXSIZE,
1752     FL_PER_EQ_UNIT = EQ_UNIT / sizeof(__be64),
1753     TXD_PER_EQ_UNIT = EQ_UNIT / sizeof(__be64),
1754
1755     T4VF_ETHTXQ_MAX_HDR = (sizeof(struct fw_eth_tx_pkt_vm_wr) +
1756                    sizeof(struct cpl_tx_pkt_lso_core) +
1757                    sizeof(struct cpl_tx_pkt_core)) / sizeof(__be64),
1758 };
1759
1760 /**
1761  *  t4vf_is_eth_imm - can an Ethernet packet be sent as immediate data?
1762  *  @skb: the packet
1763  *
1764  *  Returns whether an Ethernet packet is small enough to fit completely as
1765  *  immediate data.
1766  */
1767 static inline int t4vf_is_eth_imm(const struct sk_buff *skb)
1768 {
1769     /* The VF Driver uses the FW_ETH_TX_PKT_VM_WR firmware Work Request
1770      * which does not accommodate immediate data.  We could dike out all
1771      * of the support code for immediate data but that would tie our hands
1772      * too much if we ever want to enhace the firmware.  It would also
1773      * create more differences between the PF and VF Drivers.
1774      */
1775     return false;
1776 }
1777
1778 /**
1779  *  t4vf_calc_tx_flits - calculate the number of flits for a packet TX WR
1780  *  @skb: the packet
1781  *
1782  *  Returns the number of flits needed for a TX Work Request for the
1783  *  given Ethernet packet, including the needed WR and CPL headers.
1784  */
1785 static inline unsigned int t4vf_calc_tx_flits(const struct sk_buff *skb)
1786 {
1787     unsigned int flits;
1788
1789     /* If the skb is small enough, we can pump it out as a work request
1790      * with only immediate data.  In that case we just have to have the
1791      * TX Packet header plus the skb data in the Work Request.
1792      */
1793     if (t4vf_is_eth_imm(skb))
1794         return DIV_ROUND_UP(skb->len + sizeof(struct cpl_tx_pkt),
1795                     sizeof(__be64));
1796
1797     /* Otherwise, we're going to have to construct a Scatter gather list
1798      * of the skb body and fragments.  We also include the flits necessary
1799      * for the TX Packet Work Request and CPL.  We always have a firmware
1800      * Write Header (incorporated as part of the cpl_tx_pkt_lso and
1801      * cpl_tx_pkt structures), followed by either a TX Packet Write CPL
1802      * message or, if we're doing a Large Send Offload, an LSO CPL message
1803      * with an embedded TX Packet Write CPL message.
1804      */
1805     flits = sgl_len(skb_shinfo(skb)->nr_frags + 1);
1806     if (skb_shinfo(skb)->gso_size)
1807         flits += (sizeof(struct fw_eth_tx_pkt_vm_wr) +
1808               sizeof(struct cpl_tx_pkt_lso_core) +
1809               sizeof(struct cpl_tx_pkt_core)) / sizeof(__be64);
1810     else
1811         flits += (sizeof(struct fw_eth_tx_pkt_vm_wr) +
1812               sizeof(struct cpl_tx_pkt_core)) / sizeof(__be64);
1813     return flits;
1814 }
1815
1816 /**
1817  *  cxgb4_vf_eth_xmit - add a packet to an Ethernet TX queue
1818  *  @skb: the packet
1819  *  @dev: the egress net device
1820  *
1821  *  Add a packet to an SGE Ethernet TX queue.  Runs with softirqs disabled.
1822  */
1823 static netdev_tx_t cxgb4_vf_eth_xmit(struct sk_buff *skb,
1824                      struct net_device *dev)
1825 {
1826     unsigned int last_desc, flits, ndesc;
1827     const struct skb_shared_info *ssi;
1828     struct fw_eth_tx_pkt_vm_wr *wr;
1829     struct tx_sw_desc *sgl_sdesc;
1830     struct cpl_tx_pkt_core *cpl;
1831     const struct port_info *pi;
1832     struct sge_eth_txq *txq;
1833     struct adapter *adapter;
1834     int qidx, credits, ret;
1835     size_t fw_hdr_copy_len;
1836     unsigned int chip_ver;
1837     u64 cntrl, *end;
1838     u32 wr_mid;
1839
1840     /* The chip minimum packet length is 10 octets but the firmware
1841      * command that we are using requires that we copy the Ethernet header
1842      * (including the VLAN tag) into the header so we reject anything
1843      * smaller than that ...
1844      */
1845     BUILD_BUG_ON(sizeof(wr->firmware) !=
1846              (sizeof(wr->ethmacdst) + sizeof(wr->ethmacsrc) +
1847               sizeof(wr->ethtype) + sizeof(wr->vlantci)));
1848     fw_hdr_copy_len = sizeof(wr->firmware);
1849     ret = cxgb4_validate_skb(skb, dev, fw_hdr_copy_len);
1850     if (ret)
1851         goto out_free;
1852
1853     /* Figure out which TX Queue we're going to use. */
1854     pi = netdev_priv(dev);
1855     adapter = pi->adapter;
1856     qidx = skb_get_queue_mapping(skb);
1857     WARN_ON(qidx >= pi->nqsets);
1858     txq = &adapter->sge.ethtxq[pi->first_qset + qidx];
1859
1860     /* Take this opportunity to reclaim any TX Descriptors whose DMA
1861      * transfers have completed.
1862      */
1863     reclaim_completed_tx(adapter, &txq->q, -1, true);
1864
1865     /* Calculate the number of flits and TX Descriptors we're going to
1866      * need along with how many TX Descriptors will be left over after
1867      * we inject our Work Request.
1868      */
1869     flits = t4vf_calc_tx_flits(skb);
1870     ndesc = flits_to_desc(flits);
1871     credits = txq_avail(&txq->q) - ndesc;
1872
1873     if (unlikely(credits < 0)) {
1874         /* Not enough room for this packet's Work Request.  Stop the
1875          * TX Queue and return a "busy" condition.  The queue will get
1876          * started later on when the firmware informs us that space
1877          * has opened up.
1878          */
1879         eth_txq_stop(txq);
1880         dev_err(adapter->pdev_dev,
1881             "%s: TX ring %u full while queue awake!\n",
1882             dev->name, qidx);
1883         return NETDEV_TX_BUSY;
1884     }
1885
1886     last_desc = txq->q.pidx + ndesc - 1;
1887     if (last_desc >= txq->q.size)
1888         last_desc -= txq->q.size;
1889     sgl_sdesc = &txq->q.sdesc[last_desc];
1890
1891     if (!t4vf_is_eth_imm(skb) &&
1892         unlikely(cxgb4_map_skb(adapter->pdev_dev, skb,
1893                    sgl_sdesc->addr) < 0)) {
1894         /* We need to map the skb into PCI DMA space (because it can't
1895          * be in-lined directly into the Work Request) and the mapping
1896          * operation failed.  Record the error and drop the packet.
1897          */
1898         memset(sgl_sdesc->addr, 0, sizeof(sgl_sdesc->addr));
1899         txq->mapping_err++;
1900         goto out_free;
1901     }
1902
1903     chip_ver = CHELSIO_CHIP_VERSION(adapter->params.chip);
1904     wr_mid = FW_WR_LEN16_V(DIV_ROUND_UP(flits, 2));
1905     if (unlikely(credits < ETHTXQ_STOP_THRES)) {
1906         /* After we're done injecting the Work Request for this
1907          * packet, we'll be below our "stop threshold" so stop the TX
1908          * Queue now and schedule a request for an SGE Egress Queue
1909          * Update message.  The queue will get started later on when
1910          * the firmware processes this Work Request and sends us an
1911          * Egress Queue Status Update message indicating that space
1912          * has opened up.
1913          */
1914         eth_txq_stop(txq);
1915         if (chip_ver > CHELSIO_T5)
1916             wr_mid |= FW_WR_EQUEQ_F | FW_WR_EQUIQ_F;
1917     }
1918
1919     /* Start filling in our Work Request.  Note that we do _not_ handle
1920      * the WR Header wrapping around the TX Descriptor Ring.  If our
1921      * maximum header size ever exceeds one TX Descriptor, we'll need to
1922      * do something else here.
1923      */
1924     WARN_ON(DIV_ROUND_UP(T4VF_ETHTXQ_MAX_HDR, TXD_PER_EQ_UNIT) > 1);
1925     wr = (void *)&txq->q.desc[txq->q.pidx];
1926     wr->equiq_to_len16 = cpu_to_be32(wr_mid);
1927     wr->r3[0] = cpu_to_be32(0);
1928     wr->r3[1] = cpu_to_be32(0);
1929     skb_copy_from_linear_data(skb, &wr->firmware, fw_hdr_copy_len);
1930     end = (u64 *)wr + flits;
1931
1932     /* If this is a Large Send Offload packet we'll put in an LSO CPL
1933      * message with an encapsulated TX Packet CPL message.  Otherwise we
1934      * just use a TX Packet CPL message.
1935      */
1936     ssi = skb_shinfo(skb);
1937     if (ssi->gso_size) {
1938         struct cpl_tx_pkt_lso_core *lso = (void *)(wr + 1);
1939         bool v6 = (ssi->gso_type & SKB_GSO_TCPV6) != 0;
1940         int l3hdr_len = skb_network_header_len(skb);
1941         int eth_xtra_len = skb_network_offset(skb) - ETH_HLEN;
1942
1943         wr->op_immdlen =
1944             cpu_to_be32(FW_WR_OP_V(FW_ETH_TX_PKT_VM_WR) |
1945                     FW_WR_IMMDLEN_V(sizeof(*lso) +
1946                             sizeof(*cpl)));
1947          /* Fill in the LSO CPL message. */
1948         lso->lso_ctrl =
1949             cpu_to_be32(LSO_OPCODE_V(CPL_TX_PKT_LSO) |
1950                     LSO_FIRST_SLICE_F |
1951                     LSO_LAST_SLICE_F |
1952                     LSO_IPV6_V(v6) |
1953                     LSO_ETHHDR_LEN_V(eth_xtra_len / 4) |
1954                     LSO_IPHDR_LEN_V(l3hdr_len / 4) |
1955                     LSO_TCPHDR_LEN_V(tcp_hdr(skb)->doff));
1956         lso->ipid_ofst = cpu_to_be16(0);
1957         lso->mss = cpu_to_be16(ssi->gso_size);
1958         lso->seqno_offset = cpu_to_be32(0);
1959         if (is_t4(adapter->params.chip))
1960             lso->len = cpu_to_be32(skb->len);
1961         else
1962             lso->len = cpu_to_be32(LSO_T5_XFER_SIZE_V(skb->len));
1963
1964         /* Set up TX Packet CPL pointer, control word and perform
1965          * accounting.
1966          */
1967         cpl = (void *)(lso + 1);
1968
1969         if (chip_ver <= CHELSIO_T5)
1970             cntrl = TXPKT_ETHHDR_LEN_V(eth_xtra_len);
1971         else
1972             cntrl = T6_TXPKT_ETHHDR_LEN_V(eth_xtra_len);
1973
1974         cntrl |= TXPKT_CSUM_TYPE_V(v6 ?
1975                        TX_CSUM_TCPIP6 : TX_CSUM_TCPIP) |
1976              TXPKT_IPHDR_LEN_V(l3hdr_len);
1977         txq->tso++;
1978         txq->tx_cso += ssi->gso_segs;
1979     } else {
1980         int len;
1981
1982         len = (t4vf_is_eth_imm(skb)
1983                ? skb->len + sizeof(*cpl)
1984                : sizeof(*cpl));
1985         wr->op_immdlen =
1986             cpu_to_be32(FW_WR_OP_V(FW_ETH_TX_PKT_VM_WR) |
1987                     FW_WR_IMMDLEN_V(len));
1988
1989         /* Set up TX Packet CPL pointer, control word and perform
1990          * accounting.
1991          */
1992         cpl = (void *)(wr + 1);
1993         if (skb->ip_summed == CHECKSUM_PARTIAL) {
1994             cntrl = hwcsum(adapter->params.chip, skb) |
1995                 TXPKT_IPCSUM_DIS_F;
1996             txq->tx_cso++;
1997         } else {
1998             cntrl = TXPKT_L4CSUM_DIS_F | TXPKT_IPCSUM_DIS_F;
1999         }
2000     }
2001
2002     /* If there's a VLAN tag present, add that to the list of things to
2003      * do in this Work Request.
2004      */
2005     if (skb_vlan_tag_present(skb)) {
2006         txq->vlan_ins++;
2007         cntrl |= TXPKT_VLAN_VLD_F | TXPKT_VLAN_V(skb_vlan_tag_get(skb));
2008     }
2009
2010      /* Fill in the TX Packet CPL message header. */
2011     cpl->ctrl0 = cpu_to_be32(TXPKT_OPCODE_V(CPL_TX_PKT_XT) |
2012                  TXPKT_INTF_V(pi->port_id) |
2013                  TXPKT_PF_V(0));
2014     cpl->pack = cpu_to_be16(0);
2015     cpl->len = cpu_to_be16(skb->len);
2016     cpl->ctrl1 = cpu_to_be64(cntrl);
2017
2018     /* Fill in the body of the TX Packet CPL message with either in-lined
2019      * data or a Scatter/Gather List.
2020      */
2021     if (t4vf_is_eth_imm(skb)) {
2022         /* In-line the packet's data and free the skb since we don't
2023          * need it any longer.
2024          */
2025         cxgb4_inline_tx_skb(skb, &txq->q, cpl + 1);
2026         dev_consume_skb_any(skb);
2027     } else {
2028         /* Write the skb's Scatter/Gather list into the TX Packet CPL
2029          * message and retain a pointer to the skb so we can free it
2030          * later when its DMA completes.  (We store the skb pointer
2031          * in the Software Descriptor corresponding to the last TX
2032          * Descriptor used by the Work Request.)
2033          *
2034          * The retained skb will be freed when the corresponding TX
2035          * Descriptors are reclaimed after their DMAs complete.
2036          * However, this could take quite a while since, in general,
2037          * the hardware is set up to be lazy about sending DMA
2038          * completion notifications to us and we mostly perform TX
2039          * reclaims in the transmit routine.
2040          *
2041          * This is good for performamce but means that we rely on new
2042          * TX packets arriving to run the destructors of completed
2043          * packets, which open up space in their sockets' send queues.
2044          * Sometimes we do not get such new packets causing TX to
2045          * stall.  A single UDP transmitter is a good example of this
2046          * situation.  We have a clean up timer that periodically
2047          * reclaims completed packets but it doesn't run often enough
2048          * (nor do we want it to) to prevent lengthy stalls.  A
2049          * solution to this problem is to run the destructor early,
2050          * after the packet is queued but before it's DMAd.  A con is
2051          * that we lie to socket memory accounting, but the amount of
2052          * extra memory is reasonable (limited by the number of TX
2053          * descriptors), the packets do actually get freed quickly by
2054          * new packets almost always, and for protocols like TCP that
2055          * wait for acks to really free up the data the extra memory
2056          * is even less.  On the positive side we run the destructors
2057          * on the sending CPU rather than on a potentially different
2058          * completing CPU, usually a good thing.
2059          *
2060          * Run the destructor before telling the DMA engine about the
2061          * packet to make sure it doesn't complete and get freed
2062          * prematurely.
2063          */
2064         struct ulptx_sgl *sgl = (struct ulptx_sgl *)(cpl + 1);
2065         struct sge_txq *tq = &txq->q;
2066
2067         /* If the Work Request header was an exact multiple of our TX
2068          * Descriptor length, then it's possible that the starting SGL
2069          * pointer lines up exactly with the end of our TX Descriptor
2070          * ring.  If that's the case, wrap around to the beginning
2071          * here ...
2072          */
2073         if (unlikely((void *)sgl == (void *)tq->stat)) {
2074             sgl = (void *)tq->desc;
2075             end = (void *)((void *)tq->desc +
2076                        ((void *)end - (void *)tq->stat));
2077         }
2078
2079         cxgb4_write_sgl(skb, tq, sgl, end, 0, sgl_sdesc->addr);
2080         skb_orphan(skb);
2081         sgl_sdesc->skb = skb;
2082     }
2083
2084     /* Advance our internal TX Queue state, tell the hardware about
2085      * the new TX descriptors and return success.
2086      */
2087     txq_advance(&txq->q, ndesc);
2088
2089     cxgb4_ring_tx_db(adapter, &txq->q, ndesc);
2090     return NETDEV_TX_OK;
2091
2092 out_free:
2093     /* An error of some sort happened.  Free the TX skb and tell the
2094      * OS that we've "dealt" with the packet ...
2095      */
2096     dev_kfree_skb_any(skb);
2097     return NETDEV_TX_OK;
2098 }
2099
2100 /**
2101  * reclaim_completed_tx_imm - reclaim completed control-queue Tx descs
2102  * @q: the SGE control Tx queue
2103  *
2104  * This is a variant of cxgb4_reclaim_completed_tx() that is used
2105  * for Tx queues that send only immediate data (presently just
2106  * the control queues) and  thus do not have any sk_buffs to release.
2107  */
2108 static inline void reclaim_completed_tx_imm(struct sge_txq *q)
2109 {
2110     int hw_cidx = ntohs(READ_ONCE(q->stat->cidx));
2111     int reclaim = hw_cidx - q->cidx;
2112
2113     if (reclaim < 0)
2114         reclaim += q->size;
2115
2116     q->in_use -= reclaim;
2117     q->cidx = hw_cidx;
2118 }
2119
2120 static inline void eosw_txq_advance_index(u32 *idx, u32 n, u32 max)
2121 {
2122     u32 val = *idx + n;
2123
2124     if (val >= max)
2125         val -= max;
2126
2127     *idx = val;
2128 }
2129
2130 void cxgb4_eosw_txq_free_desc(struct adapter *adap,
2131                   struct sge_eosw_txq *eosw_txq, u32 ndesc)
2132 {
2133     struct tx_sw_desc *d;
2134
2135     d = &eosw_txq->desc[eosw_txq->last_cidx];
2136     while (ndesc--) {
2137         if (d->skb) {
2138             if (d->addr[0]) {
2139                 unmap_skb(adap->pdev_dev, d->skb, d->addr);
2140                 memset(d->addr, 0, sizeof(d->addr));
2141             }
2142             dev_consume_skb_any(d->skb);
2143             d->skb = NULL;
2144         }
2145         eosw_txq_advance_index(&eosw_txq->last_cidx, 1,
2146                        eosw_txq->ndesc);
2147         d = &eosw_txq->desc[eosw_txq->last_cidx];
2148     }
2149 }
2150
2151 static inline void eosw_txq_advance(struct sge_eosw_txq *eosw_txq, u32 n)
2152 {
2153     eosw_txq_advance_index(&eosw_txq->pidx, n, eosw_txq->ndesc);
2154     eosw_txq->inuse += n;
2155 }
2156
2157 static inline int eosw_txq_enqueue(struct sge_eosw_txq *eosw_txq,
2158                    struct sk_buff *skb)
2159 {
2160     if (eosw_txq->inuse == eosw_txq->ndesc)
2161         return -ENOMEM;
2162
2163     eosw_txq->desc[eosw_txq->pidx].skb = skb;
2164     return 0;
2165 }
2166
2167 static inline struct sk_buff *eosw_txq_peek(struct sge_eosw_txq *eosw_txq)
2168 {
2169     return eosw_txq->desc[eosw_txq->last_pidx].skb;
2170 }
2171
2172 static inline u8 ethofld_calc_tx_flits(struct adapter *adap,
2173                        struct sk_buff *skb, u32 hdr_len)
2174 {
2175     u8 flits, nsgl = 0;
2176     u32 wrlen;
2177
2178     wrlen = sizeof(struct fw_eth_tx_eo_wr) + sizeof(struct cpl_tx_pkt_core);
2179     if (skb_shinfo(skb)->gso_size &&
2180         !(skb_shinfo(skb)->gso_type & SKB_GSO_UDP_L4))
2181         wrlen += sizeof(struct cpl_tx_pkt_lso_core);
2182
2183     wrlen += roundup(hdr_len, 16);
2184
2185     /* Packet headers + WR + CPLs */
2186     flits = DIV_ROUND_UP(wrlen, 8);
2187
2188     if (skb_shinfo(skb)->nr_frags > 0) {
2189         if (skb_headlen(skb) - hdr_len)
2190             nsgl = sgl_len(skb_shinfo(skb)->nr_frags + 1);
2191         else
2192             nsgl = sgl_len(skb_shinfo(skb)->nr_frags);
2193     } else if (skb->len - hdr_len) {
2194         nsgl = sgl_len(1);
2195     }
2196
2197     return flits + nsgl;
2198 }
2199
2200 static void *write_eo_wr(struct adapter *adap, struct sge_eosw_txq *eosw_txq,
2201              struct sk_buff *skb, struct fw_eth_tx_eo_wr *wr,
2202              u32 hdr_len, u32 wrlen)
2203 {
2204     const struct skb_shared_info *ssi = skb_shinfo(skb);
2205     struct cpl_tx_pkt_core *cpl;
2206     u32 immd_len, wrlen16;
2207     bool compl = false;
2208     u8 ver, proto;
2209
2210     ver = ip_hdr(skb)->version;
2211     proto = (ver == 6) ? ipv6_hdr(skb)->nexthdr : ip_hdr(skb)->protocol;
2212
2213     wrlen16 = DIV_ROUND_UP(wrlen, 16);
2214     immd_len = sizeof(struct cpl_tx_pkt_core);
2215     if (skb_shinfo(skb)->gso_size &&
2216         !(skb_shinfo(skb)->gso_type & SKB_GSO_UDP_L4))
2217         immd_len += sizeof(struct cpl_tx_pkt_lso_core);
2218     immd_len += hdr_len;
2219
2220     if (!eosw_txq->ncompl ||
2221         (eosw_txq->last_compl + wrlen16) >=
2222         (adap->params.ofldq_wr_cred / 2)) {
2223         compl = true;
2224         eosw_txq->ncompl++;
2225         eosw_txq->last_compl = 0;
2226     }
2227
2228     wr->op_immdlen = cpu_to_be32(FW_WR_OP_V(FW_ETH_TX_EO_WR) |
2229                      FW_ETH_TX_EO_WR_IMMDLEN_V(immd_len) |
2230                      FW_WR_COMPL_V(compl));
2231     wr->equiq_to_len16 = cpu_to_be32(FW_WR_LEN16_V(wrlen16) |
2232                      FW_WR_FLOWID_V(eosw_txq->hwtid));
2233     wr->r3 = 0;
2234     if (proto == IPPROTO_UDP) {
2235         cpl = write_eo_udp_wr(skb, wr, hdr_len);
2236     } else {
2237         wr->u.tcpseg.type = FW_ETH_TX_EO_TYPE_TCPSEG;
2238         wr->u.tcpseg.ethlen = skb_network_offset(skb);
2239         wr->u.tcpseg.iplen = cpu_to_be16(skb_network_header_len(skb));
2240         wr->u.tcpseg.tcplen = tcp_hdrlen(skb);
2241         wr->u.tcpseg.tsclk_tsoff = 0;
2242         wr->u.tcpseg.r4 = 0;
2243         wr->u.tcpseg.r5 = 0;
2244         wr->u.tcpseg.plen = cpu_to_be32(skb->len - hdr_len);
2245
2246         if (ssi->gso_size) {
2247             struct cpl_tx_pkt_lso_core *lso = (void *)(wr + 1);
2248
2249             wr->u.tcpseg.mss = cpu_to_be16(ssi->gso_size);
2250             cpl = write_tso_wr(adap, skb, lso);
2251         } else {
2252             wr->u.tcpseg.mss = cpu_to_be16(0xffff);
2253             cpl = (void *)(wr + 1);
2254         }
2255     }
2256
2257     eosw_txq->cred -= wrlen16;
2258     eosw_txq->last_compl += wrlen16;
2259     return cpl;
2260 }
2261
2262 static int ethofld_hard_xmit(struct net_device *dev,
2263                  struct sge_eosw_txq *eosw_txq)
2264 {
2265     struct port_info *pi = netdev2pinfo(dev);
2266     struct adapter *adap = netdev2adap(dev);
2267     u32 wrlen, wrlen16, hdr_len, data_len;
2268     enum sge_eosw_state next_state;
2269     u64 cntrl, *start, *end, *sgl;
2270     struct sge_eohw_txq *eohw_txq;
2271     struct cpl_tx_pkt_core *cpl;
2272     struct fw_eth_tx_eo_wr *wr;
2273     bool skip_eotx_wr = false;
2274     struct tx_sw_desc *d;
2275     struct sk_buff *skb;
2276     int left, ret = 0;
2277     u8 flits, ndesc;
2278
2279     eohw_txq = &adap->sge.eohw_txq[eosw_txq->hwqid];
2280     spin_lock(&eohw_txq->lock);
2281     reclaim_completed_tx_imm(&eohw_txq->q);
2282
2283     d = &eosw_txq->desc[eosw_txq->last_pidx];
2284     skb = d->skb;
2285     skb_tx_timestamp(skb);
2286
2287     wr = (struct fw_eth_tx_eo_wr *)&eohw_txq->q.desc[eohw_txq->q.pidx];
2288     if (unlikely(eosw_txq->state != CXGB4_EO_STATE_ACTIVE &&
2289              eosw_txq->last_pidx == eosw_txq->flowc_idx)) {
2290         hdr_len = skb->len;
2291         data_len = 0;
2292         flits = DIV_ROUND_UP(hdr_len, 8);
2293         if (eosw_txq->state == CXGB4_EO_STATE_FLOWC_OPEN_SEND)
2294             next_state = CXGB4_EO_STATE_FLOWC_OPEN_REPLY;
2295         else
2296             next_state = CXGB4_EO_STATE_FLOWC_CLOSE_REPLY;
2297         skip_eotx_wr = true;
2298     } else {
2299         hdr_len = eth_get_headlen(dev, skb->data, skb_headlen(skb));
2300         data_len = skb->len - hdr_len;
2301         flits = ethofld_calc_tx_flits(adap, skb, hdr_len);
2302     }
2303     ndesc = flits_to_desc(flits);
2304     wrlen = flits * 8;
2305     wrlen16 = DIV_ROUND_UP(wrlen, 16);
2306
2307     left = txq_avail(&eohw_txq->q) - ndesc;
2308
2309     /* If there are no descriptors left in hardware queues or no
2310      * CPL credits left in software queues, then wait for them
2311      * to come back and retry again. Note that we always request
2312      * for credits update via interrupt for every half credits
2313      * consumed. So, the interrupt will eventually restore the
2314      * credits and invoke the Tx path again.
2315      */
2316     if (unlikely(left < 0 || wrlen16 > eosw_txq->cred)) {
2317         ret = -ENOMEM;
2318         goto out_unlock;
2319     }
2320
2321     if (unlikely(skip_eotx_wr)) {
2322         start = (u64 *)wr;
2323         eosw_txq->state = next_state;
2324         eosw_txq->cred -= wrlen16;
2325         eosw_txq->ncompl++;
2326         eosw_txq->last_compl = 0;
2327         goto write_wr_headers;
2328     }
2329
2330     cpl = write_eo_wr(adap, eosw_txq, skb, wr, hdr_len, wrlen);
2331     cntrl = hwcsum(adap->params.chip, skb);
2332     if (skb_vlan_tag_present(skb))
2333         cntrl |= TXPKT_VLAN_VLD_F | TXPKT_VLAN_V(skb_vlan_tag_get(skb));
2334
2335     cpl->ctrl0 = cpu_to_be32(TXPKT_OPCODE_V(CPL_TX_PKT_XT) |
2336                  TXPKT_INTF_V(pi->tx_chan) |
2337                  TXPKT_PF_V(adap->pf));
2338     cpl->pack = 0;
2339     cpl->len = cpu_to_be16(skb->len);
2340     cpl->ctrl1 = cpu_to_be64(cntrl);
2341
2342     start = (u64 *)(cpl + 1);
2343
2344 write_wr_headers:
2345     sgl = (u64 *)inline_tx_skb_header(skb, &eohw_txq->q, (void *)start,
2346                       hdr_len);
2347     if (data_len) {
2348         ret = cxgb4_map_skb(adap->pdev_dev, skb, d->addr);
2349         if (unlikely(ret)) {
2350             memset(d->addr, 0, sizeof(d->addr));
2351             eohw_txq->mapping_err++;
2352             goto out_unlock;
2353         }
2354
2355         end = (u64 *)wr + flits;
2356         if (unlikely(start > sgl)) {
2357             left = (u8 *)end - (u8 *)eohw_txq->q.stat;
2358             end = (void *)eohw_txq->q.desc + left;
2359         }
2360
2361         if (unlikely((u8 *)sgl >= (u8 *)eohw_txq->q.stat)) {
2362             /* If current position is already at the end of the
2363              * txq, reset the current to point to start of the queue
2364              * and update the end ptr as well.
2365              */
2366             left = (u8 *)end - (u8 *)eohw_txq->q.stat;
2367
2368             end = (void *)eohw_txq->q.desc + left;
2369             sgl = (void *)eohw_txq->q.desc;
2370         }
2371
2372         cxgb4_write_sgl(skb, &eohw_txq->q, (void *)sgl, end, hdr_len,
2373                 d->addr);
2374     }
2375
2376     if (skb_shinfo(skb)->gso_size) {
2377         if (skb_shinfo(skb)->gso_type & SKB_GSO_UDP_L4)
2378             eohw_txq->uso++;
2379         else
2380             eohw_txq->tso++;
2381         eohw_txq->tx_cso += skb_shinfo(skb)->gso_segs;
2382     } else if (skb->ip_summed == CHECKSUM_PARTIAL) {
2383         eohw_txq->tx_cso++;
2384     }
2385
2386     if (skb_vlan_tag_present(skb))
2387         eohw_txq->vlan_ins++;
2388
2389     txq_advance(&eohw_txq->q, ndesc);
2390     cxgb4_ring_tx_db(adap, &eohw_txq->q, ndesc);
2391     eosw_txq_advance_index(&eosw_txq->last_pidx, 1, eosw_txq->ndesc);
2392
2393 out_unlock:
2394     spin_unlock(&eohw_txq->lock);
2395     return ret;
2396 }
2397
2398 static void ethofld_xmit(struct net_device *dev, struct sge_eosw_txq *eosw_txq)
2399 {
2400     struct sk_buff *skb;
2401     int pktcount, ret;
2402
2403     switch (eosw_txq->state) {
2404     case CXGB4_EO_STATE_ACTIVE:
2405     case CXGB4_EO_STATE_FLOWC_OPEN_SEND:
2406     case CXGB4_EO_STATE_FLOWC_CLOSE_SEND:
2407         pktcount = eosw_txq->pidx - eosw_txq->last_pidx;
2408         if (pktcount < 0)
2409             pktcount += eosw_txq->ndesc;
2410         break;
2411     case CXGB4_EO_STATE_FLOWC_OPEN_REPLY:
2412     case CXGB4_EO_STATE_FLOWC_CLOSE_REPLY:
2413     case CXGB4_EO_STATE_CLOSED:
2414     default:
2415         return;
2416     }
2417
2418     while (pktcount--) {
2419         skb = eosw_txq_peek(eosw_txq);
2420         if (!skb) {
2421             eosw_txq_advance_index(&eosw_txq->last_pidx, 1,
2422                            eosw_txq->ndesc);
2423             continue;
2424         }
2425
2426         ret = ethofld_hard_xmit(dev, eosw_txq);
2427         if (ret)
2428             break;
2429     }
2430 }
2431
2432 static netdev_tx_t cxgb4_ethofld_xmit(struct sk_buff *skb,
2433                       struct net_device *dev)
2434 {
2435     struct cxgb4_tc_port_mqprio *tc_port_mqprio;
2436     struct port_info *pi = netdev2pinfo(dev);
2437     struct adapter *adap = netdev2adap(dev);
2438     struct sge_eosw_txq *eosw_txq;
2439     u32 qid;
2440     int ret;
2441
2442     ret = cxgb4_validate_skb(skb, dev, ETH_HLEN);
2443     if (ret)
2444         goto out_free;
2445
2446     tc_port_mqprio = &adap->tc_mqprio->port_mqprio[pi->port_id];
2447     qid = skb_get_queue_mapping(skb) - pi->nqsets;
2448     eosw_txq = &tc_port_mqprio->eosw_txq[qid];
2449     spin_lock_bh(&eosw_txq->lock);
2450     if (eosw_txq->state != CXGB4_EO_STATE_ACTIVE)
2451         goto out_unlock;
2452
2453     ret = eosw_txq_enqueue(eosw_txq, skb);
2454     if (ret)
2455         goto out_unlock;
2456
2457     /* SKB is queued for processing until credits are available.
2458      * So, call the destructor now and we'll free the skb later
2459      * after it has been successfully transmitted.
2460      */
2461     skb_orphan(skb);
2462
2463     eosw_txq_advance(eosw_txq, 1);
2464     ethofld_xmit(dev, eosw_txq);
2465     spin_unlock_bh(&eosw_txq->lock);
2466     return NETDEV_TX_OK;
2467
2468 out_unlock:
2469     spin_unlock_bh(&eosw_txq->lock);
2470 out_free:
2471     dev_kfree_skb_any(skb);
2472     return NETDEV_TX_OK;
2473 }
2474
2475 netdev_tx_t t4_start_xmit(struct sk_buff *skb, struct net_device *dev)
2476 {
2477     struct port_info *pi = netdev_priv(dev);
2478     u16 qid = skb_get_queue_mapping(skb);
2479
2480     if (unlikely(pi->eth_flags & PRIV_FLAG_PORT_TX_VM))
2481         return cxgb4_vf_eth_xmit(skb, dev);
2482
2483     if (unlikely(qid >= pi->nqsets))
2484         return cxgb4_ethofld_xmit(skb, dev);
2485
2486     if (is_ptp_enabled(skb, dev)) {
2487         struct adapter *adap = netdev2adap(dev);
2488         netdev_tx_t ret;
2489
2490         spin_lock(&adap->ptp_lock);
2491         ret = cxgb4_eth_xmit(skb, dev);
2492         spin_unlock(&adap->ptp_lock);
2493         return ret;
2494     }
2495
2496     return cxgb4_eth_xmit(skb, dev);
2497 }
2498
2499 static void eosw_txq_flush_pending_skbs(struct sge_eosw_txq *eosw_txq)
2500 {
2501     int pktcount = eosw_txq->pidx - eosw_txq->last_pidx;
2502     int pidx = eosw_txq->pidx;
2503     struct sk_buff *skb;
2504
2505     if (!pktcount)
2506         return;
2507
2508     if (pktcount < 0)
2509         pktcount += eosw_txq->ndesc;
2510
2511     while (pktcount--) {
2512         pidx--;
2513         if (pidx < 0)
2514             pidx += eosw_txq->ndesc;
2515
2516         skb = eosw_txq->desc[pidx].skb;
2517         if (skb) {
2518             dev_consume_skb_any(skb);
2519             eosw_txq->desc[pidx].skb = NULL;
2520             eosw_txq->inuse--;
2521         }
2522     }
2523
2524     eosw_txq->pidx = eosw_txq->last_pidx + 1;
2525 }
2526
2527 /**
2528  * cxgb4_ethofld_send_flowc - Send ETHOFLD flowc request to bind eotid to tc.
2529  * @dev: netdevice
2530  * @eotid: ETHOFLD tid to bind/unbind
2531  * @tc: traffic class. If set to FW_SCHED_CLS_NONE, then unbinds the @eotid
2532  *
2533  * Send a FLOWC work request to bind an ETHOFLD TID to a traffic class.
2534  * If @tc is set to FW_SCHED_CLS_NONE, then the @eotid is unbound from
2535  * a traffic class.
2536  */
2537 int cxgb4_ethofld_send_flowc(struct net_device *dev, u32 eotid, u32 tc)
2538 {
2539     struct port_info *pi = netdev2pinfo(dev);
2540     struct adapter *adap = netdev2adap(dev);
2541     enum sge_eosw_state next_state;
2542     struct sge_eosw_txq *eosw_txq;
2543     u32 len, len16, nparams = 6;
2544     struct fw_flowc_wr *flowc;
2545     struct eotid_entry *entry;
2546     struct sge_ofld_rxq *rxq;
2547     struct sk_buff *skb;
2548     int ret = 0;
2549
2550     len = struct_size(flowc, mnemval, nparams);
2551     len16 = DIV_ROUND_UP(len, 16);
2552
2553     entry = cxgb4_lookup_eotid(&adap->tids, eotid);
2554     if (!entry)
2555         return -ENOMEM;
2556
2557     eosw_txq = (struct sge_eosw_txq *)entry->data;
2558     if (!eosw_txq)
2559         return -ENOMEM;
2560
2561     if (!(adap->flags & CXGB4_FW_OK)) {
2562         /* Don't stall caller when access to FW is lost */
2563         complete(&eosw_txq->completion);
2564         return -EIO;
2565     }
2566
2567     skb = alloc_skb(len, GFP_KERNEL);
2568     if (!skb)
2569         return -ENOMEM;
2570
2571     spin_lock_bh(&eosw_txq->lock);
2572     if (tc != FW_SCHED_CLS_NONE) {
2573         if (eosw_txq->state != CXGB4_EO_STATE_CLOSED)
2574             goto out_free_skb;
2575
2576         next_state = CXGB4_EO_STATE_FLOWC_OPEN_SEND;
2577     } else {
2578         if (eosw_txq->state != CXGB4_EO_STATE_ACTIVE)
2579             goto out_free_skb;
2580
2581         next_state = CXGB4_EO_STATE_FLOWC_CLOSE_SEND;
2582     }
2583
2584     flowc = __skb_put(skb, len);
2585     memset(flowc, 0, len);
2586
2587     rxq = &adap->sge.eohw_rxq[eosw_txq->hwqid];
2588     flowc->flowid_len16 = cpu_to_be32(FW_WR_LEN16_V(len16) |
2589                       FW_WR_FLOWID_V(eosw_txq->hwtid));
2590     flowc->op_to_nparams = cpu_to_be32(FW_WR_OP_V(FW_FLOWC_WR) |
2591                        FW_FLOWC_WR_NPARAMS_V(nparams) |
2592                        FW_WR_COMPL_V(1));
2593     flowc->mnemval[0].mnemonic = FW_FLOWC_MNEM_PFNVFN;
2594     flowc->mnemval[0].val = cpu_to_be32(FW_PFVF_CMD_PFN_V(adap->pf));
2595     flowc->mnemval[1].mnemonic = FW_FLOWC_MNEM_CH;
2596     flowc->mnemval[1].val = cpu_to_be32(pi->tx_chan);
2597     flowc->mnemval[2].mnemonic = FW_FLOWC_MNEM_PORT;
2598     flowc->mnemval[2].val = cpu_to_be32(pi->tx_chan);
2599     flowc->mnemval[3].mnemonic = FW_FLOWC_MNEM_IQID;
2600     flowc->mnemval[3].val = cpu_to_be32(rxq->rspq.abs_id);
2601     flowc->mnemval[4].mnemonic = FW_FLOWC_MNEM_SCHEDCLASS;
2602     flowc->mnemval[4].val = cpu_to_be32(tc);
2603     flowc->mnemval[5].mnemonic = FW_FLOWC_MNEM_EOSTATE;
2604     flowc->mnemval[5].val = cpu_to_be32(tc == FW_SCHED_CLS_NONE ?
2605                         FW_FLOWC_MNEM_EOSTATE_CLOSING :
2606                         FW_FLOWC_MNEM_EOSTATE_ESTABLISHED);
2607
2608     /* Free up any pending skbs to ensure there's room for
2609      * termination FLOWC.
2610      */
2611     if (tc == FW_SCHED_CLS_NONE)
2612         eosw_txq_flush_pending_skbs(eosw_txq);
2613
2614     ret = eosw_txq_enqueue(eosw_txq, skb);
2615     if (ret)
2616         goto out_free_skb;
2617
2618     eosw_txq->state = next_state;
2619     eosw_txq->flowc_idx = eosw_txq->pidx;
2620     eosw_txq_advance(eosw_txq, 1);
2621     ethofld_xmit(dev, eosw_txq);
2622
2623     spin_unlock_bh(&eosw_txq->lock);
2624     return 0;
2625
2626 out_free_skb:
2627     dev_consume_skb_any(skb);
2628     spin_unlock_bh(&eosw_txq->lock);
2629     return ret;
2630 }
2631
2632 /**
2633  *  is_imm - check whether a packet can be sent as immediate data
2634  *  @skb: the packet
2635  *
2636  *  Returns true if a packet can be sent as a WR with immediate data.
2637  */
2638 static inline int is_imm(const struct sk_buff *skb)
2639 {
2640     return skb->len <= MAX_CTRL_WR_LEN;
2641 }
2642
2643 /**
2644  *  ctrlq_check_stop - check if a control queue is full and should stop
2645  *  @q: the queue
2646  *  @wr: most recent WR written to the queue
2647  *
2648  *  Check if a control queue has become full and should be stopped.
2649  *  We clean up control queue descriptors very lazily, only when we are out.
2650  *  If the queue is still full after reclaiming any completed descriptors
2651  *  we suspend it and have the last WR wake it up.
2652  */
2653 static void ctrlq_check_stop(struct sge_ctrl_txq *q, struct fw_wr_hdr *wr)
2654 {
2655     reclaim_completed_tx_imm(&q->q);
2656     if (unlikely(txq_avail(&q->q) < TXQ_STOP_THRES)) {
2657         wr->lo |= htonl(FW_WR_EQUEQ_F | FW_WR_EQUIQ_F);
2658         q->q.stops++;
2659         q->full = 1;
2660     }
2661 }
2662
2663 #define CXGB4_SELFTEST_LB_STR "CHELSIO_SELFTEST"
2664
2665 int cxgb4_selftest_lb_pkt(struct net_device *netdev)
2666 {
2667     struct port_info *pi = netdev_priv(netdev);
2668     struct adapter *adap = pi->adapter;
2669     struct cxgb4_ethtool_lb_test *lb;
2670     int ret, i = 0, pkt_len, credits;
2671     struct fw_eth_tx_pkt_wr *wr;
2672     struct cpl_tx_pkt_core *cpl;
2673     u32 ctrl0, ndesc, flits;
2674     struct sge_eth_txq *q;
2675     u8 *sgl;
2676
2677     pkt_len = ETH_HLEN + sizeof(CXGB4_SELFTEST_LB_STR);
2678
2679     flits = DIV_ROUND_UP(pkt_len + sizeof(*cpl) + sizeof(*wr),
2680                  sizeof(__be64));
2681     ndesc = flits_to_desc(flits);
2682
2683     lb = &pi->ethtool_lb;
2684     lb->loopback = 1;
2685
2686     q = &adap->sge.ethtxq[pi->first_qset];
2687     __netif_tx_lock(q->txq, smp_processor_id());
2688
2689     reclaim_completed_tx(adap, &q->q, -1, true);
2690     credits = txq_avail(&q->q) - ndesc;
2691     if (unlikely(credits < 0)) {
2692         __netif_tx_unlock(q->txq);
2693         return -ENOMEM;
2694     }
2695
2696     wr = (void *)&q->q.desc[q->q.pidx];
2697     memset(wr, 0, sizeof(struct tx_desc));
2698
2699     wr->op_immdlen = htonl(FW_WR_OP_V(FW_ETH_TX_PKT_WR) |
2700                    FW_WR_IMMDLEN_V(pkt_len +
2701                    sizeof(*cpl)));
2702     wr->equiq_to_len16 = htonl(FW_WR_LEN16_V(DIV_ROUND_UP(flits, 2)));
2703     wr->r3 = cpu_to_be64(0);
2704
2705     cpl = (void *)(wr + 1);
2706     sgl = (u8 *)(cpl + 1);
2707
2708     ctrl0 = TXPKT_OPCODE_V(CPL_TX_PKT_XT) | TXPKT_PF_V(adap->pf) |
2709         TXPKT_INTF_V(pi->tx_chan + 4);
2710
2711     cpl->ctrl0 = htonl(ctrl0);
2712     cpl->pack = htons(0);
2713     cpl->len = htons(pkt_len);
2714     cpl->ctrl1 = cpu_to_be64(TXPKT_L4CSUM_DIS_F | TXPKT_IPCSUM_DIS_F);
2715
2716     eth_broadcast_addr(sgl);
2717     i += ETH_ALEN;
2718     ether_addr_copy(&sgl[i], netdev->dev_addr);
2719     i += ETH_ALEN;
2720
2721     snprintf(&sgl[i], sizeof(CXGB4_SELFTEST_LB_STR), "%s",
2722          CXGB4_SELFTEST_LB_STR);
2723
2724     init_completion(&lb->completion);
2725     txq_advance(&q->q, ndesc);
2726     cxgb4_ring_tx_db(adap, &q->q, ndesc);
2727     __netif_tx_unlock(q->txq);
2728
2729     /* wait for the pkt to return */
2730     ret = wait_for_completion_timeout(&lb->completion, 10 * HZ);
2731     if (!ret)
2732         ret = -ETIMEDOUT;
2733     else
2734         ret = lb->result;
2735
2736     lb->loopback = 0;
2737
2738     return ret;
2739 }
2740
2741 /**
2742  *  ctrl_xmit - send a packet through an SGE control Tx queue
2743  *  @q: the control queue
2744  *  @skb: the packet
2745  *
2746  *  Send a packet through an SGE control Tx queue.  Packets sent through
2747  *  a control queue must fit entirely as immediate data.
2748  */
2749 static int ctrl_xmit(struct sge_ctrl_txq *q, struct sk_buff *skb)
2750 {
2751     unsigned int ndesc;
2752     struct fw_wr_hdr *wr;
2753
2754     if (unlikely(!is_imm(skb))) {
2755         WARN_ON(1);
2756         dev_kfree_skb(skb);
2757         return NET_XMIT_DROP;
2758     }
2759
2760     ndesc = DIV_ROUND_UP(skb->len, sizeof(struct tx_desc));
2761     spin_lock(&q->sendq.lock);
2762
2763     if (unlikely(q->full)) {
2764         skb->priority = ndesc;                  /* save for restart */
2765         __skb_queue_tail(&q->sendq, skb);
2766         spin_unlock(&q->sendq.lock);
2767         return NET_XMIT_CN;
2768     }
2769
2770     wr = (struct fw_wr_hdr *)&q->q.desc[q->q.pidx];
2771     cxgb4_inline_tx_skb(skb, &q->q, wr);
2772
2773     txq_advance(&q->q, ndesc);
2774     if (unlikely(txq_avail(&q->q) < TXQ_STOP_THRES))
2775         ctrlq_check_stop(q, wr);
2776
2777     cxgb4_ring_tx_db(q->adap, &q->q, ndesc);
2778     spin_unlock(&q->sendq.lock);
2779
2780     kfree_skb(skb);
2781     return NET_XMIT_SUCCESS;
2782 }
2783
2784 /**
2785  *  restart_ctrlq - restart a suspended control queue
2786  *  @t: pointer to the tasklet associated with this handler
2787  *
2788  *  Resumes transmission on a suspended Tx control queue.
2789  */
2790 static void restart_ctrlq(struct tasklet_struct *t)
2791 {
2792     struct sk_buff *skb;
2793     unsigned int written = 0;
2794     struct sge_ctrl_txq *q = from_tasklet(q, t, qresume_tsk);
2795
2796     spin_lock(&q->sendq.lock);
2797     reclaim_completed_tx_imm(&q->q);
2798     BUG_ON(txq_avail(&q->q) < TXQ_STOP_THRES);  /* q should be empty */
2799
2800     while ((skb = __skb_dequeue(&q->sendq)) != NULL) {
2801         struct fw_wr_hdr *wr;
2802         unsigned int ndesc = skb->priority;     /* previously saved */
2803
2804         written += ndesc;
2805         /* Write descriptors and free skbs outside the lock to limit
2806          * wait times.  q->full is still set so new skbs will be queued.
2807          */
2808         wr = (struct fw_wr_hdr *)&q->q.desc[q->q.pidx];
2809         txq_advance(&q->q, ndesc);
2810         spin_unlock(&q->sendq.lock);
2811
2812         cxgb4_inline_tx_skb(skb, &q->q, wr);
2813         kfree_skb(skb);
2814
2815         if (unlikely(txq_avail(&q->q) < TXQ_STOP_THRES)) {
2816             unsigned long old = q->q.stops;
2817
2818             ctrlq_check_stop(q, wr);
2819             if (q->q.stops != old) {          /* suspended anew */
2820                 spin_lock(&q->sendq.lock);
2821                 goto ringdb;
2822             }
2823         }
2824         if (written > 16) {
2825             cxgb4_ring_tx_db(q->adap, &q->q, written);
2826             written = 0;
2827         }
2828         spin_lock(&q->sendq.lock);
2829     }
2830     q->full = 0;
2831 ringdb:
2832     if (written)
2833         cxgb4_ring_tx_db(q->adap, &q->q, written);
2834     spin_unlock(&q->sendq.lock);
2835 }
2836
2837 /**
2838  *  t4_mgmt_tx - send a management message
2839  *  @adap: the adapter
2840  *  @skb: the packet containing the management message
2841  *
2842  *  Send a management message through control queue 0.
2843  */
2844 int t4_mgmt_tx(struct adapter *adap, struct sk_buff *skb)
2845 {
2846     int ret;
2847
2848     local_bh_disable();
2849     ret = ctrl_xmit(&adap->sge.ctrlq[0], skb);
2850     local_bh_enable();
2851     return ret;
2852 }
2853
2854 /**
2855  *  is_ofld_imm - check whether a packet can be sent as immediate data
2856  *  @skb: the packet
2857  *
2858  *  Returns true if a packet can be sent as an offload WR with immediate
2859  *  data.
2860  *  FW_OFLD_TX_DATA_WR limits the payload to 255 bytes due to 8-bit field.
2861  *      However, FW_ULPTX_WR commands have a 256 byte immediate only
2862  *      payload limit.
2863  */
2864 static inline int is_ofld_imm(const struct sk_buff *skb)
2865 {
2866     struct work_request_hdr *req = (struct work_request_hdr *)skb->data;
2867     unsigned long opcode = FW_WR_OP_G(ntohl(req->wr_hi));
2868
2869     if (unlikely(opcode == FW_ULPTX_WR))
2870         return skb->len <= MAX_IMM_ULPTX_WR_LEN;
2871     else if (opcode == FW_CRYPTO_LOOKASIDE_WR)
2872         return skb->len <= SGE_MAX_WR_LEN;
2873     else
2874         return skb->len <= MAX_IMM_OFLD_TX_DATA_WR_LEN;
2875 }
2876
2877 /**
2878  *  calc_tx_flits_ofld - calculate # of flits for an offload packet
2879  *  @skb: the packet
2880  *
2881  *  Returns the number of flits needed for the given offload packet.
2882  *  These packets are already fully constructed and no additional headers
2883  *  will be added.
2884  */
2885 static inline unsigned int calc_tx_flits_ofld(const struct sk_buff *skb)
2886 {
2887     unsigned int flits, cnt;
2888
2889     if (is_ofld_imm(skb))
2890         return DIV_ROUND_UP(skb->len, 8);
2891
2892     flits = skb_transport_offset(skb) / 8U;   /* headers */
2893     cnt = skb_shinfo(skb)->nr_frags;
2894     if (skb_tail_pointer(skb) != skb_transport_header(skb))
2895         cnt++;
2896     return flits + sgl_len(cnt);
2897 }
2898
2899 /**
2900  *  txq_stop_maperr - stop a Tx queue due to I/O MMU exhaustion
2901  *  @q: the queue to stop
2902  *
2903  *  Mark a Tx queue stopped due to I/O MMU exhaustion and resulting
2904  *  inability to map packets.  A periodic timer attempts to restart
2905  *  queues so marked.
2906  */
2907 static void txq_stop_maperr(struct sge_uld_txq *q)
2908 {
2909     q->mapping_err++;
2910     q->q.stops++;
2911     set_bit(q->q.cntxt_id - q->adap->sge.egr_start,
2912         q->adap->sge.txq_maperr);
2913 }
2914
2915 /**
2916  *  ofldtxq_stop - stop an offload Tx queue that has become full
2917  *  @q: the queue to stop
2918  *  @wr: the Work Request causing the queue to become full
2919  *
2920  *  Stops an offload Tx queue that has become full and modifies the packet
2921  *  being written to request a wakeup.
2922  */
2923 static void ofldtxq_stop(struct sge_uld_txq *q, struct fw_wr_hdr *wr)
2924 {
2925     wr->lo |= htonl(FW_WR_EQUEQ_F | FW_WR_EQUIQ_F);
2926     q->q.stops++;
2927     q->full = 1;
2928 }
2929
2930 /**
2931  *  service_ofldq - service/restart a suspended offload queue
2932  *  @q: the offload queue
2933  *
2934  *  Services an offload Tx queue by moving packets from its Pending Send
2935  *  Queue to the Hardware TX ring.  The function starts and ends with the
2936  *  Send Queue locked, but drops the lock while putting the skb at the
2937  *  head of the Send Queue onto the Hardware TX Ring.  Dropping the lock
2938  *  allows more skbs to be added to the Send Queue by other threads.
2939  *  The packet being processed at the head of the Pending Send Queue is
2940  *  left on the queue in case we experience DMA Mapping errors, etc.
2941  *  and need to give up and restart later.
2942  *
2943  *  service_ofldq() can be thought of as a task which opportunistically
2944  *  uses other threads execution contexts.  We use the Offload Queue
2945  *  boolean "service_ofldq_running" to make sure that only one instance
2946  *  is ever running at a time ...
2947  */
2948 static void service_ofldq(struct sge_uld_txq *q)
2949     __must_hold(&q->sendq.lock)
2950 {
2951     u64 *pos, *before, *end;
2952     int credits;
2953     struct sk_buff *skb;
2954     struct sge_txq *txq;
2955     unsigned int left;
2956     unsigned int written = 0;
2957     unsigned int flits, ndesc;
2958
2959     /* If another thread is currently in service_ofldq() processing the
2960      * Pending Send Queue then there's nothing to do. Otherwise, flag
2961      * that we're doing the work and continue.  Examining/modifying
2962      * the Offload Queue boolean "service_ofldq_running" must be done
2963      * while holding the Pending Send Queue Lock.
2964      */
2965     if (q->service_ofldq_running)
2966         return;
2967     q->service_ofldq_running = true;
2968
2969     while ((skb = skb_peek(&q->sendq)) != NULL && !q->full) {
2970         /* We drop the lock while we're working with the skb at the
2971          * head of the Pending Send Queue.  This allows more skbs to
2972          * be added to the Pending Send Queue while we're working on
2973          * this one.  We don't need to lock to guard the TX Ring
2974          * updates because only one thread of execution is ever
2975          * allowed into service_ofldq() at a time.
2976          */
2977         spin_unlock(&q->sendq.lock);
2978
2979         cxgb4_reclaim_completed_tx(q->adap, &q->q, false);
2980
2981         flits = skb->priority;                /* previously saved */
2982         ndesc = flits_to_desc(flits);
2983         credits = txq_avail(&q->q) - ndesc;
2984         BUG_ON(credits < 0);
2985         if (unlikely(credits < TXQ_STOP_THRES))
2986             ofldtxq_stop(q, (struct fw_wr_hdr *)skb->data);
2987
2988         pos = (u64 *)&q->q.desc[q->q.pidx];
2989         if (is_ofld_imm(skb))
2990             cxgb4_inline_tx_skb(skb, &q->q, pos);
2991         else if (cxgb4_map_skb(q->adap->pdev_dev, skb,
2992                        (dma_addr_t *)skb->head)) {
2993             txq_stop_maperr(q);
2994             spin_lock(&q->sendq.lock);
2995             break;
2996         } else {
2997             int last_desc, hdr_len = skb_transport_offset(skb);
2998
2999             /* The WR headers  may not fit within one descriptor.
3000              * So we need to deal with wrap-around here.
3001              */
3002             before = (u64 *)pos;
3003             end = (u64 *)pos + flits;
3004             txq = &q->q;
3005             pos = (void *)inline_tx_skb_header(skb, &q->q,
3006                                (void *)pos,
3007                                hdr_len);
3008             if (before > (u64 *)pos) {
3009                 left = (u8 *)end - (u8 *)txq->stat;
3010                 end = (void *)txq->desc + left;
3011             }
3012
3013             /* If current position is already at the end of the
3014              * ofld queue, reset the current to point to
3015              * start of the queue and update the end ptr as well.
3016              */
3017             if (pos == (u64 *)txq->stat) {
3018                 left = (u8 *)end - (u8 *)txq->stat;
3019                 end = (void *)txq->desc + left;
3020                 pos = (void *)txq->desc;
3021             }
3022
3023             cxgb4_write_sgl(skb, &q->q, (void *)pos,
3024                     end, hdr_len,
3025                     (dma_addr_t *)skb->head);
3026 #ifdef CONFIG_NEED_DMA_MAP_STATE
3027             skb->dev = q->adap->port[0];
3028             skb->destructor = deferred_unmap_destructor;
3029 #endif
3030             last_desc = q->q.pidx + ndesc - 1;
3031             if (last_desc >= q->q.size)
3032                 last_desc -= q->q.size;
3033             q->q.sdesc[last_desc].skb = skb;
3034         }
3035
3036         txq_advance(&q->q, ndesc);
3037         written += ndesc;
3038         if (unlikely(written > 32)) {
3039             cxgb4_ring_tx_db(q->adap, &q->q, written);
3040             written = 0;
3041         }
3042
3043         /* Reacquire the Pending Send Queue Lock so we can unlink the
3044          * skb we've just successfully transferred to the TX Ring and
3045          * loop for the next skb which may be at the head of the
3046          * Pending Send Queue.
3047          */
3048         spin_lock(&q->sendq.lock);
3049         __skb_unlink(skb, &q->sendq);
3050         if (is_ofld_imm(skb))
3051             kfree_skb(skb);
3052     }
3053     if (likely(written))
3054         cxgb4_ring_tx_db(q->adap, &q->q, written);
3055
3056     /*Indicate that no thread is processing the Pending Send Queue
3057      * currently.
3058      */
3059     q->service_ofldq_running = false;
3060 }
3061
3062 /**
3063  *  ofld_xmit - send a packet through an offload queue
3064  *  @q: the Tx offload queue
3065  *  @skb: the packet
3066  *
3067  *  Send an offload packet through an SGE offload queue.
3068  */
3069 static int ofld_xmit(struct sge_uld_txq *q, struct sk_buff *skb)
3070 {
3071     skb->priority = calc_tx_flits_ofld(skb);       /* save for restart */
3072     spin_lock(&q->sendq.lock);
3073
3074     /* Queue the new skb onto the Offload Queue's Pending Send Queue.  If
3075      * that results in this new skb being the only one on the queue, start
3076      * servicing it.  If there are other skbs already on the list, then
3077      * either the queue is currently being processed or it's been stopped
3078      * for some reason and it'll be restarted at a later time.  Restart
3079      * paths are triggered by events like experiencing a DMA Mapping Error
3080      * or filling the Hardware TX Ring.
3081      */
3082     __skb_queue_tail(&q->sendq, skb);
3083     if (q->sendq.qlen == 1)
3084         service_ofldq(q);
3085
3086     spin_unlock(&q->sendq.lock);
3087     return NET_XMIT_SUCCESS;
3088 }
3089
3090 /**
3091  *  restart_ofldq - restart a suspended offload queue
3092  *  @t: pointer to the tasklet associated with this handler
3093  *
3094  *  Resumes transmission on a suspended Tx offload queue.
3095  */
3096 static void restart_ofldq(struct tasklet_struct *t)
3097 {
3098     struct sge_uld_txq *q = from_tasklet(q, t, qresume_tsk);
3099
3100     spin_lock(&q->sendq.lock);
3101     q->full = 0;            /* the queue actually is completely empty now */
3102     service_ofldq(q);
3103     spin_unlock(&q->sendq.lock);
3104 }
3105
3106 /**
3107  *  skb_txq - return the Tx queue an offload packet should use
3108  *  @skb: the packet
3109  *
3110  *  Returns the Tx queue an offload packet should use as indicated by bits
3111  *  1-15 in the packet's queue_mapping.
3112  */
3113 static inline unsigned int skb_txq(const struct sk_buff *skb)
3114 {
3115     return skb->queue_mapping >> 1;
3116 }
3117
3118 /**
3119  *  is_ctrl_pkt - return whether an offload packet is a control packet
3120  *  @skb: the packet
3121  *
3122  *  Returns whether an offload packet should use an OFLD or a CTRL
3123  *  Tx queue as indicated by bit 0 in the packet's queue_mapping.
3124  */
3125 static inline unsigned int is_ctrl_pkt(const struct sk_buff *skb)
3126 {
3127     return skb->queue_mapping & 1;
3128 }
3129
3130 static inline int uld_send(struct adapter *adap, struct sk_buff *skb,
3131                unsigned int tx_uld_type)
3132 {
3133     struct sge_uld_txq_info *txq_info;
3134     struct sge_uld_txq *txq;
3135     unsigned int idx = skb_txq(skb);
3136
3137     if (unlikely(is_ctrl_pkt(skb))) {
3138         /* Single ctrl queue is a requirement for LE workaround path */
3139         if (adap->tids.nsftids)
3140             idx = 0;
3141         return ctrl_xmit(&adap->sge.ctrlq[idx], skb);
3142     }
3143
3144     txq_info = adap->sge.uld_txq_info[tx_uld_type];
3145     if (unlikely(!txq_info)) {
3146         WARN_ON(true);
3147         kfree_skb(skb);
3148         return NET_XMIT_DROP;
3149     }
3150
3151     txq = &txq_info->uldtxq[idx];
3152     return ofld_xmit(txq, skb);
3153 }
3154
3155 /**
3156  *  t4_ofld_send - send an offload packet
3157  *  @adap: the adapter
3158  *  @skb: the packet
3159  *
3160  *  Sends an offload packet.  We use the packet queue_mapping to select the
3161  *  appropriate Tx queue as follows: bit 0 indicates whether the packet
3162  *  should be sent as regular or control, bits 1-15 select the queue.
3163  */
3164 int t4_ofld_send(struct adapter *adap, struct sk_buff *skb)
3165 {
3166     int ret;
3167
3168     local_bh_disable();
3169     ret = uld_send(adap, skb, CXGB4_TX_OFLD);
3170     local_bh_enable();
3171     return ret;
3172 }
3173
3174 /**
3175  *  cxgb4_ofld_send - send an offload packet
3176  *  @dev: the net device
3177  *  @skb: the packet
3178  *
3179  *  Sends an offload packet.  This is an exported version of @t4_ofld_send,
3180  *  intended for ULDs.
3181  */
3182 int cxgb4_ofld_send(struct net_device *dev, struct sk_buff *skb)
3183 {
3184     return t4_ofld_send(netdev2adap(dev), skb);
3185 }
3186 EXPORT_SYMBOL(cxgb4_ofld_send);
3187
3188 static void *inline_tx_header(const void *src,
3189                   const struct sge_txq *q,
3190                   void *pos, int length)
3191 {
3192     int left = (void *)q->stat - pos;
3193     u64 *p;
3194
3195     if (likely(length <= left)) {
3196         memcpy(pos, src, length);
3197         pos += length;
3198     } else {
3199         memcpy(pos, src, left);
3200         memcpy(q->desc, src + left, length - left);
3201         pos = (void *)q->desc + (length - left);
3202     }
3203     /* 0-pad to multiple of 16 */
3204     p = PTR_ALIGN(pos, 8);
3205     if ((uintptr_t)p & 8) {
3206         *p = 0;
3207         return p + 1;
3208     }
3209     return p;
3210 }
3211
3212 /**
3213  *      ofld_xmit_direct - copy a WR into offload queue
3214  *      @q: the Tx offload queue
3215  *      @src: location of WR
3216  *      @len: WR length
3217  *
3218  *      Copy an immediate WR into an uncontended SGE offload queue.
3219  */
3220 static int ofld_xmit_direct(struct sge_uld_txq *q, const void *src,
3221                 unsigned int len)
3222 {
3223     unsigned int ndesc;
3224     int credits;
3225     u64 *pos;
3226
3227     /* Use the lower limit as the cut-off */
3228     if (len > MAX_IMM_OFLD_TX_DATA_WR_LEN) {
3229         WARN_ON(1);
3230         return NET_XMIT_DROP;
3231     }
3232
3233     /* Don't return NET_XMIT_CN here as the current
3234      * implementation doesn't queue the request
3235      * using an skb when the following conditions not met
3236      */
3237     if (!spin_trylock(&q->sendq.lock))
3238         return NET_XMIT_DROP;
3239
3240     if (q->full || !skb_queue_empty(&q->sendq) ||
3241         q->service_ofldq_running) {
3242         spin_unlock(&q->sendq.lock);
3243         return NET_XMIT_DROP;
3244     }
3245     ndesc = flits_to_desc(DIV_ROUND_UP(len, 8));
3246     credits = txq_avail(&q->q) - ndesc;
3247     pos = (u64 *)&q->q.desc[q->q.pidx];
3248
3249     /* ofldtxq_stop modifies WR header in-situ */
3250     inline_tx_header(src, &q->q, pos, len);
3251     if (unlikely(credits < TXQ_STOP_THRES))
3252         ofldtxq_stop(q, (struct fw_wr_hdr *)pos);
3253     txq_advance(&q->q, ndesc);
3254     cxgb4_ring_tx_db(q->adap, &q->q, ndesc);
3255
3256     spin_unlock(&q->sendq.lock);
3257     return NET_XMIT_SUCCESS;
3258 }
3259
3260 int cxgb4_immdata_send(struct net_device *dev, unsigned int idx,
3261                const void *src, unsigned int len)
3262 {
3263     struct sge_uld_txq_info *txq_info;
3264     struct sge_uld_txq *txq;
3265     struct adapter *adap;
3266     int ret;
3267
3268     adap = netdev2adap(dev);
3269
3270     local_bh_disable();
3271     txq_info = adap->sge.uld_txq_info[CXGB4_TX_OFLD];
3272     if (unlikely(!txq_info)) {
3273         WARN_ON(true);
3274         local_bh_enable();
3275         return NET_XMIT_DROP;
3276     }
3277     txq = &txq_info->uldtxq[idx];
3278
3279     ret = ofld_xmit_direct(txq, src, len);
3280     local_bh_enable();
3281     return net_xmit_eval(ret);
3282 }
3283 EXPORT_SYMBOL(cxgb4_immdata_send);
3284
3285 /**
3286  *  t4_crypto_send - send crypto packet
3287  *  @adap: the adapter
3288  *  @skb: the packet
3289  *
3290  *  Sends crypto packet.  We use the packet queue_mapping to select the
3291  *  appropriate Tx queue as follows: bit 0 indicates whether the packet
3292  *  should be sent as regular or control, bits 1-15 select the queue.
3293  */
3294 static int t4_crypto_send(struct adapter *adap, struct sk_buff *skb)
3295 {
3296     int ret;
3297
3298     local_bh_disable();
3299     ret = uld_send(adap, skb, CXGB4_TX_CRYPTO);
3300     local_bh_enable();
3301     return ret;
3302 }
3303
3304 /**
3305  *  cxgb4_crypto_send - send crypto packet
3306  *  @dev: the net device
3307  *  @skb: the packet
3308  *
3309  *  Sends crypto packet.  This is an exported version of @t4_crypto_send,
3310  *  intended for ULDs.
3311  */
3312 int cxgb4_crypto_send(struct net_device *dev, struct sk_buff *skb)
3313 {
3314     return t4_crypto_send(netdev2adap(dev), skb);
3315 }
3316 EXPORT_SYMBOL(cxgb4_crypto_send);
3317
3318 static inline void copy_frags(struct sk_buff *skb,
3319                   const struct pkt_gl *gl, unsigned int offset)
3320 {
3321     int i;
3322
3323     /* usually there's just one frag */
3324     __skb_fill_page_desc(skb, 0, gl->frags[0].page,
3325                  gl->frags[0].offset + offset,
3326                  gl->frags[0].size - offset);
3327     skb_shinfo(skb)->nr_frags = gl->nfrags;
3328     for (i = 1; i < gl->nfrags; i++)
3329         __skb_fill_page_desc(skb, i, gl->frags[i].page,
3330                      gl->frags[i].offset,
3331                      gl->frags[i].size);
3332
3333     /* get a reference to the last page, we don't own it */
3334     get_page(gl->frags[gl->nfrags - 1].page);
3335 }
3336
3337 /**
3338  *  cxgb4_pktgl_to_skb - build an sk_buff from a packet gather list
3339  *  @gl: the gather list
3340  *  @skb_len: size of sk_buff main body if it carries fragments
3341  *  @pull_len: amount of data to move to the sk_buff's main body
3342  *
3343  *  Builds an sk_buff from the given packet gather list.  Returns the
3344  *  sk_buff or %NULL if sk_buff allocation failed.
3345  */
3346 struct sk_buff *cxgb4_pktgl_to_skb(const struct pkt_gl *gl,
3347                    unsigned int skb_len, unsigned int pull_len)
3348 {
3349     struct sk_buff *skb;
3350
3351     /*
3352      * Below we rely on RX_COPY_THRES being less than the smallest Rx buffer
3353      * size, which is expected since buffers are at least PAGE_SIZEd.
3354      * In this case packets up to RX_COPY_THRES have only one fragment.
3355      */
3356     if (gl->tot_len <= RX_COPY_THRES) {
3357         skb = dev_alloc_skb(gl->tot_len);
3358         if (unlikely(!skb))
3359             goto out;
3360         __skb_put(skb, gl->tot_len);
3361         skb_copy_to_linear_data(skb, gl->va, gl->tot_len);
3362     } else {
3363         skb = dev_alloc_skb(skb_len);
3364         if (unlikely(!skb))
3365             goto out;
3366         __skb_put(skb, pull_len);
3367         skb_copy_to_linear_data(skb, gl->va, pull_len);
3368
3369         copy_frags(skb, gl, pull_len);
3370         skb->len = gl->tot_len;
3371         skb->data_len = skb->len - pull_len;
3372         skb->truesize += skb->data_len;
3373     }
3374 out:    return skb;
3375 }
3376 EXPORT_SYMBOL(cxgb4_pktgl_to_skb);
3377
3378 /**
3379  *  t4_pktgl_free - free a packet gather list
3380  *  @gl: the gather list
3381  *
3382  *  Releases the pages of a packet gather list.  We do not own the last
3383  *  page on the list and do not free it.
3384  */
3385 static void t4_pktgl_free(const struct pkt_gl *gl)
3386 {
3387     int n;
3388     const struct page_frag *p;
3389
3390     for (p = gl->frags, n = gl->nfrags - 1; n--; p++)
3391         put_page(p->page);
3392 }
3393
3394 /*
3395  * Process an MPS trace packet.  Give it an unused protocol number so it won't
3396  * be delivered to anyone and send it to the stack for capture.
3397  */
3398 static noinline int handle_trace_pkt(struct adapter *adap,
3399                      const struct pkt_gl *gl)
3400 {
3401     struct sk_buff *skb;
3402
3403     skb = cxgb4_pktgl_to_skb(gl, RX_PULL_LEN, RX_PULL_LEN);
3404     if (unlikely(!skb)) {
3405         t4_pktgl_free(gl);
3406         return 0;
3407     }
3408
3409     if (is_t4(adap->params.chip))
3410         __skb_pull(skb, sizeof(struct cpl_trace_pkt));
3411     else
3412         __skb_pull(skb, sizeof(struct cpl_t5_trace_pkt));
3413
3414     skb_reset_mac_header(skb);
3415     skb->protocol = htons(0xffff);
3416     skb->dev = adap->port[0];
3417     netif_receive_skb(skb);
3418     return 0;
3419 }
3420
3421 /**
3422  * cxgb4_sgetim_to_hwtstamp - convert sge time stamp to hw time stamp
3423  * @adap: the adapter
3424  * @hwtstamps: time stamp structure to update
3425  * @sgetstamp: 60bit iqe timestamp
3426  *
3427  * Every ingress queue entry has the 60-bit timestamp, convert that timestamp
3428  * which is in Core Clock ticks into ktime_t and assign it
3429  **/
3430 static void cxgb4_sgetim_to_hwtstamp(struct adapter *adap,
3431                      struct skb_shared_hwtstamps *hwtstamps,
3432                      u64 sgetstamp)
3433 {
3434     u64 ns;
3435     u64 tmp = (sgetstamp * 1000 * 1000 + adap->params.vpd.cclk / 2);
3436
3437     ns = div_u64(tmp, adap->params.vpd.cclk);
3438
3439     memset(hwtstamps, 0, sizeof(*hwtstamps));
3440     hwtstamps->hwtstamp = ns_to_ktime(ns);
3441 }
3442
3443 static void do_gro(struct sge_eth_rxq *rxq, const struct pkt_gl *gl,
3444            const struct cpl_rx_pkt *pkt, unsigned long tnl_hdr_len)
3445 {
3446     struct adapter *adapter = rxq->rspq.adap;
3447     struct sge *s = &adapter->sge;
3448     struct port_info *pi;
3449     int ret;
3450     struct sk_buff *skb;
3451
3452     skb = napi_get_frags(&rxq->rspq.napi);
3453     if (unlikely(!skb)) {
3454         t4_pktgl_free(gl);
3455         rxq->stats.rx_drops++;
3456         return;
3457     }
3458
3459     copy_frags(skb, gl, s->pktshift);
3460     if (tnl_hdr_len)
3461         skb->csum_level = 1;
3462     skb->len = gl->tot_len - s->pktshift;
3463     skb->data_len = skb->len;
3464     skb->truesize += skb->data_len;
3465     skb->ip_summed = CHECKSUM_UNNECESSARY;
3466     skb_record_rx_queue(skb, rxq->rspq.idx);
3467     pi = netdev_priv(skb->dev);
3468     if (pi->rxtstamp)
3469         cxgb4_sgetim_to_hwtstamp(adapter, skb_hwtstamps(skb),
3470                      gl->sgetstamp);
3471     if (rxq->rspq.netdev->features & NETIF_F_RXHASH)
3472         skb_set_hash(skb, (__force u32)pkt->rsshdr.hash_val,
3473                  PKT_HASH_TYPE_L3);
3474
3475     if (unlikely(pkt->vlan_ex)) {
3476         __vlan_hwaccel_put_tag(skb, htons(ETH_P_8021Q), ntohs(pkt->vlan));
3477         rxq->stats.vlan_ex++;
3478     }
3479     ret = napi_gro_frags(&rxq->rspq.napi);
3480     if (ret == GRO_HELD)
3481         rxq->stats.lro_pkts++;
3482     else if (ret == GRO_MERGED || ret == GRO_MERGED_FREE)
3483         rxq->stats.lro_merged++;
3484     rxq->stats.pkts++;
3485     rxq->stats.rx_cso++;
3486 }
3487
3488 enum {
3489     RX_NON_PTP_PKT = 0,
3490     RX_PTP_PKT_SUC = 1,
3491     RX_PTP_PKT_ERR = 2
3492 };
3493
3494 /**
3495  *     t4_systim_to_hwstamp - read hardware time stamp
3496  *     @adapter: the adapter
3497  *     @skb: the packet
3498  *
3499  *     Read Time Stamp from MPS packet and insert in skb which
3500  *     is forwarded to PTP application
3501  */
3502 static noinline int t4_systim_to_hwstamp(struct adapter *adapter,
3503                      struct sk_buff *skb)
3504 {
3505     struct skb_shared_hwtstamps *hwtstamps;
3506     struct cpl_rx_mps_pkt *cpl = NULL;
3507     unsigned char *data;
3508     int offset;
3509
3510     cpl = (struct cpl_rx_mps_pkt *)skb->data;
3511     if (!(CPL_RX_MPS_PKT_TYPE_G(ntohl(cpl->op_to_r1_hi)) &
3512          X_CPL_RX_MPS_PKT_TYPE_PTP))
3513         return RX_PTP_PKT_ERR;
3514
3515     data = skb->data + sizeof(*cpl);
3516     skb_pull(skb, 2 * sizeof(u64) + sizeof(struct cpl_rx_mps_pkt));
3517     offset = ETH_HLEN + IPV4_HLEN(skb->data) + UDP_HLEN;
3518     if (skb->len < offset + OFF_PTP_SEQUENCE_ID + sizeof(short))
3519         return RX_PTP_PKT_ERR;
3520
3521     hwtstamps = skb_hwtstamps(skb);
3522     memset(hwtstamps, 0, sizeof(*hwtstamps));
3523     hwtstamps->hwtstamp = ns_to_ktime(get_unaligned_be64(data));
3524
3525     return RX_PTP_PKT_SUC;
3526 }
3527
3528 /**
3529  *     t4_rx_hststamp - Recv PTP Event Message
3530  *     @adapter: the adapter
3531  *     @rsp: the response queue descriptor holding the RX_PKT message
3532  *     @rxq: the response queue holding the RX_PKT message
3533  *     @skb: the packet
3534  *
3535  *     PTP enabled and MPS packet, read HW timestamp
3536  */
3537 static int t4_rx_hststamp(struct adapter *adapter, const __be64 *rsp,
3538               struct sge_eth_rxq *rxq, struct sk_buff *skb)
3539 {
3540     int ret;
3541
3542     if (unlikely((*(u8 *)rsp == CPL_RX_MPS_PKT) &&
3543              !is_t4(adapter->params.chip))) {
3544         ret = t4_systim_to_hwstamp(adapter, skb);
3545         if (ret == RX_PTP_PKT_ERR) {
3546             kfree_skb(skb);
3547             rxq->stats.rx_drops++;
3548         }
3549         return ret;
3550     }
3551     return RX_NON_PTP_PKT;
3552 }
3553
3554 /**
3555  *      t4_tx_hststamp - Loopback PTP Transmit Event Message
3556  *      @adapter: the adapter
3557  *      @skb: the packet
3558  *      @dev: the ingress net device
3559  *
3560  *      Read hardware timestamp for the loopback PTP Tx event message
3561  */
3562 static int t4_tx_hststamp(struct adapter *adapter, struct sk_buff *skb,
3563               struct net_device *dev)
3564 {
3565     struct port_info *pi = netdev_priv(dev);
3566
3567     if (!is_t4(adapter->params.chip) && adapter->ptp_tx_skb) {
3568         cxgb4_ptp_read_hwstamp(adapter, pi);
3569         kfree_skb(skb);
3570         return 0;
3571     }
3572     return 1;
3573 }
3574
3575 /**
3576  *  t4_tx_completion_handler - handle CPL_SGE_EGR_UPDATE messages
3577  *  @rspq: Ethernet RX Response Queue associated with Ethernet TX Queue
3578  *  @rsp: Response Entry pointer into Response Queue
3579  *  @gl: Gather List pointer
3580  *
3581  *  For adapters which support the SGE Doorbell Queue Timer facility,
3582  *  we configure the Ethernet TX Queues to send CIDX Updates to the
3583  *  Associated Ethernet RX Response Queue with CPL_SGE_EGR_UPDATE
3584  *  messages.  This adds a small load to PCIe Link RX bandwidth and,
3585  *  potentially, higher CPU Interrupt load, but allows us to respond
3586  *  much more quickly to the CIDX Updates.  This is important for
3587  *  Upper Layer Software which isn't willing to have a large amount
3588  *  of TX Data outstanding before receiving DMA Completions.
3589  */
3590 static void t4_tx_completion_handler(struct sge_rspq *rspq,
3591                      const __be64 *rsp,
3592                      const struct pkt_gl *gl)
3593 {
3594     u8 opcode = ((const struct rss_header *)rsp)->opcode;
3595     struct port_info *pi = netdev_priv(rspq->netdev);
3596     struct adapter *adapter = rspq->adap;
3597     struct sge *s = &adapter->sge;
3598     struct sge_eth_txq *txq;
3599
3600     /* skip RSS header */
3601     rsp++;
3602
3603     /* FW can send EGR_UPDATEs encapsulated in a CPL_FW4_MSG.
3604      */
3605     if (unlikely(opcode == CPL_FW4_MSG &&
3606              ((const struct cpl_fw4_msg *)rsp)->type ==
3607                             FW_TYPE_RSSCPL)) {
3608         rsp++;
3609         opcode = ((const struct rss_header *)rsp)->opcode;
3610         rsp++;
3611     }
3612
3613     if (unlikely(opcode != CPL_SGE_EGR_UPDATE)) {
3614         pr_info("%s: unexpected FW4/CPL %#x on Rx queue\n",
3615             __func__, opcode);
3616         return;
3617     }
3618
3619     txq = &s->ethtxq[pi->first_qset + rspq->idx];
3620
3621     /* We've got the Hardware Consumer Index Update in the Egress Update
3622      * message. These Egress Update messages will be our sole CIDX Updates
3623      * we get since we don't want to chew up PCIe bandwidth for both Ingress
3624      * Messages and Status Page writes.  However, The code which manages
3625      * reclaiming successfully DMA'ed TX Work Requests uses the CIDX value
3626      * stored in the Status Page at the end of the TX Queue.  It's easiest
3627      * to simply copy the CIDX Update value from the Egress Update message
3628      * to the Status Page.  Also note that no Endian issues need to be
3629      * considered here since both are Big Endian and we're just copying
3630      * bytes consistently ...
3631      */
3632     if (CHELSIO_CHIP_VERSION(adapter->params.chip) <= CHELSIO_T5) {
3633         struct cpl_sge_egr_update *egr;
3634
3635         egr = (struct cpl_sge_egr_update *)rsp;
3636         WRITE_ONCE(txq->q.stat->cidx, egr->cidx);
3637     }
3638
3639     t4_sge_eth_txq_egress_update(adapter, txq, -1);
3640 }
3641
3642 static int cxgb4_validate_lb_pkt(struct port_info *pi, const struct pkt_gl *si)
3643 {
3644     struct adapter *adap = pi->adapter;
3645     struct cxgb4_ethtool_lb_test *lb;
3646     struct sge *s = &adap->sge;
3647     struct net_device *netdev;
3648     u8 *data;
3649     int i;
3650
3651     netdev = adap->port[pi->port_id];
3652     lb = &pi->ethtool_lb;
3653     data = si->va + s->pktshift;
3654
3655     i = ETH_ALEN;
3656     if (!ether_addr_equal(data + i, netdev->dev_addr))
3657         return -1;
3658
3659     i += ETH_ALEN;
3660     if (strcmp(&data[i], CXGB4_SELFTEST_LB_STR))
3661         lb->result = -EIO;
3662
3663     complete(&lb->completion);
3664     return 0;
3665 }
3666
3667 /**
3668  *  t4_ethrx_handler - process an ingress ethernet packet
3669  *  @q: the response queue that received the packet
3670  *  @rsp: the response queue descriptor holding the RX_PKT message
3671  *  @si: the gather list of packet fragments
3672  *
3673  *  Process an ingress ethernet packet and deliver it to the stack.
3674  */
3675 int t4_ethrx_handler(struct sge_rspq *q, const __be64 *rsp,
3676              const struct pkt_gl *si)
3677 {
3678     bool csum_ok;
3679     struct sk_buff *skb;
3680     const struct cpl_rx_pkt *pkt;
3681     struct sge_eth_rxq *rxq = container_of(q, struct sge_eth_rxq, rspq);
3682     struct adapter *adapter = q->adap;
3683     struct sge *s = &q->adap->sge;
3684     int cpl_trace_pkt = is_t4(q->adap->params.chip) ?
3685                 CPL_TRACE_PKT : CPL_TRACE_PKT_T5;
3686     u16 err_vec, tnl_hdr_len = 0;
3687     struct port_info *pi;
3688     int ret = 0;
3689
3690     pi = netdev_priv(q->netdev);
3691     /* If we're looking at TX Queue CIDX Update, handle that separately
3692      * and return.
3693      */
3694     if (unlikely((*(u8 *)rsp == CPL_FW4_MSG) ||
3695              (*(u8 *)rsp == CPL_SGE_EGR_UPDATE))) {
3696         t4_tx_completion_handler(q, rsp, si);
3697         return 0;
3698     }
3699
3700     if (unlikely(*(u8 *)rsp == cpl_trace_pkt))
3701         return handle_trace_pkt(q->adap, si);
3702
3703     pkt = (const struct cpl_rx_pkt *)rsp;
3704     /* Compressed error vector is enabled for T6 only */
3705     if (q->adap->params.tp.rx_pkt_encap) {
3706         err_vec = T6_COMPR_RXERR_VEC_G(be16_to_cpu(pkt->err_vec));
3707         tnl_hdr_len = T6_RX_TNLHDR_LEN_G(ntohs(pkt->err_vec));
3708     } else {
3709         err_vec = be16_to_cpu(pkt->err_vec);
3710     }
3711
3712     csum_ok = pkt->csum_calc && !err_vec &&
3713           (q->netdev->features & NETIF_F_RXCSUM);
3714
3715     if (err_vec)
3716         rxq->stats.bad_rx_pkts++;
3717
3718     if (unlikely(pi->ethtool_lb.loopback && pkt->iff >= NCHAN)) {
3719         ret = cxgb4_validate_lb_pkt(pi, si);
3720         if (!ret)
3721             return 0;
3722     }
3723
3724     if (((pkt->l2info & htonl(RXF_TCP_F)) ||
3725          tnl_hdr_len) &&
3726         (q->netdev->features & NETIF_F_GRO) && csum_ok && !pkt->ip_frag) {
3727         do_gro(rxq, si, pkt, tnl_hdr_len);
3728         return 0;
3729     }
3730
3731     skb = cxgb4_pktgl_to_skb(si, RX_PKT_SKB_LEN, RX_PULL_LEN);
3732     if (unlikely(!skb)) {
3733         t4_pktgl_free(si);
3734         rxq->stats.rx_drops++;
3735         return 0;
3736     }
3737
3738     /* Handle PTP Event Rx packet */
3739     if (unlikely(pi->ptp_enable)) {
3740         ret = t4_rx_hststamp(adapter, rsp, rxq, skb);
3741         if (ret == RX_PTP_PKT_ERR)
3742             return 0;
3743     }
3744     if (likely(!ret))
3745         __skb_pull(skb, s->pktshift); /* remove ethernet header pad */
3746
3747     /* Handle the PTP Event Tx Loopback packet */
3748     if (unlikely(pi->ptp_enable && !ret &&
3749              (pkt->l2info & htonl(RXF_UDP_F)) &&
3750              cxgb4_ptp_is_ptp_rx(skb))) {
3751         if (!t4_tx_hststamp(adapter, skb, q->netdev))
3752             return 0;
3753     }
3754
3755     skb->protocol = eth_type_trans(skb, q->netdev);
3756     skb_record_rx_queue(skb, q->idx);
3757     if (skb->dev->features & NETIF_F_RXHASH)
3758         skb_set_hash(skb, (__force u32)pkt->rsshdr.hash_val,
3759                  PKT_HASH_TYPE_L3);
3760
3761     rxq->stats.pkts++;
3762
3763     if (pi->rxtstamp)
3764         cxgb4_sgetim_to_hwtstamp(q->adap, skb_hwtstamps(skb),
3765                      si->sgetstamp);
3766     if (csum_ok && (pkt->l2info & htonl(RXF_UDP_F | RXF_TCP_F))) {
3767         if (!pkt->ip_frag) {
3768             skb->ip_summed = CHECKSUM_UNNECESSARY;
3769             rxq->stats.rx_cso++;
3770         } else if (pkt->l2info & htonl(RXF_IP_F)) {
3771             __sum16 c = (__force __sum16)pkt->csum;
3772             skb->csum = csum_unfold(c);
3773
3774             if (tnl_hdr_len) {
3775                 skb->ip_summed = CHECKSUM_UNNECESSARY;
3776                 skb->csum_level = 1;
3777             } else {
3778                 skb->ip_summed = CHECKSUM_COMPLETE;
3779             }
3780             rxq->stats.rx_cso++;
3781         }
3782     } else {
3783         skb_checksum_none_assert(skb);
3784 #ifdef CONFIG_CHELSIO_T4_FCOE
3785 #define CPL_RX_PKT_FLAGS (RXF_PSH_F | RXF_SYN_F | RXF_UDP_F | \
3786               RXF_TCP_F | RXF_IP_F | RXF_IP6_F | RXF_LRO_F)
3787
3788         if (!(pkt->l2info & cpu_to_be32(CPL_RX_PKT_FLAGS))) {
3789             if ((pkt->l2info & cpu_to_be32(RXF_FCOE_F)) &&
3790                 (pi->fcoe.flags & CXGB_FCOE_ENABLED)) {
3791                 if (q->adap->params.tp.rx_pkt_encap)
3792                     csum_ok = err_vec &
3793                           T6_COMPR_RXERR_SUM_F;
3794                 else
3795                     csum_ok = err_vec & RXERR_CSUM_F;
3796                 if (!csum_ok)
3797                     skb->ip_summed = CHECKSUM_UNNECESSARY;
3798             }
3799         }
3800
3801 #undef CPL_RX_PKT_FLAGS
3802 #endif /* CONFIG_CHELSIO_T4_FCOE */
3803     }
3804
3805     if (unlikely(pkt->vlan_ex)) {
3806         __vlan_hwaccel_put_tag(skb, htons(ETH_P_8021Q), ntohs(pkt->vlan));
3807         rxq->stats.vlan_ex++;
3808     }
3809     skb_mark_napi_id(skb, &q->napi);
3810     netif_receive_skb(skb);
3811     return 0;
3812 }
3813
3814 /**
3815  *  restore_rx_bufs - put back a packet's Rx buffers
3816  *  @si: the packet gather list
3817  *  @q: the SGE free list
3818  *  @frags: number of FL buffers to restore
3819  *
3820  *  Puts back on an FL the Rx buffers associated with @si.  The buffers
3821  *  have already been unmapped and are left unmapped, we mark them so to
3822  *  prevent further unmapping attempts.
3823  *
3824  *  This function undoes a series of @unmap_rx_buf calls when we find out
3825  *  that the current packet can't be processed right away afterall and we
3826  *  need to come back to it later.  This is a very rare event and there's
3827  *  no effort to make this particularly efficient.
3828  */
3829 static void restore_rx_bufs(const struct pkt_gl *si, struct sge_fl *q,
3830                 int frags)
3831 {
3832     struct rx_sw_desc *d;
3833
3834     while (frags--) {
3835         if (q->cidx == 0)
3836             q->cidx = q->size - 1;
3837         else
3838             q->cidx--;
3839         d = &q->sdesc[q->cidx];
3840         d->page = si->frags[frags].page;
3841         d->dma_addr |= RX_UNMAPPED_BUF;
3842         q->avail++;
3843     }
3844 }
3845
3846 /**
3847  *  is_new_response - check if a response is newly written
3848  *  @r: the response descriptor
3849  *  @q: the response queue
3850  *
3851  *  Returns true if a response descriptor contains a yet unprocessed
3852  *  response.
3853  */
3854 static inline bool is_new_response(const struct rsp_ctrl *r,
3855                    const struct sge_rspq *q)
3856 {
3857     return (r->type_gen >> RSPD_GEN_S) == q->gen;
3858 }
3859
3860 /**
3861  *  rspq_next - advance to the next entry in a response queue
3862  *  @q: the queue
3863  *
3864  *  Updates the state of a response queue to advance it to the next entry.
3865  */
3866 static inline void rspq_next(struct sge_rspq *q)
3867 {
3868     q->cur_desc = (void *)q->cur_desc + q->iqe_len;
3869     if (unlikely(++q->cidx == q->size)) {
3870         q->cidx = 0;
3871         q->gen ^= 1;
3872         q->cur_desc = q->desc;
3873     }
3874 }
3875
3876 /**
3877  *  process_responses - process responses from an SGE response queue
3878  *  @q: the ingress queue to process
3879  *  @budget: how many responses can be processed in this round
3880  *
3881  *  Process responses from an SGE response queue up to the supplied budget.
3882  *  Responses include received packets as well as control messages from FW
3883  *  or HW.
3884  *
3885  *  Additionally choose the interrupt holdoff time for the next interrupt
3886  *  on this queue.  If the system is under memory shortage use a fairly
3887  *  long delay to help recovery.
3888  */
3889 static int process_responses(struct sge_rspq *q, int budget)
3890 {
3891     int ret, rsp_type;
3892     int budget_left = budget;
3893     const struct rsp_ctrl *rc;
3894     struct sge_eth_rxq *rxq = container_of(q, struct sge_eth_rxq, rspq);
3895     struct adapter *adapter = q->adap;
3896     struct sge *s = &adapter->sge;
3897
3898     while (likely(budget_left)) {
3899         rc = (void *)q->cur_desc + (q->iqe_len - sizeof(*rc));
3900         if (!is_new_response(rc, q)) {
3901             if (q->flush_handler)
3902                 q->flush_handler(q);
3903             break;
3904         }
3905
3906         dma_rmb();
3907         rsp_type = RSPD_TYPE_G(rc->type_gen);
3908         if (likely(rsp_type == RSPD_TYPE_FLBUF_X)) {
3909             struct page_frag *fp;
3910             struct pkt_gl si;
3911             const struct rx_sw_desc *rsd;
3912             u32 len = ntohl(rc->pldbuflen_qid), bufsz, frags;
3913
3914             if (len & RSPD_NEWBUF_F) {
3915                 if (likely(q->offset > 0)) {
3916                     free_rx_bufs(q->adap, &rxq->fl, 1);
3917                     q->offset = 0;
3918                 }
3919                 len = RSPD_LEN_G(len);
3920             }
3921             si.tot_len = len;
3922
3923             /* gather packet fragments */
3924             for (frags = 0, fp = si.frags; ; frags++, fp++) {
3925                 rsd = &rxq->fl.sdesc[rxq->fl.cidx];
3926                 bufsz = get_buf_size(adapter, rsd);
3927                 fp->page = rsd->page;
3928                 fp->offset = q->offset;
3929                 fp->size = min(bufsz, len);
3930                 len -= fp->size;
3931                 if (!len)
3932                     break;
3933                 unmap_rx_buf(q->adap, &rxq->fl);
3934             }
3935
3936             si.sgetstamp = SGE_TIMESTAMP_G(
3937                     be64_to_cpu(rc->last_flit));
3938             /*
3939              * Last buffer remains mapped so explicitly make it
3940              * coherent for CPU access.
3941              */
3942             dma_sync_single_for_cpu(q->adap->pdev_dev,
3943                         get_buf_addr(rsd),
3944                         fp->size, DMA_FROM_DEVICE);
3945
3946             si.va = page_address(si.frags[0].page) +
3947                 si.frags[0].offset;
3948             prefetch(si.va);
3949
3950             si.nfrags = frags + 1;
3951             ret = q->handler(q, q->cur_desc, &si);
3952             if (likely(ret == 0))
3953                 q->offset += ALIGN(fp->size, s->fl_align);
3954             else
3955                 restore_rx_bufs(&si, &rxq->fl, frags);
3956         } else if (likely(rsp_type == RSPD_TYPE_CPL_X)) {
3957             ret = q->handler(q, q->cur_desc, NULL);
3958         } else {
3959             ret = q->handler(q, (const __be64 *)rc, CXGB4_MSG_AN);
3960         }
3961
3962         if (unlikely(ret)) {
3963             /* couldn't process descriptor, back off for recovery */
3964             q->next_intr_params = QINTR_TIMER_IDX_V(NOMEM_TMR_IDX);
3965             break;
3966         }
3967
3968         rspq_next(q);
3969         budget_left--;
3970     }
3971
3972     if (q->offset >= 0 && fl_cap(&rxq->fl) - rxq->fl.avail >= 16)
3973         __refill_fl(q->adap, &rxq->fl);
3974     return budget - budget_left;
3975 }
3976
3977 /**
3978  *  napi_rx_handler - the NAPI handler for Rx processing
3979  *  @napi: the napi instance
3980  *  @budget: how many packets we can process in this round
3981  *
3982  *  Handler for new data events when using NAPI.  This does not need any
3983  *  locking or protection from interrupts as data interrupts are off at
3984  *  this point and other adapter interrupts do not interfere (the latter
3985  *  in not a concern at all with MSI-X as non-data interrupts then have
3986  *  a separate handler).
3987  */
3988 static int napi_rx_handler(struct napi_struct *napi, int budget)
3989 {
3990     unsigned int params;
3991     struct sge_rspq *q = container_of(napi, struct sge_rspq, napi);
3992     int work_done;
3993     u32 val;
3994
3995     work_done = process_responses(q, budget);
3996     if (likely(work_done < budget)) {
3997         int timer_index;
3998
3999         napi_complete_done(napi, work_done);
4000         timer_index = QINTR_TIMER_IDX_G(q->next_intr_params);
4001
4002         if (q->adaptive_rx) {
4003             if (work_done > max(timer_pkt_quota[timer_index],
4004                         MIN_NAPI_WORK))
4005                 timer_index = (timer_index + 1);
4006             else
4007                 timer_index = timer_index - 1;
4008
4009             timer_index = clamp(timer_index, 0, SGE_TIMERREGS - 1);
4010             q->next_intr_params =
4011                     QINTR_TIMER_IDX_V(timer_index) |
4012                     QINTR_CNT_EN_V(0);
4013             params = q->next_intr_params;
4014         } else {
4015             params = q->next_intr_params;
4016             q->next_intr_params = q->intr_params;
4017         }
4018     } else
4019         params = QINTR_TIMER_IDX_V(7);
4020
4021     val = CIDXINC_V(work_done) | SEINTARM_V(params);
4022
4023     /* If we don't have access to the new User GTS (T5+), use the old
4024      * doorbell mechanism; otherwise use the new BAR2 mechanism.
4025      */
4026     if (unlikely(q->bar2_addr == NULL)) {
4027         t4_write_reg(q->adap, MYPF_REG(SGE_PF_GTS_A),
4028                  val | INGRESSQID_V((u32)q->cntxt_id));
4029     } else {
4030         writel(val | INGRESSQID_V(q->bar2_qid),
4031                q->bar2_addr + SGE_UDB_GTS);
4032         wmb();
4033     }
4034     return work_done;
4035 }
4036
4037 void cxgb4_ethofld_restart(struct tasklet_struct *t)
4038 {
4039     struct sge_eosw_txq *eosw_txq = from_tasklet(eosw_txq, t,
4040                              qresume_tsk);
4041     int pktcount;
4042
4043     spin_lock(&eosw_txq->lock);
4044     pktcount = eosw_txq->cidx - eosw_txq->last_cidx;
4045     if (pktcount < 0)
4046         pktcount += eosw_txq->ndesc;
4047
4048     if (pktcount) {
4049         cxgb4_eosw_txq_free_desc(netdev2adap(eosw_txq->netdev),
4050                      eosw_txq, pktcount);
4051         eosw_txq->inuse -= pktcount;
4052     }
4053
4054     /* There may be some packets waiting for completions. So,
4055      * attempt to send these packets now.
4056      */
4057     ethofld_xmit(eosw_txq->netdev, eosw_txq);
4058     spin_unlock(&eosw_txq->lock);
4059 }
4060
4061 /* cxgb4_ethofld_rx_handler - Process ETHOFLD Tx completions
4062  * @q: the response queue that received the packet
4063  * @rsp: the response queue descriptor holding the CPL message
4064  * @si: the gather list of packet fragments
4065  *
4066  * Process a ETHOFLD Tx completion. Increment the cidx here, but
4067  * free up the descriptors in a tasklet later.
4068  */
4069 int cxgb4_ethofld_rx_handler(struct sge_rspq *q, const __be64 *rsp,
4070                  const struct pkt_gl *si)
4071 {
4072     u8 opcode = ((const struct rss_header *)rsp)->opcode;
4073
4074     /* skip RSS header */
4075     rsp++;
4076
4077     if (opcode == CPL_FW4_ACK) {
4078         const struct cpl_fw4_ack *cpl;
4079         struct sge_eosw_txq *eosw_txq;
4080         struct eotid_entry *entry;
4081         struct sk_buff *skb;
4082         u32 hdr_len, eotid;
4083         u8 flits, wrlen16;
4084         int credits;
4085
4086         cpl = (const struct cpl_fw4_ack *)rsp;
4087         eotid = CPL_FW4_ACK_FLOWID_G(ntohl(OPCODE_TID(cpl))) -
4088             q->adap->tids.eotid_base;
4089         entry = cxgb4_lookup_eotid(&q->adap->tids, eotid);
4090         if (!entry)
4091             goto out_done;
4092
4093         eosw_txq = (struct sge_eosw_txq *)entry->data;
4094         if (!eosw_txq)
4095             goto out_done;
4096
4097         spin_lock(&eosw_txq->lock);
4098         credits = cpl->credits;
4099         while (credits > 0) {
4100             skb = eosw_txq->desc[eosw_txq->cidx].skb;
4101             if (!skb)
4102                 break;
4103
4104             if (unlikely((eosw_txq->state ==
4105                       CXGB4_EO_STATE_FLOWC_OPEN_REPLY ||
4106                       eosw_txq->state ==
4107                       CXGB4_EO_STATE_FLOWC_CLOSE_REPLY) &&
4108                      eosw_txq->cidx == eosw_txq->flowc_idx)) {
4109                 flits = DIV_ROUND_UP(skb->len, 8);
4110                 if (eosw_txq->state ==
4111                     CXGB4_EO_STATE_FLOWC_OPEN_REPLY)
4112                     eosw_txq->state = CXGB4_EO_STATE_ACTIVE;
4113                 else
4114                     eosw_txq->state = CXGB4_EO_STATE_CLOSED;
4115                 complete(&eosw_txq->completion);
4116             } else {
4117                 hdr_len = eth_get_headlen(eosw_txq->netdev,
4118                               skb->data,
4119                               skb_headlen(skb));
4120                 flits = ethofld_calc_tx_flits(q->adap, skb,
4121                                   hdr_len);
4122             }
4123             eosw_txq_advance_index(&eosw_txq->cidx, 1,
4124                            eosw_txq->ndesc);
4125             wrlen16 = DIV_ROUND_UP(flits * 8, 16);
4126             credits -= wrlen16;
4127         }
4128
4129         eosw_txq->cred += cpl->credits;
4130         eosw_txq->ncompl--;
4131
4132         spin_unlock(&eosw_txq->lock);
4133
4134         /* Schedule a tasklet to reclaim SKBs and restart ETHOFLD Tx,
4135          * if there were packets waiting for completion.
4136          */
4137         tasklet_schedule(&eosw_txq->qresume_tsk);
4138     }
4139
4140 out_done:
4141     return 0;
4142 }
4143
4144 /*
4145  * The MSI-X interrupt handler for an SGE response queue.
4146  */
4147 irqreturn_t t4_sge_intr_msix(int irq, void *cookie)
4148 {
4149     struct sge_rspq *q = cookie;
4150
4151     napi_schedule(&q->napi);
4152     return IRQ_HANDLED;
4153 }
4154
4155 /*
4156  * Process the indirect interrupt entries in the interrupt queue and kick off
4157  * NAPI for each queue that has generated an entry.
4158  */
4159 static unsigned int process_intrq(struct adapter *adap)
4160 {
4161     unsigned int credits;
4162     const struct rsp_ctrl *rc;
4163     struct sge_rspq *q = &adap->sge.intrq;
4164     u32 val;
4165
4166     spin_lock(&adap->sge.intrq_lock);
4167     for (credits = 0; ; credits++) {
4168         rc = (void *)q->cur_desc + (q->iqe_len - sizeof(*rc));
4169         if (!is_new_response(rc, q))
4170             break;
4171
4172         dma_rmb();
4173         if (RSPD_TYPE_G(rc->type_gen) == RSPD_TYPE_INTR_X) {
4174             unsigned int qid = ntohl(rc->pldbuflen_qid);
4175
4176             qid -= adap->sge.ingr_start;
4177             napi_schedule(&adap->sge.ingr_map[qid]->napi);
4178         }
4179
4180         rspq_next(q);
4181     }
4182
4183     val =  CIDXINC_V(credits) | SEINTARM_V(q->intr_params);
4184
4185     /* If we don't have access to the new User GTS (T5+), use the old
4186      * doorbell mechanism; otherwise use the new BAR2 mechanism.
4187      */
4188     if (unlikely(q->bar2_addr == NULL)) {
4189         t4_write_reg(adap, MYPF_REG(SGE_PF_GTS_A),
4190                  val | INGRESSQID_V(q->cntxt_id));
4191     } else {
4192         writel(val | INGRESSQID_V(q->bar2_qid),
4193                q->bar2_addr + SGE_UDB_GTS);
4194         wmb();
4195     }
4196     spin_unlock(&adap->sge.intrq_lock);
4197     return credits;
4198 }
4199
4200 /*
4201  * The MSI interrupt handler, which handles data events from SGE response queues
4202  * as well as error and other async events as they all use the same MSI vector.
4203  */
4204 static irqreturn_t t4_intr_msi(int irq, void *cookie)
4205 {
4206     struct adapter *adap = cookie;
4207
4208     if (adap->flags & CXGB4_MASTER_PF)
4209         t4_slow_intr_handler(adap);
4210     process_intrq(adap);
4211     return IRQ_HANDLED;
4212 }
4213
4214 /*
4215  * Interrupt handler for legacy INTx interrupts.
4216  * Handles data events from SGE response queues as well as error and other
4217  * async events as they all use the same interrupt line.
4218  */
4219 static irqreturn_t t4_intr_intx(int irq, void *cookie)
4220 {
4221     struct adapter *adap = cookie;
4222
4223     t4_write_reg(adap, MYPF_REG(PCIE_PF_CLI_A), 0);
4224     if (((adap->flags & CXGB4_MASTER_PF) && t4_slow_intr_handler(adap)) |
4225         process_intrq(adap))
4226         return IRQ_HANDLED;
4227     return IRQ_NONE;             /* probably shared interrupt */
4228 }
4229
4230 /**
4231  *  t4_intr_handler - select the top-level interrupt handler
4232  *  @adap: the adapter
4233  *
4234  *  Selects the top-level interrupt handler based on the type of interrupts
4235  *  (MSI-X, MSI, or INTx).
4236  */
4237 irq_handler_t t4_intr_handler(struct adapter *adap)
4238 {
4239     if (adap->flags & CXGB4_USING_MSIX)
4240         return t4_sge_intr_msix;
4241     if (adap->flags & CXGB4_USING_MSI)
4242         return t4_intr_msi;
4243     return t4_intr_intx;
4244 }
4245
4246 static void sge_rx_timer_cb(struct timer_list *t)
4247 {
4248     unsigned long m;
4249     unsigned int i;
4250     struct adapter *adap = from_timer(adap, t, sge.rx_timer);
4251     struct sge *s = &adap->sge;
4252
4253     for (i = 0; i < BITS_TO_LONGS(s->egr_sz); i++)
4254         for (m = s->starving_fl[i]; m; m &= m - 1) {
4255             struct sge_eth_rxq *rxq;
4256             unsigned int id = __ffs(m) + i * BITS_PER_LONG;
4257             struct sge_fl *fl = s->egr_map[id];
4258
4259             clear_bit(id, s->starving_fl);
4260             smp_mb__after_atomic();
4261
4262             if (fl_starving(adap, fl)) {
4263                 rxq = container_of(fl, struct sge_eth_rxq, fl);
4264                 if (napi_reschedule(&rxq->rspq.napi))
4265                     fl->starving++;
4266                 else
4267                     set_bit(id, s->starving_fl);
4268             }
4269         }
4270     /* The remainder of the SGE RX Timer Callback routine is dedicated to
4271      * global Master PF activities like checking for chip ingress stalls,
4272      * etc.
4273      */
4274     if (!(adap->flags & CXGB4_MASTER_PF))
4275         goto done;
4276
4277     t4_idma_monitor(adap, &s->idma_monitor, HZ, RX_QCHECK_PERIOD);
4278
4279 done:
4280     mod_timer(&s->rx_timer, jiffies + RX_QCHECK_PERIOD);
4281 }
4282
4283 static void sge_tx_timer_cb(struct timer_list *t)
4284 {
4285     struct adapter *adap = from_timer(adap, t, sge.tx_timer);
4286     struct sge *s = &adap->sge;
4287     unsigned long m, period;
4288     unsigned int i, budget;
4289
4290     for (i = 0; i < BITS_TO_LONGS(s->egr_sz); i++)
4291         for (m = s->txq_maperr[i]; m; m &= m - 1) {
4292             unsigned long id = __ffs(m) + i * BITS_PER_LONG;
4293             struct sge_uld_txq *txq = s->egr_map[id];
4294
4295             clear_bit(id, s->txq_maperr);
4296             tasklet_schedule(&txq->qresume_tsk);
4297         }
4298
4299     if (!is_t4(adap->params.chip)) {
4300         struct sge_eth_txq *q = &s->ptptxq;
4301         int avail;
4302
4303         spin_lock(&adap->ptp_lock);
4304         avail = reclaimable(&q->q);
4305
4306         if (avail) {
4307             free_tx_desc(adap, &q->q, avail, false);
4308             q->q.in_use -= avail;
4309         }
4310         spin_unlock(&adap->ptp_lock);
4311     }
4312
4313     budget = MAX_TIMER_TX_RECLAIM;
4314     i = s->ethtxq_rover;
4315     do {
4316         budget -= t4_sge_eth_txq_egress_update(adap, &s->ethtxq[i],
4317                                budget);
4318         if (!budget)
4319             break;
4320
4321         if (++i >= s->ethqsets)
4322             i = 0;
4323     } while (i != s->ethtxq_rover);
4324     s->ethtxq_rover = i;
4325
4326     if (budget == 0) {
4327         /* If we found too many reclaimable packets schedule a timer
4328          * in the near future to continue where we left off.
4329          */
4330         period = 2;
4331     } else {
4332         /* We reclaimed all reclaimable TX Descriptors, so reschedule
4333          * at the normal period.
4334          */
4335         period = TX_QCHECK_PERIOD;
4336     }
4337
4338     mod_timer(&s->tx_timer, jiffies + period);
4339 }
4340
4341 /**
4342  *  bar2_address - return the BAR2 address for an SGE Queue's Registers
4343  *  @adapter: the adapter
4344  *  @qid: the SGE Queue ID
4345  *  @qtype: the SGE Queue Type (Egress or Ingress)
4346  *  @pbar2_qid: BAR2 Queue ID or 0 for Queue ID inferred SGE Queues
4347  *
4348  *  Returns the BAR2 address for the SGE Queue Registers associated with
4349  *  @qid.  If BAR2 SGE Registers aren't available, returns NULL.  Also
4350  *  returns the BAR2 Queue ID to be used with writes to the BAR2 SGE
4351  *  Queue Registers.  If the BAR2 Queue ID is 0, then "Inferred Queue ID"
4352  *  Registers are supported (e.g. the Write Combining Doorbell Buffer).
4353  */
4354 static void __iomem *bar2_address(struct adapter *adapter,
4355                   unsigned int qid,
4356                   enum t4_bar2_qtype qtype,
4357                   unsigned int *pbar2_qid)
4358 {
4359     u64 bar2_qoffset;
4360     int ret;
4361
4362     ret = t4_bar2_sge_qregs(adapter, qid, qtype, 0,
4363                 &bar2_qoffset, pbar2_qid);
4364     if (ret)
4365         return NULL;
4366
4367     return adapter->bar2 + bar2_qoffset;
4368 }
4369
4370 /* @intr_idx: MSI/MSI-X vector if >=0, -(absolute qid + 1) if < 0
4371  * @cong: < 0 -> no congestion feedback, >= 0 -> congestion channel map
4372  */
4373 int t4_sge_alloc_rxq(struct adapter *adap, struct sge_rspq *iq, bool fwevtq,
4374              struct net_device *dev, int intr_idx,
4375              struct sge_fl *fl, rspq_handler_t hnd,
4376              rspq_flush_handler_t flush_hnd, int cong)
4377 {
4378     int ret, flsz = 0;
4379     struct fw_iq_cmd c;
4380     struct sge *s = &adap->sge;
4381     struct port_info *pi = netdev_priv(dev);
4382     int relaxed = !(adap->flags & CXGB4_ROOT_NO_RELAXED_ORDERING);
4383
4384     /* Size needs to be multiple of 16, including status entry. */
4385     iq->size = roundup(iq->size, 16);
4386
4387     iq->desc = alloc_ring(adap->pdev_dev, iq->size, iq->iqe_len, 0,
4388                   &iq->phys_addr, NULL, 0,
4389                   dev_to_node(adap->pdev_dev));
4390     if (!iq->desc)
4391         return -ENOMEM;
4392
4393     memset(&c, 0, sizeof(c));
4394     c.op_to_vfn = htonl(FW_CMD_OP_V(FW_IQ_CMD) | FW_CMD_REQUEST_F |
4395                 FW_CMD_WRITE_F | FW_CMD_EXEC_F |
4396                 FW_IQ_CMD_PFN_V(adap->pf) | FW_IQ_CMD_VFN_V(0));
4397     c.alloc_to_len16 = htonl(FW_IQ_CMD_ALLOC_F | FW_IQ_CMD_IQSTART_F |
4398                  FW_LEN16(c));
4399     c.type_to_iqandstindex = htonl(FW_IQ_CMD_TYPE_V(FW_IQ_TYPE_FL_INT_CAP) |
4400         FW_IQ_CMD_IQASYNCH_V(fwevtq) | FW_IQ_CMD_VIID_V(pi->viid) |
4401         FW_IQ_CMD_IQANDST_V(intr_idx < 0) |
4402         FW_IQ_CMD_IQANUD_V(UPDATEDELIVERY_INTERRUPT_X) |
4403         FW_IQ_CMD_IQANDSTINDEX_V(intr_idx >= 0 ? intr_idx :
4404                             -intr_idx - 1));
4405     c.iqdroprss_to_iqesize = htons(FW_IQ_CMD_IQPCIECH_V(pi->tx_chan) |
4406         FW_IQ_CMD_IQGTSMODE_F |
4407         FW_IQ_CMD_IQINTCNTTHRESH_V(iq->pktcnt_idx) |
4408         FW_IQ_CMD_IQESIZE_V(ilog2(iq->iqe_len) - 4));
4409     c.iqsize = htons(iq->size);
4410     c.iqaddr = cpu_to_be64(iq->phys_addr);
4411     if (cong >= 0)
4412         c.iqns_to_fl0congen = htonl(FW_IQ_CMD_IQFLINTCONGEN_F |
4413                 FW_IQ_CMD_IQTYPE_V(cong ? FW_IQ_IQTYPE_NIC
4414                             :  FW_IQ_IQTYPE_OFLD));
4415
4416     if (fl) {
4417         unsigned int chip_ver =
4418             CHELSIO_CHIP_VERSION(adap->params.chip);
4419
4420         /* Allocate the ring for the hardware free list (with space
4421          * for its status page) along with the associated software
4422          * descriptor ring.  The free list size needs to be a multiple
4423          * of the Egress Queue Unit and at least 2 Egress Units larger
4424          * than the SGE's Egress Congrestion Threshold
4425          * (fl_starve_thres - 1).
4426          */
4427         if (fl->size < s->fl_starve_thres - 1 + 2 * 8)
4428             fl->size = s->fl_starve_thres - 1 + 2 * 8;
4429         fl->size = roundup(fl->size, 8);
4430         fl->desc = alloc_ring(adap->pdev_dev, fl->size, sizeof(__be64),
4431                       sizeof(struct rx_sw_desc), &fl->addr,
4432                       &fl->sdesc, s->stat_len,
4433                       dev_to_node(adap->pdev_dev));
4434         if (!fl->desc)
4435             goto fl_nomem;
4436
4437         flsz = fl->size / 8 + s->stat_len / sizeof(struct tx_desc);
4438         c.iqns_to_fl0congen |= htonl(FW_IQ_CMD_FL0PACKEN_F |
4439                          FW_IQ_CMD_FL0FETCHRO_V(relaxed) |
4440                          FW_IQ_CMD_FL0DATARO_V(relaxed) |
4441                          FW_IQ_CMD_FL0PADEN_F);
4442         if (cong >= 0)
4443             c.iqns_to_fl0congen |=
4444                 htonl(FW_IQ_CMD_FL0CNGCHMAP_V(cong) |
4445                       FW_IQ_CMD_FL0CONGCIF_F |
4446                       FW_IQ_CMD_FL0CONGEN_F);
4447         /* In T6, for egress queue type FL there is internal overhead
4448          * of 16B for header going into FLM module.  Hence the maximum
4449          * allowed burst size is 448 bytes.  For T4/T5, the hardware
4450          * doesn't coalesce fetch requests if more than 64 bytes of
4451          * Free List pointers are provided, so we use a 128-byte Fetch
4452          * Burst Minimum there (T6 implements coalescing so we can use
4453          * the smaller 64-byte value there).
4454          */
4455         c.fl0dcaen_to_fl0cidxfthresh =
4456             htons(FW_IQ_CMD_FL0FBMIN_V(chip_ver <= CHELSIO_T5 ?
4457                            FETCHBURSTMIN_128B_X :
4458                            FETCHBURSTMIN_64B_T6_X) |
4459                   FW_IQ_CMD_FL0FBMAX_V((chip_ver <= CHELSIO_T5) ?
4460                            FETCHBURSTMAX_512B_X :
4461                            FETCHBURSTMAX_256B_X));
4462         c.fl0size = htons(flsz);
4463         c.fl0addr = cpu_to_be64(fl->addr);
4464     }
4465
4466     ret = t4_wr_mbox(adap, adap->mbox, &c, sizeof(c), &c);
4467     if (ret)
4468         goto err;
4469
4470     netif_napi_add(dev, &iq->napi, napi_rx_handler, 64);
4471     iq->cur_desc = iq->desc;
4472     iq->cidx = 0;
4473     iq->gen = 1;
4474     iq->next_intr_params = iq->intr_params;
4475     iq->cntxt_id = ntohs(c.iqid);
4476     iq->abs_id = ntohs(c.physiqid);
4477     iq->bar2_addr = bar2_address(adap,
4478                      iq->cntxt_id,
4479                      T4_BAR2_QTYPE_INGRESS,
4480                      &iq->bar2_qid);
4481     iq->size--;                           /* subtract status entry */
4482     iq->netdev = dev;
4483     iq->handler = hnd;
4484     iq->flush_handler = flush_hnd;
4485
4486     memset(&iq->lro_mgr, 0, sizeof(struct t4_lro_mgr));
4487     skb_queue_head_init(&iq->lro_mgr.lroq);
4488
4489     /* set offset to -1 to distinguish ingress queues without FL */
4490     iq->offset = fl ? 0 : -1;
4491
4492     adap->sge.ingr_map[iq->cntxt_id - adap->sge.ingr_start] = iq;
4493
4494     if (fl) {
4495         fl->cntxt_id = ntohs(c.fl0id);
4496         fl->avail = fl->pend_cred = 0;
4497         fl->pidx = fl->cidx = 0;
4498         fl->alloc_failed = fl->large_alloc_failed = fl->starving = 0;
4499         adap->sge.egr_map[fl->cntxt_id - adap->sge.egr_start] = fl;
4500
4501         /* Note, we must initialize the BAR2 Free List User Doorbell
4502          * information before refilling the Free List!
4503          */
4504         fl->bar2_addr = bar2_address(adap,
4505                          fl->cntxt_id,
4506                          T4_BAR2_QTYPE_EGRESS,
4507                          &fl->bar2_qid);
4508         refill_fl(adap, fl, fl_cap(fl), GFP_KERNEL);
4509     }
4510
4511     /* For T5 and later we attempt to set up the Congestion Manager values
4512      * of the new RX Ethernet Queue.  This should really be handled by
4513      * firmware because it's more complex than any host driver wants to
4514      * get involved with and it's different per chip and this is almost
4515      * certainly wrong.  Firmware would be wrong as well, but it would be
4516      * a lot easier to fix in one place ...  For now we do something very
4517      * simple (and hopefully less wrong).
4518      */
4519     if (!is_t4(adap->params.chip) && cong >= 0) {
4520         u32 param, val, ch_map = 0;
4521         int i;
4522         u16 cng_ch_bits_log = adap->params.arch.cng_ch_bits_log;
4523
4524         param = (FW_PARAMS_MNEM_V(FW_PARAMS_MNEM_DMAQ) |
4525              FW_PARAMS_PARAM_X_V(FW_PARAMS_PARAM_DMAQ_CONM_CTXT) |
4526              FW_PARAMS_PARAM_YZ_V(iq->cntxt_id));
4527         if (cong == 0) {
4528             val = CONMCTXT_CNGTPMODE_V(CONMCTXT_CNGTPMODE_QUEUE_X);
4529         } else {
4530             val =
4531                 CONMCTXT_CNGTPMODE_V(CONMCTXT_CNGTPMODE_CHANNEL_X);
4532             for (i = 0; i < 4; i++) {
4533                 if (cong & (1 << i))
4534                     ch_map |= 1 << (i << cng_ch_bits_log);
4535             }
4536             val |= CONMCTXT_CNGCHMAP_V(ch_map);
4537         }
4538         ret = t4_set_params(adap, adap->mbox, adap->pf, 0, 1,
4539                     &param, &val);
4540         if (ret)
4541             dev_warn(adap->pdev_dev, "Failed to set Congestion"
4542                  " Manager Context for Ingress Queue %d: %d\n",
4543                  iq->cntxt_id, -ret);
4544     }
4545
4546     return 0;
4547
4548 fl_nomem:
4549     ret = -ENOMEM;
4550 err:
4551     if (iq->desc) {
4552         dma_free_coherent(adap->pdev_dev, iq->size * iq->iqe_len,
4553                   iq->desc, iq->phys_addr);
4554         iq->desc = NULL;
4555     }
4556     if (fl && fl->desc) {
4557         kfree(fl->sdesc);
4558         fl->sdesc = NULL;
4559         dma_free_coherent(adap->pdev_dev, flsz * sizeof(struct tx_desc),
4560                   fl->desc, fl->addr);
4561         fl->desc = NULL;
4562     }
4563     return ret;
4564 }
4565
4566 static void init_txq(struct adapter *adap, struct sge_txq *q, unsigned int id)
4567 {
4568     q->cntxt_id = id;
4569     q->bar2_addr = bar2_address(adap,
4570                     q->cntxt_id,
4571                     T4_BAR2_QTYPE_EGRESS,
4572                     &q->bar2_qid);
4573     q->in_use = 0;
4574     q->cidx = q->pidx = 0;
4575     q->stops = q->restarts = 0;
4576     q->stat = (void *)&q->desc[q->size];
4577     spin_lock_init(&q->db_lock);
4578     adap->sge.egr_map[id - adap->sge.egr_start] = q;
4579 }
4580
4581 /**
4582  *  t4_sge_alloc_eth_txq - allocate an Ethernet TX Queue
4583  *  @adap: the adapter
4584  *  @txq: the SGE Ethernet TX Queue to initialize
4585  *  @dev: the Linux Network Device
4586  *  @netdevq: the corresponding Linux TX Queue
4587  *  @iqid: the Ingress Queue to which to deliver CIDX Update messages
4588  *  @dbqt: whether this TX Queue will use the SGE Doorbell Queue Timers
4589  */
4590 int t4_sge_alloc_eth_txq(struct adapter *adap, struct sge_eth_txq *txq,
4591              struct net_device *dev, struct netdev_queue *netdevq,
4592              unsigned int iqid, u8 dbqt)
4593 {
4594     unsigned int chip_ver = CHELSIO_CHIP_VERSION(adap->params.chip);
4595     struct port_info *pi = netdev_priv(dev);
4596     struct sge *s = &adap->sge;
4597     struct fw_eq_eth_cmd c;
4598     int ret, nentries;
4599
4600     /* Add status entries */
4601     nentries = txq->q.size + s->stat_len / sizeof(struct tx_desc);
4602
4603     txq->q.desc = alloc_ring(adap->pdev_dev, txq->q.size,
4604             sizeof(struct tx_desc), sizeof(struct tx_sw_desc),
4605             &txq->q.phys_addr, &txq->q.sdesc, s->stat_len,
4606             netdev_queue_numa_node_read(netdevq));
4607     if (!txq->q.desc)
4608         return -ENOMEM;
4609
4610     memset(&c, 0, sizeof(c));
4611     c.op_to_vfn = htonl(FW_CMD_OP_V(FW_EQ_ETH_CMD) | FW_CMD_REQUEST_F |
4612                 FW_CMD_WRITE_F | FW_CMD_EXEC_F |
4613                 FW_EQ_ETH_CMD_PFN_V(adap->pf) |
4614                 FW_EQ_ETH_CMD_VFN_V(0));
4615     c.alloc_to_len16 = htonl(FW_EQ_ETH_CMD_ALLOC_F |
4616                  FW_EQ_ETH_CMD_EQSTART_F | FW_LEN16(c));
4617
4618     /* For TX Ethernet Queues using the SGE Doorbell Queue Timer
4619      * mechanism, we use Ingress Queue messages for Hardware Consumer
4620      * Index Updates on the TX Queue.  Otherwise we have the Hardware
4621      * write the CIDX Updates into the Status Page at the end of the
4622      * TX Queue.
4623      */
4624     c.autoequiqe_to_viid = htonl(((chip_ver <= CHELSIO_T5) ?
4625                       FW_EQ_ETH_CMD_AUTOEQUIQE_F :
4626                       FW_EQ_ETH_CMD_AUTOEQUEQE_F) |
4627                      FW_EQ_ETH_CMD_VIID_V(pi->viid));
4628
4629     c.fetchszm_to_iqid =
4630         htonl(FW_EQ_ETH_CMD_HOSTFCMODE_V((chip_ver <= CHELSIO_T5) ?
4631                          HOSTFCMODE_INGRESS_QUEUE_X :
4632                          HOSTFCMODE_STATUS_PAGE_X) |
4633               FW_EQ_ETH_CMD_PCIECHN_V(pi->tx_chan) |
4634               FW_EQ_ETH_CMD_FETCHRO_F | FW_EQ_ETH_CMD_IQID_V(iqid));
4635
4636     /* Note that the CIDX Flush Threshold should match MAX_TX_RECLAIM. */
4637     c.dcaen_to_eqsize =
4638         htonl(FW_EQ_ETH_CMD_FBMIN_V(chip_ver <= CHELSIO_T5
4639                         ? FETCHBURSTMIN_64B_X
4640                         : FETCHBURSTMIN_64B_T6_X) |
4641               FW_EQ_ETH_CMD_FBMAX_V(FETCHBURSTMAX_512B_X) |
4642               FW_EQ_ETH_CMD_CIDXFTHRESH_V(CIDXFLUSHTHRESH_32_X) |
4643               FW_EQ_ETH_CMD_CIDXFTHRESHO_V(chip_ver == CHELSIO_T5) |
4644               FW_EQ_ETH_CMD_EQSIZE_V(nentries));
4645
4646     c.eqaddr = cpu_to_be64(txq->q.phys_addr);
4647
4648     /* If we're using the SGE Doorbell Queue Timer mechanism, pass in the
4649      * currently configured Timer Index.  THis can be changed later via an
4650      * ethtool -C tx-usecs {Timer Val} command.  Note that the SGE
4651      * Doorbell Queue mode is currently automatically enabled in the
4652      * Firmware by setting either AUTOEQUEQE or AUTOEQUIQE ...
4653      */
4654     if (dbqt)
4655         c.timeren_timerix =
4656             cpu_to_be32(FW_EQ_ETH_CMD_TIMEREN_F |
4657                     FW_EQ_ETH_CMD_TIMERIX_V(txq->dbqtimerix));
4658
4659     ret = t4_wr_mbox(adap, adap->mbox, &c, sizeof(c), &c);
4660     if (ret) {
4661         kfree(txq->q.sdesc);
4662         txq->q.sdesc = NULL;
4663         dma_free_coherent(adap->pdev_dev,
4664                   nentries * sizeof(struct tx_desc),
4665                   txq->q.desc, txq->q.phys_addr);
4666         txq->q.desc = NULL;
4667         return ret;
4668     }
4669
4670     txq->q.q_type = CXGB4_TXQ_ETH;
4671     init_txq(adap, &txq->q, FW_EQ_ETH_CMD_EQID_G(ntohl(c.eqid_pkd)));
4672     txq->txq = netdevq;
4673     txq->tso = 0;
4674     txq->uso = 0;
4675     txq->tx_cso = 0;
4676     txq->vlan_ins = 0;
4677     txq->mapping_err = 0;
4678     txq->dbqt = dbqt;
4679
4680     return 0;
4681 }
4682
4683 int t4_sge_alloc_ctrl_txq(struct adapter *adap, struct sge_ctrl_txq *txq,
4684               struct net_device *dev, unsigned int iqid,
4685               unsigned int cmplqid)
4686 {
4687     unsigned int chip_ver = CHELSIO_CHIP_VERSION(adap->params.chip);
4688     struct port_info *pi = netdev_priv(dev);
4689     struct sge *s = &adap->sge;
4690     struct fw_eq_ctrl_cmd c;
4691     int ret, nentries;
4692
4693     /* Add status entries */
4694     nentries = txq->q.size + s->stat_len / sizeof(struct tx_desc);
4695
4696     txq->q.desc = alloc_ring(adap->pdev_dev, nentries,
4697                  sizeof(struct tx_desc), 0, &txq->q.phys_addr,
4698                  NULL, 0, dev_to_node(adap->pdev_dev));
4699     if (!txq->q.desc)
4700         return -ENOMEM;
4701
4702     c.op_to_vfn = htonl(FW_CMD_OP_V(FW_EQ_CTRL_CMD) | FW_CMD_REQUEST_F |
4703                 FW_CMD_WRITE_F | FW_CMD_EXEC_F |
4704                 FW_EQ_CTRL_CMD_PFN_V(adap->pf) |
4705                 FW_EQ_CTRL_CMD_VFN_V(0));
4706     c.alloc_to_len16 = htonl(FW_EQ_CTRL_CMD_ALLOC_F |
4707                  FW_EQ_CTRL_CMD_EQSTART_F | FW_LEN16(c));
4708     c.cmpliqid_eqid = htonl(FW_EQ_CTRL_CMD_CMPLIQID_V(cmplqid));
4709     c.physeqid_pkd = htonl(0);
4710     c.fetchszm_to_iqid =
4711         htonl(FW_EQ_CTRL_CMD_HOSTFCMODE_V(HOSTFCMODE_STATUS_PAGE_X) |
4712               FW_EQ_CTRL_CMD_PCIECHN_V(pi->tx_chan) |
4713               FW_EQ_CTRL_CMD_FETCHRO_F | FW_EQ_CTRL_CMD_IQID_V(iqid));
4714     c.dcaen_to_eqsize =
4715         htonl(FW_EQ_CTRL_CMD_FBMIN_V(chip_ver <= CHELSIO_T5
4716                          ? FETCHBURSTMIN_64B_X
4717                          : FETCHBURSTMIN_64B_T6_X) |
4718               FW_EQ_CTRL_CMD_FBMAX_V(FETCHBURSTMAX_512B_X) |
4719               FW_EQ_CTRL_CMD_CIDXFTHRESH_V(CIDXFLUSHTHRESH_32_X) |
4720               FW_EQ_CTRL_CMD_EQSIZE_V(nentries));
4721     c.eqaddr = cpu_to_be64(txq->q.phys_addr);
4722
4723     ret = t4_wr_mbox(adap, adap->mbox, &c, sizeof(c), &c);
4724     if (ret) {
4725         dma_free_coherent(adap->pdev_dev,
4726                   nentries * sizeof(struct tx_desc),
4727                   txq->q.desc, txq->q.phys_addr);
4728         txq->q.desc = NULL;
4729         return ret;
4730     }
4731
4732     txq->q.q_type = CXGB4_TXQ_CTRL;
4733     init_txq(adap, &txq->q, FW_EQ_CTRL_CMD_EQID_G(ntohl(c.cmpliqid_eqid)));
4734     txq->adap = adap;
4735     skb_queue_head_init(&txq->sendq);
4736     tasklet_setup(&txq->qresume_tsk, restart_ctrlq);
4737     txq->full = 0;
4738     return 0;
4739 }
4740
4741 int t4_sge_mod_ctrl_txq(struct adapter *adap, unsigned int eqid,
4742             unsigned int cmplqid)
4743 {
4744     u32 param, val;
4745
4746     param = (FW_PARAMS_MNEM_V(FW_PARAMS_MNEM_DMAQ) |
4747          FW_PARAMS_PARAM_X_V(FW_PARAMS_PARAM_DMAQ_EQ_CMPLIQID_CTRL) |
4748          FW_PARAMS_PARAM_YZ_V(eqid));
4749     val = cmplqid;
4750     return t4_set_params(adap, adap->mbox, adap->pf, 0, 1, &param, &val);
4751 }
4752
4753 static int t4_sge_alloc_ofld_txq(struct adapter *adap, struct sge_txq *q,
4754                  struct net_device *dev, u32 cmd, u32 iqid)
4755 {
4756     unsigned int chip_ver = CHELSIO_CHIP_VERSION(adap->params.chip);
4757     struct port_info *pi = netdev_priv(dev);
4758     struct sge *s = &adap->sge;
4759     struct fw_eq_ofld_cmd c;
4760     u32 fb_min, nentries;
4761     int ret;
4762
4763     /* Add status entries */
4764     nentries = q->size + s->stat_len / sizeof(struct tx_desc);
4765     q->desc = alloc_ring(adap->pdev_dev, q->size, sizeof(struct tx_desc),
4766                  sizeof(struct tx_sw_desc), &q->phys_addr,
4767                  &q->sdesc, s->stat_len, NUMA_NO_NODE);
4768     if (!q->desc)
4769         return -ENOMEM;
4770
4771     if (chip_ver <= CHELSIO_T5)
4772         fb_min = FETCHBURSTMIN_64B_X;
4773     else
4774         fb_min = FETCHBURSTMIN_64B_T6_X;
4775
4776     memset(&c, 0, sizeof(c));
4777     c.op_to_vfn = htonl(FW_CMD_OP_V(cmd) | FW_CMD_REQUEST_F |
4778                 FW_CMD_WRITE_F | FW_CMD_EXEC_F |
4779                 FW_EQ_OFLD_CMD_PFN_V(adap->pf) |
4780                 FW_EQ_OFLD_CMD_VFN_V(0));
4781     c.alloc_to_len16 = htonl(FW_EQ_OFLD_CMD_ALLOC_F |
4782                  FW_EQ_OFLD_CMD_EQSTART_F | FW_LEN16(c));
4783     c.fetchszm_to_iqid =
4784         htonl(FW_EQ_OFLD_CMD_HOSTFCMODE_V(HOSTFCMODE_STATUS_PAGE_X) |
4785               FW_EQ_OFLD_CMD_PCIECHN_V(pi->tx_chan) |
4786               FW_EQ_OFLD_CMD_FETCHRO_F | FW_EQ_OFLD_CMD_IQID_V(iqid));
4787     c.dcaen_to_eqsize =
4788         htonl(FW_EQ_OFLD_CMD_FBMIN_V(fb_min) |
4789               FW_EQ_OFLD_CMD_FBMAX_V(FETCHBURSTMAX_512B_X) |
4790               FW_EQ_OFLD_CMD_CIDXFTHRESH_V(CIDXFLUSHTHRESH_32_X) |
4791               FW_EQ_OFLD_CMD_EQSIZE_V(nentries));
4792     c.eqaddr = cpu_to_be64(q->phys_addr);
4793
4794     ret = t4_wr_mbox(adap, adap->mbox, &c, sizeof(c), &c);
4795     if (ret) {
4796         kfree(q->sdesc);
4797         q->sdesc = NULL;
4798         dma_free_coherent(adap->pdev_dev,
4799                   nentries * sizeof(struct tx_desc),
4800                   q->desc, q->phys_addr);
4801         q->desc = NULL;
4802         return ret;
4803     }
4804
4805     init_txq(adap, q, FW_EQ_OFLD_CMD_EQID_G(ntohl(c.eqid_pkd)));
4806     return 0;
4807 }
4808
4809 int t4_sge_alloc_uld_txq(struct adapter *adap, struct sge_uld_txq *txq,
4810              struct net_device *dev, unsigned int iqid,
4811              unsigned int uld_type)
4812 {
4813     u32 cmd = FW_EQ_OFLD_CMD;
4814     int ret;
4815
4816     if (unlikely(uld_type == CXGB4_TX_CRYPTO))
4817         cmd = FW_EQ_CTRL_CMD;
4818
4819     ret = t4_sge_alloc_ofld_txq(adap, &txq->q, dev, cmd, iqid);
4820     if (ret)
4821         return ret;
4822
4823     txq->q.q_type = CXGB4_TXQ_ULD;
4824     txq->adap = adap;
4825     skb_queue_head_init(&txq->sendq);
4826     tasklet_setup(&txq->qresume_tsk, restart_ofldq);
4827     txq->full = 0;
4828     txq->mapping_err = 0;
4829     return 0;
4830 }
4831
4832 int t4_sge_alloc_ethofld_txq(struct adapter *adap, struct sge_eohw_txq *txq,
4833                  struct net_device *dev, u32 iqid)
4834 {
4835     int ret;
4836
4837     ret = t4_sge_alloc_ofld_txq(adap, &txq->q, dev, FW_EQ_OFLD_CMD, iqid);
4838     if (ret)
4839         return ret;
4840
4841     txq->q.q_type = CXGB4_TXQ_ULD;
4842     spin_lock_init(&txq->lock);
4843     txq->adap = adap;
4844     txq->tso = 0;
4845     txq->uso = 0;
4846     txq->tx_cso = 0;
4847     txq->vlan_ins = 0;
4848     txq->mapping_err = 0;
4849     return 0;
4850 }
4851
4852 void free_txq(struct adapter *adap, struct sge_txq *q)
4853 {
4854     struct sge *s = &adap->sge;
4855
4856     dma_free_coherent(adap->pdev_dev,
4857               q->size * sizeof(struct tx_desc) + s->stat_len,
4858               q->desc, q->phys_addr);
4859     q->cntxt_id = 0;
4860     q->sdesc = NULL;
4861     q->desc = NULL;
4862 }
4863
4864 void free_rspq_fl(struct adapter *adap, struct sge_rspq *rq,
4865           struct sge_fl *fl)
4866 {
4867     struct sge *s = &adap->sge;
4868     unsigned int fl_id = fl ? fl->cntxt_id : 0xffff;
4869
4870     adap->sge.ingr_map[rq->cntxt_id - adap->sge.ingr_start] = NULL;
4871     t4_iq_free(adap, adap->mbox, adap->pf, 0, FW_IQ_TYPE_FL_INT_CAP,
4872            rq->cntxt_id, fl_id, 0xffff);
4873     dma_free_coherent(adap->pdev_dev, (rq->size + 1) * rq->iqe_len,
4874               rq->desc, rq->phys_addr);
4875     netif_napi_del(&rq->napi);
4876     rq->netdev = NULL;
4877     rq->cntxt_id = rq->abs_id = 0;
4878     rq->desc = NULL;
4879
4880     if (fl) {
4881         free_rx_bufs(adap, fl, fl->avail);
4882         dma_free_coherent(adap->pdev_dev, fl->size * 8 + s->stat_len,
4883                   fl->desc, fl->addr);
4884         kfree(fl->sdesc);
4885         fl->sdesc = NULL;
4886         fl->cntxt_id = 0;
4887         fl->desc = NULL;
4888     }
4889 }
4890
4891 /**
4892  *      t4_free_ofld_rxqs - free a block of consecutive Rx queues
4893  *      @adap: the adapter
4894  *      @n: number of queues
4895  *      @q: pointer to first queue
4896  *
4897  *      Release the resources of a consecutive block of offload Rx queues.
4898  */
4899 void t4_free_ofld_rxqs(struct adapter *adap, int n, struct sge_ofld_rxq *q)
4900 {
4901     for ( ; n; n--, q++)
4902         if (q->rspq.desc)
4903             free_rspq_fl(adap, &q->rspq,
4904                      q->fl.size ? &q->fl : NULL);
4905 }
4906
4907 void t4_sge_free_ethofld_txq(struct adapter *adap, struct sge_eohw_txq *txq)
4908 {
4909     if (txq->q.desc) {
4910         t4_ofld_eq_free(adap, adap->mbox, adap->pf, 0,
4911                 txq->q.cntxt_id);
4912         free_tx_desc(adap, &txq->q, txq->q.in_use, false);
4913         kfree(txq->q.sdesc);
4914         free_txq(adap, &txq->q);
4915     }
4916 }
4917
4918 /**
4919  *  t4_free_sge_resources - free SGE resources
4920  *  @adap: the adapter
4921  *
4922  *  Frees resources used by the SGE queue sets.
4923  */
4924 void t4_free_sge_resources(struct adapter *adap)
4925 {
4926     int i;
4927     struct sge_eth_rxq *eq;
4928     struct sge_eth_txq *etq;
4929
4930     /* stop all Rx queues in order to start them draining */
4931     for (i = 0; i < adap->sge.ethqsets; i++) {
4932         eq = &adap->sge.ethrxq[i];
4933         if (eq->rspq.desc)
4934             t4_iq_stop(adap, adap->mbox, adap->pf, 0,
4935                    FW_IQ_TYPE_FL_INT_CAP,
4936                    eq->rspq.cntxt_id,
4937                    eq->fl.size ? eq->fl.cntxt_id : 0xffff,
4938                    0xffff);
4939     }
4940
4941     /* clean up Ethernet Tx/Rx queues */
4942     for (i = 0; i < adap->sge.ethqsets; i++) {
4943         eq = &adap->sge.ethrxq[i];
4944         if (eq->rspq.desc)
4945             free_rspq_fl(adap, &eq->rspq,
4946                      eq->fl.size ? &eq->fl : NULL);
4947         if (eq->msix) {
4948             cxgb4_free_msix_idx_in_bmap(adap, eq->msix->idx);
4949             eq->msix = NULL;
4950         }
4951
4952         etq = &adap->sge.ethtxq[i];
4953         if (etq->q.desc) {
4954             t4_eth_eq_free(adap, adap->mbox, adap->pf, 0,
4955                        etq->q.cntxt_id);
4956             __netif_tx_lock_bh(etq->txq);
4957             free_tx_desc(adap, &etq->q, etq->q.in_use, true);
4958             __netif_tx_unlock_bh(etq->txq);
4959             kfree(etq->q.sdesc);
4960             free_txq(adap, &etq->q);
4961         }
4962     }
4963
4964     /* clean up control Tx queues */
4965     for (i = 0; i < ARRAY_SIZE(adap->sge.ctrlq); i++) {
4966         struct sge_ctrl_txq *cq = &adap->sge.ctrlq[i];
4967
4968         if (cq->q.desc) {
4969             tasklet_kill(&cq->qresume_tsk);
4970             t4_ctrl_eq_free(adap, adap->mbox, adap->pf, 0,
4971                     cq->q.cntxt_id);
4972             __skb_queue_purge(&cq->sendq);
4973             free_txq(adap, &cq->q);
4974         }
4975     }
4976
4977     if (adap->sge.fw_evtq.desc) {
4978         free_rspq_fl(adap, &adap->sge.fw_evtq, NULL);
4979         if (adap->sge.fwevtq_msix_idx >= 0)
4980             cxgb4_free_msix_idx_in_bmap(adap,
4981                             adap->sge.fwevtq_msix_idx);
4982     }
4983
4984     if (adap->sge.nd_msix_idx >= 0)
4985         cxgb4_free_msix_idx_in_bmap(adap, adap->sge.nd_msix_idx);
4986
4987     if (adap->sge.intrq.desc)
4988         free_rspq_fl(adap, &adap->sge.intrq, NULL);
4989
4990     if (!is_t4(adap->params.chip)) {
4991         etq = &adap->sge.ptptxq;
4992         if (etq->q.desc) {
4993             t4_eth_eq_free(adap, adap->mbox, adap->pf, 0,
4994                        etq->q.cntxt_id);
4995             spin_lock_bh(&adap->ptp_lock);
4996             free_tx_desc(adap, &etq->q, etq->q.in_use, true);
4997             spin_unlock_bh(&adap->ptp_lock);
4998             kfree(etq->q.sdesc);
4999             free_txq(adap, &etq->q);
5000         }
5001     }
5002
5003     /* clear the reverse egress queue map */
5004     memset(adap->sge.egr_map, 0,
5005            adap->sge.egr_sz * sizeof(*adap->sge.egr_map));
5006 }
5007
5008 void t4_sge_start(struct adapter *adap)
5009 {
5010     adap->sge.ethtxq_rover = 0;
5011     mod_timer(&adap->sge.rx_timer, jiffies + RX_QCHECK_PERIOD);
5012     mod_timer(&adap->sge.tx_timer, jiffies + TX_QCHECK_PERIOD);
5013 }
5014
5015 /**
5016  *  t4_sge_stop - disable SGE operation
5017  *  @adap: the adapter
5018  *
5019  *  Stop tasklets and timers associated with the DMA engine.  Note that
5020  *  this is effective only if measures have been taken to disable any HW
5021  *  events that may restart them.
5022  */
5023 void t4_sge_stop(struct adapter *adap)
5024 {
5025     int i;
5026     struct sge *s = &adap->sge;
5027
5028     if (s->rx_timer.function)
5029         del_timer_sync(&s->rx_timer);
5030     if (s->tx_timer.function)
5031         del_timer_sync(&s->tx_timer);
5032
5033     if (is_offload(adap)) {
5034         struct sge_uld_txq_info *txq_info;
5035
5036         txq_info = adap->sge.uld_txq_info[CXGB4_TX_OFLD];
5037         if (txq_info) {
5038             struct sge_uld_txq *txq = txq_info->uldtxq;
5039
5040             for_each_ofldtxq(&adap->sge, i) {
5041                 if (txq->q.desc)
5042                     tasklet_kill(&txq->qresume_tsk);
5043             }
5044         }
5045     }
5046
5047     if (is_pci_uld(adap)) {
5048         struct sge_uld_txq_info *txq_info;
5049
5050         txq_info = adap->sge.uld_txq_info[CXGB4_TX_CRYPTO];
5051         if (txq_info) {
5052             struct sge_uld_txq *txq = txq_info->uldtxq;
5053
5054             for_each_ofldtxq(&adap->sge, i) {
5055                 if (txq->q.desc)
5056                     tasklet_kill(&txq->qresume_tsk);
5057             }
5058         }
5059     }
5060
5061     for (i = 0; i < ARRAY_SIZE(s->ctrlq); i++) {
5062         struct sge_ctrl_txq *cq = &s->ctrlq[i];
5063
5064         if (cq->q.desc)
5065             tasklet_kill(&cq->qresume_tsk);
5066     }
5067 }
5068
5069 /**
5070  *  t4_sge_init_soft - grab core SGE values needed by SGE code
5071  *  @adap: the adapter
5072  *
5073  *  We need to grab the SGE operating parameters that we need to have
5074  *  in order to do our job and make sure we can live with them.
5075  */
5076
5077 static int t4_sge_init_soft(struct adapter *adap)
5078 {
5079     struct sge *s = &adap->sge;
5080     u32 fl_small_pg, fl_large_pg, fl_small_mtu, fl_large_mtu;
5081     u32 timer_value_0_and_1, timer_value_2_and_3, timer_value_4_and_5;
5082     u32 ingress_rx_threshold;
5083
5084     /*
5085      * Verify that CPL messages are going to the Ingress Queue for
5086      * process_responses() and that only packet data is going to the
5087      * Free Lists.
5088      */
5089     if ((t4_read_reg(adap, SGE_CONTROL_A) & RXPKTCPLMODE_F) !=
5090         RXPKTCPLMODE_V(RXPKTCPLMODE_SPLIT_X)) {
5091         dev_err(adap->pdev_dev, "bad SGE CPL MODE\n");
5092         return -EINVAL;
5093     }
5094
5095     /*
5096      * Validate the Host Buffer Register Array indices that we want to
5097      * use ...
5098      *
5099      * XXX Note that we should really read through the Host Buffer Size
5100      * XXX register array and find the indices of the Buffer Sizes which
5101      * XXX meet our needs!
5102      */
5103     #define READ_FL_BUF(x) \
5104         t4_read_reg(adap, SGE_FL_BUFFER_SIZE0_A+(x)*sizeof(u32))
5105
5106     fl_small_pg = READ_FL_BUF(RX_SMALL_PG_BUF);
5107     fl_large_pg = READ_FL_BUF(RX_LARGE_PG_BUF);
5108     fl_small_mtu = READ_FL_BUF(RX_SMALL_MTU_BUF);
5109     fl_large_mtu = READ_FL_BUF(RX_LARGE_MTU_BUF);
5110
5111     /* We only bother using the Large Page logic if the Large Page Buffer
5112      * is larger than our Page Size Buffer.
5113      */
5114     if (fl_large_pg <= fl_small_pg)
5115         fl_large_pg = 0;
5116
5117     #undef READ_FL_BUF
5118
5119     /* The Page Size Buffer must be exactly equal to our Page Size and the
5120      * Large Page Size Buffer should be 0 (per above) or a power of 2.
5121      */
5122     if (fl_small_pg != PAGE_SIZE ||
5123         (fl_large_pg & (fl_large_pg-1)) != 0) {
5124         dev_err(adap->pdev_dev, "bad SGE FL page buffer sizes [%d, %d]\n",
5125             fl_small_pg, fl_large_pg);
5126         return -EINVAL;
5127     }
5128     if (fl_large_pg)
5129         s->fl_pg_order = ilog2(fl_large_pg) - PAGE_SHIFT;
5130
5131     if (fl_small_mtu < FL_MTU_SMALL_BUFSIZE(adap) ||
5132         fl_large_mtu < FL_MTU_LARGE_BUFSIZE(adap)) {
5133         dev_err(adap->pdev_dev, "bad SGE FL MTU sizes [%d, %d]\n",
5134             fl_small_mtu, fl_large_mtu);
5135         return -EINVAL;
5136     }
5137
5138     /*
5139      * Retrieve our RX interrupt holdoff timer values and counter
5140      * threshold values from the SGE parameters.
5141      */
5142     timer_value_0_and_1 = t4_read_reg(adap, SGE_TIMER_VALUE_0_AND_1_A);
5143     timer_value_2_and_3 = t4_read_reg(adap, SGE_TIMER_VALUE_2_AND_3_A);
5144     timer_value_4_and_5 = t4_read_reg(adap, SGE_TIMER_VALUE_4_AND_5_A);
5145     s->timer_val[0] = core_ticks_to_us(adap,
5146         TIMERVALUE0_G(timer_value_0_and_1));
5147     s->timer_val[1] = core_ticks_to_us(adap,
5148         TIMERVALUE1_G(timer_value_0_and_1));
5149     s->timer_val[2] = core_ticks_to_us(adap,
5150         TIMERVALUE2_G(timer_value_2_and_3));
5151     s->timer_val[3] = core_ticks_to_us(adap,
5152         TIMERVALUE3_G(timer_value_2_and_3));
5153     s->timer_val[4] = core_ticks_to_us(adap,
5154         TIMERVALUE4_G(timer_value_4_and_5));
5155     s->timer_val[5] = core_ticks_to_us(adap,
5156         TIMERVALUE5_G(timer_value_4_and_5));
5157
5158     ingress_rx_threshold = t4_read_reg(adap, SGE_INGRESS_RX_THRESHOLD_A);
5159     s->counter_val[0] = THRESHOLD_0_G(ingress_rx_threshold);
5160     s->counter_val[1] = THRESHOLD_1_G(ingress_rx_threshold);
5161     s->counter_val[2] = THRESHOLD_2_G(ingress_rx_threshold);
5162     s->counter_val[3] = THRESHOLD_3_G(ingress_rx_threshold);
5163
5164     return 0;
5165 }
5166
5167 /**
5168  *     t4_sge_init - initialize SGE
5169  *     @adap: the adapter
5170  *
5171  *     Perform low-level SGE code initialization needed every time after a
5172  *     chip reset.
5173  */
5174 int t4_sge_init(struct adapter *adap)
5175 {
5176     struct sge *s = &adap->sge;
5177     u32 sge_control, sge_conm_ctrl;
5178     int ret, egress_threshold;
5179
5180     /*
5181      * Ingress Padding Boundary and Egress Status Page Size are set up by
5182      * t4_fixup_host_params().
5183      */
5184     sge_control = t4_read_reg(adap, SGE_CONTROL_A);
5185     s->pktshift = PKTSHIFT_G(sge_control);
5186     s->stat_len = (sge_control & EGRSTATUSPAGESIZE_F) ? 128 : 64;
5187
5188     s->fl_align = t4_fl_pkt_align(adap);
5189     ret = t4_sge_init_soft(adap);
5190     if (ret < 0)
5191         return ret;
5192
5193     /*
5194      * A FL with <= fl_starve_thres buffers is starving and a periodic
5195      * timer will attempt to refill it.  This needs to be larger than the
5196      * SGE's Egress Congestion Threshold.  If it isn't, then we can get
5197      * stuck waiting for new packets while the SGE is waiting for us to
5198      * give it more Free List entries.  (Note that the SGE's Egress
5199      * Congestion Threshold is in units of 2 Free List pointers.) For T4,
5200      * there was only a single field to control this.  For T5 there's the
5201      * original field which now only applies to Unpacked Mode Free List
5202      * buffers and a new field which only applies to Packed Mode Free List
5203      * buffers.
5204      */
5205     sge_conm_ctrl = t4_read_reg(adap, SGE_CONM_CTRL_A);
5206     switch (CHELSIO_CHIP_VERSION(adap->params.chip)) {
5207     case CHELSIO_T4:
5208         egress_threshold = EGRTHRESHOLD_G(sge_conm_ctrl);
5209         break;
5210     case CHELSIO_T5:
5211         egress_threshold = EGRTHRESHOLDPACKING_G(sge_conm_ctrl);
5212         break;
5213     case CHELSIO_T6:
5214         egress_threshold = T6_EGRTHRESHOLDPACKING_G(sge_conm_ctrl);
5215         break;
5216     default:
5217         dev_err(adap->pdev_dev, "Unsupported Chip version %d\n",
5218             CHELSIO_CHIP_VERSION(adap->params.chip));
5219         return -EINVAL;
5220     }
5221     s->fl_starve_thres = 2*egress_threshold + 1;
5222
5223     t4_idma_monitor_init(adap, &s->idma_monitor);
5224
5225     /* Set up timers used for recuring callbacks to process RX and TX
5226      * administrative tasks.
5227      */
5228     timer_setup(&s->rx_timer, sge_rx_timer_cb, 0);
5229     timer_setup(&s->tx_timer, sge_tx_timer_cb, 0);
5230
5231     spin_lock_init(&s->intrq_lock);
5232
5233     return 0;
5234 }