Back to home page

OSCL-LXR

 
 

    


0001 /*
0002  * Copyright (c) 2006 Mellanox Technologies. All rights reserved
0003  *
0004  * This software is available to you under a choice of one of two
0005  * licenses.  You may choose to be licensed under the terms of the GNU
0006  * General Public License (GPL) Version 2, available from the file
0007  * COPYING in the main directory of this source tree, or the
0008  * OpenIB.org BSD license below:
0009  *
0010  *     Redistribution and use in source and binary forms, with or
0011  *     without modification, are permitted provided that the following
0012  *     conditions are met:
0013  *
0014  *      - Redistributions of source code must retain the above
0015  *        copyright notice, this list of conditions and the following
0016  *        disclaimer.
0017  *
0018  *      - Redistributions in binary form must reproduce the above
0019  *        copyright notice, this list of conditions and the following
0020  *        disclaimer in the documentation and/or other materials
0021  *        provided with the distribution.
0022  *
0023  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
0024  * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
0025  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
0026  * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
0027  * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
0028  * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
0029  * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
0030  * SOFTWARE.
0031  */
0032 
0033 #include <rdma/ib_cm.h>
0034 #include <net/dst.h>
0035 #include <net/icmp.h>
0036 #include <linux/icmpv6.h>
0037 #include <linux/delay.h>
0038 #include <linux/slab.h>
0039 #include <linux/vmalloc.h>
0040 #include <linux/moduleparam.h>
0041 #include <linux/sched/signal.h>
0042 #include <linux/sched/mm.h>
0043 
0044 #include "ipoib.h"
0045 
0046 int ipoib_max_conn_qp = 128;
0047 
0048 module_param_named(max_nonsrq_conn_qp, ipoib_max_conn_qp, int, 0444);
0049 MODULE_PARM_DESC(max_nonsrq_conn_qp,
0050          "Max number of connected-mode QPs per interface "
0051          "(applied only if shared receive queue is not available)");
0052 
0053 #ifdef CONFIG_INFINIBAND_IPOIB_DEBUG_DATA
0054 static int data_debug_level;
0055 
0056 module_param_named(cm_data_debug_level, data_debug_level, int, 0644);
0057 MODULE_PARM_DESC(cm_data_debug_level,
0058          "Enable data path debug tracing for connected mode if > 0");
0059 #endif
0060 
0061 #define IPOIB_CM_IETF_ID 0x1000000000000000ULL
0062 
0063 #define IPOIB_CM_RX_UPDATE_TIME (256 * HZ)
0064 #define IPOIB_CM_RX_TIMEOUT     (2 * 256 * HZ)
0065 #define IPOIB_CM_RX_DELAY       (3 * 256 * HZ)
0066 #define IPOIB_CM_RX_UPDATE_MASK (0x3)
0067 
0068 #define IPOIB_CM_RX_RESERVE     (ALIGN(IPOIB_HARD_LEN, 16) - IPOIB_ENCAP_LEN)
0069 
0070 static struct ib_qp_attr ipoib_cm_err_attr = {
0071     .qp_state = IB_QPS_ERR
0072 };
0073 
0074 #define IPOIB_CM_RX_DRAIN_WRID 0xffffffff
0075 
0076 static struct ib_send_wr ipoib_cm_rx_drain_wr = {
0077     .opcode = IB_WR_SEND,
0078 };
0079 
0080 static int ipoib_cm_tx_handler(struct ib_cm_id *cm_id,
0081                    const struct ib_cm_event *event);
0082 
0083 static void ipoib_cm_dma_unmap_rx(struct ipoib_dev_priv *priv, int frags,
0084                   u64 mapping[IPOIB_CM_RX_SG])
0085 {
0086     int i;
0087 
0088     ib_dma_unmap_single(priv->ca, mapping[0], IPOIB_CM_HEAD_SIZE, DMA_FROM_DEVICE);
0089 
0090     for (i = 0; i < frags; ++i)
0091         ib_dma_unmap_page(priv->ca, mapping[i + 1], PAGE_SIZE, DMA_FROM_DEVICE);
0092 }
0093 
0094 static int ipoib_cm_post_receive_srq(struct net_device *dev, int id)
0095 {
0096     struct ipoib_dev_priv *priv = ipoib_priv(dev);
0097     int i, ret;
0098 
0099     priv->cm.rx_wr.wr_id = id | IPOIB_OP_CM | IPOIB_OP_RECV;
0100 
0101     for (i = 0; i < priv->cm.num_frags; ++i)
0102         priv->cm.rx_sge[i].addr = priv->cm.srq_ring[id].mapping[i];
0103 
0104     ret = ib_post_srq_recv(priv->cm.srq, &priv->cm.rx_wr, NULL);
0105     if (unlikely(ret)) {
0106         ipoib_warn(priv, "post srq failed for buf %d (%d)\n", id, ret);
0107         ipoib_cm_dma_unmap_rx(priv, priv->cm.num_frags - 1,
0108                       priv->cm.srq_ring[id].mapping);
0109         dev_kfree_skb_any(priv->cm.srq_ring[id].skb);
0110         priv->cm.srq_ring[id].skb = NULL;
0111     }
0112 
0113     return ret;
0114 }
0115 
0116 static int ipoib_cm_post_receive_nonsrq(struct net_device *dev,
0117                     struct ipoib_cm_rx *rx,
0118                     struct ib_recv_wr *wr,
0119                     struct ib_sge *sge, int id)
0120 {
0121     struct ipoib_dev_priv *priv = ipoib_priv(dev);
0122     int i, ret;
0123 
0124     wr->wr_id = id | IPOIB_OP_CM | IPOIB_OP_RECV;
0125 
0126     for (i = 0; i < IPOIB_CM_RX_SG; ++i)
0127         sge[i].addr = rx->rx_ring[id].mapping[i];
0128 
0129     ret = ib_post_recv(rx->qp, wr, NULL);
0130     if (unlikely(ret)) {
0131         ipoib_warn(priv, "post recv failed for buf %d (%d)\n", id, ret);
0132         ipoib_cm_dma_unmap_rx(priv, IPOIB_CM_RX_SG - 1,
0133                       rx->rx_ring[id].mapping);
0134         dev_kfree_skb_any(rx->rx_ring[id].skb);
0135         rx->rx_ring[id].skb = NULL;
0136     }
0137 
0138     return ret;
0139 }
0140 
0141 static struct sk_buff *ipoib_cm_alloc_rx_skb(struct net_device *dev,
0142                          struct ipoib_cm_rx_buf *rx_ring,
0143                          int id, int frags,
0144                          u64 mapping[IPOIB_CM_RX_SG],
0145                          gfp_t gfp)
0146 {
0147     struct ipoib_dev_priv *priv = ipoib_priv(dev);
0148     struct sk_buff *skb;
0149     int i;
0150 
0151     skb = dev_alloc_skb(ALIGN(IPOIB_CM_HEAD_SIZE + IPOIB_PSEUDO_LEN, 16));
0152     if (unlikely(!skb))
0153         return NULL;
0154 
0155     /*
0156      * IPoIB adds a IPOIB_ENCAP_LEN byte header, this will align the
0157      * IP header to a multiple of 16.
0158      */
0159     skb_reserve(skb, IPOIB_CM_RX_RESERVE);
0160 
0161     mapping[0] = ib_dma_map_single(priv->ca, skb->data, IPOIB_CM_HEAD_SIZE,
0162                        DMA_FROM_DEVICE);
0163     if (unlikely(ib_dma_mapping_error(priv->ca, mapping[0]))) {
0164         dev_kfree_skb_any(skb);
0165         return NULL;
0166     }
0167 
0168     for (i = 0; i < frags; i++) {
0169         struct page *page = alloc_page(gfp);
0170 
0171         if (!page)
0172             goto partial_error;
0173         skb_fill_page_desc(skb, i, page, 0, PAGE_SIZE);
0174 
0175         mapping[i + 1] = ib_dma_map_page(priv->ca, page,
0176                          0, PAGE_SIZE, DMA_FROM_DEVICE);
0177         if (unlikely(ib_dma_mapping_error(priv->ca, mapping[i + 1])))
0178             goto partial_error;
0179     }
0180 
0181     rx_ring[id].skb = skb;
0182     return skb;
0183 
0184 partial_error:
0185 
0186     ib_dma_unmap_single(priv->ca, mapping[0], IPOIB_CM_HEAD_SIZE, DMA_FROM_DEVICE);
0187 
0188     for (; i > 0; --i)
0189         ib_dma_unmap_page(priv->ca, mapping[i], PAGE_SIZE, DMA_FROM_DEVICE);
0190 
0191     dev_kfree_skb_any(skb);
0192     return NULL;
0193 }
0194 
0195 static void ipoib_cm_free_rx_ring(struct net_device *dev,
0196                   struct ipoib_cm_rx_buf *rx_ring)
0197 {
0198     struct ipoib_dev_priv *priv = ipoib_priv(dev);
0199     int i;
0200 
0201     for (i = 0; i < ipoib_recvq_size; ++i)
0202         if (rx_ring[i].skb) {
0203             ipoib_cm_dma_unmap_rx(priv, IPOIB_CM_RX_SG - 1,
0204                           rx_ring[i].mapping);
0205             dev_kfree_skb_any(rx_ring[i].skb);
0206         }
0207 
0208     vfree(rx_ring);
0209 }
0210 
0211 static void ipoib_cm_start_rx_drain(struct ipoib_dev_priv *priv)
0212 {
0213     struct ipoib_cm_rx *p;
0214 
0215     /* We only reserved 1 extra slot in CQ for drain WRs, so
0216      * make sure we have at most 1 outstanding WR. */
0217     if (list_empty(&priv->cm.rx_flush_list) ||
0218         !list_empty(&priv->cm.rx_drain_list))
0219         return;
0220 
0221     /*
0222      * QPs on flush list are error state.  This way, a "flush
0223      * error" WC will be immediately generated for each WR we post.
0224      */
0225     p = list_entry(priv->cm.rx_flush_list.next, typeof(*p), list);
0226     ipoib_cm_rx_drain_wr.wr_id = IPOIB_CM_RX_DRAIN_WRID;
0227     if (ib_post_send(p->qp, &ipoib_cm_rx_drain_wr, NULL))
0228         ipoib_warn(priv, "failed to post drain wr\n");
0229 
0230     list_splice_init(&priv->cm.rx_flush_list, &priv->cm.rx_drain_list);
0231 }
0232 
0233 static void ipoib_cm_rx_event_handler(struct ib_event *event, void *ctx)
0234 {
0235     struct ipoib_cm_rx *p = ctx;
0236     struct ipoib_dev_priv *priv = ipoib_priv(p->dev);
0237     unsigned long flags;
0238 
0239     if (event->event != IB_EVENT_QP_LAST_WQE_REACHED)
0240         return;
0241 
0242     spin_lock_irqsave(&priv->lock, flags);
0243     list_move(&p->list, &priv->cm.rx_flush_list);
0244     p->state = IPOIB_CM_RX_FLUSH;
0245     ipoib_cm_start_rx_drain(priv);
0246     spin_unlock_irqrestore(&priv->lock, flags);
0247 }
0248 
0249 static struct ib_qp *ipoib_cm_create_rx_qp(struct net_device *dev,
0250                        struct ipoib_cm_rx *p)
0251 {
0252     struct ipoib_dev_priv *priv = ipoib_priv(dev);
0253     struct ib_qp_init_attr attr = {
0254         .event_handler = ipoib_cm_rx_event_handler,
0255         .send_cq = priv->recv_cq, /* For drain WR */
0256         .recv_cq = priv->recv_cq,
0257         .srq = priv->cm.srq,
0258         .cap.max_send_wr = 1, /* For drain WR */
0259         .cap.max_send_sge = 1, /* FIXME: 0 Seems not to work */
0260         .sq_sig_type = IB_SIGNAL_ALL_WR,
0261         .qp_type = IB_QPT_RC,
0262         .qp_context = p,
0263     };
0264 
0265     if (!ipoib_cm_has_srq(dev)) {
0266         attr.cap.max_recv_wr  = ipoib_recvq_size;
0267         attr.cap.max_recv_sge = IPOIB_CM_RX_SG;
0268     }
0269 
0270     return ib_create_qp(priv->pd, &attr);
0271 }
0272 
0273 static int ipoib_cm_modify_rx_qp(struct net_device *dev,
0274                  struct ib_cm_id *cm_id, struct ib_qp *qp,
0275                  unsigned int psn)
0276 {
0277     struct ipoib_dev_priv *priv = ipoib_priv(dev);
0278     struct ib_qp_attr qp_attr;
0279     int qp_attr_mask, ret;
0280 
0281     qp_attr.qp_state = IB_QPS_INIT;
0282     ret = ib_cm_init_qp_attr(cm_id, &qp_attr, &qp_attr_mask);
0283     if (ret) {
0284         ipoib_warn(priv, "failed to init QP attr for INIT: %d\n", ret);
0285         return ret;
0286     }
0287     ret = ib_modify_qp(qp, &qp_attr, qp_attr_mask);
0288     if (ret) {
0289         ipoib_warn(priv, "failed to modify QP to INIT: %d\n", ret);
0290         return ret;
0291     }
0292     qp_attr.qp_state = IB_QPS_RTR;
0293     ret = ib_cm_init_qp_attr(cm_id, &qp_attr, &qp_attr_mask);
0294     if (ret) {
0295         ipoib_warn(priv, "failed to init QP attr for RTR: %d\n", ret);
0296         return ret;
0297     }
0298     qp_attr.rq_psn = psn;
0299     ret = ib_modify_qp(qp, &qp_attr, qp_attr_mask);
0300     if (ret) {
0301         ipoib_warn(priv, "failed to modify QP to RTR: %d\n", ret);
0302         return ret;
0303     }
0304 
0305     /*
0306      * Current Mellanox HCA firmware won't generate completions
0307      * with error for drain WRs unless the QP has been moved to
0308      * RTS first. This work-around leaves a window where a QP has
0309      * moved to error asynchronously, but this will eventually get
0310      * fixed in firmware, so let's not error out if modify QP
0311      * fails.
0312      */
0313     qp_attr.qp_state = IB_QPS_RTS;
0314     ret = ib_cm_init_qp_attr(cm_id, &qp_attr, &qp_attr_mask);
0315     if (ret) {
0316         ipoib_warn(priv, "failed to init QP attr for RTS: %d\n", ret);
0317         return 0;
0318     }
0319     ret = ib_modify_qp(qp, &qp_attr, qp_attr_mask);
0320     if (ret) {
0321         ipoib_warn(priv, "failed to modify QP to RTS: %d\n", ret);
0322         return 0;
0323     }
0324 
0325     return 0;
0326 }
0327 
0328 static void ipoib_cm_init_rx_wr(struct net_device *dev,
0329                 struct ib_recv_wr *wr,
0330                 struct ib_sge *sge)
0331 {
0332     struct ipoib_dev_priv *priv = ipoib_priv(dev);
0333     int i;
0334 
0335     for (i = 0; i < priv->cm.num_frags; ++i)
0336         sge[i].lkey = priv->pd->local_dma_lkey;
0337 
0338     sge[0].length = IPOIB_CM_HEAD_SIZE;
0339     for (i = 1; i < priv->cm.num_frags; ++i)
0340         sge[i].length = PAGE_SIZE;
0341 
0342     wr->next    = NULL;
0343     wr->sg_list = sge;
0344     wr->num_sge = priv->cm.num_frags;
0345 }
0346 
0347 static int ipoib_cm_nonsrq_init_rx(struct net_device *dev, struct ib_cm_id *cm_id,
0348                    struct ipoib_cm_rx *rx)
0349 {
0350     struct ipoib_dev_priv *priv = ipoib_priv(dev);
0351     struct {
0352         struct ib_recv_wr wr;
0353         struct ib_sge sge[IPOIB_CM_RX_SG];
0354     } *t;
0355     int ret;
0356     int i;
0357 
0358     rx->rx_ring = vzalloc(array_size(ipoib_recvq_size,
0359                      sizeof(*rx->rx_ring)));
0360     if (!rx->rx_ring)
0361         return -ENOMEM;
0362 
0363     t = kmalloc(sizeof(*t), GFP_KERNEL);
0364     if (!t) {
0365         ret = -ENOMEM;
0366         goto err_free_1;
0367     }
0368 
0369     ipoib_cm_init_rx_wr(dev, &t->wr, t->sge);
0370 
0371     spin_lock_irq(&priv->lock);
0372 
0373     if (priv->cm.nonsrq_conn_qp >= ipoib_max_conn_qp) {
0374         spin_unlock_irq(&priv->lock);
0375         ib_send_cm_rej(cm_id, IB_CM_REJ_NO_QP, NULL, 0, NULL, 0);
0376         ret = -EINVAL;
0377         goto err_free;
0378     } else
0379         ++priv->cm.nonsrq_conn_qp;
0380 
0381     spin_unlock_irq(&priv->lock);
0382 
0383     for (i = 0; i < ipoib_recvq_size; ++i) {
0384         if (!ipoib_cm_alloc_rx_skb(dev, rx->rx_ring, i, IPOIB_CM_RX_SG - 1,
0385                        rx->rx_ring[i].mapping,
0386                        GFP_KERNEL)) {
0387             ipoib_warn(priv, "failed to allocate receive buffer %d\n", i);
0388             ret = -ENOMEM;
0389             goto err_count;
0390         }
0391         ret = ipoib_cm_post_receive_nonsrq(dev, rx, &t->wr, t->sge, i);
0392         if (ret) {
0393             ipoib_warn(priv, "ipoib_cm_post_receive_nonsrq "
0394                    "failed for buf %d\n", i);
0395             ret = -EIO;
0396             goto err_count;
0397         }
0398     }
0399 
0400     rx->recv_count = ipoib_recvq_size;
0401 
0402     kfree(t);
0403 
0404     return 0;
0405 
0406 err_count:
0407     spin_lock_irq(&priv->lock);
0408     --priv->cm.nonsrq_conn_qp;
0409     spin_unlock_irq(&priv->lock);
0410 
0411 err_free:
0412     kfree(t);
0413 
0414 err_free_1:
0415     ipoib_cm_free_rx_ring(dev, rx->rx_ring);
0416 
0417     return ret;
0418 }
0419 
0420 static int ipoib_cm_send_rep(struct net_device *dev, struct ib_cm_id *cm_id,
0421                  struct ib_qp *qp,
0422                  const struct ib_cm_req_event_param *req,
0423                  unsigned int psn)
0424 {
0425     struct ipoib_dev_priv *priv = ipoib_priv(dev);
0426     struct ipoib_cm_data data = {};
0427     struct ib_cm_rep_param rep = {};
0428 
0429     data.qpn = cpu_to_be32(priv->qp->qp_num);
0430     data.mtu = cpu_to_be32(IPOIB_CM_BUF_SIZE);
0431 
0432     rep.private_data = &data;
0433     rep.private_data_len = sizeof(data);
0434     rep.flow_control = 0;
0435     rep.rnr_retry_count = req->rnr_retry_count;
0436     rep.srq = ipoib_cm_has_srq(dev);
0437     rep.qp_num = qp->qp_num;
0438     rep.starting_psn = psn;
0439     return ib_send_cm_rep(cm_id, &rep);
0440 }
0441 
0442 static int ipoib_cm_req_handler(struct ib_cm_id *cm_id,
0443                 const struct ib_cm_event *event)
0444 {
0445     struct net_device *dev = cm_id->context;
0446     struct ipoib_dev_priv *priv = ipoib_priv(dev);
0447     struct ipoib_cm_rx *p;
0448     unsigned int psn;
0449     int ret;
0450 
0451     ipoib_dbg(priv, "REQ arrived\n");
0452     p = kzalloc(sizeof(*p), GFP_KERNEL);
0453     if (!p)
0454         return -ENOMEM;
0455     p->dev = dev;
0456     p->id = cm_id;
0457     cm_id->context = p;
0458     p->state = IPOIB_CM_RX_LIVE;
0459     p->jiffies = jiffies;
0460     INIT_LIST_HEAD(&p->list);
0461 
0462     p->qp = ipoib_cm_create_rx_qp(dev, p);
0463     if (IS_ERR(p->qp)) {
0464         ret = PTR_ERR(p->qp);
0465         goto err_qp;
0466     }
0467 
0468     psn = prandom_u32() & 0xffffff;
0469     ret = ipoib_cm_modify_rx_qp(dev, cm_id, p->qp, psn);
0470     if (ret)
0471         goto err_modify;
0472 
0473     if (!ipoib_cm_has_srq(dev)) {
0474         ret = ipoib_cm_nonsrq_init_rx(dev, cm_id, p);
0475         if (ret)
0476             goto err_modify;
0477     }
0478 
0479     spin_lock_irq(&priv->lock);
0480     queue_delayed_work(priv->wq,
0481                &priv->cm.stale_task, IPOIB_CM_RX_DELAY);
0482     /* Add this entry to passive ids list head, but do not re-add it
0483      * if IB_EVENT_QP_LAST_WQE_REACHED has moved it to flush list. */
0484     p->jiffies = jiffies;
0485     if (p->state == IPOIB_CM_RX_LIVE)
0486         list_move(&p->list, &priv->cm.passive_ids);
0487     spin_unlock_irq(&priv->lock);
0488 
0489     ret = ipoib_cm_send_rep(dev, cm_id, p->qp, &event->param.req_rcvd, psn);
0490     if (ret) {
0491         ipoib_warn(priv, "failed to send REP: %d\n", ret);
0492         if (ib_modify_qp(p->qp, &ipoib_cm_err_attr, IB_QP_STATE))
0493             ipoib_warn(priv, "unable to move qp to error state\n");
0494     }
0495     return 0;
0496 
0497 err_modify:
0498     ib_destroy_qp(p->qp);
0499 err_qp:
0500     kfree(p);
0501     return ret;
0502 }
0503 
0504 static int ipoib_cm_rx_handler(struct ib_cm_id *cm_id,
0505                    const struct ib_cm_event *event)
0506 {
0507     struct ipoib_cm_rx *p;
0508     struct ipoib_dev_priv *priv;
0509 
0510     switch (event->event) {
0511     case IB_CM_REQ_RECEIVED:
0512         return ipoib_cm_req_handler(cm_id, event);
0513     case IB_CM_DREQ_RECEIVED:
0514         ib_send_cm_drep(cm_id, NULL, 0);
0515         fallthrough;
0516     case IB_CM_REJ_RECEIVED:
0517         p = cm_id->context;
0518         priv = ipoib_priv(p->dev);
0519         if (ib_modify_qp(p->qp, &ipoib_cm_err_attr, IB_QP_STATE))
0520             ipoib_warn(priv, "unable to move qp to error state\n");
0521         fallthrough;
0522     default:
0523         return 0;
0524     }
0525 }
0526 /* Adjust length of skb with fragments to match received data */
0527 static void skb_put_frags(struct sk_buff *skb, unsigned int hdr_space,
0528               unsigned int length, struct sk_buff *toskb)
0529 {
0530     int i, num_frags;
0531     unsigned int size;
0532 
0533     /* put header into skb */
0534     size = min(length, hdr_space);
0535     skb->tail += size;
0536     skb->len += size;
0537     length -= size;
0538 
0539     num_frags = skb_shinfo(skb)->nr_frags;
0540     for (i = 0; i < num_frags; i++) {
0541         skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
0542 
0543         if (length == 0) {
0544             /* don't need this page */
0545             skb_fill_page_desc(toskb, i, skb_frag_page(frag),
0546                        0, PAGE_SIZE);
0547             --skb_shinfo(skb)->nr_frags;
0548         } else {
0549             size = min_t(unsigned int, length, PAGE_SIZE);
0550 
0551             skb_frag_size_set(frag, size);
0552             skb->data_len += size;
0553             skb->truesize += size;
0554             skb->len += size;
0555             length -= size;
0556         }
0557     }
0558 }
0559 
0560 void ipoib_cm_handle_rx_wc(struct net_device *dev, struct ib_wc *wc)
0561 {
0562     struct ipoib_dev_priv *priv = ipoib_priv(dev);
0563     struct ipoib_cm_rx_buf *rx_ring;
0564     unsigned int wr_id = wc->wr_id & ~(IPOIB_OP_CM | IPOIB_OP_RECV);
0565     struct sk_buff *skb, *newskb;
0566     struct ipoib_cm_rx *p;
0567     unsigned long flags;
0568     u64 mapping[IPOIB_CM_RX_SG];
0569     int frags;
0570     int has_srq;
0571     struct sk_buff *small_skb;
0572 
0573     ipoib_dbg_data(priv, "cm recv completion: id %d, status: %d\n",
0574                wr_id, wc->status);
0575 
0576     if (unlikely(wr_id >= ipoib_recvq_size)) {
0577         if (wr_id == (IPOIB_CM_RX_DRAIN_WRID & ~(IPOIB_OP_CM | IPOIB_OP_RECV))) {
0578             spin_lock_irqsave(&priv->lock, flags);
0579             list_splice_init(&priv->cm.rx_drain_list, &priv->cm.rx_reap_list);
0580             ipoib_cm_start_rx_drain(priv);
0581             queue_work(priv->wq, &priv->cm.rx_reap_task);
0582             spin_unlock_irqrestore(&priv->lock, flags);
0583         } else
0584             ipoib_warn(priv, "cm recv completion event with wrid %d (> %d)\n",
0585                    wr_id, ipoib_recvq_size);
0586         return;
0587     }
0588 
0589     p = wc->qp->qp_context;
0590 
0591     has_srq = ipoib_cm_has_srq(dev);
0592     rx_ring = has_srq ? priv->cm.srq_ring : p->rx_ring;
0593 
0594     skb = rx_ring[wr_id].skb;
0595 
0596     if (unlikely(wc->status != IB_WC_SUCCESS)) {
0597         ipoib_dbg(priv,
0598               "cm recv error (status=%d, wrid=%d vend_err %#x)\n",
0599               wc->status, wr_id, wc->vendor_err);
0600         ++dev->stats.rx_dropped;
0601         if (has_srq)
0602             goto repost;
0603         else {
0604             if (!--p->recv_count) {
0605                 spin_lock_irqsave(&priv->lock, flags);
0606                 list_move(&p->list, &priv->cm.rx_reap_list);
0607                 spin_unlock_irqrestore(&priv->lock, flags);
0608                 queue_work(priv->wq, &priv->cm.rx_reap_task);
0609             }
0610             return;
0611         }
0612     }
0613 
0614     if (unlikely(!(wr_id & IPOIB_CM_RX_UPDATE_MASK))) {
0615         if (p && time_after_eq(jiffies, p->jiffies + IPOIB_CM_RX_UPDATE_TIME)) {
0616             spin_lock_irqsave(&priv->lock, flags);
0617             p->jiffies = jiffies;
0618             /* Move this entry to list head, but do not re-add it
0619              * if it has been moved out of list. */
0620             if (p->state == IPOIB_CM_RX_LIVE)
0621                 list_move(&p->list, &priv->cm.passive_ids);
0622             spin_unlock_irqrestore(&priv->lock, flags);
0623         }
0624     }
0625 
0626     if (wc->byte_len < IPOIB_CM_COPYBREAK) {
0627         int dlen = wc->byte_len;
0628 
0629         small_skb = dev_alloc_skb(dlen + IPOIB_CM_RX_RESERVE);
0630         if (small_skb) {
0631             skb_reserve(small_skb, IPOIB_CM_RX_RESERVE);
0632             ib_dma_sync_single_for_cpu(priv->ca, rx_ring[wr_id].mapping[0],
0633                            dlen, DMA_FROM_DEVICE);
0634             skb_copy_from_linear_data(skb, small_skb->data, dlen);
0635             ib_dma_sync_single_for_device(priv->ca, rx_ring[wr_id].mapping[0],
0636                               dlen, DMA_FROM_DEVICE);
0637             skb_put(small_skb, dlen);
0638             skb = small_skb;
0639             goto copied;
0640         }
0641     }
0642 
0643     frags = PAGE_ALIGN(wc->byte_len -
0644                min_t(u32, wc->byte_len, IPOIB_CM_HEAD_SIZE)) /
0645         PAGE_SIZE;
0646 
0647     newskb = ipoib_cm_alloc_rx_skb(dev, rx_ring, wr_id, frags,
0648                        mapping, GFP_ATOMIC);
0649     if (unlikely(!newskb)) {
0650         /*
0651          * If we can't allocate a new RX buffer, dump
0652          * this packet and reuse the old buffer.
0653          */
0654         ipoib_dbg(priv, "failed to allocate receive buffer %d\n", wr_id);
0655         ++dev->stats.rx_dropped;
0656         goto repost;
0657     }
0658 
0659     ipoib_cm_dma_unmap_rx(priv, frags, rx_ring[wr_id].mapping);
0660     memcpy(rx_ring[wr_id].mapping, mapping, (frags + 1) * sizeof(*mapping));
0661 
0662     ipoib_dbg_data(priv, "received %d bytes, SLID 0x%04x\n",
0663                wc->byte_len, wc->slid);
0664 
0665     skb_put_frags(skb, IPOIB_CM_HEAD_SIZE, wc->byte_len, newskb);
0666 
0667 copied:
0668     skb->protocol = ((struct ipoib_header *) skb->data)->proto;
0669     skb_add_pseudo_hdr(skb);
0670 
0671     ++dev->stats.rx_packets;
0672     dev->stats.rx_bytes += skb->len;
0673 
0674     skb->dev = dev;
0675     /* XXX get correct PACKET_ type here */
0676     skb->pkt_type = PACKET_HOST;
0677     netif_receive_skb(skb);
0678 
0679 repost:
0680     if (has_srq) {
0681         if (unlikely(ipoib_cm_post_receive_srq(dev, wr_id)))
0682             ipoib_warn(priv, "ipoib_cm_post_receive_srq failed "
0683                    "for buf %d\n", wr_id);
0684     } else {
0685         if (unlikely(ipoib_cm_post_receive_nonsrq(dev, p,
0686                               &priv->cm.rx_wr,
0687                               priv->cm.rx_sge,
0688                               wr_id))) {
0689             --p->recv_count;
0690             ipoib_warn(priv, "ipoib_cm_post_receive_nonsrq failed "
0691                    "for buf %d\n", wr_id);
0692         }
0693     }
0694 }
0695 
0696 static inline int post_send(struct ipoib_dev_priv *priv,
0697                 struct ipoib_cm_tx *tx,
0698                 unsigned int wr_id,
0699                 struct ipoib_tx_buf *tx_req)
0700 {
0701     ipoib_build_sge(priv, tx_req);
0702 
0703     priv->tx_wr.wr.wr_id    = wr_id | IPOIB_OP_CM;
0704 
0705     return ib_post_send(tx->qp, &priv->tx_wr.wr, NULL);
0706 }
0707 
0708 void ipoib_cm_send(struct net_device *dev, struct sk_buff *skb, struct ipoib_cm_tx *tx)
0709 {
0710     struct ipoib_dev_priv *priv = ipoib_priv(dev);
0711     struct ipoib_tx_buf *tx_req;
0712     int rc;
0713     unsigned int usable_sge = tx->max_send_sge - !!skb_headlen(skb);
0714 
0715     if (unlikely(skb->len > tx->mtu)) {
0716         ipoib_warn(priv, "packet len %d (> %d) too long to send, dropping\n",
0717                skb->len, tx->mtu);
0718         ++dev->stats.tx_dropped;
0719         ++dev->stats.tx_errors;
0720         ipoib_cm_skb_too_long(dev, skb, tx->mtu - IPOIB_ENCAP_LEN);
0721         return;
0722     }
0723     if (skb_shinfo(skb)->nr_frags > usable_sge) {
0724         if (skb_linearize(skb) < 0) {
0725             ipoib_warn(priv, "skb could not be linearized\n");
0726             ++dev->stats.tx_dropped;
0727             ++dev->stats.tx_errors;
0728             dev_kfree_skb_any(skb);
0729             return;
0730         }
0731         /* Does skb_linearize return ok without reducing nr_frags? */
0732         if (skb_shinfo(skb)->nr_frags > usable_sge) {
0733             ipoib_warn(priv, "too many frags after skb linearize\n");
0734             ++dev->stats.tx_dropped;
0735             ++dev->stats.tx_errors;
0736             dev_kfree_skb_any(skb);
0737             return;
0738         }
0739     }
0740     ipoib_dbg_data(priv, "sending packet: head 0x%x length %d connection 0x%x\n",
0741                tx->tx_head, skb->len, tx->qp->qp_num);
0742 
0743     /*
0744      * We put the skb into the tx_ring _before_ we call post_send()
0745      * because it's entirely possible that the completion handler will
0746      * run before we execute anything after the post_send().  That
0747      * means we have to make sure everything is properly recorded and
0748      * our state is consistent before we call post_send().
0749      */
0750     tx_req = &tx->tx_ring[tx->tx_head & (ipoib_sendq_size - 1)];
0751     tx_req->skb = skb;
0752 
0753     if (unlikely(ipoib_dma_map_tx(priv->ca, tx_req))) {
0754         ++dev->stats.tx_errors;
0755         dev_kfree_skb_any(skb);
0756         return;
0757     }
0758 
0759     if ((priv->global_tx_head - priv->global_tx_tail) ==
0760         ipoib_sendq_size - 1) {
0761         ipoib_dbg(priv, "TX ring 0x%x full, stopping kernel net queue\n",
0762               tx->qp->qp_num);
0763         netif_stop_queue(dev);
0764     }
0765 
0766     skb_orphan(skb);
0767     skb_dst_drop(skb);
0768 
0769     if (netif_queue_stopped(dev)) {
0770         rc = ib_req_notify_cq(priv->send_cq, IB_CQ_NEXT_COMP |
0771                       IB_CQ_REPORT_MISSED_EVENTS);
0772         if (unlikely(rc < 0))
0773             ipoib_warn(priv, "IPoIB/CM:request notify on send CQ failed\n");
0774         else if (rc)
0775             napi_schedule(&priv->send_napi);
0776     }
0777 
0778     rc = post_send(priv, tx, tx->tx_head & (ipoib_sendq_size - 1), tx_req);
0779     if (unlikely(rc)) {
0780         ipoib_warn(priv, "IPoIB/CM:post_send failed, error %d\n", rc);
0781         ++dev->stats.tx_errors;
0782         ipoib_dma_unmap_tx(priv, tx_req);
0783         dev_kfree_skb_any(skb);
0784 
0785         if (netif_queue_stopped(dev))
0786             netif_wake_queue(dev);
0787     } else {
0788         netif_trans_update(dev);
0789         ++tx->tx_head;
0790         ++priv->global_tx_head;
0791     }
0792 }
0793 
0794 void ipoib_cm_handle_tx_wc(struct net_device *dev, struct ib_wc *wc)
0795 {
0796     struct ipoib_dev_priv *priv = ipoib_priv(dev);
0797     struct ipoib_cm_tx *tx = wc->qp->qp_context;
0798     unsigned int wr_id = wc->wr_id & ~IPOIB_OP_CM;
0799     struct ipoib_tx_buf *tx_req;
0800     unsigned long flags;
0801 
0802     ipoib_dbg_data(priv, "cm send completion: id %d, status: %d\n",
0803                wr_id, wc->status);
0804 
0805     if (unlikely(wr_id >= ipoib_sendq_size)) {
0806         ipoib_warn(priv, "cm send completion event with wrid %d (> %d)\n",
0807                wr_id, ipoib_sendq_size);
0808         return;
0809     }
0810 
0811     tx_req = &tx->tx_ring[wr_id];
0812 
0813     ipoib_dma_unmap_tx(priv, tx_req);
0814 
0815     /* FIXME: is this right? Shouldn't we only increment on success? */
0816     ++dev->stats.tx_packets;
0817     dev->stats.tx_bytes += tx_req->skb->len;
0818 
0819     dev_kfree_skb_any(tx_req->skb);
0820 
0821     netif_tx_lock(dev);
0822 
0823     ++tx->tx_tail;
0824     ++priv->global_tx_tail;
0825 
0826     if (unlikely(netif_queue_stopped(dev) &&
0827              ((priv->global_tx_head - priv->global_tx_tail) <=
0828               ipoib_sendq_size >> 1) &&
0829              test_bit(IPOIB_FLAG_ADMIN_UP, &priv->flags)))
0830         netif_wake_queue(dev);
0831 
0832     if (wc->status != IB_WC_SUCCESS &&
0833         wc->status != IB_WC_WR_FLUSH_ERR) {
0834         struct ipoib_neigh *neigh;
0835 
0836         /* IB_WC[_RNR]_RETRY_EXC_ERR error is part of the life cycle,
0837          * so don't make waves.
0838          */
0839         if (wc->status == IB_WC_RNR_RETRY_EXC_ERR ||
0840             wc->status == IB_WC_RETRY_EXC_ERR)
0841             ipoib_dbg(priv,
0842                   "%s: failed cm send event (status=%d, wrid=%d vend_err %#x)\n",
0843                    __func__, wc->status, wr_id, wc->vendor_err);
0844         else
0845             ipoib_warn(priv,
0846                     "%s: failed cm send event (status=%d, wrid=%d vend_err %#x)\n",
0847                    __func__, wc->status, wr_id, wc->vendor_err);
0848 
0849         spin_lock_irqsave(&priv->lock, flags);
0850         neigh = tx->neigh;
0851 
0852         if (neigh) {
0853             neigh->cm = NULL;
0854             ipoib_neigh_free(neigh);
0855 
0856             tx->neigh = NULL;
0857         }
0858 
0859         if (test_and_clear_bit(IPOIB_FLAG_INITIALIZED, &tx->flags)) {
0860             list_move(&tx->list, &priv->cm.reap_list);
0861             queue_work(priv->wq, &priv->cm.reap_task);
0862         }
0863 
0864         clear_bit(IPOIB_FLAG_OPER_UP, &tx->flags);
0865 
0866         spin_unlock_irqrestore(&priv->lock, flags);
0867     }
0868 
0869     netif_tx_unlock(dev);
0870 }
0871 
0872 int ipoib_cm_dev_open(struct net_device *dev)
0873 {
0874     struct ipoib_dev_priv *priv = ipoib_priv(dev);
0875     int ret;
0876 
0877     if (!IPOIB_CM_SUPPORTED(dev->dev_addr))
0878         return 0;
0879 
0880     priv->cm.id = ib_create_cm_id(priv->ca, ipoib_cm_rx_handler, dev);
0881     if (IS_ERR(priv->cm.id)) {
0882         pr_warn("%s: failed to create CM ID\n", priv->ca->name);
0883         ret = PTR_ERR(priv->cm.id);
0884         goto err_cm;
0885     }
0886 
0887     ret = ib_cm_listen(priv->cm.id, cpu_to_be64(IPOIB_CM_IETF_ID | priv->qp->qp_num),
0888                0);
0889     if (ret) {
0890         pr_warn("%s: failed to listen on ID 0x%llx\n", priv->ca->name,
0891             IPOIB_CM_IETF_ID | priv->qp->qp_num);
0892         goto err_listen;
0893     }
0894 
0895     return 0;
0896 
0897 err_listen:
0898     ib_destroy_cm_id(priv->cm.id);
0899 err_cm:
0900     priv->cm.id = NULL;
0901     return ret;
0902 }
0903 
0904 static void ipoib_cm_free_rx_reap_list(struct net_device *dev)
0905 {
0906     struct ipoib_dev_priv *priv = ipoib_priv(dev);
0907     struct ipoib_cm_rx *rx, *n;
0908     LIST_HEAD(list);
0909 
0910     spin_lock_irq(&priv->lock);
0911     list_splice_init(&priv->cm.rx_reap_list, &list);
0912     spin_unlock_irq(&priv->lock);
0913 
0914     list_for_each_entry_safe(rx, n, &list, list) {
0915         ib_destroy_cm_id(rx->id);
0916         ib_destroy_qp(rx->qp);
0917         if (!ipoib_cm_has_srq(dev)) {
0918             ipoib_cm_free_rx_ring(priv->dev, rx->rx_ring);
0919             spin_lock_irq(&priv->lock);
0920             --priv->cm.nonsrq_conn_qp;
0921             spin_unlock_irq(&priv->lock);
0922         }
0923         kfree(rx);
0924     }
0925 }
0926 
0927 void ipoib_cm_dev_stop(struct net_device *dev)
0928 {
0929     struct ipoib_dev_priv *priv = ipoib_priv(dev);
0930     struct ipoib_cm_rx *p;
0931     unsigned long begin;
0932     int ret;
0933 
0934     if (!IPOIB_CM_SUPPORTED(dev->dev_addr) || !priv->cm.id)
0935         return;
0936 
0937     ib_destroy_cm_id(priv->cm.id);
0938     priv->cm.id = NULL;
0939 
0940     spin_lock_irq(&priv->lock);
0941     while (!list_empty(&priv->cm.passive_ids)) {
0942         p = list_entry(priv->cm.passive_ids.next, typeof(*p), list);
0943         list_move(&p->list, &priv->cm.rx_error_list);
0944         p->state = IPOIB_CM_RX_ERROR;
0945         spin_unlock_irq(&priv->lock);
0946         ret = ib_modify_qp(p->qp, &ipoib_cm_err_attr, IB_QP_STATE);
0947         if (ret)
0948             ipoib_warn(priv, "unable to move qp to error state: %d\n", ret);
0949         spin_lock_irq(&priv->lock);
0950     }
0951 
0952     /* Wait for all RX to be drained */
0953     begin = jiffies;
0954 
0955     while (!list_empty(&priv->cm.rx_error_list) ||
0956            !list_empty(&priv->cm.rx_flush_list) ||
0957            !list_empty(&priv->cm.rx_drain_list)) {
0958         if (time_after(jiffies, begin + 5 * HZ)) {
0959             ipoib_warn(priv, "RX drain timing out\n");
0960 
0961             /*
0962              * assume the HW is wedged and just free up everything.
0963              */
0964             list_splice_init(&priv->cm.rx_flush_list,
0965                      &priv->cm.rx_reap_list);
0966             list_splice_init(&priv->cm.rx_error_list,
0967                      &priv->cm.rx_reap_list);
0968             list_splice_init(&priv->cm.rx_drain_list,
0969                      &priv->cm.rx_reap_list);
0970             break;
0971         }
0972         spin_unlock_irq(&priv->lock);
0973         usleep_range(1000, 2000);
0974         ipoib_drain_cq(dev);
0975         spin_lock_irq(&priv->lock);
0976     }
0977 
0978     spin_unlock_irq(&priv->lock);
0979 
0980     ipoib_cm_free_rx_reap_list(dev);
0981 
0982     cancel_delayed_work(&priv->cm.stale_task);
0983 }
0984 
0985 static int ipoib_cm_rep_handler(struct ib_cm_id *cm_id,
0986                 const struct ib_cm_event *event)
0987 {
0988     struct ipoib_cm_tx *p = cm_id->context;
0989     struct ipoib_dev_priv *priv = ipoib_priv(p->dev);
0990     struct ipoib_cm_data *data = event->private_data;
0991     struct sk_buff_head skqueue;
0992     struct ib_qp_attr qp_attr;
0993     int qp_attr_mask, ret;
0994     struct sk_buff *skb;
0995 
0996     p->mtu = be32_to_cpu(data->mtu);
0997 
0998     if (p->mtu <= IPOIB_ENCAP_LEN) {
0999         ipoib_warn(priv, "Rejecting connection: mtu %d <= %d\n",
1000                p->mtu, IPOIB_ENCAP_LEN);
1001         return -EINVAL;
1002     }
1003 
1004     qp_attr.qp_state = IB_QPS_RTR;
1005     ret = ib_cm_init_qp_attr(cm_id, &qp_attr, &qp_attr_mask);
1006     if (ret) {
1007         ipoib_warn(priv, "failed to init QP attr for RTR: %d\n", ret);
1008         return ret;
1009     }
1010 
1011     qp_attr.rq_psn = 0 /* FIXME */;
1012     ret = ib_modify_qp(p->qp, &qp_attr, qp_attr_mask);
1013     if (ret) {
1014         ipoib_warn(priv, "failed to modify QP to RTR: %d\n", ret);
1015         return ret;
1016     }
1017 
1018     qp_attr.qp_state = IB_QPS_RTS;
1019     ret = ib_cm_init_qp_attr(cm_id, &qp_attr, &qp_attr_mask);
1020     if (ret) {
1021         ipoib_warn(priv, "failed to init QP attr for RTS: %d\n", ret);
1022         return ret;
1023     }
1024     ret = ib_modify_qp(p->qp, &qp_attr, qp_attr_mask);
1025     if (ret) {
1026         ipoib_warn(priv, "failed to modify QP to RTS: %d\n", ret);
1027         return ret;
1028     }
1029 
1030     skb_queue_head_init(&skqueue);
1031 
1032     netif_tx_lock_bh(p->dev);
1033     spin_lock_irq(&priv->lock);
1034     set_bit(IPOIB_FLAG_OPER_UP, &p->flags);
1035     if (p->neigh)
1036         while ((skb = __skb_dequeue(&p->neigh->queue)))
1037             __skb_queue_tail(&skqueue, skb);
1038     spin_unlock_irq(&priv->lock);
1039     netif_tx_unlock_bh(p->dev);
1040 
1041     while ((skb = __skb_dequeue(&skqueue))) {
1042         skb->dev = p->dev;
1043         ret = dev_queue_xmit(skb);
1044         if (ret)
1045             ipoib_warn(priv, "%s:dev_queue_xmit failed to re-queue packet, ret:%d\n",
1046                    __func__, ret);
1047     }
1048 
1049     ret = ib_send_cm_rtu(cm_id, NULL, 0);
1050     if (ret) {
1051         ipoib_warn(priv, "failed to send RTU: %d\n", ret);
1052         return ret;
1053     }
1054     return 0;
1055 }
1056 
1057 static struct ib_qp *ipoib_cm_create_tx_qp(struct net_device *dev, struct ipoib_cm_tx *tx)
1058 {
1059     struct ipoib_dev_priv *priv = ipoib_priv(dev);
1060     struct ib_qp_init_attr attr = {
1061         .send_cq        = priv->send_cq,
1062         .recv_cq        = priv->recv_cq,
1063         .srq            = priv->cm.srq,
1064         .cap.max_send_wr    = ipoib_sendq_size,
1065         .cap.max_send_sge   = 1,
1066         .sq_sig_type        = IB_SIGNAL_ALL_WR,
1067         .qp_type        = IB_QPT_RC,
1068         .qp_context     = tx,
1069         .create_flags       = 0
1070     };
1071     struct ib_qp *tx_qp;
1072 
1073     if (dev->features & NETIF_F_SG)
1074         attr.cap.max_send_sge = min_t(u32, priv->ca->attrs.max_send_sge,
1075                           MAX_SKB_FRAGS + 1);
1076 
1077     tx_qp = ib_create_qp(priv->pd, &attr);
1078     tx->max_send_sge = attr.cap.max_send_sge;
1079     return tx_qp;
1080 }
1081 
1082 static int ipoib_cm_send_req(struct net_device *dev,
1083                  struct ib_cm_id *id, struct ib_qp *qp,
1084                  u32 qpn,
1085                  struct sa_path_rec *pathrec)
1086 {
1087     struct ipoib_dev_priv *priv = ipoib_priv(dev);
1088     struct ipoib_cm_data data = {};
1089     struct ib_cm_req_param req = {};
1090 
1091     data.qpn = cpu_to_be32(priv->qp->qp_num);
1092     data.mtu = cpu_to_be32(IPOIB_CM_BUF_SIZE);
1093 
1094     req.primary_path        = pathrec;
1095     req.alternate_path      = NULL;
1096     req.service_id          = cpu_to_be64(IPOIB_CM_IETF_ID | qpn);
1097     req.qp_num          = qp->qp_num;
1098     req.qp_type         = qp->qp_type;
1099     req.private_data        = &data;
1100     req.private_data_len        = sizeof(data);
1101     req.flow_control        = 0;
1102 
1103     req.starting_psn        = 0; /* FIXME */
1104 
1105     /*
1106      * Pick some arbitrary defaults here; we could make these
1107      * module parameters if anyone cared about setting them.
1108      */
1109     req.responder_resources     = 4;
1110     req.remote_cm_response_timeout  = 20;
1111     req.local_cm_response_timeout   = 20;
1112     req.retry_count         = 0; /* RFC draft warns against retries */
1113     req.rnr_retry_count     = 0; /* RFC draft warns against retries */
1114     req.max_cm_retries      = 15;
1115     req.srq             = ipoib_cm_has_srq(dev);
1116     return ib_send_cm_req(id, &req);
1117 }
1118 
1119 static int ipoib_cm_modify_tx_init(struct net_device *dev,
1120                   struct ib_cm_id *cm_id, struct ib_qp *qp)
1121 {
1122     struct ipoib_dev_priv *priv = ipoib_priv(dev);
1123     struct ib_qp_attr qp_attr;
1124     int qp_attr_mask, ret;
1125 
1126     qp_attr.pkey_index = priv->pkey_index;
1127     qp_attr.qp_state = IB_QPS_INIT;
1128     qp_attr.qp_access_flags = IB_ACCESS_LOCAL_WRITE;
1129     qp_attr.port_num = priv->port;
1130     qp_attr_mask = IB_QP_STATE | IB_QP_ACCESS_FLAGS | IB_QP_PKEY_INDEX | IB_QP_PORT;
1131 
1132     ret = ib_modify_qp(qp, &qp_attr, qp_attr_mask);
1133     if (ret) {
1134         ipoib_warn(priv, "failed to modify tx QP to INIT: %d\n", ret);
1135         return ret;
1136     }
1137     return 0;
1138 }
1139 
1140 static int ipoib_cm_tx_init(struct ipoib_cm_tx *p, u32 qpn,
1141                 struct sa_path_rec *pathrec)
1142 {
1143     struct ipoib_dev_priv *priv = ipoib_priv(p->dev);
1144     unsigned int noio_flag;
1145     int ret;
1146 
1147     noio_flag = memalloc_noio_save();
1148     p->tx_ring = vzalloc(array_size(ipoib_sendq_size, sizeof(*p->tx_ring)));
1149     if (!p->tx_ring) {
1150         memalloc_noio_restore(noio_flag);
1151         ret = -ENOMEM;
1152         goto err_tx;
1153     }
1154 
1155     p->qp = ipoib_cm_create_tx_qp(p->dev, p);
1156     memalloc_noio_restore(noio_flag);
1157     if (IS_ERR(p->qp)) {
1158         ret = PTR_ERR(p->qp);
1159         ipoib_warn(priv, "failed to create tx qp: %d\n", ret);
1160         goto err_qp;
1161     }
1162 
1163     p->id = ib_create_cm_id(priv->ca, ipoib_cm_tx_handler, p);
1164     if (IS_ERR(p->id)) {
1165         ret = PTR_ERR(p->id);
1166         ipoib_warn(priv, "failed to create tx cm id: %d\n", ret);
1167         goto err_id;
1168     }
1169 
1170     ret = ipoib_cm_modify_tx_init(p->dev, p->id,  p->qp);
1171     if (ret) {
1172         ipoib_warn(priv, "failed to modify tx qp to rtr: %d\n", ret);
1173         goto err_modify_send;
1174     }
1175 
1176     ret = ipoib_cm_send_req(p->dev, p->id, p->qp, qpn, pathrec);
1177     if (ret) {
1178         ipoib_warn(priv, "failed to send cm req: %d\n", ret);
1179         goto err_modify_send;
1180     }
1181 
1182     ipoib_dbg(priv, "Request connection 0x%x for gid %pI6 qpn 0x%x\n",
1183           p->qp->qp_num, pathrec->dgid.raw, qpn);
1184 
1185     return 0;
1186 
1187 err_modify_send:
1188     ib_destroy_cm_id(p->id);
1189 err_id:
1190     p->id = NULL;
1191     ib_destroy_qp(p->qp);
1192 err_qp:
1193     p->qp = NULL;
1194     vfree(p->tx_ring);
1195 err_tx:
1196     return ret;
1197 }
1198 
1199 static void ipoib_cm_tx_destroy(struct ipoib_cm_tx *p)
1200 {
1201     struct ipoib_dev_priv *priv = ipoib_priv(p->dev);
1202     struct ipoib_tx_buf *tx_req;
1203     unsigned long begin;
1204 
1205     ipoib_dbg(priv, "Destroy active connection 0x%x head 0x%x tail 0x%x\n",
1206           p->qp ? p->qp->qp_num : 0, p->tx_head, p->tx_tail);
1207 
1208     if (p->id)
1209         ib_destroy_cm_id(p->id);
1210 
1211     if (p->tx_ring) {
1212         /* Wait for all sends to complete */
1213         begin = jiffies;
1214         while ((int) p->tx_tail - (int) p->tx_head < 0) {
1215             if (time_after(jiffies, begin + 5 * HZ)) {
1216                 ipoib_warn(priv, "timing out; %d sends not completed\n",
1217                        p->tx_head - p->tx_tail);
1218                 goto timeout;
1219             }
1220 
1221             usleep_range(1000, 2000);
1222         }
1223     }
1224 
1225 timeout:
1226 
1227     while ((int) p->tx_tail - (int) p->tx_head < 0) {
1228         tx_req = &p->tx_ring[p->tx_tail & (ipoib_sendq_size - 1)];
1229         ipoib_dma_unmap_tx(priv, tx_req);
1230         dev_kfree_skb_any(tx_req->skb);
1231         netif_tx_lock_bh(p->dev);
1232         ++p->tx_tail;
1233         ++priv->global_tx_tail;
1234         if (unlikely((priv->global_tx_head - priv->global_tx_tail) <=
1235                  ipoib_sendq_size >> 1) &&
1236             netif_queue_stopped(p->dev) &&
1237             test_bit(IPOIB_FLAG_ADMIN_UP, &priv->flags))
1238             netif_wake_queue(p->dev);
1239         netif_tx_unlock_bh(p->dev);
1240     }
1241 
1242     if (p->qp)
1243         ib_destroy_qp(p->qp);
1244 
1245     vfree(p->tx_ring);
1246     kfree(p);
1247 }
1248 
1249 static int ipoib_cm_tx_handler(struct ib_cm_id *cm_id,
1250                    const struct ib_cm_event *event)
1251 {
1252     struct ipoib_cm_tx *tx = cm_id->context;
1253     struct ipoib_dev_priv *priv = ipoib_priv(tx->dev);
1254     struct net_device *dev = priv->dev;
1255     struct ipoib_neigh *neigh;
1256     unsigned long flags;
1257     int ret;
1258 
1259     switch (event->event) {
1260     case IB_CM_DREQ_RECEIVED:
1261         ipoib_dbg(priv, "DREQ received.\n");
1262         ib_send_cm_drep(cm_id, NULL, 0);
1263         break;
1264     case IB_CM_REP_RECEIVED:
1265         ipoib_dbg(priv, "REP received.\n");
1266         ret = ipoib_cm_rep_handler(cm_id, event);
1267         if (ret)
1268             ib_send_cm_rej(cm_id, IB_CM_REJ_CONSUMER_DEFINED,
1269                        NULL, 0, NULL, 0);
1270         break;
1271     case IB_CM_REQ_ERROR:
1272     case IB_CM_REJ_RECEIVED:
1273     case IB_CM_TIMEWAIT_EXIT:
1274         ipoib_dbg(priv, "CM error %d.\n", event->event);
1275         netif_tx_lock_bh(dev);
1276         spin_lock_irqsave(&priv->lock, flags);
1277         neigh = tx->neigh;
1278 
1279         if (neigh) {
1280             neigh->cm = NULL;
1281             ipoib_neigh_free(neigh);
1282 
1283             tx->neigh = NULL;
1284         }
1285 
1286         if (test_and_clear_bit(IPOIB_FLAG_INITIALIZED, &tx->flags)) {
1287             list_move(&tx->list, &priv->cm.reap_list);
1288             queue_work(priv->wq, &priv->cm.reap_task);
1289         }
1290 
1291         spin_unlock_irqrestore(&priv->lock, flags);
1292         netif_tx_unlock_bh(dev);
1293         break;
1294     default:
1295         break;
1296     }
1297 
1298     return 0;
1299 }
1300 
1301 struct ipoib_cm_tx *ipoib_cm_create_tx(struct net_device *dev, struct ipoib_path *path,
1302                        struct ipoib_neigh *neigh)
1303 {
1304     struct ipoib_dev_priv *priv = ipoib_priv(dev);
1305     struct ipoib_cm_tx *tx;
1306 
1307     tx = kzalloc(sizeof(*tx), GFP_ATOMIC);
1308     if (!tx)
1309         return NULL;
1310 
1311     neigh->cm = tx;
1312     tx->neigh = neigh;
1313     tx->dev = dev;
1314     list_add(&tx->list, &priv->cm.start_list);
1315     set_bit(IPOIB_FLAG_INITIALIZED, &tx->flags);
1316     queue_work(priv->wq, &priv->cm.start_task);
1317     return tx;
1318 }
1319 
1320 void ipoib_cm_destroy_tx(struct ipoib_cm_tx *tx)
1321 {
1322     struct ipoib_dev_priv *priv = ipoib_priv(tx->dev);
1323     unsigned long flags;
1324     if (test_and_clear_bit(IPOIB_FLAG_INITIALIZED, &tx->flags)) {
1325         spin_lock_irqsave(&priv->lock, flags);
1326         list_move(&tx->list, &priv->cm.reap_list);
1327         queue_work(priv->wq, &priv->cm.reap_task);
1328         ipoib_dbg(priv, "Reap connection for gid %pI6\n",
1329               tx->neigh->daddr + 4);
1330         tx->neigh = NULL;
1331         spin_unlock_irqrestore(&priv->lock, flags);
1332     }
1333 }
1334 
1335 #define QPN_AND_OPTIONS_OFFSET  4
1336 
1337 static void ipoib_cm_tx_start(struct work_struct *work)
1338 {
1339     struct ipoib_dev_priv *priv = container_of(work, struct ipoib_dev_priv,
1340                            cm.start_task);
1341     struct net_device *dev = priv->dev;
1342     struct ipoib_neigh *neigh;
1343     struct ipoib_cm_tx *p;
1344     unsigned long flags;
1345     struct ipoib_path *path;
1346     int ret;
1347 
1348     struct sa_path_rec pathrec;
1349     u32 qpn;
1350 
1351     netif_tx_lock_bh(dev);
1352     spin_lock_irqsave(&priv->lock, flags);
1353 
1354     while (!list_empty(&priv->cm.start_list)) {
1355         p = list_entry(priv->cm.start_list.next, typeof(*p), list);
1356         list_del_init(&p->list);
1357         neigh = p->neigh;
1358 
1359         qpn = IPOIB_QPN(neigh->daddr);
1360         /*
1361          * As long as the search is with these 2 locks,
1362          * path existence indicates its validity.
1363          */
1364         path = __path_find(dev, neigh->daddr + QPN_AND_OPTIONS_OFFSET);
1365         if (!path) {
1366             pr_info("%s ignore not valid path %pI6\n",
1367                 __func__,
1368                 neigh->daddr + QPN_AND_OPTIONS_OFFSET);
1369             goto free_neigh;
1370         }
1371         memcpy(&pathrec, &path->pathrec, sizeof(pathrec));
1372 
1373         spin_unlock_irqrestore(&priv->lock, flags);
1374         netif_tx_unlock_bh(dev);
1375 
1376         ret = ipoib_cm_tx_init(p, qpn, &pathrec);
1377 
1378         netif_tx_lock_bh(dev);
1379         spin_lock_irqsave(&priv->lock, flags);
1380 
1381         if (ret) {
1382 free_neigh:
1383             neigh = p->neigh;
1384             if (neigh) {
1385                 neigh->cm = NULL;
1386                 ipoib_neigh_free(neigh);
1387             }
1388             list_del(&p->list);
1389             kfree(p);
1390         }
1391     }
1392 
1393     spin_unlock_irqrestore(&priv->lock, flags);
1394     netif_tx_unlock_bh(dev);
1395 }
1396 
1397 static void ipoib_cm_tx_reap(struct work_struct *work)
1398 {
1399     struct ipoib_dev_priv *priv = container_of(work, struct ipoib_dev_priv,
1400                            cm.reap_task);
1401     struct net_device *dev = priv->dev;
1402     struct ipoib_cm_tx *p;
1403     unsigned long flags;
1404 
1405     netif_tx_lock_bh(dev);
1406     spin_lock_irqsave(&priv->lock, flags);
1407 
1408     while (!list_empty(&priv->cm.reap_list)) {
1409         p = list_entry(priv->cm.reap_list.next, typeof(*p), list);
1410         list_del_init(&p->list);
1411         spin_unlock_irqrestore(&priv->lock, flags);
1412         netif_tx_unlock_bh(dev);
1413         ipoib_cm_tx_destroy(p);
1414         netif_tx_lock_bh(dev);
1415         spin_lock_irqsave(&priv->lock, flags);
1416     }
1417 
1418     spin_unlock_irqrestore(&priv->lock, flags);
1419     netif_tx_unlock_bh(dev);
1420 }
1421 
1422 static void ipoib_cm_skb_reap(struct work_struct *work)
1423 {
1424     struct ipoib_dev_priv *priv = container_of(work, struct ipoib_dev_priv,
1425                            cm.skb_task);
1426     struct net_device *dev = priv->dev;
1427     struct sk_buff *skb;
1428     unsigned long flags;
1429     unsigned int mtu = priv->mcast_mtu;
1430 
1431     netif_tx_lock_bh(dev);
1432     spin_lock_irqsave(&priv->lock, flags);
1433 
1434     while ((skb = skb_dequeue(&priv->cm.skb_queue))) {
1435         spin_unlock_irqrestore(&priv->lock, flags);
1436         netif_tx_unlock_bh(dev);
1437 
1438         if (skb->protocol == htons(ETH_P_IP)) {
1439             memset(IPCB(skb), 0, sizeof(*IPCB(skb)));
1440             icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, htonl(mtu));
1441         }
1442 #if IS_ENABLED(CONFIG_IPV6)
1443         else if (skb->protocol == htons(ETH_P_IPV6)) {
1444             memset(IP6CB(skb), 0, sizeof(*IP6CB(skb)));
1445             icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
1446         }
1447 #endif
1448         dev_kfree_skb_any(skb);
1449 
1450         netif_tx_lock_bh(dev);
1451         spin_lock_irqsave(&priv->lock, flags);
1452     }
1453 
1454     spin_unlock_irqrestore(&priv->lock, flags);
1455     netif_tx_unlock_bh(dev);
1456 }
1457 
1458 void ipoib_cm_skb_too_long(struct net_device *dev, struct sk_buff *skb,
1459                unsigned int mtu)
1460 {
1461     struct ipoib_dev_priv *priv = ipoib_priv(dev);
1462     int e = skb_queue_empty(&priv->cm.skb_queue);
1463 
1464     skb_dst_update_pmtu(skb, mtu);
1465 
1466     skb_queue_tail(&priv->cm.skb_queue, skb);
1467     if (e)
1468         queue_work(priv->wq, &priv->cm.skb_task);
1469 }
1470 
1471 static void ipoib_cm_rx_reap(struct work_struct *work)
1472 {
1473     ipoib_cm_free_rx_reap_list(container_of(work, struct ipoib_dev_priv,
1474                         cm.rx_reap_task)->dev);
1475 }
1476 
1477 static void ipoib_cm_stale_task(struct work_struct *work)
1478 {
1479     struct ipoib_dev_priv *priv = container_of(work, struct ipoib_dev_priv,
1480                            cm.stale_task.work);
1481     struct ipoib_cm_rx *p;
1482     int ret;
1483 
1484     spin_lock_irq(&priv->lock);
1485     while (!list_empty(&priv->cm.passive_ids)) {
1486         /* List is sorted by LRU, start from tail,
1487          * stop when we see a recently used entry */
1488         p = list_entry(priv->cm.passive_ids.prev, typeof(*p), list);
1489         if (time_before_eq(jiffies, p->jiffies + IPOIB_CM_RX_TIMEOUT))
1490             break;
1491         list_move(&p->list, &priv->cm.rx_error_list);
1492         p->state = IPOIB_CM_RX_ERROR;
1493         spin_unlock_irq(&priv->lock);
1494         ret = ib_modify_qp(p->qp, &ipoib_cm_err_attr, IB_QP_STATE);
1495         if (ret)
1496             ipoib_warn(priv, "unable to move qp to error state: %d\n", ret);
1497         spin_lock_irq(&priv->lock);
1498     }
1499 
1500     if (!list_empty(&priv->cm.passive_ids))
1501         queue_delayed_work(priv->wq,
1502                    &priv->cm.stale_task, IPOIB_CM_RX_DELAY);
1503     spin_unlock_irq(&priv->lock);
1504 }
1505 
1506 static ssize_t mode_show(struct device *d, struct device_attribute *attr,
1507              char *buf)
1508 {
1509     struct net_device *dev = to_net_dev(d);
1510     struct ipoib_dev_priv *priv = ipoib_priv(dev);
1511 
1512     if (test_bit(IPOIB_FLAG_ADMIN_CM, &priv->flags))
1513         return sysfs_emit(buf, "connected\n");
1514     else
1515         return sysfs_emit(buf, "datagram\n");
1516 }
1517 
1518 static ssize_t mode_store(struct device *d, struct device_attribute *attr,
1519               const char *buf, size_t count)
1520 {
1521     struct net_device *dev = to_net_dev(d);
1522     int ret;
1523 
1524     if (!rtnl_trylock()) {
1525         return restart_syscall();
1526     }
1527 
1528     if (dev->reg_state != NETREG_REGISTERED) {
1529         rtnl_unlock();
1530         return -EPERM;
1531     }
1532 
1533     ret = ipoib_set_mode(dev, buf);
1534 
1535     /* The assumption is that the function ipoib_set_mode returned
1536      * with the rtnl held by it, if not the value -EBUSY returned,
1537      * then no need to rtnl_unlock
1538      */
1539     if (ret != -EBUSY)
1540         rtnl_unlock();
1541 
1542     return (!ret || ret == -EBUSY) ? count : ret;
1543 }
1544 
1545 static DEVICE_ATTR_RW(mode);
1546 
1547 int ipoib_cm_add_mode_attr(struct net_device *dev)
1548 {
1549     return device_create_file(&dev->dev, &dev_attr_mode);
1550 }
1551 
1552 static void ipoib_cm_create_srq(struct net_device *dev, int max_sge)
1553 {
1554     struct ipoib_dev_priv *priv = ipoib_priv(dev);
1555     struct ib_srq_init_attr srq_init_attr = {
1556         .srq_type = IB_SRQT_BASIC,
1557         .attr = {
1558             .max_wr  = ipoib_recvq_size,
1559             .max_sge = max_sge
1560         }
1561     };
1562 
1563     priv->cm.srq = ib_create_srq(priv->pd, &srq_init_attr);
1564     if (IS_ERR(priv->cm.srq)) {
1565         if (PTR_ERR(priv->cm.srq) != -EOPNOTSUPP)
1566             pr_warn("%s: failed to allocate SRQ, error %ld\n",
1567                    priv->ca->name, PTR_ERR(priv->cm.srq));
1568         priv->cm.srq = NULL;
1569         return;
1570     }
1571 
1572     priv->cm.srq_ring = vzalloc(array_size(ipoib_recvq_size,
1573                            sizeof(*priv->cm.srq_ring)));
1574     if (!priv->cm.srq_ring) {
1575         ib_destroy_srq(priv->cm.srq);
1576         priv->cm.srq = NULL;
1577         return;
1578     }
1579 
1580 }
1581 
1582 int ipoib_cm_dev_init(struct net_device *dev)
1583 {
1584     struct ipoib_dev_priv *priv = ipoib_priv(dev);
1585     int max_srq_sge, i;
1586     u8 addr;
1587 
1588     INIT_LIST_HEAD(&priv->cm.passive_ids);
1589     INIT_LIST_HEAD(&priv->cm.reap_list);
1590     INIT_LIST_HEAD(&priv->cm.start_list);
1591     INIT_LIST_HEAD(&priv->cm.rx_error_list);
1592     INIT_LIST_HEAD(&priv->cm.rx_flush_list);
1593     INIT_LIST_HEAD(&priv->cm.rx_drain_list);
1594     INIT_LIST_HEAD(&priv->cm.rx_reap_list);
1595     INIT_WORK(&priv->cm.start_task, ipoib_cm_tx_start);
1596     INIT_WORK(&priv->cm.reap_task, ipoib_cm_tx_reap);
1597     INIT_WORK(&priv->cm.skb_task, ipoib_cm_skb_reap);
1598     INIT_WORK(&priv->cm.rx_reap_task, ipoib_cm_rx_reap);
1599     INIT_DELAYED_WORK(&priv->cm.stale_task, ipoib_cm_stale_task);
1600 
1601     skb_queue_head_init(&priv->cm.skb_queue);
1602 
1603     ipoib_dbg(priv, "max_srq_sge=%d\n", priv->ca->attrs.max_srq_sge);
1604 
1605     max_srq_sge = min_t(int, IPOIB_CM_RX_SG, priv->ca->attrs.max_srq_sge);
1606     ipoib_cm_create_srq(dev, max_srq_sge);
1607     if (ipoib_cm_has_srq(dev)) {
1608         priv->cm.max_cm_mtu = max_srq_sge * PAGE_SIZE - 0x10;
1609         priv->cm.num_frags  = max_srq_sge;
1610         ipoib_dbg(priv, "max_cm_mtu = 0x%x, num_frags=%d\n",
1611               priv->cm.max_cm_mtu, priv->cm.num_frags);
1612     } else {
1613         priv->cm.max_cm_mtu = IPOIB_CM_MTU;
1614         priv->cm.num_frags  = IPOIB_CM_RX_SG;
1615     }
1616 
1617     ipoib_cm_init_rx_wr(dev, &priv->cm.rx_wr, priv->cm.rx_sge);
1618 
1619     if (ipoib_cm_has_srq(dev)) {
1620         for (i = 0; i < ipoib_recvq_size; ++i) {
1621             if (!ipoib_cm_alloc_rx_skb(dev, priv->cm.srq_ring, i,
1622                            priv->cm.num_frags - 1,
1623                            priv->cm.srq_ring[i].mapping,
1624                            GFP_KERNEL)) {
1625                 ipoib_warn(priv, "failed to allocate "
1626                        "receive buffer %d\n", i);
1627                 ipoib_cm_dev_cleanup(dev);
1628                 return -ENOMEM;
1629             }
1630 
1631             if (ipoib_cm_post_receive_srq(dev, i)) {
1632                 ipoib_warn(priv, "ipoib_cm_post_receive_srq "
1633                        "failed for buf %d\n", i);
1634                 ipoib_cm_dev_cleanup(dev);
1635                 return -EIO;
1636             }
1637         }
1638     }
1639 
1640     addr = IPOIB_FLAGS_RC;
1641     dev_addr_mod(dev, 0, &addr, 1);
1642     return 0;
1643 }
1644 
1645 void ipoib_cm_dev_cleanup(struct net_device *dev)
1646 {
1647     struct ipoib_dev_priv *priv = ipoib_priv(dev);
1648 
1649     if (!priv->cm.srq)
1650         return;
1651 
1652     ipoib_dbg(priv, "Cleanup ipoib connected mode.\n");
1653 
1654     ib_destroy_srq(priv->cm.srq);
1655     priv->cm.srq = NULL;
1656     if (!priv->cm.srq_ring)
1657         return;
1658 
1659     ipoib_cm_free_rx_ring(dev, priv->cm.srq_ring);
1660     priv->cm.srq_ring = NULL;
1661 }