Back to home page

OSCL-LXR

 
 

    


0001 /*
0002  * Copyright (c) 2004, 2005 Topspin Communications.  All rights reserved.
0003  * Copyright (c) 2005 Sun Microsystems, Inc. All rights reserved.
0004  * Copyright (c) 2005 Mellanox Technologies. All rights reserved.
0005  * Copyright (c) 2004, 2005 Voltaire, Inc. All rights reserved.
0006  *
0007  * This software is available to you under a choice of one of two
0008  * licenses.  You may choose to be licensed under the terms of the GNU
0009  * General Public License (GPL) Version 2, available from the file
0010  * COPYING in the main directory of this source tree, or the
0011  * OpenIB.org BSD license below:
0012  *
0013  *     Redistribution and use in source and binary forms, with or
0014  *     without modification, are permitted provided that the following
0015  *     conditions are met:
0016  *
0017  *      - Redistributions of source code must retain the above
0018  *        copyright notice, this list of conditions and the following
0019  *        disclaimer.
0020  *
0021  *      - Redistributions in binary form must reproduce the above
0022  *        copyright notice, this list of conditions and the following
0023  *        disclaimer in the documentation and/or other materials
0024  *        provided with the distribution.
0025  *
0026  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
0027  * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
0028  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
0029  * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
0030  * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
0031  * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
0032  * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
0033  * SOFTWARE.
0034  */
0035 
0036 #include <linux/delay.h>
0037 #include <linux/moduleparam.h>
0038 #include <linux/dma-mapping.h>
0039 #include <linux/slab.h>
0040 
0041 #include <linux/ip.h>
0042 #include <linux/tcp.h>
0043 #include <rdma/ib_cache.h>
0044 
0045 #include "ipoib.h"
0046 
0047 #ifdef CONFIG_INFINIBAND_IPOIB_DEBUG_DATA
0048 static int data_debug_level;
0049 
0050 module_param(data_debug_level, int, 0644);
0051 MODULE_PARM_DESC(data_debug_level,
0052          "Enable data path debug tracing if > 0");
0053 #endif
0054 
0055 struct ipoib_ah *ipoib_create_ah(struct net_device *dev,
0056                  struct ib_pd *pd, struct rdma_ah_attr *attr)
0057 {
0058     struct ipoib_ah *ah;
0059     struct ib_ah *vah;
0060 
0061     ah = kmalloc(sizeof(*ah), GFP_KERNEL);
0062     if (!ah)
0063         return ERR_PTR(-ENOMEM);
0064 
0065     ah->dev       = dev;
0066     ah->last_send = 0;
0067     kref_init(&ah->ref);
0068 
0069     vah = rdma_create_ah(pd, attr, RDMA_CREATE_AH_SLEEPABLE);
0070     if (IS_ERR(vah)) {
0071         kfree(ah);
0072         ah = (struct ipoib_ah *)vah;
0073     } else {
0074         ah->ah = vah;
0075         ipoib_dbg(ipoib_priv(dev), "Created ah %p\n", ah->ah);
0076     }
0077 
0078     return ah;
0079 }
0080 
0081 void ipoib_free_ah(struct kref *kref)
0082 {
0083     struct ipoib_ah *ah = container_of(kref, struct ipoib_ah, ref);
0084     struct ipoib_dev_priv *priv = ipoib_priv(ah->dev);
0085 
0086     unsigned long flags;
0087 
0088     spin_lock_irqsave(&priv->lock, flags);
0089     list_add_tail(&ah->list, &priv->dead_ahs);
0090     spin_unlock_irqrestore(&priv->lock, flags);
0091 }
0092 
0093 static void ipoib_ud_dma_unmap_rx(struct ipoib_dev_priv *priv,
0094                   u64 mapping[IPOIB_UD_RX_SG])
0095 {
0096     ib_dma_unmap_single(priv->ca, mapping[0],
0097                 IPOIB_UD_BUF_SIZE(priv->max_ib_mtu),
0098                 DMA_FROM_DEVICE);
0099 }
0100 
0101 static int ipoib_ib_post_receive(struct net_device *dev, int id)
0102 {
0103     struct ipoib_dev_priv *priv = ipoib_priv(dev);
0104     int ret;
0105 
0106     priv->rx_wr.wr_id   = id | IPOIB_OP_RECV;
0107     priv->rx_sge[0].addr = priv->rx_ring[id].mapping[0];
0108     priv->rx_sge[1].addr = priv->rx_ring[id].mapping[1];
0109 
0110 
0111     ret = ib_post_recv(priv->qp, &priv->rx_wr, NULL);
0112     if (unlikely(ret)) {
0113         ipoib_warn(priv, "receive failed for buf %d (%d)\n", id, ret);
0114         ipoib_ud_dma_unmap_rx(priv, priv->rx_ring[id].mapping);
0115         dev_kfree_skb_any(priv->rx_ring[id].skb);
0116         priv->rx_ring[id].skb = NULL;
0117     }
0118 
0119     return ret;
0120 }
0121 
0122 static struct sk_buff *ipoib_alloc_rx_skb(struct net_device *dev, int id)
0123 {
0124     struct ipoib_dev_priv *priv = ipoib_priv(dev);
0125     struct sk_buff *skb;
0126     int buf_size;
0127     u64 *mapping;
0128 
0129     buf_size = IPOIB_UD_BUF_SIZE(priv->max_ib_mtu);
0130 
0131     skb = dev_alloc_skb(buf_size + IPOIB_HARD_LEN);
0132     if (unlikely(!skb))
0133         return NULL;
0134 
0135     /*
0136      * the IP header will be at IPOIP_HARD_LEN + IB_GRH_BYTES, that is
0137      * 64 bytes aligned
0138      */
0139     skb_reserve(skb, sizeof(struct ipoib_pseudo_header));
0140 
0141     mapping = priv->rx_ring[id].mapping;
0142     mapping[0] = ib_dma_map_single(priv->ca, skb->data, buf_size,
0143                        DMA_FROM_DEVICE);
0144     if (unlikely(ib_dma_mapping_error(priv->ca, mapping[0])))
0145         goto error;
0146 
0147     priv->rx_ring[id].skb = skb;
0148     return skb;
0149 error:
0150     dev_kfree_skb_any(skb);
0151     return NULL;
0152 }
0153 
0154 static int ipoib_ib_post_receives(struct net_device *dev)
0155 {
0156     struct ipoib_dev_priv *priv = ipoib_priv(dev);
0157     int i;
0158 
0159     for (i = 0; i < ipoib_recvq_size; ++i) {
0160         if (!ipoib_alloc_rx_skb(dev, i)) {
0161             ipoib_warn(priv, "failed to allocate receive buffer %d\n", i);
0162             return -ENOMEM;
0163         }
0164         if (ipoib_ib_post_receive(dev, i)) {
0165             ipoib_warn(priv, "ipoib_ib_post_receive failed for buf %d\n", i);
0166             return -EIO;
0167         }
0168     }
0169 
0170     return 0;
0171 }
0172 
0173 static void ipoib_ib_handle_rx_wc(struct net_device *dev, struct ib_wc *wc)
0174 {
0175     struct ipoib_dev_priv *priv = ipoib_priv(dev);
0176     unsigned int wr_id = wc->wr_id & ~IPOIB_OP_RECV;
0177     struct sk_buff *skb;
0178     u64 mapping[IPOIB_UD_RX_SG];
0179     union ib_gid *dgid;
0180     union ib_gid *sgid;
0181 
0182     ipoib_dbg_data(priv, "recv completion: id %d, status: %d\n",
0183                wr_id, wc->status);
0184 
0185     if (unlikely(wr_id >= ipoib_recvq_size)) {
0186         ipoib_warn(priv, "recv completion event with wrid %d (> %d)\n",
0187                wr_id, ipoib_recvq_size);
0188         return;
0189     }
0190 
0191     skb  = priv->rx_ring[wr_id].skb;
0192 
0193     if (unlikely(wc->status != IB_WC_SUCCESS)) {
0194         if (wc->status != IB_WC_WR_FLUSH_ERR)
0195             ipoib_warn(priv,
0196                    "failed recv event (status=%d, wrid=%d vend_err %#x)\n",
0197                    wc->status, wr_id, wc->vendor_err);
0198         ipoib_ud_dma_unmap_rx(priv, priv->rx_ring[wr_id].mapping);
0199         dev_kfree_skb_any(skb);
0200         priv->rx_ring[wr_id].skb = NULL;
0201         return;
0202     }
0203 
0204     memcpy(mapping, priv->rx_ring[wr_id].mapping,
0205            IPOIB_UD_RX_SG * sizeof(*mapping));
0206 
0207     /*
0208      * If we can't allocate a new RX buffer, dump
0209      * this packet and reuse the old buffer.
0210      */
0211     if (unlikely(!ipoib_alloc_rx_skb(dev, wr_id))) {
0212         ++dev->stats.rx_dropped;
0213         goto repost;
0214     }
0215 
0216     ipoib_dbg_data(priv, "received %d bytes, SLID 0x%04x\n",
0217                wc->byte_len, wc->slid);
0218 
0219     ipoib_ud_dma_unmap_rx(priv, mapping);
0220 
0221     skb_put(skb, wc->byte_len);
0222 
0223     /* First byte of dgid signals multicast when 0xff */
0224     dgid = &((struct ib_grh *)skb->data)->dgid;
0225 
0226     if (!(wc->wc_flags & IB_WC_GRH) || dgid->raw[0] != 0xff)
0227         skb->pkt_type = PACKET_HOST;
0228     else if (memcmp(dgid, dev->broadcast + 4, sizeof(union ib_gid)) == 0)
0229         skb->pkt_type = PACKET_BROADCAST;
0230     else
0231         skb->pkt_type = PACKET_MULTICAST;
0232 
0233     sgid = &((struct ib_grh *)skb->data)->sgid;
0234 
0235     /*
0236      * Drop packets that this interface sent, ie multicast packets
0237      * that the HCA has replicated.
0238      */
0239     if (wc->slid == priv->local_lid && wc->src_qp == priv->qp->qp_num) {
0240         int need_repost = 1;
0241 
0242         if ((wc->wc_flags & IB_WC_GRH) &&
0243             sgid->global.interface_id != priv->local_gid.global.interface_id)
0244             need_repost = 0;
0245 
0246         if (need_repost) {
0247             dev_kfree_skb_any(skb);
0248             goto repost;
0249         }
0250     }
0251 
0252     skb_pull(skb, IB_GRH_BYTES);
0253 
0254     skb->protocol = ((struct ipoib_header *) skb->data)->proto;
0255     skb_add_pseudo_hdr(skb);
0256 
0257     ++dev->stats.rx_packets;
0258     dev->stats.rx_bytes += skb->len;
0259     if (skb->pkt_type == PACKET_MULTICAST)
0260         dev->stats.multicast++;
0261 
0262     skb->dev = dev;
0263     if ((dev->features & NETIF_F_RXCSUM) &&
0264             likely(wc->wc_flags & IB_WC_IP_CSUM_OK))
0265         skb->ip_summed = CHECKSUM_UNNECESSARY;
0266 
0267     napi_gro_receive(&priv->recv_napi, skb);
0268 
0269 repost:
0270     if (unlikely(ipoib_ib_post_receive(dev, wr_id)))
0271         ipoib_warn(priv, "ipoib_ib_post_receive failed "
0272                "for buf %d\n", wr_id);
0273 }
0274 
0275 int ipoib_dma_map_tx(struct ib_device *ca, struct ipoib_tx_buf *tx_req)
0276 {
0277     struct sk_buff *skb = tx_req->skb;
0278     u64 *mapping = tx_req->mapping;
0279     int i;
0280     int off;
0281 
0282     if (skb_headlen(skb)) {
0283         mapping[0] = ib_dma_map_single(ca, skb->data, skb_headlen(skb),
0284                            DMA_TO_DEVICE);
0285         if (unlikely(ib_dma_mapping_error(ca, mapping[0])))
0286             return -EIO;
0287 
0288         off = 1;
0289     } else
0290         off = 0;
0291 
0292     for (i = 0; i < skb_shinfo(skb)->nr_frags; ++i) {
0293         const skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
0294         mapping[i + off] = ib_dma_map_page(ca,
0295                          skb_frag_page(frag),
0296                          skb_frag_off(frag),
0297                          skb_frag_size(frag),
0298                          DMA_TO_DEVICE);
0299         if (unlikely(ib_dma_mapping_error(ca, mapping[i + off])))
0300             goto partial_error;
0301     }
0302     return 0;
0303 
0304 partial_error:
0305     for (; i > 0; --i) {
0306         const skb_frag_t *frag = &skb_shinfo(skb)->frags[i - 1];
0307 
0308         ib_dma_unmap_page(ca, mapping[i - !off], skb_frag_size(frag), DMA_TO_DEVICE);
0309     }
0310 
0311     if (off)
0312         ib_dma_unmap_single(ca, mapping[0], skb_headlen(skb), DMA_TO_DEVICE);
0313 
0314     return -EIO;
0315 }
0316 
0317 void ipoib_dma_unmap_tx(struct ipoib_dev_priv *priv,
0318             struct ipoib_tx_buf *tx_req)
0319 {
0320     struct sk_buff *skb = tx_req->skb;
0321     u64 *mapping = tx_req->mapping;
0322     int i;
0323     int off;
0324 
0325     if (skb_headlen(skb)) {
0326         ib_dma_unmap_single(priv->ca, mapping[0], skb_headlen(skb),
0327                     DMA_TO_DEVICE);
0328         off = 1;
0329     } else
0330         off = 0;
0331 
0332     for (i = 0; i < skb_shinfo(skb)->nr_frags; ++i) {
0333         const skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
0334 
0335         ib_dma_unmap_page(priv->ca, mapping[i + off],
0336                   skb_frag_size(frag), DMA_TO_DEVICE);
0337     }
0338 }
0339 
0340 /*
0341  * As the result of a completion error the QP Can be transferred to SQE states.
0342  * The function checks if the (send)QP is in SQE state and
0343  * moves it back to RTS state, that in order to have it functional again.
0344  */
0345 static void ipoib_qp_state_validate_work(struct work_struct *work)
0346 {
0347     struct ipoib_qp_state_validate *qp_work =
0348         container_of(work, struct ipoib_qp_state_validate, work);
0349 
0350     struct ipoib_dev_priv *priv = qp_work->priv;
0351     struct ib_qp_attr qp_attr;
0352     struct ib_qp_init_attr query_init_attr;
0353     int ret;
0354 
0355     ret = ib_query_qp(priv->qp, &qp_attr, IB_QP_STATE, &query_init_attr);
0356     if (ret) {
0357         ipoib_warn(priv, "%s: Failed to query QP ret: %d\n",
0358                __func__, ret);
0359         goto free_res;
0360     }
0361     pr_info("%s: QP: 0x%x is in state: %d\n",
0362         __func__, priv->qp->qp_num, qp_attr.qp_state);
0363 
0364     /* currently support only in SQE->RTS transition*/
0365     if (qp_attr.qp_state == IB_QPS_SQE) {
0366         qp_attr.qp_state = IB_QPS_RTS;
0367 
0368         ret = ib_modify_qp(priv->qp, &qp_attr, IB_QP_STATE);
0369         if (ret) {
0370             pr_warn("failed(%d) modify QP:0x%x SQE->RTS\n",
0371                 ret, priv->qp->qp_num);
0372             goto free_res;
0373         }
0374         pr_info("%s: QP: 0x%x moved from IB_QPS_SQE to IB_QPS_RTS\n",
0375             __func__, priv->qp->qp_num);
0376     } else {
0377         pr_warn("QP (%d) will stay in state: %d\n",
0378             priv->qp->qp_num, qp_attr.qp_state);
0379     }
0380 
0381 free_res:
0382     kfree(qp_work);
0383 }
0384 
0385 static void ipoib_ib_handle_tx_wc(struct net_device *dev, struct ib_wc *wc)
0386 {
0387     struct ipoib_dev_priv *priv = ipoib_priv(dev);
0388     unsigned int wr_id = wc->wr_id;
0389     struct ipoib_tx_buf *tx_req;
0390 
0391     ipoib_dbg_data(priv, "send completion: id %d, status: %d\n",
0392                wr_id, wc->status);
0393 
0394     if (unlikely(wr_id >= ipoib_sendq_size)) {
0395         ipoib_warn(priv, "send completion event with wrid %d (> %d)\n",
0396                wr_id, ipoib_sendq_size);
0397         return;
0398     }
0399 
0400     tx_req = &priv->tx_ring[wr_id];
0401 
0402     ipoib_dma_unmap_tx(priv, tx_req);
0403 
0404     ++dev->stats.tx_packets;
0405     dev->stats.tx_bytes += tx_req->skb->len;
0406 
0407     dev_kfree_skb_any(tx_req->skb);
0408 
0409     ++priv->tx_tail;
0410     ++priv->global_tx_tail;
0411 
0412     if (unlikely(netif_queue_stopped(dev) &&
0413              ((priv->global_tx_head - priv->global_tx_tail) <=
0414               ipoib_sendq_size >> 1) &&
0415              test_bit(IPOIB_FLAG_ADMIN_UP, &priv->flags)))
0416         netif_wake_queue(dev);
0417 
0418     if (wc->status != IB_WC_SUCCESS &&
0419         wc->status != IB_WC_WR_FLUSH_ERR) {
0420         struct ipoib_qp_state_validate *qp_work;
0421         ipoib_warn(priv,
0422                "failed send event (status=%d, wrid=%d vend_err %#x)\n",
0423                wc->status, wr_id, wc->vendor_err);
0424         qp_work = kzalloc(sizeof(*qp_work), GFP_ATOMIC);
0425         if (!qp_work)
0426             return;
0427 
0428         INIT_WORK(&qp_work->work, ipoib_qp_state_validate_work);
0429         qp_work->priv = priv;
0430         queue_work(priv->wq, &qp_work->work);
0431     }
0432 }
0433 
0434 static int poll_tx(struct ipoib_dev_priv *priv)
0435 {
0436     int n, i;
0437     struct ib_wc *wc;
0438 
0439     n = ib_poll_cq(priv->send_cq, MAX_SEND_CQE, priv->send_wc);
0440     for (i = 0; i < n; ++i) {
0441         wc = priv->send_wc + i;
0442         if (wc->wr_id & IPOIB_OP_CM)
0443             ipoib_cm_handle_tx_wc(priv->dev, priv->send_wc + i);
0444         else
0445             ipoib_ib_handle_tx_wc(priv->dev, priv->send_wc + i);
0446     }
0447     return n == MAX_SEND_CQE;
0448 }
0449 
0450 int ipoib_rx_poll(struct napi_struct *napi, int budget)
0451 {
0452     struct ipoib_dev_priv *priv =
0453         container_of(napi, struct ipoib_dev_priv, recv_napi);
0454     struct net_device *dev = priv->dev;
0455     int done;
0456     int t;
0457     int n, i;
0458 
0459     done  = 0;
0460 
0461 poll_more:
0462     while (done < budget) {
0463         int max = (budget - done);
0464 
0465         t = min(IPOIB_NUM_WC, max);
0466         n = ib_poll_cq(priv->recv_cq, t, priv->ibwc);
0467 
0468         for (i = 0; i < n; i++) {
0469             struct ib_wc *wc = priv->ibwc + i;
0470 
0471             if (wc->wr_id & IPOIB_OP_RECV) {
0472                 ++done;
0473                 if (wc->wr_id & IPOIB_OP_CM)
0474                     ipoib_cm_handle_rx_wc(dev, wc);
0475                 else
0476                     ipoib_ib_handle_rx_wc(dev, wc);
0477             } else {
0478                 pr_warn("%s: Got unexpected wqe id\n", __func__);
0479             }
0480         }
0481 
0482         if (n != t)
0483             break;
0484     }
0485 
0486     if (done < budget) {
0487         napi_complete(napi);
0488         if (unlikely(ib_req_notify_cq(priv->recv_cq,
0489                           IB_CQ_NEXT_COMP |
0490                           IB_CQ_REPORT_MISSED_EVENTS)) &&
0491             napi_reschedule(napi))
0492             goto poll_more;
0493     }
0494 
0495     return done;
0496 }
0497 
0498 int ipoib_tx_poll(struct napi_struct *napi, int budget)
0499 {
0500     struct ipoib_dev_priv *priv = container_of(napi, struct ipoib_dev_priv,
0501                            send_napi);
0502     struct net_device *dev = priv->dev;
0503     int n, i;
0504     struct ib_wc *wc;
0505 
0506 poll_more:
0507     n = ib_poll_cq(priv->send_cq, MAX_SEND_CQE, priv->send_wc);
0508 
0509     for (i = 0; i < n; i++) {
0510         wc = priv->send_wc + i;
0511         if (wc->wr_id & IPOIB_OP_CM)
0512             ipoib_cm_handle_tx_wc(dev, wc);
0513         else
0514             ipoib_ib_handle_tx_wc(dev, wc);
0515     }
0516 
0517     if (n < budget) {
0518         napi_complete(napi);
0519         if (unlikely(ib_req_notify_cq(priv->send_cq, IB_CQ_NEXT_COMP |
0520                           IB_CQ_REPORT_MISSED_EVENTS)) &&
0521             napi_reschedule(napi))
0522             goto poll_more;
0523     }
0524     return n < 0 ? 0 : n;
0525 }
0526 
0527 void ipoib_ib_rx_completion(struct ib_cq *cq, void *ctx_ptr)
0528 {
0529     struct ipoib_dev_priv *priv = ctx_ptr;
0530 
0531     napi_schedule(&priv->recv_napi);
0532 }
0533 
0534 void ipoib_ib_tx_completion(struct ib_cq *cq, void *ctx_ptr)
0535 {
0536     struct ipoib_dev_priv *priv = ctx_ptr;
0537 
0538     napi_schedule(&priv->send_napi);
0539 }
0540 
0541 static inline int post_send(struct ipoib_dev_priv *priv,
0542                 unsigned int wr_id,
0543                 struct ib_ah *address, u32 dqpn,
0544                 struct ipoib_tx_buf *tx_req,
0545                 void *head, int hlen)
0546 {
0547     struct sk_buff *skb = tx_req->skb;
0548 
0549     ipoib_build_sge(priv, tx_req);
0550 
0551     priv->tx_wr.wr.wr_id    = wr_id;
0552     priv->tx_wr.remote_qpn  = dqpn;
0553     priv->tx_wr.ah      = address;
0554 
0555     if (head) {
0556         priv->tx_wr.mss     = skb_shinfo(skb)->gso_size;
0557         priv->tx_wr.header  = head;
0558         priv->tx_wr.hlen    = hlen;
0559         priv->tx_wr.wr.opcode   = IB_WR_LSO;
0560     } else
0561         priv->tx_wr.wr.opcode   = IB_WR_SEND;
0562 
0563     return ib_post_send(priv->qp, &priv->tx_wr.wr, NULL);
0564 }
0565 
0566 int ipoib_send(struct net_device *dev, struct sk_buff *skb,
0567            struct ib_ah *address, u32 dqpn)
0568 {
0569     struct ipoib_dev_priv *priv = ipoib_priv(dev);
0570     struct ipoib_tx_buf *tx_req;
0571     int hlen, rc;
0572     void *phead;
0573     unsigned int usable_sge = priv->max_send_sge - !!skb_headlen(skb);
0574 
0575     if (skb_is_gso(skb)) {
0576         hlen = skb_tcp_all_headers(skb);
0577         phead = skb->data;
0578         if (unlikely(!skb_pull(skb, hlen))) {
0579             ipoib_warn(priv, "linear data too small\n");
0580             ++dev->stats.tx_dropped;
0581             ++dev->stats.tx_errors;
0582             dev_kfree_skb_any(skb);
0583             return -1;
0584         }
0585     } else {
0586         if (unlikely(skb->len > priv->mcast_mtu + IPOIB_ENCAP_LEN)) {
0587             ipoib_warn(priv, "packet len %d (> %d) too long to send, dropping\n",
0588                    skb->len, priv->mcast_mtu + IPOIB_ENCAP_LEN);
0589             ++dev->stats.tx_dropped;
0590             ++dev->stats.tx_errors;
0591             ipoib_cm_skb_too_long(dev, skb, priv->mcast_mtu);
0592             return -1;
0593         }
0594         phead = NULL;
0595         hlen  = 0;
0596     }
0597     if (skb_shinfo(skb)->nr_frags > usable_sge) {
0598         if (skb_linearize(skb) < 0) {
0599             ipoib_warn(priv, "skb could not be linearized\n");
0600             ++dev->stats.tx_dropped;
0601             ++dev->stats.tx_errors;
0602             dev_kfree_skb_any(skb);
0603             return -1;
0604         }
0605         /* Does skb_linearize return ok without reducing nr_frags? */
0606         if (skb_shinfo(skb)->nr_frags > usable_sge) {
0607             ipoib_warn(priv, "too many frags after skb linearize\n");
0608             ++dev->stats.tx_dropped;
0609             ++dev->stats.tx_errors;
0610             dev_kfree_skb_any(skb);
0611             return -1;
0612         }
0613     }
0614 
0615     ipoib_dbg_data(priv,
0616                "sending packet, length=%d address=%p dqpn=0x%06x\n",
0617                skb->len, address, dqpn);
0618 
0619     /*
0620      * We put the skb into the tx_ring _before_ we call post_send()
0621      * because it's entirely possible that the completion handler will
0622      * run before we execute anything after the post_send().  That
0623      * means we have to make sure everything is properly recorded and
0624      * our state is consistent before we call post_send().
0625      */
0626     tx_req = &priv->tx_ring[priv->tx_head & (ipoib_sendq_size - 1)];
0627     tx_req->skb = skb;
0628     if (unlikely(ipoib_dma_map_tx(priv->ca, tx_req))) {
0629         ++dev->stats.tx_errors;
0630         dev_kfree_skb_any(skb);
0631         return -1;
0632     }
0633 
0634     if (skb->ip_summed == CHECKSUM_PARTIAL)
0635         priv->tx_wr.wr.send_flags |= IB_SEND_IP_CSUM;
0636     else
0637         priv->tx_wr.wr.send_flags &= ~IB_SEND_IP_CSUM;
0638     /* increase the tx_head after send success, but use it for queue state */
0639     if ((priv->global_tx_head - priv->global_tx_tail) ==
0640         ipoib_sendq_size - 1) {
0641         ipoib_dbg(priv, "TX ring full, stopping kernel net queue\n");
0642         netif_stop_queue(dev);
0643     }
0644 
0645     skb_orphan(skb);
0646     skb_dst_drop(skb);
0647 
0648     if (netif_queue_stopped(dev))
0649         if (ib_req_notify_cq(priv->send_cq, IB_CQ_NEXT_COMP |
0650                      IB_CQ_REPORT_MISSED_EVENTS) < 0)
0651             ipoib_warn(priv, "request notify on send CQ failed\n");
0652 
0653     rc = post_send(priv, priv->tx_head & (ipoib_sendq_size - 1),
0654                address, dqpn, tx_req, phead, hlen);
0655     if (unlikely(rc)) {
0656         ipoib_warn(priv, "post_send failed, error %d\n", rc);
0657         ++dev->stats.tx_errors;
0658         ipoib_dma_unmap_tx(priv, tx_req);
0659         dev_kfree_skb_any(skb);
0660         if (netif_queue_stopped(dev))
0661             netif_wake_queue(dev);
0662         rc = 0;
0663     } else {
0664         netif_trans_update(dev);
0665 
0666         rc = priv->tx_head;
0667         ++priv->tx_head;
0668         ++priv->global_tx_head;
0669     }
0670     return rc;
0671 }
0672 
0673 static void ipoib_reap_dead_ahs(struct ipoib_dev_priv *priv)
0674 {
0675     struct ipoib_ah *ah, *tah;
0676     unsigned long flags;
0677 
0678     netif_tx_lock_bh(priv->dev);
0679     spin_lock_irqsave(&priv->lock, flags);
0680 
0681     list_for_each_entry_safe(ah, tah, &priv->dead_ahs, list)
0682         if ((int) priv->tx_tail - (int) ah->last_send >= 0) {
0683             list_del(&ah->list);
0684             rdma_destroy_ah(ah->ah, 0);
0685             kfree(ah);
0686         }
0687 
0688     spin_unlock_irqrestore(&priv->lock, flags);
0689     netif_tx_unlock_bh(priv->dev);
0690 }
0691 
0692 void ipoib_reap_ah(struct work_struct *work)
0693 {
0694     struct ipoib_dev_priv *priv =
0695         container_of(work, struct ipoib_dev_priv, ah_reap_task.work);
0696 
0697     ipoib_reap_dead_ahs(priv);
0698 
0699     if (!test_bit(IPOIB_STOP_REAPER, &priv->flags))
0700         queue_delayed_work(priv->wq, &priv->ah_reap_task,
0701                    round_jiffies_relative(HZ));
0702 }
0703 
0704 static void ipoib_start_ah_reaper(struct ipoib_dev_priv *priv)
0705 {
0706     clear_bit(IPOIB_STOP_REAPER, &priv->flags);
0707     queue_delayed_work(priv->wq, &priv->ah_reap_task,
0708                round_jiffies_relative(HZ));
0709 }
0710 
0711 static void ipoib_stop_ah_reaper(struct ipoib_dev_priv *priv)
0712 {
0713     set_bit(IPOIB_STOP_REAPER, &priv->flags);
0714     cancel_delayed_work(&priv->ah_reap_task);
0715     /*
0716      * After ipoib_stop_ah_reaper() we always go through
0717      * ipoib_reap_dead_ahs() which ensures the work is really stopped and
0718      * does a final flush out of the dead_ah's list
0719      */
0720 }
0721 
0722 static int recvs_pending(struct net_device *dev)
0723 {
0724     struct ipoib_dev_priv *priv = ipoib_priv(dev);
0725     int pending = 0;
0726     int i;
0727 
0728     for (i = 0; i < ipoib_recvq_size; ++i)
0729         if (priv->rx_ring[i].skb)
0730             ++pending;
0731 
0732     return pending;
0733 }
0734 
0735 static void check_qp_movement_and_print(struct ipoib_dev_priv *priv,
0736                     struct ib_qp *qp,
0737                     enum ib_qp_state new_state)
0738 {
0739     struct ib_qp_attr qp_attr;
0740     struct ib_qp_init_attr query_init_attr;
0741     int ret;
0742 
0743     ret = ib_query_qp(qp, &qp_attr, IB_QP_STATE, &query_init_attr);
0744     if (ret) {
0745         ipoib_warn(priv, "%s: Failed to query QP\n", __func__);
0746         return;
0747     }
0748     /* print according to the new-state and the previous state.*/
0749     if (new_state == IB_QPS_ERR && qp_attr.qp_state == IB_QPS_RESET)
0750         ipoib_dbg(priv, "Failed modify QP, IB_QPS_RESET to IB_QPS_ERR, acceptable\n");
0751     else
0752         ipoib_warn(priv, "Failed to modify QP to state: %d from state: %d\n",
0753                new_state, qp_attr.qp_state);
0754 }
0755 
0756 static void ipoib_napi_enable(struct net_device *dev)
0757 {
0758     struct ipoib_dev_priv *priv = ipoib_priv(dev);
0759 
0760     napi_enable(&priv->recv_napi);
0761     napi_enable(&priv->send_napi);
0762 }
0763 
0764 static void ipoib_napi_disable(struct net_device *dev)
0765 {
0766     struct ipoib_dev_priv *priv = ipoib_priv(dev);
0767 
0768     napi_disable(&priv->recv_napi);
0769     napi_disable(&priv->send_napi);
0770 }
0771 
0772 int ipoib_ib_dev_stop_default(struct net_device *dev)
0773 {
0774     struct ipoib_dev_priv *priv = ipoib_priv(dev);
0775     struct ib_qp_attr qp_attr;
0776     unsigned long begin;
0777     struct ipoib_tx_buf *tx_req;
0778     int i;
0779 
0780     if (test_bit(IPOIB_FLAG_INITIALIZED, &priv->flags))
0781         ipoib_napi_disable(dev);
0782 
0783     ipoib_cm_dev_stop(dev);
0784 
0785     /*
0786      * Move our QP to the error state and then reinitialize in
0787      * when all work requests have completed or have been flushed.
0788      */
0789     qp_attr.qp_state = IB_QPS_ERR;
0790     if (ib_modify_qp(priv->qp, &qp_attr, IB_QP_STATE))
0791         check_qp_movement_and_print(priv, priv->qp, IB_QPS_ERR);
0792 
0793     /* Wait for all sends and receives to complete */
0794     begin = jiffies;
0795 
0796     while (priv->tx_head != priv->tx_tail || recvs_pending(dev)) {
0797         if (time_after(jiffies, begin + 5 * HZ)) {
0798             ipoib_warn(priv,
0799                    "timing out; %d sends %d receives not completed\n",
0800                    priv->tx_head - priv->tx_tail,
0801                    recvs_pending(dev));
0802 
0803             /*
0804              * assume the HW is wedged and just free up
0805              * all our pending work requests.
0806              */
0807             while ((int)priv->tx_tail - (int)priv->tx_head < 0) {
0808                 tx_req = &priv->tx_ring[priv->tx_tail &
0809                             (ipoib_sendq_size - 1)];
0810                 ipoib_dma_unmap_tx(priv, tx_req);
0811                 dev_kfree_skb_any(tx_req->skb);
0812                 ++priv->tx_tail;
0813                 ++priv->global_tx_tail;
0814             }
0815 
0816             for (i = 0; i < ipoib_recvq_size; ++i) {
0817                 struct ipoib_rx_buf *rx_req;
0818 
0819                 rx_req = &priv->rx_ring[i];
0820                 if (!rx_req->skb)
0821                     continue;
0822                 ipoib_ud_dma_unmap_rx(priv,
0823                               priv->rx_ring[i].mapping);
0824                 dev_kfree_skb_any(rx_req->skb);
0825                 rx_req->skb = NULL;
0826             }
0827 
0828             goto timeout;
0829         }
0830 
0831         ipoib_drain_cq(dev);
0832 
0833         usleep_range(1000, 2000);
0834     }
0835 
0836     ipoib_dbg(priv, "All sends and receives done.\n");
0837 
0838 timeout:
0839     qp_attr.qp_state = IB_QPS_RESET;
0840     if (ib_modify_qp(priv->qp, &qp_attr, IB_QP_STATE))
0841         ipoib_warn(priv, "Failed to modify QP to RESET state\n");
0842 
0843     ib_req_notify_cq(priv->recv_cq, IB_CQ_NEXT_COMP);
0844 
0845     return 0;
0846 }
0847 
0848 int ipoib_ib_dev_open_default(struct net_device *dev)
0849 {
0850     struct ipoib_dev_priv *priv = ipoib_priv(dev);
0851     int ret;
0852 
0853     ret = ipoib_init_qp(dev);
0854     if (ret) {
0855         ipoib_warn(priv, "ipoib_init_qp returned %d\n", ret);
0856         return -1;
0857     }
0858 
0859     ret = ipoib_ib_post_receives(dev);
0860     if (ret) {
0861         ipoib_warn(priv, "ipoib_ib_post_receives returned %d\n", ret);
0862         goto out;
0863     }
0864 
0865     ret = ipoib_cm_dev_open(dev);
0866     if (ret) {
0867         ipoib_warn(priv, "ipoib_cm_dev_open returned %d\n", ret);
0868         goto out;
0869     }
0870 
0871     if (!test_bit(IPOIB_FLAG_INITIALIZED, &priv->flags))
0872         ipoib_napi_enable(dev);
0873 
0874     return 0;
0875 out:
0876     return -1;
0877 }
0878 
0879 int ipoib_ib_dev_open(struct net_device *dev)
0880 {
0881     struct ipoib_dev_priv *priv = ipoib_priv(dev);
0882 
0883     ipoib_pkey_dev_check_presence(dev);
0884 
0885     if (!test_bit(IPOIB_PKEY_ASSIGNED, &priv->flags)) {
0886         ipoib_warn(priv, "P_Key 0x%04x is %s\n", priv->pkey,
0887                (!(priv->pkey & 0x7fff) ? "Invalid" : "not found"));
0888         return -1;
0889     }
0890 
0891     ipoib_start_ah_reaper(priv);
0892     if (priv->rn_ops->ndo_open(dev)) {
0893         pr_warn("%s: Failed to open dev\n", dev->name);
0894         goto dev_stop;
0895     }
0896 
0897     set_bit(IPOIB_FLAG_INITIALIZED, &priv->flags);
0898 
0899     return 0;
0900 
0901 dev_stop:
0902     ipoib_stop_ah_reaper(priv);
0903     return -1;
0904 }
0905 
0906 void ipoib_ib_dev_stop(struct net_device *dev)
0907 {
0908     struct ipoib_dev_priv *priv = ipoib_priv(dev);
0909 
0910     priv->rn_ops->ndo_stop(dev);
0911 
0912     clear_bit(IPOIB_FLAG_INITIALIZED, &priv->flags);
0913     ipoib_stop_ah_reaper(priv);
0914 }
0915 
0916 void ipoib_pkey_dev_check_presence(struct net_device *dev)
0917 {
0918     struct ipoib_dev_priv *priv = ipoib_priv(dev);
0919     struct rdma_netdev *rn = netdev_priv(dev);
0920 
0921     if (!(priv->pkey & 0x7fff) ||
0922         ib_find_pkey(priv->ca, priv->port, priv->pkey,
0923              &priv->pkey_index)) {
0924         clear_bit(IPOIB_PKEY_ASSIGNED, &priv->flags);
0925     } else {
0926         if (rn->set_id)
0927             rn->set_id(dev, priv->pkey_index);
0928         set_bit(IPOIB_PKEY_ASSIGNED, &priv->flags);
0929     }
0930 }
0931 
0932 void ipoib_ib_dev_up(struct net_device *dev)
0933 {
0934     struct ipoib_dev_priv *priv = ipoib_priv(dev);
0935 
0936     ipoib_pkey_dev_check_presence(dev);
0937 
0938     if (!test_bit(IPOIB_PKEY_ASSIGNED, &priv->flags)) {
0939         ipoib_dbg(priv, "PKEY is not assigned.\n");
0940         return;
0941     }
0942 
0943     set_bit(IPOIB_FLAG_OPER_UP, &priv->flags);
0944 
0945     ipoib_mcast_start_thread(dev);
0946 }
0947 
0948 void ipoib_ib_dev_down(struct net_device *dev)
0949 {
0950     struct ipoib_dev_priv *priv = ipoib_priv(dev);
0951 
0952     ipoib_dbg(priv, "downing ib_dev\n");
0953 
0954     clear_bit(IPOIB_FLAG_OPER_UP, &priv->flags);
0955     netif_carrier_off(dev);
0956 
0957     ipoib_mcast_stop_thread(dev);
0958     ipoib_mcast_dev_flush(dev);
0959 
0960     ipoib_flush_paths(dev);
0961 }
0962 
0963 void ipoib_drain_cq(struct net_device *dev)
0964 {
0965     struct ipoib_dev_priv *priv = ipoib_priv(dev);
0966     int i, n;
0967 
0968     /*
0969      * We call completion handling routines that expect to be
0970      * called from the BH-disabled NAPI poll context, so disable
0971      * BHs here too.
0972      */
0973     local_bh_disable();
0974 
0975     do {
0976         n = ib_poll_cq(priv->recv_cq, IPOIB_NUM_WC, priv->ibwc);
0977         for (i = 0; i < n; ++i) {
0978             /*
0979              * Convert any successful completions to flush
0980              * errors to avoid passing packets up the
0981              * stack after bringing the device down.
0982              */
0983             if (priv->ibwc[i].status == IB_WC_SUCCESS)
0984                 priv->ibwc[i].status = IB_WC_WR_FLUSH_ERR;
0985 
0986             if (priv->ibwc[i].wr_id & IPOIB_OP_RECV) {
0987                 if (priv->ibwc[i].wr_id & IPOIB_OP_CM)
0988                     ipoib_cm_handle_rx_wc(dev, priv->ibwc + i);
0989                 else
0990                     ipoib_ib_handle_rx_wc(dev, priv->ibwc + i);
0991             } else {
0992                 pr_warn("%s: Got unexpected wqe id\n", __func__);
0993             }
0994         }
0995     } while (n == IPOIB_NUM_WC);
0996 
0997     while (poll_tx(priv))
0998         ; /* nothing */
0999 
1000     local_bh_enable();
1001 }
1002 
1003 /*
1004  * Takes whatever value which is in pkey index 0 and updates priv->pkey
1005  * returns 0 if the pkey value was changed.
1006  */
1007 static inline int update_parent_pkey(struct ipoib_dev_priv *priv)
1008 {
1009     int result;
1010     u16 prev_pkey;
1011 
1012     prev_pkey = priv->pkey;
1013     result = ib_query_pkey(priv->ca, priv->port, 0, &priv->pkey);
1014     if (result) {
1015         ipoib_warn(priv, "ib_query_pkey port %d failed (ret = %d)\n",
1016                priv->port, result);
1017         return result;
1018     }
1019 
1020     priv->pkey |= 0x8000;
1021 
1022     if (prev_pkey != priv->pkey) {
1023         ipoib_dbg(priv, "pkey changed from 0x%x to 0x%x\n",
1024               prev_pkey, priv->pkey);
1025         /*
1026          * Update the pkey in the broadcast address, while making sure to set
1027          * the full membership bit, so that we join the right broadcast group.
1028          */
1029         priv->dev->broadcast[8] = priv->pkey >> 8;
1030         priv->dev->broadcast[9] = priv->pkey & 0xff;
1031         return 0;
1032     }
1033 
1034     return 1;
1035 }
1036 /*
1037  * returns 0 if pkey value was found in a different slot.
1038  */
1039 static inline int update_child_pkey(struct ipoib_dev_priv *priv)
1040 {
1041     u16 old_index = priv->pkey_index;
1042 
1043     priv->pkey_index = 0;
1044     ipoib_pkey_dev_check_presence(priv->dev);
1045 
1046     if (test_bit(IPOIB_PKEY_ASSIGNED, &priv->flags) &&
1047         (old_index == priv->pkey_index))
1048         return 1;
1049     return 0;
1050 }
1051 
1052 /*
1053  * returns true if the device address of the ipoib interface has changed and the
1054  * new address is a valid one (i.e in the gid table), return false otherwise.
1055  */
1056 static bool ipoib_dev_addr_changed_valid(struct ipoib_dev_priv *priv)
1057 {
1058     union ib_gid search_gid;
1059     union ib_gid gid0;
1060     int err;
1061     u16 index;
1062     u32 port;
1063     bool ret = false;
1064 
1065     if (rdma_query_gid(priv->ca, priv->port, 0, &gid0))
1066         return false;
1067 
1068     netif_addr_lock_bh(priv->dev);
1069 
1070     /* The subnet prefix may have changed, update it now so we won't have
1071      * to do it later
1072      */
1073     priv->local_gid.global.subnet_prefix = gid0.global.subnet_prefix;
1074     dev_addr_mod(priv->dev, 4, (u8 *)&gid0.global.subnet_prefix,
1075              sizeof(gid0.global.subnet_prefix));
1076     search_gid.global.subnet_prefix = gid0.global.subnet_prefix;
1077 
1078     search_gid.global.interface_id = priv->local_gid.global.interface_id;
1079 
1080     netif_addr_unlock_bh(priv->dev);
1081 
1082     err = ib_find_gid(priv->ca, &search_gid, &port, &index);
1083 
1084     netif_addr_lock_bh(priv->dev);
1085 
1086     if (search_gid.global.interface_id !=
1087         priv->local_gid.global.interface_id)
1088         /* There was a change while we were looking up the gid, bail
1089          * here and let the next work sort this out
1090          */
1091         goto out;
1092 
1093     /* The next section of code needs some background:
1094      * Per IB spec the port GUID can't change if the HCA is powered on.
1095      * port GUID is the basis for GID at index 0 which is the basis for
1096      * the default device address of a ipoib interface.
1097      *
1098      * so it seems the flow should be:
1099      * if user_changed_dev_addr && gid in gid tbl
1100      *  set bit dev_addr_set
1101      *  return true
1102      * else
1103      *  return false
1104      *
1105      * The issue is that there are devices that don't follow the spec,
1106      * they change the port GUID when the HCA is powered, so in order
1107      * not to break userspace applications, We need to check if the
1108      * user wanted to control the device address and we assume that
1109      * if he sets the device address back to be based on GID index 0,
1110      * he no longer wishs to control it.
1111      *
1112      * If the user doesn't control the device address,
1113      * IPOIB_FLAG_DEV_ADDR_SET is set and ib_find_gid failed it means
1114      * the port GUID has changed and GID at index 0 has changed
1115      * so we need to change priv->local_gid and priv->dev->dev_addr
1116      * to reflect the new GID.
1117      */
1118     if (!test_bit(IPOIB_FLAG_DEV_ADDR_SET, &priv->flags)) {
1119         if (!err && port == priv->port) {
1120             set_bit(IPOIB_FLAG_DEV_ADDR_SET, &priv->flags);
1121             if (index == 0)
1122                 clear_bit(IPOIB_FLAG_DEV_ADDR_CTRL,
1123                       &priv->flags);
1124             else
1125                 set_bit(IPOIB_FLAG_DEV_ADDR_CTRL, &priv->flags);
1126             ret = true;
1127         } else {
1128             ret = false;
1129         }
1130     } else {
1131         if (!err && port == priv->port) {
1132             ret = true;
1133         } else {
1134             if (!test_bit(IPOIB_FLAG_DEV_ADDR_CTRL, &priv->flags)) {
1135                 memcpy(&priv->local_gid, &gid0,
1136                        sizeof(priv->local_gid));
1137                 dev_addr_mod(priv->dev, 4, (u8 *)&gid0,
1138                          sizeof(priv->local_gid));
1139                 ret = true;
1140             }
1141         }
1142     }
1143 
1144 out:
1145     netif_addr_unlock_bh(priv->dev);
1146 
1147     return ret;
1148 }
1149 
1150 static void __ipoib_ib_dev_flush(struct ipoib_dev_priv *priv,
1151                 enum ipoib_flush_level level,
1152                 int nesting)
1153 {
1154     struct ipoib_dev_priv *cpriv;
1155     struct net_device *dev = priv->dev;
1156     int result;
1157 
1158     down_read_nested(&priv->vlan_rwsem, nesting);
1159 
1160     /*
1161      * Flush any child interfaces too -- they might be up even if
1162      * the parent is down.
1163      */
1164     list_for_each_entry(cpriv, &priv->child_intfs, list)
1165         __ipoib_ib_dev_flush(cpriv, level, nesting + 1);
1166 
1167     up_read(&priv->vlan_rwsem);
1168 
1169     if (!test_bit(IPOIB_FLAG_INITIALIZED, &priv->flags) &&
1170         level != IPOIB_FLUSH_HEAVY) {
1171         /* Make sure the dev_addr is set even if not flushing */
1172         if (level == IPOIB_FLUSH_LIGHT)
1173             ipoib_dev_addr_changed_valid(priv);
1174         ipoib_dbg(priv, "Not flushing - IPOIB_FLAG_INITIALIZED not set.\n");
1175         return;
1176     }
1177 
1178     if (!test_bit(IPOIB_FLAG_ADMIN_UP, &priv->flags)) {
1179         /* interface is down. update pkey and leave. */
1180         if (level == IPOIB_FLUSH_HEAVY) {
1181             if (!test_bit(IPOIB_FLAG_SUBINTERFACE, &priv->flags))
1182                 update_parent_pkey(priv);
1183             else
1184                 update_child_pkey(priv);
1185         } else if (level == IPOIB_FLUSH_LIGHT)
1186             ipoib_dev_addr_changed_valid(priv);
1187         ipoib_dbg(priv, "Not flushing - IPOIB_FLAG_ADMIN_UP not set.\n");
1188         return;
1189     }
1190 
1191     if (level == IPOIB_FLUSH_HEAVY) {
1192         /* child devices chase their origin pkey value, while non-child
1193          * (parent) devices should always takes what present in pkey index 0
1194          */
1195         if (test_bit(IPOIB_FLAG_SUBINTERFACE, &priv->flags)) {
1196             result = update_child_pkey(priv);
1197             if (result) {
1198                 /* restart QP only if P_Key index is changed */
1199                 ipoib_dbg(priv, "Not flushing - P_Key index not changed.\n");
1200                 return;
1201             }
1202 
1203         } else {
1204             result = update_parent_pkey(priv);
1205             /* restart QP only if P_Key value changed */
1206             if (result) {
1207                 ipoib_dbg(priv, "Not flushing - P_Key value not changed.\n");
1208                 return;
1209             }
1210         }
1211     }
1212 
1213     if (level == IPOIB_FLUSH_LIGHT) {
1214         int oper_up;
1215         ipoib_mark_paths_invalid(dev);
1216         /* Set IPoIB operation as down to prevent races between:
1217          * the flush flow which leaves MCG and on the fly joins
1218          * which can happen during that time. mcast restart task
1219          * should deal with join requests we missed.
1220          */
1221         oper_up = test_and_clear_bit(IPOIB_FLAG_OPER_UP, &priv->flags);
1222         ipoib_mcast_dev_flush(dev);
1223         if (oper_up)
1224             set_bit(IPOIB_FLAG_OPER_UP, &priv->flags);
1225         ipoib_reap_dead_ahs(priv);
1226     }
1227 
1228     if (level >= IPOIB_FLUSH_NORMAL)
1229         ipoib_ib_dev_down(dev);
1230 
1231     if (level == IPOIB_FLUSH_HEAVY) {
1232         if (test_bit(IPOIB_FLAG_INITIALIZED, &priv->flags))
1233             ipoib_ib_dev_stop(dev);
1234 
1235         if (ipoib_ib_dev_open(dev))
1236             return;
1237 
1238         if (netif_queue_stopped(dev))
1239             netif_start_queue(dev);
1240     }
1241 
1242     /*
1243      * The device could have been brought down between the start and when
1244      * we get here, don't bring it back up if it's not configured up
1245      */
1246     if (test_bit(IPOIB_FLAG_ADMIN_UP, &priv->flags)) {
1247         if (level >= IPOIB_FLUSH_NORMAL)
1248             ipoib_ib_dev_up(dev);
1249         if (ipoib_dev_addr_changed_valid(priv))
1250             ipoib_mcast_restart_task(&priv->restart_task);
1251     }
1252 }
1253 
1254 void ipoib_ib_dev_flush_light(struct work_struct *work)
1255 {
1256     struct ipoib_dev_priv *priv =
1257         container_of(work, struct ipoib_dev_priv, flush_light);
1258 
1259     __ipoib_ib_dev_flush(priv, IPOIB_FLUSH_LIGHT, 0);
1260 }
1261 
1262 void ipoib_ib_dev_flush_normal(struct work_struct *work)
1263 {
1264     struct ipoib_dev_priv *priv =
1265         container_of(work, struct ipoib_dev_priv, flush_normal);
1266 
1267     __ipoib_ib_dev_flush(priv, IPOIB_FLUSH_NORMAL, 0);
1268 }
1269 
1270 void ipoib_ib_dev_flush_heavy(struct work_struct *work)
1271 {
1272     struct ipoib_dev_priv *priv =
1273         container_of(work, struct ipoib_dev_priv, flush_heavy);
1274 
1275     rtnl_lock();
1276     __ipoib_ib_dev_flush(priv, IPOIB_FLUSH_HEAVY, 0);
1277     rtnl_unlock();
1278 }
1279 
1280 void ipoib_ib_dev_cleanup(struct net_device *dev)
1281 {
1282     struct ipoib_dev_priv *priv = ipoib_priv(dev);
1283 
1284     ipoib_dbg(priv, "cleaning up ib_dev\n");
1285     /*
1286      * We must make sure there are no more (path) completions
1287      * that may wish to touch priv fields that are no longer valid
1288      */
1289     ipoib_flush_paths(dev);
1290 
1291     ipoib_mcast_stop_thread(dev);
1292     ipoib_mcast_dev_flush(dev);
1293 
1294     /*
1295      * All of our ah references aren't free until after
1296      * ipoib_mcast_dev_flush(), ipoib_flush_paths, and
1297      * the neighbor garbage collection is stopped and reaped.
1298      * That should all be done now, so make a final ah flush.
1299      */
1300     ipoib_reap_dead_ahs(priv);
1301 
1302     clear_bit(IPOIB_PKEY_ASSIGNED, &priv->flags);
1303 
1304     priv->rn_ops->ndo_uninit(dev);
1305 
1306     if (priv->pd) {
1307         ib_dealloc_pd(priv->pd);
1308         priv->pd = NULL;
1309     }
1310 }
1311