Back to home page

OSCL-LXR

 
 

    


0001 /*
0002  * Copyright (c) 2016 Oracle.  All rights reserved.
0003  *
0004  * This software is available to you under a choice of one of two
0005  * licenses.  You may choose to be licensed under the terms of the GNU
0006  * General Public License (GPL) Version 2, available from the file
0007  * COPYING in the main directory of this source tree, or the
0008  * OpenIB.org BSD license below:
0009  *
0010  *     Redistribution and use in source and binary forms, with or
0011  *     without modification, are permitted provided that the following
0012  *     conditions are met:
0013  *
0014  *      - Redistributions of source code must retain the above
0015  *        copyright notice, this list of conditions and the following
0016  *        disclaimer.
0017  *
0018  *      - Redistributions in binary form must reproduce the above
0019  *        copyright notice, this list of conditions and the following
0020  *        disclaimer in the documentation and/or other materials
0021  *        provided with the distribution.
0022  *
0023  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
0024  * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
0025  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
0026  * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
0027  * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
0028  * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
0029  * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
0030  * SOFTWARE.
0031  */
0032 
0033 #include "ib_mr.h"
0034 
0035 static inline void
0036 rds_transition_frwr_state(struct rds_ib_mr *ibmr,
0037               enum rds_ib_fr_state old_state,
0038               enum rds_ib_fr_state new_state)
0039 {
0040     if (cmpxchg(&ibmr->u.frmr.fr_state,
0041             old_state, new_state) == old_state &&
0042         old_state == FRMR_IS_INUSE) {
0043         /* enforce order of ibmr->u.frmr.fr_state update
0044          * before decrementing i_fastreg_inuse_count
0045          */
0046         smp_mb__before_atomic();
0047         atomic_dec(&ibmr->ic->i_fastreg_inuse_count);
0048         if (waitqueue_active(&rds_ib_ring_empty_wait))
0049             wake_up(&rds_ib_ring_empty_wait);
0050     }
0051 }
0052 
0053 static struct rds_ib_mr *rds_ib_alloc_frmr(struct rds_ib_device *rds_ibdev,
0054                        int npages)
0055 {
0056     struct rds_ib_mr_pool *pool;
0057     struct rds_ib_mr *ibmr = NULL;
0058     struct rds_ib_frmr *frmr;
0059     int err = 0;
0060 
0061     if (npages <= RDS_MR_8K_MSG_SIZE)
0062         pool = rds_ibdev->mr_8k_pool;
0063     else
0064         pool = rds_ibdev->mr_1m_pool;
0065 
0066     ibmr = rds_ib_try_reuse_ibmr(pool);
0067     if (ibmr)
0068         return ibmr;
0069 
0070     ibmr = kzalloc_node(sizeof(*ibmr), GFP_KERNEL,
0071                 rdsibdev_to_node(rds_ibdev));
0072     if (!ibmr) {
0073         err = -ENOMEM;
0074         goto out_no_cigar;
0075     }
0076 
0077     frmr = &ibmr->u.frmr;
0078     frmr->mr = ib_alloc_mr(rds_ibdev->pd, IB_MR_TYPE_MEM_REG,
0079              pool->max_pages);
0080     if (IS_ERR(frmr->mr)) {
0081         pr_warn("RDS/IB: %s failed to allocate MR", __func__);
0082         err = PTR_ERR(frmr->mr);
0083         goto out_no_cigar;
0084     }
0085 
0086     ibmr->pool = pool;
0087     if (pool->pool_type == RDS_IB_MR_8K_POOL)
0088         rds_ib_stats_inc(s_ib_rdma_mr_8k_alloc);
0089     else
0090         rds_ib_stats_inc(s_ib_rdma_mr_1m_alloc);
0091 
0092     if (atomic_read(&pool->item_count) > pool->max_items_soft)
0093         pool->max_items_soft = pool->max_items;
0094 
0095     frmr->fr_state = FRMR_IS_FREE;
0096     init_waitqueue_head(&frmr->fr_inv_done);
0097     init_waitqueue_head(&frmr->fr_reg_done);
0098     return ibmr;
0099 
0100 out_no_cigar:
0101     kfree(ibmr);
0102     atomic_dec(&pool->item_count);
0103     return ERR_PTR(err);
0104 }
0105 
0106 static void rds_ib_free_frmr(struct rds_ib_mr *ibmr, bool drop)
0107 {
0108     struct rds_ib_mr_pool *pool = ibmr->pool;
0109 
0110     if (drop)
0111         llist_add(&ibmr->llnode, &pool->drop_list);
0112     else
0113         llist_add(&ibmr->llnode, &pool->free_list);
0114     atomic_add(ibmr->sg_len, &pool->free_pinned);
0115     atomic_inc(&pool->dirty_count);
0116 
0117     /* If we've pinned too many pages, request a flush */
0118     if (atomic_read(&pool->free_pinned) >= pool->max_free_pinned ||
0119         atomic_read(&pool->dirty_count) >= pool->max_items / 5)
0120         queue_delayed_work(rds_ib_mr_wq, &pool->flush_worker, 10);
0121 }
0122 
0123 static int rds_ib_post_reg_frmr(struct rds_ib_mr *ibmr)
0124 {
0125     struct rds_ib_frmr *frmr = &ibmr->u.frmr;
0126     struct ib_reg_wr reg_wr;
0127     int ret, off = 0;
0128 
0129     while (atomic_dec_return(&ibmr->ic->i_fastreg_wrs) <= 0) {
0130         atomic_inc(&ibmr->ic->i_fastreg_wrs);
0131         cpu_relax();
0132     }
0133 
0134     ret = ib_map_mr_sg_zbva(frmr->mr, ibmr->sg, ibmr->sg_dma_len,
0135                 &off, PAGE_SIZE);
0136     if (unlikely(ret != ibmr->sg_dma_len))
0137         return ret < 0 ? ret : -EINVAL;
0138 
0139     if (cmpxchg(&frmr->fr_state,
0140             FRMR_IS_FREE, FRMR_IS_INUSE) != FRMR_IS_FREE)
0141         return -EBUSY;
0142 
0143     atomic_inc(&ibmr->ic->i_fastreg_inuse_count);
0144 
0145     /* Perform a WR for the fast_reg_mr. Each individual page
0146      * in the sg list is added to the fast reg page list and placed
0147      * inside the fast_reg_mr WR.  The key used is a rolling 8bit
0148      * counter, which should guarantee uniqueness.
0149      */
0150     ib_update_fast_reg_key(frmr->mr, ibmr->remap_count++);
0151     frmr->fr_reg = true;
0152 
0153     memset(&reg_wr, 0, sizeof(reg_wr));
0154     reg_wr.wr.wr_id = (unsigned long)(void *)ibmr;
0155     reg_wr.wr.opcode = IB_WR_REG_MR;
0156     reg_wr.wr.num_sge = 0;
0157     reg_wr.mr = frmr->mr;
0158     reg_wr.key = frmr->mr->rkey;
0159     reg_wr.access = IB_ACCESS_LOCAL_WRITE |
0160             IB_ACCESS_REMOTE_READ |
0161             IB_ACCESS_REMOTE_WRITE;
0162     reg_wr.wr.send_flags = IB_SEND_SIGNALED;
0163 
0164     ret = ib_post_send(ibmr->ic->i_cm_id->qp, &reg_wr.wr, NULL);
0165     if (unlikely(ret)) {
0166         /* Failure here can be because of -ENOMEM as well */
0167         rds_transition_frwr_state(ibmr, FRMR_IS_INUSE, FRMR_IS_STALE);
0168 
0169         atomic_inc(&ibmr->ic->i_fastreg_wrs);
0170         if (printk_ratelimit())
0171             pr_warn("RDS/IB: %s returned error(%d)\n",
0172                 __func__, ret);
0173         goto out;
0174     }
0175 
0176     /* Wait for the registration to complete in order to prevent an invalid
0177      * access error resulting from a race between the memory region already
0178      * being accessed while registration is still pending.
0179      */
0180     wait_event(frmr->fr_reg_done, !frmr->fr_reg);
0181 
0182 out:
0183 
0184     return ret;
0185 }
0186 
0187 static int rds_ib_map_frmr(struct rds_ib_device *rds_ibdev,
0188                struct rds_ib_mr_pool *pool,
0189                struct rds_ib_mr *ibmr,
0190                struct scatterlist *sg, unsigned int sg_len)
0191 {
0192     struct ib_device *dev = rds_ibdev->dev;
0193     struct rds_ib_frmr *frmr = &ibmr->u.frmr;
0194     int i;
0195     u32 len;
0196     int ret = 0;
0197 
0198     /* We want to teardown old ibmr values here and fill it up with
0199      * new sg values
0200      */
0201     rds_ib_teardown_mr(ibmr);
0202 
0203     ibmr->sg = sg;
0204     ibmr->sg_len = sg_len;
0205     ibmr->sg_dma_len = 0;
0206     frmr->sg_byte_len = 0;
0207     WARN_ON(ibmr->sg_dma_len);
0208     ibmr->sg_dma_len = ib_dma_map_sg(dev, ibmr->sg, ibmr->sg_len,
0209                      DMA_BIDIRECTIONAL);
0210     if (unlikely(!ibmr->sg_dma_len)) {
0211         pr_warn("RDS/IB: %s failed!\n", __func__);
0212         return -EBUSY;
0213     }
0214 
0215     frmr->sg_byte_len = 0;
0216     frmr->dma_npages = 0;
0217     len = 0;
0218 
0219     ret = -EINVAL;
0220     for (i = 0; i < ibmr->sg_dma_len; ++i) {
0221         unsigned int dma_len = sg_dma_len(&ibmr->sg[i]);
0222         u64 dma_addr = sg_dma_address(&ibmr->sg[i]);
0223 
0224         frmr->sg_byte_len += dma_len;
0225         if (dma_addr & ~PAGE_MASK) {
0226             if (i > 0)
0227                 goto out_unmap;
0228             else
0229                 ++frmr->dma_npages;
0230         }
0231 
0232         if ((dma_addr + dma_len) & ~PAGE_MASK) {
0233             if (i < ibmr->sg_dma_len - 1)
0234                 goto out_unmap;
0235             else
0236                 ++frmr->dma_npages;
0237         }
0238 
0239         len += dma_len;
0240     }
0241     frmr->dma_npages += len >> PAGE_SHIFT;
0242 
0243     if (frmr->dma_npages > ibmr->pool->max_pages) {
0244         ret = -EMSGSIZE;
0245         goto out_unmap;
0246     }
0247 
0248     ret = rds_ib_post_reg_frmr(ibmr);
0249     if (ret)
0250         goto out_unmap;
0251 
0252     if (ibmr->pool->pool_type == RDS_IB_MR_8K_POOL)
0253         rds_ib_stats_inc(s_ib_rdma_mr_8k_used);
0254     else
0255         rds_ib_stats_inc(s_ib_rdma_mr_1m_used);
0256 
0257     return ret;
0258 
0259 out_unmap:
0260     ib_dma_unmap_sg(rds_ibdev->dev, ibmr->sg, ibmr->sg_len,
0261             DMA_BIDIRECTIONAL);
0262     ibmr->sg_dma_len = 0;
0263     return ret;
0264 }
0265 
0266 static int rds_ib_post_inv(struct rds_ib_mr *ibmr)
0267 {
0268     struct ib_send_wr *s_wr;
0269     struct rds_ib_frmr *frmr = &ibmr->u.frmr;
0270     struct rdma_cm_id *i_cm_id = ibmr->ic->i_cm_id;
0271     int ret = -EINVAL;
0272 
0273     if (!i_cm_id || !i_cm_id->qp || !frmr->mr)
0274         goto out;
0275 
0276     if (frmr->fr_state != FRMR_IS_INUSE)
0277         goto out;
0278 
0279     while (atomic_dec_return(&ibmr->ic->i_fastreg_wrs) <= 0) {
0280         atomic_inc(&ibmr->ic->i_fastreg_wrs);
0281         cpu_relax();
0282     }
0283 
0284     frmr->fr_inv = true;
0285     s_wr = &frmr->fr_wr;
0286 
0287     memset(s_wr, 0, sizeof(*s_wr));
0288     s_wr->wr_id = (unsigned long)(void *)ibmr;
0289     s_wr->opcode = IB_WR_LOCAL_INV;
0290     s_wr->ex.invalidate_rkey = frmr->mr->rkey;
0291     s_wr->send_flags = IB_SEND_SIGNALED;
0292 
0293     ret = ib_post_send(i_cm_id->qp, s_wr, NULL);
0294     if (unlikely(ret)) {
0295         rds_transition_frwr_state(ibmr, FRMR_IS_INUSE, FRMR_IS_STALE);
0296         frmr->fr_inv = false;
0297         /* enforce order of frmr->fr_inv update
0298          * before incrementing i_fastreg_wrs
0299          */
0300         smp_mb__before_atomic();
0301         atomic_inc(&ibmr->ic->i_fastreg_wrs);
0302         pr_err("RDS/IB: %s returned error(%d)\n", __func__, ret);
0303         goto out;
0304     }
0305 
0306     /* Wait for the FRMR_IS_FREE (or FRMR_IS_STALE) transition in order to
0307      * 1) avoid a silly bouncing between "clean_list" and "drop_list"
0308      *    triggered by function "rds_ib_reg_frmr" as it is releases frmr
0309      *    regions whose state is not "FRMR_IS_FREE" right away.
0310      * 2) prevents an invalid access error in a race
0311      *    from a pending "IB_WR_LOCAL_INV" operation
0312      *    with a teardown ("dma_unmap_sg", "put_page")
0313      *    and de-registration ("ib_dereg_mr") of the corresponding
0314      *    memory region.
0315      */
0316     wait_event(frmr->fr_inv_done, frmr->fr_state != FRMR_IS_INUSE);
0317 
0318 out:
0319     return ret;
0320 }
0321 
0322 void rds_ib_mr_cqe_handler(struct rds_ib_connection *ic, struct ib_wc *wc)
0323 {
0324     struct rds_ib_mr *ibmr = (void *)(unsigned long)wc->wr_id;
0325     struct rds_ib_frmr *frmr = &ibmr->u.frmr;
0326 
0327     if (wc->status != IB_WC_SUCCESS) {
0328         rds_transition_frwr_state(ibmr, FRMR_IS_INUSE, FRMR_IS_STALE);
0329         if (rds_conn_up(ic->conn))
0330             rds_ib_conn_error(ic->conn,
0331                       "frmr completion <%pI4,%pI4> status %u(%s), vendor_err 0x%x, disconnecting and reconnecting\n",
0332                       &ic->conn->c_laddr,
0333                       &ic->conn->c_faddr,
0334                       wc->status,
0335                       ib_wc_status_msg(wc->status),
0336                       wc->vendor_err);
0337     }
0338 
0339     if (frmr->fr_inv) {
0340         rds_transition_frwr_state(ibmr, FRMR_IS_INUSE, FRMR_IS_FREE);
0341         frmr->fr_inv = false;
0342         wake_up(&frmr->fr_inv_done);
0343     }
0344 
0345     if (frmr->fr_reg) {
0346         frmr->fr_reg = false;
0347         wake_up(&frmr->fr_reg_done);
0348     }
0349 
0350     /* enforce order of frmr->{fr_reg,fr_inv} update
0351      * before incrementing i_fastreg_wrs
0352      */
0353     smp_mb__before_atomic();
0354     atomic_inc(&ic->i_fastreg_wrs);
0355 }
0356 
0357 void rds_ib_unreg_frmr(struct list_head *list, unsigned int *nfreed,
0358                unsigned long *unpinned, unsigned int goal)
0359 {
0360     struct rds_ib_mr *ibmr, *next;
0361     struct rds_ib_frmr *frmr;
0362     int ret = 0, ret2;
0363     unsigned int freed = *nfreed;
0364 
0365     /* String all ib_mr's onto one list and hand them to ib_unmap_fmr */
0366     list_for_each_entry(ibmr, list, unmap_list) {
0367         if (ibmr->sg_dma_len) {
0368             ret2 = rds_ib_post_inv(ibmr);
0369             if (ret2 && !ret)
0370                 ret = ret2;
0371         }
0372     }
0373 
0374     if (ret)
0375         pr_warn("RDS/IB: %s failed (err=%d)\n", __func__, ret);
0376 
0377     /* Now we can destroy the DMA mapping and unpin any pages */
0378     list_for_each_entry_safe(ibmr, next, list, unmap_list) {
0379         *unpinned += ibmr->sg_len;
0380         frmr = &ibmr->u.frmr;
0381         __rds_ib_teardown_mr(ibmr);
0382         if (freed < goal || frmr->fr_state == FRMR_IS_STALE) {
0383             /* Don't de-allocate if the MR is not free yet */
0384             if (frmr->fr_state == FRMR_IS_INUSE)
0385                 continue;
0386 
0387             if (ibmr->pool->pool_type == RDS_IB_MR_8K_POOL)
0388                 rds_ib_stats_inc(s_ib_rdma_mr_8k_free);
0389             else
0390                 rds_ib_stats_inc(s_ib_rdma_mr_1m_free);
0391             list_del(&ibmr->unmap_list);
0392             if (frmr->mr)
0393                 ib_dereg_mr(frmr->mr);
0394             kfree(ibmr);
0395             freed++;
0396         }
0397     }
0398     *nfreed = freed;
0399 }
0400 
0401 struct rds_ib_mr *rds_ib_reg_frmr(struct rds_ib_device *rds_ibdev,
0402                   struct rds_ib_connection *ic,
0403                   struct scatterlist *sg,
0404                   unsigned long nents, u32 *key)
0405 {
0406     struct rds_ib_mr *ibmr = NULL;
0407     struct rds_ib_frmr *frmr;
0408     int ret;
0409 
0410     if (!ic) {
0411         /* TODO: Add FRWR support for RDS_GET_MR using proxy qp*/
0412         return ERR_PTR(-EOPNOTSUPP);
0413     }
0414 
0415     do {
0416         if (ibmr)
0417             rds_ib_free_frmr(ibmr, true);
0418         ibmr = rds_ib_alloc_frmr(rds_ibdev, nents);
0419         if (IS_ERR(ibmr))
0420             return ibmr;
0421         frmr = &ibmr->u.frmr;
0422     } while (frmr->fr_state != FRMR_IS_FREE);
0423 
0424     ibmr->ic = ic;
0425     ibmr->device = rds_ibdev;
0426     ret = rds_ib_map_frmr(rds_ibdev, ibmr->pool, ibmr, sg, nents);
0427     if (ret == 0) {
0428         *key = frmr->mr->rkey;
0429     } else {
0430         rds_ib_free_frmr(ibmr, false);
0431         ibmr = ERR_PTR(ret);
0432     }
0433 
0434     return ibmr;
0435 }
0436 
0437 void rds_ib_free_frmr_list(struct rds_ib_mr *ibmr)
0438 {
0439     struct rds_ib_mr_pool *pool = ibmr->pool;
0440     struct rds_ib_frmr *frmr = &ibmr->u.frmr;
0441 
0442     if (frmr->fr_state == FRMR_IS_STALE)
0443         llist_add(&ibmr->llnode, &pool->drop_list);
0444     else
0445         llist_add(&ibmr->llnode, &pool->free_list);
0446 }