Back to home page

OSCL-LXR

 
 

    


0001 /*
0002  * Copyright (c) 2012 Mellanox Technologies. All rights reserved.
0003  *
0004  * This software is available to you under a choice of one of two
0005  * licenses.  You may choose to be licensed under the terms of the GNU
0006  * General Public License (GPL) Version 2, available from the file
0007  * COPYING in the main directory of this source tree, or the
0008  * OpenIB.org BSD license below:
0009  *
0010  *     Redistribution and use in source and binary forms, with or
0011  *     without modification, are permitted provided that the following
0012  *     conditions are met:
0013  *
0014  *      - Redistributions of source code must retain the above
0015  *        copyright notice, this list of conditions and the following
0016  *        disclaimer.
0017  *
0018  *      - Redistributions in binary form must reproduce the above
0019  *        copyright notice, this list of conditions and the following
0020  *        disclaimer in the documentation and/or other materials
0021  *        provided with the distribution.
0022  *
0023  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
0024  * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
0025  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
0026  * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
0027  * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
0028  * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
0029  * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
0030  * SOFTWARE.
0031  */
0032 
0033 #include <rdma/ib_mad.h>
0034 #include <rdma/ib_smi.h>
0035 #include <rdma/ib_cache.h>
0036 #include <rdma/ib_sa.h>
0037 
0038 #include <linux/mlx4/cmd.h>
0039 #include <linux/rbtree.h>
0040 #include <linux/delay.h>
0041 
0042 #include "mlx4_ib.h"
0043 
0044 #define MAX_VFS     80
0045 #define MAX_PEND_REQS_PER_FUNC 4
0046 #define MAD_TIMEOUT_MS  2000
0047 
0048 #define mcg_warn(fmt, arg...)   pr_warn("MCG WARNING: " fmt, ##arg)
0049 #define mcg_error(fmt, arg...)  pr_err(fmt, ##arg)
0050 #define mcg_warn_group(group, format, arg...) \
0051     pr_warn("%s-%d: %16s (port %d): WARNING: " format, __func__, __LINE__,\
0052     (group)->name, group->demux->port, ## arg)
0053 
0054 #define mcg_debug_group(group, format, arg...) \
0055     pr_debug("%s-%d: %16s (port %d): WARNING: " format, __func__, __LINE__,\
0056          (group)->name, (group)->demux->port, ## arg)
0057 
0058 #define mcg_error_group(group, format, arg...) \
0059     pr_err("  %16s: " format, (group)->name, ## arg)
0060 
0061 
0062 static union ib_gid mgid0;
0063 
0064 static struct workqueue_struct *clean_wq;
0065 
0066 enum mcast_state {
0067     MCAST_NOT_MEMBER = 0,
0068     MCAST_MEMBER,
0069 };
0070 
0071 enum mcast_group_state {
0072     MCAST_IDLE,
0073     MCAST_JOIN_SENT,
0074     MCAST_LEAVE_SENT,
0075     MCAST_RESP_READY
0076 };
0077 
0078 struct mcast_member {
0079     enum mcast_state state;
0080     uint8_t         join_state;
0081     int         num_pend_reqs;
0082     struct list_head    pending;
0083 };
0084 
0085 struct ib_sa_mcmember_data {
0086     union ib_gid    mgid;
0087     union ib_gid    port_gid;
0088     __be32      qkey;
0089     __be16      mlid;
0090     u8      mtusel_mtu;
0091     u8      tclass;
0092     __be16      pkey;
0093     u8      ratesel_rate;
0094     u8      lifetmsel_lifetm;
0095     __be32      sl_flowlabel_hoplimit;
0096     u8      scope_join_state;
0097     u8      proxy_join;
0098     u8      reserved[2];
0099 } __packed __aligned(4);
0100 
0101 struct mcast_group {
0102     struct ib_sa_mcmember_data rec;
0103     struct rb_node      node;
0104     struct list_head    mgid0_list;
0105     struct mlx4_ib_demux_ctx *demux;
0106     struct mcast_member func[MAX_VFS];
0107     struct mutex        lock;
0108     struct work_struct  work;
0109     struct list_head    pending_list;
0110     int         members[3];
0111     enum mcast_group_state  state;
0112     enum mcast_group_state  prev_state;
0113     struct ib_sa_mad    response_sa_mad;
0114     __be64          last_req_tid;
0115 
0116     char            name[33]; /* MGID string */
0117     struct device_attribute dentry;
0118 
0119     /* refcount is the reference count for the following:
0120        1. Each queued request
0121        2. Each invocation of the worker thread
0122        3. Membership of the port at the SA
0123     */
0124     atomic_t        refcount;
0125 
0126     /* delayed work to clean pending SM request */
0127     struct delayed_work timeout_work;
0128     struct list_head    cleanup_list;
0129 };
0130 
0131 struct mcast_req {
0132     int         func;
0133     struct ib_sa_mad    sa_mad;
0134     struct list_head    group_list;
0135     struct list_head    func_list;
0136     struct mcast_group  *group;
0137     int         clean;
0138 };
0139 
0140 
0141 #define safe_atomic_dec(ref) \
0142     do {\
0143         if (atomic_dec_and_test(ref)) \
0144             mcg_warn_group(group, "did not expect to reach zero\n"); \
0145     } while (0)
0146 
0147 static const char *get_state_string(enum mcast_group_state state)
0148 {
0149     switch (state) {
0150     case MCAST_IDLE:
0151         return "MCAST_IDLE";
0152     case MCAST_JOIN_SENT:
0153         return "MCAST_JOIN_SENT";
0154     case MCAST_LEAVE_SENT:
0155         return "MCAST_LEAVE_SENT";
0156     case MCAST_RESP_READY:
0157         return "MCAST_RESP_READY";
0158     }
0159     return "Invalid State";
0160 }
0161 
0162 static struct mcast_group *mcast_find(struct mlx4_ib_demux_ctx *ctx,
0163                       union ib_gid *mgid)
0164 {
0165     struct rb_node *node = ctx->mcg_table.rb_node;
0166     struct mcast_group *group;
0167     int ret;
0168 
0169     while (node) {
0170         group = rb_entry(node, struct mcast_group, node);
0171         ret = memcmp(mgid->raw, group->rec.mgid.raw, sizeof *mgid);
0172         if (!ret)
0173             return group;
0174 
0175         if (ret < 0)
0176             node = node->rb_left;
0177         else
0178             node = node->rb_right;
0179     }
0180     return NULL;
0181 }
0182 
0183 static struct mcast_group *mcast_insert(struct mlx4_ib_demux_ctx *ctx,
0184                     struct mcast_group *group)
0185 {
0186     struct rb_node **link = &ctx->mcg_table.rb_node;
0187     struct rb_node *parent = NULL;
0188     struct mcast_group *cur_group;
0189     int ret;
0190 
0191     while (*link) {
0192         parent = *link;
0193         cur_group = rb_entry(parent, struct mcast_group, node);
0194 
0195         ret = memcmp(group->rec.mgid.raw, cur_group->rec.mgid.raw,
0196                  sizeof group->rec.mgid);
0197         if (ret < 0)
0198             link = &(*link)->rb_left;
0199         else if (ret > 0)
0200             link = &(*link)->rb_right;
0201         else
0202             return cur_group;
0203     }
0204     rb_link_node(&group->node, parent, link);
0205     rb_insert_color(&group->node, &ctx->mcg_table);
0206     return NULL;
0207 }
0208 
0209 static int send_mad_to_wire(struct mlx4_ib_demux_ctx *ctx, struct ib_mad *mad)
0210 {
0211     struct mlx4_ib_dev *dev = ctx->dev;
0212     struct rdma_ah_attr ah_attr;
0213     unsigned long flags;
0214 
0215     spin_lock_irqsave(&dev->sm_lock, flags);
0216     if (!dev->sm_ah[ctx->port - 1]) {
0217         /* port is not yet Active, sm_ah not ready */
0218         spin_unlock_irqrestore(&dev->sm_lock, flags);
0219         return -EAGAIN;
0220     }
0221     mlx4_ib_query_ah(dev->sm_ah[ctx->port - 1], &ah_attr);
0222     spin_unlock_irqrestore(&dev->sm_lock, flags);
0223     return mlx4_ib_send_to_wire(dev, mlx4_master_func_num(dev->dev),
0224                     ctx->port, IB_QPT_GSI, 0, 1, IB_QP1_QKEY,
0225                     &ah_attr, NULL, 0xffff, mad);
0226 }
0227 
0228 static int send_mad_to_slave(int slave, struct mlx4_ib_demux_ctx *ctx,
0229                  struct ib_mad *mad)
0230 {
0231     struct mlx4_ib_dev *dev = ctx->dev;
0232     struct ib_mad_agent *agent = dev->send_agent[ctx->port - 1][1];
0233     struct ib_wc wc;
0234     struct rdma_ah_attr ah_attr;
0235 
0236     /* Our agent might not yet be registered when mads start to arrive */
0237     if (!agent)
0238         return -EAGAIN;
0239 
0240     rdma_query_ah(dev->sm_ah[ctx->port - 1], &ah_attr);
0241 
0242     if (ib_find_cached_pkey(&dev->ib_dev, ctx->port, IB_DEFAULT_PKEY_FULL, &wc.pkey_index))
0243         return -EINVAL;
0244     wc.sl = 0;
0245     wc.dlid_path_bits = 0;
0246     wc.port_num = ctx->port;
0247     wc.slid = rdma_ah_get_dlid(&ah_attr);  /* opensm lid */
0248     wc.src_qp = 1;
0249     return mlx4_ib_send_to_slave(dev, slave, ctx->port, IB_QPT_GSI, &wc, NULL, mad);
0250 }
0251 
0252 static int send_join_to_wire(struct mcast_group *group, struct ib_sa_mad *sa_mad)
0253 {
0254     struct ib_sa_mad mad;
0255     struct ib_sa_mcmember_data *sa_mad_data = (struct ib_sa_mcmember_data *)&mad.data;
0256     int ret;
0257 
0258     /* we rely on a mad request as arrived from a VF */
0259     memcpy(&mad, sa_mad, sizeof mad);
0260 
0261     /* fix port GID to be the real one (slave 0) */
0262     sa_mad_data->port_gid.global.interface_id = group->demux->guid_cache[0];
0263 
0264     /* assign our own TID */
0265     mad.mad_hdr.tid = mlx4_ib_get_new_demux_tid(group->demux);
0266     group->last_req_tid = mad.mad_hdr.tid; /* keep it for later validation */
0267 
0268     ret = send_mad_to_wire(group->demux, (struct ib_mad *)&mad);
0269     /* set timeout handler */
0270     if (!ret) {
0271         /* calls mlx4_ib_mcg_timeout_handler */
0272         queue_delayed_work(group->demux->mcg_wq, &group->timeout_work,
0273                 msecs_to_jiffies(MAD_TIMEOUT_MS));
0274     }
0275 
0276     return ret;
0277 }
0278 
0279 static int send_leave_to_wire(struct mcast_group *group, u8 join_state)
0280 {
0281     struct ib_sa_mad mad;
0282     struct ib_sa_mcmember_data *sa_data = (struct ib_sa_mcmember_data *)&mad.data;
0283     int ret;
0284 
0285     memset(&mad, 0, sizeof mad);
0286     mad.mad_hdr.base_version = 1;
0287     mad.mad_hdr.mgmt_class = IB_MGMT_CLASS_SUBN_ADM;
0288     mad.mad_hdr.class_version = 2;
0289     mad.mad_hdr.method = IB_SA_METHOD_DELETE;
0290     mad.mad_hdr.status = cpu_to_be16(0);
0291     mad.mad_hdr.class_specific = cpu_to_be16(0);
0292     mad.mad_hdr.tid = mlx4_ib_get_new_demux_tid(group->demux);
0293     group->last_req_tid = mad.mad_hdr.tid; /* keep it for later validation */
0294     mad.mad_hdr.attr_id = cpu_to_be16(IB_SA_ATTR_MC_MEMBER_REC);
0295     mad.mad_hdr.attr_mod = cpu_to_be32(0);
0296     mad.sa_hdr.sm_key = 0x0;
0297     mad.sa_hdr.attr_offset = cpu_to_be16(7);
0298     mad.sa_hdr.comp_mask = IB_SA_MCMEMBER_REC_MGID |
0299         IB_SA_MCMEMBER_REC_PORT_GID | IB_SA_MCMEMBER_REC_JOIN_STATE;
0300 
0301     *sa_data = group->rec;
0302     sa_data->scope_join_state = join_state;
0303 
0304     ret = send_mad_to_wire(group->demux, (struct ib_mad *)&mad);
0305     if (ret)
0306         group->state = MCAST_IDLE;
0307 
0308     /* set timeout handler */
0309     if (!ret) {
0310         /* calls mlx4_ib_mcg_timeout_handler */
0311         queue_delayed_work(group->demux->mcg_wq, &group->timeout_work,
0312                 msecs_to_jiffies(MAD_TIMEOUT_MS));
0313     }
0314 
0315     return ret;
0316 }
0317 
0318 static int send_reply_to_slave(int slave, struct mcast_group *group,
0319         struct ib_sa_mad *req_sa_mad, u16 status)
0320 {
0321     struct ib_sa_mad mad;
0322     struct ib_sa_mcmember_data *sa_data = (struct ib_sa_mcmember_data *)&mad.data;
0323     struct ib_sa_mcmember_data *req_sa_data = (struct ib_sa_mcmember_data *)&req_sa_mad->data;
0324     int ret;
0325 
0326     memset(&mad, 0, sizeof mad);
0327     mad.mad_hdr.base_version = 1;
0328     mad.mad_hdr.mgmt_class = IB_MGMT_CLASS_SUBN_ADM;
0329     mad.mad_hdr.class_version = 2;
0330     mad.mad_hdr.method = IB_MGMT_METHOD_GET_RESP;
0331     mad.mad_hdr.status = cpu_to_be16(status);
0332     mad.mad_hdr.class_specific = cpu_to_be16(0);
0333     mad.mad_hdr.tid = req_sa_mad->mad_hdr.tid;
0334     *(u8 *)&mad.mad_hdr.tid = 0; /* resetting tid to 0 */
0335     mad.mad_hdr.attr_id = cpu_to_be16(IB_SA_ATTR_MC_MEMBER_REC);
0336     mad.mad_hdr.attr_mod = cpu_to_be32(0);
0337     mad.sa_hdr.sm_key = req_sa_mad->sa_hdr.sm_key;
0338     mad.sa_hdr.attr_offset = cpu_to_be16(7);
0339     mad.sa_hdr.comp_mask = 0; /* ignored on responses, see IBTA spec */
0340 
0341     *sa_data = group->rec;
0342 
0343     /* reconstruct VF's requested join_state and port_gid */
0344     sa_data->scope_join_state &= 0xf0;
0345     sa_data->scope_join_state |= (group->func[slave].join_state & 0x0f);
0346     memcpy(&sa_data->port_gid, &req_sa_data->port_gid, sizeof req_sa_data->port_gid);
0347 
0348     ret = send_mad_to_slave(slave, group->demux, (struct ib_mad *)&mad);
0349     return ret;
0350 }
0351 
0352 static int check_selector(ib_sa_comp_mask comp_mask,
0353               ib_sa_comp_mask selector_mask,
0354               ib_sa_comp_mask value_mask,
0355               u8 src_value, u8 dst_value)
0356 {
0357     int err;
0358     u8 selector = dst_value >> 6;
0359     dst_value &= 0x3f;
0360     src_value &= 0x3f;
0361 
0362     if (!(comp_mask & selector_mask) || !(comp_mask & value_mask))
0363         return 0;
0364 
0365     switch (selector) {
0366     case IB_SA_GT:
0367         err = (src_value <= dst_value);
0368         break;
0369     case IB_SA_LT:
0370         err = (src_value >= dst_value);
0371         break;
0372     case IB_SA_EQ:
0373         err = (src_value != dst_value);
0374         break;
0375     default:
0376         err = 0;
0377         break;
0378     }
0379 
0380     return err;
0381 }
0382 
0383 static u16 cmp_rec(struct ib_sa_mcmember_data *src,
0384            struct ib_sa_mcmember_data *dst, ib_sa_comp_mask comp_mask)
0385 {
0386     /* src is group record, dst is request record */
0387     /* MGID must already match */
0388     /* Port_GID we always replace to our Port_GID, so it is a match */
0389 
0390 #define MAD_STATUS_REQ_INVALID 0x0200
0391     if (comp_mask & IB_SA_MCMEMBER_REC_QKEY && src->qkey != dst->qkey)
0392         return MAD_STATUS_REQ_INVALID;
0393     if (comp_mask & IB_SA_MCMEMBER_REC_MLID && src->mlid != dst->mlid)
0394         return MAD_STATUS_REQ_INVALID;
0395     if (check_selector(comp_mask, IB_SA_MCMEMBER_REC_MTU_SELECTOR,
0396                  IB_SA_MCMEMBER_REC_MTU,
0397                  src->mtusel_mtu, dst->mtusel_mtu))
0398         return MAD_STATUS_REQ_INVALID;
0399     if (comp_mask & IB_SA_MCMEMBER_REC_TRAFFIC_CLASS &&
0400         src->tclass != dst->tclass)
0401         return MAD_STATUS_REQ_INVALID;
0402     if (comp_mask & IB_SA_MCMEMBER_REC_PKEY && src->pkey != dst->pkey)
0403         return MAD_STATUS_REQ_INVALID;
0404     if (check_selector(comp_mask, IB_SA_MCMEMBER_REC_RATE_SELECTOR,
0405                  IB_SA_MCMEMBER_REC_RATE,
0406                  src->ratesel_rate, dst->ratesel_rate))
0407         return MAD_STATUS_REQ_INVALID;
0408     if (check_selector(comp_mask,
0409                  IB_SA_MCMEMBER_REC_PACKET_LIFE_TIME_SELECTOR,
0410                  IB_SA_MCMEMBER_REC_PACKET_LIFE_TIME,
0411                  src->lifetmsel_lifetm, dst->lifetmsel_lifetm))
0412         return MAD_STATUS_REQ_INVALID;
0413     if (comp_mask & IB_SA_MCMEMBER_REC_SL &&
0414             (be32_to_cpu(src->sl_flowlabel_hoplimit) & 0xf0000000) !=
0415             (be32_to_cpu(dst->sl_flowlabel_hoplimit) & 0xf0000000))
0416         return MAD_STATUS_REQ_INVALID;
0417     if (comp_mask & IB_SA_MCMEMBER_REC_FLOW_LABEL &&
0418             (be32_to_cpu(src->sl_flowlabel_hoplimit) & 0x0fffff00) !=
0419             (be32_to_cpu(dst->sl_flowlabel_hoplimit) & 0x0fffff00))
0420         return MAD_STATUS_REQ_INVALID;
0421     if (comp_mask & IB_SA_MCMEMBER_REC_HOP_LIMIT &&
0422             (be32_to_cpu(src->sl_flowlabel_hoplimit) & 0x000000ff) !=
0423             (be32_to_cpu(dst->sl_flowlabel_hoplimit) & 0x000000ff))
0424         return MAD_STATUS_REQ_INVALID;
0425     if (comp_mask & IB_SA_MCMEMBER_REC_SCOPE &&
0426             (src->scope_join_state & 0xf0) !=
0427             (dst->scope_join_state & 0xf0))
0428         return MAD_STATUS_REQ_INVALID;
0429 
0430     /* join_state checked separately, proxy_join ignored */
0431 
0432     return 0;
0433 }
0434 
0435 /* release group, return 1 if this was last release and group is destroyed
0436  * timout work is canceled sync */
0437 static int release_group(struct mcast_group *group, int from_timeout_handler)
0438 {
0439     struct mlx4_ib_demux_ctx *ctx = group->demux;
0440     int nzgroup;
0441 
0442     mutex_lock(&ctx->mcg_table_lock);
0443     mutex_lock(&group->lock);
0444     if (atomic_dec_and_test(&group->refcount)) {
0445         if (!from_timeout_handler) {
0446             if (group->state != MCAST_IDLE &&
0447                 !cancel_delayed_work(&group->timeout_work)) {
0448                 atomic_inc(&group->refcount);
0449                 mutex_unlock(&group->lock);
0450                 mutex_unlock(&ctx->mcg_table_lock);
0451                 return 0;
0452             }
0453         }
0454 
0455         nzgroup = memcmp(&group->rec.mgid, &mgid0, sizeof mgid0);
0456         if (nzgroup)
0457             del_sysfs_port_mcg_attr(ctx->dev, ctx->port, &group->dentry.attr);
0458         if (!list_empty(&group->pending_list))
0459             mcg_warn_group(group, "releasing a group with non empty pending list\n");
0460         if (nzgroup)
0461             rb_erase(&group->node, &ctx->mcg_table);
0462         list_del_init(&group->mgid0_list);
0463         mutex_unlock(&group->lock);
0464         mutex_unlock(&ctx->mcg_table_lock);
0465         kfree(group);
0466         return 1;
0467     } else {
0468         mutex_unlock(&group->lock);
0469         mutex_unlock(&ctx->mcg_table_lock);
0470     }
0471     return 0;
0472 }
0473 
0474 static void adjust_membership(struct mcast_group *group, u8 join_state, int inc)
0475 {
0476     int i;
0477 
0478     for (i = 0; i < 3; i++, join_state >>= 1)
0479         if (join_state & 0x1)
0480             group->members[i] += inc;
0481 }
0482 
0483 static u8 get_leave_state(struct mcast_group *group)
0484 {
0485     u8 leave_state = 0;
0486     int i;
0487 
0488     for (i = 0; i < 3; i++)
0489         if (!group->members[i])
0490             leave_state |= (1 << i);
0491 
0492     return leave_state & (group->rec.scope_join_state & 0xf);
0493 }
0494 
0495 static int join_group(struct mcast_group *group, int slave, u8 join_mask)
0496 {
0497     int ret = 0;
0498     u8 join_state;
0499 
0500     /* remove bits that slave is already member of, and adjust */
0501     join_state = join_mask & (~group->func[slave].join_state);
0502     adjust_membership(group, join_state, 1);
0503     group->func[slave].join_state |= join_state;
0504     if (group->func[slave].state != MCAST_MEMBER && join_state) {
0505         group->func[slave].state = MCAST_MEMBER;
0506         ret = 1;
0507     }
0508     return ret;
0509 }
0510 
0511 static int leave_group(struct mcast_group *group, int slave, u8 leave_state)
0512 {
0513     int ret = 0;
0514 
0515     adjust_membership(group, leave_state, -1);
0516     group->func[slave].join_state &= ~leave_state;
0517     if (!group->func[slave].join_state) {
0518         group->func[slave].state = MCAST_NOT_MEMBER;
0519         ret = 1;
0520     }
0521     return ret;
0522 }
0523 
0524 static int check_leave(struct mcast_group *group, int slave, u8 leave_mask)
0525 {
0526     if (group->func[slave].state != MCAST_MEMBER)
0527         return MAD_STATUS_REQ_INVALID;
0528 
0529     /* make sure we're not deleting unset bits */
0530     if (~group->func[slave].join_state & leave_mask)
0531         return MAD_STATUS_REQ_INVALID;
0532 
0533     if (!leave_mask)
0534         return MAD_STATUS_REQ_INVALID;
0535 
0536     return 0;
0537 }
0538 
0539 static void mlx4_ib_mcg_timeout_handler(struct work_struct *work)
0540 {
0541     struct delayed_work *delay = to_delayed_work(work);
0542     struct mcast_group *group;
0543     struct mcast_req *req = NULL;
0544 
0545     group = container_of(delay, typeof(*group), timeout_work);
0546 
0547     mutex_lock(&group->lock);
0548     if (group->state == MCAST_JOIN_SENT) {
0549         if (!list_empty(&group->pending_list)) {
0550             req = list_first_entry(&group->pending_list, struct mcast_req, group_list);
0551             list_del(&req->group_list);
0552             list_del(&req->func_list);
0553             --group->func[req->func].num_pend_reqs;
0554             mutex_unlock(&group->lock);
0555             kfree(req);
0556             if (memcmp(&group->rec.mgid, &mgid0, sizeof mgid0)) {
0557                 if (release_group(group, 1))
0558                     return;
0559             } else {
0560                 kfree(group);
0561                 return;
0562             }
0563             mutex_lock(&group->lock);
0564         } else
0565             mcg_warn_group(group, "DRIVER BUG\n");
0566     } else if (group->state == MCAST_LEAVE_SENT) {
0567         if (group->rec.scope_join_state & 0xf)
0568             group->rec.scope_join_state &= 0xf0;
0569         group->state = MCAST_IDLE;
0570         mutex_unlock(&group->lock);
0571         if (release_group(group, 1))
0572             return;
0573         mutex_lock(&group->lock);
0574     } else
0575         mcg_warn_group(group, "invalid state %s\n", get_state_string(group->state));
0576     group->state = MCAST_IDLE;
0577     atomic_inc(&group->refcount);
0578     if (!queue_work(group->demux->mcg_wq, &group->work))
0579         safe_atomic_dec(&group->refcount);
0580 
0581     mutex_unlock(&group->lock);
0582 }
0583 
0584 static int handle_leave_req(struct mcast_group *group, u8 leave_mask,
0585                 struct mcast_req *req)
0586 {
0587     u16 status;
0588 
0589     if (req->clean)
0590         leave_mask = group->func[req->func].join_state;
0591 
0592     status = check_leave(group, req->func, leave_mask);
0593     if (!status)
0594         leave_group(group, req->func, leave_mask);
0595 
0596     if (!req->clean)
0597         send_reply_to_slave(req->func, group, &req->sa_mad, status);
0598     --group->func[req->func].num_pend_reqs;
0599     list_del(&req->group_list);
0600     list_del(&req->func_list);
0601     kfree(req);
0602     return 1;
0603 }
0604 
0605 static int handle_join_req(struct mcast_group *group, u8 join_mask,
0606                struct mcast_req *req)
0607 {
0608     u8 group_join_state = group->rec.scope_join_state & 0xf;
0609     int ref = 0;
0610     u16 status;
0611     struct ib_sa_mcmember_data *sa_data = (struct ib_sa_mcmember_data *)req->sa_mad.data;
0612 
0613     if (join_mask == (group_join_state & join_mask)) {
0614         /* port's membership need not change */
0615         status = cmp_rec(&group->rec, sa_data, req->sa_mad.sa_hdr.comp_mask);
0616         if (!status)
0617             join_group(group, req->func, join_mask);
0618 
0619         --group->func[req->func].num_pend_reqs;
0620         send_reply_to_slave(req->func, group, &req->sa_mad, status);
0621         list_del(&req->group_list);
0622         list_del(&req->func_list);
0623         kfree(req);
0624         ++ref;
0625     } else {
0626         /* port's membership needs to be updated */
0627         group->prev_state = group->state;
0628         if (send_join_to_wire(group, &req->sa_mad)) {
0629             --group->func[req->func].num_pend_reqs;
0630             list_del(&req->group_list);
0631             list_del(&req->func_list);
0632             kfree(req);
0633             ref = 1;
0634             group->state = group->prev_state;
0635         } else
0636             group->state = MCAST_JOIN_SENT;
0637     }
0638 
0639     return ref;
0640 }
0641 
0642 static void mlx4_ib_mcg_work_handler(struct work_struct *work)
0643 {
0644     struct mcast_group *group;
0645     struct mcast_req *req = NULL;
0646     struct ib_sa_mcmember_data *sa_data;
0647     u8 req_join_state;
0648     int rc = 1; /* release_count - this is for the scheduled work */
0649     u16 status;
0650     u8 method;
0651 
0652     group = container_of(work, typeof(*group), work);
0653 
0654     mutex_lock(&group->lock);
0655 
0656     /* First, let's see if a response from SM is waiting regarding this group.
0657      * If so, we need to update the group's REC. If this is a bad response, we
0658      * may need to send a bad response to a VF waiting for it. If VF is waiting
0659      * and this is a good response, the VF will be answered later in this func. */
0660     if (group->state == MCAST_RESP_READY) {
0661         /* cancels mlx4_ib_mcg_timeout_handler */
0662         cancel_delayed_work(&group->timeout_work);
0663         status = be16_to_cpu(group->response_sa_mad.mad_hdr.status);
0664         method = group->response_sa_mad.mad_hdr.method;
0665         if (group->last_req_tid != group->response_sa_mad.mad_hdr.tid) {
0666             mcg_warn_group(group, "Got MAD response to existing MGID but wrong TID, dropping. Resp TID=%llx, group TID=%llx\n",
0667                 be64_to_cpu(group->response_sa_mad.mad_hdr.tid),
0668                 be64_to_cpu(group->last_req_tid));
0669             group->state = group->prev_state;
0670             goto process_requests;
0671         }
0672         if (status) {
0673             if (!list_empty(&group->pending_list))
0674                 req = list_first_entry(&group->pending_list,
0675                         struct mcast_req, group_list);
0676             if (method == IB_MGMT_METHOD_GET_RESP) {
0677                     if (req) {
0678                         send_reply_to_slave(req->func, group, &req->sa_mad, status);
0679                         --group->func[req->func].num_pend_reqs;
0680                         list_del(&req->group_list);
0681                         list_del(&req->func_list);
0682                         kfree(req);
0683                         ++rc;
0684                     } else
0685                         mcg_warn_group(group, "no request for failed join\n");
0686             } else if (method == IB_SA_METHOD_DELETE_RESP && group->demux->flushing)
0687                 ++rc;
0688         } else {
0689             u8 resp_join_state;
0690             u8 cur_join_state;
0691 
0692             resp_join_state = ((struct ib_sa_mcmember_data *)
0693                         group->response_sa_mad.data)->scope_join_state & 0xf;
0694             cur_join_state = group->rec.scope_join_state & 0xf;
0695 
0696             if (method == IB_MGMT_METHOD_GET_RESP) {
0697                 /* successfull join */
0698                 if (!cur_join_state && resp_join_state)
0699                     --rc;
0700             } else if (!resp_join_state)
0701                     ++rc;
0702             memcpy(&group->rec, group->response_sa_mad.data, sizeof group->rec);
0703         }
0704         group->state = MCAST_IDLE;
0705     }
0706 
0707 process_requests:
0708     /* We should now go over pending join/leave requests, as long as we are idle. */
0709     while (!list_empty(&group->pending_list) && group->state == MCAST_IDLE) {
0710         req = list_first_entry(&group->pending_list, struct mcast_req,
0711                        group_list);
0712         sa_data = (struct ib_sa_mcmember_data *)req->sa_mad.data;
0713         req_join_state = sa_data->scope_join_state & 0xf;
0714 
0715         /* For a leave request, we will immediately answer the VF, and
0716          * update our internal counters. The actual leave will be sent
0717          * to SM later, if at all needed. We dequeue the request now. */
0718         if (req->sa_mad.mad_hdr.method == IB_SA_METHOD_DELETE)
0719             rc += handle_leave_req(group, req_join_state, req);
0720         else
0721             rc += handle_join_req(group, req_join_state, req);
0722     }
0723 
0724     /* Handle leaves */
0725     if (group->state == MCAST_IDLE) {
0726         req_join_state = get_leave_state(group);
0727         if (req_join_state) {
0728             group->rec.scope_join_state &= ~req_join_state;
0729             group->prev_state = group->state;
0730             if (send_leave_to_wire(group, req_join_state)) {
0731                 group->state = group->prev_state;
0732                 ++rc;
0733             } else
0734                 group->state = MCAST_LEAVE_SENT;
0735         }
0736     }
0737 
0738     if (!list_empty(&group->pending_list) && group->state == MCAST_IDLE)
0739         goto process_requests;
0740     mutex_unlock(&group->lock);
0741 
0742     while (rc--)
0743         release_group(group, 0);
0744 }
0745 
0746 static struct mcast_group *search_relocate_mgid0_group(struct mlx4_ib_demux_ctx *ctx,
0747                                __be64 tid,
0748                                union ib_gid *new_mgid)
0749 {
0750     struct mcast_group *group = NULL, *cur_group, *n;
0751     struct mcast_req *req;
0752 
0753     mutex_lock(&ctx->mcg_table_lock);
0754     list_for_each_entry_safe(group, n, &ctx->mcg_mgid0_list, mgid0_list) {
0755         mutex_lock(&group->lock);
0756         if (group->last_req_tid == tid) {
0757             if (memcmp(new_mgid, &mgid0, sizeof mgid0)) {
0758                 group->rec.mgid = *new_mgid;
0759                 sprintf(group->name, "%016llx%016llx",
0760                         be64_to_cpu(group->rec.mgid.global.subnet_prefix),
0761                         be64_to_cpu(group->rec.mgid.global.interface_id));
0762                 list_del_init(&group->mgid0_list);
0763                 cur_group = mcast_insert(ctx, group);
0764                 if (cur_group) {
0765                     /* A race between our code and SM. Silently cleaning the new one */
0766                     req = list_first_entry(&group->pending_list,
0767                                    struct mcast_req, group_list);
0768                     --group->func[req->func].num_pend_reqs;
0769                     list_del(&req->group_list);
0770                     list_del(&req->func_list);
0771                     kfree(req);
0772                     mutex_unlock(&group->lock);
0773                     mutex_unlock(&ctx->mcg_table_lock);
0774                     release_group(group, 0);
0775                     return NULL;
0776                 }
0777 
0778                 atomic_inc(&group->refcount);
0779                 add_sysfs_port_mcg_attr(ctx->dev, ctx->port, &group->dentry.attr);
0780                 mutex_unlock(&group->lock);
0781                 mutex_unlock(&ctx->mcg_table_lock);
0782                 return group;
0783             } else {
0784                 struct mcast_req *tmp1, *tmp2;
0785 
0786                 list_del(&group->mgid0_list);
0787                 if (!list_empty(&group->pending_list) && group->state != MCAST_IDLE)
0788                     cancel_delayed_work_sync(&group->timeout_work);
0789 
0790                 list_for_each_entry_safe(tmp1, tmp2, &group->pending_list, group_list) {
0791                     list_del(&tmp1->group_list);
0792                     kfree(tmp1);
0793                 }
0794                 mutex_unlock(&group->lock);
0795                 mutex_unlock(&ctx->mcg_table_lock);
0796                 kfree(group);
0797                 return NULL;
0798             }
0799         }
0800         mutex_unlock(&group->lock);
0801     }
0802     mutex_unlock(&ctx->mcg_table_lock);
0803 
0804     return NULL;
0805 }
0806 
0807 static ssize_t sysfs_show_group(struct device *dev,
0808         struct device_attribute *attr, char *buf);
0809 
0810 static struct mcast_group *acquire_group(struct mlx4_ib_demux_ctx *ctx,
0811                      union ib_gid *mgid, int create)
0812 {
0813     struct mcast_group *group, *cur_group;
0814     int is_mgid0;
0815     int i;
0816 
0817     is_mgid0 = !memcmp(&mgid0, mgid, sizeof mgid0);
0818     if (!is_mgid0) {
0819         group = mcast_find(ctx, mgid);
0820         if (group)
0821             goto found;
0822     }
0823 
0824     if (!create)
0825         return ERR_PTR(-ENOENT);
0826 
0827     group = kzalloc(sizeof(*group), GFP_KERNEL);
0828     if (!group)
0829         return ERR_PTR(-ENOMEM);
0830 
0831     group->demux = ctx;
0832     group->rec.mgid = *mgid;
0833     INIT_LIST_HEAD(&group->pending_list);
0834     INIT_LIST_HEAD(&group->mgid0_list);
0835     for (i = 0; i < MAX_VFS; ++i)
0836         INIT_LIST_HEAD(&group->func[i].pending);
0837     INIT_WORK(&group->work, mlx4_ib_mcg_work_handler);
0838     INIT_DELAYED_WORK(&group->timeout_work, mlx4_ib_mcg_timeout_handler);
0839     mutex_init(&group->lock);
0840     sprintf(group->name, "%016llx%016llx",
0841             be64_to_cpu(group->rec.mgid.global.subnet_prefix),
0842             be64_to_cpu(group->rec.mgid.global.interface_id));
0843     sysfs_attr_init(&group->dentry.attr);
0844     group->dentry.show = sysfs_show_group;
0845     group->dentry.store = NULL;
0846     group->dentry.attr.name = group->name;
0847     group->dentry.attr.mode = 0400;
0848     group->state = MCAST_IDLE;
0849 
0850     if (is_mgid0) {
0851         list_add(&group->mgid0_list, &ctx->mcg_mgid0_list);
0852         goto found;
0853     }
0854 
0855     cur_group = mcast_insert(ctx, group);
0856     if (cur_group) {
0857         mcg_warn("group just showed up %s - confused\n", cur_group->name);
0858         kfree(group);
0859         return ERR_PTR(-EINVAL);
0860     }
0861 
0862     add_sysfs_port_mcg_attr(ctx->dev, ctx->port, &group->dentry.attr);
0863 
0864 found:
0865     atomic_inc(&group->refcount);
0866     return group;
0867 }
0868 
0869 static void queue_req(struct mcast_req *req)
0870 {
0871     struct mcast_group *group = req->group;
0872 
0873     atomic_inc(&group->refcount); /* for the request */
0874     atomic_inc(&group->refcount); /* for scheduling the work */
0875     list_add_tail(&req->group_list, &group->pending_list);
0876     list_add_tail(&req->func_list, &group->func[req->func].pending);
0877     /* calls mlx4_ib_mcg_work_handler */
0878     if (!queue_work(group->demux->mcg_wq, &group->work))
0879         safe_atomic_dec(&group->refcount);
0880 }
0881 
0882 int mlx4_ib_mcg_demux_handler(struct ib_device *ibdev, int port, int slave,
0883                   struct ib_sa_mad *mad)
0884 {
0885     struct mlx4_ib_dev *dev = to_mdev(ibdev);
0886     struct ib_sa_mcmember_data *rec = (struct ib_sa_mcmember_data *)mad->data;
0887     struct mlx4_ib_demux_ctx *ctx = &dev->sriov.demux[port - 1];
0888     struct mcast_group *group;
0889 
0890     switch (mad->mad_hdr.method) {
0891     case IB_MGMT_METHOD_GET_RESP:
0892     case IB_SA_METHOD_DELETE_RESP:
0893         mutex_lock(&ctx->mcg_table_lock);
0894         group = acquire_group(ctx, &rec->mgid, 0);
0895         mutex_unlock(&ctx->mcg_table_lock);
0896         if (IS_ERR(group)) {
0897             if (mad->mad_hdr.method == IB_MGMT_METHOD_GET_RESP) {
0898                 __be64 tid = mad->mad_hdr.tid;
0899                 *(u8 *)(&tid) = (u8)slave; /* in group we kept the modified TID */
0900                 group = search_relocate_mgid0_group(ctx, tid, &rec->mgid);
0901             } else
0902                 group = NULL;
0903         }
0904 
0905         if (!group)
0906             return 1;
0907 
0908         mutex_lock(&group->lock);
0909         group->response_sa_mad = *mad;
0910         group->prev_state = group->state;
0911         group->state = MCAST_RESP_READY;
0912         /* calls mlx4_ib_mcg_work_handler */
0913         atomic_inc(&group->refcount);
0914         if (!queue_work(ctx->mcg_wq, &group->work))
0915             safe_atomic_dec(&group->refcount);
0916         mutex_unlock(&group->lock);
0917         release_group(group, 0);
0918         return 1; /* consumed */
0919     case IB_MGMT_METHOD_SET:
0920     case IB_SA_METHOD_GET_TABLE:
0921     case IB_SA_METHOD_GET_TABLE_RESP:
0922     case IB_SA_METHOD_DELETE:
0923         return 0; /* not consumed, pass-through to guest over tunnel */
0924     default:
0925         mcg_warn("In demux, port %d: unexpected MCMember method: 0x%x, dropping\n",
0926             port, mad->mad_hdr.method);
0927         return 1; /* consumed */
0928     }
0929 }
0930 
0931 int mlx4_ib_mcg_multiplex_handler(struct ib_device *ibdev, int port,
0932                   int slave, struct ib_sa_mad *sa_mad)
0933 {
0934     struct mlx4_ib_dev *dev = to_mdev(ibdev);
0935     struct ib_sa_mcmember_data *rec = (struct ib_sa_mcmember_data *)sa_mad->data;
0936     struct mlx4_ib_demux_ctx *ctx = &dev->sriov.demux[port - 1];
0937     struct mcast_group *group;
0938     struct mcast_req *req;
0939     int may_create = 0;
0940 
0941     if (ctx->flushing)
0942         return -EAGAIN;
0943 
0944     switch (sa_mad->mad_hdr.method) {
0945     case IB_MGMT_METHOD_SET:
0946         may_create = 1;
0947         fallthrough;
0948     case IB_SA_METHOD_DELETE:
0949         req = kzalloc(sizeof *req, GFP_KERNEL);
0950         if (!req)
0951             return -ENOMEM;
0952 
0953         req->func = slave;
0954         req->sa_mad = *sa_mad;
0955 
0956         mutex_lock(&ctx->mcg_table_lock);
0957         group = acquire_group(ctx, &rec->mgid, may_create);
0958         mutex_unlock(&ctx->mcg_table_lock);
0959         if (IS_ERR(group)) {
0960             kfree(req);
0961             return PTR_ERR(group);
0962         }
0963         mutex_lock(&group->lock);
0964         if (group->func[slave].num_pend_reqs > MAX_PEND_REQS_PER_FUNC) {
0965             mutex_unlock(&group->lock);
0966             mcg_debug_group(group, "Port %d, Func %d has too many pending requests (%d), dropping\n",
0967                     port, slave, MAX_PEND_REQS_PER_FUNC);
0968             release_group(group, 0);
0969             kfree(req);
0970             return -ENOMEM;
0971         }
0972         ++group->func[slave].num_pend_reqs;
0973         req->group = group;
0974         queue_req(req);
0975         mutex_unlock(&group->lock);
0976         release_group(group, 0);
0977         return 1; /* consumed */
0978     case IB_SA_METHOD_GET_TABLE:
0979     case IB_MGMT_METHOD_GET_RESP:
0980     case IB_SA_METHOD_GET_TABLE_RESP:
0981     case IB_SA_METHOD_DELETE_RESP:
0982         return 0; /* not consumed, pass-through */
0983     default:
0984         mcg_warn("In multiplex, port %d, func %d: unexpected MCMember method: 0x%x, dropping\n",
0985             port, slave, sa_mad->mad_hdr.method);
0986         return 1; /* consumed */
0987     }
0988 }
0989 
0990 static ssize_t sysfs_show_group(struct device *dev,
0991                 struct device_attribute *attr, char *buf)
0992 {
0993     struct mcast_group *group =
0994         container_of(attr, struct mcast_group, dentry);
0995     struct mcast_req *req = NULL;
0996     char state_str[40];
0997     char pending_str[40];
0998     int len;
0999     int i;
1000     u32 hoplimit;
1001 
1002     if (group->state == MCAST_IDLE)
1003         scnprintf(state_str, sizeof(state_str), "%s",
1004               get_state_string(group->state));
1005     else
1006         scnprintf(state_str, sizeof(state_str), "%s(TID=0x%llx)",
1007               get_state_string(group->state),
1008               be64_to_cpu(group->last_req_tid));
1009 
1010     if (list_empty(&group->pending_list)) {
1011         scnprintf(pending_str, sizeof(pending_str), "No");
1012     } else {
1013         req = list_first_entry(&group->pending_list, struct mcast_req,
1014                        group_list);
1015         scnprintf(pending_str, sizeof(pending_str), "Yes(TID=0x%llx)",
1016               be64_to_cpu(req->sa_mad.mad_hdr.tid));
1017     }
1018 
1019     len = sysfs_emit(buf, "%1d [%02d,%02d,%02d] %4d %4s %5s     ",
1020              group->rec.scope_join_state & 0xf,
1021              group->members[2],
1022              group->members[1],
1023              group->members[0],
1024              atomic_read(&group->refcount),
1025              pending_str,
1026              state_str);
1027 
1028     for (i = 0; i < MAX_VFS; i++) {
1029         if (group->func[i].state == MCAST_MEMBER)
1030             len += sysfs_emit_at(buf, len, "%d[%1x] ", i,
1031                          group->func[i].join_state);
1032     }
1033 
1034     hoplimit = be32_to_cpu(group->rec.sl_flowlabel_hoplimit);
1035     len += sysfs_emit_at(buf, len,
1036                  "\t\t(%4hx %4x %2x %2x %2x %2x %2x %4x %4x %2x %2x)\n",
1037                  be16_to_cpu(group->rec.pkey),
1038                  be32_to_cpu(group->rec.qkey),
1039                  (group->rec.mtusel_mtu & 0xc0) >> 6,
1040                  (group->rec.mtusel_mtu & 0x3f),
1041                  group->rec.tclass,
1042                  (group->rec.ratesel_rate & 0xc0) >> 6,
1043                  (group->rec.ratesel_rate & 0x3f),
1044                  (hoplimit & 0xf0000000) >> 28,
1045                  (hoplimit & 0x0fffff00) >> 8,
1046                  (hoplimit & 0x000000ff),
1047                  group->rec.proxy_join);
1048 
1049     return len;
1050 }
1051 
1052 int mlx4_ib_mcg_port_init(struct mlx4_ib_demux_ctx *ctx)
1053 {
1054     char name[20];
1055 
1056     atomic_set(&ctx->tid, 0);
1057     sprintf(name, "mlx4_ib_mcg%d", ctx->port);
1058     ctx->mcg_wq = alloc_ordered_workqueue(name, WQ_MEM_RECLAIM);
1059     if (!ctx->mcg_wq)
1060         return -ENOMEM;
1061 
1062     mutex_init(&ctx->mcg_table_lock);
1063     ctx->mcg_table = RB_ROOT;
1064     INIT_LIST_HEAD(&ctx->mcg_mgid0_list);
1065     ctx->flushing = 0;
1066 
1067     return 0;
1068 }
1069 
1070 static void force_clean_group(struct mcast_group *group)
1071 {
1072     struct mcast_req *req, *tmp
1073         ;
1074     list_for_each_entry_safe(req, tmp, &group->pending_list, group_list) {
1075         list_del(&req->group_list);
1076         kfree(req);
1077     }
1078     del_sysfs_port_mcg_attr(group->demux->dev, group->demux->port, &group->dentry.attr);
1079     rb_erase(&group->node, &group->demux->mcg_table);
1080     kfree(group);
1081 }
1082 
1083 static void _mlx4_ib_mcg_port_cleanup(struct mlx4_ib_demux_ctx *ctx, int destroy_wq)
1084 {
1085     int i;
1086     struct rb_node *p;
1087     struct mcast_group *group;
1088     unsigned long end;
1089     int count;
1090 
1091     for (i = 0; i < MAX_VFS; ++i)
1092         clean_vf_mcast(ctx, i);
1093 
1094     end = jiffies + msecs_to_jiffies(MAD_TIMEOUT_MS + 3000);
1095     do {
1096         count = 0;
1097         mutex_lock(&ctx->mcg_table_lock);
1098         for (p = rb_first(&ctx->mcg_table); p; p = rb_next(p))
1099             ++count;
1100         mutex_unlock(&ctx->mcg_table_lock);
1101         if (!count)
1102             break;
1103 
1104         usleep_range(1000, 2000);
1105     } while (time_after(end, jiffies));
1106 
1107     flush_workqueue(ctx->mcg_wq);
1108     if (destroy_wq)
1109         destroy_workqueue(ctx->mcg_wq);
1110 
1111     mutex_lock(&ctx->mcg_table_lock);
1112     while ((p = rb_first(&ctx->mcg_table)) != NULL) {
1113         group = rb_entry(p, struct mcast_group, node);
1114         if (atomic_read(&group->refcount))
1115             mcg_debug_group(group, "group refcount %d!!! (pointer %p)\n",
1116                     atomic_read(&group->refcount), group);
1117 
1118         force_clean_group(group);
1119     }
1120     mutex_unlock(&ctx->mcg_table_lock);
1121 }
1122 
1123 struct clean_work {
1124     struct work_struct work;
1125     struct mlx4_ib_demux_ctx *ctx;
1126     int destroy_wq;
1127 };
1128 
1129 static void mcg_clean_task(struct work_struct *work)
1130 {
1131     struct clean_work *cw = container_of(work, struct clean_work, work);
1132 
1133     _mlx4_ib_mcg_port_cleanup(cw->ctx, cw->destroy_wq);
1134     cw->ctx->flushing = 0;
1135     kfree(cw);
1136 }
1137 
1138 void mlx4_ib_mcg_port_cleanup(struct mlx4_ib_demux_ctx *ctx, int destroy_wq)
1139 {
1140     struct clean_work *work;
1141 
1142     if (ctx->flushing)
1143         return;
1144 
1145     ctx->flushing = 1;
1146 
1147     if (destroy_wq) {
1148         _mlx4_ib_mcg_port_cleanup(ctx, destroy_wq);
1149         ctx->flushing = 0;
1150         return;
1151     }
1152 
1153     work = kmalloc(sizeof *work, GFP_KERNEL);
1154     if (!work) {
1155         ctx->flushing = 0;
1156         return;
1157     }
1158 
1159     work->ctx = ctx;
1160     work->destroy_wq = destroy_wq;
1161     INIT_WORK(&work->work, mcg_clean_task);
1162     queue_work(clean_wq, &work->work);
1163 }
1164 
1165 static void build_leave_mad(struct mcast_req *req)
1166 {
1167     struct ib_sa_mad *mad = &req->sa_mad;
1168 
1169     mad->mad_hdr.method = IB_SA_METHOD_DELETE;
1170 }
1171 
1172 
1173 static void clear_pending_reqs(struct mcast_group *group, int vf)
1174 {
1175     struct mcast_req *req, *tmp, *group_first = NULL;
1176     int clear;
1177     int pend = 0;
1178 
1179     if (!list_empty(&group->pending_list))
1180         group_first = list_first_entry(&group->pending_list, struct mcast_req, group_list);
1181 
1182     list_for_each_entry_safe(req, tmp, &group->func[vf].pending, func_list) {
1183         clear = 1;
1184         if (group_first == req &&
1185             (group->state == MCAST_JOIN_SENT ||
1186              group->state == MCAST_LEAVE_SENT)) {
1187             clear = cancel_delayed_work(&group->timeout_work);
1188             pend = !clear;
1189             group->state = MCAST_IDLE;
1190         }
1191         if (clear) {
1192             --group->func[vf].num_pend_reqs;
1193             list_del(&req->group_list);
1194             list_del(&req->func_list);
1195             kfree(req);
1196             atomic_dec(&group->refcount);
1197         }
1198     }
1199 
1200     if (!pend && (!list_empty(&group->func[vf].pending) || group->func[vf].num_pend_reqs)) {
1201         mcg_warn_group(group, "DRIVER BUG: list_empty %d, num_pend_reqs %d\n",
1202                    list_empty(&group->func[vf].pending), group->func[vf].num_pend_reqs);
1203     }
1204 }
1205 
1206 static int push_deleteing_req(struct mcast_group *group, int slave)
1207 {
1208     struct mcast_req *req;
1209     struct mcast_req *pend_req;
1210 
1211     if (!group->func[slave].join_state)
1212         return 0;
1213 
1214     req = kzalloc(sizeof *req, GFP_KERNEL);
1215     if (!req)
1216         return -ENOMEM;
1217 
1218     if (!list_empty(&group->func[slave].pending)) {
1219         pend_req = list_entry(group->func[slave].pending.prev, struct mcast_req, group_list);
1220         if (pend_req->clean) {
1221             kfree(req);
1222             return 0;
1223         }
1224     }
1225 
1226     req->clean = 1;
1227     req->func = slave;
1228     req->group = group;
1229     ++group->func[slave].num_pend_reqs;
1230     build_leave_mad(req);
1231     queue_req(req);
1232     return 0;
1233 }
1234 
1235 void clean_vf_mcast(struct mlx4_ib_demux_ctx *ctx, int slave)
1236 {
1237     struct mcast_group *group;
1238     struct rb_node *p;
1239 
1240     mutex_lock(&ctx->mcg_table_lock);
1241     for (p = rb_first(&ctx->mcg_table); p; p = rb_next(p)) {
1242         group = rb_entry(p, struct mcast_group, node);
1243         mutex_lock(&group->lock);
1244         if (atomic_read(&group->refcount)) {
1245             /* clear pending requests of this VF */
1246             clear_pending_reqs(group, slave);
1247             push_deleteing_req(group, slave);
1248         }
1249         mutex_unlock(&group->lock);
1250     }
1251     mutex_unlock(&ctx->mcg_table_lock);
1252 }
1253 
1254 
1255 int mlx4_ib_mcg_init(void)
1256 {
1257     clean_wq = alloc_ordered_workqueue("mlx4_ib_mcg", WQ_MEM_RECLAIM);
1258     if (!clean_wq)
1259         return -ENOMEM;
1260 
1261     return 0;
1262 }
1263 
1264 void mlx4_ib_mcg_destroy(void)
1265 {
1266     destroy_workqueue(clean_wq);
1267 }