0001
0002
0003
0004 #include "health.h"
0005 #include "lib/eq.h"
0006 #include "lib/mlx5.h"
0007
0008 int mlx5e_health_fmsg_named_obj_nest_start(struct devlink_fmsg *fmsg, char *name)
0009 {
0010 int err;
0011
0012 err = devlink_fmsg_pair_nest_start(fmsg, name);
0013 if (err)
0014 return err;
0015
0016 err = devlink_fmsg_obj_nest_start(fmsg);
0017 if (err)
0018 return err;
0019
0020 return 0;
0021 }
0022
0023 int mlx5e_health_fmsg_named_obj_nest_end(struct devlink_fmsg *fmsg)
0024 {
0025 int err;
0026
0027 err = devlink_fmsg_obj_nest_end(fmsg);
0028 if (err)
0029 return err;
0030
0031 err = devlink_fmsg_pair_nest_end(fmsg);
0032 if (err)
0033 return err;
0034
0035 return 0;
0036 }
0037
0038 int mlx5e_health_cq_diag_fmsg(struct mlx5e_cq *cq, struct devlink_fmsg *fmsg)
0039 {
0040 u32 out[MLX5_ST_SZ_DW(query_cq_out)] = {};
0041 u8 hw_status;
0042 void *cqc;
0043 int err;
0044
0045 err = mlx5_core_query_cq(cq->mdev, &cq->mcq, out);
0046 if (err)
0047 return err;
0048
0049 cqc = MLX5_ADDR_OF(query_cq_out, out, cq_context);
0050 hw_status = MLX5_GET(cqc, cqc, status);
0051
0052 err = mlx5e_health_fmsg_named_obj_nest_start(fmsg, "CQ");
0053 if (err)
0054 return err;
0055
0056 err = devlink_fmsg_u32_pair_put(fmsg, "cqn", cq->mcq.cqn);
0057 if (err)
0058 return err;
0059
0060 err = devlink_fmsg_u8_pair_put(fmsg, "HW status", hw_status);
0061 if (err)
0062 return err;
0063
0064 err = devlink_fmsg_u32_pair_put(fmsg, "ci", mlx5_cqwq_get_ci(&cq->wq));
0065 if (err)
0066 return err;
0067
0068 err = devlink_fmsg_u32_pair_put(fmsg, "size", mlx5_cqwq_get_size(&cq->wq));
0069 if (err)
0070 return err;
0071
0072 err = mlx5e_health_fmsg_named_obj_nest_end(fmsg);
0073 if (err)
0074 return err;
0075
0076 return 0;
0077 }
0078
0079 int mlx5e_health_cq_common_diag_fmsg(struct mlx5e_cq *cq, struct devlink_fmsg *fmsg)
0080 {
0081 u8 cq_log_stride;
0082 u32 cq_sz;
0083 int err;
0084
0085 cq_sz = mlx5_cqwq_get_size(&cq->wq);
0086 cq_log_stride = mlx5_cqwq_get_log_stride_size(&cq->wq);
0087
0088 err = mlx5e_health_fmsg_named_obj_nest_start(fmsg, "CQ");
0089 if (err)
0090 return err;
0091
0092 err = devlink_fmsg_u64_pair_put(fmsg, "stride size", BIT(cq_log_stride));
0093 if (err)
0094 return err;
0095
0096 err = devlink_fmsg_u32_pair_put(fmsg, "size", cq_sz);
0097 if (err)
0098 return err;
0099
0100 err = mlx5e_health_fmsg_named_obj_nest_end(fmsg);
0101 if (err)
0102 return err;
0103
0104 return 0;
0105 }
0106
0107 int mlx5e_health_eq_diag_fmsg(struct mlx5_eq_comp *eq, struct devlink_fmsg *fmsg)
0108 {
0109 int err;
0110
0111 err = mlx5e_health_fmsg_named_obj_nest_start(fmsg, "EQ");
0112 if (err)
0113 return err;
0114
0115 err = devlink_fmsg_u8_pair_put(fmsg, "eqn", eq->core.eqn);
0116 if (err)
0117 return err;
0118
0119 err = devlink_fmsg_u32_pair_put(fmsg, "irqn", eq->core.irqn);
0120 if (err)
0121 return err;
0122
0123 err = devlink_fmsg_u32_pair_put(fmsg, "vecidx", eq->core.vecidx);
0124 if (err)
0125 return err;
0126
0127 err = devlink_fmsg_u32_pair_put(fmsg, "ci", eq->core.cons_index);
0128 if (err)
0129 return err;
0130
0131 err = devlink_fmsg_u32_pair_put(fmsg, "size", eq_get_size(&eq->core));
0132 if (err)
0133 return err;
0134
0135 return mlx5e_health_fmsg_named_obj_nest_end(fmsg);
0136 }
0137
0138 void mlx5e_health_create_reporters(struct mlx5e_priv *priv)
0139 {
0140 mlx5e_reporter_tx_create(priv);
0141 mlx5e_reporter_rx_create(priv);
0142 }
0143
0144 void mlx5e_health_destroy_reporters(struct mlx5e_priv *priv)
0145 {
0146 mlx5e_reporter_rx_destroy(priv);
0147 mlx5e_reporter_tx_destroy(priv);
0148 }
0149
0150 void mlx5e_health_channels_update(struct mlx5e_priv *priv)
0151 {
0152 if (priv->tx_reporter)
0153 devlink_health_reporter_state_update(priv->tx_reporter,
0154 DEVLINK_HEALTH_REPORTER_STATE_HEALTHY);
0155 if (priv->rx_reporter)
0156 devlink_health_reporter_state_update(priv->rx_reporter,
0157 DEVLINK_HEALTH_REPORTER_STATE_HEALTHY);
0158 }
0159
0160 int mlx5e_health_sq_to_ready(struct mlx5_core_dev *mdev, struct net_device *dev, u32 sqn)
0161 {
0162 struct mlx5e_modify_sq_param msp = {};
0163 int err;
0164
0165 msp.curr_state = MLX5_SQC_STATE_ERR;
0166 msp.next_state = MLX5_SQC_STATE_RST;
0167
0168 err = mlx5e_modify_sq(mdev, sqn, &msp);
0169 if (err) {
0170 netdev_err(dev, "Failed to move sq 0x%x to reset\n", sqn);
0171 return err;
0172 }
0173
0174 memset(&msp, 0, sizeof(msp));
0175 msp.curr_state = MLX5_SQC_STATE_RST;
0176 msp.next_state = MLX5_SQC_STATE_RDY;
0177
0178 err = mlx5e_modify_sq(mdev, sqn, &msp);
0179 if (err) {
0180 netdev_err(dev, "Failed to move sq 0x%x to ready\n", sqn);
0181 return err;
0182 }
0183
0184 return 0;
0185 }
0186
0187 int mlx5e_health_recover_channels(struct mlx5e_priv *priv)
0188 {
0189 int err = 0;
0190
0191 rtnl_lock();
0192 mutex_lock(&priv->state_lock);
0193
0194 if (!test_bit(MLX5E_STATE_OPENED, &priv->state))
0195 goto out;
0196
0197 err = mlx5e_safe_reopen_channels(priv);
0198
0199 out:
0200 mutex_unlock(&priv->state_lock);
0201 rtnl_unlock();
0202
0203 return err;
0204 }
0205
0206 int mlx5e_health_channel_eq_recover(struct net_device *dev, struct mlx5_eq_comp *eq,
0207 struct mlx5e_ch_stats *stats)
0208 {
0209 u32 eqe_count;
0210
0211 netdev_err(dev, "EQ 0x%x: Cons = 0x%x, irqn = 0x%x\n",
0212 eq->core.eqn, eq->core.cons_index, eq->core.irqn);
0213
0214 eqe_count = mlx5_eq_poll_irq_disabled(eq);
0215 if (!eqe_count)
0216 return -EIO;
0217
0218 netdev_err(dev, "Recovered %d eqes on EQ 0x%x\n",
0219 eqe_count, eq->core.eqn);
0220
0221 stats->eq_rearm++;
0222 return 0;
0223 }
0224
0225 int mlx5e_health_report(struct mlx5e_priv *priv,
0226 struct devlink_health_reporter *reporter, char *err_str,
0227 struct mlx5e_err_ctx *err_ctx)
0228 {
0229 netdev_err(priv->netdev, "%s\n", err_str);
0230
0231 if (!reporter)
0232 return err_ctx->recover(err_ctx->ctx);
0233
0234 return devlink_health_report(reporter, err_str, err_ctx);
0235 }
0236
0237 #define MLX5_HEALTH_DEVLINK_MAX_SIZE 1024
0238 static int mlx5e_health_rsc_fmsg_binary(struct devlink_fmsg *fmsg,
0239 const void *value, u32 value_len)
0240
0241 {
0242 u32 data_size;
0243 int err = 0;
0244 u32 offset;
0245
0246 for (offset = 0; offset < value_len; offset += data_size) {
0247 data_size = value_len - offset;
0248 if (data_size > MLX5_HEALTH_DEVLINK_MAX_SIZE)
0249 data_size = MLX5_HEALTH_DEVLINK_MAX_SIZE;
0250 err = devlink_fmsg_binary_put(fmsg, value + offset, data_size);
0251 if (err)
0252 break;
0253 }
0254 return err;
0255 }
0256
0257 int mlx5e_health_rsc_fmsg_dump(struct mlx5e_priv *priv, struct mlx5_rsc_key *key,
0258 struct devlink_fmsg *fmsg)
0259 {
0260 struct mlx5_core_dev *mdev = priv->mdev;
0261 struct mlx5_rsc_dump_cmd *cmd;
0262 struct page *page;
0263 int cmd_err, err;
0264 int end_err;
0265 int size;
0266
0267 if (IS_ERR_OR_NULL(mdev->rsc_dump))
0268 return -EOPNOTSUPP;
0269
0270 page = alloc_page(GFP_KERNEL);
0271 if (!page)
0272 return -ENOMEM;
0273
0274 err = devlink_fmsg_binary_pair_nest_start(fmsg, "data");
0275 if (err)
0276 goto free_page;
0277
0278 cmd = mlx5_rsc_dump_cmd_create(mdev, key);
0279 if (IS_ERR(cmd)) {
0280 err = PTR_ERR(cmd);
0281 goto free_page;
0282 }
0283
0284 do {
0285 cmd_err = mlx5_rsc_dump_next(mdev, cmd, page, &size);
0286 if (cmd_err < 0) {
0287 err = cmd_err;
0288 goto destroy_cmd;
0289 }
0290
0291 err = mlx5e_health_rsc_fmsg_binary(fmsg, page_address(page), size);
0292 if (err)
0293 goto destroy_cmd;
0294
0295 } while (cmd_err > 0);
0296
0297 destroy_cmd:
0298 mlx5_rsc_dump_cmd_destroy(cmd);
0299 end_err = devlink_fmsg_binary_pair_nest_end(fmsg);
0300 if (end_err)
0301 err = end_err;
0302 free_page:
0303 __free_page(page);
0304 return err;
0305 }
0306
0307 int mlx5e_health_queue_dump(struct mlx5e_priv *priv, struct devlink_fmsg *fmsg,
0308 int queue_idx, char *lbl)
0309 {
0310 struct mlx5_rsc_key key = {};
0311 int err;
0312
0313 key.rsc = MLX5_SGMT_TYPE_FULL_QPC;
0314 key.index1 = queue_idx;
0315 key.size = PAGE_SIZE;
0316 key.num_of_obj1 = 1;
0317
0318 err = devlink_fmsg_obj_nest_start(fmsg);
0319 if (err)
0320 return err;
0321
0322 err = mlx5e_health_fmsg_named_obj_nest_start(fmsg, lbl);
0323 if (err)
0324 return err;
0325
0326 err = devlink_fmsg_u32_pair_put(fmsg, "index", queue_idx);
0327 if (err)
0328 return err;
0329
0330 err = mlx5e_health_rsc_fmsg_dump(priv, &key, fmsg);
0331 if (err)
0332 return err;
0333
0334 err = mlx5e_health_fmsg_named_obj_nest_end(fmsg);
0335 if (err)
0336 return err;
0337
0338 return devlink_fmsg_obj_nest_end(fmsg);
0339 }