0001
0002 #include <linux/ceph/ceph_debug.h>
0003
0004 #include <linux/bug.h>
0005 #include <linux/err.h>
0006 #include <linux/random.h>
0007 #include <linux/slab.h>
0008 #include <linux/types.h>
0009
0010 #include <linux/ceph/mdsmap.h>
0011 #include <linux/ceph/messenger.h>
0012 #include <linux/ceph/decode.h>
0013
0014 #include "super.h"
0015
0016 #define CEPH_MDS_IS_READY(i, ignore_laggy) \
0017 (m->m_info[i].state > 0 && ignore_laggy ? true : !m->m_info[i].laggy)
0018
0019 static int __mdsmap_get_random_mds(struct ceph_mdsmap *m, bool ignore_laggy)
0020 {
0021 int n = 0;
0022 int i, j;
0023
0024
0025 for (i = 0; i < m->possible_max_rank; i++)
0026 if (CEPH_MDS_IS_READY(i, ignore_laggy))
0027 n++;
0028 if (n == 0)
0029 return -1;
0030
0031
0032 n = prandom_u32() % n;
0033 for (j = 0, i = 0; i < m->possible_max_rank; i++) {
0034 if (CEPH_MDS_IS_READY(i, ignore_laggy))
0035 j++;
0036 if (j > n)
0037 break;
0038 }
0039
0040 return i;
0041 }
0042
0043
0044
0045
0046 int ceph_mdsmap_get_random_mds(struct ceph_mdsmap *m)
0047 {
0048 int mds;
0049
0050 mds = __mdsmap_get_random_mds(m, false);
0051 if (mds == m->possible_max_rank || mds == -1)
0052 mds = __mdsmap_get_random_mds(m, true);
0053
0054 return mds == m->possible_max_rank ? -1 : mds;
0055 }
0056
0057 #define __decode_and_drop_type(p, end, type, bad) \
0058 do { \
0059 if (*p + sizeof(type) > end) \
0060 goto bad; \
0061 *p += sizeof(type); \
0062 } while (0)
0063
0064 #define __decode_and_drop_set(p, end, type, bad) \
0065 do { \
0066 u32 n; \
0067 size_t need; \
0068 ceph_decode_32_safe(p, end, n, bad); \
0069 need = sizeof(type) * n; \
0070 ceph_decode_need(p, end, need, bad); \
0071 *p += need; \
0072 } while (0)
0073
0074 #define __decode_and_drop_map(p, end, ktype, vtype, bad) \
0075 do { \
0076 u32 n; \
0077 size_t need; \
0078 ceph_decode_32_safe(p, end, n, bad); \
0079 need = (sizeof(ktype) + sizeof(vtype)) * n; \
0080 ceph_decode_need(p, end, need, bad); \
0081 *p += need; \
0082 } while (0)
0083
0084
0085 static int __decode_and_drop_compat_set(void **p, void* end)
0086 {
0087 int i;
0088
0089 for (i = 0; i < 3; i++) {
0090 u32 n;
0091 ceph_decode_need(p, end, sizeof(u64) + sizeof(u32), bad);
0092
0093 *p += sizeof(u64);
0094
0095 n = ceph_decode_32(p);
0096 while (n-- > 0) {
0097 u32 len;
0098 ceph_decode_need(p, end, sizeof(u64) + sizeof(u32),
0099 bad);
0100 *p += sizeof(u64);
0101 len = ceph_decode_32(p);
0102 ceph_decode_need(p, end, len, bad);
0103 *p += len;
0104 }
0105 }
0106 return 0;
0107 bad:
0108 return -1;
0109 }
0110
0111
0112
0113
0114
0115
0116
0117 struct ceph_mdsmap *ceph_mdsmap_decode(void **p, void *end, bool msgr2)
0118 {
0119 struct ceph_mdsmap *m;
0120 const void *start = *p;
0121 int i, j, n;
0122 int err;
0123 u8 mdsmap_v;
0124 u16 mdsmap_ev;
0125 u32 target;
0126
0127 m = kzalloc(sizeof(*m), GFP_NOFS);
0128 if (!m)
0129 return ERR_PTR(-ENOMEM);
0130
0131 ceph_decode_need(p, end, 1 + 1, bad);
0132 mdsmap_v = ceph_decode_8(p);
0133 *p += sizeof(u8);
0134 if (mdsmap_v >= 4) {
0135 u32 mdsmap_len;
0136 ceph_decode_32_safe(p, end, mdsmap_len, bad);
0137 if (end < *p + mdsmap_len)
0138 goto bad;
0139 end = *p + mdsmap_len;
0140 }
0141
0142 ceph_decode_need(p, end, 8*sizeof(u32) + sizeof(u64), bad);
0143 m->m_epoch = ceph_decode_32(p);
0144 m->m_client_epoch = ceph_decode_32(p);
0145 m->m_last_failure = ceph_decode_32(p);
0146 m->m_root = ceph_decode_32(p);
0147 m->m_session_timeout = ceph_decode_32(p);
0148 m->m_session_autoclose = ceph_decode_32(p);
0149 m->m_max_file_size = ceph_decode_64(p);
0150 m->m_max_mds = ceph_decode_32(p);
0151
0152
0153
0154
0155
0156
0157
0158 m->m_num_active_mds = n = ceph_decode_32(p);
0159
0160
0161
0162
0163
0164
0165
0166
0167 m->possible_max_rank = max(m->m_num_active_mds, m->m_max_mds);
0168
0169 m->m_info = kcalloc(m->possible_max_rank, sizeof(*m->m_info), GFP_NOFS);
0170 if (!m->m_info)
0171 goto nomem;
0172
0173
0174 for (i = 0; i < n; i++) {
0175 u64 global_id;
0176 u32 namelen;
0177 s32 mds, inc, state;
0178 u8 info_v;
0179 void *info_end = NULL;
0180 struct ceph_entity_addr addr;
0181 u32 num_export_targets;
0182 void *pexport_targets = NULL;
0183 struct ceph_timespec laggy_since;
0184 struct ceph_mds_info *info;
0185 bool laggy;
0186
0187 ceph_decode_need(p, end, sizeof(u64) + 1, bad);
0188 global_id = ceph_decode_64(p);
0189 info_v= ceph_decode_8(p);
0190 if (info_v >= 4) {
0191 u32 info_len;
0192 ceph_decode_need(p, end, 1 + sizeof(u32), bad);
0193 *p += sizeof(u8);
0194 info_len = ceph_decode_32(p);
0195 info_end = *p + info_len;
0196 if (info_end > end)
0197 goto bad;
0198 }
0199
0200 ceph_decode_need(p, end, sizeof(u64) + sizeof(u32), bad);
0201 *p += sizeof(u64);
0202 namelen = ceph_decode_32(p);
0203 *p += namelen;
0204
0205 ceph_decode_32_safe(p, end, mds, bad);
0206 ceph_decode_32_safe(p, end, inc, bad);
0207 ceph_decode_32_safe(p, end, state, bad);
0208 *p += sizeof(u64);
0209 if (info_v >= 8)
0210 err = ceph_decode_entity_addrvec(p, end, msgr2, &addr);
0211 else
0212 err = ceph_decode_entity_addr(p, end, &addr);
0213 if (err)
0214 goto corrupt;
0215
0216 ceph_decode_copy_safe(p, end, &laggy_since, sizeof(laggy_since),
0217 bad);
0218 laggy = laggy_since.tv_sec != 0 || laggy_since.tv_nsec != 0;
0219 *p += sizeof(u32);
0220 ceph_decode_32_safe(p, end, namelen, bad);
0221 *p += namelen;
0222 if (info_v >= 2) {
0223 ceph_decode_32_safe(p, end, num_export_targets, bad);
0224 pexport_targets = *p;
0225 *p += num_export_targets * sizeof(u32);
0226 } else {
0227 num_export_targets = 0;
0228 }
0229
0230 if (info_end && *p != info_end) {
0231 if (*p > info_end)
0232 goto bad;
0233 *p = info_end;
0234 }
0235
0236 dout("mdsmap_decode %d/%d %lld mds%d.%d %s %s%s\n",
0237 i+1, n, global_id, mds, inc,
0238 ceph_pr_addr(&addr),
0239 ceph_mds_state_name(state),
0240 laggy ? "(laggy)" : "");
0241
0242 if (mds < 0 || mds >= m->possible_max_rank) {
0243 pr_warn("mdsmap_decode got incorrect mds(%d)\n", mds);
0244 continue;
0245 }
0246
0247 if (state <= 0) {
0248 dout("mdsmap_decode got incorrect state(%s)\n",
0249 ceph_mds_state_name(state));
0250 continue;
0251 }
0252
0253 info = &m->m_info[mds];
0254 info->global_id = global_id;
0255 info->state = state;
0256 info->addr = addr;
0257 info->laggy = laggy;
0258 info->num_export_targets = num_export_targets;
0259 if (num_export_targets) {
0260 info->export_targets = kcalloc(num_export_targets,
0261 sizeof(u32), GFP_NOFS);
0262 if (!info->export_targets)
0263 goto nomem;
0264 for (j = 0; j < num_export_targets; j++) {
0265 target = ceph_decode_32(&pexport_targets);
0266 info->export_targets[j] = target;
0267 }
0268 } else {
0269 info->export_targets = NULL;
0270 }
0271 }
0272
0273
0274 ceph_decode_32_safe(p, end, n, bad);
0275 m->m_num_data_pg_pools = n;
0276 m->m_data_pg_pools = kcalloc(n, sizeof(u64), GFP_NOFS);
0277 if (!m->m_data_pg_pools)
0278 goto nomem;
0279 ceph_decode_need(p, end, sizeof(u64)*(n+1), bad);
0280 for (i = 0; i < n; i++)
0281 m->m_data_pg_pools[i] = ceph_decode_64(p);
0282 m->m_cas_pg_pool = ceph_decode_64(p);
0283 m->m_enabled = m->m_epoch > 1;
0284
0285 mdsmap_ev = 1;
0286 if (mdsmap_v >= 2) {
0287 ceph_decode_16_safe(p, end, mdsmap_ev, bad_ext);
0288 }
0289 if (mdsmap_ev >= 3) {
0290 if (__decode_and_drop_compat_set(p, end) < 0)
0291 goto bad_ext;
0292 }
0293
0294 if (mdsmap_ev < 5) {
0295 __decode_and_drop_type(p, end, u32, bad_ext);
0296 } else {
0297 __decode_and_drop_type(p, end, u64, bad_ext);
0298 }
0299
0300
0301 __decode_and_drop_type(p, end, struct ceph_timespec, bad_ext);
0302 __decode_and_drop_type(p, end, struct ceph_timespec, bad_ext);
0303 __decode_and_drop_type(p, end, u32, bad_ext);
0304
0305
0306 {
0307 int num_laggy = 0;
0308 ceph_decode_32_safe(p, end, n, bad_ext);
0309 ceph_decode_need(p, end, sizeof(u32) * n, bad_ext);
0310
0311 for (i = 0; i < n; i++) {
0312 s32 mds = ceph_decode_32(p);
0313 if (mds >= 0 && mds < m->possible_max_rank) {
0314 if (m->m_info[mds].laggy)
0315 num_laggy++;
0316 }
0317 }
0318 m->m_num_laggy = num_laggy;
0319
0320 if (n > m->possible_max_rank) {
0321 void *new_m_info = krealloc(m->m_info,
0322 n * sizeof(*m->m_info),
0323 GFP_NOFS | __GFP_ZERO);
0324 if (!new_m_info)
0325 goto nomem;
0326 m->m_info = new_m_info;
0327 }
0328 m->possible_max_rank = n;
0329 }
0330
0331
0332 __decode_and_drop_map(p, end, u32, u32, bad_ext);
0333
0334 __decode_and_drop_map(p, end, u32, u64, bad_ext);
0335
0336 __decode_and_drop_set(p, end, u32, bad_ext);
0337
0338 __decode_and_drop_set(p, end, u32, bad_ext);
0339
0340 if (mdsmap_ev >= 4) {
0341
0342 __decode_and_drop_type(p, end, u32, bad_ext);
0343 }
0344 if (mdsmap_ev >= 6) {
0345
0346 __decode_and_drop_type(p, end, u8, bad_ext);
0347
0348 __decode_and_drop_type(p, end, u8, bad_ext);
0349 }
0350 if (mdsmap_ev >= 7) {
0351
0352 __decode_and_drop_type(p, end, u8, bad_ext);
0353 }
0354 if (mdsmap_ev >= 8) {
0355
0356 ceph_decode_8_safe(p, end, m->m_enabled, bad_ext);
0357
0358 ceph_decode_skip_string(p, end, bad_ext);
0359 }
0360
0361 if (mdsmap_ev >= 9) {
0362 size_t need;
0363 ceph_decode_32_safe(p, end, n, bad_ext);
0364 need = sizeof(u32) * n;
0365 ceph_decode_need(p, end, need, bad_ext);
0366 *p += need;
0367 m->m_damaged = n > 0;
0368 } else {
0369 m->m_damaged = false;
0370 }
0371 if (mdsmap_ev >= 17) {
0372
0373 ceph_decode_skip_string(p, end, bad_ext);
0374
0375 ceph_decode_skip_32(p, end, bad_ext);
0376
0377 ceph_decode_skip_32(p, end, bad_ext);
0378
0379 ceph_decode_skip_8(p, end, bad_ext);
0380
0381 ceph_decode_skip_set(p, end, 64, bad_ext);
0382 ceph_decode_64_safe(p, end, m->m_max_xattr_size, bad_ext);
0383 } else {
0384
0385 m->m_max_xattr_size = 0;
0386 }
0387 bad_ext:
0388 dout("mdsmap_decode m_enabled: %d, m_damaged: %d, m_num_laggy: %d\n",
0389 !!m->m_enabled, !!m->m_damaged, m->m_num_laggy);
0390 *p = end;
0391 dout("mdsmap_decode success epoch %u\n", m->m_epoch);
0392 return m;
0393 nomem:
0394 err = -ENOMEM;
0395 goto out_err;
0396 corrupt:
0397 pr_err("corrupt mdsmap\n");
0398 print_hex_dump(KERN_DEBUG, "mdsmap: ",
0399 DUMP_PREFIX_OFFSET, 16, 1,
0400 start, end - start, true);
0401 out_err:
0402 ceph_mdsmap_destroy(m);
0403 return ERR_PTR(err);
0404 bad:
0405 err = -EINVAL;
0406 goto corrupt;
0407 }
0408
0409 void ceph_mdsmap_destroy(struct ceph_mdsmap *m)
0410 {
0411 int i;
0412
0413 if (m->m_info) {
0414 for (i = 0; i < m->possible_max_rank; i++)
0415 kfree(m->m_info[i].export_targets);
0416 kfree(m->m_info);
0417 }
0418 kfree(m->m_data_pg_pools);
0419 kfree(m);
0420 }
0421
0422 bool ceph_mdsmap_is_cluster_available(struct ceph_mdsmap *m)
0423 {
0424 int i, nr_active = 0;
0425 if (!m->m_enabled)
0426 return false;
0427 if (m->m_damaged)
0428 return false;
0429 if (m->m_num_laggy == m->m_num_active_mds)
0430 return false;
0431 for (i = 0; i < m->possible_max_rank; i++) {
0432 if (m->m_info[i].state == CEPH_MDS_STATE_ACTIVE)
0433 nr_active++;
0434 }
0435 return nr_active > 0;
0436 }