Back to home page

OSCL-LXR

 
 

    


0001 /* SPDX-License-Identifier: GPL-2.0 */
0002 #ifndef CEPH_CRUSH_CRUSH_H
0003 #define CEPH_CRUSH_CRUSH_H
0004 
0005 #ifdef __KERNEL__
0006 # include <linux/rbtree.h>
0007 # include <linux/types.h>
0008 #else
0009 # include "crush_compat.h"
0010 #endif
0011 
0012 /*
0013  * CRUSH is a pseudo-random data distribution algorithm that
0014  * efficiently distributes input values (typically, data objects)
0015  * across a heterogeneous, structured storage cluster.
0016  *
0017  * The algorithm was originally described in detail in this paper
0018  * (although the algorithm has evolved somewhat since then):
0019  *
0020  *     https://www.ssrc.ucsc.edu/Papers/weil-sc06.pdf
0021  *
0022  * LGPL2
0023  */
0024 
0025 
0026 #define CRUSH_MAGIC 0x00010000ul   /* for detecting algorithm revisions */
0027 
0028 #define CRUSH_MAX_DEPTH 10  /* max crush hierarchy depth */
0029 #define CRUSH_MAX_RULESET (1<<8)  /* max crush ruleset number */
0030 #define CRUSH_MAX_RULES CRUSH_MAX_RULESET  /* should be the same as max rulesets */
0031 
0032 #define CRUSH_MAX_DEVICE_WEIGHT (100u * 0x10000u)
0033 #define CRUSH_MAX_BUCKET_WEIGHT (65535u * 0x10000u)
0034 
0035 #define CRUSH_ITEM_UNDEF  0x7ffffffe  /* undefined result (internal use only) */
0036 #define CRUSH_ITEM_NONE   0x7fffffff  /* no result */
0037 
0038 /*
0039  * CRUSH uses user-defined "rules" to describe how inputs should be
0040  * mapped to devices.  A rule consists of sequence of steps to perform
0041  * to generate the set of output devices.
0042  */
0043 struct crush_rule_step {
0044     __u32 op;
0045     __s32 arg1;
0046     __s32 arg2;
0047 };
0048 
0049 /* step op codes */
0050 enum {
0051     CRUSH_RULE_NOOP = 0,
0052     CRUSH_RULE_TAKE = 1,          /* arg1 = value to start with */
0053     CRUSH_RULE_CHOOSE_FIRSTN = 2, /* arg1 = num items to pick */
0054                       /* arg2 = type */
0055     CRUSH_RULE_CHOOSE_INDEP = 3,  /* same */
0056     CRUSH_RULE_EMIT = 4,          /* no args */
0057     CRUSH_RULE_CHOOSELEAF_FIRSTN = 6,
0058     CRUSH_RULE_CHOOSELEAF_INDEP = 7,
0059 
0060     CRUSH_RULE_SET_CHOOSE_TRIES = 8, /* override choose_total_tries */
0061     CRUSH_RULE_SET_CHOOSELEAF_TRIES = 9, /* override chooseleaf_descend_once */
0062     CRUSH_RULE_SET_CHOOSE_LOCAL_TRIES = 10,
0063     CRUSH_RULE_SET_CHOOSE_LOCAL_FALLBACK_TRIES = 11,
0064     CRUSH_RULE_SET_CHOOSELEAF_VARY_R = 12,
0065     CRUSH_RULE_SET_CHOOSELEAF_STABLE = 13
0066 };
0067 
0068 /*
0069  * for specifying choose num (arg1) relative to the max parameter
0070  * passed to do_rule
0071  */
0072 #define CRUSH_CHOOSE_N            0
0073 #define CRUSH_CHOOSE_N_MINUS(x)   (-(x))
0074 
0075 /*
0076  * The rule mask is used to describe what the rule is intended for.
0077  * Given a ruleset and size of output set, we search through the
0078  * rule list for a matching rule_mask.
0079  */
0080 struct crush_rule_mask {
0081     __u8 ruleset;
0082     __u8 type;
0083     __u8 min_size;
0084     __u8 max_size;
0085 };
0086 
0087 struct crush_rule {
0088     __u32 len;
0089     struct crush_rule_mask mask;
0090     struct crush_rule_step steps[];
0091 };
0092 
0093 #define crush_rule_size(len) (sizeof(struct crush_rule) + \
0094                   (len)*sizeof(struct crush_rule_step))
0095 
0096 
0097 
0098 /*
0099  * A bucket is a named container of other items (either devices or
0100  * other buckets).  Items within a bucket are chosen using one of a
0101  * few different algorithms.  The table summarizes how the speed of
0102  * each option measures up against mapping stability when items are
0103  * added or removed.
0104  *
0105  *  Bucket Alg     Speed       Additions    Removals
0106  *  ------------------------------------------------
0107  *  uniform         O(1)       poor         poor
0108  *  list            O(n)       optimal      poor
0109  *  tree            O(log n)   good         good
0110  *  straw           O(n)       better       better
0111  *  straw2          O(n)       optimal      optimal
0112  */
0113 enum {
0114     CRUSH_BUCKET_UNIFORM = 1,
0115     CRUSH_BUCKET_LIST = 2,
0116     CRUSH_BUCKET_TREE = 3,
0117     CRUSH_BUCKET_STRAW = 4,
0118     CRUSH_BUCKET_STRAW2 = 5,
0119 };
0120 extern const char *crush_bucket_alg_name(int alg);
0121 
0122 /*
0123  * although tree was a legacy algorithm, it has been buggy, so
0124  * exclude it.
0125  */
0126 #define CRUSH_LEGACY_ALLOWED_BUCKET_ALGS (  \
0127         (1 << CRUSH_BUCKET_UNIFORM) |   \
0128         (1 << CRUSH_BUCKET_LIST) |  \
0129         (1 << CRUSH_BUCKET_STRAW))
0130 
0131 struct crush_bucket {
0132     __s32 id;        /* this'll be negative */
0133     __u16 type;      /* non-zero; type=0 is reserved for devices */
0134     __u8 alg;        /* one of CRUSH_BUCKET_* */
0135     __u8 hash;       /* which hash function to use, CRUSH_HASH_* */
0136     __u32 weight;    /* 16-bit fixed point */
0137     __u32 size;      /* num items */
0138     __s32 *items;
0139 
0140 };
0141 
0142 /** @ingroup API
0143  *
0144  * Replacement weights for each item in a bucket. The size of the
0145  * array must be exactly the size of the straw2 bucket, just as the
0146  * item_weights array.
0147  *
0148  */
0149 struct crush_weight_set {
0150     __u32 *weights; /*!< 16.16 fixed point weights
0151                              in the same order as items */
0152     __u32 size;     /*!< size of the __weights__ array */
0153 };
0154 
0155 /** @ingroup API
0156  *
0157  * Replacement weights and ids for a given straw2 bucket, for
0158  * placement purposes.
0159  *
0160  * When crush_do_rule() chooses the Nth item from a straw2 bucket, the
0161  * replacement weights found at __weight_set[N]__ are used instead of
0162  * the weights from __item_weights__. If __N__ is greater than
0163  * __weight_set_size__, the weights found at __weight_set_size-1__ are
0164  * used instead. For instance if __weight_set__ is:
0165  *
0166  *    [ [ 0x10000, 0x20000 ],   // position 0
0167  *      [ 0x20000, 0x40000 ] ]  // position 1
0168  *
0169  * choosing the 0th item will use position 0 weights [ 0x10000, 0x20000 ]
0170  * choosing the 1th item will use position 1 weights [ 0x20000, 0x40000 ]
0171  * choosing the 2th item will use position 1 weights [ 0x20000, 0x40000 ]
0172  * etc.
0173  *
0174  */
0175 struct crush_choose_arg {
0176     __s32 *ids;            /*!< values to use instead of items */
0177     __u32 ids_size;        /*!< size of the __ids__ array */
0178     struct crush_weight_set *weight_set; /*!< weight replacements for
0179                                                   a given position */
0180     __u32 weight_set_size; /*!< size of the __weight_set__ array */
0181 };
0182 
0183 /** @ingroup API
0184  *
0185  * Replacement weights and ids for each bucket in the crushmap. The
0186  * __size__ of the __args__ array must be exactly the same as the
0187  * __map->max_buckets__.
0188  *
0189  * The __crush_choose_arg__ at index N will be used when choosing
0190  * an item from the bucket __map->buckets[N]__ bucket, provided it
0191  * is a straw2 bucket.
0192  *
0193  */
0194 struct crush_choose_arg_map {
0195 #ifdef __KERNEL__
0196     struct rb_node node;
0197     s64 choose_args_index;
0198 #endif
0199     struct crush_choose_arg *args; /*!< replacement for each bucket
0200                                             in the crushmap */
0201     __u32 size;                    /*!< size of the __args__ array */
0202 };
0203 
0204 struct crush_bucket_uniform {
0205     struct crush_bucket h;
0206     __u32 item_weight;  /* 16-bit fixed point; all items equally weighted */
0207 };
0208 
0209 struct crush_bucket_list {
0210     struct crush_bucket h;
0211     __u32 *item_weights;  /* 16-bit fixed point */
0212     __u32 *sum_weights;   /* 16-bit fixed point.  element i is sum
0213                  of weights 0..i, inclusive */
0214 };
0215 
0216 struct crush_bucket_tree {
0217     struct crush_bucket h;  /* note: h.size is _tree_ size, not number of
0218                    actual items */
0219     __u8 num_nodes;
0220     __u32 *node_weights;
0221 };
0222 
0223 struct crush_bucket_straw {
0224     struct crush_bucket h;
0225     __u32 *item_weights;   /* 16-bit fixed point */
0226     __u32 *straws;         /* 16-bit fixed point */
0227 };
0228 
0229 struct crush_bucket_straw2 {
0230     struct crush_bucket h;
0231     __u32 *item_weights;   /* 16-bit fixed point */
0232 };
0233 
0234 
0235 
0236 /*
0237  * CRUSH map includes all buckets, rules, etc.
0238  */
0239 struct crush_map {
0240     struct crush_bucket **buckets;
0241     struct crush_rule **rules;
0242 
0243     __s32 max_buckets;
0244     __u32 max_rules;
0245     __s32 max_devices;
0246 
0247     /* choose local retries before re-descent */
0248     __u32 choose_local_tries;
0249     /* choose local attempts using a fallback permutation before
0250      * re-descent */
0251     __u32 choose_local_fallback_tries;
0252     /* choose attempts before giving up */
0253     __u32 choose_total_tries;
0254     /* attempt chooseleaf inner descent once for firstn mode; on
0255      * reject retry outer descent.  Note that this does *not*
0256      * apply to a collision: in that case we will retry as we used
0257      * to. */
0258     __u32 chooseleaf_descend_once;
0259 
0260     /* if non-zero, feed r into chooseleaf, bit-shifted right by (r-1)
0261      * bits.  a value of 1 is best for new clusters.  for legacy clusters
0262      * that want to limit reshuffling, a value of 3 or 4 will make the
0263      * mappings line up a bit better with previous mappings. */
0264     __u8 chooseleaf_vary_r;
0265 
0266     /* if true, it makes chooseleaf firstn to return stable results (if
0267      * no local retry) so that data migrations would be optimal when some
0268      * device fails. */
0269     __u8 chooseleaf_stable;
0270 
0271     /*
0272      * This value is calculated after decode or construction by
0273      * the builder. It is exposed here (rather than having a
0274      * 'build CRUSH working space' function) so that callers can
0275      * reserve a static buffer, allocate space on the stack, or
0276      * otherwise avoid calling into the heap allocator if they
0277      * want to. The size of the working space depends on the map,
0278      * while the size of the scratch vector passed to the mapper
0279      * depends on the size of the desired result set.
0280      *
0281      * Nothing stops the caller from allocating both in one swell
0282      * foop and passing in two points, though.
0283      */
0284     size_t working_size;
0285 
0286 #ifndef __KERNEL__
0287     /*
0288      * version 0 (original) of straw_calc has various flaws.  version 1
0289      * fixes a few of them.
0290      */
0291     __u8 straw_calc_version;
0292 
0293     /*
0294      * allowed bucket algs is a bitmask, here the bit positions
0295      * are CRUSH_BUCKET_*.  note that these are *bits* and
0296      * CRUSH_BUCKET_* values are not, so we need to or together (1
0297      * << CRUSH_BUCKET_WHATEVER).  The 0th bit is not used to
0298      * minimize confusion (bucket type values start at 1).
0299      */
0300     __u32 allowed_bucket_algs;
0301 
0302     __u32 *choose_tries;
0303 #else
0304     /* device/bucket type id -> type name (CrushWrapper::type_map) */
0305     struct rb_root type_names;
0306 
0307     /* device/bucket id -> name (CrushWrapper::name_map) */
0308     struct rb_root names;
0309 
0310     /* CrushWrapper::choose_args */
0311     struct rb_root choose_args;
0312 #endif
0313 };
0314 
0315 
0316 /* crush.c */
0317 extern int crush_get_bucket_item_weight(const struct crush_bucket *b, int pos);
0318 extern void crush_destroy_bucket_uniform(struct crush_bucket_uniform *b);
0319 extern void crush_destroy_bucket_list(struct crush_bucket_list *b);
0320 extern void crush_destroy_bucket_tree(struct crush_bucket_tree *b);
0321 extern void crush_destroy_bucket_straw(struct crush_bucket_straw *b);
0322 extern void crush_destroy_bucket_straw2(struct crush_bucket_straw2 *b);
0323 extern void crush_destroy_bucket(struct crush_bucket *b);
0324 extern void crush_destroy_rule(struct crush_rule *r);
0325 extern void crush_destroy(struct crush_map *map);
0326 
0327 static inline int crush_calc_tree_node(int i)
0328 {
0329     return ((i+1) << 1)-1;
0330 }
0331 
0332 /*
0333  * These data structures are private to the CRUSH implementation. They
0334  * are exposed in this header file because builder needs their
0335  * definitions to calculate the total working size.
0336  *
0337  * Moving this out of the crush map allow us to treat the CRUSH map as
0338  * immutable within the mapper and removes the requirement for a CRUSH
0339  * map lock.
0340  */
0341 struct crush_work_bucket {
0342     __u32 perm_x; /* @x for which *perm is defined */
0343     __u32 perm_n; /* num elements of *perm that are permuted/defined */
0344     __u32 *perm;  /* Permutation of the bucket's items */
0345 };
0346 
0347 struct crush_work {
0348     struct crush_work_bucket **work; /* Per-bucket working store */
0349 #ifdef __KERNEL__
0350     struct list_head item;
0351 #endif
0352 };
0353 
0354 #ifdef __KERNEL__
0355 /* osdmap.c */
0356 void clear_crush_names(struct rb_root *root);
0357 void clear_choose_args(struct crush_map *c);
0358 #endif
0359 
0360 #endif