0001
0002
0003
0004
0005
0006
0007 #include "dm-btree.h"
0008 #include "dm-btree-internal.h"
0009 #include "dm-transaction-manager.h"
0010
0011 #include <linux/export.h>
0012 #include <linux/device-mapper.h>
0013
0014 #define DM_MSG_PREFIX "btree"
0015
0016
0017
0018
0019
0020
0021
0022
0023
0024
0025
0026
0027
0028
0029
0030
0031
0032
0033
0034
0035
0036
0037
0038
0039
0040
0041
0042
0043
0044
0045
0046
0047
0048
0049
0050
0051
0052
0053
0054
0055
0056
0057
0058
0059 static void node_shift(struct btree_node *n, int shift)
0060 {
0061 uint32_t nr_entries = le32_to_cpu(n->header.nr_entries);
0062 uint32_t value_size = le32_to_cpu(n->header.value_size);
0063
0064 if (shift < 0) {
0065 shift = -shift;
0066 BUG_ON(shift > nr_entries);
0067 BUG_ON((void *) key_ptr(n, shift) >= value_ptr(n, shift));
0068 memmove(key_ptr(n, 0),
0069 key_ptr(n, shift),
0070 (nr_entries - shift) * sizeof(__le64));
0071 memmove(value_ptr(n, 0),
0072 value_ptr(n, shift),
0073 (nr_entries - shift) * value_size);
0074 } else {
0075 BUG_ON(nr_entries + shift > le32_to_cpu(n->header.max_entries));
0076 memmove(key_ptr(n, shift),
0077 key_ptr(n, 0),
0078 nr_entries * sizeof(__le64));
0079 memmove(value_ptr(n, shift),
0080 value_ptr(n, 0),
0081 nr_entries * value_size);
0082 }
0083 }
0084
0085 static int node_copy(struct btree_node *left, struct btree_node *right, int shift)
0086 {
0087 uint32_t nr_left = le32_to_cpu(left->header.nr_entries);
0088 uint32_t value_size = le32_to_cpu(left->header.value_size);
0089 if (value_size != le32_to_cpu(right->header.value_size)) {
0090 DMERR("mismatched value size");
0091 return -EILSEQ;
0092 }
0093
0094 if (shift < 0) {
0095 shift = -shift;
0096
0097 if (nr_left + shift > le32_to_cpu(left->header.max_entries)) {
0098 DMERR("bad shift");
0099 return -EINVAL;
0100 }
0101
0102 memcpy(key_ptr(left, nr_left),
0103 key_ptr(right, 0),
0104 shift * sizeof(__le64));
0105 memcpy(value_ptr(left, nr_left),
0106 value_ptr(right, 0),
0107 shift * value_size);
0108 } else {
0109 if (shift > le32_to_cpu(right->header.max_entries)) {
0110 DMERR("bad shift");
0111 return -EINVAL;
0112 }
0113
0114 memcpy(key_ptr(right, 0),
0115 key_ptr(left, nr_left - shift),
0116 shift * sizeof(__le64));
0117 memcpy(value_ptr(right, 0),
0118 value_ptr(left, nr_left - shift),
0119 shift * value_size);
0120 }
0121 return 0;
0122 }
0123
0124
0125
0126
0127 static void delete_at(struct btree_node *n, unsigned index)
0128 {
0129 unsigned nr_entries = le32_to_cpu(n->header.nr_entries);
0130 unsigned nr_to_copy = nr_entries - (index + 1);
0131 uint32_t value_size = le32_to_cpu(n->header.value_size);
0132 BUG_ON(index >= nr_entries);
0133
0134 if (nr_to_copy) {
0135 memmove(key_ptr(n, index),
0136 key_ptr(n, index + 1),
0137 nr_to_copy * sizeof(__le64));
0138
0139 memmove(value_ptr(n, index),
0140 value_ptr(n, index + 1),
0141 nr_to_copy * value_size);
0142 }
0143
0144 n->header.nr_entries = cpu_to_le32(nr_entries - 1);
0145 }
0146
0147 static unsigned merge_threshold(struct btree_node *n)
0148 {
0149 return le32_to_cpu(n->header.max_entries) / 3;
0150 }
0151
0152 struct child {
0153 unsigned index;
0154 struct dm_block *block;
0155 struct btree_node *n;
0156 };
0157
0158 static int init_child(struct dm_btree_info *info, struct dm_btree_value_type *vt,
0159 struct btree_node *parent,
0160 unsigned index, struct child *result)
0161 {
0162 int r, inc;
0163 dm_block_t root;
0164
0165 result->index = index;
0166 root = value64(parent, index);
0167
0168 r = dm_tm_shadow_block(info->tm, root, &btree_node_validator,
0169 &result->block, &inc);
0170 if (r)
0171 return r;
0172
0173 result->n = dm_block_data(result->block);
0174
0175 if (inc)
0176 inc_children(info->tm, result->n, vt);
0177
0178 *((__le64 *) value_ptr(parent, index)) =
0179 cpu_to_le64(dm_block_location(result->block));
0180
0181 return 0;
0182 }
0183
0184 static void exit_child(struct dm_btree_info *info, struct child *c)
0185 {
0186 dm_tm_unlock(info->tm, c->block);
0187 }
0188
0189 static int shift(struct btree_node *left, struct btree_node *right, int count)
0190 {
0191 int r;
0192 uint32_t nr_left = le32_to_cpu(left->header.nr_entries);
0193 uint32_t nr_right = le32_to_cpu(right->header.nr_entries);
0194 uint32_t max_entries = le32_to_cpu(left->header.max_entries);
0195 uint32_t r_max_entries = le32_to_cpu(right->header.max_entries);
0196
0197 if (max_entries != r_max_entries) {
0198 DMERR("node max_entries mismatch");
0199 return -EILSEQ;
0200 }
0201
0202 if (nr_left - count > max_entries) {
0203 DMERR("node shift out of bounds");
0204 return -EINVAL;
0205 }
0206
0207 if (nr_right + count > max_entries) {
0208 DMERR("node shift out of bounds");
0209 return -EINVAL;
0210 }
0211
0212 if (!count)
0213 return 0;
0214
0215 if (count > 0) {
0216 node_shift(right, count);
0217 r = node_copy(left, right, count);
0218 if (r)
0219 return r;
0220 } else {
0221 r = node_copy(left, right, count);
0222 if (r)
0223 return r;
0224 node_shift(right, count);
0225 }
0226
0227 left->header.nr_entries = cpu_to_le32(nr_left - count);
0228 right->header.nr_entries = cpu_to_le32(nr_right + count);
0229
0230 return 0;
0231 }
0232
0233 static int __rebalance2(struct dm_btree_info *info, struct btree_node *parent,
0234 struct child *l, struct child *r)
0235 {
0236 int ret;
0237 struct btree_node *left = l->n;
0238 struct btree_node *right = r->n;
0239 uint32_t nr_left = le32_to_cpu(left->header.nr_entries);
0240 uint32_t nr_right = le32_to_cpu(right->header.nr_entries);
0241
0242
0243
0244
0245
0246
0247 unsigned int threshold = 2 * (merge_threshold(left) + 1);
0248
0249 if (nr_left + nr_right < threshold) {
0250
0251
0252
0253 node_copy(left, right, -nr_right);
0254 left->header.nr_entries = cpu_to_le32(nr_left + nr_right);
0255 delete_at(parent, r->index);
0256
0257
0258
0259
0260
0261 dm_tm_dec(info->tm, dm_block_location(r->block));
0262 } else {
0263
0264
0265
0266 unsigned target_left = (nr_left + nr_right) / 2;
0267 ret = shift(left, right, nr_left - target_left);
0268 if (ret)
0269 return ret;
0270 *key_ptr(parent, r->index) = right->keys[0];
0271 }
0272 return 0;
0273 }
0274
0275 static int rebalance2(struct shadow_spine *s, struct dm_btree_info *info,
0276 struct dm_btree_value_type *vt, unsigned left_index)
0277 {
0278 int r;
0279 struct btree_node *parent;
0280 struct child left, right;
0281
0282 parent = dm_block_data(shadow_current(s));
0283
0284 r = init_child(info, vt, parent, left_index, &left);
0285 if (r)
0286 return r;
0287
0288 r = init_child(info, vt, parent, left_index + 1, &right);
0289 if (r) {
0290 exit_child(info, &left);
0291 return r;
0292 }
0293
0294 r = __rebalance2(info, parent, &left, &right);
0295
0296 exit_child(info, &left);
0297 exit_child(info, &right);
0298
0299 return r;
0300 }
0301
0302
0303
0304
0305
0306
0307 static int delete_center_node(struct dm_btree_info *info, struct btree_node *parent,
0308 struct child *l, struct child *c, struct child *r,
0309 struct btree_node *left, struct btree_node *center, struct btree_node *right,
0310 uint32_t nr_left, uint32_t nr_center, uint32_t nr_right)
0311 {
0312 uint32_t max_entries = le32_to_cpu(left->header.max_entries);
0313 unsigned shift = min(max_entries - nr_left, nr_center);
0314
0315 if (nr_left + shift > max_entries) {
0316 DMERR("node shift out of bounds");
0317 return -EINVAL;
0318 }
0319
0320 node_copy(left, center, -shift);
0321 left->header.nr_entries = cpu_to_le32(nr_left + shift);
0322
0323 if (shift != nr_center) {
0324 shift = nr_center - shift;
0325
0326 if ((nr_right + shift) > max_entries) {
0327 DMERR("node shift out of bounds");
0328 return -EINVAL;
0329 }
0330
0331 node_shift(right, shift);
0332 node_copy(center, right, shift);
0333 right->header.nr_entries = cpu_to_le32(nr_right + shift);
0334 }
0335 *key_ptr(parent, r->index) = right->keys[0];
0336
0337 delete_at(parent, c->index);
0338 r->index--;
0339
0340 dm_tm_dec(info->tm, dm_block_location(c->block));
0341 return __rebalance2(info, parent, l, r);
0342 }
0343
0344
0345
0346
0347 static int redistribute3(struct dm_btree_info *info, struct btree_node *parent,
0348 struct child *l, struct child *c, struct child *r,
0349 struct btree_node *left, struct btree_node *center, struct btree_node *right,
0350 uint32_t nr_left, uint32_t nr_center, uint32_t nr_right)
0351 {
0352 int s, ret;
0353 uint32_t max_entries = le32_to_cpu(left->header.max_entries);
0354 unsigned total = nr_left + nr_center + nr_right;
0355 unsigned target_right = total / 3;
0356 unsigned remainder = (target_right * 3) != total;
0357 unsigned target_left = target_right + remainder;
0358
0359 BUG_ON(target_left > max_entries);
0360 BUG_ON(target_right > max_entries);
0361
0362 if (nr_left < nr_right) {
0363 s = nr_left - target_left;
0364
0365 if (s < 0 && nr_center < -s) {
0366
0367 ret = shift(left, center, -nr_center);
0368 if (ret)
0369 return ret;
0370
0371 s += nr_center;
0372 ret = shift(left, right, s);
0373 if (ret)
0374 return ret;
0375
0376 nr_right += s;
0377 } else {
0378 ret = shift(left, center, s);
0379 if (ret)
0380 return ret;
0381 }
0382
0383 ret = shift(center, right, target_right - nr_right);
0384 if (ret)
0385 return ret;
0386 } else {
0387 s = target_right - nr_right;
0388 if (s > 0 && nr_center < s) {
0389
0390 ret = shift(center, right, nr_center);
0391 if (ret)
0392 return ret;
0393 s -= nr_center;
0394 ret = shift(left, right, s);
0395 if (ret)
0396 return ret;
0397 nr_left -= s;
0398 } else {
0399 ret = shift(center, right, s);
0400 if (ret)
0401 return ret;
0402 }
0403
0404 ret = shift(left, center, nr_left - target_left);
0405 if (ret)
0406 return ret;
0407 }
0408
0409 *key_ptr(parent, c->index) = center->keys[0];
0410 *key_ptr(parent, r->index) = right->keys[0];
0411 return 0;
0412 }
0413
0414 static int __rebalance3(struct dm_btree_info *info, struct btree_node *parent,
0415 struct child *l, struct child *c, struct child *r)
0416 {
0417 struct btree_node *left = l->n;
0418 struct btree_node *center = c->n;
0419 struct btree_node *right = r->n;
0420
0421 uint32_t nr_left = le32_to_cpu(left->header.nr_entries);
0422 uint32_t nr_center = le32_to_cpu(center->header.nr_entries);
0423 uint32_t nr_right = le32_to_cpu(right->header.nr_entries);
0424
0425 unsigned threshold = merge_threshold(left) * 4 + 1;
0426
0427 if ((left->header.max_entries != center->header.max_entries) ||
0428 (center->header.max_entries != right->header.max_entries)) {
0429 DMERR("bad btree metadata, max_entries differ");
0430 return -EILSEQ;
0431 }
0432
0433 if ((nr_left + nr_center + nr_right) < threshold) {
0434 return delete_center_node(info, parent, l, c, r, left, center, right,
0435 nr_left, nr_center, nr_right);
0436 }
0437
0438 return redistribute3(info, parent, l, c, r, left, center, right,
0439 nr_left, nr_center, nr_right);
0440 }
0441
0442 static int rebalance3(struct shadow_spine *s, struct dm_btree_info *info,
0443 struct dm_btree_value_type *vt, unsigned left_index)
0444 {
0445 int r;
0446 struct btree_node *parent = dm_block_data(shadow_current(s));
0447 struct child left, center, right;
0448
0449
0450
0451
0452 r = init_child(info, vt, parent, left_index, &left);
0453 if (r)
0454 return r;
0455
0456 r = init_child(info, vt, parent, left_index + 1, ¢er);
0457 if (r) {
0458 exit_child(info, &left);
0459 return r;
0460 }
0461
0462 r = init_child(info, vt, parent, left_index + 2, &right);
0463 if (r) {
0464 exit_child(info, &left);
0465 exit_child(info, ¢er);
0466 return r;
0467 }
0468
0469 r = __rebalance3(info, parent, &left, ¢er, &right);
0470
0471 exit_child(info, &left);
0472 exit_child(info, ¢er);
0473 exit_child(info, &right);
0474
0475 return r;
0476 }
0477
0478 static int rebalance_children(struct shadow_spine *s,
0479 struct dm_btree_info *info,
0480 struct dm_btree_value_type *vt, uint64_t key)
0481 {
0482 int i, r, has_left_sibling, has_right_sibling;
0483 struct btree_node *n;
0484
0485 n = dm_block_data(shadow_current(s));
0486
0487 if (le32_to_cpu(n->header.nr_entries) == 1) {
0488 struct dm_block *child;
0489 dm_block_t b = value64(n, 0);
0490
0491 r = dm_tm_read_lock(info->tm, b, &btree_node_validator, &child);
0492 if (r)
0493 return r;
0494
0495 memcpy(n, dm_block_data(child),
0496 dm_bm_block_size(dm_tm_get_bm(info->tm)));
0497
0498 dm_tm_dec(info->tm, dm_block_location(child));
0499 dm_tm_unlock(info->tm, child);
0500 return 0;
0501 }
0502
0503 i = lower_bound(n, key);
0504 if (i < 0)
0505 return -ENODATA;
0506
0507 has_left_sibling = i > 0;
0508 has_right_sibling = i < (le32_to_cpu(n->header.nr_entries) - 1);
0509
0510 if (!has_left_sibling)
0511 r = rebalance2(s, info, vt, i);
0512
0513 else if (!has_right_sibling)
0514 r = rebalance2(s, info, vt, i - 1);
0515
0516 else
0517 r = rebalance3(s, info, vt, i - 1);
0518
0519 return r;
0520 }
0521
0522 static int do_leaf(struct btree_node *n, uint64_t key, unsigned *index)
0523 {
0524 int i = lower_bound(n, key);
0525
0526 if ((i < 0) ||
0527 (i >= le32_to_cpu(n->header.nr_entries)) ||
0528 (le64_to_cpu(n->keys[i]) != key))
0529 return -ENODATA;
0530
0531 *index = i;
0532
0533 return 0;
0534 }
0535
0536
0537
0538
0539
0540 static int remove_raw(struct shadow_spine *s, struct dm_btree_info *info,
0541 struct dm_btree_value_type *vt, dm_block_t root,
0542 uint64_t key, unsigned *index)
0543 {
0544 int i = *index, r;
0545 struct btree_node *n;
0546
0547 for (;;) {
0548 r = shadow_step(s, root, vt);
0549 if (r < 0)
0550 break;
0551
0552
0553
0554
0555
0556
0557 if (shadow_has_parent(s)) {
0558 __le64 location = cpu_to_le64(dm_block_location(shadow_current(s)));
0559 memcpy(value_ptr(dm_block_data(shadow_parent(s)), i),
0560 &location, sizeof(__le64));
0561 }
0562
0563 n = dm_block_data(shadow_current(s));
0564
0565 if (le32_to_cpu(n->header.flags) & LEAF_NODE)
0566 return do_leaf(n, key, index);
0567
0568 r = rebalance_children(s, info, vt, key);
0569 if (r)
0570 break;
0571
0572 n = dm_block_data(shadow_current(s));
0573 if (le32_to_cpu(n->header.flags) & LEAF_NODE)
0574 return do_leaf(n, key, index);
0575
0576 i = lower_bound(n, key);
0577
0578
0579
0580
0581
0582
0583 root = value64(n, i);
0584 }
0585
0586 return r;
0587 }
0588
0589 int dm_btree_remove(struct dm_btree_info *info, dm_block_t root,
0590 uint64_t *keys, dm_block_t *new_root)
0591 {
0592 unsigned level, last_level = info->levels - 1;
0593 int index = 0, r = 0;
0594 struct shadow_spine spine;
0595 struct btree_node *n;
0596 struct dm_btree_value_type le64_vt;
0597
0598 init_le64_type(info->tm, &le64_vt);
0599 init_shadow_spine(&spine, info);
0600 for (level = 0; level < info->levels; level++) {
0601 r = remove_raw(&spine, info,
0602 (level == last_level ?
0603 &info->value_type : &le64_vt),
0604 root, keys[level], (unsigned *)&index);
0605 if (r < 0)
0606 break;
0607
0608 n = dm_block_data(shadow_current(&spine));
0609 if (level != last_level) {
0610 root = value64(n, index);
0611 continue;
0612 }
0613
0614 BUG_ON(index < 0 || index >= le32_to_cpu(n->header.nr_entries));
0615
0616 if (info->value_type.dec)
0617 info->value_type.dec(info->value_type.context,
0618 value_ptr(n, index), 1);
0619
0620 delete_at(n, index);
0621 }
0622
0623 if (!r)
0624 *new_root = shadow_root(&spine);
0625 exit_shadow_spine(&spine);
0626
0627 return r;
0628 }
0629 EXPORT_SYMBOL_GPL(dm_btree_remove);
0630
0631
0632
0633 static int remove_nearest(struct shadow_spine *s, struct dm_btree_info *info,
0634 struct dm_btree_value_type *vt, dm_block_t root,
0635 uint64_t key, int *index)
0636 {
0637 int i = *index, r;
0638 struct btree_node *n;
0639
0640 for (;;) {
0641 r = shadow_step(s, root, vt);
0642 if (r < 0)
0643 break;
0644
0645
0646
0647
0648
0649
0650 if (shadow_has_parent(s)) {
0651 __le64 location = cpu_to_le64(dm_block_location(shadow_current(s)));
0652 memcpy(value_ptr(dm_block_data(shadow_parent(s)), i),
0653 &location, sizeof(__le64));
0654 }
0655
0656 n = dm_block_data(shadow_current(s));
0657
0658 if (le32_to_cpu(n->header.flags) & LEAF_NODE) {
0659 *index = lower_bound(n, key);
0660 return 0;
0661 }
0662
0663 r = rebalance_children(s, info, vt, key);
0664 if (r)
0665 break;
0666
0667 n = dm_block_data(shadow_current(s));
0668 if (le32_to_cpu(n->header.flags) & LEAF_NODE) {
0669 *index = lower_bound(n, key);
0670 return 0;
0671 }
0672
0673 i = lower_bound(n, key);
0674
0675
0676
0677
0678
0679
0680 root = value64(n, i);
0681 }
0682
0683 return r;
0684 }
0685
0686 static int remove_one(struct dm_btree_info *info, dm_block_t root,
0687 uint64_t *keys, uint64_t end_key,
0688 dm_block_t *new_root, unsigned *nr_removed)
0689 {
0690 unsigned level, last_level = info->levels - 1;
0691 int index = 0, r = 0;
0692 struct shadow_spine spine;
0693 struct btree_node *n;
0694 struct dm_btree_value_type le64_vt;
0695 uint64_t k;
0696
0697 init_le64_type(info->tm, &le64_vt);
0698 init_shadow_spine(&spine, info);
0699 for (level = 0; level < last_level; level++) {
0700 r = remove_raw(&spine, info, &le64_vt,
0701 root, keys[level], (unsigned *) &index);
0702 if (r < 0)
0703 goto out;
0704
0705 n = dm_block_data(shadow_current(&spine));
0706 root = value64(n, index);
0707 }
0708
0709 r = remove_nearest(&spine, info, &info->value_type,
0710 root, keys[last_level], &index);
0711 if (r < 0)
0712 goto out;
0713
0714 n = dm_block_data(shadow_current(&spine));
0715
0716 if (index < 0)
0717 index = 0;
0718
0719 if (index >= le32_to_cpu(n->header.nr_entries)) {
0720 r = -ENODATA;
0721 goto out;
0722 }
0723
0724 k = le64_to_cpu(n->keys[index]);
0725 if (k >= keys[last_level] && k < end_key) {
0726 if (info->value_type.dec)
0727 info->value_type.dec(info->value_type.context,
0728 value_ptr(n, index), 1);
0729
0730 delete_at(n, index);
0731 keys[last_level] = k + 1ull;
0732
0733 } else
0734 r = -ENODATA;
0735
0736 out:
0737 *new_root = shadow_root(&spine);
0738 exit_shadow_spine(&spine);
0739
0740 return r;
0741 }
0742
0743 int dm_btree_remove_leaves(struct dm_btree_info *info, dm_block_t root,
0744 uint64_t *first_key, uint64_t end_key,
0745 dm_block_t *new_root, unsigned *nr_removed)
0746 {
0747 int r;
0748
0749 *nr_removed = 0;
0750 do {
0751 r = remove_one(info, root, first_key, end_key, &root, nr_removed);
0752 if (!r)
0753 (*nr_removed)++;
0754 } while (!r);
0755
0756 *new_root = root;
0757 return r == -ENODATA ? 0 : r;
0758 }
0759 EXPORT_SYMBOL_GPL(dm_btree_remove_leaves);