0001
0002
0003
0004
0005
0006
0007 #include "dm-btree-internal.h"
0008 #include "dm-space-map.h"
0009 #include "dm-transaction-manager.h"
0010
0011 #include <linux/export.h>
0012 #include <linux/device-mapper.h>
0013
0014 #define DM_MSG_PREFIX "btree"
0015
0016
0017
0018
0019 static void memcpy_disk(void *dest, const void *src, size_t len)
0020 __dm_written_to_disk(src)
0021 {
0022 memcpy(dest, src, len);
0023 __dm_unbless_for_disk(src);
0024 }
0025
0026 static void array_insert(void *base, size_t elt_size, unsigned nr_elts,
0027 unsigned index, void *elt)
0028 __dm_written_to_disk(elt)
0029 {
0030 if (index < nr_elts)
0031 memmove(base + (elt_size * (index + 1)),
0032 base + (elt_size * index),
0033 (nr_elts - index) * elt_size);
0034
0035 memcpy_disk(base + (elt_size * index), elt, elt_size);
0036 }
0037
0038
0039
0040
0041 static int bsearch(struct btree_node *n, uint64_t key, int want_hi)
0042 {
0043 int lo = -1, hi = le32_to_cpu(n->header.nr_entries);
0044
0045 while (hi - lo > 1) {
0046 int mid = lo + ((hi - lo) / 2);
0047 uint64_t mid_key = le64_to_cpu(n->keys[mid]);
0048
0049 if (mid_key == key)
0050 return mid;
0051
0052 if (mid_key < key)
0053 lo = mid;
0054 else
0055 hi = mid;
0056 }
0057
0058 return want_hi ? hi : lo;
0059 }
0060
0061 int lower_bound(struct btree_node *n, uint64_t key)
0062 {
0063 return bsearch(n, key, 0);
0064 }
0065
0066 static int upper_bound(struct btree_node *n, uint64_t key)
0067 {
0068 return bsearch(n, key, 1);
0069 }
0070
0071 void inc_children(struct dm_transaction_manager *tm, struct btree_node *n,
0072 struct dm_btree_value_type *vt)
0073 {
0074 uint32_t nr_entries = le32_to_cpu(n->header.nr_entries);
0075
0076 if (le32_to_cpu(n->header.flags) & INTERNAL_NODE)
0077 dm_tm_with_runs(tm, value_ptr(n, 0), nr_entries, dm_tm_inc_range);
0078
0079 else if (vt->inc)
0080 vt->inc(vt->context, value_ptr(n, 0), nr_entries);
0081 }
0082
0083 static int insert_at(size_t value_size, struct btree_node *node, unsigned index,
0084 uint64_t key, void *value)
0085 __dm_written_to_disk(value)
0086 {
0087 uint32_t nr_entries = le32_to_cpu(node->header.nr_entries);
0088 uint32_t max_entries = le32_to_cpu(node->header.max_entries);
0089 __le64 key_le = cpu_to_le64(key);
0090
0091 if (index > nr_entries ||
0092 index >= max_entries ||
0093 nr_entries >= max_entries) {
0094 DMERR("too many entries in btree node for insert");
0095 __dm_unbless_for_disk(value);
0096 return -ENOMEM;
0097 }
0098
0099 __dm_bless_for_disk(&key_le);
0100
0101 array_insert(node->keys, sizeof(*node->keys), nr_entries, index, &key_le);
0102 array_insert(value_base(node), value_size, nr_entries, index, value);
0103 node->header.nr_entries = cpu_to_le32(nr_entries + 1);
0104
0105 return 0;
0106 }
0107
0108
0109
0110
0111
0112
0113
0114 static uint32_t calc_max_entries(size_t value_size, size_t block_size)
0115 {
0116 uint32_t total, n;
0117 size_t elt_size = sizeof(uint64_t) + value_size;
0118
0119 block_size -= sizeof(struct node_header);
0120 total = block_size / elt_size;
0121 n = total / 3;
0122
0123 return 3 * n;
0124 }
0125
0126 int dm_btree_empty(struct dm_btree_info *info, dm_block_t *root)
0127 {
0128 int r;
0129 struct dm_block *b;
0130 struct btree_node *n;
0131 size_t block_size;
0132 uint32_t max_entries;
0133
0134 r = new_block(info, &b);
0135 if (r < 0)
0136 return r;
0137
0138 block_size = dm_bm_block_size(dm_tm_get_bm(info->tm));
0139 max_entries = calc_max_entries(info->value_type.size, block_size);
0140
0141 n = dm_block_data(b);
0142 memset(n, 0, block_size);
0143 n->header.flags = cpu_to_le32(LEAF_NODE);
0144 n->header.nr_entries = cpu_to_le32(0);
0145 n->header.max_entries = cpu_to_le32(max_entries);
0146 n->header.value_size = cpu_to_le32(info->value_type.size);
0147
0148 *root = dm_block_location(b);
0149 unlock_block(info, b);
0150
0151 return 0;
0152 }
0153 EXPORT_SYMBOL_GPL(dm_btree_empty);
0154
0155
0156
0157
0158
0159
0160
0161 #define MAX_SPINE_DEPTH 64
0162 struct frame {
0163 struct dm_block *b;
0164 struct btree_node *n;
0165 unsigned level;
0166 unsigned nr_children;
0167 unsigned current_child;
0168 };
0169
0170 struct del_stack {
0171 struct dm_btree_info *info;
0172 struct dm_transaction_manager *tm;
0173 int top;
0174 struct frame spine[MAX_SPINE_DEPTH];
0175 };
0176
0177 static int top_frame(struct del_stack *s, struct frame **f)
0178 {
0179 if (s->top < 0) {
0180 DMERR("btree deletion stack empty");
0181 return -EINVAL;
0182 }
0183
0184 *f = s->spine + s->top;
0185
0186 return 0;
0187 }
0188
0189 static int unprocessed_frames(struct del_stack *s)
0190 {
0191 return s->top >= 0;
0192 }
0193
0194 static void prefetch_children(struct del_stack *s, struct frame *f)
0195 {
0196 unsigned i;
0197 struct dm_block_manager *bm = dm_tm_get_bm(s->tm);
0198
0199 for (i = 0; i < f->nr_children; i++)
0200 dm_bm_prefetch(bm, value64(f->n, i));
0201 }
0202
0203 static bool is_internal_level(struct dm_btree_info *info, struct frame *f)
0204 {
0205 return f->level < (info->levels - 1);
0206 }
0207
0208 static int push_frame(struct del_stack *s, dm_block_t b, unsigned level)
0209 {
0210 int r;
0211 uint32_t ref_count;
0212
0213 if (s->top >= MAX_SPINE_DEPTH - 1) {
0214 DMERR("btree deletion stack out of memory");
0215 return -ENOMEM;
0216 }
0217
0218 r = dm_tm_ref(s->tm, b, &ref_count);
0219 if (r)
0220 return r;
0221
0222 if (ref_count > 1)
0223
0224
0225
0226
0227 dm_tm_dec(s->tm, b);
0228
0229 else {
0230 uint32_t flags;
0231 struct frame *f = s->spine + ++s->top;
0232
0233 r = dm_tm_read_lock(s->tm, b, &btree_node_validator, &f->b);
0234 if (r) {
0235 s->top--;
0236 return r;
0237 }
0238
0239 f->n = dm_block_data(f->b);
0240 f->level = level;
0241 f->nr_children = le32_to_cpu(f->n->header.nr_entries);
0242 f->current_child = 0;
0243
0244 flags = le32_to_cpu(f->n->header.flags);
0245 if (flags & INTERNAL_NODE || is_internal_level(s->info, f))
0246 prefetch_children(s, f);
0247 }
0248
0249 return 0;
0250 }
0251
0252 static void pop_frame(struct del_stack *s)
0253 {
0254 struct frame *f = s->spine + s->top--;
0255
0256 dm_tm_dec(s->tm, dm_block_location(f->b));
0257 dm_tm_unlock(s->tm, f->b);
0258 }
0259
0260 static void unlock_all_frames(struct del_stack *s)
0261 {
0262 struct frame *f;
0263
0264 while (unprocessed_frames(s)) {
0265 f = s->spine + s->top--;
0266 dm_tm_unlock(s->tm, f->b);
0267 }
0268 }
0269
0270 int dm_btree_del(struct dm_btree_info *info, dm_block_t root)
0271 {
0272 int r;
0273 struct del_stack *s;
0274
0275
0276
0277
0278
0279
0280 s = kmalloc(sizeof(*s), GFP_NOFS);
0281 if (!s)
0282 return -ENOMEM;
0283 s->info = info;
0284 s->tm = info->tm;
0285 s->top = -1;
0286
0287 r = push_frame(s, root, 0);
0288 if (r)
0289 goto out;
0290
0291 while (unprocessed_frames(s)) {
0292 uint32_t flags;
0293 struct frame *f;
0294 dm_block_t b;
0295
0296 r = top_frame(s, &f);
0297 if (r)
0298 goto out;
0299
0300 if (f->current_child >= f->nr_children) {
0301 pop_frame(s);
0302 continue;
0303 }
0304
0305 flags = le32_to_cpu(f->n->header.flags);
0306 if (flags & INTERNAL_NODE) {
0307 b = value64(f->n, f->current_child);
0308 f->current_child++;
0309 r = push_frame(s, b, f->level);
0310 if (r)
0311 goto out;
0312
0313 } else if (is_internal_level(info, f)) {
0314 b = value64(f->n, f->current_child);
0315 f->current_child++;
0316 r = push_frame(s, b, f->level + 1);
0317 if (r)
0318 goto out;
0319
0320 } else {
0321 if (info->value_type.dec)
0322 info->value_type.dec(info->value_type.context,
0323 value_ptr(f->n, 0), f->nr_children);
0324 pop_frame(s);
0325 }
0326 }
0327 out:
0328 if (r) {
0329
0330 unlock_all_frames(s);
0331 }
0332 kfree(s);
0333
0334 return r;
0335 }
0336 EXPORT_SYMBOL_GPL(dm_btree_del);
0337
0338
0339
0340 static int btree_lookup_raw(struct ro_spine *s, dm_block_t block, uint64_t key,
0341 int (*search_fn)(struct btree_node *, uint64_t),
0342 uint64_t *result_key, void *v, size_t value_size)
0343 {
0344 int i, r;
0345 uint32_t flags, nr_entries;
0346
0347 do {
0348 r = ro_step(s, block);
0349 if (r < 0)
0350 return r;
0351
0352 i = search_fn(ro_node(s), key);
0353
0354 flags = le32_to_cpu(ro_node(s)->header.flags);
0355 nr_entries = le32_to_cpu(ro_node(s)->header.nr_entries);
0356 if (i < 0 || i >= nr_entries)
0357 return -ENODATA;
0358
0359 if (flags & INTERNAL_NODE)
0360 block = value64(ro_node(s), i);
0361
0362 } while (!(flags & LEAF_NODE));
0363
0364 *result_key = le64_to_cpu(ro_node(s)->keys[i]);
0365 if (v)
0366 memcpy(v, value_ptr(ro_node(s), i), value_size);
0367
0368 return 0;
0369 }
0370
0371 int dm_btree_lookup(struct dm_btree_info *info, dm_block_t root,
0372 uint64_t *keys, void *value_le)
0373 {
0374 unsigned level, last_level = info->levels - 1;
0375 int r = -ENODATA;
0376 uint64_t rkey;
0377 __le64 internal_value_le;
0378 struct ro_spine spine;
0379
0380 init_ro_spine(&spine, info);
0381 for (level = 0; level < info->levels; level++) {
0382 size_t size;
0383 void *value_p;
0384
0385 if (level == last_level) {
0386 value_p = value_le;
0387 size = info->value_type.size;
0388
0389 } else {
0390 value_p = &internal_value_le;
0391 size = sizeof(uint64_t);
0392 }
0393
0394 r = btree_lookup_raw(&spine, root, keys[level],
0395 lower_bound, &rkey,
0396 value_p, size);
0397
0398 if (!r) {
0399 if (rkey != keys[level]) {
0400 exit_ro_spine(&spine);
0401 return -ENODATA;
0402 }
0403 } else {
0404 exit_ro_spine(&spine);
0405 return r;
0406 }
0407
0408 root = le64_to_cpu(internal_value_le);
0409 }
0410 exit_ro_spine(&spine);
0411
0412 return r;
0413 }
0414 EXPORT_SYMBOL_GPL(dm_btree_lookup);
0415
0416 static int dm_btree_lookup_next_single(struct dm_btree_info *info, dm_block_t root,
0417 uint64_t key, uint64_t *rkey, void *value_le)
0418 {
0419 int r, i;
0420 uint32_t flags, nr_entries;
0421 struct dm_block *node;
0422 struct btree_node *n;
0423
0424 r = bn_read_lock(info, root, &node);
0425 if (r)
0426 return r;
0427
0428 n = dm_block_data(node);
0429 flags = le32_to_cpu(n->header.flags);
0430 nr_entries = le32_to_cpu(n->header.nr_entries);
0431
0432 if (flags & INTERNAL_NODE) {
0433 i = lower_bound(n, key);
0434 if (i < 0) {
0435
0436
0437
0438
0439 i = 0;
0440 }
0441 if (i >= nr_entries) {
0442 r = -ENODATA;
0443 goto out;
0444 }
0445
0446 r = dm_btree_lookup_next_single(info, value64(n, i), key, rkey, value_le);
0447 if (r == -ENODATA && i < (nr_entries - 1)) {
0448 i++;
0449 r = dm_btree_lookup_next_single(info, value64(n, i), key, rkey, value_le);
0450 }
0451
0452 } else {
0453 i = upper_bound(n, key);
0454 if (i < 0 || i >= nr_entries) {
0455 r = -ENODATA;
0456 goto out;
0457 }
0458
0459 *rkey = le64_to_cpu(n->keys[i]);
0460 memcpy(value_le, value_ptr(n, i), info->value_type.size);
0461 }
0462 out:
0463 dm_tm_unlock(info->tm, node);
0464 return r;
0465 }
0466
0467 int dm_btree_lookup_next(struct dm_btree_info *info, dm_block_t root,
0468 uint64_t *keys, uint64_t *rkey, void *value_le)
0469 {
0470 unsigned level;
0471 int r = -ENODATA;
0472 __le64 internal_value_le;
0473 struct ro_spine spine;
0474
0475 init_ro_spine(&spine, info);
0476 for (level = 0; level < info->levels - 1u; level++) {
0477 r = btree_lookup_raw(&spine, root, keys[level],
0478 lower_bound, rkey,
0479 &internal_value_le, sizeof(uint64_t));
0480 if (r)
0481 goto out;
0482
0483 if (*rkey != keys[level]) {
0484 r = -ENODATA;
0485 goto out;
0486 }
0487
0488 root = le64_to_cpu(internal_value_le);
0489 }
0490
0491 r = dm_btree_lookup_next_single(info, root, keys[level], rkey, value_le);
0492 out:
0493 exit_ro_spine(&spine);
0494 return r;
0495 }
0496
0497 EXPORT_SYMBOL_GPL(dm_btree_lookup_next);
0498
0499
0500
0501
0502
0503
0504
0505 static void copy_entries(struct btree_node *dest, unsigned dest_offset,
0506 struct btree_node *src, unsigned src_offset,
0507 unsigned count)
0508 {
0509 size_t value_size = le32_to_cpu(dest->header.value_size);
0510 memcpy(dest->keys + dest_offset, src->keys + src_offset, count * sizeof(uint64_t));
0511 memcpy(value_ptr(dest, dest_offset), value_ptr(src, src_offset), count * value_size);
0512 }
0513
0514
0515
0516
0517
0518 static void move_entries(struct btree_node *dest, unsigned dest_offset,
0519 struct btree_node *src, unsigned src_offset,
0520 unsigned count)
0521 {
0522 size_t value_size = le32_to_cpu(dest->header.value_size);
0523 memmove(dest->keys + dest_offset, src->keys + src_offset, count * sizeof(uint64_t));
0524 memmove(value_ptr(dest, dest_offset), value_ptr(src, src_offset), count * value_size);
0525 }
0526
0527
0528
0529
0530
0531 static void shift_down(struct btree_node *n, unsigned count)
0532 {
0533 move_entries(n, 0, n, count, le32_to_cpu(n->header.nr_entries) - count);
0534 }
0535
0536
0537
0538
0539
0540 static void shift_up(struct btree_node *n, unsigned count)
0541 {
0542 move_entries(n, count, n, 0, le32_to_cpu(n->header.nr_entries));
0543 }
0544
0545
0546
0547
0548
0549 static void redistribute2(struct btree_node *left, struct btree_node *right)
0550 {
0551 unsigned nr_left = le32_to_cpu(left->header.nr_entries);
0552 unsigned nr_right = le32_to_cpu(right->header.nr_entries);
0553 unsigned total = nr_left + nr_right;
0554 unsigned target_left = total / 2;
0555 unsigned target_right = total - target_left;
0556
0557 if (nr_left < target_left) {
0558 unsigned delta = target_left - nr_left;
0559 copy_entries(left, nr_left, right, 0, delta);
0560 shift_down(right, delta);
0561 } else if (nr_left > target_left) {
0562 unsigned delta = nr_left - target_left;
0563 if (nr_right)
0564 shift_up(right, delta);
0565 copy_entries(right, 0, left, target_left, delta);
0566 }
0567
0568 left->header.nr_entries = cpu_to_le32(target_left);
0569 right->header.nr_entries = cpu_to_le32(target_right);
0570 }
0571
0572
0573
0574
0575
0576 static void redistribute3(struct btree_node *left, struct btree_node *center,
0577 struct btree_node *right)
0578 {
0579 unsigned nr_left = le32_to_cpu(left->header.nr_entries);
0580 unsigned nr_center = le32_to_cpu(center->header.nr_entries);
0581 unsigned nr_right = le32_to_cpu(right->header.nr_entries);
0582 unsigned total, target_left, target_center, target_right;
0583
0584 BUG_ON(nr_center);
0585
0586 total = nr_left + nr_right;
0587 target_left = total / 3;
0588 target_center = (total - target_left) / 2;
0589 target_right = (total - target_left - target_center);
0590
0591 if (nr_left < target_left) {
0592 unsigned left_short = target_left - nr_left;
0593 copy_entries(left, nr_left, right, 0, left_short);
0594 copy_entries(center, 0, right, left_short, target_center);
0595 shift_down(right, nr_right - target_right);
0596
0597 } else if (nr_left < (target_left + target_center)) {
0598 unsigned left_to_center = nr_left - target_left;
0599 copy_entries(center, 0, left, target_left, left_to_center);
0600 copy_entries(center, left_to_center, right, 0, target_center - left_to_center);
0601 shift_down(right, nr_right - target_right);
0602
0603 } else {
0604 unsigned right_short = target_right - nr_right;
0605 shift_up(right, right_short);
0606 copy_entries(right, 0, left, nr_left - right_short, right_short);
0607 copy_entries(center, 0, left, target_left, nr_left - target_left);
0608 }
0609
0610 left->header.nr_entries = cpu_to_le32(target_left);
0611 center->header.nr_entries = cpu_to_le32(target_center);
0612 right->header.nr_entries = cpu_to_le32(target_right);
0613 }
0614
0615
0616
0617
0618
0619
0620
0621
0622
0623
0624
0625
0626
0627
0628
0629
0630
0631
0632
0633
0634
0635
0636
0637
0638
0639
0640
0641
0642
0643
0644
0645 static int split_one_into_two(struct shadow_spine *s, unsigned parent_index,
0646 struct dm_btree_value_type *vt, uint64_t key)
0647 {
0648 int r;
0649 struct dm_block *left, *right, *parent;
0650 struct btree_node *ln, *rn, *pn;
0651 __le64 location;
0652
0653 left = shadow_current(s);
0654
0655 r = new_block(s->info, &right);
0656 if (r < 0)
0657 return r;
0658
0659 ln = dm_block_data(left);
0660 rn = dm_block_data(right);
0661
0662 rn->header.flags = ln->header.flags;
0663 rn->header.nr_entries = cpu_to_le32(0);
0664 rn->header.max_entries = ln->header.max_entries;
0665 rn->header.value_size = ln->header.value_size;
0666 redistribute2(ln, rn);
0667
0668
0669 parent = shadow_parent(s);
0670 pn = dm_block_data(parent);
0671
0672 location = cpu_to_le64(dm_block_location(right));
0673 __dm_bless_for_disk(&location);
0674 r = insert_at(sizeof(__le64), pn, parent_index + 1,
0675 le64_to_cpu(rn->keys[0]), &location);
0676 if (r) {
0677 unlock_block(s->info, right);
0678 return r;
0679 }
0680
0681
0682 if (key < le64_to_cpu(rn->keys[0])) {
0683 unlock_block(s->info, right);
0684 s->nodes[1] = left;
0685 } else {
0686 unlock_block(s->info, left);
0687 s->nodes[1] = right;
0688 }
0689
0690 return 0;
0691 }
0692
0693
0694
0695
0696
0697
0698 static int shadow_child(struct dm_btree_info *info, struct dm_btree_value_type *vt,
0699 struct btree_node *parent, unsigned index,
0700 struct dm_block **result)
0701 {
0702 int r, inc;
0703 dm_block_t root;
0704 struct btree_node *node;
0705
0706 root = value64(parent, index);
0707
0708 r = dm_tm_shadow_block(info->tm, root, &btree_node_validator,
0709 result, &inc);
0710 if (r)
0711 return r;
0712
0713 node = dm_block_data(*result);
0714
0715 if (inc)
0716 inc_children(info->tm, node, vt);
0717
0718 *((__le64 *) value_ptr(parent, index)) =
0719 cpu_to_le64(dm_block_location(*result));
0720
0721 return 0;
0722 }
0723
0724
0725
0726
0727
0728 static int split_two_into_three(struct shadow_spine *s, unsigned parent_index,
0729 struct dm_btree_value_type *vt, uint64_t key)
0730 {
0731 int r;
0732 unsigned middle_index;
0733 struct dm_block *left, *middle, *right, *parent;
0734 struct btree_node *ln, *rn, *mn, *pn;
0735 __le64 location;
0736
0737 parent = shadow_parent(s);
0738 pn = dm_block_data(parent);
0739
0740 if (parent_index == 0) {
0741 middle_index = 1;
0742 left = shadow_current(s);
0743 r = shadow_child(s->info, vt, pn, parent_index + 1, &right);
0744 if (r)
0745 return r;
0746 } else {
0747 middle_index = parent_index;
0748 right = shadow_current(s);
0749 r = shadow_child(s->info, vt, pn, parent_index - 1, &left);
0750 if (r)
0751 return r;
0752 }
0753
0754 r = new_block(s->info, &middle);
0755 if (r < 0)
0756 return r;
0757
0758 ln = dm_block_data(left);
0759 mn = dm_block_data(middle);
0760 rn = dm_block_data(right);
0761
0762 mn->header.nr_entries = cpu_to_le32(0);
0763 mn->header.flags = ln->header.flags;
0764 mn->header.max_entries = ln->header.max_entries;
0765 mn->header.value_size = ln->header.value_size;
0766
0767 redistribute3(ln, mn, rn);
0768
0769
0770 pn->keys[middle_index] = rn->keys[0];
0771 location = cpu_to_le64(dm_block_location(middle));
0772 __dm_bless_for_disk(&location);
0773 r = insert_at(sizeof(__le64), pn, middle_index,
0774 le64_to_cpu(mn->keys[0]), &location);
0775 if (r) {
0776 if (shadow_current(s) != left)
0777 unlock_block(s->info, left);
0778
0779 unlock_block(s->info, middle);
0780
0781 if (shadow_current(s) != right)
0782 unlock_block(s->info, right);
0783
0784 return r;
0785 }
0786
0787
0788
0789 if (key < le64_to_cpu(mn->keys[0])) {
0790 unlock_block(s->info, middle);
0791 unlock_block(s->info, right);
0792 s->nodes[1] = left;
0793 } else if (key < le64_to_cpu(rn->keys[0])) {
0794 unlock_block(s->info, left);
0795 unlock_block(s->info, right);
0796 s->nodes[1] = middle;
0797 } else {
0798 unlock_block(s->info, left);
0799 unlock_block(s->info, middle);
0800 s->nodes[1] = right;
0801 }
0802
0803 return 0;
0804 }
0805
0806
0807
0808
0809
0810
0811
0812
0813
0814
0815
0816
0817
0818
0819
0820
0821
0822
0823
0824
0825
0826
0827
0828
0829 static int btree_split_beneath(struct shadow_spine *s, uint64_t key)
0830 {
0831 int r;
0832 size_t size;
0833 unsigned nr_left, nr_right;
0834 struct dm_block *left, *right, *new_parent;
0835 struct btree_node *pn, *ln, *rn;
0836 __le64 val;
0837
0838 new_parent = shadow_current(s);
0839
0840 pn = dm_block_data(new_parent);
0841 size = le32_to_cpu(pn->header.flags) & INTERNAL_NODE ?
0842 sizeof(__le64) : s->info->value_type.size;
0843
0844
0845 r = new_block(s->info, &left);
0846 if (r < 0)
0847 return r;
0848
0849 ln = dm_block_data(left);
0850 nr_left = le32_to_cpu(pn->header.nr_entries) / 2;
0851
0852 ln->header.flags = pn->header.flags;
0853 ln->header.nr_entries = cpu_to_le32(nr_left);
0854 ln->header.max_entries = pn->header.max_entries;
0855 ln->header.value_size = pn->header.value_size;
0856 memcpy(ln->keys, pn->keys, nr_left * sizeof(pn->keys[0]));
0857 memcpy(value_ptr(ln, 0), value_ptr(pn, 0), nr_left * size);
0858
0859
0860 r = new_block(s->info, &right);
0861 if (r < 0) {
0862 unlock_block(s->info, left);
0863 return r;
0864 }
0865
0866 rn = dm_block_data(right);
0867 nr_right = le32_to_cpu(pn->header.nr_entries) - nr_left;
0868
0869 rn->header.flags = pn->header.flags;
0870 rn->header.nr_entries = cpu_to_le32(nr_right);
0871 rn->header.max_entries = pn->header.max_entries;
0872 rn->header.value_size = pn->header.value_size;
0873 memcpy(rn->keys, pn->keys + nr_left, nr_right * sizeof(pn->keys[0]));
0874 memcpy(value_ptr(rn, 0), value_ptr(pn, nr_left),
0875 nr_right * size);
0876
0877
0878 pn->header.flags = cpu_to_le32(INTERNAL_NODE);
0879 pn->header.nr_entries = cpu_to_le32(2);
0880 pn->header.max_entries = cpu_to_le32(
0881 calc_max_entries(sizeof(__le64),
0882 dm_bm_block_size(
0883 dm_tm_get_bm(s->info->tm))));
0884 pn->header.value_size = cpu_to_le32(sizeof(__le64));
0885
0886 val = cpu_to_le64(dm_block_location(left));
0887 __dm_bless_for_disk(&val);
0888 pn->keys[0] = ln->keys[0];
0889 memcpy_disk(value_ptr(pn, 0), &val, sizeof(__le64));
0890
0891 val = cpu_to_le64(dm_block_location(right));
0892 __dm_bless_for_disk(&val);
0893 pn->keys[1] = rn->keys[0];
0894 memcpy_disk(value_ptr(pn, 1), &val, sizeof(__le64));
0895
0896 unlock_block(s->info, left);
0897 unlock_block(s->info, right);
0898 return 0;
0899 }
0900
0901
0902
0903
0904
0905
0906 static int rebalance_left(struct shadow_spine *s, struct dm_btree_value_type *vt,
0907 unsigned parent_index, uint64_t key)
0908 {
0909 int r;
0910 struct dm_block *sib;
0911 struct btree_node *left, *right, *parent = dm_block_data(shadow_parent(s));
0912
0913 r = shadow_child(s->info, vt, parent, parent_index - 1, &sib);
0914 if (r)
0915 return r;
0916
0917 left = dm_block_data(sib);
0918 right = dm_block_data(shadow_current(s));
0919 redistribute2(left, right);
0920 *key_ptr(parent, parent_index) = right->keys[0];
0921
0922 if (key < le64_to_cpu(right->keys[0])) {
0923 unlock_block(s->info, s->nodes[1]);
0924 s->nodes[1] = sib;
0925 } else {
0926 unlock_block(s->info, sib);
0927 }
0928
0929 return 0;
0930 }
0931
0932
0933
0934
0935 static int rebalance_right(struct shadow_spine *s, struct dm_btree_value_type *vt,
0936 unsigned parent_index, uint64_t key)
0937 {
0938 int r;
0939 struct dm_block *sib;
0940 struct btree_node *left, *right, *parent = dm_block_data(shadow_parent(s));
0941
0942 r = shadow_child(s->info, vt, parent, parent_index + 1, &sib);
0943 if (r)
0944 return r;
0945
0946 left = dm_block_data(shadow_current(s));
0947 right = dm_block_data(sib);
0948 redistribute2(left, right);
0949 *key_ptr(parent, parent_index + 1) = right->keys[0];
0950
0951 if (key < le64_to_cpu(right->keys[0])) {
0952 unlock_block(s->info, sib);
0953 } else {
0954 unlock_block(s->info, s->nodes[1]);
0955 s->nodes[1] = sib;
0956 }
0957
0958 return 0;
0959 }
0960
0961
0962
0963
0964 static int get_node_free_space(struct dm_btree_info *info, dm_block_t b, unsigned *space)
0965 {
0966 int r;
0967 unsigned nr_entries;
0968 struct dm_block *block;
0969 struct btree_node *node;
0970
0971 r = bn_read_lock(info, b, &block);
0972 if (r)
0973 return r;
0974
0975 node = dm_block_data(block);
0976 nr_entries = le32_to_cpu(node->header.nr_entries);
0977 *space = le32_to_cpu(node->header.max_entries) - nr_entries;
0978
0979 unlock_block(info, block);
0980 return 0;
0981 }
0982
0983
0984
0985
0986
0987
0988
0989
0990
0991 #define SPACE_THRESHOLD 8
0992 static int rebalance_or_split(struct shadow_spine *s, struct dm_btree_value_type *vt,
0993 unsigned parent_index, uint64_t key)
0994 {
0995 int r;
0996 struct btree_node *parent = dm_block_data(shadow_parent(s));
0997 unsigned nr_parent = le32_to_cpu(parent->header.nr_entries);
0998 unsigned free_space;
0999 int left_shared = 0, right_shared = 0;
1000
1001
1002 if (parent_index > 0) {
1003 dm_block_t left_b = value64(parent, parent_index - 1);
1004 r = dm_tm_block_is_shared(s->info->tm, left_b, &left_shared);
1005 if (r)
1006 return r;
1007
1008 if (!left_shared) {
1009 r = get_node_free_space(s->info, left_b, &free_space);
1010 if (r)
1011 return r;
1012
1013 if (free_space >= SPACE_THRESHOLD)
1014 return rebalance_left(s, vt, parent_index, key);
1015 }
1016 }
1017
1018
1019 if (parent_index < (nr_parent - 1)) {
1020 dm_block_t right_b = value64(parent, parent_index + 1);
1021 r = dm_tm_block_is_shared(s->info->tm, right_b, &right_shared);
1022 if (r)
1023 return r;
1024
1025 if (!right_shared) {
1026 r = get_node_free_space(s->info, right_b, &free_space);
1027 if (r)
1028 return r;
1029
1030 if (free_space >= SPACE_THRESHOLD)
1031 return rebalance_right(s, vt, parent_index, key);
1032 }
1033 }
1034
1035
1036
1037
1038
1039
1040
1041 if (left_shared || right_shared || (nr_parent <= 2) ||
1042 (parent_index == 0) || (parent_index + 1 == nr_parent)) {
1043 return split_one_into_two(s, parent_index, vt, key);
1044 } else {
1045 return split_two_into_three(s, parent_index, vt, key);
1046 }
1047 }
1048
1049
1050
1051
1052 static bool contains_key(struct btree_node *node, uint64_t key)
1053 {
1054 int i = lower_bound(node, key);
1055
1056 if (i >= 0 && le64_to_cpu(node->keys[i]) == key)
1057 return true;
1058
1059 return false;
1060 }
1061
1062
1063
1064
1065
1066
1067 static bool has_space_for_insert(struct btree_node *node, uint64_t key)
1068 {
1069 if (node->header.nr_entries == node->header.max_entries) {
1070 if (le32_to_cpu(node->header.flags) & LEAF_NODE) {
1071
1072 return contains_key(node, key);
1073 }
1074
1075 return false;
1076 }
1077
1078 return true;
1079 }
1080
1081 static int btree_insert_raw(struct shadow_spine *s, dm_block_t root,
1082 struct dm_btree_value_type *vt,
1083 uint64_t key, unsigned *index)
1084 {
1085 int r, i = *index, top = 1;
1086 struct btree_node *node;
1087
1088 for (;;) {
1089 r = shadow_step(s, root, vt);
1090 if (r < 0)
1091 return r;
1092
1093 node = dm_block_data(shadow_current(s));
1094
1095
1096
1097
1098
1099
1100 if (shadow_has_parent(s) && i >= 0) {
1101 __le64 location = cpu_to_le64(dm_block_location(shadow_current(s)));
1102
1103 __dm_bless_for_disk(&location);
1104 memcpy_disk(value_ptr(dm_block_data(shadow_parent(s)), i),
1105 &location, sizeof(__le64));
1106 }
1107
1108 node = dm_block_data(shadow_current(s));
1109
1110 if (!has_space_for_insert(node, key)) {
1111 if (top)
1112 r = btree_split_beneath(s, key);
1113 else
1114 r = rebalance_or_split(s, vt, i, key);
1115
1116 if (r < 0)
1117 return r;
1118
1119
1120 node = dm_block_data(shadow_current(s));
1121 }
1122
1123 i = lower_bound(node, key);
1124
1125 if (le32_to_cpu(node->header.flags) & LEAF_NODE)
1126 break;
1127
1128 if (i < 0) {
1129
1130 node->keys[0] = cpu_to_le64(key);
1131 i = 0;
1132 }
1133
1134 root = value64(node, i);
1135 top = 0;
1136 }
1137
1138 if (i < 0 || le64_to_cpu(node->keys[i]) != key)
1139 i++;
1140
1141 *index = i;
1142 return 0;
1143 }
1144
1145 static int __btree_get_overwrite_leaf(struct shadow_spine *s, dm_block_t root,
1146 uint64_t key, int *index)
1147 {
1148 int r, i = -1;
1149 struct btree_node *node;
1150
1151 *index = 0;
1152 for (;;) {
1153 r = shadow_step(s, root, &s->info->value_type);
1154 if (r < 0)
1155 return r;
1156
1157 node = dm_block_data(shadow_current(s));
1158
1159
1160
1161
1162
1163
1164 if (shadow_has_parent(s) && i >= 0) {
1165 __le64 location = cpu_to_le64(dm_block_location(shadow_current(s)));
1166
1167 __dm_bless_for_disk(&location);
1168 memcpy_disk(value_ptr(dm_block_data(shadow_parent(s)), i),
1169 &location, sizeof(__le64));
1170 }
1171
1172 node = dm_block_data(shadow_current(s));
1173 i = lower_bound(node, key);
1174
1175 BUG_ON(i < 0);
1176 BUG_ON(i >= le32_to_cpu(node->header.nr_entries));
1177
1178 if (le32_to_cpu(node->header.flags) & LEAF_NODE) {
1179 if (key != le64_to_cpu(node->keys[i]))
1180 return -EINVAL;
1181 break;
1182 }
1183
1184 root = value64(node, i);
1185 }
1186
1187 *index = i;
1188 return 0;
1189 }
1190
1191 int btree_get_overwrite_leaf(struct dm_btree_info *info, dm_block_t root,
1192 uint64_t key, int *index,
1193 dm_block_t *new_root, struct dm_block **leaf)
1194 {
1195 int r;
1196 struct shadow_spine spine;
1197
1198 BUG_ON(info->levels > 1);
1199 init_shadow_spine(&spine, info);
1200 r = __btree_get_overwrite_leaf(&spine, root, key, index);
1201 if (!r) {
1202 *new_root = shadow_root(&spine);
1203 *leaf = shadow_current(&spine);
1204
1205
1206
1207
1208
1209 spine.count--;
1210 }
1211 exit_shadow_spine(&spine);
1212
1213 return r;
1214 }
1215
1216 static bool need_insert(struct btree_node *node, uint64_t *keys,
1217 unsigned level, unsigned index)
1218 {
1219 return ((index >= le32_to_cpu(node->header.nr_entries)) ||
1220 (le64_to_cpu(node->keys[index]) != keys[level]));
1221 }
1222
1223 static int insert(struct dm_btree_info *info, dm_block_t root,
1224 uint64_t *keys, void *value, dm_block_t *new_root,
1225 int *inserted)
1226 __dm_written_to_disk(value)
1227 {
1228 int r;
1229 unsigned level, index = -1, last_level = info->levels - 1;
1230 dm_block_t block = root;
1231 struct shadow_spine spine;
1232 struct btree_node *n;
1233 struct dm_btree_value_type le64_type;
1234
1235 init_le64_type(info->tm, &le64_type);
1236 init_shadow_spine(&spine, info);
1237
1238 for (level = 0; level < (info->levels - 1); level++) {
1239 r = btree_insert_raw(&spine, block, &le64_type, keys[level], &index);
1240 if (r < 0)
1241 goto bad;
1242
1243 n = dm_block_data(shadow_current(&spine));
1244
1245 if (need_insert(n, keys, level, index)) {
1246 dm_block_t new_tree;
1247 __le64 new_le;
1248
1249 r = dm_btree_empty(info, &new_tree);
1250 if (r < 0)
1251 goto bad;
1252
1253 new_le = cpu_to_le64(new_tree);
1254 __dm_bless_for_disk(&new_le);
1255
1256 r = insert_at(sizeof(uint64_t), n, index,
1257 keys[level], &new_le);
1258 if (r)
1259 goto bad;
1260 }
1261
1262 if (level < last_level)
1263 block = value64(n, index);
1264 }
1265
1266 r = btree_insert_raw(&spine, block, &info->value_type,
1267 keys[level], &index);
1268 if (r < 0)
1269 goto bad;
1270
1271 n = dm_block_data(shadow_current(&spine));
1272
1273 if (need_insert(n, keys, level, index)) {
1274 if (inserted)
1275 *inserted = 1;
1276
1277 r = insert_at(info->value_type.size, n, index,
1278 keys[level], value);
1279 if (r)
1280 goto bad_unblessed;
1281 } else {
1282 if (inserted)
1283 *inserted = 0;
1284
1285 if (info->value_type.dec &&
1286 (!info->value_type.equal ||
1287 !info->value_type.equal(
1288 info->value_type.context,
1289 value_ptr(n, index),
1290 value))) {
1291 info->value_type.dec(info->value_type.context,
1292 value_ptr(n, index), 1);
1293 }
1294 memcpy_disk(value_ptr(n, index),
1295 value, info->value_type.size);
1296 }
1297
1298 *new_root = shadow_root(&spine);
1299 exit_shadow_spine(&spine);
1300
1301 return 0;
1302
1303 bad:
1304 __dm_unbless_for_disk(value);
1305 bad_unblessed:
1306 exit_shadow_spine(&spine);
1307 return r;
1308 }
1309
1310 int dm_btree_insert(struct dm_btree_info *info, dm_block_t root,
1311 uint64_t *keys, void *value, dm_block_t *new_root)
1312 __dm_written_to_disk(value)
1313 {
1314 return insert(info, root, keys, value, new_root, NULL);
1315 }
1316 EXPORT_SYMBOL_GPL(dm_btree_insert);
1317
1318 int dm_btree_insert_notify(struct dm_btree_info *info, dm_block_t root,
1319 uint64_t *keys, void *value, dm_block_t *new_root,
1320 int *inserted)
1321 __dm_written_to_disk(value)
1322 {
1323 return insert(info, root, keys, value, new_root, inserted);
1324 }
1325 EXPORT_SYMBOL_GPL(dm_btree_insert_notify);
1326
1327
1328
1329 static int find_key(struct ro_spine *s, dm_block_t block, bool find_highest,
1330 uint64_t *result_key, dm_block_t *next_block)
1331 {
1332 int i, r;
1333 uint32_t flags;
1334
1335 do {
1336 r = ro_step(s, block);
1337 if (r < 0)
1338 return r;
1339
1340 flags = le32_to_cpu(ro_node(s)->header.flags);
1341 i = le32_to_cpu(ro_node(s)->header.nr_entries);
1342 if (!i)
1343 return -ENODATA;
1344 else
1345 i--;
1346
1347 if (find_highest)
1348 *result_key = le64_to_cpu(ro_node(s)->keys[i]);
1349 else
1350 *result_key = le64_to_cpu(ro_node(s)->keys[0]);
1351
1352 if (next_block || flags & INTERNAL_NODE) {
1353 if (find_highest)
1354 block = value64(ro_node(s), i);
1355 else
1356 block = value64(ro_node(s), 0);
1357 }
1358
1359 } while (flags & INTERNAL_NODE);
1360
1361 if (next_block)
1362 *next_block = block;
1363 return 0;
1364 }
1365
1366 static int dm_btree_find_key(struct dm_btree_info *info, dm_block_t root,
1367 bool find_highest, uint64_t *result_keys)
1368 {
1369 int r = 0, count = 0, level;
1370 struct ro_spine spine;
1371
1372 init_ro_spine(&spine, info);
1373 for (level = 0; level < info->levels; level++) {
1374 r = find_key(&spine, root, find_highest, result_keys + level,
1375 level == info->levels - 1 ? NULL : &root);
1376 if (r == -ENODATA) {
1377 r = 0;
1378 break;
1379
1380 } else if (r)
1381 break;
1382
1383 count++;
1384 }
1385 exit_ro_spine(&spine);
1386
1387 return r ? r : count;
1388 }
1389
1390 int dm_btree_find_highest_key(struct dm_btree_info *info, dm_block_t root,
1391 uint64_t *result_keys)
1392 {
1393 return dm_btree_find_key(info, root, true, result_keys);
1394 }
1395 EXPORT_SYMBOL_GPL(dm_btree_find_highest_key);
1396
1397 int dm_btree_find_lowest_key(struct dm_btree_info *info, dm_block_t root,
1398 uint64_t *result_keys)
1399 {
1400 return dm_btree_find_key(info, root, false, result_keys);
1401 }
1402 EXPORT_SYMBOL_GPL(dm_btree_find_lowest_key);
1403
1404
1405
1406
1407
1408
1409
1410 static int walk_node(struct dm_btree_info *info, dm_block_t block,
1411 int (*fn)(void *context, uint64_t *keys, void *leaf),
1412 void *context)
1413 {
1414 int r;
1415 unsigned i, nr;
1416 struct dm_block *node;
1417 struct btree_node *n;
1418 uint64_t keys;
1419
1420 r = bn_read_lock(info, block, &node);
1421 if (r)
1422 return r;
1423
1424 n = dm_block_data(node);
1425
1426 nr = le32_to_cpu(n->header.nr_entries);
1427 for (i = 0; i < nr; i++) {
1428 if (le32_to_cpu(n->header.flags) & INTERNAL_NODE) {
1429 r = walk_node(info, value64(n, i), fn, context);
1430 if (r)
1431 goto out;
1432 } else {
1433 keys = le64_to_cpu(*key_ptr(n, i));
1434 r = fn(context, &keys, value_ptr(n, i));
1435 if (r)
1436 goto out;
1437 }
1438 }
1439
1440 out:
1441 dm_tm_unlock(info->tm, node);
1442 return r;
1443 }
1444
1445 int dm_btree_walk(struct dm_btree_info *info, dm_block_t root,
1446 int (*fn)(void *context, uint64_t *keys, void *leaf),
1447 void *context)
1448 {
1449 BUG_ON(info->levels > 1);
1450 return walk_node(info, root, fn, context);
1451 }
1452 EXPORT_SYMBOL_GPL(dm_btree_walk);
1453
1454
1455
1456 static void prefetch_values(struct dm_btree_cursor *c)
1457 {
1458 unsigned i, nr;
1459 __le64 value_le;
1460 struct cursor_node *n = c->nodes + c->depth - 1;
1461 struct btree_node *bn = dm_block_data(n->b);
1462 struct dm_block_manager *bm = dm_tm_get_bm(c->info->tm);
1463
1464 BUG_ON(c->info->value_type.size != sizeof(value_le));
1465
1466 nr = le32_to_cpu(bn->header.nr_entries);
1467 for (i = 0; i < nr; i++) {
1468 memcpy(&value_le, value_ptr(bn, i), sizeof(value_le));
1469 dm_bm_prefetch(bm, le64_to_cpu(value_le));
1470 }
1471 }
1472
1473 static bool leaf_node(struct dm_btree_cursor *c)
1474 {
1475 struct cursor_node *n = c->nodes + c->depth - 1;
1476 struct btree_node *bn = dm_block_data(n->b);
1477
1478 return le32_to_cpu(bn->header.flags) & LEAF_NODE;
1479 }
1480
1481 static int push_node(struct dm_btree_cursor *c, dm_block_t b)
1482 {
1483 int r;
1484 struct cursor_node *n = c->nodes + c->depth;
1485
1486 if (c->depth >= DM_BTREE_CURSOR_MAX_DEPTH - 1) {
1487 DMERR("couldn't push cursor node, stack depth too high");
1488 return -EINVAL;
1489 }
1490
1491 r = bn_read_lock(c->info, b, &n->b);
1492 if (r)
1493 return r;
1494
1495 n->index = 0;
1496 c->depth++;
1497
1498 if (c->prefetch_leaves || !leaf_node(c))
1499 prefetch_values(c);
1500
1501 return 0;
1502 }
1503
1504 static void pop_node(struct dm_btree_cursor *c)
1505 {
1506 c->depth--;
1507 unlock_block(c->info, c->nodes[c->depth].b);
1508 }
1509
1510 static int inc_or_backtrack(struct dm_btree_cursor *c)
1511 {
1512 struct cursor_node *n;
1513 struct btree_node *bn;
1514
1515 for (;;) {
1516 if (!c->depth)
1517 return -ENODATA;
1518
1519 n = c->nodes + c->depth - 1;
1520 bn = dm_block_data(n->b);
1521
1522 n->index++;
1523 if (n->index < le32_to_cpu(bn->header.nr_entries))
1524 break;
1525
1526 pop_node(c);
1527 }
1528
1529 return 0;
1530 }
1531
1532 static int find_leaf(struct dm_btree_cursor *c)
1533 {
1534 int r = 0;
1535 struct cursor_node *n;
1536 struct btree_node *bn;
1537 __le64 value_le;
1538
1539 for (;;) {
1540 n = c->nodes + c->depth - 1;
1541 bn = dm_block_data(n->b);
1542
1543 if (le32_to_cpu(bn->header.flags) & LEAF_NODE)
1544 break;
1545
1546 memcpy(&value_le, value_ptr(bn, n->index), sizeof(value_le));
1547 r = push_node(c, le64_to_cpu(value_le));
1548 if (r) {
1549 DMERR("push_node failed");
1550 break;
1551 }
1552 }
1553
1554 if (!r && (le32_to_cpu(bn->header.nr_entries) == 0))
1555 return -ENODATA;
1556
1557 return r;
1558 }
1559
1560 int dm_btree_cursor_begin(struct dm_btree_info *info, dm_block_t root,
1561 bool prefetch_leaves, struct dm_btree_cursor *c)
1562 {
1563 int r;
1564
1565 c->info = info;
1566 c->root = root;
1567 c->depth = 0;
1568 c->prefetch_leaves = prefetch_leaves;
1569
1570 r = push_node(c, root);
1571 if (r)
1572 return r;
1573
1574 return find_leaf(c);
1575 }
1576 EXPORT_SYMBOL_GPL(dm_btree_cursor_begin);
1577
1578 void dm_btree_cursor_end(struct dm_btree_cursor *c)
1579 {
1580 while (c->depth)
1581 pop_node(c);
1582 }
1583 EXPORT_SYMBOL_GPL(dm_btree_cursor_end);
1584
1585 int dm_btree_cursor_next(struct dm_btree_cursor *c)
1586 {
1587 int r = inc_or_backtrack(c);
1588 if (!r) {
1589 r = find_leaf(c);
1590 if (r)
1591 DMERR("find_leaf failed");
1592 }
1593
1594 return r;
1595 }
1596 EXPORT_SYMBOL_GPL(dm_btree_cursor_next);
1597
1598 int dm_btree_cursor_skip(struct dm_btree_cursor *c, uint32_t count)
1599 {
1600 int r = 0;
1601
1602 while (count-- && !r)
1603 r = dm_btree_cursor_next(c);
1604
1605 return r;
1606 }
1607 EXPORT_SYMBOL_GPL(dm_btree_cursor_skip);
1608
1609 int dm_btree_cursor_get_value(struct dm_btree_cursor *c, uint64_t *key, void *value_le)
1610 {
1611 if (c->depth) {
1612 struct cursor_node *n = c->nodes + c->depth - 1;
1613 struct btree_node *bn = dm_block_data(n->b);
1614
1615 if (le32_to_cpu(bn->header.flags) & INTERNAL_NODE)
1616 return -EINVAL;
1617
1618 *key = le64_to_cpu(*key_ptr(bn, n->index));
1619 memcpy(value_le, value_ptr(bn, n->index), c->info->value_type.size);
1620 return 0;
1621
1622 } else
1623 return -ENODATA;
1624 }
1625 EXPORT_SYMBOL_GPL(dm_btree_cursor_get_value);