Back to home page

OSCL-LXR

 
 

    


0001 /*
0002  * Copyright 2000 by Hans Reiser, licensing governed by reiserfs/README
0003  */
0004 
0005 #include <linux/time.h>
0006 #include <linux/slab.h>
0007 #include <linux/string.h>
0008 #include "reiserfs.h"
0009 #include <linux/buffer_head.h>
0010 
0011 /*
0012  * To make any changes in the tree we find a node that contains item
0013  * to be changed/deleted or position in the node we insert a new item
0014  * to. We call this node S. To do balancing we need to decide what we
0015  * will shift to left/right neighbor, or to a new node, where new item
0016  * will be etc. To make this analysis simpler we build virtual
0017  * node. Virtual node is an array of items, that will replace items of
0018  * node S. (For instance if we are going to delete an item, virtual
0019  * node does not contain it). Virtual node keeps information about
0020  * item sizes and types, mergeability of first and last items, sizes
0021  * of all entries in directory item. We use this array of items when
0022  * calculating what we can shift to neighbors and how many nodes we
0023  * have to have if we do not any shiftings, if we shift to left/right
0024  * neighbor or to both.
0025  */
0026 
0027 /*
0028  * Takes item number in virtual node, returns number of item
0029  * that it has in source buffer
0030  */
0031 static inline int old_item_num(int new_num, int affected_item_num, int mode)
0032 {
0033     if (mode == M_PASTE || mode == M_CUT || new_num < affected_item_num)
0034         return new_num;
0035 
0036     if (mode == M_INSERT) {
0037 
0038         RFALSE(new_num == 0,
0039                "vs-8005: for INSERT mode and item number of inserted item");
0040 
0041         return new_num - 1;
0042     }
0043 
0044     RFALSE(mode != M_DELETE,
0045            "vs-8010: old_item_num: mode must be M_DELETE (mode = \'%c\'",
0046            mode);
0047     /* delete mode */
0048     return new_num + 1;
0049 }
0050 
0051 static void create_virtual_node(struct tree_balance *tb, int h)
0052 {
0053     struct item_head *ih;
0054     struct virtual_node *vn = tb->tb_vn;
0055     int new_num;
0056     struct buffer_head *Sh; /* this comes from tb->S[h] */
0057 
0058     Sh = PATH_H_PBUFFER(tb->tb_path, h);
0059 
0060     /* size of changed node */
0061     vn->vn_size =
0062         MAX_CHILD_SIZE(Sh) - B_FREE_SPACE(Sh) + tb->insert_size[h];
0063 
0064     /* for internal nodes array if virtual items is not created */
0065     if (h) {
0066         vn->vn_nr_item = (vn->vn_size - DC_SIZE) / (DC_SIZE + KEY_SIZE);
0067         return;
0068     }
0069 
0070     /* number of items in virtual node  */
0071     vn->vn_nr_item =
0072         B_NR_ITEMS(Sh) + ((vn->vn_mode == M_INSERT) ? 1 : 0) -
0073         ((vn->vn_mode == M_DELETE) ? 1 : 0);
0074 
0075     /* first virtual item */
0076     vn->vn_vi = (struct virtual_item *)(tb->tb_vn + 1);
0077     memset(vn->vn_vi, 0, vn->vn_nr_item * sizeof(struct virtual_item));
0078     vn->vn_free_ptr += vn->vn_nr_item * sizeof(struct virtual_item);
0079 
0080     /* first item in the node */
0081     ih = item_head(Sh, 0);
0082 
0083     /* define the mergeability for 0-th item (if it is not being deleted) */
0084     if (op_is_left_mergeable(&ih->ih_key, Sh->b_size)
0085         && (vn->vn_mode != M_DELETE || vn->vn_affected_item_num))
0086         vn->vn_vi[0].vi_type |= VI_TYPE_LEFT_MERGEABLE;
0087 
0088     /*
0089      * go through all items that remain in the virtual
0090      * node (except for the new (inserted) one)
0091      */
0092     for (new_num = 0; new_num < vn->vn_nr_item; new_num++) {
0093         int j;
0094         struct virtual_item *vi = vn->vn_vi + new_num;
0095         int is_affected =
0096             ((new_num != vn->vn_affected_item_num) ? 0 : 1);
0097 
0098         if (is_affected && vn->vn_mode == M_INSERT)
0099             continue;
0100 
0101         /* get item number in source node */
0102         j = old_item_num(new_num, vn->vn_affected_item_num,
0103                  vn->vn_mode);
0104 
0105         vi->vi_item_len += ih_item_len(ih + j) + IH_SIZE;
0106         vi->vi_ih = ih + j;
0107         vi->vi_item = ih_item_body(Sh, ih + j);
0108         vi->vi_uarea = vn->vn_free_ptr;
0109 
0110         /*
0111          * FIXME: there is no check that item operation did not
0112          * consume too much memory
0113          */
0114         vn->vn_free_ptr +=
0115             op_create_vi(vn, vi, is_affected, tb->insert_size[0]);
0116         if (tb->vn_buf + tb->vn_buf_size < vn->vn_free_ptr)
0117             reiserfs_panic(tb->tb_sb, "vs-8030",
0118                        "virtual node space consumed");
0119 
0120         if (!is_affected)
0121             /* this is not being changed */
0122             continue;
0123 
0124         if (vn->vn_mode == M_PASTE || vn->vn_mode == M_CUT) {
0125             vn->vn_vi[new_num].vi_item_len += tb->insert_size[0];
0126             /* pointer to data which is going to be pasted */
0127             vi->vi_new_data = vn->vn_data;
0128         }
0129     }
0130 
0131     /* virtual inserted item is not defined yet */
0132     if (vn->vn_mode == M_INSERT) {
0133         struct virtual_item *vi = vn->vn_vi + vn->vn_affected_item_num;
0134 
0135         RFALSE(vn->vn_ins_ih == NULL,
0136                "vs-8040: item header of inserted item is not specified");
0137         vi->vi_item_len = tb->insert_size[0];
0138         vi->vi_ih = vn->vn_ins_ih;
0139         vi->vi_item = vn->vn_data;
0140         vi->vi_uarea = vn->vn_free_ptr;
0141 
0142         op_create_vi(vn, vi, 0 /*not pasted or cut */ ,
0143                  tb->insert_size[0]);
0144     }
0145 
0146     /*
0147      * set right merge flag we take right delimiting key and
0148      * check whether it is a mergeable item
0149      */
0150     if (tb->CFR[0]) {
0151         struct reiserfs_key *key;
0152 
0153         key = internal_key(tb->CFR[0], tb->rkey[0]);
0154         if (op_is_left_mergeable(key, Sh->b_size)
0155             && (vn->vn_mode != M_DELETE
0156             || vn->vn_affected_item_num != B_NR_ITEMS(Sh) - 1))
0157             vn->vn_vi[vn->vn_nr_item - 1].vi_type |=
0158                 VI_TYPE_RIGHT_MERGEABLE;
0159 
0160 #ifdef CONFIG_REISERFS_CHECK
0161         if (op_is_left_mergeable(key, Sh->b_size) &&
0162             !(vn->vn_mode != M_DELETE
0163               || vn->vn_affected_item_num != B_NR_ITEMS(Sh) - 1)) {
0164             /*
0165              * we delete last item and it could be merged
0166              * with right neighbor's first item
0167              */
0168             if (!
0169                 (B_NR_ITEMS(Sh) == 1
0170                  && is_direntry_le_ih(item_head(Sh, 0))
0171                  && ih_entry_count(item_head(Sh, 0)) == 1)) {
0172                 /*
0173                  * node contains more than 1 item, or item
0174                  * is not directory item, or this item
0175                  * contains more than 1 entry
0176                  */
0177                 print_block(Sh, 0, -1, -1);
0178                 reiserfs_panic(tb->tb_sb, "vs-8045",
0179                            "rdkey %k, affected item==%d "
0180                            "(mode==%c) Must be %c",
0181                            key, vn->vn_affected_item_num,
0182                            vn->vn_mode, M_DELETE);
0183             }
0184         }
0185 #endif
0186 
0187     }
0188 }
0189 
0190 /*
0191  * Using virtual node check, how many items can be
0192  * shifted to left neighbor
0193  */
0194 static void check_left(struct tree_balance *tb, int h, int cur_free)
0195 {
0196     int i;
0197     struct virtual_node *vn = tb->tb_vn;
0198     struct virtual_item *vi;
0199     int d_size, ih_size;
0200 
0201     RFALSE(cur_free < 0, "vs-8050: cur_free (%d) < 0", cur_free);
0202 
0203     /* internal level */
0204     if (h > 0) {
0205         tb->lnum[h] = cur_free / (DC_SIZE + KEY_SIZE);
0206         return;
0207     }
0208 
0209     /* leaf level */
0210 
0211     if (!cur_free || !vn->vn_nr_item) {
0212         /* no free space or nothing to move */
0213         tb->lnum[h] = 0;
0214         tb->lbytes = -1;
0215         return;
0216     }
0217 
0218     RFALSE(!PATH_H_PPARENT(tb->tb_path, 0),
0219            "vs-8055: parent does not exist or invalid");
0220 
0221     vi = vn->vn_vi;
0222     if ((unsigned int)cur_free >=
0223         (vn->vn_size -
0224          ((vi->vi_type & VI_TYPE_LEFT_MERGEABLE) ? IH_SIZE : 0))) {
0225         /* all contents of S[0] fits into L[0] */
0226 
0227         RFALSE(vn->vn_mode == M_INSERT || vn->vn_mode == M_PASTE,
0228                "vs-8055: invalid mode or balance condition failed");
0229 
0230         tb->lnum[0] = vn->vn_nr_item;
0231         tb->lbytes = -1;
0232         return;
0233     }
0234 
0235     d_size = 0, ih_size = IH_SIZE;
0236 
0237     /* first item may be merge with last item in left neighbor */
0238     if (vi->vi_type & VI_TYPE_LEFT_MERGEABLE)
0239         d_size = -((int)IH_SIZE), ih_size = 0;
0240 
0241     tb->lnum[0] = 0;
0242     for (i = 0; i < vn->vn_nr_item;
0243          i++, ih_size = IH_SIZE, d_size = 0, vi++) {
0244         d_size += vi->vi_item_len;
0245         if (cur_free >= d_size) {
0246             /* the item can be shifted entirely */
0247             cur_free -= d_size;
0248             tb->lnum[0]++;
0249             continue;
0250         }
0251 
0252         /* the item cannot be shifted entirely, try to split it */
0253         /*
0254          * check whether L[0] can hold ih and at least one byte
0255          * of the item body
0256          */
0257 
0258         /* cannot shift even a part of the current item */
0259         if (cur_free <= ih_size) {
0260             tb->lbytes = -1;
0261             return;
0262         }
0263         cur_free -= ih_size;
0264 
0265         tb->lbytes = op_check_left(vi, cur_free, 0, 0);
0266         if (tb->lbytes != -1)
0267             /* count partially shifted item */
0268             tb->lnum[0]++;
0269 
0270         break;
0271     }
0272 
0273     return;
0274 }
0275 
0276 /*
0277  * Using virtual node check, how many items can be
0278  * shifted to right neighbor
0279  */
0280 static void check_right(struct tree_balance *tb, int h, int cur_free)
0281 {
0282     int i;
0283     struct virtual_node *vn = tb->tb_vn;
0284     struct virtual_item *vi;
0285     int d_size, ih_size;
0286 
0287     RFALSE(cur_free < 0, "vs-8070: cur_free < 0");
0288 
0289     /* internal level */
0290     if (h > 0) {
0291         tb->rnum[h] = cur_free / (DC_SIZE + KEY_SIZE);
0292         return;
0293     }
0294 
0295     /* leaf level */
0296 
0297     if (!cur_free || !vn->vn_nr_item) {
0298         /* no free space  */
0299         tb->rnum[h] = 0;
0300         tb->rbytes = -1;
0301         return;
0302     }
0303 
0304     RFALSE(!PATH_H_PPARENT(tb->tb_path, 0),
0305            "vs-8075: parent does not exist or invalid");
0306 
0307     vi = vn->vn_vi + vn->vn_nr_item - 1;
0308     if ((unsigned int)cur_free >=
0309         (vn->vn_size -
0310          ((vi->vi_type & VI_TYPE_RIGHT_MERGEABLE) ? IH_SIZE : 0))) {
0311         /* all contents of S[0] fits into R[0] */
0312 
0313         RFALSE(vn->vn_mode == M_INSERT || vn->vn_mode == M_PASTE,
0314                "vs-8080: invalid mode or balance condition failed");
0315 
0316         tb->rnum[h] = vn->vn_nr_item;
0317         tb->rbytes = -1;
0318         return;
0319     }
0320 
0321     d_size = 0, ih_size = IH_SIZE;
0322 
0323     /* last item may be merge with first item in right neighbor */
0324     if (vi->vi_type & VI_TYPE_RIGHT_MERGEABLE)
0325         d_size = -(int)IH_SIZE, ih_size = 0;
0326 
0327     tb->rnum[0] = 0;
0328     for (i = vn->vn_nr_item - 1; i >= 0;
0329          i--, d_size = 0, ih_size = IH_SIZE, vi--) {
0330         d_size += vi->vi_item_len;
0331         if (cur_free >= d_size) {
0332             /* the item can be shifted entirely */
0333             cur_free -= d_size;
0334             tb->rnum[0]++;
0335             continue;
0336         }
0337 
0338         /*
0339          * check whether R[0] can hold ih and at least one
0340          * byte of the item body
0341          */
0342 
0343         /* cannot shift even a part of the current item */
0344         if (cur_free <= ih_size) {
0345             tb->rbytes = -1;
0346             return;
0347         }
0348 
0349         /*
0350          * R[0] can hold the header of the item and at least
0351          * one byte of its body
0352          */
0353         cur_free -= ih_size;    /* cur_free is still > 0 */
0354 
0355         tb->rbytes = op_check_right(vi, cur_free);
0356         if (tb->rbytes != -1)
0357             /* count partially shifted item */
0358             tb->rnum[0]++;
0359 
0360         break;
0361     }
0362 
0363     return;
0364 }
0365 
0366 /*
0367  * from - number of items, which are shifted to left neighbor entirely
0368  * to - number of item, which are shifted to right neighbor entirely
0369  * from_bytes - number of bytes of boundary item (or directory entries)
0370  *              which are shifted to left neighbor
0371  * to_bytes - number of bytes of boundary item (or directory entries)
0372  *            which are shifted to right neighbor
0373  */
0374 static int get_num_ver(int mode, struct tree_balance *tb, int h,
0375                int from, int from_bytes,
0376                int to, int to_bytes, short *snum012, int flow)
0377 {
0378     int i;
0379     int units;
0380     struct virtual_node *vn = tb->tb_vn;
0381     int total_node_size, max_node_size, current_item_size;
0382     int needed_nodes;
0383 
0384     /* position of item we start filling node from */
0385     int start_item;
0386 
0387     /* position of item we finish filling node by */
0388     int end_item;
0389 
0390     /*
0391      * number of first bytes (entries for directory) of start_item-th item
0392      * we do not include into node that is being filled
0393      */
0394     int start_bytes;
0395 
0396     /*
0397      * number of last bytes (entries for directory) of end_item-th item
0398      * we do node include into node that is being filled
0399      */
0400     int end_bytes;
0401 
0402     /*
0403      * these are positions in virtual item of items, that are split
0404      * between S[0] and S1new and S1new and S2new
0405      */
0406     int split_item_positions[2];
0407 
0408     split_item_positions[0] = -1;
0409     split_item_positions[1] = -1;
0410 
0411     /*
0412      * We only create additional nodes if we are in insert or paste mode
0413      * or we are in replace mode at the internal level. If h is 0 and
0414      * the mode is M_REPLACE then in fix_nodes we change the mode to
0415      * paste or insert before we get here in the code.
0416      */
0417     RFALSE(tb->insert_size[h] < 0 || (mode != M_INSERT && mode != M_PASTE),
0418            "vs-8100: insert_size < 0 in overflow");
0419 
0420     max_node_size = MAX_CHILD_SIZE(PATH_H_PBUFFER(tb->tb_path, h));
0421 
0422     /*
0423      * snum012 [0-2] - number of items, that lay
0424      * to S[0], first new node and second new node
0425      */
0426     snum012[3] = -1;    /* s1bytes */
0427     snum012[4] = -1;    /* s2bytes */
0428 
0429     /* internal level */
0430     if (h > 0) {
0431         i = ((to - from) * (KEY_SIZE + DC_SIZE) + DC_SIZE);
0432         if (i == max_node_size)
0433             return 1;
0434         return (i / max_node_size + 1);
0435     }
0436 
0437     /* leaf level */
0438     needed_nodes = 1;
0439     total_node_size = 0;
0440 
0441     /* start from 'from'-th item */
0442     start_item = from;
0443     /* skip its first 'start_bytes' units */
0444     start_bytes = ((from_bytes != -1) ? from_bytes : 0);
0445 
0446     /* last included item is the 'end_item'-th one */
0447     end_item = vn->vn_nr_item - to - 1;
0448     /* do not count last 'end_bytes' units of 'end_item'-th item */
0449     end_bytes = (to_bytes != -1) ? to_bytes : 0;
0450 
0451     /*
0452      * go through all item beginning from the start_item-th item
0453      * and ending by the end_item-th item. Do not count first
0454      * 'start_bytes' units of 'start_item'-th item and last
0455      * 'end_bytes' of 'end_item'-th item
0456      */
0457     for (i = start_item; i <= end_item; i++) {
0458         struct virtual_item *vi = vn->vn_vi + i;
0459         int skip_from_end = ((i == end_item) ? end_bytes : 0);
0460 
0461         RFALSE(needed_nodes > 3, "vs-8105: too many nodes are needed");
0462 
0463         /* get size of current item */
0464         current_item_size = vi->vi_item_len;
0465 
0466         /*
0467          * do not take in calculation head part (from_bytes)
0468          * of from-th item
0469          */
0470         current_item_size -=
0471             op_part_size(vi, 0 /*from start */ , start_bytes);
0472 
0473         /* do not take in calculation tail part of last item */
0474         current_item_size -=
0475             op_part_size(vi, 1 /*from end */ , skip_from_end);
0476 
0477         /* if item fits into current node entierly */
0478         if (total_node_size + current_item_size <= max_node_size) {
0479             snum012[needed_nodes - 1]++;
0480             total_node_size += current_item_size;
0481             start_bytes = 0;
0482             continue;
0483         }
0484 
0485         /*
0486          * virtual item length is longer, than max size of item in
0487          * a node. It is impossible for direct item
0488          */
0489         if (current_item_size > max_node_size) {
0490             RFALSE(is_direct_le_ih(vi->vi_ih),
0491                    "vs-8110: "
0492                    "direct item length is %d. It can not be longer than %d",
0493                    current_item_size, max_node_size);
0494             /* we will try to split it */
0495             flow = 1;
0496         }
0497 
0498         /* as we do not split items, take new node and continue */
0499         if (!flow) {
0500             needed_nodes++;
0501             i--;
0502             total_node_size = 0;
0503             continue;
0504         }
0505 
0506         /*
0507          * calculate number of item units which fit into node being
0508          * filled
0509          */
0510         {
0511             int free_space;
0512 
0513             free_space = max_node_size - total_node_size - IH_SIZE;
0514             units =
0515                 op_check_left(vi, free_space, start_bytes,
0516                       skip_from_end);
0517             /*
0518              * nothing fits into current node, take new
0519              * node and continue
0520              */
0521             if (units == -1) {
0522                 needed_nodes++, i--, total_node_size = 0;
0523                 continue;
0524             }
0525         }
0526 
0527         /* something fits into the current node */
0528         start_bytes += units;
0529         snum012[needed_nodes - 1 + 3] = units;
0530 
0531         if (needed_nodes > 2)
0532             reiserfs_warning(tb->tb_sb, "vs-8111",
0533                      "split_item_position is out of range");
0534         snum012[needed_nodes - 1]++;
0535         split_item_positions[needed_nodes - 1] = i;
0536         needed_nodes++;
0537         /* continue from the same item with start_bytes != -1 */
0538         start_item = i;
0539         i--;
0540         total_node_size = 0;
0541     }
0542 
0543     /*
0544      * sum012[4] (if it is not -1) contains number of units of which
0545      * are to be in S1new, snum012[3] - to be in S0. They are supposed
0546      * to be S1bytes and S2bytes correspondingly, so recalculate
0547      */
0548     if (snum012[4] > 0) {
0549         int split_item_num;
0550         int bytes_to_r, bytes_to_l;
0551         int bytes_to_S1new;
0552 
0553         split_item_num = split_item_positions[1];
0554         bytes_to_l =
0555             ((from == split_item_num
0556               && from_bytes != -1) ? from_bytes : 0);
0557         bytes_to_r =
0558             ((end_item == split_item_num
0559               && end_bytes != -1) ? end_bytes : 0);
0560         bytes_to_S1new =
0561             ((split_item_positions[0] ==
0562               split_item_positions[1]) ? snum012[3] : 0);
0563 
0564         /* s2bytes */
0565         snum012[4] =
0566             op_unit_num(&vn->vn_vi[split_item_num]) - snum012[4] -
0567             bytes_to_r - bytes_to_l - bytes_to_S1new;
0568 
0569         if (vn->vn_vi[split_item_num].vi_index != TYPE_DIRENTRY &&
0570             vn->vn_vi[split_item_num].vi_index != TYPE_INDIRECT)
0571             reiserfs_warning(tb->tb_sb, "vs-8115",
0572                      "not directory or indirect item");
0573     }
0574 
0575     /* now we know S2bytes, calculate S1bytes */
0576     if (snum012[3] > 0) {
0577         int split_item_num;
0578         int bytes_to_r, bytes_to_l;
0579         int bytes_to_S2new;
0580 
0581         split_item_num = split_item_positions[0];
0582         bytes_to_l =
0583             ((from == split_item_num
0584               && from_bytes != -1) ? from_bytes : 0);
0585         bytes_to_r =
0586             ((end_item == split_item_num
0587               && end_bytes != -1) ? end_bytes : 0);
0588         bytes_to_S2new =
0589             ((split_item_positions[0] == split_item_positions[1]
0590               && snum012[4] != -1) ? snum012[4] : 0);
0591 
0592         /* s1bytes */
0593         snum012[3] =
0594             op_unit_num(&vn->vn_vi[split_item_num]) - snum012[3] -
0595             bytes_to_r - bytes_to_l - bytes_to_S2new;
0596     }
0597 
0598     return needed_nodes;
0599 }
0600 
0601 
0602 /*
0603  * Set parameters for balancing.
0604  * Performs write of results of analysis of balancing into structure tb,
0605  * where it will later be used by the functions that actually do the balancing.
0606  * Parameters:
0607  *  tb  tree_balance structure;
0608  *  h   current level of the node;
0609  *  lnum    number of items from S[h] that must be shifted to L[h];
0610  *  rnum    number of items from S[h] that must be shifted to R[h];
0611  *  blk_num number of blocks that S[h] will be splitted into;
0612  *  s012    number of items that fall into splitted nodes.
0613  *  lbytes  number of bytes which flow to the left neighbor from the
0614  *              item that is not shifted entirely
0615  *  rbytes  number of bytes which flow to the right neighbor from the
0616  *              item that is not shifted entirely
0617  *  s1bytes number of bytes which flow to the first  new node when
0618  *              S[0] splits (this number is contained in s012 array)
0619  */
0620 
0621 static void set_parameters(struct tree_balance *tb, int h, int lnum,
0622                int rnum, int blk_num, short *s012, int lb, int rb)
0623 {
0624 
0625     tb->lnum[h] = lnum;
0626     tb->rnum[h] = rnum;
0627     tb->blknum[h] = blk_num;
0628 
0629     /* only for leaf level */
0630     if (h == 0) {
0631         if (s012 != NULL) {
0632             tb->s0num = *s012++;
0633             tb->snum[0] = *s012++;
0634             tb->snum[1] = *s012++;
0635             tb->sbytes[0] = *s012++;
0636             tb->sbytes[1] = *s012;
0637         }
0638         tb->lbytes = lb;
0639         tb->rbytes = rb;
0640     }
0641     PROC_INFO_ADD(tb->tb_sb, lnum[h], lnum);
0642     PROC_INFO_ADD(tb->tb_sb, rnum[h], rnum);
0643 
0644     PROC_INFO_ADD(tb->tb_sb, lbytes[h], lb);
0645     PROC_INFO_ADD(tb->tb_sb, rbytes[h], rb);
0646 }
0647 
0648 /*
0649  * check if node disappears if we shift tb->lnum[0] items to left
0650  * neighbor and tb->rnum[0] to the right one.
0651  */
0652 static int is_leaf_removable(struct tree_balance *tb)
0653 {
0654     struct virtual_node *vn = tb->tb_vn;
0655     int to_left, to_right;
0656     int size;
0657     int remain_items;
0658 
0659     /*
0660      * number of items that will be shifted to left (right) neighbor
0661      * entirely
0662      */
0663     to_left = tb->lnum[0] - ((tb->lbytes != -1) ? 1 : 0);
0664     to_right = tb->rnum[0] - ((tb->rbytes != -1) ? 1 : 0);
0665     remain_items = vn->vn_nr_item;
0666 
0667     /* how many items remain in S[0] after shiftings to neighbors */
0668     remain_items -= (to_left + to_right);
0669 
0670     /* all content of node can be shifted to neighbors */
0671     if (remain_items < 1) {
0672         set_parameters(tb, 0, to_left, vn->vn_nr_item - to_left, 0,
0673                    NULL, -1, -1);
0674         return 1;
0675     }
0676 
0677     /* S[0] is not removable */
0678     if (remain_items > 1 || tb->lbytes == -1 || tb->rbytes == -1)
0679         return 0;
0680 
0681     /* check whether we can divide 1 remaining item between neighbors */
0682 
0683     /* get size of remaining item (in item units) */
0684     size = op_unit_num(&vn->vn_vi[to_left]);
0685 
0686     if (tb->lbytes + tb->rbytes >= size) {
0687         set_parameters(tb, 0, to_left + 1, to_right + 1, 0, NULL,
0688                    tb->lbytes, -1);
0689         return 1;
0690     }
0691 
0692     return 0;
0693 }
0694 
0695 /* check whether L, S, R can be joined in one node */
0696 static int are_leaves_removable(struct tree_balance *tb, int lfree, int rfree)
0697 {
0698     struct virtual_node *vn = tb->tb_vn;
0699     int ih_size;
0700     struct buffer_head *S0;
0701 
0702     S0 = PATH_H_PBUFFER(tb->tb_path, 0);
0703 
0704     ih_size = 0;
0705     if (vn->vn_nr_item) {
0706         if (vn->vn_vi[0].vi_type & VI_TYPE_LEFT_MERGEABLE)
0707             ih_size += IH_SIZE;
0708 
0709         if (vn->vn_vi[vn->vn_nr_item - 1].
0710             vi_type & VI_TYPE_RIGHT_MERGEABLE)
0711             ih_size += IH_SIZE;
0712     } else {
0713         /* there was only one item and it will be deleted */
0714         struct item_head *ih;
0715 
0716         RFALSE(B_NR_ITEMS(S0) != 1,
0717                "vs-8125: item number must be 1: it is %d",
0718                B_NR_ITEMS(S0));
0719 
0720         ih = item_head(S0, 0);
0721         if (tb->CFR[0]
0722             && !comp_short_le_keys(&ih->ih_key,
0723                        internal_key(tb->CFR[0],
0724                               tb->rkey[0])))
0725             /*
0726              * Directory must be in correct state here: that is
0727              * somewhere at the left side should exist first
0728              * directory item. But the item being deleted can
0729              * not be that first one because its right neighbor
0730              * is item of the same directory. (But first item
0731              * always gets deleted in last turn). So, neighbors
0732              * of deleted item can be merged, so we can save
0733              * ih_size
0734              */
0735             if (is_direntry_le_ih(ih)) {
0736                 ih_size = IH_SIZE;
0737 
0738                 /*
0739                  * we might check that left neighbor exists
0740                  * and is of the same directory
0741                  */
0742                 RFALSE(le_ih_k_offset(ih) == DOT_OFFSET,
0743                        "vs-8130: first directory item can not be removed until directory is not empty");
0744             }
0745 
0746     }
0747 
0748     if (MAX_CHILD_SIZE(S0) + vn->vn_size <= rfree + lfree + ih_size) {
0749         set_parameters(tb, 0, -1, -1, -1, NULL, -1, -1);
0750         PROC_INFO_INC(tb->tb_sb, leaves_removable);
0751         return 1;
0752     }
0753     return 0;
0754 
0755 }
0756 
0757 /* when we do not split item, lnum and rnum are numbers of entire items */
0758 #define SET_PAR_SHIFT_LEFT \
0759 if (h)\
0760 {\
0761    int to_l;\
0762    \
0763    to_l = (MAX_NR_KEY(Sh)+1 - lpar + vn->vn_nr_item + 1) / 2 -\
0764           (MAX_NR_KEY(Sh) + 1 - lpar);\
0765           \
0766           set_parameters (tb, h, to_l, 0, lnver, NULL, -1, -1);\
0767 }\
0768 else \
0769 {\
0770    if (lset==LEFT_SHIFT_FLOW)\
0771      set_parameters (tb, h, lpar, 0, lnver, snum012+lset,\
0772              tb->lbytes, -1);\
0773    else\
0774      set_parameters (tb, h, lpar - (tb->lbytes!=-1), 0, lnver, snum012+lset,\
0775              -1, -1);\
0776 }
0777 
0778 #define SET_PAR_SHIFT_RIGHT \
0779 if (h)\
0780 {\
0781    int to_r;\
0782    \
0783    to_r = (MAX_NR_KEY(Sh)+1 - rpar + vn->vn_nr_item + 1) / 2 - (MAX_NR_KEY(Sh) + 1 - rpar);\
0784    \
0785    set_parameters (tb, h, 0, to_r, rnver, NULL, -1, -1);\
0786 }\
0787 else \
0788 {\
0789    if (rset==RIGHT_SHIFT_FLOW)\
0790      set_parameters (tb, h, 0, rpar, rnver, snum012+rset,\
0791           -1, tb->rbytes);\
0792    else\
0793      set_parameters (tb, h, 0, rpar - (tb->rbytes!=-1), rnver, snum012+rset,\
0794           -1, -1);\
0795 }
0796 
0797 static void free_buffers_in_tb(struct tree_balance *tb)
0798 {
0799     int i;
0800 
0801     pathrelse(tb->tb_path);
0802 
0803     for (i = 0; i < MAX_HEIGHT; i++) {
0804         brelse(tb->L[i]);
0805         brelse(tb->R[i]);
0806         brelse(tb->FL[i]);
0807         brelse(tb->FR[i]);
0808         brelse(tb->CFL[i]);
0809         brelse(tb->CFR[i]);
0810 
0811         tb->L[i] = NULL;
0812         tb->R[i] = NULL;
0813         tb->FL[i] = NULL;
0814         tb->FR[i] = NULL;
0815         tb->CFL[i] = NULL;
0816         tb->CFR[i] = NULL;
0817     }
0818 }
0819 
0820 /*
0821  * Get new buffers for storing new nodes that are created while balancing.
0822  * Returns: SCHEDULE_OCCURRED - schedule occurred while the function worked;
0823  *          CARRY_ON - schedule didn't occur while the function worked;
0824  *          NO_DISK_SPACE - no disk space.
0825  */
0826 /* The function is NOT SCHEDULE-SAFE! */
0827 static int get_empty_nodes(struct tree_balance *tb, int h)
0828 {
0829     struct buffer_head *new_bh, *Sh = PATH_H_PBUFFER(tb->tb_path, h);
0830     b_blocknr_t *blocknr, blocknrs[MAX_AMOUNT_NEEDED] = { 0, };
0831     int counter, number_of_freeblk;
0832     int  amount_needed; /* number of needed empty blocks */
0833     int  retval = CARRY_ON;
0834     struct super_block *sb = tb->tb_sb;
0835 
0836     /*
0837      * number_of_freeblk is the number of empty blocks which have been
0838      * acquired for use by the balancing algorithm minus the number of
0839      * empty blocks used in the previous levels of the analysis,
0840      * number_of_freeblk = tb->cur_blknum can be non-zero if a schedule
0841      * occurs after empty blocks are acquired, and the balancing analysis
0842      * is then restarted, amount_needed is the number needed by this
0843      * level (h) of the balancing analysis.
0844      *
0845      * Note that for systems with many processes writing, it would be
0846      * more layout optimal to calculate the total number needed by all
0847      * levels and then to run reiserfs_new_blocks to get all of them at
0848      * once.
0849      */
0850 
0851     /*
0852      * Initiate number_of_freeblk to the amount acquired prior to the
0853      * restart of the analysis or 0 if not restarted, then subtract the
0854      * amount needed by all of the levels of the tree below h.
0855      */
0856     /* blknum includes S[h], so we subtract 1 in this calculation */
0857     for (counter = 0, number_of_freeblk = tb->cur_blknum;
0858          counter < h; counter++)
0859         number_of_freeblk -=
0860             (tb->blknum[counter]) ? (tb->blknum[counter] -
0861                            1) : 0;
0862 
0863     /* Allocate missing empty blocks. */
0864     /* if Sh == 0  then we are getting a new root */
0865     amount_needed = (Sh) ? (tb->blknum[h] - 1) : 1;
0866     /*
0867      * Amount_needed = the amount that we need more than the
0868      * amount that we have.
0869      */
0870     if (amount_needed > number_of_freeblk)
0871         amount_needed -= number_of_freeblk;
0872     else    /* If we have enough already then there is nothing to do. */
0873         return CARRY_ON;
0874 
0875     /*
0876      * No need to check quota - is not allocated for blocks used
0877      * for formatted nodes
0878      */
0879     if (reiserfs_new_form_blocknrs(tb, blocknrs,
0880                        amount_needed) == NO_DISK_SPACE)
0881         return NO_DISK_SPACE;
0882 
0883     /* for each blocknumber we just got, get a buffer and stick it on FEB */
0884     for (blocknr = blocknrs, counter = 0;
0885          counter < amount_needed; blocknr++, counter++) {
0886 
0887         RFALSE(!*blocknr,
0888                "PAP-8135: reiserfs_new_blocknrs failed when got new blocks");
0889 
0890         new_bh = sb_getblk(sb, *blocknr);
0891         RFALSE(buffer_dirty(new_bh) ||
0892                buffer_journaled(new_bh) ||
0893                buffer_journal_dirty(new_bh),
0894                "PAP-8140: journaled or dirty buffer %b for the new block",
0895                new_bh);
0896 
0897         /* Put empty buffers into the array. */
0898         RFALSE(tb->FEB[tb->cur_blknum],
0899                "PAP-8141: busy slot for new buffer");
0900 
0901         set_buffer_journal_new(new_bh);
0902         tb->FEB[tb->cur_blknum++] = new_bh;
0903     }
0904 
0905     if (retval == CARRY_ON && FILESYSTEM_CHANGED_TB(tb))
0906         retval = REPEAT_SEARCH;
0907 
0908     return retval;
0909 }
0910 
0911 /*
0912  * Get free space of the left neighbor, which is stored in the parent
0913  * node of the left neighbor.
0914  */
0915 static int get_lfree(struct tree_balance *tb, int h)
0916 {
0917     struct buffer_head *l, *f;
0918     int order;
0919 
0920     if ((f = PATH_H_PPARENT(tb->tb_path, h)) == NULL ||
0921         (l = tb->FL[h]) == NULL)
0922         return 0;
0923 
0924     if (f == l)
0925         order = PATH_H_B_ITEM_ORDER(tb->tb_path, h) - 1;
0926     else {
0927         order = B_NR_ITEMS(l);
0928         f = l;
0929     }
0930 
0931     return (MAX_CHILD_SIZE(f) - dc_size(B_N_CHILD(f, order)));
0932 }
0933 
0934 /*
0935  * Get free space of the right neighbor,
0936  * which is stored in the parent node of the right neighbor.
0937  */
0938 static int get_rfree(struct tree_balance *tb, int h)
0939 {
0940     struct buffer_head *r, *f;
0941     int order;
0942 
0943     if ((f = PATH_H_PPARENT(tb->tb_path, h)) == NULL ||
0944         (r = tb->FR[h]) == NULL)
0945         return 0;
0946 
0947     if (f == r)
0948         order = PATH_H_B_ITEM_ORDER(tb->tb_path, h) + 1;
0949     else {
0950         order = 0;
0951         f = r;
0952     }
0953 
0954     return (MAX_CHILD_SIZE(f) - dc_size(B_N_CHILD(f, order)));
0955 
0956 }
0957 
0958 /* Check whether left neighbor is in memory. */
0959 static int is_left_neighbor_in_cache(struct tree_balance *tb, int h)
0960 {
0961     struct buffer_head *father, *left;
0962     struct super_block *sb = tb->tb_sb;
0963     b_blocknr_t left_neighbor_blocknr;
0964     int left_neighbor_position;
0965 
0966     /* Father of the left neighbor does not exist. */
0967     if (!tb->FL[h])
0968         return 0;
0969 
0970     /* Calculate father of the node to be balanced. */
0971     father = PATH_H_PBUFFER(tb->tb_path, h + 1);
0972 
0973     RFALSE(!father ||
0974            !B_IS_IN_TREE(father) ||
0975            !B_IS_IN_TREE(tb->FL[h]) ||
0976            !buffer_uptodate(father) ||
0977            !buffer_uptodate(tb->FL[h]),
0978            "vs-8165: F[h] (%b) or FL[h] (%b) is invalid",
0979            father, tb->FL[h]);
0980 
0981     /*
0982      * Get position of the pointer to the left neighbor
0983      * into the left father.
0984      */
0985     left_neighbor_position = (father == tb->FL[h]) ?
0986         tb->lkey[h] : B_NR_ITEMS(tb->FL[h]);
0987     /* Get left neighbor block number. */
0988     left_neighbor_blocknr =
0989         B_N_CHILD_NUM(tb->FL[h], left_neighbor_position);
0990     /* Look for the left neighbor in the cache. */
0991     if ((left = sb_find_get_block(sb, left_neighbor_blocknr))) {
0992 
0993         RFALSE(buffer_uptodate(left) && !B_IS_IN_TREE(left),
0994                "vs-8170: left neighbor (%b %z) is not in the tree",
0995                left, left);
0996         put_bh(left);
0997         return 1;
0998     }
0999 
1000     return 0;
1001 }
1002 
1003 #define LEFT_PARENTS  'l'
1004 #define RIGHT_PARENTS 'r'
1005 
1006 static void decrement_key(struct cpu_key *key)
1007 {
1008     /* call item specific function for this key */
1009     item_ops[cpu_key_k_type(key)]->decrement_key(key);
1010 }
1011 
1012 /*
1013  * Calculate far left/right parent of the left/right neighbor of the
1014  * current node, that is calculate the left/right (FL[h]/FR[h]) neighbor
1015  * of the parent F[h].
1016  * Calculate left/right common parent of the current node and L[h]/R[h].
1017  * Calculate left/right delimiting key position.
1018  * Returns: PATH_INCORRECT    - path in the tree is not correct
1019  *      SCHEDULE_OCCURRED - schedule occurred while the function worked
1020  *          CARRY_ON          - schedule didn't occur while the function
1021  *                  worked
1022  */
1023 static int get_far_parent(struct tree_balance *tb,
1024               int h,
1025               struct buffer_head **pfather,
1026               struct buffer_head **pcom_father, char c_lr_par)
1027 {
1028     struct buffer_head *parent;
1029     INITIALIZE_PATH(s_path_to_neighbor_father);
1030     struct treepath *path = tb->tb_path;
1031     struct cpu_key s_lr_father_key;
1032     int counter,
1033         position = INT_MAX,
1034         first_last_position = 0,
1035         path_offset = PATH_H_PATH_OFFSET(path, h);
1036 
1037     /*
1038      * Starting from F[h] go upwards in the tree, and look for the common
1039      * ancestor of F[h], and its neighbor l/r, that should be obtained.
1040      */
1041 
1042     counter = path_offset;
1043 
1044     RFALSE(counter < FIRST_PATH_ELEMENT_OFFSET,
1045            "PAP-8180: invalid path length");
1046 
1047     for (; counter > FIRST_PATH_ELEMENT_OFFSET; counter--) {
1048         /*
1049          * Check whether parent of the current buffer in the path
1050          * is really parent in the tree.
1051          */
1052         if (!B_IS_IN_TREE
1053             (parent = PATH_OFFSET_PBUFFER(path, counter - 1)))
1054             return REPEAT_SEARCH;
1055 
1056         /* Check whether position in the parent is correct. */
1057         if ((position =
1058              PATH_OFFSET_POSITION(path,
1059                       counter - 1)) >
1060             B_NR_ITEMS(parent))
1061             return REPEAT_SEARCH;
1062 
1063         /*
1064          * Check whether parent at the path really points
1065          * to the child.
1066          */
1067         if (B_N_CHILD_NUM(parent, position) !=
1068             PATH_OFFSET_PBUFFER(path, counter)->b_blocknr)
1069             return REPEAT_SEARCH;
1070 
1071         /*
1072          * Return delimiting key if position in the parent is not
1073          * equal to first/last one.
1074          */
1075         if (c_lr_par == RIGHT_PARENTS)
1076             first_last_position = B_NR_ITEMS(parent);
1077         if (position != first_last_position) {
1078             *pcom_father = parent;
1079             get_bh(*pcom_father);
1080             /*(*pcom_father = parent)->b_count++; */
1081             break;
1082         }
1083     }
1084 
1085     /* if we are in the root of the tree, then there is no common father */
1086     if (counter == FIRST_PATH_ELEMENT_OFFSET) {
1087         /*
1088          * Check whether first buffer in the path is the
1089          * root of the tree.
1090          */
1091         if (PATH_OFFSET_PBUFFER
1092             (tb->tb_path,
1093              FIRST_PATH_ELEMENT_OFFSET)->b_blocknr ==
1094             SB_ROOT_BLOCK(tb->tb_sb)) {
1095             *pfather = *pcom_father = NULL;
1096             return CARRY_ON;
1097         }
1098         return REPEAT_SEARCH;
1099     }
1100 
1101     RFALSE(B_LEVEL(*pcom_father) <= DISK_LEAF_NODE_LEVEL,
1102            "PAP-8185: (%b %z) level too small",
1103            *pcom_father, *pcom_father);
1104 
1105     /* Check whether the common parent is locked. */
1106 
1107     if (buffer_locked(*pcom_father)) {
1108 
1109         /* Release the write lock while the buffer is busy */
1110         int depth = reiserfs_write_unlock_nested(tb->tb_sb);
1111         __wait_on_buffer(*pcom_father);
1112         reiserfs_write_lock_nested(tb->tb_sb, depth);
1113         if (FILESYSTEM_CHANGED_TB(tb)) {
1114             brelse(*pcom_father);
1115             return REPEAT_SEARCH;
1116         }
1117     }
1118 
1119     /*
1120      * So, we got common parent of the current node and its
1121      * left/right neighbor.  Now we are getting the parent of the
1122      * left/right neighbor.
1123      */
1124 
1125     /* Form key to get parent of the left/right neighbor. */
1126     le_key2cpu_key(&s_lr_father_key,
1127                internal_key(*pcom_father,
1128                       (c_lr_par ==
1129                        LEFT_PARENTS) ? (tb->lkey[h - 1] =
1130                             position -
1131                             1) : (tb->rkey[h -
1132                                        1] =
1133                                   position)));
1134 
1135     if (c_lr_par == LEFT_PARENTS)
1136         decrement_key(&s_lr_father_key);
1137 
1138     if (search_by_key
1139         (tb->tb_sb, &s_lr_father_key, &s_path_to_neighbor_father,
1140          h + 1) == IO_ERROR)
1141         /* path is released */
1142         return IO_ERROR;
1143 
1144     if (FILESYSTEM_CHANGED_TB(tb)) {
1145         pathrelse(&s_path_to_neighbor_father);
1146         brelse(*pcom_father);
1147         return REPEAT_SEARCH;
1148     }
1149 
1150     *pfather = PATH_PLAST_BUFFER(&s_path_to_neighbor_father);
1151 
1152     RFALSE(B_LEVEL(*pfather) != h + 1,
1153            "PAP-8190: (%b %z) level too small", *pfather, *pfather);
1154     RFALSE(s_path_to_neighbor_father.path_length <
1155            FIRST_PATH_ELEMENT_OFFSET, "PAP-8192: path length is too small");
1156 
1157     s_path_to_neighbor_father.path_length--;
1158     pathrelse(&s_path_to_neighbor_father);
1159     return CARRY_ON;
1160 }
1161 
1162 /*
1163  * Get parents of neighbors of node in the path(S[path_offset]) and
1164  * common parents of S[path_offset] and L[path_offset]/R[path_offset]:
1165  * F[path_offset], FL[path_offset], FR[path_offset], CFL[path_offset],
1166  * CFR[path_offset].
1167  * Calculate numbers of left and right delimiting keys position:
1168  * lkey[path_offset], rkey[path_offset].
1169  * Returns: SCHEDULE_OCCURRED - schedule occurred while the function worked
1170  *          CARRY_ON - schedule didn't occur while the function worked
1171  */
1172 static int get_parents(struct tree_balance *tb, int h)
1173 {
1174     struct treepath *path = tb->tb_path;
1175     int position,
1176         ret,
1177         path_offset = PATH_H_PATH_OFFSET(tb->tb_path, h);
1178     struct buffer_head *curf, *curcf;
1179 
1180     /* Current node is the root of the tree or will be root of the tree */
1181     if (path_offset <= FIRST_PATH_ELEMENT_OFFSET) {
1182         /*
1183          * The root can not have parents.
1184          * Release nodes which previously were obtained as
1185          * parents of the current node neighbors.
1186          */
1187         brelse(tb->FL[h]);
1188         brelse(tb->CFL[h]);
1189         brelse(tb->FR[h]);
1190         brelse(tb->CFR[h]);
1191         tb->FL[h]  = NULL;
1192         tb->CFL[h] = NULL;
1193         tb->FR[h]  = NULL;
1194         tb->CFR[h] = NULL;
1195         return CARRY_ON;
1196     }
1197 
1198     /* Get parent FL[path_offset] of L[path_offset]. */
1199     position = PATH_OFFSET_POSITION(path, path_offset - 1);
1200     if (position) {
1201         /* Current node is not the first child of its parent. */
1202         curf = PATH_OFFSET_PBUFFER(path, path_offset - 1);
1203         curcf = PATH_OFFSET_PBUFFER(path, path_offset - 1);
1204         get_bh(curf);
1205         get_bh(curf);
1206         tb->lkey[h] = position - 1;
1207     } else {
1208         /*
1209          * Calculate current parent of L[path_offset], which is the
1210          * left neighbor of the current node.  Calculate current
1211          * common parent of L[path_offset] and the current node.
1212          * Note that CFL[path_offset] not equal FL[path_offset] and
1213          * CFL[path_offset] not equal F[path_offset].
1214          * Calculate lkey[path_offset].
1215          */
1216         if ((ret = get_far_parent(tb, h + 1, &curf,
1217                           &curcf,
1218                           LEFT_PARENTS)) != CARRY_ON)
1219             return ret;
1220     }
1221 
1222     brelse(tb->FL[h]);
1223     tb->FL[h] = curf;   /* New initialization of FL[h]. */
1224     brelse(tb->CFL[h]);
1225     tb->CFL[h] = curcf; /* New initialization of CFL[h]. */
1226 
1227     RFALSE((curf && !B_IS_IN_TREE(curf)) ||
1228            (curcf && !B_IS_IN_TREE(curcf)),
1229            "PAP-8195: FL (%b) or CFL (%b) is invalid", curf, curcf);
1230 
1231     /* Get parent FR[h] of R[h]. */
1232 
1233     /* Current node is the last child of F[h]. FR[h] != F[h]. */
1234     if (position == B_NR_ITEMS(PATH_H_PBUFFER(path, h + 1))) {
1235         /*
1236          * Calculate current parent of R[h], which is the right
1237          * neighbor of F[h].  Calculate current common parent of
1238          * R[h] and current node. Note that CFR[h] not equal
1239          * FR[path_offset] and CFR[h] not equal F[h].
1240          */
1241         if ((ret =
1242              get_far_parent(tb, h + 1, &curf, &curcf,
1243                     RIGHT_PARENTS)) != CARRY_ON)
1244             return ret;
1245     } else {
1246         /* Current node is not the last child of its parent F[h]. */
1247         curf = PATH_OFFSET_PBUFFER(path, path_offset - 1);
1248         curcf = PATH_OFFSET_PBUFFER(path, path_offset - 1);
1249         get_bh(curf);
1250         get_bh(curf);
1251         tb->rkey[h] = position;
1252     }
1253 
1254     brelse(tb->FR[h]);
1255     /* New initialization of FR[path_offset]. */
1256     tb->FR[h] = curf;
1257 
1258     brelse(tb->CFR[h]);
1259     /* New initialization of CFR[path_offset]. */
1260     tb->CFR[h] = curcf;
1261 
1262     RFALSE((curf && !B_IS_IN_TREE(curf)) ||
1263            (curcf && !B_IS_IN_TREE(curcf)),
1264            "PAP-8205: FR (%b) or CFR (%b) is invalid", curf, curcf);
1265 
1266     return CARRY_ON;
1267 }
1268 
1269 /*
1270  * it is possible to remove node as result of shiftings to
1271  * neighbors even when we insert or paste item.
1272  */
1273 static inline int can_node_be_removed(int mode, int lfree, int sfree, int rfree,
1274                       struct tree_balance *tb, int h)
1275 {
1276     struct buffer_head *Sh = PATH_H_PBUFFER(tb->tb_path, h);
1277     int levbytes = tb->insert_size[h];
1278     struct item_head *ih;
1279     struct reiserfs_key *r_key = NULL;
1280 
1281     ih = item_head(Sh, 0);
1282     if (tb->CFR[h])
1283         r_key = internal_key(tb->CFR[h], tb->rkey[h]);
1284 
1285     if (lfree + rfree + sfree < MAX_CHILD_SIZE(Sh) + levbytes
1286         /* shifting may merge items which might save space */
1287         -
1288         ((!h
1289           && op_is_left_mergeable(&ih->ih_key, Sh->b_size)) ? IH_SIZE : 0)
1290         -
1291         ((!h && r_key
1292           && op_is_left_mergeable(r_key, Sh->b_size)) ? IH_SIZE : 0)
1293         + ((h) ? KEY_SIZE : 0)) {
1294         /* node can not be removed */
1295         if (sfree >= levbytes) {
1296             /* new item fits into node S[h] without any shifting */
1297             if (!h)
1298                 tb->s0num =
1299                     B_NR_ITEMS(Sh) +
1300                     ((mode == M_INSERT) ? 1 : 0);
1301             set_parameters(tb, h, 0, 0, 1, NULL, -1, -1);
1302             return NO_BALANCING_NEEDED;
1303         }
1304     }
1305     PROC_INFO_INC(tb->tb_sb, can_node_be_removed[h]);
1306     return !NO_BALANCING_NEEDED;
1307 }
1308 
1309 /*
1310  * Check whether current node S[h] is balanced when increasing its size by
1311  * Inserting or Pasting.
1312  * Calculate parameters for balancing for current level h.
1313  * Parameters:
1314  *  tb  tree_balance structure;
1315  *  h   current level of the node;
1316  *  inum    item number in S[h];
1317  *  mode    i - insert, p - paste;
1318  * Returns: 1 - schedule occurred;
1319  *          0 - balancing for higher levels needed;
1320  *         -1 - no balancing for higher levels needed;
1321  *         -2 - no disk space.
1322  */
1323 /* ip means Inserting or Pasting */
1324 static int ip_check_balance(struct tree_balance *tb, int h)
1325 {
1326     struct virtual_node *vn = tb->tb_vn;
1327     /*
1328      * Number of bytes that must be inserted into (value is negative
1329      * if bytes are deleted) buffer which contains node being balanced.
1330      * The mnemonic is that the attempted change in node space used
1331      * level is levbytes bytes.
1332      */
1333     int levbytes;
1334     int ret;
1335 
1336     int lfree, sfree, rfree /* free space in L, S and R */ ;
1337 
1338     /*
1339      * nver is short for number of vertixes, and lnver is the number if
1340      * we shift to the left, rnver is the number if we shift to the
1341      * right, and lrnver is the number if we shift in both directions.
1342      * The goal is to minimize first the number of vertixes, and second,
1343      * the number of vertixes whose contents are changed by shifting,
1344      * and third the number of uncached vertixes whose contents are
1345      * changed by shifting and must be read from disk.
1346      */
1347     int nver, lnver, rnver, lrnver;
1348 
1349     /*
1350      * used at leaf level only, S0 = S[0] is the node being balanced,
1351      * sInum [ I = 0,1,2 ] is the number of items that will
1352      * remain in node SI after balancing.  S1 and S2 are new
1353      * nodes that might be created.
1354      */
1355 
1356     /*
1357      * we perform 8 calls to get_num_ver().  For each call we
1358      * calculate five parameters.  where 4th parameter is s1bytes
1359      * and 5th - s2bytes
1360      *
1361      * s0num, s1num, s2num for 8 cases
1362      * 0,1 - do not shift and do not shift but bottle
1363      * 2   - shift only whole item to left
1364      * 3   - shift to left and bottle as much as possible
1365      * 4,5 - shift to right (whole items and as much as possible
1366      * 6,7 - shift to both directions (whole items and as much as possible)
1367      */
1368     short snum012[40] = { 0, };
1369 
1370     /* Sh is the node whose balance is currently being checked */
1371     struct buffer_head *Sh;
1372 
1373     Sh = PATH_H_PBUFFER(tb->tb_path, h);
1374     levbytes = tb->insert_size[h];
1375 
1376     /* Calculate balance parameters for creating new root. */
1377     if (!Sh) {
1378         if (!h)
1379             reiserfs_panic(tb->tb_sb, "vs-8210",
1380                        "S[0] can not be 0");
1381         switch (ret = get_empty_nodes(tb, h)) {
1382         /* no balancing for higher levels needed */
1383         case CARRY_ON:
1384             set_parameters(tb, h, 0, 0, 1, NULL, -1, -1);
1385             return NO_BALANCING_NEEDED;
1386 
1387         case NO_DISK_SPACE:
1388         case REPEAT_SEARCH:
1389             return ret;
1390         default:
1391             reiserfs_panic(tb->tb_sb, "vs-8215", "incorrect "
1392                        "return value of get_empty_nodes");
1393         }
1394     }
1395 
1396     /* get parents of S[h] neighbors. */
1397     ret = get_parents(tb, h);
1398     if (ret != CARRY_ON)
1399         return ret;
1400 
1401     sfree = B_FREE_SPACE(Sh);
1402 
1403     /* get free space of neighbors */
1404     rfree = get_rfree(tb, h);
1405     lfree = get_lfree(tb, h);
1406 
1407     /* and new item fits into node S[h] without any shifting */
1408     if (can_node_be_removed(vn->vn_mode, lfree, sfree, rfree, tb, h) ==
1409         NO_BALANCING_NEEDED)
1410         return NO_BALANCING_NEEDED;
1411 
1412     create_virtual_node(tb, h);
1413 
1414     /*
1415      * determine maximal number of items we can shift to the left
1416      * neighbor (in tb structure) and the maximal number of bytes
1417      * that can flow to the left neighbor from the left most liquid
1418      * item that cannot be shifted from S[0] entirely (returned value)
1419      */
1420     check_left(tb, h, lfree);
1421 
1422     /*
1423      * determine maximal number of items we can shift to the right
1424      * neighbor (in tb structure) and the maximal number of bytes
1425      * that can flow to the right neighbor from the right most liquid
1426      * item that cannot be shifted from S[0] entirely (returned value)
1427      */
1428     check_right(tb, h, rfree);
1429 
1430     /*
1431      * all contents of internal node S[h] can be moved into its
1432      * neighbors, S[h] will be removed after balancing
1433      */
1434     if (h && (tb->rnum[h] + tb->lnum[h] >= vn->vn_nr_item + 1)) {
1435         int to_r;
1436 
1437         /*
1438          * Since we are working on internal nodes, and our internal
1439          * nodes have fixed size entries, then we can balance by the
1440          * number of items rather than the space they consume.  In this
1441          * routine we set the left node equal to the right node,
1442          * allowing a difference of less than or equal to 1 child
1443          * pointer.
1444          */
1445         to_r =
1446             ((MAX_NR_KEY(Sh) << 1) + 2 - tb->lnum[h] - tb->rnum[h] +
1447              vn->vn_nr_item + 1) / 2 - (MAX_NR_KEY(Sh) + 1 -
1448                         tb->rnum[h]);
1449         set_parameters(tb, h, vn->vn_nr_item + 1 - to_r, to_r, 0, NULL,
1450                    -1, -1);
1451         return CARRY_ON;
1452     }
1453 
1454     /*
1455      * this checks balance condition, that any two neighboring nodes
1456      * can not fit in one node
1457      */
1458     RFALSE(h &&
1459            (tb->lnum[h] >= vn->vn_nr_item + 1 ||
1460         tb->rnum[h] >= vn->vn_nr_item + 1),
1461            "vs-8220: tree is not balanced on internal level");
1462     RFALSE(!h && ((tb->lnum[h] >= vn->vn_nr_item && (tb->lbytes == -1)) ||
1463               (tb->rnum[h] >= vn->vn_nr_item && (tb->rbytes == -1))),
1464            "vs-8225: tree is not balanced on leaf level");
1465 
1466     /*
1467      * all contents of S[0] can be moved into its neighbors
1468      * S[0] will be removed after balancing.
1469      */
1470     if (!h && is_leaf_removable(tb))
1471         return CARRY_ON;
1472 
1473     /*
1474      * why do we perform this check here rather than earlier??
1475      * Answer: we can win 1 node in some cases above. Moreover we
1476      * checked it above, when we checked, that S[0] is not removable
1477      * in principle
1478      */
1479 
1480      /* new item fits into node S[h] without any shifting */
1481     if (sfree >= levbytes) {
1482         if (!h)
1483             tb->s0num = vn->vn_nr_item;
1484         set_parameters(tb, h, 0, 0, 1, NULL, -1, -1);
1485         return NO_BALANCING_NEEDED;
1486     }
1487 
1488     {
1489         int lpar, rpar, nset, lset, rset, lrset;
1490         /* regular overflowing of the node */
1491 
1492         /*
1493          * get_num_ver works in 2 modes (FLOW & NO_FLOW)
1494          * lpar, rpar - number of items we can shift to left/right
1495          *              neighbor (including splitting item)
1496          * nset, lset, rset, lrset - shows, whether flowing items
1497          *                           give better packing
1498          */
1499 #define FLOW 1
1500 #define NO_FLOW 0       /* do not any splitting */
1501 
1502         /* we choose one of the following */
1503 #define NOTHING_SHIFT_NO_FLOW   0
1504 #define NOTHING_SHIFT_FLOW  5
1505 #define LEFT_SHIFT_NO_FLOW  10
1506 #define LEFT_SHIFT_FLOW     15
1507 #define RIGHT_SHIFT_NO_FLOW 20
1508 #define RIGHT_SHIFT_FLOW    25
1509 #define LR_SHIFT_NO_FLOW    30
1510 #define LR_SHIFT_FLOW       35
1511 
1512         lpar = tb->lnum[h];
1513         rpar = tb->rnum[h];
1514 
1515         /*
1516          * calculate number of blocks S[h] must be split into when
1517          * nothing is shifted to the neighbors, as well as number of
1518          * items in each part of the split node (s012 numbers),
1519          * and number of bytes (s1bytes) of the shared drop which
1520          * flow to S1 if any
1521          */
1522         nset = NOTHING_SHIFT_NO_FLOW;
1523         nver = get_num_ver(vn->vn_mode, tb, h,
1524                    0, -1, h ? vn->vn_nr_item : 0, -1,
1525                    snum012, NO_FLOW);
1526 
1527         if (!h) {
1528             int nver1;
1529 
1530             /*
1531              * note, that in this case we try to bottle
1532              * between S[0] and S1 (S1 - the first new node)
1533              */
1534             nver1 = get_num_ver(vn->vn_mode, tb, h,
1535                         0, -1, 0, -1,
1536                         snum012 + NOTHING_SHIFT_FLOW, FLOW);
1537             if (nver > nver1)
1538                 nset = NOTHING_SHIFT_FLOW, nver = nver1;
1539         }
1540 
1541         /*
1542          * calculate number of blocks S[h] must be split into when
1543          * l_shift_num first items and l_shift_bytes of the right
1544          * most liquid item to be shifted are shifted to the left
1545          * neighbor, as well as number of items in each part of the
1546          * splitted node (s012 numbers), and number of bytes
1547          * (s1bytes) of the shared drop which flow to S1 if any
1548          */
1549         lset = LEFT_SHIFT_NO_FLOW;
1550         lnver = get_num_ver(vn->vn_mode, tb, h,
1551                     lpar - ((h || tb->lbytes == -1) ? 0 : 1),
1552                     -1, h ? vn->vn_nr_item : 0, -1,
1553                     snum012 + LEFT_SHIFT_NO_FLOW, NO_FLOW);
1554         if (!h) {
1555             int lnver1;
1556 
1557             lnver1 = get_num_ver(vn->vn_mode, tb, h,
1558                          lpar -
1559                          ((tb->lbytes != -1) ? 1 : 0),
1560                          tb->lbytes, 0, -1,
1561                          snum012 + LEFT_SHIFT_FLOW, FLOW);
1562             if (lnver > lnver1)
1563                 lset = LEFT_SHIFT_FLOW, lnver = lnver1;
1564         }
1565 
1566         /*
1567          * calculate number of blocks S[h] must be split into when
1568          * r_shift_num first items and r_shift_bytes of the left most
1569          * liquid item to be shifted are shifted to the right neighbor,
1570          * as well as number of items in each part of the splitted
1571          * node (s012 numbers), and number of bytes (s1bytes) of the
1572          * shared drop which flow to S1 if any
1573          */
1574         rset = RIGHT_SHIFT_NO_FLOW;
1575         rnver = get_num_ver(vn->vn_mode, tb, h,
1576                     0, -1,
1577                     h ? (vn->vn_nr_item - rpar) : (rpar -
1578                                    ((tb->
1579                                      rbytes !=
1580                                      -1) ? 1 :
1581                                     0)), -1,
1582                     snum012 + RIGHT_SHIFT_NO_FLOW, NO_FLOW);
1583         if (!h) {
1584             int rnver1;
1585 
1586             rnver1 = get_num_ver(vn->vn_mode, tb, h,
1587                          0, -1,
1588                          (rpar -
1589                           ((tb->rbytes != -1) ? 1 : 0)),
1590                          tb->rbytes,
1591                          snum012 + RIGHT_SHIFT_FLOW, FLOW);
1592 
1593             if (rnver > rnver1)
1594                 rset = RIGHT_SHIFT_FLOW, rnver = rnver1;
1595         }
1596 
1597         /*
1598          * calculate number of blocks S[h] must be split into when
1599          * items are shifted in both directions, as well as number
1600          * of items in each part of the splitted node (s012 numbers),
1601          * and number of bytes (s1bytes) of the shared drop which
1602          * flow to S1 if any
1603          */
1604         lrset = LR_SHIFT_NO_FLOW;
1605         lrnver = get_num_ver(vn->vn_mode, tb, h,
1606                      lpar - ((h || tb->lbytes == -1) ? 0 : 1),
1607                      -1,
1608                      h ? (vn->vn_nr_item - rpar) : (rpar -
1609                                     ((tb->
1610                                       rbytes !=
1611                                       -1) ? 1 :
1612                                      0)), -1,
1613                      snum012 + LR_SHIFT_NO_FLOW, NO_FLOW);
1614         if (!h) {
1615             int lrnver1;
1616 
1617             lrnver1 = get_num_ver(vn->vn_mode, tb, h,
1618                           lpar -
1619                           ((tb->lbytes != -1) ? 1 : 0),
1620                           tb->lbytes,
1621                           (rpar -
1622                            ((tb->rbytes != -1) ? 1 : 0)),
1623                           tb->rbytes,
1624                           snum012 + LR_SHIFT_FLOW, FLOW);
1625             if (lrnver > lrnver1)
1626                 lrset = LR_SHIFT_FLOW, lrnver = lrnver1;
1627         }
1628 
1629         /*
1630          * Our general shifting strategy is:
1631          * 1) to minimized number of new nodes;
1632          * 2) to minimized number of neighbors involved in shifting;
1633          * 3) to minimized number of disk reads;
1634          */
1635 
1636         /* we can win TWO or ONE nodes by shifting in both directions */
1637         if (lrnver < lnver && lrnver < rnver) {
1638             RFALSE(h &&
1639                    (tb->lnum[h] != 1 ||
1640                 tb->rnum[h] != 1 ||
1641                 lrnver != 1 || rnver != 2 || lnver != 2
1642                 || h != 1), "vs-8230: bad h");
1643             if (lrset == LR_SHIFT_FLOW)
1644                 set_parameters(tb, h, tb->lnum[h], tb->rnum[h],
1645                            lrnver, snum012 + lrset,
1646                            tb->lbytes, tb->rbytes);
1647             else
1648                 set_parameters(tb, h,
1649                            tb->lnum[h] -
1650                            ((tb->lbytes == -1) ? 0 : 1),
1651                            tb->rnum[h] -
1652                            ((tb->rbytes == -1) ? 0 : 1),
1653                            lrnver, snum012 + lrset, -1, -1);
1654 
1655             return CARRY_ON;
1656         }
1657 
1658         /*
1659          * if shifting doesn't lead to better packing
1660          * then don't shift
1661          */
1662         if (nver == lrnver) {
1663             set_parameters(tb, h, 0, 0, nver, snum012 + nset, -1,
1664                        -1);
1665             return CARRY_ON;
1666         }
1667 
1668         /*
1669          * now we know that for better packing shifting in only one
1670          * direction either to the left or to the right is required
1671          */
1672 
1673         /*
1674          * if shifting to the left is better than
1675          * shifting to the right
1676          */
1677         if (lnver < rnver) {
1678             SET_PAR_SHIFT_LEFT;
1679             return CARRY_ON;
1680         }
1681 
1682         /*
1683          * if shifting to the right is better than
1684          * shifting to the left
1685          */
1686         if (lnver > rnver) {
1687             SET_PAR_SHIFT_RIGHT;
1688             return CARRY_ON;
1689         }
1690 
1691         /*
1692          * now shifting in either direction gives the same number
1693          * of nodes and we can make use of the cached neighbors
1694          */
1695         if (is_left_neighbor_in_cache(tb, h)) {
1696             SET_PAR_SHIFT_LEFT;
1697             return CARRY_ON;
1698         }
1699 
1700         /*
1701          * shift to the right independently on whether the
1702          * right neighbor in cache or not
1703          */
1704         SET_PAR_SHIFT_RIGHT;
1705         return CARRY_ON;
1706     }
1707 }
1708 
1709 /*
1710  * Check whether current node S[h] is balanced when Decreasing its size by
1711  * Deleting or Cutting for INTERNAL node of S+tree.
1712  * Calculate parameters for balancing for current level h.
1713  * Parameters:
1714  *  tb  tree_balance structure;
1715  *  h   current level of the node;
1716  *  inum    item number in S[h];
1717  *  mode    i - insert, p - paste;
1718  * Returns: 1 - schedule occurred;
1719  *          0 - balancing for higher levels needed;
1720  *         -1 - no balancing for higher levels needed;
1721  *         -2 - no disk space.
1722  *
1723  * Note: Items of internal nodes have fixed size, so the balance condition for
1724  * the internal part of S+tree is as for the B-trees.
1725  */
1726 static int dc_check_balance_internal(struct tree_balance *tb, int h)
1727 {
1728     struct virtual_node *vn = tb->tb_vn;
1729 
1730     /*
1731      * Sh is the node whose balance is currently being checked,
1732      * and Fh is its father.
1733      */
1734     struct buffer_head *Sh, *Fh;
1735     int ret;
1736     int lfree, rfree /* free space in L and R */ ;
1737 
1738     Sh = PATH_H_PBUFFER(tb->tb_path, h);
1739     Fh = PATH_H_PPARENT(tb->tb_path, h);
1740 
1741     /*
1742      * using tb->insert_size[h], which is negative in this case,
1743      * create_virtual_node calculates:
1744      * new_nr_item = number of items node would have if operation is
1745      * performed without balancing (new_nr_item);
1746      */
1747     create_virtual_node(tb, h);
1748 
1749     if (!Fh) {      /* S[h] is the root. */
1750         /* no balancing for higher levels needed */
1751         if (vn->vn_nr_item > 0) {
1752             set_parameters(tb, h, 0, 0, 1, NULL, -1, -1);
1753             return NO_BALANCING_NEEDED;
1754         }
1755         /*
1756          * new_nr_item == 0.
1757          * Current root will be deleted resulting in
1758          * decrementing the tree height.
1759          */
1760         set_parameters(tb, h, 0, 0, 0, NULL, -1, -1);
1761         return CARRY_ON;
1762     }
1763 
1764     if ((ret = get_parents(tb, h)) != CARRY_ON)
1765         return ret;
1766 
1767     /* get free space of neighbors */
1768     rfree = get_rfree(tb, h);
1769     lfree = get_lfree(tb, h);
1770 
1771     /* determine maximal number of items we can fit into neighbors */
1772     check_left(tb, h, lfree);
1773     check_right(tb, h, rfree);
1774 
1775     /*
1776      * Balance condition for the internal node is valid.
1777      * In this case we balance only if it leads to better packing.
1778      */
1779     if (vn->vn_nr_item >= MIN_NR_KEY(Sh)) {
1780         /*
1781          * Here we join S[h] with one of its neighbors,
1782          * which is impossible with greater values of new_nr_item.
1783          */
1784         if (vn->vn_nr_item == MIN_NR_KEY(Sh)) {
1785             /* All contents of S[h] can be moved to L[h]. */
1786             if (tb->lnum[h] >= vn->vn_nr_item + 1) {
1787                 int n;
1788                 int order_L;
1789 
1790                 order_L =
1791                     ((n =
1792                       PATH_H_B_ITEM_ORDER(tb->tb_path,
1793                               h)) ==
1794                      0) ? B_NR_ITEMS(tb->FL[h]) : n - 1;
1795                 n = dc_size(B_N_CHILD(tb->FL[h], order_L)) /
1796                     (DC_SIZE + KEY_SIZE);
1797                 set_parameters(tb, h, -n - 1, 0, 0, NULL, -1,
1798                            -1);
1799                 return CARRY_ON;
1800             }
1801 
1802             /* All contents of S[h] can be moved to R[h]. */
1803             if (tb->rnum[h] >= vn->vn_nr_item + 1) {
1804                 int n;
1805                 int order_R;
1806 
1807                 order_R =
1808                     ((n =
1809                       PATH_H_B_ITEM_ORDER(tb->tb_path,
1810                               h)) ==
1811                      B_NR_ITEMS(Fh)) ? 0 : n + 1;
1812                 n = dc_size(B_N_CHILD(tb->FR[h], order_R)) /
1813                     (DC_SIZE + KEY_SIZE);
1814                 set_parameters(tb, h, 0, -n - 1, 0, NULL, -1,
1815                            -1);
1816                 return CARRY_ON;
1817             }
1818         }
1819 
1820         /*
1821          * All contents of S[h] can be moved to the neighbors
1822          * (L[h] & R[h]).
1823          */
1824         if (tb->rnum[h] + tb->lnum[h] >= vn->vn_nr_item + 1) {
1825             int to_r;
1826 
1827             to_r =
1828                 ((MAX_NR_KEY(Sh) << 1) + 2 - tb->lnum[h] -
1829                  tb->rnum[h] + vn->vn_nr_item + 1) / 2 -
1830                 (MAX_NR_KEY(Sh) + 1 - tb->rnum[h]);
1831             set_parameters(tb, h, vn->vn_nr_item + 1 - to_r, to_r,
1832                        0, NULL, -1, -1);
1833             return CARRY_ON;
1834         }
1835 
1836         /* Balancing does not lead to better packing. */
1837         set_parameters(tb, h, 0, 0, 1, NULL, -1, -1);
1838         return NO_BALANCING_NEEDED;
1839     }
1840 
1841     /*
1842      * Current node contain insufficient number of items.
1843      * Balancing is required.
1844      */
1845     /* Check whether we can merge S[h] with left neighbor. */
1846     if (tb->lnum[h] >= vn->vn_nr_item + 1)
1847         if (is_left_neighbor_in_cache(tb, h)
1848             || tb->rnum[h] < vn->vn_nr_item + 1 || !tb->FR[h]) {
1849             int n;
1850             int order_L;
1851 
1852             order_L =
1853                 ((n =
1854                   PATH_H_B_ITEM_ORDER(tb->tb_path,
1855                           h)) ==
1856                  0) ? B_NR_ITEMS(tb->FL[h]) : n - 1;
1857             n = dc_size(B_N_CHILD(tb->FL[h], order_L)) / (DC_SIZE +
1858                                       KEY_SIZE);
1859             set_parameters(tb, h, -n - 1, 0, 0, NULL, -1, -1);
1860             return CARRY_ON;
1861         }
1862 
1863     /* Check whether we can merge S[h] with right neighbor. */
1864     if (tb->rnum[h] >= vn->vn_nr_item + 1) {
1865         int n;
1866         int order_R;
1867 
1868         order_R =
1869             ((n =
1870               PATH_H_B_ITEM_ORDER(tb->tb_path,
1871                       h)) == B_NR_ITEMS(Fh)) ? 0 : (n + 1);
1872         n = dc_size(B_N_CHILD(tb->FR[h], order_R)) / (DC_SIZE +
1873                                   KEY_SIZE);
1874         set_parameters(tb, h, 0, -n - 1, 0, NULL, -1, -1);
1875         return CARRY_ON;
1876     }
1877 
1878     /* All contents of S[h] can be moved to the neighbors (L[h] & R[h]). */
1879     if (tb->rnum[h] + tb->lnum[h] >= vn->vn_nr_item + 1) {
1880         int to_r;
1881 
1882         to_r =
1883             ((MAX_NR_KEY(Sh) << 1) + 2 - tb->lnum[h] - tb->rnum[h] +
1884              vn->vn_nr_item + 1) / 2 - (MAX_NR_KEY(Sh) + 1 -
1885                         tb->rnum[h]);
1886         set_parameters(tb, h, vn->vn_nr_item + 1 - to_r, to_r, 0, NULL,
1887                    -1, -1);
1888         return CARRY_ON;
1889     }
1890 
1891     /* For internal nodes try to borrow item from a neighbor */
1892     RFALSE(!tb->FL[h] && !tb->FR[h], "vs-8235: trying to borrow for root");
1893 
1894     /* Borrow one or two items from caching neighbor */
1895     if (is_left_neighbor_in_cache(tb, h) || !tb->FR[h]) {
1896         int from_l;
1897 
1898         from_l =
1899             (MAX_NR_KEY(Sh) + 1 - tb->lnum[h] + vn->vn_nr_item +
1900              1) / 2 - (vn->vn_nr_item + 1);
1901         set_parameters(tb, h, -from_l, 0, 1, NULL, -1, -1);
1902         return CARRY_ON;
1903     }
1904 
1905     set_parameters(tb, h, 0,
1906                -((MAX_NR_KEY(Sh) + 1 - tb->rnum[h] + vn->vn_nr_item +
1907               1) / 2 - (vn->vn_nr_item + 1)), 1, NULL, -1, -1);
1908     return CARRY_ON;
1909 }
1910 
1911 /*
1912  * Check whether current node S[h] is balanced when Decreasing its size by
1913  * Deleting or Truncating for LEAF node of S+tree.
1914  * Calculate parameters for balancing for current level h.
1915  * Parameters:
1916  *  tb  tree_balance structure;
1917  *  h   current level of the node;
1918  *  inum    item number in S[h];
1919  *  mode    i - insert, p - paste;
1920  * Returns: 1 - schedule occurred;
1921  *          0 - balancing for higher levels needed;
1922  *         -1 - no balancing for higher levels needed;
1923  *         -2 - no disk space.
1924  */
1925 static int dc_check_balance_leaf(struct tree_balance *tb, int h)
1926 {
1927     struct virtual_node *vn = tb->tb_vn;
1928 
1929     /*
1930      * Number of bytes that must be deleted from
1931      * (value is negative if bytes are deleted) buffer which
1932      * contains node being balanced.  The mnemonic is that the
1933      * attempted change in node space used level is levbytes bytes.
1934      */
1935     int levbytes;
1936 
1937     /* the maximal item size */
1938     int maxsize, ret;
1939 
1940     /*
1941      * S0 is the node whose balance is currently being checked,
1942      * and F0 is its father.
1943      */
1944     struct buffer_head *S0, *F0;
1945     int lfree, rfree /* free space in L and R */ ;
1946 
1947     S0 = PATH_H_PBUFFER(tb->tb_path, 0);
1948     F0 = PATH_H_PPARENT(tb->tb_path, 0);
1949 
1950     levbytes = tb->insert_size[h];
1951 
1952     maxsize = MAX_CHILD_SIZE(S0);   /* maximal possible size of an item */
1953 
1954     if (!F0) {      /* S[0] is the root now. */
1955 
1956         RFALSE(-levbytes >= maxsize - B_FREE_SPACE(S0),
1957                "vs-8240: attempt to create empty buffer tree");
1958 
1959         set_parameters(tb, h, 0, 0, 1, NULL, -1, -1);
1960         return NO_BALANCING_NEEDED;
1961     }
1962 
1963     if ((ret = get_parents(tb, h)) != CARRY_ON)
1964         return ret;
1965 
1966     /* get free space of neighbors */
1967     rfree = get_rfree(tb, h);
1968     lfree = get_lfree(tb, h);
1969 
1970     create_virtual_node(tb, h);
1971 
1972     /* if 3 leaves can be merge to one, set parameters and return */
1973     if (are_leaves_removable(tb, lfree, rfree))
1974         return CARRY_ON;
1975 
1976     /*
1977      * determine maximal number of items we can shift to the left/right
1978      * neighbor and the maximal number of bytes that can flow to the
1979      * left/right neighbor from the left/right most liquid item that
1980      * cannot be shifted from S[0] entirely
1981      */
1982     check_left(tb, h, lfree);
1983     check_right(tb, h, rfree);
1984 
1985     /* check whether we can merge S with left neighbor. */
1986     if (tb->lnum[0] >= vn->vn_nr_item && tb->lbytes == -1)
1987         if (is_left_neighbor_in_cache(tb, h) || ((tb->rnum[0] - ((tb->rbytes == -1) ? 0 : 1)) < vn->vn_nr_item) ||  /* S can not be merged with R */
1988             !tb->FR[h]) {
1989 
1990             RFALSE(!tb->FL[h],
1991                    "vs-8245: dc_check_balance_leaf: FL[h] must exist");
1992 
1993             /* set parameter to merge S[0] with its left neighbor */
1994             set_parameters(tb, h, -1, 0, 0, NULL, -1, -1);
1995             return CARRY_ON;
1996         }
1997 
1998     /* check whether we can merge S[0] with right neighbor. */
1999     if (tb->rnum[0] >= vn->vn_nr_item && tb->rbytes == -1) {
2000         set_parameters(tb, h, 0, -1, 0, NULL, -1, -1);
2001         return CARRY_ON;
2002     }
2003 
2004     /*
2005      * All contents of S[0] can be moved to the neighbors (L[0] & R[0]).
2006      * Set parameters and return
2007      */
2008     if (is_leaf_removable(tb))
2009         return CARRY_ON;
2010 
2011     /* Balancing is not required. */
2012     tb->s0num = vn->vn_nr_item;
2013     set_parameters(tb, h, 0, 0, 1, NULL, -1, -1);
2014     return NO_BALANCING_NEEDED;
2015 }
2016 
2017 /*
2018  * Check whether current node S[h] is balanced when Decreasing its size by
2019  * Deleting or Cutting.
2020  * Calculate parameters for balancing for current level h.
2021  * Parameters:
2022  *  tb  tree_balance structure;
2023  *  h   current level of the node;
2024  *  inum    item number in S[h];
2025  *  mode    d - delete, c - cut.
2026  * Returns: 1 - schedule occurred;
2027  *          0 - balancing for higher levels needed;
2028  *         -1 - no balancing for higher levels needed;
2029  *         -2 - no disk space.
2030  */
2031 static int dc_check_balance(struct tree_balance *tb, int h)
2032 {
2033     RFALSE(!(PATH_H_PBUFFER(tb->tb_path, h)),
2034            "vs-8250: S is not initialized");
2035 
2036     if (h)
2037         return dc_check_balance_internal(tb, h);
2038     else
2039         return dc_check_balance_leaf(tb, h);
2040 }
2041 
2042 /*
2043  * Check whether current node S[h] is balanced.
2044  * Calculate parameters for balancing for current level h.
2045  * Parameters:
2046  *
2047  *  tb  tree_balance structure:
2048  *
2049  *              tb is a large structure that must be read about in the header
2050  *      file at the same time as this procedure if the reader is
2051  *      to successfully understand this procedure
2052  *
2053  *  h   current level of the node;
2054  *  inum    item number in S[h];
2055  *  mode    i - insert, p - paste, d - delete, c - cut.
2056  * Returns: 1 - schedule occurred;
2057  *          0 - balancing for higher levels needed;
2058  *         -1 - no balancing for higher levels needed;
2059  *         -2 - no disk space.
2060  */
2061 static int check_balance(int mode,
2062              struct tree_balance *tb,
2063              int h,
2064              int inum,
2065              int pos_in_item,
2066              struct item_head *ins_ih, const void *data)
2067 {
2068     struct virtual_node *vn;
2069 
2070     vn = tb->tb_vn = (struct virtual_node *)(tb->vn_buf);
2071     vn->vn_free_ptr = (char *)(tb->tb_vn + 1);
2072     vn->vn_mode = mode;
2073     vn->vn_affected_item_num = inum;
2074     vn->vn_pos_in_item = pos_in_item;
2075     vn->vn_ins_ih = ins_ih;
2076     vn->vn_data = data;
2077 
2078     RFALSE(mode == M_INSERT && !vn->vn_ins_ih,
2079            "vs-8255: ins_ih can not be 0 in insert mode");
2080 
2081     /* Calculate balance parameters when size of node is increasing. */
2082     if (tb->insert_size[h] > 0)
2083         return ip_check_balance(tb, h);
2084 
2085     /* Calculate balance parameters when  size of node is decreasing. */
2086     return dc_check_balance(tb, h);
2087 }
2088 
2089 /* Check whether parent at the path is the really parent of the current node.*/
2090 static int get_direct_parent(struct tree_balance *tb, int h)
2091 {
2092     struct buffer_head *bh;
2093     struct treepath *path = tb->tb_path;
2094     int position,
2095         path_offset = PATH_H_PATH_OFFSET(tb->tb_path, h);
2096 
2097     /* We are in the root or in the new root. */
2098     if (path_offset <= FIRST_PATH_ELEMENT_OFFSET) {
2099 
2100         RFALSE(path_offset < FIRST_PATH_ELEMENT_OFFSET - 1,
2101                "PAP-8260: invalid offset in the path");
2102 
2103         if (PATH_OFFSET_PBUFFER(path, FIRST_PATH_ELEMENT_OFFSET)->
2104             b_blocknr == SB_ROOT_BLOCK(tb->tb_sb)) {
2105             /* Root is not changed. */
2106             PATH_OFFSET_PBUFFER(path, path_offset - 1) = NULL;
2107             PATH_OFFSET_POSITION(path, path_offset - 1) = 0;
2108             return CARRY_ON;
2109         }
2110         /* Root is changed and we must recalculate the path. */
2111         return REPEAT_SEARCH;
2112     }
2113 
2114     /* Parent in the path is not in the tree. */
2115     if (!B_IS_IN_TREE
2116         (bh = PATH_OFFSET_PBUFFER(path, path_offset - 1)))
2117         return REPEAT_SEARCH;
2118 
2119     if ((position =
2120          PATH_OFFSET_POSITION(path,
2121                   path_offset - 1)) > B_NR_ITEMS(bh))
2122         return REPEAT_SEARCH;
2123 
2124     /* Parent in the path is not parent of the current node in the tree. */
2125     if (B_N_CHILD_NUM(bh, position) !=
2126         PATH_OFFSET_PBUFFER(path, path_offset)->b_blocknr)
2127         return REPEAT_SEARCH;
2128 
2129     if (buffer_locked(bh)) {
2130         int depth = reiserfs_write_unlock_nested(tb->tb_sb);
2131         __wait_on_buffer(bh);
2132         reiserfs_write_lock_nested(tb->tb_sb, depth);
2133         if (FILESYSTEM_CHANGED_TB(tb))
2134             return REPEAT_SEARCH;
2135     }
2136 
2137     /*
2138      * Parent in the path is unlocked and really parent
2139      * of the current node.
2140      */
2141     return CARRY_ON;
2142 }
2143 
2144 /*
2145  * Using lnum[h] and rnum[h] we should determine what neighbors
2146  * of S[h] we
2147  * need in order to balance S[h], and get them if necessary.
2148  * Returns: SCHEDULE_OCCURRED - schedule occurred while the function worked;
2149  *          CARRY_ON - schedule didn't occur while the function worked;
2150  */
2151 static int get_neighbors(struct tree_balance *tb, int h)
2152 {
2153     int child_position,
2154         path_offset = PATH_H_PATH_OFFSET(tb->tb_path, h + 1);
2155     unsigned long son_number;
2156     struct super_block *sb = tb->tb_sb;
2157     struct buffer_head *bh;
2158     int depth;
2159 
2160     PROC_INFO_INC(sb, get_neighbors[h]);
2161 
2162     if (tb->lnum[h]) {
2163         /* We need left neighbor to balance S[h]. */
2164         PROC_INFO_INC(sb, need_l_neighbor[h]);
2165         bh = PATH_OFFSET_PBUFFER(tb->tb_path, path_offset);
2166 
2167         RFALSE(bh == tb->FL[h] &&
2168                !PATH_OFFSET_POSITION(tb->tb_path, path_offset),
2169                "PAP-8270: invalid position in the parent");
2170 
2171         child_position =
2172             (bh ==
2173              tb->FL[h]) ? tb->lkey[h] : B_NR_ITEMS(tb->
2174                                        FL[h]);
2175         son_number = B_N_CHILD_NUM(tb->FL[h], child_position);
2176         depth = reiserfs_write_unlock_nested(tb->tb_sb);
2177         bh = sb_bread(sb, son_number);
2178         reiserfs_write_lock_nested(tb->tb_sb, depth);
2179         if (!bh)
2180             return IO_ERROR;
2181         if (FILESYSTEM_CHANGED_TB(tb)) {
2182             brelse(bh);
2183             PROC_INFO_INC(sb, get_neighbors_restart[h]);
2184             return REPEAT_SEARCH;
2185         }
2186 
2187         RFALSE(!B_IS_IN_TREE(tb->FL[h]) ||
2188                child_position > B_NR_ITEMS(tb->FL[h]) ||
2189                B_N_CHILD_NUM(tb->FL[h], child_position) !=
2190                bh->b_blocknr, "PAP-8275: invalid parent");
2191         RFALSE(!B_IS_IN_TREE(bh), "PAP-8280: invalid child");
2192         RFALSE(!h &&
2193                B_FREE_SPACE(bh) !=
2194                MAX_CHILD_SIZE(bh) -
2195                dc_size(B_N_CHILD(tb->FL[0], child_position)),
2196                "PAP-8290: invalid child size of left neighbor");
2197 
2198         brelse(tb->L[h]);
2199         tb->L[h] = bh;
2200     }
2201 
2202     /* We need right neighbor to balance S[path_offset]. */
2203     if (tb->rnum[h]) {
2204         PROC_INFO_INC(sb, need_r_neighbor[h]);
2205         bh = PATH_OFFSET_PBUFFER(tb->tb_path, path_offset);
2206 
2207         RFALSE(bh == tb->FR[h] &&
2208                PATH_OFFSET_POSITION(tb->tb_path,
2209                         path_offset) >=
2210                B_NR_ITEMS(bh),
2211                "PAP-8295: invalid position in the parent");
2212 
2213         child_position =
2214             (bh == tb->FR[h]) ? tb->rkey[h] + 1 : 0;
2215         son_number = B_N_CHILD_NUM(tb->FR[h], child_position);
2216         depth = reiserfs_write_unlock_nested(tb->tb_sb);
2217         bh = sb_bread(sb, son_number);
2218         reiserfs_write_lock_nested(tb->tb_sb, depth);
2219         if (!bh)
2220             return IO_ERROR;
2221         if (FILESYSTEM_CHANGED_TB(tb)) {
2222             brelse(bh);
2223             PROC_INFO_INC(sb, get_neighbors_restart[h]);
2224             return REPEAT_SEARCH;
2225         }
2226         brelse(tb->R[h]);
2227         tb->R[h] = bh;
2228 
2229         RFALSE(!h
2230                && B_FREE_SPACE(bh) !=
2231                MAX_CHILD_SIZE(bh) -
2232                dc_size(B_N_CHILD(tb->FR[0], child_position)),
2233                "PAP-8300: invalid child size of right neighbor (%d != %d - %d)",
2234                B_FREE_SPACE(bh), MAX_CHILD_SIZE(bh),
2235                dc_size(B_N_CHILD(tb->FR[0], child_position)));
2236 
2237     }
2238     return CARRY_ON;
2239 }
2240 
2241 static int get_virtual_node_size(struct super_block *sb, struct buffer_head *bh)
2242 {
2243     int max_num_of_items;
2244     int max_num_of_entries;
2245     unsigned long blocksize = sb->s_blocksize;
2246 
2247 #define MIN_NAME_LEN 1
2248 
2249     max_num_of_items = (blocksize - BLKH_SIZE) / (IH_SIZE + MIN_ITEM_LEN);
2250     max_num_of_entries = (blocksize - BLKH_SIZE - IH_SIZE) /
2251         (DEH_SIZE + MIN_NAME_LEN);
2252 
2253     return sizeof(struct virtual_node) +
2254         max(max_num_of_items * sizeof(struct virtual_item),
2255         sizeof(struct virtual_item) + sizeof(struct direntry_uarea) +
2256         (max_num_of_entries - 1) * sizeof(__u16));
2257 }
2258 
2259 /*
2260  * maybe we should fail balancing we are going to perform when kmalloc
2261  * fails several times. But now it will loop until kmalloc gets
2262  * required memory
2263  */
2264 static int get_mem_for_virtual_node(struct tree_balance *tb)
2265 {
2266     int check_fs = 0;
2267     int size;
2268     char *buf;
2269 
2270     size = get_virtual_node_size(tb->tb_sb, PATH_PLAST_BUFFER(tb->tb_path));
2271 
2272     /* we have to allocate more memory for virtual node */
2273     if (size > tb->vn_buf_size) {
2274         if (tb->vn_buf) {
2275             /* free memory allocated before */
2276             kfree(tb->vn_buf);
2277             /* this is not needed if kfree is atomic */
2278             check_fs = 1;
2279         }
2280 
2281         /* virtual node requires now more memory */
2282         tb->vn_buf_size = size;
2283 
2284         /* get memory for virtual item */
2285         buf = kmalloc(size, GFP_ATOMIC | __GFP_NOWARN);
2286         if (!buf) {
2287             /*
2288              * getting memory with GFP_KERNEL priority may involve
2289              * balancing now (due to indirect_to_direct conversion
2290              * on dcache shrinking). So, release path and collected
2291              * resources here
2292              */
2293             free_buffers_in_tb(tb);
2294             buf = kmalloc(size, GFP_NOFS);
2295             if (!buf) {
2296                 tb->vn_buf_size = 0;
2297             }
2298             tb->vn_buf = buf;
2299             schedule();
2300             return REPEAT_SEARCH;
2301         }
2302 
2303         tb->vn_buf = buf;
2304     }
2305 
2306     if (check_fs && FILESYSTEM_CHANGED_TB(tb))
2307         return REPEAT_SEARCH;
2308 
2309     return CARRY_ON;
2310 }
2311 
2312 #ifdef CONFIG_REISERFS_CHECK
2313 static void tb_buffer_sanity_check(struct super_block *sb,
2314                    struct buffer_head *bh,
2315                    const char *descr, int level)
2316 {
2317     if (bh) {
2318         if (atomic_read(&(bh->b_count)) <= 0)
2319 
2320             reiserfs_panic(sb, "jmacd-1", "negative or zero "
2321                        "reference counter for buffer %s[%d] "
2322                        "(%b)", descr, level, bh);
2323 
2324         if (!buffer_uptodate(bh))
2325             reiserfs_panic(sb, "jmacd-2", "buffer is not up "
2326                        "to date %s[%d] (%b)",
2327                        descr, level, bh);
2328 
2329         if (!B_IS_IN_TREE(bh))
2330             reiserfs_panic(sb, "jmacd-3", "buffer is not "
2331                        "in tree %s[%d] (%b)",
2332                        descr, level, bh);
2333 
2334         if (bh->b_bdev != sb->s_bdev)
2335             reiserfs_panic(sb, "jmacd-4", "buffer has wrong "
2336                        "device %s[%d] (%b)",
2337                        descr, level, bh);
2338 
2339         if (bh->b_size != sb->s_blocksize)
2340             reiserfs_panic(sb, "jmacd-5", "buffer has wrong "
2341                        "blocksize %s[%d] (%b)",
2342                        descr, level, bh);
2343 
2344         if (bh->b_blocknr > SB_BLOCK_COUNT(sb))
2345             reiserfs_panic(sb, "jmacd-6", "buffer block "
2346                        "number too high %s[%d] (%b)",
2347                        descr, level, bh);
2348     }
2349 }
2350 #else
2351 static void tb_buffer_sanity_check(struct super_block *sb,
2352                    struct buffer_head *bh,
2353                    const char *descr, int level)
2354 {;
2355 }
2356 #endif
2357 
2358 static int clear_all_dirty_bits(struct super_block *s, struct buffer_head *bh)
2359 {
2360     return reiserfs_prepare_for_journal(s, bh, 0);
2361 }
2362 
2363 static int wait_tb_buffers_until_unlocked(struct tree_balance *tb)
2364 {
2365     struct buffer_head *locked;
2366 #ifdef CONFIG_REISERFS_CHECK
2367     int repeat_counter = 0;
2368 #endif
2369     int i;
2370 
2371     do {
2372 
2373         locked = NULL;
2374 
2375         for (i = tb->tb_path->path_length;
2376              !locked && i > ILLEGAL_PATH_ELEMENT_OFFSET; i--) {
2377             if (PATH_OFFSET_PBUFFER(tb->tb_path, i)) {
2378                 /*
2379                  * if I understand correctly, we can only
2380                  * be sure the last buffer in the path is
2381                  * in the tree --clm
2382                  */
2383 #ifdef CONFIG_REISERFS_CHECK
2384                 if (PATH_PLAST_BUFFER(tb->tb_path) ==
2385                     PATH_OFFSET_PBUFFER(tb->tb_path, i))
2386                     tb_buffer_sanity_check(tb->tb_sb,
2387                                    PATH_OFFSET_PBUFFER
2388                                    (tb->tb_path,
2389                                 i), "S",
2390                                    tb->tb_path->
2391                                    path_length - i);
2392 #endif
2393                 if (!clear_all_dirty_bits(tb->tb_sb,
2394                               PATH_OFFSET_PBUFFER
2395                               (tb->tb_path,
2396                                i))) {
2397                     locked =
2398                         PATH_OFFSET_PBUFFER(tb->tb_path,
2399                                 i);
2400                 }
2401             }
2402         }
2403 
2404         for (i = 0; !locked && i < MAX_HEIGHT && tb->insert_size[i];
2405              i++) {
2406 
2407             if (tb->lnum[i]) {
2408 
2409                 if (tb->L[i]) {
2410                     tb_buffer_sanity_check(tb->tb_sb,
2411                                    tb->L[i],
2412                                    "L", i);
2413                     if (!clear_all_dirty_bits
2414                         (tb->tb_sb, tb->L[i]))
2415                         locked = tb->L[i];
2416                 }
2417 
2418                 if (!locked && tb->FL[i]) {
2419                     tb_buffer_sanity_check(tb->tb_sb,
2420                                    tb->FL[i],
2421                                    "FL", i);
2422                     if (!clear_all_dirty_bits
2423                         (tb->tb_sb, tb->FL[i]))
2424                         locked = tb->FL[i];
2425                 }
2426 
2427                 if (!locked && tb->CFL[i]) {
2428                     tb_buffer_sanity_check(tb->tb_sb,
2429                                    tb->CFL[i],
2430                                    "CFL", i);
2431                     if (!clear_all_dirty_bits
2432                         (tb->tb_sb, tb->CFL[i]))
2433                         locked = tb->CFL[i];
2434                 }
2435 
2436             }
2437 
2438             if (!locked && (tb->rnum[i])) {
2439 
2440                 if (tb->R[i]) {
2441                     tb_buffer_sanity_check(tb->tb_sb,
2442                                    tb->R[i],
2443                                    "R", i);
2444                     if (!clear_all_dirty_bits
2445                         (tb->tb_sb, tb->R[i]))
2446                         locked = tb->R[i];
2447                 }
2448 
2449                 if (!locked && tb->FR[i]) {
2450                     tb_buffer_sanity_check(tb->tb_sb,
2451                                    tb->FR[i],
2452                                    "FR", i);
2453                     if (!clear_all_dirty_bits
2454                         (tb->tb_sb, tb->FR[i]))
2455                         locked = tb->FR[i];
2456                 }
2457 
2458                 if (!locked && tb->CFR[i]) {
2459                     tb_buffer_sanity_check(tb->tb_sb,
2460                                    tb->CFR[i],
2461                                    "CFR", i);
2462                     if (!clear_all_dirty_bits
2463                         (tb->tb_sb, tb->CFR[i]))
2464                         locked = tb->CFR[i];
2465                 }
2466             }
2467         }
2468 
2469         /*
2470          * as far as I can tell, this is not required.  The FEB list
2471          * seems to be full of newly allocated nodes, which will
2472          * never be locked, dirty, or anything else.
2473          * To be safe, I'm putting in the checks and waits in.
2474          * For the moment, they are needed to keep the code in
2475          * journal.c from complaining about the buffer.
2476          * That code is inside CONFIG_REISERFS_CHECK as well.  --clm
2477          */
2478         for (i = 0; !locked && i < MAX_FEB_SIZE; i++) {
2479             if (tb->FEB[i]) {
2480                 if (!clear_all_dirty_bits
2481                     (tb->tb_sb, tb->FEB[i]))
2482                     locked = tb->FEB[i];
2483             }
2484         }
2485 
2486         if (locked) {
2487             int depth;
2488 #ifdef CONFIG_REISERFS_CHECK
2489             repeat_counter++;
2490             if ((repeat_counter % 10000) == 0) {
2491                 reiserfs_warning(tb->tb_sb, "reiserfs-8200",
2492                          "too many iterations waiting "
2493                          "for buffer to unlock "
2494                          "(%b)", locked);
2495 
2496                 /* Don't loop forever.  Try to recover from possible error. */
2497 
2498                 return (FILESYSTEM_CHANGED_TB(tb)) ?
2499                     REPEAT_SEARCH : CARRY_ON;
2500             }
2501 #endif
2502             depth = reiserfs_write_unlock_nested(tb->tb_sb);
2503             __wait_on_buffer(locked);
2504             reiserfs_write_lock_nested(tb->tb_sb, depth);
2505             if (FILESYSTEM_CHANGED_TB(tb))
2506                 return REPEAT_SEARCH;
2507         }
2508 
2509     } while (locked);
2510 
2511     return CARRY_ON;
2512 }
2513 
2514 /*
2515  * Prepare for balancing, that is
2516  *  get all necessary parents, and neighbors;
2517  *  analyze what and where should be moved;
2518  *  get sufficient number of new nodes;
2519  * Balancing will start only after all resources will be collected at a time.
2520  *
2521  * When ported to SMP kernels, only at the last moment after all needed nodes
2522  * are collected in cache, will the resources be locked using the usual
2523  * textbook ordered lock acquisition algorithms.  Note that ensuring that
2524  * this code neither write locks what it does not need to write lock nor locks
2525  * out of order will be a pain in the butt that could have been avoided.
2526  * Grumble grumble. -Hans
2527  *
2528  * fix is meant in the sense of render unchanging
2529  *
2530  * Latency might be improved by first gathering a list of what buffers
2531  * are needed and then getting as many of them in parallel as possible? -Hans
2532  *
2533  * Parameters:
2534  *  op_mode i - insert, d - delete, c - cut (truncate), p - paste (append)
2535  *  tb  tree_balance structure;
2536  *  inum    item number in S[h];
2537  *      pos_in_item - comment this if you can
2538  *      ins_ih  item head of item being inserted
2539  *  data    inserted item or data to be pasted
2540  * Returns: 1 - schedule occurred while the function worked;
2541  *          0 - schedule didn't occur while the function worked;
2542  *             -1 - if no_disk_space
2543  */
2544 
2545 int fix_nodes(int op_mode, struct tree_balance *tb,
2546           struct item_head *ins_ih, const void *data)
2547 {
2548     int ret, h, item_num = PATH_LAST_POSITION(tb->tb_path);
2549     int pos_in_item;
2550 
2551     /*
2552      * we set wait_tb_buffers_run when we have to restore any dirty
2553      * bits cleared during wait_tb_buffers_run
2554      */
2555     int wait_tb_buffers_run = 0;
2556     struct buffer_head *tbS0 = PATH_PLAST_BUFFER(tb->tb_path);
2557 
2558     ++REISERFS_SB(tb->tb_sb)->s_fix_nodes;
2559 
2560     pos_in_item = tb->tb_path->pos_in_item;
2561 
2562     tb->fs_gen = get_generation(tb->tb_sb);
2563 
2564     /*
2565      * we prepare and log the super here so it will already be in the
2566      * transaction when do_balance needs to change it.
2567      * This way do_balance won't have to schedule when trying to prepare
2568      * the super for logging
2569      */
2570     reiserfs_prepare_for_journal(tb->tb_sb,
2571                      SB_BUFFER_WITH_SB(tb->tb_sb), 1);
2572     journal_mark_dirty(tb->transaction_handle,
2573                SB_BUFFER_WITH_SB(tb->tb_sb));
2574     if (FILESYSTEM_CHANGED_TB(tb))
2575         return REPEAT_SEARCH;
2576 
2577     /* if it possible in indirect_to_direct conversion */
2578     if (buffer_locked(tbS0)) {
2579         int depth = reiserfs_write_unlock_nested(tb->tb_sb);
2580         __wait_on_buffer(tbS0);
2581         reiserfs_write_lock_nested(tb->tb_sb, depth);
2582         if (FILESYSTEM_CHANGED_TB(tb))
2583             return REPEAT_SEARCH;
2584     }
2585 #ifdef CONFIG_REISERFS_CHECK
2586     if (REISERFS_SB(tb->tb_sb)->cur_tb) {
2587         print_cur_tb("fix_nodes");
2588         reiserfs_panic(tb->tb_sb, "PAP-8305",
2589                    "there is pending do_balance");
2590     }
2591 
2592     if (!buffer_uptodate(tbS0) || !B_IS_IN_TREE(tbS0))
2593         reiserfs_panic(tb->tb_sb, "PAP-8320", "S[0] (%b %z) is "
2594                    "not uptodate at the beginning of fix_nodes "
2595                    "or not in tree (mode %c)",
2596                    tbS0, tbS0, op_mode);
2597 
2598     /* Check parameters. */
2599     switch (op_mode) {
2600     case M_INSERT:
2601         if (item_num <= 0 || item_num > B_NR_ITEMS(tbS0))
2602             reiserfs_panic(tb->tb_sb, "PAP-8330", "Incorrect "
2603                        "item number %d (in S0 - %d) in case "
2604                        "of insert", item_num,
2605                        B_NR_ITEMS(tbS0));
2606         break;
2607     case M_PASTE:
2608     case M_DELETE:
2609     case M_CUT:
2610         if (item_num < 0 || item_num >= B_NR_ITEMS(tbS0)) {
2611             print_block(tbS0, 0, -1, -1);
2612             reiserfs_panic(tb->tb_sb, "PAP-8335", "Incorrect "
2613                        "item number(%d); mode = %c "
2614                        "insert_size = %d",
2615                        item_num, op_mode,
2616                        tb->insert_size[0]);
2617         }
2618         break;
2619     default:
2620         reiserfs_panic(tb->tb_sb, "PAP-8340", "Incorrect mode "
2621                    "of operation");
2622     }
2623 #endif
2624 
2625     if (get_mem_for_virtual_node(tb) == REPEAT_SEARCH)
2626         /* FIXME: maybe -ENOMEM when tb->vn_buf == 0? Now just repeat */
2627         return REPEAT_SEARCH;
2628 
2629     /* Starting from the leaf level; for all levels h of the tree. */
2630     for (h = 0; h < MAX_HEIGHT && tb->insert_size[h]; h++) {
2631         ret = get_direct_parent(tb, h);
2632         if (ret != CARRY_ON)
2633             goto repeat;
2634 
2635         ret = check_balance(op_mode, tb, h, item_num,
2636                     pos_in_item, ins_ih, data);
2637         if (ret != CARRY_ON) {
2638             if (ret == NO_BALANCING_NEEDED) {
2639                 /* No balancing for higher levels needed. */
2640                 ret = get_neighbors(tb, h);
2641                 if (ret != CARRY_ON)
2642                     goto repeat;
2643                 if (h != MAX_HEIGHT - 1)
2644                     tb->insert_size[h + 1] = 0;
2645                 /*
2646                  * ok, analysis and resource gathering
2647                  * are complete
2648                  */
2649                 break;
2650             }
2651             goto repeat;
2652         }
2653 
2654         ret = get_neighbors(tb, h);
2655         if (ret != CARRY_ON)
2656             goto repeat;
2657 
2658         /*
2659          * No disk space, or schedule occurred and analysis may be
2660          * invalid and needs to be redone.
2661          */
2662         ret = get_empty_nodes(tb, h);
2663         if (ret != CARRY_ON)
2664             goto repeat;
2665 
2666         /*
2667          * We have a positive insert size but no nodes exist on this
2668          * level, this means that we are creating a new root.
2669          */
2670         if (!PATH_H_PBUFFER(tb->tb_path, h)) {
2671 
2672             RFALSE(tb->blknum[h] != 1,
2673                    "PAP-8350: creating new empty root");
2674 
2675             if (h < MAX_HEIGHT - 1)
2676                 tb->insert_size[h + 1] = 0;
2677         } else if (!PATH_H_PBUFFER(tb->tb_path, h + 1)) {
2678             /*
2679              * The tree needs to be grown, so this node S[h]
2680              * which is the root node is split into two nodes,
2681              * and a new node (S[h+1]) will be created to
2682              * become the root node.
2683              */
2684             if (tb->blknum[h] > 1) {
2685 
2686                 RFALSE(h == MAX_HEIGHT - 1,
2687                        "PAP-8355: attempt to create too high of a tree");
2688 
2689                 tb->insert_size[h + 1] =
2690                     (DC_SIZE +
2691                      KEY_SIZE) * (tb->blknum[h] - 1) +
2692                     DC_SIZE;
2693             } else if (h < MAX_HEIGHT - 1)
2694                 tb->insert_size[h + 1] = 0;
2695         } else
2696             tb->insert_size[h + 1] =
2697                 (DC_SIZE + KEY_SIZE) * (tb->blknum[h] - 1);
2698     }
2699 
2700     ret = wait_tb_buffers_until_unlocked(tb);
2701     if (ret == CARRY_ON) {
2702         if (FILESYSTEM_CHANGED_TB(tb)) {
2703             wait_tb_buffers_run = 1;
2704             ret = REPEAT_SEARCH;
2705             goto repeat;
2706         } else {
2707             return CARRY_ON;
2708         }
2709     } else {
2710         wait_tb_buffers_run = 1;
2711         goto repeat;
2712     }
2713 
2714 repeat:
2715     /*
2716      * fix_nodes was unable to perform its calculation due to
2717      * filesystem got changed under us, lack of free disk space or i/o
2718      * failure. If the first is the case - the search will be
2719      * repeated. For now - free all resources acquired so far except
2720      * for the new allocated nodes
2721      */
2722     {
2723         int i;
2724 
2725         /* Release path buffers. */
2726         if (wait_tb_buffers_run) {
2727             pathrelse_and_restore(tb->tb_sb, tb->tb_path);
2728         } else {
2729             pathrelse(tb->tb_path);
2730         }
2731         /* brelse all resources collected for balancing */
2732         for (i = 0; i < MAX_HEIGHT; i++) {
2733             if (wait_tb_buffers_run) {
2734                 reiserfs_restore_prepared_buffer(tb->tb_sb,
2735                                  tb->L[i]);
2736                 reiserfs_restore_prepared_buffer(tb->tb_sb,
2737                                  tb->R[i]);
2738                 reiserfs_restore_prepared_buffer(tb->tb_sb,
2739                                  tb->FL[i]);
2740                 reiserfs_restore_prepared_buffer(tb->tb_sb,
2741                                  tb->FR[i]);
2742                 reiserfs_restore_prepared_buffer(tb->tb_sb,
2743                                  tb->
2744                                  CFL[i]);
2745                 reiserfs_restore_prepared_buffer(tb->tb_sb,
2746                                  tb->
2747                                  CFR[i]);
2748             }
2749 
2750             brelse(tb->L[i]);
2751             brelse(tb->R[i]);
2752             brelse(tb->FL[i]);
2753             brelse(tb->FR[i]);
2754             brelse(tb->CFL[i]);
2755             brelse(tb->CFR[i]);
2756 
2757             tb->L[i] = NULL;
2758             tb->R[i] = NULL;
2759             tb->FL[i] = NULL;
2760             tb->FR[i] = NULL;
2761             tb->CFL[i] = NULL;
2762             tb->CFR[i] = NULL;
2763         }
2764 
2765         if (wait_tb_buffers_run) {
2766             for (i = 0; i < MAX_FEB_SIZE; i++) {
2767                 if (tb->FEB[i])
2768                     reiserfs_restore_prepared_buffer
2769                         (tb->tb_sb, tb->FEB[i]);
2770             }
2771         }
2772         return ret;
2773     }
2774 
2775 }
2776 
2777 void unfix_nodes(struct tree_balance *tb)
2778 {
2779     int i;
2780 
2781     /* Release path buffers. */
2782     pathrelse_and_restore(tb->tb_sb, tb->tb_path);
2783 
2784     /* brelse all resources collected for balancing */
2785     for (i = 0; i < MAX_HEIGHT; i++) {
2786         reiserfs_restore_prepared_buffer(tb->tb_sb, tb->L[i]);
2787         reiserfs_restore_prepared_buffer(tb->tb_sb, tb->R[i]);
2788         reiserfs_restore_prepared_buffer(tb->tb_sb, tb->FL[i]);
2789         reiserfs_restore_prepared_buffer(tb->tb_sb, tb->FR[i]);
2790         reiserfs_restore_prepared_buffer(tb->tb_sb, tb->CFL[i]);
2791         reiserfs_restore_prepared_buffer(tb->tb_sb, tb->CFR[i]);
2792 
2793         brelse(tb->L[i]);
2794         brelse(tb->R[i]);
2795         brelse(tb->FL[i]);
2796         brelse(tb->FR[i]);
2797         brelse(tb->CFL[i]);
2798         brelse(tb->CFR[i]);
2799     }
2800 
2801     /* deal with list of allocated (used and unused) nodes */
2802     for (i = 0; i < MAX_FEB_SIZE; i++) {
2803         if (tb->FEB[i]) {
2804             b_blocknr_t blocknr = tb->FEB[i]->b_blocknr;
2805             /*
2806              * de-allocated block which was not used by
2807              * balancing and bforget about buffer for it
2808              */
2809             brelse(tb->FEB[i]);
2810             reiserfs_free_block(tb->transaction_handle, NULL,
2811                         blocknr, 0);
2812         }
2813         if (tb->used[i]) {
2814             /* release used as new nodes including a new root */
2815             brelse(tb->used[i]);
2816         }
2817     }
2818 
2819     kfree(tb->vn_buf);
2820 
2821 }