Back to home page

OSCL-LXR

 
 

    


0001 // SPDX-License-Identifier: GPL-2.0-or-later
0002 /*
0003  * dir.c
0004  *
0005  * Creates, reads, walks and deletes directory-nodes
0006  *
0007  * Copyright (C) 2002, 2004 Oracle.  All rights reserved.
0008  *
0009  *  Portions of this code from linux/fs/ext3/dir.c
0010  *
0011  *  Copyright (C) 1992, 1993, 1994, 1995
0012  *  Remy Card (card@masi.ibp.fr)
0013  *  Laboratoire MASI - Institut Blaise pascal
0014  *  Universite Pierre et Marie Curie (Paris VI)
0015  *
0016  *   from
0017  *
0018  *   linux/fs/minix/dir.c
0019  *
0020  *   Copyright (C) 1991, 1992 Linus Torvalds
0021  */
0022 
0023 #include <linux/fs.h>
0024 #include <linux/types.h>
0025 #include <linux/slab.h>
0026 #include <linux/highmem.h>
0027 #include <linux/quotaops.h>
0028 #include <linux/sort.h>
0029 #include <linux/iversion.h>
0030 
0031 #include <cluster/masklog.h>
0032 
0033 #include "ocfs2.h"
0034 
0035 #include "alloc.h"
0036 #include "blockcheck.h"
0037 #include "dir.h"
0038 #include "dlmglue.h"
0039 #include "extent_map.h"
0040 #include "file.h"
0041 #include "inode.h"
0042 #include "journal.h"
0043 #include "namei.h"
0044 #include "suballoc.h"
0045 #include "super.h"
0046 #include "sysfile.h"
0047 #include "uptodate.h"
0048 #include "ocfs2_trace.h"
0049 
0050 #include "buffer_head_io.h"
0051 
0052 #define NAMEI_RA_CHUNKS  2
0053 #define NAMEI_RA_BLOCKS  4
0054 #define NAMEI_RA_SIZE        (NAMEI_RA_CHUNKS * NAMEI_RA_BLOCKS)
0055 
0056 static int ocfs2_do_extend_dir(struct super_block *sb,
0057                    handle_t *handle,
0058                    struct inode *dir,
0059                    struct buffer_head *parent_fe_bh,
0060                    struct ocfs2_alloc_context *data_ac,
0061                    struct ocfs2_alloc_context *meta_ac,
0062                    struct buffer_head **new_bh);
0063 static int ocfs2_dir_indexed(struct inode *inode);
0064 
0065 /*
0066  * These are distinct checks because future versions of the file system will
0067  * want to have a trailing dirent structure independent of indexing.
0068  */
0069 static int ocfs2_supports_dir_trailer(struct inode *dir)
0070 {
0071     struct ocfs2_super *osb = OCFS2_SB(dir->i_sb);
0072 
0073     if (OCFS2_I(dir)->ip_dyn_features & OCFS2_INLINE_DATA_FL)
0074         return 0;
0075 
0076     return ocfs2_meta_ecc(osb) || ocfs2_dir_indexed(dir);
0077 }
0078 
0079 /*
0080  * "new' here refers to the point at which we're creating a new
0081  * directory via "mkdir()", but also when we're expanding an inline
0082  * directory. In either case, we don't yet have the indexing bit set
0083  * on the directory, so the standard checks will fail in when metaecc
0084  * is turned off. Only directory-initialization type functions should
0085  * use this then. Everything else wants ocfs2_supports_dir_trailer()
0086  */
0087 static int ocfs2_new_dir_wants_trailer(struct inode *dir)
0088 {
0089     struct ocfs2_super *osb = OCFS2_SB(dir->i_sb);
0090 
0091     return ocfs2_meta_ecc(osb) ||
0092         ocfs2_supports_indexed_dirs(osb);
0093 }
0094 
0095 static inline unsigned int ocfs2_dir_trailer_blk_off(struct super_block *sb)
0096 {
0097     return sb->s_blocksize - sizeof(struct ocfs2_dir_block_trailer);
0098 }
0099 
0100 #define ocfs2_trailer_from_bh(_bh, _sb) ((struct ocfs2_dir_block_trailer *) ((_bh)->b_data + ocfs2_dir_trailer_blk_off((_sb))))
0101 
0102 /* XXX ocfs2_block_dqtrailer() is similar but not quite - can we make
0103  * them more consistent? */
0104 struct ocfs2_dir_block_trailer *ocfs2_dir_trailer_from_size(int blocksize,
0105                                 void *data)
0106 {
0107     char *p = data;
0108 
0109     p += blocksize - sizeof(struct ocfs2_dir_block_trailer);
0110     return (struct ocfs2_dir_block_trailer *)p;
0111 }
0112 
0113 /*
0114  * XXX: This is executed once on every dirent. We should consider optimizing
0115  * it.
0116  */
0117 static int ocfs2_skip_dir_trailer(struct inode *dir,
0118                   struct ocfs2_dir_entry *de,
0119                   unsigned long offset,
0120                   unsigned long blklen)
0121 {
0122     unsigned long toff = blklen - sizeof(struct ocfs2_dir_block_trailer);
0123 
0124     if (!ocfs2_supports_dir_trailer(dir))
0125         return 0;
0126 
0127     if (offset != toff)
0128         return 0;
0129 
0130     return 1;
0131 }
0132 
0133 static void ocfs2_init_dir_trailer(struct inode *inode,
0134                    struct buffer_head *bh, u16 rec_len)
0135 {
0136     struct ocfs2_dir_block_trailer *trailer;
0137 
0138     trailer = ocfs2_trailer_from_bh(bh, inode->i_sb);
0139     strcpy(trailer->db_signature, OCFS2_DIR_TRAILER_SIGNATURE);
0140     trailer->db_compat_rec_len =
0141             cpu_to_le16(sizeof(struct ocfs2_dir_block_trailer));
0142     trailer->db_parent_dinode = cpu_to_le64(OCFS2_I(inode)->ip_blkno);
0143     trailer->db_blkno = cpu_to_le64(bh->b_blocknr);
0144     trailer->db_free_rec_len = cpu_to_le16(rec_len);
0145 }
0146 /*
0147  * Link an unindexed block with a dir trailer structure into the index free
0148  * list. This function will modify dirdata_bh, but assumes you've already
0149  * passed it to the journal.
0150  */
0151 static int ocfs2_dx_dir_link_trailer(struct inode *dir, handle_t *handle,
0152                      struct buffer_head *dx_root_bh,
0153                      struct buffer_head *dirdata_bh)
0154 {
0155     int ret;
0156     struct ocfs2_dx_root_block *dx_root;
0157     struct ocfs2_dir_block_trailer *trailer;
0158 
0159     ret = ocfs2_journal_access_dr(handle, INODE_CACHE(dir), dx_root_bh,
0160                       OCFS2_JOURNAL_ACCESS_WRITE);
0161     if (ret) {
0162         mlog_errno(ret);
0163         goto out;
0164     }
0165     trailer = ocfs2_trailer_from_bh(dirdata_bh, dir->i_sb);
0166     dx_root = (struct ocfs2_dx_root_block *)dx_root_bh->b_data;
0167 
0168     trailer->db_free_next = dx_root->dr_free_blk;
0169     dx_root->dr_free_blk = cpu_to_le64(dirdata_bh->b_blocknr);
0170 
0171     ocfs2_journal_dirty(handle, dx_root_bh);
0172 
0173 out:
0174     return ret;
0175 }
0176 
0177 static int ocfs2_free_list_at_root(struct ocfs2_dir_lookup_result *res)
0178 {
0179     return res->dl_prev_leaf_bh == NULL;
0180 }
0181 
0182 void ocfs2_free_dir_lookup_result(struct ocfs2_dir_lookup_result *res)
0183 {
0184     brelse(res->dl_dx_root_bh);
0185     brelse(res->dl_leaf_bh);
0186     brelse(res->dl_dx_leaf_bh);
0187     brelse(res->dl_prev_leaf_bh);
0188 }
0189 
0190 static int ocfs2_dir_indexed(struct inode *inode)
0191 {
0192     if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INDEXED_DIR_FL)
0193         return 1;
0194     return 0;
0195 }
0196 
0197 static inline int ocfs2_dx_root_inline(struct ocfs2_dx_root_block *dx_root)
0198 {
0199     return dx_root->dr_flags & OCFS2_DX_FLAG_INLINE;
0200 }
0201 
0202 /*
0203  * Hashing code adapted from ext3
0204  */
0205 #define DELTA 0x9E3779B9
0206 
0207 static void TEA_transform(__u32 buf[4], __u32 const in[])
0208 {
0209     __u32   sum = 0;
0210     __u32   b0 = buf[0], b1 = buf[1];
0211     __u32   a = in[0], b = in[1], c = in[2], d = in[3];
0212     int n = 16;
0213 
0214     do {
0215         sum += DELTA;
0216         b0 += ((b1 << 4)+a) ^ (b1+sum) ^ ((b1 >> 5)+b);
0217         b1 += ((b0 << 4)+c) ^ (b0+sum) ^ ((b0 >> 5)+d);
0218     } while (--n);
0219 
0220     buf[0] += b0;
0221     buf[1] += b1;
0222 }
0223 
0224 static void str2hashbuf(const char *msg, int len, __u32 *buf, int num)
0225 {
0226     __u32   pad, val;
0227     int i;
0228 
0229     pad = (__u32)len | ((__u32)len << 8);
0230     pad |= pad << 16;
0231 
0232     val = pad;
0233     if (len > num*4)
0234         len = num * 4;
0235     for (i = 0; i < len; i++) {
0236         if ((i % 4) == 0)
0237             val = pad;
0238         val = msg[i] + (val << 8);
0239         if ((i % 4) == 3) {
0240             *buf++ = val;
0241             val = pad;
0242             num--;
0243         }
0244     }
0245     if (--num >= 0)
0246         *buf++ = val;
0247     while (--num >= 0)
0248         *buf++ = pad;
0249 }
0250 
0251 static void ocfs2_dx_dir_name_hash(struct inode *dir, const char *name, int len,
0252                    struct ocfs2_dx_hinfo *hinfo)
0253 {
0254     struct ocfs2_super *osb = OCFS2_SB(dir->i_sb);
0255     const char  *p;
0256     __u32       in[8], buf[4];
0257 
0258     /*
0259      * XXX: Is this really necessary, if the index is never looked
0260      * at by readdir? Is a hash value of '0' a bad idea?
0261      */
0262     if ((len == 1 && !strncmp(".", name, 1)) ||
0263         (len == 2 && !strncmp("..", name, 2))) {
0264         buf[0] = buf[1] = 0;
0265         goto out;
0266     }
0267 
0268 #ifdef OCFS2_DEBUG_DX_DIRS
0269     /*
0270      * This makes it very easy to debug indexing problems. We
0271      * should never allow this to be selected without hand editing
0272      * this file though.
0273      */
0274     buf[0] = buf[1] = len;
0275     goto out;
0276 #endif
0277 
0278     memcpy(buf, osb->osb_dx_seed, sizeof(buf));
0279 
0280     p = name;
0281     while (len > 0) {
0282         str2hashbuf(p, len, in, 4);
0283         TEA_transform(buf, in);
0284         len -= 16;
0285         p += 16;
0286     }
0287 
0288 out:
0289     hinfo->major_hash = buf[0];
0290     hinfo->minor_hash = buf[1];
0291 }
0292 
0293 /*
0294  * bh passed here can be an inode block or a dir data block, depending
0295  * on the inode inline data flag.
0296  */
0297 static int ocfs2_check_dir_entry(struct inode * dir,
0298                  struct ocfs2_dir_entry * de,
0299                  struct buffer_head * bh,
0300                  unsigned long offset)
0301 {
0302     const char *error_msg = NULL;
0303     const int rlen = le16_to_cpu(de->rec_len);
0304 
0305     if (unlikely(rlen < OCFS2_DIR_REC_LEN(1)))
0306         error_msg = "rec_len is smaller than minimal";
0307     else if (unlikely(rlen % 4 != 0))
0308         error_msg = "rec_len % 4 != 0";
0309     else if (unlikely(rlen < OCFS2_DIR_REC_LEN(de->name_len)))
0310         error_msg = "rec_len is too small for name_len";
0311     else if (unlikely(
0312          ((char *) de - bh->b_data) + rlen > dir->i_sb->s_blocksize))
0313         error_msg = "directory entry across blocks";
0314 
0315     if (unlikely(error_msg != NULL))
0316         mlog(ML_ERROR, "bad entry in directory #%llu: %s - "
0317              "offset=%lu, inode=%llu, rec_len=%d, name_len=%d\n",
0318              (unsigned long long)OCFS2_I(dir)->ip_blkno, error_msg,
0319              offset, (unsigned long long)le64_to_cpu(de->inode), rlen,
0320              de->name_len);
0321 
0322     return error_msg == NULL ? 1 : 0;
0323 }
0324 
0325 static inline int ocfs2_match(int len,
0326                   const char * const name,
0327                   struct ocfs2_dir_entry *de)
0328 {
0329     if (len != de->name_len)
0330         return 0;
0331     if (!de->inode)
0332         return 0;
0333     return !memcmp(name, de->name, len);
0334 }
0335 
0336 /*
0337  * Returns 0 if not found, -1 on failure, and 1 on success
0338  */
0339 static inline int ocfs2_search_dirblock(struct buffer_head *bh,
0340                     struct inode *dir,
0341                     const char *name, int namelen,
0342                     unsigned long offset,
0343                     char *first_de,
0344                     unsigned int bytes,
0345                     struct ocfs2_dir_entry **res_dir)
0346 {
0347     struct ocfs2_dir_entry *de;
0348     char *dlimit, *de_buf;
0349     int de_len;
0350     int ret = 0;
0351 
0352     de_buf = first_de;
0353     dlimit = de_buf + bytes;
0354 
0355     while (de_buf < dlimit) {
0356         /* this code is executed quadratically often */
0357         /* do minimal checking `by hand' */
0358 
0359         de = (struct ocfs2_dir_entry *) de_buf;
0360 
0361         if (de_buf + namelen <= dlimit &&
0362             ocfs2_match(namelen, name, de)) {
0363             /* found a match - just to be sure, do a full check */
0364             if (!ocfs2_check_dir_entry(dir, de, bh, offset)) {
0365                 ret = -1;
0366                 goto bail;
0367             }
0368             *res_dir = de;
0369             ret = 1;
0370             goto bail;
0371         }
0372 
0373         /* prevent looping on a bad block */
0374         de_len = le16_to_cpu(de->rec_len);
0375         if (de_len <= 0) {
0376             ret = -1;
0377             goto bail;
0378         }
0379 
0380         de_buf += de_len;
0381         offset += de_len;
0382     }
0383 
0384 bail:
0385     trace_ocfs2_search_dirblock(ret);
0386     return ret;
0387 }
0388 
0389 static struct buffer_head *ocfs2_find_entry_id(const char *name,
0390                            int namelen,
0391                            struct inode *dir,
0392                            struct ocfs2_dir_entry **res_dir)
0393 {
0394     int ret, found;
0395     struct buffer_head *di_bh = NULL;
0396     struct ocfs2_dinode *di;
0397     struct ocfs2_inline_data *data;
0398 
0399     ret = ocfs2_read_inode_block(dir, &di_bh);
0400     if (ret) {
0401         mlog_errno(ret);
0402         goto out;
0403     }
0404 
0405     di = (struct ocfs2_dinode *)di_bh->b_data;
0406     data = &di->id2.i_data;
0407 
0408     found = ocfs2_search_dirblock(di_bh, dir, name, namelen, 0,
0409                       data->id_data, i_size_read(dir), res_dir);
0410     if (found == 1)
0411         return di_bh;
0412 
0413     brelse(di_bh);
0414 out:
0415     return NULL;
0416 }
0417 
0418 static int ocfs2_validate_dir_block(struct super_block *sb,
0419                     struct buffer_head *bh)
0420 {
0421     int rc;
0422     struct ocfs2_dir_block_trailer *trailer =
0423         ocfs2_trailer_from_bh(bh, sb);
0424 
0425 
0426     /*
0427      * We don't validate dirents here, that's handled
0428      * in-place when the code walks them.
0429      */
0430     trace_ocfs2_validate_dir_block((unsigned long long)bh->b_blocknr);
0431 
0432     BUG_ON(!buffer_uptodate(bh));
0433 
0434     /*
0435      * If the ecc fails, we return the error but otherwise
0436      * leave the filesystem running.  We know any error is
0437      * local to this block.
0438      *
0439      * Note that we are safe to call this even if the directory
0440      * doesn't have a trailer.  Filesystems without metaecc will do
0441      * nothing, and filesystems with it will have one.
0442      */
0443     rc = ocfs2_validate_meta_ecc(sb, bh->b_data, &trailer->db_check);
0444     if (rc)
0445         mlog(ML_ERROR, "Checksum failed for dinode %llu\n",
0446              (unsigned long long)bh->b_blocknr);
0447 
0448     return rc;
0449 }
0450 
0451 /*
0452  * Validate a directory trailer.
0453  *
0454  * We check the trailer here rather than in ocfs2_validate_dir_block()
0455  * because that function doesn't have the inode to test.
0456  */
0457 static int ocfs2_check_dir_trailer(struct inode *dir, struct buffer_head *bh)
0458 {
0459     int rc = 0;
0460     struct ocfs2_dir_block_trailer *trailer;
0461 
0462     trailer = ocfs2_trailer_from_bh(bh, dir->i_sb);
0463     if (!OCFS2_IS_VALID_DIR_TRAILER(trailer)) {
0464         rc = ocfs2_error(dir->i_sb,
0465                  "Invalid dirblock #%llu: signature = %.*s\n",
0466                  (unsigned long long)bh->b_blocknr, 7,
0467                  trailer->db_signature);
0468         goto out;
0469     }
0470     if (le64_to_cpu(trailer->db_blkno) != bh->b_blocknr) {
0471         rc = ocfs2_error(dir->i_sb,
0472                  "Directory block #%llu has an invalid db_blkno of %llu\n",
0473                  (unsigned long long)bh->b_blocknr,
0474                  (unsigned long long)le64_to_cpu(trailer->db_blkno));
0475         goto out;
0476     }
0477     if (le64_to_cpu(trailer->db_parent_dinode) !=
0478         OCFS2_I(dir)->ip_blkno) {
0479         rc = ocfs2_error(dir->i_sb,
0480                  "Directory block #%llu on dinode #%llu has an invalid parent_dinode of %llu\n",
0481                  (unsigned long long)bh->b_blocknr,
0482                  (unsigned long long)OCFS2_I(dir)->ip_blkno,
0483                  (unsigned long long)le64_to_cpu(trailer->db_blkno));
0484         goto out;
0485     }
0486 out:
0487     return rc;
0488 }
0489 
0490 /*
0491  * This function forces all errors to -EIO for consistency with its
0492  * predecessor, ocfs2_bread().  We haven't audited what returning the
0493  * real error codes would do to callers.  We log the real codes with
0494  * mlog_errno() before we squash them.
0495  */
0496 static int ocfs2_read_dir_block(struct inode *inode, u64 v_block,
0497                 struct buffer_head **bh, int flags)
0498 {
0499     int rc = 0;
0500     struct buffer_head *tmp = *bh;
0501 
0502     rc = ocfs2_read_virt_blocks(inode, v_block, 1, &tmp, flags,
0503                     ocfs2_validate_dir_block);
0504     if (rc) {
0505         mlog_errno(rc);
0506         goto out;
0507     }
0508 
0509     if (!(flags & OCFS2_BH_READAHEAD) &&
0510         ocfs2_supports_dir_trailer(inode)) {
0511         rc = ocfs2_check_dir_trailer(inode, tmp);
0512         if (rc) {
0513             if (!*bh)
0514                 brelse(tmp);
0515             mlog_errno(rc);
0516             goto out;
0517         }
0518     }
0519 
0520     /* If ocfs2_read_virt_blocks() got us a new bh, pass it up. */
0521     if (!*bh)
0522         *bh = tmp;
0523 
0524 out:
0525     return rc ? -EIO : 0;
0526 }
0527 
0528 /*
0529  * Read the block at 'phys' which belongs to this directory
0530  * inode. This function does no virtual->physical block translation -
0531  * what's passed in is assumed to be a valid directory block.
0532  */
0533 static int ocfs2_read_dir_block_direct(struct inode *dir, u64 phys,
0534                        struct buffer_head **bh)
0535 {
0536     int ret;
0537     struct buffer_head *tmp = *bh;
0538 
0539     ret = ocfs2_read_block(INODE_CACHE(dir), phys, &tmp,
0540                    ocfs2_validate_dir_block);
0541     if (ret) {
0542         mlog_errno(ret);
0543         goto out;
0544     }
0545 
0546     if (ocfs2_supports_dir_trailer(dir)) {
0547         ret = ocfs2_check_dir_trailer(dir, tmp);
0548         if (ret) {
0549             if (!*bh)
0550                 brelse(tmp);
0551             mlog_errno(ret);
0552             goto out;
0553         }
0554     }
0555 
0556     if (!ret && !*bh)
0557         *bh = tmp;
0558 out:
0559     return ret;
0560 }
0561 
0562 static int ocfs2_validate_dx_root(struct super_block *sb,
0563                   struct buffer_head *bh)
0564 {
0565     int ret;
0566     struct ocfs2_dx_root_block *dx_root;
0567 
0568     BUG_ON(!buffer_uptodate(bh));
0569 
0570     dx_root = (struct ocfs2_dx_root_block *) bh->b_data;
0571 
0572     ret = ocfs2_validate_meta_ecc(sb, bh->b_data, &dx_root->dr_check);
0573     if (ret) {
0574         mlog(ML_ERROR,
0575              "Checksum failed for dir index root block %llu\n",
0576              (unsigned long long)bh->b_blocknr);
0577         return ret;
0578     }
0579 
0580     if (!OCFS2_IS_VALID_DX_ROOT(dx_root)) {
0581         ret = ocfs2_error(sb,
0582                   "Dir Index Root # %llu has bad signature %.*s\n",
0583                   (unsigned long long)le64_to_cpu(dx_root->dr_blkno),
0584                   7, dx_root->dr_signature);
0585     }
0586 
0587     return ret;
0588 }
0589 
0590 static int ocfs2_read_dx_root(struct inode *dir, struct ocfs2_dinode *di,
0591                   struct buffer_head **dx_root_bh)
0592 {
0593     int ret;
0594     u64 blkno = le64_to_cpu(di->i_dx_root);
0595     struct buffer_head *tmp = *dx_root_bh;
0596 
0597     ret = ocfs2_read_block(INODE_CACHE(dir), blkno, &tmp,
0598                    ocfs2_validate_dx_root);
0599 
0600     /* If ocfs2_read_block() got us a new bh, pass it up. */
0601     if (!ret && !*dx_root_bh)
0602         *dx_root_bh = tmp;
0603 
0604     return ret;
0605 }
0606 
0607 static int ocfs2_validate_dx_leaf(struct super_block *sb,
0608                   struct buffer_head *bh)
0609 {
0610     int ret;
0611     struct ocfs2_dx_leaf *dx_leaf = (struct ocfs2_dx_leaf *)bh->b_data;
0612 
0613     BUG_ON(!buffer_uptodate(bh));
0614 
0615     ret = ocfs2_validate_meta_ecc(sb, bh->b_data, &dx_leaf->dl_check);
0616     if (ret) {
0617         mlog(ML_ERROR,
0618              "Checksum failed for dir index leaf block %llu\n",
0619              (unsigned long long)bh->b_blocknr);
0620         return ret;
0621     }
0622 
0623     if (!OCFS2_IS_VALID_DX_LEAF(dx_leaf)) {
0624         ret = ocfs2_error(sb, "Dir Index Leaf has bad signature %.*s\n",
0625                   7, dx_leaf->dl_signature);
0626     }
0627 
0628     return ret;
0629 }
0630 
0631 static int ocfs2_read_dx_leaf(struct inode *dir, u64 blkno,
0632                   struct buffer_head **dx_leaf_bh)
0633 {
0634     int ret;
0635     struct buffer_head *tmp = *dx_leaf_bh;
0636 
0637     ret = ocfs2_read_block(INODE_CACHE(dir), blkno, &tmp,
0638                    ocfs2_validate_dx_leaf);
0639 
0640     /* If ocfs2_read_block() got us a new bh, pass it up. */
0641     if (!ret && !*dx_leaf_bh)
0642         *dx_leaf_bh = tmp;
0643 
0644     return ret;
0645 }
0646 
0647 /*
0648  * Read a series of dx_leaf blocks. This expects all buffer_head
0649  * pointers to be NULL on function entry.
0650  */
0651 static int ocfs2_read_dx_leaves(struct inode *dir, u64 start, int num,
0652                 struct buffer_head **dx_leaf_bhs)
0653 {
0654     int ret;
0655 
0656     ret = ocfs2_read_blocks(INODE_CACHE(dir), start, num, dx_leaf_bhs, 0,
0657                 ocfs2_validate_dx_leaf);
0658     if (ret)
0659         mlog_errno(ret);
0660 
0661     return ret;
0662 }
0663 
0664 static struct buffer_head *ocfs2_find_entry_el(const char *name, int namelen,
0665                            struct inode *dir,
0666                            struct ocfs2_dir_entry **res_dir)
0667 {
0668     struct super_block *sb;
0669     struct buffer_head *bh_use[NAMEI_RA_SIZE];
0670     struct buffer_head *bh, *ret = NULL;
0671     unsigned long start, block, b;
0672     int ra_max = 0;     /* Number of bh's in the readahead
0673                    buffer, bh_use[] */
0674     int ra_ptr = 0;     /* Current index into readahead
0675                    buffer */
0676     int num = 0;
0677     int nblocks, i;
0678 
0679     sb = dir->i_sb;
0680 
0681     nblocks = i_size_read(dir) >> sb->s_blocksize_bits;
0682     start = OCFS2_I(dir)->ip_dir_start_lookup;
0683     if (start >= nblocks)
0684         start = 0;
0685     block = start;
0686 
0687 restart:
0688     do {
0689         /*
0690          * We deal with the read-ahead logic here.
0691          */
0692         if (ra_ptr >= ra_max) {
0693             /* Refill the readahead buffer */
0694             ra_ptr = 0;
0695             b = block;
0696             for (ra_max = 0; ra_max < NAMEI_RA_SIZE; ra_max++) {
0697                 /*
0698                  * Terminate if we reach the end of the
0699                  * directory and must wrap, or if our
0700                  * search has finished at this block.
0701                  */
0702                 if (b >= nblocks || (num && block == start)) {
0703                     bh_use[ra_max] = NULL;
0704                     break;
0705                 }
0706                 num++;
0707 
0708                 bh = NULL;
0709                 ocfs2_read_dir_block(dir, b++, &bh,
0710                                OCFS2_BH_READAHEAD);
0711                 bh_use[ra_max] = bh;
0712             }
0713         }
0714         if ((bh = bh_use[ra_ptr++]) == NULL)
0715             goto next;
0716         if (ocfs2_read_dir_block(dir, block, &bh, 0)) {
0717             /* read error, skip block & hope for the best.
0718              * ocfs2_read_dir_block() has released the bh. */
0719             mlog(ML_ERROR, "reading directory %llu, "
0720                     "offset %lu\n",
0721                     (unsigned long long)OCFS2_I(dir)->ip_blkno,
0722                     block);
0723             goto next;
0724         }
0725         i = ocfs2_search_dirblock(bh, dir, name, namelen,
0726                       block << sb->s_blocksize_bits,
0727                       bh->b_data, sb->s_blocksize,
0728                       res_dir);
0729         if (i == 1) {
0730             OCFS2_I(dir)->ip_dir_start_lookup = block;
0731             ret = bh;
0732             goto cleanup_and_exit;
0733         } else {
0734             brelse(bh);
0735             if (i < 0)
0736                 goto cleanup_and_exit;
0737         }
0738     next:
0739         if (++block >= nblocks)
0740             block = 0;
0741     } while (block != start);
0742 
0743     /*
0744      * If the directory has grown while we were searching, then
0745      * search the last part of the directory before giving up.
0746      */
0747     block = nblocks;
0748     nblocks = i_size_read(dir) >> sb->s_blocksize_bits;
0749     if (block < nblocks) {
0750         start = 0;
0751         goto restart;
0752     }
0753 
0754 cleanup_and_exit:
0755     /* Clean up the read-ahead blocks */
0756     for (; ra_ptr < ra_max; ra_ptr++)
0757         brelse(bh_use[ra_ptr]);
0758 
0759     trace_ocfs2_find_entry_el(ret);
0760     return ret;
0761 }
0762 
0763 static int ocfs2_dx_dir_lookup_rec(struct inode *inode,
0764                    struct ocfs2_extent_list *el,
0765                    u32 major_hash,
0766                    u32 *ret_cpos,
0767                    u64 *ret_phys_blkno,
0768                    unsigned int *ret_clen)
0769 {
0770     int ret = 0, i, found;
0771     struct buffer_head *eb_bh = NULL;
0772     struct ocfs2_extent_block *eb;
0773     struct ocfs2_extent_rec *rec = NULL;
0774 
0775     if (el->l_tree_depth) {
0776         ret = ocfs2_find_leaf(INODE_CACHE(inode), el, major_hash,
0777                       &eb_bh);
0778         if (ret) {
0779             mlog_errno(ret);
0780             goto out;
0781         }
0782 
0783         eb = (struct ocfs2_extent_block *) eb_bh->b_data;
0784         el = &eb->h_list;
0785 
0786         if (el->l_tree_depth) {
0787             ret = ocfs2_error(inode->i_sb,
0788                       "Inode %lu has non zero tree depth in btree tree block %llu\n",
0789                       inode->i_ino,
0790                       (unsigned long long)eb_bh->b_blocknr);
0791             goto out;
0792         }
0793     }
0794 
0795     found = 0;
0796     for (i = le16_to_cpu(el->l_next_free_rec) - 1; i >= 0; i--) {
0797         rec = &el->l_recs[i];
0798 
0799         if (le32_to_cpu(rec->e_cpos) <= major_hash) {
0800             found = 1;
0801             break;
0802         }
0803     }
0804 
0805     if (!found) {
0806         ret = ocfs2_error(inode->i_sb,
0807                   "Inode %lu has bad extent record (%u, %u, 0) in btree\n",
0808                   inode->i_ino,
0809                   le32_to_cpu(rec->e_cpos),
0810                   ocfs2_rec_clusters(el, rec));
0811         goto out;
0812     }
0813 
0814     if (ret_phys_blkno)
0815         *ret_phys_blkno = le64_to_cpu(rec->e_blkno);
0816     if (ret_cpos)
0817         *ret_cpos = le32_to_cpu(rec->e_cpos);
0818     if (ret_clen)
0819         *ret_clen = le16_to_cpu(rec->e_leaf_clusters);
0820 
0821 out:
0822     brelse(eb_bh);
0823     return ret;
0824 }
0825 
0826 /*
0827  * Returns the block index, from the start of the cluster which this
0828  * hash belongs too.
0829  */
0830 static inline unsigned int __ocfs2_dx_dir_hash_idx(struct ocfs2_super *osb,
0831                            u32 minor_hash)
0832 {
0833     return minor_hash & osb->osb_dx_mask;
0834 }
0835 
0836 static inline unsigned int ocfs2_dx_dir_hash_idx(struct ocfs2_super *osb,
0837                       struct ocfs2_dx_hinfo *hinfo)
0838 {
0839     return __ocfs2_dx_dir_hash_idx(osb, hinfo->minor_hash);
0840 }
0841 
0842 static int ocfs2_dx_dir_lookup(struct inode *inode,
0843                    struct ocfs2_extent_list *el,
0844                    struct ocfs2_dx_hinfo *hinfo,
0845                    u32 *ret_cpos,
0846                    u64 *ret_phys_blkno)
0847 {
0848     int ret = 0;
0849     unsigned int cend, clen;
0850     u32 cpos;
0851     u64 blkno;
0852     u32 name_hash = hinfo->major_hash;
0853 
0854     ret = ocfs2_dx_dir_lookup_rec(inode, el, name_hash, &cpos, &blkno,
0855                       &clen);
0856     if (ret) {
0857         mlog_errno(ret);
0858         goto out;
0859     }
0860 
0861     cend = cpos + clen;
0862     if (name_hash >= cend) {
0863         /* We want the last cluster */
0864         blkno += ocfs2_clusters_to_blocks(inode->i_sb, clen - 1);
0865         cpos += clen - 1;
0866     } else {
0867         blkno += ocfs2_clusters_to_blocks(inode->i_sb,
0868                           name_hash - cpos);
0869         cpos = name_hash;
0870     }
0871 
0872     /*
0873      * We now have the cluster which should hold our entry. To
0874      * find the exact block from the start of the cluster to
0875      * search, we take the lower bits of the hash.
0876      */
0877     blkno += ocfs2_dx_dir_hash_idx(OCFS2_SB(inode->i_sb), hinfo);
0878 
0879     if (ret_phys_blkno)
0880         *ret_phys_blkno = blkno;
0881     if (ret_cpos)
0882         *ret_cpos = cpos;
0883 
0884 out:
0885 
0886     return ret;
0887 }
0888 
0889 static int ocfs2_dx_dir_search(const char *name, int namelen,
0890                    struct inode *dir,
0891                    struct ocfs2_dx_root_block *dx_root,
0892                    struct ocfs2_dir_lookup_result *res)
0893 {
0894     int ret, i, found;
0895     u64 phys;
0896     struct buffer_head *dx_leaf_bh = NULL;
0897     struct ocfs2_dx_leaf *dx_leaf;
0898     struct ocfs2_dx_entry *dx_entry = NULL;
0899     struct buffer_head *dir_ent_bh = NULL;
0900     struct ocfs2_dir_entry *dir_ent = NULL;
0901     struct ocfs2_dx_hinfo *hinfo = &res->dl_hinfo;
0902     struct ocfs2_extent_list *dr_el;
0903     struct ocfs2_dx_entry_list *entry_list;
0904 
0905     ocfs2_dx_dir_name_hash(dir, name, namelen, &res->dl_hinfo);
0906 
0907     if (ocfs2_dx_root_inline(dx_root)) {
0908         entry_list = &dx_root->dr_entries;
0909         goto search;
0910     }
0911 
0912     dr_el = &dx_root->dr_list;
0913 
0914     ret = ocfs2_dx_dir_lookup(dir, dr_el, hinfo, NULL, &phys);
0915     if (ret) {
0916         mlog_errno(ret);
0917         goto out;
0918     }
0919 
0920     trace_ocfs2_dx_dir_search((unsigned long long)OCFS2_I(dir)->ip_blkno,
0921                   namelen, name, hinfo->major_hash,
0922                   hinfo->minor_hash, (unsigned long long)phys);
0923 
0924     ret = ocfs2_read_dx_leaf(dir, phys, &dx_leaf_bh);
0925     if (ret) {
0926         mlog_errno(ret);
0927         goto out;
0928     }
0929 
0930     dx_leaf = (struct ocfs2_dx_leaf *) dx_leaf_bh->b_data;
0931 
0932     trace_ocfs2_dx_dir_search_leaf_info(
0933             le16_to_cpu(dx_leaf->dl_list.de_num_used),
0934             le16_to_cpu(dx_leaf->dl_list.de_count));
0935 
0936     entry_list = &dx_leaf->dl_list;
0937 
0938 search:
0939     /*
0940      * Empty leaf is legal, so no need to check for that.
0941      */
0942     found = 0;
0943     for (i = 0; i < le16_to_cpu(entry_list->de_num_used); i++) {
0944         dx_entry = &entry_list->de_entries[i];
0945 
0946         if (hinfo->major_hash != le32_to_cpu(dx_entry->dx_major_hash)
0947             || hinfo->minor_hash != le32_to_cpu(dx_entry->dx_minor_hash))
0948             continue;
0949 
0950         /*
0951          * Search unindexed leaf block now. We're not
0952          * guaranteed to find anything.
0953          */
0954         ret = ocfs2_read_dir_block_direct(dir,
0955                       le64_to_cpu(dx_entry->dx_dirent_blk),
0956                       &dir_ent_bh);
0957         if (ret) {
0958             mlog_errno(ret);
0959             goto out;
0960         }
0961 
0962         /*
0963          * XXX: We should check the unindexed block here,
0964          * before using it.
0965          */
0966 
0967         found = ocfs2_search_dirblock(dir_ent_bh, dir, name, namelen,
0968                           0, dir_ent_bh->b_data,
0969                           dir->i_sb->s_blocksize, &dir_ent);
0970         if (found == 1)
0971             break;
0972 
0973         if (found == -1) {
0974             /* This means we found a bad directory entry. */
0975             ret = -EIO;
0976             mlog_errno(ret);
0977             goto out;
0978         }
0979 
0980         brelse(dir_ent_bh);
0981         dir_ent_bh = NULL;
0982     }
0983 
0984     if (found <= 0) {
0985         ret = -ENOENT;
0986         goto out;
0987     }
0988 
0989     res->dl_leaf_bh = dir_ent_bh;
0990     res->dl_entry = dir_ent;
0991     res->dl_dx_leaf_bh = dx_leaf_bh;
0992     res->dl_dx_entry = dx_entry;
0993 
0994     ret = 0;
0995 out:
0996     if (ret) {
0997         brelse(dx_leaf_bh);
0998         brelse(dir_ent_bh);
0999     }
1000     return ret;
1001 }
1002 
1003 static int ocfs2_find_entry_dx(const char *name, int namelen,
1004                    struct inode *dir,
1005                    struct ocfs2_dir_lookup_result *lookup)
1006 {
1007     int ret;
1008     struct buffer_head *di_bh = NULL;
1009     struct ocfs2_dinode *di;
1010     struct buffer_head *dx_root_bh = NULL;
1011     struct ocfs2_dx_root_block *dx_root;
1012 
1013     ret = ocfs2_read_inode_block(dir, &di_bh);
1014     if (ret) {
1015         mlog_errno(ret);
1016         goto out;
1017     }
1018 
1019     di = (struct ocfs2_dinode *)di_bh->b_data;
1020 
1021     ret = ocfs2_read_dx_root(dir, di, &dx_root_bh);
1022     if (ret) {
1023         mlog_errno(ret);
1024         goto out;
1025     }
1026     dx_root = (struct ocfs2_dx_root_block *) dx_root_bh->b_data;
1027 
1028     ret = ocfs2_dx_dir_search(name, namelen, dir, dx_root, lookup);
1029     if (ret) {
1030         if (ret != -ENOENT)
1031             mlog_errno(ret);
1032         goto out;
1033     }
1034 
1035     lookup->dl_dx_root_bh = dx_root_bh;
1036     dx_root_bh = NULL;
1037 out:
1038     brelse(di_bh);
1039     brelse(dx_root_bh);
1040     return ret;
1041 }
1042 
1043 /*
1044  * Try to find an entry of the provided name within 'dir'.
1045  *
1046  * If nothing was found, -ENOENT is returned. Otherwise, zero is
1047  * returned and the struct 'res' will contain information useful to
1048  * other directory manipulation functions.
1049  *
1050  * Caller can NOT assume anything about the contents of the
1051  * buffer_heads - they are passed back only so that it can be passed
1052  * into any one of the manipulation functions (add entry, delete
1053  * entry, etc). As an example, bh in the extent directory case is a
1054  * data block, in the inline-data case it actually points to an inode,
1055  * in the indexed directory case, multiple buffers are involved.
1056  */
1057 int ocfs2_find_entry(const char *name, int namelen,
1058              struct inode *dir, struct ocfs2_dir_lookup_result *lookup)
1059 {
1060     struct buffer_head *bh;
1061     struct ocfs2_dir_entry *res_dir = NULL;
1062 
1063     if (ocfs2_dir_indexed(dir))
1064         return ocfs2_find_entry_dx(name, namelen, dir, lookup);
1065 
1066     /*
1067      * The unindexed dir code only uses part of the lookup
1068      * structure, so there's no reason to push it down further
1069      * than this.
1070      */
1071     if (OCFS2_I(dir)->ip_dyn_features & OCFS2_INLINE_DATA_FL)
1072         bh = ocfs2_find_entry_id(name, namelen, dir, &res_dir);
1073     else
1074         bh = ocfs2_find_entry_el(name, namelen, dir, &res_dir);
1075 
1076     if (bh == NULL)
1077         return -ENOENT;
1078 
1079     lookup->dl_leaf_bh = bh;
1080     lookup->dl_entry = res_dir;
1081     return 0;
1082 }
1083 
1084 /*
1085  * Update inode number and type of a previously found directory entry.
1086  */
1087 int ocfs2_update_entry(struct inode *dir, handle_t *handle,
1088                struct ocfs2_dir_lookup_result *res,
1089                struct inode *new_entry_inode)
1090 {
1091     int ret;
1092     ocfs2_journal_access_func access = ocfs2_journal_access_db;
1093     struct ocfs2_dir_entry *de = res->dl_entry;
1094     struct buffer_head *de_bh = res->dl_leaf_bh;
1095 
1096     /*
1097      * The same code works fine for both inline-data and extent
1098      * based directories, so no need to split this up.  The only
1099      * difference is the journal_access function.
1100      */
1101 
1102     if (OCFS2_I(dir)->ip_dyn_features & OCFS2_INLINE_DATA_FL)
1103         access = ocfs2_journal_access_di;
1104 
1105     ret = access(handle, INODE_CACHE(dir), de_bh,
1106              OCFS2_JOURNAL_ACCESS_WRITE);
1107     if (ret) {
1108         mlog_errno(ret);
1109         goto out;
1110     }
1111 
1112     de->inode = cpu_to_le64(OCFS2_I(new_entry_inode)->ip_blkno);
1113     ocfs2_set_de_type(de, new_entry_inode->i_mode);
1114 
1115     ocfs2_journal_dirty(handle, de_bh);
1116 
1117 out:
1118     return ret;
1119 }
1120 
1121 /*
1122  * __ocfs2_delete_entry deletes a directory entry by merging it with the
1123  * previous entry
1124  */
1125 static int __ocfs2_delete_entry(handle_t *handle, struct inode *dir,
1126                 struct ocfs2_dir_entry *de_del,
1127                 struct buffer_head *bh, char *first_de,
1128                 unsigned int bytes)
1129 {
1130     struct ocfs2_dir_entry *de, *pde;
1131     int i, status = -ENOENT;
1132     ocfs2_journal_access_func access = ocfs2_journal_access_db;
1133 
1134     if (OCFS2_I(dir)->ip_dyn_features & OCFS2_INLINE_DATA_FL)
1135         access = ocfs2_journal_access_di;
1136 
1137     i = 0;
1138     pde = NULL;
1139     de = (struct ocfs2_dir_entry *) first_de;
1140     while (i < bytes) {
1141         if (!ocfs2_check_dir_entry(dir, de, bh, i)) {
1142             status = -EIO;
1143             mlog_errno(status);
1144             goto bail;
1145         }
1146         if (de == de_del)  {
1147             status = access(handle, INODE_CACHE(dir), bh,
1148                     OCFS2_JOURNAL_ACCESS_WRITE);
1149             if (status < 0) {
1150                 status = -EIO;
1151                 mlog_errno(status);
1152                 goto bail;
1153             }
1154             if (pde)
1155                 le16_add_cpu(&pde->rec_len,
1156                         le16_to_cpu(de->rec_len));
1157             de->inode = 0;
1158             inode_inc_iversion(dir);
1159             ocfs2_journal_dirty(handle, bh);
1160             goto bail;
1161         }
1162         i += le16_to_cpu(de->rec_len);
1163         pde = de;
1164         de = (struct ocfs2_dir_entry *)((char *)de + le16_to_cpu(de->rec_len));
1165     }
1166 bail:
1167     return status;
1168 }
1169 
1170 static unsigned int ocfs2_figure_dirent_hole(struct ocfs2_dir_entry *de)
1171 {
1172     unsigned int hole;
1173 
1174     if (le64_to_cpu(de->inode) == 0)
1175         hole = le16_to_cpu(de->rec_len);
1176     else
1177         hole = le16_to_cpu(de->rec_len) -
1178             OCFS2_DIR_REC_LEN(de->name_len);
1179 
1180     return hole;
1181 }
1182 
1183 static int ocfs2_find_max_rec_len(struct super_block *sb,
1184                   struct buffer_head *dirblock_bh)
1185 {
1186     int size, this_hole, largest_hole = 0;
1187     char *trailer, *de_buf, *limit, *start = dirblock_bh->b_data;
1188     struct ocfs2_dir_entry *de;
1189 
1190     trailer = (char *)ocfs2_trailer_from_bh(dirblock_bh, sb);
1191     size = ocfs2_dir_trailer_blk_off(sb);
1192     limit = start + size;
1193     de_buf = start;
1194     de = (struct ocfs2_dir_entry *)de_buf;
1195     do {
1196         if (de_buf != trailer) {
1197             this_hole = ocfs2_figure_dirent_hole(de);
1198             if (this_hole > largest_hole)
1199                 largest_hole = this_hole;
1200         }
1201 
1202         de_buf += le16_to_cpu(de->rec_len);
1203         de = (struct ocfs2_dir_entry *)de_buf;
1204     } while (de_buf < limit);
1205 
1206     if (largest_hole >= OCFS2_DIR_MIN_REC_LEN)
1207         return largest_hole;
1208     return 0;
1209 }
1210 
1211 static void ocfs2_dx_list_remove_entry(struct ocfs2_dx_entry_list *entry_list,
1212                        int index)
1213 {
1214     int num_used = le16_to_cpu(entry_list->de_num_used);
1215 
1216     if (num_used == 1 || index == (num_used - 1))
1217         goto clear;
1218 
1219     memmove(&entry_list->de_entries[index],
1220         &entry_list->de_entries[index + 1],
1221         (num_used - index - 1)*sizeof(struct ocfs2_dx_entry));
1222 clear:
1223     num_used--;
1224     memset(&entry_list->de_entries[num_used], 0,
1225            sizeof(struct ocfs2_dx_entry));
1226     entry_list->de_num_used = cpu_to_le16(num_used);
1227 }
1228 
1229 static int ocfs2_delete_entry_dx(handle_t *handle, struct inode *dir,
1230                  struct ocfs2_dir_lookup_result *lookup)
1231 {
1232     int ret, index, max_rec_len, add_to_free_list = 0;
1233     struct buffer_head *dx_root_bh = lookup->dl_dx_root_bh;
1234     struct buffer_head *leaf_bh = lookup->dl_leaf_bh;
1235     struct ocfs2_dx_leaf *dx_leaf;
1236     struct ocfs2_dx_entry *dx_entry = lookup->dl_dx_entry;
1237     struct ocfs2_dir_block_trailer *trailer;
1238     struct ocfs2_dx_root_block *dx_root;
1239     struct ocfs2_dx_entry_list *entry_list;
1240 
1241     /*
1242      * This function gets a bit messy because we might have to
1243      * modify the root block, regardless of whether the indexed
1244      * entries are stored inline.
1245      */
1246 
1247     /*
1248      * *Only* set 'entry_list' here, based on where we're looking
1249      * for the indexed entries. Later, we might still want to
1250      * journal both blocks, based on free list state.
1251      */
1252     dx_root = (struct ocfs2_dx_root_block *)dx_root_bh->b_data;
1253     if (ocfs2_dx_root_inline(dx_root)) {
1254         entry_list = &dx_root->dr_entries;
1255     } else {
1256         dx_leaf = (struct ocfs2_dx_leaf *) lookup->dl_dx_leaf_bh->b_data;
1257         entry_list = &dx_leaf->dl_list;
1258     }
1259 
1260     /* Neither of these are a disk corruption - that should have
1261      * been caught by lookup, before we got here. */
1262     BUG_ON(le16_to_cpu(entry_list->de_count) <= 0);
1263     BUG_ON(le16_to_cpu(entry_list->de_num_used) <= 0);
1264 
1265     index = (char *)dx_entry - (char *)entry_list->de_entries;
1266     index /= sizeof(*dx_entry);
1267 
1268     if (index >= le16_to_cpu(entry_list->de_num_used)) {
1269         mlog(ML_ERROR, "Dir %llu: Bad dx_entry ptr idx %d, (%p, %p)\n",
1270              (unsigned long long)OCFS2_I(dir)->ip_blkno, index,
1271              entry_list, dx_entry);
1272         return -EIO;
1273     }
1274 
1275     /*
1276      * We know that removal of this dirent will leave enough room
1277      * for a new one, so add this block to the free list if it
1278      * isn't already there.
1279      */
1280     trailer = ocfs2_trailer_from_bh(leaf_bh, dir->i_sb);
1281     if (trailer->db_free_rec_len == 0)
1282         add_to_free_list = 1;
1283 
1284     /*
1285      * Add the block holding our index into the journal before
1286      * removing the unindexed entry. If we get an error return
1287      * from __ocfs2_delete_entry(), then it hasn't removed the
1288      * entry yet. Likewise, successful return means we *must*
1289      * remove the indexed entry.
1290      *
1291      * We're also careful to journal the root tree block here as
1292      * the entry count needs to be updated. Also, we might be
1293      * adding to the start of the free list.
1294      */
1295     ret = ocfs2_journal_access_dr(handle, INODE_CACHE(dir), dx_root_bh,
1296                       OCFS2_JOURNAL_ACCESS_WRITE);
1297     if (ret) {
1298         mlog_errno(ret);
1299         goto out;
1300     }
1301 
1302     if (!ocfs2_dx_root_inline(dx_root)) {
1303         ret = ocfs2_journal_access_dl(handle, INODE_CACHE(dir),
1304                           lookup->dl_dx_leaf_bh,
1305                           OCFS2_JOURNAL_ACCESS_WRITE);
1306         if (ret) {
1307             mlog_errno(ret);
1308             goto out;
1309         }
1310     }
1311 
1312     trace_ocfs2_delete_entry_dx((unsigned long long)OCFS2_I(dir)->ip_blkno,
1313                     index);
1314 
1315     ret = __ocfs2_delete_entry(handle, dir, lookup->dl_entry,
1316                    leaf_bh, leaf_bh->b_data, leaf_bh->b_size);
1317     if (ret) {
1318         mlog_errno(ret);
1319         goto out;
1320     }
1321 
1322     max_rec_len = ocfs2_find_max_rec_len(dir->i_sb, leaf_bh);
1323     trailer->db_free_rec_len = cpu_to_le16(max_rec_len);
1324     if (add_to_free_list) {
1325         trailer->db_free_next = dx_root->dr_free_blk;
1326         dx_root->dr_free_blk = cpu_to_le64(leaf_bh->b_blocknr);
1327         ocfs2_journal_dirty(handle, dx_root_bh);
1328     }
1329 
1330     /* leaf_bh was journal_accessed for us in __ocfs2_delete_entry */
1331     ocfs2_journal_dirty(handle, leaf_bh);
1332 
1333     le32_add_cpu(&dx_root->dr_num_entries, -1);
1334     ocfs2_journal_dirty(handle, dx_root_bh);
1335 
1336     ocfs2_dx_list_remove_entry(entry_list, index);
1337 
1338     if (!ocfs2_dx_root_inline(dx_root))
1339         ocfs2_journal_dirty(handle, lookup->dl_dx_leaf_bh);
1340 
1341 out:
1342     return ret;
1343 }
1344 
1345 static inline int ocfs2_delete_entry_id(handle_t *handle,
1346                     struct inode *dir,
1347                     struct ocfs2_dir_entry *de_del,
1348                     struct buffer_head *bh)
1349 {
1350     int ret;
1351     struct buffer_head *di_bh = NULL;
1352     struct ocfs2_dinode *di;
1353     struct ocfs2_inline_data *data;
1354 
1355     ret = ocfs2_read_inode_block(dir, &di_bh);
1356     if (ret) {
1357         mlog_errno(ret);
1358         goto out;
1359     }
1360 
1361     di = (struct ocfs2_dinode *)di_bh->b_data;
1362     data = &di->id2.i_data;
1363 
1364     ret = __ocfs2_delete_entry(handle, dir, de_del, bh, data->id_data,
1365                    i_size_read(dir));
1366 
1367     brelse(di_bh);
1368 out:
1369     return ret;
1370 }
1371 
1372 static inline int ocfs2_delete_entry_el(handle_t *handle,
1373                     struct inode *dir,
1374                     struct ocfs2_dir_entry *de_del,
1375                     struct buffer_head *bh)
1376 {
1377     return __ocfs2_delete_entry(handle, dir, de_del, bh, bh->b_data,
1378                     bh->b_size);
1379 }
1380 
1381 /*
1382  * Delete a directory entry. Hide the details of directory
1383  * implementation from the caller.
1384  */
1385 int ocfs2_delete_entry(handle_t *handle,
1386                struct inode *dir,
1387                struct ocfs2_dir_lookup_result *res)
1388 {
1389     if (ocfs2_dir_indexed(dir))
1390         return ocfs2_delete_entry_dx(handle, dir, res);
1391 
1392     if (OCFS2_I(dir)->ip_dyn_features & OCFS2_INLINE_DATA_FL)
1393         return ocfs2_delete_entry_id(handle, dir, res->dl_entry,
1394                          res->dl_leaf_bh);
1395 
1396     return ocfs2_delete_entry_el(handle, dir, res->dl_entry,
1397                      res->dl_leaf_bh);
1398 }
1399 
1400 /*
1401  * Check whether 'de' has enough room to hold an entry of
1402  * 'new_rec_len' bytes.
1403  */
1404 static inline int ocfs2_dirent_would_fit(struct ocfs2_dir_entry *de,
1405                      unsigned int new_rec_len)
1406 {
1407     unsigned int de_really_used;
1408 
1409     /* Check whether this is an empty record with enough space */
1410     if (le64_to_cpu(de->inode) == 0 &&
1411         le16_to_cpu(de->rec_len) >= new_rec_len)
1412         return 1;
1413 
1414     /*
1415      * Record might have free space at the end which we can
1416      * use.
1417      */
1418     de_really_used = OCFS2_DIR_REC_LEN(de->name_len);
1419     if (le16_to_cpu(de->rec_len) >= (de_really_used + new_rec_len))
1420         return 1;
1421 
1422     return 0;
1423 }
1424 
1425 static void ocfs2_dx_dir_leaf_insert_tail(struct ocfs2_dx_leaf *dx_leaf,
1426                       struct ocfs2_dx_entry *dx_new_entry)
1427 {
1428     int i;
1429 
1430     i = le16_to_cpu(dx_leaf->dl_list.de_num_used);
1431     dx_leaf->dl_list.de_entries[i] = *dx_new_entry;
1432 
1433     le16_add_cpu(&dx_leaf->dl_list.de_num_used, 1);
1434 }
1435 
1436 static void ocfs2_dx_entry_list_insert(struct ocfs2_dx_entry_list *entry_list,
1437                        struct ocfs2_dx_hinfo *hinfo,
1438                        u64 dirent_blk)
1439 {
1440     int i;
1441     struct ocfs2_dx_entry *dx_entry;
1442 
1443     i = le16_to_cpu(entry_list->de_num_used);
1444     dx_entry = &entry_list->de_entries[i];
1445 
1446     memset(dx_entry, 0, sizeof(*dx_entry));
1447     dx_entry->dx_major_hash = cpu_to_le32(hinfo->major_hash);
1448     dx_entry->dx_minor_hash = cpu_to_le32(hinfo->minor_hash);
1449     dx_entry->dx_dirent_blk = cpu_to_le64(dirent_blk);
1450 
1451     le16_add_cpu(&entry_list->de_num_used, 1);
1452 }
1453 
1454 static int __ocfs2_dx_dir_leaf_insert(struct inode *dir, handle_t *handle,
1455                       struct ocfs2_dx_hinfo *hinfo,
1456                       u64 dirent_blk,
1457                       struct buffer_head *dx_leaf_bh)
1458 {
1459     int ret;
1460     struct ocfs2_dx_leaf *dx_leaf;
1461 
1462     ret = ocfs2_journal_access_dl(handle, INODE_CACHE(dir), dx_leaf_bh,
1463                       OCFS2_JOURNAL_ACCESS_WRITE);
1464     if (ret) {
1465         mlog_errno(ret);
1466         goto out;
1467     }
1468 
1469     dx_leaf = (struct ocfs2_dx_leaf *)dx_leaf_bh->b_data;
1470     ocfs2_dx_entry_list_insert(&dx_leaf->dl_list, hinfo, dirent_blk);
1471     ocfs2_journal_dirty(handle, dx_leaf_bh);
1472 
1473 out:
1474     return ret;
1475 }
1476 
1477 static void ocfs2_dx_inline_root_insert(struct inode *dir, handle_t *handle,
1478                     struct ocfs2_dx_hinfo *hinfo,
1479                     u64 dirent_blk,
1480                     struct ocfs2_dx_root_block *dx_root)
1481 {
1482     ocfs2_dx_entry_list_insert(&dx_root->dr_entries, hinfo, dirent_blk);
1483 }
1484 
1485 static int ocfs2_dx_dir_insert(struct inode *dir, handle_t *handle,
1486                    struct ocfs2_dir_lookup_result *lookup)
1487 {
1488     int ret = 0;
1489     struct ocfs2_dx_root_block *dx_root;
1490     struct buffer_head *dx_root_bh = lookup->dl_dx_root_bh;
1491 
1492     ret = ocfs2_journal_access_dr(handle, INODE_CACHE(dir), dx_root_bh,
1493                       OCFS2_JOURNAL_ACCESS_WRITE);
1494     if (ret) {
1495         mlog_errno(ret);
1496         goto out;
1497     }
1498 
1499     dx_root = (struct ocfs2_dx_root_block *)lookup->dl_dx_root_bh->b_data;
1500     if (ocfs2_dx_root_inline(dx_root)) {
1501         ocfs2_dx_inline_root_insert(dir, handle,
1502                         &lookup->dl_hinfo,
1503                         lookup->dl_leaf_bh->b_blocknr,
1504                         dx_root);
1505     } else {
1506         ret = __ocfs2_dx_dir_leaf_insert(dir, handle, &lookup->dl_hinfo,
1507                          lookup->dl_leaf_bh->b_blocknr,
1508                          lookup->dl_dx_leaf_bh);
1509         if (ret)
1510             goto out;
1511     }
1512 
1513     le32_add_cpu(&dx_root->dr_num_entries, 1);
1514     ocfs2_journal_dirty(handle, dx_root_bh);
1515 
1516 out:
1517     return ret;
1518 }
1519 
1520 static void ocfs2_remove_block_from_free_list(struct inode *dir,
1521                        handle_t *handle,
1522                        struct ocfs2_dir_lookup_result *lookup)
1523 {
1524     struct ocfs2_dir_block_trailer *trailer, *prev;
1525     struct ocfs2_dx_root_block *dx_root;
1526     struct buffer_head *bh;
1527 
1528     trailer = ocfs2_trailer_from_bh(lookup->dl_leaf_bh, dir->i_sb);
1529 
1530     if (ocfs2_free_list_at_root(lookup)) {
1531         bh = lookup->dl_dx_root_bh;
1532         dx_root = (struct ocfs2_dx_root_block *)bh->b_data;
1533         dx_root->dr_free_blk = trailer->db_free_next;
1534     } else {
1535         bh = lookup->dl_prev_leaf_bh;
1536         prev = ocfs2_trailer_from_bh(bh, dir->i_sb);
1537         prev->db_free_next = trailer->db_free_next;
1538     }
1539 
1540     trailer->db_free_rec_len = cpu_to_le16(0);
1541     trailer->db_free_next = cpu_to_le64(0);
1542 
1543     ocfs2_journal_dirty(handle, bh);
1544     ocfs2_journal_dirty(handle, lookup->dl_leaf_bh);
1545 }
1546 
1547 /*
1548  * This expects that a journal write has been reserved on
1549  * lookup->dl_prev_leaf_bh or lookup->dl_dx_root_bh
1550  */
1551 static void ocfs2_recalc_free_list(struct inode *dir, handle_t *handle,
1552                    struct ocfs2_dir_lookup_result *lookup)
1553 {
1554     int max_rec_len;
1555     struct ocfs2_dir_block_trailer *trailer;
1556 
1557     /* Walk dl_leaf_bh to figure out what the new free rec_len is. */
1558     max_rec_len = ocfs2_find_max_rec_len(dir->i_sb, lookup->dl_leaf_bh);
1559     if (max_rec_len) {
1560         /*
1561          * There's still room in this block, so no need to remove it
1562          * from the free list. In this case, we just want to update
1563          * the rec len accounting.
1564          */
1565         trailer = ocfs2_trailer_from_bh(lookup->dl_leaf_bh, dir->i_sb);
1566         trailer->db_free_rec_len = cpu_to_le16(max_rec_len);
1567         ocfs2_journal_dirty(handle, lookup->dl_leaf_bh);
1568     } else {
1569         ocfs2_remove_block_from_free_list(dir, handle, lookup);
1570     }
1571 }
1572 
1573 /* we don't always have a dentry for what we want to add, so people
1574  * like orphan dir can call this instead.
1575  *
1576  * The lookup context must have been filled from
1577  * ocfs2_prepare_dir_for_insert.
1578  */
1579 int __ocfs2_add_entry(handle_t *handle,
1580               struct inode *dir,
1581               const char *name, int namelen,
1582               struct inode *inode, u64 blkno,
1583               struct buffer_head *parent_fe_bh,
1584               struct ocfs2_dir_lookup_result *lookup)
1585 {
1586     unsigned long offset;
1587     unsigned short rec_len;
1588     struct ocfs2_dir_entry *de, *de1;
1589     struct ocfs2_dinode *di = (struct ocfs2_dinode *)parent_fe_bh->b_data;
1590     struct super_block *sb = dir->i_sb;
1591     int retval;
1592     unsigned int size = sb->s_blocksize;
1593     struct buffer_head *insert_bh = lookup->dl_leaf_bh;
1594     char *data_start = insert_bh->b_data;
1595 
1596     if (!namelen)
1597         return -EINVAL;
1598 
1599     if (ocfs2_dir_indexed(dir)) {
1600         struct buffer_head *bh;
1601 
1602         /*
1603          * An indexed dir may require that we update the free space
1604          * list. Reserve a write to the previous node in the list so
1605          * that we don't fail later.
1606          *
1607          * XXX: This can be either a dx_root_block, or an unindexed
1608          * directory tree leaf block.
1609          */
1610         if (ocfs2_free_list_at_root(lookup)) {
1611             bh = lookup->dl_dx_root_bh;
1612             retval = ocfs2_journal_access_dr(handle,
1613                          INODE_CACHE(dir), bh,
1614                          OCFS2_JOURNAL_ACCESS_WRITE);
1615         } else {
1616             bh = lookup->dl_prev_leaf_bh;
1617             retval = ocfs2_journal_access_db(handle,
1618                          INODE_CACHE(dir), bh,
1619                          OCFS2_JOURNAL_ACCESS_WRITE);
1620         }
1621         if (retval) {
1622             mlog_errno(retval);
1623             return retval;
1624         }
1625     } else if (OCFS2_I(dir)->ip_dyn_features & OCFS2_INLINE_DATA_FL) {
1626         data_start = di->id2.i_data.id_data;
1627         size = i_size_read(dir);
1628 
1629         BUG_ON(insert_bh != parent_fe_bh);
1630     }
1631 
1632     rec_len = OCFS2_DIR_REC_LEN(namelen);
1633     offset = 0;
1634     de = (struct ocfs2_dir_entry *) data_start;
1635     while (1) {
1636         BUG_ON((char *)de >= (size + data_start));
1637 
1638         /* These checks should've already been passed by the
1639          * prepare function, but I guess we can leave them
1640          * here anyway. */
1641         if (!ocfs2_check_dir_entry(dir, de, insert_bh, offset)) {
1642             retval = -ENOENT;
1643             goto bail;
1644         }
1645         if (ocfs2_match(namelen, name, de)) {
1646             retval = -EEXIST;
1647             goto bail;
1648         }
1649 
1650         /* We're guaranteed that we should have space, so we
1651          * can't possibly have hit the trailer...right? */
1652         mlog_bug_on_msg(ocfs2_skip_dir_trailer(dir, de, offset, size),
1653                 "Hit dir trailer trying to insert %.*s "
1654                     "(namelen %d) into directory %llu.  "
1655                 "offset is %lu, trailer offset is %d\n",
1656                 namelen, name, namelen,
1657                 (unsigned long long)parent_fe_bh->b_blocknr,
1658                 offset, ocfs2_dir_trailer_blk_off(dir->i_sb));
1659 
1660         if (ocfs2_dirent_would_fit(de, rec_len)) {
1661             dir->i_mtime = dir->i_ctime = current_time(dir);
1662             retval = ocfs2_mark_inode_dirty(handle, dir, parent_fe_bh);
1663             if (retval < 0) {
1664                 mlog_errno(retval);
1665                 goto bail;
1666             }
1667 
1668             if (insert_bh == parent_fe_bh)
1669                 retval = ocfs2_journal_access_di(handle,
1670                                  INODE_CACHE(dir),
1671                                  insert_bh,
1672                                  OCFS2_JOURNAL_ACCESS_WRITE);
1673             else {
1674                 retval = ocfs2_journal_access_db(handle,
1675                                  INODE_CACHE(dir),
1676                                  insert_bh,
1677                           OCFS2_JOURNAL_ACCESS_WRITE);
1678 
1679                 if (!retval && ocfs2_dir_indexed(dir))
1680                     retval = ocfs2_dx_dir_insert(dir,
1681                                 handle,
1682                                 lookup);
1683             }
1684 
1685             if (retval) {
1686                 mlog_errno(retval);
1687                 goto bail;
1688             }
1689 
1690             /* By now the buffer is marked for journaling */
1691             offset += le16_to_cpu(de->rec_len);
1692             if (le64_to_cpu(de->inode)) {
1693                 de1 = (struct ocfs2_dir_entry *)((char *) de +
1694                     OCFS2_DIR_REC_LEN(de->name_len));
1695                 de1->rec_len =
1696                     cpu_to_le16(le16_to_cpu(de->rec_len) -
1697                     OCFS2_DIR_REC_LEN(de->name_len));
1698                 de->rec_len = cpu_to_le16(OCFS2_DIR_REC_LEN(de->name_len));
1699                 de = de1;
1700             }
1701             de->file_type = FT_UNKNOWN;
1702             if (blkno) {
1703                 de->inode = cpu_to_le64(blkno);
1704                 ocfs2_set_de_type(de, inode->i_mode);
1705             } else
1706                 de->inode = 0;
1707             de->name_len = namelen;
1708             memcpy(de->name, name, namelen);
1709 
1710             if (ocfs2_dir_indexed(dir))
1711                 ocfs2_recalc_free_list(dir, handle, lookup);
1712 
1713             inode_inc_iversion(dir);
1714             ocfs2_journal_dirty(handle, insert_bh);
1715             retval = 0;
1716             goto bail;
1717         }
1718 
1719         offset += le16_to_cpu(de->rec_len);
1720         de = (struct ocfs2_dir_entry *) ((char *) de + le16_to_cpu(de->rec_len));
1721     }
1722 
1723     /* when you think about it, the assert above should prevent us
1724      * from ever getting here. */
1725     retval = -ENOSPC;
1726 bail:
1727     if (retval)
1728         mlog_errno(retval);
1729 
1730     return retval;
1731 }
1732 
1733 static int ocfs2_dir_foreach_blk_id(struct inode *inode,
1734                     u64 *f_version,
1735                     struct dir_context *ctx)
1736 {
1737     int ret, i;
1738     unsigned long offset = ctx->pos;
1739     struct buffer_head *di_bh = NULL;
1740     struct ocfs2_dinode *di;
1741     struct ocfs2_inline_data *data;
1742     struct ocfs2_dir_entry *de;
1743 
1744     ret = ocfs2_read_inode_block(inode, &di_bh);
1745     if (ret) {
1746         mlog(ML_ERROR, "Unable to read inode block for dir %llu\n",
1747              (unsigned long long)OCFS2_I(inode)->ip_blkno);
1748         goto out;
1749     }
1750 
1751     di = (struct ocfs2_dinode *)di_bh->b_data;
1752     data = &di->id2.i_data;
1753 
1754     while (ctx->pos < i_size_read(inode)) {
1755         /* If the dir block has changed since the last call to
1756          * readdir(2), then we might be pointing to an invalid
1757          * dirent right now.  Scan from the start of the block
1758          * to make sure. */
1759         if (!inode_eq_iversion(inode, *f_version)) {
1760             for (i = 0; i < i_size_read(inode) && i < offset; ) {
1761                 de = (struct ocfs2_dir_entry *)
1762                     (data->id_data + i);
1763                 /* It's too expensive to do a full
1764                  * dirent test each time round this
1765                  * loop, but we do have to test at
1766                  * least that it is non-zero.  A
1767                  * failure will be detected in the
1768                  * dirent test below. */
1769                 if (le16_to_cpu(de->rec_len) <
1770                     OCFS2_DIR_REC_LEN(1))
1771                     break;
1772                 i += le16_to_cpu(de->rec_len);
1773             }
1774             ctx->pos = offset = i;
1775             *f_version = inode_query_iversion(inode);
1776         }
1777 
1778         de = (struct ocfs2_dir_entry *) (data->id_data + ctx->pos);
1779         if (!ocfs2_check_dir_entry(inode, de, di_bh, ctx->pos)) {
1780             /* On error, skip the f_pos to the end. */
1781             ctx->pos = i_size_read(inode);
1782             break;
1783         }
1784         offset += le16_to_cpu(de->rec_len);
1785         if (le64_to_cpu(de->inode)) {
1786             if (!dir_emit(ctx, de->name, de->name_len,
1787                       le64_to_cpu(de->inode),
1788                       fs_ftype_to_dtype(de->file_type)))
1789                 goto out;
1790         }
1791         ctx->pos += le16_to_cpu(de->rec_len);
1792     }
1793 out:
1794     brelse(di_bh);
1795     return 0;
1796 }
1797 
1798 /*
1799  * NOTE: This function can be called against unindexed directories,
1800  * and indexed ones.
1801  */
1802 static int ocfs2_dir_foreach_blk_el(struct inode *inode,
1803                     u64 *f_version,
1804                     struct dir_context *ctx,
1805                     bool persist)
1806 {
1807     unsigned long offset, blk, last_ra_blk = 0;
1808     int i;
1809     struct buffer_head * bh, * tmp;
1810     struct ocfs2_dir_entry * de;
1811     struct super_block * sb = inode->i_sb;
1812     unsigned int ra_sectors = 16;
1813     int stored = 0;
1814 
1815     bh = NULL;
1816 
1817     offset = ctx->pos & (sb->s_blocksize - 1);
1818 
1819     while (ctx->pos < i_size_read(inode)) {
1820         blk = ctx->pos >> sb->s_blocksize_bits;
1821         if (ocfs2_read_dir_block(inode, blk, &bh, 0)) {
1822             /* Skip the corrupt dirblock and keep trying */
1823             ctx->pos += sb->s_blocksize - offset;
1824             continue;
1825         }
1826 
1827         /* The idea here is to begin with 8k read-ahead and to stay
1828          * 4k ahead of our current position.
1829          *
1830          * TODO: Use the pagecache for this. We just need to
1831          * make sure it's cluster-safe... */
1832         if (!last_ra_blk
1833             || (((last_ra_blk - blk) << 9) <= (ra_sectors / 2))) {
1834             for (i = ra_sectors >> (sb->s_blocksize_bits - 9);
1835                  i > 0; i--) {
1836                 tmp = NULL;
1837                 if (!ocfs2_read_dir_block(inode, ++blk, &tmp,
1838                               OCFS2_BH_READAHEAD))
1839                     brelse(tmp);
1840             }
1841             last_ra_blk = blk;
1842             ra_sectors = 8;
1843         }
1844 
1845         /* If the dir block has changed since the last call to
1846          * readdir(2), then we might be pointing to an invalid
1847          * dirent right now.  Scan from the start of the block
1848          * to make sure. */
1849         if (!inode_eq_iversion(inode, *f_version)) {
1850             for (i = 0; i < sb->s_blocksize && i < offset; ) {
1851                 de = (struct ocfs2_dir_entry *) (bh->b_data + i);
1852                 /* It's too expensive to do a full
1853                  * dirent test each time round this
1854                  * loop, but we do have to test at
1855                  * least that it is non-zero.  A
1856                  * failure will be detected in the
1857                  * dirent test below. */
1858                 if (le16_to_cpu(de->rec_len) <
1859                     OCFS2_DIR_REC_LEN(1))
1860                     break;
1861                 i += le16_to_cpu(de->rec_len);
1862             }
1863             offset = i;
1864             ctx->pos = (ctx->pos & ~(sb->s_blocksize - 1))
1865                 | offset;
1866             *f_version = inode_query_iversion(inode);
1867         }
1868 
1869         while (ctx->pos < i_size_read(inode)
1870                && offset < sb->s_blocksize) {
1871             de = (struct ocfs2_dir_entry *) (bh->b_data + offset);
1872             if (!ocfs2_check_dir_entry(inode, de, bh, offset)) {
1873                 /* On error, skip the f_pos to the
1874                    next block. */
1875                 ctx->pos = (ctx->pos | (sb->s_blocksize - 1)) + 1;
1876                 break;
1877             }
1878             if (le64_to_cpu(de->inode)) {
1879                 if (!dir_emit(ctx, de->name,
1880                         de->name_len,
1881                         le64_to_cpu(de->inode),
1882                     fs_ftype_to_dtype(de->file_type))) {
1883                     brelse(bh);
1884                     return 0;
1885                 }
1886                 stored++;
1887             }
1888             offset += le16_to_cpu(de->rec_len);
1889             ctx->pos += le16_to_cpu(de->rec_len);
1890         }
1891         offset = 0;
1892         brelse(bh);
1893         bh = NULL;
1894         if (!persist && stored)
1895             break;
1896     }
1897     return 0;
1898 }
1899 
1900 static int ocfs2_dir_foreach_blk(struct inode *inode, u64 *f_version,
1901                  struct dir_context *ctx,
1902                  bool persist)
1903 {
1904     if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL)
1905         return ocfs2_dir_foreach_blk_id(inode, f_version, ctx);
1906     return ocfs2_dir_foreach_blk_el(inode, f_version, ctx, persist);
1907 }
1908 
1909 /*
1910  * This is intended to be called from inside other kernel functions,
1911  * so we fake some arguments.
1912  */
1913 int ocfs2_dir_foreach(struct inode *inode, struct dir_context *ctx)
1914 {
1915     u64 version = inode_query_iversion(inode);
1916     ocfs2_dir_foreach_blk(inode, &version, ctx, true);
1917     return 0;
1918 }
1919 
1920 /*
1921  * ocfs2_readdir()
1922  *
1923  */
1924 int ocfs2_readdir(struct file *file, struct dir_context *ctx)
1925 {
1926     int error = 0;
1927     struct inode *inode = file_inode(file);
1928     int lock_level = 0;
1929 
1930     trace_ocfs2_readdir((unsigned long long)OCFS2_I(inode)->ip_blkno);
1931 
1932     error = ocfs2_inode_lock_atime(inode, file->f_path.mnt, &lock_level, 1);
1933     if (lock_level && error >= 0) {
1934         /* We release EX lock which used to update atime
1935          * and get PR lock again to reduce contention
1936          * on commonly accessed directories. */
1937         ocfs2_inode_unlock(inode, 1);
1938         lock_level = 0;
1939         error = ocfs2_inode_lock(inode, NULL, 0);
1940     }
1941     if (error < 0) {
1942         if (error != -ENOENT)
1943             mlog_errno(error);
1944         /* we haven't got any yet, so propagate the error. */
1945         goto bail_nolock;
1946     }
1947 
1948     error = ocfs2_dir_foreach_blk(inode, &file->f_version, ctx, false);
1949 
1950     ocfs2_inode_unlock(inode, lock_level);
1951     if (error)
1952         mlog_errno(error);
1953 
1954 bail_nolock:
1955 
1956     return error;
1957 }
1958 
1959 /*
1960  * NOTE: this should always be called with parent dir i_rwsem taken.
1961  */
1962 int ocfs2_find_files_on_disk(const char *name,
1963                  int namelen,
1964                  u64 *blkno,
1965                  struct inode *inode,
1966                  struct ocfs2_dir_lookup_result *lookup)
1967 {
1968     int status = -ENOENT;
1969 
1970     trace_ocfs2_find_files_on_disk(namelen, name, blkno,
1971                 (unsigned long long)OCFS2_I(inode)->ip_blkno);
1972 
1973     status = ocfs2_find_entry(name, namelen, inode, lookup);
1974     if (status)
1975         goto leave;
1976 
1977     *blkno = le64_to_cpu(lookup->dl_entry->inode);
1978 
1979     status = 0;
1980 leave:
1981 
1982     return status;
1983 }
1984 
1985 /*
1986  * Convenience function for callers which just want the block number
1987  * mapped to a name and don't require the full dirent info, etc.
1988  */
1989 int ocfs2_lookup_ino_from_name(struct inode *dir, const char *name,
1990                    int namelen, u64 *blkno)
1991 {
1992     int ret;
1993     struct ocfs2_dir_lookup_result lookup = { NULL, };
1994 
1995     ret = ocfs2_find_files_on_disk(name, namelen, blkno, dir, &lookup);
1996     ocfs2_free_dir_lookup_result(&lookup);
1997 
1998     return ret;
1999 }
2000 
2001 /* Check for a name within a directory.
2002  *
2003  * Return 0 if the name does not exist
2004  * Return -EEXIST if the directory contains the name
2005  *
2006  * Callers should have i_rwsem + a cluster lock on dir
2007  */
2008 int ocfs2_check_dir_for_entry(struct inode *dir,
2009                   const char *name,
2010                   int namelen)
2011 {
2012     int ret = 0;
2013     struct ocfs2_dir_lookup_result lookup = { NULL, };
2014 
2015     trace_ocfs2_check_dir_for_entry(
2016         (unsigned long long)OCFS2_I(dir)->ip_blkno, namelen, name);
2017 
2018     if (ocfs2_find_entry(name, namelen, dir, &lookup) == 0) {
2019         ret = -EEXIST;
2020         mlog_errno(ret);
2021     }
2022 
2023     ocfs2_free_dir_lookup_result(&lookup);
2024 
2025     return ret;
2026 }
2027 
2028 struct ocfs2_empty_dir_priv {
2029     struct dir_context ctx;
2030     unsigned seen_dot;
2031     unsigned seen_dot_dot;
2032     unsigned seen_other;
2033     unsigned dx_dir;
2034 };
2035 static int ocfs2_empty_dir_filldir(struct dir_context *ctx, const char *name,
2036                    int name_len, loff_t pos, u64 ino,
2037                    unsigned type)
2038 {
2039     struct ocfs2_empty_dir_priv *p =
2040         container_of(ctx, struct ocfs2_empty_dir_priv, ctx);
2041 
2042     /*
2043      * Check the positions of "." and ".." records to be sure
2044      * they're in the correct place.
2045      *
2046      * Indexed directories don't need to proceed past the first
2047      * two entries, so we end the scan after seeing '..'. Despite
2048      * that, we allow the scan to proceed In the event that we
2049      * have a corrupted indexed directory (no dot or dot dot
2050      * entries). This allows us to double check for existing
2051      * entries which might not have been found in the index.
2052      */
2053     if (name_len == 1 && !strncmp(".", name, 1) && pos == 0) {
2054         p->seen_dot = 1;
2055         return 0;
2056     }
2057 
2058     if (name_len == 2 && !strncmp("..", name, 2) &&
2059         pos == OCFS2_DIR_REC_LEN(1)) {
2060         p->seen_dot_dot = 1;
2061 
2062         if (p->dx_dir && p->seen_dot)
2063             return 1;
2064 
2065         return 0;
2066     }
2067 
2068     p->seen_other = 1;
2069     return 1;
2070 }
2071 
2072 static int ocfs2_empty_dir_dx(struct inode *inode,
2073                   struct ocfs2_empty_dir_priv *priv)
2074 {
2075     int ret;
2076     struct buffer_head *di_bh = NULL;
2077     struct buffer_head *dx_root_bh = NULL;
2078     struct ocfs2_dinode *di;
2079     struct ocfs2_dx_root_block *dx_root;
2080 
2081     priv->dx_dir = 1;
2082 
2083     ret = ocfs2_read_inode_block(inode, &di_bh);
2084     if (ret) {
2085         mlog_errno(ret);
2086         goto out;
2087     }
2088     di = (struct ocfs2_dinode *)di_bh->b_data;
2089 
2090     ret = ocfs2_read_dx_root(inode, di, &dx_root_bh);
2091     if (ret) {
2092         mlog_errno(ret);
2093         goto out;
2094     }
2095     dx_root = (struct ocfs2_dx_root_block *)dx_root_bh->b_data;
2096 
2097     if (le32_to_cpu(dx_root->dr_num_entries) != 2)
2098         priv->seen_other = 1;
2099 
2100 out:
2101     brelse(di_bh);
2102     brelse(dx_root_bh);
2103     return ret;
2104 }
2105 
2106 /*
2107  * routine to check that the specified directory is empty (for rmdir)
2108  *
2109  * Returns 1 if dir is empty, zero otherwise.
2110  *
2111  * XXX: This is a performance problem for unindexed directories.
2112  */
2113 int ocfs2_empty_dir(struct inode *inode)
2114 {
2115     int ret;
2116     struct ocfs2_empty_dir_priv priv = {
2117         .ctx.actor = ocfs2_empty_dir_filldir,
2118     };
2119 
2120     if (ocfs2_dir_indexed(inode)) {
2121         ret = ocfs2_empty_dir_dx(inode, &priv);
2122         if (ret)
2123             mlog_errno(ret);
2124         /*
2125          * We still run ocfs2_dir_foreach to get the checks
2126          * for "." and "..".
2127          */
2128     }
2129 
2130     ret = ocfs2_dir_foreach(inode, &priv.ctx);
2131     if (ret)
2132         mlog_errno(ret);
2133 
2134     if (!priv.seen_dot || !priv.seen_dot_dot) {
2135         mlog(ML_ERROR, "bad directory (dir #%llu) - no `.' or `..'\n",
2136              (unsigned long long)OCFS2_I(inode)->ip_blkno);
2137         /*
2138          * XXX: Is it really safe to allow an unlink to continue?
2139          */
2140         return 1;
2141     }
2142 
2143     return !priv.seen_other;
2144 }
2145 
2146 /*
2147  * Fills "." and ".." dirents in a new directory block. Returns dirent for
2148  * "..", which might be used during creation of a directory with a trailing
2149  * header. It is otherwise safe to ignore the return code.
2150  */
2151 static struct ocfs2_dir_entry *ocfs2_fill_initial_dirents(struct inode *inode,
2152                               struct inode *parent,
2153                               char *start,
2154                               unsigned int size)
2155 {
2156     struct ocfs2_dir_entry *de = (struct ocfs2_dir_entry *)start;
2157 
2158     de->inode = cpu_to_le64(OCFS2_I(inode)->ip_blkno);
2159     de->name_len = 1;
2160     de->rec_len =
2161         cpu_to_le16(OCFS2_DIR_REC_LEN(de->name_len));
2162     strcpy(de->name, ".");
2163     ocfs2_set_de_type(de, S_IFDIR);
2164 
2165     de = (struct ocfs2_dir_entry *) ((char *)de + le16_to_cpu(de->rec_len));
2166     de->inode = cpu_to_le64(OCFS2_I(parent)->ip_blkno);
2167     de->rec_len = cpu_to_le16(size - OCFS2_DIR_REC_LEN(1));
2168     de->name_len = 2;
2169     strcpy(de->name, "..");
2170     ocfs2_set_de_type(de, S_IFDIR);
2171 
2172     return de;
2173 }
2174 
2175 /*
2176  * This works together with code in ocfs2_mknod_locked() which sets
2177  * the inline-data flag and initializes the inline-data section.
2178  */
2179 static int ocfs2_fill_new_dir_id(struct ocfs2_super *osb,
2180                  handle_t *handle,
2181                  struct inode *parent,
2182                  struct inode *inode,
2183                  struct buffer_head *di_bh)
2184 {
2185     int ret;
2186     struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
2187     struct ocfs2_inline_data *data = &di->id2.i_data;
2188     unsigned int size = le16_to_cpu(data->id_count);
2189 
2190     ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), di_bh,
2191                       OCFS2_JOURNAL_ACCESS_WRITE);
2192     if (ret) {
2193         mlog_errno(ret);
2194         goto out;
2195     }
2196 
2197     ocfs2_fill_initial_dirents(inode, parent, data->id_data, size);
2198     ocfs2_journal_dirty(handle, di_bh);
2199 
2200     i_size_write(inode, size);
2201     set_nlink(inode, 2);
2202     inode->i_blocks = ocfs2_inode_sector_count(inode);
2203 
2204     ret = ocfs2_mark_inode_dirty(handle, inode, di_bh);
2205     if (ret < 0)
2206         mlog_errno(ret);
2207 
2208 out:
2209     return ret;
2210 }
2211 
2212 static int ocfs2_fill_new_dir_el(struct ocfs2_super *osb,
2213                  handle_t *handle,
2214                  struct inode *parent,
2215                  struct inode *inode,
2216                  struct buffer_head *fe_bh,
2217                  struct ocfs2_alloc_context *data_ac,
2218                  struct buffer_head **ret_new_bh)
2219 {
2220     int status;
2221     unsigned int size = osb->sb->s_blocksize;
2222     struct buffer_head *new_bh = NULL;
2223     struct ocfs2_dir_entry *de;
2224 
2225     if (ocfs2_new_dir_wants_trailer(inode))
2226         size = ocfs2_dir_trailer_blk_off(parent->i_sb);
2227 
2228     status = ocfs2_do_extend_dir(osb->sb, handle, inode, fe_bh,
2229                      data_ac, NULL, &new_bh);
2230     if (status < 0) {
2231         mlog_errno(status);
2232         goto bail;
2233     }
2234 
2235     ocfs2_set_new_buffer_uptodate(INODE_CACHE(inode), new_bh);
2236 
2237     status = ocfs2_journal_access_db(handle, INODE_CACHE(inode), new_bh,
2238                      OCFS2_JOURNAL_ACCESS_CREATE);
2239     if (status < 0) {
2240         mlog_errno(status);
2241         goto bail;
2242     }
2243     memset(new_bh->b_data, 0, osb->sb->s_blocksize);
2244 
2245     de = ocfs2_fill_initial_dirents(inode, parent, new_bh->b_data, size);
2246     if (ocfs2_new_dir_wants_trailer(inode)) {
2247         int size = le16_to_cpu(de->rec_len);
2248 
2249         /*
2250          * Figure out the size of the hole left over after
2251          * insertion of '.' and '..'. The trailer wants this
2252          * information.
2253          */
2254         size -= OCFS2_DIR_REC_LEN(2);
2255         size -= sizeof(struct ocfs2_dir_block_trailer);
2256 
2257         ocfs2_init_dir_trailer(inode, new_bh, size);
2258     }
2259 
2260     ocfs2_journal_dirty(handle, new_bh);
2261 
2262     i_size_write(inode, inode->i_sb->s_blocksize);
2263     set_nlink(inode, 2);
2264     inode->i_blocks = ocfs2_inode_sector_count(inode);
2265     status = ocfs2_mark_inode_dirty(handle, inode, fe_bh);
2266     if (status < 0) {
2267         mlog_errno(status);
2268         goto bail;
2269     }
2270 
2271     status = 0;
2272     if (ret_new_bh) {
2273         *ret_new_bh = new_bh;
2274         new_bh = NULL;
2275     }
2276 bail:
2277     brelse(new_bh);
2278 
2279     return status;
2280 }
2281 
2282 static int ocfs2_dx_dir_attach_index(struct ocfs2_super *osb,
2283                      handle_t *handle, struct inode *dir,
2284                      struct buffer_head *di_bh,
2285                      struct buffer_head *dirdata_bh,
2286                      struct ocfs2_alloc_context *meta_ac,
2287                      int dx_inline, u32 num_entries,
2288                      struct buffer_head **ret_dx_root_bh)
2289 {
2290     int ret;
2291     struct ocfs2_dinode *di = (struct ocfs2_dinode *) di_bh->b_data;
2292     u16 dr_suballoc_bit;
2293     u64 suballoc_loc, dr_blkno;
2294     unsigned int num_bits;
2295     struct buffer_head *dx_root_bh = NULL;
2296     struct ocfs2_dx_root_block *dx_root;
2297     struct ocfs2_dir_block_trailer *trailer =
2298         ocfs2_trailer_from_bh(dirdata_bh, dir->i_sb);
2299 
2300     ret = ocfs2_claim_metadata(handle, meta_ac, 1, &suballoc_loc,
2301                    &dr_suballoc_bit, &num_bits, &dr_blkno);
2302     if (ret) {
2303         mlog_errno(ret);
2304         goto out;
2305     }
2306 
2307     trace_ocfs2_dx_dir_attach_index(
2308                 (unsigned long long)OCFS2_I(dir)->ip_blkno,
2309                 (unsigned long long)dr_blkno);
2310 
2311     dx_root_bh = sb_getblk(osb->sb, dr_blkno);
2312     if (dx_root_bh == NULL) {
2313         ret = -ENOMEM;
2314         goto out;
2315     }
2316     ocfs2_set_new_buffer_uptodate(INODE_CACHE(dir), dx_root_bh);
2317 
2318     ret = ocfs2_journal_access_dr(handle, INODE_CACHE(dir), dx_root_bh,
2319                       OCFS2_JOURNAL_ACCESS_CREATE);
2320     if (ret < 0) {
2321         mlog_errno(ret);
2322         goto out;
2323     }
2324 
2325     dx_root = (struct ocfs2_dx_root_block *)dx_root_bh->b_data;
2326     memset(dx_root, 0, osb->sb->s_blocksize);
2327     strcpy(dx_root->dr_signature, OCFS2_DX_ROOT_SIGNATURE);
2328     dx_root->dr_suballoc_slot = cpu_to_le16(meta_ac->ac_alloc_slot);
2329     dx_root->dr_suballoc_loc = cpu_to_le64(suballoc_loc);
2330     dx_root->dr_suballoc_bit = cpu_to_le16(dr_suballoc_bit);
2331     dx_root->dr_fs_generation = cpu_to_le32(osb->fs_generation);
2332     dx_root->dr_blkno = cpu_to_le64(dr_blkno);
2333     dx_root->dr_dir_blkno = cpu_to_le64(OCFS2_I(dir)->ip_blkno);
2334     dx_root->dr_num_entries = cpu_to_le32(num_entries);
2335     if (le16_to_cpu(trailer->db_free_rec_len))
2336         dx_root->dr_free_blk = cpu_to_le64(dirdata_bh->b_blocknr);
2337     else
2338         dx_root->dr_free_blk = cpu_to_le64(0);
2339 
2340     if (dx_inline) {
2341         dx_root->dr_flags |= OCFS2_DX_FLAG_INLINE;
2342         dx_root->dr_entries.de_count =
2343             cpu_to_le16(ocfs2_dx_entries_per_root(osb->sb));
2344     } else {
2345         dx_root->dr_list.l_count =
2346             cpu_to_le16(ocfs2_extent_recs_per_dx_root(osb->sb));
2347     }
2348     ocfs2_journal_dirty(handle, dx_root_bh);
2349 
2350     ret = ocfs2_journal_access_di(handle, INODE_CACHE(dir), di_bh,
2351                       OCFS2_JOURNAL_ACCESS_CREATE);
2352     if (ret) {
2353         mlog_errno(ret);
2354         goto out;
2355     }
2356 
2357     di->i_dx_root = cpu_to_le64(dr_blkno);
2358 
2359     spin_lock(&OCFS2_I(dir)->ip_lock);
2360     OCFS2_I(dir)->ip_dyn_features |= OCFS2_INDEXED_DIR_FL;
2361     di->i_dyn_features = cpu_to_le16(OCFS2_I(dir)->ip_dyn_features);
2362     spin_unlock(&OCFS2_I(dir)->ip_lock);
2363 
2364     ocfs2_journal_dirty(handle, di_bh);
2365 
2366     *ret_dx_root_bh = dx_root_bh;
2367     dx_root_bh = NULL;
2368 
2369 out:
2370     brelse(dx_root_bh);
2371     return ret;
2372 }
2373 
2374 static int ocfs2_dx_dir_format_cluster(struct ocfs2_super *osb,
2375                        handle_t *handle, struct inode *dir,
2376                        struct buffer_head **dx_leaves,
2377                        int num_dx_leaves, u64 start_blk)
2378 {
2379     int ret, i;
2380     struct ocfs2_dx_leaf *dx_leaf;
2381     struct buffer_head *bh;
2382 
2383     for (i = 0; i < num_dx_leaves; i++) {
2384         bh = sb_getblk(osb->sb, start_blk + i);
2385         if (bh == NULL) {
2386             ret = -ENOMEM;
2387             goto out;
2388         }
2389         dx_leaves[i] = bh;
2390 
2391         ocfs2_set_new_buffer_uptodate(INODE_CACHE(dir), bh);
2392 
2393         ret = ocfs2_journal_access_dl(handle, INODE_CACHE(dir), bh,
2394                           OCFS2_JOURNAL_ACCESS_CREATE);
2395         if (ret < 0) {
2396             mlog_errno(ret);
2397             goto out;
2398         }
2399 
2400         dx_leaf = (struct ocfs2_dx_leaf *) bh->b_data;
2401 
2402         memset(dx_leaf, 0, osb->sb->s_blocksize);
2403         strcpy(dx_leaf->dl_signature, OCFS2_DX_LEAF_SIGNATURE);
2404         dx_leaf->dl_fs_generation = cpu_to_le32(osb->fs_generation);
2405         dx_leaf->dl_blkno = cpu_to_le64(bh->b_blocknr);
2406         dx_leaf->dl_list.de_count =
2407             cpu_to_le16(ocfs2_dx_entries_per_leaf(osb->sb));
2408 
2409         trace_ocfs2_dx_dir_format_cluster(
2410                 (unsigned long long)OCFS2_I(dir)->ip_blkno,
2411                 (unsigned long long)bh->b_blocknr,
2412                 le16_to_cpu(dx_leaf->dl_list.de_count));
2413 
2414         ocfs2_journal_dirty(handle, bh);
2415     }
2416 
2417     ret = 0;
2418 out:
2419     return ret;
2420 }
2421 
2422 /*
2423  * Allocates and formats a new cluster for use in an indexed dir
2424  * leaf. This version will not do the extent insert, so that it can be
2425  * used by operations which need careful ordering.
2426  */
2427 static int __ocfs2_dx_dir_new_cluster(struct inode *dir,
2428                       u32 cpos, handle_t *handle,
2429                       struct ocfs2_alloc_context *data_ac,
2430                       struct buffer_head **dx_leaves,
2431                       int num_dx_leaves, u64 *ret_phys_blkno)
2432 {
2433     int ret;
2434     u32 phys, num;
2435     u64 phys_blkno;
2436     struct ocfs2_super *osb = OCFS2_SB(dir->i_sb);
2437 
2438     /*
2439      * XXX: For create, this should claim cluster for the index
2440      * *before* the unindexed insert so that we have a better
2441      * chance of contiguousness as the directory grows in number
2442      * of entries.
2443      */
2444     ret = __ocfs2_claim_clusters(handle, data_ac, 1, 1, &phys, &num);
2445     if (ret) {
2446         mlog_errno(ret);
2447         goto out;
2448     }
2449 
2450     /*
2451      * Format the new cluster first. That way, we're inserting
2452      * valid data.
2453      */
2454     phys_blkno = ocfs2_clusters_to_blocks(osb->sb, phys);
2455     ret = ocfs2_dx_dir_format_cluster(osb, handle, dir, dx_leaves,
2456                       num_dx_leaves, phys_blkno);
2457     if (ret) {
2458         mlog_errno(ret);
2459         goto out;
2460     }
2461 
2462     *ret_phys_blkno = phys_blkno;
2463 out:
2464     return ret;
2465 }
2466 
2467 static int ocfs2_dx_dir_new_cluster(struct inode *dir,
2468                     struct ocfs2_extent_tree *et,
2469                     u32 cpos, handle_t *handle,
2470                     struct ocfs2_alloc_context *data_ac,
2471                     struct ocfs2_alloc_context *meta_ac,
2472                     struct buffer_head **dx_leaves,
2473                     int num_dx_leaves)
2474 {
2475     int ret;
2476     u64 phys_blkno;
2477 
2478     ret = __ocfs2_dx_dir_new_cluster(dir, cpos, handle, data_ac, dx_leaves,
2479                      num_dx_leaves, &phys_blkno);
2480     if (ret) {
2481         mlog_errno(ret);
2482         goto out;
2483     }
2484 
2485     ret = ocfs2_insert_extent(handle, et, cpos, phys_blkno, 1, 0,
2486                   meta_ac);
2487     if (ret)
2488         mlog_errno(ret);
2489 out:
2490     return ret;
2491 }
2492 
2493 static struct buffer_head **ocfs2_dx_dir_kmalloc_leaves(struct super_block *sb,
2494                             int *ret_num_leaves)
2495 {
2496     int num_dx_leaves = ocfs2_clusters_to_blocks(sb, 1);
2497     struct buffer_head **dx_leaves;
2498 
2499     dx_leaves = kcalloc(num_dx_leaves, sizeof(struct buffer_head *),
2500                 GFP_NOFS);
2501     if (dx_leaves && ret_num_leaves)
2502         *ret_num_leaves = num_dx_leaves;
2503 
2504     return dx_leaves;
2505 }
2506 
2507 static int ocfs2_fill_new_dir_dx(struct ocfs2_super *osb,
2508                  handle_t *handle,
2509                  struct inode *parent,
2510                  struct inode *inode,
2511                  struct buffer_head *di_bh,
2512                  struct ocfs2_alloc_context *data_ac,
2513                  struct ocfs2_alloc_context *meta_ac)
2514 {
2515     int ret;
2516     struct buffer_head *leaf_bh = NULL;
2517     struct buffer_head *dx_root_bh = NULL;
2518     struct ocfs2_dx_hinfo hinfo;
2519     struct ocfs2_dx_root_block *dx_root;
2520     struct ocfs2_dx_entry_list *entry_list;
2521 
2522     /*
2523      * Our strategy is to create the directory as though it were
2524      * unindexed, then add the index block. This works with very
2525      * little complication since the state of a new directory is a
2526      * very well known quantity.
2527      *
2528      * Essentially, we have two dirents ("." and ".."), in the 1st
2529      * block which need indexing. These are easily inserted into
2530      * the index block.
2531      */
2532 
2533     ret = ocfs2_fill_new_dir_el(osb, handle, parent, inode, di_bh,
2534                     data_ac, &leaf_bh);
2535     if (ret) {
2536         mlog_errno(ret);
2537         goto out;
2538     }
2539 
2540     ret = ocfs2_dx_dir_attach_index(osb, handle, inode, di_bh, leaf_bh,
2541                     meta_ac, 1, 2, &dx_root_bh);
2542     if (ret) {
2543         mlog_errno(ret);
2544         goto out;
2545     }
2546     dx_root = (struct ocfs2_dx_root_block *)dx_root_bh->b_data;
2547     entry_list = &dx_root->dr_entries;
2548 
2549     /* Buffer has been journaled for us by ocfs2_dx_dir_attach_index */
2550     ocfs2_dx_dir_name_hash(inode, ".", 1, &hinfo);
2551     ocfs2_dx_entry_list_insert(entry_list, &hinfo, leaf_bh->b_blocknr);
2552 
2553     ocfs2_dx_dir_name_hash(inode, "..", 2, &hinfo);
2554     ocfs2_dx_entry_list_insert(entry_list, &hinfo, leaf_bh->b_blocknr);
2555 
2556 out:
2557     brelse(dx_root_bh);
2558     brelse(leaf_bh);
2559     return ret;
2560 }
2561 
2562 int ocfs2_fill_new_dir(struct ocfs2_super *osb,
2563                handle_t *handle,
2564                struct inode *parent,
2565                struct inode *inode,
2566                struct buffer_head *fe_bh,
2567                struct ocfs2_alloc_context *data_ac,
2568                struct ocfs2_alloc_context *meta_ac)
2569 
2570 {
2571     BUG_ON(!ocfs2_supports_inline_data(osb) && data_ac == NULL);
2572 
2573     if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL)
2574         return ocfs2_fill_new_dir_id(osb, handle, parent, inode, fe_bh);
2575 
2576     if (ocfs2_supports_indexed_dirs(osb))
2577         return ocfs2_fill_new_dir_dx(osb, handle, parent, inode, fe_bh,
2578                          data_ac, meta_ac);
2579 
2580     return ocfs2_fill_new_dir_el(osb, handle, parent, inode, fe_bh,
2581                      data_ac, NULL);
2582 }
2583 
2584 static int ocfs2_dx_dir_index_block(struct inode *dir,
2585                     handle_t *handle,
2586                     struct buffer_head **dx_leaves,
2587                     int num_dx_leaves,
2588                     u32 *num_dx_entries,
2589                     struct buffer_head *dirent_bh)
2590 {
2591     int ret = 0, namelen, i;
2592     char *de_buf, *limit;
2593     struct ocfs2_dir_entry *de;
2594     struct buffer_head *dx_leaf_bh;
2595     struct ocfs2_dx_hinfo hinfo;
2596     u64 dirent_blk = dirent_bh->b_blocknr;
2597 
2598     de_buf = dirent_bh->b_data;
2599     limit = de_buf + dir->i_sb->s_blocksize;
2600 
2601     while (de_buf < limit) {
2602         de = (struct ocfs2_dir_entry *)de_buf;
2603 
2604         namelen = de->name_len;
2605         if (!namelen || !de->inode)
2606             goto inc;
2607 
2608         ocfs2_dx_dir_name_hash(dir, de->name, namelen, &hinfo);
2609 
2610         i = ocfs2_dx_dir_hash_idx(OCFS2_SB(dir->i_sb), &hinfo);
2611         dx_leaf_bh = dx_leaves[i];
2612 
2613         ret = __ocfs2_dx_dir_leaf_insert(dir, handle, &hinfo,
2614                          dirent_blk, dx_leaf_bh);
2615         if (ret) {
2616             mlog_errno(ret);
2617             goto out;
2618         }
2619 
2620         *num_dx_entries = *num_dx_entries + 1;
2621 
2622 inc:
2623         de_buf += le16_to_cpu(de->rec_len);
2624     }
2625 
2626 out:
2627     return ret;
2628 }
2629 
2630 /*
2631  * XXX: This expects dx_root_bh to already be part of the transaction.
2632  */
2633 static void ocfs2_dx_dir_index_root_block(struct inode *dir,
2634                      struct buffer_head *dx_root_bh,
2635                      struct buffer_head *dirent_bh)
2636 {
2637     char *de_buf, *limit;
2638     struct ocfs2_dx_root_block *dx_root;
2639     struct ocfs2_dir_entry *de;
2640     struct ocfs2_dx_hinfo hinfo;
2641     u64 dirent_blk = dirent_bh->b_blocknr;
2642 
2643     dx_root = (struct ocfs2_dx_root_block *)dx_root_bh->b_data;
2644 
2645     de_buf = dirent_bh->b_data;
2646     limit = de_buf + dir->i_sb->s_blocksize;
2647 
2648     while (de_buf < limit) {
2649         de = (struct ocfs2_dir_entry *)de_buf;
2650 
2651         if (!de->name_len || !de->inode)
2652             goto inc;
2653 
2654         ocfs2_dx_dir_name_hash(dir, de->name, de->name_len, &hinfo);
2655 
2656         trace_ocfs2_dx_dir_index_root_block(
2657                 (unsigned long long)dir->i_ino,
2658                 hinfo.major_hash, hinfo.minor_hash,
2659                 de->name_len, de->name,
2660                 le16_to_cpu(dx_root->dr_entries.de_num_used));
2661 
2662         ocfs2_dx_entry_list_insert(&dx_root->dr_entries, &hinfo,
2663                        dirent_blk);
2664 
2665         le32_add_cpu(&dx_root->dr_num_entries, 1);
2666 inc:
2667         de_buf += le16_to_cpu(de->rec_len);
2668     }
2669 }
2670 
2671 /*
2672  * Count the number of inline directory entries in di_bh and compare
2673  * them against the number of entries we can hold in an inline dx root
2674  * block.
2675  */
2676 static int ocfs2_new_dx_should_be_inline(struct inode *dir,
2677                      struct buffer_head *di_bh)
2678 {
2679     int dirent_count = 0;
2680     char *de_buf, *limit;
2681     struct ocfs2_dir_entry *de;
2682     struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
2683 
2684     de_buf = di->id2.i_data.id_data;
2685     limit = de_buf + i_size_read(dir);
2686 
2687     while (de_buf < limit) {
2688         de = (struct ocfs2_dir_entry *)de_buf;
2689 
2690         if (de->name_len && de->inode)
2691             dirent_count++;
2692 
2693         de_buf += le16_to_cpu(de->rec_len);
2694     }
2695 
2696     /* We are careful to leave room for one extra record. */
2697     return dirent_count < ocfs2_dx_entries_per_root(dir->i_sb);
2698 }
2699 
2700 /*
2701  * Expand rec_len of the rightmost dirent in a directory block so that it
2702  * contains the end of our valid space for dirents. We do this during
2703  * expansion from an inline directory to one with extents. The first dir block
2704  * in that case is taken from the inline data portion of the inode block.
2705  *
2706  * This will also return the largest amount of contiguous space for a dirent
2707  * in the block. That value is *not* necessarily the last dirent, even after
2708  * expansion. The directory indexing code wants this value for free space
2709  * accounting. We do this here since we're already walking the entire dir
2710  * block.
2711  *
2712  * We add the dir trailer if this filesystem wants it.
2713  */
2714 static unsigned int ocfs2_expand_last_dirent(char *start, unsigned int old_size,
2715                          struct inode *dir)
2716 {
2717     struct super_block *sb = dir->i_sb;
2718     struct ocfs2_dir_entry *de;
2719     struct ocfs2_dir_entry *prev_de;
2720     char *de_buf, *limit;
2721     unsigned int new_size = sb->s_blocksize;
2722     unsigned int bytes, this_hole;
2723     unsigned int largest_hole = 0;
2724 
2725     if (ocfs2_new_dir_wants_trailer(dir))
2726         new_size = ocfs2_dir_trailer_blk_off(sb);
2727 
2728     bytes = new_size - old_size;
2729 
2730     limit = start + old_size;
2731     de_buf = start;
2732     de = (struct ocfs2_dir_entry *)de_buf;
2733     do {
2734         this_hole = ocfs2_figure_dirent_hole(de);
2735         if (this_hole > largest_hole)
2736             largest_hole = this_hole;
2737 
2738         prev_de = de;
2739         de_buf += le16_to_cpu(de->rec_len);
2740         de = (struct ocfs2_dir_entry *)de_buf;
2741     } while (de_buf < limit);
2742 
2743     le16_add_cpu(&prev_de->rec_len, bytes);
2744 
2745     /* We need to double check this after modification of the final
2746      * dirent. */
2747     this_hole = ocfs2_figure_dirent_hole(prev_de);
2748     if (this_hole > largest_hole)
2749         largest_hole = this_hole;
2750 
2751     if (largest_hole >= OCFS2_DIR_MIN_REC_LEN)
2752         return largest_hole;
2753     return 0;
2754 }
2755 
2756 /*
2757  * We allocate enough clusters to fulfill "blocks_wanted", but set
2758  * i_size to exactly one block. Ocfs2_extend_dir() will handle the
2759  * rest automatically for us.
2760  *
2761  * *first_block_bh is a pointer to the 1st data block allocated to the
2762  *  directory.
2763  */
2764 static int ocfs2_expand_inline_dir(struct inode *dir, struct buffer_head *di_bh,
2765                    unsigned int blocks_wanted,
2766                    struct ocfs2_dir_lookup_result *lookup,
2767                    struct buffer_head **first_block_bh)
2768 {
2769     u32 alloc, dx_alloc, bit_off, len, num_dx_entries = 0;
2770     struct super_block *sb = dir->i_sb;
2771     int ret, i, num_dx_leaves = 0, dx_inline = 0,
2772         credits = ocfs2_inline_to_extents_credits(sb);
2773     u64 dx_insert_blkno, blkno,
2774         bytes = blocks_wanted << sb->s_blocksize_bits;
2775     struct ocfs2_super *osb = OCFS2_SB(dir->i_sb);
2776     struct ocfs2_inode_info *oi = OCFS2_I(dir);
2777     struct ocfs2_alloc_context *data_ac = NULL;
2778     struct ocfs2_alloc_context *meta_ac = NULL;
2779     struct buffer_head *dirdata_bh = NULL;
2780     struct buffer_head *dx_root_bh = NULL;
2781     struct buffer_head **dx_leaves = NULL;
2782     struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
2783     handle_t *handle;
2784     struct ocfs2_extent_tree et;
2785     struct ocfs2_extent_tree dx_et;
2786     int did_quota = 0, bytes_allocated = 0;
2787 
2788     ocfs2_init_dinode_extent_tree(&et, INODE_CACHE(dir), di_bh);
2789 
2790     alloc = ocfs2_clusters_for_bytes(sb, bytes);
2791     dx_alloc = 0;
2792 
2793     down_write(&oi->ip_alloc_sem);
2794 
2795     if (ocfs2_supports_indexed_dirs(osb)) {
2796         credits += ocfs2_add_dir_index_credits(sb);
2797 
2798         dx_inline = ocfs2_new_dx_should_be_inline(dir, di_bh);
2799         if (!dx_inline) {
2800             /* Add one more cluster for an index leaf */
2801             dx_alloc++;
2802             dx_leaves = ocfs2_dx_dir_kmalloc_leaves(sb,
2803                                 &num_dx_leaves);
2804             if (!dx_leaves) {
2805                 ret = -ENOMEM;
2806                 mlog_errno(ret);
2807                 goto out;
2808             }
2809         }
2810 
2811         /* This gets us the dx_root */
2812         ret = ocfs2_reserve_new_metadata_blocks(osb, 1, &meta_ac);
2813         if (ret) {
2814             mlog_errno(ret);
2815             goto out;
2816         }
2817     }
2818 
2819     /*
2820      * We should never need more than 2 clusters for the unindexed
2821      * tree - maximum dirent size is far less than one block. In
2822      * fact, the only time we'd need more than one cluster is if
2823      * blocksize == clustersize and the dirent won't fit in the
2824      * extra space that the expansion to a single block gives. As
2825      * of today, that only happens on 4k/4k file systems.
2826      */
2827     BUG_ON(alloc > 2);
2828 
2829     ret = ocfs2_reserve_clusters(osb, alloc + dx_alloc, &data_ac);
2830     if (ret) {
2831         mlog_errno(ret);
2832         goto out;
2833     }
2834 
2835     /*
2836      * Prepare for worst case allocation scenario of two separate
2837      * extents in the unindexed tree.
2838      */
2839     if (alloc == 2)
2840         credits += OCFS2_SUBALLOC_ALLOC;
2841 
2842     handle = ocfs2_start_trans(osb, credits);
2843     if (IS_ERR(handle)) {
2844         ret = PTR_ERR(handle);
2845         mlog_errno(ret);
2846         goto out;
2847     }
2848 
2849     ret = dquot_alloc_space_nodirty(dir,
2850         ocfs2_clusters_to_bytes(osb->sb, alloc + dx_alloc));
2851     if (ret)
2852         goto out_commit;
2853     did_quota = 1;
2854 
2855     if (ocfs2_supports_indexed_dirs(osb) && !dx_inline) {
2856         /*
2857          * Allocate our index cluster first, to maximize the
2858          * possibility that unindexed leaves grow
2859          * contiguously.
2860          */
2861         ret = __ocfs2_dx_dir_new_cluster(dir, 0, handle, data_ac,
2862                          dx_leaves, num_dx_leaves,
2863                          &dx_insert_blkno);
2864         if (ret) {
2865             mlog_errno(ret);
2866             goto out_commit;
2867         }
2868         bytes_allocated += ocfs2_clusters_to_bytes(dir->i_sb, 1);
2869     }
2870 
2871     /*
2872      * Try to claim as many clusters as the bitmap can give though
2873      * if we only get one now, that's enough to continue. The rest
2874      * will be claimed after the conversion to extents.
2875      */
2876     if (ocfs2_dir_resv_allowed(osb))
2877         data_ac->ac_resv = &oi->ip_la_data_resv;
2878     ret = ocfs2_claim_clusters(handle, data_ac, 1, &bit_off, &len);
2879     if (ret) {
2880         mlog_errno(ret);
2881         goto out_commit;
2882     }
2883     bytes_allocated += ocfs2_clusters_to_bytes(dir->i_sb, 1);
2884 
2885     /*
2886      * Operations are carefully ordered so that we set up the new
2887      * data block first. The conversion from inline data to
2888      * extents follows.
2889      */
2890     blkno = ocfs2_clusters_to_blocks(dir->i_sb, bit_off);
2891     dirdata_bh = sb_getblk(sb, blkno);
2892     if (!dirdata_bh) {
2893         ret = -ENOMEM;
2894         mlog_errno(ret);
2895         goto out_commit;
2896     }
2897 
2898     ocfs2_set_new_buffer_uptodate(INODE_CACHE(dir), dirdata_bh);
2899 
2900     ret = ocfs2_journal_access_db(handle, INODE_CACHE(dir), dirdata_bh,
2901                       OCFS2_JOURNAL_ACCESS_CREATE);
2902     if (ret) {
2903         mlog_errno(ret);
2904         goto out_commit;
2905     }
2906 
2907     memcpy(dirdata_bh->b_data, di->id2.i_data.id_data, i_size_read(dir));
2908     memset(dirdata_bh->b_data + i_size_read(dir), 0,
2909            sb->s_blocksize - i_size_read(dir));
2910     i = ocfs2_expand_last_dirent(dirdata_bh->b_data, i_size_read(dir), dir);
2911     if (ocfs2_new_dir_wants_trailer(dir)) {
2912         /*
2913          * Prepare the dir trailer up front. It will otherwise look
2914          * like a valid dirent. Even if inserting the index fails
2915          * (unlikely), then all we'll have done is given first dir
2916          * block a small amount of fragmentation.
2917          */
2918         ocfs2_init_dir_trailer(dir, dirdata_bh, i);
2919     }
2920 
2921     ocfs2_update_inode_fsync_trans(handle, dir, 1);
2922     ocfs2_journal_dirty(handle, dirdata_bh);
2923 
2924     if (ocfs2_supports_indexed_dirs(osb) && !dx_inline) {
2925         /*
2926          * Dx dirs with an external cluster need to do this up
2927          * front. Inline dx root's get handled later, after
2928          * we've allocated our root block. We get passed back
2929          * a total number of items so that dr_num_entries can
2930          * be correctly set once the dx_root has been
2931          * allocated.
2932          */
2933         ret = ocfs2_dx_dir_index_block(dir, handle, dx_leaves,
2934                            num_dx_leaves, &num_dx_entries,
2935                            dirdata_bh);
2936         if (ret) {
2937             mlog_errno(ret);
2938             goto out_commit;
2939         }
2940     }
2941 
2942     /*
2943      * Set extent, i_size, etc on the directory. After this, the
2944      * inode should contain the same exact dirents as before and
2945      * be fully accessible from system calls.
2946      *
2947      * We let the later dirent insert modify c/mtime - to the user
2948      * the data hasn't changed.
2949      */
2950     ret = ocfs2_journal_access_di(handle, INODE_CACHE(dir), di_bh,
2951                       OCFS2_JOURNAL_ACCESS_CREATE);
2952     if (ret) {
2953         mlog_errno(ret);
2954         goto out_commit;
2955     }
2956 
2957     spin_lock(&oi->ip_lock);
2958     oi->ip_dyn_features &= ~OCFS2_INLINE_DATA_FL;
2959     di->i_dyn_features = cpu_to_le16(oi->ip_dyn_features);
2960     spin_unlock(&oi->ip_lock);
2961 
2962     ocfs2_dinode_new_extent_list(dir, di);
2963 
2964     i_size_write(dir, sb->s_blocksize);
2965     dir->i_mtime = dir->i_ctime = current_time(dir);
2966 
2967     di->i_size = cpu_to_le64(sb->s_blocksize);
2968     di->i_ctime = di->i_mtime = cpu_to_le64(dir->i_ctime.tv_sec);
2969     di->i_ctime_nsec = di->i_mtime_nsec = cpu_to_le32(dir->i_ctime.tv_nsec);
2970     ocfs2_update_inode_fsync_trans(handle, dir, 1);
2971 
2972     /*
2973      * This should never fail as our extent list is empty and all
2974      * related blocks have been journaled already.
2975      */
2976     ret = ocfs2_insert_extent(handle, &et, 0, blkno, len,
2977                   0, NULL);
2978     if (ret) {
2979         mlog_errno(ret);
2980         goto out_commit;
2981     }
2982 
2983     /*
2984      * Set i_blocks after the extent insert for the most up to
2985      * date ip_clusters value.
2986      */
2987     dir->i_blocks = ocfs2_inode_sector_count(dir);
2988 
2989     ocfs2_journal_dirty(handle, di_bh);
2990 
2991     if (ocfs2_supports_indexed_dirs(osb)) {
2992         ret = ocfs2_dx_dir_attach_index(osb, handle, dir, di_bh,
2993                         dirdata_bh, meta_ac, dx_inline,
2994                         num_dx_entries, &dx_root_bh);
2995         if (ret) {
2996             mlog_errno(ret);
2997             goto out_commit;
2998         }
2999 
3000         if (dx_inline) {
3001             ocfs2_dx_dir_index_root_block(dir, dx_root_bh,
3002                               dirdata_bh);
3003         } else {
3004             ocfs2_init_dx_root_extent_tree(&dx_et,
3005                                INODE_CACHE(dir),
3006                                dx_root_bh);
3007             ret = ocfs2_insert_extent(handle, &dx_et, 0,
3008                           dx_insert_blkno, 1, 0, NULL);
3009             if (ret)
3010                 mlog_errno(ret);
3011         }
3012     }
3013 
3014     /*
3015      * We asked for two clusters, but only got one in the 1st
3016      * pass. Claim the 2nd cluster as a separate extent.
3017      */
3018     if (alloc > len) {
3019         ret = ocfs2_claim_clusters(handle, data_ac, 1, &bit_off,
3020                        &len);
3021         if (ret) {
3022             mlog_errno(ret);
3023             goto out_commit;
3024         }
3025         blkno = ocfs2_clusters_to_blocks(dir->i_sb, bit_off);
3026 
3027         ret = ocfs2_insert_extent(handle, &et, 1,
3028                       blkno, len, 0, NULL);
3029         if (ret) {
3030             mlog_errno(ret);
3031             goto out_commit;
3032         }
3033         bytes_allocated += ocfs2_clusters_to_bytes(dir->i_sb, 1);
3034     }
3035 
3036     *first_block_bh = dirdata_bh;
3037     dirdata_bh = NULL;
3038     if (ocfs2_supports_indexed_dirs(osb)) {
3039         unsigned int off;
3040 
3041         if (!dx_inline) {
3042             /*
3043              * We need to return the correct block within the
3044              * cluster which should hold our entry.
3045              */
3046             off = ocfs2_dx_dir_hash_idx(osb,
3047                             &lookup->dl_hinfo);
3048             get_bh(dx_leaves[off]);
3049             lookup->dl_dx_leaf_bh = dx_leaves[off];
3050         }
3051         lookup->dl_dx_root_bh = dx_root_bh;
3052         dx_root_bh = NULL;
3053     }
3054 
3055 out_commit:
3056     if (ret < 0 && did_quota)
3057         dquot_free_space_nodirty(dir, bytes_allocated);
3058 
3059     ocfs2_commit_trans(osb, handle);
3060 
3061 out:
3062     up_write(&oi->ip_alloc_sem);
3063     if (data_ac)
3064         ocfs2_free_alloc_context(data_ac);
3065     if (meta_ac)
3066         ocfs2_free_alloc_context(meta_ac);
3067 
3068     if (dx_leaves) {
3069         for (i = 0; i < num_dx_leaves; i++)
3070             brelse(dx_leaves[i]);
3071         kfree(dx_leaves);
3072     }
3073 
3074     brelse(dirdata_bh);
3075     brelse(dx_root_bh);
3076 
3077     return ret;
3078 }
3079 
3080 /* returns a bh of the 1st new block in the allocation. */
3081 static int ocfs2_do_extend_dir(struct super_block *sb,
3082                    handle_t *handle,
3083                    struct inode *dir,
3084                    struct buffer_head *parent_fe_bh,
3085                    struct ocfs2_alloc_context *data_ac,
3086                    struct ocfs2_alloc_context *meta_ac,
3087                    struct buffer_head **new_bh)
3088 {
3089     int status;
3090     int extend, did_quota = 0;
3091     u64 p_blkno, v_blkno;
3092 
3093     spin_lock(&OCFS2_I(dir)->ip_lock);
3094     extend = (i_size_read(dir) == ocfs2_clusters_to_bytes(sb, OCFS2_I(dir)->ip_clusters));
3095     spin_unlock(&OCFS2_I(dir)->ip_lock);
3096 
3097     if (extend) {
3098         u32 offset = OCFS2_I(dir)->ip_clusters;
3099 
3100         status = dquot_alloc_space_nodirty(dir,
3101                     ocfs2_clusters_to_bytes(sb, 1));
3102         if (status)
3103             goto bail;
3104         did_quota = 1;
3105 
3106         status = ocfs2_add_inode_data(OCFS2_SB(sb), dir, &offset,
3107                           1, 0, parent_fe_bh, handle,
3108                           data_ac, meta_ac, NULL);
3109         BUG_ON(status == -EAGAIN);
3110         if (status < 0) {
3111             mlog_errno(status);
3112             goto bail;
3113         }
3114     }
3115 
3116     v_blkno = ocfs2_blocks_for_bytes(sb, i_size_read(dir));
3117     status = ocfs2_extent_map_get_blocks(dir, v_blkno, &p_blkno, NULL, NULL);
3118     if (status < 0) {
3119         mlog_errno(status);
3120         goto bail;
3121     }
3122 
3123     *new_bh = sb_getblk(sb, p_blkno);
3124     if (!*new_bh) {
3125         status = -ENOMEM;
3126         mlog_errno(status);
3127         goto bail;
3128     }
3129     status = 0;
3130 bail:
3131     if (did_quota && status < 0)
3132         dquot_free_space_nodirty(dir, ocfs2_clusters_to_bytes(sb, 1));
3133     return status;
3134 }
3135 
3136 /*
3137  * Assumes you already have a cluster lock on the directory.
3138  *
3139  * 'blocks_wanted' is only used if we have an inline directory which
3140  * is to be turned into an extent based one. The size of the dirent to
3141  * insert might be larger than the space gained by growing to just one
3142  * block, so we may have to grow the inode by two blocks in that case.
3143  *
3144  * If the directory is already indexed, dx_root_bh must be provided.
3145  */
3146 static int ocfs2_extend_dir(struct ocfs2_super *osb,
3147                 struct inode *dir,
3148                 struct buffer_head *parent_fe_bh,
3149                 unsigned int blocks_wanted,
3150                 struct ocfs2_dir_lookup_result *lookup,
3151                 struct buffer_head **new_de_bh)
3152 {
3153     int status = 0;
3154     int credits, num_free_extents, drop_alloc_sem = 0;
3155     loff_t dir_i_size;
3156     struct ocfs2_dinode *fe = (struct ocfs2_dinode *) parent_fe_bh->b_data;
3157     struct ocfs2_extent_list *el = &fe->id2.i_list;
3158     struct ocfs2_alloc_context *data_ac = NULL;
3159     struct ocfs2_alloc_context *meta_ac = NULL;
3160     handle_t *handle = NULL;
3161     struct buffer_head *new_bh = NULL;
3162     struct ocfs2_dir_entry * de;
3163     struct super_block *sb = osb->sb;
3164     struct ocfs2_extent_tree et;
3165     struct buffer_head *dx_root_bh = lookup->dl_dx_root_bh;
3166 
3167     if (OCFS2_I(dir)->ip_dyn_features & OCFS2_INLINE_DATA_FL) {
3168         /*
3169          * This would be a code error as an inline directory should
3170          * never have an index root.
3171          */
3172         BUG_ON(dx_root_bh);
3173 
3174         status = ocfs2_expand_inline_dir(dir, parent_fe_bh,
3175                          blocks_wanted, lookup,
3176                          &new_bh);
3177         if (status) {
3178             mlog_errno(status);
3179             goto bail;
3180         }
3181 
3182         /* Expansion from inline to an indexed directory will
3183          * have given us this. */
3184         dx_root_bh = lookup->dl_dx_root_bh;
3185 
3186         if (blocks_wanted == 1) {
3187             /*
3188              * If the new dirent will fit inside the space
3189              * created by pushing out to one block, then
3190              * we can complete the operation
3191              * here. Otherwise we have to expand i_size
3192              * and format the 2nd block below.
3193              */
3194             BUG_ON(new_bh == NULL);
3195             goto bail_bh;
3196         }
3197 
3198         /*
3199          * Get rid of 'new_bh' - we want to format the 2nd
3200          * data block and return that instead.
3201          */
3202         brelse(new_bh);
3203         new_bh = NULL;
3204 
3205         down_write(&OCFS2_I(dir)->ip_alloc_sem);
3206         drop_alloc_sem = 1;
3207         dir_i_size = i_size_read(dir);
3208         credits = OCFS2_SIMPLE_DIR_EXTEND_CREDITS;
3209         goto do_extend;
3210     }
3211 
3212     down_write(&OCFS2_I(dir)->ip_alloc_sem);
3213     drop_alloc_sem = 1;
3214     dir_i_size = i_size_read(dir);
3215     trace_ocfs2_extend_dir((unsigned long long)OCFS2_I(dir)->ip_blkno,
3216                    dir_i_size);
3217 
3218     /* dir->i_size is always block aligned. */
3219     spin_lock(&OCFS2_I(dir)->ip_lock);
3220     if (dir_i_size == ocfs2_clusters_to_bytes(sb, OCFS2_I(dir)->ip_clusters)) {
3221         spin_unlock(&OCFS2_I(dir)->ip_lock);
3222         ocfs2_init_dinode_extent_tree(&et, INODE_CACHE(dir),
3223                           parent_fe_bh);
3224         num_free_extents = ocfs2_num_free_extents(&et);
3225         if (num_free_extents < 0) {
3226             status = num_free_extents;
3227             mlog_errno(status);
3228             goto bail;
3229         }
3230 
3231         if (!num_free_extents) {
3232             status = ocfs2_reserve_new_metadata(osb, el, &meta_ac);
3233             if (status < 0) {
3234                 if (status != -ENOSPC)
3235                     mlog_errno(status);
3236                 goto bail;
3237             }
3238         }
3239 
3240         status = ocfs2_reserve_clusters(osb, 1, &data_ac);
3241         if (status < 0) {
3242             if (status != -ENOSPC)
3243                 mlog_errno(status);
3244             goto bail;
3245         }
3246 
3247         if (ocfs2_dir_resv_allowed(osb))
3248             data_ac->ac_resv = &OCFS2_I(dir)->ip_la_data_resv;
3249 
3250         credits = ocfs2_calc_extend_credits(sb, el);
3251     } else {
3252         spin_unlock(&OCFS2_I(dir)->ip_lock);
3253         credits = OCFS2_SIMPLE_DIR_EXTEND_CREDITS;
3254     }
3255 
3256 do_extend:
3257     if (ocfs2_dir_indexed(dir))
3258         credits++; /* For attaching the new dirent block to the
3259                 * dx_root */
3260 
3261     handle = ocfs2_start_trans(osb, credits);
3262     if (IS_ERR(handle)) {
3263         status = PTR_ERR(handle);
3264         handle = NULL;
3265         mlog_errno(status);
3266         goto bail;
3267     }
3268 
3269     status = ocfs2_do_extend_dir(osb->sb, handle, dir, parent_fe_bh,
3270                      data_ac, meta_ac, &new_bh);
3271     if (status < 0) {
3272         mlog_errno(status);
3273         goto bail;
3274     }
3275 
3276     ocfs2_set_new_buffer_uptodate(INODE_CACHE(dir), new_bh);
3277 
3278     status = ocfs2_journal_access_db(handle, INODE_CACHE(dir), new_bh,
3279                      OCFS2_JOURNAL_ACCESS_CREATE);
3280     if (status < 0) {
3281         mlog_errno(status);
3282         goto bail;
3283     }
3284     memset(new_bh->b_data, 0, sb->s_blocksize);
3285 
3286     de = (struct ocfs2_dir_entry *) new_bh->b_data;
3287     de->inode = 0;
3288     if (ocfs2_supports_dir_trailer(dir)) {
3289         de->rec_len = cpu_to_le16(ocfs2_dir_trailer_blk_off(sb));
3290 
3291         ocfs2_init_dir_trailer(dir, new_bh, le16_to_cpu(de->rec_len));
3292 
3293         if (ocfs2_dir_indexed(dir)) {
3294             status = ocfs2_dx_dir_link_trailer(dir, handle,
3295                                dx_root_bh, new_bh);
3296             if (status) {
3297                 mlog_errno(status);
3298                 goto bail;
3299             }
3300         }
3301     } else {
3302         de->rec_len = cpu_to_le16(sb->s_blocksize);
3303     }
3304     ocfs2_update_inode_fsync_trans(handle, dir, 1);
3305     ocfs2_journal_dirty(handle, new_bh);
3306 
3307     dir_i_size += dir->i_sb->s_blocksize;
3308     i_size_write(dir, dir_i_size);
3309     dir->i_blocks = ocfs2_inode_sector_count(dir);
3310     status = ocfs2_mark_inode_dirty(handle, dir, parent_fe_bh);
3311     if (status < 0) {
3312         mlog_errno(status);
3313         goto bail;
3314     }
3315 
3316 bail_bh:
3317     *new_de_bh = new_bh;
3318     get_bh(*new_de_bh);
3319 bail:
3320     if (handle)
3321         ocfs2_commit_trans(osb, handle);
3322     if (drop_alloc_sem)
3323         up_write(&OCFS2_I(dir)->ip_alloc_sem);
3324 
3325     if (data_ac)
3326         ocfs2_free_alloc_context(data_ac);
3327     if (meta_ac)
3328         ocfs2_free_alloc_context(meta_ac);
3329 
3330     brelse(new_bh);
3331 
3332     return status;
3333 }
3334 
3335 static int ocfs2_find_dir_space_id(struct inode *dir, struct buffer_head *di_bh,
3336                    const char *name, int namelen,
3337                    struct buffer_head **ret_de_bh,
3338                    unsigned int *blocks_wanted)
3339 {
3340     int ret;
3341     struct super_block *sb = dir->i_sb;
3342     struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
3343     struct ocfs2_dir_entry *de, *last_de = NULL;
3344     char *de_buf, *limit;
3345     unsigned long offset = 0;
3346     unsigned int rec_len, new_rec_len, free_space;
3347 
3348     /*
3349      * This calculates how many free bytes we'd have in block zero, should
3350      * this function force expansion to an extent tree.
3351      */
3352     if (ocfs2_new_dir_wants_trailer(dir))
3353         free_space = ocfs2_dir_trailer_blk_off(sb) - i_size_read(dir);
3354     else
3355         free_space = dir->i_sb->s_blocksize - i_size_read(dir);
3356 
3357     de_buf = di->id2.i_data.id_data;
3358     limit = de_buf + i_size_read(dir);
3359     rec_len = OCFS2_DIR_REC_LEN(namelen);
3360 
3361     while (de_buf < limit) {
3362         de = (struct ocfs2_dir_entry *)de_buf;
3363 
3364         if (!ocfs2_check_dir_entry(dir, de, di_bh, offset)) {
3365             ret = -ENOENT;
3366             goto out;
3367         }
3368         if (ocfs2_match(namelen, name, de)) {
3369             ret = -EEXIST;
3370             goto out;
3371         }
3372         /*
3373          * No need to check for a trailing dirent record here as
3374          * they're not used for inline dirs.
3375          */
3376 
3377         if (ocfs2_dirent_would_fit(de, rec_len)) {
3378             /* Ok, we found a spot. Return this bh and let
3379              * the caller actually fill it in. */
3380             *ret_de_bh = di_bh;
3381             get_bh(*ret_de_bh);
3382             ret = 0;
3383             goto out;
3384         }
3385 
3386         last_de = de;
3387         de_buf += le16_to_cpu(de->rec_len);
3388         offset += le16_to_cpu(de->rec_len);
3389     }
3390 
3391     /*
3392      * We're going to require expansion of the directory - figure
3393      * out how many blocks we'll need so that a place for the
3394      * dirent can be found.
3395      */
3396     *blocks_wanted = 1;
3397     new_rec_len = le16_to_cpu(last_de->rec_len) + free_space;
3398     if (new_rec_len < (rec_len + OCFS2_DIR_REC_LEN(last_de->name_len)))
3399         *blocks_wanted = 2;
3400 
3401     ret = -ENOSPC;
3402 out:
3403     return ret;
3404 }
3405 
3406 static int ocfs2_find_dir_space_el(struct inode *dir, const char *name,
3407                    int namelen, struct buffer_head **ret_de_bh)
3408 {
3409     unsigned long offset;
3410     struct buffer_head *bh = NULL;
3411     unsigned short rec_len;
3412     struct ocfs2_dir_entry *de;
3413     struct super_block *sb = dir->i_sb;
3414     int status;
3415     int blocksize = dir->i_sb->s_blocksize;
3416 
3417     status = ocfs2_read_dir_block(dir, 0, &bh, 0);
3418     if (status)
3419         goto bail;
3420 
3421     rec_len = OCFS2_DIR_REC_LEN(namelen);
3422     offset = 0;
3423     de = (struct ocfs2_dir_entry *) bh->b_data;
3424     while (1) {
3425         if ((char *)de >= sb->s_blocksize + bh->b_data) {
3426             brelse(bh);
3427             bh = NULL;
3428 
3429             if (i_size_read(dir) <= offset) {
3430                 /*
3431                  * Caller will have to expand this
3432                  * directory.
3433                  */
3434                 status = -ENOSPC;
3435                 goto bail;
3436             }
3437             status = ocfs2_read_dir_block(dir,
3438                          offset >> sb->s_blocksize_bits,
3439                          &bh, 0);
3440             if (status)
3441                 goto bail;
3442 
3443             /* move to next block */
3444             de = (struct ocfs2_dir_entry *) bh->b_data;
3445         }
3446         if (!ocfs2_check_dir_entry(dir, de, bh, offset)) {
3447             status = -ENOENT;
3448             goto bail;
3449         }
3450         if (ocfs2_match(namelen, name, de)) {
3451             status = -EEXIST;
3452             goto bail;
3453         }
3454 
3455         if (ocfs2_skip_dir_trailer(dir, de, offset % blocksize,
3456                        blocksize))
3457             goto next;
3458 
3459         if (ocfs2_dirent_would_fit(de, rec_len)) {
3460             /* Ok, we found a spot. Return this bh and let
3461              * the caller actually fill it in. */
3462             *ret_de_bh = bh;
3463             get_bh(*ret_de_bh);
3464             status = 0;
3465             goto bail;
3466         }
3467 next:
3468         offset += le16_to_cpu(de->rec_len);
3469         de = (struct ocfs2_dir_entry *)((char *) de + le16_to_cpu(de->rec_len));
3470     }
3471 
3472 bail:
3473     brelse(bh);
3474     if (status)
3475         mlog_errno(status);
3476 
3477     return status;
3478 }
3479 
3480 static int dx_leaf_sort_cmp(const void *a, const void *b)
3481 {
3482     const struct ocfs2_dx_entry *entry1 = a;
3483     const struct ocfs2_dx_entry *entry2 = b;
3484     u32 major_hash1 = le32_to_cpu(entry1->dx_major_hash);
3485     u32 major_hash2 = le32_to_cpu(entry2->dx_major_hash);
3486     u32 minor_hash1 = le32_to_cpu(entry1->dx_minor_hash);
3487     u32 minor_hash2 = le32_to_cpu(entry2->dx_minor_hash);
3488 
3489     if (major_hash1 > major_hash2)
3490         return 1;
3491     if (major_hash1 < major_hash2)
3492         return -1;
3493 
3494     /*
3495      * It is not strictly necessary to sort by minor
3496      */
3497     if (minor_hash1 > minor_hash2)
3498         return 1;
3499     if (minor_hash1 < minor_hash2)
3500         return -1;
3501     return 0;
3502 }
3503 
3504 static void dx_leaf_sort_swap(void *a, void *b, int size)
3505 {
3506     struct ocfs2_dx_entry *entry1 = a;
3507     struct ocfs2_dx_entry *entry2 = b;
3508 
3509     BUG_ON(size != sizeof(*entry1));
3510 
3511     swap(*entry1, *entry2);
3512 }
3513 
3514 static int ocfs2_dx_leaf_same_major(struct ocfs2_dx_leaf *dx_leaf)
3515 {
3516     struct ocfs2_dx_entry_list *dl_list = &dx_leaf->dl_list;
3517     int i, num = le16_to_cpu(dl_list->de_num_used);
3518 
3519     for (i = 0; i < (num - 1); i++) {
3520         if (le32_to_cpu(dl_list->de_entries[i].dx_major_hash) !=
3521             le32_to_cpu(dl_list->de_entries[i + 1].dx_major_hash))
3522             return 0;
3523     }
3524 
3525     return 1;
3526 }
3527 
3528 /*
3529  * Find the optimal value to split this leaf on. This expects the leaf
3530  * entries to be in sorted order.
3531  *
3532  * leaf_cpos is the cpos of the leaf we're splitting. insert_hash is
3533  * the hash we want to insert.
3534  *
3535  * This function is only concerned with the major hash - that which
3536  * determines which cluster an item belongs to.
3537  */
3538 static int ocfs2_dx_dir_find_leaf_split(struct ocfs2_dx_leaf *dx_leaf,
3539                     u32 leaf_cpos, u32 insert_hash,
3540                     u32 *split_hash)
3541 {
3542     struct ocfs2_dx_entry_list *dl_list = &dx_leaf->dl_list;
3543     int i, num_used = le16_to_cpu(dl_list->de_num_used);
3544     int allsame;
3545 
3546     /*
3547      * There's a couple rare, but nasty corner cases we have to
3548      * check for here. All of them involve a leaf where all value
3549      * have the same hash, which is what we look for first.
3550      *
3551      * Most of the time, all of the above is false, and we simply
3552      * pick the median value for a split.
3553      */
3554     allsame = ocfs2_dx_leaf_same_major(dx_leaf);
3555     if (allsame) {
3556         u32 val = le32_to_cpu(dl_list->de_entries[0].dx_major_hash);
3557 
3558         if (val == insert_hash) {
3559             /*
3560              * No matter where we would choose to split,
3561              * the new entry would want to occupy the same
3562              * block as these. Since there's no space left
3563              * in their existing block, we know there
3564              * won't be space after the split.
3565              */
3566             return -ENOSPC;
3567         }
3568 
3569         if (val == leaf_cpos) {
3570             /*
3571              * Because val is the same as leaf_cpos (which
3572              * is the smallest value this leaf can have),
3573              * yet is not equal to insert_hash, then we
3574              * know that insert_hash *must* be larger than
3575              * val (and leaf_cpos). At least cpos+1 in value.
3576              *
3577              * We also know then, that there cannot be an
3578              * adjacent extent (otherwise we'd be looking
3579              * at it). Choosing this value gives us a
3580              * chance to get some contiguousness.
3581              */
3582             *split_hash = leaf_cpos + 1;
3583             return 0;
3584         }
3585 
3586         if (val > insert_hash) {
3587             /*
3588              * val can not be the same as insert hash, and
3589              * also must be larger than leaf_cpos. Also,
3590              * we know that there can't be a leaf between
3591              * cpos and val, otherwise the entries with
3592              * hash 'val' would be there.
3593              */
3594             *split_hash = val;
3595             return 0;
3596         }
3597 
3598         *split_hash = insert_hash;
3599         return 0;
3600     }
3601 
3602     /*
3603      * Since the records are sorted and the checks above
3604      * guaranteed that not all records in this block are the same,
3605      * we simple travel forward, from the median, and pick the 1st
3606      * record whose value is larger than leaf_cpos.
3607      */
3608     for (i = (num_used / 2); i < num_used; i++)
3609         if (le32_to_cpu(dl_list->de_entries[i].dx_major_hash) >
3610             leaf_cpos)
3611             break;
3612 
3613     BUG_ON(i == num_used); /* Should be impossible */
3614     *split_hash = le32_to_cpu(dl_list->de_entries[i].dx_major_hash);
3615     return 0;
3616 }
3617 
3618 /*
3619  * Transfer all entries in orig_dx_leaves whose major hash is equal to or
3620  * larger than split_hash into new_dx_leaves. We use a temporary
3621  * buffer (tmp_dx_leaf) to make the changes to the original leaf blocks.
3622  *
3623  * Since the block offset inside a leaf (cluster) is a constant mask
3624  * of minor_hash, we can optimize - an item at block offset X within
3625  * the original cluster, will be at offset X within the new cluster.
3626  */
3627 static void ocfs2_dx_dir_transfer_leaf(struct inode *dir, u32 split_hash,
3628                        handle_t *handle,
3629                        struct ocfs2_dx_leaf *tmp_dx_leaf,
3630                        struct buffer_head **orig_dx_leaves,
3631                        struct buffer_head **new_dx_leaves,
3632                        int num_dx_leaves)
3633 {
3634     int i, j, num_used;
3635     u32 major_hash;
3636     struct ocfs2_dx_leaf *orig_dx_leaf, *new_dx_leaf;
3637     struct ocfs2_dx_entry_list *orig_list, *tmp_list;
3638     struct ocfs2_dx_entry *dx_entry;
3639 
3640     tmp_list = &tmp_dx_leaf->dl_list;
3641 
3642     for (i = 0; i < num_dx_leaves; i++) {
3643         orig_dx_leaf = (struct ocfs2_dx_leaf *) orig_dx_leaves[i]->b_data;
3644         orig_list = &orig_dx_leaf->dl_list;
3645         new_dx_leaf = (struct ocfs2_dx_leaf *) new_dx_leaves[i]->b_data;
3646 
3647         num_used = le16_to_cpu(orig_list->de_num_used);
3648 
3649         memcpy(tmp_dx_leaf, orig_dx_leaf, dir->i_sb->s_blocksize);
3650         tmp_list->de_num_used = cpu_to_le16(0);
3651         memset(&tmp_list->de_entries, 0, sizeof(*dx_entry)*num_used);
3652 
3653         for (j = 0; j < num_used; j++) {
3654             dx_entry = &orig_list->de_entries[j];
3655             major_hash = le32_to_cpu(dx_entry->dx_major_hash);
3656             if (major_hash >= split_hash)
3657                 ocfs2_dx_dir_leaf_insert_tail(new_dx_leaf,
3658                                   dx_entry);
3659             else
3660                 ocfs2_dx_dir_leaf_insert_tail(tmp_dx_leaf,
3661                                   dx_entry);
3662         }
3663         memcpy(orig_dx_leaf, tmp_dx_leaf, dir->i_sb->s_blocksize);
3664 
3665         ocfs2_journal_dirty(handle, orig_dx_leaves[i]);
3666         ocfs2_journal_dirty(handle, new_dx_leaves[i]);
3667     }
3668 }
3669 
3670 static int ocfs2_dx_dir_rebalance_credits(struct ocfs2_super *osb,
3671                       struct ocfs2_dx_root_block *dx_root)
3672 {
3673     int credits = ocfs2_clusters_to_blocks(osb->sb, 3);
3674 
3675     credits += ocfs2_calc_extend_credits(osb->sb, &dx_root->dr_list);
3676     credits += ocfs2_quota_trans_credits(osb->sb);
3677     return credits;
3678 }
3679 
3680 /*
3681  * Find the median value in dx_leaf_bh and allocate a new leaf to move
3682  * half our entries into.
3683  */
3684 static int ocfs2_dx_dir_rebalance(struct ocfs2_super *osb, struct inode *dir,
3685                   struct buffer_head *dx_root_bh,
3686                   struct buffer_head *dx_leaf_bh,
3687                   struct ocfs2_dx_hinfo *hinfo, u32 leaf_cpos,
3688                   u64 leaf_blkno)
3689 {
3690     struct ocfs2_dx_leaf *dx_leaf = (struct ocfs2_dx_leaf *)dx_leaf_bh->b_data;
3691     int credits, ret, i, num_used, did_quota = 0;
3692     u32 cpos, split_hash, insert_hash = hinfo->major_hash;
3693     u64 orig_leaves_start;
3694     int num_dx_leaves;
3695     struct buffer_head **orig_dx_leaves = NULL;
3696     struct buffer_head **new_dx_leaves = NULL;
3697     struct ocfs2_alloc_context *data_ac = NULL, *meta_ac = NULL;
3698     struct ocfs2_extent_tree et;
3699     handle_t *handle = NULL;
3700     struct ocfs2_dx_root_block *dx_root;
3701     struct ocfs2_dx_leaf *tmp_dx_leaf = NULL;
3702 
3703     trace_ocfs2_dx_dir_rebalance((unsigned long long)OCFS2_I(dir)->ip_blkno,
3704                      (unsigned long long)leaf_blkno,
3705                      insert_hash);
3706 
3707     ocfs2_init_dx_root_extent_tree(&et, INODE_CACHE(dir), dx_root_bh);
3708 
3709     dx_root = (struct ocfs2_dx_root_block *)dx_root_bh->b_data;
3710     /*
3711      * XXX: This is a rather large limit. We should use a more
3712      * realistic value.
3713      */
3714     if (le32_to_cpu(dx_root->dr_clusters) == UINT_MAX)
3715         return -ENOSPC;
3716 
3717     num_used = le16_to_cpu(dx_leaf->dl_list.de_num_used);
3718     if (num_used < le16_to_cpu(dx_leaf->dl_list.de_count)) {
3719         mlog(ML_ERROR, "DX Dir: %llu, Asked to rebalance empty leaf: "
3720              "%llu, %d\n", (unsigned long long)OCFS2_I(dir)->ip_blkno,
3721              (unsigned long long)leaf_blkno, num_used);
3722         ret = -EIO;
3723         goto out;
3724     }
3725 
3726     orig_dx_leaves = ocfs2_dx_dir_kmalloc_leaves(osb->sb, &num_dx_leaves);
3727     if (!orig_dx_leaves) {
3728         ret = -ENOMEM;
3729         mlog_errno(ret);
3730         goto out;
3731     }
3732 
3733     new_dx_leaves = ocfs2_dx_dir_kmalloc_leaves(osb->sb, NULL);
3734     if (!new_dx_leaves) {
3735         ret = -ENOMEM;
3736         mlog_errno(ret);
3737         goto out;
3738     }
3739 
3740     ret = ocfs2_lock_allocators(dir, &et, 1, 0, &data_ac, &meta_ac);
3741     if (ret) {
3742         if (ret != -ENOSPC)
3743             mlog_errno(ret);
3744         goto out;
3745     }
3746 
3747     credits = ocfs2_dx_dir_rebalance_credits(osb, dx_root);
3748     handle = ocfs2_start_trans(osb, credits);
3749     if (IS_ERR(handle)) {
3750         ret = PTR_ERR(handle);
3751         handle = NULL;
3752         mlog_errno(ret);
3753         goto out;
3754     }
3755 
3756     ret = dquot_alloc_space_nodirty(dir,
3757                        ocfs2_clusters_to_bytes(dir->i_sb, 1));
3758     if (ret)
3759         goto out_commit;
3760     did_quota = 1;
3761 
3762     ret = ocfs2_journal_access_dl(handle, INODE_CACHE(dir), dx_leaf_bh,
3763                       OCFS2_JOURNAL_ACCESS_WRITE);
3764     if (ret) {
3765         mlog_errno(ret);
3766         goto out_commit;
3767     }
3768 
3769     /*
3770      * This block is changing anyway, so we can sort it in place.
3771      */
3772     sort(dx_leaf->dl_list.de_entries, num_used,
3773          sizeof(struct ocfs2_dx_entry), dx_leaf_sort_cmp,
3774          dx_leaf_sort_swap);
3775 
3776     ocfs2_journal_dirty(handle, dx_leaf_bh);
3777 
3778     ret = ocfs2_dx_dir_find_leaf_split(dx_leaf, leaf_cpos, insert_hash,
3779                        &split_hash);
3780     if (ret) {
3781         mlog_errno(ret);
3782         goto  out_commit;
3783     }
3784 
3785     trace_ocfs2_dx_dir_rebalance_split(leaf_cpos, split_hash, insert_hash);
3786 
3787     /*
3788      * We have to carefully order operations here. There are items
3789      * which want to be in the new cluster before insert, but in
3790      * order to put those items in the new cluster, we alter the
3791      * old cluster. A failure to insert gets nasty.
3792      *
3793      * So, start by reserving writes to the old
3794      * cluster. ocfs2_dx_dir_new_cluster will reserve writes on
3795      * the new cluster for us, before inserting it. The insert
3796      * won't happen if there's an error before that. Once the
3797      * insert is done then, we can transfer from one leaf into the
3798      * other without fear of hitting any error.
3799      */
3800 
3801     /*
3802      * The leaf transfer wants some scratch space so that we don't
3803      * wind up doing a bunch of expensive memmove().
3804      */
3805     tmp_dx_leaf = kmalloc(osb->sb->s_blocksize, GFP_NOFS);
3806     if (!tmp_dx_leaf) {
3807         ret = -ENOMEM;
3808         mlog_errno(ret);
3809         goto out_commit;
3810     }
3811 
3812     orig_leaves_start = ocfs2_block_to_cluster_start(dir->i_sb, leaf_blkno);
3813     ret = ocfs2_read_dx_leaves(dir, orig_leaves_start, num_dx_leaves,
3814                    orig_dx_leaves);
3815     if (ret) {
3816         mlog_errno(ret);
3817         goto out_commit;
3818     }
3819 
3820     cpos = split_hash;
3821     ret = ocfs2_dx_dir_new_cluster(dir, &et, cpos, handle,
3822                        data_ac, meta_ac, new_dx_leaves,
3823                        num_dx_leaves);
3824     if (ret) {
3825         mlog_errno(ret);
3826         goto out_commit;
3827     }
3828 
3829     for (i = 0; i < num_dx_leaves; i++) {
3830         ret = ocfs2_journal_access_dl(handle, INODE_CACHE(dir),
3831                           orig_dx_leaves[i],
3832                           OCFS2_JOURNAL_ACCESS_WRITE);
3833         if (ret) {
3834             mlog_errno(ret);
3835             goto out_commit;
3836         }
3837 
3838         ret = ocfs2_journal_access_dl(handle, INODE_CACHE(dir),
3839                           new_dx_leaves[i],
3840                           OCFS2_JOURNAL_ACCESS_WRITE);
3841         if (ret) {
3842             mlog_errno(ret);
3843             goto out_commit;
3844         }
3845     }
3846 
3847     ocfs2_dx_dir_transfer_leaf(dir, split_hash, handle, tmp_dx_leaf,
3848                    orig_dx_leaves, new_dx_leaves, num_dx_leaves);
3849 
3850 out_commit:
3851     if (ret < 0 && did_quota)
3852         dquot_free_space_nodirty(dir,
3853                 ocfs2_clusters_to_bytes(dir->i_sb, 1));
3854 
3855     ocfs2_update_inode_fsync_trans(handle, dir, 1);
3856     ocfs2_commit_trans(osb, handle);
3857 
3858 out:
3859     if (orig_dx_leaves || new_dx_leaves) {
3860         for (i = 0; i < num_dx_leaves; i++) {
3861             if (orig_dx_leaves)
3862                 brelse(orig_dx_leaves[i]);
3863             if (new_dx_leaves)
3864                 brelse(new_dx_leaves[i]);
3865         }
3866         kfree(orig_dx_leaves);
3867         kfree(new_dx_leaves);
3868     }
3869 
3870     if (meta_ac)
3871         ocfs2_free_alloc_context(meta_ac);
3872     if (data_ac)
3873         ocfs2_free_alloc_context(data_ac);
3874 
3875     kfree(tmp_dx_leaf);
3876     return ret;
3877 }
3878 
3879 static int ocfs2_find_dir_space_dx(struct ocfs2_super *osb, struct inode *dir,
3880                    struct buffer_head *di_bh,
3881                    struct buffer_head *dx_root_bh,
3882                    const char *name, int namelen,
3883                    struct ocfs2_dir_lookup_result *lookup)
3884 {
3885     int ret, rebalanced = 0;
3886     struct ocfs2_dx_root_block *dx_root;
3887     struct buffer_head *dx_leaf_bh = NULL;
3888     struct ocfs2_dx_leaf *dx_leaf;
3889     u64 blkno;
3890     u32 leaf_cpos;
3891 
3892     dx_root = (struct ocfs2_dx_root_block *)dx_root_bh->b_data;
3893 
3894 restart_search:
3895     ret = ocfs2_dx_dir_lookup(dir, &dx_root->dr_list, &lookup->dl_hinfo,
3896                   &leaf_cpos, &blkno);
3897     if (ret) {
3898         mlog_errno(ret);
3899         goto out;
3900     }
3901 
3902     ret = ocfs2_read_dx_leaf(dir, blkno, &dx_leaf_bh);
3903     if (ret) {
3904         mlog_errno(ret);
3905         goto out;
3906     }
3907 
3908     dx_leaf = (struct ocfs2_dx_leaf *)dx_leaf_bh->b_data;
3909 
3910     if (le16_to_cpu(dx_leaf->dl_list.de_num_used) >=
3911         le16_to_cpu(dx_leaf->dl_list.de_count)) {
3912         if (rebalanced) {
3913             /*
3914              * Rebalancing should have provided us with
3915              * space in an appropriate leaf.
3916              *
3917              * XXX: Is this an abnormal condition then?
3918              * Should we print a message here?
3919              */
3920             ret = -ENOSPC;
3921             goto out;
3922         }
3923 
3924         ret = ocfs2_dx_dir_rebalance(osb, dir, dx_root_bh, dx_leaf_bh,
3925                          &lookup->dl_hinfo, leaf_cpos,
3926                          blkno);
3927         if (ret) {
3928             if (ret != -ENOSPC)
3929                 mlog_errno(ret);
3930             goto out;
3931         }
3932 
3933         /*
3934          * Restart the lookup. The rebalance might have
3935          * changed which block our item fits into. Mark our
3936          * progress, so we only execute this once.
3937          */
3938         brelse(dx_leaf_bh);
3939         dx_leaf_bh = NULL;
3940         rebalanced = 1;
3941         goto restart_search;
3942     }
3943 
3944     lookup->dl_dx_leaf_bh = dx_leaf_bh;
3945     dx_leaf_bh = NULL;
3946 
3947 out:
3948     brelse(dx_leaf_bh);
3949     return ret;
3950 }
3951 
3952 static int ocfs2_search_dx_free_list(struct inode *dir,
3953                      struct buffer_head *dx_root_bh,
3954                      int namelen,
3955                      struct ocfs2_dir_lookup_result *lookup)
3956 {
3957     int ret = -ENOSPC;
3958     struct buffer_head *leaf_bh = NULL, *prev_leaf_bh = NULL;
3959     struct ocfs2_dir_block_trailer *db;
3960     u64 next_block;
3961     int rec_len = OCFS2_DIR_REC_LEN(namelen);
3962     struct ocfs2_dx_root_block *dx_root;
3963 
3964     dx_root = (struct ocfs2_dx_root_block *)dx_root_bh->b_data;
3965     next_block = le64_to_cpu(dx_root->dr_free_blk);
3966 
3967     while (next_block) {
3968         brelse(prev_leaf_bh);
3969         prev_leaf_bh = leaf_bh;
3970         leaf_bh = NULL;
3971 
3972         ret = ocfs2_read_dir_block_direct(dir, next_block, &leaf_bh);
3973         if (ret) {
3974             mlog_errno(ret);
3975             goto out;
3976         }
3977 
3978         db = ocfs2_trailer_from_bh(leaf_bh, dir->i_sb);
3979         if (rec_len <= le16_to_cpu(db->db_free_rec_len)) {
3980             lookup->dl_leaf_bh = leaf_bh;
3981             lookup->dl_prev_leaf_bh = prev_leaf_bh;
3982             leaf_bh = NULL;
3983             prev_leaf_bh = NULL;
3984             break;
3985         }
3986 
3987         next_block = le64_to_cpu(db->db_free_next);
3988     }
3989 
3990     if (!next_block)
3991         ret = -ENOSPC;
3992 
3993 out:
3994 
3995     brelse(leaf_bh);
3996     brelse(prev_leaf_bh);
3997     return ret;
3998 }
3999 
4000 static int ocfs2_expand_inline_dx_root(struct inode *dir,
4001                        struct buffer_head *dx_root_bh)
4002 {
4003     int ret, num_dx_leaves, i, j, did_quota = 0;
4004     struct buffer_head **dx_leaves = NULL;
4005     struct ocfs2_extent_tree et;
4006     u64 insert_blkno;
4007     struct ocfs2_alloc_context *data_ac = NULL;
4008     struct ocfs2_super *osb = OCFS2_SB(dir->i_sb);
4009     handle_t *handle = NULL;
4010     struct ocfs2_dx_root_block *dx_root;
4011     struct ocfs2_dx_entry_list *entry_list;
4012     struct ocfs2_dx_entry *dx_entry;
4013     struct ocfs2_dx_leaf *target_leaf;
4014 
4015     ret = ocfs2_reserve_clusters(osb, 1, &data_ac);
4016     if (ret) {
4017         mlog_errno(ret);
4018         goto out;
4019     }
4020 
4021     dx_leaves = ocfs2_dx_dir_kmalloc_leaves(osb->sb, &num_dx_leaves);
4022     if (!dx_leaves) {
4023         ret = -ENOMEM;
4024         mlog_errno(ret);
4025         goto out;
4026     }
4027 
4028     handle = ocfs2_start_trans(osb, ocfs2_calc_dxi_expand_credits(osb->sb));
4029     if (IS_ERR(handle)) {
4030         ret = PTR_ERR(handle);
4031         mlog_errno(ret);
4032         goto out;
4033     }
4034 
4035     ret = dquot_alloc_space_nodirty(dir,
4036                        ocfs2_clusters_to_bytes(osb->sb, 1));
4037     if (ret)
4038         goto out_commit;
4039     did_quota = 1;
4040 
4041     /*
4042      * We do this up front, before the allocation, so that a
4043      * failure to add the dx_root_bh to the journal won't result
4044      * us losing clusters.
4045      */
4046     ret = ocfs2_journal_access_dr(handle, INODE_CACHE(dir), dx_root_bh,
4047                       OCFS2_JOURNAL_ACCESS_WRITE);
4048     if (ret) {
4049         mlog_errno(ret);
4050         goto out_commit;
4051     }
4052 
4053     ret = __ocfs2_dx_dir_new_cluster(dir, 0, handle, data_ac, dx_leaves,
4054                      num_dx_leaves, &insert_blkno);
4055     if (ret) {
4056         mlog_errno(ret);
4057         goto out_commit;
4058     }
4059 
4060     /*
4061      * Transfer the entries from our dx_root into the appropriate
4062      * block
4063      */
4064     dx_root = (struct ocfs2_dx_root_block *) dx_root_bh->b_data;
4065     entry_list = &dx_root->dr_entries;
4066 
4067     for (i = 0; i < le16_to_cpu(entry_list->de_num_used); i++) {
4068         dx_entry = &entry_list->de_entries[i];
4069 
4070         j = __ocfs2_dx_dir_hash_idx(osb,
4071                         le32_to_cpu(dx_entry->dx_minor_hash));
4072         target_leaf = (struct ocfs2_dx_leaf *)dx_leaves[j]->b_data;
4073 
4074         ocfs2_dx_dir_leaf_insert_tail(target_leaf, dx_entry);
4075 
4076         /* Each leaf has been passed to the journal already
4077          * via __ocfs2_dx_dir_new_cluster() */
4078     }
4079 
4080     dx_root->dr_flags &= ~OCFS2_DX_FLAG_INLINE;
4081     memset(&dx_root->dr_list, 0, osb->sb->s_blocksize -
4082            offsetof(struct ocfs2_dx_root_block, dr_list));
4083     dx_root->dr_list.l_count =
4084         cpu_to_le16(ocfs2_extent_recs_per_dx_root(osb->sb));
4085 
4086     /* This should never fail considering we start with an empty
4087      * dx_root. */
4088     ocfs2_init_dx_root_extent_tree(&et, INODE_CACHE(dir), dx_root_bh);
4089     ret = ocfs2_insert_extent(handle, &et, 0, insert_blkno, 1, 0, NULL);
4090     if (ret)
4091         mlog_errno(ret);
4092     did_quota = 0;
4093 
4094     ocfs2_update_inode_fsync_trans(handle, dir, 1);
4095     ocfs2_journal_dirty(handle, dx_root_bh);
4096 
4097 out_commit:
4098     if (ret < 0 && did_quota)
4099         dquot_free_space_nodirty(dir,
4100                       ocfs2_clusters_to_bytes(dir->i_sb, 1));
4101 
4102     ocfs2_commit_trans(osb, handle);
4103 
4104 out:
4105     if (data_ac)
4106         ocfs2_free_alloc_context(data_ac);
4107 
4108     if (dx_leaves) {
4109         for (i = 0; i < num_dx_leaves; i++)
4110             brelse(dx_leaves[i]);
4111         kfree(dx_leaves);
4112     }
4113     return ret;
4114 }
4115 
4116 static int ocfs2_inline_dx_has_space(struct buffer_head *dx_root_bh)
4117 {
4118     struct ocfs2_dx_root_block *dx_root;
4119     struct ocfs2_dx_entry_list *entry_list;
4120 
4121     dx_root = (struct ocfs2_dx_root_block *) dx_root_bh->b_data;
4122     entry_list = &dx_root->dr_entries;
4123 
4124     if (le16_to_cpu(entry_list->de_num_used) >=
4125         le16_to_cpu(entry_list->de_count))
4126         return -ENOSPC;
4127 
4128     return 0;
4129 }
4130 
4131 static int ocfs2_prepare_dx_dir_for_insert(struct inode *dir,
4132                        struct buffer_head *di_bh,
4133                        const char *name,
4134                        int namelen,
4135                        struct ocfs2_dir_lookup_result *lookup)
4136 {
4137     int ret, free_dx_root = 1;
4138     struct ocfs2_super *osb = OCFS2_SB(dir->i_sb);
4139     struct buffer_head *dx_root_bh = NULL;
4140     struct buffer_head *leaf_bh = NULL;
4141     struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
4142     struct ocfs2_dx_root_block *dx_root;
4143 
4144     ret = ocfs2_read_dx_root(dir, di, &dx_root_bh);
4145     if (ret) {
4146         mlog_errno(ret);
4147         goto out;
4148     }
4149 
4150     dx_root = (struct ocfs2_dx_root_block *)dx_root_bh->b_data;
4151     if (le32_to_cpu(dx_root->dr_num_entries) == OCFS2_DX_ENTRIES_MAX) {
4152         ret = -ENOSPC;
4153         mlog_errno(ret);
4154         goto out;
4155     }
4156 
4157     if (ocfs2_dx_root_inline(dx_root)) {
4158         ret = ocfs2_inline_dx_has_space(dx_root_bh);
4159 
4160         if (ret == 0)
4161             goto search_el;
4162 
4163         /*
4164          * We ran out of room in the root block. Expand it to
4165          * an extent, then allow ocfs2_find_dir_space_dx to do
4166          * the rest.
4167          */
4168         ret = ocfs2_expand_inline_dx_root(dir, dx_root_bh);
4169         if (ret) {
4170             mlog_errno(ret);
4171             goto out;
4172         }
4173     }
4174 
4175     /*
4176      * Insert preparation for an indexed directory is split into two
4177      * steps. The call to find_dir_space_dx reserves room in the index for
4178      * an additional item. If we run out of space there, it's a real error
4179      * we can't continue on.
4180      */
4181     ret = ocfs2_find_dir_space_dx(osb, dir, di_bh, dx_root_bh, name,
4182                       namelen, lookup);
4183     if (ret) {
4184         mlog_errno(ret);
4185         goto out;
4186     }
4187 
4188 search_el:
4189     /*
4190      * Next, we need to find space in the unindexed tree. This call
4191      * searches using the free space linked list. If the unindexed tree
4192      * lacks sufficient space, we'll expand it below. The expansion code
4193      * is smart enough to add any new blocks to the free space list.
4194      */
4195     ret = ocfs2_search_dx_free_list(dir, dx_root_bh, namelen, lookup);
4196     if (ret && ret != -ENOSPC) {
4197         mlog_errno(ret);
4198         goto out;
4199     }
4200 
4201     /* Do this up here - ocfs2_extend_dir might need the dx_root */
4202     lookup->dl_dx_root_bh = dx_root_bh;
4203     free_dx_root = 0;
4204 
4205     if (ret == -ENOSPC) {
4206         ret = ocfs2_extend_dir(osb, dir, di_bh, 1, lookup, &leaf_bh);
4207 
4208         if (ret) {
4209             mlog_errno(ret);
4210             goto out;
4211         }
4212 
4213         /*
4214          * We make the assumption here that new leaf blocks are added
4215          * to the front of our free list.
4216          */
4217         lookup->dl_prev_leaf_bh = NULL;
4218         lookup->dl_leaf_bh = leaf_bh;
4219     }
4220 
4221 out:
4222     if (free_dx_root)
4223         brelse(dx_root_bh);
4224     return ret;
4225 }
4226 
4227 /*
4228  * Get a directory ready for insert. Any directory allocation required
4229  * happens here. Success returns zero, and enough context in the dir
4230  * lookup result that ocfs2_add_entry() will be able complete the task
4231  * with minimal performance impact.
4232  */
4233 int ocfs2_prepare_dir_for_insert(struct ocfs2_super *osb,
4234                  struct inode *dir,
4235                  struct buffer_head *parent_fe_bh,
4236                  const char *name,
4237                  int namelen,
4238                  struct ocfs2_dir_lookup_result *lookup)
4239 {
4240     int ret;
4241     unsigned int blocks_wanted = 1;
4242     struct buffer_head *bh = NULL;
4243 
4244     trace_ocfs2_prepare_dir_for_insert(
4245         (unsigned long long)OCFS2_I(dir)->ip_blkno, namelen);
4246 
4247     if (!namelen) {
4248         ret = -EINVAL;
4249         mlog_errno(ret);
4250         goto out;
4251     }
4252 
4253     /*
4254      * Do this up front to reduce confusion.
4255      *
4256      * The directory might start inline, then be turned into an
4257      * indexed one, in which case we'd need to hash deep inside
4258      * ocfs2_find_dir_space_id(). Since
4259      * ocfs2_prepare_dx_dir_for_insert() also needs this hash
4260      * done, there seems no point in spreading out the calls. We
4261      * can optimize away the case where the file system doesn't
4262      * support indexing.
4263      */
4264     if (ocfs2_supports_indexed_dirs(osb))
4265         ocfs2_dx_dir_name_hash(dir, name, namelen, &lookup->dl_hinfo);
4266 
4267     if (ocfs2_dir_indexed(dir)) {
4268         ret = ocfs2_prepare_dx_dir_for_insert(dir, parent_fe_bh,
4269                               name, namelen, lookup);
4270         if (ret)
4271             mlog_errno(ret);
4272         goto out;
4273     }
4274 
4275     if (OCFS2_I(dir)->ip_dyn_features & OCFS2_INLINE_DATA_FL) {
4276         ret = ocfs2_find_dir_space_id(dir, parent_fe_bh, name,
4277                           namelen, &bh, &blocks_wanted);
4278     } else
4279         ret = ocfs2_find_dir_space_el(dir, name, namelen, &bh);
4280 
4281     if (ret && ret != -ENOSPC) {
4282         mlog_errno(ret);
4283         goto out;
4284     }
4285 
4286     if (ret == -ENOSPC) {
4287         /*
4288          * We have to expand the directory to add this name.
4289          */
4290         BUG_ON(bh);
4291 
4292         ret = ocfs2_extend_dir(osb, dir, parent_fe_bh, blocks_wanted,
4293                        lookup, &bh);
4294         if (ret) {
4295             if (ret != -ENOSPC)
4296                 mlog_errno(ret);
4297             goto out;
4298         }
4299 
4300         BUG_ON(!bh);
4301     }
4302 
4303     lookup->dl_leaf_bh = bh;
4304     bh = NULL;
4305 out:
4306     brelse(bh);
4307     return ret;
4308 }
4309 
4310 static int ocfs2_dx_dir_remove_index(struct inode *dir,
4311                      struct buffer_head *di_bh,
4312                      struct buffer_head *dx_root_bh)
4313 {
4314     int ret;
4315     struct ocfs2_super *osb = OCFS2_SB(dir->i_sb);
4316     struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
4317     struct ocfs2_dx_root_block *dx_root;
4318     struct inode *dx_alloc_inode = NULL;
4319     struct buffer_head *dx_alloc_bh = NULL;
4320     handle_t *handle;
4321     u64 blk;
4322     u16 bit;
4323     u64 bg_blkno;
4324 
4325     dx_root = (struct ocfs2_dx_root_block *) dx_root_bh->b_data;
4326 
4327     dx_alloc_inode = ocfs2_get_system_file_inode(osb,
4328                     EXTENT_ALLOC_SYSTEM_INODE,
4329                     le16_to_cpu(dx_root->dr_suballoc_slot));
4330     if (!dx_alloc_inode) {
4331         ret = -ENOMEM;
4332         mlog_errno(ret);
4333         goto out;
4334     }
4335     inode_lock(dx_alloc_inode);
4336 
4337     ret = ocfs2_inode_lock(dx_alloc_inode, &dx_alloc_bh, 1);
4338     if (ret) {
4339         mlog_errno(ret);
4340         goto out_mutex;
4341     }
4342 
4343     handle = ocfs2_start_trans(osb, OCFS2_DX_ROOT_REMOVE_CREDITS);
4344     if (IS_ERR(handle)) {
4345         ret = PTR_ERR(handle);
4346         mlog_errno(ret);
4347         goto out_unlock;
4348     }
4349 
4350     ret = ocfs2_journal_access_di(handle, INODE_CACHE(dir), di_bh,
4351                       OCFS2_JOURNAL_ACCESS_WRITE);
4352     if (ret) {
4353         mlog_errno(ret);
4354         goto out_commit;
4355     }
4356 
4357     spin_lock(&OCFS2_I(dir)->ip_lock);
4358     OCFS2_I(dir)->ip_dyn_features &= ~OCFS2_INDEXED_DIR_FL;
4359     di->i_dyn_features = cpu_to_le16(OCFS2_I(dir)->ip_dyn_features);
4360     spin_unlock(&OCFS2_I(dir)->ip_lock);
4361     di->i_dx_root = cpu_to_le64(0ULL);
4362     ocfs2_update_inode_fsync_trans(handle, dir, 1);
4363 
4364     ocfs2_journal_dirty(handle, di_bh);
4365 
4366     blk = le64_to_cpu(dx_root->dr_blkno);
4367     bit = le16_to_cpu(dx_root->dr_suballoc_bit);
4368     if (dx_root->dr_suballoc_loc)
4369         bg_blkno = le64_to_cpu(dx_root->dr_suballoc_loc);
4370     else
4371         bg_blkno = ocfs2_which_suballoc_group(blk, bit);
4372     ret = ocfs2_free_suballoc_bits(handle, dx_alloc_inode, dx_alloc_bh,
4373                        bit, bg_blkno, 1);
4374     if (ret)
4375         mlog_errno(ret);
4376 
4377 out_commit:
4378     ocfs2_commit_trans(osb, handle);
4379 
4380 out_unlock:
4381     ocfs2_inode_unlock(dx_alloc_inode, 1);
4382 
4383 out_mutex:
4384     inode_unlock(dx_alloc_inode);
4385     brelse(dx_alloc_bh);
4386 out:
4387     iput(dx_alloc_inode);
4388     return ret;
4389 }
4390 
4391 int ocfs2_dx_dir_truncate(struct inode *dir, struct buffer_head *di_bh)
4392 {
4393     int ret;
4394     unsigned int clen;
4395     u32 major_hash = UINT_MAX, p_cpos, cpos;
4396     u64 blkno;
4397     struct ocfs2_super *osb = OCFS2_SB(dir->i_sb);
4398     struct buffer_head *dx_root_bh = NULL;
4399     struct ocfs2_dx_root_block *dx_root;
4400     struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
4401     struct ocfs2_cached_dealloc_ctxt dealloc;
4402     struct ocfs2_extent_tree et;
4403 
4404     ocfs2_init_dealloc_ctxt(&dealloc);
4405 
4406     if (!ocfs2_dir_indexed(dir))
4407         return 0;
4408 
4409     ret = ocfs2_read_dx_root(dir, di, &dx_root_bh);
4410     if (ret) {
4411         mlog_errno(ret);
4412         goto out;
4413     }
4414     dx_root = (struct ocfs2_dx_root_block *)dx_root_bh->b_data;
4415 
4416     if (ocfs2_dx_root_inline(dx_root))
4417         goto remove_index;
4418 
4419     ocfs2_init_dx_root_extent_tree(&et, INODE_CACHE(dir), dx_root_bh);
4420 
4421     /* XXX: What if dr_clusters is too large? */
4422     while (le32_to_cpu(dx_root->dr_clusters)) {
4423         ret = ocfs2_dx_dir_lookup_rec(dir, &dx_root->dr_list,
4424                           major_hash, &cpos, &blkno, &clen);
4425         if (ret) {
4426             mlog_errno(ret);
4427             goto out;
4428         }
4429 
4430         p_cpos = ocfs2_blocks_to_clusters(dir->i_sb, blkno);
4431 
4432         ret = ocfs2_remove_btree_range(dir, &et, cpos, p_cpos, clen, 0,
4433                            &dealloc, 0, false);
4434         if (ret) {
4435             mlog_errno(ret);
4436             goto out;
4437         }
4438 
4439         if (cpos == 0)
4440             break;
4441 
4442         major_hash = cpos - 1;
4443     }
4444 
4445 remove_index:
4446     ret = ocfs2_dx_dir_remove_index(dir, di_bh, dx_root_bh);
4447     if (ret) {
4448         mlog_errno(ret);
4449         goto out;
4450     }
4451 
4452     ocfs2_remove_from_cache(INODE_CACHE(dir), dx_root_bh);
4453 out:
4454     ocfs2_schedule_truncate_log_flush(osb, 1);
4455     ocfs2_run_deallocs(osb, &dealloc);
4456 
4457     brelse(dx_root_bh);
4458     return ret;
4459 }