fs/gfs2/bmap.c

0001 // SPDX-License-Identifier: GPL-2.0-only
0002 /*
0003  * Copyright (C) Sistina Software, Inc.  1997-2003 All rights reserved.
0004  * Copyright (C) 2004-2006 Red Hat, Inc.  All rights reserved.
0005  */
0006
0007 #include <linux/spinlock.h>
0008 #include <linux/completion.h>
0009 #include <linux/buffer_head.h>
0010 #include <linux/blkdev.h>
0011 #include <linux/gfs2_ondisk.h>
0012 #include <linux/crc32.h>
0013 #include <linux/iomap.h>
0014 #include <linux/ktime.h>
0015
0016 #include "gfs2.h"
0017 #include "incore.h"
0018 #include "bmap.h"
0019 #include "glock.h"
0020 #include "inode.h"
0021 #include "meta_io.h"
0022 #include "quota.h"
0023 #include "rgrp.h"
0024 #include "log.h"
0025 #include "super.h"
0026 #include "trans.h"
0027 #include "dir.h"
0028 #include "util.h"
0029 #include "aops.h"
0030 #include "trace_gfs2.h"
0031
0032 /* This doesn't need to be that large as max 64 bit pointers in a 4k
0033  * block is 512, so __u16 is fine for that. It saves stack space to
0034  * keep it small.
0035  */
0036 struct metapath {
0037     struct buffer_head *mp_bh[GFS2_MAX_META_HEIGHT];
0038     __u16 mp_list[GFS2_MAX_META_HEIGHT];
0039     int mp_fheight; /* find_metapath height */
0040     int mp_aheight; /* actual height (lookup height) */
0041 };
0042
0043 static int punch_hole(struct gfs2_inode *ip, u64 offset, u64 length);
0044
0045 /**
0046  * gfs2_unstuffer_page - unstuff a stuffed inode into a block cached by a page
0047  * @ip: the inode
0048  * @dibh: the dinode buffer
0049  * @block: the block number that was allocated
0050  * @page: The (optional) page. This is looked up if @page is NULL
0051  *
0052  * Returns: errno
0053  */
0054
0055 static int gfs2_unstuffer_page(struct gfs2_inode *ip, struct buffer_head *dibh,
0056                    u64 block, struct page *page)
0057 {
0058     struct inode *inode = &ip->i_inode;
0059
0060     if (!PageUptodate(page)) {
0061         void *kaddr = kmap(page);
0062         u64 dsize = i_size_read(inode);
0063
0064         if (dsize > gfs2_max_stuffed_size(ip))
0065             dsize = gfs2_max_stuffed_size(ip);
0066
0067         memcpy(kaddr, dibh->b_data + sizeof(struct gfs2_dinode), dsize);
0068         memset(kaddr + dsize, 0, PAGE_SIZE - dsize);
0069         kunmap(page);
0070
0071         SetPageUptodate(page);
0072     }
0073
0074     if (gfs2_is_jdata(ip)) {
0075         struct buffer_head *bh;
0076
0077         if (!page_has_buffers(page))
0078             create_empty_buffers(page, BIT(inode->i_blkbits),
0079                          BIT(BH_Uptodate));
0080
0081         bh = page_buffers(page);
0082         if (!buffer_mapped(bh))
0083             map_bh(bh, inode->i_sb, block);
0084
0085         set_buffer_uptodate(bh);
0086         gfs2_trans_add_data(ip->i_gl, bh);
0087     } else {
0088         set_page_dirty(page);
0089         gfs2_ordered_add_inode(ip);
0090     }
0091
0092     return 0;
0093 }
0094
0095 static int __gfs2_unstuff_inode(struct gfs2_inode *ip, struct page *page)
0096 {
0097     struct buffer_head *bh, *dibh;
0098     struct gfs2_dinode *di;
0099     u64 block = 0;
0100     int isdir = gfs2_is_dir(ip);
0101     int error;
0102
0103     error = gfs2_meta_inode_buffer(ip, &dibh);
0104     if (error)
0105         return error;
0106
0107     if (i_size_read(&ip->i_inode)) {
0108         /* Get a free block, fill it with the stuffed data,
0109            and write it out to disk */
0110
0111         unsigned int n = 1;
0112         error = gfs2_alloc_blocks(ip, &block, &n, 0, NULL);
0113         if (error)
0114             goto out_brelse;
0115         if (isdir) {
0116             gfs2_trans_remove_revoke(GFS2_SB(&ip->i_inode), block, 1);
0117             error = gfs2_dir_get_new_buffer(ip, block, &bh);
0118             if (error)
0119                 goto out_brelse;
0120             gfs2_buffer_copy_tail(bh, sizeof(struct gfs2_meta_header),
0121                           dibh, sizeof(struct gfs2_dinode));
0122             brelse(bh);
0123         } else {
0124             error = gfs2_unstuffer_page(ip, dibh, block, page);
0125             if (error)
0126                 goto out_brelse;
0127         }
0128     }
0129
0130     /*  Set up the pointer to the new block  */
0131
0132     gfs2_trans_add_meta(ip->i_gl, dibh);
0133     di = (struct gfs2_dinode *)dibh->b_data;
0134     gfs2_buffer_clear_tail(dibh, sizeof(struct gfs2_dinode));
0135
0136     if (i_size_read(&ip->i_inode)) {
0137         *(__be64 *)(di + 1) = cpu_to_be64(block);
0138         gfs2_add_inode_blocks(&ip->i_inode, 1);
0139         di->di_blocks = cpu_to_be64(gfs2_get_inode_blocks(&ip->i_inode));
0140     }
0141
0142     ip->i_height = 1;
0143     di->di_height = cpu_to_be16(1);
0144
0145 out_brelse:
0146     brelse(dibh);
0147     return error;
0148 }
0149
0150 /**
0151  * gfs2_unstuff_dinode - Unstuff a dinode when the data has grown too big
0152  * @ip: The GFS2 inode to unstuff
0153  *
0154  * This routine unstuffs a dinode and returns it to a "normal" state such
0155  * that the height can be grown in the traditional way.
0156  *
0157  * Returns: errno
0158  */
0159
0160 int gfs2_unstuff_dinode(struct gfs2_inode *ip)
0161 {
0162     struct inode *inode = &ip->i_inode;
0163     struct page *page;
0164     int error;
0165
0166     down_write(&ip->i_rw_mutex);
0167     page = find_or_create_page(inode->i_mapping, 0, GFP_NOFS);
0168     error = -ENOMEM;
0169     if (!page)
0170         goto out;
0171     error = __gfs2_unstuff_inode(ip, page);
0172     unlock_page(page);
0173     put_page(page);
0174 out:
0175     up_write(&ip->i_rw_mutex);
0176     return error;
0177 }
0178
0179 /**
0180  * find_metapath - Find path through the metadata tree
0181  * @sdp: The superblock
0182  * @block: The disk block to look up
0183  * @mp: The metapath to return the result in
0184  * @height: The pre-calculated height of the metadata tree
0185  *
0186  *   This routine returns a struct metapath structure that defines a path
0187  *   through the metadata of inode "ip" to get to block "block".
0188  *
0189  *   Example:
0190  *   Given:  "ip" is a height 3 file, "offset" is 101342453, and this is a
0191  *   filesystem with a blocksize of 4096.
0192  *
0193  *   find_metapath() would return a struct metapath structure set to:
0194  *   mp_fheight = 3, mp_list[0] = 0, mp_list[1] = 48, and mp_list[2] = 165.
0195  *
0196  *   That means that in order to get to the block containing the byte at
0197  *   offset 101342453, we would load the indirect block pointed to by pointer
0198  *   0 in the dinode.  We would then load the indirect block pointed to by
0199  *   pointer 48 in that indirect block.  We would then load the data block
0200  *   pointed to by pointer 165 in that indirect block.
0201  *
0202  *             ----------------------------------------
0203  *             | Dinode |                             |
0204  *             |        |                            4|
0205  *             |        |0 1 2 3 4 5                 9|
0206  *             |        |                            6|
0207  *             ----------------------------------------
0208  *                       |
0209  *                       |
0210  *                       V
0211  *             ----------------------------------------
0212  *             | Indirect Block                       |
0213  *             |                                     5|
0214  *             |            4 4 4 4 4 5 5            1|
0215  *             |0           5 6 7 8 9 0 1            2|
0216  *             ----------------------------------------
0217  *                                |
0218  *                                |
0219  *                                V
0220  *             ----------------------------------------
0221  *             | Indirect Block                       |
0222  *             |                         1 1 1 1 1   5|
0223  *             |                         6 6 6 6 6   1|
0224  *             |0                        3 4 5 6 7   2|
0225  *             ----------------------------------------
0226  *                                           |
0227  *                                           |
0228  *                                           V
0229  *             ----------------------------------------
0230  *             | Data block containing offset         |
0231  *             |            101342453                 |
0232  *             |                                      |
0233  *             |                                      |
0234  *             ----------------------------------------
0235  *
0236  */
0237
0238 static void find_metapath(const struct gfs2_sbd *sdp, u64 block,
0239               struct metapath *mp, unsigned int height)
0240 {
0241     unsigned int i;
0242
0243     mp->mp_fheight = height;
0244     for (i = height; i--;)
0245         mp->mp_list[i] = do_div(block, sdp->sd_inptrs);
0246 }
0247
0248 static inline unsigned int metapath_branch_start(const struct metapath *mp)
0249 {
0250     if (mp->mp_list[0] == 0)
0251         return 2;
0252     return 1;
0253 }
0254
0255 /**
0256  * metaptr1 - Return the first possible metadata pointer in a metapath buffer
0257  * @height: The metadata height (0 = dinode)
0258  * @mp: The metapath
0259  */
0260 static inline __be64 *metaptr1(unsigned int height, const struct metapath *mp)
0261 {
0262     struct buffer_head *bh = mp->mp_bh[height];
0263     if (height == 0)
0264         return ((__be64 *)(bh->b_data + sizeof(struct gfs2_dinode)));
0265     return ((__be64 *)(bh->b_data + sizeof(struct gfs2_meta_header)));
0266 }
0267
0268 /**
0269  * metapointer - Return pointer to start of metadata in a buffer
0270  * @height: The metadata height (0 = dinode)
0271  * @mp: The metapath
0272  *
0273  * Return a pointer to the block number of the next height of the metadata
0274  * tree given a buffer containing the pointer to the current height of the
0275  * metadata tree.
0276  */
0277
0278 static inline __be64 *metapointer(unsigned int height, const struct metapath *mp)
0279 {
0280     __be64 *p = metaptr1(height, mp);
0281     return p + mp->mp_list[height];
0282 }
0283
0284 static inline const __be64 *metaend(unsigned int height, const struct metapath *mp)
0285 {
0286     const struct buffer_head *bh = mp->mp_bh[height];
0287     return (const __be64 *)(bh->b_data + bh->b_size);
0288 }
0289
0290 static void clone_metapath(struct metapath *clone, struct metapath *mp)
0291 {
0292     unsigned int hgt;
0293
0294     *clone = *mp;
0295     for (hgt = 0; hgt < mp->mp_aheight; hgt++)
0296         get_bh(clone->mp_bh[hgt]);
0297 }
0298
0299 static void gfs2_metapath_ra(struct gfs2_glock *gl, __be64 *start, __be64 *end)
0300 {
0301     const __be64 *t;
0302
0303     for (t = start; t < end; t++) {
0304         struct buffer_head *rabh;
0305
0306         if (!*t)
0307             continue;
0308
0309         rabh = gfs2_getbuf(gl, be64_to_cpu(*t), CREATE);
0310         if (trylock_buffer(rabh)) {
0311             if (!buffer_uptodate(rabh)) {
0312                 rabh->b_end_io = end_buffer_read_sync;
0313                 submit_bh(REQ_OP_READ | REQ_RAHEAD | REQ_META |
0314                       REQ_PRIO, rabh);
0315                 continue;
0316             }
0317             unlock_buffer(rabh);
0318         }
0319         brelse(rabh);
0320     }
0321 }
0322
0323 static int __fillup_metapath(struct gfs2_inode *ip, struct metapath *mp,
0324                  unsigned int x, unsigned int h)
0325 {
0326     for (; x < h; x++) {
0327         __be64 *ptr = metapointer(x, mp);
0328         u64 dblock = be64_to_cpu(*ptr);
0329         int ret;
0330
0331         if (!dblock)
0332             break;
0333         ret = gfs2_meta_buffer(ip, GFS2_METATYPE_IN, dblock, &mp->mp_bh[x + 1]);
0334         if (ret)
0335             return ret;
0336     }
0337     mp->mp_aheight = x + 1;
0338     return 0;
0339 }
0340
0341 /**
0342  * lookup_metapath - Walk the metadata tree to a specific point
0343  * @ip: The inode
0344  * @mp: The metapath
0345  *
0346  * Assumes that the inode's buffer has already been looked up and
0347  * hooked onto mp->mp_bh[0] and that the metapath has been initialised
0348  * by find_metapath().
0349  *
0350  * If this function encounters part of the tree which has not been
0351  * allocated, it returns the current height of the tree at the point
0352  * at which it found the unallocated block. Blocks which are found are
0353  * added to the mp->mp_bh[] list.
0354  *
0355  * Returns: error
0356  */
0357
0358 static int lookup_metapath(struct gfs2_inode *ip, struct metapath *mp)
0359 {
0360     return __fillup_metapath(ip, mp, 0, ip->i_height - 1);
0361 }
0362
0363 /**
0364  * fillup_metapath - fill up buffers for the metadata path to a specific height
0365  * @ip: The inode
0366  * @mp: The metapath
0367  * @h: The height to which it should be mapped
0368  *
0369  * Similar to lookup_metapath, but does lookups for a range of heights
0370  *
0371  * Returns: error or the number of buffers filled
0372  */
0373
0374 static int fillup_metapath(struct gfs2_inode *ip, struct metapath *mp, int h)
0375 {
0376     unsigned int x = 0;
0377     int ret;
0378
0379     if (h) {
0380         /* find the first buffer we need to look up. */
0381         for (x = h - 1; x > 0; x--) {
0382             if (mp->mp_bh[x])
0383                 break;
0384         }
0385     }
0386     ret = __fillup_metapath(ip, mp, x, h);
0387     if (ret)
0388         return ret;
0389     return mp->mp_aheight - x - 1;
0390 }
0391
0392 static sector_t metapath_to_block(struct gfs2_sbd *sdp, struct metapath *mp)
0393 {
0394     sector_t factor = 1, block = 0;
0395     int hgt;
0396
0397     for (hgt = mp->mp_fheight - 1; hgt >= 0; hgt--) {
0398         if (hgt < mp->mp_aheight)
0399             block += mp->mp_list[hgt] * factor;
0400         factor *= sdp->sd_inptrs;
0401     }
0402     return block;
0403 }
0404
0405 static void release_metapath(struct metapath *mp)
0406 {
0407     int i;
0408
0409     for (i = 0; i < GFS2_MAX_META_HEIGHT; i++) {
0410         if (mp->mp_bh[i] == NULL)
0411             break;
0412         brelse(mp->mp_bh[i]);
0413         mp->mp_bh[i] = NULL;
0414     }
0415 }
0416
0417 /**
0418  * gfs2_extent_length - Returns length of an extent of blocks
0419  * @bh: The metadata block
0420  * @ptr: Current position in @bh
0421  * @limit: Max extent length to return
0422  * @eob: Set to 1 if we hit "end of block"
0423  *
0424  * Returns: The length of the extent (minimum of one block)
0425  */
0426
0427 static inline unsigned int gfs2_extent_length(struct buffer_head *bh, __be64 *ptr, size_t limit, int *eob)
0428 {
0429     const __be64 *end = (__be64 *)(bh->b_data + bh->b_size);
0430     const __be64 *first = ptr;
0431     u64 d = be64_to_cpu(*ptr);
0432
0433     *eob = 0;
0434     do {
0435         ptr++;
0436         if (ptr >= end)
0437             break;
0438         d++;
0439     } while(be64_to_cpu(*ptr) == d);
0440     if (ptr >= end)
0441         *eob = 1;
0442     return ptr - first;
0443 }
0444
0445 enum walker_status { WALK_STOP, WALK_FOLLOW, WALK_CONTINUE };
0446
0447 /*
0448  * gfs2_metadata_walker - walk an indirect block
0449  * @mp: Metapath to indirect block
0450  * @ptrs: Number of pointers to look at
0451  *
0452  * When returning WALK_FOLLOW, the walker must update @mp to point at the right
0453  * indirect block to follow.
0454  */
0455 typedef enum walker_status (*gfs2_metadata_walker)(struct metapath *mp,
0456                            unsigned int ptrs);
0457
0458 /*
0459  * gfs2_walk_metadata - walk a tree of indirect blocks
0460  * @inode: The inode
0461  * @mp: Starting point of walk
0462  * @max_len: Maximum number of blocks to walk
0463  * @walker: Called during the walk
0464  *
0465  * Returns 1 if the walk was stopped by @walker, 0 if we went past @max_len or
0466  * past the end of metadata, and a negative error code otherwise.
0467  */
0468
0469 static int gfs2_walk_metadata(struct inode *inode, struct metapath *mp,
0470         u64 max_len, gfs2_metadata_walker walker)
0471 {
0472     struct gfs2_inode *ip = GFS2_I(inode);
0473     struct gfs2_sbd *sdp = GFS2_SB(inode);
0474     u64 factor = 1;
0475     unsigned int hgt;
0476     int ret;
0477
0478     /*
0479      * The walk starts in the lowest allocated indirect block, which may be
0480      * before the position indicated by @mp.  Adjust @max_len accordingly
0481      * to avoid a short walk.
0482      */
0483     for (hgt = mp->mp_fheight - 1; hgt >= mp->mp_aheight; hgt--) {
0484         max_len += mp->mp_list[hgt] * factor;
0485         mp->mp_list[hgt] = 0;
0486         factor *= sdp->sd_inptrs;
0487     }
0488
0489     for (;;) {
0490         u16 start = mp->mp_list[hgt];
0491         enum walker_status status;
0492         unsigned int ptrs;
0493         u64 len;
0494
0495         /* Walk indirect block. */
0496         ptrs = (hgt >= 1 ? sdp->sd_inptrs : sdp->sd_diptrs) - start;
0497         len = ptrs * factor;
0498         if (len > max_len)
0499             ptrs = DIV_ROUND_UP_ULL(max_len, factor);
0500         status = walker(mp, ptrs);
0501         switch (status) {
0502         case WALK_STOP:
0503             return 1;
0504         case WALK_FOLLOW:
0505             BUG_ON(mp->mp_aheight == mp->mp_fheight);
0506             ptrs = mp->mp_list[hgt] - start;
0507             len = ptrs * factor;
0508             break;
0509         case WALK_CONTINUE:
0510             break;
0511         }
0512         if (len >= max_len)
0513             break;
0514         max_len -= len;
0515         if (status == WALK_FOLLOW)
0516             goto fill_up_metapath;
0517
0518 lower_metapath:
0519         /* Decrease height of metapath. */
0520         brelse(mp->mp_bh[hgt]);
0521         mp->mp_bh[hgt] = NULL;
0522         mp->mp_list[hgt] = 0;
0523         if (!hgt)
0524             break;
0525         hgt--;
0526         factor *= sdp->sd_inptrs;
0527
0528         /* Advance in metadata tree. */
0529         (mp->mp_list[hgt])++;
0530         if (hgt) {
0531             if (mp->mp_list[hgt] >= sdp->sd_inptrs)
0532                 goto lower_metapath;
0533         } else {
0534             if (mp->mp_list[hgt] >= sdp->sd_diptrs)
0535                 break;
0536         }
0537
0538 fill_up_metapath:
0539         /* Increase height of metapath. */
0540         ret = fillup_metapath(ip, mp, ip->i_height - 1);
0541         if (ret < 0)
0542             return ret;
0543         hgt += ret;
0544         for (; ret; ret--)
0545             do_div(factor, sdp->sd_inptrs);
0546         mp->mp_aheight = hgt + 1;
0547     }
0548     return 0;
0549 }
0550
0551 static enum walker_status gfs2_hole_walker(struct metapath *mp,
0552                        unsigned int ptrs)
0553 {
0554     const __be64 *start, *ptr, *end;
0555     unsigned int hgt;
0556
0557     hgt = mp->mp_aheight - 1;
0558     start = metapointer(hgt, mp);
0559     end = start + ptrs;
0560
0561     for (ptr = start; ptr < end; ptr++) {
0562         if (*ptr) {
0563             mp->mp_list[hgt] += ptr - start;
0564             if (mp->mp_aheight == mp->mp_fheight)
0565                 return WALK_STOP;
0566             return WALK_FOLLOW;
0567         }
0568     }
0569     return WALK_CONTINUE;
0570 }
0571
0572 /**
0573  * gfs2_hole_size - figure out the size of a hole
0574  * @inode: The inode
0575  * @lblock: The logical starting block number
0576  * @len: How far to look (in blocks)
0577  * @mp: The metapath at lblock
0578  * @iomap: The iomap to store the hole size in
0579  *
0580  * This function modifies @mp.
0581  *
0582  * Returns: errno on error
0583  */
0584 static int gfs2_hole_size(struct inode *inode, sector_t lblock, u64 len,
0585               struct metapath *mp, struct iomap *iomap)
0586 {
0587     struct metapath clone;
0588     u64 hole_size;
0589     int ret;
0590
0591     clone_metapath(&clone, mp);
0592     ret = gfs2_walk_metadata(inode, &clone, len, gfs2_hole_walker);
0593     if (ret < 0)
0594         goto out;
0595
0596     if (ret == 1)
0597         hole_size = metapath_to_block(GFS2_SB(inode), &clone) - lblock;
0598     else
0599         hole_size = len;
0600     iomap->length = hole_size << inode->i_blkbits;
0601     ret = 0;
0602
0603 out:
0604     release_metapath(&clone);
0605     return ret;
0606 }
0607
0608 static inline void gfs2_indirect_init(struct metapath *mp,
0609                       struct gfs2_glock *gl, unsigned int i,
0610                       unsigned offset, u64 bn)
0611 {
0612     __be64 *ptr = (__be64 *)(mp->mp_bh[i - 1]->b_data +
0613                ((i > 1) ? sizeof(struct gfs2_meta_header) :
0614                  sizeof(struct gfs2_dinode)));
0615     BUG_ON(i < 1);
0616     BUG_ON(mp->mp_bh[i] != NULL);
0617     mp->mp_bh[i] = gfs2_meta_new(gl, bn);
0618     gfs2_trans_add_meta(gl, mp->mp_bh[i]);
0619     gfs2_metatype_set(mp->mp_bh[i], GFS2_METATYPE_IN, GFS2_FORMAT_IN);
0620     gfs2_buffer_clear_tail(mp->mp_bh[i], sizeof(struct gfs2_meta_header));
0621     ptr += offset;
0622     *ptr = cpu_to_be64(bn);
0623 }
0624
0625 enum alloc_state {
0626     ALLOC_DATA = 0,
0627     ALLOC_GROW_DEPTH = 1,
0628     ALLOC_GROW_HEIGHT = 2,
0629     /* ALLOC_UNSTUFF = 3,   TBD and rather complicated */
0630 };
0631
0632 /**
0633  * __gfs2_iomap_alloc - Build a metadata tree of the requested height
0634  * @inode: The GFS2 inode
0635  * @iomap: The iomap structure
0636  * @mp: The metapath, with proper height information calculated
0637  *
0638  * In this routine we may have to alloc:
0639  *   i) Indirect blocks to grow the metadata tree height
0640  *  ii) Indirect blocks to fill in lower part of the metadata tree
0641  * iii) Data blocks
0642  *
0643  * This function is called after __gfs2_iomap_get, which works out the
0644  * total number of blocks which we need via gfs2_alloc_size.
0645  *
0646  * We then do the actual allocation asking for an extent at a time (if
0647  * enough contiguous free blocks are available, there will only be one
0648  * allocation request per call) and uses the state machine to initialise
0649  * the blocks in order.
0650  *
0651  * Right now, this function will allocate at most one indirect block
0652  * worth of data -- with a default block size of 4K, that's slightly
0653  * less than 2M.  If this limitation is ever removed to allow huge
0654  * allocations, we would probably still want to limit the iomap size we
0655  * return to avoid stalling other tasks during huge writes; the next
0656  * iomap iteration would then find the blocks already allocated.
0657  *
0658  * Returns: errno on error
0659  */
0660
0661 static int __gfs2_iomap_alloc(struct inode *inode, struct iomap *iomap,
0662                   struct metapath *mp)
0663 {
0664     struct gfs2_inode *ip = GFS2_I(inode);
0665     struct gfs2_sbd *sdp = GFS2_SB(inode);
0666     struct buffer_head *dibh = mp->mp_bh[0];
0667     u64 bn;
0668     unsigned n, i, blks, alloced = 0, iblks = 0, branch_start = 0;
0669     size_t dblks = iomap->length >> inode->i_blkbits;
0670     const unsigned end_of_metadata = mp->mp_fheight - 1;
0671     int ret;
0672     enum alloc_state state;
0673     __be64 *ptr;
0674     __be64 zero_bn = 0;
0675
0676     BUG_ON(mp->mp_aheight < 1);
0677     BUG_ON(dibh == NULL);
0678     BUG_ON(dblks < 1);
0679
0680     gfs2_trans_add_meta(ip->i_gl, dibh);
0681
0682     down_write(&ip->i_rw_mutex);
0683
0684     if (mp->mp_fheight == mp->mp_aheight) {
0685         /* Bottom indirect block exists */
0686         state = ALLOC_DATA;
0687     } else {
0688         /* Need to allocate indirect blocks */
0689         if (mp->mp_fheight == ip->i_height) {
0690             /* Writing into existing tree, extend tree down */
0691             iblks = mp->mp_fheight - mp->mp_aheight;
0692             state = ALLOC_GROW_DEPTH;
0693         } else {
0694             /* Building up tree height */
0695             state = ALLOC_GROW_HEIGHT;
0696             iblks = mp->mp_fheight - ip->i_height;
0697             branch_start = metapath_branch_start(mp);
0698             iblks += (mp->mp_fheight - branch_start);
0699         }
0700     }
0701
0702     /* start of the second part of the function (state machine) */
0703
0704     blks = dblks + iblks;
0705     i = mp->mp_aheight;
0706     do {
0707         n = blks - alloced;
0708         ret = gfs2_alloc_blocks(ip, &bn, &n, 0, NULL);
0709         if (ret)
0710             goto out;
0711         alloced += n;
0712         if (state != ALLOC_DATA || gfs2_is_jdata(ip))
0713             gfs2_trans_remove_revoke(sdp, bn, n);
0714         switch (state) {
0715         /* Growing height of tree */
0716         case ALLOC_GROW_HEIGHT:
0717             if (i == 1) {
0718                 ptr = (__be64 *)(dibh->b_data +
0719                          sizeof(struct gfs2_dinode));
0720                 zero_bn = *ptr;
0721             }
0722             for (; i - 1 < mp->mp_fheight - ip->i_height && n > 0;
0723                  i++, n--)
0724                 gfs2_indirect_init(mp, ip->i_gl, i, 0, bn++);
0725             if (i - 1 == mp->mp_fheight - ip->i_height) {
0726                 i--;
0727                 gfs2_buffer_copy_tail(mp->mp_bh[i],
0728                         sizeof(struct gfs2_meta_header),
0729                         dibh, sizeof(struct gfs2_dinode));
0730                 gfs2_buffer_clear_tail(dibh,
0731                         sizeof(struct gfs2_dinode) +
0732                         sizeof(__be64));
0733                 ptr = (__be64 *)(mp->mp_bh[i]->b_data +
0734                     sizeof(struct gfs2_meta_header));
0735                 *ptr = zero_bn;
0736                 state = ALLOC_GROW_DEPTH;
0737                 for(i = branch_start; i < mp->mp_fheight; i++) {
0738                     if (mp->mp_bh[i] == NULL)
0739                         break;
0740                     brelse(mp->mp_bh[i]);
0741                     mp->mp_bh[i] = NULL;
0742                 }
0743                 i = branch_start;
0744             }
0745             if (n == 0)
0746                 break;
0747             fallthrough;    /* To branching from existing tree */
0748         case ALLOC_GROW_DEPTH:
0749             if (i > 1 && i < mp->mp_fheight)
0750                 gfs2_trans_add_meta(ip->i_gl, mp->mp_bh[i-1]);
0751             for (; i < mp->mp_fheight && n > 0; i++, n--)
0752                 gfs2_indirect_init(mp, ip->i_gl, i,
0753                            mp->mp_list[i-1], bn++);
0754             if (i == mp->mp_fheight)
0755                 state = ALLOC_DATA;
0756             if (n == 0)
0757                 break;
0758             fallthrough;    /* To tree complete, adding data blocks */
0759         case ALLOC_DATA:
0760             BUG_ON(n > dblks);
0761             BUG_ON(mp->mp_bh[end_of_metadata] == NULL);
0762             gfs2_trans_add_meta(ip->i_gl, mp->mp_bh[end_of_metadata]);
0763             dblks = n;
0764             ptr = metapointer(end_of_metadata, mp);
0765             iomap->addr = bn << inode->i_blkbits;
0766             iomap->flags |= IOMAP_F_MERGED | IOMAP_F_NEW;
0767             while (n-- > 0)
0768                 *ptr++ = cpu_to_be64(bn++);
0769             break;
0770         }
0771     } while (iomap->addr == IOMAP_NULL_ADDR);
0772
0773     iomap->type = IOMAP_MAPPED;
0774     iomap->length = (u64)dblks << inode->i_blkbits;
0775     ip->i_height = mp->mp_fheight;
0776     gfs2_add_inode_blocks(&ip->i_inode, alloced);
0777     gfs2_dinode_out(ip, dibh->b_data);
0778 out:
0779     up_write(&ip->i_rw_mutex);
0780     return ret;
0781 }
0782
0783 #define IOMAP_F_GFS2_BOUNDARY IOMAP_F_PRIVATE
0784
0785 /**
0786  * gfs2_alloc_size - Compute the maximum allocation size
0787  * @inode: The inode
0788  * @mp: The metapath
0789  * @size: Requested size in blocks
0790  *
0791  * Compute the maximum size of the next allocation at @mp.
0792  *
0793  * Returns: size in blocks
0794  */
0795 static u64 gfs2_alloc_size(struct inode *inode, struct metapath *mp, u64 size)
0796 {
0797     struct gfs2_inode *ip = GFS2_I(inode);
0798     struct gfs2_sbd *sdp = GFS2_SB(inode);
0799     const __be64 *first, *ptr, *end;
0800
0801     /*
0802      * For writes to stuffed files, this function is called twice via
0803      * __gfs2_iomap_get, before and after unstuffing. The size we return the
0804      * first time needs to be large enough to get the reservation and
0805      * allocation sizes right.  The size we return the second time must
0806      * be exact or else __gfs2_iomap_alloc won't do the right thing.
0807      */
0808
0809     if (gfs2_is_stuffed(ip) || mp->mp_fheight != mp->mp_aheight) {
0810         unsigned int maxsize = mp->mp_fheight > 1 ?
0811             sdp->sd_inptrs : sdp->sd_diptrs;
0812         maxsize -= mp->mp_list[mp->mp_fheight - 1];
0813         if (size > maxsize)
0814             size = maxsize;
0815         return size;
0816     }
0817
0818     first = metapointer(ip->i_height - 1, mp);
0819     end = metaend(ip->i_height - 1, mp);
0820     if (end - first > size)
0821         end = first + size;
0822     for (ptr = first; ptr < end; ptr++) {
0823         if (*ptr)
0824             break;
0825     }
0826     return ptr - first;
0827 }
0828
0829 /**
0830  * __gfs2_iomap_get - Map blocks from an inode to disk blocks
0831  * @inode: The inode
0832  * @pos: Starting position in bytes
0833  * @length: Length to map, in bytes
0834  * @flags: iomap flags
0835  * @iomap: The iomap structure
0836  * @mp: The metapath
0837  *
0838  * Returns: errno
0839  */
0840 static int __gfs2_iomap_get(struct inode *inode, loff_t pos, loff_t length,
0841                 unsigned flags, struct iomap *iomap,
0842                 struct metapath *mp)
0843 {
0844     struct gfs2_inode *ip = GFS2_I(inode);
0845     struct gfs2_sbd *sdp = GFS2_SB(inode);
0846     loff_t size = i_size_read(inode);
0847     __be64 *ptr;
0848     sector_t lblock;
0849     sector_t lblock_stop;
0850     int ret;
0851     int eob;
0852     u64 len;
0853     struct buffer_head *dibh = NULL, *bh;
0854     u8 height;
0855
0856     if (!length)
0857         return -EINVAL;
0858
0859     down_read(&ip->i_rw_mutex);
0860
0861     ret = gfs2_meta_inode_buffer(ip, &dibh);
0862     if (ret)
0863         goto unlock;
0864     mp->mp_bh[0] = dibh;
0865
0866     if (gfs2_is_stuffed(ip)) {
0867         if (flags & IOMAP_WRITE) {
0868             loff_t max_size = gfs2_max_stuffed_size(ip);
0869
0870             if (pos + length > max_size)
0871                 goto unstuff;
0872             iomap->length = max_size;
0873         } else {
0874             if (pos >= size) {
0875                 if (flags & IOMAP_REPORT) {
0876                     ret = -ENOENT;
0877                     goto unlock;
0878                 } else {
0879                     iomap->offset = pos;
0880                     iomap->length = length;
0881                     goto hole_found;
0882                 }
0883             }
0884             iomap->length = size;
0885         }
0886         iomap->addr = (ip->i_no_addr << inode->i_blkbits) +
0887                   sizeof(struct gfs2_dinode);
0888         iomap->type = IOMAP_INLINE;
0889         iomap->inline_data = dibh->b_data + sizeof(struct gfs2_dinode);
0890         goto out;
0891     }
0892
0893 unstuff:
0894     lblock = pos >> inode->i_blkbits;
0895     iomap->offset = lblock << inode->i_blkbits;
0896     lblock_stop = (pos + length - 1) >> inode->i_blkbits;
0897     len = lblock_stop - lblock + 1;
0898     iomap->length = len << inode->i_blkbits;
0899
0900     height = ip->i_height;
0901     while ((lblock + 1) * sdp->sd_sb.sb_bsize > sdp->sd_heightsize[height])
0902         height++;
0903     find_metapath(sdp, lblock, mp, height);
0904     if (height > ip->i_height || gfs2_is_stuffed(ip))
0905         goto do_alloc;
0906
0907     ret = lookup_metapath(ip, mp);
0908     if (ret)
0909         goto unlock;
0910
0911     if (mp->mp_aheight != ip->i_height)
0912         goto do_alloc;
0913
0914     ptr = metapointer(ip->i_height - 1, mp);
0915     if (*ptr == 0)
0916         goto do_alloc;
0917
0918     bh = mp->mp_bh[ip->i_height - 1];
0919     len = gfs2_extent_length(bh, ptr, len, &eob);
0920
0921     iomap->addr = be64_to_cpu(*ptr) << inode->i_blkbits;
0922     iomap->length = len << inode->i_blkbits;
0923     iomap->type = IOMAP_MAPPED;
0924     iomap->flags |= IOMAP_F_MERGED;
0925     if (eob)
0926         iomap->flags |= IOMAP_F_GFS2_BOUNDARY;
0927
0928 out:
0929     iomap->bdev = inode->i_sb->s_bdev;
0930 unlock:
0931     up_read(&ip->i_rw_mutex);
0932     return ret;
0933
0934 do_alloc:
0935     if (flags & IOMAP_REPORT) {
0936         if (pos >= size)
0937             ret = -ENOENT;
0938         else if (height == ip->i_height)
0939             ret = gfs2_hole_size(inode, lblock, len, mp, iomap);
0940         else
0941             iomap->length = size - iomap->offset;
0942     } else if (flags & IOMAP_WRITE) {
0943         u64 alloc_size;
0944
0945         if (flags & IOMAP_DIRECT)
0946             goto out;  /* (see gfs2_file_direct_write) */
0947
0948         len = gfs2_alloc_size(inode, mp, len);
0949         alloc_size = len << inode->i_blkbits;
0950         if (alloc_size < iomap->length)
0951             iomap->length = alloc_size;
0952     } else {
0953         if (pos < size && height == ip->i_height)
0954             ret = gfs2_hole_size(inode, lblock, len, mp, iomap);
0955     }
0956 hole_found:
0957     iomap->addr = IOMAP_NULL_ADDR;
0958     iomap->type = IOMAP_HOLE;
0959     goto out;
0960 }
0961
0962 static int gfs2_iomap_page_prepare(struct inode *inode, loff_t pos,
0963                    unsigned len)
0964 {
0965     unsigned int blockmask = i_blocksize(inode) - 1;
0966     struct gfs2_sbd *sdp = GFS2_SB(inode);
0967     unsigned int blocks;
0968
0969     blocks = ((pos & blockmask) + len + blockmask) >> inode->i_blkbits;
0970     return gfs2_trans_begin(sdp, RES_DINODE + blocks, 0);
0971 }
0972
0973 static void gfs2_iomap_page_done(struct inode *inode, loff_t pos,
0974                  unsigned copied, struct page *page)
0975 {
0976     struct gfs2_trans *tr = current->journal_info;
0977     struct gfs2_inode *ip = GFS2_I(inode);
0978     struct gfs2_sbd *sdp = GFS2_SB(inode);
0979
0980     if (page && !gfs2_is_stuffed(ip))
0981         gfs2_page_add_databufs(ip, page, offset_in_page(pos), copied);
0982
0983     if (tr->tr_num_buf_new)
0984         __mark_inode_dirty(inode, I_DIRTY_DATASYNC);
0985
0986     gfs2_trans_end(sdp);
0987 }
0988
0989 static const struct iomap_page_ops gfs2_iomap_page_ops = {
0990     .page_prepare = gfs2_iomap_page_prepare,
0991     .page_done = gfs2_iomap_page_done,
0992 };
0993
0994 static int gfs2_iomap_begin_write(struct inode *inode, loff_t pos,
0995                   loff_t length, unsigned flags,
0996                   struct iomap *iomap,
0997                   struct metapath *mp)
0998 {
0999     struct gfs2_inode *ip = GFS2_I(inode);
1000     struct gfs2_sbd *sdp = GFS2_SB(inode);
1001     bool unstuff;
1002     int ret;
1003
1004     unstuff = gfs2_is_stuffed(ip) &&
1005           pos + length > gfs2_max_stuffed_size(ip);
1006
1007     if (unstuff || iomap->type == IOMAP_HOLE) {
1008         unsigned int data_blocks, ind_blocks;
1009         struct gfs2_alloc_parms ap = {};
1010         unsigned int rblocks;
1011         struct gfs2_trans *tr;
1012
1013         gfs2_write_calc_reserv(ip, iomap->length, &data_blocks,
1014                        &ind_blocks);
1015         ap.target = data_blocks + ind_blocks;
1016         ret = gfs2_quota_lock_check(ip, &ap);
1017         if (ret)
1018             return ret;
1019
1020         ret = gfs2_inplace_reserve(ip, &ap);
1021         if (ret)
1022             goto out_qunlock;
1023
1024         rblocks = RES_DINODE + ind_blocks;
1025         if (gfs2_is_jdata(ip))
1026             rblocks += data_blocks;
1027         if (ind_blocks || data_blocks)
1028             rblocks += RES_STATFS + RES_QUOTA;
1029         if (inode == sdp->sd_rindex)
1030             rblocks += 2 * RES_STATFS;
1031         rblocks += gfs2_rg_blocks(ip, data_blocks + ind_blocks);
1032
1033         ret = gfs2_trans_begin(sdp, rblocks,
1034                        iomap->length >> inode->i_blkbits);
1035         if (ret)
1036             goto out_trans_fail;
1037
1038         if (unstuff) {
1039             ret = gfs2_unstuff_dinode(ip);
1040             if (ret)
1041                 goto out_trans_end;
1042             release_metapath(mp);
1043             ret = __gfs2_iomap_get(inode, iomap->offset,
1044                            iomap->length, flags, iomap, mp);
1045             if (ret)
1046                 goto out_trans_end;
1047         }
1048
1049         if (iomap->type == IOMAP_HOLE) {
1050             ret = __gfs2_iomap_alloc(inode, iomap, mp);
1051             if (ret) {
1052                 gfs2_trans_end(sdp);
1053                 gfs2_inplace_release(ip);
1054                 punch_hole(ip, iomap->offset, iomap->length);
1055                 goto out_qunlock;
1056             }
1057         }
1058
1059         tr = current->journal_info;
1060         if (tr->tr_num_buf_new)
1061             __mark_inode_dirty(inode, I_DIRTY_DATASYNC);
1062
1063         gfs2_trans_end(sdp);
1064     }
1065
1066     if (gfs2_is_stuffed(ip) || gfs2_is_jdata(ip))
1067         iomap->page_ops = &gfs2_iomap_page_ops;
1068     return 0;
1069
1070 out_trans_end:
1071     gfs2_trans_end(sdp);
1072 out_trans_fail:
1073     gfs2_inplace_release(ip);
1074 out_qunlock:
1075     gfs2_quota_unlock(ip);
1076     return ret;
1077 }
1078
1079 static int gfs2_iomap_begin(struct inode *inode, loff_t pos, loff_t length,
1080                 unsigned flags, struct iomap *iomap,
1081                 struct iomap *srcmap)
1082 {
1083     struct gfs2_inode *ip = GFS2_I(inode);
1084     struct metapath mp = { .mp_aheight = 1, };
1085     int ret;
1086
1087     if (gfs2_is_jdata(ip))
1088         iomap->flags |= IOMAP_F_BUFFER_HEAD;
1089
1090     trace_gfs2_iomap_start(ip, pos, length, flags);
1091     ret = __gfs2_iomap_get(inode, pos, length, flags, iomap, &mp);
1092     if (ret)
1093         goto out_unlock;
1094
1095     switch(flags & (IOMAP_WRITE | IOMAP_ZERO)) {
1096     case IOMAP_WRITE:
1097         if (flags & IOMAP_DIRECT) {
1098             /*
1099              * Silently fall back to buffered I/O for stuffed files
1100              * or if we've got a hole (see gfs2_file_direct_write).
1101              */
1102             if (iomap->type != IOMAP_MAPPED)
1103                 ret = -ENOTBLK;
1104             goto out_unlock;
1105         }
1106         break;
1107     case IOMAP_ZERO:
1108         if (iomap->type == IOMAP_HOLE)
1109             goto out_unlock;
1110         break;
1111     default:
1112         goto out_unlock;
1113     }
1114
1115     ret = gfs2_iomap_begin_write(inode, pos, length, flags, iomap, &mp);
1116
1117 out_unlock:
1118     release_metapath(&mp);
1119     trace_gfs2_iomap_end(ip, iomap, ret);
1120     return ret;
1121 }
1122
1123 static int gfs2_iomap_end(struct inode *inode, loff_t pos, loff_t length,
1124               ssize_t written, unsigned flags, struct iomap *iomap)
1125 {
1126     struct gfs2_inode *ip = GFS2_I(inode);
1127     struct gfs2_sbd *sdp = GFS2_SB(inode);
1128
1129     switch (flags & (IOMAP_WRITE | IOMAP_ZERO)) {
1130     case IOMAP_WRITE:
1131         if (flags & IOMAP_DIRECT)
1132             return 0;
1133         break;
1134     case IOMAP_ZERO:
1135          if (iomap->type == IOMAP_HOLE)
1136              return 0;
1137          break;
1138     default:
1139          return 0;
1140     }
1141
1142     if (!gfs2_is_stuffed(ip))
1143         gfs2_ordered_add_inode(ip);
1144
1145     if (inode == sdp->sd_rindex)
1146         adjust_fs_space(inode);
1147
1148     gfs2_inplace_release(ip);
1149
1150     if (ip->i_qadata && ip->i_qadata->qa_qd_num)
1151         gfs2_quota_unlock(ip);
1152
1153     if (length != written && (iomap->flags & IOMAP_F_NEW)) {
1154         /* Deallocate blocks that were just allocated. */
1155         loff_t hstart = round_up(pos + written, i_blocksize(inode));
1156         loff_t hend = iomap->offset + iomap->length;
1157
1158         if (hstart < hend) {
1159             truncate_pagecache_range(inode, hstart, hend - 1);
1160             punch_hole(ip, hstart, hend - hstart);
1161         }
1162     }
1163
1164     if (unlikely(!written))
1165         return 0;
1166
1167     if (iomap->flags & IOMAP_F_SIZE_CHANGED)
1168         mark_inode_dirty(inode);
1169     set_bit(GLF_DIRTY, &ip->i_gl->gl_flags);
1170     return 0;
1171 }
1172
1173 const struct iomap_ops gfs2_iomap_ops = {
1174     .iomap_begin = gfs2_iomap_begin,
1175     .iomap_end = gfs2_iomap_end,
1176 };
1177
1178 /**
1179  * gfs2_block_map - Map one or more blocks of an inode to a disk block
1180  * @inode: The inode
1181  * @lblock: The logical block number
1182  * @bh_map: The bh to be mapped
1183  * @create: True if its ok to alloc blocks to satify the request
1184  *
1185  * The size of the requested mapping is defined in bh_map->b_size.
1186  *
1187  * Clears buffer_mapped(bh_map) and leaves bh_map->b_size unchanged
1188  * when @lblock is not mapped.  Sets buffer_mapped(bh_map) and
1189  * bh_map->b_size to indicate the size of the mapping when @lblock and
1190  * successive blocks are mapped, up to the requested size.
1191  *
1192  * Sets buffer_boundary() if a read of metadata will be required
1193  * before the next block can be mapped. Sets buffer_new() if new
1194  * blocks were allocated.
1195  *
1196  * Returns: errno
1197  */
1198
1199 int gfs2_block_map(struct inode *inode, sector_t lblock,
1200            struct buffer_head *bh_map, int create)
1201 {
1202     struct gfs2_inode *ip = GFS2_I(inode);
1203     loff_t pos = (loff_t)lblock << inode->i_blkbits;
1204     loff_t length = bh_map->b_size;
1205     struct iomap iomap = { };
1206     int ret;
1207
1208     clear_buffer_mapped(bh_map);
1209     clear_buffer_new(bh_map);
1210     clear_buffer_boundary(bh_map);
1211     trace_gfs2_bmap(ip, bh_map, lblock, create, 1);
1212
1213     if (!create)
1214         ret = gfs2_iomap_get(inode, pos, length, &iomap);
1215     else
1216         ret = gfs2_iomap_alloc(inode, pos, length, &iomap);
1217     if (ret)
1218         goto out;
1219
1220     if (iomap.length > bh_map->b_size) {
1221         iomap.length = bh_map->b_size;
1222         iomap.flags &= ~IOMAP_F_GFS2_BOUNDARY;
1223     }
1224     if (iomap.addr != IOMAP_NULL_ADDR)
1225         map_bh(bh_map, inode->i_sb, iomap.addr >> inode->i_blkbits);
1226     bh_map->b_size = iomap.length;
1227     if (iomap.flags & IOMAP_F_GFS2_BOUNDARY)
1228         set_buffer_boundary(bh_map);
1229     if (iomap.flags & IOMAP_F_NEW)
1230         set_buffer_new(bh_map);
1231
1232 out:
1233     trace_gfs2_bmap(ip, bh_map, lblock, create, ret);
1234     return ret;
1235 }
1236
1237 int gfs2_get_extent(struct inode *inode, u64 lblock, u64 *dblock,
1238             unsigned int *extlen)
1239 {
1240     unsigned int blkbits = inode->i_blkbits;
1241     struct iomap iomap = { };
1242     unsigned int len;
1243     int ret;
1244
1245     ret = gfs2_iomap_get(inode, lblock << blkbits, *extlen << blkbits,
1246                  &iomap);
1247     if (ret)
1248         return ret;
1249     if (iomap.type != IOMAP_MAPPED)
1250         return -EIO;
1251     *dblock = iomap.addr >> blkbits;
1252     len = iomap.length >> blkbits;
1253     if (len < *extlen)
1254         *extlen = len;
1255     return 0;
1256 }
1257
1258 int gfs2_alloc_extent(struct inode *inode, u64 lblock, u64 *dblock,
1259               unsigned int *extlen, bool *new)
1260 {
1261     unsigned int blkbits = inode->i_blkbits;
1262     struct iomap iomap = { };
1263     unsigned int len;
1264     int ret;
1265
1266     ret = gfs2_iomap_alloc(inode, lblock << blkbits, *extlen << blkbits,
1267                    &iomap);
1268     if (ret)
1269         return ret;
1270     if (iomap.type != IOMAP_MAPPED)
1271         return -EIO;
1272     *dblock = iomap.addr >> blkbits;
1273     len = iomap.length >> blkbits;
1274     if (len < *extlen)
1275         *extlen = len;
1276     *new = iomap.flags & IOMAP_F_NEW;
1277     return 0;
1278 }
1279
1280 /*
1281  * NOTE: Never call gfs2_block_zero_range with an open transaction because it
1282  * uses iomap write to perform its actions, which begin their own transactions
1283  * (iomap_begin, page_prepare, etc.)
1284  */
1285 static int gfs2_block_zero_range(struct inode *inode, loff_t from,
1286                  unsigned int length)
1287 {
1288     BUG_ON(current->journal_info);
1289     return iomap_zero_range(inode, from, length, NULL, &gfs2_iomap_ops);
1290 }
1291
1292 #define GFS2_JTRUNC_REVOKES 8192
1293
1294 /**
1295  * gfs2_journaled_truncate - Wrapper for truncate_pagecache for jdata files
1296  * @inode: The inode being truncated
1297  * @oldsize: The original (larger) size
1298  * @newsize: The new smaller size
1299  *
1300  * With jdata files, we have to journal a revoke for each block which is
1301  * truncated. As a result, we need to split this into separate transactions
1302  * if the number of pages being truncated gets too large.
1303  */
1304
1305 static int gfs2_journaled_truncate(struct inode *inode, u64 oldsize, u64 newsize)
1306 {
1307     struct gfs2_sbd *sdp = GFS2_SB(inode);
1308     u64 max_chunk = GFS2_JTRUNC_REVOKES * sdp->sd_vfs->s_blocksize;
1309     u64 chunk;
1310     int error;
1311
1312     while (oldsize != newsize) {
1313         struct gfs2_trans *tr;
1314         unsigned int offs;
1315
1316         chunk = oldsize - newsize;
1317         if (chunk > max_chunk)
1318             chunk = max_chunk;
1319
1320         offs = oldsize & ~PAGE_MASK;
1321         if (offs && chunk > PAGE_SIZE)
1322             chunk = offs + ((chunk - offs) & PAGE_MASK);
1323
1324         truncate_pagecache(inode, oldsize - chunk);
1325         oldsize -= chunk;
1326
1327         tr = current->journal_info;
1328         if (!test_bit(TR_TOUCHED, &tr->tr_flags))
1329             continue;
1330
1331         gfs2_trans_end(sdp);
1332         error = gfs2_trans_begin(sdp, RES_DINODE, GFS2_JTRUNC_REVOKES);
1333         if (error)
1334             return error;
1335     }
1336
1337     return 0;
1338 }
1339
1340 static int trunc_start(struct inode *inode, u64 newsize)
1341 {
1342     struct gfs2_inode *ip = GFS2_I(inode);
1343     struct gfs2_sbd *sdp = GFS2_SB(inode);
1344     struct buffer_head *dibh = NULL;
1345     int journaled = gfs2_is_jdata(ip);
1346     u64 oldsize = inode->i_size;
1347     int error;
1348
1349     if (!gfs2_is_stuffed(ip)) {
1350         unsigned int blocksize = i_blocksize(inode);
1351         unsigned int offs = newsize & (blocksize - 1);
1352         if (offs) {
1353             error = gfs2_block_zero_range(inode, newsize,
1354                               blocksize - offs);
1355             if (error)
1356                 return error;
1357         }
1358     }
1359     if (journaled)
1360         error = gfs2_trans_begin(sdp, RES_DINODE + RES_JDATA, GFS2_JTRUNC_REVOKES);
1361     else
1362         error = gfs2_trans_begin(sdp, RES_DINODE, 0);
1363     if (error)
1364         return error;
1365
1366     error = gfs2_meta_inode_buffer(ip, &dibh);
1367     if (error)
1368         goto out;
1369
1370     gfs2_trans_add_meta(ip->i_gl, dibh);
1371
1372     if (gfs2_is_stuffed(ip))
1373         gfs2_buffer_clear_tail(dibh, sizeof(struct gfs2_dinode) + newsize);
1374     else
1375         ip->i_diskflags |= GFS2_DIF_TRUNC_IN_PROG;
1376
1377     i_size_write(inode, newsize);
1378     ip->i_inode.i_mtime = ip->i_inode.i_ctime = current_time(&ip->i_inode);
1379     gfs2_dinode_out(ip, dibh->b_data);
1380
1381     if (journaled)
1382         error = gfs2_journaled_truncate(inode, oldsize, newsize);
1383     else
1384         truncate_pagecache(inode, newsize);
1385
1386 out:
1387     brelse(dibh);
1388     if (current->journal_info)
1389         gfs2_trans_end(sdp);
1390     return error;
1391 }
1392
1393 int gfs2_iomap_get(struct inode *inode, loff_t pos, loff_t length,
1394            struct iomap *iomap)
1395 {
1396     struct metapath mp = { .mp_aheight = 1, };
1397     int ret;
1398
1399     ret = __gfs2_iomap_get(inode, pos, length, 0, iomap, &mp);
1400     release_metapath(&mp);
1401     return ret;
1402 }
1403
1404 int gfs2_iomap_alloc(struct inode *inode, loff_t pos, loff_t length,
1405              struct iomap *iomap)
1406 {
1407     struct metapath mp = { .mp_aheight = 1, };
1408     int ret;
1409
1410     ret = __gfs2_iomap_get(inode, pos, length, IOMAP_WRITE, iomap, &mp);
1411     if (!ret && iomap->type == IOMAP_HOLE)
1412         ret = __gfs2_iomap_alloc(inode, iomap, &mp);
1413     release_metapath(&mp);
1414     return ret;
1415 }
1416
1417 /**
1418  * sweep_bh_for_rgrps - find an rgrp in a meta buffer and free blocks therein
1419  * @ip: inode
1420  * @rd_gh: holder of resource group glock
1421  * @bh: buffer head to sweep
1422  * @start: starting point in bh
1423  * @end: end point in bh
1424  * @meta: true if bh points to metadata (rather than data)
1425  * @btotal: place to keep count of total blocks freed
1426  *
1427  * We sweep a metadata buffer (provided by the metapath) for blocks we need to
1428  * free, and free them all. However, we do it one rgrp at a time. If this
1429  * block has references to multiple rgrps, we break it into individual
1430  * transactions. This allows other processes to use the rgrps while we're
1431  * focused on a single one, for better concurrency / performance.
1432  * At every transaction boundary, we rewrite the inode into the journal.
1433  * That way the bitmaps are kept consistent with the inode and we can recover
1434  * if we're interrupted by power-outages.
1435  *
1436  * Returns: 0, or return code if an error occurred.
1437  *          *btotal has the total number of blocks freed
1438  */
1439 static int sweep_bh_for_rgrps(struct gfs2_inode *ip, struct gfs2_holder *rd_gh,
1440                   struct buffer_head *bh, __be64 *start, __be64 *end,
1441                   bool meta, u32 *btotal)
1442 {
1443     struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
1444     struct gfs2_rgrpd *rgd;
1445     struct gfs2_trans *tr;
1446     __be64 *p;
1447     int blks_outside_rgrp;
1448     u64 bn, bstart, isize_blks;
1449     s64 blen; /* needs to be s64 or gfs2_add_inode_blocks breaks */
1450     int ret = 0;
1451     bool buf_in_tr = false; /* buffer was added to transaction */
1452
1453 more_rgrps:
1454     rgd = NULL;
1455     if (gfs2_holder_initialized(rd_gh)) {
1456         rgd = gfs2_glock2rgrp(rd_gh->gh_gl);
1457         gfs2_assert_withdraw(sdp,
1458                  gfs2_glock_is_locked_by_me(rd_gh->gh_gl));
1459     }
1460     blks_outside_rgrp = 0;
1461     bstart = 0;
1462     blen = 0;
1463
1464     for (p = start; p < end; p++) {
1465         if (!*p)
1466             continue;
1467         bn = be64_to_cpu(*p);
1468
1469         if (rgd) {
1470             if (!rgrp_contains_block(rgd, bn)) {
1471                 blks_outside_rgrp++;
1472                 continue;
1473             }
1474         } else {
1475             rgd = gfs2_blk2rgrpd(sdp, bn, true);
1476             if (unlikely(!rgd)) {
1477                 ret = -EIO;
1478                 goto out;
1479             }
1480             ret = gfs2_glock_nq_init(rgd->rd_gl, LM_ST_EXCLUSIVE,
1481                          LM_FLAG_NODE_SCOPE, rd_gh);
1482             if (ret)
1483                 goto out;
1484
1485             /* Must be done with the rgrp glock held: */
1486             if (gfs2_rs_active(&ip->i_res) &&
1487                 rgd == ip->i_res.rs_rgd)
1488                 gfs2_rs_deltree(&ip->i_res);
1489         }
1490
1491         /* The size of our transactions will be unknown until we
1492            actually process all the metadata blocks that relate to
1493            the rgrp. So we estimate. We know it can't be more than
1494            the dinode's i_blocks and we don't want to exceed the
1495            journal flush threshold, sd_log_thresh2. */
1496         if (current->journal_info == NULL) {
1497             unsigned int jblocks_rqsted, revokes;
1498
1499             jblocks_rqsted = rgd->rd_length + RES_DINODE +
1500                 RES_INDIRECT;
1501             isize_blks = gfs2_get_inode_blocks(&ip->i_inode);
1502             if (isize_blks > atomic_read(&sdp->sd_log_thresh2))
1503                 jblocks_rqsted +=
1504                     atomic_read(&sdp->sd_log_thresh2);
1505             else
1506                 jblocks_rqsted += isize_blks;
1507             revokes = jblocks_rqsted;
1508             if (meta)
1509                 revokes += end - start;
1510             else if (ip->i_depth)
1511                 revokes += sdp->sd_inptrs;
1512             ret = gfs2_trans_begin(sdp, jblocks_rqsted, revokes);
1513             if (ret)
1514                 goto out_unlock;
1515             down_write(&ip->i_rw_mutex);
1516         }
1517         /* check if we will exceed the transaction blocks requested */
1518         tr = current->journal_info;
1519         if (tr->tr_num_buf_new + RES_STATFS +
1520             RES_QUOTA >= atomic_read(&sdp->sd_log_thresh2)) {
1521             /* We set blks_outside_rgrp to ensure the loop will
1522                be repeated for the same rgrp, but with a new
1523                transaction. */
1524             blks_outside_rgrp++;
1525             /* This next part is tricky. If the buffer was added
1526                to the transaction, we've already set some block
1527                pointers to 0, so we better follow through and free
1528                them, or we will introduce corruption (so break).
1529                This may be impossible, or at least rare, but I
1530                decided to cover the case regardless.
1531
1532                If the buffer was not added to the transaction
1533                (this call), doing so would exceed our transaction
1534                size, so we need to end the transaction and start a
1535                new one (so goto). */
1536
1537             if (buf_in_tr)
1538                 break;
1539             goto out_unlock;
1540         }
1541
1542         gfs2_trans_add_meta(ip->i_gl, bh);
1543         buf_in_tr = true;
1544         *p = 0;
1545         if (bstart + blen == bn) {
1546             blen++;
1547             continue;
1548         }
1549         if (bstart) {
1550             __gfs2_free_blocks(ip, rgd, bstart, (u32)blen, meta);
1551             (*btotal) += blen;
1552             gfs2_add_inode_blocks(&ip->i_inode, -blen);
1553         }
1554         bstart = bn;
1555         blen = 1;
1556     }
1557     if (bstart) {
1558         __gfs2_free_blocks(ip, rgd, bstart, (u32)blen, meta);
1559         (*btotal) += blen;
1560         gfs2_add_inode_blocks(&ip->i_inode, -blen);
1561     }
1562 out_unlock:
1563     if (!ret && blks_outside_rgrp) { /* If buffer still has non-zero blocks
1564                         outside the rgrp we just processed,
1565                         do it all over again. */
1566         if (current->journal_info) {
1567             struct buffer_head *dibh;
1568
1569             ret = gfs2_meta_inode_buffer(ip, &dibh);
1570             if (ret)
1571                 goto out;
1572
1573             /* Every transaction boundary, we rewrite the dinode
1574                to keep its di_blocks current in case of failure. */
1575             ip->i_inode.i_mtime = ip->i_inode.i_ctime =
1576                 current_time(&ip->i_inode);
1577             gfs2_trans_add_meta(ip->i_gl, dibh);
1578             gfs2_dinode_out(ip, dibh->b_data);
1579             brelse(dibh);
1580             up_write(&ip->i_rw_mutex);
1581             gfs2_trans_end(sdp);
1582             buf_in_tr = false;
1583         }
1584         gfs2_glock_dq_uninit(rd_gh);
1585         cond_resched();
1586         goto more_rgrps;
1587     }
1588 out:
1589     return ret;
1590 }
1591
1592 static bool mp_eq_to_hgt(struct metapath *mp, __u16 *list, unsigned int h)
1593 {
1594     if (memcmp(mp->mp_list, list, h * sizeof(mp->mp_list[0])))
1595         return false;
1596     return true;
1597 }
1598
1599 /**
1600  * find_nonnull_ptr - find a non-null pointer given a metapath and height
1601  * @sdp: The superblock
1602  * @mp: starting metapath
1603  * @h: desired height to search
1604  * @end_list: See punch_hole().
1605  * @end_aligned: See punch_hole().
1606  *
1607  * Assumes the metapath is valid (with buffers) out to height h.
1608  * Returns: true if a non-null pointer was found in the metapath buffer
1609  *          false if all remaining pointers are NULL in the buffer
1610  */
1611 static bool find_nonnull_ptr(struct gfs2_sbd *sdp, struct metapath *mp,
1612                  unsigned int h,
1613                  __u16 *end_list, unsigned int end_aligned)
1614 {
1615     struct buffer_head *bh = mp->mp_bh[h];
1616     __be64 *first, *ptr, *end;
1617
1618     first = metaptr1(h, mp);
1619     ptr = first + mp->mp_list[h];
1620     end = (__be64 *)(bh->b_data + bh->b_size);
1621     if (end_list && mp_eq_to_hgt(mp, end_list, h)) {
1622         bool keep_end = h < end_aligned;
1623         end = first + end_list[h] + keep_end;
1624     }
1625
1626     while (ptr < end) {
1627         if (*ptr) { /* if we have a non-null pointer */
1628             mp->mp_list[h] = ptr - first;
1629             h++;
1630             if (h < GFS2_MAX_META_HEIGHT)
1631                 mp->mp_list[h] = 0;
1632             return true;
1633         }
1634         ptr++;
1635     }
1636     return false;
1637 }
1638
1639 enum dealloc_states {
1640     DEALLOC_MP_FULL = 0,    /* Strip a metapath with all buffers read in */
1641     DEALLOC_MP_LOWER = 1,   /* lower the metapath strip height */
1642     DEALLOC_FILL_MP = 2,  /* Fill in the metapath to the given height. */
1643     DEALLOC_DONE = 3,       /* process complete */
1644 };
1645
1646 static inline void
1647 metapointer_range(struct metapath *mp, int height,
1648           __u16 *start_list, unsigned int start_aligned,
1649           __u16 *end_list, unsigned int end_aligned,
1650           __be64 **start, __be64 **end)
1651 {
1652     struct buffer_head *bh = mp->mp_bh[height];
1653     __be64 *first;
1654
1655     first = metaptr1(height, mp);
1656     *start = first;
1657     if (mp_eq_to_hgt(mp, start_list, height)) {
1658         bool keep_start = height < start_aligned;
1659         *start = first + start_list[height] + keep_start;
1660     }
1661     *end = (__be64 *)(bh->b_data + bh->b_size);
1662     if (end_list && mp_eq_to_hgt(mp, end_list, height)) {
1663         bool keep_end = height < end_aligned;
1664         *end = first + end_list[height] + keep_end;
1665     }
1666 }
1667
1668 static inline bool walk_done(struct gfs2_sbd *sdp,
1669                  struct metapath *mp, int height,
1670                  __u16 *end_list, unsigned int end_aligned)
1671 {
1672     __u16 end;
1673
1674     if (end_list) {
1675         bool keep_end = height < end_aligned;
1676         if (!mp_eq_to_hgt(mp, end_list, height))
1677             return false;
1678         end = end_list[height] + keep_end;
1679     } else
1680         end = (height > 0) ? sdp->sd_inptrs : sdp->sd_diptrs;
1681     return mp->mp_list[height] >= end;
1682 }
1683
1684 /**
1685  * punch_hole - deallocate blocks in a file
1686  * @ip: inode to truncate
1687  * @offset: the start of the hole
1688  * @length: the size of the hole (or 0 for truncate)
1689  *
1690  * Punch a hole into a file or truncate a file at a given position.  This
1691  * function operates in whole blocks (@offset and @length are rounded
1692  * accordingly); partially filled blocks must be cleared otherwise.
1693  *
1694  * This function works from the bottom up, and from the right to the left. In
1695  * other words, it strips off the highest layer (data) before stripping any of
1696  * the metadata. Doing it this way is best in case the operation is interrupted
1697  * by power failure, etc.  The dinode is rewritten in every transaction to
1698  * guarantee integrity.
1699  */
1700 static int punch_hole(struct gfs2_inode *ip, u64 offset, u64 length)
1701 {
1702     struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
1703     u64 maxsize = sdp->sd_heightsize[ip->i_height];
1704     struct metapath mp = {};
1705     struct buffer_head *dibh, *bh;
1706     struct gfs2_holder rd_gh;
1707     unsigned int bsize_shift = sdp->sd_sb.sb_bsize_shift;
1708     u64 lblock = (offset + (1 << bsize_shift) - 1) >> bsize_shift;
1709     __u16 start_list[GFS2_MAX_META_HEIGHT];
1710     __u16 __end_list[GFS2_MAX_META_HEIGHT], *end_list = NULL;
1711     unsigned int start_aligned, end_aligned;
1712     unsigned int strip_h = ip->i_height - 1;
1713     u32 btotal = 0;
1714     int ret, state;
1715     int mp_h; /* metapath buffers are read in to this height */
1716     u64 prev_bnr = 0;
1717     __be64 *start, *end;
1718
1719     if (offset >= maxsize) {
1720         /*
1721          * The starting point lies beyond the allocated meta-data;
1722          * there are no blocks do deallocate.
1723          */
1724         return 0;
1725     }
1726
1727     /*
1728      * The start position of the hole is defined by lblock, start_list, and
1729      * start_aligned.  The end position of the hole is defined by lend,
1730      * end_list, and end_aligned.
1731      *
1732      * start_aligned and end_aligned define down to which height the start
1733      * and end positions are aligned to the metadata tree (i.e., the
1734      * position is a multiple of the metadata granularity at the height
1735      * above).  This determines at which heights additional meta pointers
1736      * needs to be preserved for the remaining data.
1737      */
1738
1739     if (length) {
1740         u64 end_offset = offset + length;
1741         u64 lend;
1742
1743         /*
1744          * Clip the end at the maximum file size for the given height:
1745          * that's how far the metadata goes; files bigger than that
1746          * will have additional layers of indirection.
1747          */
1748         if (end_offset > maxsize)
1749             end_offset = maxsize;
1750         lend = end_offset >> bsize_shift;
1751
1752         if (lblock >= lend)
1753             return 0;
1754
1755         find_metapath(sdp, lend, &mp, ip->i_height);
1756         end_list = __end_list;
1757         memcpy(end_list, mp.mp_list, sizeof(mp.mp_list));
1758
1759         for (mp_h = ip->i_height - 1; mp_h > 0; mp_h--) {
1760             if (end_list[mp_h])
1761                 break;
1762         }
1763         end_aligned = mp_h;
1764     }
1765
1766     find_metapath(sdp, lblock, &mp, ip->i_height);
1767     memcpy(start_list, mp.mp_list, sizeof(start_list));
1768
1769     for (mp_h = ip->i_height - 1; mp_h > 0; mp_h--) {
1770         if (start_list[mp_h])
1771             break;
1772     }
1773     start_aligned = mp_h;
1774
1775     ret = gfs2_meta_inode_buffer(ip, &dibh);
1776     if (ret)
1777         return ret;
1778
1779     mp.mp_bh[0] = dibh;
1780     ret = lookup_metapath(ip, &mp);
1781     if (ret)
1782         goto out_metapath;
1783
1784     /* issue read-ahead on metadata */
1785     for (mp_h = 0; mp_h < mp.mp_aheight - 1; mp_h++) {
1786         metapointer_range(&mp, mp_h, start_list, start_aligned,
1787                   end_list, end_aligned, &start, &end);
1788         gfs2_metapath_ra(ip->i_gl, start, end);
1789     }
1790
1791     if (mp.mp_aheight == ip->i_height)
1792         state = DEALLOC_MP_FULL; /* We have a complete metapath */
1793     else
1794         state = DEALLOC_FILL_MP; /* deal with partial metapath */
1795
1796     ret = gfs2_rindex_update(sdp);
1797     if (ret)
1798         goto out_metapath;
1799
1800     ret = gfs2_quota_hold(ip, NO_UID_QUOTA_CHANGE, NO_GID_QUOTA_CHANGE);
1801     if (ret)
1802         goto out_metapath;
1803     gfs2_holder_mark_uninitialized(&rd_gh);
1804
1805     mp_h = strip_h;
1806
1807     while (state != DEALLOC_DONE) {
1808         switch (state) {
1809         /* Truncate a full metapath at the given strip height.
1810          * Note that strip_h == mp_h in order to be in this state. */
1811         case DEALLOC_MP_FULL:
1812             bh = mp.mp_bh[mp_h];
1813             gfs2_assert_withdraw(sdp, bh);
1814             if (gfs2_assert_withdraw(sdp,
1815                          prev_bnr != bh->b_blocknr)) {
1816                 fs_emerg(sdp, "inode %llu, block:%llu, i_h:%u,"
1817                      "s_h:%u, mp_h:%u\n",
1818                        (unsigned long long)ip->i_no_addr,
1819                        prev_bnr, ip->i_height, strip_h, mp_h);
1820             }
1821             prev_bnr = bh->b_blocknr;
1822
1823             if (gfs2_metatype_check(sdp, bh,
1824                         (mp_h ? GFS2_METATYPE_IN :
1825                             GFS2_METATYPE_DI))) {
1826                 ret = -EIO;
1827                 goto out;
1828             }
1829
1830             /*
1831              * Below, passing end_aligned as 0 gives us the
1832              * metapointer range excluding the end point: the end
1833              * point is the first metapath we must not deallocate!
1834              */
1835
1836             metapointer_range(&mp, mp_h, start_list, start_aligned,
1837                       end_list, 0 /* end_aligned */,
1838                       &start, &end);
1839             ret = sweep_bh_for_rgrps(ip, &rd_gh, mp.mp_bh[mp_h],
1840                          start, end,
1841                          mp_h != ip->i_height - 1,
1842                          &btotal);
1843
1844             /* If we hit an error or just swept dinode buffer,
1845                just exit. */
1846             if (ret || !mp_h) {
1847                 state = DEALLOC_DONE;
1848                 break;
1849             }
1850             state = DEALLOC_MP_LOWER;
1851             break;
1852
1853         /* lower the metapath strip height */
1854         case DEALLOC_MP_LOWER:
1855             /* We're done with the current buffer, so release it,
1856                unless it's the dinode buffer. Then back up to the
1857                previous pointer. */
1858             if (mp_h) {
1859                 brelse(mp.mp_bh[mp_h]);
1860                 mp.mp_bh[mp_h] = NULL;
1861             }
1862             /* If we can't get any lower in height, we've stripped
1863                off all we can. Next step is to back up and start
1864                stripping the previous level of metadata. */
1865             if (mp_h == 0) {
1866                 strip_h--;
1867                 memcpy(mp.mp_list, start_list, sizeof(start_list));
1868                 mp_h = strip_h;
1869                 state = DEALLOC_FILL_MP;
1870                 break;
1871             }
1872             mp.mp_list[mp_h] = 0;
1873             mp_h--; /* search one metadata height down */
1874             mp.mp_list[mp_h]++;
1875             if (walk_done(sdp, &mp, mp_h, end_list, end_aligned))
1876                 break;
1877             /* Here we've found a part of the metapath that is not
1878              * allocated. We need to search at that height for the
1879              * next non-null pointer. */
1880             if (find_nonnull_ptr(sdp, &mp, mp_h, end_list, end_aligned)) {
1881                 state = DEALLOC_FILL_MP;
1882                 mp_h++;
1883             }
1884             /* No more non-null pointers at this height. Back up
1885                to the previous height and try again. */
1886             break; /* loop around in the same state */
1887
1888         /* Fill the metapath with buffers to the given height. */
1889         case DEALLOC_FILL_MP:
1890             /* Fill the buffers out to the current height. */
1891             ret = fillup_metapath(ip, &mp, mp_h);
1892             if (ret < 0)
1893                 goto out;
1894
1895             /* On the first pass, issue read-ahead on metadata. */
1896             if (mp.mp_aheight > 1 && strip_h == ip->i_height - 1) {
1897                 unsigned int height = mp.mp_aheight - 1;
1898
1899                 /* No read-ahead for data blocks. */
1900                 if (mp.mp_aheight - 1 == strip_h)
1901                     height--;
1902
1903                 for (; height >= mp.mp_aheight - ret; height--) {
1904                     metapointer_range(&mp, height,
1905                               start_list, start_aligned,
1906                               end_list, end_aligned,
1907                               &start, &end);
1908                     gfs2_metapath_ra(ip->i_gl, start, end);
1909                 }
1910             }
1911
1912             /* If buffers found for the entire strip height */
1913             if (mp.mp_aheight - 1 == strip_h) {
1914                 state = DEALLOC_MP_FULL;
1915                 break;
1916             }
1917             if (mp.mp_aheight < ip->i_height) /* We have a partial height */
1918                 mp_h = mp.mp_aheight - 1;
1919
1920             /* If we find a non-null block pointer, crawl a bit
1921                higher up in the metapath and try again, otherwise
1922                we need to look lower for a new starting point. */
1923             if (find_nonnull_ptr(sdp, &mp, mp_h, end_list, end_aligned))
1924                 mp_h++;
1925             else
1926                 state = DEALLOC_MP_LOWER;
1927             break;
1928         }
1929     }
1930
1931     if (btotal) {
1932         if (current->journal_info == NULL) {
1933             ret = gfs2_trans_begin(sdp, RES_DINODE + RES_STATFS +
1934                            RES_QUOTA, 0);
1935             if (ret)
1936                 goto out;
1937             down_write(&ip->i_rw_mutex);
1938         }
1939         gfs2_statfs_change(sdp, 0, +btotal, 0);
1940         gfs2_quota_change(ip, -(s64)btotal, ip->i_inode.i_uid,
1941                   ip->i_inode.i_gid);
1942         ip->i_inode.i_mtime = ip->i_inode.i_ctime = current_time(&ip->i_inode);
1943         gfs2_trans_add_meta(ip->i_gl, dibh);
1944         gfs2_dinode_out(ip, dibh->b_data);
1945         up_write(&ip->i_rw_mutex);
1946         gfs2_trans_end(sdp);
1947     }
1948
1949 out:
1950     if (gfs2_holder_initialized(&rd_gh))
1951         gfs2_glock_dq_uninit(&rd_gh);
1952     if (current->journal_info) {
1953         up_write(&ip->i_rw_mutex);
1954         gfs2_trans_end(sdp);
1955         cond_resched();
1956     }
1957     gfs2_quota_unhold(ip);
1958 out_metapath:
1959     release_metapath(&mp);
1960     return ret;
1961 }
1962
1963 static int trunc_end(struct gfs2_inode *ip)
1964 {
1965     struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
1966     struct buffer_head *dibh;
1967     int error;
1968
1969     error = gfs2_trans_begin(sdp, RES_DINODE, 0);
1970     if (error)
1971         return error;
1972
1973     down_write(&ip->i_rw_mutex);
1974
1975     error = gfs2_meta_inode_buffer(ip, &dibh);
1976     if (error)
1977         goto out;
1978
1979     if (!i_size_read(&ip->i_inode)) {
1980         ip->i_height = 0;
1981         ip->i_goal = ip->i_no_addr;
1982         gfs2_buffer_clear_tail(dibh, sizeof(struct gfs2_dinode));
1983         gfs2_ordered_del_inode(ip);
1984     }
1985     ip->i_inode.i_mtime = ip->i_inode.i_ctime = current_time(&ip->i_inode);
1986     ip->i_diskflags &= ~GFS2_DIF_TRUNC_IN_PROG;
1987
1988     gfs2_trans_add_meta(ip->i_gl, dibh);
1989     gfs2_dinode_out(ip, dibh->b_data);
1990     brelse(dibh);
1991
1992 out:
1993     up_write(&ip->i_rw_mutex);
1994     gfs2_trans_end(sdp);
1995     return error;
1996 }
1997
1998 /**
1999  * do_shrink - make a file smaller
2000  * @inode: the inode
2001  * @newsize: the size to make the file
2002  *
2003  * Called with an exclusive lock on @inode. The @size must
2004  * be equal to or smaller than the current inode size.
2005  *
2006  * Returns: errno
2007  */
2008
2009 static int do_shrink(struct inode *inode, u64 newsize)
2010 {
2011     struct gfs2_inode *ip = GFS2_I(inode);
2012     int error;
2013
2014     error = trunc_start(inode, newsize);
2015     if (error < 0)
2016         return error;
2017     if (gfs2_is_stuffed(ip))
2018         return 0;
2019
2020     error = punch_hole(ip, newsize, 0);
2021     if (error == 0)
2022         error = trunc_end(ip);
2023
2024     return error;
2025 }
2026
2027 void gfs2_trim_blocks(struct inode *inode)
2028 {
2029     int ret;
2030
2031     ret = do_shrink(inode, inode->i_size);
2032     WARN_ON(ret != 0);
2033 }
2034
2035 /**
2036  * do_grow - Touch and update inode size
2037  * @inode: The inode
2038  * @size: The new size
2039  *
2040  * This function updates the timestamps on the inode and
2041  * may also increase the size of the inode. This function
2042  * must not be called with @size any smaller than the current
2043  * inode size.
2044  *
2045  * Although it is not strictly required to unstuff files here,
2046  * earlier versions of GFS2 have a bug in the stuffed file reading
2047  * code which will result in a buffer overrun if the size is larger
2048  * than the max stuffed file size. In order to prevent this from
2049  * occurring, such files are unstuffed, but in other cases we can
2050  * just update the inode size directly.
2051  *
2052  * Returns: 0 on success, or -ve on error
2053  */
2054
2055 static int do_grow(struct inode *inode, u64 size)
2056 {
2057     struct gfs2_inode *ip = GFS2_I(inode);
2058     struct gfs2_sbd *sdp = GFS2_SB(inode);
2059     struct gfs2_alloc_parms ap = { .target = 1, };
2060     struct buffer_head *dibh;
2061     int error;
2062     int unstuff = 0;
2063
2064     if (gfs2_is_stuffed(ip) && size > gfs2_max_stuffed_size(ip)) {
2065         error = gfs2_quota_lock_check(ip, &ap);
2066         if (error)
2067             return error;
2068
2069         error = gfs2_inplace_reserve(ip, &ap);
2070         if (error)
2071             goto do_grow_qunlock;
2072         unstuff = 1;
2073     }
2074
2075     error = gfs2_trans_begin(sdp, RES_DINODE + RES_STATFS + RES_RG_BIT +
2076                  (unstuff &&
2077                   gfs2_is_jdata(ip) ? RES_JDATA : 0) +
2078                  (sdp->sd_args.ar_quota == GFS2_QUOTA_OFF ?
2079                   0 : RES_QUOTA), 0);
2080     if (error)
2081         goto do_grow_release;
2082
2083     if (unstuff) {
2084         error = gfs2_unstuff_dinode(ip);
2085         if (error)
2086             goto do_end_trans;
2087     }
2088
2089     error = gfs2_meta_inode_buffer(ip, &dibh);
2090     if (error)
2091         goto do_end_trans;
2092
2093     truncate_setsize(inode, size);
2094     ip->i_inode.i_mtime = ip->i_inode.i_ctime = current_time(&ip->i_inode);
2095     gfs2_trans_add_meta(ip->i_gl, dibh);
2096     gfs2_dinode_out(ip, dibh->b_data);
2097     brelse(dibh);
2098
2099 do_end_trans:
2100     gfs2_trans_end(sdp);
2101 do_grow_release:
2102     if (unstuff) {
2103         gfs2_inplace_release(ip);
2104 do_grow_qunlock:
2105         gfs2_quota_unlock(ip);
2106     }
2107     return error;
2108 }
2109
2110 /**
2111  * gfs2_setattr_size - make a file a given size
2112  * @inode: the inode
2113  * @newsize: the size to make the file
2114  *
2115  * The file size can grow, shrink, or stay the same size. This
2116  * is called holding i_rwsem and an exclusive glock on the inode
2117  * in question.
2118  *
2119  * Returns: errno
2120  */
2121
2122 int gfs2_setattr_size(struct inode *inode, u64 newsize)
2123 {
2124     struct gfs2_inode *ip = GFS2_I(inode);
2125     int ret;
2126
2127     BUG_ON(!S_ISREG(inode->i_mode));
2128
2129     ret = inode_newsize_ok(inode, newsize);
2130     if (ret)
2131         return ret;
2132
2133     inode_dio_wait(inode);
2134
2135     ret = gfs2_qa_get(ip);
2136     if (ret)
2137         goto out;
2138
2139     if (newsize >= inode->i_size) {
2140         ret = do_grow(inode, newsize);
2141         goto out;
2142     }
2143
2144     ret = do_shrink(inode, newsize);
2145 out:
2146     gfs2_rs_delete(ip);
2147     gfs2_qa_put(ip);
2148     return ret;
2149 }
2150
2151 int gfs2_truncatei_resume(struct gfs2_inode *ip)
2152 {
2153     int error;
2154     error = punch_hole(ip, i_size_read(&ip->i_inode), 0);
2155     if (!error)
2156         error = trunc_end(ip);
2157     return error;
2158 }
2159
2160 int gfs2_file_dealloc(struct gfs2_inode *ip)
2161 {
2162     return punch_hole(ip, 0, 0);
2163 }
2164
2165 /**
2166  * gfs2_free_journal_extents - Free cached journal bmap info
2167  * @jd: The journal
2168  *
2169  */
2170
2171 void gfs2_free_journal_extents(struct gfs2_jdesc *jd)
2172 {
2173     struct gfs2_journal_extent *jext;
2174
2175     while(!list_empty(&jd->extent_list)) {
2176         jext = list_first_entry(&jd->extent_list, struct gfs2_journal_extent, list);
2177         list_del(&jext->list);
2178         kfree(jext);
2179     }
2180 }
2181
2182 /**
2183  * gfs2_add_jextent - Add or merge a new extent to extent cache
2184  * @jd: The journal descriptor
2185  * @lblock: The logical block at start of new extent
2186  * @dblock: The physical block at start of new extent
2187  * @blocks: Size of extent in fs blocks
2188  *
2189  * Returns: 0 on success or -ENOMEM
2190  */
2191
2192 static int gfs2_add_jextent(struct gfs2_jdesc *jd, u64 lblock, u64 dblock, u64 blocks)
2193 {
2194     struct gfs2_journal_extent *jext;
2195
2196     if (!list_empty(&jd->extent_list)) {
2197         jext = list_last_entry(&jd->extent_list, struct gfs2_journal_extent, list);
2198         if ((jext->dblock + jext->blocks) == dblock) {
2199             jext->blocks += blocks;
2200             return 0;
2201         }
2202     }
2203
2204     jext = kzalloc(sizeof(struct gfs2_journal_extent), GFP_NOFS);
2205     if (jext == NULL)
2206         return -ENOMEM;
2207     jext->dblock = dblock;
2208     jext->lblock = lblock;
2209     jext->blocks = blocks;
2210     list_add_tail(&jext->list, &jd->extent_list);
2211     jd->nr_extents++;
2212     return 0;
2213 }
2214
2215 /**
2216  * gfs2_map_journal_extents - Cache journal bmap info
2217  * @sdp: The super block
2218  * @jd: The journal to map
2219  *
2220  * Create a reusable "extent" mapping from all logical
2221  * blocks to all physical blocks for the given journal.  This will save
2222  * us time when writing journal blocks.  Most journals will have only one
2223  * extent that maps all their logical blocks.  That's because gfs2.mkfs
2224  * arranges the journal blocks sequentially to maximize performance.
2225  * So the extent would map the first block for the entire file length.
2226  * However, gfs2_jadd can happen while file activity is happening, so
2227  * those journals may not be sequential.  Less likely is the case where
2228  * the users created their own journals by mounting the metafs and
2229  * laying it out.  But it's still possible.  These journals might have
2230  * several extents.
2231  *
2232  * Returns: 0 on success, or error on failure
2233  */
2234
2235 int gfs2_map_journal_extents(struct gfs2_sbd *sdp, struct gfs2_jdesc *jd)
2236 {
2237     u64 lblock = 0;
2238     u64 lblock_stop;
2239     struct gfs2_inode *ip = GFS2_I(jd->jd_inode);
2240     struct buffer_head bh;
2241     unsigned int shift = sdp->sd_sb.sb_bsize_shift;
2242     u64 size;
2243     int rc;
2244     ktime_t start, end;
2245
2246     start = ktime_get();
2247     lblock_stop = i_size_read(jd->jd_inode) >> shift;
2248     size = (lblock_stop - lblock) << shift;
2249     jd->nr_extents = 0;
2250     WARN_ON(!list_empty(&jd->extent_list));
2251
2252     do {
2253         bh.b_state = 0;
2254         bh.b_blocknr = 0;
2255         bh.b_size = size;
2256         rc = gfs2_block_map(jd->jd_inode, lblock, &bh, 0);
2257         if (rc || !buffer_mapped(&bh))
2258             goto fail;
2259         rc = gfs2_add_jextent(jd, lblock, bh.b_blocknr, bh.b_size >> shift);
2260         if (rc)
2261             goto fail;
2262         size -= bh.b_size;
2263         lblock += (bh.b_size >> ip->i_inode.i_blkbits);
2264     } while(size > 0);
2265
2266     end = ktime_get();
2267     fs_info(sdp, "journal %d mapped with %u extents in %lldms\n", jd->jd_jid,
2268         jd->nr_extents, ktime_ms_delta(end, start));
2269     return 0;
2270
2271 fail:
2272     fs_warn(sdp, "error %d mapping journal %u at offset %llu (extent %u)\n",
2273         rc, jd->jd_jid,
2274         (unsigned long long)(i_size_read(jd->jd_inode) - size),
2275         jd->nr_extents);
2276     fs_warn(sdp, "bmap=%d lblock=%llu block=%llu, state=0x%08lx, size=%llu\n",
2277         rc, (unsigned long long)lblock, (unsigned long long)bh.b_blocknr,
2278         bh.b_state, (unsigned long long)bh.b_size);
2279     gfs2_free_journal_extents(jd);
2280     return rc;
2281 }
2282
2283 /**
2284  * gfs2_write_alloc_required - figure out if a write will require an allocation
2285  * @ip: the file being written to
2286  * @offset: the offset to write to
2287  * @len: the number of bytes being written
2288  *
2289  * Returns: 1 if an alloc is required, 0 otherwise
2290  */
2291
2292 int gfs2_write_alloc_required(struct gfs2_inode *ip, u64 offset,
2293                   unsigned int len)
2294 {
2295     struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
2296     struct buffer_head bh;
2297     unsigned int shift;
2298     u64 lblock, lblock_stop, size;
2299     u64 end_of_file;
2300
2301     if (!len)
2302         return 0;
2303
2304     if (gfs2_is_stuffed(ip)) {
2305         if (offset + len > gfs2_max_stuffed_size(ip))
2306             return 1;
2307         return 0;
2308     }
2309
2310     shift = sdp->sd_sb.sb_bsize_shift;
2311     BUG_ON(gfs2_is_dir(ip));
2312     end_of_file = (i_size_read(&ip->i_inode) + sdp->sd_sb.sb_bsize - 1) >> shift;
2313     lblock = offset >> shift;
2314     lblock_stop = (offset + len + sdp->sd_sb.sb_bsize - 1) >> shift;
2315     if (lblock_stop > end_of_file && ip != GFS2_I(sdp->sd_rindex))
2316         return 1;
2317
2318     size = (lblock_stop - lblock) << shift;
2319     do {
2320         bh.b_state = 0;
2321         bh.b_size = size;
2322         gfs2_block_map(&ip->i_inode, lblock, &bh, 0);
2323         if (!buffer_mapped(&bh))
2324             return 1;
2325         size -= bh.b_size;
2326         lblock += (bh.b_size >> ip->i_inode.i_blkbits);
2327     } while(size > 0);
2328
2329     return 0;
2330 }
2331
2332 static int stuffed_zero_range(struct inode *inode, loff_t offset, loff_t length)
2333 {
2334     struct gfs2_inode *ip = GFS2_I(inode);
2335     struct buffer_head *dibh;
2336     int error;
2337
2338     if (offset >= inode->i_size)
2339         return 0;
2340     if (offset + length > inode->i_size)
2341         length = inode->i_size - offset;
2342
2343     error = gfs2_meta_inode_buffer(ip, &dibh);
2344     if (error)
2345         return error;
2346     gfs2_trans_add_meta(ip->i_gl, dibh);
2347     memset(dibh->b_data + sizeof(struct gfs2_dinode) + offset, 0,
2348            length);
2349     brelse(dibh);
2350     return 0;
2351 }
2352
2353 static int gfs2_journaled_truncate_range(struct inode *inode, loff_t offset,
2354                      loff_t length)
2355 {
2356     struct gfs2_sbd *sdp = GFS2_SB(inode);
2357     loff_t max_chunk = GFS2_JTRUNC_REVOKES * sdp->sd_vfs->s_blocksize;
2358     int error;
2359
2360     while (length) {
2361         struct gfs2_trans *tr;
2362         loff_t chunk;
2363         unsigned int offs;
2364
2365         chunk = length;
2366         if (chunk > max_chunk)
2367             chunk = max_chunk;
2368
2369         offs = offset & ~PAGE_MASK;
2370         if (offs && chunk > PAGE_SIZE)
2371             chunk = offs + ((chunk - offs) & PAGE_MASK);
2372
2373         truncate_pagecache_range(inode, offset, chunk);
2374         offset += chunk;
2375         length -= chunk;
2376
2377         tr = current->journal_info;
2378         if (!test_bit(TR_TOUCHED, &tr->tr_flags))
2379             continue;
2380
2381         gfs2_trans_end(sdp);
2382         error = gfs2_trans_begin(sdp, RES_DINODE, GFS2_JTRUNC_REVOKES);
2383         if (error)
2384             return error;
2385     }
2386     return 0;
2387 }
2388
2389 int __gfs2_punch_hole(struct file *file, loff_t offset, loff_t length)
2390 {
2391     struct inode *inode = file_inode(file);
2392     struct gfs2_inode *ip = GFS2_I(inode);
2393     struct gfs2_sbd *sdp = GFS2_SB(inode);
2394     unsigned int blocksize = i_blocksize(inode);
2395     loff_t start, end;
2396     int error;
2397
2398     if (!gfs2_is_stuffed(ip)) {
2399         unsigned int start_off, end_len;
2400
2401         start_off = offset & (blocksize - 1);
2402         end_len = (offset + length) & (blocksize - 1);
2403         if (start_off) {
2404             unsigned int len = length;
2405             if (length > blocksize - start_off)
2406                 len = blocksize - start_off;
2407             error = gfs2_block_zero_range(inode, offset, len);
2408             if (error)
2409                 goto out;
2410             if (start_off + length < blocksize)
2411                 end_len = 0;
2412         }
2413         if (end_len) {
2414             error = gfs2_block_zero_range(inode,
2415                 offset + length - end_len, end_len);
2416             if (error)
2417                 goto out;
2418         }
2419     }
2420
2421     start = round_down(offset, blocksize);
2422     end = round_up(offset + length, blocksize) - 1;
2423     error = filemap_write_and_wait_range(inode->i_mapping, start, end);
2424     if (error)
2425         return error;
2426
2427     if (gfs2_is_jdata(ip))
2428         error = gfs2_trans_begin(sdp, RES_DINODE + 2 * RES_JDATA,
2429                      GFS2_JTRUNC_REVOKES);
2430     else
2431         error = gfs2_trans_begin(sdp, RES_DINODE, 0);
2432     if (error)
2433         return error;
2434
2435     if (gfs2_is_stuffed(ip)) {
2436         error = stuffed_zero_range(inode, offset, length);
2437         if (error)
2438             goto out;
2439     }
2440
2441     if (gfs2_is_jdata(ip)) {
2442         BUG_ON(!current->journal_info);
2443         gfs2_journaled_truncate_range(inode, offset, length);
2444     } else
2445         truncate_pagecache_range(inode, offset, offset + length - 1);
2446
2447     file_update_time(file);
2448     mark_inode_dirty(inode);
2449
2450     if (current->journal_info)
2451         gfs2_trans_end(sdp);
2452
2453     if (!gfs2_is_stuffed(ip))
2454         error = punch_hole(ip, offset, length);
2455
2456 out:
2457     if (current->journal_info)
2458         gfs2_trans_end(sdp);
2459     return error;
2460 }
2461
2462 static int gfs2_map_blocks(struct iomap_writepage_ctx *wpc, struct inode *inode,
2463         loff_t offset)
2464 {
2465     int ret;
2466
2467     if (WARN_ON_ONCE(gfs2_is_stuffed(GFS2_I(inode))))
2468         return -EIO;
2469
2470     if (offset >= wpc->iomap.offset &&
2471         offset < wpc->iomap.offset + wpc->iomap.length)
2472         return 0;
2473
2474     memset(&wpc->iomap, 0, sizeof(wpc->iomap));
2475     ret = gfs2_iomap_get(inode, offset, INT_MAX, &wpc->iomap);
2476     return ret;
2477 }
2478
2479 const struct iomap_writeback_ops gfs2_writeback_ops = {
2480     .map_blocks     = gfs2_map_blocks,
2481 };