fs/hugetlbfs/inode.c

0001 /*
0002  * hugetlbpage-backed filesystem.  Based on ramfs.
0003  *
0004  * Nadia Yvette Chambers, 2002
0005  *
0006  * Copyright (C) 2002 Linus Torvalds.
0007  * License: GPL
0008  */
0009
0010 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
0011
0012 #include <linux/thread_info.h>
0013 #include <asm/current.h>
0014 #include <linux/falloc.h>
0015 #include <linux/fs.h>
0016 #include <linux/mount.h>
0017 #include <linux/file.h>
0018 #include <linux/kernel.h>
0019 #include <linux/writeback.h>
0020 #include <linux/pagemap.h>
0021 #include <linux/highmem.h>
0022 #include <linux/init.h>
0023 #include <linux/string.h>
0024 #include <linux/capability.h>
0025 #include <linux/ctype.h>
0026 #include <linux/backing-dev.h>
0027 #include <linux/hugetlb.h>
0028 #include <linux/pagevec.h>
0029 #include <linux/fs_parser.h>
0030 #include <linux/mman.h>
0031 #include <linux/slab.h>
0032 #include <linux/dnotify.h>
0033 #include <linux/statfs.h>
0034 #include <linux/security.h>
0035 #include <linux/magic.h>
0036 #include <linux/migrate.h>
0037 #include <linux/uio.h>
0038
0039 #include <linux/uaccess.h>
0040 #include <linux/sched/mm.h>
0041
0042 static const struct address_space_operations hugetlbfs_aops;
0043 const struct file_operations hugetlbfs_file_operations;
0044 static const struct inode_operations hugetlbfs_dir_inode_operations;
0045 static const struct inode_operations hugetlbfs_inode_operations;
0046
0047 enum hugetlbfs_size_type { NO_SIZE, SIZE_STD, SIZE_PERCENT };
0048
0049 struct hugetlbfs_fs_context {
0050     struct hstate       *hstate;
0051     unsigned long long  max_size_opt;
0052     unsigned long long  min_size_opt;
0053     long            max_hpages;
0054     long            nr_inodes;
0055     long            min_hpages;
0056     enum hugetlbfs_size_type max_val_type;
0057     enum hugetlbfs_size_type min_val_type;
0058     kuid_t          uid;
0059     kgid_t          gid;
0060     umode_t         mode;
0061 };
0062
0063 int sysctl_hugetlb_shm_group;
0064
0065 enum hugetlb_param {
0066     Opt_gid,
0067     Opt_min_size,
0068     Opt_mode,
0069     Opt_nr_inodes,
0070     Opt_pagesize,
0071     Opt_size,
0072     Opt_uid,
0073 };
0074
0075 static const struct fs_parameter_spec hugetlb_fs_parameters[] = {
0076     fsparam_u32   ("gid",       Opt_gid),
0077     fsparam_string("min_size",  Opt_min_size),
0078     fsparam_u32oct("mode",      Opt_mode),
0079     fsparam_string("nr_inodes", Opt_nr_inodes),
0080     fsparam_string("pagesize",  Opt_pagesize),
0081     fsparam_string("size",      Opt_size),
0082     fsparam_u32   ("uid",       Opt_uid),
0083     {}
0084 };
0085
0086 #ifdef CONFIG_NUMA
0087 static inline void hugetlb_set_vma_policy(struct vm_area_struct *vma,
0088                     struct inode *inode, pgoff_t index)
0089 {
0090     vma->vm_policy = mpol_shared_policy_lookup(&HUGETLBFS_I(inode)->policy,
0091                             index);
0092 }
0093
0094 static inline void hugetlb_drop_vma_policy(struct vm_area_struct *vma)
0095 {
0096     mpol_cond_put(vma->vm_policy);
0097 }
0098 #else
0099 static inline void hugetlb_set_vma_policy(struct vm_area_struct *vma,
0100                     struct inode *inode, pgoff_t index)
0101 {
0102 }
0103
0104 static inline void hugetlb_drop_vma_policy(struct vm_area_struct *vma)
0105 {
0106 }
0107 #endif
0108
0109 /*
0110  * Mask used when checking the page offset value passed in via system
0111  * calls.  This value will be converted to a loff_t which is signed.
0112  * Therefore, we want to check the upper PAGE_SHIFT + 1 bits of the
0113  * value.  The extra bit (- 1 in the shift value) is to take the sign
0114  * bit into account.
0115  */
0116 #define PGOFF_LOFFT_MAX \
0117     (((1UL << (PAGE_SHIFT + 1)) - 1) <<  (BITS_PER_LONG - (PAGE_SHIFT + 1)))
0118
0119 static int hugetlbfs_file_mmap(struct file *file, struct vm_area_struct *vma)
0120 {
0121     struct inode *inode = file_inode(file);
0122     struct hugetlbfs_inode_info *info = HUGETLBFS_I(inode);
0123     loff_t len, vma_len;
0124     int ret;
0125     struct hstate *h = hstate_file(file);
0126
0127     /*
0128      * vma address alignment (but not the pgoff alignment) has
0129      * already been checked by prepare_hugepage_range.  If you add
0130      * any error returns here, do so after setting VM_HUGETLB, so
0131      * is_vm_hugetlb_page tests below unmap_region go the right
0132      * way when do_mmap unwinds (may be important on powerpc
0133      * and ia64).
0134      */
0135     vma->vm_flags |= VM_HUGETLB | VM_DONTEXPAND;
0136     vma->vm_ops = &hugetlb_vm_ops;
0137
0138     ret = seal_check_future_write(info->seals, vma);
0139     if (ret)
0140         return ret;
0141
0142     /*
0143      * page based offset in vm_pgoff could be sufficiently large to
0144      * overflow a loff_t when converted to byte offset.  This can
0145      * only happen on architectures where sizeof(loff_t) ==
0146      * sizeof(unsigned long).  So, only check in those instances.
0147      */
0148     if (sizeof(unsigned long) == sizeof(loff_t)) {
0149         if (vma->vm_pgoff & PGOFF_LOFFT_MAX)
0150             return -EINVAL;
0151     }
0152
0153     /* must be huge page aligned */
0154     if (vma->vm_pgoff & (~huge_page_mask(h) >> PAGE_SHIFT))
0155         return -EINVAL;
0156
0157     vma_len = (loff_t)(vma->vm_end - vma->vm_start);
0158     len = vma_len + ((loff_t)vma->vm_pgoff << PAGE_SHIFT);
0159     /* check for overflow */
0160     if (len < vma_len)
0161         return -EINVAL;
0162
0163     inode_lock(inode);
0164     file_accessed(file);
0165
0166     ret = -ENOMEM;
0167     if (!hugetlb_reserve_pages(inode,
0168                 vma->vm_pgoff >> huge_page_order(h),
0169                 len >> huge_page_shift(h), vma,
0170                 vma->vm_flags))
0171         goto out;
0172
0173     ret = 0;
0174     if (vma->vm_flags & VM_WRITE && inode->i_size < len)
0175         i_size_write(inode, len);
0176 out:
0177     inode_unlock(inode);
0178
0179     return ret;
0180 }
0181
0182 /*
0183  * Called under mmap_write_lock(mm).
0184  */
0185
0186 static unsigned long
0187 hugetlb_get_unmapped_area_bottomup(struct file *file, unsigned long addr,
0188         unsigned long len, unsigned long pgoff, unsigned long flags)
0189 {
0190     struct hstate *h = hstate_file(file);
0191     struct vm_unmapped_area_info info;
0192
0193     info.flags = 0;
0194     info.length = len;
0195     info.low_limit = current->mm->mmap_base;
0196     info.high_limit = arch_get_mmap_end(addr, len, flags);
0197     info.align_mask = PAGE_MASK & ~huge_page_mask(h);
0198     info.align_offset = 0;
0199     return vm_unmapped_area(&info);
0200 }
0201
0202 static unsigned long
0203 hugetlb_get_unmapped_area_topdown(struct file *file, unsigned long addr,
0204         unsigned long len, unsigned long pgoff, unsigned long flags)
0205 {
0206     struct hstate *h = hstate_file(file);
0207     struct vm_unmapped_area_info info;
0208
0209     info.flags = VM_UNMAPPED_AREA_TOPDOWN;
0210     info.length = len;
0211     info.low_limit = max(PAGE_SIZE, mmap_min_addr);
0212     info.high_limit = arch_get_mmap_base(addr, current->mm->mmap_base);
0213     info.align_mask = PAGE_MASK & ~huge_page_mask(h);
0214     info.align_offset = 0;
0215     addr = vm_unmapped_area(&info);
0216
0217     /*
0218      * A failed mmap() very likely causes application failure,
0219      * so fall back to the bottom-up function here. This scenario
0220      * can happen with large stack limits and large mmap()
0221      * allocations.
0222      */
0223     if (unlikely(offset_in_page(addr))) {
0224         VM_BUG_ON(addr != -ENOMEM);
0225         info.flags = 0;
0226         info.low_limit = current->mm->mmap_base;
0227         info.high_limit = arch_get_mmap_end(addr, len, flags);
0228         addr = vm_unmapped_area(&info);
0229     }
0230
0231     return addr;
0232 }
0233
0234 unsigned long
0235 generic_hugetlb_get_unmapped_area(struct file *file, unsigned long addr,
0236                   unsigned long len, unsigned long pgoff,
0237                   unsigned long flags)
0238 {
0239     struct mm_struct *mm = current->mm;
0240     struct vm_area_struct *vma;
0241     struct hstate *h = hstate_file(file);
0242     const unsigned long mmap_end = arch_get_mmap_end(addr, len, flags);
0243
0244     if (len & ~huge_page_mask(h))
0245         return -EINVAL;
0246     if (len > TASK_SIZE)
0247         return -ENOMEM;
0248
0249     if (flags & MAP_FIXED) {
0250         if (prepare_hugepage_range(file, addr, len))
0251             return -EINVAL;
0252         return addr;
0253     }
0254
0255     if (addr) {
0256         addr = ALIGN(addr, huge_page_size(h));
0257         vma = find_vma(mm, addr);
0258         if (mmap_end - len >= addr &&
0259             (!vma || addr + len <= vm_start_gap(vma)))
0260             return addr;
0261     }
0262
0263     /*
0264      * Use mm->get_unmapped_area value as a hint to use topdown routine.
0265      * If architectures have special needs, they should define their own
0266      * version of hugetlb_get_unmapped_area.
0267      */
0268     if (mm->get_unmapped_area == arch_get_unmapped_area_topdown)
0269         return hugetlb_get_unmapped_area_topdown(file, addr, len,
0270                 pgoff, flags);
0271     return hugetlb_get_unmapped_area_bottomup(file, addr, len,
0272             pgoff, flags);
0273 }
0274
0275 #ifndef HAVE_ARCH_HUGETLB_UNMAPPED_AREA
0276 static unsigned long
0277 hugetlb_get_unmapped_area(struct file *file, unsigned long addr,
0278               unsigned long len, unsigned long pgoff,
0279               unsigned long flags)
0280 {
0281     return generic_hugetlb_get_unmapped_area(file, addr, len, pgoff, flags);
0282 }
0283 #endif
0284
0285 /*
0286  * Support for read() - Find the page attached to f_mapping and copy out the
0287  * data. This provides functionality similar to filemap_read().
0288  */
0289 static ssize_t hugetlbfs_read_iter(struct kiocb *iocb, struct iov_iter *to)
0290 {
0291     struct file *file = iocb->ki_filp;
0292     struct hstate *h = hstate_file(file);
0293     struct address_space *mapping = file->f_mapping;
0294     struct inode *inode = mapping->host;
0295     unsigned long index = iocb->ki_pos >> huge_page_shift(h);
0296     unsigned long offset = iocb->ki_pos & ~huge_page_mask(h);
0297     unsigned long end_index;
0298     loff_t isize;
0299     ssize_t retval = 0;
0300
0301     while (iov_iter_count(to)) {
0302         struct page *page;
0303         size_t nr, copied;
0304
0305         /* nr is the maximum number of bytes to copy from this page */
0306         nr = huge_page_size(h);
0307         isize = i_size_read(inode);
0308         if (!isize)
0309             break;
0310         end_index = (isize - 1) >> huge_page_shift(h);
0311         if (index > end_index)
0312             break;
0313         if (index == end_index) {
0314             nr = ((isize - 1) & ~huge_page_mask(h)) + 1;
0315             if (nr <= offset)
0316                 break;
0317         }
0318         nr = nr - offset;
0319
0320         /* Find the page */
0321         page = find_lock_page(mapping, index);
0322         if (unlikely(page == NULL)) {
0323             /*
0324              * We have a HOLE, zero out the user-buffer for the
0325              * length of the hole or request.
0326              */
0327             copied = iov_iter_zero(nr, to);
0328         } else {
0329             unlock_page(page);
0330
0331             /*
0332              * We have the page, copy it to user space buffer.
0333              */
0334             copied = copy_page_to_iter(page, offset, nr, to);
0335             put_page(page);
0336         }
0337         offset += copied;
0338         retval += copied;
0339         if (copied != nr && iov_iter_count(to)) {
0340             if (!retval)
0341                 retval = -EFAULT;
0342             break;
0343         }
0344         index += offset >> huge_page_shift(h);
0345         offset &= ~huge_page_mask(h);
0346     }
0347     iocb->ki_pos = ((loff_t)index << huge_page_shift(h)) + offset;
0348     return retval;
0349 }
0350
0351 static int hugetlbfs_write_begin(struct file *file,
0352             struct address_space *mapping,
0353             loff_t pos, unsigned len,
0354             struct page **pagep, void **fsdata)
0355 {
0356     return -EINVAL;
0357 }
0358
0359 static int hugetlbfs_write_end(struct file *file, struct address_space *mapping,
0360             loff_t pos, unsigned len, unsigned copied,
0361             struct page *page, void *fsdata)
0362 {
0363     BUG();
0364     return -EINVAL;
0365 }
0366
0367 static void remove_huge_page(struct page *page)
0368 {
0369     ClearPageDirty(page);
0370     ClearPageUptodate(page);
0371     delete_from_page_cache(page);
0372 }
0373
0374 static void
0375 hugetlb_vmdelete_list(struct rb_root_cached *root, pgoff_t start, pgoff_t end,
0376               zap_flags_t zap_flags)
0377 {
0378     struct vm_area_struct *vma;
0379
0380     /*
0381      * end == 0 indicates that the entire range after start should be
0382      * unmapped.  Note, end is exclusive, whereas the interval tree takes
0383      * an inclusive "last".
0384      */
0385     vma_interval_tree_foreach(vma, root, start, end ? end - 1 : ULONG_MAX) {
0386         unsigned long v_offset;
0387         unsigned long v_end;
0388
0389         /*
0390          * Can the expression below overflow on 32-bit arches?
0391          * No, because the interval tree returns us only those vmas
0392          * which overlap the truncated area starting at pgoff,
0393          * and no vma on a 32-bit arch can span beyond the 4GB.
0394          */
0395         if (vma->vm_pgoff < start)
0396             v_offset = (start - vma->vm_pgoff) << PAGE_SHIFT;
0397         else
0398             v_offset = 0;
0399
0400         if (!end)
0401             v_end = vma->vm_end;
0402         else {
0403             v_end = ((end - vma->vm_pgoff) << PAGE_SHIFT)
0404                             + vma->vm_start;
0405             if (v_end > vma->vm_end)
0406                 v_end = vma->vm_end;
0407         }
0408
0409         unmap_hugepage_range(vma, vma->vm_start + v_offset, v_end,
0410                      NULL, zap_flags);
0411     }
0412 }
0413
0414 /*
0415  * remove_inode_hugepages handles two distinct cases: truncation and hole
0416  * punch.  There are subtle differences in operation for each case.
0417  *
0418  * truncation is indicated by end of range being LLONG_MAX
0419  *  In this case, we first scan the range and release found pages.
0420  *  After releasing pages, hugetlb_unreserve_pages cleans up region/reserve
0421  *  maps and global counts.  Page faults can not race with truncation
0422  *  in this routine.  hugetlb_no_page() holds i_mmap_rwsem and prevents
0423  *  page faults in the truncated range by checking i_size.  i_size is
0424  *  modified while holding i_mmap_rwsem.
0425  * hole punch is indicated if end is not LLONG_MAX
0426  *  In the hole punch case we scan the range and release found pages.
0427  *  Only when releasing a page is the associated region/reserve map
0428  *  deleted.  The region/reserve map for ranges without associated
0429  *  pages are not modified.  Page faults can race with hole punch.
0430  *  This is indicated if we find a mapped page.
0431  * Note: If the passed end of range value is beyond the end of file, but
0432  * not LLONG_MAX this routine still performs a hole punch operation.
0433  */
0434 static void remove_inode_hugepages(struct inode *inode, loff_t lstart,
0435                    loff_t lend)
0436 {
0437     struct hstate *h = hstate_inode(inode);
0438     struct address_space *mapping = &inode->i_data;
0439     const pgoff_t start = lstart >> huge_page_shift(h);
0440     const pgoff_t end = lend >> huge_page_shift(h);
0441     struct folio_batch fbatch;
0442     pgoff_t next, index;
0443     int i, freed = 0;
0444     bool truncate_op = (lend == LLONG_MAX);
0445
0446     folio_batch_init(&fbatch);
0447     next = start;
0448     while (filemap_get_folios(mapping, &next, end - 1, &fbatch)) {
0449         for (i = 0; i < folio_batch_count(&fbatch); ++i) {
0450             struct folio *folio = fbatch.folios[i];
0451             u32 hash = 0;
0452
0453             index = folio->index;
0454             if (!truncate_op) {
0455                 /*
0456                  * Only need to hold the fault mutex in the
0457                  * hole punch case.  This prevents races with
0458                  * page faults.  Races are not possible in the
0459                  * case of truncation.
0460                  */
0461                 hash = hugetlb_fault_mutex_hash(mapping, index);
0462                 mutex_lock(&hugetlb_fault_mutex_table[hash]);
0463             }
0464
0465             /*
0466              * If folio is mapped, it was faulted in after being
0467              * unmapped in caller.  Unmap (again) now after taking
0468              * the fault mutex.  The mutex will prevent faults
0469              * until we finish removing the folio.
0470              *
0471              * This race can only happen in the hole punch case.
0472              * Getting here in a truncate operation is a bug.
0473              */
0474             if (unlikely(folio_mapped(folio))) {
0475                 BUG_ON(truncate_op);
0476
0477                 mutex_unlock(&hugetlb_fault_mutex_table[hash]);
0478                 i_mmap_lock_write(mapping);
0479                 mutex_lock(&hugetlb_fault_mutex_table[hash]);
0480                 hugetlb_vmdelete_list(&mapping->i_mmap,
0481                     index * pages_per_huge_page(h),
0482                     (index + 1) * pages_per_huge_page(h),
0483                     ZAP_FLAG_DROP_MARKER);
0484                 i_mmap_unlock_write(mapping);
0485             }
0486
0487             folio_lock(folio);
0488             /*
0489              * We must free the huge page and remove from page
0490              * cache (remove_huge_page) BEFORE removing the
0491              * region/reserve map (hugetlb_unreserve_pages).  In
0492              * rare out of memory conditions, removal of the
0493              * region/reserve map could fail. Correspondingly,
0494              * the subpool and global reserve usage count can need
0495              * to be adjusted.
0496              */
0497             VM_BUG_ON(HPageRestoreReserve(&folio->page));
0498             remove_huge_page(&folio->page);
0499             freed++;
0500             if (!truncate_op) {
0501                 if (unlikely(hugetlb_unreserve_pages(inode,
0502                             index, index + 1, 1)))
0503                     hugetlb_fix_reserve_counts(inode);
0504             }
0505
0506             folio_unlock(folio);
0507             if (!truncate_op)
0508                 mutex_unlock(&hugetlb_fault_mutex_table[hash]);
0509         }
0510         folio_batch_release(&fbatch);
0511         cond_resched();
0512     }
0513
0514     if (truncate_op)
0515         (void)hugetlb_unreserve_pages(inode, start, LONG_MAX, freed);
0516 }
0517
0518 static void hugetlbfs_evict_inode(struct inode *inode)
0519 {
0520     struct resv_map *resv_map;
0521
0522     remove_inode_hugepages(inode, 0, LLONG_MAX);
0523
0524     /*
0525      * Get the resv_map from the address space embedded in the inode.
0526      * This is the address space which points to any resv_map allocated
0527      * at inode creation time.  If this is a device special inode,
0528      * i_mapping may not point to the original address space.
0529      */
0530     resv_map = (struct resv_map *)(&inode->i_data)->private_data;
0531     /* Only regular and link inodes have associated reserve maps */
0532     if (resv_map)
0533         resv_map_release(&resv_map->refs);
0534     clear_inode(inode);
0535 }
0536
0537 static void hugetlb_vmtruncate(struct inode *inode, loff_t offset)
0538 {
0539     pgoff_t pgoff;
0540     struct address_space *mapping = inode->i_mapping;
0541     struct hstate *h = hstate_inode(inode);
0542
0543     BUG_ON(offset & ~huge_page_mask(h));
0544     pgoff = offset >> PAGE_SHIFT;
0545
0546     i_mmap_lock_write(mapping);
0547     i_size_write(inode, offset);
0548     if (!RB_EMPTY_ROOT(&mapping->i_mmap.rb_root))
0549         hugetlb_vmdelete_list(&mapping->i_mmap, pgoff, 0,
0550                       ZAP_FLAG_DROP_MARKER);
0551     i_mmap_unlock_write(mapping);
0552     remove_inode_hugepages(inode, offset, LLONG_MAX);
0553 }
0554
0555 static void hugetlbfs_zero_partial_page(struct hstate *h,
0556                     struct address_space *mapping,
0557                     loff_t start,
0558                     loff_t end)
0559 {
0560     pgoff_t idx = start >> huge_page_shift(h);
0561     struct folio *folio;
0562
0563     folio = filemap_lock_folio(mapping, idx);
0564     if (!folio)
0565         return;
0566
0567     start = start & ~huge_page_mask(h);
0568     end = end & ~huge_page_mask(h);
0569     if (!end)
0570         end = huge_page_size(h);
0571
0572     folio_zero_segment(folio, (size_t)start, (size_t)end);
0573
0574     folio_unlock(folio);
0575     folio_put(folio);
0576 }
0577
0578 static long hugetlbfs_punch_hole(struct inode *inode, loff_t offset, loff_t len)
0579 {
0580     struct hugetlbfs_inode_info *info = HUGETLBFS_I(inode);
0581     struct address_space *mapping = inode->i_mapping;
0582     struct hstate *h = hstate_inode(inode);
0583     loff_t hpage_size = huge_page_size(h);
0584     loff_t hole_start, hole_end;
0585
0586     /*
0587      * hole_start and hole_end indicate the full pages within the hole.
0588      */
0589     hole_start = round_up(offset, hpage_size);
0590     hole_end = round_down(offset + len, hpage_size);
0591
0592     inode_lock(inode);
0593
0594     /* protected by i_rwsem */
0595     if (info->seals & (F_SEAL_WRITE | F_SEAL_FUTURE_WRITE)) {
0596         inode_unlock(inode);
0597         return -EPERM;
0598     }
0599
0600     i_mmap_lock_write(mapping);
0601
0602     /* If range starts before first full page, zero partial page. */
0603     if (offset < hole_start)
0604         hugetlbfs_zero_partial_page(h, mapping,
0605                 offset, min(offset + len, hole_start));
0606
0607     /* Unmap users of full pages in the hole. */
0608     if (hole_end > hole_start) {
0609         if (!RB_EMPTY_ROOT(&mapping->i_mmap.rb_root))
0610             hugetlb_vmdelete_list(&mapping->i_mmap,
0611                           hole_start >> PAGE_SHIFT,
0612                           hole_end >> PAGE_SHIFT, 0);
0613     }
0614
0615     /* If range extends beyond last full page, zero partial page. */
0616     if ((offset + len) > hole_end && (offset + len) > hole_start)
0617         hugetlbfs_zero_partial_page(h, mapping,
0618                 hole_end, offset + len);
0619
0620     i_mmap_unlock_write(mapping);
0621
0622     /* Remove full pages from the file. */
0623     if (hole_end > hole_start)
0624         remove_inode_hugepages(inode, hole_start, hole_end);
0625
0626     inode_unlock(inode);
0627
0628     return 0;
0629 }
0630
0631 static long hugetlbfs_fallocate(struct file *file, int mode, loff_t offset,
0632                 loff_t len)
0633 {
0634     struct inode *inode = file_inode(file);
0635     struct hugetlbfs_inode_info *info = HUGETLBFS_I(inode);
0636     struct address_space *mapping = inode->i_mapping;
0637     struct hstate *h = hstate_inode(inode);
0638     struct vm_area_struct pseudo_vma;
0639     struct mm_struct *mm = current->mm;
0640     loff_t hpage_size = huge_page_size(h);
0641     unsigned long hpage_shift = huge_page_shift(h);
0642     pgoff_t start, index, end;
0643     int error;
0644     u32 hash;
0645
0646     if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE))
0647         return -EOPNOTSUPP;
0648
0649     if (mode & FALLOC_FL_PUNCH_HOLE)
0650         return hugetlbfs_punch_hole(inode, offset, len);
0651
0652     /*
0653      * Default preallocate case.
0654      * For this range, start is rounded down and end is rounded up
0655      * as well as being converted to page offsets.
0656      */
0657     start = offset >> hpage_shift;
0658     end = (offset + len + hpage_size - 1) >> hpage_shift;
0659
0660     inode_lock(inode);
0661
0662     /* We need to check rlimit even when FALLOC_FL_KEEP_SIZE */
0663     error = inode_newsize_ok(inode, offset + len);
0664     if (error)
0665         goto out;
0666
0667     if ((info->seals & F_SEAL_GROW) && offset + len > inode->i_size) {
0668         error = -EPERM;
0669         goto out;
0670     }
0671
0672     /*
0673      * Initialize a pseudo vma as this is required by the huge page
0674      * allocation routines.  If NUMA is configured, use page index
0675      * as input to create an allocation policy.
0676      */
0677     vma_init(&pseudo_vma, mm);
0678     pseudo_vma.vm_flags = (VM_HUGETLB | VM_MAYSHARE | VM_SHARED);
0679     pseudo_vma.vm_file = file;
0680
0681     for (index = start; index < end; index++) {
0682         /*
0683          * This is supposed to be the vaddr where the page is being
0684          * faulted in, but we have no vaddr here.
0685          */
0686         struct page *page;
0687         unsigned long addr;
0688
0689         cond_resched();
0690
0691         /*
0692          * fallocate(2) manpage permits EINTR; we may have been
0693          * interrupted because we are using up too much memory.
0694          */
0695         if (signal_pending(current)) {
0696             error = -EINTR;
0697             break;
0698         }
0699
0700         /* Set numa allocation policy based on index */
0701         hugetlb_set_vma_policy(&pseudo_vma, inode, index);
0702
0703         /* addr is the offset within the file (zero based) */
0704         addr = index * hpage_size;
0705
0706         /*
0707          * fault mutex taken here, protects against fault path
0708          * and hole punch.  inode_lock previously taken protects
0709          * against truncation.
0710          */
0711         hash = hugetlb_fault_mutex_hash(mapping, index);
0712         mutex_lock(&hugetlb_fault_mutex_table[hash]);
0713
0714         /* See if already present in mapping to avoid alloc/free */
0715         page = find_get_page(mapping, index);
0716         if (page) {
0717             put_page(page);
0718             mutex_unlock(&hugetlb_fault_mutex_table[hash]);
0719             hugetlb_drop_vma_policy(&pseudo_vma);
0720             continue;
0721         }
0722
0723         /*
0724          * Allocate page without setting the avoid_reserve argument.
0725          * There certainly are no reserves associated with the
0726          * pseudo_vma.  However, there could be shared mappings with
0727          * reserves for the file at the inode level.  If we fallocate
0728          * pages in these areas, we need to consume the reserves
0729          * to keep reservation accounting consistent.
0730          */
0731         page = alloc_huge_page(&pseudo_vma, addr, 0);
0732         hugetlb_drop_vma_policy(&pseudo_vma);
0733         if (IS_ERR(page)) {
0734             mutex_unlock(&hugetlb_fault_mutex_table[hash]);
0735             error = PTR_ERR(page);
0736             goto out;
0737         }
0738         clear_huge_page(page, addr, pages_per_huge_page(h));
0739         __SetPageUptodate(page);
0740         error = huge_add_to_page_cache(page, mapping, index);
0741         if (unlikely(error)) {
0742             restore_reserve_on_error(h, &pseudo_vma, addr, page);
0743             put_page(page);
0744             mutex_unlock(&hugetlb_fault_mutex_table[hash]);
0745             goto out;
0746         }
0747
0748         mutex_unlock(&hugetlb_fault_mutex_table[hash]);
0749
0750         SetHPageMigratable(page);
0751         /*
0752          * unlock_page because locked by huge_add_to_page_cache()
0753          * put_page() due to reference from alloc_huge_page()
0754          */
0755         unlock_page(page);
0756         put_page(page);
0757     }
0758
0759     if (!(mode & FALLOC_FL_KEEP_SIZE) && offset + len > inode->i_size)
0760         i_size_write(inode, offset + len);
0761     inode->i_ctime = current_time(inode);
0762 out:
0763     inode_unlock(inode);
0764     return error;
0765 }
0766
0767 static int hugetlbfs_setattr(struct user_namespace *mnt_userns,
0768                  struct dentry *dentry, struct iattr *attr)
0769 {
0770     struct inode *inode = d_inode(dentry);
0771     struct hstate *h = hstate_inode(inode);
0772     int error;
0773     unsigned int ia_valid = attr->ia_valid;
0774     struct hugetlbfs_inode_info *info = HUGETLBFS_I(inode);
0775
0776     error = setattr_prepare(&init_user_ns, dentry, attr);
0777     if (error)
0778         return error;
0779
0780     if (ia_valid & ATTR_SIZE) {
0781         loff_t oldsize = inode->i_size;
0782         loff_t newsize = attr->ia_size;
0783
0784         if (newsize & ~huge_page_mask(h))
0785             return -EINVAL;
0786         /* protected by i_rwsem */
0787         if ((newsize < oldsize && (info->seals & F_SEAL_SHRINK)) ||
0788             (newsize > oldsize && (info->seals & F_SEAL_GROW)))
0789             return -EPERM;
0790         hugetlb_vmtruncate(inode, newsize);
0791     }
0792
0793     setattr_copy(&init_user_ns, inode, attr);
0794     mark_inode_dirty(inode);
0795     return 0;
0796 }
0797
0798 static struct inode *hugetlbfs_get_root(struct super_block *sb,
0799                     struct hugetlbfs_fs_context *ctx)
0800 {
0801     struct inode *inode;
0802
0803     inode = new_inode(sb);
0804     if (inode) {
0805         inode->i_ino = get_next_ino();
0806         inode->i_mode = S_IFDIR | ctx->mode;
0807         inode->i_uid = ctx->uid;
0808         inode->i_gid = ctx->gid;
0809         inode->i_atime = inode->i_mtime = inode->i_ctime = current_time(inode);
0810         inode->i_op = &hugetlbfs_dir_inode_operations;
0811         inode->i_fop = &simple_dir_operations;
0812         /* directory inodes start off with i_nlink == 2 (for "." entry) */
0813         inc_nlink(inode);
0814         lockdep_annotate_inode_mutex_key(inode);
0815     }
0816     return inode;
0817 }
0818
0819 /*
0820  * Hugetlbfs is not reclaimable; therefore its i_mmap_rwsem will never
0821  * be taken from reclaim -- unlike regular filesystems. This needs an
0822  * annotation because huge_pmd_share() does an allocation under hugetlb's
0823  * i_mmap_rwsem.
0824  */
0825 static struct lock_class_key hugetlbfs_i_mmap_rwsem_key;
0826
0827 static struct inode *hugetlbfs_get_inode(struct super_block *sb,
0828                     struct inode *dir,
0829                     umode_t mode, dev_t dev)
0830 {
0831     struct inode *inode;
0832     struct resv_map *resv_map = NULL;
0833
0834     /*
0835      * Reserve maps are only needed for inodes that can have associated
0836      * page allocations.
0837      */
0838     if (S_ISREG(mode) || S_ISLNK(mode)) {
0839         resv_map = resv_map_alloc();
0840         if (!resv_map)
0841             return NULL;
0842     }
0843
0844     inode = new_inode(sb);
0845     if (inode) {
0846         struct hugetlbfs_inode_info *info = HUGETLBFS_I(inode);
0847
0848         inode->i_ino = get_next_ino();
0849         inode_init_owner(&init_user_ns, inode, dir, mode);
0850         lockdep_set_class(&inode->i_mapping->i_mmap_rwsem,
0851                 &hugetlbfs_i_mmap_rwsem_key);
0852         inode->i_mapping->a_ops = &hugetlbfs_aops;
0853         inode->i_atime = inode->i_mtime = inode->i_ctime = current_time(inode);
0854         inode->i_mapping->private_data = resv_map;
0855         info->seals = F_SEAL_SEAL;
0856         switch (mode & S_IFMT) {
0857         default:
0858             init_special_inode(inode, mode, dev);
0859             break;
0860         case S_IFREG:
0861             inode->i_op = &hugetlbfs_inode_operations;
0862             inode->i_fop = &hugetlbfs_file_operations;
0863             break;
0864         case S_IFDIR:
0865             inode->i_op = &hugetlbfs_dir_inode_operations;
0866             inode->i_fop = &simple_dir_operations;
0867
0868             /* directory inodes start off with i_nlink == 2 (for "." entry) */
0869             inc_nlink(inode);
0870             break;
0871         case S_IFLNK:
0872             inode->i_op = &page_symlink_inode_operations;
0873             inode_nohighmem(inode);
0874             break;
0875         }
0876         lockdep_annotate_inode_mutex_key(inode);
0877     } else {
0878         if (resv_map)
0879             kref_put(&resv_map->refs, resv_map_release);
0880     }
0881
0882     return inode;
0883 }
0884
0885 /*
0886  * File creation. Allocate an inode, and we're done..
0887  */
0888 static int do_hugetlbfs_mknod(struct inode *dir,
0889             struct dentry *dentry,
0890             umode_t mode,
0891             dev_t dev,
0892             bool tmpfile)
0893 {
0894     struct inode *inode;
0895     int error = -ENOSPC;
0896
0897     inode = hugetlbfs_get_inode(dir->i_sb, dir, mode, dev);
0898     if (inode) {
0899         dir->i_ctime = dir->i_mtime = current_time(dir);
0900         if (tmpfile) {
0901             d_tmpfile(dentry, inode);
0902         } else {
0903             d_instantiate(dentry, inode);
0904             dget(dentry);/* Extra count - pin the dentry in core */
0905         }
0906         error = 0;
0907     }
0908     return error;
0909 }
0910
0911 static int hugetlbfs_mknod(struct user_namespace *mnt_userns, struct inode *dir,
0912                struct dentry *dentry, umode_t mode, dev_t dev)
0913 {
0914     return do_hugetlbfs_mknod(dir, dentry, mode, dev, false);
0915 }
0916
0917 static int hugetlbfs_mkdir(struct user_namespace *mnt_userns, struct inode *dir,
0918                struct dentry *dentry, umode_t mode)
0919 {
0920     int retval = hugetlbfs_mknod(&init_user_ns, dir, dentry,
0921                      mode | S_IFDIR, 0);
0922     if (!retval)
0923         inc_nlink(dir);
0924     return retval;
0925 }
0926
0927 static int hugetlbfs_create(struct user_namespace *mnt_userns,
0928                 struct inode *dir, struct dentry *dentry,
0929                 umode_t mode, bool excl)
0930 {
0931     return hugetlbfs_mknod(&init_user_ns, dir, dentry, mode | S_IFREG, 0);
0932 }
0933
0934 static int hugetlbfs_tmpfile(struct user_namespace *mnt_userns,
0935                  struct inode *dir, struct dentry *dentry,
0936                  umode_t mode)
0937 {
0938     return do_hugetlbfs_mknod(dir, dentry, mode | S_IFREG, 0, true);
0939 }
0940
0941 static int hugetlbfs_symlink(struct user_namespace *mnt_userns,
0942                  struct inode *dir, struct dentry *dentry,
0943                  const char *symname)
0944 {
0945     struct inode *inode;
0946     int error = -ENOSPC;
0947
0948     inode = hugetlbfs_get_inode(dir->i_sb, dir, S_IFLNK|S_IRWXUGO, 0);
0949     if (inode) {
0950         int l = strlen(symname)+1;
0951         error = page_symlink(inode, symname, l);
0952         if (!error) {
0953             d_instantiate(dentry, inode);
0954             dget(dentry);
0955         } else
0956             iput(inode);
0957     }
0958     dir->i_ctime = dir->i_mtime = current_time(dir);
0959
0960     return error;
0961 }
0962
0963 #ifdef CONFIG_MIGRATION
0964 static int hugetlbfs_migrate_folio(struct address_space *mapping,
0965                 struct folio *dst, struct folio *src,
0966                 enum migrate_mode mode)
0967 {
0968     int rc;
0969
0970     rc = migrate_huge_page_move_mapping(mapping, dst, src);
0971     if (rc != MIGRATEPAGE_SUCCESS)
0972         return rc;
0973
0974     if (hugetlb_page_subpool(&src->page)) {
0975         hugetlb_set_page_subpool(&dst->page,
0976                     hugetlb_page_subpool(&src->page));
0977         hugetlb_set_page_subpool(&src->page, NULL);
0978     }
0979
0980     if (mode != MIGRATE_SYNC_NO_COPY)
0981         folio_migrate_copy(dst, src);
0982     else
0983         folio_migrate_flags(dst, src);
0984
0985     return MIGRATEPAGE_SUCCESS;
0986 }
0987 #else
0988 #define hugetlbfs_migrate_folio NULL
0989 #endif
0990
0991 static int hugetlbfs_error_remove_page(struct address_space *mapping,
0992                 struct page *page)
0993 {
0994     struct inode *inode = mapping->host;
0995     pgoff_t index = page->index;
0996
0997     remove_huge_page(page);
0998     if (unlikely(hugetlb_unreserve_pages(inode, index, index + 1, 1)))
0999         hugetlb_fix_reserve_counts(inode);
1000
1001     return 0;
1002 }
1003
1004 /*
1005  * Display the mount options in /proc/mounts.
1006  */
1007 static int hugetlbfs_show_options(struct seq_file *m, struct dentry *root)
1008 {
1009     struct hugetlbfs_sb_info *sbinfo = HUGETLBFS_SB(root->d_sb);
1010     struct hugepage_subpool *spool = sbinfo->spool;
1011     unsigned long hpage_size = huge_page_size(sbinfo->hstate);
1012     unsigned hpage_shift = huge_page_shift(sbinfo->hstate);
1013     char mod;
1014
1015     if (!uid_eq(sbinfo->uid, GLOBAL_ROOT_UID))
1016         seq_printf(m, ",uid=%u",
1017                from_kuid_munged(&init_user_ns, sbinfo->uid));
1018     if (!gid_eq(sbinfo->gid, GLOBAL_ROOT_GID))
1019         seq_printf(m, ",gid=%u",
1020                from_kgid_munged(&init_user_ns, sbinfo->gid));
1021     if (sbinfo->mode != 0755)
1022         seq_printf(m, ",mode=%o", sbinfo->mode);
1023     if (sbinfo->max_inodes != -1)
1024         seq_printf(m, ",nr_inodes=%lu", sbinfo->max_inodes);
1025
1026     hpage_size /= 1024;
1027     mod = 'K';
1028     if (hpage_size >= 1024) {
1029         hpage_size /= 1024;
1030         mod = 'M';
1031     }
1032     seq_printf(m, ",pagesize=%lu%c", hpage_size, mod);
1033     if (spool) {
1034         if (spool->max_hpages != -1)
1035             seq_printf(m, ",size=%llu",
1036                    (unsigned long long)spool->max_hpages << hpage_shift);
1037         if (spool->min_hpages != -1)
1038             seq_printf(m, ",min_size=%llu",
1039                    (unsigned long long)spool->min_hpages << hpage_shift);
1040     }
1041     return 0;
1042 }
1043
1044 static int hugetlbfs_statfs(struct dentry *dentry, struct kstatfs *buf)
1045 {
1046     struct hugetlbfs_sb_info *sbinfo = HUGETLBFS_SB(dentry->d_sb);
1047     struct hstate *h = hstate_inode(d_inode(dentry));
1048
1049     buf->f_type = HUGETLBFS_MAGIC;
1050     buf->f_bsize = huge_page_size(h);
1051     if (sbinfo) {
1052         spin_lock(&sbinfo->stat_lock);
1053         /* If no limits set, just report 0 or -1 for max/free/used
1054          * blocks, like simple_statfs() */
1055         if (sbinfo->spool) {
1056             long free_pages;
1057
1058             spin_lock_irq(&sbinfo->spool->lock);
1059             buf->f_blocks = sbinfo->spool->max_hpages;
1060             free_pages = sbinfo->spool->max_hpages
1061                 - sbinfo->spool->used_hpages;
1062             buf->f_bavail = buf->f_bfree = free_pages;
1063             spin_unlock_irq(&sbinfo->spool->lock);
1064             buf->f_files = sbinfo->max_inodes;
1065             buf->f_ffree = sbinfo->free_inodes;
1066         }
1067         spin_unlock(&sbinfo->stat_lock);
1068     }
1069     buf->f_namelen = NAME_MAX;
1070     return 0;
1071 }
1072
1073 static void hugetlbfs_put_super(struct super_block *sb)
1074 {
1075     struct hugetlbfs_sb_info *sbi = HUGETLBFS_SB(sb);
1076
1077     if (sbi) {
1078         sb->s_fs_info = NULL;
1079
1080         if (sbi->spool)
1081             hugepage_put_subpool(sbi->spool);
1082
1083         kfree(sbi);
1084     }
1085 }
1086
1087 static inline int hugetlbfs_dec_free_inodes(struct hugetlbfs_sb_info *sbinfo)
1088 {
1089     if (sbinfo->free_inodes >= 0) {
1090         spin_lock(&sbinfo->stat_lock);
1091         if (unlikely(!sbinfo->free_inodes)) {
1092             spin_unlock(&sbinfo->stat_lock);
1093             return 0;
1094         }
1095         sbinfo->free_inodes--;
1096         spin_unlock(&sbinfo->stat_lock);
1097     }
1098
1099     return 1;
1100 }
1101
1102 static void hugetlbfs_inc_free_inodes(struct hugetlbfs_sb_info *sbinfo)
1103 {
1104     if (sbinfo->free_inodes >= 0) {
1105         spin_lock(&sbinfo->stat_lock);
1106         sbinfo->free_inodes++;
1107         spin_unlock(&sbinfo->stat_lock);
1108     }
1109 }
1110
1111
1112 static struct kmem_cache *hugetlbfs_inode_cachep;
1113
1114 static struct inode *hugetlbfs_alloc_inode(struct super_block *sb)
1115 {
1116     struct hugetlbfs_sb_info *sbinfo = HUGETLBFS_SB(sb);
1117     struct hugetlbfs_inode_info *p;
1118
1119     if (unlikely(!hugetlbfs_dec_free_inodes(sbinfo)))
1120         return NULL;
1121     p = alloc_inode_sb(sb, hugetlbfs_inode_cachep, GFP_KERNEL);
1122     if (unlikely(!p)) {
1123         hugetlbfs_inc_free_inodes(sbinfo);
1124         return NULL;
1125     }
1126
1127     /*
1128      * Any time after allocation, hugetlbfs_destroy_inode can be called
1129      * for the inode.  mpol_free_shared_policy is unconditionally called
1130      * as part of hugetlbfs_destroy_inode.  So, initialize policy here
1131      * in case of a quick call to destroy.
1132      *
1133      * Note that the policy is initialized even if we are creating a
1134      * private inode.  This simplifies hugetlbfs_destroy_inode.
1135      */
1136     mpol_shared_policy_init(&p->policy, NULL);
1137
1138     return &p->vfs_inode;
1139 }
1140
1141 static void hugetlbfs_free_inode(struct inode *inode)
1142 {
1143     kmem_cache_free(hugetlbfs_inode_cachep, HUGETLBFS_I(inode));
1144 }
1145
1146 static void hugetlbfs_destroy_inode(struct inode *inode)
1147 {
1148     hugetlbfs_inc_free_inodes(HUGETLBFS_SB(inode->i_sb));
1149     mpol_free_shared_policy(&HUGETLBFS_I(inode)->policy);
1150 }
1151
1152 static const struct address_space_operations hugetlbfs_aops = {
1153     .write_begin    = hugetlbfs_write_begin,
1154     .write_end  = hugetlbfs_write_end,
1155     .dirty_folio    = noop_dirty_folio,
1156     .migrate_folio  = hugetlbfs_migrate_folio,
1157     .error_remove_page  = hugetlbfs_error_remove_page,
1158 };
1159
1160
1161 static void init_once(void *foo)
1162 {
1163     struct hugetlbfs_inode_info *ei = (struct hugetlbfs_inode_info *)foo;
1164
1165     inode_init_once(&ei->vfs_inode);
1166 }
1167
1168 const struct file_operations hugetlbfs_file_operations = {
1169     .read_iter      = hugetlbfs_read_iter,
1170     .mmap           = hugetlbfs_file_mmap,
1171     .fsync          = noop_fsync,
1172     .get_unmapped_area  = hugetlb_get_unmapped_area,
1173     .llseek         = default_llseek,
1174     .fallocate      = hugetlbfs_fallocate,
1175 };
1176
1177 static const struct inode_operations hugetlbfs_dir_inode_operations = {
1178     .create     = hugetlbfs_create,
1179     .lookup     = simple_lookup,
1180     .link       = simple_link,
1181     .unlink     = simple_unlink,
1182     .symlink    = hugetlbfs_symlink,
1183     .mkdir      = hugetlbfs_mkdir,
1184     .rmdir      = simple_rmdir,
1185     .mknod      = hugetlbfs_mknod,
1186     .rename     = simple_rename,
1187     .setattr    = hugetlbfs_setattr,
1188     .tmpfile    = hugetlbfs_tmpfile,
1189 };
1190
1191 static const struct inode_operations hugetlbfs_inode_operations = {
1192     .setattr    = hugetlbfs_setattr,
1193 };
1194
1195 static const struct super_operations hugetlbfs_ops = {
1196     .alloc_inode    = hugetlbfs_alloc_inode,
1197     .free_inode     = hugetlbfs_free_inode,
1198     .destroy_inode  = hugetlbfs_destroy_inode,
1199     .evict_inode    = hugetlbfs_evict_inode,
1200     .statfs     = hugetlbfs_statfs,
1201     .put_super  = hugetlbfs_put_super,
1202     .show_options   = hugetlbfs_show_options,
1203 };
1204
1205 /*
1206  * Convert size option passed from command line to number of huge pages
1207  * in the pool specified by hstate.  Size option could be in bytes
1208  * (val_type == SIZE_STD) or percentage of the pool (val_type == SIZE_PERCENT).
1209  */
1210 static long
1211 hugetlbfs_size_to_hpages(struct hstate *h, unsigned long long size_opt,
1212              enum hugetlbfs_size_type val_type)
1213 {
1214     if (val_type == NO_SIZE)
1215         return -1;
1216
1217     if (val_type == SIZE_PERCENT) {
1218         size_opt <<= huge_page_shift(h);
1219         size_opt *= h->max_huge_pages;
1220         do_div(size_opt, 100);
1221     }
1222
1223     size_opt >>= huge_page_shift(h);
1224     return size_opt;
1225 }
1226
1227 /*
1228  * Parse one mount parameter.
1229  */
1230 static int hugetlbfs_parse_param(struct fs_context *fc, struct fs_parameter *param)
1231 {
1232     struct hugetlbfs_fs_context *ctx = fc->fs_private;
1233     struct fs_parse_result result;
1234     char *rest;
1235     unsigned long ps;
1236     int opt;
1237
1238     opt = fs_parse(fc, hugetlb_fs_parameters, param, &result);
1239     if (opt < 0)
1240         return opt;
1241
1242     switch (opt) {
1243     case Opt_uid:
1244         ctx->uid = make_kuid(current_user_ns(), result.uint_32);
1245         if (!uid_valid(ctx->uid))
1246             goto bad_val;
1247         return 0;
1248
1249     case Opt_gid:
1250         ctx->gid = make_kgid(current_user_ns(), result.uint_32);
1251         if (!gid_valid(ctx->gid))
1252             goto bad_val;
1253         return 0;
1254
1255     case Opt_mode:
1256         ctx->mode = result.uint_32 & 01777U;
1257         return 0;
1258
1259     case Opt_size:
1260         /* memparse() will accept a K/M/G without a digit */
1261         if (!isdigit(param->string[0]))
1262             goto bad_val;
1263         ctx->max_size_opt = memparse(param->string, &rest);
1264         ctx->max_val_type = SIZE_STD;
1265         if (*rest == '%')
1266             ctx->max_val_type = SIZE_PERCENT;
1267         return 0;
1268
1269     case Opt_nr_inodes:
1270         /* memparse() will accept a K/M/G without a digit */
1271         if (!isdigit(param->string[0]))
1272             goto bad_val;
1273         ctx->nr_inodes = memparse(param->string, &rest);
1274         return 0;
1275
1276     case Opt_pagesize:
1277         ps = memparse(param->string, &rest);
1278         ctx->hstate = size_to_hstate(ps);
1279         if (!ctx->hstate) {
1280             pr_err("Unsupported page size %lu MB\n", ps / SZ_1M);
1281             return -EINVAL;
1282         }
1283         return 0;
1284
1285     case Opt_min_size:
1286         /* memparse() will accept a K/M/G without a digit */
1287         if (!isdigit(param->string[0]))
1288             goto bad_val;
1289         ctx->min_size_opt = memparse(param->string, &rest);
1290         ctx->min_val_type = SIZE_STD;
1291         if (*rest == '%')
1292             ctx->min_val_type = SIZE_PERCENT;
1293         return 0;
1294
1295     default:
1296         return -EINVAL;
1297     }
1298
1299 bad_val:
1300     return invalfc(fc, "Bad value '%s' for mount option '%s'\n",
1301               param->string, param->key);
1302 }
1303
1304 /*
1305  * Validate the parsed options.
1306  */
1307 static int hugetlbfs_validate(struct fs_context *fc)
1308 {
1309     struct hugetlbfs_fs_context *ctx = fc->fs_private;
1310
1311     /*
1312      * Use huge page pool size (in hstate) to convert the size
1313      * options to number of huge pages.  If NO_SIZE, -1 is returned.
1314      */
1315     ctx->max_hpages = hugetlbfs_size_to_hpages(ctx->hstate,
1316                            ctx->max_size_opt,
1317                            ctx->max_val_type);
1318     ctx->min_hpages = hugetlbfs_size_to_hpages(ctx->hstate,
1319                            ctx->min_size_opt,
1320                            ctx->min_val_type);
1321
1322     /*
1323      * If max_size was specified, then min_size must be smaller
1324      */
1325     if (ctx->max_val_type > NO_SIZE &&
1326         ctx->min_hpages > ctx->max_hpages) {
1327         pr_err("Minimum size can not be greater than maximum size\n");
1328         return -EINVAL;
1329     }
1330
1331     return 0;
1332 }
1333
1334 static int
1335 hugetlbfs_fill_super(struct super_block *sb, struct fs_context *fc)
1336 {
1337     struct hugetlbfs_fs_context *ctx = fc->fs_private;
1338     struct hugetlbfs_sb_info *sbinfo;
1339
1340     sbinfo = kmalloc(sizeof(struct hugetlbfs_sb_info), GFP_KERNEL);
1341     if (!sbinfo)
1342         return -ENOMEM;
1343     sb->s_fs_info = sbinfo;
1344     spin_lock_init(&sbinfo->stat_lock);
1345     sbinfo->hstate      = ctx->hstate;
1346     sbinfo->max_inodes  = ctx->nr_inodes;
1347     sbinfo->free_inodes = ctx->nr_inodes;
1348     sbinfo->spool       = NULL;
1349     sbinfo->uid     = ctx->uid;
1350     sbinfo->gid     = ctx->gid;
1351     sbinfo->mode        = ctx->mode;
1352
1353     /*
1354      * Allocate and initialize subpool if maximum or minimum size is
1355      * specified.  Any needed reservations (for minimum size) are taken
1356      * when the subpool is created.
1357      */
1358     if (ctx->max_hpages != -1 || ctx->min_hpages != -1) {
1359         sbinfo->spool = hugepage_new_subpool(ctx->hstate,
1360                              ctx->max_hpages,
1361                              ctx->min_hpages);
1362         if (!sbinfo->spool)
1363             goto out_free;
1364     }
1365     sb->s_maxbytes = MAX_LFS_FILESIZE;
1366     sb->s_blocksize = huge_page_size(ctx->hstate);
1367     sb->s_blocksize_bits = huge_page_shift(ctx->hstate);
1368     sb->s_magic = HUGETLBFS_MAGIC;
1369     sb->s_op = &hugetlbfs_ops;
1370     sb->s_time_gran = 1;
1371
1372     /*
1373      * Due to the special and limited functionality of hugetlbfs, it does
1374      * not work well as a stacking filesystem.
1375      */
1376     sb->s_stack_depth = FILESYSTEM_MAX_STACK_DEPTH;
1377     sb->s_root = d_make_root(hugetlbfs_get_root(sb, ctx));
1378     if (!sb->s_root)
1379         goto out_free;
1380     return 0;
1381 out_free:
1382     kfree(sbinfo->spool);
1383     kfree(sbinfo);
1384     return -ENOMEM;
1385 }
1386
1387 static int hugetlbfs_get_tree(struct fs_context *fc)
1388 {
1389     int err = hugetlbfs_validate(fc);
1390     if (err)
1391         return err;
1392     return get_tree_nodev(fc, hugetlbfs_fill_super);
1393 }
1394
1395 static void hugetlbfs_fs_context_free(struct fs_context *fc)
1396 {
1397     kfree(fc->fs_private);
1398 }
1399
1400 static const struct fs_context_operations hugetlbfs_fs_context_ops = {
1401     .free       = hugetlbfs_fs_context_free,
1402     .parse_param    = hugetlbfs_parse_param,
1403     .get_tree   = hugetlbfs_get_tree,
1404 };
1405
1406 static int hugetlbfs_init_fs_context(struct fs_context *fc)
1407 {
1408     struct hugetlbfs_fs_context *ctx;
1409
1410     ctx = kzalloc(sizeof(struct hugetlbfs_fs_context), GFP_KERNEL);
1411     if (!ctx)
1412         return -ENOMEM;
1413
1414     ctx->max_hpages = -1; /* No limit on size by default */
1415     ctx->nr_inodes  = -1; /* No limit on number of inodes by default */
1416     ctx->uid    = current_fsuid();
1417     ctx->gid    = current_fsgid();
1418     ctx->mode   = 0755;
1419     ctx->hstate = &default_hstate;
1420     ctx->min_hpages = -1; /* No default minimum size */
1421     ctx->max_val_type = NO_SIZE;
1422     ctx->min_val_type = NO_SIZE;
1423     fc->fs_private = ctx;
1424     fc->ops = &hugetlbfs_fs_context_ops;
1425     return 0;
1426 }
1427
1428 static struct file_system_type hugetlbfs_fs_type = {
1429     .name           = "hugetlbfs",
1430     .init_fs_context    = hugetlbfs_init_fs_context,
1431     .parameters     = hugetlb_fs_parameters,
1432     .kill_sb        = kill_litter_super,
1433 };
1434
1435 static struct vfsmount *hugetlbfs_vfsmount[HUGE_MAX_HSTATE];
1436
1437 static int can_do_hugetlb_shm(void)
1438 {
1439     kgid_t shm_group;
1440     shm_group = make_kgid(&init_user_ns, sysctl_hugetlb_shm_group);
1441     return capable(CAP_IPC_LOCK) || in_group_p(shm_group);
1442 }
1443
1444 static int get_hstate_idx(int page_size_log)
1445 {
1446     struct hstate *h = hstate_sizelog(page_size_log);
1447
1448     if (!h)
1449         return -1;
1450     return hstate_index(h);
1451 }
1452
1453 /*
1454  * Note that size should be aligned to proper hugepage size in caller side,
1455  * otherwise hugetlb_reserve_pages reserves one less hugepages than intended.
1456  */
1457 struct file *hugetlb_file_setup(const char *name, size_t size,
1458                 vm_flags_t acctflag, int creat_flags,
1459                 int page_size_log)
1460 {
1461     struct inode *inode;
1462     struct vfsmount *mnt;
1463     int hstate_idx;
1464     struct file *file;
1465
1466     hstate_idx = get_hstate_idx(page_size_log);
1467     if (hstate_idx < 0)
1468         return ERR_PTR(-ENODEV);
1469
1470     mnt = hugetlbfs_vfsmount[hstate_idx];
1471     if (!mnt)
1472         return ERR_PTR(-ENOENT);
1473
1474     if (creat_flags == HUGETLB_SHMFS_INODE && !can_do_hugetlb_shm()) {
1475         struct ucounts *ucounts = current_ucounts();
1476
1477         if (user_shm_lock(size, ucounts)) {
1478             pr_warn_once("%s (%d): Using mlock ulimits for SHM_HUGETLB is obsolete\n",
1479                 current->comm, current->pid);
1480             user_shm_unlock(size, ucounts);
1481         }
1482         return ERR_PTR(-EPERM);
1483     }
1484
1485     file = ERR_PTR(-ENOSPC);
1486     inode = hugetlbfs_get_inode(mnt->mnt_sb, NULL, S_IFREG | S_IRWXUGO, 0);
1487     if (!inode)
1488         goto out;
1489     if (creat_flags == HUGETLB_SHMFS_INODE)
1490         inode->i_flags |= S_PRIVATE;
1491
1492     inode->i_size = size;
1493     clear_nlink(inode);
1494
1495     if (!hugetlb_reserve_pages(inode, 0,
1496             size >> huge_page_shift(hstate_inode(inode)), NULL,
1497             acctflag))
1498         file = ERR_PTR(-ENOMEM);
1499     else
1500         file = alloc_file_pseudo(inode, mnt, name, O_RDWR,
1501                     &hugetlbfs_file_operations);
1502     if (!IS_ERR(file))
1503         return file;
1504
1505     iput(inode);
1506 out:
1507     return file;
1508 }
1509
1510 static struct vfsmount *__init mount_one_hugetlbfs(struct hstate *h)
1511 {
1512     struct fs_context *fc;
1513     struct vfsmount *mnt;
1514
1515     fc = fs_context_for_mount(&hugetlbfs_fs_type, SB_KERNMOUNT);
1516     if (IS_ERR(fc)) {
1517         mnt = ERR_CAST(fc);
1518     } else {
1519         struct hugetlbfs_fs_context *ctx = fc->fs_private;
1520         ctx->hstate = h;
1521         mnt = fc_mount(fc);
1522         put_fs_context(fc);
1523     }
1524     if (IS_ERR(mnt))
1525         pr_err("Cannot mount internal hugetlbfs for page size %luK",
1526                huge_page_size(h) / SZ_1K);
1527     return mnt;
1528 }
1529
1530 static int __init init_hugetlbfs_fs(void)
1531 {
1532     struct vfsmount *mnt;
1533     struct hstate *h;
1534     int error;
1535     int i;
1536
1537     if (!hugepages_supported()) {
1538         pr_info("disabling because there are no supported hugepage sizes\n");
1539         return -ENOTSUPP;
1540     }
1541
1542     error = -ENOMEM;
1543     hugetlbfs_inode_cachep = kmem_cache_create("hugetlbfs_inode_cache",
1544                     sizeof(struct hugetlbfs_inode_info),
1545                     0, SLAB_ACCOUNT, init_once);
1546     if (hugetlbfs_inode_cachep == NULL)
1547         goto out;
1548
1549     error = register_filesystem(&hugetlbfs_fs_type);
1550     if (error)
1551         goto out_free;
1552
1553     /* default hstate mount is required */
1554     mnt = mount_one_hugetlbfs(&default_hstate);
1555     if (IS_ERR(mnt)) {
1556         error = PTR_ERR(mnt);
1557         goto out_unreg;
1558     }
1559     hugetlbfs_vfsmount[default_hstate_idx] = mnt;
1560
1561     /* other hstates are optional */
1562     i = 0;
1563     for_each_hstate(h) {
1564         if (i == default_hstate_idx) {
1565             i++;
1566             continue;
1567         }
1568
1569         mnt = mount_one_hugetlbfs(h);
1570         if (IS_ERR(mnt))
1571             hugetlbfs_vfsmount[i] = NULL;
1572         else
1573             hugetlbfs_vfsmount[i] = mnt;
1574         i++;
1575     }
1576
1577     return 0;
1578
1579  out_unreg:
1580     (void)unregister_filesystem(&hugetlbfs_fs_type);
1581  out_free:
1582     kmem_cache_destroy(hugetlbfs_inode_cachep);
1583  out:
1584     return error;
1585 }
1586 fs_initcall(init_hugetlbfs_fs)