Back to home page

OSCL-LXR

 
 

    


0001 // SPDX-License-Identifier: GPL-2.0-only
0002 /*
0003  * Ram backed block device driver.
0004  *
0005  * Copyright (C) 2007 Nick Piggin
0006  * Copyright (C) 2007 Novell Inc.
0007  *
0008  * Parts derived from drivers/block/rd.c, and drivers/block/loop.c, copyright
0009  * of their respective owners.
0010  */
0011 
0012 #include <linux/init.h>
0013 #include <linux/initrd.h>
0014 #include <linux/module.h>
0015 #include <linux/moduleparam.h>
0016 #include <linux/major.h>
0017 #include <linux/blkdev.h>
0018 #include <linux/bio.h>
0019 #include <linux/highmem.h>
0020 #include <linux/mutex.h>
0021 #include <linux/pagemap.h>
0022 #include <linux/radix-tree.h>
0023 #include <linux/fs.h>
0024 #include <linux/slab.h>
0025 #include <linux/backing-dev.h>
0026 #include <linux/debugfs.h>
0027 
0028 #include <linux/uaccess.h>
0029 
0030 /*
0031  * Each block ramdisk device has a radix_tree brd_pages of pages that stores
0032  * the pages containing the block device's contents. A brd page's ->index is
0033  * its offset in PAGE_SIZE units. This is similar to, but in no way connected
0034  * with, the kernel's pagecache or buffer cache (which sit above our block
0035  * device).
0036  */
0037 struct brd_device {
0038     int         brd_number;
0039     struct gendisk      *brd_disk;
0040     struct list_head    brd_list;
0041 
0042     /*
0043      * Backing store of pages and lock to protect it. This is the contents
0044      * of the block device.
0045      */
0046     spinlock_t      brd_lock;
0047     struct radix_tree_root  brd_pages;
0048     u64         brd_nr_pages;
0049 };
0050 
0051 /*
0052  * Look up and return a brd's page for a given sector.
0053  */
0054 static struct page *brd_lookup_page(struct brd_device *brd, sector_t sector)
0055 {
0056     pgoff_t idx;
0057     struct page *page;
0058 
0059     /*
0060      * The page lifetime is protected by the fact that we have opened the
0061      * device node -- brd pages will never be deleted under us, so we
0062      * don't need any further locking or refcounting.
0063      *
0064      * This is strictly true for the radix-tree nodes as well (ie. we
0065      * don't actually need the rcu_read_lock()), however that is not a
0066      * documented feature of the radix-tree API so it is better to be
0067      * safe here (we don't have total exclusion from radix tree updates
0068      * here, only deletes).
0069      */
0070     rcu_read_lock();
0071     idx = sector >> PAGE_SECTORS_SHIFT; /* sector to page index */
0072     page = radix_tree_lookup(&brd->brd_pages, idx);
0073     rcu_read_unlock();
0074 
0075     BUG_ON(page && page->index != idx);
0076 
0077     return page;
0078 }
0079 
0080 /*
0081  * Look up and return a brd's page for a given sector.
0082  * If one does not exist, allocate an empty page, and insert that. Then
0083  * return it.
0084  */
0085 static struct page *brd_insert_page(struct brd_device *brd, sector_t sector)
0086 {
0087     pgoff_t idx;
0088     struct page *page;
0089     gfp_t gfp_flags;
0090 
0091     page = brd_lookup_page(brd, sector);
0092     if (page)
0093         return page;
0094 
0095     /*
0096      * Must use NOIO because we don't want to recurse back into the
0097      * block or filesystem layers from page reclaim.
0098      */
0099     gfp_flags = GFP_NOIO | __GFP_ZERO | __GFP_HIGHMEM;
0100     page = alloc_page(gfp_flags);
0101     if (!page)
0102         return NULL;
0103 
0104     if (radix_tree_preload(GFP_NOIO)) {
0105         __free_page(page);
0106         return NULL;
0107     }
0108 
0109     spin_lock(&brd->brd_lock);
0110     idx = sector >> PAGE_SECTORS_SHIFT;
0111     page->index = idx;
0112     if (radix_tree_insert(&brd->brd_pages, idx, page)) {
0113         __free_page(page);
0114         page = radix_tree_lookup(&brd->brd_pages, idx);
0115         BUG_ON(!page);
0116         BUG_ON(page->index != idx);
0117     } else {
0118         brd->brd_nr_pages++;
0119     }
0120     spin_unlock(&brd->brd_lock);
0121 
0122     radix_tree_preload_end();
0123 
0124     return page;
0125 }
0126 
0127 /*
0128  * Free all backing store pages and radix tree. This must only be called when
0129  * there are no other users of the device.
0130  */
0131 #define FREE_BATCH 16
0132 static void brd_free_pages(struct brd_device *brd)
0133 {
0134     unsigned long pos = 0;
0135     struct page *pages[FREE_BATCH];
0136     int nr_pages;
0137 
0138     do {
0139         int i;
0140 
0141         nr_pages = radix_tree_gang_lookup(&brd->brd_pages,
0142                 (void **)pages, pos, FREE_BATCH);
0143 
0144         for (i = 0; i < nr_pages; i++) {
0145             void *ret;
0146 
0147             BUG_ON(pages[i]->index < pos);
0148             pos = pages[i]->index;
0149             ret = radix_tree_delete(&brd->brd_pages, pos);
0150             BUG_ON(!ret || ret != pages[i]);
0151             __free_page(pages[i]);
0152         }
0153 
0154         pos++;
0155 
0156         /*
0157          * It takes 3.4 seconds to remove 80GiB ramdisk.
0158          * So, we need cond_resched to avoid stalling the CPU.
0159          */
0160         cond_resched();
0161 
0162         /*
0163          * This assumes radix_tree_gang_lookup always returns as
0164          * many pages as possible. If the radix-tree code changes,
0165          * so will this have to.
0166          */
0167     } while (nr_pages == FREE_BATCH);
0168 }
0169 
0170 /*
0171  * copy_to_brd_setup must be called before copy_to_brd. It may sleep.
0172  */
0173 static int copy_to_brd_setup(struct brd_device *brd, sector_t sector, size_t n)
0174 {
0175     unsigned int offset = (sector & (PAGE_SECTORS-1)) << SECTOR_SHIFT;
0176     size_t copy;
0177 
0178     copy = min_t(size_t, n, PAGE_SIZE - offset);
0179     if (!brd_insert_page(brd, sector))
0180         return -ENOSPC;
0181     if (copy < n) {
0182         sector += copy >> SECTOR_SHIFT;
0183         if (!brd_insert_page(brd, sector))
0184             return -ENOSPC;
0185     }
0186     return 0;
0187 }
0188 
0189 /*
0190  * Copy n bytes from src to the brd starting at sector. Does not sleep.
0191  */
0192 static void copy_to_brd(struct brd_device *brd, const void *src,
0193             sector_t sector, size_t n)
0194 {
0195     struct page *page;
0196     void *dst;
0197     unsigned int offset = (sector & (PAGE_SECTORS-1)) << SECTOR_SHIFT;
0198     size_t copy;
0199 
0200     copy = min_t(size_t, n, PAGE_SIZE - offset);
0201     page = brd_lookup_page(brd, sector);
0202     BUG_ON(!page);
0203 
0204     dst = kmap_atomic(page);
0205     memcpy(dst + offset, src, copy);
0206     kunmap_atomic(dst);
0207 
0208     if (copy < n) {
0209         src += copy;
0210         sector += copy >> SECTOR_SHIFT;
0211         copy = n - copy;
0212         page = brd_lookup_page(brd, sector);
0213         BUG_ON(!page);
0214 
0215         dst = kmap_atomic(page);
0216         memcpy(dst, src, copy);
0217         kunmap_atomic(dst);
0218     }
0219 }
0220 
0221 /*
0222  * Copy n bytes to dst from the brd starting at sector. Does not sleep.
0223  */
0224 static void copy_from_brd(void *dst, struct brd_device *brd,
0225             sector_t sector, size_t n)
0226 {
0227     struct page *page;
0228     void *src;
0229     unsigned int offset = (sector & (PAGE_SECTORS-1)) << SECTOR_SHIFT;
0230     size_t copy;
0231 
0232     copy = min_t(size_t, n, PAGE_SIZE - offset);
0233     page = brd_lookup_page(brd, sector);
0234     if (page) {
0235         src = kmap_atomic(page);
0236         memcpy(dst, src + offset, copy);
0237         kunmap_atomic(src);
0238     } else
0239         memset(dst, 0, copy);
0240 
0241     if (copy < n) {
0242         dst += copy;
0243         sector += copy >> SECTOR_SHIFT;
0244         copy = n - copy;
0245         page = brd_lookup_page(brd, sector);
0246         if (page) {
0247             src = kmap_atomic(page);
0248             memcpy(dst, src, copy);
0249             kunmap_atomic(src);
0250         } else
0251             memset(dst, 0, copy);
0252     }
0253 }
0254 
0255 /*
0256  * Process a single bvec of a bio.
0257  */
0258 static int brd_do_bvec(struct brd_device *brd, struct page *page,
0259             unsigned int len, unsigned int off, enum req_op op,
0260             sector_t sector)
0261 {
0262     void *mem;
0263     int err = 0;
0264 
0265     if (op_is_write(op)) {
0266         err = copy_to_brd_setup(brd, sector, len);
0267         if (err)
0268             goto out;
0269     }
0270 
0271     mem = kmap_atomic(page);
0272     if (!op_is_write(op)) {
0273         copy_from_brd(mem + off, brd, sector, len);
0274         flush_dcache_page(page);
0275     } else {
0276         flush_dcache_page(page);
0277         copy_to_brd(brd, mem + off, sector, len);
0278     }
0279     kunmap_atomic(mem);
0280 
0281 out:
0282     return err;
0283 }
0284 
0285 static void brd_submit_bio(struct bio *bio)
0286 {
0287     struct brd_device *brd = bio->bi_bdev->bd_disk->private_data;
0288     sector_t sector = bio->bi_iter.bi_sector;
0289     struct bio_vec bvec;
0290     struct bvec_iter iter;
0291 
0292     bio_for_each_segment(bvec, bio, iter) {
0293         unsigned int len = bvec.bv_len;
0294         int err;
0295 
0296         /* Don't support un-aligned buffer */
0297         WARN_ON_ONCE((bvec.bv_offset & (SECTOR_SIZE - 1)) ||
0298                 (len & (SECTOR_SIZE - 1)));
0299 
0300         err = brd_do_bvec(brd, bvec.bv_page, len, bvec.bv_offset,
0301                   bio_op(bio), sector);
0302         if (err) {
0303             bio_io_error(bio);
0304             return;
0305         }
0306         sector += len >> SECTOR_SHIFT;
0307     }
0308 
0309     bio_endio(bio);
0310 }
0311 
0312 static int brd_rw_page(struct block_device *bdev, sector_t sector,
0313                struct page *page, enum req_op op)
0314 {
0315     struct brd_device *brd = bdev->bd_disk->private_data;
0316     int err;
0317 
0318     if (PageTransHuge(page))
0319         return -ENOTSUPP;
0320     err = brd_do_bvec(brd, page, PAGE_SIZE, 0, op, sector);
0321     page_endio(page, op_is_write(op), err);
0322     return err;
0323 }
0324 
0325 static const struct block_device_operations brd_fops = {
0326     .owner =        THIS_MODULE,
0327     .submit_bio =       brd_submit_bio,
0328     .rw_page =      brd_rw_page,
0329 };
0330 
0331 /*
0332  * And now the modules code and kernel interface.
0333  */
0334 static int rd_nr = CONFIG_BLK_DEV_RAM_COUNT;
0335 module_param(rd_nr, int, 0444);
0336 MODULE_PARM_DESC(rd_nr, "Maximum number of brd devices");
0337 
0338 unsigned long rd_size = CONFIG_BLK_DEV_RAM_SIZE;
0339 module_param(rd_size, ulong, 0444);
0340 MODULE_PARM_DESC(rd_size, "Size of each RAM disk in kbytes.");
0341 
0342 static int max_part = 1;
0343 module_param(max_part, int, 0444);
0344 MODULE_PARM_DESC(max_part, "Num Minors to reserve between devices");
0345 
0346 MODULE_LICENSE("GPL");
0347 MODULE_ALIAS_BLOCKDEV_MAJOR(RAMDISK_MAJOR);
0348 MODULE_ALIAS("rd");
0349 
0350 #ifndef MODULE
0351 /* Legacy boot options - nonmodular */
0352 static int __init ramdisk_size(char *str)
0353 {
0354     rd_size = simple_strtol(str, NULL, 0);
0355     return 1;
0356 }
0357 __setup("ramdisk_size=", ramdisk_size);
0358 #endif
0359 
0360 /*
0361  * The device scheme is derived from loop.c. Keep them in synch where possible
0362  * (should share code eventually).
0363  */
0364 static LIST_HEAD(brd_devices);
0365 static struct dentry *brd_debugfs_dir;
0366 
0367 static int brd_alloc(int i)
0368 {
0369     struct brd_device *brd;
0370     struct gendisk *disk;
0371     char buf[DISK_NAME_LEN];
0372     int err = -ENOMEM;
0373 
0374     list_for_each_entry(brd, &brd_devices, brd_list)
0375         if (brd->brd_number == i)
0376             return -EEXIST;
0377     brd = kzalloc(sizeof(*brd), GFP_KERNEL);
0378     if (!brd)
0379         return -ENOMEM;
0380     brd->brd_number     = i;
0381     list_add_tail(&brd->brd_list, &brd_devices);
0382 
0383     spin_lock_init(&brd->brd_lock);
0384     INIT_RADIX_TREE(&brd->brd_pages, GFP_ATOMIC);
0385 
0386     snprintf(buf, DISK_NAME_LEN, "ram%d", i);
0387     if (!IS_ERR_OR_NULL(brd_debugfs_dir))
0388         debugfs_create_u64(buf, 0444, brd_debugfs_dir,
0389                 &brd->brd_nr_pages);
0390 
0391     disk = brd->brd_disk = blk_alloc_disk(NUMA_NO_NODE);
0392     if (!disk)
0393         goto out_free_dev;
0394 
0395     disk->major     = RAMDISK_MAJOR;
0396     disk->first_minor   = i * max_part;
0397     disk->minors        = max_part;
0398     disk->fops      = &brd_fops;
0399     disk->private_data  = brd;
0400     strlcpy(disk->disk_name, buf, DISK_NAME_LEN);
0401     set_capacity(disk, rd_size * 2);
0402     
0403     /*
0404      * This is so fdisk will align partitions on 4k, because of
0405      * direct_access API needing 4k alignment, returning a PFN
0406      * (This is only a problem on very small devices <= 4M,
0407      *  otherwise fdisk will align on 1M. Regardless this call
0408      *  is harmless)
0409      */
0410     blk_queue_physical_block_size(disk->queue, PAGE_SIZE);
0411 
0412     /* Tell the block layer that this is not a rotational device */
0413     blk_queue_flag_set(QUEUE_FLAG_NONROT, disk->queue);
0414     blk_queue_flag_clear(QUEUE_FLAG_ADD_RANDOM, disk->queue);
0415     err = add_disk(disk);
0416     if (err)
0417         goto out_cleanup_disk;
0418 
0419     return 0;
0420 
0421 out_cleanup_disk:
0422     put_disk(disk);
0423 out_free_dev:
0424     list_del(&brd->brd_list);
0425     kfree(brd);
0426     return err;
0427 }
0428 
0429 static void brd_probe(dev_t dev)
0430 {
0431     brd_alloc(MINOR(dev) / max_part);
0432 }
0433 
0434 static void brd_cleanup(void)
0435 {
0436     struct brd_device *brd, *next;
0437 
0438     debugfs_remove_recursive(brd_debugfs_dir);
0439 
0440     list_for_each_entry_safe(brd, next, &brd_devices, brd_list) {
0441         del_gendisk(brd->brd_disk);
0442         put_disk(brd->brd_disk);
0443         brd_free_pages(brd);
0444         list_del(&brd->brd_list);
0445         kfree(brd);
0446     }
0447 }
0448 
0449 static inline void brd_check_and_reset_par(void)
0450 {
0451     if (unlikely(!max_part))
0452         max_part = 1;
0453 
0454     /*
0455      * make sure 'max_part' can be divided exactly by (1U << MINORBITS),
0456      * otherwise, it is possiable to get same dev_t when adding partitions.
0457      */
0458     if ((1U << MINORBITS) % max_part != 0)
0459         max_part = 1UL << fls(max_part);
0460 
0461     if (max_part > DISK_MAX_PARTS) {
0462         pr_info("brd: max_part can't be larger than %d, reset max_part = %d.\n",
0463             DISK_MAX_PARTS, DISK_MAX_PARTS);
0464         max_part = DISK_MAX_PARTS;
0465     }
0466 }
0467 
0468 static int __init brd_init(void)
0469 {
0470     int err, i;
0471 
0472     brd_check_and_reset_par();
0473 
0474     brd_debugfs_dir = debugfs_create_dir("ramdisk_pages", NULL);
0475 
0476     for (i = 0; i < rd_nr; i++) {
0477         err = brd_alloc(i);
0478         if (err)
0479             goto out_free;
0480     }
0481 
0482     /*
0483      * brd module now has a feature to instantiate underlying device
0484      * structure on-demand, provided that there is an access dev node.
0485      *
0486      * (1) if rd_nr is specified, create that many upfront. else
0487      *     it defaults to CONFIG_BLK_DEV_RAM_COUNT
0488      * (2) User can further extend brd devices by create dev node themselves
0489      *     and have kernel automatically instantiate actual device
0490      *     on-demand. Example:
0491      *      mknod /path/devnod_name b 1 X   # 1 is the rd major
0492      *      fdisk -l /path/devnod_name
0493      *  If (X / max_part) was not already created it will be created
0494      *  dynamically.
0495      */
0496 
0497     if (__register_blkdev(RAMDISK_MAJOR, "ramdisk", brd_probe)) {
0498         err = -EIO;
0499         goto out_free;
0500     }
0501 
0502     pr_info("brd: module loaded\n");
0503     return 0;
0504 
0505 out_free:
0506     brd_cleanup();
0507 
0508     pr_info("brd: module NOT loaded !!!\n");
0509     return err;
0510 }
0511 
0512 static void __exit brd_exit(void)
0513 {
0514 
0515     unregister_blkdev(RAMDISK_MAJOR, "ramdisk");
0516     brd_cleanup();
0517 
0518     pr_info("brd: module unloaded\n");
0519 }
0520 
0521 module_init(brd_init);
0522 module_exit(brd_exit);
0523