0001
0002
0003
0004
0005
0006
0007
0008
0009 #include <linux/blkdev.h>
0010 #include <linux/mm.h>
0011 #include <linux/sched/mm.h>
0012 #include <linux/sched/task.h>
0013 #include <linux/hugetlb.h>
0014 #include <linux/mman.h>
0015 #include <linux/slab.h>
0016 #include <linux/kernel_stat.h>
0017 #include <linux/swap.h>
0018 #include <linux/vmalloc.h>
0019 #include <linux/pagemap.h>
0020 #include <linux/namei.h>
0021 #include <linux/shmem_fs.h>
0022 #include <linux/blk-cgroup.h>
0023 #include <linux/random.h>
0024 #include <linux/writeback.h>
0025 #include <linux/proc_fs.h>
0026 #include <linux/seq_file.h>
0027 #include <linux/init.h>
0028 #include <linux/ksm.h>
0029 #include <linux/rmap.h>
0030 #include <linux/security.h>
0031 #include <linux/backing-dev.h>
0032 #include <linux/mutex.h>
0033 #include <linux/capability.h>
0034 #include <linux/syscalls.h>
0035 #include <linux/memcontrol.h>
0036 #include <linux/poll.h>
0037 #include <linux/oom.h>
0038 #include <linux/frontswap.h>
0039 #include <linux/swapfile.h>
0040 #include <linux/export.h>
0041 #include <linux/swap_slots.h>
0042 #include <linux/sort.h>
0043 #include <linux/completion.h>
0044
0045 #include <asm/tlbflush.h>
0046 #include <linux/swapops.h>
0047 #include <linux/swap_cgroup.h>
0048 #include "swap.h"
0049
0050 static bool swap_count_continued(struct swap_info_struct *, pgoff_t,
0051 unsigned char);
0052 static void free_swap_count_continuations(struct swap_info_struct *);
0053
0054 static DEFINE_SPINLOCK(swap_lock);
0055 static unsigned int nr_swapfiles;
0056 atomic_long_t nr_swap_pages;
0057
0058
0059
0060
0061
0062 EXPORT_SYMBOL_GPL(nr_swap_pages);
0063
0064 long total_swap_pages;
0065 static int least_priority = -1;
0066
0067 static const char Bad_file[] = "Bad swap file entry ";
0068 static const char Unused_file[] = "Unused swap file entry ";
0069 static const char Bad_offset[] = "Bad swap offset entry ";
0070 static const char Unused_offset[] = "Unused swap offset entry ";
0071
0072
0073
0074
0075
0076 static PLIST_HEAD(swap_active_head);
0077
0078
0079
0080
0081
0082
0083
0084
0085
0086
0087
0088
0089
0090 static struct plist_head *swap_avail_heads;
0091 static DEFINE_SPINLOCK(swap_avail_lock);
0092
0093 struct swap_info_struct *swap_info[MAX_SWAPFILES];
0094
0095 static DEFINE_MUTEX(swapon_mutex);
0096
0097 static DECLARE_WAIT_QUEUE_HEAD(proc_poll_wait);
0098
0099 static atomic_t proc_poll_event = ATOMIC_INIT(0);
0100
0101 atomic_t nr_rotate_swap = ATOMIC_INIT(0);
0102
0103 static struct swap_info_struct *swap_type_to_swap_info(int type)
0104 {
0105 if (type >= MAX_SWAPFILES)
0106 return NULL;
0107
0108 return READ_ONCE(swap_info[type]);
0109 }
0110
0111 static inline unsigned char swap_count(unsigned char ent)
0112 {
0113 return ent & ~SWAP_HAS_CACHE;
0114 }
0115
0116
0117 #define TTRS_ANYWAY 0x1
0118
0119
0120
0121
0122 #define TTRS_UNMAPPED 0x2
0123
0124 #define TTRS_FULL 0x4
0125
0126
0127 static int __try_to_reclaim_swap(struct swap_info_struct *si,
0128 unsigned long offset, unsigned long flags)
0129 {
0130 swp_entry_t entry = swp_entry(si->type, offset);
0131 struct page *page;
0132 int ret = 0;
0133
0134 page = find_get_page(swap_address_space(entry), offset);
0135 if (!page)
0136 return 0;
0137
0138
0139
0140
0141
0142
0143
0144 if (trylock_page(page)) {
0145 if ((flags & TTRS_ANYWAY) ||
0146 ((flags & TTRS_UNMAPPED) && !page_mapped(page)) ||
0147 ((flags & TTRS_FULL) && mem_cgroup_swap_full(page)))
0148 ret = try_to_free_swap(page);
0149 unlock_page(page);
0150 }
0151 put_page(page);
0152 return ret;
0153 }
0154
0155 static inline struct swap_extent *first_se(struct swap_info_struct *sis)
0156 {
0157 struct rb_node *rb = rb_first(&sis->swap_extent_root);
0158 return rb_entry(rb, struct swap_extent, rb_node);
0159 }
0160
0161 static inline struct swap_extent *next_se(struct swap_extent *se)
0162 {
0163 struct rb_node *rb = rb_next(&se->rb_node);
0164 return rb ? rb_entry(rb, struct swap_extent, rb_node) : NULL;
0165 }
0166
0167
0168
0169
0170
0171 static int discard_swap(struct swap_info_struct *si)
0172 {
0173 struct swap_extent *se;
0174 sector_t start_block;
0175 sector_t nr_blocks;
0176 int err = 0;
0177
0178
0179 se = first_se(si);
0180 start_block = (se->start_block + 1) << (PAGE_SHIFT - 9);
0181 nr_blocks = ((sector_t)se->nr_pages - 1) << (PAGE_SHIFT - 9);
0182 if (nr_blocks) {
0183 err = blkdev_issue_discard(si->bdev, start_block,
0184 nr_blocks, GFP_KERNEL);
0185 if (err)
0186 return err;
0187 cond_resched();
0188 }
0189
0190 for (se = next_se(se); se; se = next_se(se)) {
0191 start_block = se->start_block << (PAGE_SHIFT - 9);
0192 nr_blocks = (sector_t)se->nr_pages << (PAGE_SHIFT - 9);
0193
0194 err = blkdev_issue_discard(si->bdev, start_block,
0195 nr_blocks, GFP_KERNEL);
0196 if (err)
0197 break;
0198
0199 cond_resched();
0200 }
0201 return err;
0202 }
0203
0204 static struct swap_extent *
0205 offset_to_swap_extent(struct swap_info_struct *sis, unsigned long offset)
0206 {
0207 struct swap_extent *se;
0208 struct rb_node *rb;
0209
0210 rb = sis->swap_extent_root.rb_node;
0211 while (rb) {
0212 se = rb_entry(rb, struct swap_extent, rb_node);
0213 if (offset < se->start_page)
0214 rb = rb->rb_left;
0215 else if (offset >= se->start_page + se->nr_pages)
0216 rb = rb->rb_right;
0217 else
0218 return se;
0219 }
0220
0221 BUG();
0222 }
0223
0224 sector_t swap_page_sector(struct page *page)
0225 {
0226 struct swap_info_struct *sis = page_swap_info(page);
0227 struct swap_extent *se;
0228 sector_t sector;
0229 pgoff_t offset;
0230
0231 offset = __page_file_index(page);
0232 se = offset_to_swap_extent(sis, offset);
0233 sector = se->start_block + (offset - se->start_page);
0234 return sector << (PAGE_SHIFT - 9);
0235 }
0236
0237
0238
0239
0240
0241 static void discard_swap_cluster(struct swap_info_struct *si,
0242 pgoff_t start_page, pgoff_t nr_pages)
0243 {
0244 struct swap_extent *se = offset_to_swap_extent(si, start_page);
0245
0246 while (nr_pages) {
0247 pgoff_t offset = start_page - se->start_page;
0248 sector_t start_block = se->start_block + offset;
0249 sector_t nr_blocks = se->nr_pages - offset;
0250
0251 if (nr_blocks > nr_pages)
0252 nr_blocks = nr_pages;
0253 start_page += nr_blocks;
0254 nr_pages -= nr_blocks;
0255
0256 start_block <<= PAGE_SHIFT - 9;
0257 nr_blocks <<= PAGE_SHIFT - 9;
0258 if (blkdev_issue_discard(si->bdev, start_block,
0259 nr_blocks, GFP_NOIO))
0260 break;
0261
0262 se = next_se(se);
0263 }
0264 }
0265
0266 #ifdef CONFIG_THP_SWAP
0267 #define SWAPFILE_CLUSTER HPAGE_PMD_NR
0268
0269 #define swap_entry_size(size) (size)
0270 #else
0271 #define SWAPFILE_CLUSTER 256
0272
0273
0274
0275
0276
0277 #define swap_entry_size(size) 1
0278 #endif
0279 #define LATENCY_LIMIT 256
0280
0281 static inline void cluster_set_flag(struct swap_cluster_info *info,
0282 unsigned int flag)
0283 {
0284 info->flags = flag;
0285 }
0286
0287 static inline unsigned int cluster_count(struct swap_cluster_info *info)
0288 {
0289 return info->data;
0290 }
0291
0292 static inline void cluster_set_count(struct swap_cluster_info *info,
0293 unsigned int c)
0294 {
0295 info->data = c;
0296 }
0297
0298 static inline void cluster_set_count_flag(struct swap_cluster_info *info,
0299 unsigned int c, unsigned int f)
0300 {
0301 info->flags = f;
0302 info->data = c;
0303 }
0304
0305 static inline unsigned int cluster_next(struct swap_cluster_info *info)
0306 {
0307 return info->data;
0308 }
0309
0310 static inline void cluster_set_next(struct swap_cluster_info *info,
0311 unsigned int n)
0312 {
0313 info->data = n;
0314 }
0315
0316 static inline void cluster_set_next_flag(struct swap_cluster_info *info,
0317 unsigned int n, unsigned int f)
0318 {
0319 info->flags = f;
0320 info->data = n;
0321 }
0322
0323 static inline bool cluster_is_free(struct swap_cluster_info *info)
0324 {
0325 return info->flags & CLUSTER_FLAG_FREE;
0326 }
0327
0328 static inline bool cluster_is_null(struct swap_cluster_info *info)
0329 {
0330 return info->flags & CLUSTER_FLAG_NEXT_NULL;
0331 }
0332
0333 static inline void cluster_set_null(struct swap_cluster_info *info)
0334 {
0335 info->flags = CLUSTER_FLAG_NEXT_NULL;
0336 info->data = 0;
0337 }
0338
0339 static inline bool cluster_is_huge(struct swap_cluster_info *info)
0340 {
0341 if (IS_ENABLED(CONFIG_THP_SWAP))
0342 return info->flags & CLUSTER_FLAG_HUGE;
0343 return false;
0344 }
0345
0346 static inline void cluster_clear_huge(struct swap_cluster_info *info)
0347 {
0348 info->flags &= ~CLUSTER_FLAG_HUGE;
0349 }
0350
0351 static inline struct swap_cluster_info *lock_cluster(struct swap_info_struct *si,
0352 unsigned long offset)
0353 {
0354 struct swap_cluster_info *ci;
0355
0356 ci = si->cluster_info;
0357 if (ci) {
0358 ci += offset / SWAPFILE_CLUSTER;
0359 spin_lock(&ci->lock);
0360 }
0361 return ci;
0362 }
0363
0364 static inline void unlock_cluster(struct swap_cluster_info *ci)
0365 {
0366 if (ci)
0367 spin_unlock(&ci->lock);
0368 }
0369
0370
0371
0372
0373
0374 static inline struct swap_cluster_info *lock_cluster_or_swap_info(
0375 struct swap_info_struct *si, unsigned long offset)
0376 {
0377 struct swap_cluster_info *ci;
0378
0379
0380 ci = lock_cluster(si, offset);
0381
0382 if (!ci)
0383 spin_lock(&si->lock);
0384
0385 return ci;
0386 }
0387
0388 static inline void unlock_cluster_or_swap_info(struct swap_info_struct *si,
0389 struct swap_cluster_info *ci)
0390 {
0391 if (ci)
0392 unlock_cluster(ci);
0393 else
0394 spin_unlock(&si->lock);
0395 }
0396
0397 static inline bool cluster_list_empty(struct swap_cluster_list *list)
0398 {
0399 return cluster_is_null(&list->head);
0400 }
0401
0402 static inline unsigned int cluster_list_first(struct swap_cluster_list *list)
0403 {
0404 return cluster_next(&list->head);
0405 }
0406
0407 static void cluster_list_init(struct swap_cluster_list *list)
0408 {
0409 cluster_set_null(&list->head);
0410 cluster_set_null(&list->tail);
0411 }
0412
0413 static void cluster_list_add_tail(struct swap_cluster_list *list,
0414 struct swap_cluster_info *ci,
0415 unsigned int idx)
0416 {
0417 if (cluster_list_empty(list)) {
0418 cluster_set_next_flag(&list->head, idx, 0);
0419 cluster_set_next_flag(&list->tail, idx, 0);
0420 } else {
0421 struct swap_cluster_info *ci_tail;
0422 unsigned int tail = cluster_next(&list->tail);
0423
0424
0425
0426
0427
0428 ci_tail = ci + tail;
0429 spin_lock_nested(&ci_tail->lock, SINGLE_DEPTH_NESTING);
0430 cluster_set_next(ci_tail, idx);
0431 spin_unlock(&ci_tail->lock);
0432 cluster_set_next_flag(&list->tail, idx, 0);
0433 }
0434 }
0435
0436 static unsigned int cluster_list_del_first(struct swap_cluster_list *list,
0437 struct swap_cluster_info *ci)
0438 {
0439 unsigned int idx;
0440
0441 idx = cluster_next(&list->head);
0442 if (cluster_next(&list->tail) == idx) {
0443 cluster_set_null(&list->head);
0444 cluster_set_null(&list->tail);
0445 } else
0446 cluster_set_next_flag(&list->head,
0447 cluster_next(&ci[idx]), 0);
0448
0449 return idx;
0450 }
0451
0452
0453 static void swap_cluster_schedule_discard(struct swap_info_struct *si,
0454 unsigned int idx)
0455 {
0456
0457
0458
0459
0460
0461
0462 memset(si->swap_map + idx * SWAPFILE_CLUSTER,
0463 SWAP_MAP_BAD, SWAPFILE_CLUSTER);
0464
0465 cluster_list_add_tail(&si->discard_clusters, si->cluster_info, idx);
0466
0467 schedule_work(&si->discard_work);
0468 }
0469
0470 static void __free_cluster(struct swap_info_struct *si, unsigned long idx)
0471 {
0472 struct swap_cluster_info *ci = si->cluster_info;
0473
0474 cluster_set_flag(ci + idx, CLUSTER_FLAG_FREE);
0475 cluster_list_add_tail(&si->free_clusters, ci, idx);
0476 }
0477
0478
0479
0480
0481
0482 static void swap_do_scheduled_discard(struct swap_info_struct *si)
0483 {
0484 struct swap_cluster_info *info, *ci;
0485 unsigned int idx;
0486
0487 info = si->cluster_info;
0488
0489 while (!cluster_list_empty(&si->discard_clusters)) {
0490 idx = cluster_list_del_first(&si->discard_clusters, info);
0491 spin_unlock(&si->lock);
0492
0493 discard_swap_cluster(si, idx * SWAPFILE_CLUSTER,
0494 SWAPFILE_CLUSTER);
0495
0496 spin_lock(&si->lock);
0497 ci = lock_cluster(si, idx * SWAPFILE_CLUSTER);
0498 __free_cluster(si, idx);
0499 memset(si->swap_map + idx * SWAPFILE_CLUSTER,
0500 0, SWAPFILE_CLUSTER);
0501 unlock_cluster(ci);
0502 }
0503 }
0504
0505 static void swap_discard_work(struct work_struct *work)
0506 {
0507 struct swap_info_struct *si;
0508
0509 si = container_of(work, struct swap_info_struct, discard_work);
0510
0511 spin_lock(&si->lock);
0512 swap_do_scheduled_discard(si);
0513 spin_unlock(&si->lock);
0514 }
0515
0516 static void swap_users_ref_free(struct percpu_ref *ref)
0517 {
0518 struct swap_info_struct *si;
0519
0520 si = container_of(ref, struct swap_info_struct, users);
0521 complete(&si->comp);
0522 }
0523
0524 static void alloc_cluster(struct swap_info_struct *si, unsigned long idx)
0525 {
0526 struct swap_cluster_info *ci = si->cluster_info;
0527
0528 VM_BUG_ON(cluster_list_first(&si->free_clusters) != idx);
0529 cluster_list_del_first(&si->free_clusters, ci);
0530 cluster_set_count_flag(ci + idx, 0, 0);
0531 }
0532
0533 static void free_cluster(struct swap_info_struct *si, unsigned long idx)
0534 {
0535 struct swap_cluster_info *ci = si->cluster_info + idx;
0536
0537 VM_BUG_ON(cluster_count(ci) != 0);
0538
0539
0540
0541
0542
0543 if ((si->flags & (SWP_WRITEOK | SWP_PAGE_DISCARD)) ==
0544 (SWP_WRITEOK | SWP_PAGE_DISCARD)) {
0545 swap_cluster_schedule_discard(si, idx);
0546 return;
0547 }
0548
0549 __free_cluster(si, idx);
0550 }
0551
0552
0553
0554
0555
0556 static void inc_cluster_info_page(struct swap_info_struct *p,
0557 struct swap_cluster_info *cluster_info, unsigned long page_nr)
0558 {
0559 unsigned long idx = page_nr / SWAPFILE_CLUSTER;
0560
0561 if (!cluster_info)
0562 return;
0563 if (cluster_is_free(&cluster_info[idx]))
0564 alloc_cluster(p, idx);
0565
0566 VM_BUG_ON(cluster_count(&cluster_info[idx]) >= SWAPFILE_CLUSTER);
0567 cluster_set_count(&cluster_info[idx],
0568 cluster_count(&cluster_info[idx]) + 1);
0569 }
0570
0571
0572
0573
0574
0575
0576 static void dec_cluster_info_page(struct swap_info_struct *p,
0577 struct swap_cluster_info *cluster_info, unsigned long page_nr)
0578 {
0579 unsigned long idx = page_nr / SWAPFILE_CLUSTER;
0580
0581 if (!cluster_info)
0582 return;
0583
0584 VM_BUG_ON(cluster_count(&cluster_info[idx]) == 0);
0585 cluster_set_count(&cluster_info[idx],
0586 cluster_count(&cluster_info[idx]) - 1);
0587
0588 if (cluster_count(&cluster_info[idx]) == 0)
0589 free_cluster(p, idx);
0590 }
0591
0592
0593
0594
0595
0596 static bool
0597 scan_swap_map_ssd_cluster_conflict(struct swap_info_struct *si,
0598 unsigned long offset)
0599 {
0600 struct percpu_cluster *percpu_cluster;
0601 bool conflict;
0602
0603 offset /= SWAPFILE_CLUSTER;
0604 conflict = !cluster_list_empty(&si->free_clusters) &&
0605 offset != cluster_list_first(&si->free_clusters) &&
0606 cluster_is_free(&si->cluster_info[offset]);
0607
0608 if (!conflict)
0609 return false;
0610
0611 percpu_cluster = this_cpu_ptr(si->percpu_cluster);
0612 cluster_set_null(&percpu_cluster->index);
0613 return true;
0614 }
0615
0616
0617
0618
0619
0620 static bool scan_swap_map_try_ssd_cluster(struct swap_info_struct *si,
0621 unsigned long *offset, unsigned long *scan_base)
0622 {
0623 struct percpu_cluster *cluster;
0624 struct swap_cluster_info *ci;
0625 unsigned long tmp, max;
0626
0627 new_cluster:
0628 cluster = this_cpu_ptr(si->percpu_cluster);
0629 if (cluster_is_null(&cluster->index)) {
0630 if (!cluster_list_empty(&si->free_clusters)) {
0631 cluster->index = si->free_clusters.head;
0632 cluster->next = cluster_next(&cluster->index) *
0633 SWAPFILE_CLUSTER;
0634 } else if (!cluster_list_empty(&si->discard_clusters)) {
0635
0636
0637
0638
0639
0640 swap_do_scheduled_discard(si);
0641 *scan_base = this_cpu_read(*si->cluster_next_cpu);
0642 *offset = *scan_base;
0643 goto new_cluster;
0644 } else
0645 return false;
0646 }
0647
0648
0649
0650
0651
0652 tmp = cluster->next;
0653 max = min_t(unsigned long, si->max,
0654 (cluster_next(&cluster->index) + 1) * SWAPFILE_CLUSTER);
0655 if (tmp < max) {
0656 ci = lock_cluster(si, tmp);
0657 while (tmp < max) {
0658 if (!si->swap_map[tmp])
0659 break;
0660 tmp++;
0661 }
0662 unlock_cluster(ci);
0663 }
0664 if (tmp >= max) {
0665 cluster_set_null(&cluster->index);
0666 goto new_cluster;
0667 }
0668 cluster->next = tmp + 1;
0669 *offset = tmp;
0670 *scan_base = tmp;
0671 return true;
0672 }
0673
0674 static void __del_from_avail_list(struct swap_info_struct *p)
0675 {
0676 int nid;
0677
0678 for_each_node(nid)
0679 plist_del(&p->avail_lists[nid], &swap_avail_heads[nid]);
0680 }
0681
0682 static void del_from_avail_list(struct swap_info_struct *p)
0683 {
0684 spin_lock(&swap_avail_lock);
0685 __del_from_avail_list(p);
0686 spin_unlock(&swap_avail_lock);
0687 }
0688
0689 static void swap_range_alloc(struct swap_info_struct *si, unsigned long offset,
0690 unsigned int nr_entries)
0691 {
0692 unsigned int end = offset + nr_entries - 1;
0693
0694 if (offset == si->lowest_bit)
0695 si->lowest_bit += nr_entries;
0696 if (end == si->highest_bit)
0697 WRITE_ONCE(si->highest_bit, si->highest_bit - nr_entries);
0698 WRITE_ONCE(si->inuse_pages, si->inuse_pages + nr_entries);
0699 if (si->inuse_pages == si->pages) {
0700 si->lowest_bit = si->max;
0701 si->highest_bit = 0;
0702 del_from_avail_list(si);
0703 }
0704 }
0705
0706 static void add_to_avail_list(struct swap_info_struct *p)
0707 {
0708 int nid;
0709
0710 spin_lock(&swap_avail_lock);
0711 for_each_node(nid) {
0712 WARN_ON(!plist_node_empty(&p->avail_lists[nid]));
0713 plist_add(&p->avail_lists[nid], &swap_avail_heads[nid]);
0714 }
0715 spin_unlock(&swap_avail_lock);
0716 }
0717
0718 static void swap_range_free(struct swap_info_struct *si, unsigned long offset,
0719 unsigned int nr_entries)
0720 {
0721 unsigned long begin = offset;
0722 unsigned long end = offset + nr_entries - 1;
0723 void (*swap_slot_free_notify)(struct block_device *, unsigned long);
0724
0725 if (offset < si->lowest_bit)
0726 si->lowest_bit = offset;
0727 if (end > si->highest_bit) {
0728 bool was_full = !si->highest_bit;
0729
0730 WRITE_ONCE(si->highest_bit, end);
0731 if (was_full && (si->flags & SWP_WRITEOK))
0732 add_to_avail_list(si);
0733 }
0734 atomic_long_add(nr_entries, &nr_swap_pages);
0735 WRITE_ONCE(si->inuse_pages, si->inuse_pages - nr_entries);
0736 if (si->flags & SWP_BLKDEV)
0737 swap_slot_free_notify =
0738 si->bdev->bd_disk->fops->swap_slot_free_notify;
0739 else
0740 swap_slot_free_notify = NULL;
0741 while (offset <= end) {
0742 arch_swap_invalidate_page(si->type, offset);
0743 frontswap_invalidate_page(si->type, offset);
0744 if (swap_slot_free_notify)
0745 swap_slot_free_notify(si->bdev, offset);
0746 offset++;
0747 }
0748 clear_shadow_from_swap_cache(si->type, begin, end);
0749 }
0750
0751 static void set_cluster_next(struct swap_info_struct *si, unsigned long next)
0752 {
0753 unsigned long prev;
0754
0755 if (!(si->flags & SWP_SOLIDSTATE)) {
0756 si->cluster_next = next;
0757 return;
0758 }
0759
0760 prev = this_cpu_read(*si->cluster_next_cpu);
0761
0762
0763
0764
0765
0766 if ((prev >> SWAP_ADDRESS_SPACE_SHIFT) !=
0767 (next >> SWAP_ADDRESS_SPACE_SHIFT)) {
0768
0769 if (si->highest_bit <= si->lowest_bit)
0770 return;
0771 next = si->lowest_bit +
0772 prandom_u32_max(si->highest_bit - si->lowest_bit + 1);
0773 next = ALIGN_DOWN(next, SWAP_ADDRESS_SPACE_PAGES);
0774 next = max_t(unsigned int, next, si->lowest_bit);
0775 }
0776 this_cpu_write(*si->cluster_next_cpu, next);
0777 }
0778
0779 static bool swap_offset_available_and_locked(struct swap_info_struct *si,
0780 unsigned long offset)
0781 {
0782 if (data_race(!si->swap_map[offset])) {
0783 spin_lock(&si->lock);
0784 return true;
0785 }
0786
0787 if (vm_swap_full() && READ_ONCE(si->swap_map[offset]) == SWAP_HAS_CACHE) {
0788 spin_lock(&si->lock);
0789 return true;
0790 }
0791
0792 return false;
0793 }
0794
0795 static int scan_swap_map_slots(struct swap_info_struct *si,
0796 unsigned char usage, int nr,
0797 swp_entry_t slots[])
0798 {
0799 struct swap_cluster_info *ci;
0800 unsigned long offset;
0801 unsigned long scan_base;
0802 unsigned long last_in_cluster = 0;
0803 int latency_ration = LATENCY_LIMIT;
0804 int n_ret = 0;
0805 bool scanned_many = false;
0806
0807
0808
0809
0810
0811
0812
0813
0814
0815
0816
0817
0818 si->flags += SWP_SCANNING;
0819
0820
0821
0822
0823
0824 if (si->flags & SWP_SOLIDSTATE)
0825 scan_base = this_cpu_read(*si->cluster_next_cpu);
0826 else
0827 scan_base = si->cluster_next;
0828 offset = scan_base;
0829
0830
0831 if (si->cluster_info) {
0832 if (!scan_swap_map_try_ssd_cluster(si, &offset, &scan_base))
0833 goto scan;
0834 } else if (unlikely(!si->cluster_nr--)) {
0835 if (si->pages - si->inuse_pages < SWAPFILE_CLUSTER) {
0836 si->cluster_nr = SWAPFILE_CLUSTER - 1;
0837 goto checks;
0838 }
0839
0840 spin_unlock(&si->lock);
0841
0842
0843
0844
0845
0846
0847
0848 scan_base = offset = si->lowest_bit;
0849 last_in_cluster = offset + SWAPFILE_CLUSTER - 1;
0850
0851
0852 for (; last_in_cluster <= si->highest_bit; offset++) {
0853 if (si->swap_map[offset])
0854 last_in_cluster = offset + SWAPFILE_CLUSTER;
0855 else if (offset == last_in_cluster) {
0856 spin_lock(&si->lock);
0857 offset -= SWAPFILE_CLUSTER - 1;
0858 si->cluster_next = offset;
0859 si->cluster_nr = SWAPFILE_CLUSTER - 1;
0860 goto checks;
0861 }
0862 if (unlikely(--latency_ration < 0)) {
0863 cond_resched();
0864 latency_ration = LATENCY_LIMIT;
0865 }
0866 }
0867
0868 offset = scan_base;
0869 spin_lock(&si->lock);
0870 si->cluster_nr = SWAPFILE_CLUSTER - 1;
0871 }
0872
0873 checks:
0874 if (si->cluster_info) {
0875 while (scan_swap_map_ssd_cluster_conflict(si, offset)) {
0876
0877 if (n_ret)
0878 goto done;
0879 if (!scan_swap_map_try_ssd_cluster(si, &offset,
0880 &scan_base))
0881 goto scan;
0882 }
0883 }
0884 if (!(si->flags & SWP_WRITEOK))
0885 goto no_page;
0886 if (!si->highest_bit)
0887 goto no_page;
0888 if (offset > si->highest_bit)
0889 scan_base = offset = si->lowest_bit;
0890
0891 ci = lock_cluster(si, offset);
0892
0893 if (vm_swap_full() && si->swap_map[offset] == SWAP_HAS_CACHE) {
0894 int swap_was_freed;
0895 unlock_cluster(ci);
0896 spin_unlock(&si->lock);
0897 swap_was_freed = __try_to_reclaim_swap(si, offset, TTRS_ANYWAY);
0898 spin_lock(&si->lock);
0899
0900 if (swap_was_freed)
0901 goto checks;
0902 goto scan;
0903 }
0904
0905 if (si->swap_map[offset]) {
0906 unlock_cluster(ci);
0907 if (!n_ret)
0908 goto scan;
0909 else
0910 goto done;
0911 }
0912 WRITE_ONCE(si->swap_map[offset], usage);
0913 inc_cluster_info_page(si, si->cluster_info, offset);
0914 unlock_cluster(ci);
0915
0916 swap_range_alloc(si, offset, 1);
0917 slots[n_ret++] = swp_entry(si->type, offset);
0918
0919
0920 if ((n_ret == nr) || (offset >= si->highest_bit))
0921 goto done;
0922
0923
0924
0925
0926 if (unlikely(--latency_ration < 0)) {
0927 if (n_ret)
0928 goto done;
0929 spin_unlock(&si->lock);
0930 cond_resched();
0931 spin_lock(&si->lock);
0932 latency_ration = LATENCY_LIMIT;
0933 }
0934
0935
0936 if (si->cluster_info) {
0937 if (scan_swap_map_try_ssd_cluster(si, &offset, &scan_base))
0938 goto checks;
0939 } else if (si->cluster_nr && !si->swap_map[++offset]) {
0940
0941 --si->cluster_nr;
0942 goto checks;
0943 }
0944
0945
0946
0947
0948
0949
0950 if (!scanned_many) {
0951 unsigned long scan_limit;
0952
0953 if (offset < scan_base)
0954 scan_limit = scan_base;
0955 else
0956 scan_limit = si->highest_bit;
0957 for (; offset <= scan_limit && --latency_ration > 0;
0958 offset++) {
0959 if (!si->swap_map[offset])
0960 goto checks;
0961 }
0962 }
0963
0964 done:
0965 set_cluster_next(si, offset + 1);
0966 si->flags -= SWP_SCANNING;
0967 return n_ret;
0968
0969 scan:
0970 spin_unlock(&si->lock);
0971 while (++offset <= READ_ONCE(si->highest_bit)) {
0972 if (swap_offset_available_and_locked(si, offset))
0973 goto checks;
0974 if (unlikely(--latency_ration < 0)) {
0975 cond_resched();
0976 latency_ration = LATENCY_LIMIT;
0977 scanned_many = true;
0978 }
0979 }
0980 offset = si->lowest_bit;
0981 while (offset < scan_base) {
0982 if (swap_offset_available_and_locked(si, offset))
0983 goto checks;
0984 if (unlikely(--latency_ration < 0)) {
0985 cond_resched();
0986 latency_ration = LATENCY_LIMIT;
0987 scanned_many = true;
0988 }
0989 offset++;
0990 }
0991 spin_lock(&si->lock);
0992
0993 no_page:
0994 si->flags -= SWP_SCANNING;
0995 return n_ret;
0996 }
0997
0998 static int swap_alloc_cluster(struct swap_info_struct *si, swp_entry_t *slot)
0999 {
1000 unsigned long idx;
1001 struct swap_cluster_info *ci;
1002 unsigned long offset;
1003
1004
1005
1006
1007
1008 if (!IS_ENABLED(CONFIG_THP_SWAP)) {
1009 VM_WARN_ON_ONCE(1);
1010 return 0;
1011 }
1012
1013 if (cluster_list_empty(&si->free_clusters))
1014 return 0;
1015
1016 idx = cluster_list_first(&si->free_clusters);
1017 offset = idx * SWAPFILE_CLUSTER;
1018 ci = lock_cluster(si, offset);
1019 alloc_cluster(si, idx);
1020 cluster_set_count_flag(ci, SWAPFILE_CLUSTER, CLUSTER_FLAG_HUGE);
1021
1022 memset(si->swap_map + offset, SWAP_HAS_CACHE, SWAPFILE_CLUSTER);
1023 unlock_cluster(ci);
1024 swap_range_alloc(si, offset, SWAPFILE_CLUSTER);
1025 *slot = swp_entry(si->type, offset);
1026
1027 return 1;
1028 }
1029
1030 static void swap_free_cluster(struct swap_info_struct *si, unsigned long idx)
1031 {
1032 unsigned long offset = idx * SWAPFILE_CLUSTER;
1033 struct swap_cluster_info *ci;
1034
1035 ci = lock_cluster(si, offset);
1036 memset(si->swap_map + offset, 0, SWAPFILE_CLUSTER);
1037 cluster_set_count_flag(ci, 0, 0);
1038 free_cluster(si, idx);
1039 unlock_cluster(ci);
1040 swap_range_free(si, offset, SWAPFILE_CLUSTER);
1041 }
1042
1043 int get_swap_pages(int n_goal, swp_entry_t swp_entries[], int entry_size)
1044 {
1045 unsigned long size = swap_entry_size(entry_size);
1046 struct swap_info_struct *si, *next;
1047 long avail_pgs;
1048 int n_ret = 0;
1049 int node;
1050
1051
1052 WARN_ON_ONCE(n_goal > 1 && size == SWAPFILE_CLUSTER);
1053
1054 spin_lock(&swap_avail_lock);
1055
1056 avail_pgs = atomic_long_read(&nr_swap_pages) / size;
1057 if (avail_pgs <= 0) {
1058 spin_unlock(&swap_avail_lock);
1059 goto noswap;
1060 }
1061
1062 n_goal = min3((long)n_goal, (long)SWAP_BATCH, avail_pgs);
1063
1064 atomic_long_sub(n_goal * size, &nr_swap_pages);
1065
1066 start_over:
1067 node = numa_node_id();
1068 plist_for_each_entry_safe(si, next, &swap_avail_heads[node], avail_lists[node]) {
1069
1070 plist_requeue(&si->avail_lists[node], &swap_avail_heads[node]);
1071 spin_unlock(&swap_avail_lock);
1072 spin_lock(&si->lock);
1073 if (!si->highest_bit || !(si->flags & SWP_WRITEOK)) {
1074 spin_lock(&swap_avail_lock);
1075 if (plist_node_empty(&si->avail_lists[node])) {
1076 spin_unlock(&si->lock);
1077 goto nextsi;
1078 }
1079 WARN(!si->highest_bit,
1080 "swap_info %d in list but !highest_bit\n",
1081 si->type);
1082 WARN(!(si->flags & SWP_WRITEOK),
1083 "swap_info %d in list but !SWP_WRITEOK\n",
1084 si->type);
1085 __del_from_avail_list(si);
1086 spin_unlock(&si->lock);
1087 goto nextsi;
1088 }
1089 if (size == SWAPFILE_CLUSTER) {
1090 if (si->flags & SWP_BLKDEV)
1091 n_ret = swap_alloc_cluster(si, swp_entries);
1092 } else
1093 n_ret = scan_swap_map_slots(si, SWAP_HAS_CACHE,
1094 n_goal, swp_entries);
1095 spin_unlock(&si->lock);
1096 if (n_ret || size == SWAPFILE_CLUSTER)
1097 goto check_out;
1098 pr_debug("scan_swap_map of si %d failed to find offset\n",
1099 si->type);
1100
1101 spin_lock(&swap_avail_lock);
1102 nextsi:
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114 if (plist_node_empty(&next->avail_lists[node]))
1115 goto start_over;
1116 }
1117
1118 spin_unlock(&swap_avail_lock);
1119
1120 check_out:
1121 if (n_ret < n_goal)
1122 atomic_long_add((long)(n_goal - n_ret) * size,
1123 &nr_swap_pages);
1124 noswap:
1125 return n_ret;
1126 }
1127
1128 static struct swap_info_struct *_swap_info_get(swp_entry_t entry)
1129 {
1130 struct swap_info_struct *p;
1131 unsigned long offset;
1132
1133 if (!entry.val)
1134 goto out;
1135 p = swp_swap_info(entry);
1136 if (!p)
1137 goto bad_nofile;
1138 if (data_race(!(p->flags & SWP_USED)))
1139 goto bad_device;
1140 offset = swp_offset(entry);
1141 if (offset >= p->max)
1142 goto bad_offset;
1143 if (data_race(!p->swap_map[swp_offset(entry)]))
1144 goto bad_free;
1145 return p;
1146
1147 bad_free:
1148 pr_err("%s: %s%08lx\n", __func__, Unused_offset, entry.val);
1149 goto out;
1150 bad_offset:
1151 pr_err("%s: %s%08lx\n", __func__, Bad_offset, entry.val);
1152 goto out;
1153 bad_device:
1154 pr_err("%s: %s%08lx\n", __func__, Unused_file, entry.val);
1155 goto out;
1156 bad_nofile:
1157 pr_err("%s: %s%08lx\n", __func__, Bad_file, entry.val);
1158 out:
1159 return NULL;
1160 }
1161
1162 static struct swap_info_struct *swap_info_get_cont(swp_entry_t entry,
1163 struct swap_info_struct *q)
1164 {
1165 struct swap_info_struct *p;
1166
1167 p = _swap_info_get(entry);
1168
1169 if (p != q) {
1170 if (q != NULL)
1171 spin_unlock(&q->lock);
1172 if (p != NULL)
1173 spin_lock(&p->lock);
1174 }
1175 return p;
1176 }
1177
1178 static unsigned char __swap_entry_free_locked(struct swap_info_struct *p,
1179 unsigned long offset,
1180 unsigned char usage)
1181 {
1182 unsigned char count;
1183 unsigned char has_cache;
1184
1185 count = p->swap_map[offset];
1186
1187 has_cache = count & SWAP_HAS_CACHE;
1188 count &= ~SWAP_HAS_CACHE;
1189
1190 if (usage == SWAP_HAS_CACHE) {
1191 VM_BUG_ON(!has_cache);
1192 has_cache = 0;
1193 } else if (count == SWAP_MAP_SHMEM) {
1194
1195
1196
1197
1198 count = 0;
1199 } else if ((count & ~COUNT_CONTINUED) <= SWAP_MAP_MAX) {
1200 if (count == COUNT_CONTINUED) {
1201 if (swap_count_continued(p, offset, count))
1202 count = SWAP_MAP_MAX | COUNT_CONTINUED;
1203 else
1204 count = SWAP_MAP_MAX;
1205 } else
1206 count--;
1207 }
1208
1209 usage = count | has_cache;
1210 if (usage)
1211 WRITE_ONCE(p->swap_map[offset], usage);
1212 else
1213 WRITE_ONCE(p->swap_map[offset], SWAP_HAS_CACHE);
1214
1215 return usage;
1216 }
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247 struct swap_info_struct *get_swap_device(swp_entry_t entry)
1248 {
1249 struct swap_info_struct *si;
1250 unsigned long offset;
1251
1252 if (!entry.val)
1253 goto out;
1254 si = swp_swap_info(entry);
1255 if (!si)
1256 goto bad_nofile;
1257 if (!percpu_ref_tryget_live(&si->users))
1258 goto out;
1259
1260
1261
1262
1263
1264
1265
1266 smp_rmb();
1267 offset = swp_offset(entry);
1268 if (offset >= si->max)
1269 goto put_out;
1270
1271 return si;
1272 bad_nofile:
1273 pr_err("%s: %s%08lx\n", __func__, Bad_file, entry.val);
1274 out:
1275 return NULL;
1276 put_out:
1277 pr_err("%s: %s%08lx\n", __func__, Bad_offset, entry.val);
1278 percpu_ref_put(&si->users);
1279 return NULL;
1280 }
1281
1282 static unsigned char __swap_entry_free(struct swap_info_struct *p,
1283 swp_entry_t entry)
1284 {
1285 struct swap_cluster_info *ci;
1286 unsigned long offset = swp_offset(entry);
1287 unsigned char usage;
1288
1289 ci = lock_cluster_or_swap_info(p, offset);
1290 usage = __swap_entry_free_locked(p, offset, 1);
1291 unlock_cluster_or_swap_info(p, ci);
1292 if (!usage)
1293 free_swap_slot(entry);
1294
1295 return usage;
1296 }
1297
1298 static void swap_entry_free(struct swap_info_struct *p, swp_entry_t entry)
1299 {
1300 struct swap_cluster_info *ci;
1301 unsigned long offset = swp_offset(entry);
1302 unsigned char count;
1303
1304 ci = lock_cluster(p, offset);
1305 count = p->swap_map[offset];
1306 VM_BUG_ON(count != SWAP_HAS_CACHE);
1307 p->swap_map[offset] = 0;
1308 dec_cluster_info_page(p, p->cluster_info, offset);
1309 unlock_cluster(ci);
1310
1311 mem_cgroup_uncharge_swap(entry, 1);
1312 swap_range_free(p, offset, 1);
1313 }
1314
1315
1316
1317
1318
1319 void swap_free(swp_entry_t entry)
1320 {
1321 struct swap_info_struct *p;
1322
1323 p = _swap_info_get(entry);
1324 if (p)
1325 __swap_entry_free(p, entry);
1326 }
1327
1328
1329
1330
1331 void put_swap_page(struct page *page, swp_entry_t entry)
1332 {
1333 unsigned long offset = swp_offset(entry);
1334 unsigned long idx = offset / SWAPFILE_CLUSTER;
1335 struct swap_cluster_info *ci;
1336 struct swap_info_struct *si;
1337 unsigned char *map;
1338 unsigned int i, free_entries = 0;
1339 unsigned char val;
1340 int size = swap_entry_size(thp_nr_pages(page));
1341
1342 si = _swap_info_get(entry);
1343 if (!si)
1344 return;
1345
1346 ci = lock_cluster_or_swap_info(si, offset);
1347 if (size == SWAPFILE_CLUSTER) {
1348 VM_BUG_ON(!cluster_is_huge(ci));
1349 map = si->swap_map + offset;
1350 for (i = 0; i < SWAPFILE_CLUSTER; i++) {
1351 val = map[i];
1352 VM_BUG_ON(!(val & SWAP_HAS_CACHE));
1353 if (val == SWAP_HAS_CACHE)
1354 free_entries++;
1355 }
1356 cluster_clear_huge(ci);
1357 if (free_entries == SWAPFILE_CLUSTER) {
1358 unlock_cluster_or_swap_info(si, ci);
1359 spin_lock(&si->lock);
1360 mem_cgroup_uncharge_swap(entry, SWAPFILE_CLUSTER);
1361 swap_free_cluster(si, idx);
1362 spin_unlock(&si->lock);
1363 return;
1364 }
1365 }
1366 for (i = 0; i < size; i++, entry.val++) {
1367 if (!__swap_entry_free_locked(si, offset + i, SWAP_HAS_CACHE)) {
1368 unlock_cluster_or_swap_info(si, ci);
1369 free_swap_slot(entry);
1370 if (i == size - 1)
1371 return;
1372 lock_cluster_or_swap_info(si, offset);
1373 }
1374 }
1375 unlock_cluster_or_swap_info(si, ci);
1376 }
1377
1378 #ifdef CONFIG_THP_SWAP
1379 int split_swap_cluster(swp_entry_t entry)
1380 {
1381 struct swap_info_struct *si;
1382 struct swap_cluster_info *ci;
1383 unsigned long offset = swp_offset(entry);
1384
1385 si = _swap_info_get(entry);
1386 if (!si)
1387 return -EBUSY;
1388 ci = lock_cluster(si, offset);
1389 cluster_clear_huge(ci);
1390 unlock_cluster(ci);
1391 return 0;
1392 }
1393 #endif
1394
1395 static int swp_entry_cmp(const void *ent1, const void *ent2)
1396 {
1397 const swp_entry_t *e1 = ent1, *e2 = ent2;
1398
1399 return (int)swp_type(*e1) - (int)swp_type(*e2);
1400 }
1401
1402 void swapcache_free_entries(swp_entry_t *entries, int n)
1403 {
1404 struct swap_info_struct *p, *prev;
1405 int i;
1406
1407 if (n <= 0)
1408 return;
1409
1410 prev = NULL;
1411 p = NULL;
1412
1413
1414
1415
1416
1417
1418 if (nr_swapfiles > 1)
1419 sort(entries, n, sizeof(entries[0]), swp_entry_cmp, NULL);
1420 for (i = 0; i < n; ++i) {
1421 p = swap_info_get_cont(entries[i], prev);
1422 if (p)
1423 swap_entry_free(p, entries[i]);
1424 prev = p;
1425 }
1426 if (p)
1427 spin_unlock(&p->lock);
1428 }
1429
1430
1431
1432
1433
1434
1435 static int page_swapcount(struct page *page)
1436 {
1437 int count = 0;
1438 struct swap_info_struct *p;
1439 struct swap_cluster_info *ci;
1440 swp_entry_t entry;
1441 unsigned long offset;
1442
1443 entry.val = page_private(page);
1444 p = _swap_info_get(entry);
1445 if (p) {
1446 offset = swp_offset(entry);
1447 ci = lock_cluster_or_swap_info(p, offset);
1448 count = swap_count(p->swap_map[offset]);
1449 unlock_cluster_or_swap_info(p, ci);
1450 }
1451 return count;
1452 }
1453
1454 int __swap_count(swp_entry_t entry)
1455 {
1456 struct swap_info_struct *si;
1457 pgoff_t offset = swp_offset(entry);
1458 int count = 0;
1459
1460 si = get_swap_device(entry);
1461 if (si) {
1462 count = swap_count(si->swap_map[offset]);
1463 put_swap_device(si);
1464 }
1465 return count;
1466 }
1467
1468 static int swap_swapcount(struct swap_info_struct *si, swp_entry_t entry)
1469 {
1470 int count = 0;
1471 pgoff_t offset = swp_offset(entry);
1472 struct swap_cluster_info *ci;
1473
1474 ci = lock_cluster_or_swap_info(si, offset);
1475 count = swap_count(si->swap_map[offset]);
1476 unlock_cluster_or_swap_info(si, ci);
1477 return count;
1478 }
1479
1480
1481
1482
1483
1484
1485 int __swp_swapcount(swp_entry_t entry)
1486 {
1487 int count = 0;
1488 struct swap_info_struct *si;
1489
1490 si = get_swap_device(entry);
1491 if (si) {
1492 count = swap_swapcount(si, entry);
1493 put_swap_device(si);
1494 }
1495 return count;
1496 }
1497
1498
1499
1500
1501
1502 int swp_swapcount(swp_entry_t entry)
1503 {
1504 int count, tmp_count, n;
1505 struct swap_info_struct *p;
1506 struct swap_cluster_info *ci;
1507 struct page *page;
1508 pgoff_t offset;
1509 unsigned char *map;
1510
1511 p = _swap_info_get(entry);
1512 if (!p)
1513 return 0;
1514
1515 offset = swp_offset(entry);
1516
1517 ci = lock_cluster_or_swap_info(p, offset);
1518
1519 count = swap_count(p->swap_map[offset]);
1520 if (!(count & COUNT_CONTINUED))
1521 goto out;
1522
1523 count &= ~COUNT_CONTINUED;
1524 n = SWAP_MAP_MAX + 1;
1525
1526 page = vmalloc_to_page(p->swap_map + offset);
1527 offset &= ~PAGE_MASK;
1528 VM_BUG_ON(page_private(page) != SWP_CONTINUED);
1529
1530 do {
1531 page = list_next_entry(page, lru);
1532 map = kmap_atomic(page);
1533 tmp_count = map[offset];
1534 kunmap_atomic(map);
1535
1536 count += (tmp_count & ~COUNT_CONTINUED) * n;
1537 n *= (SWAP_CONT_MAX + 1);
1538 } while (tmp_count & COUNT_CONTINUED);
1539 out:
1540 unlock_cluster_or_swap_info(p, ci);
1541 return count;
1542 }
1543
1544 static bool swap_page_trans_huge_swapped(struct swap_info_struct *si,
1545 swp_entry_t entry)
1546 {
1547 struct swap_cluster_info *ci;
1548 unsigned char *map = si->swap_map;
1549 unsigned long roffset = swp_offset(entry);
1550 unsigned long offset = round_down(roffset, SWAPFILE_CLUSTER);
1551 int i;
1552 bool ret = false;
1553
1554 ci = lock_cluster_or_swap_info(si, offset);
1555 if (!ci || !cluster_is_huge(ci)) {
1556 if (swap_count(map[roffset]))
1557 ret = true;
1558 goto unlock_out;
1559 }
1560 for (i = 0; i < SWAPFILE_CLUSTER; i++) {
1561 if (swap_count(map[offset + i])) {
1562 ret = true;
1563 break;
1564 }
1565 }
1566 unlock_out:
1567 unlock_cluster_or_swap_info(si, ci);
1568 return ret;
1569 }
1570
1571 static bool folio_swapped(struct folio *folio)
1572 {
1573 swp_entry_t entry;
1574 struct swap_info_struct *si;
1575
1576 if (!IS_ENABLED(CONFIG_THP_SWAP) || likely(!folio_test_large(folio)))
1577 return page_swapcount(&folio->page) != 0;
1578
1579 entry = folio_swap_entry(folio);
1580 si = _swap_info_get(entry);
1581 if (si)
1582 return swap_page_trans_huge_swapped(si, entry);
1583 return false;
1584 }
1585
1586
1587
1588
1589
1590 int try_to_free_swap(struct page *page)
1591 {
1592 struct folio *folio = page_folio(page);
1593 VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio);
1594
1595 if (!folio_test_swapcache(folio))
1596 return 0;
1597 if (folio_test_writeback(folio))
1598 return 0;
1599 if (folio_swapped(folio))
1600 return 0;
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617 if (pm_suspended_storage())
1618 return 0;
1619
1620 delete_from_swap_cache(folio);
1621 folio_set_dirty(folio);
1622 return 1;
1623 }
1624
1625
1626
1627
1628
1629 int free_swap_and_cache(swp_entry_t entry)
1630 {
1631 struct swap_info_struct *p;
1632 unsigned char count;
1633
1634 if (non_swap_entry(entry))
1635 return 1;
1636
1637 p = _swap_info_get(entry);
1638 if (p) {
1639 count = __swap_entry_free(p, entry);
1640 if (count == SWAP_HAS_CACHE &&
1641 !swap_page_trans_huge_swapped(p, entry))
1642 __try_to_reclaim_swap(p, swp_offset(entry),
1643 TTRS_UNMAPPED | TTRS_FULL);
1644 }
1645 return p != NULL;
1646 }
1647
1648 #ifdef CONFIG_HIBERNATION
1649
1650 swp_entry_t get_swap_page_of_type(int type)
1651 {
1652 struct swap_info_struct *si = swap_type_to_swap_info(type);
1653 swp_entry_t entry = {0};
1654
1655 if (!si)
1656 goto fail;
1657
1658
1659 spin_lock(&si->lock);
1660 if ((si->flags & SWP_WRITEOK) && scan_swap_map_slots(si, 1, 1, &entry))
1661 atomic_long_dec(&nr_swap_pages);
1662 spin_unlock(&si->lock);
1663 fail:
1664 return entry;
1665 }
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675 int swap_type_of(dev_t device, sector_t offset)
1676 {
1677 int type;
1678
1679 if (!device)
1680 return -1;
1681
1682 spin_lock(&swap_lock);
1683 for (type = 0; type < nr_swapfiles; type++) {
1684 struct swap_info_struct *sis = swap_info[type];
1685
1686 if (!(sis->flags & SWP_WRITEOK))
1687 continue;
1688
1689 if (device == sis->bdev->bd_dev) {
1690 struct swap_extent *se = first_se(sis);
1691
1692 if (se->start_block == offset) {
1693 spin_unlock(&swap_lock);
1694 return type;
1695 }
1696 }
1697 }
1698 spin_unlock(&swap_lock);
1699 return -ENODEV;
1700 }
1701
1702 int find_first_swap(dev_t *device)
1703 {
1704 int type;
1705
1706 spin_lock(&swap_lock);
1707 for (type = 0; type < nr_swapfiles; type++) {
1708 struct swap_info_struct *sis = swap_info[type];
1709
1710 if (!(sis->flags & SWP_WRITEOK))
1711 continue;
1712 *device = sis->bdev->bd_dev;
1713 spin_unlock(&swap_lock);
1714 return type;
1715 }
1716 spin_unlock(&swap_lock);
1717 return -ENODEV;
1718 }
1719
1720
1721
1722
1723
1724 sector_t swapdev_block(int type, pgoff_t offset)
1725 {
1726 struct swap_info_struct *si = swap_type_to_swap_info(type);
1727 struct swap_extent *se;
1728
1729 if (!si || !(si->flags & SWP_WRITEOK))
1730 return 0;
1731 se = offset_to_swap_extent(si, offset);
1732 return se->start_block + (offset - se->start_page);
1733 }
1734
1735
1736
1737
1738
1739
1740
1741 unsigned int count_swap_pages(int type, int free)
1742 {
1743 unsigned int n = 0;
1744
1745 spin_lock(&swap_lock);
1746 if ((unsigned int)type < nr_swapfiles) {
1747 struct swap_info_struct *sis = swap_info[type];
1748
1749 spin_lock(&sis->lock);
1750 if (sis->flags & SWP_WRITEOK) {
1751 n = sis->pages;
1752 if (free)
1753 n -= sis->inuse_pages;
1754 }
1755 spin_unlock(&sis->lock);
1756 }
1757 spin_unlock(&swap_lock);
1758 return n;
1759 }
1760 #endif
1761
1762 static inline int pte_same_as_swp(pte_t pte, pte_t swp_pte)
1763 {
1764 return pte_same(pte_swp_clear_flags(pte), swp_pte);
1765 }
1766
1767
1768
1769
1770
1771
1772 static int unuse_pte(struct vm_area_struct *vma, pmd_t *pmd,
1773 unsigned long addr, swp_entry_t entry, struct page *page)
1774 {
1775 struct page *swapcache;
1776 spinlock_t *ptl;
1777 pte_t *pte, new_pte;
1778 int ret = 1;
1779
1780 swapcache = page;
1781 page = ksm_might_need_to_copy(page, vma, addr);
1782 if (unlikely(!page))
1783 return -ENOMEM;
1784
1785 pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
1786 if (unlikely(!pte_same_as_swp(*pte, swp_entry_to_pte(entry)))) {
1787 ret = 0;
1788 goto out;
1789 }
1790
1791 if (unlikely(!PageUptodate(page))) {
1792 pte_t pteval;
1793
1794 dec_mm_counter(vma->vm_mm, MM_SWAPENTS);
1795 pteval = swp_entry_to_pte(make_swapin_error_entry(page));
1796 set_pte_at(vma->vm_mm, addr, pte, pteval);
1797 swap_free(entry);
1798 ret = 0;
1799 goto out;
1800 }
1801
1802
1803 BUG_ON(!PageAnon(page) && PageMappedToDisk(page));
1804 BUG_ON(PageAnon(page) && PageAnonExclusive(page));
1805
1806 dec_mm_counter(vma->vm_mm, MM_SWAPENTS);
1807 inc_mm_counter(vma->vm_mm, MM_ANONPAGES);
1808 get_page(page);
1809 if (page == swapcache) {
1810 rmap_t rmap_flags = RMAP_NONE;
1811
1812
1813
1814
1815
1816
1817 VM_BUG_ON_PAGE(PageWriteback(page), page);
1818 if (pte_swp_exclusive(*pte))
1819 rmap_flags |= RMAP_EXCLUSIVE;
1820
1821 page_add_anon_rmap(page, vma, addr, rmap_flags);
1822 } else {
1823 page_add_new_anon_rmap(page, vma, addr);
1824 lru_cache_add_inactive_or_unevictable(page, vma);
1825 }
1826 new_pte = pte_mkold(mk_pte(page, vma->vm_page_prot));
1827 if (pte_swp_soft_dirty(*pte))
1828 new_pte = pte_mksoft_dirty(new_pte);
1829 if (pte_swp_uffd_wp(*pte))
1830 new_pte = pte_mkuffd_wp(new_pte);
1831 set_pte_at(vma->vm_mm, addr, pte, new_pte);
1832 swap_free(entry);
1833 out:
1834 pte_unmap_unlock(pte, ptl);
1835 if (page != swapcache) {
1836 unlock_page(page);
1837 put_page(page);
1838 }
1839 return ret;
1840 }
1841
1842 static int unuse_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
1843 unsigned long addr, unsigned long end,
1844 unsigned int type)
1845 {
1846 struct page *page;
1847 swp_entry_t entry;
1848 pte_t *pte;
1849 struct swap_info_struct *si;
1850 unsigned long offset;
1851 int ret = 0;
1852 volatile unsigned char *swap_map;
1853
1854 si = swap_info[type];
1855 pte = pte_offset_map(pmd, addr);
1856 do {
1857 if (!is_swap_pte(*pte))
1858 continue;
1859
1860 entry = pte_to_swp_entry(*pte);
1861 if (swp_type(entry) != type)
1862 continue;
1863
1864 offset = swp_offset(entry);
1865 pte_unmap(pte);
1866 swap_map = &si->swap_map[offset];
1867 page = lookup_swap_cache(entry, vma, addr);
1868 if (!page) {
1869 struct vm_fault vmf = {
1870 .vma = vma,
1871 .address = addr,
1872 .real_address = addr,
1873 .pmd = pmd,
1874 };
1875
1876 page = swapin_readahead(entry, GFP_HIGHUSER_MOVABLE,
1877 &vmf);
1878 }
1879 if (!page) {
1880 if (*swap_map == 0 || *swap_map == SWAP_MAP_BAD)
1881 goto try_next;
1882 return -ENOMEM;
1883 }
1884
1885 lock_page(page);
1886 wait_on_page_writeback(page);
1887 ret = unuse_pte(vma, pmd, addr, entry, page);
1888 if (ret < 0) {
1889 unlock_page(page);
1890 put_page(page);
1891 goto out;
1892 }
1893
1894 try_to_free_swap(page);
1895 unlock_page(page);
1896 put_page(page);
1897 try_next:
1898 pte = pte_offset_map(pmd, addr);
1899 } while (pte++, addr += PAGE_SIZE, addr != end);
1900 pte_unmap(pte - 1);
1901
1902 ret = 0;
1903 out:
1904 return ret;
1905 }
1906
1907 static inline int unuse_pmd_range(struct vm_area_struct *vma, pud_t *pud,
1908 unsigned long addr, unsigned long end,
1909 unsigned int type)
1910 {
1911 pmd_t *pmd;
1912 unsigned long next;
1913 int ret;
1914
1915 pmd = pmd_offset(pud, addr);
1916 do {
1917 cond_resched();
1918 next = pmd_addr_end(addr, end);
1919 if (pmd_none_or_trans_huge_or_clear_bad(pmd))
1920 continue;
1921 ret = unuse_pte_range(vma, pmd, addr, next, type);
1922 if (ret)
1923 return ret;
1924 } while (pmd++, addr = next, addr != end);
1925 return 0;
1926 }
1927
1928 static inline int unuse_pud_range(struct vm_area_struct *vma, p4d_t *p4d,
1929 unsigned long addr, unsigned long end,
1930 unsigned int type)
1931 {
1932 pud_t *pud;
1933 unsigned long next;
1934 int ret;
1935
1936 pud = pud_offset(p4d, addr);
1937 do {
1938 next = pud_addr_end(addr, end);
1939 if (pud_none_or_clear_bad(pud))
1940 continue;
1941 ret = unuse_pmd_range(vma, pud, addr, next, type);
1942 if (ret)
1943 return ret;
1944 } while (pud++, addr = next, addr != end);
1945 return 0;
1946 }
1947
1948 static inline int unuse_p4d_range(struct vm_area_struct *vma, pgd_t *pgd,
1949 unsigned long addr, unsigned long end,
1950 unsigned int type)
1951 {
1952 p4d_t *p4d;
1953 unsigned long next;
1954 int ret;
1955
1956 p4d = p4d_offset(pgd, addr);
1957 do {
1958 next = p4d_addr_end(addr, end);
1959 if (p4d_none_or_clear_bad(p4d))
1960 continue;
1961 ret = unuse_pud_range(vma, p4d, addr, next, type);
1962 if (ret)
1963 return ret;
1964 } while (p4d++, addr = next, addr != end);
1965 return 0;
1966 }
1967
1968 static int unuse_vma(struct vm_area_struct *vma, unsigned int type)
1969 {
1970 pgd_t *pgd;
1971 unsigned long addr, end, next;
1972 int ret;
1973
1974 addr = vma->vm_start;
1975 end = vma->vm_end;
1976
1977 pgd = pgd_offset(vma->vm_mm, addr);
1978 do {
1979 next = pgd_addr_end(addr, end);
1980 if (pgd_none_or_clear_bad(pgd))
1981 continue;
1982 ret = unuse_p4d_range(vma, pgd, addr, next, type);
1983 if (ret)
1984 return ret;
1985 } while (pgd++, addr = next, addr != end);
1986 return 0;
1987 }
1988
1989 static int unuse_mm(struct mm_struct *mm, unsigned int type)
1990 {
1991 struct vm_area_struct *vma;
1992 int ret = 0;
1993
1994 mmap_read_lock(mm);
1995 for (vma = mm->mmap; vma; vma = vma->vm_next) {
1996 if (vma->anon_vma) {
1997 ret = unuse_vma(vma, type);
1998 if (ret)
1999 break;
2000 }
2001 cond_resched();
2002 }
2003 mmap_read_unlock(mm);
2004 return ret;
2005 }
2006
2007
2008
2009
2010
2011
2012 static unsigned int find_next_to_unuse(struct swap_info_struct *si,
2013 unsigned int prev)
2014 {
2015 unsigned int i;
2016 unsigned char count;
2017
2018
2019
2020
2021
2022
2023
2024 for (i = prev + 1; i < si->max; i++) {
2025 count = READ_ONCE(si->swap_map[i]);
2026 if (count && swap_count(count) != SWAP_MAP_BAD)
2027 break;
2028 if ((i % LATENCY_LIMIT) == 0)
2029 cond_resched();
2030 }
2031
2032 if (i == si->max)
2033 i = 0;
2034
2035 return i;
2036 }
2037
2038 static int try_to_unuse(unsigned int type)
2039 {
2040 struct mm_struct *prev_mm;
2041 struct mm_struct *mm;
2042 struct list_head *p;
2043 int retval = 0;
2044 struct swap_info_struct *si = swap_info[type];
2045 struct page *page;
2046 swp_entry_t entry;
2047 unsigned int i;
2048
2049 if (!READ_ONCE(si->inuse_pages))
2050 return 0;
2051
2052 retry:
2053 retval = shmem_unuse(type);
2054 if (retval)
2055 return retval;
2056
2057 prev_mm = &init_mm;
2058 mmget(prev_mm);
2059
2060 spin_lock(&mmlist_lock);
2061 p = &init_mm.mmlist;
2062 while (READ_ONCE(si->inuse_pages) &&
2063 !signal_pending(current) &&
2064 (p = p->next) != &init_mm.mmlist) {
2065
2066 mm = list_entry(p, struct mm_struct, mmlist);
2067 if (!mmget_not_zero(mm))
2068 continue;
2069 spin_unlock(&mmlist_lock);
2070 mmput(prev_mm);
2071 prev_mm = mm;
2072 retval = unuse_mm(mm, type);
2073 if (retval) {
2074 mmput(prev_mm);
2075 return retval;
2076 }
2077
2078
2079
2080
2081
2082 cond_resched();
2083 spin_lock(&mmlist_lock);
2084 }
2085 spin_unlock(&mmlist_lock);
2086
2087 mmput(prev_mm);
2088
2089 i = 0;
2090 while (READ_ONCE(si->inuse_pages) &&
2091 !signal_pending(current) &&
2092 (i = find_next_to_unuse(si, i)) != 0) {
2093
2094 entry = swp_entry(type, i);
2095 page = find_get_page(swap_address_space(entry), i);
2096 if (!page)
2097 continue;
2098
2099
2100
2101
2102
2103
2104
2105 lock_page(page);
2106 wait_on_page_writeback(page);
2107 try_to_free_swap(page);
2108 unlock_page(page);
2109 put_page(page);
2110 }
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125 if (READ_ONCE(si->inuse_pages)) {
2126 if (!signal_pending(current))
2127 goto retry;
2128 return -EINTR;
2129 }
2130
2131 return 0;
2132 }
2133
2134
2135
2136
2137
2138
2139
2140 static void drain_mmlist(void)
2141 {
2142 struct list_head *p, *next;
2143 unsigned int type;
2144
2145 for (type = 0; type < nr_swapfiles; type++)
2146 if (swap_info[type]->inuse_pages)
2147 return;
2148 spin_lock(&mmlist_lock);
2149 list_for_each_safe(p, next, &init_mm.mmlist)
2150 list_del_init(p);
2151 spin_unlock(&mmlist_lock);
2152 }
2153
2154
2155
2156
2157 static void destroy_swap_extents(struct swap_info_struct *sis)
2158 {
2159 while (!RB_EMPTY_ROOT(&sis->swap_extent_root)) {
2160 struct rb_node *rb = sis->swap_extent_root.rb_node;
2161 struct swap_extent *se = rb_entry(rb, struct swap_extent, rb_node);
2162
2163 rb_erase(rb, &sis->swap_extent_root);
2164 kfree(se);
2165 }
2166
2167 if (sis->flags & SWP_ACTIVATED) {
2168 struct file *swap_file = sis->swap_file;
2169 struct address_space *mapping = swap_file->f_mapping;
2170
2171 sis->flags &= ~SWP_ACTIVATED;
2172 if (mapping->a_ops->swap_deactivate)
2173 mapping->a_ops->swap_deactivate(swap_file);
2174 }
2175 }
2176
2177
2178
2179
2180
2181
2182
2183 int
2184 add_swap_extent(struct swap_info_struct *sis, unsigned long start_page,
2185 unsigned long nr_pages, sector_t start_block)
2186 {
2187 struct rb_node **link = &sis->swap_extent_root.rb_node, *parent = NULL;
2188 struct swap_extent *se;
2189 struct swap_extent *new_se;
2190
2191
2192
2193
2194
2195 while (*link) {
2196 parent = *link;
2197 link = &parent->rb_right;
2198 }
2199
2200 if (parent) {
2201 se = rb_entry(parent, struct swap_extent, rb_node);
2202 BUG_ON(se->start_page + se->nr_pages != start_page);
2203 if (se->start_block + se->nr_pages == start_block) {
2204
2205 se->nr_pages += nr_pages;
2206 return 0;
2207 }
2208 }
2209
2210
2211 new_se = kmalloc(sizeof(*se), GFP_KERNEL);
2212 if (new_se == NULL)
2213 return -ENOMEM;
2214 new_se->start_page = start_page;
2215 new_se->nr_pages = nr_pages;
2216 new_se->start_block = start_block;
2217
2218 rb_link_node(&new_se->rb_node, parent, link);
2219 rb_insert_color(&new_se->rb_node, &sis->swap_extent_root);
2220 return 1;
2221 }
2222 EXPORT_SYMBOL_GPL(add_swap_extent);
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251 static int setup_swap_extents(struct swap_info_struct *sis, sector_t *span)
2252 {
2253 struct file *swap_file = sis->swap_file;
2254 struct address_space *mapping = swap_file->f_mapping;
2255 struct inode *inode = mapping->host;
2256 int ret;
2257
2258 if (S_ISBLK(inode->i_mode)) {
2259 ret = add_swap_extent(sis, 0, sis->max, 0);
2260 *span = sis->pages;
2261 return ret;
2262 }
2263
2264 if (mapping->a_ops->swap_activate) {
2265 ret = mapping->a_ops->swap_activate(sis, swap_file, span);
2266 if (ret < 0)
2267 return ret;
2268 sis->flags |= SWP_ACTIVATED;
2269 if ((sis->flags & SWP_FS_OPS) &&
2270 sio_pool_init() != 0) {
2271 destroy_swap_extents(sis);
2272 return -ENOMEM;
2273 }
2274 return ret;
2275 }
2276
2277 return generic_swapfile_activate(sis, swap_file, span);
2278 }
2279
2280 static int swap_node(struct swap_info_struct *p)
2281 {
2282 struct block_device *bdev;
2283
2284 if (p->bdev)
2285 bdev = p->bdev;
2286 else
2287 bdev = p->swap_file->f_inode->i_sb->s_bdev;
2288
2289 return bdev ? bdev->bd_disk->node_id : NUMA_NO_NODE;
2290 }
2291
2292 static void setup_swap_info(struct swap_info_struct *p, int prio,
2293 unsigned char *swap_map,
2294 struct swap_cluster_info *cluster_info)
2295 {
2296 int i;
2297
2298 if (prio >= 0)
2299 p->prio = prio;
2300 else
2301 p->prio = --least_priority;
2302
2303
2304
2305
2306 p->list.prio = -p->prio;
2307 for_each_node(i) {
2308 if (p->prio >= 0)
2309 p->avail_lists[i].prio = -p->prio;
2310 else {
2311 if (swap_node(p) == i)
2312 p->avail_lists[i].prio = 1;
2313 else
2314 p->avail_lists[i].prio = -p->prio;
2315 }
2316 }
2317 p->swap_map = swap_map;
2318 p->cluster_info = cluster_info;
2319 }
2320
2321 static void _enable_swap_info(struct swap_info_struct *p)
2322 {
2323 p->flags |= SWP_WRITEOK;
2324 atomic_long_add(p->pages, &nr_swap_pages);
2325 total_swap_pages += p->pages;
2326
2327 assert_spin_locked(&swap_lock);
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338 plist_add(&p->list, &swap_active_head);
2339 add_to_avail_list(p);
2340 }
2341
2342 static void enable_swap_info(struct swap_info_struct *p, int prio,
2343 unsigned char *swap_map,
2344 struct swap_cluster_info *cluster_info,
2345 unsigned long *frontswap_map)
2346 {
2347 if (IS_ENABLED(CONFIG_FRONTSWAP))
2348 frontswap_init(p->type, frontswap_map);
2349 spin_lock(&swap_lock);
2350 spin_lock(&p->lock);
2351 setup_swap_info(p, prio, swap_map, cluster_info);
2352 spin_unlock(&p->lock);
2353 spin_unlock(&swap_lock);
2354
2355
2356
2357 percpu_ref_resurrect(&p->users);
2358 spin_lock(&swap_lock);
2359 spin_lock(&p->lock);
2360 _enable_swap_info(p);
2361 spin_unlock(&p->lock);
2362 spin_unlock(&swap_lock);
2363 }
2364
2365 static void reinsert_swap_info(struct swap_info_struct *p)
2366 {
2367 spin_lock(&swap_lock);
2368 spin_lock(&p->lock);
2369 setup_swap_info(p, p->prio, p->swap_map, p->cluster_info);
2370 _enable_swap_info(p);
2371 spin_unlock(&p->lock);
2372 spin_unlock(&swap_lock);
2373 }
2374
2375 bool has_usable_swap(void)
2376 {
2377 bool ret = true;
2378
2379 spin_lock(&swap_lock);
2380 if (plist_head_empty(&swap_active_head))
2381 ret = false;
2382 spin_unlock(&swap_lock);
2383 return ret;
2384 }
2385
2386 SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
2387 {
2388 struct swap_info_struct *p = NULL;
2389 unsigned char *swap_map;
2390 struct swap_cluster_info *cluster_info;
2391 unsigned long *frontswap_map;
2392 struct file *swap_file, *victim;
2393 struct address_space *mapping;
2394 struct inode *inode;
2395 struct filename *pathname;
2396 int err, found = 0;
2397 unsigned int old_block_size;
2398
2399 if (!capable(CAP_SYS_ADMIN))
2400 return -EPERM;
2401
2402 BUG_ON(!current->mm);
2403
2404 pathname = getname(specialfile);
2405 if (IS_ERR(pathname))
2406 return PTR_ERR(pathname);
2407
2408 victim = file_open_name(pathname, O_RDWR|O_LARGEFILE, 0);
2409 err = PTR_ERR(victim);
2410 if (IS_ERR(victim))
2411 goto out;
2412
2413 mapping = victim->f_mapping;
2414 spin_lock(&swap_lock);
2415 plist_for_each_entry(p, &swap_active_head, list) {
2416 if (p->flags & SWP_WRITEOK) {
2417 if (p->swap_file->f_mapping == mapping) {
2418 found = 1;
2419 break;
2420 }
2421 }
2422 }
2423 if (!found) {
2424 err = -EINVAL;
2425 spin_unlock(&swap_lock);
2426 goto out_dput;
2427 }
2428 if (!security_vm_enough_memory_mm(current->mm, p->pages))
2429 vm_unacct_memory(p->pages);
2430 else {
2431 err = -ENOMEM;
2432 spin_unlock(&swap_lock);
2433 goto out_dput;
2434 }
2435 del_from_avail_list(p);
2436 spin_lock(&p->lock);
2437 if (p->prio < 0) {
2438 struct swap_info_struct *si = p;
2439 int nid;
2440
2441 plist_for_each_entry_continue(si, &swap_active_head, list) {
2442 si->prio++;
2443 si->list.prio--;
2444 for_each_node(nid) {
2445 if (si->avail_lists[nid].prio != 1)
2446 si->avail_lists[nid].prio--;
2447 }
2448 }
2449 least_priority++;
2450 }
2451 plist_del(&p->list, &swap_active_head);
2452 atomic_long_sub(p->pages, &nr_swap_pages);
2453 total_swap_pages -= p->pages;
2454 p->flags &= ~SWP_WRITEOK;
2455 spin_unlock(&p->lock);
2456 spin_unlock(&swap_lock);
2457
2458 disable_swap_slots_cache_lock();
2459
2460 set_current_oom_origin();
2461 err = try_to_unuse(p->type);
2462 clear_current_oom_origin();
2463
2464 if (err) {
2465
2466 reinsert_swap_info(p);
2467 reenable_swap_slots_cache_unlock();
2468 goto out_dput;
2469 }
2470
2471 reenable_swap_slots_cache_unlock();
2472
2473
2474
2475
2476
2477
2478
2479
2480 percpu_ref_kill(&p->users);
2481 synchronize_rcu();
2482 wait_for_completion(&p->comp);
2483
2484 flush_work(&p->discard_work);
2485
2486 destroy_swap_extents(p);
2487 if (p->flags & SWP_CONTINUED)
2488 free_swap_count_continuations(p);
2489
2490 if (!p->bdev || !bdev_nonrot(p->bdev))
2491 atomic_dec(&nr_rotate_swap);
2492
2493 mutex_lock(&swapon_mutex);
2494 spin_lock(&swap_lock);
2495 spin_lock(&p->lock);
2496 drain_mmlist();
2497
2498
2499 p->highest_bit = 0;
2500 while (p->flags >= SWP_SCANNING) {
2501 spin_unlock(&p->lock);
2502 spin_unlock(&swap_lock);
2503 schedule_timeout_uninterruptible(1);
2504 spin_lock(&swap_lock);
2505 spin_lock(&p->lock);
2506 }
2507
2508 swap_file = p->swap_file;
2509 old_block_size = p->old_block_size;
2510 p->swap_file = NULL;
2511 p->max = 0;
2512 swap_map = p->swap_map;
2513 p->swap_map = NULL;
2514 cluster_info = p->cluster_info;
2515 p->cluster_info = NULL;
2516 frontswap_map = frontswap_map_get(p);
2517 spin_unlock(&p->lock);
2518 spin_unlock(&swap_lock);
2519 arch_swap_invalidate_area(p->type);
2520 frontswap_invalidate_area(p->type);
2521 frontswap_map_set(p, NULL);
2522 mutex_unlock(&swapon_mutex);
2523 free_percpu(p->percpu_cluster);
2524 p->percpu_cluster = NULL;
2525 free_percpu(p->cluster_next_cpu);
2526 p->cluster_next_cpu = NULL;
2527 vfree(swap_map);
2528 kvfree(cluster_info);
2529 kvfree(frontswap_map);
2530
2531 swap_cgroup_swapoff(p->type);
2532 exit_swap_address_space(p->type);
2533
2534 inode = mapping->host;
2535 if (S_ISBLK(inode->i_mode)) {
2536 struct block_device *bdev = I_BDEV(inode);
2537
2538 set_blocksize(bdev, old_block_size);
2539 blkdev_put(bdev, FMODE_READ | FMODE_WRITE | FMODE_EXCL);
2540 }
2541
2542 inode_lock(inode);
2543 inode->i_flags &= ~S_SWAPFILE;
2544 inode_unlock(inode);
2545 filp_close(swap_file, NULL);
2546
2547
2548
2549
2550
2551
2552 spin_lock(&swap_lock);
2553 p->flags = 0;
2554 spin_unlock(&swap_lock);
2555
2556 err = 0;
2557 atomic_inc(&proc_poll_event);
2558 wake_up_interruptible(&proc_poll_wait);
2559
2560 out_dput:
2561 filp_close(victim, NULL);
2562 out:
2563 putname(pathname);
2564 return err;
2565 }
2566
2567 #ifdef CONFIG_PROC_FS
2568 static __poll_t swaps_poll(struct file *file, poll_table *wait)
2569 {
2570 struct seq_file *seq = file->private_data;
2571
2572 poll_wait(file, &proc_poll_wait, wait);
2573
2574 if (seq->poll_event != atomic_read(&proc_poll_event)) {
2575 seq->poll_event = atomic_read(&proc_poll_event);
2576 return EPOLLIN | EPOLLRDNORM | EPOLLERR | EPOLLPRI;
2577 }
2578
2579 return EPOLLIN | EPOLLRDNORM;
2580 }
2581
2582
2583 static void *swap_start(struct seq_file *swap, loff_t *pos)
2584 {
2585 struct swap_info_struct *si;
2586 int type;
2587 loff_t l = *pos;
2588
2589 mutex_lock(&swapon_mutex);
2590
2591 if (!l)
2592 return SEQ_START_TOKEN;
2593
2594 for (type = 0; (si = swap_type_to_swap_info(type)); type++) {
2595 if (!(si->flags & SWP_USED) || !si->swap_map)
2596 continue;
2597 if (!--l)
2598 return si;
2599 }
2600
2601 return NULL;
2602 }
2603
2604 static void *swap_next(struct seq_file *swap, void *v, loff_t *pos)
2605 {
2606 struct swap_info_struct *si = v;
2607 int type;
2608
2609 if (v == SEQ_START_TOKEN)
2610 type = 0;
2611 else
2612 type = si->type + 1;
2613
2614 ++(*pos);
2615 for (; (si = swap_type_to_swap_info(type)); type++) {
2616 if (!(si->flags & SWP_USED) || !si->swap_map)
2617 continue;
2618 return si;
2619 }
2620
2621 return NULL;
2622 }
2623
2624 static void swap_stop(struct seq_file *swap, void *v)
2625 {
2626 mutex_unlock(&swapon_mutex);
2627 }
2628
2629 static int swap_show(struct seq_file *swap, void *v)
2630 {
2631 struct swap_info_struct *si = v;
2632 struct file *file;
2633 int len;
2634 unsigned long bytes, inuse;
2635
2636 if (si == SEQ_START_TOKEN) {
2637 seq_puts(swap, "Filename\t\t\t\tType\t\tSize\t\tUsed\t\tPriority\n");
2638 return 0;
2639 }
2640
2641 bytes = si->pages << (PAGE_SHIFT - 10);
2642 inuse = READ_ONCE(si->inuse_pages) << (PAGE_SHIFT - 10);
2643
2644 file = si->swap_file;
2645 len = seq_file_path(swap, file, " \t\n\\");
2646 seq_printf(swap, "%*s%s\t%lu\t%s%lu\t%s%d\n",
2647 len < 40 ? 40 - len : 1, " ",
2648 S_ISBLK(file_inode(file)->i_mode) ?
2649 "partition" : "file\t",
2650 bytes, bytes < 10000000 ? "\t" : "",
2651 inuse, inuse < 10000000 ? "\t" : "",
2652 si->prio);
2653 return 0;
2654 }
2655
2656 static const struct seq_operations swaps_op = {
2657 .start = swap_start,
2658 .next = swap_next,
2659 .stop = swap_stop,
2660 .show = swap_show
2661 };
2662
2663 static int swaps_open(struct inode *inode, struct file *file)
2664 {
2665 struct seq_file *seq;
2666 int ret;
2667
2668 ret = seq_open(file, &swaps_op);
2669 if (ret)
2670 return ret;
2671
2672 seq = file->private_data;
2673 seq->poll_event = atomic_read(&proc_poll_event);
2674 return 0;
2675 }
2676
2677 static const struct proc_ops swaps_proc_ops = {
2678 .proc_flags = PROC_ENTRY_PERMANENT,
2679 .proc_open = swaps_open,
2680 .proc_read = seq_read,
2681 .proc_lseek = seq_lseek,
2682 .proc_release = seq_release,
2683 .proc_poll = swaps_poll,
2684 };
2685
2686 static int __init procswaps_init(void)
2687 {
2688 proc_create("swaps", 0, NULL, &swaps_proc_ops);
2689 return 0;
2690 }
2691 __initcall(procswaps_init);
2692 #endif
2693
2694 #ifdef MAX_SWAPFILES_CHECK
2695 static int __init max_swapfiles_check(void)
2696 {
2697 MAX_SWAPFILES_CHECK();
2698 return 0;
2699 }
2700 late_initcall(max_swapfiles_check);
2701 #endif
2702
2703 static struct swap_info_struct *alloc_swap_info(void)
2704 {
2705 struct swap_info_struct *p;
2706 struct swap_info_struct *defer = NULL;
2707 unsigned int type;
2708 int i;
2709
2710 p = kvzalloc(struct_size(p, avail_lists, nr_node_ids), GFP_KERNEL);
2711 if (!p)
2712 return ERR_PTR(-ENOMEM);
2713
2714 if (percpu_ref_init(&p->users, swap_users_ref_free,
2715 PERCPU_REF_INIT_DEAD, GFP_KERNEL)) {
2716 kvfree(p);
2717 return ERR_PTR(-ENOMEM);
2718 }
2719
2720 spin_lock(&swap_lock);
2721 for (type = 0; type < nr_swapfiles; type++) {
2722 if (!(swap_info[type]->flags & SWP_USED))
2723 break;
2724 }
2725 if (type >= MAX_SWAPFILES) {
2726 spin_unlock(&swap_lock);
2727 percpu_ref_exit(&p->users);
2728 kvfree(p);
2729 return ERR_PTR(-EPERM);
2730 }
2731 if (type >= nr_swapfiles) {
2732 p->type = type;
2733
2734
2735
2736
2737 smp_store_release(&swap_info[type], p);
2738 nr_swapfiles++;
2739 } else {
2740 defer = p;
2741 p = swap_info[type];
2742
2743
2744
2745
2746 }
2747 p->swap_extent_root = RB_ROOT;
2748 plist_node_init(&p->list, 0);
2749 for_each_node(i)
2750 plist_node_init(&p->avail_lists[i], 0);
2751 p->flags = SWP_USED;
2752 spin_unlock(&swap_lock);
2753 if (defer) {
2754 percpu_ref_exit(&defer->users);
2755 kvfree(defer);
2756 }
2757 spin_lock_init(&p->lock);
2758 spin_lock_init(&p->cont_lock);
2759 init_completion(&p->comp);
2760
2761 return p;
2762 }
2763
2764 static int claim_swapfile(struct swap_info_struct *p, struct inode *inode)
2765 {
2766 int error;
2767
2768 if (S_ISBLK(inode->i_mode)) {
2769 p->bdev = blkdev_get_by_dev(inode->i_rdev,
2770 FMODE_READ | FMODE_WRITE | FMODE_EXCL, p);
2771 if (IS_ERR(p->bdev)) {
2772 error = PTR_ERR(p->bdev);
2773 p->bdev = NULL;
2774 return error;
2775 }
2776 p->old_block_size = block_size(p->bdev);
2777 error = set_blocksize(p->bdev, PAGE_SIZE);
2778 if (error < 0)
2779 return error;
2780
2781
2782
2783
2784
2785 if (bdev_is_zoned(p->bdev))
2786 return -EINVAL;
2787 p->flags |= SWP_BLKDEV;
2788 } else if (S_ISREG(inode->i_mode)) {
2789 p->bdev = inode->i_sb->s_bdev;
2790 }
2791
2792 return 0;
2793 }
2794
2795
2796
2797
2798
2799
2800
2801
2802
2803
2804
2805
2806
2807
2808
2809
2810
2811
2812 unsigned long generic_max_swapfile_size(void)
2813 {
2814 return swp_offset(pte_to_swp_entry(
2815 swp_entry_to_pte(swp_entry(0, ~0UL)))) + 1;
2816 }
2817
2818
2819 __weak unsigned long max_swapfile_size(void)
2820 {
2821 return generic_max_swapfile_size();
2822 }
2823
2824 static unsigned long read_swap_header(struct swap_info_struct *p,
2825 union swap_header *swap_header,
2826 struct inode *inode)
2827 {
2828 int i;
2829 unsigned long maxpages;
2830 unsigned long swapfilepages;
2831 unsigned long last_page;
2832
2833 if (memcmp("SWAPSPACE2", swap_header->magic.magic, 10)) {
2834 pr_err("Unable to find swap-space signature\n");
2835 return 0;
2836 }
2837
2838
2839 if (swab32(swap_header->info.version) == 1) {
2840 swab32s(&swap_header->info.version);
2841 swab32s(&swap_header->info.last_page);
2842 swab32s(&swap_header->info.nr_badpages);
2843 if (swap_header->info.nr_badpages > MAX_SWAP_BADPAGES)
2844 return 0;
2845 for (i = 0; i < swap_header->info.nr_badpages; i++)
2846 swab32s(&swap_header->info.badpages[i]);
2847 }
2848
2849 if (swap_header->info.version != 1) {
2850 pr_warn("Unable to handle swap header version %d\n",
2851 swap_header->info.version);
2852 return 0;
2853 }
2854
2855 p->lowest_bit = 1;
2856 p->cluster_next = 1;
2857 p->cluster_nr = 0;
2858
2859 maxpages = max_swapfile_size();
2860 last_page = swap_header->info.last_page;
2861 if (!last_page) {
2862 pr_warn("Empty swap-file\n");
2863 return 0;
2864 }
2865 if (last_page > maxpages) {
2866 pr_warn("Truncating oversized swap area, only using %luk out of %luk\n",
2867 maxpages << (PAGE_SHIFT - 10),
2868 last_page << (PAGE_SHIFT - 10));
2869 }
2870 if (maxpages > last_page) {
2871 maxpages = last_page + 1;
2872
2873 if ((unsigned int)maxpages == 0)
2874 maxpages = UINT_MAX;
2875 }
2876 p->highest_bit = maxpages - 1;
2877
2878 if (!maxpages)
2879 return 0;
2880 swapfilepages = i_size_read(inode) >> PAGE_SHIFT;
2881 if (swapfilepages && maxpages > swapfilepages) {
2882 pr_warn("Swap area shorter than signature indicates\n");
2883 return 0;
2884 }
2885 if (swap_header->info.nr_badpages && S_ISREG(inode->i_mode))
2886 return 0;
2887 if (swap_header->info.nr_badpages > MAX_SWAP_BADPAGES)
2888 return 0;
2889
2890 return maxpages;
2891 }
2892
2893 #define SWAP_CLUSTER_INFO_COLS \
2894 DIV_ROUND_UP(L1_CACHE_BYTES, sizeof(struct swap_cluster_info))
2895 #define SWAP_CLUSTER_SPACE_COLS \
2896 DIV_ROUND_UP(SWAP_ADDRESS_SPACE_PAGES, SWAPFILE_CLUSTER)
2897 #define SWAP_CLUSTER_COLS \
2898 max_t(unsigned int, SWAP_CLUSTER_INFO_COLS, SWAP_CLUSTER_SPACE_COLS)
2899
2900 static int setup_swap_map_and_extents(struct swap_info_struct *p,
2901 union swap_header *swap_header,
2902 unsigned char *swap_map,
2903 struct swap_cluster_info *cluster_info,
2904 unsigned long maxpages,
2905 sector_t *span)
2906 {
2907 unsigned int j, k;
2908 unsigned int nr_good_pages;
2909 int nr_extents;
2910 unsigned long nr_clusters = DIV_ROUND_UP(maxpages, SWAPFILE_CLUSTER);
2911 unsigned long col = p->cluster_next / SWAPFILE_CLUSTER % SWAP_CLUSTER_COLS;
2912 unsigned long i, idx;
2913
2914 nr_good_pages = maxpages - 1;
2915
2916 cluster_list_init(&p->free_clusters);
2917 cluster_list_init(&p->discard_clusters);
2918
2919 for (i = 0; i < swap_header->info.nr_badpages; i++) {
2920 unsigned int page_nr = swap_header->info.badpages[i];
2921 if (page_nr == 0 || page_nr > swap_header->info.last_page)
2922 return -EINVAL;
2923 if (page_nr < maxpages) {
2924 swap_map[page_nr] = SWAP_MAP_BAD;
2925 nr_good_pages--;
2926
2927
2928
2929
2930 inc_cluster_info_page(p, cluster_info, page_nr);
2931 }
2932 }
2933
2934
2935 for (i = maxpages; i < round_up(maxpages, SWAPFILE_CLUSTER); i++)
2936 inc_cluster_info_page(p, cluster_info, i);
2937
2938 if (nr_good_pages) {
2939 swap_map[0] = SWAP_MAP_BAD;
2940
2941
2942
2943
2944 inc_cluster_info_page(p, cluster_info, 0);
2945 p->max = maxpages;
2946 p->pages = nr_good_pages;
2947 nr_extents = setup_swap_extents(p, span);
2948 if (nr_extents < 0)
2949 return nr_extents;
2950 nr_good_pages = p->pages;
2951 }
2952 if (!nr_good_pages) {
2953 pr_warn("Empty swap-file\n");
2954 return -EINVAL;
2955 }
2956
2957 if (!cluster_info)
2958 return nr_extents;
2959
2960
2961
2962
2963
2964
2965 for (k = 0; k < SWAP_CLUSTER_COLS; k++) {
2966 j = (k + col) % SWAP_CLUSTER_COLS;
2967 for (i = 0; i < DIV_ROUND_UP(nr_clusters, SWAP_CLUSTER_COLS); i++) {
2968 idx = i * SWAP_CLUSTER_COLS + j;
2969 if (idx >= nr_clusters)
2970 continue;
2971 if (cluster_count(&cluster_info[idx]))
2972 continue;
2973 cluster_set_flag(&cluster_info[idx], CLUSTER_FLAG_FREE);
2974 cluster_list_add_tail(&p->free_clusters, cluster_info,
2975 idx);
2976 }
2977 }
2978 return nr_extents;
2979 }
2980
2981 SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
2982 {
2983 struct swap_info_struct *p;
2984 struct filename *name;
2985 struct file *swap_file = NULL;
2986 struct address_space *mapping;
2987 struct dentry *dentry;
2988 int prio;
2989 int error;
2990 union swap_header *swap_header;
2991 int nr_extents;
2992 sector_t span;
2993 unsigned long maxpages;
2994 unsigned char *swap_map = NULL;
2995 struct swap_cluster_info *cluster_info = NULL;
2996 unsigned long *frontswap_map = NULL;
2997 struct page *page = NULL;
2998 struct inode *inode = NULL;
2999 bool inced_nr_rotate_swap = false;
3000
3001 if (swap_flags & ~SWAP_FLAGS_VALID)
3002 return -EINVAL;
3003
3004 if (!capable(CAP_SYS_ADMIN))
3005 return -EPERM;
3006
3007 if (!swap_avail_heads)
3008 return -ENOMEM;
3009
3010 p = alloc_swap_info();
3011 if (IS_ERR(p))
3012 return PTR_ERR(p);
3013
3014 INIT_WORK(&p->discard_work, swap_discard_work);
3015
3016 name = getname(specialfile);
3017 if (IS_ERR(name)) {
3018 error = PTR_ERR(name);
3019 name = NULL;
3020 goto bad_swap;
3021 }
3022 swap_file = file_open_name(name, O_RDWR|O_LARGEFILE, 0);
3023 if (IS_ERR(swap_file)) {
3024 error = PTR_ERR(swap_file);
3025 swap_file = NULL;
3026 goto bad_swap;
3027 }
3028
3029 p->swap_file = swap_file;
3030 mapping = swap_file->f_mapping;
3031 dentry = swap_file->f_path.dentry;
3032 inode = mapping->host;
3033
3034 error = claim_swapfile(p, inode);
3035 if (unlikely(error))
3036 goto bad_swap;
3037
3038 inode_lock(inode);
3039 if (d_unlinked(dentry) || cant_mount(dentry)) {
3040 error = -ENOENT;
3041 goto bad_swap_unlock_inode;
3042 }
3043 if (IS_SWAPFILE(inode)) {
3044 error = -EBUSY;
3045 goto bad_swap_unlock_inode;
3046 }
3047
3048
3049
3050
3051 if (!mapping->a_ops->read_folio) {
3052 error = -EINVAL;
3053 goto bad_swap_unlock_inode;
3054 }
3055 page = read_mapping_page(mapping, 0, swap_file);
3056 if (IS_ERR(page)) {
3057 error = PTR_ERR(page);
3058 goto bad_swap_unlock_inode;
3059 }
3060 swap_header = kmap(page);
3061
3062 maxpages = read_swap_header(p, swap_header, inode);
3063 if (unlikely(!maxpages)) {
3064 error = -EINVAL;
3065 goto bad_swap_unlock_inode;
3066 }
3067
3068
3069 swap_map = vzalloc(maxpages);
3070 if (!swap_map) {
3071 error = -ENOMEM;
3072 goto bad_swap_unlock_inode;
3073 }
3074
3075 if (p->bdev && bdev_stable_writes(p->bdev))
3076 p->flags |= SWP_STABLE_WRITES;
3077
3078 if (p->bdev && p->bdev->bd_disk->fops->rw_page)
3079 p->flags |= SWP_SYNCHRONOUS_IO;
3080
3081 if (p->bdev && bdev_nonrot(p->bdev)) {
3082 int cpu;
3083 unsigned long ci, nr_cluster;
3084
3085 p->flags |= SWP_SOLIDSTATE;
3086 p->cluster_next_cpu = alloc_percpu(unsigned int);
3087 if (!p->cluster_next_cpu) {
3088 error = -ENOMEM;
3089 goto bad_swap_unlock_inode;
3090 }
3091
3092
3093
3094
3095 for_each_possible_cpu(cpu) {
3096 per_cpu(*p->cluster_next_cpu, cpu) =
3097 1 + prandom_u32_max(p->highest_bit);
3098 }
3099 nr_cluster = DIV_ROUND_UP(maxpages, SWAPFILE_CLUSTER);
3100
3101 cluster_info = kvcalloc(nr_cluster, sizeof(*cluster_info),
3102 GFP_KERNEL);
3103 if (!cluster_info) {
3104 error = -ENOMEM;
3105 goto bad_swap_unlock_inode;
3106 }
3107
3108 for (ci = 0; ci < nr_cluster; ci++)
3109 spin_lock_init(&((cluster_info + ci)->lock));
3110
3111 p->percpu_cluster = alloc_percpu(struct percpu_cluster);
3112 if (!p->percpu_cluster) {
3113 error = -ENOMEM;
3114 goto bad_swap_unlock_inode;
3115 }
3116 for_each_possible_cpu(cpu) {
3117 struct percpu_cluster *cluster;
3118 cluster = per_cpu_ptr(p->percpu_cluster, cpu);
3119 cluster_set_null(&cluster->index);
3120 }
3121 } else {
3122 atomic_inc(&nr_rotate_swap);
3123 inced_nr_rotate_swap = true;
3124 }
3125
3126 error = swap_cgroup_swapon(p->type, maxpages);
3127 if (error)
3128 goto bad_swap_unlock_inode;
3129
3130 nr_extents = setup_swap_map_and_extents(p, swap_header, swap_map,
3131 cluster_info, maxpages, &span);
3132 if (unlikely(nr_extents < 0)) {
3133 error = nr_extents;
3134 goto bad_swap_unlock_inode;
3135 }
3136
3137 if (IS_ENABLED(CONFIG_FRONTSWAP))
3138 frontswap_map = kvcalloc(BITS_TO_LONGS(maxpages),
3139 sizeof(long),
3140 GFP_KERNEL);
3141
3142 if ((swap_flags & SWAP_FLAG_DISCARD) &&
3143 p->bdev && bdev_max_discard_sectors(p->bdev)) {
3144
3145
3146
3147
3148
3149
3150 p->flags |= (SWP_DISCARDABLE | SWP_AREA_DISCARD |
3151 SWP_PAGE_DISCARD);
3152
3153
3154
3155
3156
3157
3158
3159 if (swap_flags & SWAP_FLAG_DISCARD_ONCE)
3160 p->flags &= ~SWP_PAGE_DISCARD;
3161 else if (swap_flags & SWAP_FLAG_DISCARD_PAGES)
3162 p->flags &= ~SWP_AREA_DISCARD;
3163
3164
3165 if (p->flags & SWP_AREA_DISCARD) {
3166 int err = discard_swap(p);
3167 if (unlikely(err))
3168 pr_err("swapon: discard_swap(%p): %d\n",
3169 p, err);
3170 }
3171 }
3172
3173 error = init_swap_address_space(p->type, maxpages);
3174 if (error)
3175 goto bad_swap_unlock_inode;
3176
3177
3178
3179
3180
3181 inode->i_flags |= S_SWAPFILE;
3182 error = inode_drain_writes(inode);
3183 if (error) {
3184 inode->i_flags &= ~S_SWAPFILE;
3185 goto free_swap_address_space;
3186 }
3187
3188 mutex_lock(&swapon_mutex);
3189 prio = -1;
3190 if (swap_flags & SWAP_FLAG_PREFER)
3191 prio =
3192 (swap_flags & SWAP_FLAG_PRIO_MASK) >> SWAP_FLAG_PRIO_SHIFT;
3193 enable_swap_info(p, prio, swap_map, cluster_info, frontswap_map);
3194
3195 pr_info("Adding %uk swap on %s. Priority:%d extents:%d across:%lluk %s%s%s%s%s\n",
3196 p->pages<<(PAGE_SHIFT-10), name->name, p->prio,
3197 nr_extents, (unsigned long long)span<<(PAGE_SHIFT-10),
3198 (p->flags & SWP_SOLIDSTATE) ? "SS" : "",
3199 (p->flags & SWP_DISCARDABLE) ? "D" : "",
3200 (p->flags & SWP_AREA_DISCARD) ? "s" : "",
3201 (p->flags & SWP_PAGE_DISCARD) ? "c" : "",
3202 (frontswap_map) ? "FS" : "");
3203
3204 mutex_unlock(&swapon_mutex);
3205 atomic_inc(&proc_poll_event);
3206 wake_up_interruptible(&proc_poll_wait);
3207
3208 error = 0;
3209 goto out;
3210 free_swap_address_space:
3211 exit_swap_address_space(p->type);
3212 bad_swap_unlock_inode:
3213 inode_unlock(inode);
3214 bad_swap:
3215 free_percpu(p->percpu_cluster);
3216 p->percpu_cluster = NULL;
3217 free_percpu(p->cluster_next_cpu);
3218 p->cluster_next_cpu = NULL;
3219 if (inode && S_ISBLK(inode->i_mode) && p->bdev) {
3220 set_blocksize(p->bdev, p->old_block_size);
3221 blkdev_put(p->bdev, FMODE_READ | FMODE_WRITE | FMODE_EXCL);
3222 }
3223 inode = NULL;
3224 destroy_swap_extents(p);
3225 swap_cgroup_swapoff(p->type);
3226 spin_lock(&swap_lock);
3227 p->swap_file = NULL;
3228 p->flags = 0;
3229 spin_unlock(&swap_lock);
3230 vfree(swap_map);
3231 kvfree(cluster_info);
3232 kvfree(frontswap_map);
3233 if (inced_nr_rotate_swap)
3234 atomic_dec(&nr_rotate_swap);
3235 if (swap_file)
3236 filp_close(swap_file, NULL);
3237 out:
3238 if (page && !IS_ERR(page)) {
3239 kunmap(page);
3240 put_page(page);
3241 }
3242 if (name)
3243 putname(name);
3244 if (inode)
3245 inode_unlock(inode);
3246 if (!error)
3247 enable_swap_slots_cache();
3248 return error;
3249 }
3250
3251 void si_swapinfo(struct sysinfo *val)
3252 {
3253 unsigned int type;
3254 unsigned long nr_to_be_unused = 0;
3255
3256 spin_lock(&swap_lock);
3257 for (type = 0; type < nr_swapfiles; type++) {
3258 struct swap_info_struct *si = swap_info[type];
3259
3260 if ((si->flags & SWP_USED) && !(si->flags & SWP_WRITEOK))
3261 nr_to_be_unused += READ_ONCE(si->inuse_pages);
3262 }
3263 val->freeswap = atomic_long_read(&nr_swap_pages) + nr_to_be_unused;
3264 val->totalswap = total_swap_pages + nr_to_be_unused;
3265 spin_unlock(&swap_lock);
3266 }
3267
3268
3269
3270
3271
3272
3273
3274
3275
3276
3277
3278
3279 static int __swap_duplicate(swp_entry_t entry, unsigned char usage)
3280 {
3281 struct swap_info_struct *p;
3282 struct swap_cluster_info *ci;
3283 unsigned long offset;
3284 unsigned char count;
3285 unsigned char has_cache;
3286 int err;
3287
3288 p = get_swap_device(entry);
3289 if (!p)
3290 return -EINVAL;
3291
3292 offset = swp_offset(entry);
3293 ci = lock_cluster_or_swap_info(p, offset);
3294
3295 count = p->swap_map[offset];
3296
3297
3298
3299
3300
3301 if (unlikely(swap_count(count) == SWAP_MAP_BAD)) {
3302 err = -ENOENT;
3303 goto unlock_out;
3304 }
3305
3306 has_cache = count & SWAP_HAS_CACHE;
3307 count &= ~SWAP_HAS_CACHE;
3308 err = 0;
3309
3310 if (usage == SWAP_HAS_CACHE) {
3311
3312
3313 if (!has_cache && count)
3314 has_cache = SWAP_HAS_CACHE;
3315 else if (has_cache)
3316 err = -EEXIST;
3317 else
3318 err = -ENOENT;
3319
3320 } else if (count || has_cache) {
3321
3322 if ((count & ~COUNT_CONTINUED) < SWAP_MAP_MAX)
3323 count += usage;
3324 else if ((count & ~COUNT_CONTINUED) > SWAP_MAP_MAX)
3325 err = -EINVAL;
3326 else if (swap_count_continued(p, offset, count))
3327 count = COUNT_CONTINUED;
3328 else
3329 err = -ENOMEM;
3330 } else
3331 err = -ENOENT;
3332
3333 WRITE_ONCE(p->swap_map[offset], count | has_cache);
3334
3335 unlock_out:
3336 unlock_cluster_or_swap_info(p, ci);
3337 put_swap_device(p);
3338 return err;
3339 }
3340
3341
3342
3343
3344
3345 void swap_shmem_alloc(swp_entry_t entry)
3346 {
3347 __swap_duplicate(entry, SWAP_MAP_SHMEM);
3348 }
3349
3350
3351
3352
3353
3354
3355
3356
3357 int swap_duplicate(swp_entry_t entry)
3358 {
3359 int err = 0;
3360
3361 while (!err && __swap_duplicate(entry, 1) == -ENOMEM)
3362 err = add_swap_count_continuation(entry, GFP_ATOMIC);
3363 return err;
3364 }
3365
3366
3367
3368
3369
3370
3371
3372
3373
3374 int swapcache_prepare(swp_entry_t entry)
3375 {
3376 return __swap_duplicate(entry, SWAP_HAS_CACHE);
3377 }
3378
3379 struct swap_info_struct *swp_swap_info(swp_entry_t entry)
3380 {
3381 return swap_type_to_swap_info(swp_type(entry));
3382 }
3383
3384 struct swap_info_struct *page_swap_info(struct page *page)
3385 {
3386 swp_entry_t entry = { .val = page_private(page) };
3387 return swp_swap_info(entry);
3388 }
3389
3390
3391
3392
3393 struct address_space *swapcache_mapping(struct folio *folio)
3394 {
3395 return page_swap_info(&folio->page)->swap_file->f_mapping;
3396 }
3397 EXPORT_SYMBOL_GPL(swapcache_mapping);
3398
3399 pgoff_t __page_file_index(struct page *page)
3400 {
3401 swp_entry_t swap = { .val = page_private(page) };
3402 return swp_offset(swap);
3403 }
3404 EXPORT_SYMBOL_GPL(__page_file_index);
3405
3406
3407
3408
3409
3410
3411
3412
3413
3414
3415
3416
3417
3418
3419
3420
3421 int add_swap_count_continuation(swp_entry_t entry, gfp_t gfp_mask)
3422 {
3423 struct swap_info_struct *si;
3424 struct swap_cluster_info *ci;
3425 struct page *head;
3426 struct page *page;
3427 struct page *list_page;
3428 pgoff_t offset;
3429 unsigned char count;
3430 int ret = 0;
3431
3432
3433
3434
3435
3436 page = alloc_page(gfp_mask | __GFP_HIGHMEM);
3437
3438 si = get_swap_device(entry);
3439 if (!si) {
3440
3441
3442
3443
3444 goto outer;
3445 }
3446 spin_lock(&si->lock);
3447
3448 offset = swp_offset(entry);
3449
3450 ci = lock_cluster(si, offset);
3451
3452 count = swap_count(si->swap_map[offset]);
3453
3454 if ((count & ~COUNT_CONTINUED) != SWAP_MAP_MAX) {
3455
3456
3457
3458
3459
3460 goto out;
3461 }
3462
3463 if (!page) {
3464 ret = -ENOMEM;
3465 goto out;
3466 }
3467
3468
3469
3470
3471
3472
3473 head = vmalloc_to_page(si->swap_map + offset);
3474 offset &= ~PAGE_MASK;
3475
3476 spin_lock(&si->cont_lock);
3477
3478
3479
3480
3481 if (!page_private(head)) {
3482 BUG_ON(count & COUNT_CONTINUED);
3483 INIT_LIST_HEAD(&head->lru);
3484 set_page_private(head, SWP_CONTINUED);
3485 si->flags |= SWP_CONTINUED;
3486 }
3487
3488 list_for_each_entry(list_page, &head->lru, lru) {
3489 unsigned char *map;
3490
3491
3492
3493
3494
3495 if (!(count & COUNT_CONTINUED))
3496 goto out_unlock_cont;
3497
3498 map = kmap_atomic(list_page) + offset;
3499 count = *map;
3500 kunmap_atomic(map);
3501
3502
3503
3504
3505
3506 if ((count & ~COUNT_CONTINUED) != SWAP_CONT_MAX)
3507 goto out_unlock_cont;
3508 }
3509
3510 list_add_tail(&page->lru, &head->lru);
3511 page = NULL;
3512 out_unlock_cont:
3513 spin_unlock(&si->cont_lock);
3514 out:
3515 unlock_cluster(ci);
3516 spin_unlock(&si->lock);
3517 put_swap_device(si);
3518 outer:
3519 if (page)
3520 __free_page(page);
3521 return ret;
3522 }
3523
3524
3525
3526
3527
3528
3529
3530
3531
3532
3533 static bool swap_count_continued(struct swap_info_struct *si,
3534 pgoff_t offset, unsigned char count)
3535 {
3536 struct page *head;
3537 struct page *page;
3538 unsigned char *map;
3539 bool ret;
3540
3541 head = vmalloc_to_page(si->swap_map + offset);
3542 if (page_private(head) != SWP_CONTINUED) {
3543 BUG_ON(count & COUNT_CONTINUED);
3544 return false;
3545 }
3546
3547 spin_lock(&si->cont_lock);
3548 offset &= ~PAGE_MASK;
3549 page = list_next_entry(head, lru);
3550 map = kmap_atomic(page) + offset;
3551
3552 if (count == SWAP_MAP_MAX)
3553 goto init_map;
3554
3555 if (count == (SWAP_MAP_MAX | COUNT_CONTINUED)) {
3556
3557
3558
3559 while (*map == (SWAP_CONT_MAX | COUNT_CONTINUED)) {
3560 kunmap_atomic(map);
3561 page = list_next_entry(page, lru);
3562 BUG_ON(page == head);
3563 map = kmap_atomic(page) + offset;
3564 }
3565 if (*map == SWAP_CONT_MAX) {
3566 kunmap_atomic(map);
3567 page = list_next_entry(page, lru);
3568 if (page == head) {
3569 ret = false;
3570 goto out;
3571 }
3572 map = kmap_atomic(page) + offset;
3573 init_map: *map = 0;
3574 }
3575 *map += 1;
3576 kunmap_atomic(map);
3577 while ((page = list_prev_entry(page, lru)) != head) {
3578 map = kmap_atomic(page) + offset;
3579 *map = COUNT_CONTINUED;
3580 kunmap_atomic(map);
3581 }
3582 ret = true;
3583
3584 } else {
3585
3586
3587
3588 BUG_ON(count != COUNT_CONTINUED);
3589 while (*map == COUNT_CONTINUED) {
3590 kunmap_atomic(map);
3591 page = list_next_entry(page, lru);
3592 BUG_ON(page == head);
3593 map = kmap_atomic(page) + offset;
3594 }
3595 BUG_ON(*map == 0);
3596 *map -= 1;
3597 if (*map == 0)
3598 count = 0;
3599 kunmap_atomic(map);
3600 while ((page = list_prev_entry(page, lru)) != head) {
3601 map = kmap_atomic(page) + offset;
3602 *map = SWAP_CONT_MAX | count;
3603 count = COUNT_CONTINUED;
3604 kunmap_atomic(map);
3605 }
3606 ret = count == COUNT_CONTINUED;
3607 }
3608 out:
3609 spin_unlock(&si->cont_lock);
3610 return ret;
3611 }
3612
3613
3614
3615
3616
3617 static void free_swap_count_continuations(struct swap_info_struct *si)
3618 {
3619 pgoff_t offset;
3620
3621 for (offset = 0; offset < si->max; offset += PAGE_SIZE) {
3622 struct page *head;
3623 head = vmalloc_to_page(si->swap_map + offset);
3624 if (page_private(head)) {
3625 struct page *page, *next;
3626
3627 list_for_each_entry_safe(page, next, &head->lru, lru) {
3628 list_del(&page->lru);
3629 __free_page(page);
3630 }
3631 }
3632 }
3633 }
3634
3635 #if defined(CONFIG_MEMCG) && defined(CONFIG_BLK_CGROUP)
3636 void __cgroup_throttle_swaprate(struct page *page, gfp_t gfp_mask)
3637 {
3638 struct swap_info_struct *si, *next;
3639 int nid = page_to_nid(page);
3640
3641 if (!(gfp_mask & __GFP_IO))
3642 return;
3643
3644 if (!blk_cgroup_congested())
3645 return;
3646
3647
3648
3649
3650
3651 if (current->throttle_queue)
3652 return;
3653
3654 spin_lock(&swap_avail_lock);
3655 plist_for_each_entry_safe(si, next, &swap_avail_heads[nid],
3656 avail_lists[nid]) {
3657 if (si->bdev) {
3658 blkcg_schedule_throttle(bdev_get_queue(si->bdev), true);
3659 break;
3660 }
3661 }
3662 spin_unlock(&swap_avail_lock);
3663 }
3664 #endif
3665
3666 static int __init swapfile_init(void)
3667 {
3668 int nid;
3669
3670 swap_avail_heads = kmalloc_array(nr_node_ids, sizeof(struct plist_head),
3671 GFP_KERNEL);
3672 if (!swap_avail_heads) {
3673 pr_emerg("Not enough memory for swap heads, swap is disabled\n");
3674 return -ENOMEM;
3675 }
3676
3677 for_each_node(nid)
3678 plist_head_init(&swap_avail_heads[nid]);
3679
3680 return 0;
3681 }
3682 subsys_initcall(swapfile_init);