fs/erofs/zdata.c

0001 // SPDX-License-Identifier: GPL-2.0-only
0002 /*
0003  * Copyright (C) 2018 HUAWEI, Inc.
0004  *             https://www.huawei.com/
0005  * Copyright (C) 2022 Alibaba Cloud
0006  */
0007 #include "zdata.h"
0008 #include "compress.h"
0009 #include <linux/prefetch.h>
0010
0011 #include <trace/events/erofs.h>
0012
0013 /*
0014  * since pclustersize is variable for big pcluster feature, introduce slab
0015  * pools implementation for different pcluster sizes.
0016  */
0017 struct z_erofs_pcluster_slab {
0018     struct kmem_cache *slab;
0019     unsigned int maxpages;
0020     char name[48];
0021 };
0022
0023 #define _PCLP(n) { .maxpages = n }
0024
0025 static struct z_erofs_pcluster_slab pcluster_pool[] __read_mostly = {
0026     _PCLP(1), _PCLP(4), _PCLP(16), _PCLP(64), _PCLP(128),
0027     _PCLP(Z_EROFS_PCLUSTER_MAX_PAGES)
0028 };
0029
0030 struct z_erofs_bvec_iter {
0031     struct page *bvpage;
0032     struct z_erofs_bvset *bvset;
0033     unsigned int nr, cur;
0034 };
0035
0036 static struct page *z_erofs_bvec_iter_end(struct z_erofs_bvec_iter *iter)
0037 {
0038     if (iter->bvpage)
0039         kunmap_local(iter->bvset);
0040     return iter->bvpage;
0041 }
0042
0043 static struct page *z_erofs_bvset_flip(struct z_erofs_bvec_iter *iter)
0044 {
0045     unsigned long base = (unsigned long)((struct z_erofs_bvset *)0)->bvec;
0046     /* have to access nextpage in advance, otherwise it will be unmapped */
0047     struct page *nextpage = iter->bvset->nextpage;
0048     struct page *oldpage;
0049
0050     DBG_BUGON(!nextpage);
0051     oldpage = z_erofs_bvec_iter_end(iter);
0052     iter->bvpage = nextpage;
0053     iter->bvset = kmap_local_page(nextpage);
0054     iter->nr = (PAGE_SIZE - base) / sizeof(struct z_erofs_bvec);
0055     iter->cur = 0;
0056     return oldpage;
0057 }
0058
0059 static void z_erofs_bvec_iter_begin(struct z_erofs_bvec_iter *iter,
0060                     struct z_erofs_bvset_inline *bvset,
0061                     unsigned int bootstrap_nr,
0062                     unsigned int cur)
0063 {
0064     *iter = (struct z_erofs_bvec_iter) {
0065         .nr = bootstrap_nr,
0066         .bvset = (struct z_erofs_bvset *)bvset,
0067     };
0068
0069     while (cur > iter->nr) {
0070         cur -= iter->nr;
0071         z_erofs_bvset_flip(iter);
0072     }
0073     iter->cur = cur;
0074 }
0075
0076 static int z_erofs_bvec_enqueue(struct z_erofs_bvec_iter *iter,
0077                 struct z_erofs_bvec *bvec,
0078                 struct page **candidate_bvpage)
0079 {
0080     if (iter->cur == iter->nr) {
0081         if (!*candidate_bvpage)
0082             return -EAGAIN;
0083
0084         DBG_BUGON(iter->bvset->nextpage);
0085         iter->bvset->nextpage = *candidate_bvpage;
0086         z_erofs_bvset_flip(iter);
0087
0088         iter->bvset->nextpage = NULL;
0089         *candidate_bvpage = NULL;
0090     }
0091     iter->bvset->bvec[iter->cur++] = *bvec;
0092     return 0;
0093 }
0094
0095 static void z_erofs_bvec_dequeue(struct z_erofs_bvec_iter *iter,
0096                  struct z_erofs_bvec *bvec,
0097                  struct page **old_bvpage)
0098 {
0099     if (iter->cur == iter->nr)
0100         *old_bvpage = z_erofs_bvset_flip(iter);
0101     else
0102         *old_bvpage = NULL;
0103     *bvec = iter->bvset->bvec[iter->cur++];
0104 }
0105
0106 static void z_erofs_destroy_pcluster_pool(void)
0107 {
0108     int i;
0109
0110     for (i = 0; i < ARRAY_SIZE(pcluster_pool); ++i) {
0111         if (!pcluster_pool[i].slab)
0112             continue;
0113         kmem_cache_destroy(pcluster_pool[i].slab);
0114         pcluster_pool[i].slab = NULL;
0115     }
0116 }
0117
0118 static int z_erofs_create_pcluster_pool(void)
0119 {
0120     struct z_erofs_pcluster_slab *pcs;
0121     struct z_erofs_pcluster *a;
0122     unsigned int size;
0123
0124     for (pcs = pcluster_pool;
0125          pcs < pcluster_pool + ARRAY_SIZE(pcluster_pool); ++pcs) {
0126         size = struct_size(a, compressed_bvecs, pcs->maxpages);
0127
0128         sprintf(pcs->name, "erofs_pcluster-%u", pcs->maxpages);
0129         pcs->slab = kmem_cache_create(pcs->name, size, 0,
0130                           SLAB_RECLAIM_ACCOUNT, NULL);
0131         if (pcs->slab)
0132             continue;
0133
0134         z_erofs_destroy_pcluster_pool();
0135         return -ENOMEM;
0136     }
0137     return 0;
0138 }
0139
0140 static struct z_erofs_pcluster *z_erofs_alloc_pcluster(unsigned int nrpages)
0141 {
0142     int i;
0143
0144     for (i = 0; i < ARRAY_SIZE(pcluster_pool); ++i) {
0145         struct z_erofs_pcluster_slab *pcs = pcluster_pool + i;
0146         struct z_erofs_pcluster *pcl;
0147
0148         if (nrpages > pcs->maxpages)
0149             continue;
0150
0151         pcl = kmem_cache_zalloc(pcs->slab, GFP_NOFS);
0152         if (!pcl)
0153             return ERR_PTR(-ENOMEM);
0154         pcl->pclusterpages = nrpages;
0155         return pcl;
0156     }
0157     return ERR_PTR(-EINVAL);
0158 }
0159
0160 static void z_erofs_free_pcluster(struct z_erofs_pcluster *pcl)
0161 {
0162     unsigned int pclusterpages = z_erofs_pclusterpages(pcl);
0163     int i;
0164
0165     for (i = 0; i < ARRAY_SIZE(pcluster_pool); ++i) {
0166         struct z_erofs_pcluster_slab *pcs = pcluster_pool + i;
0167
0168         if (pclusterpages > pcs->maxpages)
0169             continue;
0170
0171         kmem_cache_free(pcs->slab, pcl);
0172         return;
0173     }
0174     DBG_BUGON(1);
0175 }
0176
0177 /* how to allocate cached pages for a pcluster */
0178 enum z_erofs_cache_alloctype {
0179     DONTALLOC,  /* don't allocate any cached pages */
0180     /*
0181      * try to use cached I/O if page allocation succeeds or fallback
0182      * to in-place I/O instead to avoid any direct reclaim.
0183      */
0184     TRYALLOC,
0185 };
0186
0187 /*
0188  * tagged pointer with 1-bit tag for all compressed pages
0189  * tag 0 - the page is just found with an extra page reference
0190  */
0191 typedef tagptr1_t compressed_page_t;
0192
0193 #define tag_compressed_page_justfound(page) \
0194     tagptr_fold(compressed_page_t, page, 1)
0195
0196 static struct workqueue_struct *z_erofs_workqueue __read_mostly;
0197
0198 void z_erofs_exit_zip_subsystem(void)
0199 {
0200     destroy_workqueue(z_erofs_workqueue);
0201     z_erofs_destroy_pcluster_pool();
0202 }
0203
0204 static inline int z_erofs_init_workqueue(void)
0205 {
0206     const unsigned int onlinecpus = num_possible_cpus();
0207
0208     /*
0209      * no need to spawn too many threads, limiting threads could minimum
0210      * scheduling overhead, perhaps per-CPU threads should be better?
0211      */
0212     z_erofs_workqueue = alloc_workqueue("erofs_unzipd",
0213                         WQ_UNBOUND | WQ_HIGHPRI,
0214                         onlinecpus + onlinecpus / 4);
0215     return z_erofs_workqueue ? 0 : -ENOMEM;
0216 }
0217
0218 int __init z_erofs_init_zip_subsystem(void)
0219 {
0220     int err = z_erofs_create_pcluster_pool();
0221
0222     if (err)
0223         return err;
0224     err = z_erofs_init_workqueue();
0225     if (err)
0226         z_erofs_destroy_pcluster_pool();
0227     return err;
0228 }
0229
0230 enum z_erofs_pclustermode {
0231     Z_EROFS_PCLUSTER_INFLIGHT,
0232     /*
0233      * The current pclusters was the tail of an exist chain, in addition
0234      * that the previous processed chained pclusters are all decided to
0235      * be hooked up to it.
0236      * A new chain will be created for the remaining pclusters which are
0237      * not processed yet, so different from Z_EROFS_PCLUSTER_FOLLOWED,
0238      * the next pcluster cannot reuse the whole page safely for inplace I/O
0239      * in the following scenario:
0240      *  ________________________________________________________________
0241      * |      tail (partial) page     |       head (partial) page       |
0242      * |   (belongs to the next pcl)  |   (belongs to the current pcl)  |
0243      * |_______PCLUSTER_FOLLOWED______|________PCLUSTER_HOOKED__________|
0244      */
0245     Z_EROFS_PCLUSTER_HOOKED,
0246     /*
0247      * a weak form of Z_EROFS_PCLUSTER_FOLLOWED, the difference is that it
0248      * could be dispatched into bypass queue later due to uptodated managed
0249      * pages. All related online pages cannot be reused for inplace I/O (or
0250      * bvpage) since it can be directly decoded without I/O submission.
0251      */
0252     Z_EROFS_PCLUSTER_FOLLOWED_NOINPLACE,
0253     /*
0254      * The current collection has been linked with the owned chain, and
0255      * could also be linked with the remaining collections, which means
0256      * if the processing page is the tail page of the collection, thus
0257      * the current collection can safely use the whole page (since
0258      * the previous collection is under control) for in-place I/O, as
0259      * illustrated below:
0260      *  ________________________________________________________________
0261      * |  tail (partial) page |          head (partial) page           |
0262      * |  (of the current cl) |      (of the previous collection)      |
0263      * | PCLUSTER_FOLLOWED or |                                        |
0264      * |_____PCLUSTER_HOOKED__|___________PCLUSTER_FOLLOWED____________|
0265      *
0266      * [  (*) the above page can be used as inplace I/O.               ]
0267      */
0268     Z_EROFS_PCLUSTER_FOLLOWED,
0269 };
0270
0271 struct z_erofs_decompress_frontend {
0272     struct inode *const inode;
0273     struct erofs_map_blocks map;
0274     struct z_erofs_bvec_iter biter;
0275
0276     struct page *candidate_bvpage;
0277     struct z_erofs_pcluster *pcl, *tailpcl;
0278     z_erofs_next_pcluster_t owned_head;
0279     enum z_erofs_pclustermode mode;
0280
0281     bool readahead;
0282     /* used for applying cache strategy on the fly */
0283     bool backmost;
0284     erofs_off_t headoffset;
0285
0286     /* a pointer used to pick up inplace I/O pages */
0287     unsigned int icur;
0288 };
0289
0290 #define DECOMPRESS_FRONTEND_INIT(__i) { \
0291     .inode = __i, .owned_head = Z_EROFS_PCLUSTER_TAIL, \
0292     .mode = Z_EROFS_PCLUSTER_FOLLOWED, .backmost = true }
0293
0294 static void z_erofs_bind_cache(struct z_erofs_decompress_frontend *fe,
0295                    enum z_erofs_cache_alloctype type,
0296                    struct page **pagepool)
0297 {
0298     struct address_space *mc = MNGD_MAPPING(EROFS_I_SB(fe->inode));
0299     struct z_erofs_pcluster *pcl = fe->pcl;
0300     bool standalone = true;
0301     /*
0302      * optimistic allocation without direct reclaim since inplace I/O
0303      * can be used if low memory otherwise.
0304      */
0305     gfp_t gfp = (mapping_gfp_mask(mc) & ~__GFP_DIRECT_RECLAIM) |
0306             __GFP_NOMEMALLOC | __GFP_NORETRY | __GFP_NOWARN;
0307     unsigned int i;
0308
0309     if (fe->mode < Z_EROFS_PCLUSTER_FOLLOWED)
0310         return;
0311
0312     for (i = 0; i < pcl->pclusterpages; ++i) {
0313         struct page *page;
0314         compressed_page_t t;
0315         struct page *newpage = NULL;
0316
0317         /* the compressed page was loaded before */
0318         if (READ_ONCE(pcl->compressed_bvecs[i].page))
0319             continue;
0320
0321         page = find_get_page(mc, pcl->obj.index + i);
0322
0323         if (page) {
0324             t = tag_compressed_page_justfound(page);
0325         } else {
0326             /* I/O is needed, no possible to decompress directly */
0327             standalone = false;
0328             switch (type) {
0329             case TRYALLOC:
0330                 newpage = erofs_allocpage(pagepool, gfp);
0331                 if (!newpage)
0332                     continue;
0333                 set_page_private(newpage,
0334                          Z_EROFS_PREALLOCATED_PAGE);
0335                 t = tag_compressed_page_justfound(newpage);
0336                 break;
0337             default:        /* DONTALLOC */
0338                 continue;
0339             }
0340         }
0341
0342         if (!cmpxchg_relaxed(&pcl->compressed_bvecs[i].page, NULL,
0343                      tagptr_cast_ptr(t)))
0344             continue;
0345
0346         if (page)
0347             put_page(page);
0348         else if (newpage)
0349             erofs_pagepool_add(pagepool, newpage);
0350     }
0351
0352     /*
0353      * don't do inplace I/O if all compressed pages are available in
0354      * managed cache since it can be moved to the bypass queue instead.
0355      */
0356     if (standalone)
0357         fe->mode = Z_EROFS_PCLUSTER_FOLLOWED_NOINPLACE;
0358 }
0359
0360 /* called by erofs_shrinker to get rid of all compressed_pages */
0361 int erofs_try_to_free_all_cached_pages(struct erofs_sb_info *sbi,
0362                        struct erofs_workgroup *grp)
0363 {
0364     struct z_erofs_pcluster *const pcl =
0365         container_of(grp, struct z_erofs_pcluster, obj);
0366     int i;
0367
0368     DBG_BUGON(z_erofs_is_inline_pcluster(pcl));
0369     /*
0370      * refcount of workgroup is now freezed as 1,
0371      * therefore no need to worry about available decompression users.
0372      */
0373     for (i = 0; i < pcl->pclusterpages; ++i) {
0374         struct page *page = pcl->compressed_bvecs[i].page;
0375
0376         if (!page)
0377             continue;
0378
0379         /* block other users from reclaiming or migrating the page */
0380         if (!trylock_page(page))
0381             return -EBUSY;
0382
0383         if (!erofs_page_is_managed(sbi, page))
0384             continue;
0385
0386         /* barrier is implied in the following 'unlock_page' */
0387         WRITE_ONCE(pcl->compressed_bvecs[i].page, NULL);
0388         detach_page_private(page);
0389         unlock_page(page);
0390     }
0391     return 0;
0392 }
0393
0394 int erofs_try_to_free_cached_page(struct page *page)
0395 {
0396     struct z_erofs_pcluster *const pcl = (void *)page_private(page);
0397     int ret, i;
0398
0399     if (!erofs_workgroup_try_to_freeze(&pcl->obj, 1))
0400         return 0;
0401
0402     ret = 0;
0403     DBG_BUGON(z_erofs_is_inline_pcluster(pcl));
0404     for (i = 0; i < pcl->pclusterpages; ++i) {
0405         if (pcl->compressed_bvecs[i].page == page) {
0406             WRITE_ONCE(pcl->compressed_bvecs[i].page, NULL);
0407             ret = 1;
0408             break;
0409         }
0410     }
0411     erofs_workgroup_unfreeze(&pcl->obj, 1);
0412     if (ret)
0413         detach_page_private(page);
0414     return ret;
0415 }
0416
0417 static bool z_erofs_try_inplace_io(struct z_erofs_decompress_frontend *fe,
0418                    struct z_erofs_bvec *bvec)
0419 {
0420     struct z_erofs_pcluster *const pcl = fe->pcl;
0421
0422     while (fe->icur > 0) {
0423         if (!cmpxchg(&pcl->compressed_bvecs[--fe->icur].page,
0424                  NULL, bvec->page)) {
0425             pcl->compressed_bvecs[fe->icur] = *bvec;
0426             return true;
0427         }
0428     }
0429     return false;
0430 }
0431
0432 /* callers must be with pcluster lock held */
0433 static int z_erofs_attach_page(struct z_erofs_decompress_frontend *fe,
0434                    struct z_erofs_bvec *bvec, bool exclusive)
0435 {
0436     int ret;
0437
0438     if (exclusive) {
0439         /* give priority for inplaceio to use file pages first */
0440         if (z_erofs_try_inplace_io(fe, bvec))
0441             return 0;
0442         /* otherwise, check if it can be used as a bvpage */
0443         if (fe->mode >= Z_EROFS_PCLUSTER_FOLLOWED &&
0444             !fe->candidate_bvpage)
0445             fe->candidate_bvpage = bvec->page;
0446     }
0447     ret = z_erofs_bvec_enqueue(&fe->biter, bvec, &fe->candidate_bvpage);
0448     fe->pcl->vcnt += (ret >= 0);
0449     return ret;
0450 }
0451
0452 static void z_erofs_try_to_claim_pcluster(struct z_erofs_decompress_frontend *f)
0453 {
0454     struct z_erofs_pcluster *pcl = f->pcl;
0455     z_erofs_next_pcluster_t *owned_head = &f->owned_head;
0456
0457     /* type 1, nil pcluster (this pcluster doesn't belong to any chain.) */
0458     if (cmpxchg(&pcl->next, Z_EROFS_PCLUSTER_NIL,
0459             *owned_head) == Z_EROFS_PCLUSTER_NIL) {
0460         *owned_head = &pcl->next;
0461         /* so we can attach this pcluster to our submission chain. */
0462         f->mode = Z_EROFS_PCLUSTER_FOLLOWED;
0463         return;
0464     }
0465
0466     /*
0467      * type 2, link to the end of an existing open chain, be careful
0468      * that its submission is controlled by the original attached chain.
0469      */
0470     if (*owned_head != &pcl->next && pcl != f->tailpcl &&
0471         cmpxchg(&pcl->next, Z_EROFS_PCLUSTER_TAIL,
0472             *owned_head) == Z_EROFS_PCLUSTER_TAIL) {
0473         *owned_head = Z_EROFS_PCLUSTER_TAIL;
0474         f->mode = Z_EROFS_PCLUSTER_HOOKED;
0475         f->tailpcl = NULL;
0476         return;
0477     }
0478     /* type 3, it belongs to a chain, but it isn't the end of the chain */
0479     f->mode = Z_EROFS_PCLUSTER_INFLIGHT;
0480 }
0481
0482 static int z_erofs_register_pcluster(struct z_erofs_decompress_frontend *fe)
0483 {
0484     struct erofs_map_blocks *map = &fe->map;
0485     bool ztailpacking = map->m_flags & EROFS_MAP_META;
0486     struct z_erofs_pcluster *pcl;
0487     struct erofs_workgroup *grp;
0488     int err;
0489
0490     if (!(map->m_flags & EROFS_MAP_ENCODED)) {
0491         DBG_BUGON(1);
0492         return -EFSCORRUPTED;
0493     }
0494
0495     /* no available pcluster, let's allocate one */
0496     pcl = z_erofs_alloc_pcluster(ztailpacking ? 1 :
0497                      map->m_plen >> PAGE_SHIFT);
0498     if (IS_ERR(pcl))
0499         return PTR_ERR(pcl);
0500
0501     atomic_set(&pcl->obj.refcount, 1);
0502     pcl->algorithmformat = map->m_algorithmformat;
0503     pcl->length = 0;
0504     pcl->partial = true;
0505
0506     /* new pclusters should be claimed as type 1, primary and followed */
0507     pcl->next = fe->owned_head;
0508     pcl->pageofs_out = map->m_la & ~PAGE_MASK;
0509     fe->mode = Z_EROFS_PCLUSTER_FOLLOWED;
0510
0511     /*
0512      * lock all primary followed works before visible to others
0513      * and mutex_trylock *never* fails for a new pcluster.
0514      */
0515     mutex_init(&pcl->lock);
0516     DBG_BUGON(!mutex_trylock(&pcl->lock));
0517
0518     if (ztailpacking) {
0519         pcl->obj.index = 0; /* which indicates ztailpacking */
0520         pcl->pageofs_in = erofs_blkoff(map->m_pa);
0521         pcl->tailpacking_size = map->m_plen;
0522     } else {
0523         pcl->obj.index = map->m_pa >> PAGE_SHIFT;
0524
0525         grp = erofs_insert_workgroup(fe->inode->i_sb, &pcl->obj);
0526         if (IS_ERR(grp)) {
0527             err = PTR_ERR(grp);
0528             goto err_out;
0529         }
0530
0531         if (grp != &pcl->obj) {
0532             fe->pcl = container_of(grp,
0533                     struct z_erofs_pcluster, obj);
0534             err = -EEXIST;
0535             goto err_out;
0536         }
0537     }
0538     /* used to check tail merging loop due to corrupted images */
0539     if (fe->owned_head == Z_EROFS_PCLUSTER_TAIL)
0540         fe->tailpcl = pcl;
0541     fe->owned_head = &pcl->next;
0542     fe->pcl = pcl;
0543     return 0;
0544
0545 err_out:
0546     mutex_unlock(&pcl->lock);
0547     z_erofs_free_pcluster(pcl);
0548     return err;
0549 }
0550
0551 static int z_erofs_collector_begin(struct z_erofs_decompress_frontend *fe)
0552 {
0553     struct erofs_map_blocks *map = &fe->map;
0554     struct erofs_workgroup *grp = NULL;
0555     int ret;
0556
0557     DBG_BUGON(fe->pcl);
0558
0559     /* must be Z_EROFS_PCLUSTER_TAIL or pointed to previous pcluster */
0560     DBG_BUGON(fe->owned_head == Z_EROFS_PCLUSTER_NIL);
0561     DBG_BUGON(fe->owned_head == Z_EROFS_PCLUSTER_TAIL_CLOSED);
0562
0563     if (!(map->m_flags & EROFS_MAP_META)) {
0564         grp = erofs_find_workgroup(fe->inode->i_sb,
0565                        map->m_pa >> PAGE_SHIFT);
0566     } else if ((map->m_pa & ~PAGE_MASK) + map->m_plen > PAGE_SIZE) {
0567         DBG_BUGON(1);
0568         return -EFSCORRUPTED;
0569     }
0570
0571     if (grp) {
0572         fe->pcl = container_of(grp, struct z_erofs_pcluster, obj);
0573         ret = -EEXIST;
0574     } else {
0575         ret = z_erofs_register_pcluster(fe);
0576     }
0577
0578     if (ret == -EEXIST) {
0579         mutex_lock(&fe->pcl->lock);
0580         /* used to check tail merging loop due to corrupted images */
0581         if (fe->owned_head == Z_EROFS_PCLUSTER_TAIL)
0582             fe->tailpcl = fe->pcl;
0583
0584         z_erofs_try_to_claim_pcluster(fe);
0585     } else if (ret) {
0586         return ret;
0587     }
0588     z_erofs_bvec_iter_begin(&fe->biter, &fe->pcl->bvset,
0589                 Z_EROFS_INLINE_BVECS, fe->pcl->vcnt);
0590     /* since file-backed online pages are traversed in reverse order */
0591     fe->icur = z_erofs_pclusterpages(fe->pcl);
0592     return 0;
0593 }
0594
0595 /*
0596  * keep in mind that no referenced pclusters will be freed
0597  * only after a RCU grace period.
0598  */
0599 static void z_erofs_rcu_callback(struct rcu_head *head)
0600 {
0601     z_erofs_free_pcluster(container_of(head,
0602             struct z_erofs_pcluster, rcu));
0603 }
0604
0605 void erofs_workgroup_free_rcu(struct erofs_workgroup *grp)
0606 {
0607     struct z_erofs_pcluster *const pcl =
0608         container_of(grp, struct z_erofs_pcluster, obj);
0609
0610     call_rcu(&pcl->rcu, z_erofs_rcu_callback);
0611 }
0612
0613 static bool z_erofs_collector_end(struct z_erofs_decompress_frontend *fe)
0614 {
0615     struct z_erofs_pcluster *pcl = fe->pcl;
0616
0617     if (!pcl)
0618         return false;
0619
0620     z_erofs_bvec_iter_end(&fe->biter);
0621     mutex_unlock(&pcl->lock);
0622
0623     if (fe->candidate_bvpage) {
0624         DBG_BUGON(z_erofs_is_shortlived_page(fe->candidate_bvpage));
0625         fe->candidate_bvpage = NULL;
0626     }
0627
0628     /*
0629      * if all pending pages are added, don't hold its reference
0630      * any longer if the pcluster isn't hosted by ourselves.
0631      */
0632     if (fe->mode < Z_EROFS_PCLUSTER_FOLLOWED_NOINPLACE)
0633         erofs_workgroup_put(&pcl->obj);
0634
0635     fe->pcl = NULL;
0636     return true;
0637 }
0638
0639 static bool should_alloc_managed_pages(struct z_erofs_decompress_frontend *fe,
0640                        unsigned int cachestrategy,
0641                        erofs_off_t la)
0642 {
0643     if (cachestrategy <= EROFS_ZIP_CACHE_DISABLED)
0644         return false;
0645
0646     if (fe->backmost)
0647         return true;
0648
0649     return cachestrategy >= EROFS_ZIP_CACHE_READAROUND &&
0650         la < fe->headoffset;
0651 }
0652
0653 static int z_erofs_do_read_page(struct z_erofs_decompress_frontend *fe,
0654                 struct page *page, struct page **pagepool)
0655 {
0656     struct inode *const inode = fe->inode;
0657     struct erofs_sb_info *const sbi = EROFS_I_SB(inode);
0658     struct erofs_map_blocks *const map = &fe->map;
0659     const loff_t offset = page_offset(page);
0660     bool tight = true, exclusive;
0661
0662     enum z_erofs_cache_alloctype cache_strategy;
0663     unsigned int cur, end, spiltted;
0664     int err = 0;
0665
0666     /* register locked file pages as online pages in pack */
0667     z_erofs_onlinepage_init(page);
0668
0669     spiltted = 0;
0670     end = PAGE_SIZE;
0671 repeat:
0672     cur = end - 1;
0673
0674     if (offset + cur < map->m_la ||
0675         offset + cur >= map->m_la + map->m_llen) {
0676         erofs_dbg("out-of-range map @ pos %llu", offset + cur);
0677
0678         if (z_erofs_collector_end(fe))
0679             fe->backmost = false;
0680         map->m_la = offset + cur;
0681         map->m_llen = 0;
0682         err = z_erofs_map_blocks_iter(inode, map, 0);
0683         if (err)
0684             goto out;
0685     } else {
0686         if (fe->pcl)
0687             goto hitted;
0688         /* didn't get a valid pcluster previously (very rare) */
0689     }
0690
0691     if (!(map->m_flags & EROFS_MAP_MAPPED))
0692         goto hitted;
0693
0694     err = z_erofs_collector_begin(fe);
0695     if (err)
0696         goto out;
0697
0698     if (z_erofs_is_inline_pcluster(fe->pcl)) {
0699         void *mp;
0700
0701         mp = erofs_read_metabuf(&fe->map.buf, inode->i_sb,
0702                     erofs_blknr(map->m_pa), EROFS_NO_KMAP);
0703         if (IS_ERR(mp)) {
0704             err = PTR_ERR(mp);
0705             erofs_err(inode->i_sb,
0706                   "failed to get inline page, err %d", err);
0707             goto out;
0708         }
0709         get_page(fe->map.buf.page);
0710         WRITE_ONCE(fe->pcl->compressed_bvecs[0].page,
0711                fe->map.buf.page);
0712         fe->mode = Z_EROFS_PCLUSTER_FOLLOWED_NOINPLACE;
0713     } else {
0714         /* bind cache first when cached decompression is preferred */
0715         if (should_alloc_managed_pages(fe, sbi->opt.cache_strategy,
0716                            map->m_la))
0717             cache_strategy = TRYALLOC;
0718         else
0719             cache_strategy = DONTALLOC;
0720
0721         z_erofs_bind_cache(fe, cache_strategy, pagepool);
0722     }
0723 hitted:
0724     /*
0725      * Ensure the current partial page belongs to this submit chain rather
0726      * than other concurrent submit chains or the noio(bypass) chain since
0727      * those chains are handled asynchronously thus the page cannot be used
0728      * for inplace I/O or bvpage (should be processed in a strict order.)
0729      */
0730     tight &= (fe->mode >= Z_EROFS_PCLUSTER_HOOKED &&
0731           fe->mode != Z_EROFS_PCLUSTER_FOLLOWED_NOINPLACE);
0732
0733     cur = end - min_t(unsigned int, offset + end - map->m_la, end);
0734     if (!(map->m_flags & EROFS_MAP_MAPPED)) {
0735         zero_user_segment(page, cur, end);
0736         goto next_part;
0737     }
0738
0739     exclusive = (!cur && (!spiltted || tight));
0740     if (cur)
0741         tight &= (fe->mode >= Z_EROFS_PCLUSTER_FOLLOWED);
0742
0743 retry:
0744     err = z_erofs_attach_page(fe, &((struct z_erofs_bvec) {
0745                     .page = page,
0746                     .offset = offset - map->m_la,
0747                     .end = end,
0748                   }), exclusive);
0749     /* should allocate an additional short-lived page for bvset */
0750     if (err == -EAGAIN && !fe->candidate_bvpage) {
0751         fe->candidate_bvpage = alloc_page(GFP_NOFS | __GFP_NOFAIL);
0752         set_page_private(fe->candidate_bvpage,
0753                  Z_EROFS_SHORTLIVED_PAGE);
0754         goto retry;
0755     }
0756
0757     if (err) {
0758         DBG_BUGON(err == -EAGAIN && fe->candidate_bvpage);
0759         goto out;
0760     }
0761
0762     z_erofs_onlinepage_split(page);
0763     /* bump up the number of spiltted parts of a page */
0764     ++spiltted;
0765     if (fe->pcl->pageofs_out != (map->m_la & ~PAGE_MASK))
0766         fe->pcl->multibases = true;
0767
0768     if ((map->m_flags & EROFS_MAP_FULL_MAPPED) &&
0769         fe->pcl->length == map->m_llen)
0770         fe->pcl->partial = false;
0771     if (fe->pcl->length < offset + end - map->m_la) {
0772         fe->pcl->length = offset + end - map->m_la;
0773         fe->pcl->pageofs_out = map->m_la & ~PAGE_MASK;
0774     }
0775 next_part:
0776     /* shorten the remaining extent to update progress */
0777     map->m_llen = offset + cur - map->m_la;
0778     map->m_flags &= ~EROFS_MAP_FULL_MAPPED;
0779
0780     end = cur;
0781     if (end > 0)
0782         goto repeat;
0783
0784 out:
0785     if (err)
0786         z_erofs_page_mark_eio(page);
0787     z_erofs_onlinepage_endio(page);
0788
0789     erofs_dbg("%s, finish page: %pK spiltted: %u map->m_llen %llu",
0790           __func__, page, spiltted, map->m_llen);
0791     return err;
0792 }
0793
0794 static bool z_erofs_get_sync_decompress_policy(struct erofs_sb_info *sbi,
0795                        unsigned int readahead_pages)
0796 {
0797     /* auto: enable for read_folio, disable for readahead */
0798     if ((sbi->opt.sync_decompress == EROFS_SYNC_DECOMPRESS_AUTO) &&
0799         !readahead_pages)
0800         return true;
0801
0802     if ((sbi->opt.sync_decompress == EROFS_SYNC_DECOMPRESS_FORCE_ON) &&
0803         (readahead_pages <= sbi->opt.max_sync_decompress_pages))
0804         return true;
0805
0806     return false;
0807 }
0808
0809 static bool z_erofs_page_is_invalidated(struct page *page)
0810 {
0811     return !page->mapping && !z_erofs_is_shortlived_page(page);
0812 }
0813
0814 struct z_erofs_decompress_backend {
0815     struct page *onstack_pages[Z_EROFS_ONSTACK_PAGES];
0816     struct super_block *sb;
0817     struct z_erofs_pcluster *pcl;
0818
0819     /* pages with the longest decompressed length for deduplication */
0820     struct page **decompressed_pages;
0821     /* pages to keep the compressed data */
0822     struct page **compressed_pages;
0823
0824     struct list_head decompressed_secondary_bvecs;
0825     struct page **pagepool;
0826     unsigned int onstack_used, nr_pages;
0827 };
0828
0829 struct z_erofs_bvec_item {
0830     struct z_erofs_bvec bvec;
0831     struct list_head list;
0832 };
0833
0834 static void z_erofs_do_decompressed_bvec(struct z_erofs_decompress_backend *be,
0835                      struct z_erofs_bvec *bvec)
0836 {
0837     struct z_erofs_bvec_item *item;
0838
0839     if (!((bvec->offset + be->pcl->pageofs_out) & ~PAGE_MASK)) {
0840         unsigned int pgnr;
0841         struct page *oldpage;
0842
0843         pgnr = (bvec->offset + be->pcl->pageofs_out) >> PAGE_SHIFT;
0844         DBG_BUGON(pgnr >= be->nr_pages);
0845         oldpage = be->decompressed_pages[pgnr];
0846         be->decompressed_pages[pgnr] = bvec->page;
0847
0848         if (!oldpage)
0849             return;
0850     }
0851
0852     /* (cold path) one pcluster is requested multiple times */
0853     item = kmalloc(sizeof(*item), GFP_KERNEL | __GFP_NOFAIL);
0854     item->bvec = *bvec;
0855     list_add(&item->list, &be->decompressed_secondary_bvecs);
0856 }
0857
0858 static void z_erofs_fill_other_copies(struct z_erofs_decompress_backend *be,
0859                       int err)
0860 {
0861     unsigned int off0 = be->pcl->pageofs_out;
0862     struct list_head *p, *n;
0863
0864     list_for_each_safe(p, n, &be->decompressed_secondary_bvecs) {
0865         struct z_erofs_bvec_item *bvi;
0866         unsigned int end, cur;
0867         void *dst, *src;
0868
0869         bvi = container_of(p, struct z_erofs_bvec_item, list);
0870         cur = bvi->bvec.offset < 0 ? -bvi->bvec.offset : 0;
0871         end = min_t(unsigned int, be->pcl->length - bvi->bvec.offset,
0872                 bvi->bvec.end);
0873         dst = kmap_local_page(bvi->bvec.page);
0874         while (cur < end) {
0875             unsigned int pgnr, scur, len;
0876
0877             pgnr = (bvi->bvec.offset + cur + off0) >> PAGE_SHIFT;
0878             DBG_BUGON(pgnr >= be->nr_pages);
0879
0880             scur = bvi->bvec.offset + cur -
0881                     ((pgnr << PAGE_SHIFT) - off0);
0882             len = min_t(unsigned int, end - cur, PAGE_SIZE - scur);
0883             if (!be->decompressed_pages[pgnr]) {
0884                 err = -EFSCORRUPTED;
0885                 cur += len;
0886                 continue;
0887             }
0888             src = kmap_local_page(be->decompressed_pages[pgnr]);
0889             memcpy(dst + cur, src + scur, len);
0890             kunmap_local(src);
0891             cur += len;
0892         }
0893         kunmap_local(dst);
0894         if (err)
0895             z_erofs_page_mark_eio(bvi->bvec.page);
0896         z_erofs_onlinepage_endio(bvi->bvec.page);
0897         list_del(p);
0898         kfree(bvi);
0899     }
0900 }
0901
0902 static void z_erofs_parse_out_bvecs(struct z_erofs_decompress_backend *be)
0903 {
0904     struct z_erofs_pcluster *pcl = be->pcl;
0905     struct z_erofs_bvec_iter biter;
0906     struct page *old_bvpage;
0907     int i;
0908
0909     z_erofs_bvec_iter_begin(&biter, &pcl->bvset, Z_EROFS_INLINE_BVECS, 0);
0910     for (i = 0; i < pcl->vcnt; ++i) {
0911         struct z_erofs_bvec bvec;
0912
0913         z_erofs_bvec_dequeue(&biter, &bvec, &old_bvpage);
0914
0915         if (old_bvpage)
0916             z_erofs_put_shortlivedpage(be->pagepool, old_bvpage);
0917
0918         DBG_BUGON(z_erofs_page_is_invalidated(bvec.page));
0919         z_erofs_do_decompressed_bvec(be, &bvec);
0920     }
0921
0922     old_bvpage = z_erofs_bvec_iter_end(&biter);
0923     if (old_bvpage)
0924         z_erofs_put_shortlivedpage(be->pagepool, old_bvpage);
0925 }
0926
0927 static int z_erofs_parse_in_bvecs(struct z_erofs_decompress_backend *be,
0928                   bool *overlapped)
0929 {
0930     struct z_erofs_pcluster *pcl = be->pcl;
0931     unsigned int pclusterpages = z_erofs_pclusterpages(pcl);
0932     int i, err = 0;
0933
0934     *overlapped = false;
0935     for (i = 0; i < pclusterpages; ++i) {
0936         struct z_erofs_bvec *bvec = &pcl->compressed_bvecs[i];
0937         struct page *page = bvec->page;
0938
0939         /* compressed pages ought to be present before decompressing */
0940         if (!page) {
0941             DBG_BUGON(1);
0942             continue;
0943         }
0944         be->compressed_pages[i] = page;
0945
0946         if (z_erofs_is_inline_pcluster(pcl)) {
0947             if (!PageUptodate(page))
0948                 err = -EIO;
0949             continue;
0950         }
0951
0952         DBG_BUGON(z_erofs_page_is_invalidated(page));
0953         if (!z_erofs_is_shortlived_page(page)) {
0954             if (erofs_page_is_managed(EROFS_SB(be->sb), page)) {
0955                 if (!PageUptodate(page))
0956                     err = -EIO;
0957                 continue;
0958             }
0959             z_erofs_do_decompressed_bvec(be, bvec);
0960             *overlapped = true;
0961         }
0962     }
0963
0964     if (err)
0965         return err;
0966     return 0;
0967 }
0968
0969 static int z_erofs_decompress_pcluster(struct z_erofs_decompress_backend *be,
0970                        int err)
0971 {
0972     struct erofs_sb_info *const sbi = EROFS_SB(be->sb);
0973     struct z_erofs_pcluster *pcl = be->pcl;
0974     unsigned int pclusterpages = z_erofs_pclusterpages(pcl);
0975     unsigned int i, inputsize;
0976     int err2;
0977     struct page *page;
0978     bool overlapped;
0979
0980     mutex_lock(&pcl->lock);
0981     be->nr_pages = PAGE_ALIGN(pcl->length + pcl->pageofs_out) >> PAGE_SHIFT;
0982
0983     /* allocate (de)compressed page arrays if cannot be kept on stack */
0984     be->decompressed_pages = NULL;
0985     be->compressed_pages = NULL;
0986     be->onstack_used = 0;
0987     if (be->nr_pages <= Z_EROFS_ONSTACK_PAGES) {
0988         be->decompressed_pages = be->onstack_pages;
0989         be->onstack_used = be->nr_pages;
0990         memset(be->decompressed_pages, 0,
0991                sizeof(struct page *) * be->nr_pages);
0992     }
0993
0994     if (pclusterpages + be->onstack_used <= Z_EROFS_ONSTACK_PAGES)
0995         be->compressed_pages = be->onstack_pages + be->onstack_used;
0996
0997     if (!be->decompressed_pages)
0998         be->decompressed_pages =
0999             kvcalloc(be->nr_pages, sizeof(struct page *),
1000                  GFP_KERNEL | __GFP_NOFAIL);
1001     if (!be->compressed_pages)
1002         be->compressed_pages =
1003             kvcalloc(pclusterpages, sizeof(struct page *),
1004                  GFP_KERNEL | __GFP_NOFAIL);
1005
1006     z_erofs_parse_out_bvecs(be);
1007     err2 = z_erofs_parse_in_bvecs(be, &overlapped);
1008     if (err2)
1009         err = err2;
1010     if (err)
1011         goto out;
1012
1013     if (z_erofs_is_inline_pcluster(pcl))
1014         inputsize = pcl->tailpacking_size;
1015     else
1016         inputsize = pclusterpages * PAGE_SIZE;
1017
1018     err = z_erofs_decompress(&(struct z_erofs_decompress_req) {
1019                     .sb = be->sb,
1020                     .in = be->compressed_pages,
1021                     .out = be->decompressed_pages,
1022                     .pageofs_in = pcl->pageofs_in,
1023                     .pageofs_out = pcl->pageofs_out,
1024                     .inputsize = inputsize,
1025                     .outputsize = pcl->length,
1026                     .alg = pcl->algorithmformat,
1027                     .inplace_io = overlapped,
1028                     .partial_decoding = pcl->partial,
1029                     .fillgaps = pcl->multibases,
1030                  }, be->pagepool);
1031
1032 out:
1033     /* must handle all compressed pages before actual file pages */
1034     if (z_erofs_is_inline_pcluster(pcl)) {
1035         page = pcl->compressed_bvecs[0].page;
1036         WRITE_ONCE(pcl->compressed_bvecs[0].page, NULL);
1037         put_page(page);
1038     } else {
1039         for (i = 0; i < pclusterpages; ++i) {
1040             page = pcl->compressed_bvecs[i].page;
1041
1042             if (erofs_page_is_managed(sbi, page))
1043                 continue;
1044
1045             /* recycle all individual short-lived pages */
1046             (void)z_erofs_put_shortlivedpage(be->pagepool, page);
1047             WRITE_ONCE(pcl->compressed_bvecs[i].page, NULL);
1048         }
1049     }
1050     if (be->compressed_pages < be->onstack_pages ||
1051         be->compressed_pages >= be->onstack_pages + Z_EROFS_ONSTACK_PAGES)
1052         kvfree(be->compressed_pages);
1053     z_erofs_fill_other_copies(be, err);
1054
1055     for (i = 0; i < be->nr_pages; ++i) {
1056         page = be->decompressed_pages[i];
1057         if (!page)
1058             continue;
1059
1060         DBG_BUGON(z_erofs_page_is_invalidated(page));
1061
1062         /* recycle all individual short-lived pages */
1063         if (z_erofs_put_shortlivedpage(be->pagepool, page))
1064             continue;
1065         if (err)
1066             z_erofs_page_mark_eio(page);
1067         z_erofs_onlinepage_endio(page);
1068     }
1069
1070     if (be->decompressed_pages != be->onstack_pages)
1071         kvfree(be->decompressed_pages);
1072
1073     pcl->length = 0;
1074     pcl->partial = true;
1075     pcl->multibases = false;
1076     pcl->bvset.nextpage = NULL;
1077     pcl->vcnt = 0;
1078
1079     /* pcluster lock MUST be taken before the following line */
1080     WRITE_ONCE(pcl->next, Z_EROFS_PCLUSTER_NIL);
1081     mutex_unlock(&pcl->lock);
1082     return err;
1083 }
1084
1085 static void z_erofs_decompress_queue(const struct z_erofs_decompressqueue *io,
1086                      struct page **pagepool)
1087 {
1088     struct z_erofs_decompress_backend be = {
1089         .sb = io->sb,
1090         .pagepool = pagepool,
1091         .decompressed_secondary_bvecs =
1092             LIST_HEAD_INIT(be.decompressed_secondary_bvecs),
1093     };
1094     z_erofs_next_pcluster_t owned = io->head;
1095
1096     while (owned != Z_EROFS_PCLUSTER_TAIL_CLOSED) {
1097         /* impossible that 'owned' equals Z_EROFS_WORK_TPTR_TAIL */
1098         DBG_BUGON(owned == Z_EROFS_PCLUSTER_TAIL);
1099         /* impossible that 'owned' equals Z_EROFS_PCLUSTER_NIL */
1100         DBG_BUGON(owned == Z_EROFS_PCLUSTER_NIL);
1101
1102         be.pcl = container_of(owned, struct z_erofs_pcluster, next);
1103         owned = READ_ONCE(be.pcl->next);
1104
1105         z_erofs_decompress_pcluster(&be, io->eio ? -EIO : 0);
1106         erofs_workgroup_put(&be.pcl->obj);
1107     }
1108 }
1109
1110 static void z_erofs_decompressqueue_work(struct work_struct *work)
1111 {
1112     struct z_erofs_decompressqueue *bgq =
1113         container_of(work, struct z_erofs_decompressqueue, u.work);
1114     struct page *pagepool = NULL;
1115
1116     DBG_BUGON(bgq->head == Z_EROFS_PCLUSTER_TAIL_CLOSED);
1117     z_erofs_decompress_queue(bgq, &pagepool);
1118
1119     erofs_release_pages(&pagepool);
1120     kvfree(bgq);
1121 }
1122
1123 static void z_erofs_decompress_kickoff(struct z_erofs_decompressqueue *io,
1124                        bool sync, int bios)
1125 {
1126     struct erofs_sb_info *const sbi = EROFS_SB(io->sb);
1127
1128     /* wake up the caller thread for sync decompression */
1129     if (sync) {
1130         if (!atomic_add_return(bios, &io->pending_bios))
1131             complete(&io->u.done);
1132         return;
1133     }
1134
1135     if (atomic_add_return(bios, &io->pending_bios))
1136         return;
1137     /* Use workqueue and sync decompression for atomic contexts only */
1138     if (in_atomic() || irqs_disabled()) {
1139         queue_work(z_erofs_workqueue, &io->u.work);
1140         /* enable sync decompression for readahead */
1141         if (sbi->opt.sync_decompress == EROFS_SYNC_DECOMPRESS_AUTO)
1142             sbi->opt.sync_decompress = EROFS_SYNC_DECOMPRESS_FORCE_ON;
1143         return;
1144     }
1145     z_erofs_decompressqueue_work(&io->u.work);
1146 }
1147
1148 static struct page *pickup_page_for_submission(struct z_erofs_pcluster *pcl,
1149                            unsigned int nr,
1150                            struct page **pagepool,
1151                            struct address_space *mc)
1152 {
1153     const pgoff_t index = pcl->obj.index;
1154     gfp_t gfp = mapping_gfp_mask(mc);
1155     bool tocache = false;
1156
1157     struct address_space *mapping;
1158     struct page *oldpage, *page;
1159
1160     compressed_page_t t;
1161     int justfound;
1162
1163 repeat:
1164     page = READ_ONCE(pcl->compressed_bvecs[nr].page);
1165     oldpage = page;
1166
1167     if (!page)
1168         goto out_allocpage;
1169
1170     /* process the target tagged pointer */
1171     t = tagptr_init(compressed_page_t, page);
1172     justfound = tagptr_unfold_tags(t);
1173     page = tagptr_unfold_ptr(t);
1174
1175     /*
1176      * preallocated cached pages, which is used to avoid direct reclaim
1177      * otherwise, it will go inplace I/O path instead.
1178      */
1179     if (page->private == Z_EROFS_PREALLOCATED_PAGE) {
1180         WRITE_ONCE(pcl->compressed_bvecs[nr].page, page);
1181         set_page_private(page, 0);
1182         tocache = true;
1183         goto out_tocache;
1184     }
1185     mapping = READ_ONCE(page->mapping);
1186
1187     /*
1188      * file-backed online pages in plcuster are all locked steady,
1189      * therefore it is impossible for `mapping' to be NULL.
1190      */
1191     if (mapping && mapping != mc)
1192         /* ought to be unmanaged pages */
1193         goto out;
1194
1195     /* directly return for shortlived page as well */
1196     if (z_erofs_is_shortlived_page(page))
1197         goto out;
1198
1199     lock_page(page);
1200
1201     /* only true if page reclaim goes wrong, should never happen */
1202     DBG_BUGON(justfound && PagePrivate(page));
1203
1204     /* the page is still in manage cache */
1205     if (page->mapping == mc) {
1206         WRITE_ONCE(pcl->compressed_bvecs[nr].page, page);
1207
1208         if (!PagePrivate(page)) {
1209             /*
1210              * impossible to be !PagePrivate(page) for
1211              * the current restriction as well if
1212              * the page is already in compressed_bvecs[].
1213              */
1214             DBG_BUGON(!justfound);
1215
1216             justfound = 0;
1217             set_page_private(page, (unsigned long)pcl);
1218             SetPagePrivate(page);
1219         }
1220
1221         /* no need to submit io if it is already up-to-date */
1222         if (PageUptodate(page)) {
1223             unlock_page(page);
1224             page = NULL;
1225         }
1226         goto out;
1227     }
1228
1229     /*
1230      * the managed page has been truncated, it's unsafe to
1231      * reuse this one, let's allocate a new cache-managed page.
1232      */
1233     DBG_BUGON(page->mapping);
1234     DBG_BUGON(!justfound);
1235
1236     tocache = true;
1237     unlock_page(page);
1238     put_page(page);
1239 out_allocpage:
1240     page = erofs_allocpage(pagepool, gfp | __GFP_NOFAIL);
1241     if (oldpage != cmpxchg(&pcl->compressed_bvecs[nr].page,
1242                    oldpage, page)) {
1243         erofs_pagepool_add(pagepool, page);
1244         cond_resched();
1245         goto repeat;
1246     }
1247 out_tocache:
1248     if (!tocache || add_to_page_cache_lru(page, mc, index + nr, gfp)) {
1249         /* turn into temporary page if fails (1 ref) */
1250         set_page_private(page, Z_EROFS_SHORTLIVED_PAGE);
1251         goto out;
1252     }
1253     attach_page_private(page, pcl);
1254     /* drop a refcount added by allocpage (then we have 2 refs here) */
1255     put_page(page);
1256
1257 out:    /* the only exit (for tracing and debugging) */
1258     return page;
1259 }
1260
1261 static struct z_erofs_decompressqueue *
1262 jobqueue_init(struct super_block *sb,
1263           struct z_erofs_decompressqueue *fgq, bool *fg)
1264 {
1265     struct z_erofs_decompressqueue *q;
1266
1267     if (fg && !*fg) {
1268         q = kvzalloc(sizeof(*q), GFP_KERNEL | __GFP_NOWARN);
1269         if (!q) {
1270             *fg = true;
1271             goto fg_out;
1272         }
1273         INIT_WORK(&q->u.work, z_erofs_decompressqueue_work);
1274     } else {
1275 fg_out:
1276         q = fgq;
1277         init_completion(&fgq->u.done);
1278         atomic_set(&fgq->pending_bios, 0);
1279         q->eio = false;
1280     }
1281     q->sb = sb;
1282     q->head = Z_EROFS_PCLUSTER_TAIL_CLOSED;
1283     return q;
1284 }
1285
1286 /* define decompression jobqueue types */
1287 enum {
1288     JQ_BYPASS,
1289     JQ_SUBMIT,
1290     NR_JOBQUEUES,
1291 };
1292
1293 static void *jobqueueset_init(struct super_block *sb,
1294                   struct z_erofs_decompressqueue *q[],
1295                   struct z_erofs_decompressqueue *fgq, bool *fg)
1296 {
1297     /*
1298      * if managed cache is enabled, bypass jobqueue is needed,
1299      * no need to read from device for all pclusters in this queue.
1300      */
1301     q[JQ_BYPASS] = jobqueue_init(sb, fgq + JQ_BYPASS, NULL);
1302     q[JQ_SUBMIT] = jobqueue_init(sb, fgq + JQ_SUBMIT, fg);
1303
1304     return tagptr_cast_ptr(tagptr_fold(tagptr1_t, q[JQ_SUBMIT], *fg));
1305 }
1306
1307 static void move_to_bypass_jobqueue(struct z_erofs_pcluster *pcl,
1308                     z_erofs_next_pcluster_t qtail[],
1309                     z_erofs_next_pcluster_t owned_head)
1310 {
1311     z_erofs_next_pcluster_t *const submit_qtail = qtail[JQ_SUBMIT];
1312     z_erofs_next_pcluster_t *const bypass_qtail = qtail[JQ_BYPASS];
1313
1314     DBG_BUGON(owned_head == Z_EROFS_PCLUSTER_TAIL_CLOSED);
1315     if (owned_head == Z_EROFS_PCLUSTER_TAIL)
1316         owned_head = Z_EROFS_PCLUSTER_TAIL_CLOSED;
1317
1318     WRITE_ONCE(pcl->next, Z_EROFS_PCLUSTER_TAIL_CLOSED);
1319
1320     WRITE_ONCE(*submit_qtail, owned_head);
1321     WRITE_ONCE(*bypass_qtail, &pcl->next);
1322
1323     qtail[JQ_BYPASS] = &pcl->next;
1324 }
1325
1326 static void z_erofs_decompressqueue_endio(struct bio *bio)
1327 {
1328     tagptr1_t t = tagptr_init(tagptr1_t, bio->bi_private);
1329     struct z_erofs_decompressqueue *q = tagptr_unfold_ptr(t);
1330     blk_status_t err = bio->bi_status;
1331     struct bio_vec *bvec;
1332     struct bvec_iter_all iter_all;
1333
1334     bio_for_each_segment_all(bvec, bio, iter_all) {
1335         struct page *page = bvec->bv_page;
1336
1337         DBG_BUGON(PageUptodate(page));
1338         DBG_BUGON(z_erofs_page_is_invalidated(page));
1339
1340         if (erofs_page_is_managed(EROFS_SB(q->sb), page)) {
1341             if (!err)
1342                 SetPageUptodate(page);
1343             unlock_page(page);
1344         }
1345     }
1346     if (err)
1347         q->eio = true;
1348     z_erofs_decompress_kickoff(q, tagptr_unfold_tags(t), -1);
1349     bio_put(bio);
1350 }
1351
1352 static void z_erofs_submit_queue(struct z_erofs_decompress_frontend *f,
1353                  struct page **pagepool,
1354                  struct z_erofs_decompressqueue *fgq,
1355                  bool *force_fg)
1356 {
1357     struct super_block *sb = f->inode->i_sb;
1358     struct address_space *mc = MNGD_MAPPING(EROFS_SB(sb));
1359     z_erofs_next_pcluster_t qtail[NR_JOBQUEUES];
1360     struct z_erofs_decompressqueue *q[NR_JOBQUEUES];
1361     void *bi_private;
1362     z_erofs_next_pcluster_t owned_head = f->owned_head;
1363     /* bio is NULL initially, so no need to initialize last_{index,bdev} */
1364     pgoff_t last_index;
1365     struct block_device *last_bdev;
1366     unsigned int nr_bios = 0;
1367     struct bio *bio = NULL;
1368
1369     bi_private = jobqueueset_init(sb, q, fgq, force_fg);
1370     qtail[JQ_BYPASS] = &q[JQ_BYPASS]->head;
1371     qtail[JQ_SUBMIT] = &q[JQ_SUBMIT]->head;
1372
1373     /* by default, all need io submission */
1374     q[JQ_SUBMIT]->head = owned_head;
1375
1376     do {
1377         struct erofs_map_dev mdev;
1378         struct z_erofs_pcluster *pcl;
1379         pgoff_t cur, end;
1380         unsigned int i = 0;
1381         bool bypass = true;
1382
1383         /* no possible 'owned_head' equals the following */
1384         DBG_BUGON(owned_head == Z_EROFS_PCLUSTER_TAIL_CLOSED);
1385         DBG_BUGON(owned_head == Z_EROFS_PCLUSTER_NIL);
1386
1387         pcl = container_of(owned_head, struct z_erofs_pcluster, next);
1388
1389         /* close the main owned chain at first */
1390         owned_head = cmpxchg(&pcl->next, Z_EROFS_PCLUSTER_TAIL,
1391                      Z_EROFS_PCLUSTER_TAIL_CLOSED);
1392         if (z_erofs_is_inline_pcluster(pcl)) {
1393             move_to_bypass_jobqueue(pcl, qtail, owned_head);
1394             continue;
1395         }
1396
1397         /* no device id here, thus it will always succeed */
1398         mdev = (struct erofs_map_dev) {
1399             .m_pa = blknr_to_addr(pcl->obj.index),
1400         };
1401         (void)erofs_map_dev(sb, &mdev);
1402
1403         cur = erofs_blknr(mdev.m_pa);
1404         end = cur + pcl->pclusterpages;
1405
1406         do {
1407             struct page *page;
1408
1409             page = pickup_page_for_submission(pcl, i++, pagepool,
1410                               mc);
1411             if (!page)
1412                 continue;
1413
1414             if (bio && (cur != last_index + 1 ||
1415                     last_bdev != mdev.m_bdev)) {
1416 submit_bio_retry:
1417                 submit_bio(bio);
1418                 bio = NULL;
1419             }
1420
1421             if (!bio) {
1422                 bio = bio_alloc(mdev.m_bdev, BIO_MAX_VECS,
1423                         REQ_OP_READ, GFP_NOIO);
1424                 bio->bi_end_io = z_erofs_decompressqueue_endio;
1425
1426                 last_bdev = mdev.m_bdev;
1427                 bio->bi_iter.bi_sector = (sector_t)cur <<
1428                     LOG_SECTORS_PER_BLOCK;
1429                 bio->bi_private = bi_private;
1430                 if (f->readahead)
1431                     bio->bi_opf |= REQ_RAHEAD;
1432                 ++nr_bios;
1433             }
1434
1435             if (bio_add_page(bio, page, PAGE_SIZE, 0) < PAGE_SIZE)
1436                 goto submit_bio_retry;
1437
1438             last_index = cur;
1439             bypass = false;
1440         } while (++cur < end);
1441
1442         if (!bypass)
1443             qtail[JQ_SUBMIT] = &pcl->next;
1444         else
1445             move_to_bypass_jobqueue(pcl, qtail, owned_head);
1446     } while (owned_head != Z_EROFS_PCLUSTER_TAIL);
1447
1448     if (bio)
1449         submit_bio(bio);
1450
1451     /*
1452      * although background is preferred, no one is pending for submission.
1453      * don't issue workqueue for decompression but drop it directly instead.
1454      */
1455     if (!*force_fg && !nr_bios) {
1456         kvfree(q[JQ_SUBMIT]);
1457         return;
1458     }
1459     z_erofs_decompress_kickoff(q[JQ_SUBMIT], *force_fg, nr_bios);
1460 }
1461
1462 static void z_erofs_runqueue(struct z_erofs_decompress_frontend *f,
1463                  struct page **pagepool, bool force_fg)
1464 {
1465     struct z_erofs_decompressqueue io[NR_JOBQUEUES];
1466
1467     if (f->owned_head == Z_EROFS_PCLUSTER_TAIL)
1468         return;
1469     z_erofs_submit_queue(f, pagepool, io, &force_fg);
1470
1471     /* handle bypass queue (no i/o pclusters) immediately */
1472     z_erofs_decompress_queue(&io[JQ_BYPASS], pagepool);
1473
1474     if (!force_fg)
1475         return;
1476
1477     /* wait until all bios are completed */
1478     wait_for_completion_io(&io[JQ_SUBMIT].u.done);
1479
1480     /* handle synchronous decompress queue in the caller context */
1481     z_erofs_decompress_queue(&io[JQ_SUBMIT], pagepool);
1482 }
1483
1484 /*
1485  * Since partial uptodate is still unimplemented for now, we have to use
1486  * approximate readmore strategies as a start.
1487  */
1488 static void z_erofs_pcluster_readmore(struct z_erofs_decompress_frontend *f,
1489                       struct readahead_control *rac,
1490                       erofs_off_t end,
1491                       struct page **pagepool,
1492                       bool backmost)
1493 {
1494     struct inode *inode = f->inode;
1495     struct erofs_map_blocks *map = &f->map;
1496     erofs_off_t cur;
1497     int err;
1498
1499     if (backmost) {
1500         map->m_la = end;
1501         err = z_erofs_map_blocks_iter(inode, map,
1502                           EROFS_GET_BLOCKS_READMORE);
1503         if (err)
1504             return;
1505
1506         /* expend ra for the trailing edge if readahead */
1507         if (rac) {
1508             loff_t newstart = readahead_pos(rac);
1509
1510             cur = round_up(map->m_la + map->m_llen, PAGE_SIZE);
1511             readahead_expand(rac, newstart, cur - newstart);
1512             return;
1513         }
1514         end = round_up(end, PAGE_SIZE);
1515     } else {
1516         end = round_up(map->m_la, PAGE_SIZE);
1517
1518         if (!map->m_llen)
1519             return;
1520     }
1521
1522     cur = map->m_la + map->m_llen - 1;
1523     while (cur >= end) {
1524         pgoff_t index = cur >> PAGE_SHIFT;
1525         struct page *page;
1526
1527         page = erofs_grab_cache_page_nowait(inode->i_mapping, index);
1528         if (page) {
1529             if (PageUptodate(page)) {
1530                 unlock_page(page);
1531             } else {
1532                 err = z_erofs_do_read_page(f, page, pagepool);
1533                 if (err)
1534                     erofs_err(inode->i_sb,
1535                           "readmore error at page %lu @ nid %llu",
1536                           index, EROFS_I(inode)->nid);
1537             }
1538             put_page(page);
1539         }
1540
1541         if (cur < PAGE_SIZE)
1542             break;
1543         cur = (index << PAGE_SHIFT) - 1;
1544     }
1545 }
1546
1547 static int z_erofs_read_folio(struct file *file, struct folio *folio)
1548 {
1549     struct page *page = &folio->page;
1550     struct inode *const inode = page->mapping->host;
1551     struct erofs_sb_info *const sbi = EROFS_I_SB(inode);
1552     struct z_erofs_decompress_frontend f = DECOMPRESS_FRONTEND_INIT(inode);
1553     struct page *pagepool = NULL;
1554     int err;
1555
1556     trace_erofs_readpage(page, false);
1557     f.headoffset = (erofs_off_t)page->index << PAGE_SHIFT;
1558
1559     z_erofs_pcluster_readmore(&f, NULL, f.headoffset + PAGE_SIZE - 1,
1560                   &pagepool, true);
1561     err = z_erofs_do_read_page(&f, page, &pagepool);
1562     z_erofs_pcluster_readmore(&f, NULL, 0, &pagepool, false);
1563
1564     (void)z_erofs_collector_end(&f);
1565
1566     /* if some compressed cluster ready, need submit them anyway */
1567     z_erofs_runqueue(&f, &pagepool,
1568              z_erofs_get_sync_decompress_policy(sbi, 0));
1569
1570     if (err)
1571         erofs_err(inode->i_sb, "failed to read, err [%d]", err);
1572
1573     erofs_put_metabuf(&f.map.buf);
1574     erofs_release_pages(&pagepool);
1575     return err;
1576 }
1577
1578 static void z_erofs_readahead(struct readahead_control *rac)
1579 {
1580     struct inode *const inode = rac->mapping->host;
1581     struct erofs_sb_info *const sbi = EROFS_I_SB(inode);
1582     struct z_erofs_decompress_frontend f = DECOMPRESS_FRONTEND_INIT(inode);
1583     struct page *pagepool = NULL, *head = NULL, *page;
1584     unsigned int nr_pages;
1585
1586     f.readahead = true;
1587     f.headoffset = readahead_pos(rac);
1588
1589     z_erofs_pcluster_readmore(&f, rac, f.headoffset +
1590                   readahead_length(rac) - 1, &pagepool, true);
1591     nr_pages = readahead_count(rac);
1592     trace_erofs_readpages(inode, readahead_index(rac), nr_pages, false);
1593
1594     while ((page = readahead_page(rac))) {
1595         set_page_private(page, (unsigned long)head);
1596         head = page;
1597     }
1598
1599     while (head) {
1600         struct page *page = head;
1601         int err;
1602
1603         /* traversal in reverse order */
1604         head = (void *)page_private(page);
1605
1606         err = z_erofs_do_read_page(&f, page, &pagepool);
1607         if (err)
1608             erofs_err(inode->i_sb,
1609                   "readahead error at page %lu @ nid %llu",
1610                   page->index, EROFS_I(inode)->nid);
1611         put_page(page);
1612     }
1613     z_erofs_pcluster_readmore(&f, rac, 0, &pagepool, false);
1614     (void)z_erofs_collector_end(&f);
1615
1616     z_erofs_runqueue(&f, &pagepool,
1617              z_erofs_get_sync_decompress_policy(sbi, nr_pages));
1618     erofs_put_metabuf(&f.map.buf);
1619     erofs_release_pages(&pagepool);
1620 }
1621
1622 const struct address_space_operations z_erofs_aops = {
1623     .read_folio = z_erofs_read_folio,
1624     .readahead = z_erofs_readahead,
1625 };