Back to home page

LXR

 
 

    


0001 /*
0002  * Bad block management
0003  *
0004  * - Heavily based on MD badblocks code from Neil Brown
0005  *
0006  * Copyright (c) 2015, Intel Corporation.
0007  *
0008  * This program is free software; you can redistribute it and/or modify it
0009  * under the terms and conditions of the GNU General Public License,
0010  * version 2, as published by the Free Software Foundation.
0011  *
0012  * This program is distributed in the hope it will be useful, but WITHOUT
0013  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
0014  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
0015  * more details.
0016  */
0017 
0018 #include <linux/badblocks.h>
0019 #include <linux/seqlock.h>
0020 #include <linux/device.h>
0021 #include <linux/kernel.h>
0022 #include <linux/module.h>
0023 #include <linux/stddef.h>
0024 #include <linux/types.h>
0025 #include <linux/slab.h>
0026 
0027 /**
0028  * badblocks_check() - check a given range for bad sectors
0029  * @bb:     the badblocks structure that holds all badblock information
0030  * @s:      sector (start) at which to check for badblocks
0031  * @sectors:    number of sectors to check for badblocks
0032  * @first_bad:  pointer to store location of the first badblock
0033  * @bad_sectors: pointer to store number of badblocks after @first_bad
0034  *
0035  * We can record which blocks on each device are 'bad' and so just
0036  * fail those blocks, or that stripe, rather than the whole device.
0037  * Entries in the bad-block table are 64bits wide.  This comprises:
0038  * Length of bad-range, in sectors: 0-511 for lengths 1-512
0039  * Start of bad-range, sector offset, 54 bits (allows 8 exbibytes)
0040  *  A 'shift' can be set so that larger blocks are tracked and
0041  *  consequently larger devices can be covered.
0042  * 'Acknowledged' flag - 1 bit. - the most significant bit.
0043  *
0044  * Locking of the bad-block table uses a seqlock so badblocks_check
0045  * might need to retry if it is very unlucky.
0046  * We will sometimes want to check for bad blocks in a bi_end_io function,
0047  * so we use the write_seqlock_irq variant.
0048  *
0049  * When looking for a bad block we specify a range and want to
0050  * know if any block in the range is bad.  So we binary-search
0051  * to the last range that starts at-or-before the given endpoint,
0052  * (or "before the sector after the target range")
0053  * then see if it ends after the given start.
0054  *
0055  * Return:
0056  *  0: there are no known bad blocks in the range
0057  *  1: there are known bad block which are all acknowledged
0058  * -1: there are bad blocks which have not yet been acknowledged in metadata.
0059  * plus the start/length of the first bad section we overlap.
0060  */
0061 int badblocks_check(struct badblocks *bb, sector_t s, int sectors,
0062             sector_t *first_bad, int *bad_sectors)
0063 {
0064     int hi;
0065     int lo;
0066     u64 *p = bb->page;
0067     int rv;
0068     sector_t target = s + sectors;
0069     unsigned seq;
0070 
0071     if (bb->shift > 0) {
0072         /* round the start down, and the end up */
0073         s >>= bb->shift;
0074         target += (1<<bb->shift) - 1;
0075         target >>= bb->shift;
0076         sectors = target - s;
0077     }
0078     /* 'target' is now the first block after the bad range */
0079 
0080 retry:
0081     seq = read_seqbegin(&bb->lock);
0082     lo = 0;
0083     rv = 0;
0084     hi = bb->count;
0085 
0086     /* Binary search between lo and hi for 'target'
0087      * i.e. for the last range that starts before 'target'
0088      */
0089     /* INVARIANT: ranges before 'lo' and at-or-after 'hi'
0090      * are known not to be the last range before target.
0091      * VARIANT: hi-lo is the number of possible
0092      * ranges, and decreases until it reaches 1
0093      */
0094     while (hi - lo > 1) {
0095         int mid = (lo + hi) / 2;
0096         sector_t a = BB_OFFSET(p[mid]);
0097 
0098         if (a < target)
0099             /* This could still be the one, earlier ranges
0100              * could not.
0101              */
0102             lo = mid;
0103         else
0104             /* This and later ranges are definitely out. */
0105             hi = mid;
0106     }
0107     /* 'lo' might be the last that started before target, but 'hi' isn't */
0108     if (hi > lo) {
0109         /* need to check all range that end after 's' to see if
0110          * any are unacknowledged.
0111          */
0112         while (lo >= 0 &&
0113                BB_OFFSET(p[lo]) + BB_LEN(p[lo]) > s) {
0114             if (BB_OFFSET(p[lo]) < target) {
0115                 /* starts before the end, and finishes after
0116                  * the start, so they must overlap
0117                  */
0118                 if (rv != -1 && BB_ACK(p[lo]))
0119                     rv = 1;
0120                 else
0121                     rv = -1;
0122                 *first_bad = BB_OFFSET(p[lo]);
0123                 *bad_sectors = BB_LEN(p[lo]);
0124             }
0125             lo--;
0126         }
0127     }
0128 
0129     if (read_seqretry(&bb->lock, seq))
0130         goto retry;
0131 
0132     return rv;
0133 }
0134 EXPORT_SYMBOL_GPL(badblocks_check);
0135 
0136 static void badblocks_update_acked(struct badblocks *bb)
0137 {
0138     u64 *p = bb->page;
0139     int i;
0140     bool unacked = false;
0141 
0142     if (!bb->unacked_exist)
0143         return;
0144 
0145     for (i = 0; i < bb->count ; i++) {
0146         if (!BB_ACK(p[i])) {
0147             unacked = true;
0148             break;
0149         }
0150     }
0151 
0152     if (!unacked)
0153         bb->unacked_exist = 0;
0154 }
0155 
0156 /**
0157  * badblocks_set() - Add a range of bad blocks to the table.
0158  * @bb:     the badblocks structure that holds all badblock information
0159  * @s:      first sector to mark as bad
0160  * @sectors:    number of sectors to mark as bad
0161  * @acknowledged: weather to mark the bad sectors as acknowledged
0162  *
0163  * This might extend the table, or might contract it if two adjacent ranges
0164  * can be merged. We binary-search to find the 'insertion' point, then
0165  * decide how best to handle it.
0166  *
0167  * Return:
0168  *  0: success
0169  *  1: failed to set badblocks (out of space)
0170  */
0171 int badblocks_set(struct badblocks *bb, sector_t s, int sectors,
0172             int acknowledged)
0173 {
0174     u64 *p;
0175     int lo, hi;
0176     int rv = 0;
0177     unsigned long flags;
0178 
0179     if (bb->shift < 0)
0180         /* badblocks are disabled */
0181         return 0;
0182 
0183     if (bb->shift) {
0184         /* round the start down, and the end up */
0185         sector_t next = s + sectors;
0186 
0187         s >>= bb->shift;
0188         next += (1<<bb->shift) - 1;
0189         next >>= bb->shift;
0190         sectors = next - s;
0191     }
0192 
0193     write_seqlock_irqsave(&bb->lock, flags);
0194 
0195     p = bb->page;
0196     lo = 0;
0197     hi = bb->count;
0198     /* Find the last range that starts at-or-before 's' */
0199     while (hi - lo > 1) {
0200         int mid = (lo + hi) / 2;
0201         sector_t a = BB_OFFSET(p[mid]);
0202 
0203         if (a <= s)
0204             lo = mid;
0205         else
0206             hi = mid;
0207     }
0208     if (hi > lo && BB_OFFSET(p[lo]) > s)
0209         hi = lo;
0210 
0211     if (hi > lo) {
0212         /* we found a range that might merge with the start
0213          * of our new range
0214          */
0215         sector_t a = BB_OFFSET(p[lo]);
0216         sector_t e = a + BB_LEN(p[lo]);
0217         int ack = BB_ACK(p[lo]);
0218 
0219         if (e >= s) {
0220             /* Yes, we can merge with a previous range */
0221             if (s == a && s + sectors >= e)
0222                 /* new range covers old */
0223                 ack = acknowledged;
0224             else
0225                 ack = ack && acknowledged;
0226 
0227             if (e < s + sectors)
0228                 e = s + sectors;
0229             if (e - a <= BB_MAX_LEN) {
0230                 p[lo] = BB_MAKE(a, e-a, ack);
0231                 s = e;
0232             } else {
0233                 /* does not all fit in one range,
0234                  * make p[lo] maximal
0235                  */
0236                 if (BB_LEN(p[lo]) != BB_MAX_LEN)
0237                     p[lo] = BB_MAKE(a, BB_MAX_LEN, ack);
0238                 s = a + BB_MAX_LEN;
0239             }
0240             sectors = e - s;
0241         }
0242     }
0243     if (sectors && hi < bb->count) {
0244         /* 'hi' points to the first range that starts after 's'.
0245          * Maybe we can merge with the start of that range
0246          */
0247         sector_t a = BB_OFFSET(p[hi]);
0248         sector_t e = a + BB_LEN(p[hi]);
0249         int ack = BB_ACK(p[hi]);
0250 
0251         if (a <= s + sectors) {
0252             /* merging is possible */
0253             if (e <= s + sectors) {
0254                 /* full overlap */
0255                 e = s + sectors;
0256                 ack = acknowledged;
0257             } else
0258                 ack = ack && acknowledged;
0259 
0260             a = s;
0261             if (e - a <= BB_MAX_LEN) {
0262                 p[hi] = BB_MAKE(a, e-a, ack);
0263                 s = e;
0264             } else {
0265                 p[hi] = BB_MAKE(a, BB_MAX_LEN, ack);
0266                 s = a + BB_MAX_LEN;
0267             }
0268             sectors = e - s;
0269             lo = hi;
0270             hi++;
0271         }
0272     }
0273     if (sectors == 0 && hi < bb->count) {
0274         /* we might be able to combine lo and hi */
0275         /* Note: 's' is at the end of 'lo' */
0276         sector_t a = BB_OFFSET(p[hi]);
0277         int lolen = BB_LEN(p[lo]);
0278         int hilen = BB_LEN(p[hi]);
0279         int newlen = lolen + hilen - (s - a);
0280 
0281         if (s >= a && newlen < BB_MAX_LEN) {
0282             /* yes, we can combine them */
0283             int ack = BB_ACK(p[lo]) && BB_ACK(p[hi]);
0284 
0285             p[lo] = BB_MAKE(BB_OFFSET(p[lo]), newlen, ack);
0286             memmove(p + hi, p + hi + 1,
0287                 (bb->count - hi - 1) * 8);
0288             bb->count--;
0289         }
0290     }
0291     while (sectors) {
0292         /* didn't merge (it all).
0293          * Need to add a range just before 'hi'
0294          */
0295         if (bb->count >= MAX_BADBLOCKS) {
0296             /* No room for more */
0297             rv = 1;
0298             break;
0299         } else {
0300             int this_sectors = sectors;
0301 
0302             memmove(p + hi + 1, p + hi,
0303                 (bb->count - hi) * 8);
0304             bb->count++;
0305 
0306             if (this_sectors > BB_MAX_LEN)
0307                 this_sectors = BB_MAX_LEN;
0308             p[hi] = BB_MAKE(s, this_sectors, acknowledged);
0309             sectors -= this_sectors;
0310             s += this_sectors;
0311         }
0312     }
0313 
0314     bb->changed = 1;
0315     if (!acknowledged)
0316         bb->unacked_exist = 1;
0317     else
0318         badblocks_update_acked(bb);
0319     write_sequnlock_irqrestore(&bb->lock, flags);
0320 
0321     return rv;
0322 }
0323 EXPORT_SYMBOL_GPL(badblocks_set);
0324 
0325 /**
0326  * badblocks_clear() - Remove a range of bad blocks to the table.
0327  * @bb:     the badblocks structure that holds all badblock information
0328  * @s:      first sector to mark as bad
0329  * @sectors:    number of sectors to mark as bad
0330  *
0331  * This may involve extending the table if we spilt a region,
0332  * but it must not fail.  So if the table becomes full, we just
0333  * drop the remove request.
0334  *
0335  * Return:
0336  *  0: success
0337  *  1: failed to clear badblocks
0338  */
0339 int badblocks_clear(struct badblocks *bb, sector_t s, int sectors)
0340 {
0341     u64 *p;
0342     int lo, hi;
0343     sector_t target = s + sectors;
0344     int rv = 0;
0345 
0346     if (bb->shift > 0) {
0347         /* When clearing we round the start up and the end down.
0348          * This should not matter as the shift should align with
0349          * the block size and no rounding should ever be needed.
0350          * However it is better the think a block is bad when it
0351          * isn't than to think a block is not bad when it is.
0352          */
0353         s += (1<<bb->shift) - 1;
0354         s >>= bb->shift;
0355         target >>= bb->shift;
0356         sectors = target - s;
0357     }
0358 
0359     write_seqlock_irq(&bb->lock);
0360 
0361     p = bb->page;
0362     lo = 0;
0363     hi = bb->count;
0364     /* Find the last range that starts before 'target' */
0365     while (hi - lo > 1) {
0366         int mid = (lo + hi) / 2;
0367         sector_t a = BB_OFFSET(p[mid]);
0368 
0369         if (a < target)
0370             lo = mid;
0371         else
0372             hi = mid;
0373     }
0374     if (hi > lo) {
0375         /* p[lo] is the last range that could overlap the
0376          * current range.  Earlier ranges could also overlap,
0377          * but only this one can overlap the end of the range.
0378          */
0379         if ((BB_OFFSET(p[lo]) + BB_LEN(p[lo]) > target) &&
0380             (BB_OFFSET(p[lo]) < target)) {
0381             /* Partial overlap, leave the tail of this range */
0382             int ack = BB_ACK(p[lo]);
0383             sector_t a = BB_OFFSET(p[lo]);
0384             sector_t end = a + BB_LEN(p[lo]);
0385 
0386             if (a < s) {
0387                 /* we need to split this range */
0388                 if (bb->count >= MAX_BADBLOCKS) {
0389                     rv = -ENOSPC;
0390                     goto out;
0391                 }
0392                 memmove(p+lo+1, p+lo, (bb->count - lo) * 8);
0393                 bb->count++;
0394                 p[lo] = BB_MAKE(a, s-a, ack);
0395                 lo++;
0396             }
0397             p[lo] = BB_MAKE(target, end - target, ack);
0398             /* there is no longer an overlap */
0399             hi = lo;
0400             lo--;
0401         }
0402         while (lo >= 0 &&
0403                (BB_OFFSET(p[lo]) + BB_LEN(p[lo]) > s) &&
0404                (BB_OFFSET(p[lo]) < target)) {
0405             /* This range does overlap */
0406             if (BB_OFFSET(p[lo]) < s) {
0407                 /* Keep the early parts of this range. */
0408                 int ack = BB_ACK(p[lo]);
0409                 sector_t start = BB_OFFSET(p[lo]);
0410 
0411                 p[lo] = BB_MAKE(start, s - start, ack);
0412                 /* now low doesn't overlap, so.. */
0413                 break;
0414             }
0415             lo--;
0416         }
0417         /* 'lo' is strictly before, 'hi' is strictly after,
0418          * anything between needs to be discarded
0419          */
0420         if (hi - lo > 1) {
0421             memmove(p+lo+1, p+hi, (bb->count - hi) * 8);
0422             bb->count -= (hi - lo - 1);
0423         }
0424     }
0425 
0426     badblocks_update_acked(bb);
0427     bb->changed = 1;
0428 out:
0429     write_sequnlock_irq(&bb->lock);
0430     return rv;
0431 }
0432 EXPORT_SYMBOL_GPL(badblocks_clear);
0433 
0434 /**
0435  * ack_all_badblocks() - Acknowledge all bad blocks in a list.
0436  * @bb:     the badblocks structure that holds all badblock information
0437  *
0438  * This only succeeds if ->changed is clear.  It is used by
0439  * in-kernel metadata updates
0440  */
0441 void ack_all_badblocks(struct badblocks *bb)
0442 {
0443     if (bb->page == NULL || bb->changed)
0444         /* no point even trying */
0445         return;
0446     write_seqlock_irq(&bb->lock);
0447 
0448     if (bb->changed == 0 && bb->unacked_exist) {
0449         u64 *p = bb->page;
0450         int i;
0451 
0452         for (i = 0; i < bb->count ; i++) {
0453             if (!BB_ACK(p[i])) {
0454                 sector_t start = BB_OFFSET(p[i]);
0455                 int len = BB_LEN(p[i]);
0456 
0457                 p[i] = BB_MAKE(start, len, 1);
0458             }
0459         }
0460         bb->unacked_exist = 0;
0461     }
0462     write_sequnlock_irq(&bb->lock);
0463 }
0464 EXPORT_SYMBOL_GPL(ack_all_badblocks);
0465 
0466 /**
0467  * badblocks_show() - sysfs access to bad-blocks list
0468  * @bb:     the badblocks structure that holds all badblock information
0469  * @page:   buffer received from sysfs
0470  * @unack:  weather to show unacknowledged badblocks
0471  *
0472  * Return:
0473  *  Length of returned data
0474  */
0475 ssize_t badblocks_show(struct badblocks *bb, char *page, int unack)
0476 {
0477     size_t len;
0478     int i;
0479     u64 *p = bb->page;
0480     unsigned seq;
0481 
0482     if (bb->shift < 0)
0483         return 0;
0484 
0485 retry:
0486     seq = read_seqbegin(&bb->lock);
0487 
0488     len = 0;
0489     i = 0;
0490 
0491     while (len < PAGE_SIZE && i < bb->count) {
0492         sector_t s = BB_OFFSET(p[i]);
0493         unsigned int length = BB_LEN(p[i]);
0494         int ack = BB_ACK(p[i]);
0495 
0496         i++;
0497 
0498         if (unack && ack)
0499             continue;
0500 
0501         len += snprintf(page+len, PAGE_SIZE-len, "%llu %u\n",
0502                 (unsigned long long)s << bb->shift,
0503                 length << bb->shift);
0504     }
0505     if (unack && len == 0)
0506         bb->unacked_exist = 0;
0507 
0508     if (read_seqretry(&bb->lock, seq))
0509         goto retry;
0510 
0511     return len;
0512 }
0513 EXPORT_SYMBOL_GPL(badblocks_show);
0514 
0515 /**
0516  * badblocks_store() - sysfs access to bad-blocks list
0517  * @bb:     the badblocks structure that holds all badblock information
0518  * @page:   buffer received from sysfs
0519  * @len:    length of data received from sysfs
0520  * @unack:  weather to show unacknowledged badblocks
0521  *
0522  * Return:
0523  *  Length of the buffer processed or -ve error.
0524  */
0525 ssize_t badblocks_store(struct badblocks *bb, const char *page, size_t len,
0526             int unack)
0527 {
0528     unsigned long long sector;
0529     int length;
0530     char newline;
0531 
0532     switch (sscanf(page, "%llu %d%c", &sector, &length, &newline)) {
0533     case 3:
0534         if (newline != '\n')
0535             return -EINVAL;
0536     case 2:
0537         if (length <= 0)
0538             return -EINVAL;
0539         break;
0540     default:
0541         return -EINVAL;
0542     }
0543 
0544     if (badblocks_set(bb, sector, length, !unack))
0545         return -ENOSPC;
0546     else
0547         return len;
0548 }
0549 EXPORT_SYMBOL_GPL(badblocks_store);
0550 
0551 static int __badblocks_init(struct device *dev, struct badblocks *bb,
0552         int enable)
0553 {
0554     bb->dev = dev;
0555     bb->count = 0;
0556     if (enable)
0557         bb->shift = 0;
0558     else
0559         bb->shift = -1;
0560     if (dev)
0561         bb->page = devm_kzalloc(dev, PAGE_SIZE, GFP_KERNEL);
0562     else
0563         bb->page = kzalloc(PAGE_SIZE, GFP_KERNEL);
0564     if (!bb->page) {
0565         bb->shift = -1;
0566         return -ENOMEM;
0567     }
0568     seqlock_init(&bb->lock);
0569 
0570     return 0;
0571 }
0572 
0573 /**
0574  * badblocks_init() - initialize the badblocks structure
0575  * @bb:     the badblocks structure that holds all badblock information
0576  * @enable: weather to enable badblocks accounting
0577  *
0578  * Return:
0579  *  0: success
0580  *  -ve errno: on error
0581  */
0582 int badblocks_init(struct badblocks *bb, int enable)
0583 {
0584     return __badblocks_init(NULL, bb, enable);
0585 }
0586 EXPORT_SYMBOL_GPL(badblocks_init);
0587 
0588 int devm_init_badblocks(struct device *dev, struct badblocks *bb)
0589 {
0590     if (!bb)
0591         return -EINVAL;
0592     return __badblocks_init(dev, bb, 1);
0593 }
0594 EXPORT_SYMBOL_GPL(devm_init_badblocks);
0595 
0596 /**
0597  * badblocks_exit() - free the badblocks structure
0598  * @bb:     the badblocks structure that holds all badblock information
0599  */
0600 void badblocks_exit(struct badblocks *bb)
0601 {
0602     if (!bb)
0603         return;
0604     if (bb->dev)
0605         devm_kfree(bb->dev, bb->page);
0606     else
0607         kfree(bb->page);
0608     bb->page = NULL;
0609 }
0610 EXPORT_SYMBOL_GPL(badblocks_exit);