block/drbd/drbd_vli.h

0001 /* SPDX-License-Identifier: GPL-2.0-or-later */
0002 /*
0003 -*- linux-c -*-
0004    drbd_receiver.c
0005    This file is part of DRBD by Philipp Reisner and Lars Ellenberg.
0006
0007    Copyright (C) 2001-2008, LINBIT Information Technologies GmbH.
0008    Copyright (C) 1999-2008, Philipp Reisner <philipp.reisner@linbit.com>.
0009    Copyright (C) 2002-2008, Lars Ellenberg <lars.ellenberg@linbit.com>.
0010
0011  */
0012
0013 #ifndef _DRBD_VLI_H
0014 #define _DRBD_VLI_H
0015
0016 /*
0017  * At a granularity of 4KiB storage represented per bit,
0018  * and stroage sizes of several TiB,
0019  * and possibly small-bandwidth replication,
0020  * the bitmap transfer time can take much too long,
0021  * if transmitted in plain text.
0022  *
0023  * We try to reduce the transferred bitmap information
0024  * by encoding runlengths of bit polarity.
0025  *
0026  * We never actually need to encode a "zero" (runlengths are positive).
0027  * But then we have to store the value of the first bit.
0028  * The first bit of information thus shall encode if the first runlength
0029  * gives the number of set or unset bits.
0030  *
0031  * We assume that large areas are either completely set or unset,
0032  * which gives good compression with any runlength method,
0033  * even when encoding the runlength as fixed size 32bit/64bit integers.
0034  *
0035  * Still, there may be areas where the polarity flips every few bits,
0036  * and encoding the runlength sequence of those areas with fix size
0037  * integers would be much worse than plaintext.
0038  *
0039  * We want to encode small runlength values with minimum code length,
0040  * while still being able to encode a Huge run of all zeros.
0041  *
0042  * Thus we need a Variable Length Integer encoding, VLI.
0043  *
0044  * For some cases, we produce more code bits than plaintext input.
0045  * We need to send incompressible chunks as plaintext, skip over them
0046  * and then see if the next chunk compresses better.
0047  *
0048  * We don't care too much about "excellent" compression ratio for large
0049  * runlengths (all set/all clear): whether we achieve a factor of 100
0050  * or 1000 is not that much of an issue.
0051  * We do not want to waste too much on short runlengths in the "noisy"
0052  * parts of the bitmap, though.
0053  *
0054  * There are endless variants of VLI, we experimented with:
0055  *  * simple byte-based
0056  *  * various bit based with different code word length.
0057  *
0058  * To avoid yet an other configuration parameter (choice of bitmap compression
0059  * algorithm) which was difficult to explain and tune, we just chose the one
0060  * variant that turned out best in all test cases.
0061  * Based on real world usage patterns, with device sizes ranging from a few GiB
0062  * to several TiB, file server/mailserver/webserver/mysql/postgress,
0063  * mostly idle to really busy, the all time winner (though sometimes only
0064  * marginally better) is:
0065  */
0066
0067 /*
0068  * encoding is "visualised" as
0069  * __little endian__ bitstream, least significant bit first (left most)
0070  *
0071  * this particular encoding is chosen so that the prefix code
0072  * starts as unary encoding the level, then modified so that
0073  * 10 levels can be described in 8bit, with minimal overhead
0074  * for the smaller levels.
0075  *
0076  * Number of data bits follow fibonacci sequence, with the exception of the
0077  * last level (+1 data bit, so it makes 64bit total).  The only worse code when
0078  * encoding bit polarity runlength is 1 plain bits => 2 code bits.
0079 prefix    data bits                                    max val  Nº data bits
0080 0 x                                                         0x2            1
0081 10 x                                                        0x4            1
0082 110 xx                                                      0x8            2
0083 1110 xxx                                                   0x10            3
0084 11110 xxx xx                                               0x30            5
0085 111110 xx xxxxxx                                          0x130            8
0086 11111100  xxxxxxxx xxxxx                                 0x2130           13
0087 11111110  xxxxxxxx xxxxxxxx xxxxx                      0x202130           21
0088 11111101  xxxxxxxx xxxxxxxx xxxxxxxx  xxxxxxxx xx   0x400202130           34
0089 11111111  xxxxxxxx xxxxxxxx xxxxxxxx  xxxxxxxx xxxxxxxx xxxxxxxx xxxxxxxx 56
0090  * maximum encodable value: 0x100000400202130 == 2**56 + some */
0091
0092 /* compression "table":
0093  transmitted   x                                0.29
0094  as plaintext x                                  ........................
0095              x                                   ........................
0096             x                                    ........................
0097            x    0.59                         0.21........................
0098           x      ........................................................
0099          x       .. c ...................................................
0100         x    0.44.. o ...................................................
0101        x .......... d ...................................................
0102       x  .......... e ...................................................
0103      X.............   ...................................................
0104     x.............. b ...................................................
0105 2.0x............... i ...................................................
0106  #X................ t ...................................................
0107  #................. s ...........................  plain bits  ..........
0108 -+-----------------------------------------------------------------------
0109  1             16              32                              64
0110 */
0111
0112 /* LEVEL: (total bits, prefix bits, prefix value),
0113  * sorted ascending by number of total bits.
0114  * The rest of the code table is calculated at compiletime from this. */
0115
0116 /* fibonacci data 1, 1, ... */
0117 #define VLI_L_1_1() do { \
0118     LEVEL( 2, 1, 0x00); \
0119     LEVEL( 3, 2, 0x01); \
0120     LEVEL( 5, 3, 0x03); \
0121     LEVEL( 7, 4, 0x07); \
0122     LEVEL(10, 5, 0x0f); \
0123     LEVEL(14, 6, 0x1f); \
0124     LEVEL(21, 8, 0x3f); \
0125     LEVEL(29, 8, 0x7f); \
0126     LEVEL(42, 8, 0xbf); \
0127     LEVEL(64, 8, 0xff); \
0128     } while (0)
0129
0130 /* finds a suitable level to decode the least significant part of in.
0131  * returns number of bits consumed.
0132  *
0133  * BUG() for bad input, as that would mean a buggy code table. */
0134 static inline int vli_decode_bits(u64 *out, const u64 in)
0135 {
0136     u64 adj = 1;
0137
0138 #define LEVEL(t,b,v)                    \
0139     do {                        \
0140         if ((in & ((1 << b) -1)) == v) {    \
0141             *out = ((in & ((~0ULL) >> (64-t))) >> b) + adj; \
0142             return t;           \
0143         }                   \
0144         adj += 1ULL << (t - b);         \
0145     } while (0)
0146
0147     VLI_L_1_1();
0148
0149     /* NOT REACHED, if VLI_LEVELS code table is defined properly */
0150     BUG();
0151 #undef LEVEL
0152 }
0153
0154 /* return number of code bits needed,
0155  * or negative error number */
0156 static inline int __vli_encode_bits(u64 *out, const u64 in)
0157 {
0158     u64 max = 0;
0159     u64 adj = 1;
0160
0161     if (in == 0)
0162         return -EINVAL;
0163
0164 #define LEVEL(t,b,v) do {       \
0165         max += 1ULL << (t - b); \
0166         if (in <= max) {    \
0167             if (out)    \
0168                 *out = ((in - adj) << b) | v;   \
0169             return t;   \
0170         }           \
0171         adj = max + 1;      \
0172     } while (0)
0173
0174     VLI_L_1_1();
0175
0176     return -EOVERFLOW;
0177 #undef LEVEL
0178 }
0179
0180 #undef VLI_L_1_1
0181
0182 /* code from here down is independend of actually used bit code */
0183
0184 /*
0185  * Code length is determined by some unique (e.g. unary) prefix.
0186  * This encodes arbitrary bit length, not whole bytes: we have a bit-stream,
0187  * not a byte stream.
0188  */
0189
0190 /* for the bitstream, we need a cursor */
0191 struct bitstream_cursor {
0192     /* the current byte */
0193     u8 *b;
0194     /* the current bit within *b, nomalized: 0..7 */
0195     unsigned int bit;
0196 };
0197
0198 /* initialize cursor to point to first bit of stream */
0199 static inline void bitstream_cursor_reset(struct bitstream_cursor *cur, void *s)
0200 {
0201     cur->b = s;
0202     cur->bit = 0;
0203 }
0204
0205 /* advance cursor by that many bits; maximum expected input value: 64,
0206  * but depending on VLI implementation, it may be more. */
0207 static inline void bitstream_cursor_advance(struct bitstream_cursor *cur, unsigned int bits)
0208 {
0209     bits += cur->bit;
0210     cur->b = cur->b + (bits >> 3);
0211     cur->bit = bits & 7;
0212 }
0213
0214 /* the bitstream itself knows its length */
0215 struct bitstream {
0216     struct bitstream_cursor cur;
0217     unsigned char *buf;
0218     size_t buf_len;     /* in bytes */
0219
0220     /* for input stream:
0221      * number of trailing 0 bits for padding
0222      * total number of valid bits in stream: buf_len * 8 - pad_bits */
0223     unsigned int pad_bits;
0224 };
0225
0226 static inline void bitstream_init(struct bitstream *bs, void *s, size_t len, unsigned int pad_bits)
0227 {
0228     bs->buf = s;
0229     bs->buf_len = len;
0230     bs->pad_bits = pad_bits;
0231     bitstream_cursor_reset(&bs->cur, bs->buf);
0232 }
0233
0234 static inline void bitstream_rewind(struct bitstream *bs)
0235 {
0236     bitstream_cursor_reset(&bs->cur, bs->buf);
0237     memset(bs->buf, 0, bs->buf_len);
0238 }
0239
0240 /* Put (at most 64) least significant bits of val into bitstream, and advance cursor.
0241  * Ignores "pad_bits".
0242  * Returns zero if bits == 0 (nothing to do).
0243  * Returns number of bits used if successful.
0244  *
0245  * If there is not enough room left in bitstream,
0246  * leaves bitstream unchanged and returns -ENOBUFS.
0247  */
0248 static inline int bitstream_put_bits(struct bitstream *bs, u64 val, const unsigned int bits)
0249 {
0250     unsigned char *b = bs->cur.b;
0251     unsigned int tmp;
0252
0253     if (bits == 0)
0254         return 0;
0255
0256     if ((bs->cur.b + ((bs->cur.bit + bits -1) >> 3)) - bs->buf >= bs->buf_len)
0257         return -ENOBUFS;
0258
0259     /* paranoia: strip off hi bits; they should not be set anyways. */
0260     if (bits < 64)
0261         val &= ~0ULL >> (64 - bits);
0262
0263     *b++ |= (val & 0xff) << bs->cur.bit;
0264
0265     for (tmp = 8 - bs->cur.bit; tmp < bits; tmp += 8)
0266         *b++ |= (val >> tmp) & 0xff;
0267
0268     bitstream_cursor_advance(&bs->cur, bits);
0269     return bits;
0270 }
0271
0272 /* Fetch (at most 64) bits from bitstream into *out, and advance cursor.
0273  *
0274  * If more than 64 bits are requested, returns -EINVAL and leave *out unchanged.
0275  *
0276  * If there are less than the requested number of valid bits left in the
0277  * bitstream, still fetches all available bits.
0278  *
0279  * Returns number of actually fetched bits.
0280  */
0281 static inline int bitstream_get_bits(struct bitstream *bs, u64 *out, int bits)
0282 {
0283     u64 val;
0284     unsigned int n;
0285
0286     if (bits > 64)
0287         return -EINVAL;
0288
0289     if (bs->cur.b + ((bs->cur.bit + bs->pad_bits + bits -1) >> 3) - bs->buf >= bs->buf_len)
0290         bits = ((bs->buf_len - (bs->cur.b - bs->buf)) << 3)
0291             - bs->cur.bit - bs->pad_bits;
0292
0293     if (bits == 0) {
0294         *out = 0;
0295         return 0;
0296     }
0297
0298     /* get the high bits */
0299     val = 0;
0300     n = (bs->cur.bit + bits + 7) >> 3;
0301     /* n may be at most 9, if cur.bit + bits > 64 */
0302     /* which means this copies at most 8 byte */
0303     if (n) {
0304         memcpy(&val, bs->cur.b+1, n - 1);
0305         val = le64_to_cpu(val) << (8 - bs->cur.bit);
0306     }
0307
0308     /* we still need the low bits */
0309     val |= bs->cur.b[0] >> bs->cur.bit;
0310
0311     /* and mask out bits we don't want */
0312     val &= ~0ULL >> (64 - bits);
0313
0314     bitstream_cursor_advance(&bs->cur, bits);
0315     *out = val;
0316
0317     return bits;
0318 }
0319
0320 /* encodes @in as vli into @bs;
0321
0322  * return values
0323  *  > 0: number of bits successfully stored in bitstream
0324  * -ENOBUFS @bs is full
0325  * -EINVAL input zero (invalid)
0326  * -EOVERFLOW input too large for this vli code (invalid)
0327  */
0328 static inline int vli_encode_bits(struct bitstream *bs, u64 in)
0329 {
0330     u64 code = code;
0331     int bits = __vli_encode_bits(&code, in);
0332
0333     if (bits <= 0)
0334         return bits;
0335
0336     return bitstream_put_bits(bs, code, bits);
0337 }
0338
0339 #endif