![]() |
|
|||
0001 /* SPDX-License-Identifier: GPL-2.0-or-later */ 0002 /* 0003 -*- linux-c -*- 0004 drbd_receiver.c 0005 This file is part of DRBD by Philipp Reisner and Lars Ellenberg. 0006 0007 Copyright (C) 2001-2008, LINBIT Information Technologies GmbH. 0008 Copyright (C) 1999-2008, Philipp Reisner <philipp.reisner@linbit.com>. 0009 Copyright (C) 2002-2008, Lars Ellenberg <lars.ellenberg@linbit.com>. 0010 0011 */ 0012 0013 #ifndef _DRBD_VLI_H 0014 #define _DRBD_VLI_H 0015 0016 /* 0017 * At a granularity of 4KiB storage represented per bit, 0018 * and stroage sizes of several TiB, 0019 * and possibly small-bandwidth replication, 0020 * the bitmap transfer time can take much too long, 0021 * if transmitted in plain text. 0022 * 0023 * We try to reduce the transferred bitmap information 0024 * by encoding runlengths of bit polarity. 0025 * 0026 * We never actually need to encode a "zero" (runlengths are positive). 0027 * But then we have to store the value of the first bit. 0028 * The first bit of information thus shall encode if the first runlength 0029 * gives the number of set or unset bits. 0030 * 0031 * We assume that large areas are either completely set or unset, 0032 * which gives good compression with any runlength method, 0033 * even when encoding the runlength as fixed size 32bit/64bit integers. 0034 * 0035 * Still, there may be areas where the polarity flips every few bits, 0036 * and encoding the runlength sequence of those areas with fix size 0037 * integers would be much worse than plaintext. 0038 * 0039 * We want to encode small runlength values with minimum code length, 0040 * while still being able to encode a Huge run of all zeros. 0041 * 0042 * Thus we need a Variable Length Integer encoding, VLI. 0043 * 0044 * For some cases, we produce more code bits than plaintext input. 0045 * We need to send incompressible chunks as plaintext, skip over them 0046 * and then see if the next chunk compresses better. 0047 * 0048 * We don't care too much about "excellent" compression ratio for large 0049 * runlengths (all set/all clear): whether we achieve a factor of 100 0050 * or 1000 is not that much of an issue. 0051 * We do not want to waste too much on short runlengths in the "noisy" 0052 * parts of the bitmap, though. 0053 * 0054 * There are endless variants of VLI, we experimented with: 0055 * * simple byte-based 0056 * * various bit based with different code word length. 0057 * 0058 * To avoid yet an other configuration parameter (choice of bitmap compression 0059 * algorithm) which was difficult to explain and tune, we just chose the one 0060 * variant that turned out best in all test cases. 0061 * Based on real world usage patterns, with device sizes ranging from a few GiB 0062 * to several TiB, file server/mailserver/webserver/mysql/postgress, 0063 * mostly idle to really busy, the all time winner (though sometimes only 0064 * marginally better) is: 0065 */ 0066 0067 /* 0068 * encoding is "visualised" as 0069 * __little endian__ bitstream, least significant bit first (left most) 0070 * 0071 * this particular encoding is chosen so that the prefix code 0072 * starts as unary encoding the level, then modified so that 0073 * 10 levels can be described in 8bit, with minimal overhead 0074 * for the smaller levels. 0075 * 0076 * Number of data bits follow fibonacci sequence, with the exception of the 0077 * last level (+1 data bit, so it makes 64bit total). The only worse code when 0078 * encoding bit polarity runlength is 1 plain bits => 2 code bits. 0079 prefix data bits max val NÂș data bits 0080 0 x 0x2 1 0081 10 x 0x4 1 0082 110 xx 0x8 2 0083 1110 xxx 0x10 3 0084 11110 xxx xx 0x30 5 0085 111110 xx xxxxxx 0x130 8 0086 11111100 xxxxxxxx xxxxx 0x2130 13 0087 11111110 xxxxxxxx xxxxxxxx xxxxx 0x202130 21 0088 11111101 xxxxxxxx xxxxxxxx xxxxxxxx xxxxxxxx xx 0x400202130 34 0089 11111111 xxxxxxxx xxxxxxxx xxxxxxxx xxxxxxxx xxxxxxxx xxxxxxxx xxxxxxxx 56 0090 * maximum encodable value: 0x100000400202130 == 2**56 + some */ 0091 0092 /* compression "table": 0093 transmitted x 0.29 0094 as plaintext x ........................ 0095 x ........................ 0096 x ........................ 0097 x 0.59 0.21........................ 0098 x ........................................................ 0099 x .. c ................................................... 0100 x 0.44.. o ................................................... 0101 x .......... d ................................................... 0102 x .......... e ................................................... 0103 X............. ................................................... 0104 x.............. b ................................................... 0105 2.0x............... i ................................................... 0106 #X................ t ................................................... 0107 #................. s ........................... plain bits .......... 0108 -+----------------------------------------------------------------------- 0109 1 16 32 64 0110 */ 0111 0112 /* LEVEL: (total bits, prefix bits, prefix value), 0113 * sorted ascending by number of total bits. 0114 * The rest of the code table is calculated at compiletime from this. */ 0115 0116 /* fibonacci data 1, 1, ... */ 0117 #define VLI_L_1_1() do { \ 0118 LEVEL( 2, 1, 0x00); \ 0119 LEVEL( 3, 2, 0x01); \ 0120 LEVEL( 5, 3, 0x03); \ 0121 LEVEL( 7, 4, 0x07); \ 0122 LEVEL(10, 5, 0x0f); \ 0123 LEVEL(14, 6, 0x1f); \ 0124 LEVEL(21, 8, 0x3f); \ 0125 LEVEL(29, 8, 0x7f); \ 0126 LEVEL(42, 8, 0xbf); \ 0127 LEVEL(64, 8, 0xff); \ 0128 } while (0) 0129 0130 /* finds a suitable level to decode the least significant part of in. 0131 * returns number of bits consumed. 0132 * 0133 * BUG() for bad input, as that would mean a buggy code table. */ 0134 static inline int vli_decode_bits(u64 *out, const u64 in) 0135 { 0136 u64 adj = 1; 0137 0138 #define LEVEL(t,b,v) \ 0139 do { \ 0140 if ((in & ((1 << b) -1)) == v) { \ 0141 *out = ((in & ((~0ULL) >> (64-t))) >> b) + adj; \ 0142 return t; \ 0143 } \ 0144 adj += 1ULL << (t - b); \ 0145 } while (0) 0146 0147 VLI_L_1_1(); 0148 0149 /* NOT REACHED, if VLI_LEVELS code table is defined properly */ 0150 BUG(); 0151 #undef LEVEL 0152 } 0153 0154 /* return number of code bits needed, 0155 * or negative error number */ 0156 static inline int __vli_encode_bits(u64 *out, const u64 in) 0157 { 0158 u64 max = 0; 0159 u64 adj = 1; 0160 0161 if (in == 0) 0162 return -EINVAL; 0163 0164 #define LEVEL(t,b,v) do { \ 0165 max += 1ULL << (t - b); \ 0166 if (in <= max) { \ 0167 if (out) \ 0168 *out = ((in - adj) << b) | v; \ 0169 return t; \ 0170 } \ 0171 adj = max + 1; \ 0172 } while (0) 0173 0174 VLI_L_1_1(); 0175 0176 return -EOVERFLOW; 0177 #undef LEVEL 0178 } 0179 0180 #undef VLI_L_1_1 0181 0182 /* code from here down is independend of actually used bit code */ 0183 0184 /* 0185 * Code length is determined by some unique (e.g. unary) prefix. 0186 * This encodes arbitrary bit length, not whole bytes: we have a bit-stream, 0187 * not a byte stream. 0188 */ 0189 0190 /* for the bitstream, we need a cursor */ 0191 struct bitstream_cursor { 0192 /* the current byte */ 0193 u8 *b; 0194 /* the current bit within *b, nomalized: 0..7 */ 0195 unsigned int bit; 0196 }; 0197 0198 /* initialize cursor to point to first bit of stream */ 0199 static inline void bitstream_cursor_reset(struct bitstream_cursor *cur, void *s) 0200 { 0201 cur->b = s; 0202 cur->bit = 0; 0203 } 0204 0205 /* advance cursor by that many bits; maximum expected input value: 64, 0206 * but depending on VLI implementation, it may be more. */ 0207 static inline void bitstream_cursor_advance(struct bitstream_cursor *cur, unsigned int bits) 0208 { 0209 bits += cur->bit; 0210 cur->b = cur->b + (bits >> 3); 0211 cur->bit = bits & 7; 0212 } 0213 0214 /* the bitstream itself knows its length */ 0215 struct bitstream { 0216 struct bitstream_cursor cur; 0217 unsigned char *buf; 0218 size_t buf_len; /* in bytes */ 0219 0220 /* for input stream: 0221 * number of trailing 0 bits for padding 0222 * total number of valid bits in stream: buf_len * 8 - pad_bits */ 0223 unsigned int pad_bits; 0224 }; 0225 0226 static inline void bitstream_init(struct bitstream *bs, void *s, size_t len, unsigned int pad_bits) 0227 { 0228 bs->buf = s; 0229 bs->buf_len = len; 0230 bs->pad_bits = pad_bits; 0231 bitstream_cursor_reset(&bs->cur, bs->buf); 0232 } 0233 0234 static inline void bitstream_rewind(struct bitstream *bs) 0235 { 0236 bitstream_cursor_reset(&bs->cur, bs->buf); 0237 memset(bs->buf, 0, bs->buf_len); 0238 } 0239 0240 /* Put (at most 64) least significant bits of val into bitstream, and advance cursor. 0241 * Ignores "pad_bits". 0242 * Returns zero if bits == 0 (nothing to do). 0243 * Returns number of bits used if successful. 0244 * 0245 * If there is not enough room left in bitstream, 0246 * leaves bitstream unchanged and returns -ENOBUFS. 0247 */ 0248 static inline int bitstream_put_bits(struct bitstream *bs, u64 val, const unsigned int bits) 0249 { 0250 unsigned char *b = bs->cur.b; 0251 unsigned int tmp; 0252 0253 if (bits == 0) 0254 return 0; 0255 0256 if ((bs->cur.b + ((bs->cur.bit + bits -1) >> 3)) - bs->buf >= bs->buf_len) 0257 return -ENOBUFS; 0258 0259 /* paranoia: strip off hi bits; they should not be set anyways. */ 0260 if (bits < 64) 0261 val &= ~0ULL >> (64 - bits); 0262 0263 *b++ |= (val & 0xff) << bs->cur.bit; 0264 0265 for (tmp = 8 - bs->cur.bit; tmp < bits; tmp += 8) 0266 *b++ |= (val >> tmp) & 0xff; 0267 0268 bitstream_cursor_advance(&bs->cur, bits); 0269 return bits; 0270 } 0271 0272 /* Fetch (at most 64) bits from bitstream into *out, and advance cursor. 0273 * 0274 * If more than 64 bits are requested, returns -EINVAL and leave *out unchanged. 0275 * 0276 * If there are less than the requested number of valid bits left in the 0277 * bitstream, still fetches all available bits. 0278 * 0279 * Returns number of actually fetched bits. 0280 */ 0281 static inline int bitstream_get_bits(struct bitstream *bs, u64 *out, int bits) 0282 { 0283 u64 val; 0284 unsigned int n; 0285 0286 if (bits > 64) 0287 return -EINVAL; 0288 0289 if (bs->cur.b + ((bs->cur.bit + bs->pad_bits + bits -1) >> 3) - bs->buf >= bs->buf_len) 0290 bits = ((bs->buf_len - (bs->cur.b - bs->buf)) << 3) 0291 - bs->cur.bit - bs->pad_bits; 0292 0293 if (bits == 0) { 0294 *out = 0; 0295 return 0; 0296 } 0297 0298 /* get the high bits */ 0299 val = 0; 0300 n = (bs->cur.bit + bits + 7) >> 3; 0301 /* n may be at most 9, if cur.bit + bits > 64 */ 0302 /* which means this copies at most 8 byte */ 0303 if (n) { 0304 memcpy(&val, bs->cur.b+1, n - 1); 0305 val = le64_to_cpu(val) << (8 - bs->cur.bit); 0306 } 0307 0308 /* we still need the low bits */ 0309 val |= bs->cur.b[0] >> bs->cur.bit; 0310 0311 /* and mask out bits we don't want */ 0312 val &= ~0ULL >> (64 - bits); 0313 0314 bitstream_cursor_advance(&bs->cur, bits); 0315 *out = val; 0316 0317 return bits; 0318 } 0319 0320 /* encodes @in as vli into @bs; 0321 0322 * return values 0323 * > 0: number of bits successfully stored in bitstream 0324 * -ENOBUFS @bs is full 0325 * -EINVAL input zero (invalid) 0326 * -EOVERFLOW input too large for this vli code (invalid) 0327 */ 0328 static inline int vli_encode_bits(struct bitstream *bs, u64 in) 0329 { 0330 u64 code = code; 0331 int bits = __vli_encode_bits(&code, in); 0332 0333 if (bits <= 0) 0334 return bits; 0335 0336 return bitstream_put_bits(bs, code, bits); 0337 } 0338 0339 #endif
[ Source navigation ] | [ Diff markup ] | [ Identifier search ] | [ general search ] |
This page was automatically generated by the 2.1.0 LXR engine. The LXR team |
![]() ![]() |