Back to home page

OSCL-LXR

 
 

    


0001 // SPDX-License-Identifier: GPL-2.0-or-later
0002 
0003 /* P9 gunzip sample code for demonstrating the P9 NX hardware
0004  * interface.  Not intended for productive uses or for performance or
0005  * compression ratio measurements.  Note also that /dev/crypto/gzip,
0006  * VAS and skiboot support are required
0007  *
0008  * Copyright 2020 IBM Corp.
0009  *
0010  * Author: Bulent Abali <abali@us.ibm.com>
0011  *
0012  * https://github.com/libnxz/power-gzip for zlib api and other utils
0013  * Definitions of acronyms used here.  See
0014  * P9 NX Gzip Accelerator User's Manual for details:
0015  * https://github.com/libnxz/power-gzip/blob/develop/doc/power_nx_gzip_um.pdf
0016  *
0017  * adler/crc: 32 bit checksums appended to stream tail
0018  * ce:       completion extension
0019  * cpb:      coprocessor parameter block (metadata)
0020  * crb:      coprocessor request block (command)
0021  * csb:      coprocessor status block (status)
0022  * dht:      dynamic huffman table
0023  * dde:      data descriptor element (address, length)
0024  * ddl:      list of ddes
0025  * dh/fh:    dynamic and fixed huffman types
0026  * fc:       coprocessor function code
0027  * histlen:  history/dictionary length
0028  * history:  sliding window of up to 32KB of data
0029  * lzcount:  Deflate LZ symbol counts
0030  * rembytecnt: remaining byte count
0031  * sfbt:     source final block type; last block's type during decomp
0032  * spbc:     source processed byte count
0033  * subc:     source unprocessed bit count
0034  * tebc:     target ending bit count; valid bits in the last byte
0035  * tpbc:     target processed byte count
0036  * vas:      virtual accelerator switch; the user mode interface
0037  */
0038 
0039 #define _ISOC11_SOURCE  // For aligned_alloc()
0040 #define _DEFAULT_SOURCE // For endian.h
0041 
0042 #include <stdio.h>
0043 #include <stdlib.h>
0044 #include <string.h>
0045 #include <unistd.h>
0046 #include <stdint.h>
0047 #include <sys/types.h>
0048 #include <sys/stat.h>
0049 #include <sys/time.h>
0050 #include <sys/fcntl.h>
0051 #include <sys/mman.h>
0052 #include <endian.h>
0053 #include <bits/endian.h>
0054 #include <sys/ioctl.h>
0055 #include <assert.h>
0056 #include <errno.h>
0057 #include <signal.h>
0058 #include "nxu.h"
0059 #include "nx.h"
0060 #include "crb.h"
0061 
0062 int nx_dbg;
0063 FILE *nx_gzip_log;
0064 
0065 #define NX_MIN(X, Y) (((X) < (Y))?(X):(Y))
0066 #define NX_MAX(X, Y) (((X) > (Y))?(X):(Y))
0067 
0068 #define GETINPC(X) fgetc(X)
0069 #define FNAME_MAX 1024
0070 
0071 /* fifo queue management */
0072 #define fifo_used_bytes(used) (used)
0073 #define fifo_free_bytes(used, len) ((len)-(used))
0074 /* amount of free bytes in the first and last parts */
0075 #define fifo_free_first_bytes(cur, used, len)  ((((cur)+(used)) <= (len)) \
0076                           ? (len)-((cur)+(used)) : 0)
0077 #define fifo_free_last_bytes(cur, used, len)   ((((cur)+(used)) <= (len)) \
0078                           ? (cur) : (len)-(used))
0079 /* amount of used bytes in the first and last parts */
0080 #define fifo_used_first_bytes(cur, used, len)  ((((cur)+(used)) <= (len)) \
0081                           ? (used) : (len)-(cur))
0082 #define fifo_used_last_bytes(cur, used, len)   ((((cur)+(used)) <= (len)) \
0083                           ? 0 : ((used)+(cur))-(len))
0084 /* first and last free parts start here */
0085 #define fifo_free_first_offset(cur, used)      ((cur)+(used))
0086 #define fifo_free_last_offset(cur, used, len)  \
0087                        fifo_used_last_bytes(cur, used, len)
0088 /* first and last used parts start here */
0089 #define fifo_used_first_offset(cur)            (cur)
0090 #define fifo_used_last_offset(cur)             (0)
0091 
0092 const int fifo_in_len = 1<<24;
0093 const int fifo_out_len = 1<<24;
0094 const int page_sz = 1<<16;
0095 const int line_sz = 1<<7;
0096 const int window_max = 1<<15;
0097 
0098 /*
0099  * Adds an (address, len) pair to the list of ddes (ddl) and updates
0100  * the base dde.  ddl[0] is the only dde in a direct dde which
0101  * contains a single (addr,len) pair.  For more pairs, ddl[0] becomes
0102  * the indirect (base) dde that points to a list of direct ddes.
0103  * See Section 6.4 of the NX-gzip user manual for DDE description.
0104  * Addr=NULL, len=0 clears the ddl[0].  Returns the total number of
0105  * bytes in ddl.  Caller is responsible for allocting the array of
0106  * nx_dde_t *ddl.  If N addresses are required in the scatter-gather
0107  * list, the ddl array must have N+1 entries minimum.
0108  */
0109 static inline uint32_t nx_append_dde(struct nx_dde_t *ddl, void *addr,
0110                     uint32_t len)
0111 {
0112     uint32_t ddecnt;
0113     uint32_t bytes;
0114 
0115     if (addr == NULL && len == 0) {
0116         clearp_dde(ddl);
0117         return 0;
0118     }
0119 
0120     NXPRT(fprintf(stderr, "%d: %s addr %p len %x\n", __LINE__, addr,
0121             __func__, len));
0122 
0123     /* Number of ddes in the dde list ; == 0 when it is a direct dde */
0124     ddecnt = getpnn(ddl, dde_count);
0125     bytes = getp32(ddl, ddebc);
0126 
0127     if (ddecnt == 0 && bytes == 0) {
0128         /* First dde is unused; make it a direct dde */
0129         bytes = len;
0130         putp32(ddl, ddebc, bytes);
0131         putp64(ddl, ddead, (uint64_t) addr);
0132     } else if (ddecnt == 0) {
0133         /* Converting direct to indirect dde
0134          * ddl[0] becomes head dde of ddl
0135          * copy direct to indirect first.
0136          */
0137         ddl[1] = ddl[0];
0138 
0139         /* Add the new dde next */
0140         clear_dde(ddl[2]);
0141         put32(ddl[2], ddebc, len);
0142         put64(ddl[2], ddead, (uint64_t) addr);
0143 
0144         /* Ddl head points to 2 direct ddes */
0145         ddecnt = 2;
0146         putpnn(ddl, dde_count, ddecnt);
0147         bytes = bytes + len;
0148         putp32(ddl, ddebc, bytes);
0149         /* Pointer to the first direct dde */
0150         putp64(ddl, ddead, (uint64_t) &ddl[1]);
0151     } else {
0152         /* Append a dde to an existing indirect ddl */
0153         ++ddecnt;
0154         clear_dde(ddl[ddecnt]);
0155         put64(ddl[ddecnt], ddead, (uint64_t) addr);
0156         put32(ddl[ddecnt], ddebc, len);
0157 
0158         putpnn(ddl, dde_count, ddecnt);
0159         bytes = bytes + len;
0160         putp32(ddl, ddebc, bytes); /* byte sum of all dde */
0161     }
0162     return bytes;
0163 }
0164 
0165 /*
0166  * Touch specified number of pages represented in number bytes
0167  * beginning from the first buffer in a dde list.
0168  * Do not touch the pages past buf_sz-th byte's page.
0169  *
0170  * Set buf_sz = 0 to touch all pages described by the ddep.
0171  */
0172 static int nx_touch_pages_dde(struct nx_dde_t *ddep, long buf_sz, long page_sz,
0173                 int wr)
0174 {
0175     uint32_t indirect_count;
0176     uint32_t buf_len;
0177     long total;
0178     uint64_t buf_addr;
0179     struct nx_dde_t *dde_list;
0180     int i;
0181 
0182     assert(!!ddep);
0183 
0184     indirect_count = getpnn(ddep, dde_count);
0185 
0186     NXPRT(fprintf(stderr, "%s dde_count %d request len ", __func__,
0187             indirect_count));
0188     NXPRT(fprintf(stderr, "0x%lx\n", buf_sz));
0189 
0190     if (indirect_count == 0) {
0191         /* Direct dde */
0192         buf_len = getp32(ddep, ddebc);
0193         buf_addr = getp64(ddep, ddead);
0194 
0195         NXPRT(fprintf(stderr, "touch direct ddebc 0x%x ddead %p\n",
0196                 buf_len, (void *)buf_addr));
0197 
0198         if (buf_sz == 0)
0199             nxu_touch_pages((void *)buf_addr, buf_len, page_sz, wr);
0200         else
0201             nxu_touch_pages((void *)buf_addr, NX_MIN(buf_len,
0202                     buf_sz), page_sz, wr);
0203 
0204         return ERR_NX_OK;
0205     }
0206 
0207     /* Indirect dde */
0208     if (indirect_count > MAX_DDE_COUNT)
0209         return ERR_NX_EXCESSIVE_DDE;
0210 
0211     /* First address of the list */
0212     dde_list = (struct nx_dde_t *) getp64(ddep, ddead);
0213 
0214     if (buf_sz == 0)
0215         buf_sz = getp32(ddep, ddebc);
0216 
0217     total = 0;
0218     for (i = 0; i < indirect_count; i++) {
0219         buf_len = get32(dde_list[i], ddebc);
0220         buf_addr = get64(dde_list[i], ddead);
0221         total += buf_len;
0222 
0223         NXPRT(fprintf(stderr, "touch loop len 0x%x ddead %p total ",
0224                 buf_len, (void *)buf_addr));
0225         NXPRT(fprintf(stderr, "0x%lx\n", total));
0226 
0227         /* Touching fewer pages than encoded in the ddebc */
0228         if (total > buf_sz) {
0229             buf_len = NX_MIN(buf_len, total - buf_sz);
0230             nxu_touch_pages((void *)buf_addr, buf_len, page_sz, wr);
0231             NXPRT(fprintf(stderr, "touch loop break len 0x%x ",
0232                       buf_len));
0233             NXPRT(fprintf(stderr, "ddead %p\n", (void *)buf_addr));
0234             break;
0235         }
0236         nxu_touch_pages((void *)buf_addr, buf_len, page_sz, wr);
0237     }
0238     return ERR_NX_OK;
0239 }
0240 
0241 /*
0242  * Src and dst buffers are supplied in scatter gather lists.
0243  * NX function code and other parameters supplied in cmdp.
0244  */
0245 static int nx_submit_job(struct nx_dde_t *src, struct nx_dde_t *dst,
0246              struct nx_gzip_crb_cpb_t *cmdp, void *handle)
0247 {
0248     uint64_t csbaddr;
0249 
0250     memset((void *)&cmdp->crb.csb, 0, sizeof(cmdp->crb.csb));
0251 
0252     cmdp->crb.source_dde = *src;
0253     cmdp->crb.target_dde = *dst;
0254 
0255     /* Status, output byte count in tpbc */
0256     csbaddr = ((uint64_t) &cmdp->crb.csb) & csb_address_mask;
0257     put64(cmdp->crb, csb_address, csbaddr);
0258 
0259     /* NX reports input bytes in spbc; cleared */
0260     cmdp->cpb.out_spbc_comp_wrap = 0;
0261     cmdp->cpb.out_spbc_comp_with_count = 0;
0262     cmdp->cpb.out_spbc_decomp = 0;
0263 
0264     /* Clear output */
0265     put32(cmdp->cpb, out_crc, INIT_CRC);
0266     put32(cmdp->cpb, out_adler, INIT_ADLER);
0267 
0268     /* Submit the crb, the job descriptor, to the accelerator. */
0269     return nxu_submit_job(cmdp, handle);
0270 }
0271 
0272 int decompress_file(int argc, char **argv, void *devhandle)
0273 {
0274     FILE *inpf = NULL;
0275     FILE *outf = NULL;
0276 
0277     int c, expect, i, cc, rc = 0;
0278     char gzfname[FNAME_MAX];
0279 
0280     /* Queuing, file ops, byte counting */
0281     char *fifo_in, *fifo_out;
0282     int used_in, cur_in, used_out, cur_out, read_sz, n;
0283     int first_free, last_free, first_used, last_used;
0284     int first_offset, last_offset;
0285     int write_sz, free_space, source_sz;
0286     int source_sz_estimate, target_sz_estimate;
0287     uint64_t last_comp_ratio = 0; /* 1000 max */
0288     uint64_t total_out = 0;
0289     int is_final, is_eof;
0290 
0291     /* nx hardware */
0292     int sfbt, subc, spbc, tpbc, nx_ce, fc, resuming = 0;
0293     int history_len = 0;
0294     struct nx_gzip_crb_cpb_t cmd, *cmdp;
0295     struct nx_dde_t *ddl_in;
0296     struct nx_dde_t dde_in[6] __aligned(128);
0297     struct nx_dde_t *ddl_out;
0298     struct nx_dde_t dde_out[6] __aligned(128);
0299     int pgfault_retries;
0300 
0301     /* when using mmap'ed files */
0302     off_t input_file_offset;
0303 
0304     if (argc > 2) {
0305         fprintf(stderr, "usage: %s <fname> or stdin\n", argv[0]);
0306         fprintf(stderr, "    writes to stdout or <fname>.nx.gunzip\n");
0307         return -1;
0308     }
0309 
0310     if (argc == 1) {
0311         inpf = stdin;
0312         outf = stdout;
0313     } else if (argc == 2) {
0314         char w[1024];
0315         char *wp;
0316 
0317         inpf = fopen(argv[1], "r");
0318         if (inpf == NULL) {
0319             perror(argv[1]);
0320             return -1;
0321         }
0322 
0323         /* Make a new file name to write to.  Ignoring '.gz' */
0324         wp = (NULL != (wp = strrchr(argv[1], '/'))) ? (wp+1) : argv[1];
0325         strcpy(w, wp);
0326         strcat(w, ".nx.gunzip");
0327 
0328         outf = fopen(w, "w");
0329         if (outf == NULL) {
0330             perror(w);
0331             return -1;
0332         }
0333     }
0334 
0335     /* Decode the gzip header */
0336     c = GETINPC(inpf); expect = 0x1f; /* ID1 */
0337     if (c != expect)
0338         goto err1;
0339 
0340     c = GETINPC(inpf); expect = 0x8b; /* ID2 */
0341     if (c != expect)
0342         goto err1;
0343 
0344     c = GETINPC(inpf); expect = 0x08; /* CM */
0345     if (c != expect)
0346         goto err1;
0347 
0348     int flg = GETINPC(inpf); /* FLG */
0349 
0350     if (flg & 0xE0 || flg & 0x4 || flg == EOF)
0351         goto err2;
0352 
0353     fprintf(stderr, "gzHeader FLG %x\n", flg);
0354 
0355     /* Read 6 bytes; ignoring the MTIME, XFL, OS fields in this
0356      * sample code.
0357      */
0358     for (i = 0; i < 6; i++) {
0359         char tmp[10];
0360 
0361         tmp[i] = GETINPC(inpf);
0362         if (tmp[i] == EOF)
0363             goto err3;
0364         fprintf(stderr, "%02x ", tmp[i]);
0365         if (i == 5)
0366             fprintf(stderr, "\n");
0367     }
0368     fprintf(stderr, "gzHeader MTIME, XFL, OS ignored\n");
0369 
0370     /* FNAME */
0371     if (flg & 0x8) {
0372         int k = 0;
0373 
0374         do {
0375             c = GETINPC(inpf);
0376             if (c == EOF || k >= FNAME_MAX)
0377                 goto err3;
0378             gzfname[k++] = c;
0379         } while (c);
0380         fprintf(stderr, "gzHeader FNAME: %s\n", gzfname);
0381     }
0382 
0383     /* FHCRC */
0384     if (flg & 0x2) {
0385         c = GETINPC(inpf);
0386         if (c == EOF)
0387             goto err3;
0388         c = GETINPC(inpf);
0389         if (c == EOF)
0390             goto err3;
0391         fprintf(stderr, "gzHeader FHCRC: ignored\n");
0392     }
0393 
0394     used_in = cur_in = used_out = cur_out = 0;
0395     is_final = is_eof = 0;
0396 
0397     /* Allocate one page larger to prevent page faults due to NX
0398      * overfetching.
0399      * Either do this (char*)(uintptr_t)aligned_alloc or use
0400      * -std=c11 flag to make the int-to-pointer warning go away.
0401      */
0402     assert((fifo_in  = (char *)(uintptr_t)aligned_alloc(line_sz,
0403                    fifo_in_len + page_sz)) != NULL);
0404     assert((fifo_out = (char *)(uintptr_t)aligned_alloc(line_sz,
0405                    fifo_out_len + page_sz + line_sz)) != NULL);
0406     /* Leave unused space due to history rounding rules */
0407     fifo_out = fifo_out + line_sz;
0408     nxu_touch_pages(fifo_out, fifo_out_len, page_sz, 1);
0409 
0410     ddl_in  = &dde_in[0];
0411     ddl_out = &dde_out[0];
0412     cmdp = &cmd;
0413     memset(&cmdp->crb, 0, sizeof(cmdp->crb));
0414 
0415 read_state:
0416 
0417     /* Read from .gz file */
0418 
0419     NXPRT(fprintf(stderr, "read_state:\n"));
0420 
0421     if (is_eof != 0)
0422         goto write_state;
0423 
0424     /* We read in to fifo_in in two steps: first: read in to from
0425      * cur_in to the end of the buffer.  last: if free space wrapped
0426      * around, read from fifo_in offset 0 to offset cur_in.
0427      */
0428 
0429     /* Reset fifo head to reduce unnecessary wrap arounds */
0430     cur_in = (used_in == 0) ? 0 : cur_in;
0431 
0432     /* Free space total is reduced by a gap */
0433     free_space = NX_MAX(0, fifo_free_bytes(used_in, fifo_in_len)
0434                 - line_sz);
0435 
0436     /* Free space may wrap around as first and last */
0437     first_free = fifo_free_first_bytes(cur_in, used_in, fifo_in_len);
0438     last_free  = fifo_free_last_bytes(cur_in, used_in, fifo_in_len);
0439 
0440     /* Start offsets of the free memory */
0441     first_offset = fifo_free_first_offset(cur_in, used_in);
0442     last_offset  = fifo_free_last_offset(cur_in, used_in, fifo_in_len);
0443 
0444     /* Reduce read_sz because of the line_sz gap */
0445     read_sz = NX_MIN(free_space, first_free);
0446     n = 0;
0447     if (read_sz > 0) {
0448         /* Read in to offset cur_in + used_in */
0449         n = fread(fifo_in + first_offset, 1, read_sz, inpf);
0450         used_in = used_in + n;
0451         free_space = free_space - n;
0452         assert(n <= read_sz);
0453         if (n != read_sz) {
0454             /* Either EOF or error; exit the read loop */
0455             is_eof = 1;
0456             goto write_state;
0457         }
0458     }
0459 
0460     /* If free space wrapped around */
0461     if (last_free > 0) {
0462         /* Reduce read_sz because of the line_sz gap */
0463         read_sz = NX_MIN(free_space, last_free);
0464         n = 0;
0465         if (read_sz > 0) {
0466             n = fread(fifo_in + last_offset, 1, read_sz, inpf);
0467             used_in = used_in + n;       /* Increase used space */
0468             free_space = free_space - n; /* Decrease free space */
0469             assert(n <= read_sz);
0470             if (n != read_sz) {
0471                 /* Either EOF or error; exit the read loop */
0472                 is_eof = 1;
0473                 goto write_state;
0474             }
0475         }
0476     }
0477 
0478     /* At this point we have used_in bytes in fifo_in with the
0479      * data head starting at cur_in and possibly wrapping around.
0480      */
0481 
0482 write_state:
0483 
0484     /* Write decompressed data to output file */
0485 
0486     NXPRT(fprintf(stderr, "write_state:\n"));
0487 
0488     if (used_out == 0)
0489         goto decomp_state;
0490 
0491     /* If fifo_out has data waiting, write it out to the file to
0492      * make free target space for the accelerator used bytes in
0493      * the first and last parts of fifo_out.
0494      */
0495 
0496     first_used = fifo_used_first_bytes(cur_out, used_out, fifo_out_len);
0497     last_used  = fifo_used_last_bytes(cur_out, used_out, fifo_out_len);
0498 
0499     write_sz = first_used;
0500 
0501     n = 0;
0502     if (write_sz > 0) {
0503         n = fwrite(fifo_out + cur_out, 1, write_sz, outf);
0504         used_out = used_out - n;
0505         /* Move head of the fifo */
0506         cur_out = (cur_out + n) % fifo_out_len;
0507         assert(n <= write_sz);
0508         if (n != write_sz) {
0509             fprintf(stderr, "error: write\n");
0510             rc = -1;
0511             goto err5;
0512         }
0513     }
0514 
0515     if (last_used > 0) { /* If more data available in the last part */
0516         write_sz = last_used; /* Keep it here for later */
0517         n = 0;
0518         if (write_sz > 0) {
0519             n = fwrite(fifo_out, 1, write_sz, outf);
0520             used_out = used_out - n;
0521             cur_out = (cur_out + n) % fifo_out_len;
0522             assert(n <= write_sz);
0523             if (n != write_sz) {
0524                 fprintf(stderr, "error: write\n");
0525                 rc = -1;
0526                 goto err5;
0527             }
0528         }
0529     }
0530 
0531 decomp_state:
0532 
0533     /* NX decompresses input data */
0534 
0535     NXPRT(fprintf(stderr, "decomp_state:\n"));
0536 
0537     if (is_final)
0538         goto finish_state;
0539 
0540     /* Address/len lists */
0541     clearp_dde(ddl_in);
0542     clearp_dde(ddl_out);
0543 
0544     /* FC, CRC, HistLen, Table 6-6 */
0545     if (resuming) {
0546         /* Resuming a partially decompressed input.
0547          * The key to resume is supplying the 32KB
0548          * dictionary (history) to NX, which is basically
0549          * the last 32KB of output produced.
0550          */
0551         fc = GZIP_FC_DECOMPRESS_RESUME;
0552 
0553         cmdp->cpb.in_crc   = cmdp->cpb.out_crc;
0554         cmdp->cpb.in_adler = cmdp->cpb.out_adler;
0555 
0556         /* Round up the history size to quadword.  Section 2.10 */
0557         history_len = (history_len + 15) / 16;
0558         putnn(cmdp->cpb, in_histlen, history_len);
0559         history_len = history_len * 16; /* bytes */
0560 
0561         if (history_len > 0) {
0562             /* Chain in the history buffer to the DDE list */
0563             if (cur_out >= history_len) {
0564                 nx_append_dde(ddl_in, fifo_out
0565                           + (cur_out - history_len),
0566                           history_len);
0567             } else {
0568                 nx_append_dde(ddl_in, fifo_out
0569                           + ((fifo_out_len + cur_out)
0570                           - history_len),
0571                           history_len - cur_out);
0572                 /* Up to 32KB history wraps around fifo_out */
0573                 nx_append_dde(ddl_in, fifo_out, cur_out);
0574             }
0575 
0576         }
0577     } else {
0578         /* First decompress job */
0579         fc = GZIP_FC_DECOMPRESS;
0580 
0581         history_len = 0;
0582         /* Writing 0 clears out subc as well */
0583         cmdp->cpb.in_histlen = 0;
0584         total_out = 0;
0585 
0586         put32(cmdp->cpb, in_crc, INIT_CRC);
0587         put32(cmdp->cpb, in_adler, INIT_ADLER);
0588         put32(cmdp->cpb, out_crc, INIT_CRC);
0589         put32(cmdp->cpb, out_adler, INIT_ADLER);
0590 
0591         /* Assuming 10% compression ratio initially; use the
0592          * most recently measured compression ratio as a
0593          * heuristic to estimate the input and output
0594          * sizes.  If we give too much input, the target buffer
0595          * overflows and NX cycles are wasted, and then we
0596          * must retry with smaller input size.  1000 is 100%.
0597          */
0598         last_comp_ratio = 100UL;
0599     }
0600     cmdp->crb.gzip_fc = 0;
0601     putnn(cmdp->crb, gzip_fc, fc);
0602 
0603     /*
0604      * NX source buffers
0605      */
0606     first_used = fifo_used_first_bytes(cur_in, used_in, fifo_in_len);
0607     last_used = fifo_used_last_bytes(cur_in, used_in, fifo_in_len);
0608 
0609     if (first_used > 0)
0610         nx_append_dde(ddl_in, fifo_in + cur_in, first_used);
0611 
0612     if (last_used > 0)
0613         nx_append_dde(ddl_in, fifo_in, last_used);
0614 
0615     /*
0616      * NX target buffers
0617      */
0618     first_free = fifo_free_first_bytes(cur_out, used_out, fifo_out_len);
0619     last_free = fifo_free_last_bytes(cur_out, used_out, fifo_out_len);
0620 
0621     /* Reduce output free space amount not to overwrite the history */
0622     int target_max = NX_MAX(0, fifo_free_bytes(used_out, fifo_out_len)
0623                 - (1<<16));
0624 
0625     NXPRT(fprintf(stderr, "target_max %d (0x%x)\n", target_max,
0626               target_max));
0627 
0628     first_free = NX_MIN(target_max, first_free);
0629     if (first_free > 0) {
0630         first_offset = fifo_free_first_offset(cur_out, used_out);
0631         nx_append_dde(ddl_out, fifo_out + first_offset, first_free);
0632     }
0633 
0634     if (last_free > 0) {
0635         last_free = NX_MIN(target_max - first_free, last_free);
0636         if (last_free > 0) {
0637             last_offset = fifo_free_last_offset(cur_out, used_out,
0638                                 fifo_out_len);
0639             nx_append_dde(ddl_out, fifo_out + last_offset,
0640                       last_free);
0641         }
0642     }
0643 
0644     /* Target buffer size is used to limit the source data size
0645      * based on previous measurements of compression ratio.
0646      */
0647 
0648     /* source_sz includes history */
0649     source_sz = getp32(ddl_in, ddebc);
0650     assert(source_sz > history_len);
0651     source_sz = source_sz - history_len;
0652 
0653     /* Estimating how much source is needed to 3/4 fill a
0654      * target_max size target buffer.  If we overshoot, then NX
0655      * must repeat the job with smaller input and we waste
0656      * bandwidth.  If we undershoot then we use more NX calls than
0657      * necessary.
0658      */
0659 
0660     source_sz_estimate = ((uint64_t)target_max * last_comp_ratio * 3UL)
0661                 / 4000;
0662 
0663     if (source_sz_estimate < source_sz) {
0664         /* Target might be small, therefore limiting the
0665          * source data.
0666          */
0667         source_sz = source_sz_estimate;
0668         target_sz_estimate = target_max;
0669     } else {
0670         /* Source file might be small, therefore limiting target
0671          * touch pages to a smaller value to save processor cycles.
0672          */
0673         target_sz_estimate = ((uint64_t)source_sz * 1000UL)
0674                     / (last_comp_ratio + 1);
0675         target_sz_estimate = NX_MIN(2 * target_sz_estimate,
0676                         target_max);
0677     }
0678 
0679     source_sz = source_sz + history_len;
0680 
0681     /* Some NX condition codes require submitting the NX job again.
0682      * Kernel doesn't handle NX page faults. Expects user code to
0683      * touch pages.
0684      */
0685     pgfault_retries = NX_MAX_FAULTS;
0686 
0687 restart_nx:
0688 
0689     putp32(ddl_in, ddebc, source_sz);
0690 
0691     /* Fault in pages */
0692     nxu_touch_pages(cmdp, sizeof(struct nx_gzip_crb_cpb_t), page_sz, 1);
0693     nx_touch_pages_dde(ddl_in, 0, page_sz, 0);
0694     nx_touch_pages_dde(ddl_out, target_sz_estimate, page_sz, 1);
0695 
0696     /* Send job to NX */
0697     cc = nx_submit_job(ddl_in, ddl_out, cmdp, devhandle);
0698 
0699     switch (cc) {
0700 
0701     case ERR_NX_AT_FAULT:
0702 
0703         /* We touched the pages ahead of time.  In the most common case
0704          * we shouldn't be here.  But may be some pages were paged out.
0705          * Kernel should have placed the faulting address to fsaddr.
0706          */
0707         NXPRT(fprintf(stderr, "ERR_NX_AT_FAULT %p\n",
0708                   (void *)cmdp->crb.csb.fsaddr));
0709 
0710         if (pgfault_retries == NX_MAX_FAULTS) {
0711             /* Try once with exact number of pages */
0712             --pgfault_retries;
0713             goto restart_nx;
0714         } else if (pgfault_retries > 0) {
0715             /* If still faulting try fewer input pages
0716              * assuming memory outage
0717              */
0718             if (source_sz > page_sz)
0719                 source_sz = NX_MAX(source_sz / 2, page_sz);
0720             --pgfault_retries;
0721             goto restart_nx;
0722         } else {
0723             fprintf(stderr, "cannot make progress; too many ");
0724             fprintf(stderr, "page fault retries cc= %d\n", cc);
0725             rc = -1;
0726             goto err5;
0727         }
0728 
0729     case ERR_NX_DATA_LENGTH:
0730 
0731         NXPRT(fprintf(stderr, "ERR_NX_DATA_LENGTH; "));
0732         NXPRT(fprintf(stderr, "stream may have trailing data\n"));
0733 
0734         /* Not an error in the most common case; it just says
0735          * there is trailing data that we must examine.
0736          *
0737          * CC=3 CE(1)=0 CE(0)=1 indicates partial completion
0738          * Fig.6-7 and Table 6-8.
0739          */
0740         nx_ce = get_csb_ce_ms3b(cmdp->crb.csb);
0741 
0742         if (!csb_ce_termination(nx_ce) &&
0743             csb_ce_partial_completion(nx_ce)) {
0744             /* Check CPB for more information
0745              * spbc and tpbc are valid
0746              */
0747             sfbt = getnn(cmdp->cpb, out_sfbt); /* Table 6-4 */
0748             subc = getnn(cmdp->cpb, out_subc); /* Table 6-4 */
0749             spbc = get32(cmdp->cpb, out_spbc_decomp);
0750             tpbc = get32(cmdp->crb.csb, tpbc);
0751             assert(target_max >= tpbc);
0752 
0753             goto ok_cc3; /* not an error */
0754         } else {
0755             /* History length error when CE(1)=1 CE(0)=0. */
0756             rc = -1;
0757             fprintf(stderr, "history length error cc= %d\n", cc);
0758             goto err5;
0759         }
0760 
0761     case ERR_NX_TARGET_SPACE:
0762 
0763         /* Target buffer not large enough; retry smaller input
0764          * data; give at least 1 byte.  SPBC/TPBC are not valid.
0765          */
0766         assert(source_sz > history_len);
0767         source_sz = ((source_sz - history_len + 2) / 2) + history_len;
0768         NXPRT(fprintf(stderr, "ERR_NX_TARGET_SPACE; retry with "));
0769         NXPRT(fprintf(stderr, "smaller input data src %d hist %d\n",
0770                   source_sz, history_len));
0771         goto restart_nx;
0772 
0773     case ERR_NX_OK:
0774 
0775         /* This should not happen for gzip formatted data;
0776          * we need trailing crc and isize
0777          */
0778         fprintf(stderr, "ERR_NX_OK\n");
0779         spbc = get32(cmdp->cpb, out_spbc_decomp);
0780         tpbc = get32(cmdp->crb.csb, tpbc);
0781         assert(target_max >= tpbc);
0782         assert(spbc >= history_len);
0783         source_sz = spbc - history_len;
0784         goto offsets_state;
0785 
0786     default:
0787         fprintf(stderr, "error: cc= %d\n", cc);
0788         rc = -1;
0789         goto err5;
0790     }
0791 
0792 ok_cc3:
0793 
0794     NXPRT(fprintf(stderr, "cc3: sfbt: %x\n", sfbt));
0795 
0796     assert(spbc > history_len);
0797     source_sz = spbc - history_len;
0798 
0799     /* Table 6-4: Source Final Block Type (SFBT) describes the
0800      * last processed deflate block and clues the software how to
0801      * resume the next job.  SUBC indicates how many input bits NX
0802      * consumed but did not process.  SPBC indicates how many
0803      * bytes of source were given to the accelerator including
0804      * history bytes.
0805      */
0806 
0807     switch (sfbt) {
0808         int dhtlen;
0809 
0810     case 0x0: /* Deflate final EOB received */
0811 
0812         /* Calculating the checksum start position. */
0813 
0814         source_sz = source_sz - subc / 8;
0815         is_final = 1;
0816         break;
0817 
0818         /* Resume decompression cases are below. Basically
0819          * indicates where NX has suspended and how to resume
0820          * the input stream.
0821          */
0822 
0823     case 0x8: /* Within a literal block; use rembytecount */
0824     case 0x9: /* Within a literal block; use rembytecount; bfinal=1 */
0825 
0826         /* Supply the partially processed source byte again */
0827         source_sz = source_sz - ((subc + 7) / 8);
0828 
0829         /* SUBC LS 3bits: number of bits in the first source byte need
0830          * to be processed.
0831          * 000 means all 8 bits;  Table 6-3
0832          * Clear subc, histlen, sfbt, rembytecnt, dhtlen
0833          */
0834         cmdp->cpb.in_subc = 0;
0835         cmdp->cpb.in_sfbt = 0;
0836         putnn(cmdp->cpb, in_subc, subc % 8);
0837         putnn(cmdp->cpb, in_sfbt, sfbt);
0838         putnn(cmdp->cpb, in_rembytecnt, getnn(cmdp->cpb,
0839                               out_rembytecnt));
0840         break;
0841 
0842     case 0xA: /* Within a FH block; */
0843     case 0xB: /* Within a FH block; bfinal=1 */
0844 
0845         source_sz = source_sz - ((subc + 7) / 8);
0846 
0847         /* Clear subc, histlen, sfbt, rembytecnt, dhtlen */
0848         cmdp->cpb.in_subc = 0;
0849         cmdp->cpb.in_sfbt = 0;
0850         putnn(cmdp->cpb, in_subc, subc % 8);
0851         putnn(cmdp->cpb, in_sfbt, sfbt);
0852         break;
0853 
0854     case 0xC: /* Within a DH block; */
0855     case 0xD: /* Within a DH block; bfinal=1 */
0856 
0857         source_sz = source_sz - ((subc + 7) / 8);
0858 
0859         /* Clear subc, histlen, sfbt, rembytecnt, dhtlen */
0860         cmdp->cpb.in_subc = 0;
0861         cmdp->cpb.in_sfbt = 0;
0862         putnn(cmdp->cpb, in_subc, subc % 8);
0863         putnn(cmdp->cpb, in_sfbt, sfbt);
0864 
0865         dhtlen = getnn(cmdp->cpb, out_dhtlen);
0866         putnn(cmdp->cpb, in_dhtlen, dhtlen);
0867         assert(dhtlen >= 42);
0868 
0869         /* Round up to a qword */
0870         dhtlen = (dhtlen + 127) / 128;
0871 
0872         while (dhtlen > 0) { /* Copy dht from cpb.out to cpb.in */
0873             --dhtlen;
0874             cmdp->cpb.in_dht[dhtlen] = cmdp->cpb.out_dht[dhtlen];
0875         }
0876         break;
0877 
0878     case 0xE: /* Within a block header; bfinal=0; */
0879              /* Also given if source data exactly ends (SUBC=0) with
0880               * EOB code with BFINAL=0.  Means the next byte will
0881               * contain a block header.
0882               */
0883     case 0xF: /* within a block header with BFINAL=1. */
0884 
0885         source_sz = source_sz - ((subc + 7) / 8);
0886 
0887         /* Clear subc, histlen, sfbt, rembytecnt, dhtlen */
0888         cmdp->cpb.in_subc = 0;
0889         cmdp->cpb.in_sfbt = 0;
0890         putnn(cmdp->cpb, in_subc, subc % 8);
0891         putnn(cmdp->cpb, in_sfbt, sfbt);
0892 
0893         /* Engine did not process any data */
0894         if (is_eof && (source_sz == 0))
0895             is_final = 1;
0896     }
0897 
0898 offsets_state:
0899 
0900     /* Adjust the source and target buffer offsets and lengths  */
0901 
0902     NXPRT(fprintf(stderr, "offsets_state:\n"));
0903 
0904     /* Delete input data from fifo_in */
0905     used_in = used_in - source_sz;
0906     cur_in = (cur_in + source_sz) % fifo_in_len;
0907     input_file_offset = input_file_offset + source_sz;
0908 
0909     /* Add output data to fifo_out */
0910     used_out = used_out + tpbc;
0911 
0912     assert(used_out <= fifo_out_len);
0913 
0914     total_out = total_out + tpbc;
0915 
0916     /* Deflate history is 32KB max.  No need to supply more
0917      * than 32KB on a resume.
0918      */
0919     history_len = (total_out > window_max) ? window_max : total_out;
0920 
0921     /* To estimate expected expansion in the next NX job; 500 means 50%.
0922      * Deflate best case is around 1 to 1000.
0923      */
0924     last_comp_ratio = (1000UL * ((uint64_t)source_sz + 1))
0925               / ((uint64_t)tpbc + 1);
0926     last_comp_ratio = NX_MAX(NX_MIN(1000UL, last_comp_ratio), 1);
0927     NXPRT(fprintf(stderr, "comp_ratio %ld source_sz %d spbc %d tpbc %d\n",
0928               last_comp_ratio, source_sz, spbc, tpbc));
0929 
0930     resuming = 1;
0931 
0932 finish_state:
0933 
0934     NXPRT(fprintf(stderr, "finish_state:\n"));
0935 
0936     if (is_final) {
0937         if (used_out)
0938             goto write_state; /* More data to write out */
0939         else if (used_in < 8) {
0940             /* Need at least 8 more bytes containing gzip crc
0941              * and isize.
0942              */
0943             rc = -1;
0944             goto err4;
0945         } else {
0946             /* Compare checksums and exit */
0947             int i;
0948             unsigned char tail[8];
0949             uint32_t cksum, isize;
0950 
0951             for (i = 0; i < 8; i++)
0952                 tail[i] = fifo_in[(cur_in + i) % fifo_in_len];
0953             fprintf(stderr, "computed checksum %08x isize %08x\n",
0954                 cmdp->cpb.out_crc, (uint32_t) (total_out
0955                 % (1ULL<<32)));
0956             cksum = ((uint32_t) tail[0] | (uint32_t) tail[1]<<8
0957                  | (uint32_t) tail[2]<<16
0958                  | (uint32_t) tail[3]<<24);
0959             isize = ((uint32_t) tail[4] | (uint32_t) tail[5]<<8
0960                  | (uint32_t) tail[6]<<16
0961                  | (uint32_t) tail[7]<<24);
0962             fprintf(stderr, "stored   checksum %08x isize %08x\n",
0963                 cksum, isize);
0964 
0965             if (cksum == cmdp->cpb.out_crc && isize == (uint32_t)
0966                 (total_out % (1ULL<<32))) {
0967                 rc = 0; goto ok1;
0968             } else {
0969                 rc = -1; goto err4;
0970             }
0971         }
0972     } else
0973         goto read_state;
0974 
0975     return -1;
0976 
0977 err1:
0978     fprintf(stderr, "error: not a gzip file, expect %x, read %x\n",
0979         expect, c);
0980     return -1;
0981 
0982 err2:
0983     fprintf(stderr, "error: the FLG byte is wrong or not being handled\n");
0984     return -1;
0985 
0986 err3:
0987     fprintf(stderr, "error: gzip header\n");
0988     return -1;
0989 
0990 err4:
0991     fprintf(stderr, "error: checksum missing or mismatch\n");
0992 
0993 err5:
0994 ok1:
0995     fprintf(stderr, "decomp is complete: fclose\n");
0996     fclose(outf);
0997 
0998     return rc;
0999 }
1000 
1001 
1002 int main(int argc, char **argv)
1003 {
1004     int rc;
1005     struct sigaction act;
1006     void *handle;
1007 
1008     nx_dbg = 0;
1009     nx_gzip_log = NULL;
1010     act.sa_handler = 0;
1011     act.sa_sigaction = nxu_sigsegv_handler;
1012     act.sa_flags = SA_SIGINFO;
1013     act.sa_restorer = 0;
1014     sigemptyset(&act.sa_mask);
1015     sigaction(SIGSEGV, &act, NULL);
1016 
1017     handle = nx_function_begin(NX_FUNC_COMP_GZIP, 0);
1018     if (!handle) {
1019         fprintf(stderr, "Unable to init NX, errno %d\n", errno);
1020         exit(-1);
1021     }
1022 
1023     rc = decompress_file(argc, argv, handle);
1024 
1025     nx_function_end(handle);
1026 
1027     return rc;
1028 }