/*
 * Copyright (c) 2011-2013, 2018-2022 Genome Research Ltd.
 * Author(s): James Bonfield
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
 *
 *    1. Redistributions of source code must retain the above copyright notice,
 *       this list of conditions and the following disclaimer.
 *
 *    2. Redistributions in binary form must reproduce the above
 *       copyright notice, this list of conditions and the following
 *       disclaimer in the documentation and/or other materials provided
 *       with the distribution.
 *
 *    3. Neither the names Genome Research Ltd and Wellcome Trust Sanger
 *       Institute nor the names of its contributors may be used to endorse
 *       or promote products derived from this software without specific
 *       prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY GENOME RESEARCH LTD AND CONTRIBUTORS "AS
 * IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
 * PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL GENOME RESEARCH
 * LTD OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */

// We use generic maps to turn 0-M into 0-N where N <= M
// before adding these into the context.  These are used
// for positions, running-diffs and quality values.
//
// This can be used as a simple divisor, eg pos/24 to get
// 2 bits of positional data for each quarter along a 100bp
// read, or it can be tailored for specific such as noting
// the first 5 cycles are poor, then we have stability and
// a gradual drop off in the last 20 or so.  Perhaps we then
// map pos 0-4=0, 5-79=1, 80-89=2, 90-99=3.
//
// We don't need to specify how many bits of data we are
// using (2 in the above example), as that is just implicit
// in the values in the map.  Specify not to use a map simply
// disables that context type (our map is essentially 0-M -> 0).

// Example of command line usage:
//
// f=~/scratch/data/q4
// cc -Wall -DTEST_MAIN -O3 -g fqzcomp_qual2.c -lm
// ./a.out $f > /tmp/_ && ./a.out -d < /tmp/_ > /tmp/__ && cmp /tmp/__ $f

#include "config.h"

#include <stdio.h>
#include <stdlib.h>
#include <assert.h>
#include <string.h>
#include <math.h>
#include <limits.h>
#include <ctype.h>
#include <math.h>
#include <inttypes.h>
#include <sys/types.h>

#include "fqzcomp_qual.h"
#include "varint.h"
#include "utils.h"

#define CTX_BITS 16
#define CTX_SIZE (1<<CTX_BITS)

#ifndef MIN
#  define MIN(a,b) ((a)<(b)?(a):(b))
#  define MAX(a,b) ((a)>(b)?(a):(b))
#endif

#define QMAX 256
#define QBITS 12
#define QSIZE (1<<QBITS)

#define NSYM 2
#include "c_simple_model.h"

#undef NSYM
#define NSYM QMAX
//#include "c_escape_model.h"
#include "c_simple_model.h"
//#include "c_cdf_model.h"
//#include "c_cdf16_model.h"

// An array of 0,0,0, 1,1,1,1, 3, 5,5
// is turned into a run-length of 3x0, 4x1, 0x2, 1x4, 0x4, 2x5,
// which then becomes 3 4 0 1 0 2.
//
// NB: size > 255 therefore means we need to repeatedly read to find
// the actual run length.
// Alternatively we could bit-encode instead of byte encode, eg BETA.
static int store_array(unsigned char *out, unsigned int *array, int size) {
    unsigned char tmp[2048];

    int i, j, k;
    for (i = j = k = 0; i < size; j++) {
        int run_len = i;
        while (i < size && array[i] == j)
            i++;
        run_len = i-run_len;

        int r;
        do {
            r = MIN(255, run_len);
            tmp[k++] = r;
            run_len -= r;
        } while (r == 255);
    }
    while (i < size)
        tmp[k++] = 0, j++;

    // RLE on out.
    //    1 2 3 3 3 3 3 4 4    5
    // => 1 2 3 3 +3... 4 4 +0 5
    int last = -1;
    for (i = j = 0; j < k; i++) {
        out[i] = tmp[j++];
        if (out[i] == last) {
            int n = j;
            while (j < k && tmp[j] == last)
                j++;
            out[++i] = j-n;
        } else {
            last = out[i];
        }
    }
    k = i;

//    fprintf(stderr, "Store_array %d => %d {", size, k);
//    for (i = 0; i < k; i++)
//      fprintf(stderr, "%d,", out[i]);
//    fprintf(stderr, "}\n");
    return k;
}

static int read_array(unsigned char *in, size_t in_size, unsigned int *array, int size) {
    unsigned char R[1024];
    int i, j, z, last = -1, nb = 0;

    size = MIN(1024, size);

    // Remove level one of run-len encoding
    for (i = j = z = 0; z < size && i < in_size; i++) {
        int run = in[i];
        R[j++] = run;
        z += run;
        if (run == last) {
            if (i+1 >= in_size)
                return -1;
            int copy = in[++i];
            z += run * copy;
            while (copy-- && z <= size && j < 1024)
                R[j++] = run;
        }
        if (j >= 1024)
            return -1;
        last = run;
    }
    nb = i;

    // Now expand inner level of run-length encoding
    int R_max = j;
    for (i = j = z = 0; j < size; i++) {
        int run_len = 0;
        int run_part;
        if (z >= R_max)
            return -1;
        do {
            run_part = R[z++];
            run_len += run_part;
        } while (run_part == 255 && z < R_max);
        if (run_part == 255)
            return -1;

        while (run_len && j < size)
            run_len--, array[j++] = i;
    }

    return nb;
}

// FIXME: how to auto-tune these rather than trial and error?
// r2 = READ2
// qa = qual avg (0, 2, 4)
static int strat_opts[][12] = {
//   qb  qs pb ps db ds ql sl pl  dl  r2 qa
    {10, 5, 4,-1, 2, 1, 0, 14, 10, 14, 0,-1}, // basic options (level < 7)
    {8,  5, 7, 0, 0, 0, 0, 14, 8,  14, 1,-1}, // e.g. HiSeq 2000
    {12, 6, 2, 0, 2, 3, 0, 9,  12, 14, 0, 0}, // e.g. MiSeq
    {12, 6, 0, 0, 0, 0, 0, 12, 0,  0,  0, 0}, // e.g. IonTorrent; adaptive O1
    {0,  0, 0, 0, 0, 0, 0, 0,  0,  0,  0, 0}, // custom
};
static int nstrats = sizeof(strat_opts) / sizeof(*strat_opts);

#ifdef HAVE_BUILTIN_PREFETCH
static inline void mm_prefetch(void *x) {
    __builtin_prefetch(x);
}
#else
static inline void mm_prefetch(void *x) {
    // Fetch and discard is quite close to a genuine prefetch
    *(volatile int *)x;
}
#endif

typedef struct {
    unsigned int qctx;  // quality sub-context
    unsigned int p;     // pos (bytes remaining)
    unsigned int delta; // delta running total
    unsigned int prevq; // previous quality
    unsigned int s;     // selector
    unsigned int qtot, qlen;
    unsigned int first_len;
    unsigned int last_len;
    ssize_t rec;
    unsigned int ctx;
} fqz_state;

static void dump_table(unsigned int *tab, int size, char *name) {
    int i, last = -99, run = 0;
    fprintf(stderr, "\t%s\t{", name);
    for (i = 0; i < size; i++) {
        if (tab[i] == last) {
            run++;
        } else if (run == 1 && tab[i] == last+1) {
            int first = last;
            do {
                last = tab[i];
                i++;
            } while (i < size && tab[i] == last+1);
            i--;

            // Want 0,1,2,3,3,3 as 0..2 3x3, not 0..3 3x2
            if (tab[i] == tab[i+1])
                i--;
            if (tab[i] != first)
                fprintf(stderr, "..%d", tab[i]);
            run = 1;
            last = -99;
        } else {
            if (run > 1)
                fprintf(stderr, " x %d%s%d", run, i?", ":"", tab[i]);
            else
                fprintf(stderr, "%s%d", i?", ":"", tab[i]);
            run = 1;
            last = tab[i];
        }
    }
    if (run > 1)
        fprintf(stderr, " x %d", run);
    fprintf(stderr, "}\n");
}

static void dump_map(unsigned int *map, int size, char *name) {
    int i, c = 0;
    fprintf(stderr, "\t%s\t{", name);
    for (i = 0; i < size; i++)
        if (map[i] != INT_MAX)
            fprintf(stderr, "%s%d=%d", c++?", ":"", i, map[i]);
    fprintf(stderr, "}\n");
}

#pragma GCC diagnostic ignored "-Wunused-function"
static void dump_params(fqz_gparams *gp) {
    fprintf(stderr, "Global params = {\n");
    fprintf(stderr, "\tvers\t%d\n", gp->vers);
    fprintf(stderr, "\tgflags\t0x%02x\n", gp->gflags);
    fprintf(stderr, "\tnparam\t%d\n", gp->nparam);
    fprintf(stderr, "\tmax_sel\t%d\n", gp->max_sel);
    fprintf(stderr, "\tmax_sym\t%d\n", gp->max_sym);
    if (gp->gflags & GFLAG_HAVE_STAB)
        dump_table(gp->stab, 256, "stab");
    fprintf(stderr, "}\n");

    int i;
    for (i = 0; i < gp->nparam; i++) {
        fqz_param *pm = &gp->p[i];
        fprintf(stderr, "\nParam[%d] = {\n", i);
        fprintf(stderr, "\tcontext\t0x%04x\n", pm->context);
        fprintf(stderr, "\tpflags\t0x%02x\n",  pm->pflags);
        fprintf(stderr, "\tmax_sym\t%d\n",  pm->max_sym);
        fprintf(stderr, "\tqbits\t%d\n",   pm->qbits);
        fprintf(stderr, "\tqshift\t%d\n",  pm->qshift);
        fprintf(stderr, "\tqloc\t%d\n",    pm->qloc);
        fprintf(stderr, "\tsloc\t%d\n",    pm->sloc);
        fprintf(stderr, "\tploc\t%d\n",    pm->ploc);
        fprintf(stderr, "\tdloc\t%d\n",    pm->dloc);

        if (pm->pflags & PFLAG_HAVE_QMAP)
            dump_map(pm->qmap, 256, "qmap");

        if (pm->pflags & PFLAG_HAVE_QTAB)
            dump_table(pm->qtab, 256, "qtab");
        if (pm->pflags & PFLAG_HAVE_PTAB)
            dump_table(pm->ptab, 1024, "ptab");
        if (pm->pflags & PFLAG_HAVE_DTAB)
            dump_table(pm->dtab, 256, "dtab");
        fprintf(stderr, "}\n");
    }
}

typedef struct {
    SIMPLE_MODEL(QMAX,_) *qual;
    SIMPLE_MODEL(256,_)   len[4];
    SIMPLE_MODEL(2,_)     revcomp;
    SIMPLE_MODEL(256,_)   sel;
    SIMPLE_MODEL(2,_)     dup;
} fqz_model;

static int fqz_create_models(fqz_model *m, fqz_gparams *gp) {
    int i;

    if (!(m->qual = htscodecs_tls_alloc(sizeof(*m->qual) * CTX_SIZE)))
        return -1;

    for (i = 0; i < CTX_SIZE; i++)
        SIMPLE_MODEL(QMAX,_init)(&m->qual[i], gp->max_sym+1);

    for (i = 0; i < 4; i++)
        SIMPLE_MODEL(256,_init)(&m->len[i],256);

    SIMPLE_MODEL(2,_init)(&m->revcomp,2);
    SIMPLE_MODEL(2,_init)(&m->dup,2);
    if (gp->max_sel > 0)
        SIMPLE_MODEL(256,_init)(&m->sel, gp->max_sel+1);

    return 0;
}

static void fqz_destroy_models(fqz_model *m) {
    htscodecs_tls_free(m->qual);
}

static inline unsigned int fqz_update_ctx(fqz_param *pm, fqz_state *state, int q) {
    unsigned int last = 0; // pm->context
    state->qctx = (state->qctx << pm->qshift) + pm->qtab[q];
    last += (state->qctx & pm->qmask) << pm->qloc;

    // The final shifts have been factored into the tables already.
    last += pm->ptab[MIN(1023, state->p)];      // << pm->ploc
    last += pm->dtab[MIN(255,  state->delta)];  // << pm->dloc
    last += state->s << pm->sloc;

    // On the fly average is slow work.
    // However it can be slightly better than using a selector bit
    // as it's something we can compute on the fly and thus doesn't
    // consume output bits for storing the selector itself.
    //
    // Q4 (novaseq.bam)
    // qtot+=q*q -DQ1=8.84 -DQ2=8.51 -DQ3=7.70; 7203598 (-0.7%)
    // qtot+=q   -DQ1=2.96 -DQ2=2.85 -DQ3=2.69; 7207315
    // vs old delta;                            7255614 (default params)
    // vs 2 bit selector (no delta)             7203006 (-x 0x8261000e80)
    // vs 2 bit selector (no delta)             7199153 (-x 0x7270000e70) -0.8%
    // vs 2 bit selector (no delta)             7219668 (-x 0xa243000ea0)
    //{
    //  double qa = state->qtot / (state->qlen+.01);
    //  //fprintf(stderr, "%f\n", qa);
    //  int x = 0;
    //  if (qa>=Q1) x=3;
    //  else if (qa>=Q2) x=2;
    //  else if (qa>=Q3) x=1;
    //  else x=0;
    //  last += x << pm->dloc; // tmp reuse of delta pos
    //  state->qtot += q*q;
    //  state->qlen++;
    //}

    // Only update delta after 1st base.
    state->delta += (state->prevq != q);
    state->prevq = q;

    state->p--;

    return last & (CTX_SIZE-1);
}

// Build quality stats for qhist and set nsym, do_dedup and do_sel params.
// One_param is -1 to gather stats on all data, or >= 0 to gather data
// on one specific selector parameter.  Used only in TEST_MAIN via
// fqz_manual_parameters at the moment.
void fqz_qual_stats(fqz_slice *s,
                    unsigned char *in, size_t in_size,
                    fqz_param *pm,
                    uint32_t qhist[256],
                    int one_param) {
#define NP 32
    uint32_t qhistb[NP][256] = {{0}};  // both
    uint32_t qhist1[NP][256] = {{0}};  // READ1 only
    uint32_t qhist2[NP][256] = {{0}};  // READ2 only
    uint64_t t1[NP] = {0};             // Count for READ1
    uint64_t t2[NP] = {0};             // COUNT for READ2
    uint32_t avg[2560] = {0};          // Avg qual *and later* avg-to-selector map.

    int dir = 0;
    int last_len = 0;
    int do_dedup = 0;
    size_t rec;
    size_t i, j;
    int num_rec = 0;

    // See what info we've been given.
    // Do we have READ1 / READ2?
    // Do we have selector hidden in the top bits of flag?
    int max_sel = 0;
    int has_r2 = 0;
    for (rec = 0; rec < s->num_records; rec++) {
        if (one_param >= 0 && (s->flags[rec] >> 16) != one_param)
            continue;
        num_rec++;
        if (max_sel < (s->flags[rec] >> 16))
            max_sel = (s->flags[rec] >> 16);
        if (s->flags[rec] & FQZ_FREAD2)
            has_r2 = 1;
    }

    // Dedup detection and histogram stats gathering
    int *avg_qual = calloc((s->num_records+1), sizeof(int));
    if (!avg_qual)
        return;

    rec = i = j = 0;
    while (i < in_size) {
        if (one_param >= 0 && (s->flags[rec] >> 16) != one_param) {
            avg_qual[rec] = 0;
            i += s->len[rec++];
            continue;
        }
        if (rec < s->num_records) {
            j = s->len[rec];
            dir = s->flags[rec] & FQZ_FREAD2 ? 1 : 0;
            if (i > 0 && j == last_len
                && !memcmp(in+i-last_len, in+i, j))
                do_dedup++; // cache which records are dup?
        } else {
            j = in_size - i;
            dir = 0;
        }
        last_len = j;

        uint32_t (*qh)[256] = dir ? qhist2 : qhist1;
        uint64_t *th        = dir ? t2     : t1;

        uint32_t tot = 0;
        for (; i < in_size && j > 0; i++, j--) {
            tot += in[i];
            qhist[in[i]]++;
            qhistb[j & (NP-1)][in[i]]++;
            qh[j & (NP-1)][in[i]]++;
            th[j & (NP-1)]++;
        }
        tot = last_len ? (tot*10.0)/last_len+.5 : 0;

        avg_qual[rec] = tot;
        avg[MIN(2559, tot)]++;

        rec++;
    }
    pm->do_dedup = ((rec+1)/(do_dedup+1) < 500);

    last_len = 0;

    // Unique symbol count
    for (i = pm->max_sym = pm->nsym = 0; i < 256; i++) {
        if (qhist[i])
            pm->max_sym = i, pm->nsym++;
    }


    // Auto tune: does average quality helps us?
    if (pm->do_qa != 0) {
        // Histogram of average qual in avg[]
        // NB: we convert avg[] from count to selector index

        // Few symbols means high compression which means
        // selector bits become more significant fraction.
        // Reduce selector bits by skewing the distribution
        // to not be even binning.
        double qf0 = pm->nsym > 8 ? 0.2 : 0.05;
        double qf1 = pm->nsym > 8 ? 0.5 : 0.22;
        double qf2 = pm->nsym > 8 ? 0.8 : 0.60;

        int total = 0;
        i = 0;
        while (i < 2560) {
            total += avg[i];
            if (total > qf0 * num_rec) {
                //fprintf(stderr, "Q1=%d\n", (int)i);
                break;
            }
            avg[i++] = 0;
        }
        while (i < 2560) {
            total += avg[i];
            if (total > qf1 * num_rec) {
                //fprintf(stderr, "Q2=%d\n", (int)i);
                break;
            }
            avg[i++] = 1;
        }
        while (i < 2560) {
            total += avg[i];
            if (total > qf2 * num_rec) {
                //fprintf(stderr, "Q3=%d\n", (int)i);
                break;
            }
            avg[i++] = 2;
        }
        while (i < 2560)
            avg[i++] = 3;

        // Compute simple entropy of merged signal vs split signal.
        i = 0;
        rec = 0;

        int qbin4[4][NP][256] = {{{0}}};
        int qbin2[2][NP][256] = {{{0}}};
        int qbin1   [NP][256] = {{0}};
        int qcnt4[4][NP] = {{0}};
        int qcnt2[4][NP] = {{0}};
        int qcnt1   [NP] = {0};
        while (i < in_size) {
            if (one_param >= 0 && (s->flags[rec] >> 16) != one_param) {
                i += s->len[rec++];
                continue;
            }
            if ((rec & 7) && rec < s->num_records) {
                // subsample for speed
                i += s->len[rec++];
                continue;
            }
            if (rec < s->num_records)
                j = s->len[rec];
            else
                j = in_size - i;
            last_len = j;

            uint32_t tot = avg_qual[rec];
            int qb4 = avg[MIN(2559, tot)];
            int qb2 = qb4/2;

            for (; i < in_size && j > 0; i++, j--) {
                int x = j & (NP-1);
                qbin4[qb4][x][in[i]]++;  qcnt4[qb4][x]++;
                qbin2[qb2][x][in[i]]++;  qcnt2[qb2][x]++;
                qbin1     [x][in[i]]++;  qcnt1     [x]++;
            }
            rec++;
        }

        double e1 = 0, e2 = 0, e4 = 0;
        for (j = 0; j < NP; j++) {
            for (i = 0; i < 256; i++) {
                if (qbin1   [j][i]) e1 += qbin1   [j][i] * fast_log(qbin1   [j][i] / (double)qcnt1   [j]);
                if (qbin2[0][j][i]) e2 += qbin2[0][j][i] * fast_log(qbin2[0][j][i] / (double)qcnt2[0][j]);
                if (qbin2[1][j][i]) e2 += qbin2[1][j][i] * fast_log(qbin2[1][j][i] / (double)qcnt2[1][j]);
                if (qbin4[0][j][i]) e4 += qbin4[0][j][i] * fast_log(qbin4[0][j][i] / (double)qcnt4[0][j]);
                if (qbin4[1][j][i]) e4 += qbin4[1][j][i] * fast_log(qbin4[1][j][i] / (double)qcnt4[1][j]);
                if (qbin4[2][j][i]) e4 += qbin4[2][j][i] * fast_log(qbin4[2][j][i] / (double)qcnt4[2][j]);
                if (qbin4[3][j][i]) e4 += qbin4[3][j][i] * fast_log(qbin4[3][j][i] / (double)qcnt4[3][j]);
            }
        }
        e1 /= -log(2)/8;
        e2 /= -log(2)/8;
        e4 /= -log(2)/8;
        //fprintf(stderr, "E1=%f E2=%f E4=%f %f\n", e1, e2+s->num_records/8, e4+s->num_records/4, (e4+s->num_records/4)/(e2+s->num_records/8));

        // Note by using the selector we're robbing bits from elsewhere in
        // the context, which may reduce compression better.
        // We don't know how much by, so this is basically a guess!
        // For now we just say need 5% saving here.
        double qm = pm->do_qa > 0 ? 1 : 0.98;
        if ((pm->do_qa == -1 || pm->do_qa >= 4) &&
            e4 + s->num_records/4 < e2*qm + s->num_records/8 &&
            e4 + s->num_records/4 < e1*qm) {
            //fprintf(stderr, "do q4\n");
            for (i = 0; i < s->num_records; i++) {
                //fprintf(stderr, "%d -> %d -> %d, %d\n", (int)i, avg_qual[i], avg[MIN(2559, avg_qual[i])], s->flags[i]>>16);
                s->flags[i] |= avg[MIN(2559, avg_qual[i])] <<16;
            }
            pm->do_sel = 1;
            max_sel = 3;
        } else if ((pm->do_qa == -1 || pm->do_qa >= 2) && e2 + s->num_records/8 < e1*qm) {
            //fprintf(stderr, "do q2\n");
            for (i = 0; i < s->num_records; i++)
                s->flags[i] |= (avg[MIN(2559, avg_qual[i])]>>1) <<16;
            pm->do_sel = 1;
            max_sel = 1;
        }

        if (pm->do_qa == -1) {
            // assume qual, pos, delta in that order.
            if (pm->pbits > 0 && pm->dbits > 0) {
                // 1 from pos/delta
                pm->sloc = pm->dloc-1;
                pm->pbits--;
                pm->dbits--;
                pm->dloc++;
            } else if (pm->dbits >= 2) {
                // 2 from delta
                pm->sloc = pm->dloc;
                pm->dbits -= 2;
                pm->dloc += 2;
            } else if (pm->qbits >= 2) {
                pm->qbits -= 2;
                pm->ploc -= 2;
                pm->sloc = 16-2 - pm->do_r2;
                if (pm->qbits == 6 && pm->qshift == 5)
                    pm->qbits--;
            }
            pm->do_qa = 4;
        }
    }

    // Auto tune: does splitting up READ1 and READ2 help us?
    if (has_r2 || pm->do_r2) { // FIXME: && but debug for now
        double e1 = 0, e2 = 0; // entropy sum

        for (j = 0; j < NP; j++) {
            if (!t1[j] || !t2[j]) continue;
            for (i = 0; i < 256; i++) {
                if (!qhistb[j][i]) continue;
                e1 -= (qhistb[j][i])*log(qhistb[j][i] / (double)(t1[j]+t2[j]));
                if (qhist1[j][i])
                    e2 -= qhist1[j][i] * log(qhist1[j][i] / (double)t1[j]);
                if (qhist2[j][i])
                    e2 -= qhist2[j][i] * log(qhist2[j][i] / (double)t2[j]);
            }
        }
        e1 /= log(2)*8; // bytes
        e2 /= log(2)*8;

        //fprintf(stderr, "read1/2 entropy merge %f split %f\n", e1, e2);

        // Note by using the selector we're robbing bits from elsewhere in
        // the context, which may reduce compression better.
        // We don't know how much by, so this is basically a guess!
        // For now we just say need 5% saving here.
        double qm = pm->do_r2 > 0 ? 1 : 0.95;
        if (e2 + (8+s->num_records/8) < e1*qm) {
            for (rec = 0; rec < s->num_records; rec++) {
                if (one_param >= 0 && (s->flags[rec] >> 16) != one_param)
                    continue;
                int sel = s->flags[rec] >> 16;
                s->flags[rec] =  (s->flags[rec] & 0xffff)
                    | ((s->flags[rec] & FQZ_FREAD2)
                       ? ((sel*2)+1) << 16
                       : ((sel*2)+0) << 16);
                if (max_sel < (s->flags[rec]>>16))
                    max_sel = (s->flags[rec]>>16);
            }
        }
    }

    // We provided explicit selector data or auto-tuned it
    if (max_sel > 0) {
        pm->do_sel = 1;
        pm->max_sel = max_sel;
    }

    free(avg_qual);
}

static inline
int fqz_store_parameters1(fqz_param *pm, unsigned char *comp) {
    int comp_idx = 0, i, j;

    // Starting context
    comp[comp_idx++] = pm->context;
    comp[comp_idx++] = pm->context >> 8;

    comp[comp_idx++] = pm->pflags;
    comp[comp_idx++] = pm->max_sym;

    comp[comp_idx++] = (pm->qbits<<4)|pm->qshift;
    comp[comp_idx++] = (pm->qloc<<4)|pm->sloc;
    comp[comp_idx++] = (pm->ploc<<4)|pm->dloc;

    if (pm->store_qmap) {
        for (i = j = 0; i < 256; i++)
            if (pm->qmap[i] != INT_MAX)
                comp[comp_idx++] = i;
    }

    if (pm->qbits && pm->use_qtab)
        // custom qtab
        comp_idx += store_array(comp+comp_idx, pm->qtab, 256);

    if (pm->pbits && pm->use_ptab)
        // custom ptab
        comp_idx += store_array(comp+comp_idx, pm->ptab, 1024);

    if (pm->dbits && pm->use_dtab)
        // custom dtab
        comp_idx += store_array(comp+comp_idx, pm->dtab, 256);

    return comp_idx;
}

static 
int fqz_store_parameters(fqz_gparams *gp, unsigned char *comp) {
    int comp_idx = 0;
    comp[comp_idx++] = gp->vers; // Format number

    comp[comp_idx++] = gp->gflags;

    if (gp->gflags & GFLAG_MULTI_PARAM)
        comp[comp_idx++] = gp->nparam;

    if (gp->gflags & GFLAG_HAVE_STAB) {
        comp[comp_idx++] = gp->max_sel;
        comp_idx += store_array(comp+comp_idx, gp->stab, 256);
    }

    int i;
    for (i = 0; i < gp->nparam; i++)
        comp_idx += fqz_store_parameters1(&gp->p[i], comp+comp_idx);

    //fprintf(stderr, "Encoded %d bytes of param\n", comp_idx);
    return comp_idx;
}

// Choose a set of parameters based on quality statistics and
// some predefined options (selected via "strat").
static inline
int fqz_pick_parameters(fqz_gparams *gp,
                        int vers,
                        int strat,
                        fqz_slice *s,
                        unsigned char *in,
                        size_t in_size) {
    //approx sqrt(delta), must be sequential
    int dsqr[] = {
        0, 1, 1, 1, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3,
        4, 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5, 5, 5,
        5, 5, 5, 5, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
        6, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7
    };
    uint32_t qhist[256] = {0};

    if (strat >= nstrats) strat = nstrats-1;

    // Start with 1 set of parameters.
    // FIXME: add support for multiple params later.
    memset(gp, 0, sizeof(*gp));
    gp->vers = FQZ_VERS;

    if (!(gp->p = calloc(1, sizeof(fqz_param))))
        return -1;
    gp->nparam = 1;
    gp->max_sel = 0;

    if (vers == 3) // V3.0 doesn't store qual in original orientation
        gp->gflags |= GFLAG_DO_REV;

    fqz_param *pm = gp->p;

    // Programmed strategies, which we then amend based on our
    // statistical analysis of the quality stream.
    pm->qbits  = strat_opts[strat][0];
    pm->qshift = strat_opts[strat][1];
    pm->pbits  = strat_opts[strat][2];
    pm->pshift = strat_opts[strat][3];
    pm->dbits  = strat_opts[strat][4];
    pm->dshift = strat_opts[strat][5];
    pm->qloc   = strat_opts[strat][6];
    pm->sloc   = strat_opts[strat][7];
    pm->ploc   = strat_opts[strat][8];
    pm->dloc   = strat_opts[strat][9];

    // Params for controlling behaviour here.
    pm->do_r2 = strat_opts[strat][10];
    pm->do_qa = strat_opts[strat][11];

    // Validity check input lengths and buffer size
    size_t tlen = 0, i;
    for (i = 0; i < s->num_records; i++) {
        if (tlen + s->len[i] > in_size)
            // Oversized buffer
            s->len[i] = in_size - tlen;
        tlen += s->len[i];
    }
    if (s->num_records > 0 && tlen < in_size)
        // Undersized buffer
        s->len[s->num_records-1] += in_size - tlen;

    // Quality metrics, for all recs
    fqz_qual_stats(s, in, in_size, pm, qhist, -1);

    pm->store_qmap = (pm->nsym <= 8 && pm->nsym*2 < pm->max_sym);

    // Check for fixed length.
    uint32_t first_len = s->len[0];
    for (i = 1; i < s->num_records; i++) {
        if (s->len[i] != first_len)
            break;
    }
    pm->fixed_len = (i == s->num_records);
    pm->use_qtab = 0; // unused by current encoder

    if (strat >= nstrats-1)
        goto manually_set; // used in TEST_MAIN for debugging

    if (pm->pshift < 0)
        pm->pshift = MAX(0, log((double)s->len[0]/(1<<pm->pbits))/log(2)+.5);

    if (pm->nsym <= 4) {
        // NovaSeq
        pm->qshift = 2; // qmax 64, although we can store up to 256 if needed
        if (in_size < 5000000) {
            pm->pbits =2;
            pm->pshift=5;
        }
    } else if (pm->nsym <= 8) {
        // HiSeqX
        pm->qbits =MIN(pm->qbits,9);
        pm->qshift=3;
        if (in_size < 5000000)
            pm->qbits =6;
    }

    if (in_size < 300000) {
        pm->qbits=pm->qshift;
        pm->dbits=2;
    }

 manually_set:
//    fprintf(stderr, "-x 0x%x%x%x%x%x%x%x%x%x%x%x%x\n",
//          pm->qbits, pm->qshift,
//          pm->pbits, pm->pshift,
//          pm->dbits, pm->dshift,
//          pm->qloc, pm->sloc, pm->ploc, pm->dloc,
//          pm->do_r2, pm->do_qa);

    for (i = 0; i < sizeof(dsqr)/sizeof(*dsqr); i++)
        if (dsqr[i] > (1<<pm->dbits)-1)
            dsqr[i] = (1<<pm->dbits)-1;

    if (pm->store_qmap) {
        int j;
        for (i = j = 0; i < 256; i++)
            if (qhist[i])
                pm->qmap[i] = j++;
            else
                pm->qmap[i] = INT_MAX;
        pm->max_sym = pm->nsym;
    } else {
        pm->nsym = 255;
        for (i = 0; i < 256; i++)
            pm->qmap[i] = i;
    }
    if (gp->max_sym < pm->max_sym)
        gp->max_sym = pm->max_sym;

    // Produce ptab from pshift.
    if (pm->qbits) {
        for (i = 0; i < 256; i++) {
            pm->qtab[i] = i; // 1:1

            // Alternative mappings:
            //qtab[i] = i > 30 ? MIN(max_sym,i)-15 : i/2;  // eg for 9827 BAM
        }

    }
    pm->qmask = (1<<pm->qbits)-1;

    if (pm->pbits) {
        for (i = 0; i < 1024; i++)
            pm->ptab[i] = MIN((1<<pm->pbits)-1, i>>pm->pshift);

        // Alternatively via analysis of quality distributions we
        // may select a bunch of positions that are special and
        // have a non-uniform ptab[].
        // Manual experimentation on a NovaSeq run saved 2.8% here.
    }

    if (pm->dbits) {
        for (i = 0; i < 256; i++)
            pm->dtab[i] = dsqr[MIN(sizeof(dsqr)/sizeof(*dsqr)-1, i>>pm->dshift)];
    }

    pm->use_ptab = (pm->pbits > 0);
    pm->use_dtab = (pm->dbits > 0);

    pm->pflags =
        (pm->use_qtab   ?PFLAG_HAVE_QTAB :0)|
        (pm->use_dtab   ?PFLAG_HAVE_DTAB :0)|
        (pm->use_ptab   ?PFLAG_HAVE_PTAB :0)|
        (pm->do_sel     ?PFLAG_DO_SEL    :0)|
        (pm->fixed_len  ?PFLAG_DO_LEN    :0)|
        (pm->do_dedup   ?PFLAG_DO_DEDUP  :0)|
        (pm->store_qmap ?PFLAG_HAVE_QMAP :0);

    gp->max_sel = 0;
    if (pm->do_sel) {
        // 2 selectors values, but 1 parameter block.
        // We'll use the sloc instead to encode the selector bits into
        // the context.
        gp->max_sel = 1; // indicator to check recs
        gp->gflags |= GFLAG_HAVE_STAB;
        // NB: stab is already all zero
    }

    if (gp->max_sel) {
        int max = 0;
        for (i = 0; i < s->num_records; i++) {
            if (max < (s->flags[i] >> 16))
                max = (s->flags[i] >> 16);
        }
        gp->max_sel = max;
    }

    return 0;
}

static void fqz_free_parameters(fqz_gparams *gp) {
    if (gp && gp->p) free(gp->p);
}

static int compress_new_read(fqz_slice *s,
                             fqz_state *state,
                             fqz_gparams *gp,
                             fqz_param *pm,
                             fqz_model *model,
                             RangeCoder *rc,
                             unsigned char *in,
                             size_t *in_i, // in[in_i],
                             unsigned int *last) {
    ssize_t rec = state->rec;
    size_t i = *in_i;
    if (pm->do_sel || (gp->gflags & GFLAG_MULTI_PARAM)) {
        state->s = rec < s->num_records
            ? s->flags[rec] >> 16 // reuse spare bits
            : 0;
        SIMPLE_MODEL(256,_encodeSymbol)(&model->sel, rc, state->s);
    } else {
        state->s = 0;
    }
    int x = (gp->gflags & GFLAG_HAVE_STAB) ? gp->stab[state->s] : state->s;
    pm = &gp->p[x];

    int len = s->len[rec];
    if (!pm->fixed_len || state->first_len) {
        SIMPLE_MODEL(256,_encodeSymbol)(&model->len[0], rc, (len>> 0) & 0xff);
        SIMPLE_MODEL(256,_encodeSymbol)(&model->len[1], rc, (len>> 8) & 0xff);
        SIMPLE_MODEL(256,_encodeSymbol)(&model->len[2], rc, (len>>16) & 0xff);
        SIMPLE_MODEL(256,_encodeSymbol)(&model->len[3], rc, (len>>24) & 0xff);
        state->first_len = 0;
    }

    if (gp->gflags & GFLAG_DO_REV) {
        // no need to reverse complement for V4.0 as the core format
        // already has this feature.
        if (s->flags[rec] & FQZ_FREVERSE)
            SIMPLE_MODEL(2,_encodeSymbol)(&model->revcomp, rc, 1);
        else
            SIMPLE_MODEL(2,_encodeSymbol)(&model->revcomp, rc, 0);
    }

    state->rec++;

    state->qtot = 0;
    state->qlen = 0;

    state->p = len;
    state->delta = 0;
    state->qctx = 0;
    state->prevq = 0;

    *last = pm->context;

    if (pm->do_dedup) {
        // Possible dup of previous read?
        if (i && len == state->last_len &&
            !memcmp(in+i-state->last_len, in+i, len)) {
            SIMPLE_MODEL(2,_encodeSymbol)(&model->dup, rc, 1);
            i += len-1;
            state->p = 0;
            *in_i = i;
            return 1; // is a dup
        } else {
            SIMPLE_MODEL(2,_encodeSymbol)(&model->dup, rc, 0);
        }

        state->last_len = len;
    }

    *in_i = i;

    return 0; // not dup
}

static
unsigned char *compress_block_fqz2f(int vers,
                                    int strat,
                                    fqz_slice *s,
                                    unsigned char *in,
                                    size_t in_size,
                                    size_t *out_size,
                                    fqz_gparams *gp) {
    fqz_gparams local_gp;
    int free_params = 0;

    unsigned int last = 0;
    size_t i, j;
    ssize_t rec = 0;

    int comp_idx = 0;
    RangeCoder rc;

    unsigned char *comp = (unsigned char *)malloc(in_size*1.1+100000);
    unsigned char *compe = comp + (size_t)(in_size*1.1+100000);
    if (!comp)
        return NULL;

    // Pick and store params
    if (!gp) {
        gp = &local_gp;
        if (fqz_pick_parameters(gp, vers, strat, s, in, in_size) < 0)
            return NULL;
        free_params = 1;
    }

    //dump_params(gp);
    comp_idx = var_put_u32(comp, compe, in_size);
    comp_idx += fqz_store_parameters(gp, comp+comp_idx);

    fqz_param *pm;

    // Optimise tables to remove shifts in loop (NB: cannot do this in next vers)
    for (j = 0; j < gp->nparam; j++) {
        pm = &gp->p[j];

        for (i = 0; i < 1024; i++)
            pm->ptab[i] <<= pm->ploc;

        for (i = 0; i < 256; i++)
            pm->dtab[i] <<= pm->dloc;
    }

    // Create models and initialise range coder
    fqz_model model;
    if (fqz_create_models(&model, gp) < 0)
        return NULL;

    RC_SetOutput(&rc, (char *)comp+comp_idx);
    RC_StartEncode(&rc);

    // For CRAM3.1, reverse upfront if needed
    pm = &gp->p[0];
    if (gp->gflags & GFLAG_DO_REV) {
        i = rec = j = 0;
        while (i < in_size) {
            int len = rec < s->num_records-1
                ? s->len[rec] : in_size - i;

            if (s->flags[rec] & FQZ_FREVERSE) {
                // Reverse complement sequence - note: modifies buffer
                int I,J;
                unsigned char *cp = in+i;
                for (I = 0, J = len-1; I < J; I++, J--) {
                    unsigned char c;
                    c = cp[I];
                    cp[I] = cp[J];
                    cp[J] = c;
                }
            }

            i += len;
            rec++;
        }
        rec = 0;
    }

    fqz_state state = {0};
    pm = &gp->p[0];
    state.p = 0;
    state.first_len = 1;
    state.last_len = 0;
    state.rec = rec;

    for (i = 0; i < in_size; i++) {
        if (state.p == 0) {
            if (compress_new_read(s, &state, gp, pm, &model, &rc,
                                  in, &i, /*&rec,*/ &last))
                continue;
        }

#if 0
        //                        fqz_qual_stats imp.
        // q40 6.876  6.852       5.96
        // q4  6.566              5.07
        // _Q  1.383              1.11
        unsigned char q = in[i];
        unsigned char qm = pm->qmap[q];

        SIMPLE_MODEL(QMAX,_encodeSymbol)(&model.qual[last], &rc, qm);
        last = fqz_update_ctx(pm, &state, qm);
#else
        //     gcc    clang            gcc+fqz_qual_stats imp.
        // q40 5.033  5.026     -27%   4.137 -38%
        // q4  5.595            -15%   4.011 -36%
        // _Q  1.225            -11%   0.956
        int j = -1;

        while (state.p >= 4 && i+j+4 < in_size) {
            int l1 = last, l2, l3, l4;
            // Model has symbols sorted by frequency, so most common are at
            // start.  So while model is approx 1Kb, the first cache line is
            // a big win.
            mm_prefetch(&model.qual[l1]);
            unsigned char qm1 = pm->qmap[in[i + ++j]];
            last = fqz_update_ctx(pm, &state, qm1); l2 = last;

            mm_prefetch(&model.qual[l2]);
            unsigned char qm2 = pm->qmap[in[i + ++j]];
            last = fqz_update_ctx(pm, &state, qm2); l3 = last;

            mm_prefetch(&model.qual[l3]);
            unsigned char qm3 = pm->qmap[in[i + ++j]];
            last = fqz_update_ctx(pm, &state, qm3); l4 = last;

            mm_prefetch(&model.qual[l4]);
            unsigned char qm4 = pm->qmap[in[i + ++j]];
            last = fqz_update_ctx(pm, &state, qm4);

            SIMPLE_MODEL(QMAX,_encodeSymbol)(&model.qual[l1], &rc, qm1);
            SIMPLE_MODEL(QMAX,_encodeSymbol)(&model.qual[l2], &rc, qm2);
            SIMPLE_MODEL(QMAX,_encodeSymbol)(&model.qual[l3], &rc, qm3);
            SIMPLE_MODEL(QMAX,_encodeSymbol)(&model.qual[l4], &rc, qm4);
        }

        while (state.p > 0) {
            int l2 = last;
            mm_prefetch(&model.qual[last]);
            unsigned char qm = pm->qmap[in[i + ++j]];
            last = fqz_update_ctx(pm, &state, qm);
            SIMPLE_MODEL(QMAX,_encodeSymbol)(&model.qual[l2], &rc, qm);
        }
        i += j;
#endif
    }

    RC_FinishEncode(&rc);

    // For CRAM3.1, undo our earlier reversal step
    rec = state.rec;
    if (gp->gflags & GFLAG_DO_REV) {
        i = rec = j = 0;
        while (i < in_size) {
            int len = rec < s->num_records-1
                ? s->len[rec]
                : in_size - i;

            if (s->flags[rec] & FQZ_FREVERSE) {
                // Reverse complement sequence - note: modifies buffer
                int I,J;
                unsigned char *cp = in+i;
                for (I = 0, J = len-1; I < J; I++, J--) {
                    unsigned char c;
                    c = cp[I];
                    cp[I] = cp[J];
                    cp[J] = c;
                }
            }

            i += len;
            rec++;
        }
    }

    // Clear selector abuse of flags
    for (rec = 0; rec < s->num_records; rec++)
        s->flags[rec] &= 0xffff;

    *out_size = comp_idx + RC_OutSize(&rc);
    //fprintf(stderr, "%d -> %d\n", (int)in_size, (int)*out_size);

    fqz_destroy_models(&model);
    if (free_params)
        fqz_free_parameters(gp);

    return comp;
}

// Read fqz paramaters.
//
// FIXME: pass in and check in_size.
//
// Returns number of bytes read on success,
//         -1 on failure.
static inline
int fqz_read_parameters1(fqz_param *pm, unsigned char *in, size_t in_size) {
    int in_idx = 0;
    size_t i;

    if (in_size < 7)
        return -1;

    // Starting context
    pm->context = in[in_idx] + (in[in_idx+1]<<8);
    in_idx += 2;

    // Bit flags
    pm->pflags     = in[in_idx++];
    pm->use_qtab   = pm->pflags & PFLAG_HAVE_QTAB;
    pm->use_dtab   = pm->pflags & PFLAG_HAVE_DTAB;
    pm->use_ptab   = pm->pflags & PFLAG_HAVE_PTAB;
    pm->do_sel     = pm->pflags & PFLAG_DO_SEL;
    pm->fixed_len  = pm->pflags & PFLAG_DO_LEN;
    pm->do_dedup   = pm->pflags & PFLAG_DO_DEDUP;
    pm->store_qmap = pm->pflags & PFLAG_HAVE_QMAP;
    pm->max_sym    = in[in_idx++];

    // Sub-context sizes and locations
    pm->qbits      = in[in_idx]>>4;
    pm->qmask      = (1<<pm->qbits)-1;
    pm->qshift     = in[in_idx++]&15;
    pm->qloc       = in[in_idx]>>4;
    pm->sloc       = in[in_idx++]&15;
    pm->ploc       = in[in_idx]>>4;
    pm->dloc       = in[in_idx++]&15;

    // Maps and tables
    if (pm->store_qmap) {
        for (i = 0; i < 256; i++) pm->qmap[i] = INT_MAX; // so dump_map works
        if (in_idx + pm->max_sym > in_size)
            return -1;
        for (i = 0; i < pm->max_sym; i++)
            pm->qmap[i] = in[in_idx++];
    } else {
        for (i = 0; i < 256; i++)
            pm->qmap[i] = i;
    }

    if (pm->qbits) {
        if (pm->use_qtab) {
            int used = read_array(in+in_idx, in_size-in_idx, pm->qtab, 256);
            if (used < 0)
                return -1;
            in_idx += used;
        } else {
            for (i = 0; i < 256; i++)
                pm->qtab[i] = i;
        }
    }

    if (pm->use_ptab) {
        int used = read_array(in+in_idx, in_size-in_idx, pm->ptab, 1024);
        if (used < 0)
            return -1;
        in_idx += used;
    } else {
        for (i = 0; i < 1024; i++)
            pm->ptab[i] = 0;
    }

    if (pm->use_dtab) {
        int used = read_array(in+in_idx, in_size-in_idx, pm->dtab, 256);
        if (used < 0)
            return -1;
        in_idx += used;
    } else {
        for (i = 0; i < 256; i++)
            pm->dtab[i] = 0;
    }

    return in_idx;
}

static
int fqz_read_parameters(fqz_gparams *gp, unsigned char *in, size_t in_size) {
    int in_idx = 0;
    int i;

    if (in_size < 10)
        return -1;

    // Format version
    gp->vers = in[in_idx++];
    if (gp->vers != FQZ_VERS)
        return -1;

    // Global glags
    gp->gflags = in[in_idx++];

    // Number of param blocks and param selector details
    gp->nparam = (gp->gflags & GFLAG_MULTI_PARAM) ? in[in_idx++] : 1;
    if (gp->nparam <= 0)
        return -1;
    gp->max_sel = gp->nparam > 1 ? gp->nparam : 0;

    if (gp->gflags & GFLAG_HAVE_STAB) {
        gp->max_sel = in[in_idx++];
        int used = read_array(in+in_idx, in_size-in_idx, gp->stab, 256);
        if (used < 0)
            goto err;
        in_idx += used;
    } else {
        for (i = 0; i < gp->nparam; i++)
            gp->stab[i] = i;
        for (; i < 256; i++)
            gp->stab[i] = gp->nparam-1;
    }

    // Load the individual parameter locks
    if (!(gp->p = malloc(gp->nparam * sizeof(*gp->p))))
        return -1;

    gp->max_sym = 0;
    for (i = 0; i < gp->nparam; i++) {
        int e = fqz_read_parameters1(&gp->p[i], in + in_idx, in_size-in_idx);
        if (e < 0)
            goto err;
        if (gp->p[i].do_sel && gp->max_sel == 0)
            goto err; // Inconsistent
        in_idx += e;

        if (gp->max_sym < gp->p[i].max_sym)
            gp->max_sym = gp->p[i].max_sym;
    }

    //fprintf(stderr, "Decoded %d bytes of param\n", in_idx);
    return in_idx;

 err:
    fqz_free_parameters(gp);
    gp->nparam = 0;
    return -1;
}

// Handles the state.p==0 section of uncompress_block_fqz2f
static int decompress_new_read(fqz_slice *s,
                               fqz_state *state,
                               fqz_gparams *gp,
                               fqz_param *pm,
                               fqz_model *model,
                               RangeCoder *rc,
                               unsigned char *in, ssize_t *in_i, // in[in_i],
                               unsigned char *uncomp, size_t *out_size,
                               int *rev, char *rev_a, int *len_a,
                               int *lengths, int nlengths) {
    size_t i = *in_i;
    ssize_t rec = state->rec;

    if (pm->do_sel) {
        state->s = SIMPLE_MODEL(256,_decodeSymbol)(&model->sel, rc);
    } else {
        state->s = 0;
    }

    int x = (gp->gflags & GFLAG_HAVE_STAB)
        ? gp->stab[MIN(255, state->s)]
        : state->s;
    if (x >= gp->nparam)
        return -1;
    pm = &gp->p[x];

    unsigned int len = state->last_len;
    if (!pm->fixed_len || state->first_len) {
        len  = SIMPLE_MODEL(256,_decodeSymbol)(&model->len[0], rc);
        len |= SIMPLE_MODEL(256,_decodeSymbol)(&model->len[1], rc)<<8;
        len |= SIMPLE_MODEL(256,_decodeSymbol)(&model->len[2], rc)<<16;
        len |= ((unsigned)SIMPLE_MODEL(256,_decodeSymbol)(&model->len[3], rc))<<24;
        state->first_len = 0;
        state->last_len = len;
    }
    if (len > *out_size-i || len <= 0)
        return -1;

    if (lengths && rec < nlengths)
        lengths[rec] = len;

    if (gp->gflags & GFLAG_DO_REV) {
        *rev = SIMPLE_MODEL(2,_decodeSymbol)(&model->revcomp, rc);
        rev_a[rec] = *rev;
        len_a[rec] = len;
    }

    if (pm->do_dedup) {
        if (SIMPLE_MODEL(2,_decodeSymbol)(&model->dup, rc)) {
            // Dup of last line
            if (len > i)
                return -1;
            memcpy(uncomp+i, uncomp+i-len, len);
            i += len;
            state->p = 0;
            state->rec++;
            *in_i = i;
            return 1; // dup => continue
        }
    }

    state->rec++;
    state->p = len;
    state->delta = 0;
    state->prevq = 0;
    state->qctx = 0;
    state->ctx = pm->context;

    *in_i = i;

    return 0;
}


static
unsigned char *uncompress_block_fqz2f(fqz_slice *s,
                                      unsigned char *in,
                                      size_t in_size,
                                      size_t *out_size,
                                      int *lengths,
                                      int nlengths) {
    fqz_gparams gp;
    fqz_param *pm;
    char *rev_a = NULL;
    int *len_a = NULL;
    memset(&gp, 0, sizeof(gp));

    uint32_t len;
    ssize_t i, rec = 0, in_idx;
    in_idx = var_get_u32(in, in+in_size, &len);
    *out_size = len;

#ifdef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION
    if (len > 100000)
        return NULL;
#endif

    unsigned char *uncomp = NULL;
    RangeCoder rc;
    unsigned int last = 0;

    // Decode parameter blocks
    if ((i = fqz_read_parameters(&gp, in+in_idx, in_size-in_idx)) < 0)
        return NULL;
    //dump_params(&gp);
    in_idx += i;

    // Optimisations to remove shifts from main loop
    for (i = 0; i < gp.nparam; i++) {
        int j;
        pm = &gp.p[i];
        for (j = 0; j < 1024; j++)
            pm->ptab[j] <<= pm->ploc;
        for (j = 0; j < 256; j++)
            pm->dtab[j] <<= pm->dloc;
    }

    // Initialise models and entropy coder
    fqz_model model;
    if (fqz_create_models(&model, &gp) < 0)
        return NULL;

    RC_SetInput(&rc, (char *)in+in_idx, (char *)in+in_size);
    RC_StartDecode(&rc);


    // Allocate buffers
    uncomp = (unsigned char *)malloc(*out_size);
    if (!uncomp)
        goto err;

    int nrec = 1000;
    rev_a = malloc(nrec);
    len_a = malloc(nrec * sizeof(int));
    if (!rev_a || !len_a)
        goto err;

    // Main decode loop
    fqz_state state;
    state.delta = 0;
    state.prevq = 0;
    state.qctx = 0;
    state.p = 0;
    state.s = 0;
    state.first_len = 1;
    state.last_len = 0;
    state.rec = 0;
    state.ctx = last;

    int rev = 0;
    int x = 0;
    pm = &gp.p[x];
    for (i = 0; i < len; ) {
        if (state.rec >= nrec) {
            nrec *= 2;
            rev_a = realloc(rev_a, nrec);
            len_a = realloc(len_a, nrec*sizeof(int));
            if (!rev_a || !len_a)
                goto err;
        }

        if (state.p == 0) {
            int r = decompress_new_read(s, &state, &gp, pm, &model, &rc,
                                        in, &i, uncomp, out_size,
                                        &rev, rev_a, len_a,
                                        lengths, nlengths);
            if (r < 0)
                goto err;
            if (r > 0)
                continue;
            last = state.ctx;
        }

        // Decode and update context
        do {
            unsigned char Q = SIMPLE_MODEL(QMAX,_decodeSymbol)
                (&model.qual[last], &rc);

            last = fqz_update_ctx(pm, &state, Q);
            uncomp[i++] = pm->qmap[Q];
        } while (state.p != 0 && i < len);
    }

    rec = state.rec;
    if (rec >= nrec) {
        nrec *= 2;
        rev_a = realloc(rev_a, nrec);
        len_a = realloc(len_a, nrec*sizeof(int));
        if (!rev_a || !len_a)
            goto err;
    }
    rev_a[rec] = rev;
    len_a[rec] = len;

    if (gp.gflags & GFLAG_DO_REV) {
        for (i = rec = 0; i < len && rec < nrec; i += len_a[rec++]) {
            if (!rev_a[rec])
                continue;

            int I, J;
            unsigned char *cp = uncomp+i;
            for (I = 0, J = len_a[rec]-1; I < J; I++, J--) {
                unsigned char c;
                c = cp[I];
                cp[I] = cp[J];
                cp[J] = c;
            }
        }
    }

    RC_FinishDecode(&rc);
    fqz_destroy_models(&model);
    free(rev_a);
    free(len_a);
    fqz_free_parameters(&gp);

#ifdef TEST_MAIN
    s->num_records = rec;
#endif

    return uncomp;

 err:
    fqz_destroy_models(&model);
    free(rev_a);
    free(len_a);
    fqz_free_parameters(&gp);
    free(uncomp);

    return NULL;
}

char *fqz_compress(int vers, fqz_slice *s, char *in, size_t uncomp_size,
                   size_t *comp_size, int strat, fqz_gparams *gp) {
    if (uncomp_size > INT_MAX) {
        *comp_size = 0;
        return NULL;
    }

    return (char *)compress_block_fqz2f(vers, strat, s, (unsigned char *)in,
                                        uncomp_size, comp_size, gp);
}

char *fqz_decompress(char *in, size_t comp_size, size_t *uncomp_size,
                     int *lengths, int nlengths) {
    return (char *)uncompress_block_fqz2f(NULL, (unsigned char *)in,
                                          comp_size, uncomp_size, lengths, nlengths);
}
