/*
 *  R : A Computer Language for Statistical Data Analysis
 *  Copyright (C) 2016-2023 The R Core Team
 *
 *  Based on code donated from the data.table package
 *  (C) 2006-2015 Matt Dowle and Arun Srinivasan.
 *
 *  This program is free software; you can redistribute it and/or modify
 *  it under the terms of the GNU General Public License as published by
 *  the Free Software Foundation; either version 2 of the License, or
 *  (at your option) any later version.
 *
 *  This program is distributed in the hope that it will be useful,
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *  GNU General Public License for more details.
 *
 *  You should have received a copy of the GNU General Public License
 *  along with this program; if not, a copy is available at
 *  https://www.R-project.org/Licenses/
 */

#ifdef HAVE_CONFIG_H
#include <config.h>
#endif

#include <Defn.h>
#include <Internal.h>

/* It would be better to find a way to avoid abusing TRUELENGTH, but
   in the meantime replace TRUELENGTH/SET_TRUELENGTH with
   TRLEN/SET_TRLEN that cast to int to avoid warnings. */
#define TRLEN(x) ((int) TRUELENGTH(x))
#define SET_TRLEN(x, v) SET_TRUELENGTH(x, ((int) (v)))

// gs = groupsizes e.g.23, 12, 87, 2, 1, 34,...
static int *gs[2] = { NULL };
//two vectors flip flopped:flip and 1 - flip
static int flip = 0;
//allocated stack size
static int gsalloc[2] = { 0 };
static int gsngrp[2] = { 0 };
//max grpn so far
static int gsmax[2] = { 0 };
//max size of stack, set by do_radixsort to nrows
static int gsmaxalloc = 0;
//switched off for last arg unless retGrp==TRUE
static Rboolean stackgrps = TRUE;
// TRUE for setkey, FALSE for by=
static Rboolean sortStr = TRUE;
// used by do_radixsort and [i|d|c]sort to reorder order.
// not needed if narg==1
static int *newo = NULL;
// =1, 0, -1 for TRUE, NA, FALSE respectively.
// Value rewritten inside do_radixsort().
static int nalast = -1;
// =1, -1 for ascending and descending order respectively
static int order = 1;

//replaced n < 200 with n < N_SMALL.Easier to change later
#define N_SMALL 200
// range limit for counting sort. Should be less than INT_MAX
// (see setRange for details)
#define N_RANGE 100000

static SEXP *saveds = NULL;
static R_len_t *savedtl = NULL, nalloc = 0, nsaved = 0;

static void savetl_init(void)
{
    if (nsaved || nalloc || saveds || savedtl)
	error("Internal error: savetl_init checks failed (%d %d %p %p).",
	      nsaved, nalloc, (void *)saveds, (void *)savedtl);
    nsaved = 0;
    nalloc = 100;
    saveds = (SEXP *) malloc(nalloc * sizeof(SEXP));
    if (saveds == NULL)
	error("Could not allocate saveds in savetl_init");
    savedtl = (R_len_t *) malloc(nalloc * sizeof(R_len_t));
    if (savedtl == NULL) {
	free(saveds);
	error("Could not allocate saveds in savetl_init");
    }
}

static void savetl_end(void)
{
    // Can get called if nothing has been saved yet (nsaved == 0), or
    // even if _init() has not been called yet (pointers NULL). Such as
    // to clear up before error. Also, it might be that nothing needed
    // to be saved anyway.
    for (int i = 0; i < nsaved; i++)
	SET_TRLEN(saveds[i], savedtl[i]);
    free(saveds);  // does nothing on NULL input
    free(savedtl);
    nsaved = nalloc = 0;
    saveds = NULL;
    savedtl = NULL;
}


static void savetl(SEXP s)
{
    if (nsaved >= nalloc) {
	nalloc *= 2;
	char *tmp;
	tmp = (char *) realloc(saveds, nalloc * sizeof(SEXP));
	if (tmp == NULL) {
	    savetl_end();
	    error("Could not realloc saveds in savetl");
	}
	saveds = (SEXP *) tmp;
	tmp = (char *) realloc(savedtl, nalloc * sizeof(R_len_t));
	if (tmp == NULL) {
	    savetl_end();
	    error("Could not realloc savedtl in savetl");
	}
	savedtl = (R_len_t *) tmp;
    }
    saveds[nsaved] = s;
    savedtl[nsaved] = TRLEN(s);
    nsaved++;
}

// http://gcc.gnu.org/onlinedocs/cpp/Swallowing-the-Semicolon.html#Swallowing-the-Semicolon
#define Error(...) do {savetl_end(); error(__VA_ARGS__);} while(0)
#undef warning
// since it can be turned to error via warn = 2
#define warning(...) Do not use warning in this file
/* use malloc/realloc (not Calloc/Realloc) so we can trap errors
   and call savetl_end() before the error(). */

static void growstack(uint64_t newlen)
{
    // no link to icount range restriction,
    // just 100,000 seems a good minimum at 0.4MB
    if (newlen == 0) newlen = 100000;
    if (newlen > gsmaxalloc) newlen = gsmaxalloc;
    gs[flip] = realloc(gs[flip], newlen * sizeof(int));
    if (gs[flip] == NULL)
	Error("Failed to realloc working memory stack to %d*4bytes (flip=%d)",
	      (int)newlen /* no bigger than gsmaxalloc */, flip);
    gsalloc[flip] = (int)newlen;
}

static void push(int x)
{
    if (!stackgrps || x == 0)
	return;
    if (gsalloc[flip] == gsngrp[flip])
	growstack((uint64_t)(gsngrp[flip]) * 2);
    gs[flip][gsngrp[flip]++] = x;
    if (x > gsmax[flip])
	gsmax[flip] = x;
}

static void mpush(int x, int n)
{
    if (!stackgrps || x == 0)
	return;
    if (gsalloc[flip] < gsngrp[flip] + n)
	growstack(((uint64_t)(gsngrp[flip]) + n) * 2);
    for (int i = 0; i < n; i++)
	gs[flip][gsngrp[flip]++] = x;
    if (x > gsmax[flip])
	gsmax[flip] = x;
}

static void flipflop(void)
{
    flip = 1 - flip;
    gsngrp[flip] = 0;
    gsmax[flip] = 0;
    if (gsalloc[flip] < gsalloc[1 - flip])
	growstack((uint64_t)(gsalloc[1 - flip]) * 2);
}

static void gsfree(void)
{
    free(gs[0]);
    free(gs[1]);
    gs[0] = NULL;
    gs[1] = NULL;
    flip = 0;
    gsalloc[0] = gsalloc[1] = 0;
    gsngrp[0] = gsngrp[1] = 0;
    gsmax[0] = gsmax[1] = 0;
    gsmaxalloc = 0;
}

#ifdef TIMING_ON
// many calls to clock() can be expensive,
// hence compiled out rather than switch(verbose)
#include <time.h>
#define NBLOCK 20
static clock_t tblock[NBLOCK], tstart;
static int nblock[NBLOCK];
#define TBEG() tstart = clock();
#define TEND(i) tblock[i] += clock()-tstart; nblock[i]++; tstart = clock();
#else
#define TBEG()
#define TEND(i)
#endif

static int range, xmin; // used by both icount and do_radixsort
static void setRange(int *x, int n)
{
    xmin = NA_INTEGER;
    int xmax = NA_INTEGER;
    double overflow;

    int i = 0;
    while(i < n && x[i] == NA_INTEGER) i++;
    if (i < n) xmax = xmin = x[i];
    for (; i < n; i++) {
	int tmp = x[i];
	if (tmp == NA_INTEGER)
	    continue;
	if (tmp > xmax)
	    xmax = tmp;
	else if (tmp < xmin)
	    xmin = tmp;
    }
    // all NAs, nothing to do
    if (xmin == NA_INTEGER) {
	range = NA_INTEGER;
	return;
    }
    // ex: x=c(-2147483647L, NA_integer_, 1L) results in overflowing int range.
    overflow = (double) xmax - (double) xmin + 1;
    // detect and force iradix here, since icount is out of the picture
    if (overflow > INT_MAX) {
	range = INT_MAX;
	return;
    }

    range = xmax - xmin + 1;

    return;
}

// x*order results in integer overflow when -1*NA,
// so careful to avoid that here :
static inline int icheck(int x)
{
    // if nalast == 1, NAs must go last.
    return ((nalast != 1) ? ((x != NA_INTEGER) ? x*order : x) :
	    ((x != NA_INTEGER) ? (x*order) - 1 : INT_MAX));
}


static void icount(int *x, int *o, int n)
/* Counting sort:
   1. Places the ordering into o directly, overwriting whatever was there
   2. Doesn't change x
   3. Pushes group sizes onto stack
*/
{
    int napos = range; // NA's always counted in last bin
    // static is IMPORTANT, counting sort is called repetitively.
    static unsigned int counts[N_RANGE + 1] = { 0 };
    /* counts are set back to 0 at the end efficiently. 1e5 = 0.4MB i.e
       tiny. We'll only use the front part of it, as large as range. So it's
       just reserving space, not using it. Have defined N_RANGE to be 100000.*/
    if (range > N_RANGE)
	Error("Internal error: range = %d; isorted cannot handle range > %d",
	      range, N_RANGE);
    for (int i = 0; i < n; i++) {
	// For nalast=NA case, we won't remove/skip NAs, rather set 'o' indices
	// to 0. subset will skip them. We can't know how many NAs to skip
	// beforehand - i.e. while allocating "ans" vector
	if (x[i] == NA_INTEGER)
	    counts[napos]++;
	else
	    counts[x[i] - xmin]++;
    }
    
    int tmp = 0;
    if (nalast != 1 && counts[napos]) {
        push(counts[napos]);
        tmp += counts[napos];
    }
    int w = (order==1) ? 0 : range-1;
    for (int i = 0; i < range; i++) 
        /* no point in adding tmp < n && i <= range, since range includes max, 
           need to go to max, unlike 256 loops elsewhere in radixsort.c */
    {
	if (counts[w]) {
	    // cumulate but not through 0's.
	    // Helps resetting zeros when n < range, below.
	    push(counts[w]);
	    counts[w] = (tmp += counts[w]);
	}
        w += order; // order is +1 or -1
    }
    if (nalast == 1 && counts[napos]) {
        push(counts[napos]);
        counts[napos] = (tmp += counts[napos]);
    }
    for (int i = n - 1; i >= 0; i--) {
	// This way na.last=TRUE/FALSE cases will have just a
	// single if-check overhead.
	o[--counts[(x[i] == NA_INTEGER) ? napos :
		   x[i] - xmin]] = (int) (i + 1);
    }
    // nalast = 1, -1 are both taken care already.
    if (nalast == 0)
	// nalast = 0 is dealt with separately as it just sets o to 0
	for (int i = 0; i < n; i++)
	    o[i] = (x[o[i] - 1] == NA_INTEGER) ? 0 : o[i];
    // at those indices where x is NA. x[o[i]-1] because x is not modifed here.

    /* counts were cumulated above so leaves non zero.
       Faster to clear up now ready for next time. */
    if (n < range) {
	/* Many zeros in counts already. Loop through n instead,
	   doesn't matter if we set to 0 several times on any repeats */
	counts[napos] = 0;
	for (int i = 0; i < n; i++) {
	    if (x[i] != NA_INTEGER)
		counts[x[i] - xmin] = 0;
	}
    } else
	memset(counts, 0, (range + 1) * sizeof(int));
    return;
}

static void iinsert(int *x, int *o, int n)
/*  orders both x and o by reference in-place. Fast for small vectors,
    low overhead.  don't be tempted to binsearch backwards here, have
    to shift anyway; many memmove would have overhead and do the same
    thing. */
/*  when nalast == 0, iinsert will be called only from within iradix,
    where o[.] = 0 for x[.]=NA is already taken care of */
{
    for (int i = 1; i < n; i++) {
	int xtmp = x[i];
	if (xtmp < x[i - 1]) {
	    int j = i - 1;
	    int otmp = o[i];
	    while (j >= 0 && xtmp < x[j]) {
		x[j + 1] = x[j];
		o[j + 1] = o[j];
		j--;
	    }
	    x[j + 1] = xtmp;
	    o[j + 1] = otmp;
	}
    }
    int tt = 0;
    for (int i = 1; i < n; i++)
	if (x[i] == x[i - 1])
	    tt++;
	else {
	    push(tt + 1);
	    tt = 0;
	}
    push(tt + 1);
}

/*
  iradix is a counting sort performed forwards from MSB to LSB, with
  some tricks and short circuits building on Terdiman and Herf.
  http://codercorner.com/RadixSortRevisited.htm
  http://stereopsis.com/radix.html

  ~ Note they are LSD, but we do MSD here which is more complicated,
    for efficiency.
  ~ NAs need no special treatment as NA is the most negative integer
    in R (checked in init.c once,  for efficiency) so NA naturally sort
    to the front.
  ~ Using 4-pass 1-byte radix for the following reasons :

  * 11-bit (Herf) reduces to 3-passes (3*11=33) yes, and LSD need
    random access to o vector in each pass 1:n so reduction in passes is
    good, but Terdiman's idea to skip a radix if all values are equal
    occurs less the wider the radix. A narrower radix benefits more from that.
    * That's detected here using a single 'if', an improvement on
      Terdiman's exposition of a single loop to find if any count==n
    * The pass through counts bites when radix is wider,
      because we repetitively call this iradix from fastorder forwards.
  *  Herf's parallel histogramming is neat. In 4-pass 1-byte it needs
     4*256 storage, that's  tiny, and can be static. 4*256 << 3*2048.
     4-pass 1-byte is simpler and tighter code than 3-pass 11-bit,
     giving modern optimizers and modern CPUs a better chance.
     We may get lucky anyway, if one or two of the 4-passes are skipped.

  Recall: there are no comparisons at all in counting and radix,
  there is wide random access in each LSD radix pass, though.
*/

// 4 are used for iradix, 8 for dradix and i64radix
static unsigned int radixcounts[8][257] = { {0} };

static int skip[8];
/* global because iradix and iradix_r interact and are called repetitively.
   counts are set back to 0 after each use, to benefit from skipped radix. */
static void *radix_xsub = NULL;
static size_t radix_xsuballoc = 0;

static int *otmp = NULL, otmp_alloc = 0;
static void alloc_otmp(int n)
{
    if (otmp_alloc >= n)
	return;
    otmp = (int *) realloc(otmp, n * sizeof(int));
    if (otmp == NULL)
	Error("Failed to allocate working memory for otmp. Requested %d * %d bytes",
	      n, (int)sizeof(int));
    otmp_alloc = n;
}

// TO DO: save xtmp if possible, see allocs in do_radixsort
static void *xtmp = NULL;
static int xtmp_alloc = 0;
// TO DO: currently always the largest type (double) but
//        could be int if that's all that's needed
static void alloc_xtmp(int n)
{
    if (xtmp_alloc >= n)
	return;
    xtmp = (double *) realloc(xtmp, n * sizeof(double));
    if (xtmp == NULL)
	Error("Failed to allocate working memory for xtmp. Requested %d * %d bytes",
	      n, (int)sizeof(double));
    xtmp_alloc = n;
}

static void iradix_r(int *xsub, int *osub, int n, int radix);

static void iradix(int *x, int *o, int n)
/* As icount :
   Places the ordering into o directly, overwriting whatever was there
   Doesn't change x
   Pushes group sizes onto stack */
{
    int nextradix, itmp, thisgrpn, maxgrpn;
    unsigned int thisx = 0, shift, *thiscounts;

    for (int i = 0; i < n;i++) {
	/* parallel histogramming pass; i.e. count occurrences of
	   0:255 in each byte.  Sequential so almost negligible. */
	// relies on overflow behaviour. And shouldn't -INT_MIN be up in iradix?
	thisx = (unsigned int) (icheck(x[i])) - INT_MIN;
	// unrolled since inside n-loop
	radixcounts[0][thisx & 0xFF]++;
	radixcounts[1][thisx >> 8 & 0xFF]++;
	radixcounts[2][thisx >> 16 & 0xFF]++;
	radixcounts[3][thisx >> 24 & 0xFF]++;
    }
    for (int radix = 0; radix < 4; radix++) {
	/* any(count == n) => all radix must have been that value =>
	   last x (still thisx) was that value */
	int i = thisx >> (radix*8) & 0xFF;
	skip[radix] = radixcounts[radix][i] == n;
	// clear it now, the other counts must be 0 already
	if (skip[radix])
	    radixcounts[radix][i] = 0;
    }

    int radix = 3;  // MSD
    while (radix >= 0 && skip[radix]) radix--;
    if (radix == -1) { // All radix are skipped; one number repeated n times.
	if (nalast == 0 && x[0] == NA_INTEGER)
	    // all values are identical. return 0 if nalast=0 & all NA
	    // because of 'return', have to take care of it here.
	    for (int i = 0; i < n; i++)
		o[i] = 0;
	else
	    for (int i = 0; i < n; i++)
		o[i] = (i + 1);
	push(n);
	return;
    }
    for (int i = radix - 1; i >= 0; i--) {
	if (!skip[i])
	    memset(radixcounts[i], 0, 257 * sizeof(unsigned int));
	/* clear the counts as we only needed the parallel pass for skip[]
	   and we're going to use radixcounts again below. Can't use parallel
	   lower counts in MSD radix, unlike LSD. */
    }
    thiscounts = radixcounts[radix];
    shift = radix * 8;

    itmp = thiscounts[0];
    maxgrpn = itmp;
    for (int i = 1; itmp < n && i < 256; i++) {
	thisgrpn = thiscounts[i];
	if (thisgrpn) {
	    // don't cummulate through 0s, important below.
	    if (thisgrpn > maxgrpn)
		maxgrpn = thisgrpn;
	    thiscounts[i] = (itmp += thisgrpn);
	}
    }
    for (int i = n - 1; i >= 0; i--) {
	thisx = ((unsigned int) (icheck(x[i])) - INT_MIN) >> shift & 0xFF;
	o[--thiscounts[thisx]] = i + 1;
    }

    if (radix_xsuballoc < maxgrpn) {
        // The largest group according to the first non-skipped radix,
        // so could be big (if radix is needed on first arg)
        // TO DO: could include extra bits to divide the first radix
        // up more. Often the MSD has groups in just 0-4 out of 256.
        // free'd at the end of do_radixsort once we're done calling iradix
        // repetitively
        radix_xsub = (int *) realloc(radix_xsub, maxgrpn * sizeof(double));
        if (!radix_xsub)
            Error("Failed to realloc working memory %d*8bytes (xsub in iradix), radix=%d",
                  maxgrpn, radix);
        radix_xsuballoc = maxgrpn;
    }

    // TO DO: can we leave this to do_radixsort and remove these calls??
    alloc_otmp(maxgrpn);
    // TO DO: doesn't need to be sizeof(double) always, see inside
    alloc_xtmp(maxgrpn);

    nextradix = radix - 1;
    while (nextradix >= 0 && skip[nextradix]) nextradix--;
    if (thiscounts[0] != 0)
	Error("Internal error. thiscounts[0]=%d but should have been decremented to 0. dradix=%d",
	      thiscounts[0], radix);
    thiscounts[256] = n;
    itmp = 0;
    for (int i = 1; itmp < n && i <= 256; i++) {
        if (thiscounts[i] == 0) continue;
        // undo cumulate; i.e. diff
        thisgrpn = thiscounts[i] - itmp;
        if (thisgrpn == 1 || nextradix == -1) {
            push(thisgrpn);
        } else {
            for (int j = 0; j < thisgrpn; j++)
                // this is why this xsub here can't be the same memory as
                // xsub in do_radixsort.
                ((int *)radix_xsub)[j] = icheck(x[o[itmp+j]-1]);
            // changes xsub and o by reference recursively.
            iradix_r(radix_xsub, o+itmp, thisgrpn, nextradix);
        }
        itmp = thiscounts[i];
        thiscounts[i] = 0;
    }
    if (nalast == 0) // nalast = 1, -1 are both taken care already.
	// nalast = 0 is dealt with separately as it just sets o to 0
	for (int i = 0; i < n; i++)
	    o[i] = (x[o[i] - 1] == NA_INTEGER) ? 0 : o[i];
    // at those indices where x is NA. x[o[i]-1] because x is not
    // modified by reference unlike iinsert or iradix_r
}

static void iradix_r(int *xsub, int *osub, int n, int radix)
// xsub is a recursive offset into xsub working memory above in
// iradix, reordered by reference.  osub is a an offset into the main
// answer o, reordered by reference.  radix iterates 3,2,1,0
{
    int j, itmp, thisx, thisgrpn, nextradix, shift;
    unsigned int *thiscounts;

    // N_SMALL=200 is guess based on limited testing. Needs
    // calibrate().  Was 50 based on sum(1:50)=1275 worst -vs- 256
    // cummulate + 256 memset + allowance since reverse order is
    // unlikely.  when nalast==0, iinsert will be called only from
    // within iradix.
    if (n < N_SMALL) {
	iinsert(xsub, osub, n);
	return;
    }

    shift = radix * 8;
    thiscounts = radixcounts[radix];

    for (int i = 0; i < n; i++) {
	thisx = (unsigned int) xsub[i] - INT_MIN; // sequential in xsub
	thiscounts[thisx >> shift & 0xFF]++;
    }
    itmp = thiscounts[0];
    for (int i = 1; itmp < n && i < 256; i++)
	// don't cummulate through 0s, important below
	if (thiscounts[i])
	    thiscounts[i] = (itmp += thiscounts[i]);
    for (int i = n - 1; i >= 0; i--) {
	thisx = ((unsigned int) xsub[i] - INT_MIN) >> shift & 0xFF;
	j = --thiscounts[thisx];
	otmp[j] = osub[i];
	((int *) xtmp)[j] = xsub[i];
    }
    memcpy(osub, otmp, n * sizeof(int));
    memcpy(xsub, xtmp, n * sizeof(int));

    nextradix = radix - 1;
    while (nextradix >= 0 && skip[nextradix]) nextradix--;
    /* TO DO: If nextradix == -1 AND no further args from do_radixsort AND
       !retGrp, we're done. We have o. Remember to memset thiscounts
       before returning. */

    if (thiscounts[0] != 0)
	Error("Logical error. thiscounts[0]=%d but should have been decremented to 0. radix=%d",
	      thiscounts[0], radix);
    thiscounts[256] = n;
    itmp = 0;
    for (int i = 1; itmp < n && i <= 256; i++) {
	if (thiscounts[i] == 0)
	    continue;
	thisgrpn = thiscounts[i] - itmp;        // undo cummulate; i.e. diff
	if (thisgrpn == 1 || nextradix == -1) {
	    push(thisgrpn);
	} else {
	    iradix_r(xsub+itmp, osub+itmp, thisgrpn, nextradix);
	}
	itmp = thiscounts[i];
	thiscounts[i] = 0;
    }
}

// dradix from Arun's fastradixdouble.c
// + changed to MSD and hooked into do_radixsort framework here.
// + replaced tolerance with rounding s.f.

static unsigned long long dmask1;
static unsigned long long dmask2;

static void setNumericRounding(int dround)
{
    dmask1 = dround ? 1 << (8 * dround - 1) : 0;
    dmask2 = 0xffffffffffffffff << dround * 8;
}

static union {
    double d;
    unsigned long long ull;
} u;

static
unsigned long long dtwiddle(void *p, int i, int order)
{
    u.d = order * ((double *)p)[i]; // take care of 'order' at the beginning
    if (R_FINITE(u.d)) {
	u.ull = (u.d != 0.0) ? u.ull + ((u.ull & dmask1) << 1) : 0;
    } else if (ISNAN(u.d)) {
	u.ull = 0;
	return (nalast == 1 ? ~u.ull : u.ull);
    }
    unsigned long long mask = (u.ull & 0x8000000000000000) ?
	// always flip sign bit and if negative (sign bit was set)
	// flip other bits too
	0xffffffffffffffff : 0x8000000000000000;
    return ((u.ull ^ mask) & dmask2);
}

static Rboolean dnan(void *p, int i)
{
    u.d = ((double *) p)[i];
    return (ISNAN(u.d));
}

static unsigned long long (*twiddle) (void *, int, int);
static Rboolean(*is_nan) (void *, int);
// the size of the arg type (4 or 8). Just 8 currently until iradix is
// merged in.
static size_t colSize = 8;

static void dradix_r(unsigned char *xsub, int *osub, int n, int radix);

#ifdef WORDS_BIGENDIAN
#define RADIX_BYTE colSize - radix - 1
#else
#define RADIX_BYTE radix
#endif

static void dradix(unsigned char *x, int *o, int n)
{
    int radix, nextradix, itmp, thisgrpn, maxgrpn;
    unsigned int *thiscounts;
    unsigned long long thisx = 0;
    // see comments in iradix for structure.  This follows the same.
    // TO DO: merge iradix in here (almost ready)
    for (int i = 0; i < n; i++) {
	thisx = twiddle(x, i, order);
	for (radix = 0; radix < colSize; radix++)
	    // if dround == 2 then radix 0 and 1 will be all 0 here and skipped.
	    /* on little endian, 0 is the least significant bits (the right)
	       and 7 is the most including sign (the left); i.e. reversed. */
	    radixcounts[radix][((unsigned char *)&thisx)[RADIX_BYTE]]++;
    }
    for (radix = 0; radix < colSize; radix++) {
	// thisx is the last x after loop above
	int i = ((unsigned char *) &thisx)[RADIX_BYTE];
	skip[radix] = radixcounts[radix][i] == n;
	// clear it now, the other counts must be 0 already
	if (skip[radix])
	    radixcounts[radix][i] = 0;
    }
    radix = (int) colSize - 1;  // MSD
    while (radix >= 0 && skip[radix]) radix--;
    if (radix == -1) {
	// All radix are skipped; i.e. one number repeated n times.
	if (nalast == 0 && is_nan(x, 0))
	    // all values are identical. return 0 if nalast=0 & all NA
	    // because of 'return', have to take care of it here.
	    for (int i = 0; i < n; i++)
		o[i] = 0;
	else
	    for (int i = 0; i < n; i++)
		o[i] = (i + 1);
	push(n);
	return;
    }
    for (int i = radix - 1; i >= 0; i--) {
	// clear the lower radix counts, we only did them to know
	// skip. will be reused within each group
	if (!skip[i])
	    memset(radixcounts[i], 0, 257 * sizeof(unsigned int));
    }
    thiscounts = radixcounts[radix];
    itmp = thiscounts[0];
    maxgrpn = itmp;
    for (int i = 1; itmp < n && i < 256; i++) {
	thisgrpn = thiscounts[i];
	if (thisgrpn) {  // don't cummulate through 0s, important below
	    if (thisgrpn > maxgrpn)
		maxgrpn = thisgrpn;
	    thiscounts[i] = (itmp += thisgrpn);
	}
    }
    for (int i = n - 1; i >= 0; i--) {
	thisx = twiddle(x, i, order);
	o[ --thiscounts[((unsigned char *)&thisx)[RADIX_BYTE]] ] = i + 1;
    }

    if (radix_xsuballoc < maxgrpn) {
        // TO DO: centralize this alloc
        // The largest group according to the first non-skipped radix,
        // so could be big (if radix is needed on first arg) TO DO:
        // could include extra bits to divide the first radix up
        // more. Often the MSD has groups in just 0-4 out of 256.
        // free'd at the end of do_radixsort once we're done calling iradix
        // repetitively
        radix_xsub = (double *) realloc(radix_xsub, maxgrpn * sizeof(double));
        if (!radix_xsub)
            Error("Failed to realloc working memory %d*8bytes (xsub in dradix), radix=%d",
                  maxgrpn, radix);
        radix_xsuballoc = maxgrpn;
    }

    alloc_otmp(maxgrpn);   // TO DO: leave to do_radixsort and remove these?
    alloc_xtmp(maxgrpn);

    nextradix = radix - 1;
    while (nextradix >= 0 && skip[nextradix])
	nextradix--;
    if (thiscounts[0] != 0)
	Error("Logical error. thiscounts[0]=%d but should have been decremented to 0. dradix=%d",
	      thiscounts[0], radix);
    thiscounts[256] = n;
    itmp = 0;
    for (int i = 1; itmp < n && i <= 256; i++) {
        if (thiscounts[i] == 0)
            continue;
        thisgrpn = thiscounts[i] - itmp;  // undo cummulate; i.e. diff
        if (thisgrpn == 1 || nextradix == -1) {
            push(thisgrpn);
        } else {
            if (colSize == 4) { // ready for merging in iradix ...
                error("Not yet used, still using iradix instead");
                for (int j = 0; j < thisgrpn; j++)
                    ((int *)radix_xsub)[j] = (int)twiddle(x, o[itmp+j]-1, order);
                // this is why this xsub here can't be the same memory
                // as xsub in do_radixsort
            } else 
		for (int j = 0; j < thisgrpn; j++)
		    ((unsigned long long *)radix_xsub)[j] =
			twiddle(x, o[itmp+j]-1, order);
	    // changes xsub and o by reference recursively.
	    dradix_r(radix_xsub, o+itmp, thisgrpn, nextradix);
	}
	itmp = thiscounts[i];
	thiscounts[i] = 0;
    }
    if (nalast == 0) // nalast = 1, -1 are both taken care already.
	for (int i = 0; i < n; i++)
	    o[i] = is_nan(x, o[i] - 1) ? 0 : o[i];
    // nalast = 0 is dealt with separately as it just sets o to 0
    // at those indices where x is NA. x[o[i]-1] because x is not
    // modified by reference unlike iinsert or iradix_r

}

static void dinsert(unsigned long long *x, int *o, int n)
// orders both x and o by reference in-place. Fast for small vectors,
// low overhead.  don't be tempted to binsearch backwards here, have
// to shift anyway; many memmove would have overhead and do the same
// thing 'dinsert' will not be called when nalast = 0 and o[0] = -1.
{
    int otmp, tt;
    unsigned long long xtmp;
    for (int i = 1; i < n; i++) {
	xtmp = x[i];
	if (xtmp < x[i - 1]) {
	    int j = i - 1;
	    otmp = o[i];
	    while (j >= 0 && xtmp < x[j]) {
		x[j + 1] = x[j];
		o[j + 1] = o[j];
		j--;
	    }
	    x[j + 1] = xtmp;
	    o[j + 1] = otmp;
	}
    }
    tt = 0;
    for (int i = 1; i < n; i++)
	if (x[i] == x[i - 1])
	    tt++;
	else {
	    push(tt + 1);
	    tt = 0;
	}
    push(tt + 1);
}

static void dradix_r(unsigned char *xsub, int *osub, int n, int radix)
/* xsub is a recursive offset into xsub working memory above in
   dradix, reordered by reference.  osub is a an offset into the main
   answer o, reordered by reference.  dradix iterates
   7,6,5,4,3,2,1,0 */
{
    int itmp, thisgrpn, nextradix;
    unsigned int *thiscounts;
    unsigned char *p;
    if (n < 200) {
	/* 200 is guess based on limited testing. Needs calibrate(). Was 50
	   based on sum(1:50)=1275 worst -vs- 256 cummulate + 256 memset +
	   allowance since reverse order is unlikely */
	// order=1 here because it's already taken care of in iradix
	dinsert((void *)xsub, osub, n);

	return;
    }
    thiscounts = radixcounts[radix];
    p = xsub + RADIX_BYTE;
    for (int i = 0; i < n; i++) {
	thiscounts[*p]++;
	p += colSize;
    }
    itmp = thiscounts[0];
    for (int i = 1; itmp < n && i < 256; i++)
	// don't cummulate through 0s, important below
	if (thiscounts[i])
	    thiscounts[i] = (itmp += thiscounts[i]);
    p = xsub + (n - 1) * colSize;
    if (colSize == 4) {
	error("Not yet used, still using iradix instead");
	for (int i = n - 1; i >= 0; i--) {
	    int j = --thiscounts[*(p + RADIX_BYTE)];
	    otmp[j] = osub[i];
	    ((int *) xtmp)[j] = *(int *) p;
	    p -= colSize;
	}
    } else {
	for (int i = n - 1; i >= 0; i--) {
	    int j = --thiscounts[*(p + RADIX_BYTE)];
	    otmp[j] = osub[i];
	    ((unsigned long long *) xtmp)[j] = *(unsigned long long *) p;
	    p -= colSize;
	}
    }
    memcpy(osub, otmp, n * sizeof(int));
    memcpy(xsub, xtmp, n * colSize);

    nextradix = radix - 1;
    while (nextradix >= 0 && skip[nextradix])
	nextradix--;
    // TO DO: If nextradix==-1 and no further args from do_radixsort,
    // we're done. We have o. Remember to memset thiscounts before
    // returning.

    if (thiscounts[0] != 0)
	Error("Logical error. thiscounts[0]=%d but should have been decremented to 0. radix=%d",
	      thiscounts[0], radix);
    thiscounts[256] = n;
    itmp = 0;
    for (int i = 1; itmp < n && i <= 256; i++) {
	if (thiscounts[i] == 0)
	    continue;
	thisgrpn = thiscounts[i] - itmp;        // undo cummulate; i.e. diff
	if (thisgrpn == 1 || nextradix == -1)
	    push(thisgrpn);
	else
	    dradix_r(xsub + itmp * colSize, osub + itmp, thisgrpn,
		     nextradix);
	itmp = thiscounts[i];
	thiscounts[i] = 0;
    }
}

// TO DO?: dcount. Find step size, then range = (max-min)/step and
// proceed as icount. Many fixed precision floats (such as prices) may
// be suitable. Fixed precision such as 1.10, 1.15, 1.20, 1.25, 1.30
// ... do use all bits so dradix skipping may not help.

static int *cradix_counts = NULL;
static int cradix_counts_alloc = 0;
static int maxlen = 1;
static SEXP *cradix_xtmp = NULL;
static int cradix_xtmp_alloc = 0;

// same as StrCmp but also takes into account 'decreasing' and 'na.last' args.
static int StrCmp2(SEXP x, SEXP y)
{
    // same cached pointer (including NA_STRING == NA_STRING)
    if (x == y) return 0;
    // if x=NA, nalast=1 ? then x > y else x < y (Note: nalast == 0 is
    // already taken care of in 'csorted', won't be 0 here)
    if (x == NA_STRING) return nalast;
    if (y == NA_STRING) return -nalast;     // if y=NA, nalast=1 ? then y > x
    return order*strcmp(CHAR(x), CHAR(y));  // same as explanation in StrCmp
}

static int StrCmp(SEXP x, SEXP y)            // also used by bmerge and chmatch
{
    // same cached pointer (including NA_STRING == NA_STRING)
    if (x == y) return 0;
    if (x == NA_STRING) return -1;    // x < y
    if (y == NA_STRING) return 1;     // x > y
    // assumes strings are in same encoding
    return strcmp(CHAR(x), CHAR(y));
}

#define CHAR_ENCODING(x) (IS_ASCII(x) ? CE_UTF8 : getCharCE(x))

static void checkEncodings(SEXP x)
{
    cetype_t ce;

    int i;
    for (i = 0; i < length(x) && STRING_ELT(x, i) == NA_STRING; i++);

    if (i < length(x)) {
        ce = CHAR_ENCODING(STRING_ELT(x, i));
        if (ce == CE_NATIVE) {
            error(_("Character encoding must be UTF-8, Latin-1 or bytes"));
        }
    }

    /* Disabled for now -- doubles the time (for already sorted vectors): why?
    for (int i = 1; i < length(x); i++) {
        if (ce != CHAR_ENCODING(STRING_ELT(x, i))) {
            error(_("Mixed character encodings are not supported"));
        }
    }
    */
}

static void cradix_r(SEXP * xsub, int n, int radix)
// xsub is a unique set of CHARSXP, to be ordered by reference

// First time, radix == 0, and xsub == x. Then recursively moves SEXP together
// for L1 cache efficiency.

// Quite different to iradix because
//   1) x is known to be unique so fits in cache
//      (wide random access not an issue)
//   2) they're variable length character strings
//   3) no need to maintain o.  Just simply reorder x. No grps or push.

// Fortunately, UTF sorts in the same order if treated as ASCII, so we
// can simplify by doing it by bytes.

// TO DO: confirm a forwards (MSD) radix for efficiency, although more
// complicated.

// This part has nothing to do with truelength. The
// truelength stuff is to do with finding the unique strings.  We may
// be able to improve CHARSXP derefencing by submitting patch to R to
// make R's string cache contiguous but would likely be difficult. If
// we strxfrm, then it'll then be contiguous and compact then anyway.
{
    int itmp, *thiscounts, thisgrpn=0, thisx=0;
    SEXP stmp;

    // TO DO?: chmatch to existing sorted vector, then grow it.
    // TO DO?: if (n<N_SMALL = 200) insert sort, then loop through groups via ==
    if (n <= 1) return;
    if (n == 2) {
	if (StrCmp(xsub[1], xsub[0]) < 0) {
	    stmp = xsub[0];
	    xsub[0] = xsub[1];
	    xsub[1] = stmp;
	}
	return;
    }
    // TO DO: if (n < 50) cinsert (continuing from radix offset into
    // CHAR) or using StrCmp. But 256 is narrow, so quick and not too
    // much an issue.

    thiscounts = cradix_counts + radix * 256;
    for (int i = 0; i < n; i++) {
	thisx = xsub[i] == NA_STRING ?
	    0 : (radix < LENGTH(xsub[i]) ?
		 (unsigned char) (CHAR(xsub[i])[radix]) : 1);
	thiscounts[ thisx ]++;   // 0 for NA,  1 for ""
    }
    // this also catches when subx has shorter strings than the rest,
    // thiscounts[0] == n and we'll recurse very quickly through to the
    // overall maxlen with no 256 overhead each time
    if (thiscounts[thisx] == n && radix < maxlen - 1) {
	cradix_r(xsub, n, radix + 1);
	thiscounts[thisx] = 0;  // the rest must be 0 already, save the memset
	return;
    }
    itmp = thiscounts[0];
    for (int i = 1; i < 256; i++)
	// don't cummulate through 0s, important below
	if (thiscounts[i])
	    thiscounts[i] = (itmp += thiscounts[i]);
    for (int i = n - 1; i >= 0; i--) {
	thisx = xsub[i] == NA_STRING ?
	    0 : (radix < LENGTH(xsub[i]) ?
		 (unsigned char) (CHAR(xsub[i])[radix]) : 1);
	int j = --thiscounts[thisx];
	cradix_xtmp[j] = xsub[i];
    }
    memcpy(xsub, cradix_xtmp, n * sizeof(SEXP));
    if (radix == maxlen - 1) {
	memset(thiscounts, 0, 256 * sizeof(int));
	return;
    }
    if (thiscounts[0] != 0)
	Error("Logical error. counts[0]=%d in cradix but should have been decremented to 0. radix=%d",
	      thiscounts[0], radix);
    itmp = 0;
    for (int i = 1; i < 256; i++) {
	if (thiscounts[i] == 0)
	    continue;
	thisgrpn = thiscounts[i] - itmp;        // undo cummulate; i.e. diff
	cradix_r(xsub + itmp, thisgrpn, radix + 1);
	itmp = thiscounts[i];
	// set to 0 now since we're here, saves memset
	// afterwards. Important to clear! Also more portable for
	// machines where 0 isn't all bits 0 (?!)
	thiscounts[i] = 0;
    }
    if (itmp < n - 1)
	cradix_r(xsub + itmp, n - itmp, radix + 1);     // final group
}

static SEXP *ustr = NULL;
static int ustr_alloc = 0, ustr_n = 0;

static void cgroup(SEXP * x, int *o, int n)
// As icount :
//   Places the ordering into o directly, overwriting whatever was there
//   Doesn't change x
//   Pushes group sizes onto stack

// Only run when sortStr == FALSE. Basically a counting sort, in first
// appearance order, directly.  Since it doesn't sort the strings, the
// name is cgroup.  there is no _pre for this.  ustr created and
// cleared each time.
{
    // savetl_init() is called once at the start of do_radixsort
    if (ustr_n != 0)
	Error
	    ("Internal error. ustr isn't empty when starting cgroup: ustr_n=%d, ustr_alloc=%d",
	     ustr_n, ustr_alloc);
    for (int i = 0; i < n; i++) {
	SEXP s = x[i];
	if (TRLEN(s) < 0) {        // this case first as it's the most frequent
	    SET_TRLEN(s, TRLEN(s) - 1);
	    // use negative counts so as to detect R's own (positive)
	    // usage of tl on CHARSXP
	    continue;
	}
	if (TRLEN(s) > 0) {
	    // Save any of R's own usage of tl (assumed positive, so
	    // we can both count and save in one scan), to restore
	    // afterwards. From R 2.14.0, tl is initialized to 0,
	    // prior to that it was random so this step saved too much.
	    savetl(s);
	    SET_TRLEN(s, 0);
	}
	if (ustr_alloc <= ustr_n) {
	    // 10000 = 78k of 8byte pointers. Small initial guess,
	    // negligible time to alloc.
	    ustr_alloc = (ustr_alloc == 0) ? 10000 : ustr_alloc*2;
	    if (ustr_alloc > n)
		ustr_alloc = n;
	    ustr = realloc(ustr, ustr_alloc * sizeof(SEXP));
	    if (ustr == NULL)
		Error("Unable to realloc %d * %d bytes in cgroup", ustr_alloc,
		      (int)sizeof(SEXP));
	}
	SET_TRLEN(s, -1);
	ustr[ustr_n++] = s;
    }
    // TO DO: the same string in different encodings will be
    // considered different here. Sweep through ustr and merge counts
    // where equal (sort needed therefore, unfortunately?, only if
    // there are any marked encodings present)
    int cumsum = 0;
    for (int i = 0; i < ustr_n; i++) {      // 0.000
	push(-TRLEN(ustr[i]));
	SET_TRLEN(ustr[i], cumsum += -TRLEN(ustr[i]));
    }
    int *target = (o[0] != -1) ? newo : o;
    for (int i = n - 1; i >= 0; i--) {
	SEXP s = x[i];           // 0.400 (page fetches on string cache)
	int k = TRLEN(s) - 1;
	SET_TRLEN(s, k);
	target[k] = i + 1;      // 0.800 (random access to o)
    }
    // The cummulate meant counts are left non zero, so reset for next
    // time (0.00s).
    for (int i = 0; i < ustr_n; i++)
	SET_TRLEN(ustr[i], 0);
    ustr_n = 0;
}

static int *csort_otmp = NULL, csort_otmp_alloc = 0;
static void alloc_csort_otmp(int n)
{
    if (csort_otmp_alloc >= n)
	return;
    csort_otmp = (int *) realloc(csort_otmp, n * sizeof(int));
    if (csort_otmp == NULL)
	Error
	    ("Failed to allocate working memory for csort_otmp. Requested %d * %d bytes",
	     n, (int)sizeof(int));
    csort_otmp_alloc = n;
}

static void csort(SEXP * x, int *o, int n)
/*
   As icount :
   Places the ordering into o directly, overwriting whatever was there
   Doesn't change x
   Pushes group sizes onto stack
   Requires csort_pre() to have created and sorted ustr already
*/
{
    /* can't use otmp, since iradix might be called here and that uses
       otmp (and xtmp).  alloc_csort_otmp(n) is called from do_radixsort for
       either n=nrow if 1st arg, or n=maxgrpn if onwards args */
    for (int i = 0; i < n; i++)
	csort_otmp[i] = (x[i] == NA_STRING) ? NA_INTEGER : -TRLEN(x[i]);
    if (nalast == 0 && n == 2) {
        // special case for nalast == 0. n == 1 is handled inside
        // do_radixsort. at least 1 will be NA here else use o from caller
        // directly (not 1st arg)
        if (o[0] == -1)
            for (int i = 0; i < n; i++)
                o[i] = i + 1;
        for (int i = 0;  i < n; i++)
            if (csort_otmp[i] == NA_INTEGER)
                o[i] = 0;
        push(1); push(1);
        return; 
    }
    if (n < N_SMALL && nalast != 0) { // TO DO: calibrate() N_SMALL=200
        if (o[0] == -1)
            for (int i = 0; i < n; i++)
                o[i] = i + 1;
        // else use o from caller directly (not 1st arg)
        for (int i = 0; i < n; i++)
            csort_otmp[i] = icheck(csort_otmp[i]);
        iinsert(csort_otmp, o, n);
    } else {
	setRange(csort_otmp, n);
	if (range == NA_INTEGER)
	    Error("Internal error. csort's otmp contains all-NA");
	int *target = (o[0] != -1) ? newo : o;
	if (range <= N_RANGE)
	    // TO DO: calibrate(). radix was faster (9.2s
	    // "range<=10000" instead of 11.6s "range<=N_RANGE &&
	    // range<n") for run(7) where range=N_RANGE n=10000000
	    icount(csort_otmp, target, n);
	else
	    iradix(csort_otmp, target, n);
    }
    // all i* push onto stack. Using their counts may be faster here
    // than thrashing SEXP fetches over several passes as cgroup does
    // (but cgroup needs that to keep original order, and cgroup saves
    // the sort in csort_pre).
}

static void csort_pre(SEXP * x, int n)
// Finds ustr and sorts it.  Runs once for each arg (if
// sortStr == TRUE), then ustr is used by csort within each group ustr
// is grown on each character arg, to save sorting the same strings
// again if several args contain the same strings
{
    SEXP s;
    int old_un, new_un;
    // savetl_init() is called once at the start of do_radixsort
    old_un = ustr_n;
    for (int i = 0; i < n; i++) {
	s = x[i];
	// this case first as it's the most frequent. Already in ustr,
	// this negative is its ordering.
	if (TRLEN(s) < 0)
	    continue;
	// Save any of R's own usage of tl (assumed positive, so we
	// can both count and save in one scan), to restore
	// afterwards. From R 2.14.0, tl is initialized to 0, prior to
	// that it was random so this step saved too much.
	if (TRLEN(s) > 0) {
	    savetl(s);
	    SET_TRLEN(s, 0);
	}
	if (ustr_alloc <= ustr_n) {
	    // 10000 = 78k of 8byte pointers. Small initial guess,
	    // negligible time to alloc.
	    ustr_alloc = (ustr_alloc == 0) ? 10000 : ustr_alloc*2;
	    if (ustr_alloc > old_un+n)
		ustr_alloc = old_un + n;
	    ustr = realloc(ustr, ustr_alloc * sizeof(SEXP));
	    if (ustr == NULL)
		Error("Failed to realloc ustr. Requested %d * %d bytes",
		      ustr_alloc, (int)sizeof(SEXP));
	}
	SET_TRLEN(s, -1);  // this -1 will become its ordering later below
	ustr[ustr_n++] = s;
	// length on CHARSXP is the nchar of char * (excluding \0),
	// and treats marked encodings as if ascii.
	if (s != NA_STRING && LENGTH(s) > maxlen)
	    maxlen = LENGTH(s);
    }
    new_un = ustr_n;
    if (new_un == old_un)
	return;
    // No new strings observed, seen them all before in previous
    // arg. ustr already sufficient.  If we ever make ustr
    // permanently held by data.table, we'll just need to make the
    // final loop to set -i-1 before returning here.  sort ustr.

    // TODO: just sort new ones and merge them in.  These allocs are
    // here, to save them being in the recursive cradix_r()
    if (cradix_counts_alloc < maxlen) {
	cradix_counts_alloc = maxlen + 10;   // +10 to save too many reallocs
	cradix_counts = (int *)realloc(cradix_counts,
				       cradix_counts_alloc * 256 * sizeof(int));
	if (!cradix_counts)
	    Error("Failed to alloc cradix_counts");
	memset(cradix_counts, 0, cradix_counts_alloc * 256 * sizeof(int));
    }
    if (cradix_xtmp_alloc < ustr_n) {
        cradix_xtmp = (SEXP *) realloc(cradix_xtmp,  ustr_n * sizeof(SEXP));
        // TO DO: Reuse the one we have in do_radixsort.
        // Does it need to be n length?
        if (!cradix_xtmp)
            Error("Failed to alloc cradix_tmp");
        cradix_xtmp_alloc = ustr_n;
    }
    // sorts ustr in-place by reference save ordering in the
    // CHARSXP. negative so as to distinguish with R's own usage.
    cradix_r(ustr, ustr_n, 0);
    for (int i = 0; i < ustr_n; i++)
	SET_TRLEN(ustr[i], -i - 1);
}

// functions to test vectors for sortedness: isorted, dsorted and csorted

// base:is.unsorted returns NA in the presence of any NA, but we need
// to consider na.last, and we also return -1 if x is sorted in
// _strictly_ reverse order; a common case we optimize.  If a vector
// is in decreasing order *with ties*, then an in-place reverse (no
// sort) would result in instability of ties, so we are strict. We
// also save grouping information during the check; that information
// is required when sorting by multiple arguments.

// TO DO: test in big steps first to return faster if unsortedness is
// at the end (a common case of rbind'ing data to end) These are all
// sequential access to x, so very quick and cache efficient.

// order = 1 is ascending and order=-1 is descending; also takes care
// of na.last argument with check through 'icheck' Relies on
// NA_INTEGER == INT_MIN, checked in init.c
static int isorted(int *x, int n)
{
    int i = 1, j = 0;
    // when nalast = NA,
    // all NAs ? return special value to replace all o's values with '0'
    // any NAs ? return 0 = unsorted and leave it
    //   to sort routines to replace o's with 0's
    // no NAs ? continue to check rest of isorted - the same routine as usual
    if (nalast == 0) {
	for (int k = 0; k < n; k++)
	    if (x[k] != NA_INTEGER)
		j++;
	if (j == 0) {
	    push(n);
	    return (-2);
	}
	if (j != n)
	    return (0);
    }
    if (n <= 1) {
	push(n);
	return (1);
    }
    if (icheck(x[1]) < icheck(x[0])) {
	i = 2;
	while (i < n && icheck(x[i]) < icheck(x[i - 1]))
	    i++;
	// strictly opposite to expected 'order', no ties;
	if (i == n) {
	    mpush(1, n);
	    return (-1);
	}
	// e.g. no more than one NA at the beginning/end (for order=-1/1)
	else return (0);
    }
    int old = gsngrp[flip];
    int tt = 1;
    for (int i = 1; i < n; i++) {
	if (icheck(x[i]) < icheck(x[i - 1])) {
	    gsngrp[flip] = old;
	    return (0);
	}
	if (x[i] == x[i - 1])
	    tt++;
	else {
	    push(tt); tt = 1;
	}
    }
    push(tt);
    // same as 'order', NAs at the beginning for order=1, at end for
    // order=-1, possibly with ties
    return(1);
}

// order=1 is ascending and -1 is descending
// also accounts for nalast=0 (=NA), =1 (TRUE), -1 (FALSE) (in twiddle)
static int dsorted(double *x, int n)
{
    int i = 1, j = 0;
    unsigned long long prev, this;
    if (nalast == 0) {
	// when nalast = NA,
	// all NAs ? return special value to replace all o's values with '0'
	// any NAs ? return 0 = unsorted and leave it to sort routines to
	//           replace o's with 0's
	// no NAs  ? continue to check the rest of isorted -
	//           the same routine as usual
	for (int k = 0; k < n; k++)
	    if (!is_nan(x, k))
		j++;
	if (j == 0) {
	    push(n);
	    return (-2);
	}
	if (j != n)
	    return (0);
    }
    if (n <= 1) {
	push(n);
	return (1);
    }
    prev = twiddle(x, 0, order);
    this = twiddle(x, 1, order);
    if (this < prev) {
	i = 2;
	prev = this;
	while (i < n && (this = twiddle(x, i, order)) < prev) {
	    i++;
	    prev = this;
	}
	if (i == n) {
	    mpush(1, n);
	    return (-1);
	}
	// strictly opposite of expected 'order', no ties; e.g. no
	// more than one NA at the beginning/end (for order=-1/1)

	// TO DO: improve to be stable for ties in reverse
	else return(0);
    }
    int old = gsngrp[flip];
    int tt = 1;
    for (int i = 1; i < n; i++) {
	// TO DO: once we get past -Inf, NA and NaN at the bottom, and
	//        +Inf at the top, the middle only need be twiddled
	//        for tolerance (worth it?)
	this = twiddle(x, i, order);
	if (this < prev) {
	    gsngrp[flip] = old;
	    return (0);
	}
	if (this == prev)
	    tt++;
	else {
	    push(tt);
	    tt = 1;
	}
	prev = this;
    }
    push(tt);
    // exactly as expected in 'order' (1=increasing, -1=decreasing),
    // possibly with ties
    return (1);
}

// order=1 is ascending and -1 is descending
// also accounts for nalast=0 (=NA), =1 (TRUE), -1 (FALSE)
static int csorted(SEXP *x, int n)
{
    int i = 1, j = 0, tmp;
    if (nalast == 0) {
	// when nalast = NA,
	// all NAs ? return special value to replace all o's values with '0'
	// any NAs ? return 0 = unsorted and leave it to sort routines
	//           to replace o's with 0's
	// no NAs  ? continue to check the rest of isorted -
	//           the same routine as usual
	for (int k = 0; k < n; k++)
	    if (x[k] != NA_STRING)
		j++;
	if (j == 0) {
	    push(n);
	    return (-2);
	}
	if (j != n)
	    return (0);
    }
    if (n <= 1) {
	push(n);
	return (1);
    }
    if (StrCmp2(x[1], x[0]) < 0) {
	i = 2;
	while (i < n && StrCmp2(x[i], x[i - 1]) < 0)
	    i++;
	if (i == n) {
	    mpush(1, n);
	    return (-1);
	}
	// strictly opposite of expected 'order', no ties;
	// e.g. no more than one NA at the beginning/end (for order=-1/1)
	else
	    return (0);
    }
    int old = gsngrp[flip];
    int tt = 1;
    for (int i = 1; i < n; i++) {
	tmp = StrCmp2(x[i], x[i - 1]);
	if (tmp < 0) {
	    gsngrp[flip] = old;
	    return (0);
	}
	if (tmp == 0)
	    tt++;
	else {
	    push(tt);
	    tt = 1;
	}
    }
    push(tt);
    // exactly as expected in 'order', possibly with ties
    return (1);
}

static void isort(int *x, int *o, int n)
{
    if (n <= 2) {
	// nalast = 0 and n == 2 (check bottom of this file for explanation)
	if (nalast == 0 && n == 2) {
	    if (o[0] == -1) {
		o[0] = 1;
		o[1] = 2;
	    }
	    for (int i = 0; i < n; i++)
		if (x[i] == NA_INTEGER)
		    o[i] = 0;
	    push(1); push(1);
	    return;
	} else Error("Internal error: isort received n=%d. isorted should have dealt with this (e.g. as a reverse sorted vector) already",n);
    }
    if (n < N_SMALL && o[0] != -1 && nalast != 0) {
        // see comment above in iradix_r on N_SMALL=200.
        /* if not o[0] then can't just populate with 1:n here, since x
           is changed by ref too (so would need to be copied). */
        /* pushes inside too. Changes x and o by reference, so not
           suitable in first arg when o hasn't been populated yet
           and x is an actual argument (hence check on o[0]). */
        if (order != 1 || nalast != -1)
            // so that default case, i.e., order=1, nalast=FALSE will
            // not be affected (ex: `setkey`)
            for (int i = 0; i < n; i++)
                x[i] = icheck(x[i]);
        iinsert(x, o, n);
    } else {
        /* Tighter range (e.g. copes better with a few abnormally large
           values in some groups), but also, when setRange was once at
           arg level that caused an extra scan of (long) x
           first. 10,000 calls to setRange takes just 0.04s
           i.e. negligible. */
        setRange(x, n);
        if (range == NA_INTEGER)
            Error("Internal error: isort passed all-NA. isorted should have caught this before this point");
        int *target = (o[0] != -1) ? newo : o;
        // was range < 10000 for subgroups, but 1e5 for the first
        // arg, tried to generalise here.  1e4 rather than 1e5 here
        // because iterated was (thisgrpn < 200 || range > 20000) then
        // radix a short vector with large range can bite icount when
        // iterated (BLOCK 4 and 6)
        if (range <= N_RANGE && range <= n) {
            icount(x, target, n);
        } else {
            iradix(x, target, n);
        }
    }
}

static void dsort(double *x, int *o, int n)
{
    if (n <= 2) {
	if (nalast == 0 && n == 2) {
	    // don't have to twiddle here.. at least one will be NA
	    // and 'n' WILL BE 2.
	    if (o[0] == -1) {
		o[0] = 1;
		o[1] = 2;
	    }
	    for (int i = 0; i < n; i++)
		if (is_nan(x, i))
		    o[i] = 0;
	    push(1); push(1);
	    return;
	}
	Error("Internal error: dsort received n=%d. dsorted should have dealt with this (e.g. as a reverse sorted vector) already",n);
    }
    if (n < N_SMALL && o[0] != -1 && nalast != 0) {
	// see comment above in iradix_r re N_SMALL=200,  and isort for o[0]
	for (int i = 0; i < n; i++)
	    ((unsigned long long *)x)[i] = twiddle(x, i, order);
	// have to twiddle here anyways, can't speed up default case
	// like in isort
	dinsert((unsigned long long *)x, o, n);
    } else {
	dradix((unsigned char *) x, (o[0] != -1) ? newo : o, n);
    }
}

attribute_hidden SEXP do_radixsort(SEXP call, SEXP op, SEXP args, SEXP rho)
{
    int n = -1, narg = 0, ngrp, tmp, *osub, thisgrpn;
    R_xlen_t nl = n;
    Rboolean isSorted = TRUE, retGrp;
    void *xd;
    int *o = NULL;

    /* ML: FIXME: Here are just two of the dangerous assumptions here */
    if (sizeof(int) != 4) {
        error("radix sort assumes sizeof(int) == 4");
    }
    if (sizeof(double) != 8) {
        error("radix sort assumes sizeof(double) == 8");
    }
    
    nalast = (asLogical(CAR(args)) == NA_LOGICAL) ? 0 :
	(asLogical(CAR(args)) == TRUE) ? 1 : -1; // 1=TRUE, -1=FALSE, 0=NA
    args = CDR(args);
    SEXP decreasing = CAR(args);
    args = CDR(args);

    /* If TRUE, return starts of runs of identical values + max group size. */
    retGrp = asLogical(CAR(args));
    args = CDR(args);

    /* If FALSE, get order of strings in appearance order. Essentially
       abuses the CHARSXP table to group strings without hashing
       them. Only makes sense when retGrp=TRUE.
    */
    sortStr = asLogical(CAR(args));
    args = CDR(args);

    /* When grouping, we round off doubles to account for imprecision */
    setNumericRounding(retGrp ? 2 : 0);

    if (args == R_NilValue)
	return R_NilValue;
    if (isVector(CAR(args)))
	nl = XLENGTH(CAR(args));
    for (SEXP ap = args; ap != R_NilValue; ap = CDR(ap), narg++) {
	if (!isVector(CAR(ap)))
	    error(_("argument %d is not a vector"), narg + 1);
        //Rprintf("%d, %d\n", XLENGTH(CAR(ap)), nl);
	if (XLENGTH(CAR(ap)) != nl)
	    error(_("argument lengths differ"));
    }

    if (narg != length(decreasing))
	error(_("length(decreasing) must match the number of order arguments"));
    for (int i = 0; i < narg; i++) {
	if (LOGICAL(decreasing)[i] == NA_LOGICAL)
	    error(_("'decreasing' elements must be TRUE or FALSE"));
    }
    order = asLogical(decreasing) ? -1 : 1;

    SEXP x = CAR(args);
    args = CDR(args);

    // (ML) FIXME: need to support long vectors
    if (nl > INT_MAX) {
	error(_("long vectors not supported"));
    }
    n = (int) nl;

    // upper limit for stack size (all size 1 groups). We'll detect
    // and avoid that limit, but if just one non-1 group (say 2), that
    // can't be avoided.
    gsmaxalloc = n;

    // once for the result, needs to be length n.

    // TO DO: save allocation if NULL is returned (isSorted = =TRUE) so
    // [i|c|d]sort know they can populate o directly with no working
    // memory needed to reorder existing order had to repace this from
    // '0' to '-1' because 'nalast = 0' replace 'o[.]' with 0 values.

    SEXP ans = PROTECT(allocVector(INTSXP, n));
    o = INTEGER(ans);
    if (n > 0)
	o[0] = -1;
    xd = DATAPTR(x);

    stackgrps = narg > 1 || retGrp;

    if (TYPEOF(x) == STRSXP) {
        checkEncodings(x);
    }
    
    savetl_init();   // from now on use Error not error.

    switch (TYPEOF(x)) {
    case INTSXP:
    case LGLSXP:
	tmp = isorted(xd, n);
	break;
    case REALSXP :
	twiddle = &dtwiddle;
	is_nan  = &dnan;
	tmp = dsorted(xd, n);
	break;
    case STRSXP :
	tmp = csorted(xd, n);
	break;
    default :
        Error("First arg is type '%s', not yet supported",
              R_typeToChar(x));
    }
    if (tmp) {
	// -1 or 1. NEW: or -2 in case of nalast == 0 and all NAs
	if (tmp == 1) {
	    // same as expected in 'order' (1 = increasing, -1 = decreasing)
	    isSorted = TRUE;
	    for (int i = 0; i < n; i++)
		o[i] = i + 1;
	} else if (tmp == -1) {
	    // -1 (or -n for result of strcmp), strictly opposite to
	    // -expected 'order'
	    isSorted = FALSE;
	    for (int i = 0; i < n; i++)
		o[i] = n - i;
	} else if (nalast == 0 && tmp == -2) {
	    // happens only when nalast=NA/0. Means all NAs, replace
	    // with 0's therefore!
	    isSorted = FALSE;
	    for (int i = 0; i < n; i++)
		o[i] = 0;
	}
    } else {
	isSorted = FALSE;
	switch (TYPEOF(x)) {
	case INTSXP:
	case LGLSXP:
	    isort(xd, o, n);
	    break;
	case REALSXP :
	    dsort(xd, o, n);
	    break;
	case STRSXP :
	    if (sortStr) {
		csort_pre(xd, n);
		alloc_csort_otmp(n);
		csort(xd, o, n);
	    } else
		cgroup(xd, o, n);
	    break;
	default:
	    Error
		("Internal error: previous default should have caught unsupported type");
	}
    }
    
    int maxgrpn = gsmax[flip];   // biggest group in the first arg
    void *xsub = NULL;           // local
// This was not valid C23, and clang 15 warns it was not valid C99 either.
//    int (*f) ();  // called with fn pointer, int
//    void (*g) (); // called with fn pointer, int *, int
    int fgtype;
    
    if (narg > 1 && gsngrp[flip] < n) {
        // double is the largest type, 8
        xsub = (void *) malloc(maxgrpn * sizeof(double));
        if (xsub == NULL)
            Error("Couldn't allocate xsub in do_radixsort, requested %d * %d bytes.",
                  maxgrpn, (int)sizeof(double));
        // global variable, used by isort, dsort, sort and cgroup
        newo = (int *) malloc(maxgrpn * sizeof(int));
        if (newo == NULL)
            Error("Couldn't allocate newo in do_radixsort, requested %d * %d bytes.",
                  maxgrpn, (int)sizeof(int));
    }

    for (int col = 2; col <= narg; col++) {
	x = CAR(args);
	args = CDR(args);
	xd = DATAPTR(x);
	ngrp = gsngrp[flip];
	if (ngrp == n && nalast != 0)
	    break;
	flipflop();
	stackgrps = col != narg || retGrp;
	order = LOGICAL(decreasing)[col - 1] ? -1 : 1;
	switch (TYPEOF(x)) {
	case INTSXP:
	case LGLSXP:
//	    f = &isorted;
//	    g = &isort;
	    fgtype = 1;
	    break;
	case REALSXP:
	    twiddle = &dtwiddle;
	    is_nan = &dnan;
	    fgtype = 2;
//	    f = &dsorted;
//	    g = &dsort;
	    break;
	case STRSXP:
	    fgtype = 3;
//	    f = &csorted;
	    if (sortStr) {
		csort_pre(xd, n);
		alloc_csort_otmp(gsmax[1 - flip]);
//		g = &csort;
	    }
	    // no increasing/decreasing order required if sortStr = FALSE,
	    // just a dummy argument
	    else {
		fgtype = 4;
//		g = &cgroup;
	    }
	    break;
	default:
	    Error("Arg %d is type '%s', not yet supported",
		  col, R_typeToChar(x));
	}
	int i = 0;
	for (int grp = 0; grp < ngrp; grp++) {
	    thisgrpn = gs[1 - flip][grp];
	    if (thisgrpn == 1) {
		if (nalast == 0) {
		    // this edge case had to be taken care of
		    // here.. (see the bottom of this file for
		    // more explanation)
		    switch (TYPEOF(x)) {
		    case INTSXP:
			if (INTEGER(x)[o[i] - 1] == NA_INTEGER) {
			    isSorted = FALSE;
			    o[i] = 0;
			}
			break;
		    case LGLSXP:
			if (LOGICAL(x)[o[i] - 1] == NA_LOGICAL) {
			    isSorted = FALSE;
			    o[i] = 0;
			}
			break;
		    case REALSXP:
			if (ISNAN(REAL(x)[o[i] - 1])) {
			    isSorted = FALSE;
			    o[i] = 0;
			}
			break;
		    case STRSXP:
			if (STRING_ELT(x, o[i] - 1) == NA_STRING) {
			    isSorted = FALSE;
			    o[i] = 0;
                        } break;
                    default :
                        Error("Internal error: previous default should have caught unsupported type");
                    }
                }
                i++;
                push(1);
                continue;
            }
            osub = o+i;
            // ** TO DO **: if isSorted, we can just point xsub
            //        into x directly. If (*f)() returns 0,
            //        though, will have to copy x at that point
            //        When doing this, xsub could be allocated at
            //        that point for the first time.
            if (TYPEOF(x) == STRSXP)
                for (int j = 0; j < thisgrpn; j++)
                    ((SEXP *) xsub)[j] = ((SEXP *) xd)[o[i++] - 1];
            else if (TYPEOF(x) == REALSXP)
                for (int j = 0; j < thisgrpn; j++)
                    ((double *) xsub)[j] = ((double *) xd)[o[i++] - 1];
            else
                for (int j = 0; j < thisgrpn; j++)
                    ((int *) xsub)[j] = ((int *) xd)[o[i++] - 1];
                
            // continue; // BASELINE short circuit timing
            // point. Up to here is the cost of creating xsub.
            // [i|d|c]sorted(); very low cost, sequential
//            tmp = (*f)(xsub, thisgrpn);
	    switch(fgtype) {
	    case 1:
		tmp = isorted(xsub, thisgrpn);
		break;
	    case 2:
		tmp = dsorted(xsub, thisgrpn);
		break;
	    case 3:
	    case 4:
		tmp = csorted(xsub, thisgrpn);
	    }
            if (tmp) {
                // *sorted will have already push()'d the groups
                if (tmp == -1) {
		    isSorted = FALSE;
		    for (int k = 0; k < thisgrpn / 2; k++) {
			// reverse the order in-place using no
			// function call or working memory
			// isorted only returns -1 for
			// _strictly_ decreasing order,
			// otherwise ties wouldn't be stable
			tmp = osub[k];
			osub[k] = osub[thisgrpn - 1 - k];
			osub[thisgrpn - 1 - k] = tmp;
		    }
		} else if (nalast == 0 && tmp == -2) {
		    // all NAs, replace osub[.] with 0s.
		    isSorted = FALSE;
		    for (int k = 0; k < thisgrpn; k++) osub[k] = 0;
		}
		continue;
	    }
	    isSorted = FALSE;
	    // nalast=NA will result in newo[0] = 0. So had to change to -1.
	    newo[0] = -1;
	    // may update osub directly, or if not will put the
	    // result in global newo
//	    (*g)(xsub, osub, thisgrpn);
	    switch(fgtype) {
	    case 1: isort(xsub, osub, thisgrpn); break;
	    case 2: dsort(xsub, osub, thisgrpn); break;
	    case 3: csort(xsub, osub, thisgrpn); break;
	    case 4: cgroup(xsub, osub, thisgrpn); break;
	    }
	    if (newo[0] != -1) {
		if (nalast != 0)
		    for (int j = 0; j < thisgrpn; j++)
			// reuse xsub to reorder osub
			((int *) xsub)[j] = osub[newo[j] - 1];
		else
		    for (int j = 0; j < thisgrpn; j++)
			// final nalast case to handle!
			((int *) xsub)[j] = (newo[j] == 0) ? 0 :
			    osub[newo[j] - 1];
		memcpy(osub, xsub, thisgrpn * sizeof(int));
	    }
	}
    }

    if (!sortStr && ustr_n != 0)
        Error("Internal error: at the end of do_radixsort sortStr == FALSE but ustr_n !=0 [%d]",
              ustr_n);
    for(int i = 0; i < ustr_n; i++)
        SET_TRLEN(ustr[i], 0);
    maxlen = 1;  // reset global. Minimum needed to count "" and NA
    ustr_n = 0;
    savetl_end();
    free(ustr);
    ustr = NULL;
    ustr_alloc = 0;

    if (retGrp) {
        int maxgrpn = NA_INTEGER;
        ngrp = gsngrp[flip];
        SEXP s_ends = install("ends");
        setAttrib(ans, s_ends, x = allocVector(INTSXP, ngrp));
        if (ngrp > 0) {
            INTEGER(x)[0] = gs[flip][0];
            for (int i = 1; i < ngrp; i++)
                INTEGER(x)[i] = INTEGER(x)[i - 1] + gs[flip][i];
            maxgrpn = gsmax[flip];
        }
        SEXP s_maxgrpn = install("maxgrpn");
        setAttrib(ans, s_maxgrpn, ScalarInteger(maxgrpn));
        SEXP nms;
        PROTECT(nms = allocVector(STRSXP, 2));
        SET_STRING_ELT(nms, 0, mkChar("grouping"));
        SET_STRING_ELT(nms, 1, mkChar("integer"));
        setAttrib(ans, R_ClassSymbol, nms);
        UNPROTECT(1);
    }

    Rboolean dropZeros = !retGrp && !isSorted && nalast == 0;
    if (dropZeros) {
        int zeros = 0;
        for (int i = 0; i < n; i++) {
            if (o[i] == 0)
                zeros++;
        }
        if (zeros > 0) {
            PROTECT(ans = allocVector(INTSXP, n - zeros));
            int *o2 = INTEGER(ans);
            for (int i = 0, i2 = 0; i < n; i++) {
                if (o[i] > 0)
                    o2[i2++] = o[i];
            }
            UNPROTECT(1);
        }
    }
    
    gsfree();
    free(radix_xsub);          radix_xsub=NULL;    radix_xsuballoc=0;
    free(xsub); free(newo);    xsub=newo=NULL;
    free(xtmp);                xtmp=NULL;          xtmp_alloc=0;
    free(otmp);                otmp=NULL;          otmp_alloc=0;
    free(csort_otmp);          csort_otmp=NULL;    csort_otmp_alloc=0;

    free(cradix_counts);       cradix_counts=NULL; cradix_counts_alloc=0;
    free(cradix_xtmp);         cradix_xtmp=NULL;   cradix_xtmp_alloc=0;
    // TO DO: use xtmp already got

    UNPROTECT(1);
    return ans;
}
