/*  vcfview.c -- VCF/BCF conversion, view, subset and filter VCF/BCF files.

    Copyright (C) 2013-2023 Genome Research Ltd.

    Author: Shane McCarthy <sm15@sanger.ac.uk>

Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:

The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.

THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.  */

#include <stdio.h>
#include <strings.h>
#include <unistd.h>
#include <getopt.h>
#include <ctype.h>
#include <string.h>
#include <errno.h>
#include <sys/stat.h>
#include <sys/types.h>
#include <math.h>
#include <inttypes.h>
#include <htslib/vcf.h>
#include <htslib/synced_bcf_reader.h>
#include <htslib/vcfutils.h>
#include "bcftools.h"
#include "filter.h"
#include "htslib/khash_str2int.h"

#define FLT_INCLUDE 1
#define FLT_EXCLUDE 2

#define ALLELE_NONREF 1
#define ALLELE_MINOR 2
#define ALLELE_ALT1 3
#define ALLELE_MAJOR 4
#define ALLELE_NONMAJOR 5

#define GT_NEED_HOM 1
#define GT_NEED_HET 2
#define GT_NO_HOM   3
#define GT_NO_HET   4
#define GT_NEED_MISSING 5
#define GT_NO_MISSING 6

typedef struct _args_t
{
    filter_t *filter;
    char *filter_str;
    int filter_logic;   // one of FLT_INCLUDE/FLT_EXCLUDE (-i or -e)

    bcf_srs_t *files;
    bcf_hdr_t *hdr, *hnull, *hsub; // original header, sites-only header, subset header
    char **argv, *format, *sample_names, *subset_fname, *targets_list, *regions_list;
    int regions_overlap, targets_overlap;
    int argc, clevel, n_threads, output_type, print_header, update_info, header_only, n_samples, *imap, calc_ac;
    int trim_alts, sites_only, known, novel, min_alleles, max_alleles, private_vars, uncalled, phased;
    int min_ac, min_ac_type, max_ac, max_ac_type, min_af_type, max_af_type, gt_type;
    int *ac, mac;
    float min_af, max_af;
    char *fn_ref, *fn_out, **samples;
    int sample_is_file, force_samples;
    char *include_types, *exclude_types;
    int include, exclude;
    int record_cmd_line;
    char *index_fn;
    int write_index;
    htsFile *out;
}
args_t;

static void init_data(args_t *args)
{
    int i;
    args->hdr = args->files->readers[0].header;

    if (args->calc_ac && args->update_info)
    {
        if (bcf_hdr_append(args->hdr,"##INFO=<ID=AC,Number=A,Type=Integer,Description=\"Allele count in genotypes\">") < 0)
            error_errno("[%s] Failed to add \"AC\" INFO header", __func__);
        if (bcf_hdr_append(args->hdr,"##INFO=<ID=AN,Number=1,Type=Integer,Description=\"Total number of alleles in called genotypes\">") < 0)
            error_errno("[%s] Failed to add \"AN\" INFO header", __func__);
    }
    if (args->record_cmd_line) bcf_hdr_append_version(args->hdr, args->argc, args->argv, "bcftools_view");
    else if (bcf_hdr_sync(args->hdr) < 0)
        error_errno("[%s] Failed to update header", __func__);

    // setup sample data
    if (args->sample_names)
    {
        void *hdr_samples = khash_str2int_init();
        for (i=0; i<bcf_hdr_nsamples(args->hdr); i++)
            khash_str2int_inc(hdr_samples, bcf_hdr_int2id(args->hdr,BCF_DT_SAMPLE,i));

        void *exclude = (args->sample_names[0]=='^') ? khash_str2int_init() : NULL;
        int nsmpl;
        char **smpl = NULL;
        args->samples = NULL; args->n_samples = 0;
        smpl = hts_readlist(exclude ? &args->sample_names[1] : args->sample_names, args->sample_is_file, &nsmpl);
        if ( !smpl )
        {
            error("Could not read the list: \"%s\"\n", exclude ? &args->sample_names[1] : args->sample_names);
        }

        if ( exclude )
        {
            for (i=0; i<nsmpl; i++) {
                if (!khash_str2int_has_key(hdr_samples,smpl[i])) {
                    if (args->force_samples) {
                        fprintf(stderr, "Warn: exclude called for sample that does not exist in header: \"%s\"... skipping\n", smpl[i]);
                    } else {
                        error("Error: exclude called for sample that does not exist in header: \"%s\". Use \"--force-samples\" to ignore this error.\n", smpl[i]);
                    }
                }
                khash_str2int_inc(exclude, smpl[i]);
            }

            for (i=0; i<bcf_hdr_nsamples(args->hdr); i++)
            {
                if ( exclude && khash_str2int_has_key(exclude,bcf_hdr_int2id(args->hdr,BCF_DT_SAMPLE,i))  ) continue;
                args->samples = (char**) realloc(args->samples, (args->n_samples+1)*sizeof(const char*));
                args->samples[args->n_samples++] = strdup(bcf_hdr_int2id(args->hdr,BCF_DT_SAMPLE,i));
            }
            khash_str2int_destroy(exclude);
        }
        else
        {
            for (i=0; i<nsmpl; i++) {
                if (!khash_str2int_has_key(hdr_samples,smpl[i])) {
                    if (args->force_samples) {
                        fprintf(stderr, "Warn: subset called for sample that does not exist in header: \"%s\"... skipping\n", smpl[i]);
                        continue;
                    } else {
                        error("Error: subset called for sample that does not exist in header: \"%s\". Use \"--force-samples\" to ignore this error.\n", smpl[i]);
                    }
                }
                args->samples = (char**) realloc(args->samples, (args->n_samples+1)*sizeof(const char*));
                args->samples[args->n_samples++] = strdup(smpl[i]);
            }
        }
        for (i=0; i<nsmpl; i++) free(smpl[i]);
        free(smpl);
        khash_str2int_destroy(hdr_samples);
        if (args->n_samples == 0) {
            fprintf(stderr, "Warn: subsetting has removed all samples\n");
            args->sites_only = 1;
        }
    }

    if (args->n_samples)
        args->imap = (int*)malloc(args->n_samples * sizeof(int));

    // determine variant types to include/exclude
    if (args->include_types || args->exclude_types) {
        if (args->include_types && args->exclude_types) {
            fprintf(stderr, "Error: only supply one of --include-types, --exclude-types options\n");
            exit(1);
        }
        char **type_list = 0;
        int m = 0, n = 0;
        const char *q, *p;
        for (q = p = args->include_types ? args->include_types : args->exclude_types;; ++p) {
            if (*p == ',' || *p == 0) {
                if (m == n) {
                    m = m? m<<1 : 16;
                    type_list = (char**)realloc(type_list, m * sizeof(char*));
                }
                type_list[n] = (char*)calloc(p - q + 1, 1);
                strncpy(type_list[n++], q, p - q);
                q = p + 1;
                if (*p == 0) break;
            }
        }
        type_list = (char**)realloc(type_list, n * sizeof(char*));

        if (args->include_types) {
            args->include = 0;
            for (i = 0; i < n; ++i) {
                if (strcmp(type_list[i], "snps") == 0) args->include |= VCF_SNP<<1;
                else if (strcmp(type_list[i], "indels") == 0) args->include |= VCF_INDEL<<1;
                else if (strcmp(type_list[i], "mnps") == 0) args->include |= VCF_MNP<<1;
                else if (strcmp(type_list[i], "other") == 0) args->include |= VCF_OTHER<<1;
                else if (strcmp(type_list[i], "ref") == 0) args->include |= VCF_OTHER<<1;
                else if (strcmp(type_list[i], "bnd") == 0) args->include |= VCF_BND<<1;
                else {
                    fprintf(stderr, "[E::%s] unknown type\n", type_list[i]);
                    fprintf(stderr, "Accepted types are snps, indels, mnps, other\n");
                    exit(1);
                }
            }
        }
        if (args->exclude_types) {
            args->exclude = 0;
            for (i = 0; i < n; ++i) {
                if (strcmp(type_list[i], "snps") == 0) args->exclude |= VCF_SNP<<1;
                else if (strcmp(type_list[i], "indels") == 0) args->exclude |= VCF_INDEL<<1;
                else if (strcmp(type_list[i], "mnps") == 0) args->exclude |= VCF_MNP<<1;
                else if (strcmp(type_list[i], "other") == 0) args->exclude |= VCF_OTHER<<1;
                else if (strcmp(type_list[i], "ref") == 0) args->exclude |= VCF_OTHER<<1;
                else if (strcmp(type_list[i], "bnd") == 0) args->exclude |= VCF_BND<<1;
                else {
                    fprintf(stderr, "[E::%s] unknown type\n", type_list[i]);
                    fprintf(stderr, "Accepted types are snps, indels, mnps, other\n");
                    exit(1);
                }
            }
        }
        for (i = 0; i < n; ++i)
            free(type_list[i]);
        free(type_list);
    }

    char wmode[8];
    set_wmode(wmode,args->output_type,args->fn_out,args->clevel);
    args->out = hts_open(args->fn_out ? args->fn_out : "-", wmode);
    if ( !args->out ) error("%s: %s\n", args->fn_out,strerror(errno));
    if ( args->n_threads > 0)
        hts_set_opt(args->out, HTS_OPT_THREAD_POOL, args->files->p);

    // headers: hdr=full header, hsub=subset header, hnull=sites only header
    if (args->sites_only){
        args->hnull = bcf_hdr_subset(args->hdr, 0, 0, 0);
        bcf_hdr_remove(args->hnull, BCF_HL_FMT, NULL);
    }
    if (args->n_samples > 0)
    {
        args->hsub = bcf_hdr_subset(args->hdr, args->n_samples, args->samples, args->imap);
        if ( !args->hsub ) error("Error occurred while subsetting samples\n");
        if ( args->n_samples != bcf_hdr_nsamples(args->hsub) )
        {
            int i;
            for (i=0; i<args->n_samples; i++)
                if ( args->imap[i]<0 ) error("Error: No such sample: \"%s\"\n", args->samples[i]);
        }
    }

    if ( args->filter_str )
        args->filter = filter_init(args->hdr, args->filter_str);
}

static void destroy_data(args_t *args)
{
    int i;
    if ( args->imap ) {
        for (i = 0; i < args->n_samples; ++i)
            free(args->samples[i]);
        free(args->samples);
        free(args->imap);
    }
    if (args->hnull) bcf_hdr_destroy(args->hnull);
    if (args->hsub) bcf_hdr_destroy(args->hsub);
    if ( args->filter )
        filter_destroy(args->filter);
    free(args->ac);
}

// true if all samples are phased.
// haploid genotypes are considered phased
// ./. => not phased, .|. => phased
int bcf_all_phased(const bcf_hdr_t *header, bcf1_t *line)
{
    bcf_unpack(line, BCF_UN_FMT);
    bcf_fmt_t *fmt_ptr = bcf_get_fmt(header, line, "GT");
    int all_phased = 1;
    if ( fmt_ptr )
    {
        int i, isample;
        for (isample=0; isample<line->n_sample; isample++)
        {
            int sample_phased = 0;
            #define BRANCH_INT(type_t,vector_end) { \
                type_t *p = (type_t*) (fmt_ptr->p + isample*fmt_ptr->size); \
                for (i=0; i<fmt_ptr->n; i++) \
                { \
                    if (fmt_ptr->n == 1 || (p[i] == vector_end && i == 1)) { sample_phased = 1; break; } /* haploid phased by definition */ \
                    if ( p[i] == vector_end ) { break; }; /* smaller ploidy */ \
                    if ( bcf_gt_is_missing(p[i]) ) continue; /* missing allele */ \
                    if ((p[i])&1) { \
                        sample_phased = 1; \
                        break; \
                    } \
                } \
            }
            switch (fmt_ptr->type) {
                case BCF_BT_INT8:  BRANCH_INT(int8_t,  bcf_int8_vector_end); break;
                case BCF_BT_INT16: BRANCH_INT(int16_t, bcf_int16_vector_end); break;
                case BCF_BT_INT32: BRANCH_INT(int32_t, bcf_int32_vector_end); break;
                default: fprintf(stderr, "[E::%s] todo: fmt_type %d\n", __func__, fmt_ptr->type); exit(1); break;
            }
            #undef BRANCH_INT
            if (!sample_phased) {
                all_phased = 0;
                break;
            }
        }
    }
    return all_phased;
}

int subset_vcf(args_t *args, bcf1_t *line)
{
    if ( args->min_alleles && line->n_allele < args->min_alleles ) return 0; // min alleles
    if ( args->max_alleles && line->n_allele > args->max_alleles ) return 0; // max alleles
    if (args->novel || args->known)
    {
        if ( args->novel && (line->d.id[0]!='.' || line->d.id[1]!=0) ) return 0; // skip sites which are known, ID != '.'
        if ( args->known && line->d.id[0]=='.' && line->d.id[1]==0 ) return 0;  // skip sites which are novel, ID == '.'
    }

    if (args->include || args->exclude)
    {
        int line_type = bcf_get_variant_types(line);
        if ( args->include && !((line_type<<1) & args->include) ) return 0; // include only given variant types
        if ( args->exclude &&   (line_type<<1) & args->exclude  ) return 0; // exclude given variant types
    }

    if ( args->filter )
    {
        int ret = filter_test(args->filter, line, NULL);
        if ( args->filter_logic==FLT_INCLUDE ) { if ( !ret ) return 0; }
        else if ( ret ) return 0;
    }

    hts_expand(int, line->n_allele, args->mac, args->ac);
    int i, an = 0, non_ref_ac = 0;
    if (args->calc_ac) {
        bcf_calc_ac(args->hdr, line, args->ac, BCF_UN_INFO|BCF_UN_FMT); // get original AC and AN values from INFO field if available, otherwise calculate
        for (i=1; i<line->n_allele; i++)
            non_ref_ac += args->ac[i];
        for (i=0; i<line->n_allele; i++)
            an += args->ac[i];
    }

    int update_ac = args->calc_ac;
    if (args->n_samples)
    {
        int non_ref_ac_sub = 0, *ac_sub = (int*) calloc(line->n_allele,sizeof(int));
        bcf_subset(args->hdr, line, args->n_samples, args->imap);
        if ( args->calc_ac && !bcf_get_fmt(args->hdr,line,"GT") ) update_ac = 0;
        if ( update_ac )
        {
            bcf_calc_ac(args->hsub, line, ac_sub, BCF_UN_FMT); // recalculate AC and AN
            an = 0;
            for (i=0; i<line->n_allele; i++) {
                args->ac[i] = ac_sub[i];
                an += ac_sub[i];
            }
            for (i=1; i<line->n_allele; i++)
                non_ref_ac_sub += ac_sub[i];
            if (args->private_vars) {
                if (args->private_vars == FLT_INCLUDE && !(non_ref_ac_sub > 0 && non_ref_ac == non_ref_ac_sub)) { free(ac_sub); return 0; } // select private sites
                if (args->private_vars == FLT_EXCLUDE && non_ref_ac_sub > 0 && non_ref_ac == non_ref_ac_sub) { free(ac_sub); return 0; } // exclude private sites
            }
            non_ref_ac = non_ref_ac_sub;
        }
        free(ac_sub);
    }

    bcf_fmt_t *gt_fmt;
    if ( args->gt_type && (gt_fmt=bcf_get_fmt(args->hdr,line,"GT")) )
    {
        int nhet = 0, nhom = 0, nmiss = 0;
        for (i=0; i<bcf_hdr_nsamples(args->hdr); i++)
        {
            int type = bcf_gt_type(gt_fmt,i,NULL,NULL);
            if ( type==GT_HET_RA || type==GT_HET_AA )
            {
                if ( args->gt_type==GT_NO_HET ) return 0;
                nhet = 1;
            }
            else if ( type==GT_UNKN )
            {
                if ( args->gt_type==GT_NO_MISSING ) return 0;
                nmiss = 1;
            }
            else
            {
                if ( args->gt_type==GT_NO_HOM ) return 0;
                nhom = 1;
            }
        }
        if ( args->gt_type==GT_NEED_HOM && !nhom ) return 0;
        else if ( args->gt_type==GT_NEED_HET && !nhet ) return 0;
        else if ( args->gt_type==GT_NEED_MISSING && !nmiss ) return 0;
    }

    int minor_ac = 0;
    int major_ac = 0;
    if ( args->calc_ac )
    {
        minor_ac = args->ac[0];
        major_ac = args->ac[0];
        for (i=1; i<line->n_allele; i++){
            if (args->ac[i] < minor_ac) { minor_ac = args->ac[i]; }
            if (args->ac[i] > major_ac) { major_ac = args->ac[i]; }
        }
    }

    if (args->min_ac!=-1)
    {
        if (args->min_ac_type == ALLELE_NONREF && args->min_ac>non_ref_ac) return 0; // min AC
        else if (args->min_ac_type == ALLELE_MINOR && args->min_ac>minor_ac) return 0; // min minor AC
        else if (args->min_ac_type == ALLELE_ALT1 && args->min_ac>args->ac[1]) return 0; // min 1st alternate AC
        else if (args->min_ac_type == ALLELE_MAJOR && args->min_ac > major_ac) return 0; // min major AC
        else if (args->min_ac_type == ALLELE_NONMAJOR && args->min_ac > an-major_ac) return 0; // min non-major AC
    }
    if (args->max_ac!=-1)
    {
        if (args->max_ac_type == ALLELE_NONREF && args->max_ac<non_ref_ac) return 0; // max AC
        else if (args->max_ac_type == ALLELE_MINOR && args->max_ac<minor_ac) return 0; // max minor AC
        else if (args->max_ac_type == ALLELE_ALT1 && args->max_ac<args->ac[1]) return 0; // max 1st alternate AC
        else if (args->max_ac_type == ALLELE_MAJOR && args->max_ac < major_ac) return 0; // max major AC
        else if (args->max_ac_type == ALLELE_NONMAJOR && args->max_ac < an-major_ac) return 0; // max non-major AC
    }
    if (args->min_af!=-1)
    {
        if (an == 0) return 0; // freq not defined, skip site
        if (args->min_af_type == ALLELE_NONREF && args->min_af>non_ref_ac/(double)an) return 0; // min AF
        else if (args->min_af_type == ALLELE_MINOR && args->min_af>minor_ac/(double)an) return 0; // min minor AF
        else if (args->min_af_type == ALLELE_ALT1 && args->min_af>args->ac[1]/(double)an) return 0; // min 1st alternate AF
        else if (args->min_af_type == ALLELE_MAJOR && args->min_af > major_ac/(double)an) return 0; // min major AF
        else if (args->min_af_type == ALLELE_NONMAJOR && args->min_af > (an-major_ac)/(double)an) return 0; // min non-major AF
    }
    if (args->max_af!=-1)
    {
        if (an == 0) return 0; // freq not defined, skip site
        if (args->max_af_type == ALLELE_NONREF && args->max_af<non_ref_ac/(double)an) return 0; // max AF
        else if (args->max_af_type == ALLELE_MINOR && args->max_af<minor_ac/(double)an) return 0; // max minor AF
        else if (args->max_af_type == ALLELE_ALT1 && args->max_af<args->ac[1]/(double)an) return 0; // max 1st alternate AF
        else if (args->max_af_type == ALLELE_MAJOR && args->max_af < major_ac/(double)an) return 0; // max major AF
        else if (args->max_af_type == ALLELE_NONMAJOR && args->max_af < (an-major_ac)/(double)an) return 0; // max non-major AF
    }
    if (args->uncalled) {
        if (args->uncalled == FLT_INCLUDE && an > 0) return 0; // select uncalled
        if (args->uncalled == FLT_EXCLUDE && an == 0) return 0; // skip if uncalled
    }
    if (update_ac && args->update_info) {
        bcf_update_info_int32(args->hdr, line, "AC", &args->ac[1], line->n_allele-1);
        bcf_update_info_int32(args->hdr, line, "AN", &an, 1);
    }
    if (args->trim_alts)
    {
        int ret = bcf_trim_alleles(args->hsub ? args->hsub : args->hdr, line);
        if ( ret<0 ) error("Error: Could not trim alleles at %s:%"PRId64"\n", bcf_seqname(args->hsub ? args->hsub : args->hdr, line), (int64_t) line->pos+1);
    }
    if (args->phased) {
        int phased = bcf_all_phased(args->hdr, line);
        if (args->phased == FLT_INCLUDE && !phased) { return 0; } // skip unphased
        if (args->phased == FLT_EXCLUDE && phased) { return 0; } // skip phased
    }
    if (args->sites_only) bcf_subset(args->hsub ? args->hsub : args->hdr, line, 0, 0);
    return 1;
}

void set_allele_type (int *atype, char *atype_string)
{
    *atype = ALLELE_NONREF;
    if (strcmp(atype_string, "minor") == 0) {
        *atype = ALLELE_MINOR;
    }
    else if (strcmp(atype_string, "alt1") == 0) {
        *atype = ALLELE_ALT1;
    }
    else if (strcmp(atype_string, "nref") == 0) {
        *atype = ALLELE_NONREF;
    }
    else if (strcmp(atype_string, "major") == 0) {
        *atype = ALLELE_MAJOR;
    }
    else if (strcmp(atype_string, "nonmajor") == 0) {
        *atype = ALLELE_NONMAJOR;
    }
    else {
        error("Error: allele type not recognised. Expected one of nref|alt1|minor|major|nonmajor, got \"%s\".\n", atype_string);
    }
}

static void usage(args_t *args)
{
    fprintf(stderr, "\n");
    fprintf(stderr, "About:   VCF/BCF conversion, view, subset and filter VCF/BCF files.\n");
    fprintf(stderr, "Usage:   bcftools view [options] <in.vcf.gz> [region1 [...]]\n");
    fprintf(stderr, "\n");
    fprintf(stderr, "Output options:\n");
    fprintf(stderr, "    -G, --drop-genotypes              Drop individual genotype information (after subsetting if -s option set)\n");
    fprintf(stderr, "    -h, --header-only                 Print only the header in VCF output (equivalent to bcftools head)\n");
    fprintf(stderr, "    -H, --no-header                   Suppress the header in VCF output\n");
    fprintf(stderr, "        --with-header                 Print both header and records in VCF output [default]\n");
    fprintf(stderr, "    -l, --compression-level [0-9]     Compression level: 0 uncompressed, 1 best speed, 9 best compression [%d]\n", args->clevel);
    fprintf(stderr, "        --no-version                  Do not append version and command line to the header\n");
    fprintf(stderr, "    -o, --output FILE                 Output file name [stdout]\n");
    fprintf(stderr, "    -O, --output-type u|b|v|z[0-9]    u/b: un/compressed BCF, v/z: un/compressed VCF, 0-9: compression level [v]\n");
    fprintf(stderr, "    -r, --regions REGION              Restrict to comma-separated list of regions\n");
    fprintf(stderr, "    -R, --regions-file FILE           Restrict to regions listed in FILE\n");
    fprintf(stderr, "        --regions-overlap 0|1|2       Include if POS in the region (0), record overlaps (1), variant overlaps (2) [1]\n");
    fprintf(stderr, "    -t, --targets [^]REGION           Similar to -r but streams rather than index-jumps. Exclude regions with \"^\" prefix\n");
    fprintf(stderr, "    -T, --targets-file [^]FILE        Similar to -R but streams rather than index-jumps. Exclude regions with \"^\" prefix\n");
    fprintf(stderr, "        --targets-overlap 0|1|2       Include if POS in the region (0), record overlaps (1), variant overlaps (2) [0]\n");
    fprintf(stderr, "        --threads INT                 Use multithreading with INT worker threads [0]\n");
    fprintf(stderr, "\n");
    fprintf(stderr, "Subset options:\n");
    fprintf(stderr, "    -a, --trim-alt-alleles            Trim ALT alleles not seen in the genotype fields (or their subset with -s/-S)\n");
    fprintf(stderr, "    -I, --no-update                   Do not (re)calculate INFO fields for the subset (currently INFO/AC and INFO/AN)\n");
    fprintf(stderr, "    -s, --samples [^]LIST             Comma separated list of samples to include (or exclude with \"^\" prefix). Be careful\n");
    fprintf(stderr, "                                        when combining filtering with sample subsetting as filtering comes (usually) first.\n");
    fprintf(stderr, "                                        If unsure, split sample subsetting and filtering in two commands, using -Ou when piping.\n");
    fprintf(stderr, "    -S, --samples-file [^]FILE        File of samples to include (or exclude with \"^\" prefix)\n");
    fprintf(stderr, "        --force-samples               Only warn about unknown subset samples\n");
    fprintf(stderr, "\n");
    fprintf(stderr, "Filter options:\n");
    fprintf(stderr, "    -c/C, --min-ac/--max-ac INT[:TYPE]     Minimum/maximum count for non-reference (nref), 1st alternate (alt1), least frequent\n");
    fprintf(stderr, "                                               (minor), most frequent (major) or sum of all but most frequent (nonmajor) alleles [nref]\n");
    fprintf(stderr, "    -f,   --apply-filters LIST             Require at least one of the listed FILTER strings (e.g. \"PASS,.\")\n");
    fprintf(stderr, "    -g,   --genotype [^]hom|het|miss       Require one or more hom/het/missing genotype or, if prefixed with \"^\", exclude such sites\n");
    fprintf(stderr, "    -i/e, --include/--exclude EXPR         Select/exclude sites for which the expression is true (see man page for details)\n");
    fprintf(stderr, "    -k/n, --known/--novel                  Select known/novel sites only (ID is not/is '.')\n");
    fprintf(stderr, "    -m/M, --min-alleles/--max-alleles INT  Minimum/maximum number of alleles listed in REF and ALT (e.g. -m2 -M2 for biallelic sites)\n");
    fprintf(stderr, "    -p/P, --phased/--exclude-phased        Select/exclude sites where all samples are phased\n");
    fprintf(stderr, "    -q/Q, --min-af/--max-af FLOAT[:TYPE]   Minimum/maximum frequency for non-reference (nref), 1st alternate (alt1), least frequent\n");
    fprintf(stderr, "                                               (minor), most frequent (major) or sum of all but most frequent (nonmajor) alleles [nref]\n");
    fprintf(stderr, "    -u/U, --uncalled/--exclude-uncalled    Select/exclude sites without a called genotype\n");
    fprintf(stderr, "    -v/V, --types/--exclude-types LIST     Select/exclude comma-separated list of variant types: snps,indels,mnps,ref,bnd,other [null]\n");
    fprintf(stderr, "    -x/X, --private/--exclude-private      Select/exclude sites where the non-reference alleles are exclusive (private) to the subset samples\n");
    fprintf(stderr, "          --write-index                    Automatically index the output files [off]\n");
    fprintf(stderr, "\n");
    exit(1);
}

int main_vcfview(int argc, char *argv[])
{
    int c;
    args_t *args  = (args_t*) calloc(1,sizeof(args_t));
    args->argc    = argc; args->argv = argv;
    args->files   = bcf_sr_init();
    args->clevel  = -1;
    args->print_header = 1;
    args->update_info = 1;
    args->output_type = FT_VCF;
    args->n_threads = 0;
    args->record_cmd_line = 1;
    args->write_index = 0;
    args->min_ac = args->max_ac = args->min_af = args->max_af = -1;
    args->regions_overlap = 1;
    args->targets_overlap = 0;
    int targets_is_file = 0, regions_is_file = 0;

    static struct option loptions[] =
    {
        {"genotype",required_argument,NULL,'g'},
        {"compression-level",required_argument,NULL,'l'},
        {"threads",required_argument,NULL,9},
        {"header-only",no_argument,NULL,'h'},
        {"no-header",no_argument,NULL,'H'},
        {"with-header",no_argument,NULL,4},
        {"exclude",required_argument,NULL,'e'},
        {"include",required_argument,NULL,'i'},
        {"trim-alt-alleles",no_argument,NULL,'a'},
        {"no-update",no_argument,NULL,'I'},
        {"drop-genotypes",no_argument,NULL,'G'},
        {"private",no_argument,NULL,'x'},
        {"exclude-private",no_argument,NULL,'X'},
        {"uncalled",no_argument,NULL,'u'},
        {"exclude-uncalled",no_argument,NULL,'U'},
        {"apply-filters",required_argument,NULL,'f'},
        {"known",no_argument,NULL,'k'},
        {"novel",no_argument,NULL,'n'},
        {"min-alleles",required_argument,NULL,'m'},
        {"max-alleles",required_argument,NULL,'M'},
        {"samples",required_argument,NULL,'s'},
        {"samples-file",required_argument,NULL,'S'},
        {"force-samples",no_argument,NULL,1},
        {"output-type",required_argument,NULL,'O'},
        {"output-file",required_argument,NULL,'o'},
        {"output",required_argument,NULL,'o'},
        {"types",required_argument,NULL,'v'},
        {"exclude-types",required_argument,NULL,'V'},
        {"targets",required_argument,NULL,'t'},
        {"targets-file",required_argument,NULL,'T'},
        {"targets-overlap",required_argument,NULL,2},
        {"regions",required_argument,NULL,'r'},
        {"regions-file",required_argument,NULL,'R'},
        {"regions-overlap",required_argument,NULL,3},
        {"min-ac",required_argument,NULL,'c'},
        {"max-ac",required_argument,NULL,'C'},
        {"min-af",required_argument,NULL,'q'},
        {"max-af",required_argument,NULL,'Q'},
        {"phased",no_argument,NULL,'p'},
        {"exclude-phased",no_argument,NULL,'P'},
        {"no-version",no_argument,NULL,8},
        {"write-index",no_argument,NULL,10},
        {NULL,0,NULL,0}
    };
    char *tmp;
    while ((c = getopt_long(argc, argv, "l:t:T:r:R:o:O:s:S:Gf:knv:V:m:M:auUhHc:C:Ii:e:xXpPq:Q:g:",loptions,NULL)) >= 0)
    {
        char allele_type[9] = "nref";
        switch (c)
        {
            case 'O':
                switch (optarg[0]) {
                    case 'b': args->output_type = FT_BCF_GZ; break;
                    case 'u': args->output_type = FT_BCF; break;
                    case 'z': args->output_type = FT_VCF_GZ; break;
                    case 'v': args->output_type = FT_VCF; break;
                    default:
                    {
                        args->clevel = strtol(optarg,&tmp,10);
                        if ( *tmp || args->clevel<0 || args->clevel>9 ) error("The output type \"%s\" not recognised\n", optarg);
                    }
                };
                if ( optarg[1] )
                {
                    args->clevel = strtol(optarg+1,&tmp,10);
                    if ( *tmp || args->clevel<0 || args->clevel>9 ) error("Could not parse argument: --compression-level %s\n", optarg+1);
                }
                break;
            case 'l':
                args->clevel = strtol(optarg,&tmp,10);
                if ( *tmp ) error("Could not parse argument: --compression-level %s\n", optarg);
                args->output_type |= FT_GZ;
                break;
            case 'o': args->fn_out = optarg; break;
            case 'H': args->print_header = 0; break;
            case 'h': args->header_only = 1; break;
            case  4 : args->print_header = 1; args->header_only = 0; break;

            case 't': args->targets_list = optarg; break;
            case 'T': args->targets_list = optarg; targets_is_file = 1; break;
            case 'r': args->regions_list = optarg; break;
            case 'R': args->regions_list = optarg; regions_is_file = 1; break;

            case 's': args->sample_names = optarg; break;
            case 'S': args->sample_names = optarg; args->sample_is_file = 1; break;
            case  1 : args->force_samples = 1; break;
            case 'a': args->trim_alts = 1; args->calc_ac = 1; break;
            case 'I': args->update_info = 0; break;
            case 'G': args->sites_only = 1; break;

            case 'f': args->files->apply_filters = optarg; break;
            case 'k': args->known = 1; break;
            case 'n': args->novel = 1; break;
            case 'm':
                args->min_alleles = strtol(optarg,&tmp,10);
                if ( *tmp ) error("Could not parse argument: --min-alleles %s\n", optarg);
                break;
            case 'M':
                args->max_alleles = strtol(optarg,&tmp,10);
                if ( *tmp ) error("Could not parse argument: --max-alleles %s\n", optarg);
                break;
            case 'v': args->include_types = optarg; break;
            case 'V': args->exclude_types = optarg; break;
            case 'e':
                if ( args->filter_str ) error("Error: only one -i or -e expression can be given, and they cannot be combined\n");
                args->filter_str = optarg; args->filter_logic |= FLT_EXCLUDE; break;
            case 'i':
                if ( args->filter_str ) error("Error: only one -i or -e expression can be given, and they cannot be combined\n");
                args->filter_str = optarg; args->filter_logic |= FLT_INCLUDE; break;
            case 'c':
            {
                args->min_ac_type = ALLELE_NONREF;
                if ( sscanf(optarg,"%d:%8s",&args->min_ac, allele_type)!=2 && sscanf(optarg,"%d",&args->min_ac)!=1 )
                    error("Error: Could not parse --min-ac %s\n", optarg);
                set_allele_type(&args->min_ac_type, allele_type);
                args->calc_ac = 1;
                break;
            }
            case 'C':
            {
                args->max_ac_type = ALLELE_NONREF;
                if ( sscanf(optarg,"%d:%8s",&args->max_ac, allele_type)!=2 && sscanf(optarg,"%d",&args->max_ac)!=1 )
                    error("Error: Could not parse --max-ac %s\n", optarg);
                set_allele_type(&args->max_ac_type, allele_type);
                args->calc_ac = 1;
                break;
            }
            case 'q':
            {
                args->min_af_type = ALLELE_NONREF;
                if ( sscanf(optarg,"%f:%8s",&args->min_af, allele_type)!=2 && sscanf(optarg,"%f",&args->min_af)!=1 )
                    error("Error: Could not parse --min-af %s\n", optarg);
                set_allele_type(&args->min_af_type, allele_type);
                args->calc_ac = 1;
                break;
            }
            case 'Q':
            {
                args->max_af_type = ALLELE_NONREF;
                if ( sscanf(optarg,"%f:%8s",&args->max_af, allele_type)!=2 && sscanf(optarg,"%f",&args->max_af)!=1 )
                    error("Error: Could not parse --max-af %s\n", optarg);
                set_allele_type(&args->max_af_type, allele_type);
                args->calc_ac = 1;
                break;
            }

            case 'x': args->private_vars |= FLT_INCLUDE; args->calc_ac = 1; break;
            case 'X': args->private_vars |= FLT_EXCLUDE; args->calc_ac = 1; break;
            case 'u': args->uncalled |= FLT_INCLUDE; args->calc_ac = 1; break;
            case 'U': args->uncalled |= FLT_EXCLUDE; args->calc_ac = 1; break;
            case 'p': args->phased |= FLT_INCLUDE; break; // phased
            case 'P': args->phased |= FLT_EXCLUDE; break; // exclude-phased
            case 'g':
            {
                if ( !strcasecmp(optarg,"hom") ) args->gt_type = GT_NEED_HOM;
                else if ( !strcasecmp(optarg,"het") ) args->gt_type = GT_NEED_HET;
                else if ( !strcasecmp(optarg,"miss") ) args->gt_type = GT_NEED_MISSING;
                else if ( !strcasecmp(optarg,"^hom") ) args->gt_type = GT_NO_HOM;
                else if ( !strcasecmp(optarg,"^het") ) args->gt_type = GT_NO_HET;
                else if ( !strcasecmp(optarg,"^miss") ) args->gt_type = GT_NO_MISSING;
                else error("The argument to -g not recognised. Expected one of hom/het/miss/^hom/^het/^miss, got \"%s\".\n", optarg);
                break;
            }
            case  2 :
                args->targets_overlap = parse_overlap_option(optarg);
                if ( args->targets_overlap < 0 ) error("Could not parse: --targets-overlap %s\n",optarg);
                break;
            case  3 :
                args->regions_overlap = parse_overlap_option(optarg);
                if ( args->regions_overlap < 0 ) error("Could not parse: --regions-overlap %s\n",optarg);
                break;
            case  9 : args->n_threads = strtol(optarg, 0, 0); break;
            case  8 : args->record_cmd_line = 0; break;
            case 10 : args->write_index = 1; break;
            case '?': usage(args); break;
            default: error("Unknown argument: %s\n", optarg);
        }
    }

    if ( args->filter_logic == (FLT_EXCLUDE|FLT_INCLUDE) ) error("Only one of -i or -e can be given.\n");
    if ( args->private_vars > FLT_EXCLUDE ) error("Only one of -x or -X can be given.\n");
    if ( args->uncalled > FLT_EXCLUDE ) error("Only one of -u or -U can be given.\n");
    if ( args->phased > FLT_EXCLUDE ) error("Only one of -p or -P can be given.\n");

    if ( args->sample_names && args->update_info) args->calc_ac = 1;

    char *fname = NULL;
    if ( optind>=argc )
    {
        if ( !isatty(fileno((FILE *)stdin)) ) fname = "-";  // reading from stdin
        else usage(args);
    }
    else fname = argv[optind];

    // read in the regions from the command line
    if ( args->regions_list )
    {
        bcf_sr_set_opt(args->files,BCF_SR_REGIONS_OVERLAP,args->regions_overlap);
        if ( bcf_sr_set_regions(args->files, args->regions_list, regions_is_file)<0 )
            error("Failed to read the regions: %s\n", args->regions_list);
    }
    else if ( optind+1 < argc )
    {
        int i;
        kstring_t tmp = {0,0,0};
        kputs(argv[optind+1],&tmp);
        for (i=optind+2; i<argc; i++) { kputc(',',&tmp); kputs(argv[i],&tmp); }
        if ( bcf_sr_set_regions(args->files, tmp.s, 0)<0 )
            error("Failed to read the regions: %s\n", tmp.s);
        free(tmp.s);
    }
    if ( args->targets_list )
    {
        bcf_sr_set_opt(args->files,BCF_SR_TARGETS_OVERLAP,args->targets_overlap);
        if ( bcf_sr_set_targets(args->files, args->targets_list, targets_is_file, 0)<0 )
            error("Failed to read the targets: %s\n", args->targets_list);
    }

    if ( bcf_sr_set_threads(args->files, args->n_threads)<0 ) error("Failed to create threads\n");
    if ( !bcf_sr_add_reader(args->files, fname) ) error("Failed to read from %s: %s\n", !strcmp("-",fname)?"standard input":fname,bcf_sr_strerror(args->files->errnum));

    init_data(args);
    bcf_hdr_t *out_hdr = args->hnull ? args->hnull : (args->hsub ? args->hsub : args->hdr);
    if (args->print_header)
    {
        if ( bcf_hdr_write(args->out, out_hdr)!=0 ) error("[%s] Error: cannot write to %s\n", __func__,args->fn_out);
    }
    else if ( args->output_type & FT_BCF )
        error("BCF output requires header, cannot proceed with -H\n");

    if ( args->write_index && init_index(args->out,out_hdr,args->fn_out,&args->index_fn)<0 ) error("Error: failed to initialise index for %s\n",args->fn_out);

    int ret = 0;
    if (!args->header_only)
    {
        while ( bcf_sr_next_line(args->files) )
        {
            bcf1_t *line = args->files->readers[0].buffer[0];
            if ( line->errcode && out_hdr!=args->hdr ) error("Undefined tags in the header, cannot proceed in the sample subset mode.\n");
            if ( subset_vcf(args, line) && bcf_write1(args->out, out_hdr, line)!=0 ) error("[%s] Error: cannot write to %s\n", __func__,args->fn_out);
        }
        ret = args->files->errnum;
        if ( ret ) fprintf(stderr,"Error: %s\n", bcf_sr_strerror(args->files->errnum));
    }

    if (args->write_index)
    {
        if (bcf_idx_save(args->out) < 0)
        {
            if ( hts_close(args->out)!=0 ) error("Error: close failed %s\n", args->fn_out?args->fn_out:"stdout");
            error("Error: cannot write to index %s\n", args->index_fn);
        }
        free(args->index_fn);
    }

    if ( hts_close(args->out)!=0 ) error("Error: close failed %s\n", args->fn_out?args->fn_out:"stdout");
    destroy_data(args);
    bcf_sr_destroy(args->files);
    free(args);
    return ret;
}
