/*
	nrdb - quasi-nonredundant database generator

	Copyright (C) 1997 by Warren R. Gish.  All Rights Reserved.


	Usage:  nrdb [options] file1 [file2 [file3 [file4] ...]]

	Usage:  nrdb [options] file1 id1 [file2 id2 [file3 id3 [file4 id4] ...]]

	where each file# argument is the name of an input file in FASTA
	format; and each id# (identifier) argument is an arbitrary
	(possibly zero-length) character string to be prepended to each
	sequence name read from the corresponding input file.  By
	default, the nonredundant database is sent to standard output,
	but only after all input files have been read in the order specified.
	Statistics about the number of redundant sequence records and
	residues are reported to standard error at the end of a
	successful execution.

	See the call to the getopt() function and its associated switch-case
	statement below for a brief description of the available
	command line options.

	Example:
	
        nrdb -o outfile pir swissprot genpept gpupdate


	Updated:  May 1997 with compression for ambiguous DNA sequences
	Author:  W. Gish, NCBI, February 1991
*/
#include <ncbi.h>
#include <sys/time.h>
#include <sys/resource.h>
#include <gish.h>
#include <gishlib.h>
#define EXTERN
#include "nrdb.h"


static int		nfiles;
NRFilePtr	*files;
static int		maxcatfiles = INT_MAX;

SeqStr	ss;
SeqName	sn;
size_t		minlen = 1;
int	prefix_flag = 0;

void	usage();


main(argc, argv)
	int		argc;
	char	**argv;
{
	struct rlimit	rl;
	int		i, j, c;
	SeqStrPtr	new, old;
	SeqNamePtr	snp;

	module = str_dup(misc_basename(argv[0], NULL));

	while ((c = getopt(argc, argv, "pa:o:h:l:c:C:d:n:iL")) != -1) {
		switch (c) {
			case 'p':
				prefix_flag = 1;
				break;
			case 'a': /* append output to the specified file */
				ofp = fopen(optarg, "a+");
				if (ofp == NULL)
					fatal(1, "Unable to open specified output file");
				break;
			case 'o': /* write output to the specified file (overwriting) */
				ofp = fopen(optarg, "w");
				if (ofp == NULL)
					fatal(1, "Unable to open specified output file");
				break;
			case 'h':
				fprintf(stderr, "\nThe -h option is no longer valid and will be ignored for now.\n");
				break;
			case 'l': /* minimum required sequence length */
				if (sscanf(optarg, "%u", &minlen) != 1)
					fatal(1, "Invalid -l# minimum length");
				break;
			case 'c':
				/* no. of input files for which definitions will be
					concatenated to any existing definition(s) */
				if (sscanf(optarg, "%d", &maxcatfiles) != 1 || maxcatfiles < 0)
					fatal(1, "Invalid -c# max. number of concatenating files");
				break;
			case 'n': /* max. no. of sequence names reported per sequence */
				if (sscanf(optarg, "%d", &desc_max) != 1 || desc_max < 0)
					fatal(1, "invalid -n#:  max. number of sequences names to report per sequence:  %s", optarg);
				break;
			case 'i': /* skip comparison of first database against itself */
				skip1 = TRUE;
				break;
			case 'C':
				if (chdir(optarg) == -1)
					fatal(1, "Couldn't change to directory:  %s", optarg);
				break;
			case 'L': /* lock data in memory */
#ifdef DATLOCK
				if (plock(DATLOCK) == 0)
					plocked = 1;
				else
					break;
#ifdef RLIMIT_RSS
				if (getrlimit(RLIMIT_RSS, &rl) != 0)
					break;
				rl.rlim_cur *= 0.67;
				setrlimit(RLIMIT_RSS, &rl);
#endif /* RLIMIT_RSS */
#else /* !DATLOCK */
				fprintf(stderr, "\nWARNING:  the -L option is ignored on this computing platform.\n");
#endif /* !DATLOCK */
				break;
			case 'd': /* delimiter between multiple descriptions */
				if (strlen(optarg) == 1 && !isdigit(*optarg)) {
					delim = *optarg;
					break;
				}
				if (strncasecmp(optarg, "0x", 2) == 0) {
					if (sscanf(optarg+2, "%lx", &i) != 1)
						fatal(1, "Hex conversion error on delimiter %s", optarg);
				}
				else
					if (sscanf(optarg, "%d", &i) != 1)
						fatal(1, "Decimal conversion error on delimiter %s", optarg);
				if (i >= (1<<CHAR_BIT) || i < 0)
					fatal(1, "delimiter must be in the range 0-%u", 1<<CHAR_BIT);
				delim = i;
				break;
			case '?':
				fprintf(stderr, "What's this?  -->  \"%s\"\n", optarg);
			default:
				usage();
				break;
		}
	}

	if (optind == argc)
		usage();

	if (prefix_flag && (argc - optind)%2 != 0)
		fatal(1, "a filename was specified with no matching identifier");

	initalpha();

	if (ofp == NULL) {
		ofp = stdout;
		if (ofp == NULL)
			fatal(1, "could not open output file");
	}
#ifdef _IOFBF
	setvbuf(ofp, NULL, _IOFBF, 256*KBYTE);
#endif

	nfiles = (argc - optind) / (1 + prefix_flag);
	files = (NRFilePtr *)ckalloc0(sizeof(NRFilePtr)*nfiles);
	SeqStr_InitBase();

	for (i=optind, j = 0; i<argc; i += 1 + prefix_flag, ++j) {
		curfp = NRFile_Open(argv[i], (prefix_flag ? argv[i+1] : ""));
		if (curfp == NULL)
			fatal(1, "Could not open sequence file:  %s", argv[i]);
		files[(i-optind)/(1+prefix_flag)] = curfp;

		while ((new = NRFile_Read(curfp)) != NULL) {
			if (new->seqlen == 0) {
				curfp->numnull++;
				continue;
			}
			if (new->seqlen < minlen) {
				curfp->lencnt++;
				curfp->lenres += new->seqlen;
				continue;
			}
			if ((i != optind || !skip1) &&
					(old = SeqStr_AlreadyFound(new)) != NULL) {
				curfp->dupres += old->seqlen;
				curfp->nummatches++;
				if (j < maxcatfiles || old->name1->nrfp->filenum == curfp->filenum)
					SeqStr_AppendName(old, new->name1);
				continue;
			}
			else {
				SeqStr_Append(new);
				continue;
			}
		}
		NRFile_Close(curfp);
	}

	Report(listhead);

	fflush(stdout);

	fprintf(stderr, "\n\nProgressive Statistics:\n\n");
	fprintf(stderr, "%11s", "");
	fprintf(stderr, "--------- Records ---------  ");
	fprintf(stderr, "-------------- Residues -----------\n");
	fprintf(stderr, "Database   ");
	fprintf(stderr, "   Read  Duplicate  Written     ");
	fprintf(stderr, "    Read   Duplicate     Written\n");
	for (i=optind; i<argc; i += (1 + prefix_flag)) {
		curfp = files[(i-optind)/(1 + prefix_flag)];
		numseqs += curfp->numseqs;
		nummatches += curfp->nummatches;
		totres += curfp->totres;
		dupres += curfp->dupres;
		lencnt += curfp->lencnt;
		lenres += curfp->lenres;
		numnull += curfp->numnull;
		fprintf(stderr, "%-10s %7s  %7s    %7s  ",
			misc_basename(curfp->filename, NULL),
			Ultostr(curfp->numseqs,1),
			Ultostr(curfp->nummatches,1),
			Ultostr(curfp->numseqs - curfp->nummatches - curfp->lencnt - curfp->numnull,1)
			);
		fprintf(stderr, "%11s %11s %11s\n",
			Ultostr(curfp->totres,1),
			Ultostr(curfp->dupres,1),
			Ultostr(curfp->totres - curfp->dupres - curfp->lenres,1)
			);
	}

	fprintf(stderr, "\n%-10s %7s  %7s    %7s  ",
		"Totals:",
		Ultostr(numseqs,1),
		Ultostr(nummatches,1),
		Ultostr(numseqs - nummatches - numnull - lencnt,1)
		);
	fprintf(stderr, "%11s %11s %11s\n",
		Ultostr(totres,1),
		Ultostr(dupres,1),
		Ultostr(totres - dupres - lenres,1)
		);

	if (numnull != 0)
		fprintf(stderr,
			"\nTotal no. of zero-length sequences encountered:  %d\n", numnull);
	fprintf(stderr, "\nNo. of base word hits:  %s (%s total)\n",
			Ultostr(numhits,1), Ultostr(numtothits,1));
	fprintf(stderr, "No. of 32-bit hash hits:  %s\n", Ultostr(numhashhits,1));
	fprintf(stderr, "Total memory allocated:  %0.3lf MB\n", (double)totalloced() / (KBYTE*KBYTE));
	fprintf(stderr, "Longest comment line read:  %s\n", Ultostr(maxnamelen,1));
	fprintf(stderr, "Longest comment line written:  %s\n", Ultostr(maxwnamelen,1));
	fprintf(stderr, "Longest sequence read:  %s\n", Ultostr(maxseqlen,1));
	if (minlen > 1)
	fprintf(stderr, "Sequences less than %d residues in length:  %s\n",
					minlen, Ultostr(lencnt,1));
	exit(0);
}


void
usage()
{
	fprintf(stderr, "nrdb 2.0.1 -- quasi-nonredundant database generator\n\n");
	fprintf(stderr, "Copyright (C) 1997,1998 by Warren R. Gish.  All Rights Reserved.\n");
	fprintf(stderr, "\nUsage:\n\n    %s [options] file1 [file2 [file3 ...]]\n",
			module);
	fprintf(stderr, "\nUsage:\n\n    %s -p [options] file1 id1 [file2 id2 [file3 id3 ...]]\n",
			module);
	fprintf(stderr, "\nwhere options are:\n");
	fprintf(stderr, "\t-o filename\t-- name of file in which to save output\n");
	fprintf(stderr, "\t-a filename\t-- name of file to which output should be appended\n");
	fprintf(stderr, "\t-l#\t\t-- min. required sequence length\n");
	fprintf(stderr, "\t-i\t\t-- do not check first file for duplicates\n");
	fprintf(stderr, "\t-c#\t\t-- no. of input files for concatenating descriptions\n");
	fprintf(stderr, "\t-C directory\t-- change directory\n");
#ifdef DATLOCK
	fprintf(stderr, "\t-L\t\t-- lock data pages in memory (super-user only)\n");
#endif
	fprintf(stderr, "\t-n#\t\t-- max. no. of seq. descriptions to report per sequence\n");
	fprintf(stderr, "\t-d#\t\t-- delimiter between consecutive descriptions\n");
	fprintf(stderr, "\t-p\t\t-- use id prefixes on command line\n");
	exit(1);
}

