/**************************************************************************
*                                                                         *
*                             COPYRIGHT NOTICE                            *
*                                                                         *
* This software/database is categorized as "United States Government      *
* Work" under the terms of the United States Copyright Act.  It was       *
* produced as part of the author's official duties as a Government        *
* employee and thus can not be copyrighted.  This software/database is    *
* freely available to the public for use without a copyright notice.      *
* Restrictions can not be placed on its present or future use.            *
*                                                                         *
* Although all reasonable efforts have been taken to ensure the accuracy  *
* and reliability of the software and data, the National Library of       *
* Medicine (NLM) and the U.S. Government do not and can not warrant the   *
* performance or results that may be obtained by using this software,     *
* data, or derivative works thereof.  The NLM and the U.S. Government     *
* disclaim any and all warranties, expressed or implied, as to the        *
* performance, merchantability or fitness for any particular purpose or   *
* use.                                                                    *
*                                                                         *
* In any work or product derived from this material, proper attribution   *
* of the author(s) as the source of the software or data would be         *
* appreciated.                                                            *
*                                                                         *
**************************************************************************/
/*
SETDB - Produce a protein sequence database for searching with the
programs BLASTP, BLASTX, and BLAST3.

Usage:

 	setdb [-t title] dbname

where dbname names a file containing protein sequences in FASTA format,
each separated by a single header line that begins with the character
'>' and ends with a newline character.  The optional title will be
displayed by the BLAST programs when the database is searched.

Typical sequence entries in the FASTA-format input file to the SETDB program
look like this:

>CCHU (PIR) Cytochrome c - Human
GDVEKGKKIFIMKCSQCHTVEKGGKHKTGPNLHGLFGRKTGQAPGYSYTAANKNKGIIWG
EDTLMEYLENPKKYIPGTKMIFVGIKKKEERADLIAYLKKATNE
>CCCZ (PIR) Cytochrome c - Chimpanzee (tentative sequence)
GDVEKGKKIFIMKCSQCHTVEKGGKHKTGPNLHGLFGRKTGQAPGYSYTAANKNKGIIWG
EDTLMEYLENPKKYIPGTKMIFVGIKKKEERADLIAYLKKATNE
>CCMQR (PIR) Cytochrome c - Rhesus macaque (tentative sequence)
GDVEKGKKIFIMKCSQCHTVEKGGKHKTGPNLHGLFGRKTGQAPGYSYTAANKNKGITWG
EDTLMEYLENPKKYIPGTKMIFVGIKKKEERADLIAYLKKATNE


SETDB partitions the database into three files.  For example, if the
database is named AABANK, then the three created files are named
AABANK.ahd (containing header lines), AABANK.bsq (containing
binary-encoded sequences), and AABANK.atb (containing indices).
The input FASTA-format file is not required by BLASTP, BLASTX, or BLAST3.
(A fourth type of output file, .seq, is used by GBLASTA.  Using conditional
compilation, SETDB does not ordinarily produce a .seq file.)

The database can then be searched using the command:

    blastp aabank queryseq

*/
#define I_DONT_PLAN_TO_USE_GBLASTA

#include <ncbi.h>
#include <gish.h>
#include <gishlib.h>

#define EXTERN
#include "blastapp.h"
#include "aabet.h"	/* the alphabet */

/* entry_max is dynamically increased to accommodate arbitrarily large dbs */
size_t	entry_max;
/* amount to increment entry_max each time */
#define ENTRY_INCR (256*1024)	

off_t	*header_beg, /* file offsets to the header lines */
		*seq_beg; /* file offsets to start of sequence data */

long
		dbtype = DB_TYPE_PRO,	/* database type indicator */
		dbformat = AAFORMAT;	/* database format (version) indicator */

unsigned long
		entry,		/* actual number of database entries */
		mxlen;		/* maximum length of a database sequence */

char	*fname;		/* Basename used for the names of database files */
char	*dbtitle = "";	/* Visible string name of the database */
char	filename[FILENAME_MAX+1];
char	*buf;
size_t	bufmax;
int		long_write PROTO((long, FILE *));
const char	*module = "setdb";

Boolean	noblock_flag, nolock_flag;
char	alarm_rang;

int	alarmfunc()
{
	if (!alarm_rang)
		printf("Waiting to acquire a file lock -- this database appears to be in use.\n");
	else
		printf("Still waiting...\n");
	alarm_rang = 1;
	return 0;
}

main(argc,argv)
	int		argc;
	char	**argv;
{
	FILE
		*infp,	/* input file */
		*hdrfile,	/* header file */
#ifndef I_DONT_PLAN_TO_USE_GBLASTA
		*sfile,	/* sequence file */
#endif
		*bfile,	/* binary-encoded sequence file */
		*tfile;	/* tab (index) file */
	AlarmBlkPtr	abp;
	long	len;
	long	i;
	int		period = 60;
	int		c, eol;
	char	*eof;
	int		fd;
	register char	*cp, *cp2, ch, xch;
	long	totlen=0;
	long	seqcnt=0;

	while ((c = getopt(argc, argv, "t:p:l")) != -1)
		switch (c) {
		case 't':
			dbtitle = optarg;
			break;
		case 'p': /* period of waiting messages */
			period = MAX(atoi(optarg), 0);
			break;
		case 'b':
			noblock_flag = TRUE;
			break;
		case 'l':
			nolock_flag = TRUE;
			break;
		case '?':
			usage();
		}

	if (argc - optind != 1)
		usage();

	fname = argv[optind];

	infp = ckopen(fname, "r", 1);

	sprintf(filename, "%s.atb", fname);
	fd = ckdopen(filename, O_WRONLY|O_CREAT);
	tfile = fdopen(fd, "w");

	sprintf(filename, "%s.ahd", fname);
	fd = ckdopen(filename, O_WRONLY|O_CREAT);
	hdrfile = fdopen(fd, "w");

	sprintf(filename, "%s%s", fname, AA_SEARCHSEQ_EXT);
	fd = ckdopen(filename, O_WRONLY|O_CREAT);
	bfile = fdopen(fd, "w");

	/*
	The order of locking files is important, to avoid the potential
	for deadlock with the BLAST programs, which must lock files in
	the same order (at least for the first file) as is done here.
	The order of file closing is then important, because that's when
	the locks established here are released.  The closing order should
	be the opposite order of locking.
	*/
	AlarmEvery(&abp, period, alarmfunc, NULL);
	lockfile(tfile);
	lockfile(hdrfile);
	lockfile(bfile);
	AlarmClr(abp);
	if (alarm_rang)
		printf("Continuing\n");

#ifndef I_DONT_PLAN_TO_USE_GBLASTA
	sprintf(filename, "%s.seq", fname);
	sfile = ckopen(filename, "w", 1);
	SYS_WRITEWLOCK(fileno(sfile), 0, SEEK_SET, 0);
#endif

	/* Skip over any initial comment lines */
	do {
		eof = vfgets(&buf, &bufmax, &buf, infp);
	} while (buf[0] != '>' && eof != NULL);

	/* Add a null prefix (sentinel) byte to each database file */
#ifndef I_DONT_PLAN_TO_USE_GBLASTA
	putc_unlocked(NULLB, sfile);
#endif
	putc_unlocked(NULLB, bfile);

	for (entry = 0; eof != NULL; entry++) {
		if (entry >= entry_max) {
			entry_max += ENTRY_INCR;
			header_beg = (off_t *)ckrealloc(header_beg, sizeof(*header_beg)*(entry_max+1));
			seq_beg = (off_t *)ckrealloc(seq_beg, sizeof(*seq_beg)*(entry_max+1));

		}
		header_beg[entry] = ftell(hdrfile);
		/* Write the header line without a trailing newline character */
		if (fwrite(buf, strlen(buf)-1, 1, hdrfile) != 1)
			bfatal(ERR_UNDEF, "Write error on file %s.ahd", fname);
		len = 0;
		seq_beg[entry] = ftell(bfile);
		while ((eof = vfgets(&buf, &bufmax, &buf, infp)) != NULL) {
			if (*buf == '>')
				break;
			/* extract ASCII alphabetic characters and encode them in binary */
			cp = cp2 = buf;
			while ((ch = *cp++) != NULLB) {
				if ((xch = aa_atob[ch]) < AAID_MAX+1) {
#ifndef I_DONT_PLAN_TO_USE_GBLASTA
					putc_unlocked(ch, sfile);
#endif
					*cp2++ = xch;
				}
			}
			eol = cp2 - buf;
			if (fwrite(buf, 1, eol, bfile) != eol) {
				perror(module);
				exit(2);
			}

			/* tally the byte count */
			len += eol;
		}

		++seqcnt;
		totlen += len;

		/* Add a null terminator (sentinel) byte to each sequence */
#ifndef I_DONT_PLAN_TO_USE_GBLASTA
		putc_unlocked(NULLB, sfile);
#endif
		putc_unlocked(NULLB, bfile);

		if (len > mxlen) /* mxlen doesn't take null terminator into account */
			mxlen = len;
	}
	seq_beg[entry] = ftell(bfile);
	header_beg[entry] = ftell(hdrfile);


	long_write(dbtype, tfile);
	long_write(dbformat, tfile);

	/* Save the database title */
	len = strlen(dbtitle) + 1;
	long_write(len, tfile);
	fwrite(dbtitle, len, 1, tfile);
	/* pad dbtitle to a multiple of 4 bytes with nulls */
	if (len%4 != 0)
		for (c = 0; c < 4 - len%4; ++c)
			putc_unlocked('\0', tfile);

	long_write(entry, tfile);
	long_write(totlen, tfile);
	long_write(mxlen, tfile);

	for (i=0; i<entry+1; ++i)
		long_write((long)seq_beg[i], tfile);
	for (i=0; i<entry+1; ++i)
		long_write((long)header_beg[i], tfile);

#ifndef I_DONT_PLAN_TO_USE_GBLASTA
	fclose(sfile);
#endif
	fclose(bfile);
	fclose(hdrfile);
	fclose(tfile);

	printf("%s ==> %s sequences totalling %s letters\n",
			fname, Ltostr(seqcnt,1), Ltostr(totlen,1));
	printf("Maximum sequence length %s\n", Ltostr(mxlen,1));
	exit(0);
}

int
lockfile(fp)
	FILE	*fp;
{
	int	fd;

	fd = fileno(fp);
	if (!nolock_flag) {
		if (!noblock_flag)
			SYS_WRITEWLOCK(fd, 0, SEEK_SET, 0);
		else
			SYS_WRITELOCK(fd, 0, SEEK_SET, 0);
	}
	if (ftruncate(fd, 0) == -1)
		bfatal(ERR_FOPEN, "ftruncate failed");
	return 0;
}


void
usage()
{
	fprintf(stderr,
		"Purpose:  produce a protein database for BLAST from a file in FASTA format.\n\n");
	fprintf(stderr, "Usage:\n\n    %s [options] profilename\n\n", module);
	fprintf(stderr, "Options include:\n");
	fprintf(stderr, "    -t title   descriptive title for the database\n");
	fprintf(stderr, "    -l         do not lock database files during processing\n");
	fprintf(stderr, "    -b         lock database files but do not block if already locked\n");
	fprintf(stderr, "    -p time    periodicity (in seconds) to warn of file lock contention\n");
	exit(1);
}

int
long_write(i, fp)
	long	i;
	FILE	*fp;
{
	register CharPtr	ip;

	LONG_BIGENDIAN(i);
	ip = (CharPtr) &i;
	ip += (sizeof(long) - BO_LONG_SIZE);
	return fwrite(ip, BO_LONG_SIZE, 1, fp);
}

