/* ===========================================================================
 
                            PUBLIC DOMAIN NOTICE
 
  Unless otherwise indicated, this work was written by Warren R. Gish.
  Portions of the work were developed by the author while a United States
  Government employee under the terms of the United States Copyright Act and
  thus cannot be copyrighted.  Modifications to the prior work, as well as
  entirely new portions of this software, were developed with subsequent
  support from the Washington University School of Medicine, St. Louis, MO, and
  are hereby placed in the public domain.  This software is freely available
  to the public for use without restriction.

  Although all reasonable efforts have been taken to ensure the accuracy and
  reliability of the software and data, the author does not and cannot warrant
  the performance or results that may be obtained by using this software or
  data.  The author disclaims all warranties, express or implied, including
  warranties of performance, merchantability or fitness for any particular
  purpose.

  Please cite the author in any work or product based on this material.

===========================================================================*/
/*
Last modified:  9/18/95

PRESSDB - Produce a nucleotide sequence database for searching with the
programs BLASTN and TBLASTN.

Usage:

    pressdb [-t title] [-c clean-bound] database

where "database" names a file in FASTA format containing DNA sequences,
separated by header lines beginning with the character '>'.  A typical
sequence entry is:

>BOVHBPP3I  Bovine beta-globin psi-3 pseudogene, 5' end.
GGAGAATAAAGTTTCTGAGTCTAGACACACTGGATCAGCCAATCACAGATGAAGGGCACT
GAGGAACAGGAGTGCATCTTACATTCCCCCAAACCAATGAACTTGTATTATGCCCTGGGC
TAATCTGCTCTCAGCAGAGAGGGCAGGGGGCTGGGTGGGGCTCACAAGCAAGACCAGGGC
CCCTACTGCTTACACTTGCTTCTAACACAACTTGCAACTGCACAAACACACATCATGGTG
CATCTGACTCTTGAGGGGAAGGCTACTTGTCACT

It is >>>ABSOLUTELY REQUIRED<<< that all lines of sequence data in the FASTA
file read by PRESSDB, except header lines and the last line of each sequence
entry, have the SAME length.  Furthermore, the last line of the database must
end with a newline ('\n') character, and there can be no spaces or tabs in the
sequence.  These may sound like dumb requirements, and you may note that SETDB
has no such requirements, but these conditions permit the programs BLASTN,
TBLASTN and TBLASTX to directly access any needed stretch of sequence from the
FASTA file, in order to obtain any ambiguity codes present in the sequence.
The FASTA file is only referenced if a match is found in a sequence known to
have one or more ambiguity codes in it (assuming the FASTA file is accessible
to the search program).

If two consecutive lines in the file begin with '>', PRESSDB assumes that there
is supposed to be an intervening sequence with zero length.  In other words, a
sequence can have one and only one header line associated with it.

PRESSDB produces three output files that assist database searching.  If the
database is named DNABANK, for example, then the three output files are named
DNABANK.csq, DNABANK.nhd and DNABANK.ntb.  The .csq file is a 4-to-1 byte
compressed version of the sequences in the FASTA-format file; the .nhd file
contains the header lines/descriptions; and the .ntb file contains indices for
the .csq, .nhd, and input FASTA files.

After formatting with PRESSDB, the database can then be searched using the
command:

       blastn DNABANK query.seq

The original input FASTA-format file, which may be voluminous, was at one time
absolutely required by the BLASTN and TBLASTN programs, but is not strictly
necessary to be maintained in current versions.  If a database sequence is
known to contain one or more nucleotide ambiguity codes, maintenance of the
original FASTA-format file will enable BLASTN and TBLASTN to assess whether
ambiguity codes are present in segments where matching was observed; if the
FASTA file is unavailable and an ambiguous sequence is hit against, BLASTN and
TBLASTN will merely issue a warning.  If none of the sequences in the FASTA
file contain ambiguity codes, then BLASTN and TBLASTN will never attempt to
refer to the FASTA file, rendering this file completely superfluous for the
purposes of BLAST searching.

The optional clean-bound argument to pressdb, which must be a positive integer,
characterizes a (possibly empty) set of 8-mers that are to be "cleaned" from a
query sequence prior to searching the database.  Any octomer that occurs at
least clean-bound times in the database (and at coordinate positions within the
sequences that are divisible by four) will be marked as "uninformative" and
neglected in word searches by BLASTN.  If clean-bound is not specified, no
octomers will be cleaned.

	Some clean-bound statistics gathered on GenBank Release 64.0

	Clean-bound		# of 8-mers cleaned
	===========		===================
	 100			34,578
	 200			 8,246
	 300			 1,876
	 400			   631
	 500			   275
	 600			   153
	 700			    96

NOTE: the clean-bound is no longer recommended for use, as it can produce
unexpected database search results, particularly when region(s) of similarity
between the query and database sequences are short.  BLASTN version 1.4 does
not support the clean-bound, as well.
*/

#include <ncbi.h>
#include <signal.h>
#include <gishlib.h>

#define EXTERN
#include "blastapp.h"

const char	*module = "pressdb";

#ifdef WORDSIZE_MIN
#undef WORDSIZE_MIN
#endif
#define WORDSIZE_MIN	8		/* Smallest hash word size */

#define BUCKETS		65536		/* 4**8 */

size_t	entry_max; /* limit on allocated memory */
/* amount to increment entry_max each time it must be increased */
#define ENTRY_INCR	(512*1024)

/* type to pack values 0..255 in one byte; unsigned char for SUN */
#define PACK_TYPE unsigned char

char	hadzerolen;

char	*buf;
size_t	bufmax, obufmax;

char	*fname, filename[FILENAME_MAX+1];
char	*dbname;
long	dbsize;
char	*dbtitle;
Boolean	append_mode = FALSE;
Boolean	echo_mode = FALSE; /* echo FASTA-format sequences to output file? */

PACK_TYPE *cbuf;

FILE	*infile, *outfile, *cfile, *tfile, *hdrfile;

Boolean	had_ambiguity = FALSE;
long	bad_one;

void	put_tail PROTO((BDBFILE PNTR, char *,int));
int	random_nuc PROTO((BDBFILE PNTR bp, int c));

#if defined(SIGINT) || defined(SIGHUP) || defined(SIGTERM)
void	sighandler();
#endif

int	clean_bound = 0,
	occurs[BUCKETS];

#define mask	(BUCKETS-1)
BLAST_AlphabetPtr	ntap;
BLAST_DegenMapPtr	dmp;
BLAST_DegenListPtr	dlp;
Boolean	PNTR maptst;
BLAST_LetterPtr	map;

long	seed;
Boolean	s_flag;
Boolean	noblock_flag, nolock_flag;
unsigned long	count_orig;

int	long_write PROTO((long, FILE *));

char	alarm_rang;

int	alarmfunc()
{
	if (!alarm_rang)
		printf("Waiting to acquire a file lock -- this database appears to be in use.\n");
	else
		printf("Still waiting...\n");
	alarm_rang = 1;
	return 0;
}

int
main(argc,argv)
	int		argc;
	char	**argv;
{
	long	i, len;
	int		c, shortlen=0, lastlen = 0, theline = 0, seqline, fileline=0;
	int		fd;
	BDBFILE	bdb;
	BDBFILE	PNTR bp;
	AlarmBlkPtr	abp;
	int		period = 60;
	char	*eof, *b, *compress();
	unsigned long	seqlen = 0;
	size_t	otsize, tsize;

	bp = &bdb;
	MemSet((VoidPtr)bp, 0, sizeof(bdb));

	InitBLAST();

	ntap = blast_oldblastna;
	maptst = ntap->inmap->maptst;
	map = ntap->inmap->map;
	dmp = BlastDegenMapFind(ntap);
	dlp = dmp->degen;

	while ((c = getopt(argc, argv, "o:a:t:c:s:p:bl")) != -1)
		switch (c) {
		case 'o': /* output file or database name */
			echo_mode = TRUE;
			dbname = optarg;
			break;
		case 'a':
			append_mode = TRUE;
			echo_mode = TRUE;
			dbname = optarg;
			break;
		case 't':
			dbtitle = optarg;
			break;
		case 'c':
			clean_bound = atoi(optarg);
			if (clean_bound < 1)
				fatal(ERR_INVAL, "the clean limit must be a positive integer");
			break;
		case 's': /* random number generator seed */
			if (sscanf(optarg, "%ld", &seed) != 1)
				fatal(ERR_INVAL, "invalid random number generator seed:  %s", optarg);
			s_flag = TRUE;
			break;
		case 'p': /* period of waiting messages */
			period = MAX(atoi(optarg), 0);
			break;
		case 'b':
			noblock_flag = TRUE;
			break;
		case 'l':
			nolock_flag = TRUE;
			break;
		case '?':
			usage();
		}

	if (argc - optind != 1) /* Need one more argument on the command line */
		usage();

	fname = argv[optind];
	if (dbname == NULL)
		dbname = fname;
	if (strcmp(dbname, "-") == 0)
		fatal(ERR_DBASE, "Sorry, the database name \"%s\" is not permitted to be used.", dbname);
	if (strcmp(dbname, fname) == 0) {
		echo_mode = FALSE;
		if (append_mode)
			fatal(ERR_DBASE, "Sorry, a database can not be appended to itself.");
	}

#ifdef SIGINT
	if (sys_signal(SIGINT, sighandler) == SIG_IGN)
		sys_signal(SIGINT, SIG_IGN);
#endif
#ifdef SIGHUP
	if (sys_signal(SIGHUP, sighandler) == SIG_IGN)
		sys_signal(SIGHUP, SIG_IGN);
#endif
#ifdef SIGTERM
	sys_signal(SIGTERM, sighandler);
#endif

/*#define ckopen(a,b,c) fopen(a,b)*/
	infile = outfile = ckopen(fname, "r", 1);

	/* CAUTION:  a call to ftruncate() is invoked below by truncfile() */
	sprintf(filename, "%s%s", dbname, NT_TABLE_EXT);
	tfile = ckopen(filename, "r+", 0);
	if (tfile == NULL) {
		if (append_mode && sys_filesize(dbname) > 0)
			fatal(ERR_DBASE, "illegal attempt to append to a database, when the FASTA file is nonempty and the corresponding BLAST database files are yet to be created.");
		append_mode = FALSE;
		tfile = ckopen(filename, "w+", 1);
	}

	sprintf(filename, "%s%s", dbname, NT_HEADER_EXT);
	if (append_mode)
		hdrfile = ckopen(filename, "r+", 1);
	else
		hdrfile = ckopen(filename, "w+", 1);

	sprintf(filename, "%s%s", dbname, NT_SEARCHSEQ_EXT);
	if (append_mode)
		cfile = ckopen(filename, "r+", 1);
	else
		cfile = ckopen(filename, "w", 1);

	if (!nolock_flag && !noblock_flag)
		AlarmEvery(&abp, period, alarmfunc, NULL);
	/* Only lock the .ntb file -- this locks the whole database */
	lockfile(tfile);
	dbsize = sys_filesize(dbname);
	if (append_mode && get_table(tfile, bp) != 0)
		fatal(ERR_DBASE, "Error reading file %s%s", dbname, NT_TABLE_EXT);
	if (dbsize < 0 && append_mode)
		echo_mode = FALSE;
	if (append_mode && dbsize >= 0 && bp->seq_beg[bp->count] != dbsize)
		fatal(ERR_DBASE, "The size of the existing FASTA file does not jibe with the contents of the BLAST database file %s%s", dbname, NT_TABLE_EXT);
	count_orig = bp->count;

	bp->type = DB_TYPE_NUC;
	bp->format = NTFORMAT;
	if (echo_mode) {
		outfile = ckopen(dbname, (append_mode ? "a" : "w"), 0);
		if (outfile == NULL)
			echo_mode = FALSE;
	}

	truncfile(tfile);
	truncfile(hdrfile);
	fseek(hdrfile, 0, SEEK_END);
	truncfile(cfile);
	fseek(cfile, 0, SEEK_END);
	if (!append_mode)
		fputc(NT_MAGIC_BYTE, cfile); /* MAGIC prefix byte */
	AlarmClr(abp);
	if (alarm_rang)
		printf("Continuing\n");


	if (s_flag)
		Nlm_RandomSeed(seed);

	/* Skip over any initial comment lines */
	do {
		++fileline;
		eof = vfgets(&buf, &bufmax, &buf, infile);
		if (bufmax != obufmax) {
			obufmax = bufmax;
			cbuf = (PACK_TYPE *)mem_realloc(cbuf, bufmax/(CHAR_BIT/NBPN) + 1);
		}
	} while (buf[0] != '>' && eof != NULL);

	b = buf;
	for (entry_max = bp->count; eof != NULL; ++bp->count) {
		if (bp->count >= entry_max) {
			otsize = ((entry_max+CHAR_BIT-1)/CHAR_BIT);
			entry_max += ENTRY_INCR;
			tsize = ((entry_max+CHAR_BIT-1)/CHAR_BIT);
			bp->header_beg = (unsigned long PNTR)mem_realloc((CharPtr)bp->header_beg, sizeof(*bp->header_beg)*(entry_max+1));
			bp->cseq_beg = (unsigned long PNTR)mem_realloc((CharPtr)bp->cseq_beg, sizeof(*bp->cseq_beg) * (entry_max+1));

			bp->seq_beg = (unsigned long PNTR)mem_realloc((CharPtr)bp->seq_beg, sizeof(*bp->seq_beg) * (entry_max+1));

			bp->ambiguity = (UcharPtr)mem_realloc((CharPtr)bp->ambiguity, tsize+1);
			Nlm_MemSet((CharPtr)bp->ambiguity + otsize, 0, tsize - otsize + 1);
		}
		bp->header_beg[bp->count] = ftell(hdrfile);
		/* Write the header line without the newline character */
		if (fwrite(b, (i = strlen(b))-1, 1, hdrfile) != 1)
			fatal(ERR_DBASE, "Error writing file %s%s", dbname, NT_HEADER_EXT);
		if (echo_mode && fwrite(b, i, 1, outfile) != 1)
			fatal(ERR_DBASE, "Error appending to file %s", dbname);
		b = buf;
		hadzerolen = FALSE;
		bp->cseq_beg[bp->count] = ftell(cfile) * (CHAR_BIT/NBPN);
		bp->seq_beg[bp->count] = ftell(outfile);
		seqline = 0;
		while (++fileline && (eof = vfgets(&buf, &bufmax, &b, infile)) != NULL) {
			if (bufmax != obufmax) {
				obufmax = bufmax;
				cbuf = (PACK_TYPE *)mem_realloc(cbuf, bufmax/(CHAR_BIT/NBPN) + 1);
			}
			if (*b == '>') {
				put_tail(bp, b, bp->count);
				seqlen = 0;
				break;
			}
			++seqline;
			len = strlen(b);
			if (echo_mode && fwrite(b, len, 1, outfile) != 1)
				fatal(ERR_DBASE, "Error appending to file %s", dbname);
			b[--len] = NULLB; /* Remove the terminal newline character */
			if (bp->line_len != 0) {
				if (len == 0) {
					hadzerolen = TRUE;
				}
				else
				if ((seqline == 1 && len > bp->line_len) ||
						hadzerolen ||
						(seqline > 1 && lastlen != bp->line_len))
					fatal(ERR_DBASE, "database sequence lines must have equal length; see line %d",
							fileline);
			}
			else
				if (seqline > 1) {
					if (shortlen > lastlen)
						fatal(ERR_DBASE, "database sequence lines must have equal length; see line %d",
							theline);
					bp->line_len = lastlen;
				}
				else
					if (shortlen < len) {
						theline = fileline;
						shortlen = len;
					}
			lastlen = len;
			bp->totdblen += len;
			seqlen += len;
			if (seqlen > bp->maxlen)
				bp->maxlen = seqlen;
			b = compress(bp, b+len);
		}
	}

	if (bp->line_len == 0)
		bp->line_len = bp->maxlen;

	bp->seq_beg[bp->count] = ftell(outfile);
	bp->header_beg[bp->count] = ftell(hdrfile);
	if (bp->count != count_orig)
		put_tail(bp, b, bp->count-1);
	bp->cseq_beg[bp->count] = ftell(cfile) * (CHAR_BIT/NBPN);

	bp->c_len = ftell(cfile);
	if (clean_bound > 0)
		for (i = bp->clean_count = 0; i < BUCKETS; ++i)
			if (occurs[i] > clean_bound)
				++bp->clean_count;
		
	long_write(bp->type, tfile);
	long_write(bp->format, tfile);

	if (dbtitle == NULL)
		dbtitle = "";
	len = strlen(dbtitle) + 1;
	long_write(len, tfile);
	fwrite(dbtitle, len, 1, tfile);
	/* pad dbtitle to a multiple of 4 bytes with nulls */
	if (len%4 != 0)
		for (c = 0; c < 4 - len%4; ++c)
			putc_unlocked('\0', tfile);

	long_write(bp->line_len, tfile);
	long_write(bp->count, tfile);
	long_write(bp->maxlen, tfile);
	long_write(bp->totdblen, tfile);
	long_write(bp->c_len, tfile);
	long_write(bp->clean_count, tfile);

	if (bp->clean_count != 0) {
		for (i = 0; i < BUCKETS; ++i)
			if (occurs[i] > clean_bound)
				long_write(i, tfile);
	}
	for (i = 0; i <= bp->count; ++i)
		long_write((long)bp->cseq_beg[i], tfile);
	for (i = 0; i <= bp->count; ++i)
		long_write((long)bp->seq_beg[i], tfile);
	for (i = 0; i <= bp->count; ++i)
		long_write((long)bp->header_beg[i], tfile);
	fwrite((CharPtr)bp->ambiguity, sizeof(bp->ambiguity[0]), bp->count/CHAR_BIT+1, tfile);

	(void) fclose(cfile);
	if (hdrfile != NULL)
		(void) fclose(hdrfile);
	if (infile != NULL)
		(void) fclose(infile);
	(void) fclose(tfile); /* this releases the lock on the database */

	if (clean_bound > 0)
		printf("%lu 8mers cleaned from database\n", bp->clean_count);
	printf("%s entries (%s nucleotides) packed to %s bytes\n",
			Ltostr(bp->count,1), Ltostr(bp->totdblen,1), Ltostr(bp->c_len,1));

	exit (0);
}

/* compress - pack buf into cbuf, leaving < 4 characters in buf */

/* randomly map unknown letters into A, C, G, and T */
#define PRESS_NUC(n) \
((maptst[n] && map[n] < 4) ? map[n] : random_nuc(bp, n))

char *
compress(bp, end)
	BDBFILE	PNTR bp;
	register char	*end;
{
	register char	*p;
	register PACK_TYPE	*c, *d;
	register long	bucket;
	long	len;

	for (c = cbuf, p = buf; p+4 <= end; ++c, p += 4)
		*c = (PRESS_NUC(p[0])<<6) | (PRESS_NUC(p[1])<<4)
				| (PRESS_NUC(p[2])<<2) | PRESS_NUC(p[3]);

	bucket = *cbuf;
	for (d = cbuf+1; d < c; ++d) {
		bucket = ((bucket<<8) | *d) & mask;
		++occurs[bucket];
	}
	len = strlen(p);
	Nlm_MemCpy((CharPtr)buf, (CharPtr)p, len+1);
	fwrite((char *)cbuf, sizeof(cbuf[0]), c - cbuf, cfile);
	return buf+len;
}

void
put_tail(bp, b, seq_nbr)
	BDBFILE	PNTR bp;
	char	*b;
	int		seq_nbr;
{
	PACK_TYPE	c;
	register int	many;
	register int	i;

	if ((many = b - buf) > 0) {
		c = PRESS_NUC(buf[0])<<6;
		if (many > 1)
			c |= PRESS_NUC(buf[1])<<4;
		if (many > 2)
			c |= PRESS_NUC(buf[2])<<2;
		fwrite((char *)&c, sizeof(c), 1, cfile);

		bp->cseq_beg[seq_nbr] |= many;
	}

	/* Save whether an ambiguity letter was encountered in current sequence */
	bp->ambiguity[seq_nbr/CHAR_BIT] |= had_ambiguity << (seq_nbr%CHAR_BIT);
	had_ambiguity = FALSE;


	/* Append magic bytes to act as a sentinel */
	for (i=0; i<NSENTINELS; ++i)
		fputc(NT_MAGIC_BYTE, cfile);

	if (bad_one > 0) {
		fprintf(stderr, "sequence no. %d contains %d invalid nucleic acid code(s).\n",
			bp->count+1, bad_one);
		bad_one = 0;
		exit(2);
	}
}

/*
	random_nuc()

	input:  an ASCII nucleotide code
	output: a randomly chosen binary nucleotide from the list of matchers
*/
#define RANNUC(c) \
(dlp[(BLAST_Letter)(c)].list[(Nlm_RandomNum()>>8) % dlp[(BLAST_Letter)c].cnt])

int
random_nuc(bp, c)
	BDBFILE	PNTR bp;
	int		c;
{
	had_ambiguity = TRUE;
	if (!maptst[c]) {
		if (bad_one == 0) {
			char	qname[50];
			CharPtr	cp;

			fseek(hdrfile, bp->header_beg[bp->count], SEEK_SET);
			fread(qname, sizeof qname, 1, hdrfile);
			fseek(hdrfile, 0, SEEK_END);
			qname[sizeof(qname)-1] = NULLB;
			cp = strchr(qname, ' ');
			if (cp != NULL)
				*cp = NULLB;
			fprintf(stderr, "\nsequence #%lu\n\t%s\n", bp->count+1, qname);
		}
		if (bad_one < 10) {
			if (isalnum(c) || ispunct(c))
				fprintf(stderr, "\tinvalid letter:  \"%c\"\n", c);
			else
				fprintf(stderr, "\tinvalid letter, numeric value %d\n", c);
		}
		if (bad_one == 10)
			fprintf(stderr, "\tadditional invalid letter(s) are not displayed.\n");
		++bad_one;
		return RANNUC(map['N']);
	}
	return RANNUC(map[c]);
}


#if defined(SIGINT) || defined(SIGHUP) || defined(SIGTERM)
void
sighandler()
{
	char	buf[FILENAME_MAX+1];

	if (fname == NULL)
		exit(1);
	if (cfile != NULL) {
		(void) fclose(cfile);
		sprintf(buf, "%s%s", dbname, NT_SEARCHSEQ_EXT);
		unlink(buf);
	}
	if (hdrfile != NULL) {
		(void) fclose(hdrfile);
		sprintf(buf, "%s%s", dbname, NT_HEADER_EXT);
		unlink(buf);
	}
	if (tfile != NULL) {
		(void) fclose(tfile);
		sprintf(buf, "%s%s", dbname, NT_TABLE_EXT);
		unlink(buf);
	}
	exit(1);
}
#endif

int
lockfile(fp)
	FILE	*fp;
{
	int	fd;

	fd = fileno(fp);
	if (!nolock_flag) {
		if (!noblock_flag)
			SYS_WRITEWLOCK(fd, 0, SEEK_SET, 0);
		else
			SYS_WRITELOCK(fd, 0, SEEK_SET, 0);
	}
	return 0;
}

int
truncfile(fp)
	FILE	*fp;
{
	int	fd;

	if (!append_mode) {
		fd = fileno(fp);
		if (ftruncate(fd, 0) == -1)
			fatal(ERR_FOPEN, "ftruncate failed");
	}
	return 0;
}

void
usage()
{
	fprintf(stderr, "\n%s-%s\n\n", module, LOCALITY);
	fprintf(stderr,
		"Purpose:  produce a nt. sequence database for BLAST from a file in FASTA format\n\n");
	fprintf(stderr,
		"Usage:\n\n    %s [options] ntdbname\n\n", module);
	fprintf(stderr, "Options include:\n");
	fprintf(stderr, "    -t title   descriptive title for the database, to appear in BLAST output\n");
	fprintf(stderr, "    -o fname   create database named fname, truncating it if it already exists\n");
	fprintf(stderr, "    -a fname   append to database named fname, creating it if necessary\n");
	fprintf(stderr, "    -c cleanlimit   8-mer clean limit (not used by BLASTN 1.4)\n");
	fprintf(stderr, "    -s seed    seed for the random number generator\n");
	fprintf(stderr, "    -l         do not lock database files during processing\n");
	fprintf(stderr, "    -b         lock database files but do not block if already locked\n");
	fprintf(stderr, "    -p time    periodicity (in seconds) to warn of file lock contention\n");
	exit(1);
}

int
long_write(i, fp)
	long	i;
	FILE	*fp;
{
	register CharPtr	ip;

	LONG_BIGENDIAN(i);
	ip = (CharPtr) &i;
	ip += (sizeof(i) - BO_LONG_SIZE);
	return fwrite(ip, BO_LONG_SIZE, 1, fp);
}

int
long_read(ip, fp)
	long	PNTR ip;
	FILE	*fp;
{
	long	i;
	CharPtr	cp;
	int	rc;

	i = 0;
	cp = (CharPtr) &i;
	cp += (sizeof(i) - BO_LONG_SIZE);
	rc = fread(cp, BO_LONG_SIZE, 1, fp);
	LONG_BIGENDIAN(i);
	*ip = i;
	return rc;
}

long PNTR
get_longs(n, fp)
	size_t	n;
	FILE	*fp;
{
	long	PNTR lp, PNTR lp0, PNTR lpmax;

	lp0 = lp = mem_malloc(sizeof(*lp) * n);
	for (lpmax = lp + n; lp < lpmax; ++lp)
		long_read(lp, fp);
	return lp0;
}

int
get_table(fp, bp)
	FILE	*fp;
	BDBFILE	PNTR bp;
{
	size_t	many, i;
	CharPtr	title;
	long	titlelen;

	rewind(fp);

	MemSet((VoidPtr)bp, 0, sizeof(*bp));
	if (long_read(&bp->type, fp) != 1)
		return 1;

	if (bp->type != DB_TYPE_NUC)
		fatal(ERR_DBASE, "\"%s\" is not a proper nucleotide sequence database.", dbname);
	long_read(&bp->format, fp);
	if (bp->format != NTFORMAT)
		fatal(ERR_DBASE, "Database format is an incompatible version for appending.");
	bp->restype = BLAST_ALPHATYPE_NUCLEIC_ACID;

	long_read(&titlelen, fp);
	title = Nlm_Calloc(1, titlelen);
	fread(title, 1, titlelen, fp);
	title[titlelen-1] = NULLB;
	if (dbtitle == NULL)
		dbtitle = title;
	else
		mem_free(title);
	i = titlelen%4;
	if (i != 0) {
		i = 4 - i;
		fseek(fp, i, SEEK_CUR);
	}

	long_read(&bp->line_len, fp);
	long_read(&bp->count, fp);
	long_read(&bp->maxlen, fp);
	long_read(&bp->totdblen, fp);
	long_read(&bp->c_len, fp);
	long_read(&bp->clean_count, fp); /* sic */
	if (bp->clean_count > 0)
		fseek(fp, bp->clean_count * BO_LONG_SIZE, SEEK_CUR);
	bp->clean_count = 0; /* clear any previous counts--this is bogus anyway */
	clean_bound = 0;

	many = bp->count + 1;
	bp->cseq_beg = (unsigned long PNTR)get_longs(many, fp);
	bp->seq_beg = (unsigned long PNTR)get_longs(many, fp);
	bp->header_beg = (unsigned long PNTR)get_longs(many, fp);

	i = bp->count / CHAR_BIT + 1;
	bp->ambiguity = mem_malloc(sizeof(bp->ambiguity[0]) * i);
	if (fread((CharPtr)bp->ambiguity, i, 1, fp) != 1)
		fatal(ERR_DBASE, "Error reading ambiguities.");

	rewind(fp);
	return 0;
}

