/*
	gb2fasta.c

	Convert a list of GenBank-format sequence files into FASTA format
	(for subsequent processing by PRESSDB and use by BLASTN).

	Usage:  gb2fasta gbpri.seq gbrod.seq gbmam.seq ... > genbank.total

	Suggested alternative usage:

	gb2fasta -l GenPept genpept.seq > genpept

	Tested successfully on GenBank Release 66.0

	8/14/95 WRG - now parses NCBI gi identifiers from the flat file
	7/22/90 WRG
*/
#include <ncbi.h>
#include <gishlib.h>

void	usage();
FILE	*fp;
#define BUFFER_SIZE	256
char	buf[BUFFER_SIZE];

char	*label = "gb";
char	*module = "gb2fasta";
char	locus[128];
char	accession[128];
char	definition[4096];
unsigned long	ncbigi;

main(ac, av)
	int		ac;
	char	**av;
{
	int		i;
	register char	*cp, ch;

	if (ac == 2 && strcmp(av[1], "-") == 0)
		optind = ac-1;
	else
		while ((i = getopt(ac, av, "l:")) != -1)
			switch (i) {
			case 'l':
				label = optarg;
				continue;
			default:
				usage();
			}

	if (ac == optind)
		usage();

	for (i = optind; i < ac; ++i) {
		fp = openfile(av[i], "rb");
		if (fp == NULL)
			goto NextFile;

		for (;/* Each Nt. Sequence */;) {
			locus[0] = accession[0] = definition[0] = NULLB;
			ncbigi = 0;
			for (;/* Each Record */;) {
				if (fgets(buf, BUFFER_SIZE, fp) == NULL)
					goto NextFile;
Switch:
				switch (buf[0]) {
				case 'L':
					if (strncmp(buf, "LOCUS", 5))
						continue;
					get_locus(buf);
					continue;
				case 'D':
					if (strncmp(buf, "DEFINITION", 10))
						continue;
					get_definition(buf);
					goto Switch;
				case 'C':
					if (strncmp(buf, "COMMENT", 7))
						continue;
					get_ncbi_gi(buf);
					goto Switch;
				case 'N':
					if (strncmp(buf, "NID", 3))
						continue;
					get_ncbi_nid(buf);
					continue;
				case 'A':
					if (strncmp(buf, "ACCESSION", 9))
						continue;
					if (accession[0] != NULLB)
						continue;
					if (strtok(buf, " \t\n\r") != NULL) {
						cp = strtok(NULL, " \t\n\r");
						if (cp != NULL)
							strcpy(accession, cp);
					}
					continue;
				case 'O':
					if (strncmp(buf, "ORIGIN", 6) == 0)
						goto GetSeq;
					continue;
				default:
					continue;
				}
			}

GetSeq:
			putchar('>');
			if (ncbigi != 0)
				fprintf(stdout, "gi|%lu|", ncbigi);
			if (label[0] != NULLB)
				fputs(label, stdout);
			else
				fputs("???", stdout);
			putchar('|');
			if (accession[0] != NULLB)
				fputs(accession, stdout);
			putchar('|');
			if (locus[0] != NULLB)
				fputs(locus, stdout);

			if (definition[0]) {
				putchar(' ');
				fputs(definition, stdout);
			}
			putchar('\n');

			while (fgets(buf, BUFFER_SIZE, fp) != NULL && buf[0] != '/') {
				cp = buf;
				while ((ch = *cp++) != NULLB)
					if (isalpha(ch)) {
						if (islower(ch))
							ch = toupper(ch);
						putchar(ch);
					}
				putchar('\n');
			}

		} /* for (;;) */
		

NextFile:
		if (fp != NULL)
			fclose(fp);
		continue;
	}
	exit(0);
}


int
get_locus(buf)
	register CharPtr	buf;
{
	register CharPtr	cp;
	register int	ch, len;

	if (locus[0] != NULLB) {
		fprintf(stderr, "\nLOCUS without a sequence\n");
		fprintf(stderr, "LOCUS %s\n", locus);
		exit(1);
	}
	cp = buf+5;
	while (*cp != NULLB && isspace(*cp))
		++cp;
	for (len = 0; (ch = *cp) != NULLB && !isspace(ch); ++cp) {
		locus[len++] = ch;
	}
	locus[len] = NULLB;
	return 0;
}

int
get_ncbi_nid(CharPtr buf)
{
	CharPtr	cp;

	if (strlen(buf) < 14)
		return 0;

	cp = buf + 3;
	while (*cp != '\0' && isspace(*cp))
		++cp;

	if (*cp != 'g') /* not an NCBI "gi" identifier */
		return 0;
	sscanf(cp+1, "%lu", &ncbigi);
	return 0;
}

int
get_ncbi_gi(CharPtr buf)
{
	int	emptyline = 0;

	if (strlen(buf) > 22 && buf[12] == 'N' && strncmp(buf+12, "NCBI gi: ", 9) == 0) {
		sscanf(buf+21, "%lu", &ncbigi);
		fgets(buf, BUFFER_SIZE, fp);
		return 0;
	}
	buf[12] = NULLB;
	while (fgets(buf, BUFFER_SIZE, fp) != NULL) {
		if (buf[0] == '\n' || buf[12] == '\n') {
			emptyline = 1;
			continue;
		}
		if (emptyline && buf[12] == 'N' && strncmp(buf+12, "NCBI gi: ", 9) == 0) {
			sscanf(buf+21, "%lu", &ncbigi);
			return 0;
		}
		emptyline = 0;
		if (buf[0] != ' ')
			return 0;
		buf[12] = NULLB;
	}
	return 1;
}

int
get_definition(buf)
	register CharPtr	buf;
{
	register CharPtr	cp;
	register int		ch, len, len2;

	if (definition[0] != NULLB) {
		fprintf(stderr, "\nDuplicate DEFINITION records for a single sequence\n");
		if (locus[0])
			fprintf(stderr, "LOCUS %s\n", locus);
		exit(2);
	}

	cp = buf+10;
	while ((ch = *cp) != NULLB && isspace(ch))
		++cp;
	definition[len = 0] = NULLB;
	if (ch != NULLB) {
		len = strlen(cp) - 1;
		Nlm_MemCpy(definition, cp, len);
		definition[len] = NULLB;
	}
	while (fgets(buf, BUFFER_SIZE, fp) != NULL &&
				buf[0] == ' ') {
		cp = buf+1;
		while ((ch = *cp) != NULLB && isspace(ch))
			++cp;
		if (ch != NULLB) {
			len2 = strlen(cp) - 1;
			definition[len++] = ' ';
			Nlm_MemCpy(definition + len, cp, len2);
			len += len2;
			definition[len] = NULLB;
		}
	}
	if (len == 1 && definition[0] == '.')
		strcpy(definition, "No DEFINITION available");
	return 0;
}


void
usage()
{
	fprintf(stderr, "Purpose:  convert a GenBank-format file into FASTA format.\n");
	fprintf(stderr, "Usage:  %s [-l label] dbfile\n", module);
	exit(1);
}

