/*
** C program to read in sequences in Fasta format
** test them for internal repeats
** and print them out with the internal repeats flagged
**
** David J. States and Jean Michel Claverie
** May 12, 1992 , REV 3.0 October 28, 1993
**
** Please cite: Jean Michel Claverie & David J. States (1993)
** Computers and Chemistry 17: 191-201.
** 
*/
#include <stdio.h>
#include <string.h>
#include <ctype.h>
#include <memory.h>
#include <math.h>
#include <malloc.h>

#include "datadefs.h"

typedef struct {
        char    *title;
        char    *seq;
        int     len;
        int     max;
        } Sequence;

#define TMPSIZE 32000
#define STRINGSIZE 32000

int ascend=1;
int descend=1;

int verbose=0;

int ncut=4;
int mcut=1;

double pcut=0.01;
int scut=0;

char subchar='X';
int  repeats=0;
int *iseq,*hit;

int* Allocate(nbytes)
int nbytes;
{
        int* ptr;

        ptr = (int*)malloc(nbytes);
        if (ptr==NULL) {
		perror("Allocate memory failure");
		exit(1);
	}

        return(ptr);
}

void Free(ptr)
int *ptr;
{
        if (ptr != NULL) free(ptr);
}

char Translate(ch)
char ch;
{
        if (isupper(ch)) ch = tolower(ch);

        return(ch);
}

int AlphaToN2(ch,alphabet)
int ch;
char *alphabet;
{
        int i;
        char *p;

        if (isupper(alphabet[0]) && islower(ch)) ch = toupper(ch);
        if (islower(alphabet[0]) && isupper(ch)) ch = tolower(ch);
        p = strchr(alphabet,ch);

        if (p != NULL) i = (p - alphabet);
        else i = 22;

        return(i);
}

void ReadSequence(F,s)
FILE *F;
Sequence *s;
{       
        char ch;
        int l,ich;

        s->len = l = 0;

        if (s->title == NULL) {
                s->title = (char*)Allocate(STRINGSIZE*sizeof(char));
        }

        fgets(s->title,STRINGSIZE,F);
        if (s->title[strlen(s->title)-1] == '\n') 
              s->title[strlen(s->title)-1] = '\0';

        if (feof(F)) return;

	if (s->seq == NULL) {
                s->seq = (char*)Allocate(TMPSIZE*sizeof(int));
                s->max = TMPSIZE;
                iseq = (int*)Allocate(TMPSIZE*sizeof(int));
                hit =  (int*)Allocate(TMPSIZE*sizeof(int));
        }

        for (ich=getc(F); (ich!=EOF && ich!=(int)'>'); ich=getc(F) )
        {       
                ch = (char)ich;
                if (isprint(ch) && (ch != ' ')) s->seq[l++]=Translate(ch);

                if (l>=s->max) {
                        s->seq = (char*)realloc(s->seq,
                                        (s->max+TMPSIZE)*sizeof(char));
                        s->max += TMPSIZE;
                        iseq = (int*)realloc(iseq,(s->max+TMPSIZE)*sizeof(int));
                        hit =  (int*)realloc(hit, (s->max+TMPSIZE)*sizeof(int));
                }
        }

        s->seq[l] = '\0';
        s->len = l;

        if (ich==(int)'>') ungetc(ich,F);
}

/*
** Calculate the expected information content, in nats,
** for an alignment described by the scoring matrix m
** given the sequence composition f
*/
double ExpectedInformation(m,lambda,f)
SCOREMATRIX m;
double lambda,f[20];
{
	int i,j;
	double sum,tot,fij,eij;

	sum = tot = 0.0;

	for (i=0; i<20; i++)
	for (j=0; j<20; j++) {

		fij = f[i]*f[j];
		tot += fij;

		eij = m[i][j]*fij*exp(lambda*m[i][j]);
		sum += eij;
	}

	return(lambda*sum/tot);
}

/*
** Subroutine to actually process the sequence for repeats
** and write out the output
*/
void ProcessSeq(s,m,lambda,K,H)
Sequence *s;
SCOREMATRIX m;
double lambda,K,H;
{
	int i,j,k,off,sum,beg,end,top,noff;
	int topcut,fallcut;
	double s0;

	if (s->len == 0) return;

	for (i=0; i<s->len; i++) iseq[i] = AlphaToN2(s->seq[i],alphabet);
	for (i=0; i<s->len; i++) hit[i]=0;

	noff = s->len-1;
	if (ncut>0) noff=ncut;
/*
** Determine the score cutoff so that pcut will be the fraction
** of random sequence eliminated assuming lambda, K, and H are
** characteristic of the database as a whole
*/
	if (scut!=0) {
		topcut = scut;
	} else {
		s0 = - log( pcut*H / (noff*K) ) / lambda;
		if (s0>0) topcut = floor(s0 + log(s0)/lambda + 0.5);
		else topcut = 0;
	}
	fallcut = (int)log(K/0.001)/lambda;

        if (verbose) {
                printf("# Score cutoff = %d, Search from offsets %d to %d\n",
                        topcut,mcut,noff);
  
                if (ascend && descend) printf("# both members of each repeat flagged\n");
                else if (ascend) printf("# last member of each repeat flagged\n");
                else if (descend) printf("# first member of each repeat flagged\n");
  
                printf("# lambda = %.3f, K = %.3f, H = %.3f\n",
                        lambda,K,H);
        }

	for (off=mcut; off<=noff; off++) {

		sum=top=0;
		beg=off;
		end=0;

		for (i=off; i<s->len; i++) {
			sum += m[iseq[i]][iseq[i-off]];
			if (sum>top) {
				top=sum;
				end=i;
			}
			if (top>=topcut && top-sum>fallcut) {
				for (k=beg; k<=end; k++) {
					if (ascend) hit[k] = 1;
					if (descend) hit[k-off] = 1;
				}
				sum=top=0;
				beg=end=i+1;
			} else if (top-sum>fallcut) {
				sum=top=0;
				beg=end=i+1;
			}
			if (sum<0) {
				beg=end=i+1;
				sum=top=0;
			}
		}
		if (top>=topcut) {
			for (k=beg; k<=end; k++) {
				if (ascend) hit[k] = 1;
				if (descend) hit[k-off] = 1;
			}
		}
	}

	for (i=0; i<s->len; i++) {
		s->seq[i] = toupper(s->seq[i]);
		if (hit[i] ^ repeats) {
			if (subchar==0) {
				s->seq[i] = tolower(s->seq[i]);
			} else {
				s->seq[i] = subchar;
			}
		}
	}

	printf("%s\n",s->title);
	for (i=0; i<s->len; i+=60) {
		for (j=i; j<i+60 && j<s->len; j++) 
			putchar((int)s->seq[j]);
		putchar((int)'\n');
	}

}

main (argc,argv)
int argc;
char *argv[];
{
	int i,ich;
	int *m;
	char ch,*filename;
	double lambda,K,H;

	Sequence s;
	FILE *F;

	K = 0.2;
	m = (int*)pam120;
	lambda = lambda120;
	s.title=s.seq=NULL;
	filename = NULL;

	for (i=1; i<argc; i++) {
		if (strcmp(argv[i],"-p")==0) { pcut = atof(argv[++i]); continue; }
		if (strcmp(argv[i],"-n")==0) { ncut = atoi(argv[++i]); continue; }
		if (strcmp(argv[i],"-m")==0) { mcut = atoi(argv[++i]); continue; }
		if (strcmp(argv[i],"-s")==0) { scut = atoi(argv[++i]); continue; }

		if (strcmp(argv[i],"-60")==0)  { m = (int*)pam60;  lambda = lambda60;  continue; }
		if (strcmp(argv[i],"-250")==0) { m = (int*)pam250; lambda = lambda250; continue; }

		if (strcmp(argv[i],"-a")==0) { ascend = 1; descend = 0;continue; }
		if (strcmp(argv[i],"-d")==0) { ascend = 0; descend = 1;continue; }
		if (strcmp(argv[i],"-b")==0) { subchar = ' '; continue; }
		if (strcmp(argv[i],"-.")==0) { subchar = '.'; continue; }
		if (strcmp(argv[i],"-x")==0) { subchar = 'X'; continue; }
		if (strcmp(argv[i],"-o")==0) { subchar = '\000'; continue; }
		if (strcmp(argv[i],"-r")==0) { repeats = 1; continue; }
		if (strcmp(argv[i],"-v")==0) { verbose = 1; continue; }

		if (filename==NULL) { filename = argv[i]; }
		else {
			printf("Multiple filenames on command line, only one allowed\n");
			exit(1);
		}
	}

	if (filename == NULL) {
		printf("Usage: xnu fasta-sequences-file\n");
		printf("              [-60] [-120] [-250]            PAM matrix to use\n");
		printf("              [-p probability-cut]           default 0.01\n");
		printf("              [-s score-cut]\n");
		printf("              [-n max-search-offset]         default 4, 0->all diagonals\n");
		printf("              [-m min-search-offset]         default 1\n");
		printf("              [-a] [-d]                      ascending or descending\n");
		printf("              [-.] [-x] [-o]                 output options\n");
		printf("              [-r]                           print repeats instead of unique\n");
		printf("              [-v]                           verbose output\n");
		exit(0);
	}

	if (strcmp(filename,"-")==0) F = stdin;
	else F = fopen(filename,"r");
	if (F==NULL) {
		perror("Unable to open sequence file");
		exit(1);
	}

	H = ExpectedInformation(m,lambda,fdayhoff);

	s.len=0;
        for (ich=getc(F); ich!=EOF; ich=getc(F))
        {
                ch = (char)ich;
                if (ch=='>') {
                        ungetc(ich,F);

			ProcessSeq(&s,m,lambda,K,H);

                        ReadSequence(F,&s);
                }
        }

	ProcessSeq(&s,m,lambda,K,H);
exit(0);          /* to return 0 */
}
