 
#include <stdio.h>
#include <string.h>
/****************************************************************************/
/*  xblast.c   2.0     Jean-Michel Claverie            Dec 92               */
/*  module to read a BLAST output and to generate a filtered query          */
/*  This version includes the -rd option				    */
/*  Default option:  " X " the matching part				    */
/*           -r:  keep the matching part, "X" the rest		            */
/*           -d:  build a db of each individual contiguous matching segment */
/*    each defline is then annotated with the [positions]		    */
/****************************************************************************/

#define MAXLINE 200
#define DEFLIMIT 500
#define SEQLIMIT 500000
#define MASK 'x'

main (argc, argv)
int argc;
char *argv[];
{
int fgt_line(), gt_1stword(), gt_fasta();
FILE *FIN, *QRY;
char line[MAXLINE], *pline, word[MAXLINE], X= MASK;
char defline[DEFLIMIT],defline2[DEFLIMIT],seq[SEQLIMIT], newseq[SEQLIMIT], *s, *ns;
int c, i, pos1, pos2, size, OptR=0 , OptD=0, Opt=0, match;

/***********************  test arguments and usages *************************/
if(argc<3 || argc >5)
  {fprintf(stderr,"\n USAGE:   %s  [-r]  blast.output query    <X_char>\n",argv[0]);
   fprintf(stderr,"\n   pipe | %s  [-r]       +       query    <X_char>\n",argv[0]);
   fprintf(stderr,"          %s  [-d]  blast.output query    <X_char>\n",argv[0]);
   fprintf(stderr,"   pipe | %s  [-d]       +       query    <X_char>\n",argv[0]);
   exit(0);}
/************************   fopen and tests  ********************************/
if (!strcmp(argv[1],"-r"))   OptR=1;             /* reverse option set */
if (!strcmp(argv[1],"-d"))   OptD=1;   /* reverse and divide options set */

Opt = OptR + OptD;		       /* Option offset */
if (*argv[1+Opt]=='+') FIN=stdin;
else if (NULL==(FIN=fopen(argv[1+Opt],"r")))
         { fprintf(stderr," ERROR: opening  %s  ?\n",argv[1+OptR]); exit(0);}

if (NULL==(QRY=fopen(argv[2+Opt],"r")))
         { fprintf(stderr," ERROR: opening  %s  ?\n",argv[2+Opt]); exit(0);}
if (argc==4+Opt) X= (*argv[3+Opt]);
/**************************** get fasta sequence in *************************/
if (2>=(size=gt_1stfasta(QRY, defline, seq)))
   { fprintf(stderr," ERROR: query  %s  ?\n",argv[2+Opt]); exit(0);}

/****************************************************************************/

while (1)		/*  skipping until Sequences */
	{
        c=fgt_line(FIN,MAXLINE,line);
	if (c==EOF)
	   { fprintf(stderr," ERROR: bad BLAST output %s ?\n",argv[1]);
	     exit(0);
	   }
        gt_1stword (line,word);
	if(!strcmp("Sequences",word)) break; /* found Sequences */
        }
	
if (!OptR && !OptD)     /*   default: X matching segments  */
   {   
   while (1)		/* Looking for Query line */
	{
        c=fgt_line(FIN,MAXLINE,line);
	if (c==EOF)  break;
        c=gt_1stword (line,word);
	if(!strcmp("Query:",word))   /* found Query line */
	  {
  	  pline=line+c;
          c=gt_1stword (pline,word); sscanf(word,"%d",&pos1);
	  pline += c;
	  c=gt_1stword (pline,word); /* skip sequence */
	  pline += c;
          c=gt_1stword (pline,word); sscanf(word,"%d",&pos2);

	   if (pos1> pos2)
		{ c= pos1; pos1= pos2; pos2= c;}

	   if (pos1 < 1 || pos2 > size)
          {fprintf(stderr," ERROR: query/ output inconsistency ?\n"); exit(0);}
 
          for (c=pos1-1; c<pos2; c++) seq[c]= X;

          }
        }
	printf("%s \n", defline);

	for (c=0, i=59; c<size ;  c++, i--)
	    { putchar(seq[c]);
	      if (!i) { putchar('\n'); i=60; }
	    }
	putchar('\n');
   }

if (OptR)        /*   keep matching segments, do not write empty seq   */
   { 

   for (s=seq, ns=newseq; *s != '\0'; s++, ns++) { *ns=*s, *s=X; }
   *ns = '\0';
   match=0;			/*  No match yet  */
  
   while (1)		/* Looking for Query line */
	{
        c=fgt_line(FIN,MAXLINE,line);
	if (c==EOF)  break;
        c=gt_1stword (line,word);
	if(!strcmp("Query:",word))   /* found Query line */
	  {
  	  pline=line+c;
          c=gt_1stword (pline,word); sscanf(word,"%d",&pos1);
	  pline += c;
	  c=gt_1stword (pline,word); /* skip sequence */
	  pline += c;
          c=gt_1stword (pline,word); sscanf(word,"%d",&pos2);

	   if (pos1> pos2)
		{ c= pos1; pos1= pos2; pos2= c;}

	   if (pos1 < 1 || pos2 > size)
          {fprintf(stderr," ERROR: query/ output inconsistency ?\n"); exit(0);}
 
        for (c=pos1-1; c<pos2; c++) seq[c]= newseq[c];
        match=1;     /* we now had one match, seq will be written out */
          }
        }
	
	if (match)
           {
	   printf("%s \n", defline);
	   for (c=0, i=59; c<size ;  c++, i--)
	       { putchar(seq[c]);
	         if (!i) { putchar('\n'); i=60; }
	       }
	       putchar('\n');
           }
   }

if (OptD)
   {
   /*  prepare the defline for this option */
   c= gt_1stword (defline+1,word);
   *defline = '>';
   strcpy(defline2, defline+c+1);  /* keep the rest of the line */
   strcpy(defline+1,word);

   for (s=seq, ns=newseq; *s != '\0'; s++, ns++) { *ns=*s, *s=X; }
   *ns = '\0';
   match=0;			/*  No match yet  */
  
   while (1)		/* Looking for Query line */
	{
        c=fgt_line(FIN,MAXLINE,line);
	if (c==EOF)  break;
        c=gt_1stword (line,word);
	if(!strcmp("Query:",word))   /* found Query line */
	  {
  	  pline=line+c;
          c=gt_1stword (pline,word); sscanf(word,"%d",&pos1);
	  pline += c;
	  c=gt_1stword (pline,word); /* skip sequence */
	  pline += c;
          c=gt_1stword (pline,word); sscanf(word,"%d",&pos2);

	   if (pos1> pos2)
		{ c= pos1; pos1= pos2; pos2= c;}

	   if (pos1 < 1 || pos2 > size)
          {fprintf(stderr," ERROR: query/ output inconsistency ?\n"); exit(0);}
 
        for (c=pos1-1; c<pos2; c++) seq[c]= newseq[c];
        match=1;     /* we now had one match, seq will be written out */
          }
        }
	
    if (match)
       {
       pos2=1; s=seq;
       while (pos2 <= size)
           {
           for (pos1=pos2; pos1<=size; pos1++,s++) if (*s!= X)  break;
           pos2 = pos1;
           if (pos2 <= size)    /* not the end of current seq */
              {
              for (   ; pos2 <=size; pos2++, s++) if ( *s == X ) break;
         
              printf("%s_%d_%d %s\n",defline,pos1,pos2-1,defline2);

	      for (c=pos1-1, i=59; c<pos2-1 ;  c++, i--)
	          { putchar(seq[c]);
	            if (!i) { putchar('\n'); i=60; }
	          }
	      putchar('\n');
              }
           } 
       }
   }        /*  end option -d   */

exit(0);
}

/* ---------------------------------------------------------------------*/
/* fgt_line(Fin, max,line)		JMC jan 1991       SGI		*/
/* #include <stdio.h> 							*/
/* function to input a line from an opened steam file into a char array,*/
/* with checking on the max length < max				*/
/* file "records" must be delimited by '\n'				*/
/* string is filled out until \n is encountered, or max-1 chars read	*/
/* or EOF occurs							*/
/* arguments:								*/
/*	Fin: FILE pointer to input stream				*/
/* 	max: array dimension [], max length including the final \0	*/
/*     line: string or char *						*/
/* return: the number of usable chars in the string (\0 not included)	*/
/*	 : or max if capacity was exceeded, line contains   		*/
/*	          max-1 usable characters.				*/
/*	 : 0 if NEW LINE was encountered first, line starts with '\0'	*/
/*	 :-1 if EOF was encountered, line starts with '\0'		*/
/* -------------------------------------------------------------------- */

fgt_line (Fin,max,line)
int max;
char line[];
FILE *Fin;

{
int c, i;

for (i=0; (c=fgetc(Fin))!= EOF && c!='\n' && i<(max-1);i++) line[i]=c;

line[i]='\0';          	/* replace NEW LINE or last by '\0', don't count */

if (c==EOF) return(-1); 			/* EOF encountered */
	
if (i==(max-1) && c!='\n') { while (fgetc(Fin)!='\n');     /* clean buffer */
			   return(max); }         	/* flag overflow   */
	return (i);
}  

/* sgt_line do the same things from a string */


/* -------------------------------------------------------------------	*/
/* int gt_1stword()         Jean-Michel Claverie      feb 1991    SGI	*/
/* module to read the first word of a closed string			*/
/* ARGUMENTS 								*/
/* IN:   line, a closed string						*/
/* OUT:  word, a closed string,  WITH sufficient space allocation	*/
/* RETURN: next: the relative position of the first characters NOT   	*/
/*         belonging to the first word in the string			*/
/*         thus, no more word in the string returns   0 (false)		*/
/* Note : the next call can then use : line + next, as a pointer to	*/
/*        initiate the search for the next 1st word in the string	*/
/* --------------------------------------------------------------------	*/
		
int gt_1stword (line,word)
char *line, *word;
{
char *p;
for ( p=line; isspace(*p) ; p++);  /* skeep leading blanks */
if (*p == '\0') return (0) ;		/* no word in line */

while (!isspace(*p) && *p!='\0'){ *word =(*p) ; word++; p++;}
*word='\0';
return (p-line);
}

/**********************************************************************	*/
/* int gt_1stfasta ()    jean-Michel Claverie       feb 92  SGI		*/
/* module to load the 1st defline and sequence from a fasta format file	*/
/* (DEFLIMIT-1)  chars of the first line are stored			*/
/* Definition line  must start with a " > "				*/
/* Sequence size input limited at SEQLIMIT-1				*/ 
/* in:									*/
/* *Fin : "r" opened file (fasta db)					*/
/* char *defline, char *seq : char arrays for defline, sequence	        */
/* out:									*/  
/* defline as a closed string, 						*/
/* seq as a closed string, no blank, lower or upper			*/
/* return: sequence size or EOF (-1) if no more entry    		*/
/**********************************************************************	*/

int gt_1stfasta(Fin, defline, seq)

FILE *Fin;
char *defline, *seq;
{
int i, c, size;
char *in;    
/* loading all definition line up to deflimit */ 

 if (EOF==(c=fgetc(Fin))) return c;              /* end of fasta file */
 if (c!='>') 
    { fprintf(stderr,"\n ERROR: missing '>' in def line\n");
      exit(0);
    } 

       *defline='>';
       for (i=DEFLIMIT-2, in=defline+1 ; i; i--, in++)
    { c=fgetc(Fin);
      if (c==EOF)
         {fprintf(stderr,"\n ERROR: unexpected EOF in def line\n");
           exit(0);
         }
      *in = c;
      if (*in=='\n') break;   /* end of def line reached  */
    }

if (!i)		/*  DEFLIMIT was reached, go find end of line  */
   {
   while (EOF !=(c=fgetc(Fin)))  if (c=='\n') break;
      if (c==EOF)
      {fprintf(stderr,"\n ERROR: unexpected EOF in def line %s\n", defline);
       exit(0);
      }
   }

 *in= '\0';      /* defline closure  */

/* loading sequence up to EOF, next '>'	or SEQLIMIT	*/

i=SEQLIMIT-1;
size=0;
while (EOF != (c=fgetc(Fin)) && c !='>')
      {
      if (!isspace (c)) { *seq =c; seq++; i--; size++; }
      if (!i)
         { fprintf (stderr,"\n ERROR: seq overflow %s %d \n", defline, size);
	   exit(0);
         }
      }

*seq='\0'; ungetc(c,Fin); 
return size; 

}

