771 lines
18 KiB
C
771 lines
18 KiB
C
|
/*
|
||
|
Title: seqIOEdit
|
||
|
|
||
|
File: seqIOEdit.c
|
||
|
Purpose: IO of the editted portion of plain or edited sequences
|
||
|
|
||
|
Last update: Monday 24 February 1992
|
||
|
|
||
|
Change log :-
|
||
|
15.01.91 SD New include file (opp.h)
|
||
|
04.12.91 lfw added sample lanes up to 40 instead of 24
|
||
|
04.12.91 changed the way the left cutoff is found...now I allow
|
||
|
look for first occurrence of the left cutting sequence;
|
||
|
looking first for an exact match, then a match with
|
||
|
one mismatch, then with two...if nothing is found at
|
||
|
that point I assume the left cutoff is not there
|
||
|
24.02.92 SD Fixed bug in findRightCutoff() when checking for overlap with
|
||
|
leftCutoff. There was some confusion over what the value of
|
||
|
rightCutoff actually means.
|
||
|
18.11.92 lfw changed the names of the temporary files to .abc* instead
|
||
|
of abc*, also used the remove() command rather than system() to get
|
||
|
rid of the files and added another remove call so they will be sure to
|
||
|
be removed
|
||
|
|
||
|
|
||
|
*/
|
||
|
|
||
|
|
||
|
/* ---- Imports ---- */
|
||
|
#include <stdlib.h>
|
||
|
#include "seqIOEdit.h"
|
||
|
|
||
|
#include "seq.h"/* IMPORT: Seq, BasesAndTraces, NULLSeq,
|
||
|
newSeq, freeSeq */
|
||
|
#include "opp.h" /* IMPORT: oppInitialise */
|
||
|
#include "dialogues.h"
|
||
|
|
||
|
|
||
|
extern int findPercntAmbig(char *theSeq, int num_bases);
|
||
|
|
||
|
/* ---- Internal Functions ---- */
|
||
|
|
||
|
int findPercntAmbig(char *theSeq, int num_bases);
|
||
|
|
||
|
static void text_to_output(char *vec,int stp,int endp,int dvice,char *outfile)
|
||
|
|
||
|
/*
|
||
|
* text_to_output(vec,stp,endp,dvice,outfile)
|
||
|
* input: char **vec,*outfile; int stp, endp, dvice;
|
||
|
* this program output a specified portion of a genbank file
|
||
|
* (from vec[stp] to vec[endp]) to the screen (default), a file (dvice
|
||
|
* =1), or lpr (dvice =2). Outfile is the input filename if you wish to
|
||
|
* output the information to a specified file. stp and endp are integers,
|
||
|
* not pointers.
|
||
|
*/
|
||
|
|
||
|
{
|
||
|
int i;
|
||
|
FILE *fopen(),*fp,*where;
|
||
|
|
||
|
|
||
|
if (dvice == 1) {
|
||
|
if ((fp = fopen(outfile,"a"))==NULL) {
|
||
|
printf ("\nERROR: can't open file %s\n",outfile);
|
||
|
return;
|
||
|
}
|
||
|
else where = fp;
|
||
|
}
|
||
|
else if (dvice == 2) {
|
||
|
if ((fp = fopen("junkfile.","a"))==NULL) {
|
||
|
printf ("\nERROR: can't open file junkfile. to output to the lpr\n");
|
||
|
return;
|
||
|
}
|
||
|
else where = fp;
|
||
|
}
|
||
|
else {
|
||
|
/* default : */
|
||
|
where = stdout;
|
||
|
}
|
||
|
|
||
|
for (i = stp; i < endp; i++)
|
||
|
putc(vec[i],where);
|
||
|
|
||
|
if ((dvice == 1) || (dvice == 2)) fclose(fp);
|
||
|
|
||
|
}
|
||
|
|
||
|
int checkForExistingEdFile(char *fn)
|
||
|
/*
|
||
|
* check to see if there is an existing .seq.n file;
|
||
|
* return the largest n or 0 if no files existed,
|
||
|
* return a -1 if it was a problem with opening files
|
||
|
*/
|
||
|
|
||
|
{
|
||
|
char vec[500];
|
||
|
int last_ed_num;
|
||
|
FILE *fp;
|
||
|
int i;
|
||
|
|
||
|
/* make sure there are no files with the names I'm about to use*/
|
||
|
remove (".abcxyztmpsh.");
|
||
|
remove (".abcxyztmpout.");
|
||
|
|
||
|
|
||
|
/* write a little shell to see the last n in your_filename.n
|
||
|
in the current directory */
|
||
|
if ((fp=fopen(".abcxyztmpsh.", "w")) == NULL) return(-1);
|
||
|
fclose(fp);
|
||
|
sprintf(vec,"for i in 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 \n");
|
||
|
i = strlen(vec);
|
||
|
sprintf(vec+i," do\n oldname=%s.$i\n if test -f $oldname\n then echo $i\n",fn);
|
||
|
i = strlen(vec);
|
||
|
sprintf(vec+i," fi\ndone\n");
|
||
|
|
||
|
text_to_output(vec,0,strlen(vec),1,".abcxyztmpsh.");
|
||
|
|
||
|
/* execute the shell and have it output the last number it found into
|
||
|
a file called abcxyztmpout. */
|
||
|
system("sh .abcxyztmpsh. | tail -1 > .abcxyztmpout.");
|
||
|
|
||
|
/* read that number from that file */
|
||
|
|
||
|
if ((fp=fopen(".abcxyztmpout.", "r")) == NULL) return(-1);
|
||
|
vec[0]=getc(fp);
|
||
|
fclose(fp);
|
||
|
/* if that number was EOF return(0), nothing found */
|
||
|
if (vec[0]==EOF) {
|
||
|
remove (".abcxyztmpsh.");
|
||
|
remove (".abcxyztmpout.");
|
||
|
|
||
|
return(0);
|
||
|
}
|
||
|
|
||
|
vec[1]='\0';
|
||
|
|
||
|
sscanf(vec,"%d",&last_ed_num);
|
||
|
remove (".abcxyztmpsh.");
|
||
|
remove (".abcxyztmpout.");
|
||
|
|
||
|
return(last_ed_num);
|
||
|
}
|
||
|
|
||
|
Boolean isDotSeq(char *fn)
|
||
|
/* make sure there is a .seq on the end of fn, puts one on
|
||
|
if there is not */
|
||
|
{
|
||
|
int i;
|
||
|
|
||
|
i = strlen(fn)-1;
|
||
|
if (fn[i]!='q' || fn[i-1]!='e' || fn[i-2]!='s' || fn[i-3]!='.')
|
||
|
return(False);
|
||
|
else return(True);
|
||
|
}
|
||
|
|
||
|
void stripDotSeq(char *fn)
|
||
|
/* there is a .seq on the end of fn, strip it off */
|
||
|
|
||
|
{ int i;
|
||
|
|
||
|
i = strlen(fn)-1;
|
||
|
|
||
|
if (fn[i]=='q' && fn[i-1]=='e' && fn[i-2]=='s' && fn[i-3]=='.')
|
||
|
fn[i-3]='\0';
|
||
|
|
||
|
return;
|
||
|
}
|
||
|
|
||
|
void stripDotNum(char *fn)
|
||
|
/* if there is a .num on the end of fn, strip it off */
|
||
|
|
||
|
{ int i;
|
||
|
|
||
|
i = strlen(fn)-1;
|
||
|
|
||
|
if (isdigit(fn[i]) && fn[i-1]=='.')
|
||
|
fn[i-1]='\0';
|
||
|
else if (isdigit(fn[i]) && isdigit(fn[i-1]) && fn[i-2]=='.')
|
||
|
fn[i-2]='\0';
|
||
|
|
||
|
return;
|
||
|
}
|
||
|
|
||
|
int isDotNum(char *fn)
|
||
|
/*
|
||
|
* checks if there is a .1 or .2 or .m on the inputfilename.
|
||
|
* If there is, then returns that num. If not returns -1.
|
||
|
*/
|
||
|
{
|
||
|
int i,j;
|
||
|
int dotnum;
|
||
|
char *atemp;
|
||
|
|
||
|
atemp = (char *)calloc(20,sizeof(char));
|
||
|
|
||
|
i = strlen(fn)-1;
|
||
|
j = 0;
|
||
|
|
||
|
if (isdigit(fn[i]) && (fn[i-1]=='.')) {
|
||
|
atemp[0] = fn[i];
|
||
|
atemp[1] = fn[i+1];
|
||
|
sscanf(atemp,"%d",&dotnum);
|
||
|
free(atemp);
|
||
|
return(dotnum);
|
||
|
}
|
||
|
else if (isdigit(fn[i]) && isdigit(fn[i-1]) && fn[i-2]=='.') {
|
||
|
atemp[0] = fn[i-1];
|
||
|
atemp[1] = fn[i];
|
||
|
atemp[2] = '\0';
|
||
|
sscanf(atemp,"%d",&dotnum);
|
||
|
free(atemp);
|
||
|
return(dotnum);
|
||
|
}
|
||
|
|
||
|
free(atemp);
|
||
|
return(-1);
|
||
|
}
|
||
|
|
||
|
static void get_compl_seq(char *ac_seq,char *aseq,int stp,int endp, int seq_len, int rev)
|
||
|
|
||
|
/*
|
||
|
* uses aseq to find the sequence that
|
||
|
* would appear on the opposite strand and places
|
||
|
* that sequence in ac_seq; stp is the starting point on
|
||
|
* aseq and endp is the ending point on aseq;
|
||
|
* seq_len is the length of the input sequence,
|
||
|
* if rev==1 reverses as well as complements, if rev==0
|
||
|
* only complements
|
||
|
*/
|
||
|
{
|
||
|
int i;
|
||
|
|
||
|
oppInitialize();
|
||
|
|
||
|
if (rev == 1) {
|
||
|
for (i = stp; i <= endp; i++)
|
||
|
ac_seq[seq_len - i] = opp[aseq[i]];
|
||
|
}
|
||
|
else {
|
||
|
for (i = stp; i <= endp; i++)
|
||
|
ac_seq[i] = opp[aseq[i]];
|
||
|
ac_seq[i-1] ='\0';
|
||
|
}
|
||
|
|
||
|
|
||
|
return;
|
||
|
}
|
||
|
|
||
|
|
||
|
/* ---- Externals ---- */
|
||
|
|
||
|
Boolean writeEdSeq(Seq seq, char *fn)
|
||
|
|
||
|
{
|
||
|
FILE *fp;
|
||
|
int i,j;
|
||
|
char ed_fn[200];
|
||
|
int last_ed_num; /* largest n of fn.seq.n in
|
||
|
current directory */
|
||
|
|
||
|
oppInitialize();
|
||
|
|
||
|
/* make sure the filename does not have a .seq on the end */
|
||
|
stripDotSeq(fn);
|
||
|
|
||
|
/* get the n to put on inputfilename.n for the edited
|
||
|
file to be kept */
|
||
|
last_ed_num = checkForExistingEdFile(fn);
|
||
|
|
||
|
if (last_ed_num == -1) return(False);
|
||
|
else sprintf(ed_fn,"%s.%d",fn,last_ed_num+1);
|
||
|
/* that line takes care of 0 too, because
|
||
|
it names the file fn.seq.1 */
|
||
|
|
||
|
/* Open for writing, text */
|
||
|
if ((fp=fopen(ed_fn, "w")) == NULL) return(False);
|
||
|
|
||
|
|
||
|
/* write information in the following format:
|
||
|
NedBases*edits array*edBase array*edBasePos array*
|
||
|
each division ends in an * and within
|
||
|
each division entries are separated by spaces */
|
||
|
|
||
|
if (seq->bottom) {
|
||
|
fprintf(fp," %6d*%3d*%6d*%6d*",seq->NorigBases,seq->NedBases,seq->rightCutoff,seq->leftCutoff);
|
||
|
}
|
||
|
else {
|
||
|
fprintf(fp," %6d*%3d*%6d*%6d*",seq->NorigBases,seq->NedBases,seq->leftCutoff,seq->rightCutoff);
|
||
|
}
|
||
|
|
||
|
/* print out edits array */
|
||
|
|
||
|
if (seq->bottom) {
|
||
|
for (i=seq->NorigBases+MaxEdits-1;
|
||
|
i > -1;
|
||
|
i--)
|
||
|
{
|
||
|
j = 0;
|
||
|
if (i==seq->NorigBases+MaxEdits-1)
|
||
|
/* this part is a fudge to stick in 0 0 */
|
||
|
fprintf(fp,"%6d %6d ",j,j);
|
||
|
|
||
|
|
||
|
if (((seq->edits[i]!=0) && ((seq->NedBases -1- i)>=0)) || i==0){
|
||
|
if (seq->edits[i] <0)
|
||
|
fprintf(fp,"%6d %6d ",seq->NedBases -1- i,seq->edits[i]);
|
||
|
else
|
||
|
fprintf(fp,"%6d %6d ",seq->NedBases -1 -i,seq->NorigBases - 1-(seq->edits[i]));
|
||
|
}
|
||
|
}
|
||
|
}
|
||
|
else {
|
||
|
for (i=0;
|
||
|
i<seq->NorigBases+MaxEdits;
|
||
|
i++) {
|
||
|
|
||
|
if ((seq->edits[i]!=0) || (i==0))
|
||
|
fprintf(fp,"%6d %6d ",i,seq->edits[i]);
|
||
|
}
|
||
|
}
|
||
|
|
||
|
fprintf(fp,"%6d %6d ",NULLPoint,NULLPoint);
|
||
|
|
||
|
fprintf(fp,"*");
|
||
|
|
||
|
/* print out non-NULL entries in edBase array */
|
||
|
for (i=1;
|
||
|
i<MaxEdits;
|
||
|
i++)
|
||
|
{
|
||
|
if (seq->edBase[i] == NULL) {
|
||
|
fprintf(fp,"* ");
|
||
|
break;
|
||
|
}
|
||
|
else {
|
||
|
if (seq->bottom)
|
||
|
fprintf(fp,"%c ",opp[seq->edBase[i]]);
|
||
|
else
|
||
|
fprintf(fp,"%c ",seq->edBase[i]);
|
||
|
}
|
||
|
}
|
||
|
|
||
|
fprintf(fp,"*");
|
||
|
|
||
|
/* print out non-NULL entries in edBasePos array */
|
||
|
|
||
|
for (i=1;
|
||
|
i<MaxEdits;
|
||
|
i++)
|
||
|
{
|
||
|
|
||
|
if (seq->edBasePos[i] == NULLPoint) {
|
||
|
fprintf(fp,"%6d ",NULLPoint); /* -1 is NULLPoint*/
|
||
|
break;
|
||
|
}
|
||
|
else {
|
||
|
int fudge;
|
||
|
/*
|
||
|
* when you're plotting the strand in the reverse
|
||
|
* order, you must move the starting position over by
|
||
|
* the width of one character. Because positions in the
|
||
|
* other file, already take into account the character
|
||
|
* width. Therefore, fudge = ~character width + basePos
|
||
|
* of the first base
|
||
|
*/
|
||
|
|
||
|
fudge = seq->basePos[0] + 6;
|
||
|
|
||
|
if (seq->bottom) {
|
||
|
/* fprintf(fp,"%6d ",seq->basePos[seq->NorigBases-1]-seq->edBasePos[i]+fudge); */
|
||
|
fprintf(fp,"%6d ",seq->NPoints-seq->edBasePos[i]);
|
||
|
}
|
||
|
else
|
||
|
fprintf(fp,"%6d ",seq->edBasePos[i]);
|
||
|
}
|
||
|
}
|
||
|
|
||
|
fprintf(fp,"*");
|
||
|
(void) fclose(fp);
|
||
|
return(True);
|
||
|
}
|
||
|
|
||
|
|
||
|
|
||
|
Boolean readEdSeq(Seq seq, char *fn, int dotnum)
|
||
|
/*
|
||
|
* reads in the most recent fn.seq.n file. This
|
||
|
* file should be of the form:
|
||
|
* basePos;edBasePos;edBase
|
||
|
* returns True if it read in the sequence from
|
||
|
* an editted file, and False if there was no
|
||
|
* editted file or if there was a problem
|
||
|
*
|
||
|
* dotnum == -1 if the user did not specify a
|
||
|
* version number of the sequence to read in,
|
||
|
* if they want version m read in.
|
||
|
*/
|
||
|
|
||
|
{
|
||
|
int last_ed_num;
|
||
|
char ed_fn[200];
|
||
|
FILE *fp;
|
||
|
|
||
|
oppInitialize();
|
||
|
|
||
|
/* make sure the filename does not have a .seq on the end */
|
||
|
stripDotSeq(fn);
|
||
|
|
||
|
if (dotnum == -1) {
|
||
|
/* get the n to put on fn.seq.n */
|
||
|
last_ed_num = checkForExistingEdFile(fn);
|
||
|
if (last_ed_num == -1) return(False);
|
||
|
else if (last_ed_num == 0) return(False);
|
||
|
else sprintf(ed_fn,"%s.%d",fn,last_ed_num);
|
||
|
}
|
||
|
else
|
||
|
sprintf(ed_fn,"%s.%d",fn,dotnum);
|
||
|
|
||
|
|
||
|
/* Open for reading, text */
|
||
|
if ((fp=fopen(ed_fn, "r")) == NULL) return(False);
|
||
|
fclose(fp);
|
||
|
|
||
|
if (processEdSeqFile(seq,ed_fn))
|
||
|
return(True);
|
||
|
else return(False);
|
||
|
}
|
||
|
|
||
|
Boolean processEdSeqFile(Seq seq, char *fn)
|
||
|
|
||
|
/*
|
||
|
* processes a char vector containing the Editted
|
||
|
* sequence in the following format:
|
||
|
*
|
||
|
* NedBases*NorigBases*leftCutoff*rightCutoff*negative
|
||
|
* components of edits array in the form (position,
|
||
|
* negative number)*non NULL components edBase array*
|
||
|
* non NULL components edBasePos array*
|
||
|
*
|
||
|
* each division ends in an H. The end of the
|
||
|
* edits array is signaled by a -1 -1 entry. Within
|
||
|
* each division entries are separated by spaces
|
||
|
*
|
||
|
* sticks the information it finds into the seq array
|
||
|
*
|
||
|
* returns false if there was a problem with the file format
|
||
|
*/
|
||
|
{
|
||
|
FILE *fp;
|
||
|
int nbases;
|
||
|
char achar;
|
||
|
int i,j,k;
|
||
|
char ed_fn[200];
|
||
|
|
||
|
strcpy(ed_fn,fn);
|
||
|
|
||
|
if ((fp=fopen(ed_fn, "r")) == NULL) return(False);
|
||
|
|
||
|
fscanf(fp," %6d",&nbases);
|
||
|
achar = getc(fp);
|
||
|
if (achar != '*') {
|
||
|
printf("ERROR: Input editted sequence was of wrong format \n(No asterisk was found after the number of bases)\n");
|
||
|
fclose(fp);
|
||
|
return(False);
|
||
|
}
|
||
|
|
||
|
|
||
|
fscanf(fp,"%3d",&i);
|
||
|
achar = getc(fp);
|
||
|
if (achar != '*') {
|
||
|
printf("ERROR: Input editted sequence was of wrong format \n(No asterisk was found after the number of edited bases)\n");
|
||
|
fclose(fp);
|
||
|
return(False);
|
||
|
}
|
||
|
|
||
|
seq->NedBases = i;
|
||
|
if (nbases != seq->NorigBases) {
|
||
|
printf("ERROR: Input editted sequence was of wrong format\n (Number of editted bases has changed)\n");
|
||
|
fclose(fp);
|
||
|
return(False);
|
||
|
}
|
||
|
|
||
|
|
||
|
fscanf(fp,"%6d",&j);
|
||
|
achar = getc(fp);
|
||
|
if (achar != '*') {
|
||
|
printf("ERROR: Input editted sequence was of wrong format \n(No asterisk was found after the left cutoff)\n");
|
||
|
fclose(fp);
|
||
|
return(False);
|
||
|
}
|
||
|
|
||
|
fscanf(fp,"%6d",&k);
|
||
|
achar = getc(fp);
|
||
|
if (achar != '*') {
|
||
|
printf("ERROR: Input editted sequence was of wrong format \n(No asterisk was found after the right cutoff)\n");
|
||
|
fclose(fp);
|
||
|
return(False);
|
||
|
}
|
||
|
|
||
|
if (seq->bottom) {
|
||
|
seq->leftCutoff = k;
|
||
|
seq->rightCutoff = j;
|
||
|
}
|
||
|
else {
|
||
|
seq->leftCutoff = j;
|
||
|
seq->rightCutoff = k;
|
||
|
}
|
||
|
|
||
|
/* read in the seq->edits array */
|
||
|
|
||
|
for (i=1;
|
||
|
i<seq->NorigBases+MaxEdits;
|
||
|
i++)
|
||
|
{
|
||
|
fscanf(fp,"%6d %6d ",&j,&k);
|
||
|
if (j==NULLPoint) break;
|
||
|
else {
|
||
|
if (seq->bottom) {
|
||
|
if (k<0) /* then it is a reference to the edBase array */
|
||
|
seq->edits[seq->NedBases-1-j]=k;
|
||
|
else /*it is a base number which needs to be converted
|
||
|
to the base number on the opposite strand */
|
||
|
seq->edits[seq->NedBases-1-j]=seq->NorigBases-1-k;
|
||
|
}
|
||
|
else
|
||
|
seq->edits[j]=k;
|
||
|
}
|
||
|
|
||
|
}
|
||
|
achar = getc(fp);
|
||
|
if (achar != '*') {
|
||
|
printf("ERROR: Input editted sequence was of wrong format \n(Error in the edits array)\n");
|
||
|
fclose(fp);
|
||
|
return(False);
|
||
|
}
|
||
|
|
||
|
/* read in the seq->edBase array */
|
||
|
for (i=1;
|
||
|
i<MaxEdits;
|
||
|
i++)
|
||
|
{
|
||
|
fscanf(fp,"%c ",&achar);
|
||
|
if (achar == '*') break;
|
||
|
else {
|
||
|
if (seq->bottom)
|
||
|
seq->edBase[i]=opp[achar];
|
||
|
else
|
||
|
seq->edBase[i]=achar;
|
||
|
}
|
||
|
}
|
||
|
achar = getc(fp);
|
||
|
if (achar != '*') {
|
||
|
printf("Input editted sequence was of wrong format\n(Extraneous information after edBases and before the *\n");
|
||
|
fclose(fp);
|
||
|
return(False);
|
||
|
}
|
||
|
|
||
|
|
||
|
/* read in the seq->edBasePos array */
|
||
|
|
||
|
for (i=1;
|
||
|
i<MaxEdits;
|
||
|
i++)
|
||
|
{
|
||
|
fscanf(fp,"%6d ",&k);
|
||
|
if (k==NULLPoint) break;
|
||
|
else {
|
||
|
int fudge;
|
||
|
|
||
|
/*
|
||
|
* when you're plotting the strand in the reverse
|
||
|
* order, you must move the starting position over by
|
||
|
* the width of one character. Because positions in the
|
||
|
* ABI File, already take into account the character
|
||
|
* width. Therefore, fudge = ~character width + offset of
|
||
|
* the first peaks position
|
||
|
*/
|
||
|
|
||
|
fudge = seq->basePos[0] + 6;
|
||
|
|
||
|
if (seq->bottom) {
|
||
|
/* seq->edBasePos[i]=seq->basePos[seq->NorigBases-1]-k+fudge;*/
|
||
|
seq->edBasePos[i]=seq->NPoints - k;
|
||
|
}
|
||
|
else
|
||
|
seq->edBasePos[i]=k;
|
||
|
}
|
||
|
|
||
|
}
|
||
|
achar = getc(fp);
|
||
|
if (achar != '*') {
|
||
|
printf("ERROR: Input editted sequence was of wrong format\n(Error in editted Base Position array)\n");
|
||
|
fclose(fp);
|
||
|
return(False);
|
||
|
}
|
||
|
|
||
|
/*
|
||
|
* Don't set the seq to Dirty, otherwise the user won't know
|
||
|
* if they have or have not edited their input edited sequence
|
||
|
*/
|
||
|
fclose(fp);
|
||
|
return(True);
|
||
|
}
|
||
|
|
||
|
|
||
|
|
||
|
|
||
|
int findLeftCutoff(Seq seq, char *enzInString)
|
||
|
/*
|
||
|
* looks for left cutoff, if it doesn't find a "enzInString", then
|
||
|
* it looks from enzInString less it's last character, etc
|
||
|
*/
|
||
|
|
||
|
{
|
||
|
int maxStartPos=100; /* if the enzyme site wasn't found before this
|
||
|
baseNum, then that's probably not the cloning
|
||
|
site */
|
||
|
int i,j,found;
|
||
|
/* int jj,kk; */
|
||
|
int indices[100];
|
||
|
int num_matches;
|
||
|
char *theSeq;
|
||
|
int num_bases;
|
||
|
char enzString[100];
|
||
|
/* char enztemp[100]; */
|
||
|
|
||
|
found = 0;
|
||
|
|
||
|
if (seq->bottom)
|
||
|
get_compl_seq(enzString,enzInString,0,strlen(enzInString),strlen(enzInString),0);
|
||
|
else
|
||
|
strcpy(enzString,enzInString);
|
||
|
|
||
|
|
||
|
num_bases = getNBases(seq,EdBases);
|
||
|
theSeq = (char *)calloc(num_bases,sizeof(char));
|
||
|
|
||
|
j = 0;
|
||
|
if (seq->bottom) {
|
||
|
for (i = num_bases-1; i >= 0; i--){
|
||
|
theSeq[i] = getBase(seq, EdBases, j);
|
||
|
j++;
|
||
|
}
|
||
|
}
|
||
|
else {
|
||
|
for (i = 0; i < num_bases; i++)
|
||
|
theSeq[i] = getBase(seq, EdBases, i);
|
||
|
}
|
||
|
|
||
|
|
||
|
|
||
|
/* first look for a complete match (one mismatch allowed), then
|
||
|
gradually take nucleotides off the 5 prime end and continue
|
||
|
looking for a match */
|
||
|
/*
|
||
|
for (i = 0; i < 3; i++) {
|
||
|
jj=0;
|
||
|
for (kk=i; enzString[kk]; kk++) enztemp[jj++]=enzString[kk];
|
||
|
enztemp[jj]='\0';
|
||
|
num_matches=string_match(enztemp,strlen(enztemp),theSeq,num_bases,2,indices);
|
||
|
if (num_matches > 0)
|
||
|
if (indices[0] < maxStartPos) {
|
||
|
found = 1;
|
||
|
break;
|
||
|
}
|
||
|
}
|
||
|
*/
|
||
|
|
||
|
/* look for first occurrence of enzString;
|
||
|
just look a match with at most i mismatches, starting
|
||
|
with 0 mismatches down to two*/
|
||
|
for (i=0; i<3; i++) {
|
||
|
num_matches=string_match(enzString,strlen(enzString),theSeq,num_bases,i,indices);
|
||
|
if (num_matches > 0)
|
||
|
if (indices[0] < maxStartPos) {
|
||
|
found = 1;
|
||
|
break;
|
||
|
}
|
||
|
}
|
||
|
|
||
|
free(theSeq);
|
||
|
|
||
|
if (found) return(indices[0] + strlen(enzString));
|
||
|
else return(0);
|
||
|
|
||
|
}
|
||
|
|
||
|
|
||
|
int findRightCutoff(Seq seq)
|
||
|
{
|
||
|
/* give the %age cutoff a default but let it be user
|
||
|
specifiable on the command line ? */
|
||
|
/*
|
||
|
* ways to look for ends of sequence
|
||
|
* 1. runs of nucleotides or dinucleotides
|
||
|
* -- but rick says there are lots of runs of
|
||
|
* A's and T's in what they're sequencing
|
||
|
* 2. percentage of N's
|
||
|
* 3. automatically drop down to baseNum 600 to even start
|
||
|
* looking for a cutoff
|
||
|
*/
|
||
|
|
||
|
int num_bases;
|
||
|
char *theSeq;
|
||
|
int i,j;
|
||
|
int rightCutoff;
|
||
|
|
||
|
num_bases = getNBases(seq,EdBases);
|
||
|
theSeq = (char *)calloc(num_bases,sizeof(char));
|
||
|
|
||
|
j = 0;
|
||
|
if (seq->bottom) {
|
||
|
for (i = num_bases-1; i >= 0; i--){
|
||
|
theSeq[i] = getBase(seq, EdBases, j);
|
||
|
j++;
|
||
|
}
|
||
|
}
|
||
|
else {
|
||
|
for (i = 0; i < num_bases; i++)
|
||
|
theSeq[i] = getBase(seq, EdBases, i);
|
||
|
}
|
||
|
|
||
|
rightCutoff = findPercntAmbig(theSeq,num_bases);
|
||
|
|
||
|
/* added so that the left and right cutoffs do not overlap */
|
||
|
if (rightCutoff > num_bases - seq->leftCutoff) rightCutoff=num_bases - seq->leftCutoff;
|
||
|
|
||
|
free(theSeq);
|
||
|
return(rightCutoff);
|
||
|
}
|
||
|
|
||
|
|
||
|
int findPercntAmbig(char *theSeq, int num_bases)
|
||
|
|
||
|
{
|
||
|
int i,j;
|
||
|
int isN[256];
|
||
|
int totalN = 0;
|
||
|
int numN;
|
||
|
int nucWindow;
|
||
|
|
||
|
numN = 2;
|
||
|
nucWindow = 5;
|
||
|
|
||
|
for (i = 0; i <= 256; i++) isN[i]=0;
|
||
|
isN['n']=1;
|
||
|
isN['N']=1;
|
||
|
isN['-']=1;
|
||
|
|
||
|
|
||
|
|
||
|
/*
|
||
|
* start at base num 200 and look for numN Ns within
|
||
|
* a window of nucWindow nucleotides, once you find that
|
||
|
* second N send back the indices of that second
|
||
|
* N as the cutoff line
|
||
|
*/
|
||
|
|
||
|
for (i = 200; i < num_bases; i++) {
|
||
|
totalN = 0;
|
||
|
for (j = 0; j < nucWindow; j++) {
|
||
|
if (isN[theSeq[i+j]]) totalN++;
|
||
|
if (totalN == numN) return(num_bases - (i+j));
|
||
|
}
|
||
|
}
|
||
|
return(0);
|
||
|
|
||
|
}
|
||
|
|
||
|
|