1314 lines
32 KiB
C
1314 lines
32 KiB
C
/*
|
|
Title: seqIOEdit
|
|
|
|
File: seqIOEdit.c
|
|
Purpose: IO of the editted portion of plain or edited sequences
|
|
|
|
Last update: Monday 24 February 1992
|
|
|
|
Change log :-
|
|
15.01.91 SD New include file (opp.h)
|
|
04.12.91 lfw added sample lanes up to 40 instead of 24
|
|
04.12.91 changed the way the left cutoff is found...now I allow
|
|
look for first occurrence of the left cutting sequence;
|
|
looking first for an exact match, then a match with
|
|
one mismatch, then with two...if nothing is found at
|
|
that point I assume the left cutoff is not there
|
|
24.02.92 SD Fixed bug in findRightCutoff() when checking for overlap with
|
|
leftCutoff. There was some confusion over what the value of
|
|
rightCutoff actually means.
|
|
18.11.92 lfw changed the names of the temporary files to .abc* instead
|
|
of abc*, also used the remove() command rather than system() to get
|
|
rid of the files and added another remove call so they will be sure to
|
|
be removed
|
|
|
|
|
|
*/
|
|
|
|
|
|
/* ---- Imports ---- */
|
|
#include <stdlib.h>
|
|
#include "seqIOEdit.h"
|
|
|
|
#include "seq.h"/* IMPORT: Seq, BasesAndTraces, NULLSeq,
|
|
newSeq, freeSeq */
|
|
#include "opp.h" /* IMPORT: oppInitialise */
|
|
#include "dialogues.h"
|
|
#include "match.h"
|
|
|
|
|
|
|
|
|
|
#ifdef QUAL_CODE
|
|
/* global definitions quality cutoffs for this file */
|
|
#define SIDEBAND_CUTOFF 0.8
|
|
#define NONCALLED_OVER_CALLED_CUTOFF 0.25
|
|
#define OVERALL_TRACE_QUAL_CUTOFF 0.27
|
|
#define STEP_SIZE 8
|
|
#define LAST_ALLOWED_BASE 400
|
|
|
|
|
|
|
|
/* definition of internal functions found below */
|
|
int findRightQualCutoff(Seq seq, int num_bases);
|
|
void findLeftQualCutoff(Seq seq, int *start_point);
|
|
void SeqQual_sideband(Seq seq1);
|
|
void SeqQual_nonCalledOverCalled(Seq seq1);
|
|
|
|
|
|
#endif /*QUAL_CODE*/
|
|
|
|
|
|
|
|
|
|
|
|
/* ---- Internal Functions ---- */
|
|
|
|
int findPercntAmbig(char *theSeq, int num_bases);
|
|
|
|
static void text_to_output(char *vec,int stp,int endp,int dvice,char *outfile)
|
|
|
|
/*
|
|
* text_to_output(vec,stp,endp,dvice,outfile)
|
|
* input: char **vec,*outfile; int stp, endp, dvice;
|
|
* this program output a specified portion of a genbank file
|
|
* (from vec[stp] to vec[endp]) to the screen (default), a file (dvice
|
|
* =1), or lpr (dvice =2). Outfile is the input filename if you wish to
|
|
* output the information to a specified file. stp and endp are integers,
|
|
* not pointers.
|
|
*/
|
|
|
|
{
|
|
int i;
|
|
FILE *fopen(),*fp,*where;
|
|
|
|
|
|
if (dvice == 1) {
|
|
if ((fp = fopen(outfile,"a"))==NULL) {
|
|
printf ("\nERROR: can't open file %s\n",outfile);
|
|
return;
|
|
}
|
|
else where = fp;
|
|
}
|
|
else if (dvice == 2) {
|
|
if ((fp = fopen("junkfile.","a"))==NULL) {
|
|
printf ("\nERROR: can't open file junkfile. to output to the lpr\n");
|
|
return;
|
|
}
|
|
else where = fp;
|
|
}
|
|
else {
|
|
/* default : */
|
|
where = stdout;
|
|
}
|
|
|
|
for (i = stp; i < endp; i++)
|
|
putc(vec[i],where);
|
|
|
|
if ((dvice == 1) || (dvice == 2)) fclose(fp);
|
|
|
|
}
|
|
|
|
int checkForExistingEdFile(char *fn)
|
|
/*
|
|
* check to see if there is an existing .seq.n file;
|
|
* return the largest n or 0 if no files existed,
|
|
* return a -1 if it was a problem with opening files
|
|
*/
|
|
|
|
{
|
|
char vec[500];
|
|
int last_ed_num;
|
|
FILE *fp;
|
|
int i;
|
|
|
|
/* make sure there are no files with the names I'm about to use*/
|
|
remove (".abcxyztmpsh.");
|
|
remove (".abcxyztmpout.");
|
|
|
|
|
|
/* write a little shell to see the last n in your_filename.n
|
|
in the current directory */
|
|
if ((fp=fopen(".abcxyztmpsh.", "w")) == NULL) return(-1);
|
|
fclose(fp);
|
|
sprintf(vec,"for i in 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 \n");
|
|
i = strlen(vec);
|
|
sprintf(vec+i," do\n oldname=%s.$i\n if test -f $oldname\n then echo $i\n",fn);
|
|
i = strlen(vec);
|
|
sprintf(vec+i," fi\ndone\n");
|
|
|
|
text_to_output(vec,0,strlen(vec),1,".abcxyztmpsh.");
|
|
|
|
/* execute the shell and have it output the last number it found into
|
|
a file called .abcxyztmpout. */
|
|
system("sh .abcxyztmpsh. | tail -1 > .abcxyztmpout.");
|
|
|
|
/* read that number from that file */
|
|
|
|
if ((fp=fopen(".abcxyztmpout.", "r")) == NULL) return(-1);
|
|
vec[0]=getc(fp);
|
|
fclose(fp);
|
|
/* if that number was EOF return(0), nothing found */
|
|
if (vec[0]==EOF) {
|
|
remove (".abcxyztmpsh.");
|
|
remove (".abcxyztmpout.");
|
|
|
|
return(0);
|
|
}
|
|
|
|
vec[1]='\0';
|
|
|
|
sscanf(vec,"%d",&last_ed_num);
|
|
remove (".abcxyztmpsh.");
|
|
remove (".abcxyztmpout.");
|
|
|
|
return(last_ed_num);
|
|
}
|
|
|
|
Boolean isDotSeq(char *fn)
|
|
/* make sure there is a .seq on the end of fn, puts one on
|
|
if there is not */
|
|
{
|
|
int i;
|
|
|
|
i = strlen(fn)-1;
|
|
if (fn[i]!='q' || fn[i-1]!='e' || fn[i-2]!='s' || fn[i-3]!='.')
|
|
return(False);
|
|
else return(True);
|
|
}
|
|
|
|
void stripDotSeq(char *fn)
|
|
/* there is a .seq on the end of fn, strip it off */
|
|
|
|
{ int i;
|
|
|
|
i = strlen(fn)-1;
|
|
|
|
if (fn[i]=='q' && fn[i-1]=='e' && fn[i-2]=='s' && fn[i-3]=='.')
|
|
fn[i-3]='\0';
|
|
|
|
return;
|
|
}
|
|
|
|
void stripDotNum(char *fn)
|
|
/* if there is a .num on the end of fn, strip it off */
|
|
|
|
{ int i;
|
|
|
|
i = strlen(fn)-1;
|
|
|
|
if (isdigit(fn[i]) && fn[i-1]=='.')
|
|
fn[i-1]='\0';
|
|
else if (isdigit(fn[i]) && isdigit(fn[i-1]) && fn[i-2]=='.')
|
|
fn[i-2]='\0';
|
|
|
|
return;
|
|
}
|
|
|
|
int isDotNum(char *fn)
|
|
/*
|
|
* checks if there is a .1 or .2 or .m on the inputfilename.
|
|
* If there is, then returns that num. If not returns -1.
|
|
*/
|
|
{
|
|
int i,j;
|
|
int dotnum;
|
|
char *atemp;
|
|
|
|
atemp = (char *)calloc(20,sizeof(char));
|
|
|
|
i = strlen(fn)-1;
|
|
j = 0;
|
|
|
|
if (isdigit(fn[i]) && (fn[i-1]=='.')) {
|
|
atemp[0] = fn[i];
|
|
atemp[1] = fn[i+1];
|
|
sscanf(atemp,"%d",&dotnum);
|
|
free(atemp);
|
|
return(dotnum);
|
|
}
|
|
else if (isdigit(fn[i]) && isdigit(fn[i-1]) && fn[i-2]=='.') {
|
|
atemp[0] = fn[i-1];
|
|
atemp[1] = fn[i];
|
|
atemp[2] = '\0';
|
|
sscanf(atemp,"%d",&dotnum);
|
|
free(atemp);
|
|
return(dotnum);
|
|
}
|
|
|
|
free(atemp);
|
|
return(-1);
|
|
}
|
|
|
|
static void get_compl_seq(char *ac_seq,char *aseq,int stp,int endp, int seq_len, int rev)
|
|
|
|
/*
|
|
* uses aseq to find the sequence that
|
|
* would appear on the opposite strand and places
|
|
* that sequence in ac_seq; stp is the starting point on
|
|
* aseq and endp is the ending point on aseq;
|
|
* seq_len is the length of the input sequence,
|
|
* if rev==1 reverses as well as complements, if rev==0
|
|
* only complements
|
|
*/
|
|
{
|
|
int i;
|
|
|
|
oppInitialize();
|
|
|
|
if (rev == 1) {
|
|
for (i = stp; i <= endp; i++)
|
|
ac_seq[seq_len - i] = opp[aseq[i]];
|
|
}
|
|
else {
|
|
for (i = stp; i <= endp; i++)
|
|
ac_seq[i] = opp[aseq[i]];
|
|
ac_seq[i-1] ='\0';
|
|
}
|
|
|
|
|
|
return;
|
|
}
|
|
|
|
|
|
/* ---- Externals ---- */
|
|
|
|
Boolean writeEdSeq(Seq seq, char *fn)
|
|
|
|
{
|
|
FILE *fp;
|
|
int i,j;
|
|
char ed_fn[200];
|
|
int last_ed_num; /* largest n of fn.seq.n in
|
|
current directory */
|
|
|
|
oppInitialize();
|
|
|
|
/* make sure the filename does not have a .seq on the end */
|
|
stripDotSeq(fn);
|
|
|
|
/* get the n to put on inputfilename.n for the edited
|
|
file to be kept */
|
|
last_ed_num = checkForExistingEdFile(fn);
|
|
|
|
if (last_ed_num == -1) return(False);
|
|
else sprintf(ed_fn,"%s.%d",fn,last_ed_num+1);
|
|
/* that line takes care of 0 too, because
|
|
it names the file fn.seq.1 */
|
|
|
|
/* Open for writing, text */
|
|
if ((fp=fopen(ed_fn, "w")) == NULL) return(False);
|
|
|
|
|
|
/* write information in the following format:
|
|
NedBases*edits array*edBase array*edBasePos array*
|
|
each division ends in an * and within
|
|
each division entries are separated by spaces */
|
|
|
|
if (seq->bottom) {
|
|
fprintf(fp," %6d*%3d*%6d*%6d*",seq->NorigBases,seq->NedBases,seq->rightCutoff,seq->leftCutoff);
|
|
}
|
|
else {
|
|
fprintf(fp," %6d*%3d*%6d*%6d*",seq->NorigBases,seq->NedBases,seq->leftCutoff,seq->rightCutoff);
|
|
}
|
|
|
|
/* print out edits array */
|
|
|
|
if (seq->bottom) {
|
|
for (i=seq->NorigBases+MaxEdits-1;
|
|
i > -1;
|
|
i--)
|
|
{
|
|
j = 0;
|
|
if (i==seq->NorigBases+MaxEdits-1)
|
|
/* this part is a fudge to stick in 0 0 */
|
|
fprintf(fp,"%6d %6d ",j,j);
|
|
|
|
|
|
if (((seq->edits[i]!=0) && ((seq->NedBases -1- i)>=0)) || i==0){
|
|
if (seq->edits[i] <0)
|
|
fprintf(fp,"%6d %6d ",seq->NedBases -1- i,seq->edits[i]);
|
|
else
|
|
fprintf(fp,"%6d %6d ",seq->NedBases -1 -i,seq->NorigBases - 1-(seq->edits[i]));
|
|
}
|
|
}
|
|
}
|
|
else {
|
|
for (i=0;
|
|
i<seq->NorigBases+MaxEdits;
|
|
i++) {
|
|
|
|
if ((seq->edits[i]!=0) || (i==0))
|
|
fprintf(fp,"%6d %6d ",i,seq->edits[i]);
|
|
}
|
|
}
|
|
|
|
fprintf(fp,"%6d %6d ",NULLPoint,NULLPoint);
|
|
|
|
fprintf(fp,"*");
|
|
|
|
/* print out non-NULL entries in edBase array */
|
|
for (i=1;
|
|
i<MaxEdits;
|
|
i++)
|
|
{
|
|
if (seq->edBase[i] == NULL) {
|
|
fprintf(fp,"* ");
|
|
break;
|
|
}
|
|
else {
|
|
if (seq->bottom)
|
|
fprintf(fp,"%c ",opp[seq->edBase[i]]);
|
|
else
|
|
fprintf(fp,"%c ",seq->edBase[i]);
|
|
}
|
|
}
|
|
|
|
fprintf(fp,"*");
|
|
|
|
/* print out non-NULL entries in edBasePos array */
|
|
|
|
for (i=1;
|
|
i<MaxEdits;
|
|
i++)
|
|
{
|
|
|
|
if (seq->edBasePos[i] == NULLPoint) {
|
|
fprintf(fp,"%6d ",NULLPoint); /* -1 is NULLPoint*/
|
|
break;
|
|
}
|
|
else {
|
|
int fudge;
|
|
/*
|
|
* when you're plotting the strand in the reverse
|
|
* order, you must move the starting position over by
|
|
* the width of one character. Because positions in the
|
|
* other file, already take into account the character
|
|
* width. Therefore, fudge = ~character width + basePos
|
|
* of the first base
|
|
*/
|
|
|
|
fudge = seq->basePos[0] + 6;
|
|
|
|
if (seq->bottom) {
|
|
/* fprintf(fp,"%6d ",seq->basePos[seq->NorigBases-1]-seq->edBasePos[i]+fudge); */
|
|
fprintf(fp,"%6d ",seq->NPoints-seq->edBasePos[i]);
|
|
}
|
|
else
|
|
fprintf(fp,"%6d ",seq->edBasePos[i]);
|
|
}
|
|
}
|
|
|
|
fprintf(fp,"*");
|
|
(void) fclose(fp);
|
|
return(True);
|
|
}
|
|
|
|
|
|
|
|
Boolean readEdSeq(Seq seq, char *fn, int dotnum)
|
|
/*
|
|
* reads in the most recent fn.seq.n file. This
|
|
* file should be of the form:
|
|
* basePos;edBasePos;edBase
|
|
* returns True if it read in the sequence from
|
|
* an editted file, and False if there was no
|
|
* editted file or if there was a problem
|
|
*
|
|
* dotnum == -1 if the user did not specify a
|
|
* version number of the sequence to read in,
|
|
* if they want version m read in.
|
|
*/
|
|
|
|
{
|
|
int last_ed_num;
|
|
char ed_fn[200];
|
|
FILE *fp;
|
|
|
|
oppInitialize();
|
|
|
|
/* make sure the filename does not have a .seq on the end */
|
|
stripDotSeq(fn);
|
|
|
|
if (dotnum == -1) {
|
|
/* get the n to put on fn.seq.n */
|
|
last_ed_num = checkForExistingEdFile(fn);
|
|
if (last_ed_num == -1) return(False);
|
|
else if (last_ed_num == 0) return(False);
|
|
else sprintf(ed_fn,"%s.%d",fn,last_ed_num);
|
|
}
|
|
else
|
|
sprintf(ed_fn,"%s.%d",fn,dotnum);
|
|
|
|
|
|
/* Open for reading, text */
|
|
if ((fp=fopen(ed_fn, "r")) == NULL) return(False);
|
|
fclose(fp);
|
|
|
|
if (processEdSeqFile(seq,ed_fn))
|
|
return(True);
|
|
else return(False);
|
|
}
|
|
|
|
Boolean processEdSeqFile(Seq seq, char *fn)
|
|
|
|
/*
|
|
* processes a char vector containing the Editted
|
|
* sequence in the following format:
|
|
*
|
|
* NedBases*NorigBases*leftCutoff*rightCutoff*negative
|
|
* components of edits array in the form (position,
|
|
* negative number)*non NULL components edBase array*
|
|
* non NULL components edBasePos array*
|
|
*
|
|
* each division ends in an H. The end of the
|
|
* edits array is signaled by a -1 -1 entry. Within
|
|
* each division entries are separated by spaces
|
|
*
|
|
* sticks the information it finds into the seq array
|
|
*
|
|
* returns false if there was a problem with the file format
|
|
*/
|
|
{
|
|
FILE *fp;
|
|
int nbases;
|
|
char achar;
|
|
int i,j,k;
|
|
char ed_fn[200];
|
|
|
|
strcpy(ed_fn,fn);
|
|
|
|
if ((fp=fopen(ed_fn, "r")) == NULL) return(False);
|
|
|
|
fscanf(fp," %6d",&nbases);
|
|
achar = getc(fp);
|
|
if (achar != '*') {
|
|
printf("ERROR: Input editted sequence was of wrong format \n(No asterisk was found after the number of bases)\n");
|
|
fclose(fp);
|
|
return(False);
|
|
}
|
|
|
|
|
|
fscanf(fp,"%3d",&i);
|
|
achar = getc(fp);
|
|
if (achar != '*') {
|
|
printf("ERROR: Input editted sequence was of wrong format \n(No asterisk was found after the number of edited bases)\n");
|
|
fclose(fp);
|
|
return(False);
|
|
}
|
|
|
|
seq->NedBases = i;
|
|
if (nbases != seq->NorigBases) {
|
|
printf("ERROR: Input editted sequence was of wrong format\n (Number of editted bases has changed)\n");
|
|
fclose(fp);
|
|
return(False);
|
|
}
|
|
|
|
|
|
fscanf(fp,"%6d",&j);
|
|
achar = getc(fp);
|
|
if (achar != '*') {
|
|
printf("ERROR: Input editted sequence was of wrong format \n(No asterisk was found after the left cutoff)\n");
|
|
fclose(fp);
|
|
return(False);
|
|
}
|
|
|
|
fscanf(fp,"%6d",&k);
|
|
achar = getc(fp);
|
|
if (achar != '*') {
|
|
printf("ERROR: Input editted sequence was of wrong format \n(No asterisk was found after the right cutoff)\n");
|
|
fclose(fp);
|
|
return(False);
|
|
}
|
|
|
|
if (seq->bottom) {
|
|
seq->leftCutoff = k;
|
|
seq->rightCutoff = j;
|
|
}
|
|
else {
|
|
seq->leftCutoff = j;
|
|
seq->rightCutoff = k;
|
|
}
|
|
|
|
/* read in the seq->edits array */
|
|
|
|
for (i=1;
|
|
i<seq->NorigBases+MaxEdits;
|
|
i++)
|
|
{
|
|
fscanf(fp,"%6d %6d ",&j,&k);
|
|
if (j==NULLPoint) break;
|
|
else {
|
|
if (seq->bottom) {
|
|
if (k<0) /* then it is a reference to the edBase array */
|
|
seq->edits[seq->NedBases-1-j]=k;
|
|
else /*it is a base number which needs to be converted
|
|
to the base number on the opposite strand */
|
|
seq->edits[seq->NedBases-1-j]=seq->NorigBases-1-k;
|
|
}
|
|
else
|
|
seq->edits[j]=k;
|
|
}
|
|
|
|
}
|
|
achar = getc(fp);
|
|
if (achar != '*') {
|
|
printf("ERROR: Input editted sequence was of wrong format \n(Error in the edits array)\n");
|
|
fclose(fp);
|
|
return(False);
|
|
}
|
|
|
|
/* read in the seq->edBase array */
|
|
for (i=1;
|
|
i<MaxEdits;
|
|
i++)
|
|
{
|
|
fscanf(fp,"%c ",&achar);
|
|
if (achar == '*') break;
|
|
else {
|
|
if (seq->bottom)
|
|
seq->edBase[i]=opp[achar];
|
|
else
|
|
seq->edBase[i]=achar;
|
|
}
|
|
}
|
|
achar = getc(fp);
|
|
if (achar != '*') {
|
|
printf("Input editted sequence was of wrong format\n(Extraneous information after edBases and before the *\n");
|
|
fclose(fp);
|
|
return(False);
|
|
}
|
|
|
|
|
|
/* read in the seq->edBasePos array */
|
|
|
|
for (i=1;
|
|
i<MaxEdits;
|
|
i++)
|
|
{
|
|
fscanf(fp,"%6d ",&k);
|
|
if (k==NULLPoint) break;
|
|
else {
|
|
int fudge;
|
|
|
|
/*
|
|
* when you're plotting the strand in the reverse
|
|
* order, you must move the starting position over by
|
|
* the width of one character. Because positions in the
|
|
* ABI File, already take into account the character
|
|
* width. Therefore, fudge = ~character width + offset of
|
|
* the first peaks position
|
|
*/
|
|
|
|
fudge = seq->basePos[0] + 6;
|
|
|
|
if (seq->bottom) {
|
|
/* seq->edBasePos[i]=seq->basePos[seq->NorigBases-1]-k+fudge;*/
|
|
seq->edBasePos[i]=seq->NPoints - k;
|
|
}
|
|
else
|
|
seq->edBasePos[i]=k;
|
|
}
|
|
|
|
}
|
|
achar = getc(fp);
|
|
if (achar != '*') {
|
|
printf("ERROR: Input editted sequence was of wrong format\n(Error in editted Base Position array)\n");
|
|
fclose(fp);
|
|
return(False);
|
|
}
|
|
|
|
/*
|
|
* Don't set the seq to Dirty, otherwise the user won't know
|
|
* if they have or have not edited their input edited sequence
|
|
*/
|
|
fclose(fp);
|
|
return(True);
|
|
}
|
|
|
|
|
|
|
|
|
|
int findLeftCutoff(Seq seq, char *enzInString)
|
|
/*
|
|
* looks for left cutoff, if it doesn't find a "enzInString", then
|
|
* it looks from enzInString less it's last character, etc
|
|
*/
|
|
|
|
{
|
|
int maxStartPos=100; /* if the enzyme site wasn't found before this
|
|
baseNum, then that's probably not the cloning
|
|
site */
|
|
int i,j,found;
|
|
/* int jj,kk; */
|
|
int indices[100];
|
|
int num_matches;
|
|
char *theSeq;
|
|
int num_bases;
|
|
char enzString[100];
|
|
/* char enztemp[100]; */
|
|
#ifdef QUAL_CODE
|
|
int cut_point;
|
|
#endif /*QUAL_CODE*/
|
|
|
|
found = 0;
|
|
|
|
if (seq->bottom)
|
|
get_compl_seq(enzString,enzInString,0,strlen(enzInString),strlen(enzInString),0);
|
|
else
|
|
strcpy(enzString,enzInString);
|
|
|
|
|
|
num_bases = getNBases(seq,EdBases);
|
|
theSeq = (char *)calloc(num_bases,sizeof(char));
|
|
|
|
j = 0;
|
|
if (seq->bottom) {
|
|
for (i = num_bases-1; i >= 0; i--){
|
|
theSeq[i] = getBase(seq, EdBases, j);
|
|
j++;
|
|
}
|
|
}
|
|
else {
|
|
for (i = 0; i < num_bases; i++)
|
|
theSeq[i] = getBase(seq, EdBases, i);
|
|
}
|
|
|
|
|
|
|
|
/* look for first occurrence of enzString;
|
|
just look a match with at most i mismatches, starting
|
|
with 0 mismatches down to two*/
|
|
for (i=0; i<3; i++) {
|
|
num_matches=string_match(enzString,strlen(enzString),theSeq,num_bases,i,indices);
|
|
if (num_matches > 0)
|
|
if (indices[0] < maxStartPos) {
|
|
found = 1;
|
|
break;
|
|
}
|
|
}
|
|
|
|
free(theSeq);
|
|
|
|
if (found && indices[0])
|
|
return(indices[0] + strlen(enzString));
|
|
#ifdef QUAL_CODE
|
|
else {
|
|
cut_point=0;
|
|
/*
|
|
* make sure there are not a bunch of Ns from the ABI
|
|
* primer problem at the start of this sequence...move the
|
|
* left cutoff past all of those Ns
|
|
*/
|
|
for (i=0; i<seq->NorigBases; i++)
|
|
if (seq->base[i]!='N' && seq->base[i]!='-') {cut_point=i; break;}
|
|
}
|
|
|
|
findLeftQualCutoff(seq,&cut_point);
|
|
|
|
return(cut_point);
|
|
#else /*QUAL_CODE*/
|
|
else return(0);
|
|
#endif /*QUAL_CODE*/
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
#ifdef QUAL_CODE
|
|
|
|
|
|
int findRightCutoff(Seq seq)
|
|
{
|
|
int num_bases;
|
|
int rightCutoff;
|
|
|
|
num_bases = getNBases(seq,EdBases);
|
|
|
|
rightCutoff = findRightQualCutoff(seq,num_bases);
|
|
|
|
/* added so that the left and right cutoffs do not overlap */
|
|
if (rightCutoff > num_bases - seq->leftCutoff) rightCutoff=num_bases - seq->leftCutoff;
|
|
|
|
return(rightCutoff);
|
|
}
|
|
#else /*QUAL_CODE*/
|
|
int findRightCutoff(Seq seq)
|
|
{
|
|
/* give the %age cutoff a default but let it be user
|
|
specifiable on the command line ? */
|
|
/*
|
|
* ways to look for ends of sequence
|
|
* 1. runs of nucleotides or dinucleotides
|
|
* -- but rick says there are lots of runs of
|
|
* A's and T's in what they're sequencing
|
|
* 2. percentage of N's
|
|
* 3. automatically drop down to baseNum 600 to even start
|
|
* looking for a cutoff
|
|
*/
|
|
|
|
int num_bases;
|
|
char *theSeq;
|
|
int i,j;
|
|
int rightCutoff;
|
|
|
|
num_bases = getNBases(seq,EdBases);
|
|
theSeq = (char *)calloc(num_bases,sizeof(char));
|
|
|
|
j = 0;
|
|
if (seq->bottom) {
|
|
for (i = num_bases-1; i >= 0; i--){
|
|
theSeq[i] = getBase(seq, EdBases, j);
|
|
j++;
|
|
}
|
|
}
|
|
else {
|
|
for (i = 0; i < num_bases; i++)
|
|
theSeq[i] = getBase(seq, EdBases, i);
|
|
}
|
|
|
|
rightCutoff = findPercntAmbig(theSeq,num_bases);
|
|
|
|
/* added so that the left and right cutoffs do not overlap */
|
|
if (rightCutoff > num_bases - seq->leftCutoff) rightCutoff=num_bases - seq->leftCutoff;
|
|
|
|
free(theSeq);
|
|
return(rightCutoff);
|
|
}
|
|
#endif /*QUAL_CODE*/
|
|
|
|
|
|
int findPercntAmbig(char *theSeq, int num_bases)
|
|
|
|
{
|
|
int i,j;
|
|
int isN[256];
|
|
int totalN = 0;
|
|
int numN;
|
|
int nucWindow;
|
|
|
|
numN = 2;
|
|
nucWindow = 5;
|
|
|
|
for (i = 0; i <= 256; i++) isN[i]=0;
|
|
isN['n']=1;
|
|
isN['N']=1;
|
|
isN['-']=1;
|
|
|
|
|
|
|
|
/*
|
|
* start at base num 200 and look for numN Ns within
|
|
* a window of nucWindow nucleotides, once you find that
|
|
* second N send back the indices of that second
|
|
* N as the cutoff line
|
|
*/
|
|
|
|
for (i = 200; i < num_bases; i++) {
|
|
totalN = 0;
|
|
for (j = 0; j < nucWindow; j++) {
|
|
if (isN[theSeq[i+j]]) totalN++;
|
|
if (totalN == numN) return(num_bases - (i+j));
|
|
}
|
|
}
|
|
return(0);
|
|
|
|
}
|
|
|
|
|
|
#ifdef QUAL_CODE
|
|
|
|
/*
|
|
Title: seqQual
|
|
|
|
File: seqQual.c
|
|
Purpose: Sequence Quality calculation module
|
|
Last update: May 1992
|
|
|
|
Change log:
|
|
|
|
*/
|
|
|
|
|
|
|
|
|
|
|
|
static int one_half_forwards(Seq seq, int base)
|
|
/*
|
|
* Returns the position half way between base and the following base.
|
|
*/
|
|
{
|
|
int pos;
|
|
|
|
|
|
if ((base+1) < seq->NorigBases) {
|
|
pos = (seq->basePos[base]+seq->basePos[base+1]) / 2;
|
|
} else {
|
|
/*
|
|
* Last base is a special case. We should guestimate.
|
|
*
|
|
* guess 1: pos = bp[N] + (bp[N] - bp[N-1])/2
|
|
*
|
|
* if pos > NPoints
|
|
* guess 2: pos = NPoints-1
|
|
*
|
|
*/
|
|
pos = seq->basePos[base] +
|
|
(seq->basePos[base] - seq->basePos[base-1])/2;
|
|
if (pos >= seq->NPoints) pos = seq->NPoints-1;
|
|
}
|
|
|
|
return pos;
|
|
|
|
}
|
|
|
|
|
|
static int one_half_backwards(Seq seq, int base)
|
|
/*
|
|
* Returns the position half way between base and the precedingbase.
|
|
*/
|
|
{
|
|
int pos;
|
|
|
|
|
|
if (base > 0) {
|
|
pos = (seq->basePos[base]+seq->basePos[base-1]) / 2;
|
|
} else {
|
|
/*
|
|
* Last base is a special case. We should guestimate.
|
|
*
|
|
* guess 1: pos = bp[N] - (bp[N+1] - bp[N])/2
|
|
*
|
|
* if pos < 0
|
|
* guess 2: pos = 0
|
|
*
|
|
*/
|
|
pos = seq->basePos[base] -
|
|
(seq->basePos[base+1] - seq->basePos[base])/2;
|
|
if (pos < 0) pos = 0;
|
|
}
|
|
|
|
return pos;
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
/*
|
|
* 1) when they ask to find
|
|
* the left cutoff go ahead and just
|
|
* calculate max_non_called over called measure to
|
|
* see whether or not to throw the trace away entirely
|
|
*
|
|
* 1.5)find the left cutoff first and use that for starting your
|
|
* hunt for the right cutoff.
|
|
*
|
|
* 2) go out and calculated the side band ratio and find the cutoffs
|
|
*
|
|
* 3) last calculate any quality measures base by base which you
|
|
* actually may want for writing out to the sequence file or whatever
|
|
*/
|
|
|
|
|
|
int overallTraceQual(Seq seq)
|
|
/*
|
|
* returns a one if the overall trace quality was good enough to
|
|
* warrant keeping the trace....0 if the trace should be thrown away
|
|
*
|
|
* Seq is the trace structure
|
|
* for step size...you are looking at values for the quality
|
|
* measure each STEP_SIZE-th base
|
|
*/
|
|
|
|
{ int j;
|
|
int num_good=0; /* count of number of consecutive bases having
|
|
a quality index value better than the cutoff */
|
|
int num_consecutive_good=20; /* 24 being about 200 bases of
|
|
good trace since values are
|
|
read every 8 bases */
|
|
int num_problems; /* count of number of bases in this run having
|
|
a quality index above the cutoff */
|
|
int num_problems_allowed=4; /* number of values above the
|
|
cutoff allowed in the span
|
|
of num_consecutive_good */
|
|
float cutoff=OVERALL_TRACE_QUAL_CUTOFF; /* cutoff for the value of max_non_called over called */
|
|
int last_problem;
|
|
|
|
SeqQual_nonCalledOverCalled(seq);
|
|
|
|
for (j=0; j< seq->NorigBases; j+=STEP_SIZE) {
|
|
/* printf("overall seq->qual[%d] is %4.3f\n",j,seq->qualIndex[j]);*/
|
|
if (seq->qualIndex[j]<cutoff) num_good++;
|
|
else {
|
|
num_problems++;
|
|
if (num_problems==1) last_problem=j;
|
|
if (num_problems>num_problems_allowed) {num_good=0; num_problems=0; j=last_problem+1;}
|
|
}
|
|
/*
|
|
* make sure you hit num_consecutive_good in a row and that you are not
|
|
* out past LAST_ALLOWED_BASE when you do hit it
|
|
*/
|
|
if (num_good==num_consecutive_good) return(1);
|
|
}
|
|
|
|
return(0);
|
|
}
|
|
|
|
int findRightQualCutoff(Seq seq, int num_bases)
|
|
{ int i,j;
|
|
/*int num_good=0;*/ /* count of number of consecutive bases having
|
|
a quality index value better than the cutoff */
|
|
int num_problems; /* count of number of bases in this run having
|
|
a quality index above the cutoff */
|
|
int num_allowed_problems=4; /* number of values above the
|
|
cutoff allowed before setting
|
|
the right cutoff */
|
|
float cutoff=SIDEBAND_CUTOFF;
|
|
/* cutoff for the value of this side-band-ratio */
|
|
int rightCutoff;
|
|
int first_problem_base;
|
|
|
|
|
|
/* go calculate the quality measure */
|
|
SeqQual_sideband(seq);
|
|
|
|
|
|
/* step through all of the bases in STEP_SIZE increments */
|
|
|
|
for (j=seq->leftCutoff; j<seq->NorigBases; j+=STEP_SIZE) {
|
|
/* printf("seq->qualIndex[%d] is %5.3f\n",j,seq->qualIndex[j]);*/
|
|
|
|
/* if the quality index exceeds the cutoff ... count it as a problem area*/
|
|
if (seq->qualIndex[j]>cutoff) {
|
|
num_problems++;
|
|
if (num_problems==1) first_problem_base=j;
|
|
}
|
|
else num_problems=0;
|
|
|
|
|
|
|
|
/* printf("num_problems is %d\n",num_problems);*/
|
|
|
|
/*
|
|
* if we have reached the num_allowed_problems over
|
|
* consecutive bases...then go ahead and assign
|
|
* the right cutoff to the point where the start
|
|
* of the problem bases was found
|
|
*/
|
|
if (num_problems==num_allowed_problems ) {
|
|
rightCutoff=j-num_problems*STEP_SIZE;
|
|
}
|
|
}
|
|
|
|
if (num_problems<num_allowed_problems) {
|
|
rightCutoff=LAST_ALLOWED_BASE;
|
|
if (rightCutoff>num_bases) rightCutoff=num_bases;
|
|
}
|
|
|
|
/* printf("rightcutoff is %d\n",rightCutoff);*/
|
|
|
|
/*
|
|
* now go check the other quality measure from this cutoff
|
|
* base backwards....checking that we are not exceeding
|
|
* that rule...or could just take the more conservative
|
|
* of the two estimates....except then we're sometimes
|
|
* actually finding the left cutoff
|
|
*/
|
|
|
|
SeqQual_nonCalledOverCalled(seq);
|
|
|
|
for (i=rightCutoff; i>seq->leftCutoff; i-=8) {
|
|
/* printf("other seq->qualIndex[%d] is %5.3f\n",i,seq->qualIndex[i]);*/
|
|
/* If two consecutive regions are good using the noncalled over called
|
|
cutoff, then go ahead and set the right cutoff there */
|
|
if (seq->qualIndex[i]<NONCALLED_OVER_CALLED_CUTOFF && seq->qualIndex[i-STEP_SIZE]<NONCALLED_OVER_CALLED_CUTOFF) break;
|
|
}
|
|
rightCutoff=i;
|
|
|
|
|
|
|
|
/* ABSOLUTE CUTOFF IS LAST_ALLOWED_BASE */
|
|
if (rightCutoff>LAST_ALLOWED_BASE) rightCutoff=LAST_ALLOWED_BASE;
|
|
|
|
|
|
/* printf("rightcutoff is %d\n",rightCutoff); */
|
|
|
|
/*
|
|
* remember that the right cutoff in the trace structure is not
|
|
* the base position of the right cutoff...rather it is num_bases
|
|
* minus that position
|
|
*/
|
|
|
|
rightCutoff=num_bases-rightCutoff;
|
|
|
|
return(rightCutoff);
|
|
}
|
|
|
|
void findLeftQualCutoff(Seq seq,int *start_point)
|
|
{ int j;
|
|
/*
|
|
* go calculate the quality measure ... this will take care
|
|
* of calculating it both for the the left and the right
|
|
*/
|
|
|
|
/* had already called this for the overall trace quality */
|
|
/* SeqQual_nonCalledOverCalled(seq);*/
|
|
|
|
/*
|
|
* start looking at start_point+STEP_SIZE because you really do not
|
|
* want to look at what comes before the cutoff. You only care
|
|
* about what is after the cutoff...so in essence you are making
|
|
* your window centered on the left foot of the rectangle window rather
|
|
* than the center
|
|
*/
|
|
|
|
for (j=*start_point+STEP_SIZE; j< seq->NorigBases; j+=STEP_SIZE) {
|
|
/* printf("seq->qualIndex[%d] is %4.3f\n",j,seq->qualIndex[j]);*/
|
|
if (seq->qualIndex[j]<NONCALLED_OVER_CALLED_CUTOFF) {
|
|
*start_point=j-STEP_SIZE;
|
|
return;
|
|
}
|
|
}
|
|
|
|
*start_point=j;
|
|
return;
|
|
|
|
}
|
|
|
|
|
|
/*
|
|
* MODULE SeqQual17 - SeqQual_sideband
|
|
*
|
|
* for the called base, take the ratio of the value of that
|
|
* trace 1/2 of the way between this base and the next base
|
|
* over the value of the trace at its peak...
|
|
* compare that to the ratio of the value of that trace 1/2 or
|
|
* the way between this base and the previous base
|
|
* over the value of the trace at its peak...take the worst ratio
|
|
* and average that over the 8 bases before and after this base
|
|
*
|
|
*/
|
|
|
|
void SeqQual_sideband(Seq seq1)
|
|
{
|
|
int i;
|
|
int pos;
|
|
int one_half_for;
|
|
int one_half_back;
|
|
int half_window_size=8; /* 8 */
|
|
int start_sum,end_sum;
|
|
int j;
|
|
/* int one_half_pos; */
|
|
float forward,backward;
|
|
|
|
for (i=0; i<seq1->NorigBases; i++)
|
|
seq1->qualIndex[i] = 0.0;
|
|
seq1->qualType=17;
|
|
|
|
|
|
for (j=0; j< seq1->NorigBases; j++) {
|
|
|
|
end_sum=j+half_window_size;
|
|
start_sum=j-half_window_size;
|
|
|
|
|
|
if (end_sum>=seq1->NorigBases) end_sum=(seq1->NorigBases)-2;
|
|
if (start_sum<0) start_sum=1;
|
|
|
|
for (i=start_sum; i<=end_sum; i++) {
|
|
|
|
/*
|
|
one_half_for=(int)((seq1->basePos)[i]+((seq1->basePos)[i+1] - (seq1->basePos[i]))/2);
|
|
one_half_back=(int)((seq1->basePos)[i]-((seq1->basePos)[i] - (seq1->basePos[i-1]))/2);
|
|
|
|
*/
|
|
one_half_for = one_half_forwards(seq1,i);
|
|
one_half_back = one_half_backwards(seq1,i);
|
|
|
|
pos = (seq1->basePos)[i];
|
|
|
|
switch ((seq1->base)[i]) {
|
|
case 'A':
|
|
case 'a':
|
|
forward=(float)seq1->traceA[one_half_for]/(float)seq1->traceA[pos];
|
|
backward=(float)seq1->traceA[one_half_back]/(float)seq1->traceA[pos];
|
|
if (forward>backward)
|
|
seq1->qualIndex[j] += forward;
|
|
else
|
|
seq1->qualIndex[j] += backward;
|
|
break;
|
|
case 'C':
|
|
case 'c':
|
|
forward=(float)seq1->traceC[one_half_for]/(float)seq1->traceC[pos];
|
|
backward=(float)seq1->traceC[one_half_back]/(float)seq1->traceC[pos];
|
|
if (forward>backward)
|
|
seq1->qualIndex[j] += forward;
|
|
else
|
|
seq1->qualIndex[j] += backward;
|
|
|
|
break;
|
|
case 'G':
|
|
case 'g':
|
|
forward=(float)seq1->traceG[one_half_for]/(float)seq1->traceG[pos];
|
|
backward=(float)seq1->traceG[one_half_back]/(float)seq1->traceG[pos];
|
|
if (forward>backward)
|
|
seq1->qualIndex[j] += forward;
|
|
else
|
|
seq1->qualIndex[j] += backward;
|
|
|
|
break;
|
|
case 'T':
|
|
case 't':
|
|
forward=(float)seq1->traceT[one_half_for]/(float)seq1->traceT[pos];
|
|
backward=(float)seq1->traceT[one_half_back]/(float)seq1->traceT[pos];
|
|
if (forward>backward)
|
|
seq1->qualIndex[j] += forward;
|
|
else
|
|
seq1->qualIndex[j] += backward;
|
|
|
|
break;
|
|
default:
|
|
(seq1->qualIndex)[j] += 1.0;
|
|
break;
|
|
}
|
|
}
|
|
/* seq1->qualIndex[j] = seq1->qualIndex[j]/(half_window_size*2+1);*/
|
|
seq1->qualIndex[j] = seq1->qualIndex[j]/(end_sum-start_sum+1);
|
|
}
|
|
|
|
seq1->qualType = 17; /* identify quality index as type 16 */
|
|
}
|
|
|
|
|
|
/*
|
|
* MODULE SeqQual15 SeqQual_nonCalledOverCalled
|
|
*
|
|
*
|
|
* center the window at base N, look to either side of base N,
|
|
* by window_size bases/2
|
|
*
|
|
* area of the called divided by the area of the max non-called
|
|
* for range from base N-window_size/2 to N+window_size/2
|
|
* between this base and the next base
|
|
*
|
|
* So find the area for each base....divide area of the called
|
|
* by other max area
|
|
*/
|
|
|
|
float max_area ( TRACE *tx, TRACE *ty, TRACE *tz , int stp, int endp);
|
|
float get_area( TRACE *trace, int startp, int endp);
|
|
|
|
void SeqQual_nonCalledOverCalled(Seq seq1)
|
|
{
|
|
int i,j;
|
|
int pos,start_pos,end_pos;
|
|
int half_window_size=8; /* look at three bases on either side */
|
|
float max_area();
|
|
float get_area();
|
|
int end_sum,start_sum;
|
|
|
|
for (i=0; i<seq1->NorigBases; i++)
|
|
seq1->qualIndex[i] = 0.0;
|
|
seq1->qualType=15;
|
|
|
|
for (j=0; j< seq1->NorigBases; j++) {
|
|
|
|
end_sum=j+half_window_size;
|
|
start_sum=j-half_window_size;
|
|
|
|
if (end_sum>=seq1->NorigBases) end_sum=(seq1->NorigBases)-1;
|
|
if (start_sum<0) start_sum=0;
|
|
|
|
for (i=start_sum; i<=end_sum; i++) {
|
|
pos = (seq1->basePos)[i];
|
|
|
|
/*
|
|
start_pos=(seq1->basePos)[i]-((seq1->basePos)[i] - (seq1->basePos[i-1]))/2;
|
|
end_pos=(seq1->basePos)[i]+((seq1->basePos)[i+1] - (seq1->basePos[i]))/2;
|
|
*/
|
|
start_pos = one_half_backwards(seq1,i);
|
|
end_pos = one_half_forwards(seq1,i);
|
|
|
|
|
|
switch ((seq1->base)[i]) {
|
|
case 'A':
|
|
case 'a':
|
|
seq1->qualIndex[j] +=
|
|
(max_area (seq1->traceC, seq1->traceG, seq1->traceT,start_pos,end_pos)/
|
|
get_area(seq1->traceA,start_pos,end_pos));
|
|
break;
|
|
case 'C':
|
|
case 'c':
|
|
seq1->qualIndex[j] +=
|
|
(max_area(seq1->traceA, seq1->traceG, seq1->traceT,start_pos,end_pos) /
|
|
get_area(seq1->traceC,start_pos,end_pos));
|
|
break;
|
|
case 'G':
|
|
case 'g':
|
|
seq1->qualIndex[j] +=
|
|
(max_area(seq1->traceC, seq1->traceA,seq1->traceT,start_pos,end_pos) /
|
|
get_area(seq1->traceG,start_pos,end_pos));
|
|
break;
|
|
case 'T':
|
|
case 't':
|
|
seq1->qualIndex[j] +=
|
|
(max_area(seq1->traceC, seq1->traceG, seq1->traceA,start_pos,end_pos) /
|
|
get_area(seq1->traceT,start_pos,end_pos));
|
|
break;
|
|
default:
|
|
(seq1->qualIndex)[j] += 1.0;
|
|
break;
|
|
}
|
|
}
|
|
/* seq1->qualIndex[j] = seq1->qualIndex[j]/(half_window_size*2+1);*/
|
|
seq1->qualIndex[j] = seq1->qualIndex[j]/(end_sum-start_sum+1);
|
|
}
|
|
|
|
seq1->qualType = 15; /* identify quality index as type 15 */
|
|
}
|
|
|
|
float get_area(TRACE*trace, int startp, int endp)
|
|
{ int i;
|
|
float sum=0;
|
|
|
|
for (i=startp; i<endp; i++)
|
|
sum += trace[i];
|
|
|
|
return(sum);
|
|
}
|
|
|
|
|
|
float max_area ( TRACE *tx, TRACE *ty, TRACE *tz , int stp, int endp)
|
|
{
|
|
float x,y,z,max;
|
|
x = get_area(tx,stp,endp);
|
|
y = get_area(ty,stp,endp);
|
|
z = get_area(tz,stp,endp);
|
|
|
|
if (x > y) {
|
|
if (z > x) {
|
|
max = z;
|
|
} else {
|
|
max = x;
|
|
}
|
|
} else {
|
|
if (z > y) {
|
|
max = z;
|
|
} else {
|
|
max = y;
|
|
}
|
|
}
|
|
|
|
return max;
|
|
}
|
|
|
|
#endif /*QUAL_CODE*/
|