gde_linux/HGL_SRC/HGLfuncs.c
2022-03-07 20:43:05 +00:00

3165 lines
71 KiB
C
Executable file

/****************************************************************
*
* This is a set of functions defined for the genome
* project.
*
****************************************************************/
#ifndef _GLOBAL_DEFS_H
#define _GLOBAL_DEFS_H
#include "global_defs.h"
#endif
#define MAXLINELEN 256
static char Default_DNA_Trans[16] = {
'-', 'a','c','m','g','r','s','v','t','w','y','h','k','d','b','n' };
/***********
*
* WriteRecord() outputs one record at a time in HGL format.
* Only the fields in the fields_array will be output. All the
* fields will be output if fields_array is NULL.
*
* fp : pointer to the output file.
* tSeq: pointer to the record.
* fields_array: contains the field ids of the selected fields.
* array_size: number of selected fields.
*
* Returns: 1 if any field is printed;
* 0 if no field is printed;
* -1 if anything is wrong.
*
**********/
int
WriteRecord(fp, tSeq, fields_array, array_size)
FILE *fp;
const Sequence *tSeq;
int *fields_array;
int array_size;
{
int i, save_str_size, tt;
int all_fields = FALSE;
int first_field = TRUE;
char temp_str[256];
char *save_str;
char *ptr;
save_str = (char *)Calloc(256, 1);
save_str_size = 256;
/* When all the fields are selected. */
if(fields_array == NULL)
{
all_fields = TRUE;
fields_array = (int *)Calloc(NUM_OF_FIELDS, sizeof(int));
for(i=0; i<NUM_OF_FIELDS; i++)
{
fields_array[i] = i;
}
array_size = NUM_OF_FIELDS;
}
for (i = 0; i < array_size; i++)
{
save_str[0]='\0';
if (fields_array[i] == e_creation_date &&
tSeq->creation_date[0] != 0 )
{
sprintf(save_str,"\n%s\t%d/%d/%d ",
at[fields_array[i]],
tSeq->creation_date[1],
tSeq->creation_date[2],
tSeq->creation_date[0]);
if(tSeq->creation_date[3]>=0)
{
if(tSeq->creation_date[4] < 0)
tSeq->creation_date[4] = 0;
if(tSeq->creation_date[5] < 0)
tSeq->creation_date[5] = 0;
sprintf(save_str, "%s%d:%d:%d",
save_str,
tSeq->creation_date[3],
tSeq->creation_date[4],
tSeq->creation_date[5]);
}
}
else if (fields_array[i] == e_probing_date &&
tSeq->probing_date[0] != 0 )
{
sprintf(save_str,"\n%s\t%d/%d/%d ",
at[fields_array[i]],
tSeq->probing_date[1],
tSeq->probing_date[2],
tSeq->probing_date[0]);
if(tSeq->probing_date[3]>=0)
{
if(tSeq->probing_date[4] < 0)
tSeq->probing_date[4] = 0;
if(tSeq->probing_date[5] < 0)
tSeq->probing_date[5] = 0;
sprintf(save_str, "%s%d:%d:%d",
save_str,
tSeq->probing_date[3],
tSeq->probing_date[4],
tSeq->probing_date[5]);
}
}
else if (fields_array[i] == e_autorad_date &&
tSeq->autorad_date[0] != 0 )
{
sprintf(save_str,"\n%s\t%d/%d/%d ",
at[fields_array[i]],
tSeq->autorad_date[1],
tSeq->autorad_date[2],
tSeq->autorad_date[0]);
if(tSeq->autorad_date[3]>=0)
{
if(tSeq->autorad_date[4] < 0)
tSeq->autorad_date[4] = 0;
if(tSeq->autorad_date[5] < 0)
tSeq->autorad_date[5] = 0;
sprintf(save_str, "%s%d:%d:%d",
save_str,
tSeq->autorad_date[3],
tSeq->autorad_date[4],
tSeq->autorad_date[5]);
}
}
else if ( fields_array[i] == e_c_elem &&
tSeq->c_elem != NULL )
{
ptr = tSeq->c_elem;
sprintf(save_str,"\n%s\t\"",at[fields_array[i]]);
while ( ptr < tSeq->c_elem + tSeq->seqlen )
{
if ( ptr != tSeq->c_elem )
strcat(save_str,"\n");
strncpy(temp_str, ptr, MIN(60, tSeq->c_elem +tSeq->seqlen-ptr));
temp_str[MIN(60, tSeq->c_elem+tSeq->seqlen - ptr)] = '\0';
/* Gurantee strlen(temp_str) chars for the string,
* one for \n, one for ", and one for \0.
*/
while(save_str_size - strlen(save_str) < strlen(temp_str)+3)
{
save_str_size *= 2;
save_str = (char *)Realloc(save_str,save_str_size);
}
strcat(save_str, temp_str);
ptr += 60;
}
strcat(save_str,"\"");
}
else if ( fields_array[i] == e_comments &&
tSeq->commentslen != 0)
{
while(save_str_size < 20+tSeq->commentslen)
{
save_str_size *= 2;
save_str = (char *)Realloc(save_str,save_str_size);
}
strcat(save_str,"\n");
strcat(save_str,at[fields_array[i]]);
strcat(save_str,"\t\"\n");
/* put a \0 at the end of comments. */
while(tSeq->commentslen + 1 > tSeq->commentsmaxlen)
{
tSeq->commentsmaxlen *= 2;
tSeq->comments = (char *)
Realloc(tSeq->comments,
tSeq->commentsmaxlen);
}
tSeq->comments[tSeq->commentslen] = '\0';
/* clean up the leading empty lines.*/
tt = 0;
while(tSeq->comments[tt] == '\n' || tSeq->comments[tt] == ' ')
tt++;
tSeq->commentslen -= tt;
strcat(save_str,tSeq->comments+tt);
strcat(save_str,"\"");
}
else if (fields_array[i] == e_laneset && tSeq->laneset != -1)
sprintf(save_str,"\n%s\t\t%d",
at[fields_array[i]],tSeq->laneset);
else if (fields_array[i] == e_strandedness && tSeq->strandedness != 0)
sprintf(save_str,"\n%s\t%d",
at[fields_array[i]],tSeq->strandedness);
else if (fields_array[i] == e_direction && tSeq->direction != 0)
sprintf(save_str,"\n%s\t%d",
at[fields_array[i]],tSeq->direction);
else if (fields_array[i] == e_orig_strand && tSeq->orig_strand != 0)
sprintf(save_str,"\n%s\t%d",
at[fields_array[i]],tSeq->orig_strand);
else if (fields_array[i] == e_orig_direction && tSeq->orig_direction != 0)
sprintf(save_str,"\n%s\t%d",
at[fields_array[i]],tSeq->orig_direction);
else if (fields_array[i] == e_offset)
sprintf(save_str,"\n%s\t\t%d",
at[fields_array[i]],tSeq->offset);
else if (fields_array[i] == e_group_number && tSeq->group_number != 0)
sprintf(save_str,"\n%s\t%d",
at[fields_array[i]],tSeq->group_number);
else if (fields_array[i] == e_group_ID)
sprintf(save_str,"\n%s\t%d",
at[fields_array[i]],tSeq->group_ID);
else if (fields_array[i] == e_type && tSeq->type[0] != '\0' )
sprintf(save_str,"\n%s\t\t\"%s\"",
at[fields_array[i]],tSeq->type);
else if (fields_array[i] == e_barcode && tSeq->barcode[0] != '\0' )
sprintf(save_str,"\n%s\t\t\"%s\"",
at[fields_array[i]],tSeq->barcode);
else if (fields_array[i] == e_name && tSeq->name[0] != '\0' )
sprintf(save_str,"\n%s\t\t\"%s\"",
at[fields_array[i]],tSeq->name);
else if (fields_array[i] == e_status && tSeq->status[0] != '\0' )
sprintf(save_str,"\n%s\t\t\"%s\"",
at[fields_array[i]],tSeq->status);
else if (fields_array[i] == e_walk && tSeq->walk[0] != '\0' )
sprintf(save_str,"\n%s\t\t\"%s\"",
at[fields_array[i]],tSeq->walk);
else if (fields_array[i] == e_sequence_ID &&
tSeq->sequence_ID[0] != '\0' )
sprintf(save_str,"\n%s\t\"%s\"",
at[fields_array[i]],tSeq->sequence_ID);
else if (fields_array[i] == e_creator && tSeq->creator[0] != '\0')
sprintf(save_str,"\n%s\t\t\"%s\"",
at[fields_array[i]],tSeq->creator);
else if (fields_array[i]==e_film && tSeq->film[0]!='\0')
sprintf(save_str,"\n%s\t\t\"%s\"",
at[fields_array[i]],tSeq->film);
else if (fields_array[i] == e_membrane && tSeq->membrane[0] != '\0')
sprintf(save_str,"\n%s\t\"%s\"",
at[fields_array[i]],tSeq->membrane);
else if (fields_array[i] == e_source_ID && tSeq->source_ID[0] != '\0')
sprintf(save_str,"\n%s\t\"%s\"",
at[fields_array[i]],tSeq->source_ID);
else if (fields_array[i] == e_contig && tSeq->contig[0] != '\0')
sprintf(save_str,"\n%s\t\t\"%s\"",
at[fields_array[i]],tSeq->contig);
else if (fields_array[i] == e_baggage && tSeq->baglen != 0)
{
if(save_str_size < tSeq->baglen+2)
{
save_str_size = tSeq->baglen+2;
save_str = (char *)Realloc(save_str,save_str_size);
}
save_str[0] = '\n';
save_str[1] = '\0';
/* put a \0 at the end of baggage. */
strncat(save_str, tSeq->baggage, tSeq->baglen);
while(save_str[tSeq->baglen-1] == '\n')
{
tSeq->baglen--;
}
save_str[tSeq->baglen] = '\0';
}
if(save_str[0] != '\0')
{
if (first_field == TRUE)
{
first_field = FALSE;
fprintf(fp,"{");
}
fprintf(fp,"%s",save_str);
}
}
if (first_field == FALSE)
{
fprintf(fp,"\n}\n");
}
if(all_fields == TRUE && fields_array != NULL)
{
Cfree(fields_array);
fields_array = NULL;
}
if(save_str != NULL)
{
Cfree(save_str);
save_str = NULL;
}
if (first_field == TRUE)
return 0;
else
return 1;
}
/*********
*
* ReadRecord() reads one record from fp into tSeq. fp remains at
* the finishing position so that next time when ReadRecord() is
* called, it reads the next record.
*
* The caller program should LOCATE MEMORY for the tSeq before calling.
*
* ReadRecord() returns:
* TRUE if no error;
* FALSE if anything is wrong
* -1 if end-of-file is reached
*
**********/
int
ReadRecord(fp, tSeq)
FILE *fp;
Sequence *tSeq;
{
char field_name[20], line[256], orig_line[256];
int temp_str_size, start, end, l, max_len = 255;
char *fgets_ret, *temp_str, *fgets_ret1;
int start_rec = FALSE;
int need_to_read = TRUE;
char started = 'F';
void InitRecord();
void FreeRecord();
temp_str = (char *)Calloc(256, 1);
temp_str_size = 256;
InitRecord(tSeq);
if(tSeq->c_elem == NULL)
{
tSeq->c_elem = (char *)Calloc(256, 1);
tSeq->seqmaxlen = 256;
}
tSeq->c_elem[0] = '\0';
/* read file line-by-line. */
while (need_to_read == TRUE &&
((fgets_ret = fgets(line, max_len, fp)) != NULL ||
start_rec == TRUE))
{
strcpy(orig_line, line);
end = strlen(line) -1;
while(end>=0 && (line[end] == ' ' ||
line[end] == '\t' ||
line[end] == ',' ||
line[end] == '\n') )
end--;
/* ignore empty lines. */
if(end == -1)
continue;
if(line[end] == '{')
started = 'T';
/* to ignore the lines between a } and a {. */
while(started == 'F' && fgets_ret != NULL)
{
fgets_ret = fgets(line, max_len, fp);
strcpy(orig_line, line);
end = strlen(line) -1;
while(end>=0 && (line[end] == ' ' ||
line[end] == '\t' ||
line[end] == ',' ||
line[end] == '\n') )
end--;
/* ignore empty lines. */
if(end == -1)
continue;
if(line[end] == '{')
started = 'T';
}
if(fgets_ret == NULL)
return -1;
if (end < 0)
{
}
else if ((line[end] == '}') && (end==0))
{
start_rec = FALSE;
need_to_read = FALSE;
}
else if (line[end] == '{' && end <= 10)
{
start_rec = TRUE;
}
else
{
if (line[end]=='}')
{
need_to_read = FALSE;
start_rec = FALSE;
}
/* locate the tag. */
start = 0;
while(line[start] == ' ' ||
line[start] == '\t'||
line[start] == '\n'||
line[start] == '{' )
start++;
end = start +1;
while(line[end] != ' ' &&
line[end] != '\t' &&
line[end] != '\n' &&
line[end] != '\0')
end++;
strncpy(field_name, line+start, end-start);
field_name[end-start] = '\0';
/* process the field value. */
/*
* creation_date, probing_date, or autorad_date
*/
if ( strcmp(field_name,"creation-date") == 0)
{
while(!isdigit(line[end]))
end++;
if(strToDate(line + end, tSeq->creation_date) == -1)
{
return FALSE;
}
}
else if (strcmp(field_name,"probing-date") == 0)
{
while(line[end] != '\0' && !isdigit(line[end]))
end++;
if(line[end] != '\0' &&
strToDate(line + end, tSeq->probing_date) == -1)
{
return FALSE;
}
}
else if ( strcmp(field_name,"autorad-date") == 0)
{
while(line[end] != '\0' && !isdigit(line[end]))
end++;
if(line[end] != '\0' &&
strToDate(line + end, tSeq->autorad_date) == -1)
{
return FALSE;
}
}
/*
* sequence or comments.
*/
else if (strcmp(field_name,"sequence") == 0 ||
strcmp(field_name,"comments") == 0 )
{
temp_str[0] = '\0';
/* locate the first ". */
while(line[end++] != '"');
start = end;
end = strlen(line);
/* ---"\n\0. */
if(line[end-2] == '"')
end -= 2;
else if(line[end-1] == '\n' &&
strcmp(field_name,"sequence") == 0)
end--;
while(temp_str_size < end-start+1 )
{
temp_str_size *= 2;
temp_str = (char *)Realloc(temp_str, temp_str_size);
}
if(end - start > 0)
strncat(temp_str, line+start, end-start);
/* Read the second line of the seq. or comments, if any.
end-start<0 is the case that " is the only char this line.*/
if (line[strlen(line)-2] != '"' || end-start<0)
{
while((fgets_ret1 = fgets(line, max_len, fp)) != NULL)
{
/* IGNORE empty lines. 5/4/92 */
int empty_line = 0;
while(line[empty_line] == ' ')
empty_line++;
if(line[empty_line] == '\n')
{
continue;
/* strncat(temp_str, line, end); 5/4/92 */
}
l = strlen(line) -1;
if(line[l-1] == '"')
end = l-1;
else
end = l;
if(line[end] == '\n' &&
strcmp(field_name,"comments") == 0)
end++;
/* Gurantee 'end' chars for the string, one for ",
* and one for \0.
*/
while(temp_str_size - strlen(temp_str) < end+3 )
{
temp_str_size *= 2;
temp_str=(char *)Realloc(temp_str,temp_str_size);
}
strncat(temp_str, line, end);
if(line[l-1] == '"')
break;
}
if(fgets_ret1 == NULL && need_to_read == TRUE)
{
fprintf(stderr, "ReadRecord(): incomplete record.\n");
return FALSE;
}
}
l = strlen(temp_str);
if(strcmp(field_name,"comments") == 0 )
{
if(tSeq->commentsmaxlen == 0)
{
tSeq->comments = (char *)Calloc(l+1, 1);
tSeq->commentsmaxlen = l+1;
}
else
{
while(tSeq->commentslen+l+1>tSeq->commentsmaxlen)
{
tSeq->commentsmaxlen *= 2;
tSeq->comments = (char *)
Realloc(tSeq->comments, tSeq->commentsmaxlen);
}
}
tSeq->comments[tSeq->commentslen] = '\0';
strcat(tSeq->comments, temp_str);
tSeq->commentslen += l;
}
else /* it is the sequence. */
{
if(tSeq->seqmaxlen == 0)
{
tSeq->c_elem = (char *)Calloc(l+1, 1);
}
else if(l+1>tSeq->seqmaxlen)
{
tSeq->c_elem = (char *)Realloc(tSeq->c_elem, l+1);
}
tSeq->seqmaxlen = l+1;
tSeq->seqlen = l;
strcpy(tSeq->c_elem, temp_str);
}
}
/*
* Integer or String.
*/
else
{
/* locate the value: a string or an integer. */
while(line[end] == ' ' || line[end] == '\t')
end++;
if (line[end] == '"')
{
/* It is a string. */
end++;
start = end;
while(line[end] != '\0' && line[end] != '"')
end++;
/*
* strncat will not put a \0 at the end of a string
* if the copying string is longer than n.
*/
line[end++] = '\0';
}
else
{
/* It is an integer. */
start = end;
while(line[end] != ' ' &&
line[end] != '\t' &&
line[end] != '\n' &&
line[end] != '\0')
end++;
strncpy(temp_str, line+start, end-start+1); /*4/26 add 1*/
temp_str[end-start] = '\0';
}
/* assign to an integer field. */
if (strcmp(field_name,"laneset") == 0 )
tSeq->laneset = atoi(temp_str);
else if (strcmp(field_name,"strandedness") == 0 )
tSeq->strandedness = atoi(temp_str);
else if (strcmp(field_name,"direction") == 0)
tSeq->direction = atoi(temp_str);
else if (strcmp(field_name,"orig_strand") == 0 )
tSeq->orig_strand = atoi(temp_str);
else if (strcmp(field_name,"orig_direction") == 0 )
tSeq->orig_direction = atoi(temp_str);
else if (strcmp(field_name,"offset") == 0 )
tSeq->offset = atoi(temp_str);
else if (strcmp(field_name,"group-number") == 0 )
tSeq->group_number = atoi(temp_str);
else if (strcmp(field_name,"group-ID") == 0 )
tSeq->group_ID = atoi(temp_str);
/* assign to a string field. */
else if (strcmp(field_name,"type") == 0 )
{
if(end - start > 31) end = start + 31;
strncpy(tSeq->type, line+start, end-start);
tSeq->type[end-start] = '\0';
}
else if (strcmp(field_name,"barcode") == 0 )
{
if(end - start > 31) end = start + 31;
strncpy(tSeq->barcode, line+start, end-start);
tSeq->barcode[end-start] = '\0';
}
else if (strcmp(field_name,"name") == 0 )
{
if(end - start > 31) end = start + 31;
strncpy(tSeq->name, line+start, end-start);
tSeq->name[end-start] = '\0';
}
else if (strcmp(field_name,"status") == 0 )
{
if(end - start > 31) end = start + 31;
strncpy(tSeq->status, line+start, end-start);
tSeq->status[end-start] = '\0';
}
else if (strcmp(field_name,"walk") == 0 )
{
if(end - start > 31) end = start + 31;
strncpy(tSeq->walk, line+start, end-start);
tSeq->walk[end-start] = '\0';
}
else if (strcmp(field_name,"sequence-ID") == 0 )
{
if(end - start > 31) end = start + 31;
strncpy(tSeq->sequence_ID, line+start, end-start);
tSeq->sequence_ID[end-start] = '\0';
}
else if (strcmp(field_name,"creator") == 0 )
{
if(end - start > 31) end = start + 31;
strncpy(tSeq->creator, line+start, end-start);
tSeq->creator[end-start] = '\0';
}
else if (strcmp(field_name,"film") == 0 )
{
if(end - start > 31) end = start + 31;
strncpy(tSeq->film, line+start, end-start);
tSeq->film[end-start] = '\0';
}
else if (strcmp(field_name,"membrane") == 0 )
{
if(end - start > 31) end = start + 31;
strncpy(tSeq->membrane, line+start, end-start);
tSeq->membrane[end-start] = '\0';
}
else if (strcmp(field_name,"source-ID") == 0 )
{
if(end - start > 31) end = start + 31;
strncpy(tSeq->source_ID, line+start, end-start);
tSeq->source_ID[end-start] = '\0';
}
else if (strcmp(field_name,"contig") == 0 )
{
if(end - start > 31) end = start + 31;
strncpy(tSeq->contig, line+start, end-start);
tSeq->contig[end-start] = '\0';
}
else
{
if(tSeq->bagmaxlen == 0)
{
tSeq->bagmaxlen = 4*strlen(orig_line);
tSeq->baggage = (char *)Calloc(tSeq->bagmaxlen, 1);
}
else
{
while(tSeq->bagmaxlen<tSeq->baglen+2+strlen(orig_line))
{
tSeq->bagmaxlen *= 2;
tSeq->baggage = (char *)
Realloc(tSeq->baggage, tSeq->bagmaxlen);
}
}
if(tSeq->baglen == 0)
{
/*
tSeq->baggage[0] = '\n';
tSeq->baggage[1] = '\0';
tSeq->baglen = 1;
*/
tSeq->baggage[0] = '\0';
}
/* strcat(tSeq->baggage, "\n");*/
strcat(tSeq->baggage, orig_line);
tSeq->baglen += strlen(orig_line);
}
}
}
}
if(temp_str != NULL)
{
Cfree(temp_str);
temp_str = NULL;
}
if ( start_rec == FALSE && fgets_ret == NULL)
{
/* end of file, did not get a record. */
return -1;
}
else
return TRUE;
}
/*********
*
* Initialize a record.
*
* Note: no memory allocation is performed.
*
**********/
void
InitRecord(tSeq)
Sequence *tSeq;
{
int i;
strcpy(tSeq->type, "DNA");
tSeq->barcode[0] = '\0';
tSeq->name[0] = '\0';
tSeq->status[0] = '\0';
strcpy(tSeq->walk, "FALSE");
tSeq->sequence_ID[0] = '\0';
tSeq->c_elem = NULL;
tSeq->seqlen = 0;
tSeq->seqmaxlen = 0;
for (i = 0; i<6; i++)
{
tSeq->creation_date[i] = 0;
tSeq->probing_date[i] = 0;
tSeq->autorad_date[i] = 0;
}
tSeq->creator[0] = '\0';
tSeq->film[0] = '\0';
tSeq->membrane[0] = '\0';
tSeq->source_ID[0] = '\0';
tSeq->contig[0] = '\0';
tSeq->laneset = -1;
tSeq->direction = 1; /* (1/-1/0),default: 5 to 3. */
tSeq->strandedness = 1; /* (1/2/0), default: primary.*/
tSeq->orig_direction= 0; /* (0 unknown, -1:3'->5', 1:5'->3') */
tSeq->orig_strand = 0; /* (0 unknown, 1:primary, 2:secondary) */
tSeq->offset = 0;
tSeq->comments = NULL;
tSeq->commentslen = 0;
tSeq->commentsmaxlen = 0;
tSeq->baggage = NULL;
tSeq->baglen = 0;
tSeq->bagmaxlen = 0;
tSeq->group_number = 0;
tSeq->group_ID = 0;
}
void
CopyRecord(to, from)
Sequence *from, *to;
{
int i;
InitRecord(to);
strcpy(to->type, from->type);
strcpy(to->barcode, from->barcode);
strcpy(to->name, from->name);
strcpy(to->status,from->status);
strcpy(to->walk,from->walk);
strcpy(to->sequence_ID, from->sequence_ID);
if(from->c_elem != NULL)
{
to->seqlen = from->seqlen;
to->seqmaxlen = from->seqmaxlen;
to->c_elem = (char *)Calloc(to->seqmaxlen, 1);
strncpy(to->c_elem, from->c_elem, to->seqlen);
to->c_elem[to->seqlen] = '\0';
}
for (i = 0; i<6; i++)
{
to->creation_date[i] = from->creation_date[i];
to->probing_date[i] = from->probing_date[i];
to->autorad_date[i] = from->autorad_date[i];
}
strcpy(to->creator, from->creator);
strcpy(to->film, from->film);
strcpy(to->membrane, from->membrane);
strcpy(to->source_ID, from->source_ID);
strcpy(to->contig, from->contig);
to->laneset = from->laneset;
to->strandedness = from->strandedness;
to->orig_direction = from->orig_direction;
to->orig_strand = from->orig_strand;
to->direction = from->direction;
to->offset = from->offset;
if(from->comments != NULL)
{
to->commentsmaxlen = from->commentsmaxlen;
to->commentslen = from->commentslen;
to->comments = (char *)Calloc(to->commentsmaxlen, 1);
strncpy(to->comments, from->comments, to->commentslen);
to->comments[to->commentslen] = '\0';
}
if(from->baggage != NULL)
{
to->baglen = from->baglen;
to->bagmaxlen = from->bagmaxlen;
to->baggage = (char *)Calloc(to->bagmaxlen, 1);
strncpy(to->baggage, from->baggage, to->baglen);
to->baggage[to->baglen] = '\0';
}
to->group_number = from->group_number;
to->group_ID = from->group_ID;
}
/*********
*
* Clean the contents of a record without changing the memory size.
*
**********/
void
CleanRecord(tSeq)
Sequence *tSeq;
{
int i;
strcpy(tSeq->type, "DNA");
tSeq->name[0] = '\0';
tSeq->barcode[0] = '\0';
tSeq->status[0] = '\0';
strcpy(tSeq->walk, "FALSE");
tSeq->sequence_ID[0] = '\0';
if(tSeq->c_elem != NULL)
tSeq->c_elem[0] = '\0';
tSeq->seqlen = 0;
for (i = 0; i<6; i++)
{
tSeq->creation_date[i] = 0;
tSeq->probing_date[i] = 0;
tSeq->autorad_date[i] = 0;
}
tSeq->creator[0] = '\0';
tSeq->film[0] = '\0';
tSeq->membrane[0] = '\0';
tSeq->source_ID[0] = '\0';
tSeq->contig[0] = '\0';
tSeq->laneset = -1;
tSeq->strandedness = 1; /* (1/2/0), default. primary. */
tSeq->direction = 1; /* (1/-1/0),default. 5 to 3. */
tSeq->orig_direction= 0;
tSeq->orig_strand = 0;
tSeq->offset = 0;
if(tSeq->comments != NULL)
tSeq->comments[0] = '\0';
tSeq->commentslen = 0;
if(tSeq->baggage != NULL)
tSeq->baggage[0] = '\0';
tSeq->baglen = 0;
tSeq->group_number = 0;
tSeq->group_ID = 0;
}
/*********
*
* Free memory for a record.
*
**********/
void
FreeRecord(tSeq)
Sequence **tSeq;
{
Cfree((*tSeq)->c_elem);
Cfree((*tSeq)->comments);
Cfree((*tSeq)->baggage);
Cfree((*tSeq));
(*tSeq)->c_elem = NULL;
(*tSeq)->comments = NULL;
(*tSeq)->baggage = NULL;
(*tSeq) = NULL;
}
static max_day[2][13] = {
{ 0, 31, 28, 31, 30, 31, 30, 31, 31, 30, 31, 30, 31},
{ 0, 31, 29, 31, 30, 31, 30, 31, 31, 30, 31, 30, 31} };
/***********
*
* strToDate() locates first six integers and translates them
* into a date.
*
* String should have the format of "mm/dd/yy hh/mn/sc xm",
* with anything except digit as the delimiters.
*
* Order in the date array is (0->5): (yy mm dd hh mn sc).
*
* Returns FALSE if anything is wrong, TRUE otherwise.
*
**********/
int
strToDate(str, date)
const char *str;
int date[];
{
int leap;
char temp_str[2];
char longstr[256];
/* locate 6 integers. */
strcpy(longstr, str);
strcat(longstr, " -1/-1/-1 ");
sscanf(longstr, "%d%*c%d%*c%d%*c%d%*c%d%*c%d%2s",
&date[1],&date[2],&date[0],&date[3],
&date[4],&date[5],temp_str);
/* verify year. */
if(date[0] >= 100)
date[0] -= 1900;
/* verify month. */
if(date[1] > 12 || date[1] < 1)
{
fprintf(stderr,"invalid month %s\n", str);
return FALSE;
}
/* verify day. */
if ((date[0] % 4 == 0 && date[0] % 100 != 0) ||
date[0] % 400 == 0)
leap = 1;
else
leap = 0;
if(date[2] > max_day[leap][date[1]] ||
date[2] < 1)
{
fprintf(stderr,"invalid day %s\n", str);
return FALSE;
}
/* verify time. */
if (strncmp(temp_str,"pm",2)==0)
date[3] += 12;
if (date[3]<-1 || date[3]>23 ||
date[4]<-1 || date[4]>59 ||
date[5]<-1 || date[5]>59 )
{
fprintf(stderr,"invalid time %s\n", str);
return FALSE;
}
return TRUE;
}
/**********
*
* Default_IUPAC_Trans() translates an ASCII IUPAC code into
* an (char) integer.
*
**********/
char
Default_IUPAC_Trans(base)
char base;
{
int i;
char c;
c = base | 32;
if(c == 'u')
return (char ) 8;
if(c == 'p')
return (char) 5;
for(i=0; i<16; i++)
{
if(c == Default_DNA_Trans[i])
{
return ( (char) i);
}
}
fprintf(stderr, "Character %c is not IUPAC coded.\n", base);
return -1;
}
char *uniqueID();
/***********
*
* MakeConsensus() takes an array of aligned sequence and an
* initialized 'Sequence' consensus. It modifies the consensus.
*
* The memory that 'consensus' has located will be reused, and
* consensus->seqmaxlen will be modified if necessary.
*
* Returns TRUE if successful, FALSE otherwise.
*
**********/
int
MakeConsensus(aligned, numOfAligned, consensus, group)
Sequence aligned[]; /* input. */
int numOfAligned; /* input. */
Sequence *consensus; /* input and output. */
int group; /* Group number (if zero, use all groups) */
{
char occurence;
int i, j, index;
int max_cons = INT_MIN;
int min_offset = INT_MAX;
char temp_str[2];
unsigned char case_bit;
/*
* Search for the minimun offset.
*/
for (i=0; i<numOfAligned; i++)
{
if(group == 0 || aligned[i].group_number == group)
{
SeqNormal(&aligned[i]);
min_offset = MIN(min_offset, aligned[i].offset);
max_cons = MAX(max_cons, aligned[i].offset+aligned[i].seqlen);
}
}
/*
* Decide consensus base by base.
*/
CleanRecord(consensus);
consensus->offset = min_offset;
if(aligned[0].contig[0] != '\0')
{
strcpy(consensus->name, aligned[0].contig);
strcat(consensus->name, ".");
}
else if(strncmp(aligned[0].name, "cons.", 5) != 0)
{
strcpy(consensus->name, "cons.");
strcat(consensus->name, aligned[0].name);
}
strcpy(consensus->sequence_ID, uniqueID());
strcpy(consensus->contig, aligned[0].contig);
for(j=min_offset; j<max_cons; j++)
{
occurence = 00;
case_bit = 0;
for(i=0; i<numOfAligned; i++)
{
if(group == 0 || aligned[i].group_number == group)
{
if (j >= aligned[i].offset &&
j < aligned[i].offset+aligned[i].seqlen)
{
index = j-aligned[i].offset;
if(aligned[i].c_elem[index] == '-')
case_bit = 32;
else if(case_bit == 0)
case_bit |= (aligned[i].c_elem[index] & 32);
occurence = occurence |
Default_IUPAC_Trans(aligned[i].c_elem[index]);
if(occurence != 1 && occurence != 2 &&
occurence != 4 && occurence != 8)
case_bit = 32;
/*
printf("%1c", aligned[i].c_elem[index]);
*/
}
/*
else
printf(" ");
*/
}
}
sprintf(temp_str, "%1c", Default_DNA_Trans[(int) occurence]);
if(case_bit == 0)
temp_str[0] = toupper(temp_str[0]);
if(InsertElems(consensus, j, temp_str)== FALSE)
return FALSE;
/*
printf(" cons[%d]=%1c\n", j - min_offset,
consensus->c_elem[j - min_offset]);
*/
}
return TRUE;
}
/***********
*
* MakeScore() takes an array of aligned sequence, and generates
* a consensus. Note, memory for (Sequence* consensus) should be
* located before it is passed to this function.
*
* Returns TRUE if successful, FALSE otherwise.
*
**********/
int
MakeScore(aligned, numOfAligned, consensus, group)
Sequence aligned[]; /* input. */
int numOfAligned; /* input. */
Sequence *consensus; /* input and output. */
int group;
{
int i, j, index, score;
int max_cons = INT_MIN;
int min_offset = INT_MAX;
int As, Cs, Ts, Gs, Ns, tot_in_grp;
char temp_str[2], occurence, base;
int max_occ;
static char map[17] = "0123456789ABCDEF";
/*
* Search for the minimum offset.
*/
for (i=0; i<numOfAligned; i++)
{
if(group == 0 || aligned[i].group_number == group)
{
SeqNormal(&aligned[i]);
min_offset = MIN(min_offset, aligned[i].offset);
max_cons = MAX(max_cons, aligned[i].offset+aligned[i].seqlen);
}
}
/*
* Decide consensus base by base.
*/
CleanRecord(consensus);
consensus->offset = min_offset;
if(aligned[0].contig[0] != '\0')
{
strcpy(consensus->name, aligned[0].contig);
strcat(consensus->name, ".");
}
else if(strncmp(aligned[0].name, "cons.", 5) != 0)
{
strcpy(consensus->name, "cons.");
strcat(consensus->name, aligned[0].name);
}
strcpy(consensus->sequence_ID, uniqueID());
strcpy(consensus->contig, aligned[0].contig);
for(j=min_offset; j<max_cons; j++)
{
As = Cs = Ts = Gs = Ns = 0;
tot_in_grp = 0;
occurence = 00;
for(i=0; i<numOfAligned; i++)
{
if(group == 0 || aligned[i].group_number == group)
{
if (j >= aligned[i].offset &&
j < aligned[i].offset+aligned[i].seqlen)
{
tot_in_grp++;
index = j-aligned[i].offset;
/*
occurence = Default_IUPAC_Trans(aligned[i].c_elem[index]);
if((occurence & 01) == 01)
As++;
if((occurence & 02) == 02)
Cs++;
if((occurence & 04) == 04)
Gs++;
if((occurence & 010) == 010)
Ts++;
*/
base = (aligned[i].c_elem[index]|32);
if(base == 'a')
As++;
else if(base == 'c')
Cs++;
else if(base == 'g')
Gs++;
else if(base == 't')
Ts++;
else if(base == 'n' || base == '-')
Ns++;
/*
printf("%1c", aligned[i].c_elem[index]);
*/
}
/*
else
printf(" ");
*/
}
}
max_occ = MAX(As, MAX(Cs, MAX(Gs,Ts)));
/* socre = [0,E], F:all mismatches are either 'n' or '-' */
if(Ns != 0 && max_occ+Ns == tot_in_grp)
score = 15;
else
score = max_occ*14/tot_in_grp;
/*
if( score > 0xF )
{
if (InsertElems(consensus, j, "F") == FALSE)
{
return FALSE;
}
}
else
{
*/
sprintf(temp_str,"%1c", map[score]);
if(InsertElems(consensus, j, temp_str) == FALSE)
{
return FALSE;
}
/*
printf(" %2d-%2d-%2d-%2d %2d cons[%d]=%1c\n",
Ts, Gs, Cs, As, score, j,
consensus->c_elem[j]);
*/
}
return TRUE;
}
/***********
*
* MakePhyloMask() takes an array of aligned sequence, and generates
* a mask that has a '0' for all columns except the columns which contain
* a, c, g, t and u only.
*
* Returns TRUE if successful, FALSE otherwise.
*
**********/
int
MakePhyloMask(aligned, numOfAligned, consensus, group, acgtu)
Sequence aligned[]; /* input. */
int numOfAligned; /* input. */
Sequence *consensus; /* input and output. */
int acgtu[];
int group;
{
int i, j, cnt, max_cons = INT_MIN, min_offset = INT_MAX;
/*
* Search for the minimum offset.
*/
for (i=0; i<numOfAligned; i++)
{
if(group == 0 || aligned[i].group_number == group)
{
SeqNormal(&aligned[i]);
min_offset = MIN(min_offset, aligned[i].offset);
max_cons = MAX(max_cons, aligned[i].offset+aligned[i].seqlen);
}
}
/*
* Decide consensus base by base.
*/
CleanRecord(consensus);
consensus->offset = min_offset;
strcpy(consensus->name, "mask");
strcpy(consensus->type, "MASK");
strcpy(consensus->sequence_ID, uniqueID());
strcpy(consensus->contig, aligned[0].contig);
consensus->seqlen = max_cons - min_offset;
if(consensus->seqmaxlen == 0)
{
consensus->c_elem = (char *)Calloc(max_cons - min_offset+5, 1);
consensus->seqmaxlen = max_cons - min_offset + 5;
}
else if(consensus->seqmaxlen < max_cons - min_offset)
{
consensus->seqmaxlen = max_cons - min_offset + 5;
consensus->c_elem = (char *)Realloc(consensus->c_elem,
max_cons - min_offset + 5);
}
cnt = 0;
for(j=min_offset; j<max_cons; j++)
{
consensus->c_elem[j-min_offset] = '1';
for(i=0; i<numOfAligned; i++)
{
if(group == 0 || aligned[i].group_number == group)
{
if (j < aligned[i].offset ||
j >= aligned[i].offset+aligned[i].seqlen ||
acgtu[aligned[i].c_elem[j-aligned[i].offset]] == 0)
{
consensus->c_elem[j-min_offset] = '0';
cnt++;
break;
}
}
}
}
fprintf(stderr, "\nNumber of 1s in mask: %d\n", max_cons-min_offset-cnt);
fprintf(stderr, "Number of 0s in mask: %d\n\n", cnt);
return TRUE;
}
/***********
*
* MajorityCons() takes an array of aligned sequence, and generates
* a MAJORITY consensus.
* Note, memory for (Sequence* consensus) should be
* located before it is passed to this function.
*
* Returns TRUE if successful, FALSE otherwise.
*
**********/
int
MajorityCons(aligned, numOfAligned, consensus, group, major_perc)
Sequence aligned[]; /* input. */
int numOfAligned; /* input. */
Sequence *consensus; /* input and output. */
int group, major_perc;
{
int i, j, index, score, ii, base, max;
int max_cons = INT_MIN;
int min_offset = INT_MAX;
char temp_str[2], occurence;
int *cnts, tot_in_grp;
unsigned char case_bit;
cnts = (int *)Calloc(16, sizeof(int));
/*
* Search for the minimum offset.
*/
for (i=0; i<numOfAligned; i++)
{
if(group == 0 || aligned[i].group_number == group)
{
SeqNormal(&aligned[i]);
min_offset = MIN(min_offset, aligned[i].offset);
max_cons = MAX(max_cons, aligned[i].offset+aligned[i].seqlen);
}
}
/*
* Decide consensus base by base.
*/
CleanRecord(consensus);
consensus->offset = min_offset;
if(aligned[0].contig[0] != '\0')
{
strcpy(consensus->name, aligned[0].contig);
strcat(consensus->name, ".");
}
else if(strncmp(aligned[0].name, "cons.", 5) != 0)
{
strcpy(consensus->name, "cons.");
strcat(consensus->name, aligned[0].name);
}
strcpy(consensus->sequence_ID, uniqueID());
strcpy(consensus->contig, aligned[0].contig);
for(j=min_offset; j<max_cons; j++)
{
case_bit = 0;
occurence = 00;
tot_in_grp = 0;
for(ii = 0; ii < 16; ii++)
cnts[ii] = 0;
for(i=0; i<numOfAligned; i++)
{
if(group == 0 || aligned[i].group_number == group)
{
if (j >= aligned[i].offset &&
j < aligned[i].offset+aligned[i].seqlen)
{
tot_in_grp++;
index = j-aligned[i].offset;
if(aligned[i].c_elem[index] == '-')
case_bit = 32;
else if(case_bit == 0)
case_bit |= (aligned[i].c_elem[index] & 32);
occurence |=
Default_IUPAC_Trans(aligned[i].c_elem[index]);
cnts[(int)Default_IUPAC_Trans(aligned[i].c_elem[index])]++;
if(case_bit == 0 &&
occurence != 1 && occurence != 2 &&
occurence != 4 && occurence != 8)
case_bit = 32;
}
}
}
max = 0;
for(ii = 0; ii < 16; ii++)
{
if(cnts[ii] > max)
{
max = cnts[ii];
base = ii;
}
}
if(max*100/tot_in_grp >= major_perc)
{
/* follow the majority rule. */
sprintf(temp_str,"%1c", Default_DNA_Trans[base]);
}
else
{
/* use IUPAC code. */
sprintf(temp_str,"%1c",
Default_DNA_Trans[(int) occurence]);
}
if(case_bit == 0)
temp_str[0] = toupper(temp_str[0]);
if(InsertElems(consensus, j, temp_str) == FALSE)
{
return FALSE;
}
}
return TRUE;
}
/***********
*
* ReadGDEtoHGL() reads a GDE formated file into an array of HGL structure.
*
* Return -1 if anything is wrong, number_of_sequence otherwise.
*
***********/
int
ReadGDEtoHGL(fp, tSeq_arr)
FILE *fp;
Sequence **tSeq_arr;
{
char line[MAXLINELEN];
int ptr, num_seq, max_num_seq = 20;
int seq_len = 200;
char *newline;
(*tSeq_arr) = (Sequence *)Calloc(max_num_seq, sizeof(Sequence));
num_seq = -1;
while(fgets(line, MAXLINELEN-2, fp) != NULL) /* spaces for \n\0 */
{
/* ptr points to the last char. */
ptr = strlen(line)-1;
/* clear up the tail. */
while(ptr>=0 && (line[ptr] == '\n' ||
line[ptr] == ' ' ||
line[ptr] == '\t'))
ptr--;
line[ptr+1] = '\0';
if(ptr <= 0)
{
/* it is an empty line. */
}
else if(line[0] == '#')
{
if(++num_seq == max_num_seq)
{
max_num_seq *= 2;
/* printf("max_num_seq = %d\n", max_num_seq); */
(*tSeq_arr) = (Sequence *)Realloc((*tSeq_arr),
max_num_seq*sizeof(Sequence));
}
InitRecord((*tSeq_arr)[num_seq]);
if (line[ptr] == '<')
{
(*tSeq_arr)[num_seq].direction = 2; /* 3to5 */
line[ptr] = '\0';
}
else if (line[ptr] == '>')
{
(*tSeq_arr)[num_seq].direction = 1; /* 5to3 */
line[ptr] = '\0';
}
strcpy((*tSeq_arr)[num_seq].sequence_ID, line+1);
}
else
{
ptr = 0;
if((*tSeq_arr)[num_seq].seqlen == 0)
{
/* determine the offset. */
while(line[ptr] != '\0' && line[ptr] == '-')
{
ptr++;
}
(*tSeq_arr)[num_seq].offset += ptr;
}
if(line[ptr] != '\0')
{
newline = line + ptr;
if((*tSeq_arr)[num_seq].seqmaxlen == 0)
{
(*tSeq_arr)[num_seq].c_elem =
(char *)Calloc(seq_len, 1);
(*tSeq_arr)[num_seq].c_elem[0] = '\0';
(*tSeq_arr)[num_seq].seqmaxlen = seq_len;
}
else
{
while((*tSeq_arr)[num_seq].seqlen + strlen(newline) + 1
> (*tSeq_arr)[num_seq].seqmaxlen)
{
seq_len *= 2;
(*tSeq_arr)[num_seq].c_elem = (char *)
Realloc((*tSeq_arr)[num_seq].c_elem, seq_len);
(*tSeq_arr)[num_seq].seqmaxlen = seq_len;
}
}
strcat((*tSeq_arr)[num_seq].c_elem, newline);
(*tSeq_arr)[num_seq].seqlen = strlen((*tSeq_arr)[num_seq].c_elem);
}
}
}
return (num_seq + 1);
}
/********
*
* InsertElems returns TRUE if successful, FALSE otherwise.
*
********/
int
InsertElems(seq,pos,c)
Sequence *seq; /* Sequence */
int pos; /* Position (in respect to the master consensus)
* to insert BEFORE
* always move string to the right. */
char c[]; /*Null terminated array of elements to insert */
{
int dashes, j,len;
len = strlen(c);
if(seq->seqlen == 0)
{
/* get rid of '-'s at right. */
/*
dashes = len-1;
while(dashes >= 0 && c[dashes] == '-')
dashes--;
if(dashes < 0)
{
seq->offset = pos;
return TRUE;
}
c[dashes+1] = '\0';
*/
/* clear out '-'s at left. */
dashes = 0;
/*
while(c[dashes] == '-')
dashes++;
c += dashes;
len = strlen(c);
pos += dashes;
*/
if(seq->seqmaxlen == 0)
{
seq->c_elem = (char *)Calloc(len+1, 1);
seq->seqmaxlen = len + 1;
}
else if(len+1 >= seq->seqmaxlen)
{
seq->c_elem = (char *)Realloc(seq->c_elem, len+1);
seq->seqmaxlen = len+1;
}
strcpy(seq->c_elem, c);
seq->seqlen = len;
seq->offset = pos;
return TRUE;
}
/* to make sure there is a space for '\0'. */
if(seq->seqlen > seq->seqmaxlen)
{
fprintf(stderr,
"InsertElems(): seqlen>seqmaxlen. Something is wrong.\n");
return FALSE;
}
else
{
while(seq->seqlen+1 >= seq->seqmaxlen)
{
seq->seqmaxlen *= 2;
seq->c_elem = (char *)Realloc(seq->c_elem, seq->seqmaxlen);
}
}
seq->c_elem[seq->seqlen] = '\0';
if(pos < seq->offset) /* insert to the left of the seq. */
{
/* ignore the dashes at the left. */
dashes = 0;
/*
while(dashes < len && c[dashes] == '-')
dashes++;
if(c[dashes] == '\0')
{
seq->offset += len;
return TRUE;
}
c += dashes;
len -= dashes;
*/
if(seq->seqlen + len + seq->offset - pos > seq->seqmaxlen)
{
seq->seqmaxlen = seq->seqlen+len+seq->offset-pos+256;
seq->c_elem = (char *)Realloc(seq->c_elem, seq->seqmaxlen);
}
/* copy the old string including the last '\0'. */
for(j=seq->seqlen; j>=0; j--)
seq->c_elem[j+len+seq->offset-pos] = seq->c_elem[j];
/* insert dashes. */
for(j=len; j<len+seq->offset-pos; j++)
seq->c_elem[j] = '-';
/* copy the inserted string. */
for(j=0; j<len; j++)
seq->c_elem[j] = c[j];
/* detector. */
if(c[j] != '\0')
fprintf(stderr, "InsertElems: Problem.....\n");
seq->seqlen = strlen(seq->c_elem);
/* seq->offset = pos; commented on 6-3-91 */
seq->offset = pos + dashes;
if(dashes > 0)
printf("\nInsertElems(): dashes is not zero.\n\n");
}
else if(pos - seq->offset >= seq->seqlen) /* insert to the right. */
{
/* ignore the dashes at the right. */
/*
dashes = len -1;
while(dashes >= 0 && c[dashes] == '-')
dashes--;
if(dashes < 0)
return TRUE;
len = dashes+1;
c[len] = '\0';
*/
if(pos - seq->offset + len > seq->seqmaxlen)
{
seq->seqmaxlen = pos - seq->offset + len + 256;
seq->c_elem = (char *)Realloc(seq->c_elem, seq->seqmaxlen);
}
/* insert dashes. */
for(j=seq->seqlen; j<pos-seq->offset; j++)
seq->c_elem[j] = '-';
/* copy the inserted string. */
for(j=0; j<len; j++)
seq->c_elem[pos - seq->offset + j] = c[j];
seq->c_elem[pos-seq->offset+len] = '\0';
/* detector. */
if(c[j] != '\0')
fprintf(stderr, "InsertElems: Problem too .....\n");
seq->seqlen = strlen(seq->c_elem);
}
else /* insert into the seq. */
{
if(seq->seqlen + len > seq->seqmaxlen)
{
seq->seqmaxlen = seq->seqlen + len + 256;
seq->c_elem = (char *)Realloc(seq->c_elem, seq->seqmaxlen);
}
/* move the bottom part of the older string including the last '\0'. */
for(j=seq->seqlen; j>=pos-seq->offset; j--)
seq->c_elem[j+len] = seq->c_elem[j];
/* copy the inserted string. */
for(j=0; j<len; j++)
seq->c_elem[pos - seq->offset + j] = c[j];
/* detector. */
if(c[j] != '\0')
fprintf(stderr, "InsertElems: Problem too too .....\n");
seq->seqlen = strlen(seq->c_elem);
}
return TRUE;
}
/******************************************************************
*
* int GetArgs(argArray, numArgs)
* Arg *argArray;
* int numArgs;
*
* Return TRUE if successful, FALSE otherwise.
*
******************************************************************/
#define MAX_ARGS 50 /* maximum args this can process */
int
GetArgs(argArray, numArgs, argc, argv)
Args *argArray;
int numArgs;
int argc;
char **argv;
{
int i, j;
Args *curarg;
int noArgOK = TRUE;
if ((argArray == NULL) || (numArgs == 0) || (numArgs > MAX_ARGS))
{
fprintf(stderr, "GetArgs: Invalid number of args.\n");
return FALSE;
}
/*
* Test if all are either 'default' or 'optional'.
*/
curarg = argArray;
for (i=0; i<numArgs; i++, curarg++)
{
if(curarg->strvalue[0] == '\0' && curarg->optional == 'F')
{
noArgOK = FALSE;
break;
}
}
/*
* show usage if some arg is required but no arg is
* supllied on command line.
*/
if(noArgOK == FALSE && argc == 1)
{
fprintf(stderr, "\n%s arguments:\n\n", argv[0]);
curarg = argArray;
for (i = 0; i < numArgs; i++, curarg++)
{
fprintf(stderr, " -%c %s ", curarg->tag, curarg->prompt);
if (curarg->optional == 'T')
fprintf(stderr, " [Optional]");
fprintf(stderr, "\n");
if (curarg->strvalue[0] != '\0')
fprintf(stderr, " default = %s\n", curarg->strvalue);
}
fprintf(stderr, "\n");
return FALSE;
}
/*
* Process
*/
for (i = 1; i < argc; i++)
{
if (argv[i][0] != '-')
{
fprintf(stderr, "Arguments must start with -");
return FALSE;
}
/* check the tag. */
curarg = argArray;
for (j = 0; j < numArgs; j++, curarg++)
{
if ((argv[i][1]|32) == (curarg->tag|32))
break;
}
if (j == numArgs)
{
fprintf(stderr, "Invalid argument tag in %s\n", argv[i]);
return FALSE;
}
strcpy(curarg->strvalue, argv[i]+2);
if(curarg->strvalue[0] == '\''
&& curarg->strvalue[strlen(curarg->strvalue)-1] == '\'')
{
char ttmm[256];
strcpy(ttmm, curarg->strvalue+1);
ttmm[strlen(ttmm)-1] = '\0';
strcpy(curarg->strvalue, ttmm);
}
}
return TRUE;
}
/*********
*
* GetCond interprets the -c argument, the condition.
*
* The condition will be set to NULL if no condition is specified,
* that is, if you pass '&p' as the address of a cond* structure,
* p will be set to NULL if no condition [(p == NULL) = TRUE].
*
* Return TRUE if successful, FALSE otherwise.
*
*********/
int
GetCond(arg, cond)
char *arg;
str_cond **cond;
{
int start, end, i, found;
char message_buf[1000];
if ( strcmp(arg, "null")==0)
{
(*cond) = NULL;
return TRUE;
}
else
{
(*cond) = (str_cond *)Calloc(1, sizeof(str_cond));
start = end = 0;
/* find the field name. */
while (('a'<= arg[end] && arg[end]<='z') ||
('A'<= arg[end] && arg[end]<='Z') ||
arg[end] == '-' )
end++;
found = FALSE;
for (i=0; i<NUM_OF_FIELDS && found == FALSE; i++)
{
if (strncmp(arg, at[i], strlen(at[i]))==0 )
{
(*cond)->field = i; /* condition on field &at[i]. */
found = TRUE;
break;
}
}
if (found == FALSE)
{
strncpy(message_buf, arg, end-start);
message_buf[end-start] = '\0';
fprintf(stderr, "Field %s not found.\n", message_buf);
return FALSE;
}
start = end;
end++;
while (arg[end] == '=' ||
arg[end] == '!' ||
arg[end] == '>' ||
arg[end] == '<' )
end++;
strncpy((*cond)->symbol, arg+start, end-start);
(*cond)->symbol[end-start] = '\0';
if (strlen((*cond)->symbol)>2 ||
strlen((*cond)->symbol)<1 ||
(strlen((*cond)->symbol)==1 &&
*((*cond)->symbol) !='>' &&
*((*cond)->symbol) != '<') ||
(strlen((*cond)->symbol)==2 &&
(strncmp((*cond)->symbol,"!=",2)!= 0 ) &&
(strncmp((*cond)->symbol,"==",2)!= 0 ) &&
(strncmp((*cond)->symbol,">=",2)!= 0 ) &&
(strncmp((*cond)->symbol,"<=",2)!= 0 )
)
)
{
fprintf(stderr, "Invalid condition.\n");
return FALSE;
}
if(arg[end] == '"' && arg[strlen(arg) - 1] == '"')
{
end++;
arg[strlen(arg) - 1] = '\0';
}
(*cond)->value = (char *)Calloc(strlen(arg) - end + 2, 1);
strcpy((*cond)->value, arg+end);
}
return TRUE;
}
/*********
*
* GetFields interprets the -f arguments, the fields list.
*
* Returns number of selected fields, 0 if anything is wrong.
*
*********/
int
GetFields(arg, selected_fields)
char *arg;
int selected_fields[];
{
int start, end, i, found, list_done, i_selected;
char message_buf[1000];
if ( strcmp(arg, "all") == 0 )
{
selected_fields[0] = -1;
return NUM_OF_FIELDS;
}
else
{
start = end = 0;
list_done = FALSE;
i_selected = 0;
while ( list_done == FALSE )
{
while (arg[end] != '\0' && arg[end] != ',')
{
end++ ;
}
if (arg[end] == '\0')
{
list_done = TRUE;
}
found = FALSE;
for (i=0; i<NUM_OF_FIELDS && found == FALSE; i++)
{
if (strncmp(arg+start, at[i], strlen(at[i])) == 0)
{
selected_fields[i_selected++] = i;
found = TRUE;
start = end+1;
break;
}
}
if (found == FALSE)
{
strncpy(message_buf, (arg+start),end-start);
message_buf[end-start] = '\0';
fprintf(stderr, "Field %s not found.\n", message_buf);
return 0;
}
end++;
}
}
return i_selected;
}
static char *pairs[] = {"aa","ac","ag","at",
"ca","cc","cg","ct",
"ga","gc","gg","gt",
"ta","tc","tg","tt" };
static int stemp[16] = {55, 98, 58, 57,
55, 86, 73, 58,
87, 136, 86, 98,
37, 87, 55, 55 };
/*******
*
* MST() returns Mean Stacking Temperature for the given sequence,
* returns -1 if anything is wrong.
*
*******/
float
MST(c_elem)
const char *c_elem;
{
int i, j, l;
int tot_stemp = 0, non_amb_pairs = 0;
char *seq;
l = strlen(c_elem);
seq = (char *)Calloc(l, 1+1);
/* clean out dashes. */
j = 0;
for(i = 0; i<l; i++)
{
if(c_elem[i] != '-')
{
seq[j] = c_elem[i]|32;
if(seq[j] == 'u')
seq[j] = 't';
j++;
}
}
seq[j] = '\0';
l = j;
for(i=0; i<l-1; i++)
{
j = 0;
while(j<16 && strncmp(seq+i, pairs[j], 2) != 0)
{
j++;
}
/* ignore the pairing of an ambiguous base. */
if(j!=16)
{
tot_stemp += stemp[j];
non_amb_pairs++;
}
}
if(seq != NULL)
{
Cfree(seq);
seq = NULL;
}
return ((float)tot_stemp/(float)non_amb_pairs);
}
/********
*
* SubStr() fill ss with a substring of at most 'length' chars and returns
* TRUE. If anything is wrong, it sets ss to be empty and returns FALSE.
*
********/
int
SubStr(string, start, length, ss)
const char *string;
int start, length;
char *ss;
{
int i;
if(strlen(string)<=start)
{
fprintf(stderr, "SubStr(): starting point is beyond the boundary.\n");
ss[0] = '\0';
return FALSE;
}
for(i=start; string[i] != '\0' && i<start+length; i++)
{
ss[i-start] = string[i];
}
ss[i-start] = '\0';
return TRUE;
}
/*******
*
* FindPattern() searches string for pattern.
* Returns the number of appearences.
*
*******/
int
FindPattern(string, pattern)
const char *string;
const char *pattern;
{
int i, sl, pl, num_app = 0;
if(string == NULL || (sl = strlen(string)) == 0)
return 0;
pl = strlen(pattern);
for(i = 0; i <= sl-pl; i++)
{
if(strncmp(string+i, pattern, pl) == 0)
num_app++;
}
return num_app;
}
/*******
*
* FindPattern2(), same as FindPattern(), but returns the #
* of appearences that do not overlap only.
*
*******/
int
FindPattern2(string, pattern, orig_loc)
const char *string;
const char *pattern;
int orig_loc;
{
int i, sl, pl, num_app = 0;
if(string == NULL || (sl = strlen(string)) == 0)
return 0;
pl = strlen(pattern);
for(i = 0; i <= sl-pl; i++)
{
if(abs(i - orig_loc) >= pl &&
strncmp(string+i, pattern, pl) == 0)
num_app++;
}
return num_app;
}
/*******
*
* FindPatternNC() searches string for pattern , CASE INSENSITIVE.
* Returns the number of appearences.
*
*******/
int
FindPatternNC(string, pattern)
const char *string;
const char *pattern;
{
int i, j, sl, pl, num_app = 0;
if(string == NULL || (sl = strlen(string)) == 0)
return 0;
pl = strlen(pattern);
for(i = 0; i <= sl-pl; i++)
{
j = 0;
while(j < pl && (string[i+j]|32) == (pattern[j]|32))
j++;
if(j == pl)
num_app++;
}
return num_app;
}
/*******
*
* Complementary() CHANGES the given DNA/RNA string to its complementary,
* and returns TRUE. Returns FALSE if anything is wrong and keep the
* given string unchanged.
*
*******/
int
Complementary(sequence, type)
char *sequence;
char type;
{
int i, l;
char *temp_str;
l = strlen(sequence);
temp_str = (char *)Calloc(l+1, sizeof(char));
if( type == 'D' || type == 'd')
type = 0;
else if(type == 'R' || type == 'r')
type = 1;
else
{
fprintf(stderr,
"Complementary(): type unknown. Type is D/d/R/r\n");
return (int) NULL;
}
for(i=0; i<l; i++)
{
switch(sequence[i])
{
case 'A':
temp_str[i] = (type == 0) ? 'T' : 'U';
break;
case 'a':
temp_str[i] = (type == 0) ? 't' : 'u';
break;
case 'C':
temp_str[i] = 'G';
break;
case 'c':
temp_str[i] = 'g';
break;
case 'G':
temp_str[i] = 'C';
break;
case 'g':
temp_str[i] = 'c';
break;
case 'T':
case 'U':
temp_str[i] = 'A';
break;
case 't':
case 'u':
temp_str[i] = 'a';
break;
}
}
temp_str[i] = '\0';
strcpy(sequence, temp_str);
if(temp_str != NULL)
{
Cfree(temp_str);
temp_str = NULL;
}
return TRUE;
}
/********
*
* KnownSeq() returns an integer which is the index of the first
* occurence of an ambiguous base in the seq. -1 if no ambiguous
* base in the seq.
*
********/
int KnownSeq(seq)
char *seq;
{
int i;
char c;
for(i=0; i<strlen(seq); i++)
{
c = seq[i]|32;
if(c != 'a' && c != 't' && c != 'g' && c != 'c' && c != 'u')
return i;
}
return -1;
}
/********
*
* Reverse() reverses the given string and returns TRUE.
* (NOTE: Reverse() actually changes the string).
* If anything goes wrong, leave seq unchanged.
*
*
********/
int Reverse(seq)
char *seq;
{
int i, l;
char c;
l = strlen(seq);
if(l<2)
{
return TRUE;
}
for(i=0; i < l/2; i++)
{
c = seq[i];
seq[i] = seq[l-i-1];
seq[l-i-1] = c;
}
return TRUE;
}
/********
*
* GoodOligos() returns a pointer to an array of subsequences that
* do not contant secondary structure, nor self complementary structure.
* Returns NULL if anything is wrong.
*
* l_bnd and r_bnd are regards to the head of the probe.
*
* Note: this program Calloc-s memory for the returned pointer.
* The caller program is responsible of Freeing the memory when
* not needed.
*
********/
char **
GoodOligos(c_elem, check_len, min_len, max_len, l_bnd, r_bnd)
char *c_elem;
int check_len, min_len, max_len, l_bnd, r_bnd;
/* l_bnd and r_bnd are relative to c_elem, so they should be in
[0,strlen(c_elem)] */
{
int i, l, seq_len, max_num_probe, seq_cnt = 0;
char **seq_set;
char *seq, *subseq, *scd_str, *PossibleOligo;
int BadOligo, PO_len, PO_index, PO_l;
/* constant(s): */
/* to check if there is a substr of length 'no_repeat_len' appears
* more than once in the PossibleOligo. */
int no_repeat_len = 15;
seq_len = strlen(c_elem);
/* A lower case copy of the c_elem. */
seq = (char *)Calloc(seq_len+1, sizeof(char));
/* String used to check the PossibleOligo. */
PossibleOligo = (char *)Calloc(max_len+1, sizeof(char));
subseq = (char *)Calloc(max_len+1, sizeof(char));
scd_str= (char *)Calloc(max_len+1, sizeof(char));
/* The output. A set of possibly good oligos. */
max_num_probe = 20;
seq_set = (char **)Calloc(max_num_probe, sizeof(char *));
for(i=0; i<seq_len; i++)
{
seq[i] = c_elem[i]|32;
}
i = MAX(l_bnd, 0);
while(i <= MIN(r_bnd, seq_len - min_len))
{
BadOligo = FALSE;
for(l = min_len;
BadOligo == FALSE && l <= seq_len - i && l <= max_len;
l++)
{
int uk;
SubStr(seq, i, l, PossibleOligo);
/* Any unknow base?
*/
if((uk = KnownSeq(PossibleOligo)) != -1)
{
fprintf(stderr, "%s has ambiguous base(s)\n", PossibleOligo);
i += uk+1;
BadOligo = TRUE;
}
PO_len = strlen(PossibleOligo);
/* check if there is a substr of len(no_repeat_len)
* repeat itself in the PossibleOligo.
DOESN'T MATTER! IT COULD MESS UP AT MOST SEVERAL
BASES READ INTO THE PROBE. CUT_SITE IS WHAT REALLY
MATTERS.
for(PO_index = 0;
BadOligo==FALSE && PO_index<=PO_len-no_repeat_len;
PO_index++)
{
SubStr(PossibleOligo,PO_index,no_repeat_len,subseq);
if(FindPattern(PossibleOligo, subseq) > 1)
{
fprintf(stderr,
"%s has 15 repatitive base(s) %s\n",
PossibleOligo, subseq);
i++;
BadOligo = TRUE;
}
}
*/
/*
* To ensure that the probe is not going to hybridize
* with itself:
*/
for(PO_index = 0;
BadOligo==FALSE && PO_index<=PO_len-no_repeat_len;
PO_index++)
{
SubStr(PossibleOligo, PO_index, no_repeat_len, subseq);
strcpy(scd_str, subseq);
Complementary(scd_str, 'd');
Reverse(scd_str);
if(FindPattern(PossibleOligo, scd_str) > 0)
{
fprintf(stderr,
"%s may hybridize with itself: %s vs. %s.\n",
PossibleOligo, subseq, scd_str);
i++;
BadOligo = TRUE;
}
}
for(PO_index = 0;
BadOligo == FALSE && PO_index <= PO_len-2*check_len;
PO_index++)
{
SubStr(PossibleOligo, PO_index, check_len, subseq);
Complementary(subseq, 'd');
strcpy(scd_str, subseq);
Reverse(scd_str);
/*
if(FindPattern2(PossibleOligo,subseq,PO_index)>0)
{
fprintf(stderr, "%s has self-compl %s\n",
PossibleOligo, subseq);
i += PO_index+1;
BadOligo = TRUE;
}
else
*/
if(FindPattern2(PossibleOligo,scd_str,PO_index)>0)
{
fprintf(stderr, "%s has 2nd struct %s\n",
PossibleOligo, scd_str);
i += PO_index+1;
BadOligo = TRUE;
}
}
if(BadOligo == FALSE)
{
seq_set[seq_cnt] = (char *)
Calloc(strlen(PossibleOligo)+1, sizeof(char));
strcpy(seq_set[seq_cnt], PossibleOligo);
if(++seq_cnt == max_num_probe)
{
max_num_probe *= 2;
seq_set = (char **)
Realloc(seq_set, max_num_probe*sizeof(char *));
}
i++;
}
} /* end of l. */
} /* end of i. */
seq_set[seq_cnt] = NULL;
if(seq_cnt == 0)
return NULL;
return seq_set;
}
/* ALWAYS COPY the result from uniqueID() to a char[32],
* (strlen(hostname)+1+10). Memory is lost when the function
* is finished.
*/
char vname[32];
char *uniqueID()
{
char hname[32],/* vname[32], rtm 18.III.98 */ tstr[32];
time_t *tp;
static cnt = 0;
int ll;
tp = (time_t *)Calloc(1, sizeof(time_t));
if(gethostname(hname, 32) == -1)
{
fprintf(stderr, "UniqueID(): Failed to get host name.\n");
exit(1);
}
time(tp);
sprintf(tstr, ":%d:%ld", cnt, *tp);
if((ll = strlen(tstr)) > 31)
{
strncpy(vname, tstr, 31);
vname[31] = '\0';
}
else
{
ll = strlen(hname)-(31-ll);
if(ll < 0)
ll = 0;
sprintf(vname, "%s%s", hname+ll, tstr);
}
cnt++;
Cfree(tp);
return(vname);
}
/* return the percentage of GCcontents. */
int GCcontent(seq)
char *seq;
{
int l, gc=0, j;
l = strlen(seq);
for (j=0; j<l; j++)
{
if((seq[j]|32) == 'g' || (seq[j]|32) == 'c')
{
gc++;
}
}
return ((int) (gc*100/l));
}
/******
*
* HGLtoIQ() outputs a HGL format record to an ASCII file with
* the Input-Queue format, the format for the synthesizer.
*
******/
void HGLtoIQ(fname, tSeq)
const char *fname;
Sequence *tSeq;
{
FILE *fp;
if((fp = fopen(fname, "w")) == NULL)
{
fprintf(stderr, "Can't open IQ file: %s\n", fname);
exit(1);
}
fprintf(fp, "%s %s\n", tSeq->comments, tSeq->c_elem);
}
Find2(string,key)
char *key,*string;
/*
* Like find, but returns the index of the leftmost
* occurence, and -1 if not found.
* Note in this program, T==U, and case insensitive.
*/
{
int i,j,len1,len2,dif,flag = FALSE;
char *target;
if(string == NULL || string[0] == '\0')
return -1;
len2 = strlen(string);
target = (char *) Calloc(len2+1, 1);
for(i = 0; i<len2; i++)
{
target[i] = string[i]|32;
if(target[i] == 'u')
target[i] = 't';
}
len1 = strlen(key);
for(i = 0; i<len1; i++)
{
key[i] |= 32;
if(key[i] == 'u')
key[i] = 't';
}
dif = len2 - len1 +1;
if(len1>0)
for(j=0;j<dif && flag == FALSE;j++)
{
flag = TRUE;
for(i=0; i < len1 && flag; i++)
flag = (key[i] == target[i+j]) ? TRUE : FALSE;
}
Cfree(target);
return(flag?j-1:-1);
}
/* return -1 if end-of-file.
FALSE if anything is wrong.
*/
int
ReadGDE(fp, seq)
FILE *fp;
Sequence *seq;
{
char temp_line[1000], waste[64];
int ii, l1;
while(fgets(temp_line, 1000, fp) != NULL )
{
if(strncmp(temp_line, "sequence-ID", 11) == 0)
{
sscanf(temp_line,"%s%s",waste,seq->sequence_ID);
}
else if(temp_line[0] == '#')
{
strncpy(seq->name, temp_line+1, 31);
seq->name[31] = '\0';
ii = 0;
while(ii < strlen(seq->name) &&
seq->name[ii] != ' ' &&
seq->name[ii] != '\n')
ii++;
seq->name[ii] = '\0';
seq->seqmaxlen = 256;
seq->c_elem=(char *)Calloc(seq->seqmaxlen,1);
seq->seqlen = 0;
while(fgets(temp_line, 1000, fp) != NULL)
{
l1 = strlen(temp_line);
if(temp_line[l1 - 1] == '\n')
{
l1--;
temp_line[l1] = '\0';
}
while(seq->seqmaxlen <
seq->seqlen + strlen(temp_line) + 1)
{
seq->seqmaxlen *= 2;
seq->c_elem = (char *)
Realloc(seq->c_elem, seq->seqmaxlen);
}
strcat(seq->c_elem, temp_line);
seq->seqlen += strlen(temp_line);
}
if(seq->seqlen == 0)
{
fprintf(stderr, "\n%s\n","Sequence is empty.");
return FALSE;
}
}
}
return -1;
}
void heapify(seq_set, seq_size, heap_size, elem, Pkey, Skey, order)
int seq_size, elem, heap_size, **order;
char Pkey[], Skey[];
Sequence *seq_set;
{
int l, r, temp, largest;
l = 2*elem+1;
r = 2*elem+2;
if(l <= heap_size &&
CompKey(seq_set[(*order)[l]], seq_set[(*order)[elem]],
Pkey, Skey) > 0)
largest = l;
else
largest = elem;
if(r <= heap_size &&
CompKey(seq_set[(*order)[r]], seq_set[(*order)[largest]],
Pkey, Skey) > 0)
largest = r;
if(largest != elem)
{
temp = (*order)[elem];
(*order)[elem] = (*order)[largest];
(*order)[largest] = temp;
heapify(seq_set,seq_size,heap_size,largest,Pkey,Skey,order);
}
}
heapsort(seq_set, seq_size, Pkey, Skey, order)
int seq_size, **order;
char Pkey[], Skey[];
Sequence *seq_set;
{
int ii, temp, heap_size;
/*
* build_heap(seq_set, seq_size, &heap_size, order);
*/
heap_size = seq_size-1;
for(ii = (seq_size-1)/2; ii>=0; ii--) /* (L-1)/2-1?? */
{
heapify(seq_set, seq_size, heap_size, ii,Pkey,Skey,order);
}
for(ii = seq_size-1; ii>0; ii--)
{
temp = (*order)[0];
(*order)[0] = (*order)[ii];
(*order)[ii] = temp;
heap_size--;
heapify(seq_set, seq_size, heap_size, 0, Pkey,Skey,order);
}
}
/*
* Return >0, ==0, <0.
*/
int CompKey(seq1, seq2, Pkey, Skey)
Sequence seq1, seq2;
char Pkey[], Skey[];
{
int ii, jj, Pret;
char b1[32], b2[32];
if(strcmp(Pkey, "type") == 0)
{
Pret = strcmp(seq1.type, seq2.type);
if(Pret != 0 || Skey[0] == '\0') return Pret;
}
else if(strcmp(Pkey, "name") == 0)
{
Pret = strcmp(seq1.name, seq2.name);
if(Pret != 0 || Skey[0] == '\0') return Pret;
}
else if(strcmp(Pkey, "sequence-ID") == 0)
{
Pret = strcmp(seq1.sequence_ID, seq2.sequence_ID);
if(Pret != 0 || Skey[0] == '\0') return Pret;
}
else if(strcmp(Pkey, "creator") == 0)
{
Pret = strcmp(seq1.creator, seq2.creator);
if(Pret != 0 || Skey[0] == '\0') return Pret;
}
else if(strcmp(Pkey, "offset") == 0)
{
Pret = seq1.offset - seq2.offset;
if(Pret != 0 || Skey[0] == '\0') return Pret;
}
else if(strcmp(Pkey, "group-ID") == 0)
{
Pret = seq1.group_ID - seq2.group_ID;
if(Pret != 0 || Skey[0] == '\0') return Pret;
}
else if(strcmp(Pkey, "barcode") == 0)
{
if(seq1.barcode[0] == 'P')
strcpy(b1, seq1.barcode+2);
else
strcpy(b1, seq1.barcode);
if(seq2.barcode[0] == 'P')
strcpy(b2, seq2.barcode+2);
else
strcpy(b2, seq2.barcode);
Pret = strcmp(b1, b2);
if(Pret != 0 || Skey[0] == '\0') return Pret;
}
else if(strcmp(Pkey, "seqlen") == 0)
{
Pret = seq1.seqlen - seq2.seqlen;
if(Pret != 0 || Skey[0] == '\0') return Pret;
}
else if(strcmp(Pkey, "creation-date") == 0)
{
seq1.creation_date[0] %= 100;
seq2.creation_date[0] %= 100;
Pret = seq1.creation_date[0]*10000
+ seq1.creation_date[1]*100
+ seq1.creation_date[2]
- seq2.creation_date[0]*10000
- seq2.creation_date[1]*100
- seq2.creation_date[2];
if(Pret == 0)
{
Pret = seq1.creation_date[3]*10000
+ seq1.creation_date[4]*100
+ seq1.creation_date[5]
- seq2.creation_date[3]*10000
- seq2.creation_date[4]*100
- seq2.creation_date[5];
}
if(Pret != 0 || Skey[0] == '\0') return Pret;
}
else if(strcmp(Pkey, "probing-date") == 0)
{
seq1.probing_date[0] %= 100;
seq2.probing_date[0] %= 100;
Pret = seq1.probing_date[0]*10000
+ seq1.probing_date[1]*100
+ seq1.probing_date[2]
- seq2.probing_date[0]*10000
- seq2.probing_date[1]*100
- seq2.probing_date[2];
if(Pret == 0)
{
Pret = seq1.probing_date[3]*10000
+ seq1.probing_date[4]*100
+ seq1.probing_date[5]
- seq2.probing_date[3]*10000
- seq2.probing_date[4]*100
- seq2.probing_date[5];
}
if(Pret != 0 || Skey[0] == '\0') return Pret;
}
else if(strcmp(Pkey, "autorad_date") == 0)
{
seq1.autorad_date[0] %= 100;
seq2.autorad_date[0] %= 100;
Pret = seq1.autorad_date[0]*10000
+ seq1.autorad_date[1]*100
+ seq1.autorad_date[2]
- seq2.autorad_date[0]*10000
- seq2.autorad_date[1]*100
- seq2.autorad_date[2];
if(Pret == 0)
{
Pret = seq1.autorad_date[3]*10000
+ seq1.autorad_date[4]*100
+ seq1.autorad_date[5]
- seq2.autorad_date[3]*10000
- seq2.autorad_date[4]*100
- seq2.autorad_date[5];
}
if(Pret != 0 || Skey[0] == '\0') return Pret;
}
else if(strcmp(Pkey, "film") == 0)
{
Pret = strcmp(seq1.film, seq2.film);
if(Pret != 0 || Skey[0] == '\0') return Pret;
}
else if(strcmp(Pkey, "membrane") == 0)
{
Pret = strcmp(seq1.membrane, seq2.membrane);
if(Pret != 0 || Skey[0] == '\0') return Pret;
}
else if(strcmp(Pkey, "contig") == 0)
{
Pret = strcmp(seq1.contig, seq2.contig);
if(Pret != 0 || Skey[0] == '\0') return Pret;
}
else
{
fprintf(stderr,"CompKey(): Invalid primary key %s.\n",Pkey);
exit(1);
}
if(strcmp(Skey, "type") == 0)
{
return (strcmp(seq1.type, seq2.type));
}
else if(strcmp(Skey, "name") == 0)
{
return (strcmp(seq1.name, seq2.name));
}
else if(strcmp(Skey, "sequence-ID") == 0)
{
return (strcmp(seq1.sequence_ID, seq2.sequence_ID));
}
else if(strcmp(Skey, "creator") == 0)
{
return (strcmp(seq1.creator, seq2.creator));
}
else if(strcmp(Skey, "offset") == 0)
{
return (seq1.offset - seq2.offset);
}
else if(strcmp(Skey, "group-ID") == 0)
{
return (seq1.group_ID - seq2.group_ID);
}
else if(strcmp(Skey, "barcode") == 0)
{
if(seq1.barcode[0] == 'P')
strcpy(b1, seq1.barcode+2);
else
strcpy(b1, seq1.barcode);
if(seq2.barcode[0] == 'P')
strcpy(b2, seq2.barcode+2);
else
strcpy(b2, seq2.barcode);
return (strcmp(b1, b2));
}
else if(strcmp(Skey, "seqlen") == 0)
{
return(seq1.seqlen - seq2.seqlen);
}
else if(strcmp(Skey, "creation-date") == 0)
{
seq1.creation_date[0] %= 100;
seq2.creation_date[0] %= 100;
Pret = seq1.creation_date[0]*10000
+ seq1.creation_date[1]*100
+ seq1.creation_date[2]
- seq2.creation_date[0]*10000
- seq2.creation_date[1]*100
- seq2.creation_date[2];
if(Pret != 0)
return Pret;
return(seq1.creation_date[3]*10000
+ seq1.creation_date[4]*100
+ seq1.creation_date[5]
- seq2.creation_date[3]*10000
- seq2.creation_date[4]*100
- seq2.creation_date[5]);
}
else if(strcmp(Skey, "probing-date") == 0)
{
seq1.probing_date[0] %= 100;
seq2.probing_date[0] %= 100;
Pret = seq1.probing_date[0]*10000
+ seq1.probing_date[1]*100
+ seq1.probing_date[2]
- seq2.probing_date[0]*10000
- seq2.probing_date[1]*100
- seq2.probing_date[2];
if(Pret != 0)
return Pret;
return(seq1.probing_date[3]*10000
+ seq1.probing_date[4]*100
+ seq1.probing_date[5]
- seq2.probing_date[3]*10000
- seq2.probing_date[4]*100
- seq2.probing_date[5]);
}
else if(strcmp(Skey, "autorad_date") == 0)
{
seq1.autorad_date[0] %= 100;
seq2.autorad_date[0] %= 100;
Pret = seq1.autorad_date[0]*10000
+ seq1.autorad_date[1]*100
+ seq1.autorad_date[2]
- seq2.autorad_date[0]*10000
- seq2.autorad_date[1]*100
- seq2.autorad_date[2];
if(Pret != 0)
return Pret;
return(seq1.autorad_date[3]*10000
+ seq1.autorad_date[4]*100
+ seq1.autorad_date[5]
- seq2.autorad_date[3]*10000
- seq2.autorad_date[4]*100
- seq2.autorad_date[5]);
}
else if(strcmp(Skey, "film") == 0)
{
return(strcmp(seq1.film, seq2.film));
}
else if(strcmp(Skey, "membrane") == 0)
{
return(strcmp(seq1.membrane, seq2.membrane));
}
else if(strcmp(Skey, "contig") == 0)
{
return(strcmp(seq1.contig, seq2.contig));
}
else
{
fprintf(stderr, "CompKey(): Invalid secondary key %s.\n",Skey);
exit(1);
}
}
int Lock(fname)
char *fname;
{
char buffer[1024];
FILE *fp;
int wait = 0;
while((fp = fopen(fname, "r")) == NULL)
{
sleep(1);
if(++wait == 30)
{
fprintf(stderr, "File %s not available, Try later.\n\n", fname);
return FALSE;
}
}
fclose(fp);
sprintf(buffer, "mv %s %s.locked", fname, fname);
system(buffer);
return TRUE;
}
void Unlock(fname)
char *fname;
{
char buffer[1024];
sprintf(buffer, "mv %s.locked %s", fname, fname);
system(buffer);
}
AppendComments(seq, str)
Sequence *seq;
char *str;
{
int ii, jj, kk;
kk = strlen(str);
if(seq->commentsmaxlen == 0)
{
seq->comments = (char *)Calloc(kk+1, 1);
seq->commentsmaxlen = kk+1;
seq->commentslen = 0;
}
else if(seq->commentslen+kk+1>seq->commentsmaxlen)
{
seq->commentsmaxlen += 2*(kk+1);
seq->comments = (char *)
Realloc(seq->comments, seq->commentsmaxlen);
}
seq->comments[seq->commentslen] = '\0';
seq->comments[seq->commentslen] = '\0';
strcat(seq->comments, str);
seq->commentslen = strlen(seq->comments);
}