2022-03-08 04:43:05 +08:00
|
|
|
|
|
|
|
/****************************************************************
|
|
|
|
*
|
2023-04-09 02:34:34 +08:00
|
|
|
* This is a set of functions defined for the genome
|
2022-03-08 04:43:05 +08:00
|
|
|
* project.
|
|
|
|
*
|
|
|
|
****************************************************************/
|
|
|
|
|
|
|
|
#ifndef _GLOBAL_DEFS_H
|
|
|
|
#define _GLOBAL_DEFS_H
|
2023-04-09 02:34:34 +08:00
|
|
|
#include <ctype.h>
|
|
|
|
#include <stdlib.h>
|
|
|
|
|
2022-03-08 04:43:05 +08:00
|
|
|
#include "global_defs.h"
|
|
|
|
#endif
|
|
|
|
|
2023-04-09 02:34:34 +08:00
|
|
|
#define MAXLINELEN 256
|
2022-03-08 04:43:05 +08:00
|
|
|
|
2023-04-09 02:34:34 +08:00
|
|
|
static char Default_DNA_Trans[16] = {'-', 'a', 'c', 'm', 'g', 'r', 's', 'v',
|
|
|
|
't', 'w', 'y', 'h', 'k', 'd', 'b', 'n'};
|
2022-03-08 04:43:05 +08:00
|
|
|
|
|
|
|
/***********
|
|
|
|
*
|
|
|
|
* WriteRecord() outputs one record at a time in HGL format.
|
2023-04-09 02:34:34 +08:00
|
|
|
* Only the fields in the fields_array will be output. All the
|
2022-03-08 04:43:05 +08:00
|
|
|
* fields will be output if fields_array is NULL.
|
|
|
|
*
|
|
|
|
* fp : pointer to the output file.
|
|
|
|
* tSeq: pointer to the record.
|
|
|
|
* fields_array: contains the field ids of the selected fields.
|
|
|
|
* array_size: number of selected fields.
|
|
|
|
*
|
|
|
|
* Returns: 1 if any field is printed;
|
|
|
|
* 0 if no field is printed;
|
|
|
|
* -1 if anything is wrong.
|
|
|
|
*
|
|
|
|
**********/
|
|
|
|
|
2023-04-09 02:34:34 +08:00
|
|
|
int WriteRecord(fp, tSeq, fields_array, array_size)
|
2022-03-08 04:43:05 +08:00
|
|
|
FILE *fp;
|
|
|
|
const Sequence *tSeq;
|
|
|
|
int *fields_array;
|
|
|
|
int array_size;
|
|
|
|
{
|
2023-04-09 02:34:34 +08:00
|
|
|
int i, save_str_size, tt;
|
|
|
|
int all_fields = FALSE;
|
|
|
|
int first_field = TRUE;
|
|
|
|
char temp_str[256];
|
|
|
|
char *save_str;
|
|
|
|
char *ptr;
|
|
|
|
|
|
|
|
save_str = (char *)Calloc(256, 1);
|
|
|
|
save_str_size = 256;
|
|
|
|
|
|
|
|
/* When all the fields are selected. */
|
|
|
|
if (fields_array == NULL) {
|
|
|
|
all_fields = TRUE;
|
|
|
|
fields_array = (int *)Calloc(NUM_OF_FIELDS, sizeof(int));
|
|
|
|
|
|
|
|
for (i = 0; i < NUM_OF_FIELDS; i++) {
|
|
|
|
fields_array[i] = i;
|
|
|
|
}
|
|
|
|
array_size = NUM_OF_FIELDS;
|
2022-03-08 04:43:05 +08:00
|
|
|
}
|
|
|
|
|
2023-04-09 02:34:34 +08:00
|
|
|
for (i = 0; i < array_size; i++) {
|
|
|
|
save_str[0] = '\0';
|
|
|
|
|
|
|
|
if (fields_array[i] == e_creation_date &&
|
|
|
|
tSeq->creation_date[0] != 0) {
|
|
|
|
sprintf(save_str, "\n%s\t%d/%d/%d ",
|
|
|
|
at[fields_array[i]], tSeq->creation_date[1],
|
|
|
|
tSeq->creation_date[2], tSeq->creation_date[0]);
|
|
|
|
|
|
|
|
if (tSeq->creation_date[3] >= 0) {
|
|
|
|
if (tSeq->creation_date[4] < 0)
|
|
|
|
tSeq->creation_date[4] = 0;
|
|
|
|
if (tSeq->creation_date[5] < 0)
|
|
|
|
tSeq->creation_date[5] = 0;
|
|
|
|
sprintf(save_str, "%s%d:%d:%d", save_str,
|
|
|
|
tSeq->creation_date[3],
|
|
|
|
tSeq->creation_date[4],
|
|
|
|
tSeq->creation_date[5]);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
else if (fields_array[i] == e_probing_date &&
|
|
|
|
tSeq->probing_date[0] != 0) {
|
|
|
|
sprintf(save_str, "\n%s\t%d/%d/%d ",
|
|
|
|
at[fields_array[i]], tSeq->probing_date[1],
|
|
|
|
tSeq->probing_date[2], tSeq->probing_date[0]);
|
|
|
|
|
|
|
|
if (tSeq->probing_date[3] >= 0) {
|
|
|
|
if (tSeq->probing_date[4] < 0)
|
|
|
|
tSeq->probing_date[4] = 0;
|
|
|
|
if (tSeq->probing_date[5] < 0)
|
|
|
|
tSeq->probing_date[5] = 0;
|
|
|
|
sprintf(save_str, "%s%d:%d:%d", save_str,
|
|
|
|
tSeq->probing_date[3],
|
|
|
|
tSeq->probing_date[4],
|
|
|
|
tSeq->probing_date[5]);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
else if (fields_array[i] == e_autorad_date &&
|
|
|
|
tSeq->autorad_date[0] != 0) {
|
|
|
|
sprintf(save_str, "\n%s\t%d/%d/%d ",
|
|
|
|
at[fields_array[i]], tSeq->autorad_date[1],
|
|
|
|
tSeq->autorad_date[2], tSeq->autorad_date[0]);
|
|
|
|
|
|
|
|
if (tSeq->autorad_date[3] >= 0) {
|
|
|
|
if (tSeq->autorad_date[4] < 0)
|
|
|
|
tSeq->autorad_date[4] = 0;
|
|
|
|
if (tSeq->autorad_date[5] < 0)
|
|
|
|
tSeq->autorad_date[5] = 0;
|
|
|
|
sprintf(save_str, "%s%d:%d:%d", save_str,
|
|
|
|
tSeq->autorad_date[3],
|
|
|
|
tSeq->autorad_date[4],
|
|
|
|
tSeq->autorad_date[5]);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
else if (fields_array[i] == e_c_elem && tSeq->c_elem != NULL) {
|
|
|
|
ptr = tSeq->c_elem;
|
|
|
|
sprintf(save_str, "\n%s\t\"", at[fields_array[i]]);
|
|
|
|
while (ptr < tSeq->c_elem + tSeq->seqlen) {
|
|
|
|
if (ptr != tSeq->c_elem) strcat(save_str, "\n");
|
|
|
|
strncpy(
|
|
|
|
temp_str, ptr,
|
|
|
|
MIN(60, tSeq->c_elem + tSeq->seqlen - ptr));
|
|
|
|
temp_str[MIN(60, tSeq->c_elem + tSeq->seqlen -
|
|
|
|
ptr)] = '\0';
|
|
|
|
|
|
|
|
/* Gurantee strlen(temp_str) chars for the
|
|
|
|
* string, one for \n, one for ", and one for
|
|
|
|
* \0.
|
|
|
|
*/
|
|
|
|
while (save_str_size - strlen(save_str) <
|
|
|
|
strlen(temp_str) + 3) {
|
|
|
|
save_str_size *= 2;
|
|
|
|
save_str = (char *)Realloc(
|
|
|
|
save_str, save_str_size);
|
|
|
|
}
|
|
|
|
strcat(save_str, temp_str);
|
|
|
|
ptr += 60;
|
|
|
|
}
|
|
|
|
strcat(save_str, "\"");
|
|
|
|
}
|
|
|
|
else if (fields_array[i] == e_comments &&
|
|
|
|
tSeq->commentslen != 0) {
|
|
|
|
while (save_str_size < 20 + tSeq->commentslen) {
|
|
|
|
save_str_size *= 2;
|
|
|
|
save_str =
|
|
|
|
(char *)Realloc(save_str, save_str_size);
|
|
|
|
}
|
|
|
|
|
|
|
|
strcat(save_str, "\n");
|
|
|
|
strcat(save_str, at[fields_array[i]]);
|
|
|
|
strcat(save_str, "\t\"\n");
|
2022-03-08 04:43:05 +08:00
|
|
|
|
2023-04-09 02:34:34 +08:00
|
|
|
/* put a \0 at the end of comments. */
|
|
|
|
while (tSeq->commentslen + 1 > tSeq->commentsmaxlen) {
|
|
|
|
tSeq->commentsmaxlen *= 2;
|
|
|
|
tSeq->comments = (char *)Realloc(
|
|
|
|
tSeq->comments, tSeq->commentsmaxlen);
|
|
|
|
}
|
|
|
|
tSeq->comments[tSeq->commentslen] = '\0';
|
|
|
|
|
|
|
|
/* clean up the leading empty lines.*/
|
|
|
|
tt = 0;
|
|
|
|
while (tSeq->comments[tt] == '\n' ||
|
|
|
|
tSeq->comments[tt] == ' ')
|
|
|
|
tt++;
|
|
|
|
tSeq->commentslen -= tt;
|
|
|
|
strcat(save_str, tSeq->comments + tt);
|
|
|
|
strcat(save_str, "\"");
|
|
|
|
}
|
|
|
|
else if (fields_array[i] == e_laneset && tSeq->laneset != -1)
|
|
|
|
sprintf(save_str, "\n%s\t\t%d", at[fields_array[i]],
|
|
|
|
tSeq->laneset);
|
|
|
|
else if (fields_array[i] == e_strandedness &&
|
|
|
|
tSeq->strandedness != 0)
|
|
|
|
sprintf(save_str, "\n%s\t%d", at[fields_array[i]],
|
|
|
|
tSeq->strandedness);
|
|
|
|
else if (fields_array[i] == e_direction && tSeq->direction != 0)
|
|
|
|
sprintf(save_str, "\n%s\t%d", at[fields_array[i]],
|
|
|
|
tSeq->direction);
|
|
|
|
else if (fields_array[i] == e_orig_strand &&
|
|
|
|
tSeq->orig_strand != 0)
|
|
|
|
sprintf(save_str, "\n%s\t%d", at[fields_array[i]],
|
|
|
|
tSeq->orig_strand);
|
|
|
|
else if (fields_array[i] == e_orig_direction &&
|
|
|
|
tSeq->orig_direction != 0)
|
|
|
|
sprintf(save_str, "\n%s\t%d", at[fields_array[i]],
|
|
|
|
tSeq->orig_direction);
|
|
|
|
else if (fields_array[i] == e_offset)
|
|
|
|
sprintf(save_str, "\n%s\t\t%d", at[fields_array[i]],
|
|
|
|
tSeq->offset);
|
|
|
|
else if (fields_array[i] == e_group_number &&
|
|
|
|
tSeq->group_number != 0)
|
|
|
|
sprintf(save_str, "\n%s\t%d", at[fields_array[i]],
|
|
|
|
tSeq->group_number);
|
|
|
|
else if (fields_array[i] == e_group_ID)
|
|
|
|
sprintf(save_str, "\n%s\t%d", at[fields_array[i]],
|
|
|
|
tSeq->group_ID);
|
|
|
|
else if (fields_array[i] == e_type && tSeq->type[0] != '\0')
|
|
|
|
sprintf(save_str, "\n%s\t\t\"%s\"", at[fields_array[i]],
|
|
|
|
tSeq->type);
|
|
|
|
else if (fields_array[i] == e_barcode &&
|
|
|
|
tSeq->barcode[0] != '\0')
|
|
|
|
sprintf(save_str, "\n%s\t\t\"%s\"", at[fields_array[i]],
|
|
|
|
tSeq->barcode);
|
|
|
|
else if (fields_array[i] == e_name && tSeq->name[0] != '\0')
|
|
|
|
sprintf(save_str, "\n%s\t\t\"%s\"", at[fields_array[i]],
|
|
|
|
tSeq->name);
|
|
|
|
else if (fields_array[i] == e_status && tSeq->status[0] != '\0')
|
|
|
|
sprintf(save_str, "\n%s\t\t\"%s\"", at[fields_array[i]],
|
|
|
|
tSeq->status);
|
|
|
|
else if (fields_array[i] == e_walk && tSeq->walk[0] != '\0')
|
|
|
|
sprintf(save_str, "\n%s\t\t\"%s\"", at[fields_array[i]],
|
|
|
|
tSeq->walk);
|
|
|
|
else if (fields_array[i] == e_sequence_ID &&
|
|
|
|
tSeq->sequence_ID[0] != '\0')
|
|
|
|
sprintf(save_str, "\n%s\t\"%s\"", at[fields_array[i]],
|
|
|
|
tSeq->sequence_ID);
|
|
|
|
else if (fields_array[i] == e_creator &&
|
|
|
|
tSeq->creator[0] != '\0')
|
|
|
|
sprintf(save_str, "\n%s\t\t\"%s\"", at[fields_array[i]],
|
|
|
|
tSeq->creator);
|
|
|
|
else if (fields_array[i] == e_film && tSeq->film[0] != '\0')
|
|
|
|
sprintf(save_str, "\n%s\t\t\"%s\"", at[fields_array[i]],
|
|
|
|
tSeq->film);
|
|
|
|
else if (fields_array[i] == e_membrane &&
|
|
|
|
tSeq->membrane[0] != '\0')
|
|
|
|
sprintf(save_str, "\n%s\t\"%s\"", at[fields_array[i]],
|
|
|
|
tSeq->membrane);
|
|
|
|
else if (fields_array[i] == e_source_ID &&
|
|
|
|
tSeq->source_ID[0] != '\0')
|
|
|
|
sprintf(save_str, "\n%s\t\"%s\"", at[fields_array[i]],
|
|
|
|
tSeq->source_ID);
|
|
|
|
else if (fields_array[i] == e_contig && tSeq->contig[0] != '\0')
|
|
|
|
sprintf(save_str, "\n%s\t\t\"%s\"", at[fields_array[i]],
|
|
|
|
tSeq->contig);
|
|
|
|
else if (fields_array[i] == e_baggage && tSeq->baglen != 0) {
|
|
|
|
if (save_str_size < tSeq->baglen + 2) {
|
|
|
|
save_str_size = tSeq->baglen + 2;
|
|
|
|
save_str =
|
|
|
|
(char *)Realloc(save_str, save_str_size);
|
|
|
|
}
|
|
|
|
|
|
|
|
save_str[0] = '\n';
|
|
|
|
save_str[1] = '\0';
|
|
|
|
|
|
|
|
/* put a \0 at the end of baggage. */
|
|
|
|
strncat(save_str, tSeq->baggage, tSeq->baglen);
|
|
|
|
while (save_str[tSeq->baglen - 1] == '\n') {
|
|
|
|
tSeq->baglen--;
|
|
|
|
}
|
|
|
|
save_str[tSeq->baglen] = '\0';
|
|
|
|
}
|
|
|
|
if (save_str[0] != '\0') {
|
|
|
|
if (first_field == TRUE) {
|
|
|
|
first_field = FALSE;
|
|
|
|
fprintf(fp, "{");
|
|
|
|
}
|
|
|
|
fprintf(fp, "%s", save_str);
|
2022-03-08 04:43:05 +08:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2023-04-09 02:34:34 +08:00
|
|
|
if (first_field == FALSE) {
|
|
|
|
fprintf(fp, "\n}\n");
|
|
|
|
}
|
2022-03-08 04:43:05 +08:00
|
|
|
|
2023-04-09 02:34:34 +08:00
|
|
|
if (all_fields == TRUE && fields_array != NULL) {
|
|
|
|
Cfree(fields_array);
|
|
|
|
fields_array = NULL;
|
|
|
|
}
|
|
|
|
if (save_str != NULL) {
|
|
|
|
Cfree(save_str);
|
|
|
|
save_str = NULL;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (first_field == TRUE)
|
|
|
|
return 0;
|
|
|
|
else
|
|
|
|
return 1;
|
|
|
|
}
|
2022-03-08 04:43:05 +08:00
|
|
|
|
|
|
|
/*********
|
|
|
|
*
|
|
|
|
* ReadRecord() reads one record from fp into tSeq. fp remains at
|
2023-04-09 02:34:34 +08:00
|
|
|
* the finishing position so that next time when ReadRecord() is
|
2022-03-08 04:43:05 +08:00
|
|
|
* called, it reads the next record.
|
|
|
|
*
|
|
|
|
* The caller program should LOCATE MEMORY for the tSeq before calling.
|
|
|
|
*
|
|
|
|
* ReadRecord() returns:
|
2023-04-09 02:34:34 +08:00
|
|
|
* TRUE if no error;
|
2022-03-08 04:43:05 +08:00
|
|
|
* FALSE if anything is wrong
|
|
|
|
* -1 if end-of-file is reached
|
|
|
|
*
|
|
|
|
**********/
|
|
|
|
|
2023-04-09 02:34:34 +08:00
|
|
|
int ReadRecord(fp, tSeq)
|
2022-03-08 04:43:05 +08:00
|
|
|
FILE *fp;
|
|
|
|
Sequence *tSeq;
|
|
|
|
{
|
2023-04-09 02:34:34 +08:00
|
|
|
char field_name[20], line[256], orig_line[256];
|
|
|
|
int temp_str_size, start, end, l, max_len = 255;
|
|
|
|
char *fgets_ret, *temp_str, *fgets_ret1;
|
|
|
|
int start_rec = FALSE;
|
|
|
|
int need_to_read = TRUE;
|
|
|
|
char started = 'F';
|
|
|
|
void InitRecord();
|
|
|
|
void FreeRecord();
|
|
|
|
|
|
|
|
temp_str = (char *)Calloc(256, 1);
|
|
|
|
temp_str_size = 256;
|
|
|
|
|
|
|
|
InitRecord(tSeq);
|
|
|
|
|
|
|
|
if (tSeq->c_elem == NULL) {
|
|
|
|
tSeq->c_elem = (char *)Calloc(256, 1);
|
|
|
|
tSeq->seqmaxlen = 256;
|
2022-03-08 04:43:05 +08:00
|
|
|
}
|
2023-04-09 02:34:34 +08:00
|
|
|
tSeq->c_elem[0] = '\0';
|
|
|
|
|
|
|
|
/* read file line-by-line. */
|
|
|
|
while (need_to_read == TRUE &&
|
|
|
|
((fgets_ret = fgets(line, max_len, fp)) != NULL ||
|
|
|
|
start_rec == TRUE)) {
|
|
|
|
strcpy(orig_line, line);
|
|
|
|
end = strlen(line) - 1;
|
|
|
|
while (end >= 0 && (line[end] == ' ' || line[end] == '\t' ||
|
|
|
|
line[end] == ',' || line[end] == '\n'))
|
|
|
|
end--;
|
|
|
|
|
|
|
|
/* ignore empty lines. */
|
|
|
|
if (end == -1) continue;
|
|
|
|
|
|
|
|
if (line[end] == '{') started = 'T';
|
|
|
|
|
|
|
|
/* to ignore the lines between a } and a {. */
|
|
|
|
while (started == 'F' && fgets_ret != NULL) {
|
|
|
|
fgets_ret = fgets(line, max_len, fp);
|
|
|
|
strcpy(orig_line, line);
|
|
|
|
end = strlen(line) - 1;
|
|
|
|
while (end >= 0 &&
|
|
|
|
(line[end] == ' ' || line[end] == '\t' ||
|
|
|
|
line[end] == ',' || line[end] == '\n'))
|
|
|
|
end--;
|
|
|
|
|
|
|
|
/* ignore empty lines. */
|
|
|
|
if (end == -1) continue;
|
|
|
|
|
|
|
|
if (line[end] == '{') started = 'T';
|
|
|
|
}
|
2022-03-08 04:43:05 +08:00
|
|
|
|
2023-04-09 02:34:34 +08:00
|
|
|
if (fgets_ret == NULL) return -1;
|
2022-03-08 04:43:05 +08:00
|
|
|
|
2023-04-09 02:34:34 +08:00
|
|
|
if (end < 0) {
|
2022-03-08 04:43:05 +08:00
|
|
|
}
|
2023-04-09 02:34:34 +08:00
|
|
|
else if ((line[end] == '}') && (end == 0)) {
|
|
|
|
start_rec = FALSE;
|
|
|
|
need_to_read = FALSE;
|
2022-03-08 04:43:05 +08:00
|
|
|
}
|
2023-04-09 02:34:34 +08:00
|
|
|
else if (line[end] == '{' && end <= 10) {
|
|
|
|
start_rec = TRUE;
|
2022-03-08 04:43:05 +08:00
|
|
|
}
|
2023-04-09 02:34:34 +08:00
|
|
|
else {
|
|
|
|
if (line[end] == '}') {
|
|
|
|
need_to_read = FALSE;
|
|
|
|
start_rec = FALSE;
|
|
|
|
}
|
2022-03-08 04:43:05 +08:00
|
|
|
|
2023-04-09 02:34:34 +08:00
|
|
|
/* locate the tag. */
|
|
|
|
start = 0;
|
|
|
|
while (line[start] == ' ' || line[start] == '\t' ||
|
|
|
|
line[start] == '\n' || line[start] == '{')
|
|
|
|
start++;
|
2022-03-08 04:43:05 +08:00
|
|
|
|
2023-04-09 02:34:34 +08:00
|
|
|
end = start + 1;
|
|
|
|
while (line[end] != ' ' && line[end] != '\t' &&
|
|
|
|
line[end] != '\n' && line[end] != '\0')
|
|
|
|
end++;
|
|
|
|
strncpy(field_name, line + start, end - start);
|
|
|
|
field_name[end - start] = '\0';
|
|
|
|
|
|
|
|
/* process the field value. */
|
|
|
|
|
|
|
|
/*
|
|
|
|
* creation_date, probing_date, or autorad_date
|
2022-03-08 04:43:05 +08:00
|
|
|
*/
|
|
|
|
|
2023-04-09 02:34:34 +08:00
|
|
|
if (strcmp(field_name, "creation-date") == 0) {
|
|
|
|
while (!isdigit(line[end])) end++;
|
|
|
|
if (strToDate(line + end,
|
|
|
|
tSeq->creation_date) == -1) {
|
|
|
|
return FALSE;
|
|
|
|
}
|
2022-03-08 04:43:05 +08:00
|
|
|
}
|
2023-04-09 02:34:34 +08:00
|
|
|
else if (strcmp(field_name, "probing-date") == 0) {
|
|
|
|
while (line[end] != '\0' && !isdigit(line[end]))
|
|
|
|
end++;
|
|
|
|
|
|
|
|
if (line[end] != '\0' &&
|
|
|
|
strToDate(line + end, tSeq->probing_date) ==
|
|
|
|
-1) {
|
|
|
|
return FALSE;
|
|
|
|
}
|
2022-03-08 04:43:05 +08:00
|
|
|
}
|
2023-04-09 02:34:34 +08:00
|
|
|
else if (strcmp(field_name, "autorad-date") == 0) {
|
|
|
|
while (line[end] != '\0' && !isdigit(line[end]))
|
|
|
|
end++;
|
|
|
|
if (line[end] != '\0' &&
|
|
|
|
strToDate(line + end, tSeq->autorad_date) ==
|
|
|
|
-1) {
|
|
|
|
return FALSE;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2022-03-08 04:43:05 +08:00
|
|
|
/*
|
2023-04-09 02:34:34 +08:00
|
|
|
* sequence or comments.
|
|
|
|
*/
|
|
|
|
|
|
|
|
else if (strcmp(field_name, "sequence") == 0 ||
|
|
|
|
strcmp(field_name, "comments") == 0) {
|
|
|
|
temp_str[0] = '\0';
|
|
|
|
|
|
|
|
/* locate the first ". */
|
|
|
|
while (line[end++] != '"')
|
|
|
|
;
|
|
|
|
start = end;
|
|
|
|
end = strlen(line);
|
|
|
|
|
|
|
|
/* ---"\n\0. */
|
|
|
|
if (line[end - 2] == '"')
|
|
|
|
end -= 2;
|
|
|
|
else if (line[end - 1] == '\n' &&
|
|
|
|
strcmp(field_name, "sequence") == 0)
|
|
|
|
end--;
|
|
|
|
|
|
|
|
while (temp_str_size < end - start + 1) {
|
|
|
|
temp_str_size *= 2;
|
|
|
|
temp_str = (char *)Realloc(
|
|
|
|
temp_str, temp_str_size);
|
|
|
|
}
|
|
|
|
if (end - start > 0)
|
|
|
|
strncat(temp_str, line + start,
|
|
|
|
end - start);
|
|
|
|
|
|
|
|
/* Read the second line of the seq. or comments,
|
|
|
|
if any. end-start<0 is the case that " is the
|
|
|
|
only char this line.*/
|
|
|
|
if (line[strlen(line) - 2] != '"' ||
|
|
|
|
end - start < 0) {
|
|
|
|
while ((fgets_ret1 =
|
|
|
|
fgets(line, max_len, fp)) !=
|
|
|
|
NULL) {
|
|
|
|
/* IGNORE empty lines. 5/4/92 */
|
|
|
|
int empty_line = 0;
|
|
|
|
while (line[empty_line] == ' ')
|
|
|
|
empty_line++;
|
|
|
|
if (line[empty_line] == '\n') {
|
|
|
|
continue;
|
|
|
|
/* strncat(temp_str,
|
|
|
|
* line, end); 5/4/92 */
|
|
|
|
}
|
|
|
|
|
|
|
|
l = strlen(line) - 1;
|
|
|
|
if (line[l - 1] == '"')
|
|
|
|
end = l - 1;
|
|
|
|
else
|
|
|
|
end = l;
|
|
|
|
|
|
|
|
if (line[end] == '\n' &&
|
|
|
|
strcmp(field_name,
|
|
|
|
"comments") == 0)
|
|
|
|
end++;
|
|
|
|
|
|
|
|
/* Gurantee 'end' chars for the
|
|
|
|
* string, one for ", and one
|
|
|
|
* for \0.
|
|
|
|
*/
|
|
|
|
while (temp_str_size -
|
|
|
|
strlen(temp_str) <
|
|
|
|
end + 3) {
|
|
|
|
temp_str_size *= 2;
|
|
|
|
temp_str =
|
|
|
|
(char *)Realloc(
|
|
|
|
temp_str,
|
|
|
|
temp_str_size);
|
|
|
|
}
|
|
|
|
strncat(temp_str, line, end);
|
|
|
|
|
|
|
|
if (line[l - 1] == '"') break;
|
|
|
|
}
|
|
|
|
if (fgets_ret1 == NULL &&
|
|
|
|
need_to_read == TRUE) {
|
|
|
|
fprintf(stderr,
|
|
|
|
"ReadRecord(): "
|
|
|
|
"incomplete record.\n");
|
|
|
|
return FALSE;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
l = strlen(temp_str);
|
|
|
|
if (strcmp(field_name, "comments") == 0) {
|
|
|
|
if (tSeq->commentsmaxlen == 0) {
|
|
|
|
tSeq->comments =
|
|
|
|
(char *)Calloc(l + 1, 1);
|
|
|
|
tSeq->commentsmaxlen = l + 1;
|
|
|
|
}
|
|
|
|
else {
|
|
|
|
while (tSeq->commentslen + l +
|
|
|
|
1 >
|
|
|
|
tSeq->commentsmaxlen) {
|
|
|
|
tSeq->commentsmaxlen *=
|
|
|
|
2;
|
|
|
|
tSeq->comments =
|
|
|
|
(char *)Realloc(
|
|
|
|
tSeq->comments,
|
|
|
|
tSeq->commentsmaxlen);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
tSeq->comments[tSeq->commentslen] =
|
|
|
|
'\0';
|
|
|
|
strcat(tSeq->comments, temp_str);
|
|
|
|
tSeq->commentslen += l;
|
|
|
|
}
|
|
|
|
else /* it is the sequence. */
|
|
|
|
{
|
|
|
|
if (tSeq->seqmaxlen == 0) {
|
|
|
|
tSeq->c_elem =
|
|
|
|
(char *)Calloc(l + 1, 1);
|
|
|
|
}
|
|
|
|
else if (l + 1 > tSeq->seqmaxlen) {
|
|
|
|
tSeq->c_elem = (char *)Realloc(
|
|
|
|
tSeq->c_elem, l + 1);
|
|
|
|
}
|
|
|
|
tSeq->seqmaxlen = l + 1;
|
|
|
|
tSeq->seqlen = l;
|
|
|
|
strcpy(tSeq->c_elem, temp_str);
|
|
|
|
}
|
|
|
|
}
|
2022-03-08 04:43:05 +08:00
|
|
|
|
2023-04-09 02:34:34 +08:00
|
|
|
/*
|
|
|
|
* Integer or String.
|
|
|
|
*/
|
|
|
|
|
|
|
|
else {
|
|
|
|
/* locate the value: a string or an integer. */
|
|
|
|
|
|
|
|
while (line[end] == ' ' || line[end] == '\t')
|
|
|
|
end++;
|
|
|
|
if (line[end] == '"') {
|
|
|
|
/* It is a string. */
|
|
|
|
end++;
|
|
|
|
start = end;
|
|
|
|
while (line[end] != '\0' &&
|
|
|
|
line[end] != '"')
|
|
|
|
end++;
|
|
|
|
/*
|
|
|
|
* strncat will not put a \0 at the end
|
|
|
|
* of a string if the copying string is
|
|
|
|
* longer than n.
|
|
|
|
*/
|
|
|
|
line[end++] = '\0';
|
|
|
|
}
|
|
|
|
else {
|
|
|
|
/* It is an integer. */
|
|
|
|
start = end;
|
|
|
|
while (line[end] != ' ' &&
|
|
|
|
line[end] != '\t' &&
|
|
|
|
line[end] != '\n' &&
|
|
|
|
line[end] != '\0')
|
|
|
|
end++;
|
|
|
|
strncpy(temp_str, line + start,
|
|
|
|
end - start + 1); /*4/26 add 1*/
|
|
|
|
temp_str[end - start] = '\0';
|
|
|
|
}
|
|
|
|
|
|
|
|
/* assign to an integer field. */
|
|
|
|
if (strcmp(field_name, "laneset") == 0)
|
|
|
|
tSeq->laneset = atoi(temp_str);
|
|
|
|
else if (strcmp(field_name, "strandedness") ==
|
|
|
|
0)
|
|
|
|
tSeq->strandedness = atoi(temp_str);
|
|
|
|
else if (strcmp(field_name, "direction") == 0)
|
|
|
|
tSeq->direction = atoi(temp_str);
|
|
|
|
else if (strcmp(field_name, "orig_strand") == 0)
|
|
|
|
tSeq->orig_strand = atoi(temp_str);
|
|
|
|
else if (strcmp(field_name, "orig_direction") ==
|
|
|
|
0)
|
|
|
|
tSeq->orig_direction = atoi(temp_str);
|
|
|
|
else if (strcmp(field_name, "offset") == 0)
|
|
|
|
tSeq->offset = atoi(temp_str);
|
|
|
|
else if (strcmp(field_name, "group-number") ==
|
|
|
|
0)
|
|
|
|
tSeq->group_number = atoi(temp_str);
|
|
|
|
else if (strcmp(field_name, "group-ID") == 0)
|
|
|
|
tSeq->group_ID = atoi(temp_str);
|
|
|
|
|
|
|
|
/* assign to a string field. */
|
|
|
|
else if (strcmp(field_name, "type") == 0) {
|
|
|
|
if (end - start > 31) end = start + 31;
|
|
|
|
strncpy(tSeq->type, line + start,
|
|
|
|
end - start);
|
|
|
|
tSeq->type[end - start] = '\0';
|
|
|
|
}
|
|
|
|
else if (strcmp(field_name, "barcode") == 0) {
|
|
|
|
if (end - start > 31) end = start + 31;
|
|
|
|
strncpy(tSeq->barcode, line + start,
|
|
|
|
end - start);
|
|
|
|
tSeq->barcode[end - start] = '\0';
|
|
|
|
}
|
|
|
|
else if (strcmp(field_name, "name") == 0) {
|
|
|
|
if (end - start > 31) end = start + 31;
|
|
|
|
strncpy(tSeq->name, line + start,
|
|
|
|
end - start);
|
|
|
|
tSeq->name[end - start] = '\0';
|
|
|
|
}
|
|
|
|
else if (strcmp(field_name, "status") == 0) {
|
|
|
|
if (end - start > 31) end = start + 31;
|
|
|
|
strncpy(tSeq->status, line + start,
|
|
|
|
end - start);
|
|
|
|
tSeq->status[end - start] = '\0';
|
|
|
|
}
|
|
|
|
else if (strcmp(field_name, "walk") == 0) {
|
|
|
|
if (end - start > 31) end = start + 31;
|
|
|
|
strncpy(tSeq->walk, line + start,
|
|
|
|
end - start);
|
|
|
|
tSeq->walk[end - start] = '\0';
|
|
|
|
}
|
|
|
|
else if (strcmp(field_name, "sequence-ID") ==
|
|
|
|
0) {
|
|
|
|
if (end - start > 31) end = start + 31;
|
|
|
|
strncpy(tSeq->sequence_ID, line + start,
|
|
|
|
end - start);
|
|
|
|
tSeq->sequence_ID[end - start] = '\0';
|
|
|
|
}
|
|
|
|
else if (strcmp(field_name, "creator") == 0) {
|
|
|
|
if (end - start > 31) end = start + 31;
|
|
|
|
strncpy(tSeq->creator, line + start,
|
|
|
|
end - start);
|
|
|
|
tSeq->creator[end - start] = '\0';
|
|
|
|
}
|
|
|
|
else if (strcmp(field_name, "film") == 0) {
|
|
|
|
if (end - start > 31) end = start + 31;
|
|
|
|
strncpy(tSeq->film, line + start,
|
|
|
|
end - start);
|
|
|
|
tSeq->film[end - start] = '\0';
|
|
|
|
}
|
|
|
|
else if (strcmp(field_name, "membrane") == 0) {
|
|
|
|
if (end - start > 31) end = start + 31;
|
|
|
|
strncpy(tSeq->membrane, line + start,
|
|
|
|
end - start);
|
|
|
|
tSeq->membrane[end - start] = '\0';
|
|
|
|
}
|
|
|
|
else if (strcmp(field_name, "source-ID") == 0) {
|
|
|
|
if (end - start > 31) end = start + 31;
|
|
|
|
strncpy(tSeq->source_ID, line + start,
|
|
|
|
end - start);
|
|
|
|
tSeq->source_ID[end - start] = '\0';
|
|
|
|
}
|
|
|
|
else if (strcmp(field_name, "contig") == 0) {
|
|
|
|
if (end - start > 31) end = start + 31;
|
|
|
|
strncpy(tSeq->contig, line + start,
|
|
|
|
end - start);
|
|
|
|
tSeq->contig[end - start] = '\0';
|
|
|
|
}
|
|
|
|
else {
|
|
|
|
if (tSeq->bagmaxlen == 0) {
|
|
|
|
tSeq->bagmaxlen =
|
|
|
|
4 * strlen(orig_line);
|
|
|
|
tSeq->baggage = (char *)Calloc(
|
|
|
|
tSeq->bagmaxlen, 1);
|
|
|
|
}
|
|
|
|
else {
|
|
|
|
while (tSeq->bagmaxlen <
|
|
|
|
tSeq->baglen + 2 +
|
|
|
|
strlen(orig_line)) {
|
|
|
|
tSeq->bagmaxlen *= 2;
|
|
|
|
tSeq->baggage =
|
|
|
|
(char *)Realloc(
|
|
|
|
tSeq->baggage,
|
|
|
|
tSeq->bagmaxlen);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
if (tSeq->baglen == 0) {
|
|
|
|
/*
|
|
|
|
tSeq->baggage[0] = '\n';
|
|
|
|
tSeq->baggage[1] = '\0';
|
|
|
|
tSeq->baglen = 1;
|
|
|
|
*/
|
|
|
|
tSeq->baggage[0] = '\0';
|
|
|
|
}
|
|
|
|
|
|
|
|
/* strcat(tSeq->baggage, "\n");*/
|
|
|
|
strcat(tSeq->baggage, orig_line);
|
|
|
|
tSeq->baglen += strlen(orig_line);
|
|
|
|
}
|
|
|
|
}
|
2022-03-08 04:43:05 +08:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2023-04-09 02:34:34 +08:00
|
|
|
if (temp_str != NULL) {
|
|
|
|
Cfree(temp_str);
|
|
|
|
temp_str = NULL;
|
|
|
|
}
|
2022-03-08 04:43:05 +08:00
|
|
|
|
2023-04-09 02:34:34 +08:00
|
|
|
if (start_rec == FALSE && fgets_ret == NULL) {
|
|
|
|
/* end of file, did not get a record. */
|
|
|
|
return -1;
|
|
|
|
}
|
|
|
|
else
|
|
|
|
return TRUE;
|
2022-03-08 04:43:05 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
/*********
|
|
|
|
*
|
|
|
|
* Initialize a record.
|
|
|
|
*
|
|
|
|
* Note: no memory allocation is performed.
|
|
|
|
*
|
|
|
|
**********/
|
|
|
|
|
2023-04-09 02:34:34 +08:00
|
|
|
void InitRecord(tSeq) Sequence *tSeq;
|
2022-03-08 04:43:05 +08:00
|
|
|
{
|
2023-04-09 02:34:34 +08:00
|
|
|
int i;
|
|
|
|
|
|
|
|
strcpy(tSeq->type, "DNA");
|
|
|
|
tSeq->barcode[0] = '\0';
|
|
|
|
tSeq->name[0] = '\0';
|
|
|
|
tSeq->status[0] = '\0';
|
|
|
|
strcpy(tSeq->walk, "FALSE");
|
|
|
|
tSeq->sequence_ID[0] = '\0';
|
|
|
|
|
|
|
|
tSeq->c_elem = NULL;
|
|
|
|
tSeq->seqlen = 0;
|
|
|
|
tSeq->seqmaxlen = 0;
|
|
|
|
|
|
|
|
for (i = 0; i < 6; i++) {
|
|
|
|
tSeq->creation_date[i] = 0;
|
|
|
|
tSeq->probing_date[i] = 0;
|
|
|
|
tSeq->autorad_date[i] = 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
tSeq->creator[0] = '\0';
|
|
|
|
tSeq->film[0] = '\0';
|
|
|
|
tSeq->membrane[0] = '\0';
|
|
|
|
tSeq->source_ID[0] = '\0';
|
|
|
|
tSeq->contig[0] = '\0';
|
|
|
|
tSeq->laneset = -1;
|
|
|
|
tSeq->direction = 1; /* (1/-1/0),default: 5 to 3. */
|
|
|
|
tSeq->strandedness = 1; /* (1/2/0), default: primary.*/
|
|
|
|
tSeq->orig_direction = 0; /* (0 unknown, -1:3'->5', 1:5'->3') */
|
|
|
|
tSeq->orig_strand = 0; /* (0 unknown, 1:primary, 2:secondary) */
|
|
|
|
tSeq->offset = 0;
|
|
|
|
|
|
|
|
tSeq->comments = NULL;
|
|
|
|
tSeq->commentslen = 0;
|
|
|
|
tSeq->commentsmaxlen = 0;
|
|
|
|
|
|
|
|
tSeq->baggage = NULL;
|
|
|
|
tSeq->baglen = 0;
|
|
|
|
tSeq->bagmaxlen = 0;
|
|
|
|
tSeq->group_number = 0;
|
|
|
|
tSeq->group_ID = 0;
|
2022-03-08 04:43:05 +08:00
|
|
|
}
|
|
|
|
|
2023-04-09 02:34:34 +08:00
|
|
|
void CopyRecord(to, from) Sequence *from, *to;
|
|
|
|
{
|
|
|
|
int i;
|
2022-03-08 04:43:05 +08:00
|
|
|
|
2023-04-09 02:34:34 +08:00
|
|
|
InitRecord(to);
|
2022-03-08 04:43:05 +08:00
|
|
|
|
2023-04-09 02:34:34 +08:00
|
|
|
strcpy(to->type, from->type);
|
|
|
|
|
|
|
|
strcpy(to->barcode, from->barcode);
|
|
|
|
strcpy(to->name, from->name);
|
|
|
|
strcpy(to->status, from->status);
|
|
|
|
strcpy(to->walk, from->walk);
|
|
|
|
strcpy(to->sequence_ID, from->sequence_ID);
|
|
|
|
|
|
|
|
if (from->c_elem != NULL) {
|
|
|
|
to->seqlen = from->seqlen;
|
|
|
|
to->seqmaxlen = from->seqmaxlen;
|
|
|
|
to->c_elem = (char *)Calloc(to->seqmaxlen, 1);
|
|
|
|
strncpy(to->c_elem, from->c_elem, to->seqlen);
|
|
|
|
to->c_elem[to->seqlen] = '\0';
|
|
|
|
}
|
2022-03-08 04:43:05 +08:00
|
|
|
|
2023-04-09 02:34:34 +08:00
|
|
|
for (i = 0; i < 6; i++) {
|
|
|
|
to->creation_date[i] = from->creation_date[i];
|
|
|
|
to->probing_date[i] = from->probing_date[i];
|
|
|
|
to->autorad_date[i] = from->autorad_date[i];
|
|
|
|
}
|
2022-03-08 04:43:05 +08:00
|
|
|
|
2023-04-09 02:34:34 +08:00
|
|
|
strcpy(to->creator, from->creator);
|
|
|
|
strcpy(to->film, from->film);
|
|
|
|
strcpy(to->membrane, from->membrane);
|
|
|
|
strcpy(to->source_ID, from->source_ID);
|
|
|
|
strcpy(to->contig, from->contig);
|
|
|
|
to->laneset = from->laneset;
|
|
|
|
to->strandedness = from->strandedness;
|
|
|
|
to->orig_direction = from->orig_direction;
|
|
|
|
to->orig_strand = from->orig_strand;
|
|
|
|
to->direction = from->direction;
|
|
|
|
to->offset = from->offset;
|
|
|
|
|
|
|
|
if (from->comments != NULL) {
|
|
|
|
to->commentsmaxlen = from->commentsmaxlen;
|
|
|
|
to->commentslen = from->commentslen;
|
|
|
|
to->comments = (char *)Calloc(to->commentsmaxlen, 1);
|
|
|
|
strncpy(to->comments, from->comments, to->commentslen);
|
|
|
|
to->comments[to->commentslen] = '\0';
|
|
|
|
}
|
2022-03-08 04:43:05 +08:00
|
|
|
|
2023-04-09 02:34:34 +08:00
|
|
|
if (from->baggage != NULL) {
|
|
|
|
to->baglen = from->baglen;
|
|
|
|
to->bagmaxlen = from->bagmaxlen;
|
|
|
|
to->baggage = (char *)Calloc(to->bagmaxlen, 1);
|
|
|
|
strncpy(to->baggage, from->baggage, to->baglen);
|
|
|
|
to->baggage[to->baglen] = '\0';
|
|
|
|
}
|
|
|
|
|
|
|
|
to->group_number = from->group_number;
|
|
|
|
to->group_ID = from->group_ID;
|
|
|
|
}
|
2022-03-08 04:43:05 +08:00
|
|
|
|
|
|
|
/*********
|
|
|
|
*
|
|
|
|
* Clean the contents of a record without changing the memory size.
|
|
|
|
*
|
|
|
|
**********/
|
|
|
|
|
2023-04-09 02:34:34 +08:00
|
|
|
void CleanRecord(tSeq) Sequence *tSeq;
|
2022-03-08 04:43:05 +08:00
|
|
|
{
|
2023-04-09 02:34:34 +08:00
|
|
|
int i;
|
|
|
|
|
|
|
|
strcpy(tSeq->type, "DNA");
|
|
|
|
tSeq->name[0] = '\0';
|
|
|
|
tSeq->barcode[0] = '\0';
|
|
|
|
tSeq->status[0] = '\0';
|
|
|
|
strcpy(tSeq->walk, "FALSE");
|
|
|
|
tSeq->sequence_ID[0] = '\0';
|
|
|
|
|
|
|
|
if (tSeq->c_elem != NULL) tSeq->c_elem[0] = '\0';
|
|
|
|
tSeq->seqlen = 0;
|
|
|
|
|
|
|
|
for (i = 0; i < 6; i++) {
|
|
|
|
tSeq->creation_date[i] = 0;
|
|
|
|
tSeq->probing_date[i] = 0;
|
|
|
|
tSeq->autorad_date[i] = 0;
|
|
|
|
}
|
2022-03-08 04:43:05 +08:00
|
|
|
|
2023-04-09 02:34:34 +08:00
|
|
|
tSeq->creator[0] = '\0';
|
|
|
|
tSeq->film[0] = '\0';
|
|
|
|
tSeq->membrane[0] = '\0';
|
|
|
|
tSeq->source_ID[0] = '\0';
|
|
|
|
tSeq->contig[0] = '\0';
|
|
|
|
tSeq->laneset = -1;
|
|
|
|
tSeq->strandedness = 1; /* (1/2/0), default. primary. */
|
|
|
|
tSeq->direction = 1; /* (1/-1/0),default. 5 to 3. */
|
|
|
|
tSeq->orig_direction = 0;
|
|
|
|
tSeq->orig_strand = 0;
|
|
|
|
tSeq->offset = 0;
|
|
|
|
|
|
|
|
if (tSeq->comments != NULL) tSeq->comments[0] = '\0';
|
|
|
|
tSeq->commentslen = 0;
|
|
|
|
|
|
|
|
if (tSeq->baggage != NULL) tSeq->baggage[0] = '\0';
|
|
|
|
tSeq->baglen = 0;
|
|
|
|
tSeq->group_number = 0;
|
|
|
|
tSeq->group_ID = 0;
|
|
|
|
}
|
2022-03-08 04:43:05 +08:00
|
|
|
|
|
|
|
/*********
|
|
|
|
*
|
|
|
|
* Free memory for a record.
|
|
|
|
*
|
|
|
|
**********/
|
|
|
|
|
2023-04-09 02:34:34 +08:00
|
|
|
void FreeRecord(tSeq) Sequence **tSeq;
|
2022-03-08 04:43:05 +08:00
|
|
|
{
|
2023-04-09 02:34:34 +08:00
|
|
|
Cfree((*tSeq)->c_elem);
|
|
|
|
Cfree((*tSeq)->comments);
|
|
|
|
Cfree((*tSeq)->baggage);
|
|
|
|
Cfree((*tSeq));
|
|
|
|
(*tSeq)->c_elem = NULL;
|
|
|
|
(*tSeq)->comments = NULL;
|
|
|
|
(*tSeq)->baggage = NULL;
|
|
|
|
(*tSeq) = NULL;
|
2022-03-08 04:43:05 +08:00
|
|
|
}
|
|
|
|
|
2023-04-09 02:34:34 +08:00
|
|
|
static max_day[2][13] = {{0, 31, 28, 31, 30, 31, 30, 31, 31, 30, 31, 30, 31},
|
|
|
|
{0, 31, 29, 31, 30, 31, 30, 31, 31, 30, 31, 30, 31}};
|
2022-03-08 04:43:05 +08:00
|
|
|
|
|
|
|
/***********
|
|
|
|
*
|
2023-04-09 02:34:34 +08:00
|
|
|
* strToDate() locates first six integers and translates them
|
2022-03-08 04:43:05 +08:00
|
|
|
* into a date.
|
|
|
|
*
|
2023-04-09 02:34:34 +08:00
|
|
|
* String should have the format of "mm/dd/yy hh/mn/sc xm",
|
2022-03-08 04:43:05 +08:00
|
|
|
* with anything except digit as the delimiters.
|
|
|
|
*
|
|
|
|
* Order in the date array is (0->5): (yy mm dd hh mn sc).
|
|
|
|
*
|
|
|
|
* Returns FALSE if anything is wrong, TRUE otherwise.
|
|
|
|
*
|
|
|
|
**********/
|
|
|
|
|
2023-04-09 02:34:34 +08:00
|
|
|
int strToDate(str, date) const char *str;
|
2022-03-08 04:43:05 +08:00
|
|
|
int date[];
|
|
|
|
{
|
2023-04-09 02:34:34 +08:00
|
|
|
int leap;
|
|
|
|
char temp_str[2];
|
|
|
|
char longstr[256];
|
|
|
|
|
|
|
|
/* locate 6 integers. */
|
|
|
|
|
|
|
|
strcpy(longstr, str);
|
|
|
|
strcat(longstr, " -1/-1/-1 ");
|
|
|
|
sscanf(longstr, "%d%*c%d%*c%d%*c%d%*c%d%*c%d%2s", &date[1], &date[2],
|
|
|
|
&date[0], &date[3], &date[4], &date[5], temp_str);
|
|
|
|
|
|
|
|
/* verify year. */
|
|
|
|
if (date[0] >= 100) date[0] -= 1900;
|
|
|
|
|
|
|
|
/* verify month. */
|
|
|
|
if (date[1] > 12 || date[1] < 1) {
|
|
|
|
fprintf(stderr, "invalid month %s\n", str);
|
|
|
|
return FALSE;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* verify day. */
|
|
|
|
if ((date[0] % 4 == 0 && date[0] % 100 != 0) || date[0] % 400 == 0)
|
|
|
|
leap = 1;
|
|
|
|
else
|
|
|
|
leap = 0;
|
|
|
|
|
|
|
|
if (date[2] > max_day[leap][date[1]] || date[2] < 1) {
|
|
|
|
fprintf(stderr, "invalid day %s\n", str);
|
|
|
|
return FALSE;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* verify time. */
|
|
|
|
if (strncmp(temp_str, "pm", 2) == 0) date[3] += 12;
|
|
|
|
if (date[3] < -1 || date[3] > 23 || date[4] < -1 || date[4] > 59 ||
|
|
|
|
date[5] < -1 || date[5] > 59) {
|
|
|
|
fprintf(stderr, "invalid time %s\n", str);
|
|
|
|
return FALSE;
|
|
|
|
}
|
2022-03-08 04:43:05 +08:00
|
|
|
|
2023-04-09 02:34:34 +08:00
|
|
|
return TRUE;
|
|
|
|
}
|
2022-03-08 04:43:05 +08:00
|
|
|
|
|
|
|
/**********
|
2023-04-09 02:34:34 +08:00
|
|
|
*
|
2022-03-08 04:43:05 +08:00
|
|
|
* Default_IUPAC_Trans() translates an ASCII IUPAC code into
|
|
|
|
* an (char) integer.
|
|
|
|
*
|
|
|
|
**********/
|
|
|
|
|
2023-04-09 02:34:34 +08:00
|
|
|
char Default_IUPAC_Trans(base)
|
2022-03-08 04:43:05 +08:00
|
|
|
char base;
|
|
|
|
{
|
2023-04-09 02:34:34 +08:00
|
|
|
int i;
|
|
|
|
char c;
|
|
|
|
c = base | 32;
|
2022-03-08 04:43:05 +08:00
|
|
|
|
2023-04-09 02:34:34 +08:00
|
|
|
if (c == 'u') return (char)8;
|
2022-03-08 04:43:05 +08:00
|
|
|
|
2023-04-09 02:34:34 +08:00
|
|
|
if (c == 'p') return (char)5;
|
2022-03-08 04:43:05 +08:00
|
|
|
|
2023-04-09 02:34:34 +08:00
|
|
|
for (i = 0; i < 16; i++) {
|
|
|
|
if (c == Default_DNA_Trans[i]) {
|
|
|
|
return ((char)i);
|
|
|
|
}
|
2022-03-08 04:43:05 +08:00
|
|
|
}
|
2023-04-09 02:34:34 +08:00
|
|
|
fprintf(stderr, "Character %c is not IUPAC coded.\n", base);
|
|
|
|
return -1;
|
2022-03-08 04:43:05 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
char *uniqueID();
|
|
|
|
|
|
|
|
/***********
|
2023-04-09 02:34:34 +08:00
|
|
|
*
|
2022-03-08 04:43:05 +08:00
|
|
|
* MakeConsensus() takes an array of aligned sequence and an
|
|
|
|
* initialized 'Sequence' consensus. It modifies the consensus.
|
|
|
|
*
|
2023-04-09 02:34:34 +08:00
|
|
|
* The memory that 'consensus' has located will be reused, and
|
2022-03-08 04:43:05 +08:00
|
|
|
* consensus->seqmaxlen will be modified if necessary.
|
|
|
|
*
|
|
|
|
* Returns TRUE if successful, FALSE otherwise.
|
|
|
|
*
|
|
|
|
**********/
|
|
|
|
|
2023-04-09 02:34:34 +08:00
|
|
|
int MakeConsensus(aligned, numOfAligned, consensus, group)
|
|
|
|
Sequence aligned[]; /* input. */
|
|
|
|
int numOfAligned; /* input. */
|
|
|
|
Sequence *consensus; /* input and output. */
|
|
|
|
int group; /* Group number (if zero, use all groups) */
|
2022-03-08 04:43:05 +08:00
|
|
|
{
|
2023-04-09 02:34:34 +08:00
|
|
|
char occurence;
|
|
|
|
int i, j, index;
|
|
|
|
int max_cons = INT_MIN;
|
|
|
|
int min_offset = INT_MAX;
|
|
|
|
char temp_str[2];
|
|
|
|
unsigned char case_bit;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Search for the minimun offset.
|
|
|
|
*/
|
|
|
|
|
|
|
|
for (i = 0; i < numOfAligned; i++) {
|
|
|
|
if (group == 0 || aligned[i].group_number == group) {
|
|
|
|
SeqNormal(&aligned[i]);
|
|
|
|
min_offset = MIN(min_offset, aligned[i].offset);
|
|
|
|
max_cons = MAX(max_cons,
|
|
|
|
aligned[i].offset + aligned[i].seqlen);
|
2022-03-08 04:43:05 +08:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
2023-04-09 02:34:34 +08:00
|
|
|
* Decide consensus base by base.
|
|
|
|
*/
|
2022-03-08 04:43:05 +08:00
|
|
|
|
2023-04-09 02:34:34 +08:00
|
|
|
CleanRecord(consensus);
|
|
|
|
consensus->offset = min_offset;
|
2022-03-08 04:43:05 +08:00
|
|
|
|
2023-04-09 02:34:34 +08:00
|
|
|
if (aligned[0].contig[0] != '\0') {
|
|
|
|
strcpy(consensus->name, aligned[0].contig);
|
|
|
|
strcat(consensus->name, ".");
|
|
|
|
}
|
|
|
|
else if (strncmp(aligned[0].name, "cons.", 5) != 0) {
|
|
|
|
strcpy(consensus->name, "cons.");
|
|
|
|
strcat(consensus->name, aligned[0].name);
|
|
|
|
}
|
|
|
|
strcpy(consensus->sequence_ID, uniqueID());
|
|
|
|
strcpy(consensus->contig, aligned[0].contig);
|
|
|
|
|
|
|
|
for (j = min_offset; j < max_cons; j++) {
|
|
|
|
occurence = 00;
|
|
|
|
case_bit = 0;
|
|
|
|
for (i = 0; i < numOfAligned; i++) {
|
|
|
|
if (group == 0 || aligned[i].group_number == group) {
|
|
|
|
if (j >= aligned[i].offset &&
|
|
|
|
j < aligned[i].offset + aligned[i].seqlen) {
|
|
|
|
index = j - aligned[i].offset;
|
|
|
|
|
|
|
|
if (aligned[i].c_elem[index] == '-')
|
|
|
|
case_bit = 32;
|
|
|
|
else if (case_bit == 0)
|
|
|
|
case_bit |=
|
|
|
|
(aligned[i].c_elem[index] &
|
|
|
|
32);
|
|
|
|
|
|
|
|
occurence =
|
|
|
|
occurence |
|
|
|
|
Default_IUPAC_Trans(
|
|
|
|
aligned[i].c_elem[index]);
|
|
|
|
|
|
|
|
if (occurence != 1 && occurence != 2 &&
|
|
|
|
occurence != 4 && occurence != 8)
|
|
|
|
case_bit = 32;
|
|
|
|
/*
|
|
|
|
printf("%1c", aligned[i].c_elem[index]);
|
|
|
|
*/
|
|
|
|
}
|
|
|
|
/*
|
|
|
|
else
|
|
|
|
printf(" ");
|
|
|
|
*/
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
sprintf(temp_str, "%1c", Default_DNA_Trans[(int)occurence]);
|
|
|
|
if (case_bit == 0) temp_str[0] = toupper(temp_str[0]);
|
|
|
|
|
|
|
|
if (InsertElems(consensus, j, temp_str) == FALSE) return FALSE;
|
|
|
|
/*
|
|
|
|
printf(" cons[%d]=%1c\n", j - min_offset,
|
|
|
|
consensus->c_elem[j - min_offset]);
|
|
|
|
*/
|
|
|
|
}
|
|
|
|
return TRUE;
|
|
|
|
}
|
2022-03-08 04:43:05 +08:00
|
|
|
|
|
|
|
/***********
|
2023-04-09 02:34:34 +08:00
|
|
|
*
|
2022-03-08 04:43:05 +08:00
|
|
|
* MakeScore() takes an array of aligned sequence, and generates
|
2023-04-09 02:34:34 +08:00
|
|
|
* a consensus. Note, memory for (Sequence* consensus) should be
|
2022-03-08 04:43:05 +08:00
|
|
|
* located before it is passed to this function.
|
2023-04-09 02:34:34 +08:00
|
|
|
*
|
2022-03-08 04:43:05 +08:00
|
|
|
* Returns TRUE if successful, FALSE otherwise.
|
|
|
|
*
|
|
|
|
**********/
|
|
|
|
|
2023-04-09 02:34:34 +08:00
|
|
|
int MakeScore(aligned, numOfAligned, consensus, group)
|
|
|
|
Sequence aligned[]; /* input. */
|
|
|
|
int numOfAligned; /* input. */
|
|
|
|
Sequence *consensus; /* input and output. */
|
2022-03-08 04:43:05 +08:00
|
|
|
int group;
|
|
|
|
{
|
2023-04-09 02:34:34 +08:00
|
|
|
int i, j, index, score;
|
|
|
|
int max_cons = INT_MIN;
|
|
|
|
int min_offset = INT_MAX;
|
|
|
|
int As, Cs, Ts, Gs, Ns, tot_in_grp;
|
|
|
|
char temp_str[2], occurence, base;
|
|
|
|
int max_occ;
|
|
|
|
|
|
|
|
static char map[17] = "0123456789ABCDEF";
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Search for the minimum offset.
|
|
|
|
*/
|
|
|
|
|
|
|
|
for (i = 0; i < numOfAligned; i++) {
|
|
|
|
if (group == 0 || aligned[i].group_number == group) {
|
|
|
|
SeqNormal(&aligned[i]);
|
|
|
|
min_offset = MIN(min_offset, aligned[i].offset);
|
|
|
|
max_cons = MAX(max_cons,
|
|
|
|
aligned[i].offset + aligned[i].seqlen);
|
2022-03-08 04:43:05 +08:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2023-04-09 02:34:34 +08:00
|
|
|
/*
|
|
|
|
* Decide consensus base by base.
|
|
|
|
*/
|
|
|
|
CleanRecord(consensus);
|
|
|
|
consensus->offset = min_offset;
|
|
|
|
|
|
|
|
if (aligned[0].contig[0] != '\0') {
|
|
|
|
strcpy(consensus->name, aligned[0].contig);
|
|
|
|
strcat(consensus->name, ".");
|
|
|
|
}
|
|
|
|
else if (strncmp(aligned[0].name, "cons.", 5) != 0) {
|
|
|
|
strcpy(consensus->name, "cons.");
|
|
|
|
strcat(consensus->name, aligned[0].name);
|
|
|
|
}
|
|
|
|
strcpy(consensus->sequence_ID, uniqueID());
|
|
|
|
strcpy(consensus->contig, aligned[0].contig);
|
|
|
|
|
|
|
|
for (j = min_offset; j < max_cons; j++) {
|
|
|
|
As = Cs = Ts = Gs = Ns = 0;
|
|
|
|
tot_in_grp = 0;
|
|
|
|
occurence = 00;
|
|
|
|
|
|
|
|
for (i = 0; i < numOfAligned; i++) {
|
|
|
|
if (group == 0 || aligned[i].group_number == group) {
|
|
|
|
if (j >= aligned[i].offset &&
|
|
|
|
j < aligned[i].offset + aligned[i].seqlen) {
|
|
|
|
tot_in_grp++;
|
|
|
|
index = j - aligned[i].offset;
|
|
|
|
|
|
|
|
/*
|
|
|
|
occurence =
|
|
|
|
Default_IUPAC_Trans(aligned[i].c_elem[index]);
|
|
|
|
if((occurence & 01) == 01)
|
|
|
|
As++;
|
|
|
|
if((occurence & 02) == 02)
|
|
|
|
Cs++;
|
|
|
|
if((occurence & 04) == 04)
|
|
|
|
Gs++;
|
|
|
|
if((occurence & 010) == 010)
|
|
|
|
Ts++;
|
|
|
|
*/
|
|
|
|
|
|
|
|
base = (aligned[i].c_elem[index] | 32);
|
|
|
|
|
|
|
|
if (base == 'a')
|
|
|
|
As++;
|
|
|
|
else if (base == 'c')
|
|
|
|
Cs++;
|
|
|
|
else if (base == 'g')
|
|
|
|
Gs++;
|
|
|
|
else if (base == 't')
|
|
|
|
Ts++;
|
|
|
|
else if (base == 'n' || base == '-')
|
|
|
|
Ns++;
|
|
|
|
/*
|
|
|
|
printf("%1c",
|
|
|
|
aligned[i].c_elem[index]);
|
|
|
|
*/
|
|
|
|
}
|
|
|
|
/*
|
|
|
|
else
|
|
|
|
printf(" ");
|
|
|
|
*/
|
|
|
|
}
|
|
|
|
}
|
2022-03-08 04:43:05 +08:00
|
|
|
|
2023-04-09 02:34:34 +08:00
|
|
|
max_occ = MAX(As, MAX(Cs, MAX(Gs, Ts)));
|
2022-03-08 04:43:05 +08:00
|
|
|
|
2023-04-09 02:34:34 +08:00
|
|
|
/* socre = [0,E], F:all mismatches are either 'n' or '-' */
|
|
|
|
if (Ns != 0 && max_occ + Ns == tot_in_grp)
|
|
|
|
score = 15;
|
|
|
|
else
|
|
|
|
score = max_occ * 14 / tot_in_grp;
|
2022-03-08 04:43:05 +08:00
|
|
|
|
2023-04-09 02:34:34 +08:00
|
|
|
/*
|
|
|
|
if( score > 0xF )
|
|
|
|
{
|
|
|
|
if (InsertElems(consensus, j, "F") == FALSE)
|
|
|
|
{
|
|
|
|
return FALSE;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
else
|
|
|
|
{
|
|
|
|
*/
|
|
|
|
|
|
|
|
sprintf(temp_str, "%1c", map[score]);
|
|
|
|
if (InsertElems(consensus, j, temp_str) == FALSE) {
|
|
|
|
return FALSE;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
printf(" %2d-%2d-%2d-%2d %2d cons[%d]=%1c\n",
|
|
|
|
Ts, Gs, Cs, As, score, j,
|
|
|
|
consensus->c_elem[j]);
|
|
|
|
*/
|
2022-03-08 04:43:05 +08:00
|
|
|
}
|
2023-04-09 02:34:34 +08:00
|
|
|
return TRUE;
|
2022-03-08 04:43:05 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
/***********
|
2023-04-09 02:34:34 +08:00
|
|
|
*
|
2022-03-08 04:43:05 +08:00
|
|
|
* MakePhyloMask() takes an array of aligned sequence, and generates
|
|
|
|
* a mask that has a '0' for all columns except the columns which contain
|
|
|
|
* a, c, g, t and u only.
|
2023-04-09 02:34:34 +08:00
|
|
|
*
|
2022-03-08 04:43:05 +08:00
|
|
|
* Returns TRUE if successful, FALSE otherwise.
|
|
|
|
*
|
|
|
|
**********/
|
|
|
|
|
2023-04-09 02:34:34 +08:00
|
|
|
int MakePhyloMask(aligned, numOfAligned, consensus, group, acgtu)
|
|
|
|
Sequence aligned[]; /* input. */
|
|
|
|
int numOfAligned; /* input. */
|
|
|
|
Sequence *consensus; /* input and output. */
|
2022-03-08 04:43:05 +08:00
|
|
|
int acgtu[];
|
|
|
|
int group;
|
|
|
|
{
|
2023-04-09 02:34:34 +08:00
|
|
|
int i, j, cnt, max_cons = INT_MIN, min_offset = INT_MAX;
|
2022-03-08 04:43:05 +08:00
|
|
|
|
2023-04-09 02:34:34 +08:00
|
|
|
/*
|
|
|
|
* Search for the minimum offset.
|
|
|
|
*/
|
|
|
|
|
|
|
|
for (i = 0; i < numOfAligned; i++) {
|
|
|
|
if (group == 0 || aligned[i].group_number == group) {
|
|
|
|
SeqNormal(&aligned[i]);
|
|
|
|
min_offset = MIN(min_offset, aligned[i].offset);
|
|
|
|
max_cons = MAX(max_cons,
|
|
|
|
aligned[i].offset + aligned[i].seqlen);
|
|
|
|
}
|
|
|
|
}
|
2022-03-08 04:43:05 +08:00
|
|
|
|
2023-04-09 02:34:34 +08:00
|
|
|
/*
|
|
|
|
* Decide consensus base by base.
|
|
|
|
*/
|
|
|
|
CleanRecord(consensus);
|
|
|
|
consensus->offset = min_offset;
|
|
|
|
strcpy(consensus->name, "mask");
|
|
|
|
strcpy(consensus->type, "MASK");
|
|
|
|
strcpy(consensus->sequence_ID, uniqueID());
|
|
|
|
strcpy(consensus->contig, aligned[0].contig);
|
|
|
|
|
|
|
|
consensus->seqlen = max_cons - min_offset;
|
|
|
|
if (consensus->seqmaxlen == 0) {
|
|
|
|
consensus->c_elem =
|
|
|
|
(char *)Calloc(max_cons - min_offset + 5, 1);
|
|
|
|
consensus->seqmaxlen = max_cons - min_offset + 5;
|
|
|
|
}
|
|
|
|
else if (consensus->seqmaxlen < max_cons - min_offset) {
|
|
|
|
consensus->seqmaxlen = max_cons - min_offset + 5;
|
|
|
|
consensus->c_elem = (char *)Realloc(consensus->c_elem,
|
|
|
|
max_cons - min_offset + 5);
|
|
|
|
}
|
|
|
|
|
|
|
|
cnt = 0;
|
|
|
|
for (j = min_offset; j < max_cons; j++) {
|
|
|
|
consensus->c_elem[j - min_offset] = '1';
|
|
|
|
for (i = 0; i < numOfAligned; i++) {
|
|
|
|
if (group == 0 || aligned[i].group_number == group) {
|
|
|
|
if (j < aligned[i].offset ||
|
|
|
|
j >=
|
|
|
|
aligned[i].offset + aligned[i].seqlen ||
|
|
|
|
acgtu[aligned[i]
|
|
|
|
.c_elem[j - aligned[i].offset]] ==
|
|
|
|
0) {
|
|
|
|
consensus->c_elem[j - min_offset] = '0';
|
|
|
|
cnt++;
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
2022-03-08 04:43:05 +08:00
|
|
|
}
|
|
|
|
}
|
2023-04-09 02:34:34 +08:00
|
|
|
fprintf(stderr, "\nNumber of 1s in mask: %d\n",
|
|
|
|
max_cons - min_offset - cnt);
|
|
|
|
fprintf(stderr, "Number of 0s in mask: %d\n\n", cnt);
|
|
|
|
return TRUE;
|
2022-03-08 04:43:05 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
/***********
|
2023-04-09 02:34:34 +08:00
|
|
|
*
|
2022-03-08 04:43:05 +08:00
|
|
|
* MajorityCons() takes an array of aligned sequence, and generates
|
2023-04-09 02:34:34 +08:00
|
|
|
* a MAJORITY consensus.
|
|
|
|
* Note, memory for (Sequence* consensus) should be
|
2022-03-08 04:43:05 +08:00
|
|
|
* located before it is passed to this function.
|
2023-04-09 02:34:34 +08:00
|
|
|
*
|
2022-03-08 04:43:05 +08:00
|
|
|
* Returns TRUE if successful, FALSE otherwise.
|
|
|
|
*
|
|
|
|
**********/
|
|
|
|
|
2023-04-09 02:34:34 +08:00
|
|
|
int MajorityCons(aligned, numOfAligned, consensus, group, major_perc)
|
|
|
|
Sequence aligned[]; /* input. */
|
|
|
|
int numOfAligned; /* input. */
|
|
|
|
Sequence *consensus; /* input and output. */
|
2022-03-08 04:43:05 +08:00
|
|
|
int group, major_perc;
|
|
|
|
{
|
2023-04-09 02:34:34 +08:00
|
|
|
int i, j, index, score, ii, base, max;
|
|
|
|
int max_cons = INT_MIN;
|
|
|
|
int min_offset = INT_MAX;
|
|
|
|
char temp_str[2], occurence;
|
|
|
|
int *cnts, tot_in_grp;
|
|
|
|
unsigned char case_bit;
|
|
|
|
|
|
|
|
cnts = (int *)Calloc(16, sizeof(int));
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Search for the minimum offset.
|
|
|
|
*/
|
|
|
|
|
|
|
|
for (i = 0; i < numOfAligned; i++) {
|
|
|
|
if (group == 0 || aligned[i].group_number == group) {
|
|
|
|
SeqNormal(&aligned[i]);
|
|
|
|
min_offset = MIN(min_offset, aligned[i].offset);
|
|
|
|
max_cons = MAX(max_cons,
|
|
|
|
aligned[i].offset + aligned[i].seqlen);
|
2022-03-08 04:43:05 +08:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2023-04-09 02:34:34 +08:00
|
|
|
/*
|
|
|
|
* Decide consensus base by base.
|
|
|
|
*/
|
|
|
|
|
|
|
|
CleanRecord(consensus);
|
|
|
|
consensus->offset = min_offset;
|
|
|
|
|
|
|
|
if (aligned[0].contig[0] != '\0') {
|
|
|
|
strcpy(consensus->name, aligned[0].contig);
|
|
|
|
strcat(consensus->name, ".");
|
2022-03-08 04:43:05 +08:00
|
|
|
}
|
2023-04-09 02:34:34 +08:00
|
|
|
else if (strncmp(aligned[0].name, "cons.", 5) != 0) {
|
|
|
|
strcpy(consensus->name, "cons.");
|
|
|
|
strcat(consensus->name, aligned[0].name);
|
2022-03-08 04:43:05 +08:00
|
|
|
}
|
2023-04-09 02:34:34 +08:00
|
|
|
strcpy(consensus->sequence_ID, uniqueID());
|
|
|
|
strcpy(consensus->contig, aligned[0].contig);
|
|
|
|
|
|
|
|
for (j = min_offset; j < max_cons; j++) {
|
|
|
|
case_bit = 0;
|
|
|
|
occurence = 00;
|
|
|
|
tot_in_grp = 0;
|
|
|
|
for (ii = 0; ii < 16; ii++) cnts[ii] = 0;
|
|
|
|
|
|
|
|
for (i = 0; i < numOfAligned; i++) {
|
|
|
|
if (group == 0 || aligned[i].group_number == group) {
|
|
|
|
if (j >= aligned[i].offset &&
|
|
|
|
j < aligned[i].offset + aligned[i].seqlen) {
|
|
|
|
tot_in_grp++;
|
|
|
|
index = j - aligned[i].offset;
|
|
|
|
|
|
|
|
if (aligned[i].c_elem[index] == '-')
|
|
|
|
case_bit = 32;
|
|
|
|
else if (case_bit == 0)
|
|
|
|
case_bit |=
|
|
|
|
(aligned[i].c_elem[index] &
|
|
|
|
32);
|
|
|
|
|
|
|
|
occurence |= Default_IUPAC_Trans(
|
|
|
|
aligned[i].c_elem[index]);
|
|
|
|
cnts[(int)Default_IUPAC_Trans(
|
|
|
|
aligned[i].c_elem[index])]++;
|
|
|
|
|
|
|
|
if (case_bit == 0 && occurence != 1 &&
|
|
|
|
occurence != 2 && occurence != 4 &&
|
|
|
|
occurence != 8)
|
|
|
|
case_bit = 32;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
2022-03-08 04:43:05 +08:00
|
|
|
|
2023-04-09 02:34:34 +08:00
|
|
|
max = 0;
|
|
|
|
for (ii = 0; ii < 16; ii++) {
|
|
|
|
if (cnts[ii] > max) {
|
|
|
|
max = cnts[ii];
|
|
|
|
base = ii;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
if (max * 100 / tot_in_grp >= major_perc) {
|
|
|
|
/* follow the majority rule. */
|
|
|
|
sprintf(temp_str, "%1c", Default_DNA_Trans[base]);
|
|
|
|
}
|
|
|
|
else {
|
|
|
|
/* use IUPAC code. */
|
|
|
|
sprintf(temp_str, "%1c",
|
|
|
|
Default_DNA_Trans[(int)occurence]);
|
|
|
|
}
|
|
|
|
|
|
|
|
if (case_bit == 0) temp_str[0] = toupper(temp_str[0]);
|
|
|
|
|
|
|
|
if (InsertElems(consensus, j, temp_str) == FALSE) {
|
|
|
|
return FALSE;
|
|
|
|
}
|
2022-03-08 04:43:05 +08:00
|
|
|
}
|
2023-04-09 02:34:34 +08:00
|
|
|
return TRUE;
|
2022-03-08 04:43:05 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
/***********
|
|
|
|
*
|
|
|
|
* ReadGDEtoHGL() reads a GDE formated file into an array of HGL structure.
|
|
|
|
*
|
|
|
|
* Return -1 if anything is wrong, number_of_sequence otherwise.
|
|
|
|
*
|
|
|
|
***********/
|
|
|
|
|
2023-04-09 02:34:34 +08:00
|
|
|
int ReadGDEtoHGL(fp, tSeq_arr)
|
2022-03-08 04:43:05 +08:00
|
|
|
FILE *fp;
|
|
|
|
Sequence **tSeq_arr;
|
|
|
|
{
|
2023-04-09 02:34:34 +08:00
|
|
|
char line[MAXLINELEN];
|
|
|
|
int ptr, num_seq, max_num_seq = 20;
|
|
|
|
int seq_len = 200;
|
|
|
|
char *newline;
|
|
|
|
|
|
|
|
(*tSeq_arr) = (Sequence *)Calloc(max_num_seq, sizeof(Sequence));
|
|
|
|
num_seq = -1;
|
|
|
|
while (fgets(line, MAXLINELEN - 2, fp) != NULL) /* spaces for \n\0 */
|
|
|
|
{
|
|
|
|
/* ptr points to the last char. */
|
|
|
|
ptr = strlen(line) - 1;
|
|
|
|
|
|
|
|
/* clear up the tail. */
|
|
|
|
while (ptr >= 0 && (line[ptr] == '\n' || line[ptr] == ' ' ||
|
|
|
|
line[ptr] == '\t'))
|
|
|
|
ptr--;
|
|
|
|
line[ptr + 1] = '\0';
|
|
|
|
|
|
|
|
if (ptr <= 0) {
|
|
|
|
/* it is an empty line. */
|
2022-03-08 04:43:05 +08:00
|
|
|
}
|
2023-04-09 02:34:34 +08:00
|
|
|
else if (line[0] == '#') {
|
|
|
|
if (++num_seq == max_num_seq) {
|
|
|
|
max_num_seq *= 2;
|
|
|
|
/* printf("max_num_seq = %d\n", max_num_seq); */
|
|
|
|
(*tSeq_arr) = (Sequence *)Realloc(
|
|
|
|
(*tSeq_arr),
|
|
|
|
max_num_seq * sizeof(Sequence));
|
|
|
|
}
|
2022-03-08 04:43:05 +08:00
|
|
|
|
2023-04-09 02:34:34 +08:00
|
|
|
InitRecord((*tSeq_arr)[num_seq]);
|
2022-03-08 04:43:05 +08:00
|
|
|
|
2023-04-09 02:34:34 +08:00
|
|
|
if (line[ptr] == '<') {
|
|
|
|
(*tSeq_arr)[num_seq].direction = 2; /* 3to5 */
|
|
|
|
line[ptr] = '\0';
|
|
|
|
}
|
|
|
|
else if (line[ptr] == '>') {
|
|
|
|
(*tSeq_arr)[num_seq].direction = 1; /* 5to3 */
|
|
|
|
line[ptr] = '\0';
|
|
|
|
}
|
|
|
|
strcpy((*tSeq_arr)[num_seq].sequence_ID, line + 1);
|
|
|
|
}
|
|
|
|
else {
|
|
|
|
ptr = 0;
|
|
|
|
if ((*tSeq_arr)[num_seq].seqlen == 0) {
|
|
|
|
/* determine the offset. */
|
|
|
|
while (line[ptr] != '\0' && line[ptr] == '-') {
|
|
|
|
ptr++;
|
|
|
|
}
|
|
|
|
(*tSeq_arr)[num_seq].offset += ptr;
|
|
|
|
}
|
2022-03-08 04:43:05 +08:00
|
|
|
|
2023-04-09 02:34:34 +08:00
|
|
|
if (line[ptr] != '\0') {
|
|
|
|
newline = line + ptr;
|
|
|
|
|
|
|
|
if ((*tSeq_arr)[num_seq].seqmaxlen == 0) {
|
|
|
|
(*tSeq_arr)[num_seq].c_elem =
|
|
|
|
(char *)Calloc(seq_len, 1);
|
|
|
|
(*tSeq_arr)[num_seq].c_elem[0] = '\0';
|
|
|
|
(*tSeq_arr)[num_seq].seqmaxlen =
|
|
|
|
seq_len;
|
|
|
|
}
|
|
|
|
else {
|
|
|
|
while ((*tSeq_arr)[num_seq].seqlen +
|
|
|
|
strlen(newline) + 1 >
|
|
|
|
(*tSeq_arr)[num_seq].seqmaxlen) {
|
|
|
|
seq_len *= 2;
|
|
|
|
(*tSeq_arr)[num_seq]
|
|
|
|
.c_elem = (char *)Realloc(
|
|
|
|
(*tSeq_arr)[num_seq].c_elem,
|
|
|
|
seq_len);
|
|
|
|
(*tSeq_arr)[num_seq].seqmaxlen =
|
|
|
|
seq_len;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
strcat((*tSeq_arr)[num_seq].c_elem, newline);
|
|
|
|
(*tSeq_arr)[num_seq].seqlen =
|
|
|
|
strlen((*tSeq_arr)[num_seq].c_elem);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
return (num_seq + 1);
|
|
|
|
}
|
2022-03-08 04:43:05 +08:00
|
|
|
|
|
|
|
/********
|
|
|
|
*
|
|
|
|
* InsertElems returns TRUE if successful, FALSE otherwise.
|
|
|
|
*
|
|
|
|
********/
|
|
|
|
|
2023-04-09 02:34:34 +08:00
|
|
|
int InsertElems(seq, pos, c)
|
|
|
|
Sequence *seq; /* Sequence */
|
|
|
|
int pos; /* Position (in respect to the master consensus)
|
|
|
|
* to insert BEFORE
|
|
|
|
* always move string to the right. */
|
|
|
|
char c[]; /*Null terminated array of elements to insert */
|
2022-03-08 04:43:05 +08:00
|
|
|
{
|
2023-04-09 02:34:34 +08:00
|
|
|
int dashes, j, len;
|
2022-03-08 04:43:05 +08:00
|
|
|
|
2023-04-09 02:34:34 +08:00
|
|
|
len = strlen(c);
|
2022-03-08 04:43:05 +08:00
|
|
|
|
2023-04-09 02:34:34 +08:00
|
|
|
if (seq->seqlen == 0) {
|
|
|
|
/* get rid of '-'s at right. */
|
|
|
|
/*
|
|
|
|
dashes = len-1;
|
|
|
|
while(dashes >= 0 && c[dashes] == '-')
|
|
|
|
dashes--;
|
|
|
|
if(dashes < 0)
|
|
|
|
{
|
|
|
|
seq->offset = pos;
|
|
|
|
return TRUE;
|
|
|
|
}
|
|
|
|
c[dashes+1] = '\0';
|
|
|
|
*/
|
|
|
|
|
|
|
|
/* clear out '-'s at left. */
|
|
|
|
dashes = 0;
|
|
|
|
/*
|
|
|
|
while(c[dashes] == '-')
|
|
|
|
dashes++;
|
|
|
|
|
|
|
|
c += dashes;
|
|
|
|
len = strlen(c);
|
|
|
|
pos += dashes;
|
|
|
|
*/
|
|
|
|
|
|
|
|
if (seq->seqmaxlen == 0) {
|
|
|
|
seq->c_elem = (char *)Calloc(len + 1, 1);
|
|
|
|
seq->seqmaxlen = len + 1;
|
|
|
|
}
|
|
|
|
else if (len + 1 >= seq->seqmaxlen) {
|
|
|
|
seq->c_elem = (char *)Realloc(seq->c_elem, len + 1);
|
|
|
|
seq->seqmaxlen = len + 1;
|
|
|
|
}
|
|
|
|
|
|
|
|
strcpy(seq->c_elem, c);
|
|
|
|
seq->seqlen = len;
|
|
|
|
seq->offset = pos;
|
|
|
|
return TRUE;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* to make sure there is a space for '\0'. */
|
|
|
|
if (seq->seqlen > seq->seqmaxlen) {
|
|
|
|
fprintf(
|
|
|
|
stderr,
|
|
|
|
"InsertElems(): seqlen>seqmaxlen. Something is wrong.\n");
|
|
|
|
return FALSE;
|
|
|
|
}
|
|
|
|
else {
|
|
|
|
while (seq->seqlen + 1 >= seq->seqmaxlen) {
|
|
|
|
seq->seqmaxlen *= 2;
|
|
|
|
seq->c_elem =
|
|
|
|
(char *)Realloc(seq->c_elem, seq->seqmaxlen);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
seq->c_elem[seq->seqlen] = '\0';
|
|
|
|
|
|
|
|
if (pos < seq->offset) /* insert to the left of the seq. */
|
2022-03-08 04:43:05 +08:00
|
|
|
{
|
2023-04-09 02:34:34 +08:00
|
|
|
/* ignore the dashes at the left. */
|
|
|
|
dashes = 0;
|
|
|
|
/*
|
|
|
|
while(dashes < len && c[dashes] == '-')
|
|
|
|
dashes++;
|
|
|
|
if(c[dashes] == '\0')
|
|
|
|
{
|
|
|
|
seq->offset += len;
|
|
|
|
return TRUE;
|
|
|
|
}
|
|
|
|
c += dashes;
|
|
|
|
len -= dashes;
|
|
|
|
*/
|
|
|
|
|
|
|
|
if (seq->seqlen + len + seq->offset - pos > seq->seqmaxlen) {
|
|
|
|
seq->seqmaxlen =
|
|
|
|
seq->seqlen + len + seq->offset - pos + 256;
|
|
|
|
seq->c_elem =
|
|
|
|
(char *)Realloc(seq->c_elem, seq->seqmaxlen);
|
|
|
|
}
|
|
|
|
|
|
|
|
/* copy the old string including the last '\0'. */
|
|
|
|
for (j = seq->seqlen; j >= 0; j--)
|
|
|
|
seq->c_elem[j + len + seq->offset - pos] =
|
|
|
|
seq->c_elem[j];
|
|
|
|
|
|
|
|
/* insert dashes. */
|
|
|
|
for (j = len; j < len + seq->offset - pos; j++)
|
|
|
|
seq->c_elem[j] = '-';
|
|
|
|
|
|
|
|
/* copy the inserted string. */
|
|
|
|
for (j = 0; j < len; j++) seq->c_elem[j] = c[j];
|
|
|
|
|
|
|
|
/* detector. */
|
|
|
|
if (c[j] != '\0')
|
|
|
|
fprintf(stderr, "InsertElems: Problem.....\n");
|
|
|
|
|
|
|
|
seq->seqlen = strlen(seq->c_elem);
|
|
|
|
|
|
|
|
/* seq->offset = pos; commented on 6-3-91 */
|
|
|
|
seq->offset = pos + dashes;
|
|
|
|
if (dashes > 0)
|
|
|
|
printf("\nInsertElems(): dashes is not zero.\n\n");
|
2022-03-08 04:43:05 +08:00
|
|
|
}
|
|
|
|
|
2023-04-09 02:34:34 +08:00
|
|
|
else if (pos - seq->offset >= seq->seqlen) /* insert to the right. */
|
|
|
|
{
|
|
|
|
/* ignore the dashes at the right. */
|
|
|
|
/*
|
|
|
|
dashes = len -1;
|
|
|
|
while(dashes >= 0 && c[dashes] == '-')
|
|
|
|
dashes--;
|
|
|
|
if(dashes < 0)
|
|
|
|
return TRUE;
|
|
|
|
len = dashes+1;
|
|
|
|
c[len] = '\0';
|
|
|
|
*/
|
|
|
|
|
|
|
|
if (pos - seq->offset + len > seq->seqmaxlen) {
|
|
|
|
seq->seqmaxlen = pos - seq->offset + len + 256;
|
|
|
|
seq->c_elem =
|
|
|
|
(char *)Realloc(seq->c_elem, seq->seqmaxlen);
|
|
|
|
}
|
|
|
|
|
|
|
|
/* insert dashes. */
|
|
|
|
for (j = seq->seqlen; j < pos - seq->offset; j++)
|
|
|
|
seq->c_elem[j] = '-';
|
2022-03-08 04:43:05 +08:00
|
|
|
|
2023-04-09 02:34:34 +08:00
|
|
|
/* copy the inserted string. */
|
|
|
|
for (j = 0; j < len; j++)
|
|
|
|
seq->c_elem[pos - seq->offset + j] = c[j];
|
|
|
|
seq->c_elem[pos - seq->offset + len] = '\0';
|
2022-03-08 04:43:05 +08:00
|
|
|
|
2023-04-09 02:34:34 +08:00
|
|
|
/* detector. */
|
|
|
|
if (c[j] != '\0')
|
|
|
|
fprintf(stderr, "InsertElems: Problem too .....\n");
|
|
|
|
|
|
|
|
seq->seqlen = strlen(seq->c_elem);
|
2022-03-08 04:43:05 +08:00
|
|
|
}
|
2023-04-09 02:34:34 +08:00
|
|
|
else /* insert into the seq. */
|
2022-03-08 04:43:05 +08:00
|
|
|
{
|
2023-04-09 02:34:34 +08:00
|
|
|
if (seq->seqlen + len > seq->seqmaxlen) {
|
|
|
|
seq->seqmaxlen = seq->seqlen + len + 256;
|
|
|
|
seq->c_elem =
|
|
|
|
(char *)Realloc(seq->c_elem, seq->seqmaxlen);
|
|
|
|
}
|
2022-03-08 04:43:05 +08:00
|
|
|
|
2023-04-09 02:34:34 +08:00
|
|
|
/* move the bottom part of the older string including the last
|
|
|
|
* '\0'. */
|
|
|
|
for (j = seq->seqlen; j >= pos - seq->offset; j--)
|
|
|
|
seq->c_elem[j + len] = seq->c_elem[j];
|
|
|
|
|
|
|
|
/* copy the inserted string. */
|
|
|
|
for (j = 0; j < len; j++)
|
|
|
|
seq->c_elem[pos - seq->offset + j] = c[j];
|
2022-03-08 04:43:05 +08:00
|
|
|
|
2023-04-09 02:34:34 +08:00
|
|
|
/* detector. */
|
|
|
|
if (c[j] != '\0')
|
|
|
|
fprintf(stderr,
|
|
|
|
"InsertElems: Problem too too .....\n");
|
2022-03-08 04:43:05 +08:00
|
|
|
|
2023-04-09 02:34:34 +08:00
|
|
|
seq->seqlen = strlen(seq->c_elem);
|
|
|
|
}
|
2022-03-08 04:43:05 +08:00
|
|
|
|
2023-04-09 02:34:34 +08:00
|
|
|
return TRUE;
|
|
|
|
}
|
2022-03-08 04:43:05 +08:00
|
|
|
|
|
|
|
/******************************************************************
|
|
|
|
*
|
|
|
|
* int GetArgs(argArray, numArgs)
|
|
|
|
* Arg *argArray;
|
|
|
|
* int numArgs;
|
|
|
|
*
|
|
|
|
* Return TRUE if successful, FALSE otherwise.
|
|
|
|
*
|
|
|
|
******************************************************************/
|
|
|
|
|
2023-04-09 02:34:34 +08:00
|
|
|
#define MAX_ARGS 50 /* maximum args this can process */
|
2022-03-08 04:43:05 +08:00
|
|
|
|
2023-04-09 02:34:34 +08:00
|
|
|
int GetArgs(argArray, numArgs, argc, argv)
|
2022-03-08 04:43:05 +08:00
|
|
|
Args *argArray;
|
2023-04-09 02:34:34 +08:00
|
|
|
int numArgs;
|
2022-03-08 04:43:05 +08:00
|
|
|
int argc;
|
|
|
|
char **argv;
|
|
|
|
{
|
2023-04-09 02:34:34 +08:00
|
|
|
int i, j;
|
|
|
|
Args *curarg;
|
|
|
|
int noArgOK = TRUE;
|
2022-03-08 04:43:05 +08:00
|
|
|
|
2023-04-09 02:34:34 +08:00
|
|
|
if ((argArray == NULL) || (numArgs == 0) || (numArgs > MAX_ARGS)) {
|
|
|
|
fprintf(stderr, "GetArgs: Invalid number of args.\n");
|
|
|
|
return FALSE;
|
2022-03-08 04:43:05 +08:00
|
|
|
}
|
|
|
|
|
2023-04-09 02:34:34 +08:00
|
|
|
/*
|
|
|
|
* Test if all are either 'default' or 'optional'.
|
|
|
|
*/
|
2022-03-08 04:43:05 +08:00
|
|
|
curarg = argArray;
|
2023-04-09 02:34:34 +08:00
|
|
|
for (i = 0; i < numArgs; i++, curarg++) {
|
|
|
|
if (curarg->strvalue[0] == '\0' && curarg->optional == 'F') {
|
|
|
|
noArgOK = FALSE;
|
|
|
|
break;
|
|
|
|
}
|
2022-03-08 04:43:05 +08:00
|
|
|
}
|
2023-04-09 02:34:34 +08:00
|
|
|
|
|
|
|
/*
|
|
|
|
* show usage if some arg is required but no arg is
|
|
|
|
* supllied on command line.
|
|
|
|
*/
|
|
|
|
if (noArgOK == FALSE && argc == 1) {
|
|
|
|
fprintf(stderr, "\n%s arguments:\n\n", argv[0]);
|
|
|
|
curarg = argArray;
|
|
|
|
|
|
|
|
for (i = 0; i < numArgs; i++, curarg++) {
|
|
|
|
fprintf(stderr, " -%c %s ", curarg->tag,
|
|
|
|
curarg->prompt);
|
|
|
|
if (curarg->optional == 'T')
|
|
|
|
fprintf(stderr, " [Optional]");
|
|
|
|
fprintf(stderr, "\n");
|
|
|
|
if (curarg->strvalue[0] != '\0')
|
|
|
|
fprintf(stderr, " default = %s\n",
|
|
|
|
curarg->strvalue);
|
|
|
|
}
|
|
|
|
fprintf(stderr, "\n");
|
|
|
|
return FALSE;
|
2022-03-08 04:43:05 +08:00
|
|
|
}
|
2023-04-09 02:34:34 +08:00
|
|
|
|
|
|
|
/*
|
|
|
|
* Process
|
|
|
|
*/
|
|
|
|
for (i = 1; i < argc; i++) {
|
|
|
|
if (argv[i][0] != '-') {
|
|
|
|
fprintf(stderr, "Arguments must start with -");
|
|
|
|
return FALSE;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* check the tag. */
|
|
|
|
curarg = argArray;
|
|
|
|
for (j = 0; j < numArgs; j++, curarg++) {
|
|
|
|
if ((argv[i][1] | 32) == (curarg->tag | 32)) break;
|
|
|
|
}
|
|
|
|
if (j == numArgs) {
|
|
|
|
fprintf(stderr, "Invalid argument tag in %s\n",
|
|
|
|
argv[i]);
|
|
|
|
return FALSE;
|
|
|
|
}
|
|
|
|
|
|
|
|
strcpy(curarg->strvalue, argv[i] + 2);
|
|
|
|
if (curarg->strvalue[0] == '\'' &&
|
|
|
|
curarg->strvalue[strlen(curarg->strvalue) - 1] == '\'') {
|
|
|
|
char ttmm[256];
|
|
|
|
strcpy(ttmm, curarg->strvalue + 1);
|
|
|
|
ttmm[strlen(ttmm) - 1] = '\0';
|
|
|
|
strcpy(curarg->strvalue, ttmm);
|
|
|
|
}
|
2022-03-08 04:43:05 +08:00
|
|
|
}
|
2023-04-09 02:34:34 +08:00
|
|
|
return TRUE;
|
2022-03-08 04:43:05 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
/*********
|
|
|
|
*
|
|
|
|
* GetCond interprets the -c argument, the condition.
|
|
|
|
*
|
|
|
|
* The condition will be set to NULL if no condition is specified,
|
|
|
|
* that is, if you pass '&p' as the address of a cond* structure,
|
|
|
|
* p will be set to NULL if no condition [(p == NULL) = TRUE].
|
|
|
|
*
|
|
|
|
* Return TRUE if successful, FALSE otherwise.
|
|
|
|
*
|
|
|
|
*********/
|
|
|
|
|
2023-04-09 02:34:34 +08:00
|
|
|
int GetCond(arg, cond)
|
2022-03-08 04:43:05 +08:00
|
|
|
char *arg;
|
|
|
|
str_cond **cond;
|
|
|
|
{
|
2023-04-09 02:34:34 +08:00
|
|
|
int start, end, i, found;
|
|
|
|
char message_buf[1000];
|
2022-03-08 04:43:05 +08:00
|
|
|
|
2023-04-09 02:34:34 +08:00
|
|
|
if (strcmp(arg, "null") == 0) {
|
|
|
|
(*cond) = NULL;
|
|
|
|
return TRUE;
|
2022-03-08 04:43:05 +08:00
|
|
|
}
|
2023-04-09 02:34:34 +08:00
|
|
|
else {
|
|
|
|
(*cond) = (str_cond *)Calloc(1, sizeof(str_cond));
|
|
|
|
|
|
|
|
start = end = 0;
|
|
|
|
|
|
|
|
/* find the field name. */
|
|
|
|
while (('a' <= arg[end] && arg[end] <= 'z') ||
|
|
|
|
('A' <= arg[end] && arg[end] <= 'Z') || arg[end] == '-')
|
|
|
|
end++;
|
|
|
|
|
|
|
|
found = FALSE;
|
|
|
|
for (i = 0; i < NUM_OF_FIELDS && found == FALSE; i++) {
|
|
|
|
if (strncmp(arg, at[i], strlen(at[i])) == 0) {
|
|
|
|
(*cond)->field =
|
|
|
|
i; /* condition on field &at[i]. */
|
|
|
|
found = TRUE;
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
if (found == FALSE) {
|
|
|
|
strncpy(message_buf, arg, end - start);
|
|
|
|
message_buf[end - start] = '\0';
|
|
|
|
fprintf(stderr, "Field %s not found.\n", message_buf);
|
|
|
|
return FALSE;
|
|
|
|
}
|
2022-03-08 04:43:05 +08:00
|
|
|
|
2023-04-09 02:34:34 +08:00
|
|
|
start = end;
|
|
|
|
end++;
|
|
|
|
while (arg[end] == '=' || arg[end] == '!' || arg[end] == '>' ||
|
|
|
|
arg[end] == '<')
|
|
|
|
end++;
|
|
|
|
strncpy((*cond)->symbol, arg + start, end - start);
|
|
|
|
(*cond)->symbol[end - start] = '\0';
|
|
|
|
if (strlen((*cond)->symbol) > 2 ||
|
|
|
|
strlen((*cond)->symbol) < 1 ||
|
|
|
|
(strlen((*cond)->symbol) == 1 &&
|
|
|
|
*((*cond)->symbol) != '>' && *((*cond)->symbol) != '<') ||
|
|
|
|
(strlen((*cond)->symbol) == 2 &&
|
|
|
|
(strncmp((*cond)->symbol, "!=", 2) != 0) &&
|
|
|
|
(strncmp((*cond)->symbol, "==", 2) != 0) &&
|
|
|
|
(strncmp((*cond)->symbol, ">=", 2) != 0) &&
|
|
|
|
(strncmp((*cond)->symbol, "<=", 2) != 0))) {
|
|
|
|
fprintf(stderr, "Invalid condition.\n");
|
|
|
|
return FALSE;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (arg[end] == '"' && arg[strlen(arg) - 1] == '"') {
|
|
|
|
end++;
|
|
|
|
arg[strlen(arg) - 1] = '\0';
|
|
|
|
}
|
2022-03-08 04:43:05 +08:00
|
|
|
|
2023-04-09 02:34:34 +08:00
|
|
|
(*cond)->value = (char *)Calloc(strlen(arg) - end + 2, 1);
|
|
|
|
strcpy((*cond)->value, arg + end);
|
|
|
|
}
|
|
|
|
return TRUE;
|
|
|
|
}
|
2022-03-08 04:43:05 +08:00
|
|
|
|
|
|
|
/*********
|
|
|
|
*
|
|
|
|
* GetFields interprets the -f arguments, the fields list.
|
|
|
|
*
|
2023-04-09 02:34:34 +08:00
|
|
|
* Returns number of selected fields, 0 if anything is wrong.
|
2022-03-08 04:43:05 +08:00
|
|
|
*
|
|
|
|
*********/
|
|
|
|
|
2023-04-09 02:34:34 +08:00
|
|
|
int GetFields(arg, selected_fields)
|
2022-03-08 04:43:05 +08:00
|
|
|
char *arg;
|
|
|
|
int selected_fields[];
|
|
|
|
{
|
2023-04-09 02:34:34 +08:00
|
|
|
int start, end, i, found, list_done, i_selected;
|
|
|
|
char message_buf[1000];
|
2022-03-08 04:43:05 +08:00
|
|
|
|
2023-04-09 02:34:34 +08:00
|
|
|
if (strcmp(arg, "all") == 0) {
|
|
|
|
selected_fields[0] = -1;
|
|
|
|
return NUM_OF_FIELDS;
|
|
|
|
}
|
|
|
|
else {
|
|
|
|
start = end = 0;
|
|
|
|
list_done = FALSE;
|
|
|
|
i_selected = 0;
|
|
|
|
|
|
|
|
while (list_done == FALSE) {
|
|
|
|
while (arg[end] != '\0' && arg[end] != ',') {
|
|
|
|
end++;
|
|
|
|
}
|
|
|
|
if (arg[end] == '\0') {
|
|
|
|
list_done = TRUE;
|
|
|
|
}
|
|
|
|
found = FALSE;
|
|
|
|
for (i = 0; i < NUM_OF_FIELDS && found == FALSE; i++) {
|
|
|
|
if (strncmp(arg + start, at[i],
|
|
|
|
strlen(at[i])) == 0) {
|
|
|
|
selected_fields[i_selected++] = i;
|
|
|
|
found = TRUE;
|
|
|
|
start = end + 1;
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
if (found == FALSE) {
|
|
|
|
strncpy(message_buf, (arg + start),
|
|
|
|
end - start);
|
|
|
|
message_buf[end - start] = '\0';
|
|
|
|
fprintf(stderr, "Field %s not found.\n",
|
|
|
|
message_buf);
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
end++;
|
|
|
|
}
|
|
|
|
}
|
2022-03-08 04:43:05 +08:00
|
|
|
|
2023-04-09 02:34:34 +08:00
|
|
|
return i_selected;
|
|
|
|
}
|
2022-03-08 04:43:05 +08:00
|
|
|
|
2023-04-09 02:34:34 +08:00
|
|
|
static char *pairs[] = {"aa", "ac", "ag", "at", "ca", "cc", "cg", "ct",
|
|
|
|
"ga", "gc", "gg", "gt", "ta", "tc", "tg", "tt"};
|
2022-03-08 04:43:05 +08:00
|
|
|
|
2023-04-09 02:34:34 +08:00
|
|
|
static int stemp[16] = {55, 98, 58, 57, 55, 86, 73, 58,
|
|
|
|
87, 136, 86, 98, 37, 87, 55, 55};
|
2022-03-08 04:43:05 +08:00
|
|
|
|
|
|
|
/*******
|
|
|
|
*
|
|
|
|
* MST() returns Mean Stacking Temperature for the given sequence,
|
|
|
|
* returns -1 if anything is wrong.
|
|
|
|
*
|
|
|
|
*******/
|
|
|
|
|
2023-04-09 02:34:34 +08:00
|
|
|
float MST(c_elem) const char *c_elem;
|
2022-03-08 04:43:05 +08:00
|
|
|
{
|
2023-04-09 02:34:34 +08:00
|
|
|
int i, j, l;
|
|
|
|
int tot_stemp = 0, non_amb_pairs = 0;
|
|
|
|
char *seq;
|
|
|
|
|
|
|
|
l = strlen(c_elem);
|
|
|
|
|
|
|
|
seq = (char *)Calloc(l, 1 + 1);
|
|
|
|
|
|
|
|
/* clean out dashes. */
|
|
|
|
j = 0;
|
|
|
|
for (i = 0; i < l; i++) {
|
|
|
|
if (c_elem[i] != '-') {
|
|
|
|
seq[j] = c_elem[i] | 32;
|
|
|
|
if (seq[j] == 'u') seq[j] = 't';
|
|
|
|
j++;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
seq[j] = '\0';
|
|
|
|
l = j;
|
|
|
|
|
|
|
|
for (i = 0; i < l - 1; i++) {
|
|
|
|
j = 0;
|
|
|
|
while (j < 16 && strncmp(seq + i, pairs[j], 2) != 0) {
|
|
|
|
j++;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* ignore the pairing of an ambiguous base. */
|
|
|
|
if (j != 16) {
|
|
|
|
tot_stemp += stemp[j];
|
|
|
|
non_amb_pairs++;
|
|
|
|
}
|
|
|
|
}
|
2022-03-08 04:43:05 +08:00
|
|
|
|
2023-04-09 02:34:34 +08:00
|
|
|
if (seq != NULL) {
|
|
|
|
Cfree(seq);
|
|
|
|
seq = NULL;
|
|
|
|
}
|
|
|
|
return ((float)tot_stemp / (float)non_amb_pairs);
|
|
|
|
}
|
2022-03-08 04:43:05 +08:00
|
|
|
|
|
|
|
/********
|
|
|
|
*
|
|
|
|
* SubStr() fill ss with a substring of at most 'length' chars and returns
|
|
|
|
* TRUE. If anything is wrong, it sets ss to be empty and returns FALSE.
|
|
|
|
*
|
|
|
|
********/
|
|
|
|
|
2023-04-09 02:34:34 +08:00
|
|
|
int SubStr(string, start, length, ss) const char *string;
|
2022-03-08 04:43:05 +08:00
|
|
|
int start, length;
|
|
|
|
char *ss;
|
|
|
|
{
|
2023-04-09 02:34:34 +08:00
|
|
|
int i;
|
|
|
|
|
|
|
|
if (strlen(string) <= start) {
|
|
|
|
fprintf(stderr,
|
|
|
|
"SubStr(): starting point is beyond the boundary.\n");
|
|
|
|
ss[0] = '\0';
|
|
|
|
return FALSE;
|
|
|
|
}
|
2022-03-08 04:43:05 +08:00
|
|
|
|
2023-04-09 02:34:34 +08:00
|
|
|
for (i = start; string[i] != '\0' && i < start + length; i++) {
|
|
|
|
ss[i - start] = string[i];
|
|
|
|
}
|
|
|
|
ss[i - start] = '\0';
|
2022-03-08 04:43:05 +08:00
|
|
|
|
2023-04-09 02:34:34 +08:00
|
|
|
return TRUE;
|
|
|
|
}
|
2022-03-08 04:43:05 +08:00
|
|
|
|
|
|
|
/*******
|
|
|
|
*
|
|
|
|
* FindPattern() searches string for pattern.
|
|
|
|
* Returns the number of appearences.
|
|
|
|
*
|
|
|
|
*******/
|
|
|
|
|
2023-04-09 02:34:34 +08:00
|
|
|
int FindPattern(string, pattern) const char *string;
|
2022-03-08 04:43:05 +08:00
|
|
|
const char *pattern;
|
|
|
|
{
|
2023-04-09 02:34:34 +08:00
|
|
|
int i, sl, pl, num_app = 0;
|
2022-03-08 04:43:05 +08:00
|
|
|
|
2023-04-09 02:34:34 +08:00
|
|
|
if (string == NULL || (sl = strlen(string)) == 0) return 0;
|
2022-03-08 04:43:05 +08:00
|
|
|
|
2023-04-09 02:34:34 +08:00
|
|
|
pl = strlen(pattern);
|
2022-03-08 04:43:05 +08:00
|
|
|
|
2023-04-09 02:34:34 +08:00
|
|
|
for (i = 0; i <= sl - pl; i++) {
|
|
|
|
if (strncmp(string + i, pattern, pl) == 0) num_app++;
|
|
|
|
}
|
2022-03-08 04:43:05 +08:00
|
|
|
|
2023-04-09 02:34:34 +08:00
|
|
|
return num_app;
|
2022-03-08 04:43:05 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
/*******
|
|
|
|
*
|
|
|
|
* FindPattern2(), same as FindPattern(), but returns the #
|
|
|
|
* of appearences that do not overlap only.
|
|
|
|
*
|
|
|
|
*******/
|
|
|
|
|
2023-04-09 02:34:34 +08:00
|
|
|
int FindPattern2(string, pattern, orig_loc) const char *string;
|
2022-03-08 04:43:05 +08:00
|
|
|
const char *pattern;
|
|
|
|
int orig_loc;
|
|
|
|
{
|
2023-04-09 02:34:34 +08:00
|
|
|
int i, sl, pl, num_app = 0;
|
2022-03-08 04:43:05 +08:00
|
|
|
|
2023-04-09 02:34:34 +08:00
|
|
|
if (string == NULL || (sl = strlen(string)) == 0) return 0;
|
2022-03-08 04:43:05 +08:00
|
|
|
|
2023-04-09 02:34:34 +08:00
|
|
|
pl = strlen(pattern);
|
2022-03-08 04:43:05 +08:00
|
|
|
|
2023-04-09 02:34:34 +08:00
|
|
|
for (i = 0; i <= sl - pl; i++) {
|
|
|
|
if (abs(i - orig_loc) >= pl &&
|
|
|
|
strncmp(string + i, pattern, pl) == 0)
|
|
|
|
num_app++;
|
|
|
|
}
|
2022-03-08 04:43:05 +08:00
|
|
|
|
2023-04-09 02:34:34 +08:00
|
|
|
return num_app;
|
2022-03-08 04:43:05 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
/*******
|
|
|
|
*
|
|
|
|
* FindPatternNC() searches string for pattern , CASE INSENSITIVE.
|
2023-04-09 02:34:34 +08:00
|
|
|
* Returns the number of appearences.
|
2022-03-08 04:43:05 +08:00
|
|
|
*
|
|
|
|
*******/
|
|
|
|
|
2023-04-09 02:34:34 +08:00
|
|
|
int FindPatternNC(string, pattern) const char *string;
|
2022-03-08 04:43:05 +08:00
|
|
|
const char *pattern;
|
|
|
|
{
|
2023-04-09 02:34:34 +08:00
|
|
|
int i, j, sl, pl, num_app = 0;
|
2022-03-08 04:43:05 +08:00
|
|
|
|
2023-04-09 02:34:34 +08:00
|
|
|
if (string == NULL || (sl = strlen(string)) == 0) return 0;
|
2022-03-08 04:43:05 +08:00
|
|
|
|
2023-04-09 02:34:34 +08:00
|
|
|
pl = strlen(pattern);
|
2022-03-08 04:43:05 +08:00
|
|
|
|
2023-04-09 02:34:34 +08:00
|
|
|
for (i = 0; i <= sl - pl; i++) {
|
|
|
|
j = 0;
|
|
|
|
while (j < pl && (string[i + j] | 32) == (pattern[j] | 32)) j++;
|
2022-03-08 04:43:05 +08:00
|
|
|
|
2023-04-09 02:34:34 +08:00
|
|
|
if (j == pl) num_app++;
|
|
|
|
}
|
2022-03-08 04:43:05 +08:00
|
|
|
|
2023-04-09 02:34:34 +08:00
|
|
|
return num_app;
|
2022-03-08 04:43:05 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
/*******
|
|
|
|
*
|
|
|
|
* Complementary() CHANGES the given DNA/RNA string to its complementary,
|
|
|
|
* and returns TRUE. Returns FALSE if anything is wrong and keep the
|
|
|
|
* given string unchanged.
|
|
|
|
*
|
|
|
|
*******/
|
|
|
|
|
2023-04-09 02:34:34 +08:00
|
|
|
int Complementary(sequence, type)
|
2022-03-08 04:43:05 +08:00
|
|
|
char *sequence;
|
|
|
|
char type;
|
|
|
|
{
|
2023-04-09 02:34:34 +08:00
|
|
|
int i, l;
|
|
|
|
char *temp_str;
|
|
|
|
|
|
|
|
l = strlen(sequence);
|
|
|
|
temp_str = (char *)Calloc(l + 1, sizeof(char));
|
|
|
|
if (type == 'D' || type == 'd')
|
|
|
|
type = 0;
|
|
|
|
else if (type == 'R' || type == 'r')
|
|
|
|
type = 1;
|
|
|
|
else {
|
|
|
|
fprintf(stderr,
|
|
|
|
"Complementary(): type unknown. Type is D/d/R/r\n");
|
|
|
|
return (int)NULL;
|
|
|
|
}
|
2022-03-08 04:43:05 +08:00
|
|
|
|
2023-04-09 02:34:34 +08:00
|
|
|
for (i = 0; i < l; i++) {
|
|
|
|
switch (sequence[i]) {
|
|
|
|
case 'A':
|
|
|
|
temp_str[i] = (type == 0) ? 'T' : 'U';
|
|
|
|
break;
|
|
|
|
case 'a':
|
|
|
|
temp_str[i] = (type == 0) ? 't' : 'u';
|
|
|
|
break;
|
|
|
|
case 'C':
|
|
|
|
temp_str[i] = 'G';
|
|
|
|
break;
|
|
|
|
case 'c':
|
|
|
|
temp_str[i] = 'g';
|
|
|
|
break;
|
|
|
|
case 'G':
|
|
|
|
temp_str[i] = 'C';
|
|
|
|
break;
|
|
|
|
case 'g':
|
|
|
|
temp_str[i] = 'c';
|
|
|
|
break;
|
|
|
|
case 'T':
|
|
|
|
case 'U':
|
|
|
|
temp_str[i] = 'A';
|
|
|
|
break;
|
|
|
|
case 't':
|
|
|
|
case 'u':
|
|
|
|
temp_str[i] = 'a';
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
temp_str[i] = '\0';
|
|
|
|
strcpy(sequence, temp_str);
|
|
|
|
if (temp_str != NULL) {
|
|
|
|
Cfree(temp_str);
|
|
|
|
temp_str = NULL;
|
|
|
|
}
|
|
|
|
|
|
|
|
return TRUE;
|
|
|
|
}
|
2022-03-08 04:43:05 +08:00
|
|
|
|
|
|
|
/********
|
|
|
|
*
|
|
|
|
* KnownSeq() returns an integer which is the index of the first
|
|
|
|
* occurence of an ambiguous base in the seq. -1 if no ambiguous
|
|
|
|
* base in the seq.
|
|
|
|
*
|
|
|
|
********/
|
|
|
|
|
|
|
|
int KnownSeq(seq)
|
|
|
|
char *seq;
|
|
|
|
{
|
2023-04-09 02:34:34 +08:00
|
|
|
int i;
|
|
|
|
char c;
|
2022-03-08 04:43:05 +08:00
|
|
|
|
2023-04-09 02:34:34 +08:00
|
|
|
for (i = 0; i < strlen(seq); i++) {
|
|
|
|
c = seq[i] | 32;
|
|
|
|
if (c != 'a' && c != 't' && c != 'g' && c != 'c' && c != 'u')
|
|
|
|
return i;
|
|
|
|
}
|
|
|
|
return -1;
|
|
|
|
}
|
2022-03-08 04:43:05 +08:00
|
|
|
|
|
|
|
/********
|
|
|
|
*
|
|
|
|
* Reverse() reverses the given string and returns TRUE.
|
2023-04-09 02:34:34 +08:00
|
|
|
* (NOTE: Reverse() actually changes the string).
|
2022-03-08 04:43:05 +08:00
|
|
|
* If anything goes wrong, leave seq unchanged.
|
2023-04-09 02:34:34 +08:00
|
|
|
*
|
2022-03-08 04:43:05 +08:00
|
|
|
*
|
|
|
|
********/
|
|
|
|
|
|
|
|
int Reverse(seq)
|
|
|
|
char *seq;
|
|
|
|
{
|
2023-04-09 02:34:34 +08:00
|
|
|
int i, l;
|
|
|
|
char c;
|
2022-03-08 04:43:05 +08:00
|
|
|
|
2023-04-09 02:34:34 +08:00
|
|
|
l = strlen(seq);
|
|
|
|
|
|
|
|
if (l < 2) {
|
|
|
|
return TRUE;
|
|
|
|
}
|
|
|
|
|
|
|
|
for (i = 0; i < l / 2; i++) {
|
|
|
|
c = seq[i];
|
|
|
|
seq[i] = seq[l - i - 1];
|
|
|
|
seq[l - i - 1] = c;
|
|
|
|
}
|
|
|
|
return TRUE;
|
|
|
|
}
|
2022-03-08 04:43:05 +08:00
|
|
|
|
|
|
|
/********
|
|
|
|
*
|
|
|
|
* GoodOligos() returns a pointer to an array of subsequences that
|
|
|
|
* do not contant secondary structure, nor self complementary structure.
|
|
|
|
* Returns NULL if anything is wrong.
|
|
|
|
*
|
|
|
|
* l_bnd and r_bnd are regards to the head of the probe.
|
|
|
|
*
|
|
|
|
* Note: this program Calloc-s memory for the returned pointer.
|
|
|
|
* The caller program is responsible of Freeing the memory when
|
|
|
|
* not needed.
|
|
|
|
*
|
|
|
|
********/
|
|
|
|
|
2023-04-09 02:34:34 +08:00
|
|
|
char **GoodOligos(c_elem, check_len, min_len, max_len, l_bnd, r_bnd)
|
2022-03-08 04:43:05 +08:00
|
|
|
char *c_elem;
|
|
|
|
int check_len, min_len, max_len, l_bnd, r_bnd;
|
2023-04-09 02:34:34 +08:00
|
|
|
/* l_bnd and r_bnd are relative to c_elem, so they should be in
|
2022-03-08 04:43:05 +08:00
|
|
|
[0,strlen(c_elem)] */
|
|
|
|
{
|
2023-04-09 02:34:34 +08:00
|
|
|
int i, l, seq_len, max_num_probe, seq_cnt = 0;
|
|
|
|
char **seq_set;
|
|
|
|
char *seq, *subseq, *scd_str, *PossibleOligo;
|
|
|
|
int BadOligo, PO_len, PO_index, PO_l;
|
2022-03-08 04:43:05 +08:00
|
|
|
|
2023-04-09 02:34:34 +08:00
|
|
|
/* constant(s): */
|
|
|
|
/* to check if there is a substr of length 'no_repeat_len' appears
|
|
|
|
* more than once in the PossibleOligo. */
|
|
|
|
int no_repeat_len = 15;
|
2022-03-08 04:43:05 +08:00
|
|
|
|
2023-04-09 02:34:34 +08:00
|
|
|
seq_len = strlen(c_elem);
|
2022-03-08 04:43:05 +08:00
|
|
|
|
2023-04-09 02:34:34 +08:00
|
|
|
/* A lower case copy of the c_elem. */
|
|
|
|
seq = (char *)Calloc(seq_len + 1, sizeof(char));
|
2022-03-08 04:43:05 +08:00
|
|
|
|
2023-04-09 02:34:34 +08:00
|
|
|
/* String used to check the PossibleOligo. */
|
|
|
|
PossibleOligo = (char *)Calloc(max_len + 1, sizeof(char));
|
|
|
|
subseq = (char *)Calloc(max_len + 1, sizeof(char));
|
|
|
|
scd_str = (char *)Calloc(max_len + 1, sizeof(char));
|
|
|
|
|
|
|
|
/* The output. A set of possibly good oligos. */
|
|
|
|
max_num_probe = 20;
|
|
|
|
seq_set = (char **)Calloc(max_num_probe, sizeof(char *));
|
2022-03-08 04:43:05 +08:00
|
|
|
|
2023-04-09 02:34:34 +08:00
|
|
|
for (i = 0; i < seq_len; i++) {
|
|
|
|
seq[i] = c_elem[i] | 32;
|
|
|
|
}
|
|
|
|
|
|
|
|
i = MAX(l_bnd, 0);
|
|
|
|
while (i <= MIN(r_bnd, seq_len - min_len)) {
|
|
|
|
BadOligo = FALSE;
|
|
|
|
for (l = min_len;
|
|
|
|
BadOligo == FALSE && l <= seq_len - i && l <= max_len;
|
|
|
|
l++) {
|
|
|
|
int uk;
|
|
|
|
|
|
|
|
SubStr(seq, i, l, PossibleOligo);
|
|
|
|
|
|
|
|
/* Any unknow base?
|
|
|
|
*/
|
|
|
|
|
|
|
|
if ((uk = KnownSeq(PossibleOligo)) != -1) {
|
|
|
|
fprintf(stderr, "%s has ambiguous base(s)\n",
|
|
|
|
PossibleOligo);
|
|
|
|
i += uk + 1;
|
|
|
|
BadOligo = TRUE;
|
|
|
|
}
|
|
|
|
|
|
|
|
PO_len = strlen(PossibleOligo);
|
|
|
|
|
|
|
|
/* check if there is a substr of len(no_repeat_len)
|
|
|
|
* repeat itself in the PossibleOligo.
|
|
|
|
DOESN'T MATTER! IT COULD MESS UP AT MOST SEVERAL
|
|
|
|
BASES READ INTO THE PROBE. CUT_SITE IS WHAT REALLY
|
|
|
|
MATTERS.
|
|
|
|
|
|
|
|
for(PO_index = 0;
|
|
|
|
BadOligo==FALSE && PO_index<=PO_len-no_repeat_len;
|
|
|
|
PO_index++)
|
|
|
|
{
|
|
|
|
SubStr(PossibleOligo,PO_index,no_repeat_len,subseq);
|
|
|
|
if(FindPattern(PossibleOligo, subseq) > 1)
|
|
|
|
{
|
|
|
|
fprintf(stderr,
|
|
|
|
"%s has 15 repatitive base(s) %s\n",
|
|
|
|
PossibleOligo, subseq);
|
|
|
|
i++;
|
|
|
|
BadOligo = TRUE;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
*/
|
|
|
|
|
|
|
|
/*
|
|
|
|
* To ensure that the probe is not going to hybridize
|
|
|
|
* with itself:
|
|
|
|
*/
|
|
|
|
for (PO_index = 0; BadOligo == FALSE &&
|
|
|
|
PO_index <= PO_len - no_repeat_len;
|
|
|
|
PO_index++) {
|
|
|
|
SubStr(PossibleOligo, PO_index, no_repeat_len,
|
|
|
|
subseq);
|
|
|
|
strcpy(scd_str, subseq);
|
|
|
|
Complementary(scd_str, 'd');
|
|
|
|
Reverse(scd_str);
|
|
|
|
|
|
|
|
if (FindPattern(PossibleOligo, scd_str) > 0) {
|
|
|
|
fprintf(stderr,
|
|
|
|
"%s may hybridize with itself: "
|
|
|
|
"%s vs. %s.\n",
|
|
|
|
PossibleOligo, subseq, scd_str);
|
|
|
|
i++;
|
|
|
|
BadOligo = TRUE;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
for (PO_index = 0; BadOligo == FALSE &&
|
|
|
|
PO_index <= PO_len - 2 * check_len;
|
|
|
|
PO_index++) {
|
|
|
|
SubStr(PossibleOligo, PO_index, check_len,
|
|
|
|
subseq);
|
|
|
|
Complementary(subseq, 'd');
|
|
|
|
strcpy(scd_str, subseq);
|
|
|
|
Reverse(scd_str);
|
|
|
|
|
|
|
|
/*
|
|
|
|
if(FindPattern2(PossibleOligo,subseq,PO_index)>0)
|
|
|
|
{
|
|
|
|
fprintf(stderr, "%s has self-compl %s\n",
|
|
|
|
PossibleOligo, subseq);
|
|
|
|
i += PO_index+1;
|
|
|
|
BadOligo = TRUE;
|
|
|
|
}
|
|
|
|
else
|
|
|
|
*/
|
|
|
|
|
|
|
|
if (FindPattern2(PossibleOligo, scd_str,
|
|
|
|
PO_index) > 0) {
|
|
|
|
fprintf(stderr,
|
|
|
|
"%s has 2nd struct %s\n",
|
|
|
|
PossibleOligo, scd_str);
|
|
|
|
i += PO_index + 1;
|
|
|
|
BadOligo = TRUE;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
if (BadOligo == FALSE) {
|
|
|
|
seq_set[seq_cnt] = (char *)Calloc(
|
|
|
|
strlen(PossibleOligo) + 1, sizeof(char));
|
|
|
|
strcpy(seq_set[seq_cnt], PossibleOligo);
|
|
|
|
|
|
|
|
if (++seq_cnt == max_num_probe) {
|
|
|
|
max_num_probe *= 2;
|
|
|
|
seq_set = (char **)Realloc(
|
|
|
|
seq_set,
|
|
|
|
max_num_probe * sizeof(char *));
|
|
|
|
}
|
|
|
|
i++;
|
|
|
|
}
|
|
|
|
} /* end of l. */
|
|
|
|
} /* end of i. */
|
2022-03-08 04:43:05 +08:00
|
|
|
|
2023-04-09 02:34:34 +08:00
|
|
|
seq_set[seq_cnt] = NULL;
|
|
|
|
|
|
|
|
if (seq_cnt == 0) return NULL;
|
|
|
|
|
|
|
|
return seq_set;
|
|
|
|
}
|
2022-03-08 04:43:05 +08:00
|
|
|
|
|
|
|
/* ALWAYS COPY the result from uniqueID() to a char[32],
|
|
|
|
* (strlen(hostname)+1+10). Memory is lost when the function
|
|
|
|
* is finished.
|
|
|
|
*/
|
|
|
|
char vname[32];
|
|
|
|
char *uniqueID()
|
|
|
|
{
|
2023-04-09 02:34:34 +08:00
|
|
|
char hname[32], /* vname[32], rtm 18.III.98 */ tstr[32];
|
|
|
|
time_t *tp;
|
|
|
|
static cnt = 0;
|
|
|
|
int ll;
|
2022-03-08 04:43:05 +08:00
|
|
|
|
2023-04-09 02:34:34 +08:00
|
|
|
tp = (time_t *)Calloc(1, sizeof(time_t));
|
2022-03-08 04:43:05 +08:00
|
|
|
|
2023-04-09 02:34:34 +08:00
|
|
|
if (gethostname(hname, 32) == -1) {
|
|
|
|
fprintf(stderr, "UniqueID(): Failed to get host name.\n");
|
|
|
|
exit(1);
|
|
|
|
}
|
|
|
|
|
|
|
|
time(tp);
|
|
|
|
sprintf(tstr, ":%d:%ld", cnt, *tp);
|
|
|
|
if ((ll = strlen(tstr)) > 31) {
|
|
|
|
strncpy(vname, tstr, 31);
|
|
|
|
vname[31] = '\0';
|
|
|
|
}
|
|
|
|
else {
|
|
|
|
ll = strlen(hname) - (31 - ll);
|
|
|
|
if (ll < 0) ll = 0;
|
|
|
|
sprintf(vname, "%s%s", hname + ll, tstr);
|
|
|
|
}
|
|
|
|
cnt++;
|
|
|
|
Cfree(tp);
|
|
|
|
return (vname);
|
|
|
|
}
|
2022-03-08 04:43:05 +08:00
|
|
|
|
|
|
|
/* return the percentage of GCcontents. */
|
|
|
|
|
|
|
|
int GCcontent(seq)
|
|
|
|
char *seq;
|
|
|
|
{
|
2023-04-09 02:34:34 +08:00
|
|
|
int l, gc = 0, j;
|
2022-03-08 04:43:05 +08:00
|
|
|
|
2023-04-09 02:34:34 +08:00
|
|
|
l = strlen(seq);
|
2022-03-08 04:43:05 +08:00
|
|
|
|
2023-04-09 02:34:34 +08:00
|
|
|
for (j = 0; j < l; j++) {
|
|
|
|
if ((seq[j] | 32) == 'g' || (seq[j] | 32) == 'c') {
|
|
|
|
gc++;
|
|
|
|
}
|
2022-03-08 04:43:05 +08:00
|
|
|
}
|
2023-04-09 02:34:34 +08:00
|
|
|
return ((int)(gc * 100 / l));
|
2022-03-08 04:43:05 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
/******
|
|
|
|
*
|
|
|
|
* HGLtoIQ() outputs a HGL format record to an ASCII file with
|
|
|
|
* the Input-Queue format, the format for the synthesizer.
|
|
|
|
*
|
|
|
|
******/
|
|
|
|
|
2023-04-09 02:34:34 +08:00
|
|
|
void HGLtoIQ(fname, tSeq) const char *fname;
|
2022-03-08 04:43:05 +08:00
|
|
|
Sequence *tSeq;
|
|
|
|
{
|
2023-04-09 02:34:34 +08:00
|
|
|
FILE *fp;
|
|
|
|
|
|
|
|
if ((fp = fopen(fname, "w")) == NULL) {
|
|
|
|
fprintf(stderr, "Can't open IQ file: %s\n", fname);
|
|
|
|
exit(1);
|
|
|
|
}
|
|
|
|
fprintf(fp, "%s %s\n", tSeq->comments, tSeq->c_elem);
|
2022-03-08 04:43:05 +08:00
|
|
|
}
|
|
|
|
|
2023-04-09 02:34:34 +08:00
|
|
|
Find2(string, key) char *key, *string;
|
|
|
|
/*
|
|
|
|
* Like find, but returns the index of the leftmost
|
|
|
|
* occurence, and -1 if not found.
|
|
|
|
* Note in this program, T==U, and case insensitive.
|
|
|
|
*/
|
|
|
|
{
|
|
|
|
int i, j, len1, len2, dif, flag = FALSE;
|
|
|
|
char *target;
|
2022-03-08 04:43:05 +08:00
|
|
|
|
2023-04-09 02:34:34 +08:00
|
|
|
if (string == NULL || string[0] == '\0') return -1;
|
2022-03-08 04:43:05 +08:00
|
|
|
|
2023-04-09 02:34:34 +08:00
|
|
|
len2 = strlen(string);
|
|
|
|
target = (char *)Calloc(len2 + 1, 1);
|
|
|
|
for (i = 0; i < len2; i++) {
|
|
|
|
target[i] = string[i] | 32;
|
|
|
|
if (target[i] == 'u') target[i] = 't';
|
|
|
|
}
|
2022-03-08 04:43:05 +08:00
|
|
|
|
2023-04-09 02:34:34 +08:00
|
|
|
len1 = strlen(key);
|
|
|
|
for (i = 0; i < len1; i++) {
|
|
|
|
key[i] |= 32;
|
|
|
|
if (key[i] == 'u') key[i] = 't';
|
|
|
|
}
|
2022-03-08 04:43:05 +08:00
|
|
|
|
2023-04-09 02:34:34 +08:00
|
|
|
dif = len2 - len1 + 1;
|
2022-03-08 04:43:05 +08:00
|
|
|
|
2023-04-09 02:34:34 +08:00
|
|
|
if (len1 > 0)
|
|
|
|
for (j = 0; j < dif && flag == FALSE; j++) {
|
|
|
|
flag = TRUE;
|
|
|
|
for (i = 0; i < len1 && flag; i++)
|
|
|
|
flag = (key[i] == target[i + j]) ? TRUE : FALSE;
|
|
|
|
}
|
|
|
|
Cfree(target);
|
|
|
|
return (flag ? j - 1 : -1);
|
|
|
|
}
|
2022-03-08 04:43:05 +08:00
|
|
|
|
|
|
|
/* return -1 if end-of-file.
|
|
|
|
FALSE if anything is wrong.
|
|
|
|
*/
|
2023-04-09 02:34:34 +08:00
|
|
|
int ReadGDE(fp, seq)
|
2022-03-08 04:43:05 +08:00
|
|
|
FILE *fp;
|
|
|
|
Sequence *seq;
|
|
|
|
{
|
2023-04-09 02:34:34 +08:00
|
|
|
char temp_line[1000], waste[64];
|
|
|
|
int ii, l1;
|
|
|
|
|
|
|
|
while (fgets(temp_line, 1000, fp) != NULL) {
|
|
|
|
if (strncmp(temp_line, "sequence-ID", 11) == 0) {
|
|
|
|
sscanf(temp_line, "%s%s", waste, seq->sequence_ID);
|
|
|
|
}
|
|
|
|
else if (temp_line[0] == '#') {
|
|
|
|
strncpy(seq->name, temp_line + 1, 31);
|
|
|
|
seq->name[31] = '\0';
|
|
|
|
ii = 0;
|
|
|
|
while (ii < strlen(seq->name) && seq->name[ii] != ' ' &&
|
|
|
|
seq->name[ii] != '\n')
|
|
|
|
ii++;
|
|
|
|
seq->name[ii] = '\0';
|
|
|
|
|
|
|
|
seq->seqmaxlen = 256;
|
|
|
|
seq->c_elem = (char *)Calloc(seq->seqmaxlen, 1);
|
|
|
|
seq->seqlen = 0;
|
|
|
|
while (fgets(temp_line, 1000, fp) != NULL) {
|
|
|
|
l1 = strlen(temp_line);
|
|
|
|
|
|
|
|
if (temp_line[l1 - 1] == '\n') {
|
|
|
|
l1--;
|
|
|
|
temp_line[l1] = '\0';
|
|
|
|
}
|
|
|
|
|
|
|
|
while (seq->seqmaxlen <
|
|
|
|
seq->seqlen + strlen(temp_line) + 1) {
|
|
|
|
seq->seqmaxlen *= 2;
|
|
|
|
seq->c_elem = (char *)Realloc(
|
|
|
|
seq->c_elem, seq->seqmaxlen);
|
|
|
|
}
|
|
|
|
|
|
|
|
strcat(seq->c_elem, temp_line);
|
|
|
|
seq->seqlen += strlen(temp_line);
|
|
|
|
}
|
2022-03-08 04:43:05 +08:00
|
|
|
|
2023-04-09 02:34:34 +08:00
|
|
|
if (seq->seqlen == 0) {
|
|
|
|
fprintf(stderr, "\n%s\n", "Sequence is empty.");
|
|
|
|
return FALSE;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
return -1;
|
|
|
|
}
|
2022-03-08 04:43:05 +08:00
|
|
|
|
2023-04-09 02:34:34 +08:00
|
|
|
void heapify(seq_set, seq_size, heap_size, elem, Pkey, Skey, order) int seq_size
|
|
|
|
,
|
|
|
|
elem, heap_size, **order;
|
2022-03-08 04:43:05 +08:00
|
|
|
char Pkey[], Skey[];
|
|
|
|
Sequence *seq_set;
|
|
|
|
{
|
2023-04-09 02:34:34 +08:00
|
|
|
int l, r, temp, largest;
|
2022-03-08 04:43:05 +08:00
|
|
|
|
2023-04-09 02:34:34 +08:00
|
|
|
l = 2 * elem + 1;
|
|
|
|
r = 2 * elem + 2;
|
|
|
|
|
|
|
|
if (l <= heap_size && CompKey(seq_set[(*order)[l]],
|
|
|
|
seq_set[(*order)[elem]], Pkey, Skey) > 0)
|
|
|
|
largest = l;
|
|
|
|
else
|
|
|
|
largest = elem;
|
|
|
|
|
|
|
|
if (r <= heap_size &&
|
|
|
|
CompKey(seq_set[(*order)[r]], seq_set[(*order)[largest]], Pkey,
|
|
|
|
Skey) > 0)
|
|
|
|
largest = r;
|
|
|
|
|
|
|
|
if (largest != elem) {
|
|
|
|
temp = (*order)[elem];
|
|
|
|
(*order)[elem] = (*order)[largest];
|
|
|
|
(*order)[largest] = temp;
|
|
|
|
heapify(seq_set, seq_size, heap_size, largest, Pkey, Skey,
|
|
|
|
order);
|
|
|
|
}
|
|
|
|
}
|
2022-03-08 04:43:05 +08:00
|
|
|
|
2023-04-09 02:34:34 +08:00
|
|
|
heapsort(seq_set, seq_size, Pkey, Skey, order) int seq_size, **order;
|
|
|
|
char Pkey[], Skey[];
|
2022-03-08 04:43:05 +08:00
|
|
|
Sequence *seq_set;
|
|
|
|
{
|
2023-04-09 02:34:34 +08:00
|
|
|
int ii, temp, heap_size;
|
2022-03-08 04:43:05 +08:00
|
|
|
|
2023-04-09 02:34:34 +08:00
|
|
|
/*
|
|
|
|
* build_heap(seq_set, seq_size, &heap_size, order);
|
|
|
|
*/
|
|
|
|
heap_size = seq_size - 1;
|
2022-03-08 04:43:05 +08:00
|
|
|
|
2023-04-09 02:34:34 +08:00
|
|
|
for (ii = (seq_size - 1) / 2; ii >= 0; ii--) /* (L-1)/2-1?? */
|
|
|
|
{
|
|
|
|
heapify(seq_set, seq_size, heap_size, ii, Pkey, Skey, order);
|
|
|
|
}
|
2022-03-08 04:43:05 +08:00
|
|
|
|
2023-04-09 02:34:34 +08:00
|
|
|
for (ii = seq_size - 1; ii > 0; ii--) {
|
|
|
|
temp = (*order)[0];
|
|
|
|
(*order)[0] = (*order)[ii];
|
|
|
|
(*order)[ii] = temp;
|
|
|
|
heap_size--;
|
|
|
|
heapify(seq_set, seq_size, heap_size, 0, Pkey, Skey, order);
|
|
|
|
}
|
|
|
|
}
|
2022-03-08 04:43:05 +08:00
|
|
|
|
|
|
|
/*
|
2023-04-09 02:34:34 +08:00
|
|
|
* Return >0, ==0, <0.
|
2022-03-08 04:43:05 +08:00
|
|
|
*/
|
|
|
|
|
|
|
|
int CompKey(seq1, seq2, Pkey, Skey)
|
2023-04-09 02:34:34 +08:00
|
|
|
Sequence seq1, seq2;
|
|
|
|
char Pkey[], Skey[];
|
2022-03-08 04:43:05 +08:00
|
|
|
{
|
2023-04-09 02:34:34 +08:00
|
|
|
int ii, jj, Pret;
|
|
|
|
char b1[32], b2[32];
|
2022-03-08 04:43:05 +08:00
|
|
|
|
2023-04-09 02:34:34 +08:00
|
|
|
if (strcmp(Pkey, "type") == 0) {
|
|
|
|
Pret = strcmp(seq1.type, seq2.type);
|
|
|
|
if (Pret != 0 || Skey[0] == '\0') return Pret;
|
|
|
|
}
|
|
|
|
else if (strcmp(Pkey, "name") == 0) {
|
|
|
|
Pret = strcmp(seq1.name, seq2.name);
|
|
|
|
if (Pret != 0 || Skey[0] == '\0') return Pret;
|
|
|
|
}
|
|
|
|
else if (strcmp(Pkey, "sequence-ID") == 0) {
|
|
|
|
Pret = strcmp(seq1.sequence_ID, seq2.sequence_ID);
|
|
|
|
if (Pret != 0 || Skey[0] == '\0') return Pret;
|
|
|
|
}
|
|
|
|
else if (strcmp(Pkey, "creator") == 0) {
|
|
|
|
Pret = strcmp(seq1.creator, seq2.creator);
|
|
|
|
if (Pret != 0 || Skey[0] == '\0') return Pret;
|
|
|
|
}
|
|
|
|
else if (strcmp(Pkey, "offset") == 0) {
|
|
|
|
Pret = seq1.offset - seq2.offset;
|
|
|
|
if (Pret != 0 || Skey[0] == '\0') return Pret;
|
|
|
|
}
|
|
|
|
else if (strcmp(Pkey, "group-ID") == 0) {
|
|
|
|
Pret = seq1.group_ID - seq2.group_ID;
|
|
|
|
if (Pret != 0 || Skey[0] == '\0') return Pret;
|
|
|
|
}
|
|
|
|
else if (strcmp(Pkey, "barcode") == 0) {
|
|
|
|
if (seq1.barcode[0] == 'P')
|
|
|
|
strcpy(b1, seq1.barcode + 2);
|
|
|
|
else
|
|
|
|
strcpy(b1, seq1.barcode);
|
2022-03-08 04:43:05 +08:00
|
|
|
|
2023-04-09 02:34:34 +08:00
|
|
|
if (seq2.barcode[0] == 'P')
|
|
|
|
strcpy(b2, seq2.barcode + 2);
|
|
|
|
else
|
|
|
|
strcpy(b2, seq2.barcode);
|
2022-03-08 04:43:05 +08:00
|
|
|
|
2023-04-09 02:34:34 +08:00
|
|
|
Pret = strcmp(b1, b2);
|
|
|
|
if (Pret != 0 || Skey[0] == '\0') return Pret;
|
|
|
|
}
|
|
|
|
else if (strcmp(Pkey, "seqlen") == 0) {
|
|
|
|
Pret = seq1.seqlen - seq2.seqlen;
|
|
|
|
if (Pret != 0 || Skey[0] == '\0') return Pret;
|
|
|
|
}
|
|
|
|
else if (strcmp(Pkey, "creation-date") == 0) {
|
|
|
|
seq1.creation_date[0] %= 100;
|
|
|
|
seq2.creation_date[0] %= 100;
|
|
|
|
Pret = seq1.creation_date[0] * 10000 +
|
|
|
|
seq1.creation_date[1] * 100 + seq1.creation_date[2] -
|
|
|
|
seq2.creation_date[0] * 10000 -
|
|
|
|
seq2.creation_date[1] * 100 - seq2.creation_date[2];
|
|
|
|
if (Pret == 0) {
|
|
|
|
Pret = seq1.creation_date[3] * 10000 +
|
|
|
|
seq1.creation_date[4] * 100 +
|
|
|
|
seq1.creation_date[5] -
|
|
|
|
seq2.creation_date[3] * 10000 -
|
|
|
|
seq2.creation_date[4] * 100 -
|
|
|
|
seq2.creation_date[5];
|
|
|
|
}
|
|
|
|
if (Pret != 0 || Skey[0] == '\0') return Pret;
|
|
|
|
}
|
|
|
|
else if (strcmp(Pkey, "probing-date") == 0) {
|
|
|
|
seq1.probing_date[0] %= 100;
|
|
|
|
seq2.probing_date[0] %= 100;
|
|
|
|
Pret = seq1.probing_date[0] * 10000 +
|
|
|
|
seq1.probing_date[1] * 100 + seq1.probing_date[2] -
|
|
|
|
seq2.probing_date[0] * 10000 -
|
|
|
|
seq2.probing_date[1] * 100 - seq2.probing_date[2];
|
|
|
|
if (Pret == 0) {
|
|
|
|
Pret =
|
|
|
|
seq1.probing_date[3] * 10000 +
|
|
|
|
seq1.probing_date[4] * 100 + seq1.probing_date[5] -
|
|
|
|
seq2.probing_date[3] * 10000 -
|
|
|
|
seq2.probing_date[4] * 100 - seq2.probing_date[5];
|
|
|
|
}
|
|
|
|
if (Pret != 0 || Skey[0] == '\0') return Pret;
|
|
|
|
}
|
|
|
|
else if (strcmp(Pkey, "autorad_date") == 0) {
|
|
|
|
seq1.autorad_date[0] %= 100;
|
|
|
|
seq2.autorad_date[0] %= 100;
|
|
|
|
Pret = seq1.autorad_date[0] * 10000 +
|
|
|
|
seq1.autorad_date[1] * 100 + seq1.autorad_date[2] -
|
|
|
|
seq2.autorad_date[0] * 10000 -
|
|
|
|
seq2.autorad_date[1] * 100 - seq2.autorad_date[2];
|
|
|
|
if (Pret == 0) {
|
|
|
|
Pret =
|
|
|
|
seq1.autorad_date[3] * 10000 +
|
|
|
|
seq1.autorad_date[4] * 100 + seq1.autorad_date[5] -
|
|
|
|
seq2.autorad_date[3] * 10000 -
|
|
|
|
seq2.autorad_date[4] * 100 - seq2.autorad_date[5];
|
|
|
|
}
|
|
|
|
if (Pret != 0 || Skey[0] == '\0') return Pret;
|
|
|
|
}
|
|
|
|
else if (strcmp(Pkey, "film") == 0) {
|
|
|
|
Pret = strcmp(seq1.film, seq2.film);
|
|
|
|
if (Pret != 0 || Skey[0] == '\0') return Pret;
|
|
|
|
}
|
|
|
|
else if (strcmp(Pkey, "membrane") == 0) {
|
|
|
|
Pret = strcmp(seq1.membrane, seq2.membrane);
|
|
|
|
if (Pret != 0 || Skey[0] == '\0') return Pret;
|
|
|
|
}
|
|
|
|
else if (strcmp(Pkey, "contig") == 0) {
|
|
|
|
Pret = strcmp(seq1.contig, seq2.contig);
|
|
|
|
if (Pret != 0 || Skey[0] == '\0') return Pret;
|
|
|
|
}
|
2022-03-08 04:43:05 +08:00
|
|
|
|
2023-04-09 02:34:34 +08:00
|
|
|
else {
|
|
|
|
fprintf(stderr, "CompKey(): Invalid primary key %s.\n", Pkey);
|
|
|
|
exit(1);
|
|
|
|
}
|
2022-03-08 04:43:05 +08:00
|
|
|
|
2023-04-09 02:34:34 +08:00
|
|
|
if (strcmp(Skey, "type") == 0) {
|
|
|
|
return (strcmp(seq1.type, seq2.type));
|
|
|
|
}
|
|
|
|
else if (strcmp(Skey, "name") == 0) {
|
|
|
|
return (strcmp(seq1.name, seq2.name));
|
|
|
|
}
|
|
|
|
else if (strcmp(Skey, "sequence-ID") == 0) {
|
|
|
|
return (strcmp(seq1.sequence_ID, seq2.sequence_ID));
|
|
|
|
}
|
|
|
|
else if (strcmp(Skey, "creator") == 0) {
|
|
|
|
return (strcmp(seq1.creator, seq2.creator));
|
|
|
|
}
|
|
|
|
else if (strcmp(Skey, "offset") == 0) {
|
|
|
|
return (seq1.offset - seq2.offset);
|
|
|
|
}
|
|
|
|
else if (strcmp(Skey, "group-ID") == 0) {
|
|
|
|
return (seq1.group_ID - seq2.group_ID);
|
|
|
|
}
|
|
|
|
else if (strcmp(Skey, "barcode") == 0) {
|
|
|
|
if (seq1.barcode[0] == 'P')
|
|
|
|
strcpy(b1, seq1.barcode + 2);
|
|
|
|
else
|
|
|
|
strcpy(b1, seq1.barcode);
|
2022-03-08 04:43:05 +08:00
|
|
|
|
2023-04-09 02:34:34 +08:00
|
|
|
if (seq2.barcode[0] == 'P')
|
|
|
|
strcpy(b2, seq2.barcode + 2);
|
|
|
|
else
|
|
|
|
strcpy(b2, seq2.barcode);
|
|
|
|
|
|
|
|
return (strcmp(b1, b2));
|
|
|
|
}
|
|
|
|
else if (strcmp(Skey, "seqlen") == 0) {
|
|
|
|
return (seq1.seqlen - seq2.seqlen);
|
|
|
|
}
|
|
|
|
else if (strcmp(Skey, "creation-date") == 0) {
|
|
|
|
seq1.creation_date[0] %= 100;
|
|
|
|
seq2.creation_date[0] %= 100;
|
|
|
|
Pret = seq1.creation_date[0] * 10000 +
|
|
|
|
seq1.creation_date[1] * 100 + seq1.creation_date[2] -
|
|
|
|
seq2.creation_date[0] * 10000 -
|
|
|
|
seq2.creation_date[1] * 100 - seq2.creation_date[2];
|
|
|
|
if (Pret != 0) return Pret;
|
|
|
|
|
|
|
|
return (seq1.creation_date[3] * 10000 +
|
|
|
|
seq1.creation_date[4] * 100 + seq1.creation_date[5] -
|
|
|
|
seq2.creation_date[3] * 10000 -
|
|
|
|
seq2.creation_date[4] * 100 - seq2.creation_date[5]);
|
|
|
|
}
|
|
|
|
else if (strcmp(Skey, "probing-date") == 0) {
|
|
|
|
seq1.probing_date[0] %= 100;
|
|
|
|
seq2.probing_date[0] %= 100;
|
|
|
|
Pret = seq1.probing_date[0] * 10000 +
|
|
|
|
seq1.probing_date[1] * 100 + seq1.probing_date[2] -
|
|
|
|
seq2.probing_date[0] * 10000 -
|
|
|
|
seq2.probing_date[1] * 100 - seq2.probing_date[2];
|
|
|
|
if (Pret != 0) return Pret;
|
|
|
|
|
|
|
|
return (seq1.probing_date[3] * 10000 +
|
|
|
|
seq1.probing_date[4] * 100 + seq1.probing_date[5] -
|
|
|
|
seq2.probing_date[3] * 10000 -
|
|
|
|
seq2.probing_date[4] * 100 - seq2.probing_date[5]);
|
|
|
|
}
|
|
|
|
else if (strcmp(Skey, "autorad_date") == 0) {
|
|
|
|
seq1.autorad_date[0] %= 100;
|
|
|
|
seq2.autorad_date[0] %= 100;
|
|
|
|
Pret = seq1.autorad_date[0] * 10000 +
|
|
|
|
seq1.autorad_date[1] * 100 + seq1.autorad_date[2] -
|
|
|
|
seq2.autorad_date[0] * 10000 -
|
|
|
|
seq2.autorad_date[1] * 100 - seq2.autorad_date[2];
|
|
|
|
if (Pret != 0) return Pret;
|
|
|
|
|
|
|
|
return (seq1.autorad_date[3] * 10000 +
|
|
|
|
seq1.autorad_date[4] * 100 + seq1.autorad_date[5] -
|
|
|
|
seq2.autorad_date[3] * 10000 -
|
|
|
|
seq2.autorad_date[4] * 100 - seq2.autorad_date[5]);
|
|
|
|
}
|
|
|
|
else if (strcmp(Skey, "film") == 0) {
|
|
|
|
return (strcmp(seq1.film, seq2.film));
|
|
|
|
}
|
|
|
|
else if (strcmp(Skey, "membrane") == 0) {
|
|
|
|
return (strcmp(seq1.membrane, seq2.membrane));
|
|
|
|
}
|
|
|
|
else if (strcmp(Skey, "contig") == 0) {
|
|
|
|
return (strcmp(seq1.contig, seq2.contig));
|
|
|
|
}
|
|
|
|
else {
|
|
|
|
fprintf(stderr, "CompKey(): Invalid secondary key %s.\n", Skey);
|
|
|
|
exit(1);
|
|
|
|
}
|
|
|
|
}
|
2022-03-08 04:43:05 +08:00
|
|
|
|
2023-04-09 02:34:34 +08:00
|
|
|
int Lock(fname)
|
2022-03-08 04:43:05 +08:00
|
|
|
char *fname;
|
|
|
|
{
|
2023-04-09 02:34:34 +08:00
|
|
|
char buffer[1024];
|
|
|
|
FILE *fp;
|
|
|
|
int wait = 0;
|
|
|
|
|
|
|
|
while ((fp = fopen(fname, "r")) == NULL) {
|
|
|
|
sleep(1);
|
|
|
|
if (++wait == 30) {
|
|
|
|
fprintf(stderr,
|
|
|
|
"File %s not available, Try later.\n\n",
|
|
|
|
fname);
|
|
|
|
return FALSE;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
fclose(fp);
|
|
|
|
sprintf(buffer, "mv %s %s.locked", fname, fname);
|
|
|
|
system(buffer);
|
|
|
|
return TRUE;
|
2022-03-08 04:43:05 +08:00
|
|
|
}
|
|
|
|
|
2023-04-09 02:34:34 +08:00
|
|
|
void Unlock(fname) char *fname;
|
|
|
|
{
|
|
|
|
char buffer[1024];
|
|
|
|
sprintf(buffer, "mv %s.locked %s", fname, fname);
|
|
|
|
system(buffer);
|
|
|
|
}
|
2022-03-08 04:43:05 +08:00
|
|
|
|
2023-04-09 02:34:34 +08:00
|
|
|
AppendComments(seq, str) Sequence *seq;
|
2022-03-08 04:43:05 +08:00
|
|
|
char *str;
|
|
|
|
{
|
2023-04-09 02:34:34 +08:00
|
|
|
int ii, jj, kk;
|
|
|
|
|
|
|
|
kk = strlen(str);
|
|
|
|
|
|
|
|
if (seq->commentsmaxlen == 0) {
|
|
|
|
seq->comments = (char *)Calloc(kk + 1, 1);
|
|
|
|
seq->commentsmaxlen = kk + 1;
|
|
|
|
seq->commentslen = 0;
|
|
|
|
}
|
|
|
|
else if (seq->commentslen + kk + 1 > seq->commentsmaxlen) {
|
|
|
|
seq->commentsmaxlen += 2 * (kk + 1);
|
|
|
|
seq->comments =
|
|
|
|
(char *)Realloc(seq->comments, seq->commentsmaxlen);
|
|
|
|
}
|
|
|
|
seq->comments[seq->commentslen] = '\0';
|
|
|
|
seq->comments[seq->commentslen] = '\0';
|
|
|
|
strcat(seq->comments, str);
|
|
|
|
seq->commentslen = strlen(seq->comments);
|
2022-03-08 04:43:05 +08:00
|
|
|
}
|