/**************************************************************** * * This is a set of functions defined for the genome * project. * ****************************************************************/ #ifndef _GLOBAL_DEFS_H #define _GLOBAL_DEFS_H #include "global_defs.h" #endif #define MAXLINELEN 256 static char Default_DNA_Trans[16] = { '-', 'a','c','m','g','r','s','v','t','w','y','h','k','d','b','n' }; /*********** * * WriteRecord() outputs one record at a time in HGL format. * Only the fields in the fields_array will be output. All the * fields will be output if fields_array is NULL. * * fp : pointer to the output file. * tSeq: pointer to the record. * fields_array: contains the field ids of the selected fields. * array_size: number of selected fields. * * Returns: 1 if any field is printed; * 0 if no field is printed; * -1 if anything is wrong. * **********/ int WriteRecord(fp, tSeq, fields_array, array_size) FILE *fp; const Sequence *tSeq; int *fields_array; int array_size; { int i, save_str_size, tt; int all_fields = FALSE; int first_field = TRUE; char temp_str[256]; char *save_str; char *ptr; save_str = (char *)Calloc(256, 1); save_str_size = 256; /* When all the fields are selected. */ if(fields_array == NULL) { all_fields = TRUE; fields_array = (int *)Calloc(NUM_OF_FIELDS, sizeof(int)); for(i=0; icreation_date[0] != 0 ) { sprintf(save_str,"\n%s\t%d/%d/%d ", at[fields_array[i]], tSeq->creation_date[1], tSeq->creation_date[2], tSeq->creation_date[0]); if(tSeq->creation_date[3]>=0) { if(tSeq->creation_date[4] < 0) tSeq->creation_date[4] = 0; if(tSeq->creation_date[5] < 0) tSeq->creation_date[5] = 0; sprintf(save_str, "%s%d:%d:%d", save_str, tSeq->creation_date[3], tSeq->creation_date[4], tSeq->creation_date[5]); } } else if (fields_array[i] == e_probing_date && tSeq->probing_date[0] != 0 ) { sprintf(save_str,"\n%s\t%d/%d/%d ", at[fields_array[i]], tSeq->probing_date[1], tSeq->probing_date[2], tSeq->probing_date[0]); if(tSeq->probing_date[3]>=0) { if(tSeq->probing_date[4] < 0) tSeq->probing_date[4] = 0; if(tSeq->probing_date[5] < 0) tSeq->probing_date[5] = 0; sprintf(save_str, "%s%d:%d:%d", save_str, tSeq->probing_date[3], tSeq->probing_date[4], tSeq->probing_date[5]); } } else if (fields_array[i] == e_autorad_date && tSeq->autorad_date[0] != 0 ) { sprintf(save_str,"\n%s\t%d/%d/%d ", at[fields_array[i]], tSeq->autorad_date[1], tSeq->autorad_date[2], tSeq->autorad_date[0]); if(tSeq->autorad_date[3]>=0) { if(tSeq->autorad_date[4] < 0) tSeq->autorad_date[4] = 0; if(tSeq->autorad_date[5] < 0) tSeq->autorad_date[5] = 0; sprintf(save_str, "%s%d:%d:%d", save_str, tSeq->autorad_date[3], tSeq->autorad_date[4], tSeq->autorad_date[5]); } } else if ( fields_array[i] == e_c_elem && tSeq->c_elem != NULL ) { ptr = tSeq->c_elem; sprintf(save_str,"\n%s\t\"",at[fields_array[i]]); while ( ptr < tSeq->c_elem + tSeq->seqlen ) { if ( ptr != tSeq->c_elem ) strcat(save_str,"\n"); strncpy(temp_str, ptr, MIN(60, tSeq->c_elem +tSeq->seqlen-ptr)); temp_str[MIN(60, tSeq->c_elem+tSeq->seqlen - ptr)] = '\0'; /* Gurantee strlen(temp_str) chars for the string, * one for \n, one for ", and one for \0. */ while(save_str_size - strlen(save_str) < strlen(temp_str)+3) { save_str_size *= 2; save_str = (char *)Realloc(save_str,save_str_size); } strcat(save_str, temp_str); ptr += 60; } strcat(save_str,"\""); } else if ( fields_array[i] == e_comments && tSeq->commentslen != 0) { while(save_str_size < 20+tSeq->commentslen) { save_str_size *= 2; save_str = (char *)Realloc(save_str,save_str_size); } strcat(save_str,"\n"); strcat(save_str,at[fields_array[i]]); strcat(save_str,"\t\"\n"); /* put a \0 at the end of comments. */ while(tSeq->commentslen + 1 > tSeq->commentsmaxlen) { tSeq->commentsmaxlen *= 2; tSeq->comments = (char *) Realloc(tSeq->comments, tSeq->commentsmaxlen); } tSeq->comments[tSeq->commentslen] = '\0'; /* clean up the leading empty lines.*/ tt = 0; while(tSeq->comments[tt] == '\n' || tSeq->comments[tt] == ' ') tt++; tSeq->commentslen -= tt; strcat(save_str,tSeq->comments+tt); strcat(save_str,"\""); } else if (fields_array[i] == e_laneset && tSeq->laneset != -1) sprintf(save_str,"\n%s\t\t%d", at[fields_array[i]],tSeq->laneset); else if (fields_array[i] == e_strandedness && tSeq->strandedness != 0) sprintf(save_str,"\n%s\t%d", at[fields_array[i]],tSeq->strandedness); else if (fields_array[i] == e_direction && tSeq->direction != 0) sprintf(save_str,"\n%s\t%d", at[fields_array[i]],tSeq->direction); else if (fields_array[i] == e_orig_strand && tSeq->orig_strand != 0) sprintf(save_str,"\n%s\t%d", at[fields_array[i]],tSeq->orig_strand); else if (fields_array[i] == e_orig_direction && tSeq->orig_direction != 0) sprintf(save_str,"\n%s\t%d", at[fields_array[i]],tSeq->orig_direction); else if (fields_array[i] == e_offset) sprintf(save_str,"\n%s\t\t%d", at[fields_array[i]],tSeq->offset); else if (fields_array[i] == e_group_number && tSeq->group_number != 0) sprintf(save_str,"\n%s\t%d", at[fields_array[i]],tSeq->group_number); else if (fields_array[i] == e_group_ID) sprintf(save_str,"\n%s\t%d", at[fields_array[i]],tSeq->group_ID); else if (fields_array[i] == e_type && tSeq->type[0] != '\0' ) sprintf(save_str,"\n%s\t\t\"%s\"", at[fields_array[i]],tSeq->type); else if (fields_array[i] == e_barcode && tSeq->barcode[0] != '\0' ) sprintf(save_str,"\n%s\t\t\"%s\"", at[fields_array[i]],tSeq->barcode); else if (fields_array[i] == e_name && tSeq->name[0] != '\0' ) sprintf(save_str,"\n%s\t\t\"%s\"", at[fields_array[i]],tSeq->name); else if (fields_array[i] == e_status && tSeq->status[0] != '\0' ) sprintf(save_str,"\n%s\t\t\"%s\"", at[fields_array[i]],tSeq->status); else if (fields_array[i] == e_walk && tSeq->walk[0] != '\0' ) sprintf(save_str,"\n%s\t\t\"%s\"", at[fields_array[i]],tSeq->walk); else if (fields_array[i] == e_sequence_ID && tSeq->sequence_ID[0] != '\0' ) sprintf(save_str,"\n%s\t\"%s\"", at[fields_array[i]],tSeq->sequence_ID); else if (fields_array[i] == e_creator && tSeq->creator[0] != '\0') sprintf(save_str,"\n%s\t\t\"%s\"", at[fields_array[i]],tSeq->creator); else if (fields_array[i]==e_film && tSeq->film[0]!='\0') sprintf(save_str,"\n%s\t\t\"%s\"", at[fields_array[i]],tSeq->film); else if (fields_array[i] == e_membrane && tSeq->membrane[0] != '\0') sprintf(save_str,"\n%s\t\"%s\"", at[fields_array[i]],tSeq->membrane); else if (fields_array[i] == e_source_ID && tSeq->source_ID[0] != '\0') sprintf(save_str,"\n%s\t\"%s\"", at[fields_array[i]],tSeq->source_ID); else if (fields_array[i] == e_contig && tSeq->contig[0] != '\0') sprintf(save_str,"\n%s\t\t\"%s\"", at[fields_array[i]],tSeq->contig); else if (fields_array[i] == e_baggage && tSeq->baglen != 0) { if(save_str_size < tSeq->baglen+2) { save_str_size = tSeq->baglen+2; save_str = (char *)Realloc(save_str,save_str_size); } save_str[0] = '\n'; save_str[1] = '\0'; /* put a \0 at the end of baggage. */ strncat(save_str, tSeq->baggage, tSeq->baglen); while(save_str[tSeq->baglen-1] == '\n') { tSeq->baglen--; } save_str[tSeq->baglen] = '\0'; } if(save_str[0] != '\0') { if (first_field == TRUE) { first_field = FALSE; fprintf(fp,"{"); } fprintf(fp,"%s",save_str); } } if (first_field == FALSE) { fprintf(fp,"\n}\n"); } if(all_fields == TRUE && fields_array != NULL) { Cfree(fields_array); fields_array = NULL; } if(save_str != NULL) { Cfree(save_str); save_str = NULL; } if (first_field == TRUE) return 0; else return 1; } /********* * * ReadRecord() reads one record from fp into tSeq. fp remains at * the finishing position so that next time when ReadRecord() is * called, it reads the next record. * * The caller program should LOCATE MEMORY for the tSeq before calling. * * ReadRecord() returns: * TRUE if no error; * FALSE if anything is wrong * -1 if end-of-file is reached * **********/ int ReadRecord(fp, tSeq) FILE *fp; Sequence *tSeq; { char field_name[20], line[256], orig_line[256]; int temp_str_size, start, end, l, max_len = 255; char *fgets_ret, *temp_str, *fgets_ret1; int start_rec = FALSE; int need_to_read = TRUE; char started = 'F'; void InitRecord(); void FreeRecord(); temp_str = (char *)Calloc(256, 1); temp_str_size = 256; InitRecord(tSeq); if(tSeq->c_elem == NULL) { tSeq->c_elem = (char *)Calloc(256, 1); tSeq->seqmaxlen = 256; } tSeq->c_elem[0] = '\0'; /* read file line-by-line. */ while (need_to_read == TRUE && ((fgets_ret = fgets(line, max_len, fp)) != NULL || start_rec == TRUE)) { strcpy(orig_line, line); end = strlen(line) -1; while(end>=0 && (line[end] == ' ' || line[end] == '\t' || line[end] == ',' || line[end] == '\n') ) end--; /* ignore empty lines. */ if(end == -1) continue; if(line[end] == '{') started = 'T'; /* to ignore the lines between a } and a {. */ while(started == 'F' && fgets_ret != NULL) { fgets_ret = fgets(line, max_len, fp); strcpy(orig_line, line); end = strlen(line) -1; while(end>=0 && (line[end] == ' ' || line[end] == '\t' || line[end] == ',' || line[end] == '\n') ) end--; /* ignore empty lines. */ if(end == -1) continue; if(line[end] == '{') started = 'T'; } if(fgets_ret == NULL) return -1; if (end < 0) { } else if ((line[end] == '}') && (end==0)) { start_rec = FALSE; need_to_read = FALSE; } else if (line[end] == '{' && end <= 10) { start_rec = TRUE; } else { if (line[end]=='}') { need_to_read = FALSE; start_rec = FALSE; } /* locate the tag. */ start = 0; while(line[start] == ' ' || line[start] == '\t'|| line[start] == '\n'|| line[start] == '{' ) start++; end = start +1; while(line[end] != ' ' && line[end] != '\t' && line[end] != '\n' && line[end] != '\0') end++; strncpy(field_name, line+start, end-start); field_name[end-start] = '\0'; /* process the field value. */ /* * creation_date, probing_date, or autorad_date */ if ( strcmp(field_name,"creation-date") == 0) { while(!isdigit(line[end])) end++; if(strToDate(line + end, tSeq->creation_date) == -1) { return FALSE; } } else if (strcmp(field_name,"probing-date") == 0) { while(line[end] != '\0' && !isdigit(line[end])) end++; if(line[end] != '\0' && strToDate(line + end, tSeq->probing_date) == -1) { return FALSE; } } else if ( strcmp(field_name,"autorad-date") == 0) { while(line[end] != '\0' && !isdigit(line[end])) end++; if(line[end] != '\0' && strToDate(line + end, tSeq->autorad_date) == -1) { return FALSE; } } /* * sequence or comments. */ else if (strcmp(field_name,"sequence") == 0 || strcmp(field_name,"comments") == 0 ) { temp_str[0] = '\0'; /* locate the first ". */ while(line[end++] != '"'); start = end; end = strlen(line); /* ---"\n\0. */ if(line[end-2] == '"') end -= 2; else if(line[end-1] == '\n' && strcmp(field_name,"sequence") == 0) end--; while(temp_str_size < end-start+1 ) { temp_str_size *= 2; temp_str = (char *)Realloc(temp_str, temp_str_size); } if(end - start > 0) strncat(temp_str, line+start, end-start); /* Read the second line of the seq. or comments, if any. end-start<0 is the case that " is the only char this line.*/ if (line[strlen(line)-2] != '"' || end-start<0) { while((fgets_ret1 = fgets(line, max_len, fp)) != NULL) { /* IGNORE empty lines. 5/4/92 */ int empty_line = 0; while(line[empty_line] == ' ') empty_line++; if(line[empty_line] == '\n') { continue; /* strncat(temp_str, line, end); 5/4/92 */ } l = strlen(line) -1; if(line[l-1] == '"') end = l-1; else end = l; if(line[end] == '\n' && strcmp(field_name,"comments") == 0) end++; /* Gurantee 'end' chars for the string, one for ", * and one for \0. */ while(temp_str_size - strlen(temp_str) < end+3 ) { temp_str_size *= 2; temp_str=(char *)Realloc(temp_str,temp_str_size); } strncat(temp_str, line, end); if(line[l-1] == '"') break; } if(fgets_ret1 == NULL && need_to_read == TRUE) { fprintf(stderr, "ReadRecord(): incomplete record.\n"); return FALSE; } } l = strlen(temp_str); if(strcmp(field_name,"comments") == 0 ) { if(tSeq->commentsmaxlen == 0) { tSeq->comments = (char *)Calloc(l+1, 1); tSeq->commentsmaxlen = l+1; } else { while(tSeq->commentslen+l+1>tSeq->commentsmaxlen) { tSeq->commentsmaxlen *= 2; tSeq->comments = (char *) Realloc(tSeq->comments, tSeq->commentsmaxlen); } } tSeq->comments[tSeq->commentslen] = '\0'; strcat(tSeq->comments, temp_str); tSeq->commentslen += l; } else /* it is the sequence. */ { if(tSeq->seqmaxlen == 0) { tSeq->c_elem = (char *)Calloc(l+1, 1); } else if(l+1>tSeq->seqmaxlen) { tSeq->c_elem = (char *)Realloc(tSeq->c_elem, l+1); } tSeq->seqmaxlen = l+1; tSeq->seqlen = l; strcpy(tSeq->c_elem, temp_str); } } /* * Integer or String. */ else { /* locate the value: a string or an integer. */ while(line[end] == ' ' || line[end] == '\t') end++; if (line[end] == '"') { /* It is a string. */ end++; start = end; while(line[end] != '\0' && line[end] != '"') end++; /* * strncat will not put a \0 at the end of a string * if the copying string is longer than n. */ line[end++] = '\0'; } else { /* It is an integer. */ start = end; while(line[end] != ' ' && line[end] != '\t' && line[end] != '\n' && line[end] != '\0') end++; strncpy(temp_str, line+start, end-start+1); /*4/26 add 1*/ temp_str[end-start] = '\0'; } /* assign to an integer field. */ if (strcmp(field_name,"laneset") == 0 ) tSeq->laneset = atoi(temp_str); else if (strcmp(field_name,"strandedness") == 0 ) tSeq->strandedness = atoi(temp_str); else if (strcmp(field_name,"direction") == 0) tSeq->direction = atoi(temp_str); else if (strcmp(field_name,"orig_strand") == 0 ) tSeq->orig_strand = atoi(temp_str); else if (strcmp(field_name,"orig_direction") == 0 ) tSeq->orig_direction = atoi(temp_str); else if (strcmp(field_name,"offset") == 0 ) tSeq->offset = atoi(temp_str); else if (strcmp(field_name,"group-number") == 0 ) tSeq->group_number = atoi(temp_str); else if (strcmp(field_name,"group-ID") == 0 ) tSeq->group_ID = atoi(temp_str); /* assign to a string field. */ else if (strcmp(field_name,"type") == 0 ) { if(end - start > 31) end = start + 31; strncpy(tSeq->type, line+start, end-start); tSeq->type[end-start] = '\0'; } else if (strcmp(field_name,"barcode") == 0 ) { if(end - start > 31) end = start + 31; strncpy(tSeq->barcode, line+start, end-start); tSeq->barcode[end-start] = '\0'; } else if (strcmp(field_name,"name") == 0 ) { if(end - start > 31) end = start + 31; strncpy(tSeq->name, line+start, end-start); tSeq->name[end-start] = '\0'; } else if (strcmp(field_name,"status") == 0 ) { if(end - start > 31) end = start + 31; strncpy(tSeq->status, line+start, end-start); tSeq->status[end-start] = '\0'; } else if (strcmp(field_name,"walk") == 0 ) { if(end - start > 31) end = start + 31; strncpy(tSeq->walk, line+start, end-start); tSeq->walk[end-start] = '\0'; } else if (strcmp(field_name,"sequence-ID") == 0 ) { if(end - start > 31) end = start + 31; strncpy(tSeq->sequence_ID, line+start, end-start); tSeq->sequence_ID[end-start] = '\0'; } else if (strcmp(field_name,"creator") == 0 ) { if(end - start > 31) end = start + 31; strncpy(tSeq->creator, line+start, end-start); tSeq->creator[end-start] = '\0'; } else if (strcmp(field_name,"film") == 0 ) { if(end - start > 31) end = start + 31; strncpy(tSeq->film, line+start, end-start); tSeq->film[end-start] = '\0'; } else if (strcmp(field_name,"membrane") == 0 ) { if(end - start > 31) end = start + 31; strncpy(tSeq->membrane, line+start, end-start); tSeq->membrane[end-start] = '\0'; } else if (strcmp(field_name,"source-ID") == 0 ) { if(end - start > 31) end = start + 31; strncpy(tSeq->source_ID, line+start, end-start); tSeq->source_ID[end-start] = '\0'; } else if (strcmp(field_name,"contig") == 0 ) { if(end - start > 31) end = start + 31; strncpy(tSeq->contig, line+start, end-start); tSeq->contig[end-start] = '\0'; } else { if(tSeq->bagmaxlen == 0) { tSeq->bagmaxlen = 4*strlen(orig_line); tSeq->baggage = (char *)Calloc(tSeq->bagmaxlen, 1); } else { while(tSeq->bagmaxlenbaglen+2+strlen(orig_line)) { tSeq->bagmaxlen *= 2; tSeq->baggage = (char *) Realloc(tSeq->baggage, tSeq->bagmaxlen); } } if(tSeq->baglen == 0) { /* tSeq->baggage[0] = '\n'; tSeq->baggage[1] = '\0'; tSeq->baglen = 1; */ tSeq->baggage[0] = '\0'; } /* strcat(tSeq->baggage, "\n");*/ strcat(tSeq->baggage, orig_line); tSeq->baglen += strlen(orig_line); } } } } if(temp_str != NULL) { Cfree(temp_str); temp_str = NULL; } if ( start_rec == FALSE && fgets_ret == NULL) { /* end of file, did not get a record. */ return -1; } else return TRUE; } /********* * * Initialize a record. * * Note: no memory allocation is performed. * **********/ void InitRecord(tSeq) Sequence *tSeq; { int i; strcpy(tSeq->type, "DNA"); tSeq->barcode[0] = '\0'; tSeq->name[0] = '\0'; tSeq->status[0] = '\0'; strcpy(tSeq->walk, "FALSE"); tSeq->sequence_ID[0] = '\0'; tSeq->c_elem = NULL; tSeq->seqlen = 0; tSeq->seqmaxlen = 0; for (i = 0; i<6; i++) { tSeq->creation_date[i] = 0; tSeq->probing_date[i] = 0; tSeq->autorad_date[i] = 0; } tSeq->creator[0] = '\0'; tSeq->film[0] = '\0'; tSeq->membrane[0] = '\0'; tSeq->source_ID[0] = '\0'; tSeq->contig[0] = '\0'; tSeq->laneset = -1; tSeq->direction = 1; /* (1/-1/0),default: 5 to 3. */ tSeq->strandedness = 1; /* (1/2/0), default: primary.*/ tSeq->orig_direction= 0; /* (0 unknown, -1:3'->5', 1:5'->3') */ tSeq->orig_strand = 0; /* (0 unknown, 1:primary, 2:secondary) */ tSeq->offset = 0; tSeq->comments = NULL; tSeq->commentslen = 0; tSeq->commentsmaxlen = 0; tSeq->baggage = NULL; tSeq->baglen = 0; tSeq->bagmaxlen = 0; tSeq->group_number = 0; tSeq->group_ID = 0; } void CopyRecord(to, from) Sequence *from, *to; { int i; InitRecord(to); strcpy(to->type, from->type); strcpy(to->barcode, from->barcode); strcpy(to->name, from->name); strcpy(to->status,from->status); strcpy(to->walk,from->walk); strcpy(to->sequence_ID, from->sequence_ID); if(from->c_elem != NULL) { to->seqlen = from->seqlen; to->seqmaxlen = from->seqmaxlen; to->c_elem = (char *)Calloc(to->seqmaxlen, 1); strncpy(to->c_elem, from->c_elem, to->seqlen); to->c_elem[to->seqlen] = '\0'; } for (i = 0; i<6; i++) { to->creation_date[i] = from->creation_date[i]; to->probing_date[i] = from->probing_date[i]; to->autorad_date[i] = from->autorad_date[i]; } strcpy(to->creator, from->creator); strcpy(to->film, from->film); strcpy(to->membrane, from->membrane); strcpy(to->source_ID, from->source_ID); strcpy(to->contig, from->contig); to->laneset = from->laneset; to->strandedness = from->strandedness; to->orig_direction = from->orig_direction; to->orig_strand = from->orig_strand; to->direction = from->direction; to->offset = from->offset; if(from->comments != NULL) { to->commentsmaxlen = from->commentsmaxlen; to->commentslen = from->commentslen; to->comments = (char *)Calloc(to->commentsmaxlen, 1); strncpy(to->comments, from->comments, to->commentslen); to->comments[to->commentslen] = '\0'; } if(from->baggage != NULL) { to->baglen = from->baglen; to->bagmaxlen = from->bagmaxlen; to->baggage = (char *)Calloc(to->bagmaxlen, 1); strncpy(to->baggage, from->baggage, to->baglen); to->baggage[to->baglen] = '\0'; } to->group_number = from->group_number; to->group_ID = from->group_ID; } /********* * * Clean the contents of a record without changing the memory size. * **********/ void CleanRecord(tSeq) Sequence *tSeq; { int i; strcpy(tSeq->type, "DNA"); tSeq->name[0] = '\0'; tSeq->barcode[0] = '\0'; tSeq->status[0] = '\0'; strcpy(tSeq->walk, "FALSE"); tSeq->sequence_ID[0] = '\0'; if(tSeq->c_elem != NULL) tSeq->c_elem[0] = '\0'; tSeq->seqlen = 0; for (i = 0; i<6; i++) { tSeq->creation_date[i] = 0; tSeq->probing_date[i] = 0; tSeq->autorad_date[i] = 0; } tSeq->creator[0] = '\0'; tSeq->film[0] = '\0'; tSeq->membrane[0] = '\0'; tSeq->source_ID[0] = '\0'; tSeq->contig[0] = '\0'; tSeq->laneset = -1; tSeq->strandedness = 1; /* (1/2/0), default. primary. */ tSeq->direction = 1; /* (1/-1/0),default. 5 to 3. */ tSeq->orig_direction= 0; tSeq->orig_strand = 0; tSeq->offset = 0; if(tSeq->comments != NULL) tSeq->comments[0] = '\0'; tSeq->commentslen = 0; if(tSeq->baggage != NULL) tSeq->baggage[0] = '\0'; tSeq->baglen = 0; tSeq->group_number = 0; tSeq->group_ID = 0; } /********* * * Free memory for a record. * **********/ void FreeRecord(tSeq) Sequence **tSeq; { Cfree((*tSeq)->c_elem); Cfree((*tSeq)->comments); Cfree((*tSeq)->baggage); Cfree((*tSeq)); (*tSeq)->c_elem = NULL; (*tSeq)->comments = NULL; (*tSeq)->baggage = NULL; (*tSeq) = NULL; } static max_day[2][13] = { { 0, 31, 28, 31, 30, 31, 30, 31, 31, 30, 31, 30, 31}, { 0, 31, 29, 31, 30, 31, 30, 31, 31, 30, 31, 30, 31} }; /*********** * * strToDate() locates first six integers and translates them * into a date. * * String should have the format of "mm/dd/yy hh/mn/sc xm", * with anything except digit as the delimiters. * * Order in the date array is (0->5): (yy mm dd hh mn sc). * * Returns FALSE if anything is wrong, TRUE otherwise. * **********/ int strToDate(str, date) const char *str; int date[]; { int leap; char temp_str[2]; char longstr[256]; /* locate 6 integers. */ strcpy(longstr, str); strcat(longstr, " -1/-1/-1 "); sscanf(longstr, "%d%*c%d%*c%d%*c%d%*c%d%*c%d%2s", &date[1],&date[2],&date[0],&date[3], &date[4],&date[5],temp_str); /* verify year. */ if(date[0] >= 100) date[0] -= 1900; /* verify month. */ if(date[1] > 12 || date[1] < 1) { fprintf(stderr,"invalid month %s\n", str); return FALSE; } /* verify day. */ if ((date[0] % 4 == 0 && date[0] % 100 != 0) || date[0] % 400 == 0) leap = 1; else leap = 0; if(date[2] > max_day[leap][date[1]] || date[2] < 1) { fprintf(stderr,"invalid day %s\n", str); return FALSE; } /* verify time. */ if (strncmp(temp_str,"pm",2)==0) date[3] += 12; if (date[3]<-1 || date[3]>23 || date[4]<-1 || date[4]>59 || date[5]<-1 || date[5]>59 ) { fprintf(stderr,"invalid time %s\n", str); return FALSE; } return TRUE; } /********** * * Default_IUPAC_Trans() translates an ASCII IUPAC code into * an (char) integer. * **********/ char Default_IUPAC_Trans(base) char base; { int i; char c; c = base | 32; if(c == 'u') return (char ) 8; if(c == 'p') return (char) 5; for(i=0; i<16; i++) { if(c == Default_DNA_Trans[i]) { return ( (char) i); } } fprintf(stderr, "Character %c is not IUPAC coded.\n", base); return -1; } char *uniqueID(); /*********** * * MakeConsensus() takes an array of aligned sequence and an * initialized 'Sequence' consensus. It modifies the consensus. * * The memory that 'consensus' has located will be reused, and * consensus->seqmaxlen will be modified if necessary. * * Returns TRUE if successful, FALSE otherwise. * **********/ int MakeConsensus(aligned, numOfAligned, consensus, group) Sequence aligned[]; /* input. */ int numOfAligned; /* input. */ Sequence *consensus; /* input and output. */ int group; /* Group number (if zero, use all groups) */ { char occurence; int i, j, index; int max_cons = INT_MIN; int min_offset = INT_MAX; char temp_str[2]; unsigned char case_bit; /* * Search for the minimun offset. */ for (i=0; ioffset = min_offset; if(aligned[0].contig[0] != '\0') { strcpy(consensus->name, aligned[0].contig); strcat(consensus->name, "."); } else if(strncmp(aligned[0].name, "cons.", 5) != 0) { strcpy(consensus->name, "cons."); strcat(consensus->name, aligned[0].name); } strcpy(consensus->sequence_ID, uniqueID()); strcpy(consensus->contig, aligned[0].contig); for(j=min_offset; j= aligned[i].offset && j < aligned[i].offset+aligned[i].seqlen) { index = j-aligned[i].offset; if(aligned[i].c_elem[index] == '-') case_bit = 32; else if(case_bit == 0) case_bit |= (aligned[i].c_elem[index] & 32); occurence = occurence | Default_IUPAC_Trans(aligned[i].c_elem[index]); if(occurence != 1 && occurence != 2 && occurence != 4 && occurence != 8) case_bit = 32; /* printf("%1c", aligned[i].c_elem[index]); */ } /* else printf(" "); */ } } sprintf(temp_str, "%1c", Default_DNA_Trans[(int) occurence]); if(case_bit == 0) temp_str[0] = toupper(temp_str[0]); if(InsertElems(consensus, j, temp_str)== FALSE) return FALSE; /* printf(" cons[%d]=%1c\n", j - min_offset, consensus->c_elem[j - min_offset]); */ } return TRUE; } /*********** * * MakeScore() takes an array of aligned sequence, and generates * a consensus. Note, memory for (Sequence* consensus) should be * located before it is passed to this function. * * Returns TRUE if successful, FALSE otherwise. * **********/ int MakeScore(aligned, numOfAligned, consensus, group) Sequence aligned[]; /* input. */ int numOfAligned; /* input. */ Sequence *consensus; /* input and output. */ int group; { int i, j, index, score; int max_cons = INT_MIN; int min_offset = INT_MAX; int As, Cs, Ts, Gs, Ns, tot_in_grp; char temp_str[2], occurence, base; int max_occ; static char map[17] = "0123456789ABCDEF"; /* * Search for the minimum offset. */ for (i=0; ioffset = min_offset; if(aligned[0].contig[0] != '\0') { strcpy(consensus->name, aligned[0].contig); strcat(consensus->name, "."); } else if(strncmp(aligned[0].name, "cons.", 5) != 0) { strcpy(consensus->name, "cons."); strcat(consensus->name, aligned[0].name); } strcpy(consensus->sequence_ID, uniqueID()); strcpy(consensus->contig, aligned[0].contig); for(j=min_offset; j= aligned[i].offset && j < aligned[i].offset+aligned[i].seqlen) { tot_in_grp++; index = j-aligned[i].offset; /* occurence = Default_IUPAC_Trans(aligned[i].c_elem[index]); if((occurence & 01) == 01) As++; if((occurence & 02) == 02) Cs++; if((occurence & 04) == 04) Gs++; if((occurence & 010) == 010) Ts++; */ base = (aligned[i].c_elem[index]|32); if(base == 'a') As++; else if(base == 'c') Cs++; else if(base == 'g') Gs++; else if(base == 't') Ts++; else if(base == 'n' || base == '-') Ns++; /* printf("%1c", aligned[i].c_elem[index]); */ } /* else printf(" "); */ } } max_occ = MAX(As, MAX(Cs, MAX(Gs,Ts))); /* socre = [0,E], F:all mismatches are either 'n' or '-' */ if(Ns != 0 && max_occ+Ns == tot_in_grp) score = 15; else score = max_occ*14/tot_in_grp; /* if( score > 0xF ) { if (InsertElems(consensus, j, "F") == FALSE) { return FALSE; } } else { */ sprintf(temp_str,"%1c", map[score]); if(InsertElems(consensus, j, temp_str) == FALSE) { return FALSE; } /* printf(" %2d-%2d-%2d-%2d %2d cons[%d]=%1c\n", Ts, Gs, Cs, As, score, j, consensus->c_elem[j]); */ } return TRUE; } /*********** * * MakePhyloMask() takes an array of aligned sequence, and generates * a mask that has a '0' for all columns except the columns which contain * a, c, g, t and u only. * * Returns TRUE if successful, FALSE otherwise. * **********/ int MakePhyloMask(aligned, numOfAligned, consensus, group, acgtu) Sequence aligned[]; /* input. */ int numOfAligned; /* input. */ Sequence *consensus; /* input and output. */ int acgtu[]; int group; { int i, j, cnt, max_cons = INT_MIN, min_offset = INT_MAX; /* * Search for the minimum offset. */ for (i=0; ioffset = min_offset; strcpy(consensus->name, "mask"); strcpy(consensus->type, "MASK"); strcpy(consensus->sequence_ID, uniqueID()); strcpy(consensus->contig, aligned[0].contig); consensus->seqlen = max_cons - min_offset; if(consensus->seqmaxlen == 0) { consensus->c_elem = (char *)Calloc(max_cons - min_offset+5, 1); consensus->seqmaxlen = max_cons - min_offset + 5; } else if(consensus->seqmaxlen < max_cons - min_offset) { consensus->seqmaxlen = max_cons - min_offset + 5; consensus->c_elem = (char *)Realloc(consensus->c_elem, max_cons - min_offset + 5); } cnt = 0; for(j=min_offset; jc_elem[j-min_offset] = '1'; for(i=0; i= aligned[i].offset+aligned[i].seqlen || acgtu[aligned[i].c_elem[j-aligned[i].offset]] == 0) { consensus->c_elem[j-min_offset] = '0'; cnt++; break; } } } } fprintf(stderr, "\nNumber of 1s in mask: %d\n", max_cons-min_offset-cnt); fprintf(stderr, "Number of 0s in mask: %d\n\n", cnt); return TRUE; } /*********** * * MajorityCons() takes an array of aligned sequence, and generates * a MAJORITY consensus. * Note, memory for (Sequence* consensus) should be * located before it is passed to this function. * * Returns TRUE if successful, FALSE otherwise. * **********/ int MajorityCons(aligned, numOfAligned, consensus, group, major_perc) Sequence aligned[]; /* input. */ int numOfAligned; /* input. */ Sequence *consensus; /* input and output. */ int group, major_perc; { int i, j, index, score, ii, base, max; int max_cons = INT_MIN; int min_offset = INT_MAX; char temp_str[2], occurence; int *cnts, tot_in_grp; unsigned char case_bit; cnts = (int *)Calloc(16, sizeof(int)); /* * Search for the minimum offset. */ for (i=0; ioffset = min_offset; if(aligned[0].contig[0] != '\0') { strcpy(consensus->name, aligned[0].contig); strcat(consensus->name, "."); } else if(strncmp(aligned[0].name, "cons.", 5) != 0) { strcpy(consensus->name, "cons."); strcat(consensus->name, aligned[0].name); } strcpy(consensus->sequence_ID, uniqueID()); strcpy(consensus->contig, aligned[0].contig); for(j=min_offset; j= aligned[i].offset && j < aligned[i].offset+aligned[i].seqlen) { tot_in_grp++; index = j-aligned[i].offset; if(aligned[i].c_elem[index] == '-') case_bit = 32; else if(case_bit == 0) case_bit |= (aligned[i].c_elem[index] & 32); occurence |= Default_IUPAC_Trans(aligned[i].c_elem[index]); cnts[(int)Default_IUPAC_Trans(aligned[i].c_elem[index])]++; if(case_bit == 0 && occurence != 1 && occurence != 2 && occurence != 4 && occurence != 8) case_bit = 32; } } } max = 0; for(ii = 0; ii < 16; ii++) { if(cnts[ii] > max) { max = cnts[ii]; base = ii; } } if(max*100/tot_in_grp >= major_perc) { /* follow the majority rule. */ sprintf(temp_str,"%1c", Default_DNA_Trans[base]); } else { /* use IUPAC code. */ sprintf(temp_str,"%1c", Default_DNA_Trans[(int) occurence]); } if(case_bit == 0) temp_str[0] = toupper(temp_str[0]); if(InsertElems(consensus, j, temp_str) == FALSE) { return FALSE; } } return TRUE; } /*********** * * ReadGDEtoHGL() reads a GDE formated file into an array of HGL structure. * * Return -1 if anything is wrong, number_of_sequence otherwise. * ***********/ int ReadGDEtoHGL(fp, tSeq_arr) FILE *fp; Sequence **tSeq_arr; { char line[MAXLINELEN]; int ptr, num_seq, max_num_seq = 20; int seq_len = 200; char *newline; (*tSeq_arr) = (Sequence *)Calloc(max_num_seq, sizeof(Sequence)); num_seq = -1; while(fgets(line, MAXLINELEN-2, fp) != NULL) /* spaces for \n\0 */ { /* ptr points to the last char. */ ptr = strlen(line)-1; /* clear up the tail. */ while(ptr>=0 && (line[ptr] == '\n' || line[ptr] == ' ' || line[ptr] == '\t')) ptr--; line[ptr+1] = '\0'; if(ptr <= 0) { /* it is an empty line. */ } else if(line[0] == '#') { if(++num_seq == max_num_seq) { max_num_seq *= 2; /* printf("max_num_seq = %d\n", max_num_seq); */ (*tSeq_arr) = (Sequence *)Realloc((*tSeq_arr), max_num_seq*sizeof(Sequence)); } InitRecord((*tSeq_arr)[num_seq]); if (line[ptr] == '<') { (*tSeq_arr)[num_seq].direction = 2; /* 3to5 */ line[ptr] = '\0'; } else if (line[ptr] == '>') { (*tSeq_arr)[num_seq].direction = 1; /* 5to3 */ line[ptr] = '\0'; } strcpy((*tSeq_arr)[num_seq].sequence_ID, line+1); } else { ptr = 0; if((*tSeq_arr)[num_seq].seqlen == 0) { /* determine the offset. */ while(line[ptr] != '\0' && line[ptr] == '-') { ptr++; } (*tSeq_arr)[num_seq].offset += ptr; } if(line[ptr] != '\0') { newline = line + ptr; if((*tSeq_arr)[num_seq].seqmaxlen == 0) { (*tSeq_arr)[num_seq].c_elem = (char *)Calloc(seq_len, 1); (*tSeq_arr)[num_seq].c_elem[0] = '\0'; (*tSeq_arr)[num_seq].seqmaxlen = seq_len; } else { while((*tSeq_arr)[num_seq].seqlen + strlen(newline) + 1 > (*tSeq_arr)[num_seq].seqmaxlen) { seq_len *= 2; (*tSeq_arr)[num_seq].c_elem = (char *) Realloc((*tSeq_arr)[num_seq].c_elem, seq_len); (*tSeq_arr)[num_seq].seqmaxlen = seq_len; } } strcat((*tSeq_arr)[num_seq].c_elem, newline); (*tSeq_arr)[num_seq].seqlen = strlen((*tSeq_arr)[num_seq].c_elem); } } } return (num_seq + 1); } /******** * * InsertElems returns TRUE if successful, FALSE otherwise. * ********/ int InsertElems(seq,pos,c) Sequence *seq; /* Sequence */ int pos; /* Position (in respect to the master consensus) * to insert BEFORE * always move string to the right. */ char c[]; /*Null terminated array of elements to insert */ { int dashes, j,len; len = strlen(c); if(seq->seqlen == 0) { /* get rid of '-'s at right. */ /* dashes = len-1; while(dashes >= 0 && c[dashes] == '-') dashes--; if(dashes < 0) { seq->offset = pos; return TRUE; } c[dashes+1] = '\0'; */ /* clear out '-'s at left. */ dashes = 0; /* while(c[dashes] == '-') dashes++; c += dashes; len = strlen(c); pos += dashes; */ if(seq->seqmaxlen == 0) { seq->c_elem = (char *)Calloc(len+1, 1); seq->seqmaxlen = len + 1; } else if(len+1 >= seq->seqmaxlen) { seq->c_elem = (char *)Realloc(seq->c_elem, len+1); seq->seqmaxlen = len+1; } strcpy(seq->c_elem, c); seq->seqlen = len; seq->offset = pos; return TRUE; } /* to make sure there is a space for '\0'. */ if(seq->seqlen > seq->seqmaxlen) { fprintf(stderr, "InsertElems(): seqlen>seqmaxlen. Something is wrong.\n"); return FALSE; } else { while(seq->seqlen+1 >= seq->seqmaxlen) { seq->seqmaxlen *= 2; seq->c_elem = (char *)Realloc(seq->c_elem, seq->seqmaxlen); } } seq->c_elem[seq->seqlen] = '\0'; if(pos < seq->offset) /* insert to the left of the seq. */ { /* ignore the dashes at the left. */ dashes = 0; /* while(dashes < len && c[dashes] == '-') dashes++; if(c[dashes] == '\0') { seq->offset += len; return TRUE; } c += dashes; len -= dashes; */ if(seq->seqlen + len + seq->offset - pos > seq->seqmaxlen) { seq->seqmaxlen = seq->seqlen+len+seq->offset-pos+256; seq->c_elem = (char *)Realloc(seq->c_elem, seq->seqmaxlen); } /* copy the old string including the last '\0'. */ for(j=seq->seqlen; j>=0; j--) seq->c_elem[j+len+seq->offset-pos] = seq->c_elem[j]; /* insert dashes. */ for(j=len; joffset-pos; j++) seq->c_elem[j] = '-'; /* copy the inserted string. */ for(j=0; jc_elem[j] = c[j]; /* detector. */ if(c[j] != '\0') fprintf(stderr, "InsertElems: Problem.....\n"); seq->seqlen = strlen(seq->c_elem); /* seq->offset = pos; commented on 6-3-91 */ seq->offset = pos + dashes; if(dashes > 0) printf("\nInsertElems(): dashes is not zero.\n\n"); } else if(pos - seq->offset >= seq->seqlen) /* insert to the right. */ { /* ignore the dashes at the right. */ /* dashes = len -1; while(dashes >= 0 && c[dashes] == '-') dashes--; if(dashes < 0) return TRUE; len = dashes+1; c[len] = '\0'; */ if(pos - seq->offset + len > seq->seqmaxlen) { seq->seqmaxlen = pos - seq->offset + len + 256; seq->c_elem = (char *)Realloc(seq->c_elem, seq->seqmaxlen); } /* insert dashes. */ for(j=seq->seqlen; joffset; j++) seq->c_elem[j] = '-'; /* copy the inserted string. */ for(j=0; jc_elem[pos - seq->offset + j] = c[j]; seq->c_elem[pos-seq->offset+len] = '\0'; /* detector. */ if(c[j] != '\0') fprintf(stderr, "InsertElems: Problem too .....\n"); seq->seqlen = strlen(seq->c_elem); } else /* insert into the seq. */ { if(seq->seqlen + len > seq->seqmaxlen) { seq->seqmaxlen = seq->seqlen + len + 256; seq->c_elem = (char *)Realloc(seq->c_elem, seq->seqmaxlen); } /* move the bottom part of the older string including the last '\0'. */ for(j=seq->seqlen; j>=pos-seq->offset; j--) seq->c_elem[j+len] = seq->c_elem[j]; /* copy the inserted string. */ for(j=0; jc_elem[pos - seq->offset + j] = c[j]; /* detector. */ if(c[j] != '\0') fprintf(stderr, "InsertElems: Problem too too .....\n"); seq->seqlen = strlen(seq->c_elem); } return TRUE; } /****************************************************************** * * int GetArgs(argArray, numArgs) * Arg *argArray; * int numArgs; * * Return TRUE if successful, FALSE otherwise. * ******************************************************************/ #define MAX_ARGS 50 /* maximum args this can process */ int GetArgs(argArray, numArgs, argc, argv) Args *argArray; int numArgs; int argc; char **argv; { int i, j; Args *curarg; int noArgOK = TRUE; if ((argArray == NULL) || (numArgs == 0) || (numArgs > MAX_ARGS)) { fprintf(stderr, "GetArgs: Invalid number of args.\n"); return FALSE; } /* * Test if all are either 'default' or 'optional'. */ curarg = argArray; for (i=0; istrvalue[0] == '\0' && curarg->optional == 'F') { noArgOK = FALSE; break; } } /* * show usage if some arg is required but no arg is * supllied on command line. */ if(noArgOK == FALSE && argc == 1) { fprintf(stderr, "\n%s arguments:\n\n", argv[0]); curarg = argArray; for (i = 0; i < numArgs; i++, curarg++) { fprintf(stderr, " -%c %s ", curarg->tag, curarg->prompt); if (curarg->optional == 'T') fprintf(stderr, " [Optional]"); fprintf(stderr, "\n"); if (curarg->strvalue[0] != '\0') fprintf(stderr, " default = %s\n", curarg->strvalue); } fprintf(stderr, "\n"); return FALSE; } /* * Process */ for (i = 1; i < argc; i++) { if (argv[i][0] != '-') { fprintf(stderr, "Arguments must start with -"); return FALSE; } /* check the tag. */ curarg = argArray; for (j = 0; j < numArgs; j++, curarg++) { if ((argv[i][1]|32) == (curarg->tag|32)) break; } if (j == numArgs) { fprintf(stderr, "Invalid argument tag in %s\n", argv[i]); return FALSE; } strcpy(curarg->strvalue, argv[i]+2); if(curarg->strvalue[0] == '\'' && curarg->strvalue[strlen(curarg->strvalue)-1] == '\'') { char ttmm[256]; strcpy(ttmm, curarg->strvalue+1); ttmm[strlen(ttmm)-1] = '\0'; strcpy(curarg->strvalue, ttmm); } } return TRUE; } /********* * * GetCond interprets the -c argument, the condition. * * The condition will be set to NULL if no condition is specified, * that is, if you pass '&p' as the address of a cond* structure, * p will be set to NULL if no condition [(p == NULL) = TRUE]. * * Return TRUE if successful, FALSE otherwise. * *********/ int GetCond(arg, cond) char *arg; str_cond **cond; { int start, end, i, found; char message_buf[1000]; if ( strcmp(arg, "null")==0) { (*cond) = NULL; return TRUE; } else { (*cond) = (str_cond *)Calloc(1, sizeof(str_cond)); start = end = 0; /* find the field name. */ while (('a'<= arg[end] && arg[end]<='z') || ('A'<= arg[end] && arg[end]<='Z') || arg[end] == '-' ) end++; found = FALSE; for (i=0; ifield = i; /* condition on field &at[i]. */ found = TRUE; break; } } if (found == FALSE) { strncpy(message_buf, arg, end-start); message_buf[end-start] = '\0'; fprintf(stderr, "Field %s not found.\n", message_buf); return FALSE; } start = end; end++; while (arg[end] == '=' || arg[end] == '!' || arg[end] == '>' || arg[end] == '<' ) end++; strncpy((*cond)->symbol, arg+start, end-start); (*cond)->symbol[end-start] = '\0'; if (strlen((*cond)->symbol)>2 || strlen((*cond)->symbol)<1 || (strlen((*cond)->symbol)==1 && *((*cond)->symbol) !='>' && *((*cond)->symbol) != '<') || (strlen((*cond)->symbol)==2 && (strncmp((*cond)->symbol,"!=",2)!= 0 ) && (strncmp((*cond)->symbol,"==",2)!= 0 ) && (strncmp((*cond)->symbol,">=",2)!= 0 ) && (strncmp((*cond)->symbol,"<=",2)!= 0 ) ) ) { fprintf(stderr, "Invalid condition.\n"); return FALSE; } if(arg[end] == '"' && arg[strlen(arg) - 1] == '"') { end++; arg[strlen(arg) - 1] = '\0'; } (*cond)->value = (char *)Calloc(strlen(arg) - end + 2, 1); strcpy((*cond)->value, arg+end); } return TRUE; } /********* * * GetFields interprets the -f arguments, the fields list. * * Returns number of selected fields, 0 if anything is wrong. * *********/ int GetFields(arg, selected_fields) char *arg; int selected_fields[]; { int start, end, i, found, list_done, i_selected; char message_buf[1000]; if ( strcmp(arg, "all") == 0 ) { selected_fields[0] = -1; return NUM_OF_FIELDS; } else { start = end = 0; list_done = FALSE; i_selected = 0; while ( list_done == FALSE ) { while (arg[end] != '\0' && arg[end] != ',') { end++ ; } if (arg[end] == '\0') { list_done = TRUE; } found = FALSE; for (i=0; i= pl && strncmp(string+i, pattern, pl) == 0) num_app++; } return num_app; } /******* * * FindPatternNC() searches string for pattern , CASE INSENSITIVE. * Returns the number of appearences. * *******/ int FindPatternNC(string, pattern) const char *string; const char *pattern; { int i, j, sl, pl, num_app = 0; if(string == NULL || (sl = strlen(string)) == 0) return 0; pl = strlen(pattern); for(i = 0; i <= sl-pl; i++) { j = 0; while(j < pl && (string[i+j]|32) == (pattern[j]|32)) j++; if(j == pl) num_app++; } return num_app; } /******* * * Complementary() CHANGES the given DNA/RNA string to its complementary, * and returns TRUE. Returns FALSE if anything is wrong and keep the * given string unchanged. * *******/ int Complementary(sequence, type) char *sequence; char type; { int i, l; char *temp_str; l = strlen(sequence); temp_str = (char *)Calloc(l+1, sizeof(char)); if( type == 'D' || type == 'd') type = 0; else if(type == 'R' || type == 'r') type = 1; else { fprintf(stderr, "Complementary(): type unknown. Type is D/d/R/r\n"); return (int) NULL; } for(i=0; i 1) { fprintf(stderr, "%s has 15 repatitive base(s) %s\n", PossibleOligo, subseq); i++; BadOligo = TRUE; } } */ /* * To ensure that the probe is not going to hybridize * with itself: */ for(PO_index = 0; BadOligo==FALSE && PO_index<=PO_len-no_repeat_len; PO_index++) { SubStr(PossibleOligo, PO_index, no_repeat_len, subseq); strcpy(scd_str, subseq); Complementary(scd_str, 'd'); Reverse(scd_str); if(FindPattern(PossibleOligo, scd_str) > 0) { fprintf(stderr, "%s may hybridize with itself: %s vs. %s.\n", PossibleOligo, subseq, scd_str); i++; BadOligo = TRUE; } } for(PO_index = 0; BadOligo == FALSE && PO_index <= PO_len-2*check_len; PO_index++) { SubStr(PossibleOligo, PO_index, check_len, subseq); Complementary(subseq, 'd'); strcpy(scd_str, subseq); Reverse(scd_str); /* if(FindPattern2(PossibleOligo,subseq,PO_index)>0) { fprintf(stderr, "%s has self-compl %s\n", PossibleOligo, subseq); i += PO_index+1; BadOligo = TRUE; } else */ if(FindPattern2(PossibleOligo,scd_str,PO_index)>0) { fprintf(stderr, "%s has 2nd struct %s\n", PossibleOligo, scd_str); i += PO_index+1; BadOligo = TRUE; } } if(BadOligo == FALSE) { seq_set[seq_cnt] = (char *) Calloc(strlen(PossibleOligo)+1, sizeof(char)); strcpy(seq_set[seq_cnt], PossibleOligo); if(++seq_cnt == max_num_probe) { max_num_probe *= 2; seq_set = (char **) Realloc(seq_set, max_num_probe*sizeof(char *)); } i++; } } /* end of l. */ } /* end of i. */ seq_set[seq_cnt] = NULL; if(seq_cnt == 0) return NULL; return seq_set; } /* ALWAYS COPY the result from uniqueID() to a char[32], * (strlen(hostname)+1+10). Memory is lost when the function * is finished. */ char vname[32]; char *uniqueID() { char hname[32],/* vname[32], rtm 18.III.98 */ tstr[32]; time_t *tp; static cnt = 0; int ll; tp = (time_t *)Calloc(1, sizeof(time_t)); if(gethostname(hname, 32) == -1) { fprintf(stderr, "UniqueID(): Failed to get host name.\n"); exit(1); } time(tp); sprintf(tstr, ":%d:%ld", cnt, *tp); if((ll = strlen(tstr)) > 31) { strncpy(vname, tstr, 31); vname[31] = '\0'; } else { ll = strlen(hname)-(31-ll); if(ll < 0) ll = 0; sprintf(vname, "%s%s", hname+ll, tstr); } cnt++; Cfree(tp); return(vname); } /* return the percentage of GCcontents. */ int GCcontent(seq) char *seq; { int l, gc=0, j; l = strlen(seq); for (j=0; jcomments, tSeq->c_elem); } Find2(string,key) char *key,*string; /* * Like find, but returns the index of the leftmost * occurence, and -1 if not found. * Note in this program, T==U, and case insensitive. */ { int i,j,len1,len2,dif,flag = FALSE; char *target; if(string == NULL || string[0] == '\0') return -1; len2 = strlen(string); target = (char *) Calloc(len2+1, 1); for(i = 0; i0) for(j=0;jsequence_ID); } else if(temp_line[0] == '#') { strncpy(seq->name, temp_line+1, 31); seq->name[31] = '\0'; ii = 0; while(ii < strlen(seq->name) && seq->name[ii] != ' ' && seq->name[ii] != '\n') ii++; seq->name[ii] = '\0'; seq->seqmaxlen = 256; seq->c_elem=(char *)Calloc(seq->seqmaxlen,1); seq->seqlen = 0; while(fgets(temp_line, 1000, fp) != NULL) { l1 = strlen(temp_line); if(temp_line[l1 - 1] == '\n') { l1--; temp_line[l1] = '\0'; } while(seq->seqmaxlen < seq->seqlen + strlen(temp_line) + 1) { seq->seqmaxlen *= 2; seq->c_elem = (char *) Realloc(seq->c_elem, seq->seqmaxlen); } strcat(seq->c_elem, temp_line); seq->seqlen += strlen(temp_line); } if(seq->seqlen == 0) { fprintf(stderr, "\n%s\n","Sequence is empty."); return FALSE; } } } return -1; } void heapify(seq_set, seq_size, heap_size, elem, Pkey, Skey, order) int seq_size, elem, heap_size, **order; char Pkey[], Skey[]; Sequence *seq_set; { int l, r, temp, largest; l = 2*elem+1; r = 2*elem+2; if(l <= heap_size && CompKey(seq_set[(*order)[l]], seq_set[(*order)[elem]], Pkey, Skey) > 0) largest = l; else largest = elem; if(r <= heap_size && CompKey(seq_set[(*order)[r]], seq_set[(*order)[largest]], Pkey, Skey) > 0) largest = r; if(largest != elem) { temp = (*order)[elem]; (*order)[elem] = (*order)[largest]; (*order)[largest] = temp; heapify(seq_set,seq_size,heap_size,largest,Pkey,Skey,order); } } heapsort(seq_set, seq_size, Pkey, Skey, order) int seq_size, **order; char Pkey[], Skey[]; Sequence *seq_set; { int ii, temp, heap_size; /* * build_heap(seq_set, seq_size, &heap_size, order); */ heap_size = seq_size-1; for(ii = (seq_size-1)/2; ii>=0; ii--) /* (L-1)/2-1?? */ { heapify(seq_set, seq_size, heap_size, ii,Pkey,Skey,order); } for(ii = seq_size-1; ii>0; ii--) { temp = (*order)[0]; (*order)[0] = (*order)[ii]; (*order)[ii] = temp; heap_size--; heapify(seq_set, seq_size, heap_size, 0, Pkey,Skey,order); } } /* * Return >0, ==0, <0. */ int CompKey(seq1, seq2, Pkey, Skey) Sequence seq1, seq2; char Pkey[], Skey[]; { int ii, jj, Pret; char b1[32], b2[32]; if(strcmp(Pkey, "type") == 0) { Pret = strcmp(seq1.type, seq2.type); if(Pret != 0 || Skey[0] == '\0') return Pret; } else if(strcmp(Pkey, "name") == 0) { Pret = strcmp(seq1.name, seq2.name); if(Pret != 0 || Skey[0] == '\0') return Pret; } else if(strcmp(Pkey, "sequence-ID") == 0) { Pret = strcmp(seq1.sequence_ID, seq2.sequence_ID); if(Pret != 0 || Skey[0] == '\0') return Pret; } else if(strcmp(Pkey, "creator") == 0) { Pret = strcmp(seq1.creator, seq2.creator); if(Pret != 0 || Skey[0] == '\0') return Pret; } else if(strcmp(Pkey, "offset") == 0) { Pret = seq1.offset - seq2.offset; if(Pret != 0 || Skey[0] == '\0') return Pret; } else if(strcmp(Pkey, "group-ID") == 0) { Pret = seq1.group_ID - seq2.group_ID; if(Pret != 0 || Skey[0] == '\0') return Pret; } else if(strcmp(Pkey, "barcode") == 0) { if(seq1.barcode[0] == 'P') strcpy(b1, seq1.barcode+2); else strcpy(b1, seq1.barcode); if(seq2.barcode[0] == 'P') strcpy(b2, seq2.barcode+2); else strcpy(b2, seq2.barcode); Pret = strcmp(b1, b2); if(Pret != 0 || Skey[0] == '\0') return Pret; } else if(strcmp(Pkey, "seqlen") == 0) { Pret = seq1.seqlen - seq2.seqlen; if(Pret != 0 || Skey[0] == '\0') return Pret; } else if(strcmp(Pkey, "creation-date") == 0) { seq1.creation_date[0] %= 100; seq2.creation_date[0] %= 100; Pret = seq1.creation_date[0]*10000 + seq1.creation_date[1]*100 + seq1.creation_date[2] - seq2.creation_date[0]*10000 - seq2.creation_date[1]*100 - seq2.creation_date[2]; if(Pret == 0) { Pret = seq1.creation_date[3]*10000 + seq1.creation_date[4]*100 + seq1.creation_date[5] - seq2.creation_date[3]*10000 - seq2.creation_date[4]*100 - seq2.creation_date[5]; } if(Pret != 0 || Skey[0] == '\0') return Pret; } else if(strcmp(Pkey, "probing-date") == 0) { seq1.probing_date[0] %= 100; seq2.probing_date[0] %= 100; Pret = seq1.probing_date[0]*10000 + seq1.probing_date[1]*100 + seq1.probing_date[2] - seq2.probing_date[0]*10000 - seq2.probing_date[1]*100 - seq2.probing_date[2]; if(Pret == 0) { Pret = seq1.probing_date[3]*10000 + seq1.probing_date[4]*100 + seq1.probing_date[5] - seq2.probing_date[3]*10000 - seq2.probing_date[4]*100 - seq2.probing_date[5]; } if(Pret != 0 || Skey[0] == '\0') return Pret; } else if(strcmp(Pkey, "autorad_date") == 0) { seq1.autorad_date[0] %= 100; seq2.autorad_date[0] %= 100; Pret = seq1.autorad_date[0]*10000 + seq1.autorad_date[1]*100 + seq1.autorad_date[2] - seq2.autorad_date[0]*10000 - seq2.autorad_date[1]*100 - seq2.autorad_date[2]; if(Pret == 0) { Pret = seq1.autorad_date[3]*10000 + seq1.autorad_date[4]*100 + seq1.autorad_date[5] - seq2.autorad_date[3]*10000 - seq2.autorad_date[4]*100 - seq2.autorad_date[5]; } if(Pret != 0 || Skey[0] == '\0') return Pret; } else if(strcmp(Pkey, "film") == 0) { Pret = strcmp(seq1.film, seq2.film); if(Pret != 0 || Skey[0] == '\0') return Pret; } else if(strcmp(Pkey, "membrane") == 0) { Pret = strcmp(seq1.membrane, seq2.membrane); if(Pret != 0 || Skey[0] == '\0') return Pret; } else if(strcmp(Pkey, "contig") == 0) { Pret = strcmp(seq1.contig, seq2.contig); if(Pret != 0 || Skey[0] == '\0') return Pret; } else { fprintf(stderr,"CompKey(): Invalid primary key %s.\n",Pkey); exit(1); } if(strcmp(Skey, "type") == 0) { return (strcmp(seq1.type, seq2.type)); } else if(strcmp(Skey, "name") == 0) { return (strcmp(seq1.name, seq2.name)); } else if(strcmp(Skey, "sequence-ID") == 0) { return (strcmp(seq1.sequence_ID, seq2.sequence_ID)); } else if(strcmp(Skey, "creator") == 0) { return (strcmp(seq1.creator, seq2.creator)); } else if(strcmp(Skey, "offset") == 0) { return (seq1.offset - seq2.offset); } else if(strcmp(Skey, "group-ID") == 0) { return (seq1.group_ID - seq2.group_ID); } else if(strcmp(Skey, "barcode") == 0) { if(seq1.barcode[0] == 'P') strcpy(b1, seq1.barcode+2); else strcpy(b1, seq1.barcode); if(seq2.barcode[0] == 'P') strcpy(b2, seq2.barcode+2); else strcpy(b2, seq2.barcode); return (strcmp(b1, b2)); } else if(strcmp(Skey, "seqlen") == 0) { return(seq1.seqlen - seq2.seqlen); } else if(strcmp(Skey, "creation-date") == 0) { seq1.creation_date[0] %= 100; seq2.creation_date[0] %= 100; Pret = seq1.creation_date[0]*10000 + seq1.creation_date[1]*100 + seq1.creation_date[2] - seq2.creation_date[0]*10000 - seq2.creation_date[1]*100 - seq2.creation_date[2]; if(Pret != 0) return Pret; return(seq1.creation_date[3]*10000 + seq1.creation_date[4]*100 + seq1.creation_date[5] - seq2.creation_date[3]*10000 - seq2.creation_date[4]*100 - seq2.creation_date[5]); } else if(strcmp(Skey, "probing-date") == 0) { seq1.probing_date[0] %= 100; seq2.probing_date[0] %= 100; Pret = seq1.probing_date[0]*10000 + seq1.probing_date[1]*100 + seq1.probing_date[2] - seq2.probing_date[0]*10000 - seq2.probing_date[1]*100 - seq2.probing_date[2]; if(Pret != 0) return Pret; return(seq1.probing_date[3]*10000 + seq1.probing_date[4]*100 + seq1.probing_date[5] - seq2.probing_date[3]*10000 - seq2.probing_date[4]*100 - seq2.probing_date[5]); } else if(strcmp(Skey, "autorad_date") == 0) { seq1.autorad_date[0] %= 100; seq2.autorad_date[0] %= 100; Pret = seq1.autorad_date[0]*10000 + seq1.autorad_date[1]*100 + seq1.autorad_date[2] - seq2.autorad_date[0]*10000 - seq2.autorad_date[1]*100 - seq2.autorad_date[2]; if(Pret != 0) return Pret; return(seq1.autorad_date[3]*10000 + seq1.autorad_date[4]*100 + seq1.autorad_date[5] - seq2.autorad_date[3]*10000 - seq2.autorad_date[4]*100 - seq2.autorad_date[5]); } else if(strcmp(Skey, "film") == 0) { return(strcmp(seq1.film, seq2.film)); } else if(strcmp(Skey, "membrane") == 0) { return(strcmp(seq1.membrane, seq2.membrane)); } else if(strcmp(Skey, "contig") == 0) { return(strcmp(seq1.contig, seq2.contig)); } else { fprintf(stderr, "CompKey(): Invalid secondary key %s.\n",Skey); exit(1); } } int Lock(fname) char *fname; { char buffer[1024]; FILE *fp; int wait = 0; while((fp = fopen(fname, "r")) == NULL) { sleep(1); if(++wait == 30) { fprintf(stderr, "File %s not available, Try later.\n\n", fname); return FALSE; } } fclose(fp); sprintf(buffer, "mv %s %s.locked", fname, fname); system(buffer); return TRUE; } void Unlock(fname) char *fname; { char buffer[1024]; sprintf(buffer, "mv %s.locked %s", fname, fname); system(buffer); } AppendComments(seq, str) Sequence *seq; char *str; { int ii, jj, kk; kk = strlen(str); if(seq->commentsmaxlen == 0) { seq->comments = (char *)Calloc(kk+1, 1); seq->commentsmaxlen = kk+1; seq->commentslen = 0; } else if(seq->commentslen+kk+1>seq->commentsmaxlen) { seq->commentsmaxlen += 2*(kk+1); seq->comments = (char *) Realloc(seq->comments, seq->commentsmaxlen); } seq->comments[seq->commentslen] = '\0'; seq->comments[seq->commentslen] = '\0'; strcat(seq->comments, str); seq->commentslen = strlen(seq->comments); }