/* * Check xdap database for errors */ #include #include /* IMPORT : tolower */ #include "misc.h" #include "seq.h" typedef char StringBuffer[200]; /* * Database io */ #ifdef BAP_VERSION /* * BAP version */ #include "bapIO.h" BapIO io; #define FILE_NAME_LENGTH BAP_FILE_NAME_LENGTH #define open_for_read bap_open_for_read #define close_files bap_close_files #define read_rl bap_read_rl #define read_sq bap_read_sq #define read_ar bap_read_ar #define read_tg bap_read_tg #define read_comment bap_read_comment #define rl_file_rec bap_rl_file_rec #define ar_file_rec bap_ar_file_rec #define tg_file_rec bap_tg_file_rec #else /* * DAP version */ #include "dapIO.h" DapIO io; #define FILE_NAME_LENGTH DAP_FILE_NAME_LENGTH #define open_for_read dap_open_for_read #define close_files dap_close_files #define read_rl dap_read_rl #define read_sq dap_read_sq #define read_ar dap_read_ar #define read_tg dap_read_tg #define read_comment dap_read_comment #define rl_file_rec dap_rl_file_rec #define ar_file_rec dap_ar_file_rec #define tg_file_rec dap_tg_file_rec #endif /* * Prototypes */ extern void *malloc(size_t size); extern void *calloc(size_t nobj, size_t size); extern char *getenv(char *s); /* * Global variables - to this file at least */ static int consensusCutoff; static int alignmentCutoff = 0; /* scores range from -2000 to +2000 */ static int bridge=2; /* was -200 */ static StringBuffer rawData; static StringBuffer contigName; static FILE *log_fp; #include "upam.gbl" #include "uascii.gbl" #include "llin.h" initpam2() /* * Initialise alignment routine */ { int i, j, k; pam = npam; nsq = naa; k=0; for (i=0; i= cut * scores[idm-1] ) return charsu(i); return '-'; } static void consensus_region(int contig, int start, int end, char *cseq, int cutoff) /* * Determine the consensus of a region in a contig * Adequate space must be allocated in cseq. * It used a memory intensive algorithm! (The score maxtrix is 7xlength of region!) */ { Scores *scores; rl_file_rec cline; rl_file_rec gline; char *gseq; int gel; int region_len; int i; region_len = end - start + 1; scores = (Scores *)calloc(1,region_len * sizeof(Scores)); read_rl(&io,contig,&cline); gel = cline.clines.left_end; gseq = malloc(io.max_gel_length+1); /* * find left-most gel in region */ read_rl(&io,gel,&gline); while (gel && gline.lines.rel_pos + abs(gline.lines.length) <= start) { if (gel = gline.lines.right_nbr) read_rl(&io,gel,&gline); } while (gel && gline.lines.rel_pos <= end) { int gstart, gend, goffset; char *s; read_sq(&io,gel,gseq); gseq[abs(gline.lines.length)] = '\0'; gstart = max(start, gline.lines.rel_pos) - gline.lines.rel_pos + 1; gend = min(end, gline.lines.rel_pos + abs(gline.lines.length) - 1) - gline.lines.rel_pos + 1; goffset = gline.lines.rel_pos - start - 1; for (s = &gseq[gstart-1];gstart<=gend;gstart++, s++) { int j,score; j = indexs(*s,&score); scores[gstart+goffset][j] += score; scores[gstart+goffset][6] += score; } if (gel = gline.lines.right_nbr) read_rl(&io,gel,&gline); } for (i=0;i0) { for(i=1-MM,score=0;i=0) score -= isUncertain(rawSeq[i]); if (i+MM-1= NN) break; } rcut = min(rcut,max(i,0)); } /* * Remove 3' crap from map */ for (i=rcut;i0) map[k] = ~map[k]; } } } static int pos_in_contig(int pos, int rel_pos, int length, int complemented) { if (complemented) return rel_pos + length - pos; else return rel_pos + pos - 1; } static int bases_equal(char a, char b) /* * Return true if bases a and b are equal. * This ignores the case of the bases */ { /* * Beware of non ANSI implementations of tolower() * that only work when argument isupper(). */ if (isupper(a)) a = tolower(a); if (isupper(b)) b = tolower(b); return (a == b); } static void region_covered(int start, int end, int *coverage) /* * Mark as covered, the region from start to end in the map */ { int i; for (i=start; i<=end+(1-bridge); i++) { /* * Setting coverage at position x means bases between x and x+(1-bridge) (inclusive) are covered */ coverage[i-1]++; } } static void compare_seq_update_coverage(int gel, char *seq, int length, char *rawSeq, int rawLength, int rel_pos, int complemented, int *coverage, int *map ) /* * Report the differences, insertions and deletions */ { int i; int mstart,mend; int gstart,gend; int cstart,cend; i = 0; while (i < rawLength) { /* * Skip over regions: * (a) not mapped because the quality of the trace was deemed too poor (*map < 0) * or (b) where deletions are indicated (*map == 0) * or (c) where bases disagree ( *rawSeq != seq[*map] ) */ while (i < rawLength && (map[i] <= 0 || ! bases_equal(rawSeq[i],seq[map[i]-1]) ) ) i++; /* * We have hit start of good coverage * Determine regions where there are no insertions or changes, * then mark them as covered */ if (i < rawLength) { int mstart = i; i++; while (i < rawLength && map[i] > 0) { if (map[i] != map[i-1]+1) { /* insertion detected */ break; } if ( ! bases_equal(rawSeq[i],seq[map[i]-1]) ) { /* change detected */ break; } i++; } mend = i-1; gstart = map[mstart]; gend = map[mend]; if (complemented) { cstart = pos_in_contig(gend,rel_pos,length,complemented); cend = pos_in_contig(gstart,rel_pos,length,complemented); } else { cstart = pos_in_contig(gstart,rel_pos,length,complemented); cend = pos_in_contig(gend,rel_pos,length,complemented); } region_covered(cstart,cend,coverage); i++; } } } static int make_alignment_map(int gel, char *seq, int length, char *rawSeq, int rawLength, int *map ) /* * Produce an alignment map which maps base positions in the raw sequence * to positions in the gel reading */ { int totalLength; int *res; /* used by alignment program */ int score; totalLength = length+rawLength; res = (int *) malloc(totalLength * sizeof(int)); score = DIFF(rawSeq-1,seq-1, rawLength,length, pam2, -gdelval, -ggapval, res); if (score > alignmentCutoff) { fprintf(log_fp,";Alignment for gel %d failed to meet cutoff score (%d/%d)\n", gel,score,alignmentCutoff); return 0; } /* * Use alignment results to produce an array * mapping trace to consensus positions */ /*************************************************************************/ { int mindex; int mvalue; int i, j, op; int *S, M, N; S = res; M = rawLength; N = length; i = j = op = 0; mindex = 0; mvalue = 1; while (i < M || j < N) { if (op == 0 && *S == 0) { map[mindex++] = mvalue; mvalue++; op = *S++; i++; j++; } else { if (op == 0) op = *S++; if (op > 0) { mvalue++; op--; j++; } else { map[mindex++] = 0; op++; i++; } } } } /*************************************************************************/ /*********** { (void) DISPLAY(rawSeq-1, seq-1, rawLength, length, res); printf("\n\n"); } ************/ free(res); return 1; } static void compare_sequences(/* This gel reading info */ int gel, char *seq, int length, int rel_pos, int complemented, /* This trace info */ Seq trace, int rawLCut, int rawLength, char *rawSeq, /* This cosmid info*/ int *coverage ) /* * Perform an alignment between the consensus sequence and the raw data. * Check only high quality bits and update coverage */ { int *map; map = (int *)malloc(rawLength*sizeof(int)); if (make_alignment_map(gel,seq,length,rawSeq,rawLength,map)) { select_good_raw_bits(trace,rawLCut,rawLength,rawSeq,map); compare_seq_update_coverage(gel,seq,length,rawSeq,rawLength,rel_pos, complemented,coverage,map); } free(map); } static void process_gel_reading(int gel, int length, int complemented, int rel_pos, char *cseq, int contig, int *coverage ) /* * Perform the quality check on a specific gel reading */ { ar_file_rec ar_rec; tg_file_rec tg_rec; char *rd; int rd_length; int rd_cut; int rd_ulen; char rd_type[5]; char rd_file[19]; /* * Read Name */ read_ar(&io,gel,&ar_rec); fprintf (log_fp,";%d %s\n",complemented?-gel:gel,ar_rec.lines.name); /* * Read Raw data file name */ read_tg(&io,gel,&tg_rec); if (tg_rec.lines.comment) { char *fullFileName; rd = (char *) read_comment(&io, tg_rec.lines.comment); sscanf(rd,"%6d%6d%6d%*s",&rd_length, &rd_cut, &rd_ulen); f2cstr(&rd[22],18,rd_file,18); f2cstr(&rd[18],4,rd_type,4); if ( (fullFileName = findfile(rd_file,rawData)) == NULL) { /* Trace file specified but not found */ fprintf(log_fp,";Trace file %s specified but not found for gel reading %d %s\n", rd_file,gel,ar_rec.lines.name); } else { Seq trace; trace = (Seq) getSeq(fullFileName,rd_type); if (trace == NULLSeq) { /* Trace file specified but not found */ fprintf(log_fp,";Error reading Trace file %s for gel reading %d %s\n", fullFileName,gel,ar_rec.lines.name); } else { char *seq; char *rawSeq; rawSeq = (char *)getSequence(trace); seq = (char *)malloc(length+1); strncpy(seq, &cseq[rel_pos-1],length); seq[length] = '\0'; if (complemented) { /* * Complement consensus sequence */ complement_zseq(seq); } compare_sequences(gel,seq,length,rel_pos,complemented, trace,rd_cut,rd_ulen,&rawSeq[rd_cut], coverage); freeSeq(trace); free(rawSeq); free(seq); } } free(rd); } else /* Skip */ fprintf(log_fp,";No raw data tag for gel reading %d %s\n",gel,ar_rec.lines.name); } static void print_coverage_gaps(int *coverage, int length) /* * Print out a list of places where there is not coverage */ { int starti,endi; int i; printf("Problem areas:\n"); fprintf(log_fp,"Problem areas:\n"); for (i=0;iio.num_gels); } int find_contig(int readNumber) /* * */ { int i; rl_file_rec cline; int contigNum; for (i=0; i