diff --git a/SUPPORT/CAP2.c b/SUPPORT/CAP2.c new file mode 100644 index 0000000..7c9733c --- /dev/null +++ b/SUPPORT/CAP2.c @@ -0,0 +1,2223 @@ +/* CONTIG ASSEMBLY PROGRAM (CAP) + + copyright (c) 1991 Xiaoqiu Huang + The distribution of the program is granted provided no charge + is made and the copyright notice is included. + + Proper attribution of the author as the source of the software + would be appreciated: + "A Contig Assembly Program Based on Sensitive Detection of + Fragment Overlaps" (submitted to Genomics, 1991) + Xiaoqiu Huang + Department of Computer Science + Michigan Technological University + Houghton, MI 49931 + E-mail: huang@cs.mtu.edu + + The CAP program uses a dynamic programming algorithm to compute + the maximal-scoring overlapping alignment between two fragments. + Fragments in random orientations are assembled into contigs by a + greedy approach in order of the overlap scores. CAP is efficient + in computer memory: a large number of arbitrarily long fragments + can be assembled. The time requirement is acceptable; for example, + CAP took 4 hours to assemble 1015 fragments of a total of 252 kb + nucleotides on a Sun SPARCstation SLC. The program is written in C + and runs on Sun workstations. + + Below is a description of the parameters in the #define section of CAP. + Two specially chosen sets of substitution scores and indel penalties + are used by the dynamic programming algorithm: heavy set for regions + of low sequencing error rates and light set for fragment ends of high + sequencing error rates. (Use integers only.) + Heavy set: Light set: + + MATCH = 2 MATCH = 2 + MISMAT = -6 LTMISM = -3 + EXTEND = 4 LTEXTEN = 2 + + In the initial assembly, any overlap must be of length at least OVERLEN, + and any overlap/containment must be of identity percentage at least + PERCENT. After the initial assembly, the program attempts to join + contigs together using weak overlaps. Two contigs are merged if the + score of the overlapping alignment is at least CUTOFF. The value for + CUTOFF is chosen according to the value for MATCH. + + DELTA is a parameter in necessary conditions for overlap/containment. + Those conditions are used to quickly reject pairs of fragments that + could not possibly have an overlap/containment relationship. + The dynamic programming algorithm is only applied to pairs of fragments + that pass the screening. A large value for DELTA means stringent + conditions, where the value for DELTA is a real number at least 8.0. + + POS5 and POS3 are fragment positions such that the 5' end between base 1 + and base POS5, and the 3' end after base POS3 are of high sequencing + error rates, say more than 5%. For mismatches and indels occurring in + the two ends, light penalties are used. + + A file of input fragments looks like: +>G019uabh +ATACATCATAACACTACTTCCTACCCATAAGCTCCTTTTAACTTGTTAAA +GTCTTGCTTGAATTAAAGACTTGTTTAAACACAAAAATTTAGAGTTTTAC +TCAACAAAAGTGATTGATTGATTGATTGATTGATTGATGGTTTACAGTAG +GACTTCATTCTAGTCATTATAGCTGCTGGCAGTATAACTGGCCAGCCTTT +AATACATTGCTGCTTAGAGTCAAAGCATGTACTTAGAGTTGGTATGATTT +ATCTTTTTGGTCTTCTATAGCCTCCTTCCCCATCCCCATCAGTCTTAATC +AGTCTTGTTACGTTATGACTAATCTTTGGGGATTGTGCAGAATGTTATTT +TAGATAAGCAAAACGAGCAAAATGGGGAGTTACTTATATTTCTTTAAAGC +>G028uaah +CATAAGCTCCTTTTAACTTGTTAAAGTCTTGCTTGAATTAAAGACTTGTT +TAAACACAAAATTTAGACTTTTACTCAACAAAAGTGATTGATTGATTGAT +TGATTGATTGATGGTTTACAGTAGGACTTCATTCTAGTCATTATAGCTGC +TGGCAGTATAACTGGCCAGCCTTTAATACATTGCTGCTTAGAGTCAAAGC +ATGTACTTAGAGTTGGTATGATTTATCTTTTTGGTCTTCTATAGCCTCCT +TCCCCATCCCATCAGTCT +>G022uabh +TATTTTAGAGACCCAAGTTTTTGACCTTTTCCATGTTTACATCAATCCTG +TAGGTGATTGGGCAGCCATTTAAGTATTATTATAGACATTTTCACTATCC +CATTAAAACCCTTTATGCCCATACATCATAACACTACTTCCTACCCATAA +GCTCCTTTTAACTTGTTAAAGTCTTGCTTGAATTAAAGACTTGTTTAAAC +ACAAAATTTAGACTTTTACTCAACAAAAGTGATTGATTGATTGATTGATT +GATTGAT +>G023uabh +AATAAATACCAAAAAAATAGTATATCTACATAGAATTTCACATAAAATAA +ACTGTTTTCTATGTGAAAATTAACCTAAAAATATGCTTTGCTTATGTTTA +AGATGTCATGCTTTTTATCAGTTGAGGAGTTCAGCTTAATAATCCTCTAC +GATCTTAAACAAATAGGAAAAAAACTAAAAGTAGAAAATGGAAATAAAAT +GTCAAAGCATTTCTACCACTCAGAATTGATCTTATAACATGAAATGCTTT +TTAAAAGAAAATATTAAAGTTAAACTCCCCTATTTTGCTCGTTTTTGCTT +ATCTAAAATACATTCTGCACAATCCCCAAAGATTGATCATACGTTAC +>G006uaah +ACATAAAATAAACTGTTTTCTATGTGAAAATTAACCTANNATATGCTTTG +CTTATGTTTAAGATGTCATGCTTTTTATCAGTTGAGGAGTTCAGCTTAAT +AATCCTCTAAGATCTTAAACAAATAGGAAAAAAACTAAAAGTAGAAAATG +GAAATAAAATGTCAAAGCATTTCTACCACTCAGAATTGATCTTATAACAT +GAAATGCTTTTTAAAAGAAAATATTAAAGTTAAACTCCCC + A string after ">" is the name of the following fragment. + Only the five upper-case letters A, C, G, T and N are allowed + to appear in fragment data. No other characters are allowed. + A common mistake is the use of lower case letters in a fragment. + + To run the program, type a command of form + + cap file_of_fragments + + The output goes to the terminal screen. So redirection of the + output into a file is necessary. The output consists of three parts: + overview of contigs at fragment level, detailed display of contigs + at nucleotide level, and consensus sequences. + '+' = direct orientation; '-' = reverse complement + The output of CAP on the sample input data looks like: + +#Contig 1 + +#G022uabh+(0) +TATTTTAGAGACCCAAGTTTTTGACCTTTTCCATGTTTACATCAATCCTGTAGGTGATTG +GGCAGCCATTTAAGTATTATTATAGACATTTTCACTATCCCATTAAAACCCTTTATGCCC +ATACATCATAACACTACTTCCTACCCATAAGCTCCTTTTAACTTGTTAAAGTCTTGCTTG +AATTAAAGACTTGTTTAAACACAAAA-TTTAGACTTTTACTCAACAAAAGTGATTGATTG +ATTGATTGATTGATTGAT +#G028uaah+(145) +CATAAGCTCCTTTTAACTTGTTAAAGTCTTGCTTGAATTAAAGACTTGTTTAAACACAAA +A-TTTAGACTTTTACTCAACAAAAGTGATTGATTGATTGATTGATTGATTGATGGTTTAC +AGTAGGACTTCATTCTAGTCATTATAGCTGCTGGCAGTATAACTGGCCAGCCTTTAATAC +ATTGCTGCTTAGAGTCAAAGCATGTACTTAGAGTTGGTATGATTTATCTTTTTGGTCTTC +TATAGCCTCCTTCCCCATCCC-ATCAGTCT +#G019uabh+(120) +ATACATCATAACACTACTTCCTACCCATAAGCTCCTTTTAACTTGTTAAAGTCTTGCTTG +AATTAAAGACTTGTTTAAACACAAAAATTTAGAGTTTTACTCAACAAAAGTGATTGATTG +ATTGATTGATTGATTGATGGTTTACAGTAGGACTTCATTCTAGTCATTATAGCTGCTGGC +AGTATAACTGGCCAGCCTTTAATACATTGCTGCTTAGAGTCAAAGCATGTACTTAGAGTT +GGTATGATTTATCTTTTTGGTCTTCTATAGCCTCCTTCCCCATCCCCATCAGTCTTAATC +AGTCTTGTTACGTTATGACT-AATCTTTGGGGATTGTGCAGAATGTTATTTTAGATAAGC +AAAA-CGAGCAAAAT-GGGGAGTT-A-CTT-A-TATTT-CTTT-AAA--GC +#G023uabh-(426) +GTAACGT-ATGA-TCAATCTTTGGGGATTGTGCAGAATGT-ATTTTAGATAAGCAAAAAC +GAGCAAAATAGGGGAGTTTAACTTTAATATTTTCTTTTAAAAAGCATTTCATGTTATAAG +ATCAATTCTGAGTGGTAGAAATGCTTTGACATTTTATTTCCATTTTCTACTTTTAGTTTT +TTTCCTATTTGTTTAAGATCGTAGAGGATTATTAAGCTGAACTCCTCAACTGATAAAAAG +CATGACATCTTAAACATAAGCAAAGCATATTTTTAGGTTAATTTTCACATAGAAAACAGT +TTATTTTATGTGAAATTCTATGTAGATATACTATTTTTTTGGTATTTATT +#G006uaah-(496) +GGGGAGTTTAACTTTAATATTTTCTTTTAAAAAGCATTTCATGTTATAAGATCAATTCTG +AGTGGTAGAAATGCTTTGACATTTTATTTCCATTTTCTACTTTTAGTTTTTTTCCTATTT +GTTTAAGATCTTAGAGGATTATTAAGCTGAACTCCTCAACTGATAAAAAGCATGACATCT +TAAACATAAGCAAAGCATATNNT-AGGTTAATTTTCACATAGAAAACAGTTTATTTTATG +T + + + +Slight modifications by S. Smith on Mon Feb 17 10:18:34 EST 1992. +These changes allow for command line arguements for several +of the hard coded parameters, as well as a slight modification to +the output routine to support GDE format. Changes are commented +as: + +Mod by S.S. + +*/ + +#include +#include +#include + +int OVERLEN; /* Minimum length of any overlap */ +float PERCENT; /* Minimum identity percentage of any overlap */ +#define CUTOFF 50 /* cutoff score for overlap or containment */ +#define DELTA 9.0 /* Jump increment in check for overlap */ +#define MATCH 2 /* score of a match */ +#define MISMAT -6 /* score of a mismatch */ +#define LTMISM -3 /* light score of a mismatch */ +#define OPEN 0 /* gap open penalty */ +#define EXTEND 4 /* gap extension penalty */ +#define LTEXTEN 2 /* light gap extension penalty */ +#define POS5 30 /* Sequencing errors often occur before base POS5 */ +#define POS3 475 /* Sequencing errors often occur after base POS3 */ +#define LINELEN 60 /* length of one printed line */ +#define NAMELEN 20 /* length of printed fragment name */ +#define TUPLELEN 4 /* Maximum length of words for lookup table */ +#define SEQLEN 2000 /* initial size of an array for an output fragment */ + +static int over_len; +static float per_cent; +typedef struct OVERLAP /* of 5' and 3' segments */ +{ + int number; /* index of 3' segment */ + int host; /* index of 5' segment */ + int ind; /* used in reassembly */ + int stari; /* start position of 5' suffix */ + int endi; /* end position of 5' suffix */ + int starj; /* start position of 3' prefix */ + int endj; /* end position of 3' prefix */ + short orienti; /* orientation of 5' segment: 0= rev. */ + short orientj; /* orientation of 3' segment: 0= rev. */ + int score; /* score of overlap alignment */ + int length; /* length of alignment */ + int match; /* number of matches in alignment */ + short kind; /* 0 = containment; 1 = overlap */ + int *script; /* script for constructing alignment */ + struct OVERLAP *next; +} over, *overptr; +struct SEG { + char *name; /* name string */ + int len; /* length of segment name */ + char *seq; /* segment sequence */ + char *rev; /* reverse complement */ + int length; /* length of sequence */ + short kind; /* 0 = contain; 1 = overlap */ + int *lookup; /* lookup table */ + int *pos; /* location list */ + overptr list; /* list of overlapping edges */ +} *segment; /* array of segment records */ +int seg_num; /* The number of segments */ +overptr *edge; /* set of overlapping edges */ +int edge_num; /* The number of overlaps */ +struct CONS /* 1 = itself; 0 = reverse complement */ +{ + short isfive[2]; /* is 5' end free? */ + short isthree[2]; /* is 3' end free? */ + short orient[2]; /* orientation of 3' segment */ + int group; /* contig serial number */ + int next[2]; /* pointer to 3' adjacent segment */ + int other; /* the other end of the contig */ + int child; /* for the containment tree */ + int brother; + int father; + overptr node[2]; /* pointers to overlapping edges */ +} *contigs; /* list of contigs */ +struct TTREE /* multiple alignment tree */ +{ + short head; /* head = 1 means the head of a contig */ + short orient; /* orientation */ + int begin; /* start position of previous segment */ + int *script; /* alignment script */ + int size; /* length of script */ + int next; /* list of overlap segments */ + int child; /* list of child segments */ + int brother; /* list of sibling segments */ +} *mtree; +int vert[128]; /* converted digits for 'A','C','G','T' */ +int vertc[128]; /* for reverse complement */ +int tuple; /* tuple = TUPLELEN - 1 */ +int base[TUPLELEN]; /* locations of a lookup table */ +int power[TUPLELEN]; /* powers of 4 */ +typedef struct OUT { + char *line; /* one print line */ + char *a; /* pointer to slot in line */ + char c; /* current char */ + char *seq; /* pointer to sequence */ + int length; /* length of segment */ + int id; /* index of segment */ + int *s; /* pointer to script vector */ + int size; /* size of script vector */ + int op; /* current operation */ + char name[NAMELEN + 2]; /* name of segment */ + short done; /* indicates if segment is done */ + int loc; /* position of next segment symbol */ + char kind; /* type of next symbol of segment */ + char up; /* type of upper symbol of operation */ + char dw; /* type of lower symbol of operation */ + int offset; /* relative to consensus sequence */ + int linesize; /* size of array line */ + struct OUT *child; /* pointer to child subtree */ + struct OUT *brother; /* pointer to brother node */ + struct OUT *link; /* for operation linked list */ + struct OUT *father; /* pointer to father node */ +} row, *rowptr; /* node for segment */ +rowptr *work; /* a set of working segments */ +rowptr head, tail; /* first and last nodes of op list */ +struct VX { + int id; /* Segment index */ + short kind; /* overlap or containment */ + overptr list; /* list of overlapping edges */ +} *piece; /* array of segment records */ +char *allconsen, *allconpt; /* Storing consensus sequences */ +char *ckalloc(size_t size); +main(argc, argv) int argc; +char *argv[]; +{ + int M; /* Sequence length */ + int V[128][128], Q, R; /* Weights */ + int V1[128][128], R1; /* Light weights */ + int total; /* Total of segment lengths */ + int number; /* The number of segments */ + char *sequence; /* Storing all segments */ + char *reverse; /* Storing all reverse segments */ + int symbol, prev; /* The next character */ + FILE *Ap, *ckopen(); /* For the sequence file */ + char *ckalloc(size_t amount); /* space-allocating function */ + register int i, j, k; /* index variables */ + /* Mod by S.S. */ + int jj; + short heading; /* 1: heading; 0: sequence */ + + /* +* Mod by S.S. +* + if ( argc != 2 ) + fatalf("The proper form of command is: \n%s file_of_fragments", + argv[0]); +*/ + if (argc != 4) + fatalf("usage: %s file_of_fragments MIN_OVERLAP PERCENT_MATCH", + argv[0]); + sscanf(argv[2], "%d", &OVERLEN); + sscanf(argv[3], "%d", &jj); + PERCENT = (float)jj / 100.0; + if (PERCENT < 0.25) PERCENT = 0.25; + if (PERCENT > 1.0) PERCENT = 1.0; + if (OVERLEN < 1) OVERLEN = 1; + if (OVERLEN > 100) OVERLEN = 100; + + /* determine number of segments and total lengths */ + j = 0; + + Ap = ckopen(argv[1], "r"); + prev = '\n'; + for (total = 3, number = 0; (symbol = getc(Ap)) != EOF; total++) { + if (symbol == '>' && prev == '\n') number++; + prev = symbol; + } + (void)fclose(Ap); + + total += number * 20; + /* allocate space for segments */ + sequence = (char *)ckalloc(total * sizeof(char)); + reverse = (char *)ckalloc(total * sizeof(char)); + allconpt = allconsen = (char *)ckalloc(total * sizeof(char)); + segment = (struct SEG *)ckalloc(number * sizeof(struct SEG)); + + /* read the segments into sequence */ + M = 0; + Ap = ckopen(argv[1], "r"); + number = -1; + heading = 0; + prev = '\n'; + for (i = 0, k = total; (symbol = getc(Ap)) != EOF;) { + if (symbol != '\n') { + sequence[++i] = symbol; + switch (symbol) { + case 'A': + reverse[--k] = 'T'; + break; + case 'a': + reverse[--k] = 't'; + break; + case 'T': + reverse[--k] = 'A'; + break; + case 't': + reverse[--k] = 'a'; + break; + case 'C': + reverse[--k] = 'G'; + break; + case 'c': + reverse[--k] = 'g'; + break; + case 'G': + reverse[--k] = 'C'; + break; + case 'g': + reverse[--k] = 'c'; + break; + default: + reverse[--k] = symbol; + break; + } + } + if (symbol == '>' && prev == '\n') { + heading = 1; + if (number >= 0) { + segment[number].length = i - j - 1; + segment[number].rev = &(reverse[k]); + if (i - j - 1 > M) M = i - j - 1; + } + number++; + j = i; + segment[number].name = &(sequence[i]); + segment[number].kind = 1; + segment[number].list = 0; + } + if (heading && symbol == '\n') { + heading = 0; + segment[number].len = i - j; + segment[number].seq = &(sequence[i]); + j = i; + } + + prev = symbol; + } + segment[number].length = i - j; + reverse[--k] = '>'; + segment[number].rev = &(reverse[k]); + if (i - j > M) M = i - j; + seg_num = ++number; + (void)fclose(Ap); + + Q = OPEN; + R = EXTEND; + R1 = LTEXTEN; + /* set match and mismatch weights */ + for (i = 0; i < 128; i++) + for (j = 0; j < 128; j++) + if ((i | 32) == (j | 32)) + V[i][j] = V1[i][j] = MATCH; + else { + V[i][j] = MISMAT; + V1[i][j] = LTMISM; + } + for (i = 0; i < 128; i++) V['N'][i] = V[i]['N'] = MISMAT + 1; + V1['N']['N'] = MISMAT + 1; + + over_len = OVERLEN; + per_cent = PERCENT; + edge_num = 0; + INIT(M); + MAKE(); + PAIR(V, V1, Q, R, R1); + ASSEM(); + REPAIR(); + FORM_TREE(); + /* GRAPH(); */ + SHOW(); +} + +static int (*v)[128]; /* substitution scores */ +static int q, r; /* gap penalties */ +static int qr; /* qr = q + r */ +static int (*v1)[128]; /* light substitution scores */ +static int r1; /* light extension penalty */ +static int qr1; /* qr1 = q + r1 */ + +static int SCORE; +static int STARI; +static int STARJ; +static int ENDI; +static int ENDJ; + +static int *CC, *DD; /* saving matrix scores */ +static int *RR, *SS; /* saving start-points */ +static int *S; /* saving operations for diff */ + +/* The following definitions are for function diff() */ + +int diff(); +static int zero = 0; /* int type zero */ + +#define gap(k) ((k) <= 0 ? 0 : q + r * (k)) /* k-symbol indel score */ + +static int *sapp; /* Current script append ptr */ +static int last; /* Last script op appended */ + +static int no_mat; /* number of matches */ + +static int no_mis; /* number of mismatches */ + +static int al_len; /* length of alignment */ +/* Append "Delete k" op */ +#define DEL(k) \ + { \ + al_len += k; \ + if (last < 0) \ + last = sapp[-1] -= (k); \ + else \ + last = *sapp++ = -(k); \ + } +/* Append "Insert k" op */ +#define INS(k) \ + { \ + al_len += k; \ + if (last < 0) { \ + sapp[-1] = (k); \ + *sapp++ = last; \ + } \ + else \ + last = *sapp++ = (k); \ + } + +/* Append "Replace" op */ +#define REP \ + { \ + last = *sapp++ = 0; \ + al_len += 1; \ + } + +INIT(M) int M; +{ + register int j; /* row and column indices */ + char *ckalloc(); /* space-allocating function */ + + /* allocate space for all vectors */ + j = (M + 1) * sizeof(int); + CC = (int *)ckalloc(j); + DD = (int *)ckalloc(j); + RR = (int *)ckalloc(j); + SS = (int *)ckalloc(j); + S = (int *)ckalloc(2 * j); +} + +/* Make a lookup table for words of lengths up to TUPLELEN in each sequence. + The value of a word is used as an index to the table. */ +MAKE() +{ + int hash[TUPLELEN]; /* values of words of lengths up to TUPLELEN */ + int *table; /* pointer to a lookup table */ + int *loc; /* pointer to a table of sequence locations */ + int size; /* size of a lookup table */ + int limit, digit, step; /* temporary variables */ + register int i, j, k, p, q; /* index varibles */ + char *ckalloc(); /* space-allocating function */ + char *A; /* pointer to a sequence */ + int M; /* length of a sequence */ + + tuple = TUPLELEN - 1; + for (i = 0; i < 128; i++) vert[i] = 4; + vert['A'] = vert['a'] = 0; + vert['C'] = vert['c'] = 1; + vert['G'] = vert['g'] = 2; + vert['T'] = vert['t'] = 3; + vertc['A'] = vertc['a'] = 3; + vertc['C'] = vertc['c'] = 2; + vertc['G'] = vertc['g'] = 1; + vertc['T'] = vertc['t'] = 0; + for (i = 0, j = 1, size = 0; i <= tuple; i++, j *= 4) { + base[i] = size; + power[i] = j; + size = (size + 1) * 4; + } + for (j = 0; j <= tuple; j++) hash[j] = 0; + /* make a lookup table for each sequence */ + for (i = 0; i < seg_num; i++) { + A = segment[i].seq; + M = segment[i].length; + table = segment[i].lookup = (int *)ckalloc(size * sizeof(int)); + loc = segment[i].pos = (int *)ckalloc((M + 1) * sizeof(int)); + for (j = 0; j < size; j++) table[j] = 0; + for (k = 0, j = 1; j <= M; j++) + if ((digit = vert[A[j]]) != 4) { + for (p = tuple; p > 0; p--) + hash[p] = 4 * (hash[p - 1] + 1) + digit; + hash[0] = digit; + step = j - k; + limit = tuple <= step ? tuple : step; + for (p = 0; p < limit; p++) + if (!table[(q = hash[p])]) table[q] = 1; + if (step > tuple) { + loc[(p = j - tuple)] = + table[(q = hash[tuple])]; + table[q] = p; + } + } + else + k = j; + } +} + +/* Perform pair-wise comparisons of sequences. The sequences not + satisfying the necessary condition for overlap are rejected quickly. + Those that do satisfy the condition are verified with a dynamic + programming algorithm to see if overlaps exist. */ +PAIR(V, V1, Q, R, R1) int V[][128], V1[][128], Q, R, R1; +{ + int endi, endj, stari, starj; /* endpoint and startpoint */ + + short orienti, orientj; /* orientation of segments */ + short iscon; /* containment condition */ + int score; /* the max score */ + int count, limit; /* temporary variables */ + + register int i, j, d; /* row and column indices */ + char *ckalloc(); /* space-allocating function */ + int rl, cl; + char *A, *B; + int M, N; + overptr node1; + int total; /* total number of pairs */ + int hit; /* number of pairs satisfying cond. */ + int CHECK(); + + v = V; + q = Q; + r = R; + qr = q + r; + v1 = V1; + r1 = R1; + qr1 = q + r1; + total = hit = 0; + limit = 2 * (seg_num - 1); + for (orienti = 0, d = 0; d < limit; d++) { + i = d / 2; + orienti = 1 - orienti; + A = orienti ? segment[i].seq : segment[i].rev; + M = segment[i].length; + for (j = i + 1; j < seg_num; j++) { + B = segment[j].seq; + orientj = 1; + N = segment[j].length; + total += 1; + if (CHECK(orienti, i, j)) { + hit += 1; + SCORE = 0; + big_pass(A, B, M, N, orienti, orientj); + if (SCORE) { + score = SCORE; + stari = ++STARI; + starj = ++STARJ; + endi = ENDI; + endj = ENDJ; + rl = endi - stari + 1; + cl = endj - starj + 1; + sapp = S; + last = 0; + al_len = no_mat = no_mis = 0; + (void)diff(&A[stari] - 1, &B[starj] - 1, + rl, cl, q, q); + iscon = stari == 1 && endi == M || + starj == 1 && endj == N; + if (no_mat >= al_len * per_cent && + (al_len >= over_len || iscon)) { + node1 = (overptr)ckalloc( + (int)sizeof(over)); + if (iscon) + node1->kind = + 0; /* containment */ + else { + node1->kind = 1; + edge_num++; + } /* overlap */ + if (endi == M && + (endj != N || + starj != 1)) /*i is 5'*/ + { + node1->number = j; + node1->host = i; + node1->stari = stari; + node1->endi = endi; + node1->orienti = + orienti; + node1->starj = starj; + node1->endj = endj; + node1->orientj = + orientj; + } + else /* j is 5' */ + { + node1->number = i; + node1->host = j; + node1->stari = starj; + node1->endi = endj; + node1->orienti = + orientj; + node1->starj = stari; + node1->endj = endi; + node1->orientj = + orienti; + } + node1->score = score; + node1->length = al_len; + node1->match = no_mat; + count = + node1->number == i ? j : i; + node1->next = + segment[count].list; + segment[count].list = node1; + if (!node1->kind) + segment[count].kind = 0; + } + } + } + } + } +} + +/* Return 1 if two sequences satisfy the necessary condition for overlap, + and 0 otherwise. Parameters first and second are the indices of segments, + and parameter orient indicates the orientation of segment first. */ +int CHECK(orient, first, second) +short orient; +int first, second; +{ + int limit, bound; /* maximum number of jumps */ + int small; /* smaller of limit and bound */ + float delta; /* cutoff factor */ + float cut; /* cutoff score */ + register int i; /* index variable */ + int t, q; /* temporary variables */ + int JUMP(); + int RJUMP(); + int JUMPC(); + int RJUMPC(); + + delta = DELTA; + if (orient) + limit = JUMP(CC, first, second, 1); + else + limit = JUMPC(CC, first, second); + bound = RJUMP(DD, second, first, orient); + small = limit <= bound ? limit : bound; + cut = 0; + for (i = 1; i <= small; i++) { + if ((t = DD[i] - 1) >= over_len && t >= cut && + (q = CC[i] - 1) >= over_len && q >= cut) + return (1); + cut += delta; + } + if (DD[bound] >= delta * (bound - 1) || + CC[limit] >= delta * (limit - 1)) + return (1); + limit = JUMP(CC, second, first, orient); + if (orient) + bound = RJUMP(DD, first, second, 1); + else + bound = RJUMPC(DD, first, second); + small = limit <= bound ? limit : bound; + cut = 0; + for (i = 1; i <= small; i++) { + if ((t = DD[i] - 1) >= over_len && t >= cut && + (q = CC[i] - 1) >= over_len && q >= cut) + return (1); + cut += delta; + } + return (0); +} + +/* Compute a vector of lengths of jumps */ +int JUMP(H, one, two, orient) +int H[], one, two; +short orient; +{ + char *A, *B; /* pointers to two sequences */ + int M, N; /* lengths of two sequences */ + int *table; /* pointer to a lookup table */ + int *loc; /* pointer to a location table */ + int value; /* value of a word */ + int maxd; /* maximum length of an identical diagonal */ + int d; /* length of current identical diagonal */ + int s; /* length of jumps */ + int k; /* number of jumps */ + + register int i, j, p; /* index variables */ + + A = segment[one].seq; + M = segment[one].length; + table = segment[one].lookup; + loc = segment[one].pos; + B = orient ? segment[two].seq : segment[two].rev; + N = segment[two].length; + for (s = 1, k = 1; s <= N; k++) { + maxd = 0; + for (value = -1, d = 0, j = s; d <= tuple && j <= N; j++, d++) { + if (vert[B[j]] == 4) break; + value = 4 * (value + 1) + vert[B[j]]; + if (!table[value]) break; + } + if (d > tuple) { + for (p = table[value]; p; p = loc[p]) { + d = tuple + 1; + for (i = p + d, j = s + d; i <= M && j <= N; + i++, j++, d++) + if (A[i] != B[j] && vert[A[i]] != 4 && + vert[B[j]] != 4) + break; + if (maxd < d) maxd = d; + if (j > N) break; + } + } + else + maxd = d; + s += maxd + 1; + H[k] = s; + } + return (k - 1); +} + +/* Compute a vector of lengths of jumps for reverse complement of one */ +int JUMPC(H, one, two) +int H[], one, two; +{ + char *A, *B; /* pointers to two sequences */ + int M, N; /* lengths of two sequences */ + int *table; /* pointer to a lookup table */ + int *loc; /* pointer to a location table */ + int value; /* value of a word */ + int maxd; /* maximum length of an identical diagonal */ + int d; /* length of current identical diagonal */ + int s; /* length of jumps */ + int k; /* number of jumps */ + + register int i, j, p; /* index variables */ + + A = segment[one].rev; + M = segment[one].length; + table = segment[one].lookup; + loc = segment[one].pos; + B = segment[two].seq; + N = segment[two].length; + for (s = 1, k = 1; s <= N; k++) { + maxd = 0; + for (value = 0, d = 0, j = s; d <= tuple && j <= N; j++, d++) { + if (vert[B[j]] == 4) break; + value += vertc[B[j]] * power[d]; + if (!table[value + base[d]]) break; + } + if (d > tuple) { + for (p = table[value + base[tuple]]; p; p = loc[p]) { + d = tuple + 1; + for (i = M + 2 - p, j = s + d; i <= M && j <= N; + i++, j++, d++) + if (A[i] != B[j] && vert[A[i]] != 4 && + vert[B[j]] != 4) + break; + if (maxd < d) maxd = d; + if (j > N) break; + } + } + else + maxd = d; + s += maxd + 1; + H[k] = s; + } + return (k - 1); +} + +/* Compute a vector of lengths of reverse jumps */ +int RJUMP(H, one, two, orient) +int H[], one, two; +short orient; +{ + char *A, *B; /* pointers to two sequences */ + int N; /* length of a sequence */ + int *table; /* pointer to a lookup table */ + int *loc; /* pointer to a location table */ + int value; /* value of a word */ + int maxd; /* maximum length of an identical diagonal */ + int d; /* length of current identical diagonal */ + int s; /* length of jumps */ + int k; /* number of jumps */ + + register int i, j, p; /* index variables */ + + A = segment[one].seq; + table = segment[one].lookup; + loc = segment[one].pos; + B = orient ? segment[two].seq : segment[two].rev; + N = segment[two].length; + for (s = 1, k = 1; s <= N; k++) { + maxd = 0; + for (value = 0, d = 0, j = N + 1 - s; d <= tuple && j >= 1; + j--, d++) { + if (vert[B[j]] == 4) break; + value += vert[B[j]] * power[d]; + if (!table[value + base[d]]) break; + } + if (d > tuple) { + for (p = table[value + base[tuple]]; p; p = loc[p]) { + d = tuple + 1; + for (i = p - 1, j = N + 1 - s - d; + i >= 1 && j >= 1; i--, j--, d++) + if (A[i] != B[j] && vert[A[i]] != 4 && + vert[B[j]] != 4) + break; + if (maxd < d) maxd = d; + if (j < 1) break; + } + } + else + maxd = d; + s += maxd + 1; + H[k] = s; + } + return (k - 1); +} + +/* Compute a vector of lengths of reverse jumps for reverse complement */ +int RJUMPC(H, one, two) +int H[], one, two; +{ + char *A, *B; /* pointers to two sequences */ + int M, N; /* lengths of two sequences */ + int *table; /* pointer to a lookup table */ + int *loc; /* pointer to a location table */ + int value; /* value of a word */ + int maxd; /* maximum length of an identical diagonal */ + int d; /* length of current identical diagonal */ + int s; /* length of jumps */ + int k; /* number of jumps */ + + register int i, j, p; /* index variables */ + + A = segment[one].rev; + M = segment[one].length; + table = segment[one].lookup; + loc = segment[one].pos; + B = segment[two].seq; + N = segment[two].length; + for (s = 1, k = 1; s <= N; k++) { + maxd = 0; + for (value = -1, d = 0, j = N + 1 - s; d <= tuple && j >= 1; + j--, d++) { + if (vert[B[j]] == 4) break; + value = 4 * (value + 1) + vertc[B[j]]; + if (!table[value]) break; + } + if (d > tuple) { + for (p = table[value]; p; p = loc[p]) { + d = tuple + 1; + i = M - p - tuple; + for (j = N - s - tuple; i >= 1 && j >= 1; + i--, j--, d++) + if (A[i] != B[j] && vert[A[i]] != 4 && + vert[B[j]] != 4) + break; + if (maxd < d) maxd = d; + if (j < 1) break; + } + } + else + maxd = d; + s += maxd + 1; + H[k] = s; + } + return (k - 1); +} + +/* Construct contigs */ +ASSEM() +{ + char *ckalloc(); /* space-allocating function */ + register int i, j, k; /* index variables */ + overptr node1, x, y; /* temporary pointer */ + int five, three; /* indices of 5' and 3' segments */ + short orienti; /* orientation of 5' segment */ + short orientj; /* orientation of 3' segment */ + short sorted; /* boolean variable */ + + contigs = (struct CONS *)ckalloc(seg_num * sizeof(struct CONS)); + for (i = 0; i < seg_num; i++) { + contigs[i].isfive[0] = contigs[i].isthree[0] = 1; + contigs[i].isfive[1] = contigs[i].isthree[1] = 1; + contigs[i].other = i; + contigs[i].group = contigs[i].child = -1; + contigs[i].brother = contigs[i].father = -1; + } + for (i = 0; i < seg_num; i++) + if (!segment[i].kind) + for (;;) { + for (y = segment[i].list; y->kind; y = y->next) + ; + for (x = y->next; x != 0; x = x->next) + if (!x->kind && x->score > y->score) + y = x; + for (j = y->number; + (k = contigs[j].father) != -1; j = k) + ; + if (j != i) { + contigs[i].father = j = y->number; + contigs[i].brother = contigs[j].child; + contigs[j].child = i; + contigs[i].node[1] = y; + break; + } + else { + if (segment[i].list->number == + y->number) + segment[i].list = y->next; + else { + for (x = segment[i].list; + x->next->number != + y->number;) + x = x->next; + x->next = y->next; + } + for (x = segment[i].list; + x != 0 && x->kind; x = x->next) + ; + if (x == 0) { + segment[i].kind = 1; + break; + } + } + } + edge = (overptr *)ckalloc(edge_num * sizeof(overptr)); + for (j = 0, i = 0; i < seg_num; i++) + if (segment[i].kind) + for (node1 = segment[i].list; node1 != 0; + node1 = node1->next) + if (segment[node1->number].kind) + edge[j++] = node1; + edge_num = j; + for (i = edge_num - 1; i > 0; i--) { + sorted = 1; + for (j = 0; j < i; j++) + if (edge[j]->score < edge[j + 1]->score) { + node1 = edge[j]; + edge[j] = edge[j + 1]; + edge[j + 1] = node1; + sorted = 0; + } + if (sorted) break; + } + for (k = 0; k < edge_num; k++) { + five = edge[k]->host; + three = edge[k]->number; + orienti = edge[k]->orienti; + orientj = edge[k]->orientj; + if (contigs[five].isthree[orienti] && + contigs[three].isfive[orientj] && + contigs[five].other != three) { + contigs[five].isthree[orienti] = 0; + contigs[three].isfive[orientj] = 0; + contigs[five].next[orienti] = three; + contigs[five].orient[orienti] = orientj; + contigs[five].node[orienti] = edge[k]; + contigs[three].isthree[(j = 1 - orientj)] = 0; + contigs[five].isfive[(i = 1 - orienti)] = 0; + contigs[three].next[j] = five; + contigs[three].orient[j] = i; + contigs[three].node[j] = edge[k]; + i = contigs[three].other; + j = contigs[five].other; + contigs[i].other = j; + contigs[j].other = i; + } + } +} + +REPAIR() +{ + int endi, endj, stari, starj; /* endpoint and startpoint */ + + short orienti, orientj; /* orientation of segments */ + short isconi, isconj; /* containment condition */ + int score; /* the max score */ + int i, j, f, d, e; /* row and column indices */ + char *ckalloc(); /* space-allocating function */ + char *A, *B; + int M, N; + overptr node1; + int piece_num; /* The number of pieces */ + int count, limit; + int number; + int hit; + + piece = (struct VX *)ckalloc(seg_num * sizeof(struct VX)); + for (j = 0, i = 0; i < seg_num; i++) + if (segment[i].kind && + (contigs[i].isfive[1] || contigs[i].isfive[0])) + piece[j++].id = i; + piece_num = j; + for (i = 0; i < piece_num; i++) { + piece[i].kind = 1; + piece[i].list = 0; + } + limit = 2 * (piece_num - 1); + hit = number = 0; + for (orienti = 0, d = 0; d < limit; d++) { + i = piece[(e = d / 2)].id; + orienti = 1 - orienti; + A = orienti ? segment[i].seq : segment[i].rev; + M = segment[i].length; + for (f = e + 1; f < piece_num; f++) { + j = piece[f].id; + B = segment[j].seq; + orientj = 1; + N = segment[j].length; + SCORE = 0; + hit++; + big_pass(A, B, M, N, orienti, orientj); + if (SCORE > CUTOFF) { + score = SCORE; + stari = ++STARI; + starj = ++STARJ; + endi = ENDI; + endj = ENDJ; + isconi = stari == 1 && endi == M; + isconj = starj == 1 && endj == N; + node1 = (overptr)ckalloc((int)sizeof(over)); + if (isconi || isconj) + node1->kind = 0; /* containment */ + else { + node1->kind = 1; + number++; + } /* overlap */ + if (endi == M && !isconj) /*i is 5'*/ + { + node1->number = j; + node1->host = i; + node1->ind = f; + node1->stari = stari; + node1->endi = endi; + node1->orienti = orienti; + node1->starj = starj; + node1->endj = endj; + node1->orientj = orientj; + } + else /* j is 5' */ + { + node1->number = i; + node1->host = j; + node1->ind = e; + node1->stari = starj; + node1->endi = endj; + node1->orienti = orientj; + node1->starj = stari; + node1->endj = endi; + node1->orientj = orienti; + } + node1->score = score; + count = node1->number == i ? f : e; + node1->next = piece[count].list; + piece[count].list = node1; + if (!node1->kind) piece[count].kind = 0; + } + } + } + REASSEM(piece_num, number); +} + +/* Construct contigs */ +REASSEM(piece_num, number) int piece_num, number; +{ + char *ckalloc(); /* space-allocating function */ + int i, j, k, d; /* index variables */ + overptr node1, x, y; /* temporary pointer */ + int five, three; /* indices of 5' and 3' segments */ + short orienti; /* orientation of 5' segment */ + short orientj; /* orientation of 3' segment */ + short sorted; /* boolean variable */ + + for (d = 0; d < piece_num; d++) + if (!piece[d].kind) + for (i = piece[d].id;;) { + for (y = piece[d].list; y->kind; y = y->next) + ; + for (x = y->next; x != 0; x = x->next) + if (!x->kind && x->score > y->score) + y = x; + for (j = y->number; + (k = contigs[j].father) != -1; j = k) + ; + if (j != i && + RECONCILE(y, &piece_num, &number)) { + contigs[i].father = j = y->number; + contigs[i].brother = contigs[j].child; + contigs[j].child = i; + contigs[i].node[1] = y; + segment[i].kind = 0; + break; + } + else { + if (piece[d].list->number == y->number) + piece[d].list = y->next; + else { + for (x = piece[d].list; + x->next->number != + y->number;) + x = x->next; + x->next = y->next; + } + for (x = piece[d].list; + x != 0 && x->kind; x = x->next) + ; + if (x == 0) { + piece[d].kind = 1; + break; + } + } + } + if (number > edge_num) + edge = (overptr *)ckalloc(number * sizeof(overptr)); + for (j = 0, d = 0; d < piece_num; d++) + if (piece[d].kind) + for (node1 = piece[d].list; node1 != 0; + node1 = node1->next) + if (piece[node1->ind].kind) edge[j++] = node1; + edge_num = j; + for (i = edge_num - 1; i > 0; i--) { + sorted = 1; + for (j = 0; j < i; j++) + if (edge[j]->score < edge[j + 1]->score) { + node1 = edge[j]; + edge[j] = edge[j + 1]; + edge[j + 1] = node1; + sorted = 0; + } + if (sorted) break; + } + for (k = 0; k < edge_num; k++) { + five = edge[k]->host; + three = edge[k]->number; + orienti = edge[k]->orienti; + orientj = edge[k]->orientj; + if (contigs[five].isthree[orienti] && + contigs[three].isfive[orientj] && + contigs[five].other != three) { + contigs[five].isthree[orienti] = 0; + contigs[three].isfive[orientj] = 0; + contigs[five].next[orienti] = three; + contigs[five].orient[orienti] = orientj; + contigs[five].node[orienti] = edge[k]; + contigs[three].isthree[(j = 1 - orientj)] = 0; + contigs[five].isfive[(i = 1 - orienti)] = 0; + contigs[three].next[j] = five; + contigs[three].orient[j] = i; + contigs[three].node[j] = edge[k]; + i = contigs[three].other; + j = contigs[five].other; + contigs[i].other = j; + contigs[j].other = i; + } + } +} + +RECONCILE(y, pp, nn) overptr y; +int *pp, *nn; +{ + short orienti, orientj; /* orientation of segments */ + short orientk, orientd; /* orientation of segments */ + int i, j, k, d, f; /* row and column indices */ + char *ckalloc(); /* space-allocating function */ + char *A, *B; + int M, N; + overptr node1; + + k = y->host; + d = y->number; + orientk = y->orienti; + orientd = y->orientj; + if (!contigs[k].isthree[orientk]) { + if (!piece[y->ind].kind) return (0); + if (contigs[d].isthree[orientd]) { + orienti = orientd; + i = d; + orientj = contigs[k].orient[orientk]; + j = contigs[k].next[orientk]; + } + else + return (0); + } + else if (!contigs[k].isfive[orientk]) { + if (!piece[y->ind].kind) return (0); + if (contigs[d].isfive[orientd]) { + orienti = contigs[k].orient[1 - orientk]; + orienti = 1 - orienti; + i = contigs[k].next[1 - orientk]; + orientj = orientd; + j = d; + } + else + return (0); + } + else + return (0); + A = orienti ? segment[i].seq : segment[i].rev; + M = segment[i].length; + B = orientj ? segment[j].seq : segment[j].rev; + N = segment[j].length; + SCORE = 0; + big_pass(A, B, M, N, orienti, orientj); + if (SCORE > CUTOFF && ENDI - STARI > over_len && ENDI == M && + STARJ == 0) { + node1 = (overptr)ckalloc((int)sizeof(over)); + node1->kind = 1; + node1->host = i; + node1->number = j; + node1->stari = ++STARI; + node1->endi = ENDI; + node1->orienti = orienti; + node1->starj = ++STARJ; + node1->endj = ENDJ; + node1->orientj = orientj; + node1->score = SCORE; + piece[*pp].kind = 1; + if (i == d) { + node1->ind = *pp; + node1->next = piece[y->ind].list; + piece[y->ind].list = node1; + piece[*pp].id = j; + piece[*pp].list = 0; + } + else { + node1->ind = y->ind; + piece[*pp].list = node1; + node1->next = 0; + piece[*pp].id = i; + } + (*nn)++; + (*pp)++; + f = contigs[k].other; + if (!contigs[k].isthree[orientk]) { + contigs[j].isfive[orientj] = 1; + contigs[j].isthree[1 - orientj] = 1; + contigs[k].isthree[orientk] = 1; + contigs[k].isfive[1 - orientk] = 1; + contigs[f].other = j; + contigs[j].other = f; + } + else { + contigs[i].isthree[orienti] = 1; + contigs[i].isfive[1 - orienti] = 1; + contigs[k].isfive[orientk] = 1; + contigs[k].isthree[1 - orientk] = 1; + contigs[f].other = i; + contigs[i].other = f; + } + contigs[k].other = k; + return (1); + } + return (0); +} + +/* Construct a tree of overlapping-containment segments */ +FORM_TREE() +{ + register int i, j, k; /* index variables */ + char *ckalloc(); /* space-allocating function */ + overptr node1; /* temporary pointer */ + short orient; /* orientation of segment */ + int group; /* serial number of contigs */ + char *A, *B; /* pointers to segment sequences */ + int stari, endi, starj, endj; /* positions where alignment begins */ + int M, N; /* lengths of segment sequences */ + int count; /* temporary variables */ + + mtree = (struct TTREE *)ckalloc(seg_num * sizeof(struct TTREE)); + for (i = 0; i < seg_num; i++) { + mtree[i].head = 0; + mtree[i].next = mtree[i].child = mtree[i].brother = -1; + } + for (group = 0, i = 0; i < seg_num; i++) + if (segment[i].kind && contigs[i].group < 0 && + (contigs[i].isfive[1] || contigs[i].isfive[0])) { + orient = contigs[i].isfive[1] ? 1 : 0; + mtree[i].head = 1; + for (j = i;;) { + contigs[j].group = group; + mtree[j].orient = orient; + SORT(j, orient); + if (contigs[j].isthree[orient]) + break; + else { + k = contigs[j].next[orient]; + node1 = contigs[j].node[orient]; + if (j == node1->host) { + stari = node1->stari; + endi = node1->endi; + starj = node1->starj; + endj = node1->endj; + A = node1->orienti + ? segment[j].seq + : segment[j].rev; + B = node1->orientj + ? segment[k].seq + : segment[k].rev; + } + else { + M = segment[j].length; + stari = M + 1 - node1->endj; + endi = M + 1 - node1->starj; + N = segment[k].length; + starj = N + 1 - node1->endi; + endj = N + 1 - node1->stari; + A = node1->orientj + ? segment[j].rev + : segment[j].seq; + B = node1->orienti + ? segment[k].rev + : segment[k].seq; + } + M = endi - stari + 1; + N = endj - starj + 1; + sapp = S; + last = 0; + al_len = no_mat = no_mis = 0; + (void)diff(&A[stari] - 1, &B[starj] - 1, + M, N, q, q); + count = + ((N = sapp - S) + 1) * sizeof(int); + mtree[k].script = (int *)ckalloc(count); + for (M = 0; M < N; M++) + mtree[k].script[M] = S[M]; + mtree[k].size = N; + mtree[k].begin = stari; + mtree[j].next = k; + orient = contigs[j].orient[orient]; + j = k; + } + } + group++; + } +} + +/* Sort the children of each node by the `begin' field */ +SORT(seg, ort) int seg; +short ort; +{ + register int i, j, k; /* index variables */ + char *ckalloc(); /* space-allocating function */ + overptr node1; /* temporary pointer */ + short orient; /* orientation of segment */ + char *A, *B; /* pointers to segment sequences */ + int stari, endi, starj, endj; /* positions where alignment begins */ + int M, N; /* lengths of segment sequences */ + int count; /* temporary variables */ + + for (j = contigs[seg].child; j != -1; j = contigs[j].brother) { + node1 = contigs[j].node[1]; + if (ort == node1->orientj) { + stari = node1->starj; + endi = node1->endj; + starj = node1->stari; + endj = node1->endi; + A = node1->orientj ? segment[seg].seq + : segment[seg].rev; + B = node1->orienti ? segment[j].seq : segment[j].rev; + orient = node1->orienti; + } + else { + M = segment[seg].length; + stari = M + 1 - node1->endj; + endi = M + 1 - node1->starj; + N = segment[j].length; + starj = N + 1 - node1->endi; + endj = N + 1 - node1->stari; + A = node1->orientj ? segment[seg].rev + : segment[seg].seq; + B = node1->orienti ? segment[j].rev : segment[j].seq; + orient = 1 - node1->orienti; + } + M = endi - stari + 1; + N = endj - starj + 1; + sapp = S; + last = 0; + al_len = no_mat = no_mis = 0; + (void)diff(&A[stari] - 1, &B[starj] - 1, M, N, q, q); + count = ((M = sapp - S) + 1) * sizeof(int); + mtree[j].script = (int *)ckalloc(count); + for (k = 0; k < M; k++) mtree[j].script[k] = S[k]; + mtree[j].size = M; + mtree[j].begin = stari; + mtree[j].orient = orient; + if (mtree[seg].child == -1) + mtree[seg].child = j; + else { + i = mtree[seg].child; + if (mtree[i].begin >= stari) { + mtree[j].brother = i; + mtree[seg].child = j; + } + else { + M = mtree[i].brother; + for (; M != -1; i = M, M = mtree[M].brother) + if (mtree[M].begin >= stari) break; + mtree[j].brother = M; + mtree[i].brother = j; + } + } + SORT(j, orient); + } +} + +/* Display the alignments of segments */ +SHOW() +{ + register int i, j, k; /* index variables */ + char *ckalloc(); /* space-allocating function */ + int n; /* number of working segments */ + int limit; /* number of slots in work */ + int col; /* number of output columns prepared */ + short done; /* tells if current group is done */ + rowptr root; /* pointer to root of op tree */ + int sym[6]; /* occurrance counts for six chars */ + char c; /* temp variable */ + rowptr t, w, yy; /* temp pointer */ + int x; /* temp variables */ + int group; /* Contigs number */ + char conlit[20], *a; /* String form of contig number */ + char *spt; /* pointer to the start of consensus */ + + work = (rowptr *)ckalloc(seg_num * sizeof(rowptr)); + group = 0; + yy = 0; + for (j = 0; j < 6; j++) sym[j] = 0; + n = limit = col = 0; + for (i = 0; i < seg_num; i++) + if (mtree[i].head) { + (void)sprintf(conlit, ">Contig %d\n", group); + for (a = conlit; *a;) *allconpt++ = *a++; + /* Mod by S.S. + (void) printf("\n#Contig %d\n\n", group++); +*/ + group++; + done = 0; + ENTER(&limit, &n, i, col, yy); + root = work[0]; + spt = allconpt; + while (!done) { + for (j = 0; j < n; + j++) /* get segments into work */ + { + t = work[j]; + k = t->id; + if ((x = mtree[k].next) != -1 && + mtree[x].begin == t->loc) { + ENTER(&limit, &n, x, col, t); + mtree[k].next = -1; + } + for (x = mtree[k].child; x != -1; + x = mtree[x].brother) + if (mtree[x].begin == t->loc) { + ENTER(&limit, &n, x, + col, t); + mtree[k].child = + mtree[x].brother; + } + else + break; + } + COLUMN(root); /* determine next column */ + root->c = root->kind; + for (t = head; t != 0; t = t->link) + t->c = t->kind; + for (j = 0; j < n; j++) { + t = work[j]; + if (t->done) + *t->a++ = ' '; + else { + if (t->c == 'L') { + if (t->loc == 1) + t->offset = + allconpt - + spt; + c = *t->a++ = + t->seq[t->loc++]; + } + else if (t->loc > 1) + c = *t->a++ = '-'; + else + c = *t->a++ = ' '; + if (c != ' ') + if (c == '-') + sym[5] += 1; + else + sym[vert[c]] += + 1; + t->c = ' '; + } + } + /* determine consensus char */ + k = sym[0] + sym[1] + sym[2] + sym[3] + sym[4]; + if (k < sym[5]) + *allconpt++ = '-'; + else if (sym[0] == sym[1] && sym[1] == sym[2] && + sym[2] == sym[3]) + *allconpt++ = 'N'; + else { + k = sym[0]; + c = 'A'; + if (k < sym[1]) { + k = sym[1]; + c = 'C'; + } + if (k < sym[2]) { + k = sym[2]; + c = 'G'; + } + if (k < sym[3]) c = 'T'; + *allconpt++ = c; + } + for (j = 0; j < 6; j++) sym[j] = 0; + for (t = head; t != 0; t = t->link) { + NEXTOP(t); + if (t->done) /* delete it from op tree + */ + { + w = t->father; + if (w->child->id == t->id) + w->child = t->brother; + else { + w = w->child; + for (; w->brother->id != + t->id; + w = w->brother) + ; + w->brother = t->brother; + } + } + } + if (root->loc > + root->length) /* check root node */ + { + root->done = 1; + if ((w = root->child) != 0) { + w->father = 0; + root = w; + } + else + done = 1; + } + col++; + if (col == LINELEN || done) /* output */ + { + col = 0; + for (j = 0; j < n; j++) { + t = work[j]; + if (t->done) + /* + Mod by S.S. + { (void) printf("#"); + for ( a = t->name; *a; a++ ) + (void) printf("%c", *a); +*/ + { + int jj; + (void)printf( + "{\nname "); + for (jj = 0; + jj < + strlen(t->name) - + 1; + jj++) + (void)printf( + "%c", + t->name + [jj]); + printf( + "\nstrandedness" + " %c\n", + t->name[strlen( + t->name)] == '+' + ? '1' + : '2'); + + printf( + "offset " + "%d\ntype " + "DNA\ngroup-" + "ID %d\nsequence \"\n", + t->offset, group); + for (k = 0, a = t->line; + a != t->a; a++) + if (*a != ' ') { + k++; + (void)printf( + "%" + "c", + *a); + if (k == + LINELEN) { + (void)printf( + "\n"); + k = 0; + } + } + /* + if ( k ) + */ + (void)printf("\"\n}\n"); + } + if (t->linesize - + (t->a - t->line) < + LINELEN + 3) + ALOC_SEQ(t); + } + if (!done) { + for (k = j = n - 1; j >= 0; j--) + if (work[j]->done) { + t = work[j]; + for (x = j; + x < k; x++) + work[x] = work + [x + + 1]; + work[k--] = t; + } + n = k + 1; + } + else + n = 0; + } + } + } +} + +/* allocate more space for output fragment */ +ALOC_SEQ(t) rowptr t; +{ + char *start, *end, *p; + t->linesize *= 2; + start = t->line; + end = t->a; + t->line = ckalloc(t->linesize * sizeof(char)); + for (t->a = t->line, p = start; p != end;) *t->a++ = *p++; + free(start); +} + +/* enter a segment into working set */ +ENTER(b, d, id, pos, par) int *b, *d, id, pos; +rowptr par; +{ + int i; + char *ckalloc(); /* space-allocating function */ + rowptr t; + + if (*b <= *d) { + work[*b] = (rowptr)ckalloc((int)sizeof(row)); + work[*b]->line = (char *)ckalloc(SEQLEN * sizeof(char)); + work[*b]->linesize = SEQLEN; + *b += 1; + } + t = work[*d]; + *d += 1; + t->a = t->line; + for (i = 0; i < pos; i++) *t->a++ = ' '; + t->c = ' '; + t->seq = mtree[id].orient ? segment[id].seq : segment[id].rev; + t->length = segment[id].length; + t->id = id; + if (par != 0) { + t->s = mtree[id].script; + t->size = mtree[id].size; + } + t->op = 0; + for (i = 1; i <= segment[id].len && i <= NAMELEN; i++) + t->name[i - 1] = segment[id].name[i]; + if (mtree[id].orient) + t->name[i - 1] = '+'; + else + t->name[i - 1] = '-'; + t->name[i] = '\0'; + t->done = 0; + t->loc = 1; + t->child = 0; + t->father = par; + if (par != 0) { + t->brother = par->child; + par->child = t; + NEXTOP(t); + } +} + +/* get the next operation */ +NEXTOP(t) rowptr t; +{ + if (t->size || t->op) + if (t->op == 0 && *t->s == 0) { + t->op = *t->s++; + t->size--; + t->up = 'L'; + t->dw = 'L'; + } + else { + if (t->op == 0) { + t->op = *t->s++; + t->size--; + } + if (t->op > 0) { + t->up = '-'; + t->dw = 'L'; + t->op--; + } + else { + t->up = 'L'; + t->dw = '-'; + t->op++; + } + } + else if (t->loc > t->length) + t->done = 1; +} + +COLUMN(x) rowptr x; +{ + rowptr y; + rowptr start, end; /* first and last nodes for subtree */ + + if (x->child == 0) { + head = tail = 0; + x->kind = 'L'; + } + else { + start = end = 0; + x->kind = 'L'; + for (y = x->child; y != 0; y = y->brother) { + COLUMN(y); + if (x->kind == y->up) + if (y->kind == y->dw) { + if (head == 0) { + y->link = 0; + head = tail = y; + } + else { + y->link = head; + head = y; + } + if (end == 0) + start = head; + else + end->link = head; + end = tail; + } + else if (y->kind == '-') { + start = head; + end = tail; + x->kind = '-'; + } + else { + y->link = 0; + y->kind = '-'; + if (end == 0) + start = end = y; + else { + end->link = y; + end = y; + } + } + else if (y->kind == y->dw) + if (x->kind == '-') + ; + else { + if (head == 0) { + y->link = 0; + head = tail = y; + } + else { + y->link = head; + head = y; + } + start = head; + end = tail; + x->kind = '-'; + } + else if (x->kind == '-') + if (y->kind == '-') { + if (end == 0) { + start = head; + end = tail; + } + else if (head == 0) + /* code folded from here */ + ; + /* unfolding */ + else { + /* code folded from here */ + end->link = head; + end = tail; + /* unfolding */ + } + } + else + ; + else { + start = head; + end = tail; + x->kind = '-'; + } + } + head = start; + tail = end; + } +} + +/* Display a summary of contigs */ +GRAPH() +{ + int i, j, k; /* index variables */ + int group; /* serial number of contigs */ + char name[NAMELEN + 2]; /* name of segment */ + char *t; /* temp var */ + int length; /* length of name */ + + (void)printf("\nOVERLAPS CONTAINMENTS\n\n"); + group = 1; + for (i = 0; i < seg_num; i++) + if (mtree[i].head) { + (void)printf( + "******************* Contig %d " + "********************\n", + group++); + for (j = i; j != -1; j = mtree[j].next) { + length = segment[j].len; + t = segment[j].name + 1; + for (k = 0; k < length && k < NAMELEN; k++) + name[k] = *t++; + if (mtree[j].orient) + name[k] = '+'; + else + name[k] = '-'; + name[k + 1] = '\0'; + (void)printf("%s\n", name); + CONTAIN(mtree[j].child, name); + } + } +} + +CONTAIN(id, f) int id; +char *f; +{ + int k; /* index variable */ + char name[NAMELEN + 2]; /* name of segment */ + char *t; /* temp var */ + int length; /* length of name */ + + if (id != -1) { + length = segment[id].len; + t = segment[id].name + 1; + for (k = 0; k < length && k < NAMELEN; k++) name[k] = *t++; + if (mtree[id].orient) + name[k] = '+'; + else + name[k] = '-'; + name[k + 1] = '\0'; + (void)printf(" %s is in %s\n", name, f); + CONTAIN(mtree[id].child, name); + CONTAIN(mtree[id].brother, f); + } +} + +big_pass(A, B, M, N, orienti, orientj) char A[], B[]; +int M, N; +short orienti, orientj; +{ + register int i, j; /* row and column indices */ + register int c; /* best score at current point */ + register int f; /* best score ending with insertion */ + register int d; /* best score ending with deletion */ + register int p; /* best score at (i-1, j-1) */ + register int ci; /* end-point associated with c */ + + register int di; /* end-point associated with d */ + register int fi; /* end-point associated with f */ + register int pi; /* end-point associated with p */ + int *va; /* pointer to v(A[i], B[j]) */ + int x1, + x2; /* regions of A before x1 or after x2 are lightly penalized */ + int y1, + y2; /* regions of B before y1 or after y2 are lightly penalized */ + short heavy; /* 1 = heavy penalty */ + int ex, gx; /* current gap penalty scores */ + + /* determine x1, x2, y1, y2 */ + if (POS5 >= POS3) + fatal( + "The value for POS5 must be less than the value for POS3"); + if (orienti) { + x1 = POS5 >= M ? 1 : POS5; + x2 = POS3 >= M ? M : POS3; + } + else { + x1 = POS3 >= M ? 1 : M - POS3 + 1; + x2 = POS5 >= M ? M : M - POS5 + 1; + } + if (orientj) { + y1 = POS5 >= N ? 1 : POS5; + y2 = POS3 >= N ? N : POS3; + } + else { + y1 = POS3 >= N ? 1 : N - POS3 + 1; + y2 = POS5 >= N ? N : N - POS5 + 1; + } + if (x1 + 1 <= x2) x1++; + if (y1 + 1 <= y2) y1++; + heavy = 0; + + /* Compute the matrix. + CC : the scores of the current row + RR : the starting point that leads to score CC + DD : the scores of the current row, ending with deletion + SS : the starting point that leads to score DD */ + /* Initialize the 0 th row */ + for (j = 1; j <= N; j++) { + CC[j] = 0; + DD[j] = -(q); + RR[j] = SS[j] = -j; + } + for (i = 1; i <= M; i++) { + if (i == x1) heavy = 1 - heavy; + if (i == x2) heavy = 1 - heavy; + ex = r1; + gx = qr1; + va = v1[A[i]]; + c = 0; /* Initialize column 0 */ + f = -(q); + ci = fi = i; + p = 0; + pi = i - 1; + for (j = 1; j <= N; j++) { + if (j == y1) { + if (heavy) { + ex = r; + gx = qr; + /* +S.S. + va = v[A[i]]; +*/ + va = v[A[i]]; + } + } + if (j == y2) { + if (heavy) { + ex = r1; + gx = qr1; + va = v1[A[i]]; + } + } + if ((f = f - ex) < (c = c - gx)) { + f = c; + fi = ci; + } + di = SS[j]; + if ((d = DD[j] - ex) < (c = CC[j] - gx)) { + d = c; + di = RR[j]; + } + c = p + va[B[j]]; /* diagonal */ + ci = pi; + if (c < d) { + c = d; + ci = di; + } + if (c < f) { + c = f; + ci = fi; + } + p = CC[j]; + CC[j] = c; + pi = RR[j]; + RR[j] = ci; + DD[j] = d; + SS[j] = di; + if ((j == N || i == M) && c > SCORE) { + SCORE = c; + ENDI = i; + ENDJ = j; + STARI = ci; + } + } + } + if (SCORE) + if (STARI < 0) { + STARJ = -STARI; + STARI = 0; + } + else + STARJ = 0; +} + +/* diff(A,B,M,N,tb,te) returns the score of an optimum conversion between + A[1..M] and B[1..N] that begins(ends) with a delete if tb(te) is zero + and appends such a conversion to the current script. */ + +int diff(A, B, M, N, tb, te) +char *A, *B; +int M, N; +int tb, te; + +{ + int midi, midj, type; /* Midpoint, type, and cost */ + int midc; + + { + register int i, j; + register int c, e, d, s; + int t, *va; + char *ckalloc(); + + /* Boundary cases: M <= 1 or N == 0 */ + + if (N <= 0) { + if (M > 0) DEL(M) + return -gap(M); + } + if (M <= 1) { + if (M <= 0) { + INS(N); + return -gap(N); + } + if (tb > te) tb = te; + midc = -(tb + r + gap(N)); + midj = 0; + va = v[A[1]]; + for (j = 1; j <= N; j++) { + c = va[B[j]] - (gap(j - 1) + gap(N - j)); + if (c > midc) { + midc = c; + midj = j; + } + } + if (midj == 0) { + INS(N) DEL(1) + } + else { + if (midj > 1) INS(midj - 1) + REP if ((A[1] | 32) == (B[midj] | 32)) no_mat += + 1; + else no_mis += 1; + if (midj < N) INS(N - midj) + } + return midc; + } + + /* Divide: Find optimum midpoint (midi,midj) of cost midc */ + + midi = M / 2; /* Forward phase: */ + CC[0] = 0; /* Compute C(M/2,k) & D(M/2,k) for all k */ + t = -q; + for (j = 1; j <= N; j++) { + CC[j] = t = t - r; + DD[j] = t - q; + } + t = -tb; + for (i = 1; i <= midi; i++) { + s = CC[0]; + CC[0] = c = t = t - r; + e = t - q; + va = v[A[i]]; + for (j = 1; j <= N; j++) { + if ((c = c - qr) > (e = e - r)) e = c; + if ((c = CC[j] - qr) > (d = DD[j] - r)) d = c; + c = s + va[B[j]]; + if (c < d) c = d; + if (c < e) c = e; + s = CC[j]; + CC[j] = c; + DD[j] = d; + } + } + DD[0] = CC[0]; + + RR[N] = 0; /* Reverse phase: */ + t = -q; /* Compute R(M/2,k) & S(M/2,k) for all k */ + for (j = N - 1; j >= 0; j--) { + RR[j] = t = t - r; + SS[j] = t - q; + } + t = -te; + for (i = M - 1; i >= midi; i--) { + s = RR[N]; + RR[N] = c = t = t - r; + e = t - q; + va = v[A[i + 1]]; + for (j = N - 1; j >= 0; j--) { + if ((c = c - qr) > (e = e - r)) e = c; + if ((c = RR[j] - qr) > (d = SS[j] - r)) d = c; + c = s + va[B[j + 1]]; + if (c < d) c = d; + if (c < e) c = e; + s = RR[j]; + RR[j] = c; + SS[j] = d; + } + } + SS[N] = RR[N]; + + midc = CC[0] + RR[0]; /* Find optimal midpoint */ + midj = 0; + type = 1; + for (j = 0; j <= N; j++) + if ((c = CC[j] + RR[j]) >= midc) + if (c > midc || + CC[j] != DD[j] && RR[j] == SS[j]) { + midc = c; + midj = j; + } + for (j = N; j >= 0; j--) + if ((c = DD[j] + SS[j] + q) > midc) { + midc = c; + midj = j; + type = 2; + } + } + + /* Conquer: recursively around midpoint */ + + if (type == 1) { + (void)diff(A, B, midi, midj, tb, q); + (void)diff(A + midi, B + midj, M - midi, N - midj, q, te); + } + else { + (void)diff(A, B, midi - 1, midj, tb, zero); + DEL(2); + (void)diff(A + midi + 1, B + midj, M - midi - 1, N - midj, zero, + te); + } + return midc; +} + +/* lib.c - library of C procedures. */ + +/* fatal - print message and die */ +fatal(msg) char *msg; +{ + (void)fprintf(stderr, "%s\n", msg); + exit(1); +} + +/* fatalf - format message, print it, and die */ +fatalf(msg, val) char *msg, *val; +{ + (void)fprintf(stderr, msg, val); + (void)putc('\n', stderr); + exit(1); +} + +/* ckopen - open file; check for success */ +FILE *ckopen(name, mode) +char *name, *mode; +{ + FILE *fopen(), *fp; + + if ((fp = fopen(name, mode)) == NULL) fatalf("Cannot open %s.", name); + return (fp); +} + +/* ckalloc - allocate space; check for success */ +char *ckalloc(amount) +size_t amount; +{ + void *malloc(), *p; + + if ((p = malloc((unsigned)amount)) == NULL) fatal("Ran out of memory."); + return (p); +} + diff --git a/SUPPORT/Flatio.c b/SUPPORT/Flatio.c new file mode 100644 index 0000000..9c028de --- /dev/null +++ b/SUPPORT/Flatio.c @@ -0,0 +1,149 @@ +#include +#include +#include +#include +#define TRUE 1 +#define FALSE 0 +#define MAX(a, b) ((a) > (b) ? (a) : (b)) +#define MIN(a, b) ((a) < (b) ? (a) : (b)) + +struct data_format { + int length; + char *nuc; + int offset; + char name[64]; + char type; +}; + +char *Realloc(char *block, int size); +char *Calloc(int count, int size); +int ErrorOut(int code, char *string); +int Errorout(char *string); +int ReadFlat(FILE *file, struct data_format align[], int maxseqs); +int WriteData(FILE *file, struct data_format data[], int count); + +int ReadFlat(FILE *file, struct data_format align[], int maxseqs) +{ + int j, len = 0, count = -1, offset; + unsigned maxlen = 1024; + char cinline[1025]; + extern char *Calloc(), *Realloc(); + + if (file == NULL) Errorout("Cannot open data file"); + + for (; fgets(cinline, 1024, file) != NULL;) { + cinline[strlen(cinline) - 1] = '\0'; + switch (cinline[0]) { + case '>': + case '#': + case '%': + case '"': + case '@': + offset = 0; + for (j = 0; j < strlen(cinline); j++) { + if (cinline[j] == '(') { + sscanf( + (char *)(cinline + j + 1), + "%d", &offset); + cinline[j] = '\0'; + } + } + + if (count != -1) { + align[count].length = len; + align[count].nuc[len] = '\0'; + maxlen = len; + } + + count++; + if (count > maxseqs) + Errorout( + "Sorry, alignment is too large"); + + align[count].nuc = Calloc(maxlen, sizeof(char)); + align[count].type = cinline[0]; + align[count].offset = offset; + if (align[count].nuc == NULL) + Errorout("Calloc problem"); + + sscanf((char *)(cinline + 1), "%s", + align[count].name); + len = 0; + break; + default: + if (len + strlen(cinline) > maxlen) { + maxlen = (maxlen + strlen(cinline)) * 2; + align[count].nuc = + Realloc(align[count].nuc, maxlen); + } + for (j = 0; j < strlen(cinline); j++) + align[count].nuc[j + len] = cinline[j]; + len += strlen(cinline); + break; + } + } + if (count == -1) exit(1); + + align[count].length = len; + align[count].nuc[len] = '\0'; + return (++count); +} + +int Errorout(char *string) +{ + fprintf(stderr, "%s\n", string); + exit(1); +} + +int WriteData(FILE *file, struct data_format data[], int count) +{ + int i, j; + for (j = 0; j < count; j++) { + if (data[j].offset) + fprintf(file, "\n%c%s(%d)", data[j].type, data[j].name, + data[j].offset); + else + fprintf(file, "\n%c%s", data[j].type, data[j].name); + + for (i = 0; i < data[j].length; i++) { + if (i % 60 == 0) fputc('\n', file); + fputc(data[j].nuc[i], file); + } + } + return 0; +} + +int ErrorOut(int code, char *string) +{ + if (code == 0) { + fprintf(stderr, "Error:%s\n", string); + exit(1); + } + return 0; +} + +char *Calloc(int count, int size) +{ + char *temp; + + temp = (char *)calloc(count, size); + if (temp == NULL) { + fprintf(stdout, "Error in Calloc\n"); + exit(-1); + } + else + return (temp); +} + +char *Realloc(char *block, int size) +{ + char *temp; + temp = (char *)realloc(block, size); + if (temp == NULL) { + fprintf(stdout, "Error in Calloc\n"); + exit(-1); + } + else + return (temp); +} + diff --git a/SUPPORT/Makefile b/SUPPORT/Makefile new file mode 100644 index 0000000..0ee1dd2 --- /dev/null +++ b/SUPPORT/Makefile @@ -0,0 +1,21 @@ +CC = cc +FLAGS = -lm + +all:CAP2 Restriction count findall varpos lsadt sho_helix Zuk_to_gen + +CAP2: CAP2.c + $(CC) CAP2.c -O -o CAP2 +Restriction: Restriction.c + $(CC) Restriction.c -O -o Restriction +Zuk_to_gen: Zuk_to_gen.c + $(CC) Zuk_to_gen.c -O -o Zuk_to_gen +count: count.c + $(CC) count.c -O -o count $(FLAGS) +findall: findall.c + $(CC) findall.c -O -o findall +lsadt: lsadt.c + $(CC) lsadt.c -O -o lsadt $(FLAGS) +sho_helix: sho_helix.c + $(CC) sho_helix.c -O -o sho_helix +varpos: varpos.c + $(CC) varpos.c -O -o varpos diff --git a/SUPPORT/PrintContig.c b/SUPPORT/PrintContig.c new file mode 100644 index 0000000..77ebc33 --- /dev/null +++ b/SUPPORT/PrintContig.c @@ -0,0 +1,70 @@ +#include "Flatio.c" +#define WIDTH 50 + +main() +{ + struct data_format data[10000]; + int i,j,k,numseqs,maxlen = 0,minlen=999999999; + int lines_printed; + int len[1000]; + char a,b; + + numseqs = ReadFlat(stdin,data,10000); + if(numseqs == 0) + exit(1); + + for(k=0;k j+WIDTH) || + (data[i].offset+data[i].length=data[i].offset)) + putchar(data[i].nuc[k-data[i].offset]); + else putchar(' '); + } + } + } + if(lines_printed) + { + printf("\n |---------|---------|---------|---------|---------\n"); + printf(" %6d %6d %6d %6d %6d\n\n",j+1,j+11,j+21,j+31,j+41); + } + } + putchar('\n'); + exit(0); +} + + +int indx(pos,seq) +int pos; +struct data_format *seq; +{ + int j,count=0; + if(pos < seq->offset) + return (0); + if(pos>seq->offset+seq->length) + pos = seq->offset+seq->length; + pos -= seq->offset; + for(j=0;jnuc[j] != '-') + if(seq->nuc[j] != '~') + count++; + return (count); +} diff --git a/SUPPORT/Restriction.c b/SUPPORT/Restriction.c new file mode 100644 index 0000000..98f088a --- /dev/null +++ b/SUPPORT/Restriction.c @@ -0,0 +1,130 @@ +/* +* Copyright 1991 Steven Smith at the Harvard Genome Lab. +* All rights reserved. +*/ +#include "Flatio.c" + + +main(ac,av) +int ac; +char **av; +{ + struct data_format data[10000]; + FILE *file; + int i,j,k,color,numseqs,numenzymes,nextpos,len; + char enzymes[80][80],dummy[80]; + if(ac<3) + { + fprintf(stderr,"Usage: %s enzyme_file seq_file\n",av[0]); + exit(-1); + } + file = fopen(av[2],"r"); + if(file == NULL) + exit(-1); + + numseqs = ReadFlat(file,data,10000); + + file = fopen(av[1],"r"); + if(file == NULL) + exit(-1); + + for(numenzymes = 0; + fscanf(file,"%s %s",enzymes[numenzymes],dummy)>0; + numenzymes++); + + for(i=0;i1) +*/ + printf("name:%s\n",data[i].name); + printf("length:%d\n",strlen(data[i].nuc)); + if(numseqs>1) + printf("nodash:\n"); + printf("start:\n"); + for(j=0;j0) + for(flag = FALSE,j=offset;j +#include + +typedef struct Sequence { + int len; + char name[80]; + char type[8]; + char *nuc; +} Sequence; + +main() +{ + char a[5000], b[5000], cinline[132]; + int pos1, pos2, pos3, i, j, k, FLAG; + Sequence pair[2]; + + for (j = 0; j < 5000; j++) b[j] = '-'; + FLAG = (int)gets(cinline); + for (j = 0; FLAG; j++) { + FLAG = (int)gets(cinline); + sscanf(cinline, "%d", &pos1); + if ((sscanf(cinline, "%*6c %c %d %d %d", &(a[j]), &k, &pos2, + &pos3) == 4) && + (FLAG)) { + if (pos3 != 0) { + if (pos1 < pos3) { + b[pos1 - 1] = '['; + b[pos3 - 1] = ']'; + } + else { + b[pos3 - 1] = '['; + b[pos1 - 1] = ']'; + } + } + } + else { + pair[0].len = j; + strcpy(pair[0].name, "HELIX"); + strcpy(pair[0].type, "TEXT"); + + pair[1].len = j; + /* + sscanf(cinline,"%*24c + %s",pair[1].name); + */ + strcpy(pair[1].name, "Sequence"); + strcpy(pair[1].type, "RNA"); + + pair[0].nuc = b; + pair[1].nuc = a; + + WriteGen(pair, stdout, 2); + for (j = 0; j < 5000; j++) b[j] = '-'; + j = -1; + } + } + exit(0); +} + +WriteGen(seq, file, numseq) Sequence *seq; +FILE *file; +int numseq; +{ + register i, j; + char temp[14]; + + for (j = 0; j < numseq; j++) fprintf(file, "%-.12s\n", seq[j].name); + + fprintf(file, "ZZZZZZZZZZ\n"); + + for (j = 0; j < numseq; j++) { + strcpy(temp, seq[j].name); + for (i = strlen(temp); i < 13; i++) temp[i] = ' '; + temp[i] = '\0'; + fprintf(file, "LOCUS %-.12s %s %d BP\n", temp, + seq[j].type, seq[j].len); + + fprintf(file, "ORIGIN"); + for (i = 0; i < seq[j].len; i++) { + if (i % 60 == 0) fprintf(file, "\n%9d", i + 1); + if (i % 10 == 0) fprintf(file, " "); + fprintf(file, "%c", seq[j].nuc[i]); + } + fprintf(file, "\n//\n"); + } + return; +} + diff --git a/SUPPORT/count.c b/SUPPORT/count.c new file mode 100644 index 0000000..625b7c2 --- /dev/null +++ b/SUPPORT/count.c @@ -0,0 +1,393 @@ +/* + * Copyright 1991 Steven Smith at the Harvard Genome Lab. + * All rights reserved. + */ +#include + +#include "Flatio.c" + +#define FALSE 0 +#define TRUE 1 + +#define JUKES 0 +#define OLSEN 1 +#define NONE 2 + +#define Min(a, b) (a) < (b) ? (a) : (b) + +int width, start, jump, usecase, sim, correction; +int tbl, numseq, num, denom, special; +char argtyp[255], argval[255]; + +float acwt = 1.0, agwt = 1.0, auwt = 1.0, ucwt = 1.0, ugwt = 1.0, gcwt = 1.0; + +float dist[200][200]; + +struct data_format data[10000]; +float parta[200], partc[200], partg[200], partu[200], setdist(); + +main(ac, av) int ac; +char **av; +{ + int i, j, k; + extern int ReadFlat(); + FILE *file; + + width = 1; + jump = 1; + if (ac == 1) { + fprintf(stderr, + "usage: %s [-sim] [-case] [-c=] ", + av[0]); + fprintf(stderr, "[-t] alignment_flat_file\n"); + exit(1); + } + for (j = 1; j < ac - 1; j++) { + getarg(av, j, argtyp, argval); + if (strcmp(argtyp, "-s=") == 0) { + j++; + sscanf(argval, "%d", &start); + start--; + } + else if (strcmp(argtyp, "-m=") == 0) { + j++; + sscanf(argval, "%d", &width); + } + else if (strcmp(argtyp, "-j=") == 0) { + j++; + sscanf(argval, "%d", &jump); + } + else if (strcmp(argtyp, "-case") == 0) + usecase = TRUE; + + else if (strcmp(argtyp, "-sim") == 0) + sim = TRUE; + + else if (strcmp(argtyp, "-c=") == 0) { + if (strcmp(argval, "olsen") == 0) + correction = OLSEN; + + else if (strcmp(argval, "none") == 0) + correction = NONE; + + else if (strcmp(argval, "jukes") == 0) + correction = JUKES; + + else + fprintf(stderr, "Correction type %s %s\n", + argval, "unknown, using JUKES"); + } + else if (strcmp("-t", argtyp) == 0) + tbl = TRUE; + + else if (strcmp("-ac=", argtyp) == 0 || + strcmp("-ca=", argtyp) == 0) { + j++; + sscanf(argval, "%f", &acwt); + special = TRUE; + } + else if (strcmp("-au=", argtyp) == 0 || + strcmp("-ua=", argtyp) == 0) { + j++; + sscanf(argval, "%f", &auwt); + special = TRUE; + } + else if (strcmp("-ag=", argtyp) == 0 || + strcmp("-ga=", argtyp) == 0) { + j++; + sscanf(argval, "%f", &agwt); + special = TRUE; + } + else if (strcmp("-uc=", argtyp) == 0 || + strcmp("-cu=", argtyp) == 0) { + j++; + sscanf(argval, "%f", &ucwt); + special = TRUE; + } + else if (strcmp("-ug=", argtyp) == 0 || + strcmp("-gu=", argtyp) == 0) { + j++; + sscanf(argval, "%f", &ugwt); + special = TRUE; + } + else if (strcmp("-gc=", argtyp) == 0 || + strcmp("-cg=", argtyp) == 0) { + j++; + sscanf(argval, "%f", &gcwt); + special = TRUE; + } + else if (strcmp("-transition=", argtyp) == 0) { + j++; + sscanf(argval, "%f", &ucwt); + agwt = ucwt; + special = TRUE; + } + else if (strcmp("-transversion=", argtyp) == 0) { + j++; + sscanf(argval, "%f", &gcwt); + ugwt = gcwt; + acwt = gcwt; + auwt = gcwt; + special = TRUE; + } + } + + file = fopen(av[ac - 1], "r"); + if ((file == NULL) || (ac == 1)) { + fprintf(stderr, "Error opening input file %s\n", av[ac - 1]); + exit(1); + } + + numseq = ReadFlat(file, data, 10000); + + fclose(file); + SetPart(); + + for (j = 0; j < numseq - 1; j++) + for (k = j + 1; k < numseq; k++) { + Compare(j, k, &num, &denom); + dist[j][k] = setdist(num, denom, j, k); + } + + Report(); + exit(0); +} + +Compare(a, b, num, denom) int a, b, *num, *denom; +{ + int mn, i, j, casefix, match, blank; + float fnum = 0.0; + struct data_format *da, *db; + char ac, bc; + + casefix = (usecase) ? 0 : 32; + *num = 0; + *denom = 0; + + da = &data[a]; + db = &data[b]; + mn = Min(da->length, db->length); + + for (j = 0; j < mn; j += jump) { + match = TRUE; + blank = TRUE; + for (i = 0; i < width; i++) { + ac = da->nuc[j + i] | casefix; + bc = db->nuc[j + i] | casefix; + if (ac == 't') ac = 'u'; + if (ac == 'T') ac = 'U'; + if (bc == 't') bc = 'u'; + if (bc == 'T') bc = 'U'; + + if ((ac == '-') || (ac | 32) == 'n' || (ac == ' ') || + (bc == '-') || (bc | 32) == 'n' || (bc == ' ')) + ; + + else { + blank = FALSE; + if (ac != bc) { + match = FALSE; + switch (ac) { + case 'a': + if (bc == 'c') + fnum += acwt; + else if (bc == 'g') + fnum += agwt; + else if (bc == 'u') + fnum += auwt; + break; + + case 'c': + if (bc == 'a') + fnum += acwt; + else if (bc == 'g') + fnum += gcwt; + else if (bc == 'u') + fnum += ucwt; + break; + + case 'g': + if (bc == 'a') + fnum += agwt; + else if (bc == 'c') + fnum += gcwt; + else if (bc == 'u') + fnum += ugwt; + break; + + case 'u': + if (bc == 'a') + fnum += auwt; + else if (bc == 'c') + fnum += ucwt; + else if (bc == 'g') + fnum += ugwt; + break; + + case 't': + if (bc == 'a') + fnum += auwt; + else if (bc == 'c') + fnum += ucwt; + else if (bc == 'g') + fnum += ugwt; + break; + + default: + break; + }; + } + } + + if ((blank == FALSE) && match) { + (*num)++; + (*denom)++; + } + else if (!blank) + (*denom)++; + } + } + if (special) (*num) = *denom - (int)fnum; + return 0; +} + +float setdist(num, denom, a, b) +int num, denom, a, b; +{ + float cor; + switch (correction) { + case OLSEN: + cor = parta[a] * parta[b] + partc[a] * partc[b] + + partg[a] * partg[b] + partu[a] * partu[b]; + break; + + case JUKES: + cor = 0.25; + break; + + case NONE: + cor = 0.0; + break; + + default: + cor = 0.0; + break; + }; + + if (correction == NONE) + return (1.0 - (float)num / (float)denom); + else + return (-(1.0 - cor) * log(1.0 / (1.0 - cor) * + ((float)num / (float)denom - cor))); +} + +getarg(av, ndx, atype, aval) char **av, atype[], aval[]; +int ndx; +{ + int i, j; + char c; + for (j = 0; (c = av[ndx][j]) != ' ' && c != '=' && c != '\0'; j++) + atype[j] = c; + if (c == '=') { + atype[j++] = c; + atype[j] = '\0'; + } + else { + atype[j] = '\0'; + j++; + } + + if (c == '=') { + for (i = 0; (c = av[ndx][j]) != '\0' && c != ' '; i++, j++) + aval[i] = c; + aval[i] = '\0'; + } + return 0; +} + +SetPart() +{ + int a, c, g, u, tot, i, j; + char nuc; + + for (j = 0; j < numseq; j++) { + a = 0; + c = 0; + g = 0; + u = 0; + tot = 0; + + for (i = 0; i < data[j].length; i++) { + nuc = data[j].nuc[i] | 32; + switch (nuc) { + case 'a': + a++; + tot++; + break; + + case 'c': + c++; + tot++; + break; + + case 'g': + g++; + tot++; + break; + + case 'u': + u++; + tot++; + break; + + case 't': + u++; + tot++; + break; + }; + } + parta[j] = (float)a / (float)tot; + partc[j] = (float)c / (float)tot; + partg[j] = (float)g / (float)tot; + partu[j] = (float)u / (float)tot; + } + return 0; +} + +Report() +{ + int i, ii, jj, j, k; + + if (tbl) printf("#\n#-\n#-\n#-\n#-\n"); + for (jj = 0, j = 0; j < numseq; j++) { + if (tbl) printf("%2d: %-.15s|", jj + 1, data[j].name); + + for (i = 0; i < j; i++) { + if (sim) + printf("%6.1f", 100 - dist[i][j] * 100.0); + else + printf("%6.1f", dist[i][j] * 100.0); + } + printf("\n"); + jj++; + } + return 0; +} + +int find(b, a) +char *a, *b; +{ + int flag, lenb, lena; + register i, j; + + flag = 0; + lenb = strlen(b); + lena = strlen(a); + for (i = 0; ((i < lena) && flag == 0); i++) { + for (j = 0; (j < lenb) && (a[i + j] == b[j]); j++) + ; + flag = ((j == lenb) ? 1 : 0); + } + return flag; +} + diff --git a/SUPPORT/findall.c b/SUPPORT/findall.c new file mode 100644 index 0000000..7440667 --- /dev/null +++ b/SUPPORT/findall.c @@ -0,0 +1,114 @@ +/* + * Copyright 1991 Steven Smith at the Harvard Genome Lab. + * All rights reserved. + */ +#include "Flatio.c" + +int main(ac, av) +int ac; +char **av; +{ + struct data_format data[10000]; + int Match = 2, Mismatch = 8; + int i, j, k, l, numseqs, mis, Case = 32; + int slen, pcnt, pos; + int UT = FALSE; + char c; + if (ac < 3) { + fprintf(stderr, + "usage: %s search_string %%mismatch [-case] [-match " + "color] [-mismatch color]\n", + av[0]); + fprintf(stderr, " [-u=t]\n"); + exit(0); + } + for (j = 3; j < ac; j++) { + if (strcmp("-case", av[j]) == 0) Case = 0; + if (strcmp("-match", av[j]) == 0) + sscanf(av[j + 1], "%d", &Match); + if (strcmp("-u=t", av[j]) == 0) UT = TRUE; + if (strcmp("-mismatch", av[j]) == 0) + sscanf(av[j + 1], "%d", &Mismatch); + } + numseqs = ReadFlat(stdin, data, 10000); + + slen = strlen(av[1]); + sscanf(av[2], "%d", &pcnt); + pcnt *= slen; + pcnt /= 100; + + if (UT) + for (j = 0; j <= strlen(av[1]); j++) { + if (av[1][j] == 't') av[1][j] = 'u'; + if (av[1][j] == 'T') av[1][j] = 'U'; + } + + for (i = 0; i < numseqs; i++) { + if (UT) + for (j = 0; data[i].nuc[j] != '\0'; j++) { + if (data[i].nuc[j] == 't') + data[i].nuc[j] = 'u'; + else if (data[i].nuc[j] == 'T') + data[i].nuc[j] = 'U'; + } + printf("name:%s\n", data[i].name); + printf("length:%d\n", strlen(data[i].nuc)); + printf("start:\n"); + for (j = 0; j < data[i].length; j++) { + mis = 0; + for (k = 0, pos = j; k < slen && pos < data[i].length; + k++, pos++) { + c = data[i].nuc[pos]; + for (; (c == ' ' || c == '-' || c == '~') && + pos < data[i].length;) + c = data[i].nuc[++pos]; + c |= Case; + + if (data[i].type == '#') { + if (CompIUP(c, (av[1][k] | Case)) == + FALSE) + mis++; + } + else { + if (c != (av[1][k] | Case)) mis++; + } + } + if (k == slen && mis <= pcnt) { + for (k = j; k < pos; k++) printf("%d\n", Match); + j = pos - 1; + } + else + printf("%d\n", Mismatch); + } + } + exit(0); +} + +int CompIUP(a, b) +char a, b; +{ + static int tmatr[16] = {'-', 'a', 'c', 'm', 'g', 'r', 's', 'v', + 't', 'w', 'y', 'h', 'k', 'd', 'b', 'n'}; + + static int matr[128] = { + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0x00, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0x01, + 0xe, 0x02, 0x0d, 0, 0, 0x04, 0x0b, 0, 0, 0x0c, 0, + 0x03, 0x0f, 0, 0, 0, 0x05, 0x06, 0x08, 0x08, 0x07, 0x09, + 0x00, 0xa, 0, 0, 0, 0, 0, 0, 0, 0x01, 0x0e, + 0x02, 0x0d, 0, 0, 0x04, 0x0b, 0, 0, 0x0c, 0, 0x03, + 0x0f, 0, 0, 0, 0x05, 0x06, 0x08, 0x08, 0x07, 0x09, 0x00, + 0x0a, 0, 0, 0, 0, 0x00, 0}; + + int testa, testb; + + if (a & 32 != b & 32) return (FALSE); + + testa = matr[(int)a]; + testb = matr[(int)b]; + return (testa & testb); +} diff --git a/SUPPORT/lsadt.c b/SUPPORT/lsadt.c new file mode 100644 index 0000000..56122b7 --- /dev/null +++ b/SUPPORT/lsadt.c @@ -0,0 +1,1305 @@ +/* + PROGRAM LSADT +Fortran->C conversion by Mike Maciukenas, CPGA, Microbiology at University + of Illinois. +C----------------------------------------------------------------------- +C +C LEAST SQUARES ALGORITHM FOR FITTING ADDITIVE TREES TO +C PROXIMITY DATA +C +C GEERT DE SOETE -- VERSION 1.01 - FEB. 1983 +C VERSION 1.02 - JUNE 1983 +C VERSION 1.03 - JULY 1983 +C +C REFERENCE: DE SOETE, G. A LEAST SQUARES ALGORITHM FOR FITTING +C ADDITIVE TREES TO PROXIMITY DATA. PSYCHOMETRIKA, 1983, 48, +C 621-626. +C DE SOETE, G. ADDITIVE TREE REPRESENTATIONS OF INCOMPLETE +C DISSIMILARITY DATA. QUALITY AND QUANTITY, 1984, 18, +C 387-393. +C REMARKS +C ------- +C 2) UNIFORMLY DISTRIBUTED RANDOM NUMBERS ARE GENERATED BY A +C PROCEDURE DUE TO SCHRAGE. CF. +C SCHRAGE, L. A MORE PORTABLE FORTRAN RANDOM NUMBER GENERATOR. +C ACM TRANS. ON MATH. SOFTW., 1979, 5, 132-138. +C 3) SUBROUTINES VA14AD AND VA14AC (translated into minfungra) ARE +C ADAPTED FROM THE HARWELL SUBROUTINE LIBRARY (1979 EDITION). +C 4) ALTHOUGH THIS PROGRAM HAS BEEN CAREFULLY TESTED, THE +C AUTHOR DISCLAIMS ANY RESPONSABILITY FOR POSSIBLE +C ERRORS. +C +C----------------------------------------------------------------------- +*/ +#include +#include +#include +#include +#include +#include +#include +#include + +#define BUFLEN 1024 +#define MAXLEAVES 256 + +static int m, n, dissim, pr, start, save, seed, nempty; +static double ps1, ps2, f, empty, tol, c; +static char fname[1000]; +static char *names[MAXLEAVES]; +static double *delta[MAXLEAVES]; +static double **d; +static double **g; +static double **dold; +static FILE *reportf; +static int report; +extern int errno; +double nfac; + +extern double strtod(); + +double dabs(a) +double a; +{ + return ((a < 0.0) ? -a : a); +} + +double sqr(a) +double a; +{ + return (a * a); +} + +double max(a, b) +double a; +double b; +{ + return ((a > b) ? a : b); +} + +int imin(a, b) +int a; +int b; +{ + return ((a < b) ? a : b); +} + +double min(a, b) +double a; +double b; +{ + return ((a < b) ? a : b); +} + +float runif() +{ +#define A 16807 +#define P 2147483647 +#define B15 32768 +#define B16 65536 + + int xhi, xalo, leftlo, fhi, k; + float t1, t2; + + xhi = seed / B16; + xalo = (seed - (xhi * B16)) * A; + leftlo = xalo / B16; + fhi = xhi * A + leftlo; + k = fhi / B15; + seed = (((xalo - leftlo * B16) - P) + (fhi - k * B15) * B16) + k; + if (seed < 0) seed += P; + t1 = seed; + t2 = t1 * 4.656612875e-10; + return (t2); /* seed * (1/(2**31-1))*/ +} + +float rnorm() +{ + static float t1, t2; + static float n1, n2; + static int second = 0; + + if (second) { + second = 0; + n1 = sin(t2); + n2 = t1 * n1; + n1 = t1 * sin(t2); + return (n2); + } + else { + t1 = runif(); + t1 = sqrt(-2.0 * log(t1)); + n2 = 6.283185308; + t2 = runif() * n2; + n1 = cos(t2); + n2 = t1 * n1; + n1 = t1 * cos(t2); + second = 1; + return (n2); + } +} + +double gd(i, j) +int i, j; +{ + if (j < i) + return (d[i][j]); + else if (j > i) + return (d[j][i]); + else + show_error("gd: i=j -- programmer screwed up!"); +} + +char *repeatch(string, ch, num) +char *string; +int ch; +int num; +{ + for (string[num--] = '\0'; num >= 0; string[num--] = ch) + ; + return (string); +} + +int getachar() +/* skips comments! */ +{ + static int oldchar = '\0'; + int ch; + int more = 1; + + while (more) { + ch = getchar(); + if (oldchar == '\n' && ch == '#') { + while (ch != '\n' && ch != EOF) ch = getchar(); + oldchar = ch; + } + else if (oldchar == '\n' && isspace(ch)) + ; + else + more = 0; + } + oldchar = ch; + return (ch); +} + +int skip_space() +{ + int ch; + + while (isspace(ch = getachar())) + ; + return (ch); +} + +int getaword(string, len) +/* 0 if failed, 1 if data was read, -1 if data read to end of file */ +char *string; +int len; +{ + int i; + int ch; + ch = skip_space(); + if (ch == EOF) return (0); + for (i = 0; !isspace(ch) && ch != EOF; i++) { + if (i < len - 1) { + string[i] = ch; + } + ch = getachar(); + } + string[i < len ? i : len - 1] = '\0'; + if (ch == EOF) + return (-1); + else + return (1); +} + +int readtobar(string, len) +/* 0 if failed, 1 if data was read */ +char *string; +int len; +{ + int i; + int ch; + + ch = skip_space(); + if (ch == EOF) return (0); + for (i = 0; ch != EOF && ch != '|'; i++) { + if (ch == '\n' || ch == '\r' || ch == '\t') + i--; + else { + if (i < len - 1) string[i] = ch; + } + ch = getachar(); + } + len = i < len ? i : len - 1; + for (i = len - 1; i >= 0 && isspace(string[i]); i--) + ; + string[i + 1] = '\0'; + if (ch == EOF) + return (-1); + else + return (1); +} + +int readtobarorcolon(string, len) +/* 0 if failed, 1 if data was read */ +char *string; +int len; +{ + int i; + int ch; + + ch = skip_space(); + if (ch == EOF) return (0); + for (i = 0; ch != EOF && ch != '|' && ch != ':'; i++) { + if (ch == '\n' || ch == '\r' || ch == '\t') + i--; + else { + if (i < len - 1) string[i] = ch; + } + ch = getachar(); + } + len = i < len ? i : len - 1; + for (i = len - 1; i >= 0 && isspace(string[i]); i--) + ; + string[i + 1] = '\0'; + if (ch == EOF) + return (-1); + else + return (1); +} + +char *getmem(nelem, elsize) +unsigned nelem, elsize; +{ + char *temp; + + temp = (char *)calloc(nelem + 1, elsize); + if (temp == NULL) + show_error("Couldn't allocate memory."); + else + return (temp); +} + +int get_parms(argc, argv) +int argc; +char **argv; +{ + int i; + int cur_arg; + + /* codes for current argument: + ** 0 = no current argument + ** 1 = pr + ** 2 = start + ** 3 = seed + ** 4 = ps1 + ** 5 = ps2 + ** 6 = empty + ** 7 = filename */ + + dissim = 0; + pr = 0; + start = 2; + save = 0; + seed = 12345; + ps1 = 0.0001; + ps2 = 0.0001; + empty = 0.0; + n = 0; + cur_arg = 0; + for (i = 1; i < argc; i++) switch (cur_arg) { + case 0: + if (strcmp(argv[i], "-d") == 0) + dissim = 0; + else if (strcmp(argv[i], "-s") == 0) + dissim = 1; + else if (strcmp(argv[i], "-print") == 0) + cur_arg = 1; + else if (strcmp(argv[i], "-init") == 0) + cur_arg = 2; + else if (strcmp(argv[i], "-save") == 0) + save = 1; + else if (strcmp(argv[i], "-seed") == 0) + cur_arg = 3; + else if (strcmp(argv[i], "-ps1") == 0) + cur_arg = 4; + else if (strcmp(argv[i], "-ps2") == 0) + cur_arg = 5; + else if (strcmp(argv[i], "-empty") == 0) + cur_arg = 6; + else if (strcmp(argv[i], "-f") == 0) + cur_arg = 7; + else if (strcmp(argv[i], "-help") == 0) + show_help(); + else { + printf("\nlsadt: bad option\n"); + show_help(); + } + break; + case 1: + pr = atoi(argv[i]); + cur_arg = 0; + break; + case 2: + start = atoi(argv[i]); + cur_arg = 0; + break; + case 3: + seed = atoi(argv[i]); + cur_arg = 0; + break; + case 4: + ps1 = atof(argv[i]); + cur_arg = 0; + break; + case 5: + ps2 = atof(argv[i]); + cur_arg = 0; + break; + case 6: + empty = atof(argv[i]); + cur_arg = 0; + break; + case 7: + strcpy(fname, argv[i]); + cur_arg = 0; + break; + default: + printf("\nlsadt: bad option\n"); + show_help(); + break; + } + + /* check validity of parameters */ + if (ps1 < 0.0) ps1 = 0.0001; + if (ps2 < 0.0) ps2 = 0.0001; + if (dissim < 0) dissim = 0; + if (start < 1 || start > 3) start = 3; + if (save != 1) save = 0; + if (seed < 0) seed = 12345; + + /*printf("dissim=%d\n", dissim);*/ + /*printf("pr=%d\n", pr);*/ + /*printf("start=%d\n", start);*/ + /*printf("save=%d\n", save);*/ + /*printf("seed=%d\n", seed);*/ + /*printf("ps1=%f\n", ps1);*/ + /*printf("ps2=%f\n", ps2);*/ + /*printf("empty=%f\n", empty);*/ +} + +int get_data() +{ + int i, j, more; + char buf[BUFLEN]; + char *ptr; + char ch; + int result; + double temp, nfactor, datmin, datmax; + + nempty = n = 0; + more = 1; + ptr = &ch; + while (more) { + result = readtobarorcolon(buf, BUFLEN); + if (result == 0 || result == -1) + more = 0; + else { + n++; + names[n] = getmem(BUFLEN, 1); + result = readtobar(buf, BUFLEN); + if (result != 1) + show_error( + "get_data: bad name syntax, or missing " + "'|'"); + strcpy(names[n], buf); + if (n > 1) + delta[n] = (double *)getmem(n, sizeof(double)); + else + delta[n] = NULL; + for (j = 1; j < n; j++) { + if (!getaword(buf, BUFLEN)) + show_error( + "get_data: missing data values.\n"); + delta[n][j] = strtod(buf, &ptr); + if (ptr == buf) + show_error( + "get_data: invalid data value.\n"); + if (delta[n][j] == empty) nempty++; + } + } + } + if (n < 4) show_error("Must be at least 4 nodes."); + m = n * (n - 1) / 2; + /* make all positive */ + datmin = MAXDOUBLE; + for (i = 2; i <= n; i++) + for (j = 1; j < i; j++) + if (delta[i][j] < datmin && delta[i][j] != empty) + datmin = delta[i][j]; + if (datmin < 0.0) + for (i = 2; i <= n; i++) + for (j = 1; j < i; j++) + if (delta[i][j] != empty) delta[i][j] -= datmin; + /* convert dissimilarities into similarities if -s option specified */ + if (dissim) { + datmax = MINDOUBLE; + for (i = 2; i <= n; i++) + for (j = 1; j < i; j++) + if (delta[i][j] > datmax && + delta[i][j] != empty) + datmax = delta[i][j]; + datmax += 1.0; + for (i = 2; i <= n; i++) + for (j = 1; j < i; j++) + if (delta[i][j] != empty) + delta[i][j] = datmax - delta[i][j]; + } + + /* make sum of squared deviations from delta to average(delta) = 1 */ + temp = 0.0; + for (i = 2; i <= n; i++) + for (j = 1; j < i; j++) + if (delta[i][j] != empty) temp += delta[i][j]; + temp = temp / (double)(m - nempty); + /* now temp = average of all non-empty cells */ + nfactor = 0.0; + for (i = 2; i <= n; i++) + for (j = 1; j < i; j++) + if (delta[i][j] != empty) + nfactor += sqr(delta[i][j] - temp); + nfactor = 1.0 / sqrt(nfactor); + nfac = nfactor; + /* now nfactor is the normalization factor */ + if (report) + fprintf(reportf, "Normalization factor: %15.10f\n", nfactor); + for (i = 2; i <= n; i++) + for (j = 1; j < i; j++) + if (delta[i][j] != empty) delta[i][j] *= nfactor; + /* now all is normalized */ +} + +int initd() +{ + int i, j, k; + double t1, t2; + int emp, nemp; + double dmax; + double efac, estd; + + efac = 0.333333333333333; + if (start == 1) { + for (i = 2; i <= n; i++) + for (j = 1; j < i; j++) d[i][j] = runif(); + return; + } + if (nempty == 0 && start == 2) { + estd = sqrt(efac / m); + for (i = 2; i <= n; i++) + for (j = 1; j < i; j++) + d[i][j] = delta[i][j] + rnorm() * estd; + return; + } + for (i = 2; i <= n; i++) + for (j = 1; j < i; j++) d[i][j] = delta[i][j]; + if (start == 3 && nempty == 0) return; + emp = nempty; + nemp = 0; + while (nemp < emp) + for (i = 2; i <= n; i++) + for (j = 1; j < i; j++) + if (d[i][j] == empty) { + dmax = MAXDOUBLE; + for (k = 1; k <= n; k++) { + t1 = gd(i, k); + t2 = gd(j, k); + if (t1 != empty && t2 != empty) + dmax = min(dmax, + max(t1, t2)); + } + if (dmax != MAXDOUBLE) + d[i][j] = dmax; + else + nemp++; + } + if (nemp != 0) { + t1 = 0.0; + for (i = 2; i <= n; i++) + for (j = 1; j < i; j++) + if (d[i][j] != empty) t1 += d[i][j]; + t1 /= m - nemp; + for (i = 2; i <= n; i++) + for (j = 1; j < i; j++) + if (d[i][j] == empty) d[i][j] = t1; + } + if (start != 3) { + estd = sqrt(efac / (m - nempty)); + for (i = 2; i <= n; i++) + for (j = 1; j < i; j++) d[i][j] += rnorm() * estd; + } + return; +} + +static int nm0, nm1, nm2, nm3; +static double fitl, fitp, r; + +fungra() +{ + double fac; + int i, j, k, l; + double wijkl; + + for (i = 2; i <= n; i++) + for (j = 1; j < i; j++) g[i][j] = 0.0; + fitl = 0.0; + fac = 2.0; + for (i = 2; i <= n; i++) + for (j = 1; j < i; j++) + if (delta[i][j] != empty) { + fitl += sqr(d[i][j] - delta[i][j]); + g[i][j] += fac * (d[i][j] - delta[i][j]); + } + fitp = 0.0; + fac = 2.0 * r; + for (i = 1; i <= nm3; i++) + for (j = i + 1; j <= nm2; j++) + for (k = j + 1; k <= nm1; k++) + for (l = k + 1; l <= nm0; l++) + if ((d[j][i] + d[l][k] >= + d[l][i] + d[k][j]) && + (d[k][i] + d[l][j] >= + d[l][i] + d[k][j])) { + wijkl = d[j][i] + d[l][k] - + d[k][i] - d[l][j]; + fitp += sqr(wijkl); + g[j][i] += fac * wijkl; + g[l][k] += fac * wijkl; + g[k][i] -= fac * wijkl; + g[l][j] -= fac * wijkl; + } + else if ((d[j][i] + d[l][k] >= + d[k][i] + d[l][j]) && + (d[l][i] + d[k][j] >= + d[k][i] + d[l][j])) { + wijkl = d[j][i] + d[l][k] - + d[l][i] - d[k][j]; + fitp += sqr(wijkl); + g[j][i] += fac * wijkl; + g[l][k] += fac * wijkl; + g[k][j] -= fac * wijkl; + g[l][i] -= fac * wijkl; + } + else if ((d[k][i] + d[l][j] >= + d[j][i] + d[l][k]) && + (d[l][i] + d[k][j] >= + d[j][i] + d[l][k])) { + wijkl = d[k][i] + d[l][j] - + d[l][i] - d[k][j]; + fitp += sqr(wijkl); + g[k][i] += fac * wijkl; + g[l][j] += fac * wijkl; + g[l][i] -= fac * wijkl; + g[k][j] -= fac * wijkl; + } + f = fitl + r * fitp; +} + +static double **dr, **dgr, **d1, **gs, **xx, **gg; +static int iterc, prc; +print_iter(maxfnc, f) int maxfnc; +double f; +{ + int i, j; + + if (pr == 0) { + iterc++; + } + else if (prc < abs(pr)) { + prc++; + iterc++; + } + else { + printf("Iteration %6d", iterc); + printf(": function values %6d", maxfnc); + printf(" f = %24.16e\n", f); + if (pr < 0) { + printf(" d[] looks like this:\n"); + for (i = 2; i <= n; i++) { + printf(" "); + for (j = 1; j < i; j++) + printf("%24.16e ", d[i][j]); + printf("\n "); + } + printf(" g[] looks like this:\n"); + for (i = 2; i <= n; i++) { + printf(" "); + for (j = 1; j < i; j++) + printf("%24.16e ", g[i][j]); + printf("\n "); + } + } + prc = 1; + iterc++; + } +} + +minfungra(dfn, acc, maxfn) double dfn, acc; +int maxfn; +{ + double beta, c, clt, dginit, dgstep, dtest, finit; + double fmin, gamden, gamma, ggstar, gm, gmin, gnew; + double gsinit, gsumsq, xbound, xmin, xnew, xold; + int itcrs, flag, retry, maxfnk, maxfnc; + int i, j; + + flag = 0; + retry = -2; + iterc = 0; + maxfnc = 1; + prc = abs(pr); + goto L190; + +L10: + xnew = dabs(dfn + dfn) / gsumsq; + dgstep = -gsumsq; + itcrs = m + 1; +L30: + if (maxfnk < maxfnc) { + f = fmin; + for (i = 2; i <= n; i++) + for (j = 1; j < i; j++) { + d[i][j] = xx[i][j]; + g[i][j] = gg[i][j]; + } + } + if (flag > 0) { + prc = 0; + print_iter(maxfnc, f); + return; + } + if (itcrs > m) { + for (i = 2; i <= n; i++) + for (j = 1; j < i; j++) d1[i][j] = -g[i][j]; + } + print_iter(maxfnc, f); + dginit = 0.0; + for (i = 2; i <= n; i++) + for (j = 1; j < i; j++) { + gs[i][j] = g[i][j]; + dginit += d1[i][j] * g[i][j]; + } + if (dginit >= 0.0) { + retry = -retry; + if (imin(retry, maxfnk) < 2) { + printf( + "minfungra: \ + gradient wrong or acc too small\n"); + flag = 2; + } + else + itcrs = m + 1; + goto L30; + } + xmin = 0.0; + fmin = f; + finit = f; + gsinit = gsumsq; + gmin = dginit; + gm = dginit; + xbound = -1.0; + xnew = xnew * min(1.0, dgstep / dginit); + dgstep = dginit; +L170: + c = xnew - xmin; + dtest = 0.0; + for (i = 1; i <= n; i++) + for (j = 1; j < i; j++) { + d[i][j] = xx[i][j] + c * d1[i][j]; + dtest += dabs(d[i][j] - xx[i][j]); + } + if (dtest <= 0.0) { + clt = .7; + goto L300; + } + if (max(xbound, xmin - c) < 0.0) { + clt = 0.1; + } + maxfnc++; +L190: + fungra(); + + if (maxfnc > 1) { + for (gnew = 0.0, i = 1; i <= n; i++) + for (j = 1; j < i; j++) gnew += d1[i][j] * g[i][j]; + } + if (maxfnc <= 1 || (maxfnc > 1 && f < fmin) || + (maxfnc > 1 && f == fmin && dabs(gnew) <= dabs(gmin))) { + maxfnk = maxfnc; + gsumsq = 0.0; + for (i = 1; i <= n; i++) + for (j = 1; j < i; j++) gsumsq += sqr(g[i][j]); + if (gsumsq <= acc) { + prc = 0; + print_iter(maxfnc, f); + return; + } + } + if (maxfnc == maxfn) { + printf( + "minfungra: \ + maxfn function values have been calculated\n"); + flag = 1; + goto L30; + } + if (maxfnk < maxfnc) goto L300; + for (i = 1; i <= n; i++) + for (j = 1; j < i; j++) { + xx[i][j] = d[i][j]; + gg[i][j] = g[i][j]; + } + if (maxfnc <= 1) goto L10; + gm = gnew; + if (gm <= dginit) goto L310; + ggstar = 0.0; + for (i = 1; i <= n; i++) + for (j = 1; j < i; j++) ggstar += gs[i][j] * g[i][j]; + beta = (gsumsq - ggstar) / (gm - dginit); +L300: + if (dabs(gm) > clt * dabs(dginit) || + (dabs(gm) <= clt * dabs(dginit) && + dabs(gm * beta) >= clt * gsumsq)) { + L310: + clt += 0.3; + if (clt > 0.8) { + retry = -retry; + if (imin(retry, maxfnk) < 2) { + printf( + "minfungra: \ + gradient wrong or acc too small\n"); + flag = 2; + } + else + itcrs = m + 1; + goto L30; + } + xold = xnew; + xnew = .5 * (xmin + xold); + if (maxfnk >= maxfnc && gmin * gnew > 0.0) { + xnew = 10.0 * xold; + if (xbound >= 0.0) { + xnew = 0.5 * (xold + xbound); + } + } + c = gnew - + (3.0 * gnew + gmin - 4.0 * (f - fmin) / (xold - xmin)) * + (xold - xnew) / (xold - xmin); + if (maxfnk >= maxfnc) { + if (gmin * gnew <= 0.0) { + xbound = xmin; + } + xmin = xold; + fmin = f; + gmin = gnew; + } + else + xbound = xold; + if (c * gmin < 0.0) + xnew = (xmin * c - xnew * gmin) / (c - gmin); + goto L170; + } + if (min(f, fmin) < finit || + (min(f, fmin) >= finit && gsumsq < gsinit)) { + if (itcrs < m && dabs(ggstar) < .2 * gsumsq) { + gamma = 0.0; + c = 0.0; + for (i = 1; i <= n; i++) + for (j = 1; j < i; j++) { + gamma += gg[i][j] * dgr[i][j]; + c += gg[i][j] * dr[i][j]; + } + gamma /= gamden; + if (dabs(beta * gm + gamma * c) < .2 * gsumsq) { + for (i = 1; i <= n; i++) + for (j = 1; j < i; j++) + d1[i][j] = -gg[i][j] + + beta * d1[i][j] + + gamma * dr[i][j]; + itcrs++; + } + else { + itcrs = 2; + gamden = gm - dginit; + for (i = 1; i <= n; i++) + for (j = 1; j < i; j++) { + dr[i][j] = d1[i][j]; + dgr[i][j] = gg[i][j] - gs[i][j]; + d1[i][j] = + -gg[i][j] + beta * d1[i][j]; + } + } + } + else { + itcrs = 2; + gamden = gm - dginit; + for (i = 1; i <= n; i++) + for (j = 1; j < i; j++) { + dr[i][j] = d1[i][j]; + dgr[i][j] = gg[i][j] - gs[i][j]; + d1[i][j] = -gg[i][j] + beta * d1[i][j]; + } + } + goto L30; + } + else { + retry = -retry; + if (imin(retry, maxfnk) < 2) { + printf( + "minfungra: \ + gradient wrong or acc too small\n"); + flag = 2; + } + else + itcrs = m + 1; + goto L30; + } +} + +get_dist() +{ + static double cons = 1.0; + static int maxfn = 500; + int i, j, iter; + double dif; + + dr = (double **)getmem(n, sizeof(delta[1])); + dgr = (double **)getmem(n, sizeof(delta[1])); + d1 = (double **)getmem(n, sizeof(delta[1])); + gs = (double **)getmem(n, sizeof(delta[1])); + xx = (double **)getmem(n, sizeof(delta[1])); + gg = (double **)getmem(n, sizeof(delta[1])); + dr[1] = NULL; + dgr[1] = NULL; + d1[1] = NULL; + gs[1] = NULL; + xx[1] = NULL; + gg[1] = NULL; + for (i = 2; i <= n; i++) { + dr[i] = (double *)getmem(i - 1, sizeof(double)); + dgr[i] = (double *)getmem(i - 1, sizeof(double)); + d1[i] = (double *)getmem(i - 1, sizeof(double)); + gs[i] = (double *)getmem(i - 1, sizeof(double)); + xx[i] = (double *)getmem(i - 1, sizeof(double)); + gg[i] = (double *)getmem(i - 1, sizeof(double)); + } + nm0 = n; + nm1 = n - 1; + nm2 = n - 2; + nm3 = n - 3; + if (start == 3) { + r = 0.001; + fungra(); + } + else { + r = 1.0; + fungra(); + r = cons * fitl / fitp; + f = (1.0 + cons) * fitl; + } + iter = 1; + do { + for (i = 1; i <= n; i++) + for (j = 1; j < i; j++) dold[i][j] = d[i][j]; + minfungra(0.5 * f, ps1, maxfn); + dif = 0.0; + for (i = 1; i <= n; i++) + for (j = 1; j < i; j++) + dif += sqr(d[i][j] - dold[i][j]); + dif = sqrt(dif); + if (dif > ps2) { + iter++; + r *= 10.0; + } + } while (dif > ps2); + fungra(); + for (i = 2; i <= n; i++) { + free(dr[i]); + free(dgr[i]); + free(d1[i]); + free(gs[i]); + free(xx[i]); + free(gg[i]); + } + free(dr); + free(dgr); + free(d1); + free(gs); + free(xx); + free(gg); +} + +double gttol() +{ + double result; + int i, j, k, l; + + result = 0.0; + nm0 = n; + nm1 = n - 1; + nm2 = n - 2; + nm3 = n - 3; + for (i = 1; i <= nm3; i++) + for (j = i + 1; j <= nm2; j++) + for (k = j + 1; k <= nm1; k++) + for (l = k + 1; l <= nm0; l++) + if ((d[j][i] + d[l][k] >= + d[l][i] + d[k][j]) && + (d[k][i] + d[l][j] >= + d[l][i] + d[k][j])) + result = max( + result, + dabs(d[j][i] + d[l][k] - + d[k][i] - d[l][j])); + else if ((d[j][i] + d[l][k] >= + d[k][i] + d[l][j]) && + (d[l][i] + d[k][j] >= + d[k][i] + d[l][j])) + result = max( + result, + dabs(d[j][i] + d[l][k] - + d[l][i] - d[k][j])); + else if ((d[k][i] + d[l][j] >= + d[j][i] + d[l][k]) && + (d[l][i] + d[k][j] >= + d[j][i] + d[l][k])) + result = max( + result, + dabs(d[k][i] + d[l][j] - + d[l][i] - d[k][j])); + return (result); +} + +gtcord() +{ + double sumx, sumy, ssqx, ssqy, scp, fn; + int i, j; + + sumx = sumy = ssqx = ssqy = scp = 0.0; + for (i = 1; i <= n; i++) + for (j = 1; j < i; j++) + if (delta[i][j] != empty) { + sumx += delta[i][j]; + sumy += d[i][j]; + ssqx += sqr(delta[i][j]); + ssqy += sqr(d[i][j]); + scp += delta[i][j] * d[i][j]; + } + fn = (double)(m - nempty); + if ((fn * ssqy - sumx * sumy) <= 0.0) + show_error("gtcord: path length distances have zero variance"); + else + return ( + (fn * scp - sumx * sumy) / + sqrt((fn * ssqx - sqr(sumx)) * (fn * ssqy - sqr(sumy)))); +} + +goodfit() +{ + double r, rsq; + + r = gtcord(); + rsq = r * r * 100.0; + tol = gttol(); +} + +additive_const() +{ + int i, j, k; + + c = 0.0; + for (i = 1; i <= n; i++) + for (j = 1; j < i; j++) + for (k = 1; k <= n; k++) + if (k != i && k != j) + c = max(c, + gd(i, j) - gd(i, k) - gd(j, k)); + if (report) fprintf(reportf, "Additive Constant: %15.10f\n", c); + if (c != 0.0) + for (i = 1; i <= n; i++) + for (j = 1; j < i; j++) d[i][j] += c; +} + +static int *mergei, *mergej; +static int ninv; +get_tree() +{ +#define false 0 +#define true 1 + int i, j, k, l, im, jm, km, lm, count; + int maxnode, maxcnt, nnode, nact; + double arci, arcj, arcim, arcjm; + char *act; + + maxnode = 2 * n - 2; + mergei = (int *)getmem(maxnode, sizeof(int)); + mergej = (int *)getmem(maxnode, sizeof(int)); + act = (char *)getmem(maxnode, sizeof(char)); + for (i = 1; i <= maxnode; i++) act[i] = false; + nact = n; + ninv = 0; + nnode = n; + while (nact > 4) { + maxcnt = 0; + for (i = 1; i <= nnode; i++) + if (!act[i]) + for (j = 1; j < i; j++) + if (!act[j]) { + count = 0; + arci = 0.0; + arcj = 0.0; + for (k = 2; k <= nnode; k++) + if (!act[k] && k != i && + k != j) + for (l = 1; + l < k; l++) + if (!act[l] && + l != + i && + l != + j) + if (gd(i, + j) + gd(k, + l) <= + gd(i, + k) + + gd(j, + l) && + gd(i, + j) + gd(k, + l) <= + gd(i, + l) + + gd(j, + k)) { + count++; + arci += + (gd(i, + j) + + gd(i, + k) - + gd(j, + k)) / + 2.0; + arcj += + (gd(i, + j) + + gd(j, + l) - + gd(i, + l)) / + 2.0; + } + if (count > maxcnt) { + maxcnt = count; + arcim = max( + 0.0, arci / count); + arcjm = max( + 0.0, arcj / count); + im = i; + jm = j; + } + } + + nnode++; + if (nnode + 2 > maxnode) + show_error("get_tree: number of nodes exceeds 2N-2"); + ninv++; + mergei[ninv] = im; + mergej[ninv] = jm; + act[im] = true; + act[jm] = true; + d[nnode] = (double *)getmem(nnode - 1, sizeof(double)); + d[nnode][im] = arcim; + d[nnode][jm] = arcjm; + for (i = 1; i <= nnode - 1; i++) + if (!act[i]) d[nnode][i] = max(0.0, gd(im, i) - arcim); + nact--; + } + + for (i = 1; act[i]; i++) + if (i > nnode) + show_error( + "get_tree: can't find last two invisible nodes"); + im = i; + for (i = im + 1; act[i]; i++) + if (i > nnode) + show_error( + "get_tree: can't find last two invisible nodes"); + jm = i; + for (i = jm + 1; act[i]; i++) + if (i > nnode) + show_error( + "get_tree: can't find last two invisible nodes"); + km = i; + for (i = km + 1; act[i]; i++) + if (i > nnode) + show_error( + "get_tree: can't find last two invisible nodes"); + lm = i; + if (gd(im, jm) + gd(km, lm) <= gd(im, km) + gd(jm, lm) + tol && + gd(im, jm) + gd(km, lm) <= gd(im, lm) + gd(jm, km) + tol) { + i = im; + j = jm; + k = km; + l = lm; + } + else if (gd(im, lm) + gd(jm, km) <= gd(im, km) + gd(jm, lm) + tol && + gd(im, lm) + gd(jm, km) <= gd(im, jm) + gd(km, lm) + tol) { + i = im; + j = lm; + k = km; + l = jm; + } + else if (gd(im, km) + gd(jm, lm) <= gd(im, jm) + gd(km, lm) + tol && + gd(im, km) + gd(jm, lm) <= gd(im, lm) + gd(jm, km) + tol) { + i = im; + j = km; + k = lm; + l = jm; + } + + nnode++; + ninv++; + mergei[ninv] = i; + mergej[ninv] = j; + d[nnode] = (double *)getmem(nnode - 1, sizeof(double)); + d[nnode][i] = max(0.0, (gd(i, j) + gd(i, k) - gd(j, k)) / 2.0); + d[nnode][j] = max(0.0, (gd(i, j) + gd(j, l) - gd(i, l)) / 2.0); + nnode++; + ninv++; + mergei[ninv] = k; + mergej[ninv] = l; + d[nnode] = (double *)getmem(nnode - 1, sizeof(double)); + d[nnode][k] = max(0.0, (gd(k, l) + gd(i, k) - gd(i, l)) / 2.0); + d[nnode][l] = max(0.0, (gd(k, l) + gd(j, l) - gd(j, k)) / 2.0); + d[nnode][nnode - 1] = + max(0.0, (gd(i, k) + gd(j, l) - gd(i, j) - gd(k, l)) / 2.0); +} + +print_node(node, dist, indent) int node; +double dist; +int indent; +{ + static char buf[BUFLEN]; + + if (node <= n) + printf("%s%s:%6.4f", repeatch(buf, '\t', indent), names[node], + dist / nfac); + else { + printf("%s(\n", repeatch(buf, '\t', indent)); + print_node(mergei[node - n], gd(node, mergei[node - n]), + indent + 1); + printf(",\n"); + print_node(mergej[node - n], gd(node, mergej[node - n]), + indent + 1); + printf("\n%s):%6.4f", repeatch(buf, '\t', indent), dist / nfac); + } +} + +show_tree() +{ + int i, j, current; + int ij[2]; + + current = 0; + for (i = 1; current < 2; i++) { + for (j = 1; (mergei[j] != i && mergej[j] != i) && j <= ninv; + j++) + ; + if (j > ninv) ij[current++] = i; + } + printf("(\n"); + print_node(ij[0], gd(ij[0], ij[1]) / 2.0, 1); + printf(",\n"); + print_node(ij[1], gd(ij[0], ij[1]) / 2.0, 1); + printf("\n);\n"); +} + +show_help() +{ + printf("\nlsadt--options:\n"); + printf(" -f file - write report to 'file'\n"); + printf(" -d - treat data as dissimilarities (default)\n"); + printf(" -s - treat data as similarities\n"); + printf(" -print 0 - don't print out iteration history (default)\n"); + printf( + " -print n>0 - print out iteration history every n iterations\n"); + printf( + " -print n<0 - print out iteration history every n iterations\n"); + printf( + " (including current distance estimates & " + "gradients)\n"); + printf(" -init n - initial parameter estimates (default = 3)\n"); + printf(" n=1 - uniformly distributed random numbers\n"); + printf(" n=2 - error-perturbed data\n"); + printf(" n=3 - original distance data from input matrix\n"); + printf( + " -save - save final parameter estimates (default is don't " + "save\n"); + printf( + " -seed n - seed for random number generator (default = " + "12345)\n"); + printf( + " -ps1 n - convergence criterion for inner iterations (default " + "= 0.0001)\n"); + printf( + " -ps2 n - convergence criterion for major iterations (default " + "= 0.0001)\n"); + printf(" -empty n - missing data indicator (default = 0.0)\n"); + printf(" -help - show this help text\n"); + exit(0); +} + +show_error(message) char *message; +{ + printf("\n>>>>ERROR:\n>>>>%s\n", message); + exit(0); +} + +main(argc, argv) int argc; +char **argv; +{ + int i; + + strcpy(fname, ""); + get_parms(argc, argv); + if (strcmp(fname, "")) { + report = 1; + reportf = fopen(fname, "w"); + if (reportf == NULL) { + perror("lsadt"); + exit(0); + } + } + else + report = 0; + get_data(); + d = (double **)getmem(n, sizeof(delta[1])); + g = (double **)getmem(n, sizeof(delta[1])); + dold = (double **)getmem(n, sizeof(delta[1])); + d[1] = NULL; + g[1] = NULL; + dold[1] = NULL; + for (i = 2; i <= n; i++) { + d[i] = (double *)getmem(i - 1, sizeof(double)); + g[i] = (double *)getmem(i - 1, sizeof(double)); + dold[i] = (double *)getmem(i - 1, sizeof(double)); + } + initd(); + get_dist(); + goodfit(); + additive_const(); + get_tree(); + show_tree(); + if (report) close(reportf); +} diff --git a/SUPPORT/sho_helix.c b/SUPPORT/sho_helix.c new file mode 100644 index 0000000..84b2af6 --- /dev/null +++ b/SUPPORT/sho_helix.c @@ -0,0 +1,87 @@ +#include "Flatio.c" +#define BLACK 8 +#define RED 3 +#define BLUE 6 +#define YELLOW 1 +#define AQUA 4 +main() +{ + struct data_format data[10000]; + int i,j,k,numseqs,mask = -1; + int pair[20000],stack[20000],spt = 0; + char ch; + + numseqs = ReadFlat(stdin,data,10000); + if(numseqs == 0) + exit(1); + for(j=0;j(b)?(a):(b)) + +/* +* Varpos.c- An extremely simple program for showing which positions +* are varying in an alignment. Use this as a model for other +* external functions. +* +* Read in a flat file alignment, pass back an alignment color +* mask. +* +* Copyright 1991/1992 Steven Smith, Harvard Genome lab. +* +*/ + +main(ac,av) +int ac; +char **av; +{ + struct data_format data[10000]; + int i,j,k,numseqs,rev = FALSE; + int maxlen = -99999, + score = 0, + minoffset = 99999; + char ch; + if(ac>2) + { + fprintf(stderr,"Usage %s [-rev]gde_color_mask\n", av[0]); + exit(1); + } + if(ac == 2) + if(strcmp(av[1],"-rev") == 0) + rev = TRUE; + + numseqs = ReadFlat(stdin,data,10000); + + if(numseqs == 0) + exit(1); + + for(j=0;j maxlen) + maxlen = data[j].length+data[j].offset; + if(data[j].offset < minoffset) + minoffset = data[j].offset; + } + + printf("length:%d\n",maxlen); + printf("offset:%d\n",minoffset); + printf("start:\n"); + for(j=0;j j) + { + if(j>data[k].offset) + ch=data[k].nuc[j-data[k].offset] | 32; + else + ch = '-'; + + if(ch=='a')a++; + if(ch=='c')c++; + if(ch=='g')g++; + if(ch=='u')u++; + if(ch=='t')u++; + } + + score=MAX(a,c); + score=MAX(score,g); + score=MAX(score,u); + if(a+c+g+u) + { + if(rev) + score=(score*6/(a+c+g+u)+8); + else + score=((8-score*6/(a+c+g+u))+8); + } + else + score=8; + printf("%d\n",score); + } + exit(0); +}