2223 lines
58 KiB
C
2223 lines
58 KiB
C
/* CONTIG ASSEMBLY PROGRAM (CAP)
|
|
|
|
copyright (c) 1991 Xiaoqiu Huang
|
|
The distribution of the program is granted provided no charge
|
|
is made and the copyright notice is included.
|
|
|
|
Proper attribution of the author as the source of the software
|
|
would be appreciated:
|
|
"A Contig Assembly Program Based on Sensitive Detection of
|
|
Fragment Overlaps" (submitted to Genomics, 1991)
|
|
Xiaoqiu Huang
|
|
Department of Computer Science
|
|
Michigan Technological University
|
|
Houghton, MI 49931
|
|
E-mail: huang@cs.mtu.edu
|
|
|
|
The CAP program uses a dynamic programming algorithm to compute
|
|
the maximal-scoring overlapping alignment between two fragments.
|
|
Fragments in random orientations are assembled into contigs by a
|
|
greedy approach in order of the overlap scores. CAP is efficient
|
|
in computer memory: a large number of arbitrarily long fragments
|
|
can be assembled. The time requirement is acceptable; for example,
|
|
CAP took 4 hours to assemble 1015 fragments of a total of 252 kb
|
|
nucleotides on a Sun SPARCstation SLC. The program is written in C
|
|
and runs on Sun workstations.
|
|
|
|
Below is a description of the parameters in the #define section of CAP.
|
|
Two specially chosen sets of substitution scores and indel penalties
|
|
are used by the dynamic programming algorithm: heavy set for regions
|
|
of low sequencing error rates and light set for fragment ends of high
|
|
sequencing error rates. (Use integers only.)
|
|
Heavy set: Light set:
|
|
|
|
MATCH = 2 MATCH = 2
|
|
MISMAT = -6 LTMISM = -3
|
|
EXTEND = 4 LTEXTEN = 2
|
|
|
|
In the initial assembly, any overlap must be of length at least OVERLEN,
|
|
and any overlap/containment must be of identity percentage at least
|
|
PERCENT. After the initial assembly, the program attempts to join
|
|
contigs together using weak overlaps. Two contigs are merged if the
|
|
score of the overlapping alignment is at least CUTOFF. The value for
|
|
CUTOFF is chosen according to the value for MATCH.
|
|
|
|
DELTA is a parameter in necessary conditions for overlap/containment.
|
|
Those conditions are used to quickly reject pairs of fragments that
|
|
could not possibly have an overlap/containment relationship.
|
|
The dynamic programming algorithm is only applied to pairs of fragments
|
|
that pass the screening. A large value for DELTA means stringent
|
|
conditions, where the value for DELTA is a real number at least 8.0.
|
|
|
|
POS5 and POS3 are fragment positions such that the 5' end between base 1
|
|
and base POS5, and the 3' end after base POS3 are of high sequencing
|
|
error rates, say more than 5%. For mismatches and indels occurring in
|
|
the two ends, light penalties are used.
|
|
|
|
A file of input fragments looks like:
|
|
>G019uabh
|
|
ATACATCATAACACTACTTCCTACCCATAAGCTCCTTTTAACTTGTTAAA
|
|
GTCTTGCTTGAATTAAAGACTTGTTTAAACACAAAAATTTAGAGTTTTAC
|
|
TCAACAAAAGTGATTGATTGATTGATTGATTGATTGATGGTTTACAGTAG
|
|
GACTTCATTCTAGTCATTATAGCTGCTGGCAGTATAACTGGCCAGCCTTT
|
|
AATACATTGCTGCTTAGAGTCAAAGCATGTACTTAGAGTTGGTATGATTT
|
|
ATCTTTTTGGTCTTCTATAGCCTCCTTCCCCATCCCCATCAGTCTTAATC
|
|
AGTCTTGTTACGTTATGACTAATCTTTGGGGATTGTGCAGAATGTTATTT
|
|
TAGATAAGCAAAACGAGCAAAATGGGGAGTTACTTATATTTCTTTAAAGC
|
|
>G028uaah
|
|
CATAAGCTCCTTTTAACTTGTTAAAGTCTTGCTTGAATTAAAGACTTGTT
|
|
TAAACACAAAATTTAGACTTTTACTCAACAAAAGTGATTGATTGATTGAT
|
|
TGATTGATTGATGGTTTACAGTAGGACTTCATTCTAGTCATTATAGCTGC
|
|
TGGCAGTATAACTGGCCAGCCTTTAATACATTGCTGCTTAGAGTCAAAGC
|
|
ATGTACTTAGAGTTGGTATGATTTATCTTTTTGGTCTTCTATAGCCTCCT
|
|
TCCCCATCCCATCAGTCT
|
|
>G022uabh
|
|
TATTTTAGAGACCCAAGTTTTTGACCTTTTCCATGTTTACATCAATCCTG
|
|
TAGGTGATTGGGCAGCCATTTAAGTATTATTATAGACATTTTCACTATCC
|
|
CATTAAAACCCTTTATGCCCATACATCATAACACTACTTCCTACCCATAA
|
|
GCTCCTTTTAACTTGTTAAAGTCTTGCTTGAATTAAAGACTTGTTTAAAC
|
|
ACAAAATTTAGACTTTTACTCAACAAAAGTGATTGATTGATTGATTGATT
|
|
GATTGAT
|
|
>G023uabh
|
|
AATAAATACCAAAAAAATAGTATATCTACATAGAATTTCACATAAAATAA
|
|
ACTGTTTTCTATGTGAAAATTAACCTAAAAATATGCTTTGCTTATGTTTA
|
|
AGATGTCATGCTTTTTATCAGTTGAGGAGTTCAGCTTAATAATCCTCTAC
|
|
GATCTTAAACAAATAGGAAAAAAACTAAAAGTAGAAAATGGAAATAAAAT
|
|
GTCAAAGCATTTCTACCACTCAGAATTGATCTTATAACATGAAATGCTTT
|
|
TTAAAAGAAAATATTAAAGTTAAACTCCCCTATTTTGCTCGTTTTTGCTT
|
|
ATCTAAAATACATTCTGCACAATCCCCAAAGATTGATCATACGTTAC
|
|
>G006uaah
|
|
ACATAAAATAAACTGTTTTCTATGTGAAAATTAACCTANNATATGCTTTG
|
|
CTTATGTTTAAGATGTCATGCTTTTTATCAGTTGAGGAGTTCAGCTTAAT
|
|
AATCCTCTAAGATCTTAAACAAATAGGAAAAAAACTAAAAGTAGAAAATG
|
|
GAAATAAAATGTCAAAGCATTTCTACCACTCAGAATTGATCTTATAACAT
|
|
GAAATGCTTTTTAAAAGAAAATATTAAAGTTAAACTCCCC
|
|
A string after ">" is the name of the following fragment.
|
|
Only the five upper-case letters A, C, G, T and N are allowed
|
|
to appear in fragment data. No other characters are allowed.
|
|
A common mistake is the use of lower case letters in a fragment.
|
|
|
|
To run the program, type a command of form
|
|
|
|
cap file_of_fragments
|
|
|
|
The output goes to the terminal screen. So redirection of the
|
|
output into a file is necessary. The output consists of three parts:
|
|
overview of contigs at fragment level, detailed display of contigs
|
|
at nucleotide level, and consensus sequences.
|
|
'+' = direct orientation; '-' = reverse complement
|
|
The output of CAP on the sample input data looks like:
|
|
|
|
#Contig 1
|
|
|
|
#G022uabh+(0)
|
|
TATTTTAGAGACCCAAGTTTTTGACCTTTTCCATGTTTACATCAATCCTGTAGGTGATTG
|
|
GGCAGCCATTTAAGTATTATTATAGACATTTTCACTATCCCATTAAAACCCTTTATGCCC
|
|
ATACATCATAACACTACTTCCTACCCATAAGCTCCTTTTAACTTGTTAAAGTCTTGCTTG
|
|
AATTAAAGACTTGTTTAAACACAAAA-TTTAGACTTTTACTCAACAAAAGTGATTGATTG
|
|
ATTGATTGATTGATTGAT
|
|
#G028uaah+(145)
|
|
CATAAGCTCCTTTTAACTTGTTAAAGTCTTGCTTGAATTAAAGACTTGTTTAAACACAAA
|
|
A-TTTAGACTTTTACTCAACAAAAGTGATTGATTGATTGATTGATTGATTGATGGTTTAC
|
|
AGTAGGACTTCATTCTAGTCATTATAGCTGCTGGCAGTATAACTGGCCAGCCTTTAATAC
|
|
ATTGCTGCTTAGAGTCAAAGCATGTACTTAGAGTTGGTATGATTTATCTTTTTGGTCTTC
|
|
TATAGCCTCCTTCCCCATCCC-ATCAGTCT
|
|
#G019uabh+(120)
|
|
ATACATCATAACACTACTTCCTACCCATAAGCTCCTTTTAACTTGTTAAAGTCTTGCTTG
|
|
AATTAAAGACTTGTTTAAACACAAAAATTTAGAGTTTTACTCAACAAAAGTGATTGATTG
|
|
ATTGATTGATTGATTGATGGTTTACAGTAGGACTTCATTCTAGTCATTATAGCTGCTGGC
|
|
AGTATAACTGGCCAGCCTTTAATACATTGCTGCTTAGAGTCAAAGCATGTACTTAGAGTT
|
|
GGTATGATTTATCTTTTTGGTCTTCTATAGCCTCCTTCCCCATCCCCATCAGTCTTAATC
|
|
AGTCTTGTTACGTTATGACT-AATCTTTGGGGATTGTGCAGAATGTTATTTTAGATAAGC
|
|
AAAA-CGAGCAAAAT-GGGGAGTT-A-CTT-A-TATTT-CTTT-AAA--GC
|
|
#G023uabh-(426)
|
|
GTAACGT-ATGA-TCAATCTTTGGGGATTGTGCAGAATGT-ATTTTAGATAAGCAAAAAC
|
|
GAGCAAAATAGGGGAGTTTAACTTTAATATTTTCTTTTAAAAAGCATTTCATGTTATAAG
|
|
ATCAATTCTGAGTGGTAGAAATGCTTTGACATTTTATTTCCATTTTCTACTTTTAGTTTT
|
|
TTTCCTATTTGTTTAAGATCGTAGAGGATTATTAAGCTGAACTCCTCAACTGATAAAAAG
|
|
CATGACATCTTAAACATAAGCAAAGCATATTTTTAGGTTAATTTTCACATAGAAAACAGT
|
|
TTATTTTATGTGAAATTCTATGTAGATATACTATTTTTTTGGTATTTATT
|
|
#G006uaah-(496)
|
|
GGGGAGTTTAACTTTAATATTTTCTTTTAAAAAGCATTTCATGTTATAAGATCAATTCTG
|
|
AGTGGTAGAAATGCTTTGACATTTTATTTCCATTTTCTACTTTTAGTTTTTTTCCTATTT
|
|
GTTTAAGATCTTAGAGGATTATTAAGCTGAACTCCTCAACTGATAAAAAGCATGACATCT
|
|
TAAACATAAGCAAAGCATATNNT-AGGTTAATTTTCACATAGAAAACAGTTTATTTTATG
|
|
T
|
|
|
|
|
|
|
|
Slight modifications by S. Smith on Mon Feb 17 10:18:34 EST 1992.
|
|
These changes allow for command line arguements for several
|
|
of the hard coded parameters, as well as a slight modification to
|
|
the output routine to support GDE format. Changes are commented
|
|
as:
|
|
|
|
Mod by S.S.
|
|
|
|
*/
|
|
|
|
#include <stdio.h>
|
|
#include <stdlib.h>
|
|
#include <string.h>
|
|
|
|
int OVERLEN; /* Minimum length of any overlap */
|
|
float PERCENT; /* Minimum identity percentage of any overlap */
|
|
#define CUTOFF 50 /* cutoff score for overlap or containment */
|
|
#define DELTA 9.0 /* Jump increment in check for overlap */
|
|
#define MATCH 2 /* score of a match */
|
|
#define MISMAT -6 /* score of a mismatch */
|
|
#define LTMISM -3 /* light score of a mismatch */
|
|
#define OPEN 0 /* gap open penalty */
|
|
#define EXTEND 4 /* gap extension penalty */
|
|
#define LTEXTEN 2 /* light gap extension penalty */
|
|
#define POS5 30 /* Sequencing errors often occur before base POS5 */
|
|
#define POS3 475 /* Sequencing errors often occur after base POS3 */
|
|
#define LINELEN 60 /* length of one printed line */
|
|
#define NAMELEN 20 /* length of printed fragment name */
|
|
#define TUPLELEN 4 /* Maximum length of words for lookup table */
|
|
#define SEQLEN 2000 /* initial size of an array for an output fragment */
|
|
|
|
static int over_len;
|
|
static float per_cent;
|
|
typedef struct OVERLAP /* of 5' and 3' segments */
|
|
{
|
|
int number; /* index of 3' segment */
|
|
int host; /* index of 5' segment */
|
|
int ind; /* used in reassembly */
|
|
int stari; /* start position of 5' suffix */
|
|
int endi; /* end position of 5' suffix */
|
|
int starj; /* start position of 3' prefix */
|
|
int endj; /* end position of 3' prefix */
|
|
short orienti; /* orientation of 5' segment: 0= rev. */
|
|
short orientj; /* orientation of 3' segment: 0= rev. */
|
|
int score; /* score of overlap alignment */
|
|
int length; /* length of alignment */
|
|
int match; /* number of matches in alignment */
|
|
short kind; /* 0 = containment; 1 = overlap */
|
|
int *script; /* script for constructing alignment */
|
|
struct OVERLAP *next;
|
|
} over, *overptr;
|
|
struct SEG {
|
|
char *name; /* name string */
|
|
int len; /* length of segment name */
|
|
char *seq; /* segment sequence */
|
|
char *rev; /* reverse complement */
|
|
int length; /* length of sequence */
|
|
short kind; /* 0 = contain; 1 = overlap */
|
|
int *lookup; /* lookup table */
|
|
int *pos; /* location list */
|
|
overptr list; /* list of overlapping edges */
|
|
} *segment; /* array of segment records */
|
|
int seg_num; /* The number of segments */
|
|
overptr *edge; /* set of overlapping edges */
|
|
int edge_num; /* The number of overlaps */
|
|
struct CONS /* 1 = itself; 0 = reverse complement */
|
|
{
|
|
short isfive[2]; /* is 5' end free? */
|
|
short isthree[2]; /* is 3' end free? */
|
|
short orient[2]; /* orientation of 3' segment */
|
|
int group; /* contig serial number */
|
|
int next[2]; /* pointer to 3' adjacent segment */
|
|
int other; /* the other end of the contig */
|
|
int child; /* for the containment tree */
|
|
int brother;
|
|
int father;
|
|
overptr node[2]; /* pointers to overlapping edges */
|
|
} *contigs; /* list of contigs */
|
|
struct TTREE /* multiple alignment tree */
|
|
{
|
|
short head; /* head = 1 means the head of a contig */
|
|
short orient; /* orientation */
|
|
int begin; /* start position of previous segment */
|
|
int *script; /* alignment script */
|
|
int size; /* length of script */
|
|
int next; /* list of overlap segments */
|
|
int child; /* list of child segments */
|
|
int brother; /* list of sibling segments */
|
|
} *mtree;
|
|
int vert[128]; /* converted digits for 'A','C','G','T' */
|
|
int vertc[128]; /* for reverse complement */
|
|
int tuple; /* tuple = TUPLELEN - 1 */
|
|
int base[TUPLELEN]; /* locations of a lookup table */
|
|
int power[TUPLELEN]; /* powers of 4 */
|
|
typedef struct OUT {
|
|
char *line; /* one print line */
|
|
char *a; /* pointer to slot in line */
|
|
char c; /* current char */
|
|
char *seq; /* pointer to sequence */
|
|
int length; /* length of segment */
|
|
int id; /* index of segment */
|
|
int *s; /* pointer to script vector */
|
|
int size; /* size of script vector */
|
|
int op; /* current operation */
|
|
char name[NAMELEN + 2]; /* name of segment */
|
|
short done; /* indicates if segment is done */
|
|
int loc; /* position of next segment symbol */
|
|
char kind; /* type of next symbol of segment */
|
|
char up; /* type of upper symbol of operation */
|
|
char dw; /* type of lower symbol of operation */
|
|
int offset; /* relative to consensus sequence */
|
|
int linesize; /* size of array line */
|
|
struct OUT *child; /* pointer to child subtree */
|
|
struct OUT *brother; /* pointer to brother node */
|
|
struct OUT *link; /* for operation linked list */
|
|
struct OUT *father; /* pointer to father node */
|
|
} row, *rowptr; /* node for segment */
|
|
rowptr *work; /* a set of working segments */
|
|
rowptr head, tail; /* first and last nodes of op list */
|
|
struct VX {
|
|
int id; /* Segment index */
|
|
short kind; /* overlap or containment */
|
|
overptr list; /* list of overlapping edges */
|
|
} *piece; /* array of segment records */
|
|
char *allconsen, *allconpt; /* Storing consensus sequences */
|
|
char *ckalloc(size_t size);
|
|
main(argc, argv) int argc;
|
|
char *argv[];
|
|
{
|
|
int M; /* Sequence length */
|
|
int V[128][128], Q, R; /* Weights */
|
|
int V1[128][128], R1; /* Light weights */
|
|
int total; /* Total of segment lengths */
|
|
int number; /* The number of segments */
|
|
char *sequence; /* Storing all segments */
|
|
char *reverse; /* Storing all reverse segments */
|
|
int symbol, prev; /* The next character */
|
|
FILE *Ap, *ckopen(); /* For the sequence file */
|
|
char *ckalloc(size_t amount); /* space-allocating function */
|
|
register int i, j, k; /* index variables */
|
|
/* Mod by S.S. */
|
|
int jj;
|
|
short heading; /* 1: heading; 0: sequence */
|
|
|
|
/*
|
|
* Mod by S.S.
|
|
*
|
|
if ( argc != 2 )
|
|
fatalf("The proper form of command is: \n%s file_of_fragments",
|
|
argv[0]);
|
|
*/
|
|
if (argc != 4)
|
|
fatalf("usage: %s file_of_fragments MIN_OVERLAP PERCENT_MATCH",
|
|
argv[0]);
|
|
sscanf(argv[2], "%d", &OVERLEN);
|
|
sscanf(argv[3], "%d", &jj);
|
|
PERCENT = (float)jj / 100.0;
|
|
if (PERCENT < 0.25) PERCENT = 0.25;
|
|
if (PERCENT > 1.0) PERCENT = 1.0;
|
|
if (OVERLEN < 1) OVERLEN = 1;
|
|
if (OVERLEN > 100) OVERLEN = 100;
|
|
|
|
/* determine number of segments and total lengths */
|
|
j = 0;
|
|
|
|
Ap = ckopen(argv[1], "r");
|
|
prev = '\n';
|
|
for (total = 3, number = 0; (symbol = getc(Ap)) != EOF; total++) {
|
|
if (symbol == '>' && prev == '\n') number++;
|
|
prev = symbol;
|
|
}
|
|
(void)fclose(Ap);
|
|
|
|
total += number * 20;
|
|
/* allocate space for segments */
|
|
sequence = (char *)ckalloc(total * sizeof(char));
|
|
reverse = (char *)ckalloc(total * sizeof(char));
|
|
allconpt = allconsen = (char *)ckalloc(total * sizeof(char));
|
|
segment = (struct SEG *)ckalloc(number * sizeof(struct SEG));
|
|
|
|
/* read the segments into sequence */
|
|
M = 0;
|
|
Ap = ckopen(argv[1], "r");
|
|
number = -1;
|
|
heading = 0;
|
|
prev = '\n';
|
|
for (i = 0, k = total; (symbol = getc(Ap)) != EOF;) {
|
|
if (symbol != '\n') {
|
|
sequence[++i] = symbol;
|
|
switch (symbol) {
|
|
case 'A':
|
|
reverse[--k] = 'T';
|
|
break;
|
|
case 'a':
|
|
reverse[--k] = 't';
|
|
break;
|
|
case 'T':
|
|
reverse[--k] = 'A';
|
|
break;
|
|
case 't':
|
|
reverse[--k] = 'a';
|
|
break;
|
|
case 'C':
|
|
reverse[--k] = 'G';
|
|
break;
|
|
case 'c':
|
|
reverse[--k] = 'g';
|
|
break;
|
|
case 'G':
|
|
reverse[--k] = 'C';
|
|
break;
|
|
case 'g':
|
|
reverse[--k] = 'c';
|
|
break;
|
|
default:
|
|
reverse[--k] = symbol;
|
|
break;
|
|
}
|
|
}
|
|
if (symbol == '>' && prev == '\n') {
|
|
heading = 1;
|
|
if (number >= 0) {
|
|
segment[number].length = i - j - 1;
|
|
segment[number].rev = &(reverse[k]);
|
|
if (i - j - 1 > M) M = i - j - 1;
|
|
}
|
|
number++;
|
|
j = i;
|
|
segment[number].name = &(sequence[i]);
|
|
segment[number].kind = 1;
|
|
segment[number].list = 0;
|
|
}
|
|
if (heading && symbol == '\n') {
|
|
heading = 0;
|
|
segment[number].len = i - j;
|
|
segment[number].seq = &(sequence[i]);
|
|
j = i;
|
|
}
|
|
|
|
prev = symbol;
|
|
}
|
|
segment[number].length = i - j;
|
|
reverse[--k] = '>';
|
|
segment[number].rev = &(reverse[k]);
|
|
if (i - j > M) M = i - j;
|
|
seg_num = ++number;
|
|
(void)fclose(Ap);
|
|
|
|
Q = OPEN;
|
|
R = EXTEND;
|
|
R1 = LTEXTEN;
|
|
/* set match and mismatch weights */
|
|
for (i = 0; i < 128; i++)
|
|
for (j = 0; j < 128; j++)
|
|
if ((i | 32) == (j | 32))
|
|
V[i][j] = V1[i][j] = MATCH;
|
|
else {
|
|
V[i][j] = MISMAT;
|
|
V1[i][j] = LTMISM;
|
|
}
|
|
for (i = 0; i < 128; i++) V['N'][i] = V[i]['N'] = MISMAT + 1;
|
|
V1['N']['N'] = MISMAT + 1;
|
|
|
|
over_len = OVERLEN;
|
|
per_cent = PERCENT;
|
|
edge_num = 0;
|
|
INIT(M);
|
|
MAKE();
|
|
PAIR(V, V1, Q, R, R1);
|
|
ASSEM();
|
|
REPAIR();
|
|
FORM_TREE();
|
|
/* GRAPH(); */
|
|
SHOW();
|
|
}
|
|
|
|
static int (*v)[128]; /* substitution scores */
|
|
static int q, r; /* gap penalties */
|
|
static int qr; /* qr = q + r */
|
|
static int (*v1)[128]; /* light substitution scores */
|
|
static int r1; /* light extension penalty */
|
|
static int qr1; /* qr1 = q + r1 */
|
|
|
|
static int SCORE;
|
|
static int STARI;
|
|
static int STARJ;
|
|
static int ENDI;
|
|
static int ENDJ;
|
|
|
|
static int *CC, *DD; /* saving matrix scores */
|
|
static int *RR, *SS; /* saving start-points */
|
|
static int *S; /* saving operations for diff */
|
|
|
|
/* The following definitions are for function diff() */
|
|
|
|
int diff();
|
|
static int zero = 0; /* int type zero */
|
|
|
|
#define gap(k) ((k) <= 0 ? 0 : q + r * (k)) /* k-symbol indel score */
|
|
|
|
static int *sapp; /* Current script append ptr */
|
|
static int last; /* Last script op appended */
|
|
|
|
static int no_mat; /* number of matches */
|
|
|
|
static int no_mis; /* number of mismatches */
|
|
|
|
static int al_len; /* length of alignment */
|
|
/* Append "Delete k" op */
|
|
#define DEL(k) \
|
|
{ \
|
|
al_len += k; \
|
|
if (last < 0) \
|
|
last = sapp[-1] -= (k); \
|
|
else \
|
|
last = *sapp++ = -(k); \
|
|
}
|
|
/* Append "Insert k" op */
|
|
#define INS(k) \
|
|
{ \
|
|
al_len += k; \
|
|
if (last < 0) { \
|
|
sapp[-1] = (k); \
|
|
*sapp++ = last; \
|
|
} \
|
|
else \
|
|
last = *sapp++ = (k); \
|
|
}
|
|
|
|
/* Append "Replace" op */
|
|
#define REP \
|
|
{ \
|
|
last = *sapp++ = 0; \
|
|
al_len += 1; \
|
|
}
|
|
|
|
INIT(M) int M;
|
|
{
|
|
register int j; /* row and column indices */
|
|
char *ckalloc(); /* space-allocating function */
|
|
|
|
/* allocate space for all vectors */
|
|
j = (M + 1) * sizeof(int);
|
|
CC = (int *)ckalloc(j);
|
|
DD = (int *)ckalloc(j);
|
|
RR = (int *)ckalloc(j);
|
|
SS = (int *)ckalloc(j);
|
|
S = (int *)ckalloc(2 * j);
|
|
}
|
|
|
|
/* Make a lookup table for words of lengths up to TUPLELEN in each sequence.
|
|
The value of a word is used as an index to the table. */
|
|
MAKE()
|
|
{
|
|
int hash[TUPLELEN]; /* values of words of lengths up to TUPLELEN */
|
|
int *table; /* pointer to a lookup table */
|
|
int *loc; /* pointer to a table of sequence locations */
|
|
int size; /* size of a lookup table */
|
|
int limit, digit, step; /* temporary variables */
|
|
register int i, j, k, p, q; /* index varibles */
|
|
char *ckalloc(); /* space-allocating function */
|
|
char *A; /* pointer to a sequence */
|
|
int M; /* length of a sequence */
|
|
|
|
tuple = TUPLELEN - 1;
|
|
for (i = 0; i < 128; i++) vert[i] = 4;
|
|
vert['A'] = vert['a'] = 0;
|
|
vert['C'] = vert['c'] = 1;
|
|
vert['G'] = vert['g'] = 2;
|
|
vert['T'] = vert['t'] = 3;
|
|
vertc['A'] = vertc['a'] = 3;
|
|
vertc['C'] = vertc['c'] = 2;
|
|
vertc['G'] = vertc['g'] = 1;
|
|
vertc['T'] = vertc['t'] = 0;
|
|
for (i = 0, j = 1, size = 0; i <= tuple; i++, j *= 4) {
|
|
base[i] = size;
|
|
power[i] = j;
|
|
size = (size + 1) * 4;
|
|
}
|
|
for (j = 0; j <= tuple; j++) hash[j] = 0;
|
|
/* make a lookup table for each sequence */
|
|
for (i = 0; i < seg_num; i++) {
|
|
A = segment[i].seq;
|
|
M = segment[i].length;
|
|
table = segment[i].lookup = (int *)ckalloc(size * sizeof(int));
|
|
loc = segment[i].pos = (int *)ckalloc((M + 1) * sizeof(int));
|
|
for (j = 0; j < size; j++) table[j] = 0;
|
|
for (k = 0, j = 1; j <= M; j++)
|
|
if ((digit = vert[A[j]]) != 4) {
|
|
for (p = tuple; p > 0; p--)
|
|
hash[p] = 4 * (hash[p - 1] + 1) + digit;
|
|
hash[0] = digit;
|
|
step = j - k;
|
|
limit = tuple <= step ? tuple : step;
|
|
for (p = 0; p < limit; p++)
|
|
if (!table[(q = hash[p])]) table[q] = 1;
|
|
if (step > tuple) {
|
|
loc[(p = j - tuple)] =
|
|
table[(q = hash[tuple])];
|
|
table[q] = p;
|
|
}
|
|
}
|
|
else
|
|
k = j;
|
|
}
|
|
}
|
|
|
|
/* Perform pair-wise comparisons of sequences. The sequences not
|
|
satisfying the necessary condition for overlap are rejected quickly.
|
|
Those that do satisfy the condition are verified with a dynamic
|
|
programming algorithm to see if overlaps exist. */
|
|
PAIR(V, V1, Q, R, R1) int V[][128], V1[][128], Q, R, R1;
|
|
{
|
|
int endi, endj, stari, starj; /* endpoint and startpoint */
|
|
|
|
short orienti, orientj; /* orientation of segments */
|
|
short iscon; /* containment condition */
|
|
int score; /* the max score */
|
|
int count, limit; /* temporary variables */
|
|
|
|
register int i, j, d; /* row and column indices */
|
|
char *ckalloc(); /* space-allocating function */
|
|
int rl, cl;
|
|
char *A, *B;
|
|
int M, N;
|
|
overptr node1;
|
|
int total; /* total number of pairs */
|
|
int hit; /* number of pairs satisfying cond. */
|
|
int CHECK();
|
|
|
|
v = V;
|
|
q = Q;
|
|
r = R;
|
|
qr = q + r;
|
|
v1 = V1;
|
|
r1 = R1;
|
|
qr1 = q + r1;
|
|
total = hit = 0;
|
|
limit = 2 * (seg_num - 1);
|
|
for (orienti = 0, d = 0; d < limit; d++) {
|
|
i = d / 2;
|
|
orienti = 1 - orienti;
|
|
A = orienti ? segment[i].seq : segment[i].rev;
|
|
M = segment[i].length;
|
|
for (j = i + 1; j < seg_num; j++) {
|
|
B = segment[j].seq;
|
|
orientj = 1;
|
|
N = segment[j].length;
|
|
total += 1;
|
|
if (CHECK(orienti, i, j)) {
|
|
hit += 1;
|
|
SCORE = 0;
|
|
big_pass(A, B, M, N, orienti, orientj);
|
|
if (SCORE) {
|
|
score = SCORE;
|
|
stari = ++STARI;
|
|
starj = ++STARJ;
|
|
endi = ENDI;
|
|
endj = ENDJ;
|
|
rl = endi - stari + 1;
|
|
cl = endj - starj + 1;
|
|
sapp = S;
|
|
last = 0;
|
|
al_len = no_mat = no_mis = 0;
|
|
(void)diff(&A[stari] - 1, &B[starj] - 1,
|
|
rl, cl, q, q);
|
|
iscon = stari == 1 && endi == M ||
|
|
starj == 1 && endj == N;
|
|
if (no_mat >= al_len * per_cent &&
|
|
(al_len >= over_len || iscon)) {
|
|
node1 = (overptr)ckalloc(
|
|
(int)sizeof(over));
|
|
if (iscon)
|
|
node1->kind =
|
|
0; /* containment */
|
|
else {
|
|
node1->kind = 1;
|
|
edge_num++;
|
|
} /* overlap */
|
|
if (endi == M &&
|
|
(endj != N ||
|
|
starj != 1)) /*i is 5'*/
|
|
{
|
|
node1->number = j;
|
|
node1->host = i;
|
|
node1->stari = stari;
|
|
node1->endi = endi;
|
|
node1->orienti =
|
|
orienti;
|
|
node1->starj = starj;
|
|
node1->endj = endj;
|
|
node1->orientj =
|
|
orientj;
|
|
}
|
|
else /* j is 5' */
|
|
{
|
|
node1->number = i;
|
|
node1->host = j;
|
|
node1->stari = starj;
|
|
node1->endi = endj;
|
|
node1->orienti =
|
|
orientj;
|
|
node1->starj = stari;
|
|
node1->endj = endi;
|
|
node1->orientj =
|
|
orienti;
|
|
}
|
|
node1->score = score;
|
|
node1->length = al_len;
|
|
node1->match = no_mat;
|
|
count =
|
|
node1->number == i ? j : i;
|
|
node1->next =
|
|
segment[count].list;
|
|
segment[count].list = node1;
|
|
if (!node1->kind)
|
|
segment[count].kind = 0;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
/* Return 1 if two sequences satisfy the necessary condition for overlap,
|
|
and 0 otherwise. Parameters first and second are the indices of segments,
|
|
and parameter orient indicates the orientation of segment first. */
|
|
int CHECK(orient, first, second)
|
|
short orient;
|
|
int first, second;
|
|
{
|
|
int limit, bound; /* maximum number of jumps */
|
|
int small; /* smaller of limit and bound */
|
|
float delta; /* cutoff factor */
|
|
float cut; /* cutoff score */
|
|
register int i; /* index variable */
|
|
int t, q; /* temporary variables */
|
|
int JUMP();
|
|
int RJUMP();
|
|
int JUMPC();
|
|
int RJUMPC();
|
|
|
|
delta = DELTA;
|
|
if (orient)
|
|
limit = JUMP(CC, first, second, 1);
|
|
else
|
|
limit = JUMPC(CC, first, second);
|
|
bound = RJUMP(DD, second, first, orient);
|
|
small = limit <= bound ? limit : bound;
|
|
cut = 0;
|
|
for (i = 1; i <= small; i++) {
|
|
if ((t = DD[i] - 1) >= over_len && t >= cut &&
|
|
(q = CC[i] - 1) >= over_len && q >= cut)
|
|
return (1);
|
|
cut += delta;
|
|
}
|
|
if (DD[bound] >= delta * (bound - 1) ||
|
|
CC[limit] >= delta * (limit - 1))
|
|
return (1);
|
|
limit = JUMP(CC, second, first, orient);
|
|
if (orient)
|
|
bound = RJUMP(DD, first, second, 1);
|
|
else
|
|
bound = RJUMPC(DD, first, second);
|
|
small = limit <= bound ? limit : bound;
|
|
cut = 0;
|
|
for (i = 1; i <= small; i++) {
|
|
if ((t = DD[i] - 1) >= over_len && t >= cut &&
|
|
(q = CC[i] - 1) >= over_len && q >= cut)
|
|
return (1);
|
|
cut += delta;
|
|
}
|
|
return (0);
|
|
}
|
|
|
|
/* Compute a vector of lengths of jumps */
|
|
int JUMP(H, one, two, orient)
|
|
int H[], one, two;
|
|
short orient;
|
|
{
|
|
char *A, *B; /* pointers to two sequences */
|
|
int M, N; /* lengths of two sequences */
|
|
int *table; /* pointer to a lookup table */
|
|
int *loc; /* pointer to a location table */
|
|
int value; /* value of a word */
|
|
int maxd; /* maximum length of an identical diagonal */
|
|
int d; /* length of current identical diagonal */
|
|
int s; /* length of jumps */
|
|
int k; /* number of jumps */
|
|
|
|
register int i, j, p; /* index variables */
|
|
|
|
A = segment[one].seq;
|
|
M = segment[one].length;
|
|
table = segment[one].lookup;
|
|
loc = segment[one].pos;
|
|
B = orient ? segment[two].seq : segment[two].rev;
|
|
N = segment[two].length;
|
|
for (s = 1, k = 1; s <= N; k++) {
|
|
maxd = 0;
|
|
for (value = -1, d = 0, j = s; d <= tuple && j <= N; j++, d++) {
|
|
if (vert[B[j]] == 4) break;
|
|
value = 4 * (value + 1) + vert[B[j]];
|
|
if (!table[value]) break;
|
|
}
|
|
if (d > tuple) {
|
|
for (p = table[value]; p; p = loc[p]) {
|
|
d = tuple + 1;
|
|
for (i = p + d, j = s + d; i <= M && j <= N;
|
|
i++, j++, d++)
|
|
if (A[i] != B[j] && vert[A[i]] != 4 &&
|
|
vert[B[j]] != 4)
|
|
break;
|
|
if (maxd < d) maxd = d;
|
|
if (j > N) break;
|
|
}
|
|
}
|
|
else
|
|
maxd = d;
|
|
s += maxd + 1;
|
|
H[k] = s;
|
|
}
|
|
return (k - 1);
|
|
}
|
|
|
|
/* Compute a vector of lengths of jumps for reverse complement of one */
|
|
int JUMPC(H, one, two)
|
|
int H[], one, two;
|
|
{
|
|
char *A, *B; /* pointers to two sequences */
|
|
int M, N; /* lengths of two sequences */
|
|
int *table; /* pointer to a lookup table */
|
|
int *loc; /* pointer to a location table */
|
|
int value; /* value of a word */
|
|
int maxd; /* maximum length of an identical diagonal */
|
|
int d; /* length of current identical diagonal */
|
|
int s; /* length of jumps */
|
|
int k; /* number of jumps */
|
|
|
|
register int i, j, p; /* index variables */
|
|
|
|
A = segment[one].rev;
|
|
M = segment[one].length;
|
|
table = segment[one].lookup;
|
|
loc = segment[one].pos;
|
|
B = segment[two].seq;
|
|
N = segment[two].length;
|
|
for (s = 1, k = 1; s <= N; k++) {
|
|
maxd = 0;
|
|
for (value = 0, d = 0, j = s; d <= tuple && j <= N; j++, d++) {
|
|
if (vert[B[j]] == 4) break;
|
|
value += vertc[B[j]] * power[d];
|
|
if (!table[value + base[d]]) break;
|
|
}
|
|
if (d > tuple) {
|
|
for (p = table[value + base[tuple]]; p; p = loc[p]) {
|
|
d = tuple + 1;
|
|
for (i = M + 2 - p, j = s + d; i <= M && j <= N;
|
|
i++, j++, d++)
|
|
if (A[i] != B[j] && vert[A[i]] != 4 &&
|
|
vert[B[j]] != 4)
|
|
break;
|
|
if (maxd < d) maxd = d;
|
|
if (j > N) break;
|
|
}
|
|
}
|
|
else
|
|
maxd = d;
|
|
s += maxd + 1;
|
|
H[k] = s;
|
|
}
|
|
return (k - 1);
|
|
}
|
|
|
|
/* Compute a vector of lengths of reverse jumps */
|
|
int RJUMP(H, one, two, orient)
|
|
int H[], one, two;
|
|
short orient;
|
|
{
|
|
char *A, *B; /* pointers to two sequences */
|
|
int N; /* length of a sequence */
|
|
int *table; /* pointer to a lookup table */
|
|
int *loc; /* pointer to a location table */
|
|
int value; /* value of a word */
|
|
int maxd; /* maximum length of an identical diagonal */
|
|
int d; /* length of current identical diagonal */
|
|
int s; /* length of jumps */
|
|
int k; /* number of jumps */
|
|
|
|
register int i, j, p; /* index variables */
|
|
|
|
A = segment[one].seq;
|
|
table = segment[one].lookup;
|
|
loc = segment[one].pos;
|
|
B = orient ? segment[two].seq : segment[two].rev;
|
|
N = segment[two].length;
|
|
for (s = 1, k = 1; s <= N; k++) {
|
|
maxd = 0;
|
|
for (value = 0, d = 0, j = N + 1 - s; d <= tuple && j >= 1;
|
|
j--, d++) {
|
|
if (vert[B[j]] == 4) break;
|
|
value += vert[B[j]] * power[d];
|
|
if (!table[value + base[d]]) break;
|
|
}
|
|
if (d > tuple) {
|
|
for (p = table[value + base[tuple]]; p; p = loc[p]) {
|
|
d = tuple + 1;
|
|
for (i = p - 1, j = N + 1 - s - d;
|
|
i >= 1 && j >= 1; i--, j--, d++)
|
|
if (A[i] != B[j] && vert[A[i]] != 4 &&
|
|
vert[B[j]] != 4)
|
|
break;
|
|
if (maxd < d) maxd = d;
|
|
if (j < 1) break;
|
|
}
|
|
}
|
|
else
|
|
maxd = d;
|
|
s += maxd + 1;
|
|
H[k] = s;
|
|
}
|
|
return (k - 1);
|
|
}
|
|
|
|
/* Compute a vector of lengths of reverse jumps for reverse complement */
|
|
int RJUMPC(H, one, two)
|
|
int H[], one, two;
|
|
{
|
|
char *A, *B; /* pointers to two sequences */
|
|
int M, N; /* lengths of two sequences */
|
|
int *table; /* pointer to a lookup table */
|
|
int *loc; /* pointer to a location table */
|
|
int value; /* value of a word */
|
|
int maxd; /* maximum length of an identical diagonal */
|
|
int d; /* length of current identical diagonal */
|
|
int s; /* length of jumps */
|
|
int k; /* number of jumps */
|
|
|
|
register int i, j, p; /* index variables */
|
|
|
|
A = segment[one].rev;
|
|
M = segment[one].length;
|
|
table = segment[one].lookup;
|
|
loc = segment[one].pos;
|
|
B = segment[two].seq;
|
|
N = segment[two].length;
|
|
for (s = 1, k = 1; s <= N; k++) {
|
|
maxd = 0;
|
|
for (value = -1, d = 0, j = N + 1 - s; d <= tuple && j >= 1;
|
|
j--, d++) {
|
|
if (vert[B[j]] == 4) break;
|
|
value = 4 * (value + 1) + vertc[B[j]];
|
|
if (!table[value]) break;
|
|
}
|
|
if (d > tuple) {
|
|
for (p = table[value]; p; p = loc[p]) {
|
|
d = tuple + 1;
|
|
i = M - p - tuple;
|
|
for (j = N - s - tuple; i >= 1 && j >= 1;
|
|
i--, j--, d++)
|
|
if (A[i] != B[j] && vert[A[i]] != 4 &&
|
|
vert[B[j]] != 4)
|
|
break;
|
|
if (maxd < d) maxd = d;
|
|
if (j < 1) break;
|
|
}
|
|
}
|
|
else
|
|
maxd = d;
|
|
s += maxd + 1;
|
|
H[k] = s;
|
|
}
|
|
return (k - 1);
|
|
}
|
|
|
|
/* Construct contigs */
|
|
ASSEM()
|
|
{
|
|
char *ckalloc(); /* space-allocating function */
|
|
register int i, j, k; /* index variables */
|
|
overptr node1, x, y; /* temporary pointer */
|
|
int five, three; /* indices of 5' and 3' segments */
|
|
short orienti; /* orientation of 5' segment */
|
|
short orientj; /* orientation of 3' segment */
|
|
short sorted; /* boolean variable */
|
|
|
|
contigs = (struct CONS *)ckalloc(seg_num * sizeof(struct CONS));
|
|
for (i = 0; i < seg_num; i++) {
|
|
contigs[i].isfive[0] = contigs[i].isthree[0] = 1;
|
|
contigs[i].isfive[1] = contigs[i].isthree[1] = 1;
|
|
contigs[i].other = i;
|
|
contigs[i].group = contigs[i].child = -1;
|
|
contigs[i].brother = contigs[i].father = -1;
|
|
}
|
|
for (i = 0; i < seg_num; i++)
|
|
if (!segment[i].kind)
|
|
for (;;) {
|
|
for (y = segment[i].list; y->kind; y = y->next)
|
|
;
|
|
for (x = y->next; x != 0; x = x->next)
|
|
if (!x->kind && x->score > y->score)
|
|
y = x;
|
|
for (j = y->number;
|
|
(k = contigs[j].father) != -1; j = k)
|
|
;
|
|
if (j != i) {
|
|
contigs[i].father = j = y->number;
|
|
contigs[i].brother = contigs[j].child;
|
|
contigs[j].child = i;
|
|
contigs[i].node[1] = y;
|
|
break;
|
|
}
|
|
else {
|
|
if (segment[i].list->number ==
|
|
y->number)
|
|
segment[i].list = y->next;
|
|
else {
|
|
for (x = segment[i].list;
|
|
x->next->number !=
|
|
y->number;)
|
|
x = x->next;
|
|
x->next = y->next;
|
|
}
|
|
for (x = segment[i].list;
|
|
x != 0 && x->kind; x = x->next)
|
|
;
|
|
if (x == 0) {
|
|
segment[i].kind = 1;
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
edge = (overptr *)ckalloc(edge_num * sizeof(overptr));
|
|
for (j = 0, i = 0; i < seg_num; i++)
|
|
if (segment[i].kind)
|
|
for (node1 = segment[i].list; node1 != 0;
|
|
node1 = node1->next)
|
|
if (segment[node1->number].kind)
|
|
edge[j++] = node1;
|
|
edge_num = j;
|
|
for (i = edge_num - 1; i > 0; i--) {
|
|
sorted = 1;
|
|
for (j = 0; j < i; j++)
|
|
if (edge[j]->score < edge[j + 1]->score) {
|
|
node1 = edge[j];
|
|
edge[j] = edge[j + 1];
|
|
edge[j + 1] = node1;
|
|
sorted = 0;
|
|
}
|
|
if (sorted) break;
|
|
}
|
|
for (k = 0; k < edge_num; k++) {
|
|
five = edge[k]->host;
|
|
three = edge[k]->number;
|
|
orienti = edge[k]->orienti;
|
|
orientj = edge[k]->orientj;
|
|
if (contigs[five].isthree[orienti] &&
|
|
contigs[three].isfive[orientj] &&
|
|
contigs[five].other != three) {
|
|
contigs[five].isthree[orienti] = 0;
|
|
contigs[three].isfive[orientj] = 0;
|
|
contigs[five].next[orienti] = three;
|
|
contigs[five].orient[orienti] = orientj;
|
|
contigs[five].node[orienti] = edge[k];
|
|
contigs[three].isthree[(j = 1 - orientj)] = 0;
|
|
contigs[five].isfive[(i = 1 - orienti)] = 0;
|
|
contigs[three].next[j] = five;
|
|
contigs[three].orient[j] = i;
|
|
contigs[three].node[j] = edge[k];
|
|
i = contigs[three].other;
|
|
j = contigs[five].other;
|
|
contigs[i].other = j;
|
|
contigs[j].other = i;
|
|
}
|
|
}
|
|
}
|
|
|
|
REPAIR()
|
|
{
|
|
int endi, endj, stari, starj; /* endpoint and startpoint */
|
|
|
|
short orienti, orientj; /* orientation of segments */
|
|
short isconi, isconj; /* containment condition */
|
|
int score; /* the max score */
|
|
int i, j, f, d, e; /* row and column indices */
|
|
char *ckalloc(); /* space-allocating function */
|
|
char *A, *B;
|
|
int M, N;
|
|
overptr node1;
|
|
int piece_num; /* The number of pieces */
|
|
int count, limit;
|
|
int number;
|
|
int hit;
|
|
|
|
piece = (struct VX *)ckalloc(seg_num * sizeof(struct VX));
|
|
for (j = 0, i = 0; i < seg_num; i++)
|
|
if (segment[i].kind &&
|
|
(contigs[i].isfive[1] || contigs[i].isfive[0]))
|
|
piece[j++].id = i;
|
|
piece_num = j;
|
|
for (i = 0; i < piece_num; i++) {
|
|
piece[i].kind = 1;
|
|
piece[i].list = 0;
|
|
}
|
|
limit = 2 * (piece_num - 1);
|
|
hit = number = 0;
|
|
for (orienti = 0, d = 0; d < limit; d++) {
|
|
i = piece[(e = d / 2)].id;
|
|
orienti = 1 - orienti;
|
|
A = orienti ? segment[i].seq : segment[i].rev;
|
|
M = segment[i].length;
|
|
for (f = e + 1; f < piece_num; f++) {
|
|
j = piece[f].id;
|
|
B = segment[j].seq;
|
|
orientj = 1;
|
|
N = segment[j].length;
|
|
SCORE = 0;
|
|
hit++;
|
|
big_pass(A, B, M, N, orienti, orientj);
|
|
if (SCORE > CUTOFF) {
|
|
score = SCORE;
|
|
stari = ++STARI;
|
|
starj = ++STARJ;
|
|
endi = ENDI;
|
|
endj = ENDJ;
|
|
isconi = stari == 1 && endi == M;
|
|
isconj = starj == 1 && endj == N;
|
|
node1 = (overptr)ckalloc((int)sizeof(over));
|
|
if (isconi || isconj)
|
|
node1->kind = 0; /* containment */
|
|
else {
|
|
node1->kind = 1;
|
|
number++;
|
|
} /* overlap */
|
|
if (endi == M && !isconj) /*i is 5'*/
|
|
{
|
|
node1->number = j;
|
|
node1->host = i;
|
|
node1->ind = f;
|
|
node1->stari = stari;
|
|
node1->endi = endi;
|
|
node1->orienti = orienti;
|
|
node1->starj = starj;
|
|
node1->endj = endj;
|
|
node1->orientj = orientj;
|
|
}
|
|
else /* j is 5' */
|
|
{
|
|
node1->number = i;
|
|
node1->host = j;
|
|
node1->ind = e;
|
|
node1->stari = starj;
|
|
node1->endi = endj;
|
|
node1->orienti = orientj;
|
|
node1->starj = stari;
|
|
node1->endj = endi;
|
|
node1->orientj = orienti;
|
|
}
|
|
node1->score = score;
|
|
count = node1->number == i ? f : e;
|
|
node1->next = piece[count].list;
|
|
piece[count].list = node1;
|
|
if (!node1->kind) piece[count].kind = 0;
|
|
}
|
|
}
|
|
}
|
|
REASSEM(piece_num, number);
|
|
}
|
|
|
|
/* Construct contigs */
|
|
REASSEM(piece_num, number) int piece_num, number;
|
|
{
|
|
char *ckalloc(); /* space-allocating function */
|
|
int i, j, k, d; /* index variables */
|
|
overptr node1, x, y; /* temporary pointer */
|
|
int five, three; /* indices of 5' and 3' segments */
|
|
short orienti; /* orientation of 5' segment */
|
|
short orientj; /* orientation of 3' segment */
|
|
short sorted; /* boolean variable */
|
|
|
|
for (d = 0; d < piece_num; d++)
|
|
if (!piece[d].kind)
|
|
for (i = piece[d].id;;) {
|
|
for (y = piece[d].list; y->kind; y = y->next)
|
|
;
|
|
for (x = y->next; x != 0; x = x->next)
|
|
if (!x->kind && x->score > y->score)
|
|
y = x;
|
|
for (j = y->number;
|
|
(k = contigs[j].father) != -1; j = k)
|
|
;
|
|
if (j != i &&
|
|
RECONCILE(y, &piece_num, &number)) {
|
|
contigs[i].father = j = y->number;
|
|
contigs[i].brother = contigs[j].child;
|
|
contigs[j].child = i;
|
|
contigs[i].node[1] = y;
|
|
segment[i].kind = 0;
|
|
break;
|
|
}
|
|
else {
|
|
if (piece[d].list->number == y->number)
|
|
piece[d].list = y->next;
|
|
else {
|
|
for (x = piece[d].list;
|
|
x->next->number !=
|
|
y->number;)
|
|
x = x->next;
|
|
x->next = y->next;
|
|
}
|
|
for (x = piece[d].list;
|
|
x != 0 && x->kind; x = x->next)
|
|
;
|
|
if (x == 0) {
|
|
piece[d].kind = 1;
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
if (number > edge_num)
|
|
edge = (overptr *)ckalloc(number * sizeof(overptr));
|
|
for (j = 0, d = 0; d < piece_num; d++)
|
|
if (piece[d].kind)
|
|
for (node1 = piece[d].list; node1 != 0;
|
|
node1 = node1->next)
|
|
if (piece[node1->ind].kind) edge[j++] = node1;
|
|
edge_num = j;
|
|
for (i = edge_num - 1; i > 0; i--) {
|
|
sorted = 1;
|
|
for (j = 0; j < i; j++)
|
|
if (edge[j]->score < edge[j + 1]->score) {
|
|
node1 = edge[j];
|
|
edge[j] = edge[j + 1];
|
|
edge[j + 1] = node1;
|
|
sorted = 0;
|
|
}
|
|
if (sorted) break;
|
|
}
|
|
for (k = 0; k < edge_num; k++) {
|
|
five = edge[k]->host;
|
|
three = edge[k]->number;
|
|
orienti = edge[k]->orienti;
|
|
orientj = edge[k]->orientj;
|
|
if (contigs[five].isthree[orienti] &&
|
|
contigs[three].isfive[orientj] &&
|
|
contigs[five].other != three) {
|
|
contigs[five].isthree[orienti] = 0;
|
|
contigs[three].isfive[orientj] = 0;
|
|
contigs[five].next[orienti] = three;
|
|
contigs[five].orient[orienti] = orientj;
|
|
contigs[five].node[orienti] = edge[k];
|
|
contigs[three].isthree[(j = 1 - orientj)] = 0;
|
|
contigs[five].isfive[(i = 1 - orienti)] = 0;
|
|
contigs[three].next[j] = five;
|
|
contigs[three].orient[j] = i;
|
|
contigs[three].node[j] = edge[k];
|
|
i = contigs[three].other;
|
|
j = contigs[five].other;
|
|
contigs[i].other = j;
|
|
contigs[j].other = i;
|
|
}
|
|
}
|
|
}
|
|
|
|
RECONCILE(y, pp, nn) overptr y;
|
|
int *pp, *nn;
|
|
{
|
|
short orienti, orientj; /* orientation of segments */
|
|
short orientk, orientd; /* orientation of segments */
|
|
int i, j, k, d, f; /* row and column indices */
|
|
char *ckalloc(); /* space-allocating function */
|
|
char *A, *B;
|
|
int M, N;
|
|
overptr node1;
|
|
|
|
k = y->host;
|
|
d = y->number;
|
|
orientk = y->orienti;
|
|
orientd = y->orientj;
|
|
if (!contigs[k].isthree[orientk]) {
|
|
if (!piece[y->ind].kind) return (0);
|
|
if (contigs[d].isthree[orientd]) {
|
|
orienti = orientd;
|
|
i = d;
|
|
orientj = contigs[k].orient[orientk];
|
|
j = contigs[k].next[orientk];
|
|
}
|
|
else
|
|
return (0);
|
|
}
|
|
else if (!contigs[k].isfive[orientk]) {
|
|
if (!piece[y->ind].kind) return (0);
|
|
if (contigs[d].isfive[orientd]) {
|
|
orienti = contigs[k].orient[1 - orientk];
|
|
orienti = 1 - orienti;
|
|
i = contigs[k].next[1 - orientk];
|
|
orientj = orientd;
|
|
j = d;
|
|
}
|
|
else
|
|
return (0);
|
|
}
|
|
else
|
|
return (0);
|
|
A = orienti ? segment[i].seq : segment[i].rev;
|
|
M = segment[i].length;
|
|
B = orientj ? segment[j].seq : segment[j].rev;
|
|
N = segment[j].length;
|
|
SCORE = 0;
|
|
big_pass(A, B, M, N, orienti, orientj);
|
|
if (SCORE > CUTOFF && ENDI - STARI > over_len && ENDI == M &&
|
|
STARJ == 0) {
|
|
node1 = (overptr)ckalloc((int)sizeof(over));
|
|
node1->kind = 1;
|
|
node1->host = i;
|
|
node1->number = j;
|
|
node1->stari = ++STARI;
|
|
node1->endi = ENDI;
|
|
node1->orienti = orienti;
|
|
node1->starj = ++STARJ;
|
|
node1->endj = ENDJ;
|
|
node1->orientj = orientj;
|
|
node1->score = SCORE;
|
|
piece[*pp].kind = 1;
|
|
if (i == d) {
|
|
node1->ind = *pp;
|
|
node1->next = piece[y->ind].list;
|
|
piece[y->ind].list = node1;
|
|
piece[*pp].id = j;
|
|
piece[*pp].list = 0;
|
|
}
|
|
else {
|
|
node1->ind = y->ind;
|
|
piece[*pp].list = node1;
|
|
node1->next = 0;
|
|
piece[*pp].id = i;
|
|
}
|
|
(*nn)++;
|
|
(*pp)++;
|
|
f = contigs[k].other;
|
|
if (!contigs[k].isthree[orientk]) {
|
|
contigs[j].isfive[orientj] = 1;
|
|
contigs[j].isthree[1 - orientj] = 1;
|
|
contigs[k].isthree[orientk] = 1;
|
|
contigs[k].isfive[1 - orientk] = 1;
|
|
contigs[f].other = j;
|
|
contigs[j].other = f;
|
|
}
|
|
else {
|
|
contigs[i].isthree[orienti] = 1;
|
|
contigs[i].isfive[1 - orienti] = 1;
|
|
contigs[k].isfive[orientk] = 1;
|
|
contigs[k].isthree[1 - orientk] = 1;
|
|
contigs[f].other = i;
|
|
contigs[i].other = f;
|
|
}
|
|
contigs[k].other = k;
|
|
return (1);
|
|
}
|
|
return (0);
|
|
}
|
|
|
|
/* Construct a tree of overlapping-containment segments */
|
|
FORM_TREE()
|
|
{
|
|
register int i, j, k; /* index variables */
|
|
char *ckalloc(); /* space-allocating function */
|
|
overptr node1; /* temporary pointer */
|
|
short orient; /* orientation of segment */
|
|
int group; /* serial number of contigs */
|
|
char *A, *B; /* pointers to segment sequences */
|
|
int stari, endi, starj, endj; /* positions where alignment begins */
|
|
int M, N; /* lengths of segment sequences */
|
|
int count; /* temporary variables */
|
|
|
|
mtree = (struct TTREE *)ckalloc(seg_num * sizeof(struct TTREE));
|
|
for (i = 0; i < seg_num; i++) {
|
|
mtree[i].head = 0;
|
|
mtree[i].next = mtree[i].child = mtree[i].brother = -1;
|
|
}
|
|
for (group = 0, i = 0; i < seg_num; i++)
|
|
if (segment[i].kind && contigs[i].group < 0 &&
|
|
(contigs[i].isfive[1] || contigs[i].isfive[0])) {
|
|
orient = contigs[i].isfive[1] ? 1 : 0;
|
|
mtree[i].head = 1;
|
|
for (j = i;;) {
|
|
contigs[j].group = group;
|
|
mtree[j].orient = orient;
|
|
SORT(j, orient);
|
|
if (contigs[j].isthree[orient])
|
|
break;
|
|
else {
|
|
k = contigs[j].next[orient];
|
|
node1 = contigs[j].node[orient];
|
|
if (j == node1->host) {
|
|
stari = node1->stari;
|
|
endi = node1->endi;
|
|
starj = node1->starj;
|
|
endj = node1->endj;
|
|
A = node1->orienti
|
|
? segment[j].seq
|
|
: segment[j].rev;
|
|
B = node1->orientj
|
|
? segment[k].seq
|
|
: segment[k].rev;
|
|
}
|
|
else {
|
|
M = segment[j].length;
|
|
stari = M + 1 - node1->endj;
|
|
endi = M + 1 - node1->starj;
|
|
N = segment[k].length;
|
|
starj = N + 1 - node1->endi;
|
|
endj = N + 1 - node1->stari;
|
|
A = node1->orientj
|
|
? segment[j].rev
|
|
: segment[j].seq;
|
|
B = node1->orienti
|
|
? segment[k].rev
|
|
: segment[k].seq;
|
|
}
|
|
M = endi - stari + 1;
|
|
N = endj - starj + 1;
|
|
sapp = S;
|
|
last = 0;
|
|
al_len = no_mat = no_mis = 0;
|
|
(void)diff(&A[stari] - 1, &B[starj] - 1,
|
|
M, N, q, q);
|
|
count =
|
|
((N = sapp - S) + 1) * sizeof(int);
|
|
mtree[k].script = (int *)ckalloc(count);
|
|
for (M = 0; M < N; M++)
|
|
mtree[k].script[M] = S[M];
|
|
mtree[k].size = N;
|
|
mtree[k].begin = stari;
|
|
mtree[j].next = k;
|
|
orient = contigs[j].orient[orient];
|
|
j = k;
|
|
}
|
|
}
|
|
group++;
|
|
}
|
|
}
|
|
|
|
/* Sort the children of each node by the `begin' field */
|
|
SORT(seg, ort) int seg;
|
|
short ort;
|
|
{
|
|
register int i, j, k; /* index variables */
|
|
char *ckalloc(); /* space-allocating function */
|
|
overptr node1; /* temporary pointer */
|
|
short orient; /* orientation of segment */
|
|
char *A, *B; /* pointers to segment sequences */
|
|
int stari, endi, starj, endj; /* positions where alignment begins */
|
|
int M, N; /* lengths of segment sequences */
|
|
int count; /* temporary variables */
|
|
|
|
for (j = contigs[seg].child; j != -1; j = contigs[j].brother) {
|
|
node1 = contigs[j].node[1];
|
|
if (ort == node1->orientj) {
|
|
stari = node1->starj;
|
|
endi = node1->endj;
|
|
starj = node1->stari;
|
|
endj = node1->endi;
|
|
A = node1->orientj ? segment[seg].seq
|
|
: segment[seg].rev;
|
|
B = node1->orienti ? segment[j].seq : segment[j].rev;
|
|
orient = node1->orienti;
|
|
}
|
|
else {
|
|
M = segment[seg].length;
|
|
stari = M + 1 - node1->endj;
|
|
endi = M + 1 - node1->starj;
|
|
N = segment[j].length;
|
|
starj = N + 1 - node1->endi;
|
|
endj = N + 1 - node1->stari;
|
|
A = node1->orientj ? segment[seg].rev
|
|
: segment[seg].seq;
|
|
B = node1->orienti ? segment[j].rev : segment[j].seq;
|
|
orient = 1 - node1->orienti;
|
|
}
|
|
M = endi - stari + 1;
|
|
N = endj - starj + 1;
|
|
sapp = S;
|
|
last = 0;
|
|
al_len = no_mat = no_mis = 0;
|
|
(void)diff(&A[stari] - 1, &B[starj] - 1, M, N, q, q);
|
|
count = ((M = sapp - S) + 1) * sizeof(int);
|
|
mtree[j].script = (int *)ckalloc(count);
|
|
for (k = 0; k < M; k++) mtree[j].script[k] = S[k];
|
|
mtree[j].size = M;
|
|
mtree[j].begin = stari;
|
|
mtree[j].orient = orient;
|
|
if (mtree[seg].child == -1)
|
|
mtree[seg].child = j;
|
|
else {
|
|
i = mtree[seg].child;
|
|
if (mtree[i].begin >= stari) {
|
|
mtree[j].brother = i;
|
|
mtree[seg].child = j;
|
|
}
|
|
else {
|
|
M = mtree[i].brother;
|
|
for (; M != -1; i = M, M = mtree[M].brother)
|
|
if (mtree[M].begin >= stari) break;
|
|
mtree[j].brother = M;
|
|
mtree[i].brother = j;
|
|
}
|
|
}
|
|
SORT(j, orient);
|
|
}
|
|
}
|
|
|
|
/* Display the alignments of segments */
|
|
SHOW()
|
|
{
|
|
register int i, j, k; /* index variables */
|
|
char *ckalloc(); /* space-allocating function */
|
|
int n; /* number of working segments */
|
|
int limit; /* number of slots in work */
|
|
int col; /* number of output columns prepared */
|
|
short done; /* tells if current group is done */
|
|
rowptr root; /* pointer to root of op tree */
|
|
int sym[6]; /* occurrance counts for six chars */
|
|
char c; /* temp variable */
|
|
rowptr t, w, yy; /* temp pointer */
|
|
int x; /* temp variables */
|
|
int group; /* Contigs number */
|
|
char conlit[20], *a; /* String form of contig number */
|
|
char *spt; /* pointer to the start of consensus */
|
|
|
|
work = (rowptr *)ckalloc(seg_num * sizeof(rowptr));
|
|
group = 0;
|
|
yy = 0;
|
|
for (j = 0; j < 6; j++) sym[j] = 0;
|
|
n = limit = col = 0;
|
|
for (i = 0; i < seg_num; i++)
|
|
if (mtree[i].head) {
|
|
(void)sprintf(conlit, ">Contig %d\n", group);
|
|
for (a = conlit; *a;) *allconpt++ = *a++;
|
|
/* Mod by S.S.
|
|
(void) printf("\n#Contig %d\n\n", group++);
|
|
*/
|
|
group++;
|
|
done = 0;
|
|
ENTER(&limit, &n, i, col, yy);
|
|
root = work[0];
|
|
spt = allconpt;
|
|
while (!done) {
|
|
for (j = 0; j < n;
|
|
j++) /* get segments into work */
|
|
{
|
|
t = work[j];
|
|
k = t->id;
|
|
if ((x = mtree[k].next) != -1 &&
|
|
mtree[x].begin == t->loc) {
|
|
ENTER(&limit, &n, x, col, t);
|
|
mtree[k].next = -1;
|
|
}
|
|
for (x = mtree[k].child; x != -1;
|
|
x = mtree[x].brother)
|
|
if (mtree[x].begin == t->loc) {
|
|
ENTER(&limit, &n, x,
|
|
col, t);
|
|
mtree[k].child =
|
|
mtree[x].brother;
|
|
}
|
|
else
|
|
break;
|
|
}
|
|
COLUMN(root); /* determine next column */
|
|
root->c = root->kind;
|
|
for (t = head; t != 0; t = t->link)
|
|
t->c = t->kind;
|
|
for (j = 0; j < n; j++) {
|
|
t = work[j];
|
|
if (t->done)
|
|
*t->a++ = ' ';
|
|
else {
|
|
if (t->c == 'L') {
|
|
if (t->loc == 1)
|
|
t->offset =
|
|
allconpt -
|
|
spt;
|
|
c = *t->a++ =
|
|
t->seq[t->loc++];
|
|
}
|
|
else if (t->loc > 1)
|
|
c = *t->a++ = '-';
|
|
else
|
|
c = *t->a++ = ' ';
|
|
if (c != ' ')
|
|
if (c == '-')
|
|
sym[5] += 1;
|
|
else
|
|
sym[vert[c]] +=
|
|
1;
|
|
t->c = ' ';
|
|
}
|
|
}
|
|
/* determine consensus char */
|
|
k = sym[0] + sym[1] + sym[2] + sym[3] + sym[4];
|
|
if (k < sym[5])
|
|
*allconpt++ = '-';
|
|
else if (sym[0] == sym[1] && sym[1] == sym[2] &&
|
|
sym[2] == sym[3])
|
|
*allconpt++ = 'N';
|
|
else {
|
|
k = sym[0];
|
|
c = 'A';
|
|
if (k < sym[1]) {
|
|
k = sym[1];
|
|
c = 'C';
|
|
}
|
|
if (k < sym[2]) {
|
|
k = sym[2];
|
|
c = 'G';
|
|
}
|
|
if (k < sym[3]) c = 'T';
|
|
*allconpt++ = c;
|
|
}
|
|
for (j = 0; j < 6; j++) sym[j] = 0;
|
|
for (t = head; t != 0; t = t->link) {
|
|
NEXTOP(t);
|
|
if (t->done) /* delete it from op tree
|
|
*/
|
|
{
|
|
w = t->father;
|
|
if (w->child->id == t->id)
|
|
w->child = t->brother;
|
|
else {
|
|
w = w->child;
|
|
for (; w->brother->id !=
|
|
t->id;
|
|
w = w->brother)
|
|
;
|
|
w->brother = t->brother;
|
|
}
|
|
}
|
|
}
|
|
if (root->loc >
|
|
root->length) /* check root node */
|
|
{
|
|
root->done = 1;
|
|
if ((w = root->child) != 0) {
|
|
w->father = 0;
|
|
root = w;
|
|
}
|
|
else
|
|
done = 1;
|
|
}
|
|
col++;
|
|
if (col == LINELEN || done) /* output */
|
|
{
|
|
col = 0;
|
|
for (j = 0; j < n; j++) {
|
|
t = work[j];
|
|
if (t->done)
|
|
/*
|
|
Mod by S.S.
|
|
{ (void) printf("#");
|
|
for ( a = t->name; *a; a++ )
|
|
(void) printf("%c", *a);
|
|
*/
|
|
{
|
|
int jj;
|
|
(void)printf(
|
|
"{\nname ");
|
|
for (jj = 0;
|
|
jj <
|
|
strlen(t->name) -
|
|
1;
|
|
jj++)
|
|
(void)printf(
|
|
"%c",
|
|
t->name
|
|
[jj]);
|
|
printf(
|
|
"\nstrandedness"
|
|
" %c\n",
|
|
t->name[strlen(
|
|
t->name)] == '+'
|
|
? '1'
|
|
: '2');
|
|
|
|
printf(
|
|
"offset "
|
|
"%d\ntype "
|
|
"DNA\ngroup-"
|
|
"ID %d\nsequence \"\n",
|
|
t->offset, group);
|
|
for (k = 0, a = t->line;
|
|
a != t->a; a++)
|
|
if (*a != ' ') {
|
|
k++;
|
|
(void)printf(
|
|
"%"
|
|
"c",
|
|
*a);
|
|
if (k ==
|
|
LINELEN) {
|
|
(void)printf(
|
|
"\n");
|
|
k = 0;
|
|
}
|
|
}
|
|
/*
|
|
if ( k )
|
|
*/
|
|
(void)printf("\"\n}\n");
|
|
}
|
|
if (t->linesize -
|
|
(t->a - t->line) <
|
|
LINELEN + 3)
|
|
ALOC_SEQ(t);
|
|
}
|
|
if (!done) {
|
|
for (k = j = n - 1; j >= 0; j--)
|
|
if (work[j]->done) {
|
|
t = work[j];
|
|
for (x = j;
|
|
x < k; x++)
|
|
work[x] = work
|
|
[x +
|
|
1];
|
|
work[k--] = t;
|
|
}
|
|
n = k + 1;
|
|
}
|
|
else
|
|
n = 0;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
/* allocate more space for output fragment */
|
|
ALOC_SEQ(t) rowptr t;
|
|
{
|
|
char *start, *end, *p;
|
|
t->linesize *= 2;
|
|
start = t->line;
|
|
end = t->a;
|
|
t->line = ckalloc(t->linesize * sizeof(char));
|
|
for (t->a = t->line, p = start; p != end;) *t->a++ = *p++;
|
|
free(start);
|
|
}
|
|
|
|
/* enter a segment into working set */
|
|
ENTER(b, d, id, pos, par) int *b, *d, id, pos;
|
|
rowptr par;
|
|
{
|
|
int i;
|
|
char *ckalloc(); /* space-allocating function */
|
|
rowptr t;
|
|
|
|
if (*b <= *d) {
|
|
work[*b] = (rowptr)ckalloc((int)sizeof(row));
|
|
work[*b]->line = (char *)ckalloc(SEQLEN * sizeof(char));
|
|
work[*b]->linesize = SEQLEN;
|
|
*b += 1;
|
|
}
|
|
t = work[*d];
|
|
*d += 1;
|
|
t->a = t->line;
|
|
for (i = 0; i < pos; i++) *t->a++ = ' ';
|
|
t->c = ' ';
|
|
t->seq = mtree[id].orient ? segment[id].seq : segment[id].rev;
|
|
t->length = segment[id].length;
|
|
t->id = id;
|
|
if (par != 0) {
|
|
t->s = mtree[id].script;
|
|
t->size = mtree[id].size;
|
|
}
|
|
t->op = 0;
|
|
for (i = 1; i <= segment[id].len && i <= NAMELEN; i++)
|
|
t->name[i - 1] = segment[id].name[i];
|
|
if (mtree[id].orient)
|
|
t->name[i - 1] = '+';
|
|
else
|
|
t->name[i - 1] = '-';
|
|
t->name[i] = '\0';
|
|
t->done = 0;
|
|
t->loc = 1;
|
|
t->child = 0;
|
|
t->father = par;
|
|
if (par != 0) {
|
|
t->brother = par->child;
|
|
par->child = t;
|
|
NEXTOP(t);
|
|
}
|
|
}
|
|
|
|
/* get the next operation */
|
|
NEXTOP(t) rowptr t;
|
|
{
|
|
if (t->size || t->op)
|
|
if (t->op == 0 && *t->s == 0) {
|
|
t->op = *t->s++;
|
|
t->size--;
|
|
t->up = 'L';
|
|
t->dw = 'L';
|
|
}
|
|
else {
|
|
if (t->op == 0) {
|
|
t->op = *t->s++;
|
|
t->size--;
|
|
}
|
|
if (t->op > 0) {
|
|
t->up = '-';
|
|
t->dw = 'L';
|
|
t->op--;
|
|
}
|
|
else {
|
|
t->up = 'L';
|
|
t->dw = '-';
|
|
t->op++;
|
|
}
|
|
}
|
|
else if (t->loc > t->length)
|
|
t->done = 1;
|
|
}
|
|
|
|
COLUMN(x) rowptr x;
|
|
{
|
|
rowptr y;
|
|
rowptr start, end; /* first and last nodes for subtree */
|
|
|
|
if (x->child == 0) {
|
|
head = tail = 0;
|
|
x->kind = 'L';
|
|
}
|
|
else {
|
|
start = end = 0;
|
|
x->kind = 'L';
|
|
for (y = x->child; y != 0; y = y->brother) {
|
|
COLUMN(y);
|
|
if (x->kind == y->up)
|
|
if (y->kind == y->dw) {
|
|
if (head == 0) {
|
|
y->link = 0;
|
|
head = tail = y;
|
|
}
|
|
else {
|
|
y->link = head;
|
|
head = y;
|
|
}
|
|
if (end == 0)
|
|
start = head;
|
|
else
|
|
end->link = head;
|
|
end = tail;
|
|
}
|
|
else if (y->kind == '-') {
|
|
start = head;
|
|
end = tail;
|
|
x->kind = '-';
|
|
}
|
|
else {
|
|
y->link = 0;
|
|
y->kind = '-';
|
|
if (end == 0)
|
|
start = end = y;
|
|
else {
|
|
end->link = y;
|
|
end = y;
|
|
}
|
|
}
|
|
else if (y->kind == y->dw)
|
|
if (x->kind == '-')
|
|
;
|
|
else {
|
|
if (head == 0) {
|
|
y->link = 0;
|
|
head = tail = y;
|
|
}
|
|
else {
|
|
y->link = head;
|
|
head = y;
|
|
}
|
|
start = head;
|
|
end = tail;
|
|
x->kind = '-';
|
|
}
|
|
else if (x->kind == '-')
|
|
if (y->kind == '-') {
|
|
if (end == 0) {
|
|
start = head;
|
|
end = tail;
|
|
}
|
|
else if (head == 0)
|
|
/* code folded from here */
|
|
;
|
|
/* unfolding */
|
|
else {
|
|
/* code folded from here */
|
|
end->link = head;
|
|
end = tail;
|
|
/* unfolding */
|
|
}
|
|
}
|
|
else
|
|
;
|
|
else {
|
|
start = head;
|
|
end = tail;
|
|
x->kind = '-';
|
|
}
|
|
}
|
|
head = start;
|
|
tail = end;
|
|
}
|
|
}
|
|
|
|
/* Display a summary of contigs */
|
|
GRAPH()
|
|
{
|
|
int i, j, k; /* index variables */
|
|
int group; /* serial number of contigs */
|
|
char name[NAMELEN + 2]; /* name of segment */
|
|
char *t; /* temp var */
|
|
int length; /* length of name */
|
|
|
|
(void)printf("\nOVERLAPS CONTAINMENTS\n\n");
|
|
group = 1;
|
|
for (i = 0; i < seg_num; i++)
|
|
if (mtree[i].head) {
|
|
(void)printf(
|
|
"******************* Contig %d "
|
|
"********************\n",
|
|
group++);
|
|
for (j = i; j != -1; j = mtree[j].next) {
|
|
length = segment[j].len;
|
|
t = segment[j].name + 1;
|
|
for (k = 0; k < length && k < NAMELEN; k++)
|
|
name[k] = *t++;
|
|
if (mtree[j].orient)
|
|
name[k] = '+';
|
|
else
|
|
name[k] = '-';
|
|
name[k + 1] = '\0';
|
|
(void)printf("%s\n", name);
|
|
CONTAIN(mtree[j].child, name);
|
|
}
|
|
}
|
|
}
|
|
|
|
CONTAIN(id, f) int id;
|
|
char *f;
|
|
{
|
|
int k; /* index variable */
|
|
char name[NAMELEN + 2]; /* name of segment */
|
|
char *t; /* temp var */
|
|
int length; /* length of name */
|
|
|
|
if (id != -1) {
|
|
length = segment[id].len;
|
|
t = segment[id].name + 1;
|
|
for (k = 0; k < length && k < NAMELEN; k++) name[k] = *t++;
|
|
if (mtree[id].orient)
|
|
name[k] = '+';
|
|
else
|
|
name[k] = '-';
|
|
name[k + 1] = '\0';
|
|
(void)printf(" %s is in %s\n", name, f);
|
|
CONTAIN(mtree[id].child, name);
|
|
CONTAIN(mtree[id].brother, f);
|
|
}
|
|
}
|
|
|
|
big_pass(A, B, M, N, orienti, orientj) char A[], B[];
|
|
int M, N;
|
|
short orienti, orientj;
|
|
{
|
|
register int i, j; /* row and column indices */
|
|
register int c; /* best score at current point */
|
|
register int f; /* best score ending with insertion */
|
|
register int d; /* best score ending with deletion */
|
|
register int p; /* best score at (i-1, j-1) */
|
|
register int ci; /* end-point associated with c */
|
|
|
|
register int di; /* end-point associated with d */
|
|
register int fi; /* end-point associated with f */
|
|
register int pi; /* end-point associated with p */
|
|
int *va; /* pointer to v(A[i], B[j]) */
|
|
int x1,
|
|
x2; /* regions of A before x1 or after x2 are lightly penalized */
|
|
int y1,
|
|
y2; /* regions of B before y1 or after y2 are lightly penalized */
|
|
short heavy; /* 1 = heavy penalty */
|
|
int ex, gx; /* current gap penalty scores */
|
|
|
|
/* determine x1, x2, y1, y2 */
|
|
if (POS5 >= POS3)
|
|
fatal(
|
|
"The value for POS5 must be less than the value for POS3");
|
|
if (orienti) {
|
|
x1 = POS5 >= M ? 1 : POS5;
|
|
x2 = POS3 >= M ? M : POS3;
|
|
}
|
|
else {
|
|
x1 = POS3 >= M ? 1 : M - POS3 + 1;
|
|
x2 = POS5 >= M ? M : M - POS5 + 1;
|
|
}
|
|
if (orientj) {
|
|
y1 = POS5 >= N ? 1 : POS5;
|
|
y2 = POS3 >= N ? N : POS3;
|
|
}
|
|
else {
|
|
y1 = POS3 >= N ? 1 : N - POS3 + 1;
|
|
y2 = POS5 >= N ? N : N - POS5 + 1;
|
|
}
|
|
if (x1 + 1 <= x2) x1++;
|
|
if (y1 + 1 <= y2) y1++;
|
|
heavy = 0;
|
|
|
|
/* Compute the matrix.
|
|
CC : the scores of the current row
|
|
RR : the starting point that leads to score CC
|
|
DD : the scores of the current row, ending with deletion
|
|
SS : the starting point that leads to score DD */
|
|
/* Initialize the 0 th row */
|
|
for (j = 1; j <= N; j++) {
|
|
CC[j] = 0;
|
|
DD[j] = -(q);
|
|
RR[j] = SS[j] = -j;
|
|
}
|
|
for (i = 1; i <= M; i++) {
|
|
if (i == x1) heavy = 1 - heavy;
|
|
if (i == x2) heavy = 1 - heavy;
|
|
ex = r1;
|
|
gx = qr1;
|
|
va = v1[A[i]];
|
|
c = 0; /* Initialize column 0 */
|
|
f = -(q);
|
|
ci = fi = i;
|
|
p = 0;
|
|
pi = i - 1;
|
|
for (j = 1; j <= N; j++) {
|
|
if (j == y1) {
|
|
if (heavy) {
|
|
ex = r;
|
|
gx = qr;
|
|
/*
|
|
S.S.
|
|
va = v[A[i]];
|
|
*/
|
|
va = v[A[i]];
|
|
}
|
|
}
|
|
if (j == y2) {
|
|
if (heavy) {
|
|
ex = r1;
|
|
gx = qr1;
|
|
va = v1[A[i]];
|
|
}
|
|
}
|
|
if ((f = f - ex) < (c = c - gx)) {
|
|
f = c;
|
|
fi = ci;
|
|
}
|
|
di = SS[j];
|
|
if ((d = DD[j] - ex) < (c = CC[j] - gx)) {
|
|
d = c;
|
|
di = RR[j];
|
|
}
|
|
c = p + va[B[j]]; /* diagonal */
|
|
ci = pi;
|
|
if (c < d) {
|
|
c = d;
|
|
ci = di;
|
|
}
|
|
if (c < f) {
|
|
c = f;
|
|
ci = fi;
|
|
}
|
|
p = CC[j];
|
|
CC[j] = c;
|
|
pi = RR[j];
|
|
RR[j] = ci;
|
|
DD[j] = d;
|
|
SS[j] = di;
|
|
if ((j == N || i == M) && c > SCORE) {
|
|
SCORE = c;
|
|
ENDI = i;
|
|
ENDJ = j;
|
|
STARI = ci;
|
|
}
|
|
}
|
|
}
|
|
if (SCORE)
|
|
if (STARI < 0) {
|
|
STARJ = -STARI;
|
|
STARI = 0;
|
|
}
|
|
else
|
|
STARJ = 0;
|
|
}
|
|
|
|
/* diff(A,B,M,N,tb,te) returns the score of an optimum conversion between
|
|
A[1..M] and B[1..N] that begins(ends) with a delete if tb(te) is zero
|
|
and appends such a conversion to the current script. */
|
|
|
|
int diff(A, B, M, N, tb, te)
|
|
char *A, *B;
|
|
int M, N;
|
|
int tb, te;
|
|
|
|
{
|
|
int midi, midj, type; /* Midpoint, type, and cost */
|
|
int midc;
|
|
|
|
{
|
|
register int i, j;
|
|
register int c, e, d, s;
|
|
int t, *va;
|
|
char *ckalloc();
|
|
|
|
/* Boundary cases: M <= 1 or N == 0 */
|
|
|
|
if (N <= 0) {
|
|
if (M > 0) DEL(M)
|
|
return -gap(M);
|
|
}
|
|
if (M <= 1) {
|
|
if (M <= 0) {
|
|
INS(N);
|
|
return -gap(N);
|
|
}
|
|
if (tb > te) tb = te;
|
|
midc = -(tb + r + gap(N));
|
|
midj = 0;
|
|
va = v[A[1]];
|
|
for (j = 1; j <= N; j++) {
|
|
c = va[B[j]] - (gap(j - 1) + gap(N - j));
|
|
if (c > midc) {
|
|
midc = c;
|
|
midj = j;
|
|
}
|
|
}
|
|
if (midj == 0) {
|
|
INS(N) DEL(1)
|
|
}
|
|
else {
|
|
if (midj > 1) INS(midj - 1)
|
|
REP if ((A[1] | 32) == (B[midj] | 32)) no_mat +=
|
|
1;
|
|
else no_mis += 1;
|
|
if (midj < N) INS(N - midj)
|
|
}
|
|
return midc;
|
|
}
|
|
|
|
/* Divide: Find optimum midpoint (midi,midj) of cost midc */
|
|
|
|
midi = M / 2; /* Forward phase: */
|
|
CC[0] = 0; /* Compute C(M/2,k) & D(M/2,k) for all k */
|
|
t = -q;
|
|
for (j = 1; j <= N; j++) {
|
|
CC[j] = t = t - r;
|
|
DD[j] = t - q;
|
|
}
|
|
t = -tb;
|
|
for (i = 1; i <= midi; i++) {
|
|
s = CC[0];
|
|
CC[0] = c = t = t - r;
|
|
e = t - q;
|
|
va = v[A[i]];
|
|
for (j = 1; j <= N; j++) {
|
|
if ((c = c - qr) > (e = e - r)) e = c;
|
|
if ((c = CC[j] - qr) > (d = DD[j] - r)) d = c;
|
|
c = s + va[B[j]];
|
|
if (c < d) c = d;
|
|
if (c < e) c = e;
|
|
s = CC[j];
|
|
CC[j] = c;
|
|
DD[j] = d;
|
|
}
|
|
}
|
|
DD[0] = CC[0];
|
|
|
|
RR[N] = 0; /* Reverse phase: */
|
|
t = -q; /* Compute R(M/2,k) & S(M/2,k) for all k */
|
|
for (j = N - 1; j >= 0; j--) {
|
|
RR[j] = t = t - r;
|
|
SS[j] = t - q;
|
|
}
|
|
t = -te;
|
|
for (i = M - 1; i >= midi; i--) {
|
|
s = RR[N];
|
|
RR[N] = c = t = t - r;
|
|
e = t - q;
|
|
va = v[A[i + 1]];
|
|
for (j = N - 1; j >= 0; j--) {
|
|
if ((c = c - qr) > (e = e - r)) e = c;
|
|
if ((c = RR[j] - qr) > (d = SS[j] - r)) d = c;
|
|
c = s + va[B[j + 1]];
|
|
if (c < d) c = d;
|
|
if (c < e) c = e;
|
|
s = RR[j];
|
|
RR[j] = c;
|
|
SS[j] = d;
|
|
}
|
|
}
|
|
SS[N] = RR[N];
|
|
|
|
midc = CC[0] + RR[0]; /* Find optimal midpoint */
|
|
midj = 0;
|
|
type = 1;
|
|
for (j = 0; j <= N; j++)
|
|
if ((c = CC[j] + RR[j]) >= midc)
|
|
if (c > midc ||
|
|
CC[j] != DD[j] && RR[j] == SS[j]) {
|
|
midc = c;
|
|
midj = j;
|
|
}
|
|
for (j = N; j >= 0; j--)
|
|
if ((c = DD[j] + SS[j] + q) > midc) {
|
|
midc = c;
|
|
midj = j;
|
|
type = 2;
|
|
}
|
|
}
|
|
|
|
/* Conquer: recursively around midpoint */
|
|
|
|
if (type == 1) {
|
|
(void)diff(A, B, midi, midj, tb, q);
|
|
(void)diff(A + midi, B + midj, M - midi, N - midj, q, te);
|
|
}
|
|
else {
|
|
(void)diff(A, B, midi - 1, midj, tb, zero);
|
|
DEL(2);
|
|
(void)diff(A + midi + 1, B + midj, M - midi - 1, N - midj, zero,
|
|
te);
|
|
}
|
|
return midc;
|
|
}
|
|
|
|
/* lib.c - library of C procedures. */
|
|
|
|
/* fatal - print message and die */
|
|
fatal(msg) char *msg;
|
|
{
|
|
(void)fprintf(stderr, "%s\n", msg);
|
|
exit(1);
|
|
}
|
|
|
|
/* fatalf - format message, print it, and die */
|
|
fatalf(msg, val) char *msg, *val;
|
|
{
|
|
(void)fprintf(stderr, msg, val);
|
|
(void)putc('\n', stderr);
|
|
exit(1);
|
|
}
|
|
|
|
/* ckopen - open file; check for success */
|
|
FILE *ckopen(name, mode)
|
|
char *name, *mode;
|
|
{
|
|
FILE *fopen(), *fp;
|
|
|
|
if ((fp = fopen(name, mode)) == NULL) fatalf("Cannot open %s.", name);
|
|
return (fp);
|
|
}
|
|
|
|
/* ckalloc - allocate space; check for success */
|
|
char *ckalloc(amount)
|
|
size_t amount;
|
|
{
|
|
void *malloc(), *p;
|
|
|
|
if ((p = malloc((unsigned)amount)) == NULL) fatal("Ran out of memory.");
|
|
return (p);
|
|
}
|
|
|