gde_linux/CORE/Genbank.c

504 lines
12 KiB
C
Executable File

#include <malloc.h>
#include <stdio.h>
#include <sys/time.h>
#include <time.h>
#include <xview/panel.h>
#include <xview/xview.h>
#include "defines.h"
#include "menudefs.h"
/*
Copyright (c) 1989-1990, University of Illinois board of trustees. All
rights reserved. Written by Steven Smith at the Center for Prokaryote Genome
Analysis. Design and implementation guidance by Dr. Gary Olsen and Dr.
Carl Woese.
Copyright (c) 1990,1991,1992 Steven Smith at the Harvard Genome Laboratory.
all rights reserved.
Copyright (c) 1993, Steven Smith, all rights reserved.
*/
ReadGen(filename, dataset, type) char *filename;
NA_Alignment *dataset;
int type;
{
register int done = FALSE, len = 0, j = 0;
int count, IS_REALLY_AA = FALSE;
char Inline[GBUFSIZ], c;
char *buffer, *gencomments = NULL, fields[8][GBUFSIZ];
int buflen = 0, genclen = 0, curelem = 0, n = 0, flag = 0;
int start_col = -1;
NA_Sequence *this_elem;
FILE *file;
extern int Default_DNA_Trans[], Default_RNA_Trans[];
extern int Default_NA_RTrans[];
extern int Default_PROColor_LKUP[], Default_NAColor_LKUP[];
ErrorOut("No such file", file = fopen(filename, "r"));
for (; fgets(Inline, GBUFSIZ, file) != 0;) {
if (Inline[strlen(Inline) - 1] == '\n')
Inline[strlen(Inline) - 1] = '\0';
if (Find(Inline, "LOCUS")) {
curelem = dataset->numelements++;
if (curelem == 0) {
dataset->element = (NA_Sequence *)Calloc(
5, sizeof(NA_Sequence));
dataset->maxnumelements = 5;
}
else if (curelem == dataset->maxnumelements) {
(dataset->maxnumelements) *= 2;
dataset->element = (NA_Sequence *)Realloc(
dataset->element, dataset->maxnumelements *
sizeof(NA_Sequence));
}
this_elem = &(dataset->element[curelem]);
n = sscanf(Inline, "%s %s %s %s %s %s %s %s", fields[0],
fields[1], fields[2], fields[3], fields[4],
fields[5], fields[6], fields[7]);
if (IS_REALLY_AA) {
InitNASeq(this_elem, PROTEIN);
}
else if (Find(Inline, "DNA")) {
InitNASeq(this_elem, DNA);
}
else if (Find(Inline, "RNA")) {
InitNASeq(this_elem, RNA);
}
else if (Find(Inline, "MASK")) {
InitNASeq(this_elem, MASK);
}
else if (Find(Inline, "TEXT")) {
InitNASeq(this_elem, TEXT);
}
else if (Find(Inline, "PROT")) {
InitNASeq(this_elem, PROTEIN);
}
else
InitNASeq(this_elem, DNA);
strncpy(this_elem->short_name, fields[1], 31);
AsciiTime(&(this_elem->t_stamp.origin), fields[n - 1]);
this_elem->attr = DEFAULT_X_ATTR;
if (Find(Inline, "Circular"))
this_elem->attr |= IS_CIRCULAR;
gencomments = NULL;
genclen = 0;
}
else if (Find(Inline, "DEFINITION"))
strncpy(this_elem->description, &(Inline[12]), 79);
else if (Find(Inline, "AUTHOR"))
strncpy(this_elem->authority, &(Inline[12]), 79);
else if (Find(Inline, " ORGANISM"))
strncpy(this_elem->seq_name, &(Inline[12]), 79);
else if (Find(Inline, "ACCESSION"))
strncpy(this_elem->id, &(Inline[12]), 79);
else if (Find(Inline, "ORIGIN")) {
done = FALSE;
len = 0;
for (; done == FALSE &&
fgets(Inline, GBUFSIZ, file) != 0;) {
if (Inline[0] != '/') {
if (buflen == 0) {
buflen = GBUFSIZ;
buffer = Calloc(sizeof(char),
buflen);
}
else if (len + strlen(Inline) >=
buflen) {
buflen += GBUFSIZ;
buffer = Realloc(
buffer,
sizeof(char) * buflen);
for (j = buflen - GBUFSIZ;
j < buflen; j++)
buffer[j] = '\0';
}
/*
* Search for the fist column of
*data
*(whitespace-number-whitespace)data
*/
if (start_col == -1) {
for (start_col = 0;
Inline[start_col] == ' ' ||
Inline[start_col] == '\t';
start_col++)
;
for (start_col++;
strchr(
"1234567890",
Inline[start_col]) !=
NULL;
start_col++)
;
for (start_col++;
Inline[start_col] == ' ' ||
Inline[start_col] == '\t';
start_col++)
;
}
for (j = start_col;
(c = Inline[j]) != '\0'; j++) {
if ((c != '\n') &&
((j - start_col + 1) % 11 !=
0))
buffer[len++] = c;
}
}
else {
AppendNA(buffer, len,
&(dataset->element[curelem]));
for (j = 0; j < len; j++)
buffer[j] = '\0';
len = 0;
done = TRUE;
dataset->element[curelem].comments =
gencomments;
dataset->element[curelem].comments_len =
genclen - 1;
dataset->element[curelem]
.comments_maxlen = genclen;
gencomments = NULL;
genclen = 0;
}
}
/*
* Test if sequence should be converted by
*the translation table If it looks like a protein...
*/
if (dataset->element[curelem].rmatrix &&
IS_REALLY_AA == FALSE) {
IS_REALLY_AA = CheckType(
dataset->element[curelem].sequence,
dataset->element[curelem].seqlen);
if (IS_REALLY_AA == FALSE)
Ascii2NA(
dataset->element[curelem].sequence,
dataset->element[curelem].seqlen,
dataset->element[curelem].rmatrix);
else
/*
* Force the sequence to be AA
*/
{
dataset->element[curelem].elementtype =
PROTEIN;
dataset->element[curelem].rmatrix =
NULL;
dataset->element[curelem].tmatrix =
NULL;
dataset->element[curelem].col_lut =
Default_PROColor_LKUP;
}
}
}
else if (Find(Inline, "ZZZZZ")) {
Cfree(gencomments);
genclen = 0;
}
else {
if (gencomments == NULL) {
gencomments = String(Inline);
genclen = strlen(gencomments) + 1;
}
else {
genclen += strlen(Inline) + 1;
gencomments = Realloc(gencomments,
genclen * sizeof(char));
strncat(gencomments, Inline, GBUFSIZ);
strncat(gencomments, "\n", GBUFSIZ);
}
}
}
Cfree(buffer);
fclose(file);
for (j = 0; j < dataset->numelements; j++)
dataset->maxlen =
MAX(dataset->maxlen, dataset->element[j].seqlen +
dataset->element[j].offset);
return;
}
typedef struct mya {
int yy;
int mm;
int dd;
int hr;
int mn;
int sc;
} sA;
AsciiTime(sA *a, char *asciitime)
{
int j;
char temp[GBUFSIZ];
extern char month[12][6];
a->dd = 0;
a->yy = 0;
a->mm = 0;
sscanf(asciitime, "%d%5c%d", &(a->dd), temp, &(a->yy));
temp[5] = '\0';
for (j = 0; j < 12; j++)
if (strcmp(temp, month[j]) == 0) a->mm = j + 1;
if (a->dd < 0 || a->dd > 31 || a->yy < 0 || a->mm > 11) SetTime(a);
return;
}
WriteGen(aln, filename, method, maskable) NA_Alignment *aln;
char *filename;
int method, maskable;
{
int i, j, k, mask = -1;
FILE *file;
NA_Sequence *this_elem;
extern char month[12][6];
char c;
if (aln == NULL) return;
if (aln->na_ddata == NULL) return;
file = fopen(filename, "w");
if (file == NULL) {
Warning("Cannot open file for output");
return (1);
}
if (maskable && method != SELECT_REGION)
for (j = 0; j < aln->numelements; j++)
if (aln->element[j].elementtype == MASK &&
aln->element[j].selected)
mask = j;
for (j = 0; j < aln->numelements; j++) {
if ((aln->element[j].selected && j != mask &&
method != SELECT_REGION) ||
(aln->element[j].subselected && method == SELECT_REGION) ||
(method == ALL)) {
this_elem = &(aln->element[j]);
fprintf(
file,
"LOCUS %10s%8d bp %4s %10s %2d%5s%4d\n",
this_elem->short_name,
this_elem->seqlen + this_elem->offset,
(this_elem->elementtype == DNA) ? "DNA"
: (this_elem->elementtype == RNA) ? "RNA"
: (this_elem->elementtype == MASK) ? "MASK"
: (this_elem->elementtype == PROTEIN) ? "PROT"
: "TEXT",
this_elem->attr & IS_CIRCULAR ? "Circular" : "",
this_elem->t_stamp.origin.dd,
month[this_elem->t_stamp.origin.mm - 1],
this_elem->t_stamp.origin.yy > 1900
? this_elem->t_stamp.origin.yy
: this_elem->t_stamp.origin.yy + 1900);
if (this_elem->description[0])
fprintf(file, "DEFINITION %s\n",
this_elem->description);
if (this_elem->seq_name[0])
fprintf(file, " ORGANISM %s\n",
this_elem->seq_name);
if (this_elem->id[0])
fprintf(file, " ACCESSION %s\n",
this_elem->id);
if (this_elem->authority[0])
fprintf(file, " AUTHORS %s\n",
this_elem->authority);
if (this_elem->comments)
fprintf(file, "%s\n", this_elem->comments);
fprintf(file, "ORIGIN");
if (this_elem->tmatrix) {
if (mask == -1) {
for (i = 0, k = 0;
k < this_elem->seqlen +
this_elem->offset;
k++) {
if (method == SELECT_REGION) {
if (aln->selection_mask
[k] == '1') {
if (i % 60 == 0)
fprintf(
file,
"\n"
"%9"
"d",
i + 1);
if (i % 10 == 0)
fprintf(
file,
" ");
fprintf(
file, "%c",
this_elem->tmatrix
[getelem(
this_elem,
k)]);
i++;
}
}
else {
if (i % 60 == 0)
fprintf(file,
"\n%9d",
i + 1);
if (i % 10 == 0)
fprintf(file,
" ");
fprintf(
file, "%c",
this_elem->tmatrix
[getelem(
this_elem,
k)]);
i++;
}
}
}
else {
for (k = 0; k < this_elem->seqlen +
this_elem->offset;
k++) {
c = (char)getelem(
&(aln->element[mask]), k);
if (c != '0' && c != '-') {
if (k % 60 == 0)
fprintf(file,
"\n%9d",
k + 1);
if (k % 10 == 0)
fprintf(file,
" ");
fprintf(
file, "%c",
this_elem->tmatrix
[getelem(
this_elem,
k)]);
}
}
}
}
else {
if (mask == -1) {
for (i = 0, k = 0;
k < this_elem->seqlen +
this_elem->offset;
k++) {
if (method == SELECT_REGION) {
if (aln->selection_mask
[k] == '1') {
if (i % 60 == 0)
fprintf(
file,
"\n"
"%9"
"d",
i + 1);
if (i % 10 == 0)
fprintf(
file,
" ");
fprintf(
file, "%c",
getelem(
this_elem,
k));
i++;
}
}
else {
if (i % 60 == 0)
fprintf(file,
"\n%9d",
i + 1);
if (i % 10 == 0)
fprintf(file,
" ");
fprintf(
file, "%c",
getelem(this_elem,
k));
i++;
}
}
}
else {
for (k = 0; k < this_elem->seqlen +
this_elem->offset;
k++) {
c = (char)getelem(
&(aln->element[mask]), k);
if (c != '0' && c != '-') {
if (k % 60 == 0)
fprintf(file,
"\n%9d",
k + 1);
if (k % 10 == 0)
fprintf(file,
" ");
fprintf(
file, "%c",
getelem(this_elem,
k));
}
}
}
}
fprintf(file, "\n//\n");
}
}
fclose(file);
return;
}
SetTime(sA *a)
{
struct tm *tim, *localtime();
long clock;
clock = time(0);
tim = localtime(&clock);
a->yy = tim->tm_year;
a->mm = tim->tm_mon + 1;
a->dd = tim->tm_mday;
a->hr = tim->tm_hour;
a->mn = tim->tm_min;
a->sc = tim->tm_sec;
return;
}
/*
* CheckType: Check base composition to see if the sequence
* appears to be an amino acid sequence. If it is, pass back
* TRUE, else FALSE.
*/
CheckType(seq, len) char *seq;
int len;
{
int j, count1 = 0, count2 = 0;
for (j = 0; j < len; j++)
if (((seq[j] | 32) < 'z') && ((seq[j] | 32) > 'a')) {
count1++;
if (index("ACGTUNacgtun", seq[j]) == NULL) count2++;
}
return ((count2 > count1 / 4) ? TRUE : FALSE);
}