gde_linux/CORE/Genbank.c

504 lines
12 KiB
C
Raw Normal View History

2022-03-08 04:43:05 +08:00
#include <malloc.h>
2023-04-09 02:17:32 +08:00
#include <stdio.h>
#include <sys/time.h>
#include <time.h>
2022-03-08 04:43:05 +08:00
#include <xview/panel.h>
2023-04-09 02:17:32 +08:00
#include <xview/xview.h>
2022-03-08 04:43:05 +08:00
#include "defines.h"
2023-04-09 02:17:32 +08:00
#include "menudefs.h"
2022-03-08 04:43:05 +08:00
/*
Copyright (c) 1989-1990, University of Illinois board of trustees. All
rights reserved. Written by Steven Smith at the Center for Prokaryote Genome
Analysis. Design and implementation guidance by Dr. Gary Olsen and Dr.
Carl Woese.
Copyright (c) 1990,1991,1992 Steven Smith at the Harvard Genome Laboratory.
all rights reserved.
Copyright (c) 1993, Steven Smith, all rights reserved.
*/
2023-04-09 02:17:32 +08:00
ReadGen(filename, dataset, type) char *filename;
2022-03-08 04:43:05 +08:00
NA_Alignment *dataset;
int type;
{
2023-04-09 02:17:32 +08:00
register int done = FALSE, len = 0, j = 0;
int count, IS_REALLY_AA = FALSE;
char Inline[GBUFSIZ], c;
char *buffer, *gencomments = NULL, fields[8][GBUFSIZ];
int buflen = 0, genclen = 0, curelem = 0, n = 0, flag = 0;
2022-03-08 04:43:05 +08:00
int start_col = -1;
NA_Sequence *this_elem;
FILE *file;
extern int Default_DNA_Trans[], Default_RNA_Trans[];
extern int Default_NA_RTrans[];
2023-04-09 02:17:32 +08:00
extern int Default_PROColor_LKUP[], Default_NAColor_LKUP[];
2022-03-08 04:43:05 +08:00
2023-04-09 02:17:32 +08:00
ErrorOut("No such file", file = fopen(filename, "r"));
2022-03-08 04:43:05 +08:00
2023-04-09 02:17:32 +08:00
for (; fgets(Inline, GBUFSIZ, file) != 0;) {
if (Inline[strlen(Inline) - 1] == '\n')
Inline[strlen(Inline) - 1] = '\0';
if (Find(Inline, "LOCUS")) {
2022-03-08 04:43:05 +08:00
curelem = dataset->numelements++;
2023-04-09 02:17:32 +08:00
if (curelem == 0) {
dataset->element = (NA_Sequence *)Calloc(
5, sizeof(NA_Sequence));
2022-03-08 04:43:05 +08:00
dataset->maxnumelements = 5;
}
2023-04-09 02:17:32 +08:00
else if (curelem == dataset->maxnumelements) {
2022-03-08 04:43:05 +08:00
(dataset->maxnumelements) *= 2;
2023-04-09 02:17:32 +08:00
dataset->element = (NA_Sequence *)Realloc(
dataset->element, dataset->maxnumelements *
sizeof(NA_Sequence));
2022-03-08 04:43:05 +08:00
}
this_elem = &(dataset->element[curelem]);
2023-04-09 02:17:32 +08:00
n = sscanf(Inline, "%s %s %s %s %s %s %s %s", fields[0],
fields[1], fields[2], fields[3], fields[4],
fields[5], fields[6], fields[7]);
if (IS_REALLY_AA) {
InitNASeq(this_elem, PROTEIN);
2022-03-08 04:43:05 +08:00
}
2023-04-09 02:17:32 +08:00
else if (Find(Inline, "DNA")) {
InitNASeq(this_elem, DNA);
2022-03-08 04:43:05 +08:00
}
2023-04-09 02:17:32 +08:00
else if (Find(Inline, "RNA")) {
InitNASeq(this_elem, RNA);
2022-03-08 04:43:05 +08:00
}
2023-04-09 02:17:32 +08:00
else if (Find(Inline, "MASK")) {
InitNASeq(this_elem, MASK);
2022-03-08 04:43:05 +08:00
}
2023-04-09 02:17:32 +08:00
else if (Find(Inline, "TEXT")) {
InitNASeq(this_elem, TEXT);
2022-03-08 04:43:05 +08:00
}
2023-04-09 02:17:32 +08:00
else if (Find(Inline, "PROT")) {
InitNASeq(this_elem, PROTEIN);
2022-03-08 04:43:05 +08:00
}
else
2023-04-09 02:17:32 +08:00
InitNASeq(this_elem, DNA);
2022-03-08 04:43:05 +08:00
2023-04-09 02:17:32 +08:00
strncpy(this_elem->short_name, fields[1], 31);
AsciiTime(&(this_elem->t_stamp.origin), fields[n - 1]);
2022-03-08 04:43:05 +08:00
this_elem->attr = DEFAULT_X_ATTR;
2023-04-09 02:17:32 +08:00
if (Find(Inline, "Circular"))
2022-03-08 04:43:05 +08:00
this_elem->attr |= IS_CIRCULAR;
gencomments = NULL;
genclen = 0;
}
2023-04-09 02:17:32 +08:00
else if (Find(Inline, "DEFINITION"))
strncpy(this_elem->description, &(Inline[12]), 79);
2022-03-08 04:43:05 +08:00
2023-04-09 02:17:32 +08:00
else if (Find(Inline, "AUTHOR"))
strncpy(this_elem->authority, &(Inline[12]), 79);
2022-03-08 04:43:05 +08:00
2023-04-09 02:17:32 +08:00
else if (Find(Inline, " ORGANISM"))
strncpy(this_elem->seq_name, &(Inline[12]), 79);
2022-03-08 04:43:05 +08:00
2023-04-09 02:17:32 +08:00
else if (Find(Inline, "ACCESSION"))
strncpy(this_elem->id, &(Inline[12]), 79);
2022-03-08 04:43:05 +08:00
2023-04-09 02:17:32 +08:00
else if (Find(Inline, "ORIGIN")) {
2022-03-08 04:43:05 +08:00
done = FALSE;
len = 0;
2023-04-09 02:17:32 +08:00
for (; done == FALSE &&
fgets(Inline, GBUFSIZ, file) != 0;) {
if (Inline[0] != '/') {
if (buflen == 0) {
2022-03-08 04:43:05 +08:00
buflen = GBUFSIZ;
2023-04-09 02:17:32 +08:00
buffer = Calloc(sizeof(char),
buflen);
2022-03-08 04:43:05 +08:00
}
2023-04-09 02:17:32 +08:00
else if (len + strlen(Inline) >=
buflen) {
2022-03-08 04:43:05 +08:00
buflen += GBUFSIZ;
2023-04-09 02:17:32 +08:00
buffer = Realloc(
buffer,
sizeof(char) * buflen);
for (j = buflen - GBUFSIZ;
j < buflen; j++)
2022-03-08 04:43:05 +08:00
buffer[j] = '\0';
}
2023-04-09 02:17:32 +08:00
/*
* Search for the fist column of
*data
*(whitespace-number-whitespace)data
*/
if (start_col == -1) {
for (start_col = 0;
Inline[start_col] == ' ' ||
Inline[start_col] == '\t';
start_col++)
;
for (start_col++;
strchr(
"1234567890",
Inline[start_col]) !=
NULL;
start_col++)
;
for (start_col++;
Inline[start_col] == ' ' ||
Inline[start_col] == '\t';
start_col++)
;
2022-03-08 04:43:05 +08:00
}
2023-04-09 02:17:32 +08:00
for (j = start_col;
(c = Inline[j]) != '\0'; j++) {
if ((c != '\n') &&
((j - start_col + 1) % 11 !=
0))
2022-03-08 04:43:05 +08:00
buffer[len++] = c;
}
}
2023-04-09 02:17:32 +08:00
else {
AppendNA(buffer, len,
&(dataset->element[curelem]));
for (j = 0; j < len; j++)
2022-03-08 04:43:05 +08:00
buffer[j] = '\0';
len = 0;
done = TRUE;
2023-04-09 02:17:32 +08:00
dataset->element[curelem].comments =
gencomments;
dataset->element[curelem].comments_len =
2022-03-08 04:43:05 +08:00
genclen - 1;
2023-04-09 02:17:32 +08:00
dataset->element[curelem]
.comments_maxlen = genclen;
2022-03-08 04:43:05 +08:00
gencomments = NULL;
genclen = 0;
}
}
2023-04-09 02:17:32 +08:00
/*
* Test if sequence should be converted by
*the translation table If it looks like a protein...
*/
if (dataset->element[curelem].rmatrix &&
IS_REALLY_AA == FALSE) {
IS_REALLY_AA = CheckType(
dataset->element[curelem].sequence,
dataset->element[curelem].seqlen);
if (IS_REALLY_AA == FALSE)
Ascii2NA(
dataset->element[curelem].sequence,
2022-03-08 04:43:05 +08:00
dataset->element[curelem].seqlen,
dataset->element[curelem].rmatrix);
else
2023-04-09 02:17:32 +08:00
/*
* Force the sequence to be AA
*/
2022-03-08 04:43:05 +08:00
{
2023-04-09 02:17:32 +08:00
dataset->element[curelem].elementtype =
PROTEIN;
dataset->element[curelem].rmatrix =
NULL;
dataset->element[curelem].tmatrix =
NULL;
2022-03-08 04:43:05 +08:00
dataset->element[curelem].col_lut =
Default_PROColor_LKUP;
}
}
}
2023-04-09 02:17:32 +08:00
else if (Find(Inline, "ZZZZZ")) {
2022-03-08 04:43:05 +08:00
Cfree(gencomments);
genclen = 0;
}
2023-04-09 02:17:32 +08:00
else {
if (gencomments == NULL) {
2022-03-08 04:43:05 +08:00
gencomments = String(Inline);
2023-04-09 02:17:32 +08:00
genclen = strlen(gencomments) + 1;
2022-03-08 04:43:05 +08:00
}
2023-04-09 02:17:32 +08:00
else {
genclen += strlen(Inline) + 1;
gencomments = Realloc(gencomments,
genclen * sizeof(char));
strncat(gencomments, Inline, GBUFSIZ);
strncat(gencomments, "\n", GBUFSIZ);
2022-03-08 04:43:05 +08:00
}
}
}
Cfree(buffer);
fclose(file);
2023-04-09 02:17:32 +08:00
for (j = 0; j < dataset->numelements; j++)
dataset->maxlen =
MAX(dataset->maxlen, dataset->element[j].seqlen +
dataset->element[j].offset);
2022-03-08 04:43:05 +08:00
return;
}
typedef struct mya {
int yy;
int mm;
int dd;
int hr;
int mn;
int sc;
} sA;
2023-04-09 02:17:32 +08:00
AsciiTime(sA *a, char *asciitime)
2022-03-08 04:43:05 +08:00
{
int j;
char temp[GBUFSIZ];
extern char month[12][6];
a->dd = 0;
a->yy = 0;
a->mm = 0;
2023-04-09 02:17:32 +08:00
sscanf(asciitime, "%d%5c%d", &(a->dd), temp, &(a->yy));
2022-03-08 04:43:05 +08:00
temp[5] = '\0';
2023-04-09 02:17:32 +08:00
for (j = 0; j < 12; j++)
if (strcmp(temp, month[j]) == 0) a->mm = j + 1;
if (a->dd < 0 || a->dd > 31 || a->yy < 0 || a->mm > 11) SetTime(a);
2022-03-08 04:43:05 +08:00
return;
}
2023-04-09 02:17:32 +08:00
WriteGen(aln, filename, method, maskable) NA_Alignment *aln;
2022-03-08 04:43:05 +08:00
char *filename;
2023-04-09 02:17:32 +08:00
int method, maskable;
2022-03-08 04:43:05 +08:00
{
2023-04-09 02:17:32 +08:00
int i, j, k, mask = -1;
2022-03-08 04:43:05 +08:00
FILE *file;
NA_Sequence *this_elem;
extern char month[12][6];
char c;
2023-04-09 02:17:32 +08:00
if (aln == NULL) return;
if (aln->na_ddata == NULL) return;
file = fopen(filename, "w");
if (file == NULL) {
2022-03-08 04:43:05 +08:00
Warning("Cannot open file for output");
2023-04-09 02:17:32 +08:00
return (1);
2022-03-08 04:43:05 +08:00
}
2023-04-09 02:17:32 +08:00
if (maskable && method != SELECT_REGION)
for (j = 0; j < aln->numelements; j++)
if (aln->element[j].elementtype == MASK &&
2022-03-08 04:43:05 +08:00
aln->element[j].selected)
mask = j;
2023-04-09 02:17:32 +08:00
for (j = 0; j < aln->numelements; j++) {
if ((aln->element[j].selected && j != mask &&
method != SELECT_REGION) ||
(aln->element[j].subselected && method == SELECT_REGION) ||
(method == ALL)) {
2022-03-08 04:43:05 +08:00
this_elem = &(aln->element[j]);
2023-04-09 02:17:32 +08:00
fprintf(
file,
2022-03-08 04:43:05 +08:00
"LOCUS %10s%8d bp %4s %10s %2d%5s%4d\n",
2023-04-09 02:17:32 +08:00
this_elem->short_name,
this_elem->seqlen + this_elem->offset,
(this_elem->elementtype == DNA) ? "DNA"
: (this_elem->elementtype == RNA) ? "RNA"
: (this_elem->elementtype == MASK) ? "MASK"
: (this_elem->elementtype == PROTEIN) ? "PROT"
: "TEXT",
this_elem->attr & IS_CIRCULAR ? "Circular" : "",
2022-03-08 04:43:05 +08:00
this_elem->t_stamp.origin.dd,
2023-04-09 02:17:32 +08:00
month[this_elem->t_stamp.origin.mm - 1],
this_elem->t_stamp.origin.yy > 1900
? this_elem->t_stamp.origin.yy
: this_elem->t_stamp.origin.yy + 1900);
if (this_elem->description[0])
fprintf(file, "DEFINITION %s\n",
this_elem->description);
if (this_elem->seq_name[0])
fprintf(file, " ORGANISM %s\n",
this_elem->seq_name);
if (this_elem->id[0])
fprintf(file, " ACCESSION %s\n",
this_elem->id);
if (this_elem->authority[0])
fprintf(file, " AUTHORS %s\n",
this_elem->authority);
if (this_elem->comments)
fprintf(file, "%s\n", this_elem->comments);
fprintf(file, "ORIGIN");
if (this_elem->tmatrix) {
if (mask == -1) {
for (i = 0, k = 0;
k < this_elem->seqlen +
this_elem->offset;
k++) {
if (method == SELECT_REGION) {
if (aln->selection_mask
[k] == '1') {
if (i % 60 == 0)
fprintf(
file,
"\n"
"%9"
"d",
i + 1);
if (i % 10 == 0)
fprintf(
file,
" ");
fprintf(
file, "%c",
this_elem->tmatrix
[getelem(
this_elem,
k)]);
2022-03-08 04:43:05 +08:00
i++;
}
}
2023-04-09 02:17:32 +08:00
else {
if (i % 60 == 0)
fprintf(file,
"\n%9d",
i + 1);
if (i % 10 == 0)
fprintf(file,
" ");
fprintf(
file, "%c",
this_elem->tmatrix
[getelem(
this_elem,
k)]);
2022-03-08 04:43:05 +08:00
i++;
}
}
}
2023-04-09 02:17:32 +08:00
else {
for (k = 0; k < this_elem->seqlen +
this_elem->offset;
k++) {
c = (char)getelem(
&(aln->element[mask]), k);
if (c != '0' && c != '-') {
if (k % 60 == 0)
fprintf(file,
"\n%9d",
k + 1);
if (k % 10 == 0)
fprintf(file,
" ");
fprintf(
file, "%c",
this_elem->tmatrix
[getelem(
this_elem,
k)]);
2022-03-08 04:43:05 +08:00
}
}
}
}
2023-04-09 02:17:32 +08:00
else {
if (mask == -1) {
for (i = 0, k = 0;
k < this_elem->seqlen +
this_elem->offset;
k++) {
if (method == SELECT_REGION) {
if (aln->selection_mask
[k] == '1') {
if (i % 60 == 0)
fprintf(
file,
"\n"
"%9"
"d",
i + 1);
if (i % 10 == 0)
fprintf(
file,
" ");
fprintf(
file, "%c",
getelem(
this_elem,
k));
2022-03-08 04:43:05 +08:00
i++;
}
}
2023-04-09 02:17:32 +08:00
else {
if (i % 60 == 0)
fprintf(file,
"\n%9d",
i + 1);
if (i % 10 == 0)
fprintf(file,
" ");
fprintf(
file, "%c",
getelem(this_elem,
k));
2022-03-08 04:43:05 +08:00
i++;
}
}
}
2023-04-09 02:17:32 +08:00
else {
for (k = 0; k < this_elem->seqlen +
this_elem->offset;
k++) {
c = (char)getelem(
&(aln->element[mask]), k);
if (c != '0' && c != '-') {
if (k % 60 == 0)
fprintf(file,
"\n%9d",
k + 1);
if (k % 10 == 0)
fprintf(file,
" ");
fprintf(
file, "%c",
getelem(this_elem,
k));
2022-03-08 04:43:05 +08:00
}
}
}
}
2023-04-09 02:17:32 +08:00
fprintf(file, "\n//\n");
2022-03-08 04:43:05 +08:00
}
}
fclose(file);
return;
}
SetTime(sA *a)
{
2023-04-09 02:17:32 +08:00
struct tm *tim, *localtime();
2022-03-08 04:43:05 +08:00
long clock;
clock = time(0);
tim = localtime(&clock);
a->yy = tim->tm_year;
2023-04-09 02:17:32 +08:00
a->mm = tim->tm_mon + 1;
2022-03-08 04:43:05 +08:00
a->dd = tim->tm_mday;
a->hr = tim->tm_hour;
a->mn = tim->tm_min;
a->sc = tim->tm_sec;
return;
}
/*
2023-04-09 02:17:32 +08:00
* CheckType: Check base composition to see if the sequence
* appears to be an amino acid sequence. If it is, pass back
* TRUE, else FALSE.
*/
CheckType(seq, len) char *seq;
2022-03-08 04:43:05 +08:00
int len;
{
2023-04-09 02:17:32 +08:00
int j, count1 = 0, count2 = 0;
2022-03-08 04:43:05 +08:00
2023-04-09 02:17:32 +08:00
for (j = 0; j < len; j++)
if (((seq[j] | 32) < 'z') && ((seq[j] | 32) > 'a')) {
2022-03-08 04:43:05 +08:00
count1++;
2023-04-09 02:17:32 +08:00
if (index("ACGTUNacgtun", seq[j]) == NULL) count2++;
2022-03-08 04:43:05 +08:00
}
2023-04-09 02:17:32 +08:00
return ((count2 > count1 / 4) ? TRUE : FALSE);
2022-03-08 04:43:05 +08:00
}