324 lines
8 KiB
C
324 lines
8 KiB
C
/* ureadasn.c
|
|
-- parse, mangle and otherwise rewrite ASN1 file/entries for readseq reading
|
|
-- from NCBI toolkit (ncbi.nlm.nih.gov:/toolkit)
|
|
*/
|
|
|
|
#ifdef NCBI
|
|
|
|
#include <ctype.h>
|
|
#include <stdio.h>
|
|
#include <string.h>
|
|
|
|
/* NCBI toolkit :include: must be on lib path */
|
|
#include <ncbi.h>
|
|
#include <seqport.h>
|
|
|
|
#define UREADASN
|
|
#include "ureadseq.h"
|
|
|
|
#pragma segment ureadasn
|
|
|
|
/* this stuff is hacked up from tofasta.c of ncbitools */
|
|
#define kBaseAny 0
|
|
#define kBaseNucleic 1
|
|
#define kBaseAmino 2
|
|
|
|
typedef struct tofasta {
|
|
Boolean idonly;
|
|
short *seqnum;
|
|
short whichSeq;
|
|
char **seq, **seqid;
|
|
long *seqlen;
|
|
} FastaDat, PNTR FastaPtr;
|
|
|
|
void BioseqRawToRaw(BioseqPtr bsp, Boolean idonly, short whichSeq,
|
|
short *seqnum, char **seq, char **seqid, long *seqlen)
|
|
{
|
|
SeqPortPtr spp;
|
|
SeqIdPtr bestid;
|
|
Uint1 repr, code, residue;
|
|
CharPtr tmp, title;
|
|
long outlen, outmax;
|
|
char localid[256], *sp;
|
|
|
|
/* !!! this may be called several times for a single sequence
|
|
because SeqEntryExplore looks for parts and joins them...
|
|
assume seq, seqid, seqlen may contain data (or NULL)
|
|
*/
|
|
if (bsp == NULL) return;
|
|
repr = Bioseq_repr(bsp);
|
|
if (!(repr == Seq_repr_raw || repr == Seq_repr_const)) return;
|
|
|
|
(*seqnum)++;
|
|
if (!(whichSeq == *seqnum || whichSeq == 0)) return;
|
|
|
|
bestid = SeqIdFindBest(bsp->id, (Uint1)0);
|
|
title = BioseqGetTitle(bsp);
|
|
if (idonly) {
|
|
sprintf(localid, " %d) ", *seqnum);
|
|
tmp = localid + strlen(localid) - 1;
|
|
}
|
|
else {
|
|
strcpy(localid, " ");
|
|
tmp = localid;
|
|
}
|
|
tmp = SeqIdPrint(bestid, tmp, PRINTID_FASTA_SHORT);
|
|
tmp = StringMove(tmp, " ");
|
|
StringNCpy(tmp, title, 200);
|
|
/* fprintf(stderr,"BioseqRawToRaw: localid='%s'\n",localid); */
|
|
|
|
/* < seqid is fixed storage */
|
|
/* strcpy( *seqid, localid); */
|
|
/* < seqid is variable sized */
|
|
outmax = strlen(localid) + 3;
|
|
if (*seqid == NULL) {
|
|
*seqid = (char *)malloc(outmax);
|
|
if (*seqid == NULL) return;
|
|
strcpy(*seqid, localid);
|
|
}
|
|
else {
|
|
outmax += strlen(*seqid) + 2;
|
|
*seqid = (char *)realloc(*seqid, outmax);
|
|
if (*seqid == NULL) return;
|
|
if (!idonly) strcat(*seqid, "; ");
|
|
strcat(*seqid, localid);
|
|
}
|
|
|
|
if (idonly) {
|
|
strcat(*seqid, "\n");
|
|
return;
|
|
}
|
|
|
|
if (ISA_na(bsp->mol))
|
|
code = Seq_code_iupacna;
|
|
else
|
|
code = Seq_code_iupacaa;
|
|
spp = SeqPortNew(bsp, 0, -1, 0, code);
|
|
SeqPortSeek(spp, 0, SEEK_SET);
|
|
|
|
sp = *seq;
|
|
if (sp == NULL) { /* this is always true now !? */
|
|
outlen = 0;
|
|
outmax = 500;
|
|
sp = (char *)malloc(outmax);
|
|
}
|
|
else {
|
|
outlen = strlen(sp);
|
|
outmax = outlen + 500;
|
|
sp = (char *)realloc(sp, outmax);
|
|
}
|
|
if (sp == NULL) return;
|
|
|
|
while ((residue = SeqPortGetResidue(spp)) != SEQPORT_EOF) {
|
|
if (outlen >= outmax) {
|
|
outmax = outlen + 500;
|
|
sp = (char *)realloc(sp, outmax);
|
|
if (sp == NULL) return;
|
|
}
|
|
sp[outlen++] = residue;
|
|
}
|
|
sp = (char *)realloc(sp, outlen + 1);
|
|
if (sp != NULL) sp[outlen] = '\0';
|
|
*seq = sp;
|
|
*seqlen = outlen;
|
|
SeqPortFree(spp);
|
|
return;
|
|
}
|
|
|
|
static void SeqEntryRawseq(SeqEntryPtr sep, Pointer data, Int4 index,
|
|
Int2 indent)
|
|
{
|
|
FastaPtr tfa;
|
|
BioseqPtr bsp;
|
|
|
|
if (!IS_Bioseq(sep)) return;
|
|
bsp = (BioseqPtr)sep->data.ptrvalue;
|
|
tfa = (FastaPtr)data;
|
|
BioseqRawToRaw(bsp, tfa->idonly, tfa->whichSeq, tfa->seqnum, tfa->seq,
|
|
tfa->seqid, tfa->seqlen);
|
|
}
|
|
|
|
void SeqEntryToRaw(SeqEntryPtr sep, Boolean idonly, short whichSeq,
|
|
short *seqnum, char **seq, char **seqid, long *seqlen)
|
|
{
|
|
FastaDat tfa;
|
|
|
|
if (sep == NULL) return;
|
|
tfa.idonly = idonly;
|
|
tfa.seqnum = seqnum;
|
|
tfa.whichSeq = whichSeq;
|
|
tfa.seq = seq;
|
|
tfa.seqid = seqid;
|
|
tfa.seqlen = seqlen;
|
|
SeqEntryExplore(sep, (Pointer)&tfa, SeqEntryRawseq);
|
|
}
|
|
|
|
char *listASNSeqs(
|
|
const char *filename, const long skiplines,
|
|
const short format, /* note: this is kASNseqentry or kASNseqset */
|
|
short *nseq, short *error)
|
|
{
|
|
AsnIoPtr aip = NULL;
|
|
SeqEntryPtr the_set;
|
|
AsnTypePtr atp, atp2;
|
|
AsnModulePtr amp;
|
|
Boolean inIsBinary =
|
|
FALSE; /* damn, why can't asn routines test this? */
|
|
char *seq = NULL;
|
|
char *seqid = NULL, stemp[256];
|
|
long seqlen;
|
|
int i, count;
|
|
|
|
*nseq = 0;
|
|
*error = 0;
|
|
|
|
/* asn dictionary setups */
|
|
/*fprintf(stderr,"listASNSeqs: SeqEntryLoad\n");*/
|
|
if (!SeqEntryLoad())
|
|
goto errxit; /* sequence alphabets (and sequence parse trees)
|
|
*/
|
|
amp = AsnAllModPtr(); /* get pointer to all loaded ASN.1 modules */
|
|
if (amp == NULL) goto errxit;
|
|
atp = AsnFind("Bioseq-set"); /* get the initial type pointers */
|
|
if (atp == NULL) goto errxit;
|
|
atp2 = AsnFind("Bioseq-set.seq-set.E");
|
|
if (atp2 == NULL) goto errxit;
|
|
|
|
/*fprintf(stderr,"listASNSeqs: AsnIoOpen\n");*/
|
|
/* open the ASN.1 input file in the right mode */
|
|
/* !!!! THIS FAILS when filename has MAC PATH (& other paths?)
|
|
* (:folder:filename) */
|
|
if ((aip = AsnIoOpen(filename, inIsBinary ? "rb" : "r")) == NULL)
|
|
goto errxit;
|
|
for (i = 0; i < skiplines; i++)
|
|
fgets(stemp, 255,
|
|
aip->fp); /* this may mess up asn routines... */
|
|
|
|
if (!ErrSetLog("stderr"))
|
|
goto errxit;
|
|
else
|
|
ErrSetOpts(ERR_CONTINUE,
|
|
ERR_LOG_ON); /*?? log errors instead of die */
|
|
|
|
if (format == kASNseqentry) { /* read one Seq-entry */
|
|
/*fprintf(stderr,"listASNSeqs: SeqEntryAsnRead\n");*/
|
|
the_set = SeqEntryAsnRead(aip, NULL);
|
|
SeqEntryToRaw(the_set, true, 0, nseq, &seq, &seqid, &seqlen);
|
|
if (seq) free(seq);
|
|
seq = NULL;
|
|
SeqEntryFree(the_set);
|
|
}
|
|
else { /* read Seq-entry's from a Bioseq-set */
|
|
count = 0;
|
|
/*fprintf(stderr,"listASNSeqs: AsnReadId\n");*/
|
|
while ((atp = AsnReadId(aip, amp, atp)) != NULL) {
|
|
if (atp == atp2) { /* top level Seq-entry */
|
|
the_set = SeqEntryAsnRead(aip, atp);
|
|
SeqEntryToRaw(the_set, true, 0, nseq, &seq,
|
|
&seqid, &seqlen);
|
|
SeqEntryFree(the_set);
|
|
if (seq) free(seq);
|
|
seq = NULL;
|
|
}
|
|
else
|
|
AsnReadVal(aip, atp, NULL);
|
|
count++;
|
|
}
|
|
}
|
|
|
|
AsnIoClose(aip);
|
|
*error = 0;
|
|
return seqid;
|
|
|
|
errxit:
|
|
AsnIoClose(aip);
|
|
if (seqid) free(seqid);
|
|
*error = eASNerr;
|
|
return NULL;
|
|
}
|
|
|
|
char *readASNSeq(
|
|
const short whichEntry, const char *filename, const long skiplines,
|
|
const short format, /* note: this is kASNseqentry or kASNseqset */
|
|
long *seqlen, short *nseq, short *error, char **seqid)
|
|
{
|
|
AsnIoPtr aip = NULL;
|
|
SeqEntryPtr the_set;
|
|
AsnTypePtr atp, atp2;
|
|
AsnModulePtr amp;
|
|
Boolean inIsBinary =
|
|
FALSE; /* damn, why can't asn routines test this? */
|
|
char *seq, stemp[200];
|
|
int i, count;
|
|
|
|
*seqlen = 0;
|
|
*nseq = 0;
|
|
*error = 0;
|
|
seq = NULL;
|
|
|
|
/*fprintf(stderr,"readASNseq: SeqEntryLoad\n");*/
|
|
/* asn dictionary setups */
|
|
if (!SeqEntryLoad())
|
|
goto errxit; /* sequence alphabets (and sequence parse trees)
|
|
*/
|
|
amp = AsnAllModPtr(); /* get pointer to all loaded ASN.1 modules */
|
|
if (amp == NULL) goto errxit;
|
|
atp = AsnFind("Bioseq-set"); /* get the initial type pointers */
|
|
if (atp == NULL) goto errxit;
|
|
atp2 = AsnFind("Bioseq-set.seq-set.E");
|
|
if (atp2 == NULL) goto errxit;
|
|
|
|
/* open the ASN.1 input file in the right mode */
|
|
/*fprintf(stderr,"readASNseq: AsnIoOpen(%s)\n", filename);*/
|
|
if ((aip = AsnIoOpen(filename, inIsBinary ? "rb" : "r")) == NULL)
|
|
goto errxit;
|
|
for (i = 0; i < skiplines; i++)
|
|
fgets(stemp, 255,
|
|
aip->fp); /* this may mess up asn routines... */
|
|
|
|
if (!ErrSetLog("stderr"))
|
|
goto errxit;
|
|
else
|
|
ErrSetOpts(ERR_CONTINUE,
|
|
ERR_LOG_ON); /*?? log errors instead of die */
|
|
|
|
seq = NULL;
|
|
if (format == kASNseqentry) { /* read one Seq-entry */
|
|
/*fprintf(stderr,"readASNseq: SeqEntryAsnRead\n");*/
|
|
the_set = SeqEntryAsnRead(aip, NULL);
|
|
SeqEntryToRaw(the_set, false, whichEntry, nseq, &seq, seqid,
|
|
seqlen);
|
|
SeqEntryFree(the_set);
|
|
goto goodexit;
|
|
}
|
|
|
|
else { /* read Seq-entry's from a Bioseq-set */
|
|
count = 0;
|
|
/*fprintf(stderr,"readASNseq: AsnReadId\n");*/
|
|
while ((atp = AsnReadId(aip, amp, atp)) != NULL) {
|
|
if (atp == atp2) { /* top level Seq-entry */
|
|
the_set = SeqEntryAsnRead(aip, atp);
|
|
SeqEntryToRaw(the_set, false, whichEntry, nseq,
|
|
&seq, seqid, seqlen);
|
|
SeqEntryFree(the_set);
|
|
if (*nseq >= whichEntry) goto goodexit;
|
|
}
|
|
else
|
|
AsnReadVal(aip, atp, NULL);
|
|
count++;
|
|
}
|
|
}
|
|
|
|
goodexit:
|
|
AsnIoClose(aip);
|
|
*error = 0;
|
|
return seq;
|
|
|
|
errxit:
|
|
AsnIoClose(aip);
|
|
*error = eASNerr;
|
|
if (seq) free(seq);
|
|
return NULL;
|
|
}
|
|
|
|
#endif /*NCBI*/
|