staden-lg/src/ted/seqIOABI.c

428 lines
12 KiB
C

/*
Title: seqIOABI
File: seqIOABI.c
Purpose: IO of ABI sequences
Last update: Tue Nov 10 1992
Change log:
27/11/90 SD writeSeqABI() outputs header to sequence file:
format: ;{noOfBases}{leftCutOff}{basesWritten}{type}{tra
cefile}
eg: ; 867 45 383ABI a09b7.s1RES
28.11.90 SD put undesirables under STLOUIS compilation flag
11.12.90 SD new static function tail to find file name in path name
02.01.91 SD Merged with St.L version
15.01.91 SD New include added (opp.h)
30.07.91 SD Those ole FWO_ field blues
17.09.91 LFW changed STLOUIS compilation flag to SAVE_EDITS
and AUTO_CLIP
25.10.91 SD Machine independant I/O...removed BIGENDIAN flag
10.11.92 SD FWO_ and S/N% interpretation. Comments for information
window.
*/
/*
This module should be regarded as part of `seq' since it is
privy to the internal structure of `Seq'.
At the moment, the `maxTraceVal' of the sequence is hardwired
as 1200, but I suspect it is encoded somewhere in the ABI file
structure, or if not, we should calculate it as we read the
traces in.
The ABI results file is controlled by an index found towards
the end --- this is pointed to by a longword found at `IndexPO'.
The index consists of a number of entries, each of which is
four character label followed by 6 long words. The first of these
long words holds a simple count (starting at 1) for those cases
where there are multiple entries with the same label. Entries should
be found by label (and count), rather than their index position,
because entries can be ommited or new ones added. This happens when
ABI changes the version of their software and also depending
on whether the data was analysed or unalaysed. We do, however,
make assumptions about the relative order of entries.
Ideally we would have a separate module which provides a number
of functions to extract the data we are interested in, keeping
the ABI format well wrapped up and out of harms way.
Note that we are relying on the endian-ness of the machine being
appropriate so we can just read long words in as integers. This
should be recoded to deal with running on different endians.
All references to the seq->bottom, the bottom strand of the
sequence, were added by lfw
*/
/* ---- Imports ---- */
#include "seq.h" /* IMPORT: Seq, BasesAndTraces, NULLSeq,
newSeq, freeSeq */
#include "seqIOABI.h"
#include "seqIOEdit.h" /* IMPORT: writeEdSeq,
readEdSEq */
#include "fpoint.h" /* IMPORT: int_to_float */
#include <ctype.h>
#include <stdio.h> /* IMPORT: fopen, fclose, fseek, ftell, fgetc,
EOF */
#include <stdlib.h>
#include "mach-io.h" /* IMPORT: machine independant io library */
#include "dialogues.h"
/* ---- Constants ---- */
#define BasesPerLine 50 /* For output formatting */
/*
The index is located towards the end of the ABI trace file.
It's location is given by a longword at a fixed place.
*/
#define IndexPO ((off_t)26)
#define IndexEntryLength 28
/*
Here are some labels we will be looking for, four chars packed
into a long word.
*/
#define DataEntryLabel ((int_4) ((((('D'<<8)+'A')<<8)+'T')<<8)+'A')
#define BaseEntryLabel ((int_4) ((((('P'<<8)+'B')<<8)+'A')<<8)+'S')
#define BasePosEntryLabel ((int_4) ((((('P'<<8)+'L')<<8)+'O')<<8)+'C')
#define SpacingEntryLabel ((int_4) ((((('S'<<8)+'P')<<8)+'A')<<8)+'C')
#define SignalEntryLabel ((int_4) ((((('S'<<8)+'/')<<8)+'N')<<8)+'%')
#define FWO_Label ((int_4) ((((('F'<<8)+'W')<<8)+'O')<<8)+'_')
#define MCHNLabel ((int_4) ((((('M'<<8)+'C')<<8)+'H')<<8)+'N')
#define PDMFLabel ((int_4) ((((('P'<<8)+'D')<<8)+'M')<<8)+'F')
#define SMPLLabel ((int_4) ((((('S'<<8)+'M')<<8)+'P')<<8)+'L')
#define PPOSLabel ((int_4) ((((('P'<<8)+'P')<<8)+'O')<<8)+'S')
/* ---- Internal functions ---- */
static Boolean getIndexEntryLW(FILE *fp, off_t indexO,
uint_4 label, uint_4 count, int lw,
uint_4 *val)
/*
From the ABI results file connected to `fp' whose index starts
at byte offset `indexO', return in `val' the `lw'th long word
from the `count'th entry labelled `label'.
The result indicates success.
*/
{ off_t entryNum=-1;
int i;
uint_4 entryLabel, entryLw1;
do
{ entryNum++;
if (fseek(fp, indexO+(entryNum*IndexEntryLength), 0) != 0) return(-1);
if (!be_read_int_4(fp, &entryLabel)) return(False);
if (!be_read_int_4(fp, &entryLw1)) return(False);
} while (!(entryLabel == label && entryLw1 == count));
for(i=2; i<=lw; i++)
if (!be_read_int_4(fp, val)) return(-1);
return(True);
}
/* ---- Exports ---- */
Seq readSeqABI(char *fn)
/*
Read the ABI format sequence with name `fn' into `seq'.
All printing characters (as defined by ANSII C `isprint')
are accepted, but `N's are translated to `-'s. If bottom
is True, we read the sequence in in reverse and complemented
A NULL result indicates failure.
In this module all references to the bottom strand were
inserted by lfw.
*/
{ Seq seq = NULLSeq;
uint_4 fwo_; /* base -> lane mapping */
FILE *fp;
uint_4 numPoints, numBases;
uint_4 signalO;
uint_4 indexO; /* File offset where the index is */
uint_4 baseO; /* File offset where the bases are stored */
uint_4 basePosO; /* File offset where the base positions are stored */
uint_4 dataCO; /* File offset where the C trace is stored */
uint_4 dataAO; /* File offset where the A trace is stored */
uint_4 dataGO; /* File offset where the G trace is stored */
uint_4 dataTO; /* File offset where the T trace is stored */
uint_4 MCHN_O; /* File offset where the machine name is stored */
uint_4 PDMF_O; /* File offset where the dye primer guff is stored */
/*
uint_4 SMPL_O; /. File offset where the sample name is stored ./
*/
int i;
if ((fp = fopen(fn, "rb")) == NULL) return(NULLSeq);
/* Get the index offset */
if ((fseek(fp, IndexPO, 0) != 0) || (!be_read_int_4(fp, &indexO)))
{ fclose(fp);
return(NULLSeq);
}
/* Get the number of points */
if (!getIndexEntryLW(fp,(off_t)indexO,DataEntryLabel,9,3,&numPoints))
{ fclose(fp);
return(NULLSeq);
}
/* Get the number of bases */
if (!getIndexEntryLW(fp,(off_t)indexO,BaseEntryLabel,1,3,&numBases))
{ fclose(fp);
return(NULLSeq);
}
/* Allocate the sequence */
if ((seq = newSeq(BasesAndTraces, numPoints, numBases)) == NULLSeq)
{ fclose(fp);
return(NULLSeq);
}
seq->mode = BasesAndTraces;
seq->format = ABIFormat;
seq->dirty = False;
seq->maxTraceVal = 1200;
seq->NPoints = numPoints;
seq->NorigBases = numBases;
seq->NedBases = numBases;
seq->bottom = False;
/*************************************************************
*
*************************************************************/
/*
** The order of the DATA fields is determined by the field FWO_
** Juggle around with data pointers to get it right
*/
{
uint_4 *dataxO[4];
dataxO[0] = &dataCO;
dataxO[1] = &dataAO;
dataxO[2] = &dataGO;
dataxO[3] = &dataTO;
/* Get the Freak World Out (FWO?) field ... */
if (!getIndexEntryLW(fp,(off_t)indexO,FWO_Label,1,5,&fwo_))
{ fclose(fp);
return(NULLSeq);
}
#define baseIndex(B) ((B)=='C'?0:(B)=='A'?1:(B)=='G'?2:3)
/*Get the positions of the four traces */
if (! ( getIndexEntryLW(fp,(off_t)indexO,DataEntryLabel,9 ,5,dataxO[baseIndex((char)(fwo_>>24&255))]) &&
getIndexEntryLW(fp,(off_t)indexO,DataEntryLabel,10,5,dataxO[baseIndex((char)(fwo_>>16&255))]) &&
getIndexEntryLW(fp,(off_t)indexO,DataEntryLabel,11,5,dataxO[baseIndex((char)(fwo_>>8&255))]) &&
getIndexEntryLW(fp,(off_t)indexO,DataEntryLabel,12,5,dataxO[baseIndex((char)(fwo_&255))])
)
)
{ fclose(fp);
freeSeq(seq);
return(NULLSeq);
}
}
/* Read in the C trace */
if (fseek(fp, (off_t)dataCO, 0) != 0) {fclose(fp);freeSeq(seq);return(NULLSeq);}
for (i=0;i<(seq->NPoints);i++)
{ if (!be_read_int_2(fp, &(seq->traceC[i])))
{fclose(fp);freeSeq(seq);return(NULLSeq);}
}
/* Read in the A trace */
if (fseek(fp, (off_t)dataAO, 0) != 0) {fclose(fp);freeSeq(seq);return(NULLSeq);}
for (i=0;i<(seq->NPoints);i++)
{ if (!be_read_int_2(fp, &seq->traceA[i]))
{fclose(fp);freeSeq(seq);return(NULLSeq);}
}
/* Read in the G trace */
if (fseek(fp, (off_t)dataGO, 0) != 0) {fclose(fp);freeSeq(seq);return(NULLSeq);}
for (i=0;i<(seq->NPoints);i++)
{ if (!be_read_int_2(fp, &seq->traceG[i]))
{fclose(fp);freeSeq(seq);return(NULLSeq);}
}
/* Read in the T trace */
if (fseek(fp, (off_t)dataTO, 0) != 0) {fclose(fp);freeSeq(seq);return(NULLSeq);}
for (i=0;i<(seq->NPoints);i++)
{ if (!be_read_int_2(fp, &seq->traceT[i]))
{fclose(fp);freeSeq(seq);return(NULLSeq);}
}
/* Read in the bases */
if (! (getIndexEntryLW(fp,(off_t)indexO,BaseEntryLabel,1,5,&baseO) &&
(fseek(fp, (off_t)baseO, 0) == 0)
)
)
{ fclose(fp);
freeSeq(seq);
return(NULLSeq);
}
for (i=0;i<(seq->NorigBases);i++)
{ int ch;
if ((ch = fgetc(fp)) == EOF)
{ fclose(fp);
freeSeq(seq);
return(NULLSeq);
}
seq->base[i] = (ch == 'N') ? '-' : ch;
}
/* Read in the base positions */
if (!(getIndexEntryLW(fp,(off_t)indexO,BasePosEntryLabel,1,5,&basePosO) &&
(fseek(fp, (off_t)basePosO, 0) == 0)
)
)
{ fclose(fp);
freeSeq(seq);
return(NULLSeq);
}
for (i=0;i<(seq->NorigBases);i++)
{ if (!be_read_int_2(fp, (uint_2 *)&seq->basePos[i]))
{ fclose(fp); freeSeq(seq); return(NULLSeq); }
}
/*************************************************************
* Gather useful information
*************************************************************/
{
char comment[1024];
char line[128];
int4 spacing;
int4 ppos;
*comment = '\0';
/* Get Signal Strength Offset */
if (getIndexEntryLW(fp,(off_t)indexO,SignalEntryLabel,1,5,&signalO)) {
int2 C,A,G,T;
int2 *base[4];
base[0] = &C;
base[1] = &A;
base[2] = &G;
base[3] = &T;
if (fseek(fp, (off_t)signalO, 0) >= 0 &&
be_read_int_2(fp, (uint_2 *)base[baseIndex((char)(fwo_>>24&255))]) &&
be_read_int_2(fp, (uint_2 *)base[baseIndex((char)(fwo_>>16&255))]) &&
be_read_int_2(fp, (uint_2 *)base[baseIndex((char)(fwo_>>8&255))]) &&
be_read_int_2(fp, (uint_2 *)base[baseIndex((char)(fwo_&255))])) {
sprintf(line,"avg_signal_strength = C:%d A:%d G:%d T:%d\n",C,A,G,T);
strcat(comment,line);
}
}
/* Get the spacing.. it's a float but don't worry yet */
if (getIndexEntryLW(fp,(off_t)indexO,SpacingEntryLabel,1,5,(uint_4*)&spacing)) {
sprintf(line,"avg_spacing = %6.2f\n",int_to_float(spacing));
strcat(comment,line);
}
/* Get primer position */
if (getIndexEntryLW(fp,(off_t)indexO,PPOSLabel,1,5,(uint_4 *)&ppos)) {
sprintf(line,"primer_position = %d\n",(ppos>>16)); /* ppos stores in MBShort of pointer */
strcat(comment,line);
}
/* Get Machine Name Offset */
if (getIndexEntryLW(fp,(off_t)indexO,MCHNLabel,1,5,&MCHN_O)) {
if (fseek(fp, (off_t)MCHN_O, 0) >= 0) {
unsigned char l;
char buffer[256];
/* first byte is a length */
fread(&l,sizeof(char),1,fp);
fread(buffer,l,1,fp);
sprintf(line,"machine_name = %.*s\n",l,buffer);
strcat(comment,line);
}
}
/* Get Dye Primer Offset */
if (getIndexEntryLW(fp,(off_t)indexO,PDMFLabel,1,5,&PDMF_O)) {
if (fseek(fp, (off_t)PDMF_O, 0) >= 0) {
unsigned char l;
char buffer[256];
/* first byte is a length */
fread(&l,sizeof(char),1,fp);
fread(buffer,l,1,fp);
sprintf(line,"dye_primer = %.*s\n",l,buffer);
strcat(comment,line);
}
}
/* Get Sample Name Offset */
/*
* The code here needs to be a bit more clever
* When sample name length is less than (or equal?) four, the
* string is packed into the SMPL_O field
if (getIndexEntryLW(fp,(off_t)indexO,SMPLLabel,1,5,&SMPL_O)) {
if (fseek(fp, (off_t)SMPL_O, 0) >= 0) {
unsigned char l;
char buffer[256];
/. first byte is a length ./
fread(&l,sizeof(char),1,fp);
fread(buffer,l,1,fp);
sprintf(line,"sample_name = %.*s\n",l,buffer);
strcat(comment,line);
}
}
*/
/* dumplicate string and set info */
{
char *s = (char *)malloc(strlen(comment)+1);
strcpy(s,comment);
seq->info = s;
}
}
/* Finished with the file */
fclose(fp);
return(seq);
}