This commit is contained in:
qaqland 2022-01-13 20:12:32 +08:00
parent 1ee11d7cb1
commit e53203768f
9 changed files with 231 additions and 180 deletions

15
a.nex Normal file
View file

@ -0,0 +1,15 @@
#NEXUS
BEGIN DATA;
DIMENSIONS NTAX=4 NCHAR=6;
FORMAT DATATYPE=DNA GAP=- MISSING=?;
MATRIX
'>Bradybaena circulus circulus' -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------aactatatg-gtatcatatatagctatttctgctcaatg-taatataaatagccgcagtactctgactgtgctaaggtagcataatcatttggcttataattgaagtctagtatgaaagaagatatgggagttaactgtttcctaaacgtttacttaatttacttagggggtgaaaatacccccacaaacataatagacgagaagacccttgaaatttttagtata---attttaaatcgtgctttttgttggggcgacaaggtagcatagtaaacctactaagtggttttattagaacaaaattgtatgaataattaaattactcaagggataacagcataatattttaaagtttgtgacctcgatgttggactaggacaatatagtttaaaagactattatttttgctctgttcg---------------------tatatattatttggtgtttggtgtgggatagttggtacaggtttaagattattgattcgaatagagttaggtagttctggtgttatagcagaagagcatttctacaatgttattgtaacagctcatgcttttgtaataattttttttatagttatgccaatcatgattggaggttttggaaattgaatagtaccgttgttgattggggcacccgatatgagctttccacgtataaacaatataaggttttgattgttacccccttcttttcttctattaattagaagtagtctagtagaaggcggtgcagggaccggttgaacagtgtatcctccacttagctcacttgtaggtcataggagagctgccgtagacttagcaatcttttctcttcatttggctgggatatcatcaattttaggtgcaatcaattttattacaactatttttaatatacgagccccaggaataactatggaacgtgttagactgtttgtttgatctattttagtgacagtgtttcttttattact-----------------------------------------------------------------------------
'>Bradybaena phaeogramma phaeogramma' -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------aactatatg-gtatcatatatagctatttctgctcaatg-taatataaatagccgcagtactctgactgtgctaaggtagcataatcatttggcttataattgaagtctagtatgaaagaagatatgggagttaactgtttcctaaacgtttacttaatttacttagggggtgaaaatacccccacaaacataatagacgagaagacccttgaaatttttagtata---attttaaatcgtgctttttgttggggcgacaaggtagcatagtaaacctactaagtggttttattagaacaaaattgtatgaataattaaattactcaagggataacagcataatattttaaagtttgtgacctcgatgttggactaggacaatatagtttaaaagactattatttttgctctgttcg---------------------tatatattatttggtgtttggtgtgggatagttggtacaggtttaagattattgattcgaatagagttaggtagttctggtgttatagcagaagagcatttctacaatgttattgtaacagctcatgcttttgtaataattttttttatagttatgccaatcatgattggaggttttggaaattgaatagtaccgttgttgattggggcacccgatatgagctttccacgtataaacaatataaggttttgattgttacccccttcttttcttctattaattagaagtagtctagtagaaggcggtgcagggaccggttgaacagtgtatcctccacttagctcacttgtaggtcataggagagctgccgtagacttagcaatcttttctcttcatttggctgggatatcatcaattttaggtgcaatcaattttattacaactatttttaatatacgagccccaggaataactatggaacgtgttagactgtttgtttgatctattttagtgacagtgtttcttttattact-----------------------------------------------------------------------------
'>Bradybaena similaris' -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------aactatatg-gtatcatatatagctatttctgctcaatg-taatataaatagccgcagtactctgactgtgctaaggtagcataatcatttggcttataattgaagtctagtatgaaagaagatatgggagttaactgtttcctaaacgtttacttaatttacttagggggtgaaaatacccccacaaacataatagacgagaagacccttgaaatttttagtata---attttaaatcgtgctttttgttggggcgacaaggtagcatagtaaacctactaagtggttttattagaacaaaattgtatgaataattaaattactcaagggataacagcataatattttaaagtttgtgacctcgatgttggactaggacaatatagtttaaaagactattatttttgctctgttcg---------------------???
'>Bradybaena virgo virgo' ???tatatattatttggtgtttggtgtgggatagttggtacaggtttaagattattgattcgaatagagttaggtagttctggtgttatagcagaagagcatttctacaatgttattgtaacagctcatgcttttgtaataattttttttatagttatgccaatcatgattggaggttttggaaattgaatagtaccgttgttgattggggcacccgatatgagctttccacgtataaacaatataaggttttgattgttacccccttcttttcttctattaattagaagtagtctagtagaaggcggtgcagggaccggttgaacagtgtatcctccacttagctcacttgtaggtcataggagagctgccgtagacttagcaatcttttctcttcatttggctgggatatcatcaattttaggtgcaatcaattttattacaactatttttaatatacgagccccaggaataactatggaacgtgttagactgtttgtttgatctattttagtgacagtgtttcttttattact-----------------------------------------------------------------------------
;
END;
BEGIN SETS;
CHARSET ./data/16S.fas = 1-3;
CHARSET ./data/CO1.fas = 4-6;
END;

38
count.go Normal file
View file

@ -0,0 +1,38 @@
package main
import "fmt"
type charset struct {
Name string
From int
To int
}
// 遍历文件得到基本数据
func fas_sum() []dna {
sum := []dna{}
for i, f := range file_input {
sum = append(sum, fas_parser(f))
fmt.Println("[ working A ]", i+1, f)
}
return sum
}
// 整合若干文件的统计
func fas_count() []charset {
fas_charset := []charset{}
sum_nex := fas_sum()
for k, v := range sum_nex {
n := v.name
f := 1
if k != 0 {
f = fas_charset[k-1].To + 1
}
t := f + v.count - 1
fmt.Println("[ working B ]", n, f, t)
new_charset := charset{n, f, t}
fas_charset = append(fas_charset, new_charset)
}
// fmt.Println(sum_charset)
return fas_charset
}

20
flag.go Normal file
View file

@ -0,0 +1,20 @@
package main
import (
"flag"
"fmt"
)
var (
file_output string
file_input []string
)
func dna_flag() {
flag.StringVar(&file_output, "o", "a.nex", "files name wait to out")
flag.Parse()
file_input = flag.Args() // []string{"foo", "bar"}
fmt.Println("==============")
fmt.Println("[input file:]", file_input)
fmt.Println("[output file:]", file_output)
}

33
gocomb.go Normal file
View file

@ -0,0 +1,33 @@
package main
import (
"strings"
)
type tmpl_data struct {
Ntax int
Nchar int
Matrix map[string]string
Charset []charset
}
func main() {
dna_flag()
sum_nex := fas_sum()
sum_charset := fas_count()
sum_dna, ntax, nchar := dna_mix(sum_nex, sum_charset)
matrix := make(map[string]string, ntax)
for k := range sum_dna {
matrix[k] = strings.Join(sum_dna[k], "")
}
// 准备发射到模板的数据
last_data := tmpl_data{ntax, nchar, matrix, sum_charset}
// fmt.Println(last_data)
do_impl(last_data)
}

32
mix.go Normal file
View file

@ -0,0 +1,32 @@
package main
import "strings"
func dna_mix(sum_nex []dna, sum_charset []charset) (map[string][]string, int, int) {
// dna 的整合
ntax := 0
nchar := sum_charset[len(sum_charset)-1].To
sum_dna := make(map[string][]string)
for _, v := range sum_nex {
for k1 := range v.min_dna {
_, has := sum_dna[k1]
if !has {
sum_dna[k1] = make([]string, len(sum_charset))
ntax++
}
}
}
for k, v := range sum_nex {
for _, v1 := range v.min_dna {
for k2 := range sum_dna {
if _, ok := v.min_dna[k2]; ok {
sum_dna[k2][k] = v1
} else {
sum_dna[k2][k] = strings.Repeat("?", v.count)
}
}
}
}
// fmt.Println(sum_dna)
return sum_dna, ntax, nchar
}

View file

@ -1,6 +1,4 @@
package fas_parser
const Nex_tmpl = `#NEXUS
#NEXUS
BEGIN DATA;
DIMENSIONS NTAX={{ .Ntax }} NCHAR={{ .Nchar }};
FORMAT DATATYPE=DNA GAP=- MISSING=?;
@ -14,7 +12,3 @@ BEGIN SETS;
CHARSET {{ $i.Name }} = {{ $i.From }}-{{ $i.To }};
{{- end }}
END;
`
// 最后那个 $i 好像有问题
// {{/* $k| printf "%-40s" */}}

135
parser.go
View file

@ -1,119 +1,52 @@
package main
import (
"flag"
"fmt"
fas_parser "gocomb/src"
"os"
"strings"
"text/template"
"io/ioutil"
)
type dna struct {
name string
min_dna map[string]string
count int
min_dna map[string]string
}
type charset struct {
Name string
From int
To int
}
// 读取fas文件
func fas_parser(file_name string) dna {
type tmpl_data struct {
Ntax int
Nchar int
Matrix map[string]string
Charset []charset
}
func main() {
// 读取命令行,这里一定要是指针
file_export := flag.String("o", "a.nex", "files name wait to out")
flag.Parse()
file_names := flag.Args() // []string{"foo", "bar"}
fmt.Println("[ export here ]", *file_export)
// 遍历文件得到基本数据
sum_nex := make([]dna, 0, 5)
for k, v := range file_names {
i, j := fas_parser.Fas_parser(v)
new_nex := dna{v, i, j}
sum_nex = append(sum_nex, new_nex)
fmt.Println("[ working A ]", k+1, v)
}
// 整合若干文件的统计
sum_charset := []charset{}
for k, v := range sum_nex {
n := v.name
f := 1
if k != 0 {
f = sum_charset[k-1].To + 1
}
t := f + v.count - 1
fmt.Println("[ working B ]", n, f, t)
new_charset := charset{n, f, t}
sum_charset = append(sum_charset, new_charset)
}
// fmt.Println(sum_charset)
// dna 的整合
ntax := 0
nchar := sum_charset[len(sum_charset)-1].To
sum_dna := make(map[string][]string)
for _, v := range sum_nex {
for k1 := range v.min_dna {
_, has := sum_dna[k1]
if !has {
sum_dna[k1] = make([]string, len(sum_charset))
ntax ++
}
}
}
for k, v := range sum_nex {
for _, v1 := range v.min_dna {
for k2 := range sum_dna {
if _, ok := v.min_dna[k2]; ok {
sum_dna[k2][k] = v1
} else {
sum_dna[k2][k] = strings.Repeat("?", v.count)
}
}
}
}
// fmt.Println(sum_dna)
matrix := make(map[string]string, ntax)
for k := range sum_dna {
matrix[k] = strings.Join(sum_dna[k], "")
}
// 准备发射到模板的数据
last_data := tmpl_data{ntax, nchar, matrix, sum_charset}
// fmt.Println(last_data)
// 读取模板
nex_tmpl, err := template.New("nex").Parse(fas_parser.Nex_tmpl)
// 读取文件
f, err := ioutil.ReadFile("./" + file_name)
if err != nil {
fmt.Println("[ tmpl err ]", err)
return
fmt.Println(err)
return dna{"", 0, nil}
}
// 覆盖创建要写入的 nex 文件
new_file, err := os.OpenFile(*file_export, os.O_WRONLY|os.O_TRUNC|os.O_CREATE, 0666)
if err != nil {
fmt.Println("[ create or open file error ]", err)
return
}
defer new_file.Close()
count := 0
i := 0 // acgt行计数
j := -1 // 标题行计数
seq := make(map[string]string)
section := ""
// 写入 nex 模板
err = nex_tmpl.Execute(new_file, last_data)
if err != nil {
fmt.Println("[ err at tmpl exec ]", err)
return
for k, v := range f {
switch v {
case '>':
j = k
count++
case '\n':
if j != -1 {
section = string(f[j:k])
i = k + 1
j = -1
continue
}
seq[section] = seq[section] + string(f[i:k])
i = k + 1
}
}
// for k1, v1 := range seq {
// fmt.Println(k1)
// fmt.Println(v1)
// }
// fmt.Println(count)
return dna{file_name, count, seq}
}

View file

@ -1,53 +0,0 @@
package fas_parser
import(
"io/ioutil"
"fmt"
)
func Fas_parser(file_name string) (map[string]string, int) {
f, err := ioutil.ReadFile("./" + file_name)
if err != nil {
fmt.Println(err)
return nil, 0
}
// fmt.Println(f)
count := 0
i := 0 // DNA行计数
j := 0 // 非序列行计数
seq := make(map[string]string)
section := ""
// fmt.Println('a', 'c', 'g', 't', '-', '\n', '\r')
for k, v := range f {
switch v {
case 'a', 'c', 'g', 't', '-':
if j != 0 {
continue
}
if i == 0 {
i = k
}
case '\n':
if i != 0 {
seq[section] = seq[section] + string(f[i:k])
if len(seq) < 2 && j == 0 {
count += k - i
}
i = 0
continue
}
section = string(f[j:k])
j = 0
default:
if j == 0 {
j = k + 1
}
}
}
// for k1, v1 := range seq {
// fmt.Println(k1)
// fmt.Println(v1)
// }
// fmt.Println(count)
return seq, count
}

39
tmpl.go Normal file
View file

@ -0,0 +1,39 @@
package main
import (
"fmt"
"io/ioutil"
"os"
"text/template"
)
func do_impl(last_data tmpl_data) {
f, err := ioutil.ReadFile("nex.tmpl")
if err != nil {
fmt.Println(err)
return
}
// 读取模板
nex_tmpl, err := template.New("nex").Parse(string(f))
if err != nil {
fmt.Println("[ tmpl err ]", err)
return
}
// 覆盖创建要写入的 nex 文件
new_file, err := os.OpenFile(file_output, os.O_WRONLY|os.O_TRUNC|os.O_CREATE, 0666)
if err != nil {
fmt.Println("[ create or open file error ]", err)
return
}
defer new_file.Close()
// 写入 nex 模板
err = nex_tmpl.Execute(new_file, last_data)
if err != nil {
fmt.Println("[ err at tmpl exec ]", err)
return
}
}