Merge pull request from MalacoLab/rebuild

Rebuild
This commit is contained in:
kuoi 2022-01-15 10:58:45 +00:00 committed by GitHub
commit f09afd591b
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
10 changed files with 249 additions and 181 deletions

2
.gitignore vendored Normal file
View file

@ -0,0 +1,2 @@
a.nex
test.go

16
a.nex Normal file
View file

@ -0,0 +1,16 @@
#NEXUS
BEGIN DATA;
DIMENSIONS NTAX=4 NCHAR=2031;
FORMAT DATATYPE=DNA GAP=- MISSING=?;
MATRIX
'Bradybaena circulus circulus' -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------aactatatg-gtatcatatatagctatttctgctcaatg-taatataaatagccgcagtactctgactgtgctaaggtagcataatcatttggcttataattgaagtctagtatgaaagaagatatgggagttaactgtttcctaaacgtttacttaatttacttagggggtgaaaatacccccacaaacataatagacgagaagacccttgaaatttttagtata---attttaaatcgtgctttttgttggggcgacaaggtagcatagtaaacctactaagtggttttattagaacaaaattgtatgaataattaaattactcaagggataacagcataatattttaaagtttgtgacctcgatgttggactaggacaatatagtttaaaagactattatttttgctctgttcg---------------------tatatattatttggtgtttggtgtgggatagttggtacaggtttaagattattgattcgaatagagttaggtagttctggtgttatagcagaagagcatttctacaatgttattgtaacagctcatgcttttgtaataattttttttatagttatgccaatcatgattggaggttttggaaattgaatagtaccgttgttgattggggcacccgatatgagctttccacgtataaacaatataaggttttgattgttacccccttcttttcttctattaattagaagtagtctagtagaaggcggtgcagggaccggttgaacagtgtatcctccacttagctcacttgtaggtcataggagagctgccgtagacttagcaatcttttctcttcatttggctgggatatcatcaattttaggtgcaatcaattttattacaactatttttaatatacgagccccaggaataactatggaacgtgttagactgtttgtttgatctattttagtgacagtgtttcttttattact-----------------------------------------------------------------------------tatatattatttggtgtttggtgtgggatagttggtacaggtttaagattattgattcgaatagagttaggtagttctggtgttatagcagaagagcatttctacaatgttattgtaacagctcatgcttttgtaataattttttttatagttatgccaatcatgattggaggttttggaaattgaatagtaccgttgttgattggggcacccgatatgagctttccacgtataaacaatataaggttttgattgttacccccttcttttcttctattaattagaagtagtctagtagaaggcggtgcagggaccggttgaacagtgtatcctccacttagctcacttgtaggtcataggagagctgccgtagacttagcaatcttttctcttcatttggctgggatatcatcaattttaggtgcaatcaattttattacaactatttttaatatacgagccccaggaataactatggaacgtgttagactgtttgtttgatctattttagtgacagtgtttcttttattact-----------------------------------------------------------------------------
'Bradybaena phaeogramma phaeogramma' -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------aactatatg-gtatcatatatagctatttctgctcaatg-taatataaatagccgcagtactctgactgtgctaaggtagcataatcatttggcttataattgaagtctagtatgaaagaagatatgggagttaactgtttcctaaacgtttacttaatttacttagggggtgaaaatacccccacaaacataatagacgagaagacccttgaaatttttagtata---attttaaatcgtgctttttgttggggcgacaaggtagcatagtaaacctactaagtggttttattagaacaaaattgtatgaataattaaattactcaagggataacagcataatattttaaagtttgtgacctcgatgttggactaggacaatatagtttaaaagactattatttttgctctgttcg---------------------tatatattatttggtgtttggtgtgggatagttggtacaggtttaagattattgattcgaatagagttaggtagttctggtgttatagcagaagagcatttctacaatgttattgtaacagctcatgcttttgtaataattttttttatagttatgccaatcatgattggaggttttggaaattgaatagtaccgttgttgattggggcacccgatatgagctttccacgtataaacaatataaggttttgattgttacccccttcttttcttctattaattagaagtagtctagtagaaggcggtgcagggaccggttgaacagtgtatcctccacttagctcacttgtaggtcataggagagctgccgtagacttagcaatcttttctcttcatttggctgggatatcatcaattttaggtgcaatcaattttattacaactatttttaatatacgagccccaggaataactatggaacgtgttagactgtttgtttgatctattttagtgacagtgtttcttttattact-----------------------------------------------------------------------------tatatattatttggtgtttggtgtgggatagttggtacaggtttaagattattgattcgaatagagttaggtagttctggtgttatagcagaagagcatttctacaatgttattgtaacagctcatgcttttgtaataattttttttatagttatgccaatcatgattggaggttttggaaattgaatagtaccgttgttgattggggcacccgatatgagctttccacgtataaacaatataaggttttgattgttacccccttcttttcttctattaattagaagtagtctagtagaaggcggtgcagggaccggttgaacagtgtatcctccacttagctcacttgtaggtcataggagagctgccgtagacttagcaatcttttctcttcatttggctgggatatcatcaattttaggtgcaatcaattttattacaactatttttaatatacgagccccaggaataactatggaacgtgttagactgtttgtttgatctattttagtgacagtgtttcttttattact-----------------------------------------------------------------------------
'Bradybaena similaris' -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------aactatatg-gtatcatatatagctatttctgctcaatg-taatataaatagccgcagtactctgactgtgctaaggtagcataatcatttggcttataattgaagtctagtatgaaagaagatatgggagttaactgtttcctaaacgtttacttaatttacttagggggtgaaaatacccccacaaacataatagacgagaagacccttgaaatttttagtata---attttaaatcgtgctttttgttggggcgacaaggtagcatagtaaacctactaagtggttttattagaacaaaattgtatgaataattaaattactcaagggataacagcataatattttaaagtttgtgacctcgatgttggactaggacaatatagtttaaaagactattatttttgctctgttcg
'Bradybaena virgo virgotatatattatttggtgtttggtgtgggatagttggtacaggtttaagattattgattcgaatagagttaggtagttctggtgttatagcagaagagcatttctacaatgttattgtaacagctcatgcttttgtaataattttttttatagttatgccaatcatgattggaggttttggaaattgaatagtaccgttgttgattggggcacccgatatgagctttccacgtataaacaatataaggttttgattgttacccccttcttttcttctattaattagaagtagtctagtagaaggcggtgcagggaccggttgaacagtgtatcctccacttagctcacttgtaggtcataggagagctgccgtagacttagcaatcttttctcttcatttggctgggatatcatcaattttaggtgcaatcaattttattacaactatttttaatatacgagccccaggaataactatggaacgtgttagactgtttgtttgatctattttagtgacagtgtttcttttattact-----------------------------------------------------------------------------tatatattatttggtgtttggtgtgggatagttggtacaggtttaagattattgattcgaatagagttaggtagttctggtgttatagcagaagagcatttctacaatgttattgtaacagctcatgcttttgtaataattttttttatagttatgccaatcatgattggaggttttggaaattgaatagtaccgttgttgattggggcacccgatatgagctttccacgtataaacaatataaggttttgattgttacccccttcttttcttctattaattagaagtagtctagtagaaggcggtgcagggaccggttgaacagtgtatcctccacttagctcacttgtaggtcataggagagctgccgtagacttagcaatcttttctcttcatttggctgggatatcatcaattttaggtgcaatcaattttattacaactatttttaatatacgagccccaggaataactatggaacgtgttagactgtttgtttgatctattttagtgacagtgtttcttttattact-----------------------------------------------------------------------------
;
END;
BEGIN SETS;
CHARSET ./data/16S.fas = 1-811;
CHARSET ./data/CO1.fas = 812-1421;
CHARSET ./data/CO1.mega.fas = 1422-2031;
END;

49
count.go Normal file
View file

@ -0,0 +1,49 @@
package main
import (
"fmt"
"regexp"
)
type charset struct {
Name string
From int
To int
}
// 遍历文件得到基本数据
func fas_sum() []dna {
sum := []dna{}
for i, f := range file_input {
sum = append(sum, fas_parser(f))
fmt.Println("[ working A ]", i+1, f)
}
return sum
}
// 整合若干文件的统计
func fas_count(sum_nex []dna) []charset {
fas_charset := []charset{}
for k, v := range sum_nex {
n := fas_name(v.name)
f := 1
if k != 0 {
f = fas_charset[k-1].To + 1
}
t := f + v.count - 1
fmt.Println("[ working B ]", n, f, t)
new_charset := charset{n, f, t}
fas_charset = append(fas_charset, new_charset)
}
fmt.Println(fas_charset)
return fas_charset
}
func fas_name(old_name string) string {
//needed to import string
compileRegex := regexp.MustCompile(`(\w+)\.\w+`)
matchArr := compileRegex.FindStringSubmatch(old_name)
//needed to use the string get from the old string
new_name := matchArr[len(matchArr)-1]
return new_name
}

20
flag.go Normal file
View file

@ -0,0 +1,20 @@
package main
import (
"flag"
"fmt"
)
var (
file_output string
file_input []string
)
func dna_flag() {
flag.StringVar(&file_output, "o", "a.nex", "files name wait to out")
flag.Parse()
file_input = flag.Args() // []string{"foo", "bar"}
fmt.Println("==============")
fmt.Println("[input file:]", file_input)
fmt.Println("[output file:]", file_output)
}

33
gocomb.go Normal file
View file

@ -0,0 +1,33 @@
package main
import (
"strings"
)
type tmpl_data struct {
Ntax int
Nchar int
Matrix map[string]string
Charset []charset
}
func main() {
dna_flag()
sum_nex := fas_sum()
sum_charset := fas_count(sum_nex)
sum_dna, ntax, nchar := fas_mix(sum_nex, sum_charset)
matrix := make(map[string]string, ntax)
for k := range sum_dna {
matrix[k] = strings.Join(sum_dna[k], "")
}
// 准备发射到模板的数据
last_data := tmpl_data{ntax, nchar, matrix, sum_charset}
// fmt.Println(last_data)
do_impl(last_data)
}

33
mix.go Normal file
View file

@ -0,0 +1,33 @@
package main
import "strings"
func fas_mix(sum_nex []dna, sum_charset []charset) (map[string][]string, int, int) {
// dna 的整合
ntax := 0
nchar := sum_charset[len(sum_charset)-1].To
sum_dna := make(map[string][]string)
for _, v := range sum_nex {
for k1 := range v.min {
_, has := sum_dna[k1]
if !has {
sum_dna[k1] = make([]string, len(sum_charset))
ntax++
}
}
}
for k, v := range sum_nex {
for _, v1 := range v.min {
for k2 := range sum_dna {
if _, ok := v.min[k2]; ok {
sum_dna[k2][k] = v1
} else {
sum_dna[k2][k] = strings.Repeat("?", v.count)
// 之前就没写错吗
}
}
}
}
// fmt.Println(sum_dna)
return sum_dna, ntax, nchar
}

View file

@ -1,20 +1,15 @@
package fas_parser #NEXUS
const Nex_tmpl = `#NEXUS
BEGIN DATA; BEGIN DATA;
DIMENSIONS NTAX={{ .Ntax }} NCHAR={{ .Nchar }}; DIMENSIONS NTAX={{ .Ntax }} NCHAR={{ .Nchar }};
FORMAT DATATYPE=DNA GAP=- MISSING=?; FORMAT DATATYPE=DNA GAP=- MISSING=?;
MATRIX MATRIX
{{- range $k, $v := .Matrix }} {{- range $k, $v := .Matrix }}
'{{ $k }}' {{ $v }}{{ end }} '{{ $k }}' {{ $v }}
{{- end }}
; ;
END; END;
BEGIN SETS; BEGIN SETS;
{{- range $_, $i := .Charset }} {{- range $_, $i := .Charset }}
CHARSET {{ $i.Name }} = {{ $i.From }}-{{ $i.To }}; CHARSET {{ $i.Name }} = {{ $i.From }}-{{ $i.To }};
{{- end }} {{- end }}
END; END;
`
// 最后那个 $i 好像有问题
// {{/* $k| printf "%-40s" */}}

172
parser.go
View file

@ -1,119 +1,53 @@
package main package main
import ( import (
"flag" "fmt"
"fmt" "io/ioutil"
fas_parser "gocomb/src" "strings"
"os" )
"strings"
"text/template" type dna struct {
) name string
count int
type dna struct { min map[string]string
name string }
min_dna map[string]string
count int // 读取fas文件
} func fas_parser(file_name string) dna {
type charset struct { // 读取文件
Name string f, err := ioutil.ReadFile("./" + file_name)
From int if err != nil {
To int fmt.Println(err)
} return dna{"", 0, nil}
}
type tmpl_data struct {
Ntax int count := 0
Nchar int i := 0 // acgt行计数
Matrix map[string]string j := -1 // 标题行计数
Charset []charset seq := make(map[string]string)
} indid := ""
func main() { for k, v := range f {
switch v {
// 读取命令行,这里一定要是指针 case '>':
file_export := flag.String("o", "a.nex", "files name wait to out") j = k + 1
flag.Parse() case '\n':
file_names := flag.Args() // []string{"foo", "bar"} if j != -1 {
fmt.Println("[ export here ]", *file_export) indid = string(f[j:k])
i = k + 1
// 遍历文件得到基本数据 j = -1
sum_nex := make([]dna, 0, 5) continue
for k, v := range file_names { }
i, j := fas_parser.Fas_parser(v) seq[indid] = seq[indid] + strings.ToLower(string(f[i:k]))
new_nex := dna{v, i, j} i = k + 1
sum_nex = append(sum_nex, new_nex) }
fmt.Println("[ working A ]", k+1, v) }
} count = len(seq[indid])
// for k1, v1 := range seq {
// 整合若干文件的统计 // fmt.Println(k1)
sum_charset := []charset{} // fmt.Println(v1)
for k, v := range sum_nex { // }
n := v.name // fmt.Println(count)
f := 1 return dna{file_name, count, seq}
if k != 0 { }
f = sum_charset[k-1].To + 1
}
t := f + v.count - 1
fmt.Println("[ working B ]", n, f, t)
new_charset := charset{n, f, t}
sum_charset = append(sum_charset, new_charset)
}
// fmt.Println(sum_charset)
// dna 的整合
ntax := 0
nchar := sum_charset[len(sum_charset)-1].To
sum_dna := make(map[string][]string)
for _, v := range sum_nex {
for k1 := range v.min_dna {
_, has := sum_dna[k1]
if !has {
sum_dna[k1] = make([]string, len(sum_charset))
ntax ++
}
}
}
for k, v := range sum_nex {
for _, v1 := range v.min_dna {
for k2 := range sum_dna {
if _, ok := v.min_dna[k2]; ok {
sum_dna[k2][k] = v1
} else {
sum_dna[k2][k] = strings.Repeat("?", v.count)
}
}
}
}
// fmt.Println(sum_dna)
matrix := make(map[string]string, ntax)
for k := range sum_dna {
matrix[k] = strings.Join(sum_dna[k], "")
}
// 准备发射到模板的数据
last_data := tmpl_data{ntax, nchar, matrix, sum_charset}
// fmt.Println(last_data)
// 读取模板
nex_tmpl, err := template.New("nex").Parse(fas_parser.Nex_tmpl)
if err != nil {
fmt.Println("[ tmpl err ]", err)
return
}
// 覆盖创建要写入的 nex 文件
new_file, err := os.OpenFile(*file_export, os.O_WRONLY|os.O_TRUNC|os.O_CREATE, 0666)
if err != nil {
fmt.Println("[ create or open file error ]", err)
return
}
defer new_file.Close()
// 写入 nex 模板
err = nex_tmpl.Execute(new_file, last_data)
if err != nil {
fmt.Println("[ err at tmpl exec ]", err)
return
}
}

View file

@ -1,53 +0,0 @@
package fas_parser
import(
"io/ioutil"
"fmt"
)
func Fas_parser(file_name string) (map[string]string, int) {
f, err := ioutil.ReadFile("./" + file_name)
if err != nil {
fmt.Println(err)
return nil, 0
}
// fmt.Println(f)
count := 0
i := 0 // DNA行计数
j := 0 // 非序列行计数
seq := make(map[string]string)
section := ""
// fmt.Println('a', 'c', 'g', 't', '-', '\n', '\r')
for k, v := range f {
switch v {
case 'a', 'c', 'g', 't', '-':
if j != 0 {
continue
}
if i == 0 {
i = k
}
case '\n':
if i != 0 {
seq[section] = seq[section] + string(f[i:k])
if len(seq) < 2 && j == 0 {
count += k - i
}
i = 0
continue
}
section = string(f[j:k])
j = 0
default:
if j == 0 {
j = k + 1
}
}
}
// for k1, v1 := range seq {
// fmt.Println(k1)
// fmt.Println(v1)
// }
// fmt.Println(count)
return seq, count
}

39
tmpl.go Normal file
View file

@ -0,0 +1,39 @@
package main
import (
"fmt"
"io/ioutil"
"os"
"text/template"
)
func do_impl(last_data tmpl_data) {
f, err := ioutil.ReadFile("nex.tmpl")
if err != nil {
fmt.Println(err)
return
}
// 读取模板
nex_tmpl, err := template.New("nex").Parse(string(f))
if err != nil {
fmt.Println("[ tmpl err ]", err)
return
}
// 覆盖创建要写入的 nex 文件
new_file, err := os.OpenFile(file_output, os.O_WRONLY|os.O_TRUNC|os.O_CREATE, 0666)
if err != nil {
fmt.Println("[ create or open file error ]", err)
return
}
defer new_file.Close()
// 写入 nex 模板
err = nex_tmpl.Execute(new_file, last_data)
if err != nil {
fmt.Println("[ err at tmpl exec ]", err)
return
}
}