From e53203768f5fff8fbe98347693c108e823b373c2 Mon Sep 17 00:00:00 2001 From: qaqland <62464571+qaqland@users.noreply.github.com> Date: Thu, 13 Jan 2022 20:12:32 +0800 Subject: [PATCH] failed --- a.nex | 15 ++++ count.go | 38 ++++++++ flag.go | 20 +++++ gocomb.go | 33 +++++++ mix.go | 32 +++++++ src/nex_tmpl.go => nex.tmpl | 10 +-- parser.go | 171 +++++++++++------------------------- src/fas_parser.go | 53 ----------- tmpl.go | 39 ++++++++ 9 files changed, 231 insertions(+), 180 deletions(-) create mode 100644 a.nex create mode 100644 count.go create mode 100644 flag.go create mode 100644 gocomb.go create mode 100644 mix.go rename src/nex_tmpl.go => nex.tmpl (70%) delete mode 100644 src/fas_parser.go create mode 100644 tmpl.go diff --git a/a.nex b/a.nex new file mode 100644 index 0000000..1419f14 --- /dev/null +++ b/a.nex @@ -0,0 +1,15 @@ +#NEXUS +BEGIN DATA; + DIMENSIONS NTAX=4 NCHAR=6; + FORMAT DATATYPE=DNA GAP=- MISSING=?; +MATRIX +'>Bradybaena circulus circulus' -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------aactatatg-gtatcatatatagctatttctgctcaatg-taatataaatagccgcagtactctgactgtgctaaggtagcataatcatttggcttataattgaagtctagtatgaaagaagatatgggagttaactgtttcctaaacgtttacttaatttacttagggggtgaaaatacccccacaaacataatagacgagaagacccttgaaatttttagtata---attttaaatcgtgctttttgttggggcgacaaggtagcatagtaaacctactaagtggttttattagaacaaaattgtatgaataattaaattactcaagggataacagcataatattttaaagtttgtgacctcgatgttggactaggacaatatagtttaaaagactattatttttgctctgttcg---------------------tatatattatttggtgtttggtgtgggatagttggtacaggtttaagattattgattcgaatagagttaggtagttctggtgttatagcagaagagcatttctacaatgttattgtaacagctcatgcttttgtaataattttttttatagttatgccaatcatgattggaggttttggaaattgaatagtaccgttgttgattggggcacccgatatgagctttccacgtataaacaatataaggttttgattgttacccccttcttttcttctattaattagaagtagtctagtagaaggcggtgcagggaccggttgaacagtgtatcctccacttagctcacttgtaggtcataggagagctgccgtagacttagcaatcttttctcttcatttggctgggatatcatcaattttaggtgcaatcaattttattacaactatttttaatatacgagccccaggaataactatggaacgtgttagactgtttgtttgatctattttagtgacagtgtttcttttattact----------------------------------------------------------------------------- +'>Bradybaena phaeogramma phaeogramma' -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------aactatatg-gtatcatatatagctatttctgctcaatg-taatataaatagccgcagtactctgactgtgctaaggtagcataatcatttggcttataattgaagtctagtatgaaagaagatatgggagttaactgtttcctaaacgtttacttaatttacttagggggtgaaaatacccccacaaacataatagacgagaagacccttgaaatttttagtata---attttaaatcgtgctttttgttggggcgacaaggtagcatagtaaacctactaagtggttttattagaacaaaattgtatgaataattaaattactcaagggataacagcataatattttaaagtttgtgacctcgatgttggactaggacaatatagtttaaaagactattatttttgctctgttcg---------------------tatatattatttggtgtttggtgtgggatagttggtacaggtttaagattattgattcgaatagagttaggtagttctggtgttatagcagaagagcatttctacaatgttattgtaacagctcatgcttttgtaataattttttttatagttatgccaatcatgattggaggttttggaaattgaatagtaccgttgttgattggggcacccgatatgagctttccacgtataaacaatataaggttttgattgttacccccttcttttcttctattaattagaagtagtctagtagaaggcggtgcagggaccggttgaacagtgtatcctccacttagctcacttgtaggtcataggagagctgccgtagacttagcaatcttttctcttcatttggctgggatatcatcaattttaggtgcaatcaattttattacaactatttttaatatacgagccccaggaataactatggaacgtgttagactgtttgtttgatctattttagtgacagtgtttcttttattact----------------------------------------------------------------------------- +'>Bradybaena similaris' -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------aactatatg-gtatcatatatagctatttctgctcaatg-taatataaatagccgcagtactctgactgtgctaaggtagcataatcatttggcttataattgaagtctagtatgaaagaagatatgggagttaactgtttcctaaacgtttacttaatttacttagggggtgaaaatacccccacaaacataatagacgagaagacccttgaaatttttagtata---attttaaatcgtgctttttgttggggcgacaaggtagcatagtaaacctactaagtggttttattagaacaaaattgtatgaataattaaattactcaagggataacagcataatattttaaagtttgtgacctcgatgttggactaggacaatatagtttaaaagactattatttttgctctgttcg---------------------??? +'>Bradybaena virgo virgo' ???tatatattatttggtgtttggtgtgggatagttggtacaggtttaagattattgattcgaatagagttaggtagttctggtgttatagcagaagagcatttctacaatgttattgtaacagctcatgcttttgtaataattttttttatagttatgccaatcatgattggaggttttggaaattgaatagtaccgttgttgattggggcacccgatatgagctttccacgtataaacaatataaggttttgattgttacccccttcttttcttctattaattagaagtagtctagtagaaggcggtgcagggaccggttgaacagtgtatcctccacttagctcacttgtaggtcataggagagctgccgtagacttagcaatcttttctcttcatttggctgggatatcatcaattttaggtgcaatcaattttattacaactatttttaatatacgagccccaggaataactatggaacgtgttagactgtttgtttgatctattttagtgacagtgtttcttttattact----------------------------------------------------------------------------- +; +END; +BEGIN SETS; + CHARSET ./data/16S.fas = 1-3; + CHARSET ./data/CO1.fas = 4-6; +END; \ No newline at end of file diff --git a/count.go b/count.go new file mode 100644 index 0000000..739e041 --- /dev/null +++ b/count.go @@ -0,0 +1,38 @@ +package main + +import "fmt" + +type charset struct { + Name string + From int + To int +} + +// 遍历文件得到基本数据 +func fas_sum() []dna { + sum := []dna{} + for i, f := range file_input { + sum = append(sum, fas_parser(f)) + fmt.Println("[ working A ]", i+1, f) + } + return sum +} + +// 整合若干文件的统计 +func fas_count() []charset { + fas_charset := []charset{} + sum_nex := fas_sum() + for k, v := range sum_nex { + n := v.name + f := 1 + if k != 0 { + f = fas_charset[k-1].To + 1 + } + t := f + v.count - 1 + fmt.Println("[ working B ]", n, f, t) + new_charset := charset{n, f, t} + fas_charset = append(fas_charset, new_charset) + } + // fmt.Println(sum_charset) + return fas_charset +} diff --git a/flag.go b/flag.go new file mode 100644 index 0000000..22dff76 --- /dev/null +++ b/flag.go @@ -0,0 +1,20 @@ +package main + +import ( + "flag" + "fmt" +) + +var ( + file_output string + file_input []string +) + +func dna_flag() { + flag.StringVar(&file_output, "o", "a.nex", "files name wait to out") + flag.Parse() + file_input = flag.Args() // []string{"foo", "bar"} + fmt.Println("==============") + fmt.Println("[input file:]", file_input) + fmt.Println("[output file:]", file_output) +} diff --git a/gocomb.go b/gocomb.go new file mode 100644 index 0000000..d7c7b52 --- /dev/null +++ b/gocomb.go @@ -0,0 +1,33 @@ +package main + +import ( + "strings" +) + +type tmpl_data struct { + Ntax int + Nchar int + Matrix map[string]string + Charset []charset +} + +func main() { + + dna_flag() + + sum_nex := fas_sum() + + sum_charset := fas_count() + + sum_dna, ntax, nchar := dna_mix(sum_nex, sum_charset) + + matrix := make(map[string]string, ntax) + for k := range sum_dna { + matrix[k] = strings.Join(sum_dna[k], "") + } + + // 准备发射到模板的数据 + last_data := tmpl_data{ntax, nchar, matrix, sum_charset} + // fmt.Println(last_data) + do_impl(last_data) +} diff --git a/mix.go b/mix.go new file mode 100644 index 0000000..303e8cd --- /dev/null +++ b/mix.go @@ -0,0 +1,32 @@ +package main + +import "strings" + +func dna_mix(sum_nex []dna, sum_charset []charset) (map[string][]string, int, int) { + // dna 的整合 + ntax := 0 + nchar := sum_charset[len(sum_charset)-1].To + sum_dna := make(map[string][]string) + for _, v := range sum_nex { + for k1 := range v.min_dna { + _, has := sum_dna[k1] + if !has { + sum_dna[k1] = make([]string, len(sum_charset)) + ntax++ + } + } + } + for k, v := range sum_nex { + for _, v1 := range v.min_dna { + for k2 := range sum_dna { + if _, ok := v.min_dna[k2]; ok { + sum_dna[k2][k] = v1 + } else { + sum_dna[k2][k] = strings.Repeat("?", v.count) + } + } + } + } + // fmt.Println(sum_dna) + return sum_dna, ntax, nchar +} diff --git a/src/nex_tmpl.go b/nex.tmpl similarity index 70% rename from src/nex_tmpl.go rename to nex.tmpl index 2891241..799bfea 100644 --- a/src/nex_tmpl.go +++ b/nex.tmpl @@ -1,6 +1,4 @@ -package fas_parser - -const Nex_tmpl = `#NEXUS +#NEXUS BEGIN DATA; DIMENSIONS NTAX={{ .Ntax }} NCHAR={{ .Nchar }}; FORMAT DATATYPE=DNA GAP=- MISSING=?; @@ -13,8 +11,4 @@ BEGIN SETS; {{- range $_, $i := .Charset }} CHARSET {{ $i.Name }} = {{ $i.From }}-{{ $i.To }}; {{- end }} -END; -` - -// 最后那个 $i 好像有问题 -// {{/* $k| printf "%-40s" */}} +END; \ No newline at end of file diff --git a/parser.go b/parser.go index a3f40e0..143d6e0 100644 --- a/parser.go +++ b/parser.go @@ -1,119 +1,52 @@ -package main - -import ( - "flag" - "fmt" - fas_parser "gocomb/src" - "os" - "strings" - "text/template" -) - -type dna struct { - name string - min_dna map[string]string - count int -} - -type charset struct { - Name string - From int - To int -} - -type tmpl_data struct { - Ntax int - Nchar int - Matrix map[string]string - Charset []charset -} - -func main() { - - // 读取命令行,这里一定要是指针 - file_export := flag.String("o", "a.nex", "files name wait to out") - flag.Parse() - file_names := flag.Args() // []string{"foo", "bar"} - fmt.Println("[ export here ]", *file_export) - - // 遍历文件得到基本数据 - sum_nex := make([]dna, 0, 5) - for k, v := range file_names { - i, j := fas_parser.Fas_parser(v) - new_nex := dna{v, i, j} - sum_nex = append(sum_nex, new_nex) - fmt.Println("[ working A ]", k+1, v) - } - - // 整合若干文件的统计 - sum_charset := []charset{} - for k, v := range sum_nex { - n := v.name - f := 1 - if k != 0 { - f = sum_charset[k-1].To + 1 - } - t := f + v.count - 1 - fmt.Println("[ working B ]", n, f, t) - new_charset := charset{n, f, t} - sum_charset = append(sum_charset, new_charset) - } - // fmt.Println(sum_charset) - - // dna 的整合 - ntax := 0 - nchar := sum_charset[len(sum_charset)-1].To - sum_dna := make(map[string][]string) - for _, v := range sum_nex { - for k1 := range v.min_dna { - _, has := sum_dna[k1] - if !has { - sum_dna[k1] = make([]string, len(sum_charset)) - ntax ++ - } - } - } - for k, v := range sum_nex { - for _, v1 := range v.min_dna { - for k2 := range sum_dna { - if _, ok := v.min_dna[k2]; ok { - sum_dna[k2][k] = v1 - } else { - sum_dna[k2][k] = strings.Repeat("?", v.count) - } - } - } - } - // fmt.Println(sum_dna) - - matrix := make(map[string]string, ntax) - for k := range sum_dna { - matrix[k] = strings.Join(sum_dna[k], "") - } - - // 准备发射到模板的数据 - last_data := tmpl_data{ntax, nchar, matrix, sum_charset} - // fmt.Println(last_data) - - // 读取模板 - nex_tmpl, err := template.New("nex").Parse(fas_parser.Nex_tmpl) - if err != nil { - fmt.Println("[ tmpl err ]", err) - return - } - - // 覆盖创建要写入的 nex 文件 - new_file, err := os.OpenFile(*file_export, os.O_WRONLY|os.O_TRUNC|os.O_CREATE, 0666) - if err != nil { - fmt.Println("[ create or open file error ]", err) - return - } - defer new_file.Close() - - // 写入 nex 模板 - err = nex_tmpl.Execute(new_file, last_data) - if err != nil { - fmt.Println("[ err at tmpl exec ]", err) - return - } -} +package main + +import ( + "fmt" + "io/ioutil" +) + +type dna struct { + name string + count int + min_dna map[string]string +} + +// 读取fas文件 +func fas_parser(file_name string) dna { + + // 读取文件 + f, err := ioutil.ReadFile("./" + file_name) + if err != nil { + fmt.Println(err) + return dna{"", 0, nil} + } + + count := 0 + i := 0 // acgt行计数 + j := -1 // 标题行计数 + seq := make(map[string]string) + section := "" + + for k, v := range f { + switch v { + case '>': + j = k + count++ + case '\n': + if j != -1 { + section = string(f[j:k]) + i = k + 1 + j = -1 + continue + } + seq[section] = seq[section] + string(f[i:k]) + i = k + 1 + } + } + // for k1, v1 := range seq { + // fmt.Println(k1) + // fmt.Println(v1) + // } + // fmt.Println(count) + return dna{file_name, count, seq} +} diff --git a/src/fas_parser.go b/src/fas_parser.go deleted file mode 100644 index fb055f5..0000000 --- a/src/fas_parser.go +++ /dev/null @@ -1,53 +0,0 @@ -package fas_parser - -import( - "io/ioutil" - "fmt" -) - -func Fas_parser(file_name string) (map[string]string, int) { - f, err := ioutil.ReadFile("./" + file_name) - if err != nil { - fmt.Println(err) - return nil, 0 - } - // fmt.Println(f) - count := 0 - i := 0 // DNA行计数 - j := 0 // 非序列行计数 - seq := make(map[string]string) - section := "" - // fmt.Println('a', 'c', 'g', 't', '-', '\n', '\r') - for k, v := range f { - switch v { - case 'a', 'c', 'g', 't', '-': - if j != 0 { - continue - } - if i == 0 { - i = k - } - case '\n': - if i != 0 { - seq[section] = seq[section] + string(f[i:k]) - if len(seq) < 2 && j == 0 { - count += k - i - } - i = 0 - continue - } - section = string(f[j:k]) - j = 0 - default: - if j == 0 { - j = k + 1 - } - } - } - // for k1, v1 := range seq { - // fmt.Println(k1) - // fmt.Println(v1) - // } - // fmt.Println(count) - return seq, count -} diff --git a/tmpl.go b/tmpl.go new file mode 100644 index 0000000..ef86bc2 --- /dev/null +++ b/tmpl.go @@ -0,0 +1,39 @@ +package main + +import ( + "fmt" + "io/ioutil" + "os" + "text/template" +) + +func do_impl(last_data tmpl_data) { + + f, err := ioutil.ReadFile("nex.tmpl") + if err != nil { + fmt.Println(err) + return + } + + // 读取模板 + nex_tmpl, err := template.New("nex").Parse(string(f)) + if err != nil { + fmt.Println("[ tmpl err ]", err) + return + } + + // 覆盖创建要写入的 nex 文件 + new_file, err := os.OpenFile(file_output, os.O_WRONLY|os.O_TRUNC|os.O_CREATE, 0666) + if err != nil { + fmt.Println("[ create or open file error ]", err) + return + } + defer new_file.Close() + + // 写入 nex 模板 + err = nex_tmpl.Execute(new_file, last_data) + if err != nil { + fmt.Println("[ err at tmpl exec ]", err) + return + } +}