From e53203768f5fff8fbe98347693c108e823b373c2 Mon Sep 17 00:00:00 2001 From: qaqland <62464571+qaqland@users.noreply.github.com> Date: Thu, 13 Jan 2022 20:12:32 +0800 Subject: [PATCH 01/11] failed --- a.nex | 15 ++++ count.go | 38 ++++++++ flag.go | 20 +++++ gocomb.go | 33 +++++++ mix.go | 32 +++++++ src/nex_tmpl.go => nex.tmpl | 10 +-- parser.go | 171 +++++++++++------------------------- src/fas_parser.go | 53 ----------- tmpl.go | 39 ++++++++ 9 files changed, 231 insertions(+), 180 deletions(-) create mode 100644 a.nex create mode 100644 count.go create mode 100644 flag.go create mode 100644 gocomb.go create mode 100644 mix.go rename src/nex_tmpl.go => nex.tmpl (70%) delete mode 100644 src/fas_parser.go create mode 100644 tmpl.go diff --git a/a.nex b/a.nex new file mode 100644 index 0000000..1419f14 --- /dev/null +++ b/a.nex @@ -0,0 +1,15 @@ +#NEXUS +BEGIN DATA; + DIMENSIONS NTAX=4 NCHAR=6; + FORMAT DATATYPE=DNA GAP=- MISSING=?; +MATRIX +'>Bradybaena circulus circulus' -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------aactatatg-gtatcatatatagctatttctgctcaatg-taatataaatagccgcagtactctgactgtgctaaggtagcataatcatttggcttataattgaagtctagtatgaaagaagatatgggagttaactgtttcctaaacgtttacttaatttacttagggggtgaaaatacccccacaaacataatagacgagaagacccttgaaatttttagtata---attttaaatcgtgctttttgttggggcgacaaggtagcatagtaaacctactaagtggttttattagaacaaaattgtatgaataattaaattactcaagggataacagcataatattttaaagtttgtgacctcgatgttggactaggacaatatagtttaaaagactattatttttgctctgttcg---------------------tatatattatttggtgtttggtgtgggatagttggtacaggtttaagattattgattcgaatagagttaggtagttctggtgttatagcagaagagcatttctacaatgttattgtaacagctcatgcttttgtaataattttttttatagttatgccaatcatgattggaggttttggaaattgaatagtaccgttgttgattggggcacccgatatgagctttccacgtataaacaatataaggttttgattgttacccccttcttttcttctattaattagaagtagtctagtagaaggcggtgcagggaccggttgaacagtgtatcctccacttagctcacttgtaggtcataggagagctgccgtagacttagcaatcttttctcttcatttggctgggatatcatcaattttaggtgcaatcaattttattacaactatttttaatatacgagccccaggaataactatggaacgtgttagactgtttgtttgatctattttagtgacagtgtttcttttattact----------------------------------------------------------------------------- +'>Bradybaena phaeogramma phaeogramma' -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------aactatatg-gtatcatatatagctatttctgctcaatg-taatataaatagccgcagtactctgactgtgctaaggtagcataatcatttggcttataattgaagtctagtatgaaagaagatatgggagttaactgtttcctaaacgtttacttaatttacttagggggtgaaaatacccccacaaacataatagacgagaagacccttgaaatttttagtata---attttaaatcgtgctttttgttggggcgacaaggtagcatagtaaacctactaagtggttttattagaacaaaattgtatgaataattaaattactcaagggataacagcataatattttaaagtttgtgacctcgatgttggactaggacaatatagtttaaaagactattatttttgctctgttcg---------------------tatatattatttggtgtttggtgtgggatagttggtacaggtttaagattattgattcgaatagagttaggtagttctggtgttatagcagaagagcatttctacaatgttattgtaacagctcatgcttttgtaataattttttttatagttatgccaatcatgattggaggttttggaaattgaatagtaccgttgttgattggggcacccgatatgagctttccacgtataaacaatataaggttttgattgttacccccttcttttcttctattaattagaagtagtctagtagaaggcggtgcagggaccggttgaacagtgtatcctccacttagctcacttgtaggtcataggagagctgccgtagacttagcaatcttttctcttcatttggctgggatatcatcaattttaggtgcaatcaattttattacaactatttttaatatacgagccccaggaataactatggaacgtgttagactgtttgtttgatctattttagtgacagtgtttcttttattact----------------------------------------------------------------------------- +'>Bradybaena similaris' -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------aactatatg-gtatcatatatagctatttctgctcaatg-taatataaatagccgcagtactctgactgtgctaaggtagcataatcatttggcttataattgaagtctagtatgaaagaagatatgggagttaactgtttcctaaacgtttacttaatttacttagggggtgaaaatacccccacaaacataatagacgagaagacccttgaaatttttagtata---attttaaatcgtgctttttgttggggcgacaaggtagcatagtaaacctactaagtggttttattagaacaaaattgtatgaataattaaattactcaagggataacagcataatattttaaagtttgtgacctcgatgttggactaggacaatatagtttaaaagactattatttttgctctgttcg---------------------??? +'>Bradybaena virgo virgo' ???tatatattatttggtgtttggtgtgggatagttggtacaggtttaagattattgattcgaatagagttaggtagttctggtgttatagcagaagagcatttctacaatgttattgtaacagctcatgcttttgtaataattttttttatagttatgccaatcatgattggaggttttggaaattgaatagtaccgttgttgattggggcacccgatatgagctttccacgtataaacaatataaggttttgattgttacccccttcttttcttctattaattagaagtagtctagtagaaggcggtgcagggaccggttgaacagtgtatcctccacttagctcacttgtaggtcataggagagctgccgtagacttagcaatcttttctcttcatttggctgggatatcatcaattttaggtgcaatcaattttattacaactatttttaatatacgagccccaggaataactatggaacgtgttagactgtttgtttgatctattttagtgacagtgtttcttttattact----------------------------------------------------------------------------- +; +END; +BEGIN SETS; + CHARSET ./data/16S.fas = 1-3; + CHARSET ./data/CO1.fas = 4-6; +END; \ No newline at end of file diff --git a/count.go b/count.go new file mode 100644 index 0000000..739e041 --- /dev/null +++ b/count.go @@ -0,0 +1,38 @@ +package main + +import "fmt" + +type charset struct { + Name string + From int + To int +} + +// 遍历文件得到基本数据 +func fas_sum() []dna { + sum := []dna{} + for i, f := range file_input { + sum = append(sum, fas_parser(f)) + fmt.Println("[ working A ]", i+1, f) + } + return sum +} + +// 整合若干文件的统计 +func fas_count() []charset { + fas_charset := []charset{} + sum_nex := fas_sum() + for k, v := range sum_nex { + n := v.name + f := 1 + if k != 0 { + f = fas_charset[k-1].To + 1 + } + t := f + v.count - 1 + fmt.Println("[ working B ]", n, f, t) + new_charset := charset{n, f, t} + fas_charset = append(fas_charset, new_charset) + } + // fmt.Println(sum_charset) + return fas_charset +} diff --git a/flag.go b/flag.go new file mode 100644 index 0000000..22dff76 --- /dev/null +++ b/flag.go @@ -0,0 +1,20 @@ +package main + +import ( + "flag" + "fmt" +) + +var ( + file_output string + file_input []string +) + +func dna_flag() { + flag.StringVar(&file_output, "o", "a.nex", "files name wait to out") + flag.Parse() + file_input = flag.Args() // []string{"foo", "bar"} + fmt.Println("==============") + fmt.Println("[input file:]", file_input) + fmt.Println("[output file:]", file_output) +} diff --git a/gocomb.go b/gocomb.go new file mode 100644 index 0000000..d7c7b52 --- /dev/null +++ b/gocomb.go @@ -0,0 +1,33 @@ +package main + +import ( + "strings" +) + +type tmpl_data struct { + Ntax int + Nchar int + Matrix map[string]string + Charset []charset +} + +func main() { + + dna_flag() + + sum_nex := fas_sum() + + sum_charset := fas_count() + + sum_dna, ntax, nchar := dna_mix(sum_nex, sum_charset) + + matrix := make(map[string]string, ntax) + for k := range sum_dna { + matrix[k] = strings.Join(sum_dna[k], "") + } + + // 准备发射到模板的数据 + last_data := tmpl_data{ntax, nchar, matrix, sum_charset} + // fmt.Println(last_data) + do_impl(last_data) +} diff --git a/mix.go b/mix.go new file mode 100644 index 0000000..303e8cd --- /dev/null +++ b/mix.go @@ -0,0 +1,32 @@ +package main + +import "strings" + +func dna_mix(sum_nex []dna, sum_charset []charset) (map[string][]string, int, int) { + // dna 的整合 + ntax := 0 + nchar := sum_charset[len(sum_charset)-1].To + sum_dna := make(map[string][]string) + for _, v := range sum_nex { + for k1 := range v.min_dna { + _, has := sum_dna[k1] + if !has { + sum_dna[k1] = make([]string, len(sum_charset)) + ntax++ + } + } + } + for k, v := range sum_nex { + for _, v1 := range v.min_dna { + for k2 := range sum_dna { + if _, ok := v.min_dna[k2]; ok { + sum_dna[k2][k] = v1 + } else { + sum_dna[k2][k] = strings.Repeat("?", v.count) + } + } + } + } + // fmt.Println(sum_dna) + return sum_dna, ntax, nchar +} diff --git a/src/nex_tmpl.go b/nex.tmpl similarity index 70% rename from src/nex_tmpl.go rename to nex.tmpl index 2891241..799bfea 100644 --- a/src/nex_tmpl.go +++ b/nex.tmpl @@ -1,6 +1,4 @@ -package fas_parser - -const Nex_tmpl = `#NEXUS +#NEXUS BEGIN DATA; DIMENSIONS NTAX={{ .Ntax }} NCHAR={{ .Nchar }}; FORMAT DATATYPE=DNA GAP=- MISSING=?; @@ -13,8 +11,4 @@ BEGIN SETS; {{- range $_, $i := .Charset }} CHARSET {{ $i.Name }} = {{ $i.From }}-{{ $i.To }}; {{- end }} -END; -` - -// 最后那个 $i 好像有问题 -// {{/* $k| printf "%-40s" */}} +END; \ No newline at end of file diff --git a/parser.go b/parser.go index a3f40e0..143d6e0 100644 --- a/parser.go +++ b/parser.go @@ -1,119 +1,52 @@ -package main - -import ( - "flag" - "fmt" - fas_parser "gocomb/src" - "os" - "strings" - "text/template" -) - -type dna struct { - name string - min_dna map[string]string - count int -} - -type charset struct { - Name string - From int - To int -} - -type tmpl_data struct { - Ntax int - Nchar int - Matrix map[string]string - Charset []charset -} - -func main() { - - // 读取命令行,这里一定要是指针 - file_export := flag.String("o", "a.nex", "files name wait to out") - flag.Parse() - file_names := flag.Args() // []string{"foo", "bar"} - fmt.Println("[ export here ]", *file_export) - - // 遍历文件得到基本数据 - sum_nex := make([]dna, 0, 5) - for k, v := range file_names { - i, j := fas_parser.Fas_parser(v) - new_nex := dna{v, i, j} - sum_nex = append(sum_nex, new_nex) - fmt.Println("[ working A ]", k+1, v) - } - - // 整合若干文件的统计 - sum_charset := []charset{} - for k, v := range sum_nex { - n := v.name - f := 1 - if k != 0 { - f = sum_charset[k-1].To + 1 - } - t := f + v.count - 1 - fmt.Println("[ working B ]", n, f, t) - new_charset := charset{n, f, t} - sum_charset = append(sum_charset, new_charset) - } - // fmt.Println(sum_charset) - - // dna 的整合 - ntax := 0 - nchar := sum_charset[len(sum_charset)-1].To - sum_dna := make(map[string][]string) - for _, v := range sum_nex { - for k1 := range v.min_dna { - _, has := sum_dna[k1] - if !has { - sum_dna[k1] = make([]string, len(sum_charset)) - ntax ++ - } - } - } - for k, v := range sum_nex { - for _, v1 := range v.min_dna { - for k2 := range sum_dna { - if _, ok := v.min_dna[k2]; ok { - sum_dna[k2][k] = v1 - } else { - sum_dna[k2][k] = strings.Repeat("?", v.count) - } - } - } - } - // fmt.Println(sum_dna) - - matrix := make(map[string]string, ntax) - for k := range sum_dna { - matrix[k] = strings.Join(sum_dna[k], "") - } - - // 准备发射到模板的数据 - last_data := tmpl_data{ntax, nchar, matrix, sum_charset} - // fmt.Println(last_data) - - // 读取模板 - nex_tmpl, err := template.New("nex").Parse(fas_parser.Nex_tmpl) - if err != nil { - fmt.Println("[ tmpl err ]", err) - return - } - - // 覆盖创建要写入的 nex 文件 - new_file, err := os.OpenFile(*file_export, os.O_WRONLY|os.O_TRUNC|os.O_CREATE, 0666) - if err != nil { - fmt.Println("[ create or open file error ]", err) - return - } - defer new_file.Close() - - // 写入 nex 模板 - err = nex_tmpl.Execute(new_file, last_data) - if err != nil { - fmt.Println("[ err at tmpl exec ]", err) - return - } -} +package main + +import ( + "fmt" + "io/ioutil" +) + +type dna struct { + name string + count int + min_dna map[string]string +} + +// 读取fas文件 +func fas_parser(file_name string) dna { + + // 读取文件 + f, err := ioutil.ReadFile("./" + file_name) + if err != nil { + fmt.Println(err) + return dna{"", 0, nil} + } + + count := 0 + i := 0 // acgt行计数 + j := -1 // 标题行计数 + seq := make(map[string]string) + section := "" + + for k, v := range f { + switch v { + case '>': + j = k + count++ + case '\n': + if j != -1 { + section = string(f[j:k]) + i = k + 1 + j = -1 + continue + } + seq[section] = seq[section] + string(f[i:k]) + i = k + 1 + } + } + // for k1, v1 := range seq { + // fmt.Println(k1) + // fmt.Println(v1) + // } + // fmt.Println(count) + return dna{file_name, count, seq} +} diff --git a/src/fas_parser.go b/src/fas_parser.go deleted file mode 100644 index fb055f5..0000000 --- a/src/fas_parser.go +++ /dev/null @@ -1,53 +0,0 @@ -package fas_parser - -import( - "io/ioutil" - "fmt" -) - -func Fas_parser(file_name string) (map[string]string, int) { - f, err := ioutil.ReadFile("./" + file_name) - if err != nil { - fmt.Println(err) - return nil, 0 - } - // fmt.Println(f) - count := 0 - i := 0 // DNA行计数 - j := 0 // 非序列行计数 - seq := make(map[string]string) - section := "" - // fmt.Println('a', 'c', 'g', 't', '-', '\n', '\r') - for k, v := range f { - switch v { - case 'a', 'c', 'g', 't', '-': - if j != 0 { - continue - } - if i == 0 { - i = k - } - case '\n': - if i != 0 { - seq[section] = seq[section] + string(f[i:k]) - if len(seq) < 2 && j == 0 { - count += k - i - } - i = 0 - continue - } - section = string(f[j:k]) - j = 0 - default: - if j == 0 { - j = k + 1 - } - } - } - // for k1, v1 := range seq { - // fmt.Println(k1) - // fmt.Println(v1) - // } - // fmt.Println(count) - return seq, count -} diff --git a/tmpl.go b/tmpl.go new file mode 100644 index 0000000..ef86bc2 --- /dev/null +++ b/tmpl.go @@ -0,0 +1,39 @@ +package main + +import ( + "fmt" + "io/ioutil" + "os" + "text/template" +) + +func do_impl(last_data tmpl_data) { + + f, err := ioutil.ReadFile("nex.tmpl") + if err != nil { + fmt.Println(err) + return + } + + // 读取模板 + nex_tmpl, err := template.New("nex").Parse(string(f)) + if err != nil { + fmt.Println("[ tmpl err ]", err) + return + } + + // 覆盖创建要写入的 nex 文件 + new_file, err := os.OpenFile(file_output, os.O_WRONLY|os.O_TRUNC|os.O_CREATE, 0666) + if err != nil { + fmt.Println("[ create or open file error ]", err) + return + } + defer new_file.Close() + + // 写入 nex 模板 + err = nex_tmpl.Execute(new_file, last_data) + if err != nil { + fmt.Println("[ err at tmpl exec ]", err) + return + } +} From c0c60ce5a7796cddfca9e4a179965bb1bebbb505 Mon Sep 17 00:00:00 2001 From: qaqland <62464571+qaqland@users.noreply.github.com> Date: Fri, 14 Jan 2022 17:20:42 +0800 Subject: [PATCH 02/11] =?UTF-8?q?=E6=9A=82=E6=97=B6=E4=BF=AE=E5=A5=BD?= =?UTF-8?q?=E4=BA=86?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- a.nex | 15 ++++++++------- count.go | 5 ++--- gocomb.go | 4 ++-- mix.go | 9 +++++---- nex.tmpl | 3 ++- parser.go | 17 +++++++++-------- 6 files changed, 28 insertions(+), 25 deletions(-) diff --git a/a.nex b/a.nex index 1419f14..ad020cc 100644 --- a/a.nex +++ b/a.nex @@ -1,15 +1,16 @@ #NEXUS BEGIN DATA; - DIMENSIONS NTAX=4 NCHAR=6; + DIMENSIONS NTAX=4 NCHAR=2031; FORMAT DATATYPE=DNA GAP=- MISSING=?; MATRIX -'>Bradybaena circulus circulus' -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------aactatatg-gtatcatatatagctatttctgctcaatg-taatataaatagccgcagtactctgactgtgctaaggtagcataatcatttggcttataattgaagtctagtatgaaagaagatatgggagttaactgtttcctaaacgtttacttaatttacttagggggtgaaaatacccccacaaacataatagacgagaagacccttgaaatttttagtata---attttaaatcgtgctttttgttggggcgacaaggtagcatagtaaacctactaagtggttttattagaacaaaattgtatgaataattaaattactcaagggataacagcataatattttaaagtttgtgacctcgatgttggactaggacaatatagtttaaaagactattatttttgctctgttcg---------------------tatatattatttggtgtttggtgtgggatagttggtacaggtttaagattattgattcgaatagagttaggtagttctggtgttatagcagaagagcatttctacaatgttattgtaacagctcatgcttttgtaataattttttttatagttatgccaatcatgattggaggttttggaaattgaatagtaccgttgttgattggggcacccgatatgagctttccacgtataaacaatataaggttttgattgttacccccttcttttcttctattaattagaagtagtctagtagaaggcggtgcagggaccggttgaacagtgtatcctccacttagctcacttgtaggtcataggagagctgccgtagacttagcaatcttttctcttcatttggctgggatatcatcaattttaggtgcaatcaattttattacaactatttttaatatacgagccccaggaataactatggaacgtgttagactgtttgtttgatctattttagtgacagtgtttcttttattact----------------------------------------------------------------------------- -'>Bradybaena phaeogramma phaeogramma' -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------aactatatg-gtatcatatatagctatttctgctcaatg-taatataaatagccgcagtactctgactgtgctaaggtagcataatcatttggcttataattgaagtctagtatgaaagaagatatgggagttaactgtttcctaaacgtttacttaatttacttagggggtgaaaatacccccacaaacataatagacgagaagacccttgaaatttttagtata---attttaaatcgtgctttttgttggggcgacaaggtagcatagtaaacctactaagtggttttattagaacaaaattgtatgaataattaaattactcaagggataacagcataatattttaaagtttgtgacctcgatgttggactaggacaatatagtttaaaagactattatttttgctctgttcg---------------------tatatattatttggtgtttggtgtgggatagttggtacaggtttaagattattgattcgaatagagttaggtagttctggtgttatagcagaagagcatttctacaatgttattgtaacagctcatgcttttgtaataattttttttatagttatgccaatcatgattggaggttttggaaattgaatagtaccgttgttgattggggcacccgatatgagctttccacgtataaacaatataaggttttgattgttacccccttcttttcttctattaattagaagtagtctagtagaaggcggtgcagggaccggttgaacagtgtatcctccacttagctcacttgtaggtcataggagagctgccgtagacttagcaatcttttctcttcatttggctgggatatcatcaattttaggtgcaatcaattttattacaactatttttaatatacgagccccaggaataactatggaacgtgttagactgtttgtttgatctattttagtgacagtgtttcttttattact----------------------------------------------------------------------------- -'>Bradybaena similaris' -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------aactatatg-gtatcatatatagctatttctgctcaatg-taatataaatagccgcagtactctgactgtgctaaggtagcataatcatttggcttataattgaagtctagtatgaaagaagatatgggagttaactgtttcctaaacgtttacttaatttacttagggggtgaaaatacccccacaaacataatagacgagaagacccttgaaatttttagtata---attttaaatcgtgctttttgttggggcgacaaggtagcatagtaaacctactaagtggttttattagaacaaaattgtatgaataattaaattactcaagggataacagcataatattttaaagtttgtgacctcgatgttggactaggacaatatagtttaaaagactattatttttgctctgttcg---------------------??? -'>Bradybaena virgo virgo' ???tatatattatttggtgtttggtgtgggatagttggtacaggtttaagattattgattcgaatagagttaggtagttctggtgttatagcagaagagcatttctacaatgttattgtaacagctcatgcttttgtaataattttttttatagttatgccaatcatgattggaggttttggaaattgaatagtaccgttgttgattggggcacccgatatgagctttccacgtataaacaatataaggttttgattgttacccccttcttttcttctattaattagaagtagtctagtagaaggcggtgcagggaccggttgaacagtgtatcctccacttagctcacttgtaggtcataggagagctgccgtagacttagcaatcttttctcttcatttggctgggatatcatcaattttaggtgcaatcaattttattacaactatttttaatatacgagccccaggaataactatggaacgtgttagactgtttgtttgatctattttagtgacagtgtttcttttattact----------------------------------------------------------------------------- +'Bradybaena circulus circulus' -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------aactatatg-gtatcatatatagctatttctgctcaatg-taatataaatagccgcagtactctgactgtgctaaggtagcataatcatttggcttataattgaagtctagtatgaaagaagatatgggagttaactgtttcctaaacgtttacttaatttacttagggggtgaaaatacccccacaaacataatagacgagaagacccttgaaatttttagtata---attttaaatcgtgctttttgttggggcgacaaggtagcatagtaaacctactaagtggttttattagaacaaaattgtatgaataattaaattactcaagggataacagcataatattttaaagtttgtgacctcgatgttggactaggacaatatagtttaaaagactattatttttgctctgttcg---------------------tatatattatttggtgtttggtgtgggatagttggtacaggtttaagattattgattcgaatagagttaggtagttctggtgttatagcagaagagcatttctacaatgttattgtaacagctcatgcttttgtaataattttttttatagttatgccaatcatgattggaggttttggaaattgaatagtaccgttgttgattggggcacccgatatgagctttccacgtataaacaatataaggttttgattgttacccccttcttttcttctattaattagaagtagtctagtagaaggcggtgcagggaccggttgaacagtgtatcctccacttagctcacttgtaggtcataggagagctgccgtagacttagcaatcttttctcttcatttggctgggatatcatcaattttaggtgcaatcaattttattacaactatttttaatatacgagccccaggaataactatggaacgtgttagactgtttgtttgatctattttagtgacagtgtttcttttattact-----------------------------------------------------------------------------tatatattatttggtgtttggtgtgggatagttggtacaggtttaagattattgattcgaatagagttaggtagttctggtgttatagcagaagagcatttctacaatgttattgtaacagctcatgcttttgtaataattttttttatagttatgccaatcatgattggaggttttggaaattgaatagtaccgttgttgattggggcacccgatatgagctttccacgtataaacaatataaggttttgattgttacccccttcttttcttctattaattagaagtagtctagtagaaggcggtgcagggaccggttgaacagtgtatcctccacttagctcacttgtaggtcataggagagctgccgtagacttagcaatcttttctcttcatttggctgggatatcatcaattttaggtgcaatcaattttattacaactatttttaatatacgagccccaggaataactatggaacgtgttagactgtttgtttgatctattttagtgacagtgtttcttttattact----------------------------------------------------------------------------- +'Bradybaena phaeogramma phaeogramma' -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------aactatatg-gtatcatatatagctatttctgctcaatg-taatataaatagccgcagtactctgactgtgctaaggtagcataatcatttggcttataattgaagtctagtatgaaagaagatatgggagttaactgtttcctaaacgtttacttaatttacttagggggtgaaaatacccccacaaacataatagacgagaagacccttgaaatttttagtata---attttaaatcgtgctttttgttggggcgacaaggtagcatagtaaacctactaagtggttttattagaacaaaattgtatgaataattaaattactcaagggataacagcataatattttaaagtttgtgacctcgatgttggactaggacaatatagtttaaaagactattatttttgctctgttcg---------------------tatatattatttggtgtttggtgtgggatagttggtacaggtttaagattattgattcgaatagagttaggtagttctggtgttatagcagaagagcatttctacaatgttattgtaacagctcatgcttttgtaataattttttttatagttatgccaatcatgattggaggttttggaaattgaatagtaccgttgttgattggggcacccgatatgagctttccacgtataaacaatataaggttttgattgttacccccttcttttcttctattaattagaagtagtctagtagaaggcggtgcagggaccggttgaacagtgtatcctccacttagctcacttgtaggtcataggagagctgccgtagacttagcaatcttttctcttcatttggctgggatatcatcaattttaggtgcaatcaattttattacaactatttttaatatacgagccccaggaataactatggaacgtgttagactgtttgtttgatctattttagtgacagtgtttcttttattact-----------------------------------------------------------------------------tatatattatttggtgtttggtgtgggatagttggtacaggtttaagattattgattcgaatagagttaggtagttctggtgttatagcagaagagcatttctacaatgttattgtaacagctcatgcttttgtaataattttttttatagttatgccaatcatgattggaggttttggaaattgaatagtaccgttgttgattggggcacccgatatgagctttccacgtataaacaatataaggttttgattgttacccccttcttttcttctattaattagaagtagtctagtagaaggcggtgcagggaccggttgaacagtgtatcctccacttagctcacttgtaggtcataggagagctgccgtagacttagcaatcttttctcttcatttggctgggatatcatcaattttaggtgcaatcaattttattacaactatttttaatatacgagccccaggaataactatggaacgtgttagactgtttgtttgatctattttagtgacagtgtttcttttattact----------------------------------------------------------------------------- +'Bradybaena similaris' -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------aactatatg-gtatcatatatagctatttctgctcaatg-taatataaatagccgcagtactctgactgtgctaaggtagcataatcatttggcttataattgaagtctagtatgaaagaagatatgggagttaactgtttcctaaacgtttacttaatttacttagggggtgaaaatacccccacaaacataatagacgagaagacccttgaaatttttagtata---attttaaatcgtgctttttgttggggcgacaaggtagcatagtaaacctactaagtggttttattagaacaaaattgtatgaataattaaattactcaagggataacagcataatattttaaagtttgtgacctcgatgttggactaggacaatatagtttaaaagactattatttttgctctgttcg---------------------???????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????? +'Bradybaena virgo virgo' ???????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????tatatattatttggtgtttggtgtgggatagttggtacaggtttaagattattgattcgaatagagttaggtagttctggtgttatagcagaagagcatttctacaatgttattgtaacagctcatgcttttgtaataattttttttatagttatgccaatcatgattggaggttttggaaattgaatagtaccgttgttgattggggcacccgatatgagctttccacgtataaacaatataaggttttgattgttacccccttcttttcttctattaattagaagtagtctagtagaaggcggtgcagggaccggttgaacagtgtatcctccacttagctcacttgtaggtcataggagagctgccgtagacttagcaatcttttctcttcatttggctgggatatcatcaattttaggtgcaatcaattttattacaactatttttaatatacgagccccaggaataactatggaacgtgttagactgtttgtttgatctattttagtgacagtgtttcttttattact-----------------------------------------------------------------------------tatatattatttggtgtttggtgtgggatagttggtacaggtttaagattattgattcgaatagagttaggtagttctggtgttatagcagaagagcatttctacaatgttattgtaacagctcatgcttttgtaataattttttttatagttatgccaatcatgattggaggttttggaaattgaatagtaccgttgttgattggggcacccgatatgagctttccacgtataaacaatataaggttttgattgttacccccttcttttcttctattaattagaagtagtctagtagaaggcggtgcagggaccggttgaacagtgtatcctccacttagctcacttgtaggtcataggagagctgccgtagacttagcaatcttttctcttcatttggctgggatatcatcaattttaggtgcaatcaattttattacaactatttttaatatacgagccccaggaataactatggaacgtgttagactgtttgtttgatctattttagtgacagtgtttcttttattact----------------------------------------------------------------------------- ; END; BEGIN SETS; - CHARSET ./data/16S.fas = 1-3; - CHARSET ./data/CO1.fas = 4-6; + CHARSET ./data/16S.fas = 1-811; + CHARSET ./data/CO1.fas = 812-1421; + CHARSET ./data/CO1.mega.fas = 1422-2031; END; \ No newline at end of file diff --git a/count.go b/count.go index 739e041..a1504c8 100644 --- a/count.go +++ b/count.go @@ -19,9 +19,8 @@ func fas_sum() []dna { } // 整合若干文件的统计 -func fas_count() []charset { +func fas_count(sum_nex []dna) []charset { fas_charset := []charset{} - sum_nex := fas_sum() for k, v := range sum_nex { n := v.name f := 1 @@ -33,6 +32,6 @@ func fas_count() []charset { new_charset := charset{n, f, t} fas_charset = append(fas_charset, new_charset) } - // fmt.Println(sum_charset) + fmt.Println(fas_charset) return fas_charset } diff --git a/gocomb.go b/gocomb.go index d7c7b52..c2ce004 100644 --- a/gocomb.go +++ b/gocomb.go @@ -17,9 +17,9 @@ func main() { sum_nex := fas_sum() - sum_charset := fas_count() + sum_charset := fas_count(sum_nex) - sum_dna, ntax, nchar := dna_mix(sum_nex, sum_charset) + sum_dna, ntax, nchar := fas_mix(sum_nex, sum_charset) matrix := make(map[string]string, ntax) for k := range sum_dna { diff --git a/mix.go b/mix.go index 303e8cd..bae2274 100644 --- a/mix.go +++ b/mix.go @@ -2,13 +2,13 @@ package main import "strings" -func dna_mix(sum_nex []dna, sum_charset []charset) (map[string][]string, int, int) { +func fas_mix(sum_nex []dna, sum_charset []charset) (map[string][]string, int, int) { // dna 的整合 ntax := 0 nchar := sum_charset[len(sum_charset)-1].To sum_dna := make(map[string][]string) for _, v := range sum_nex { - for k1 := range v.min_dna { + for k1 := range v.min { _, has := sum_dna[k1] if !has { sum_dna[k1] = make([]string, len(sum_charset)) @@ -17,12 +17,13 @@ func dna_mix(sum_nex []dna, sum_charset []charset) (map[string][]string, int, in } } for k, v := range sum_nex { - for _, v1 := range v.min_dna { + for _, v1 := range v.min { for k2 := range sum_dna { - if _, ok := v.min_dna[k2]; ok { + if _, ok := v.min[k2]; ok { sum_dna[k2][k] = v1 } else { sum_dna[k2][k] = strings.Repeat("?", v.count) + // 之前就没写错吗 } } } diff --git a/nex.tmpl b/nex.tmpl index 799bfea..807700c 100644 --- a/nex.tmpl +++ b/nex.tmpl @@ -4,7 +4,8 @@ BEGIN DATA; FORMAT DATATYPE=DNA GAP=- MISSING=?; MATRIX {{- range $k, $v := .Matrix }} -'{{ $k }}' {{ $v }}{{ end }} +'{{ $k }}' {{ $v }} +{{- end }} ; END; BEGIN SETS; diff --git a/parser.go b/parser.go index 143d6e0..0232b4f 100644 --- a/parser.go +++ b/parser.go @@ -3,12 +3,13 @@ package main import ( "fmt" "io/ioutil" + "strings" ) type dna struct { - name string - count int - min_dna map[string]string + name string + count int + min map[string]string } // 读取fas文件 @@ -25,24 +26,24 @@ func fas_parser(file_name string) dna { i := 0 // acgt行计数 j := -1 // 标题行计数 seq := make(map[string]string) - section := "" + indid := "" for k, v := range f { switch v { case '>': - j = k - count++ + j = k + 1 case '\n': if j != -1 { - section = string(f[j:k]) + indid = string(f[j:k]) i = k + 1 j = -1 continue } - seq[section] = seq[section] + string(f[i:k]) + seq[indid] = seq[indid] + strings.ToLower(string(f[i:k])) i = k + 1 } } + count = len(seq[indid]) // for k1, v1 := range seq { // fmt.Println(k1) // fmt.Println(v1) From 179cf24fba7be9eb02f7553ccb982412226e6780 Mon Sep 17 00:00:00 2001 From: qaqland <62464571+qaqland@users.noreply.github.com> Date: Sat, 15 Jan 2022 17:01:37 +0800 Subject: [PATCH 03/11] =?UTF-8?q?=E9=9C=80=E8=A6=81=E4=B8=80=E4=B8=AA?= =?UTF-8?q?=E6=AD=A3=E5=88=99?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .gitignore | 2 ++ count.go | 12 ++++++++++-- 2 files changed, 12 insertions(+), 2 deletions(-) create mode 100644 .gitignore diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..38a7bcb --- /dev/null +++ b/.gitignore @@ -0,0 +1,2 @@ +a.nex +test.go \ No newline at end of file diff --git a/count.go b/count.go index a1504c8..2237248 100644 --- a/count.go +++ b/count.go @@ -1,6 +1,8 @@ package main -import "fmt" +import ( + "fmt" +) type charset struct { Name string @@ -22,7 +24,7 @@ func fas_sum() []dna { func fas_count(sum_nex []dna) []charset { fas_charset := []charset{} for k, v := range sum_nex { - n := v.name + n := fas_name(v.name) f := 1 if k != 0 { f = fas_charset[k-1].To + 1 @@ -35,3 +37,9 @@ func fas_count(sum_nex []dna) []charset { fmt.Println(fas_charset) return fas_charset } + +func fas_name(old_name string) string { + // 需要 最后一个/开始,一个.结束 中间的部分 + // 正则表达式 + return old_name +} From 2c26c9e50a59b0cfec181fb0d545e6511590f97c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E6=9C=88=E5=AD=A3=E8=8A=B1=E4=B8=AD=E7=9A=84=E6=98=9F?= =?UTF-8?q?=E8=BE=B0?= Date: Sat, 15 Jan 2022 10:35:44 +0000 Subject: [PATCH 04/11] try to test --- count.go | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/count.go b/count.go index 2237248..d0d318f 100644 --- a/count.go +++ b/count.go @@ -39,7 +39,11 @@ func fas_count(sum_nex []dna) []charset { } func fas_name(old_name string) string { - // 需要 最后一个/开始,一个.结束 中间的部分 - // 正则表达式 + //needed to import string + str := + compileRegex := regexp.MustCompile("(\w+).\w+") + matchArr := compileRegex.FindStringSubmatch(str) + //needed to use the string get from the old string + fmt.Println("output content:", matchArr[len(matchArr)-1]) return old_name } From c098cabced9518553a6f39ee404b331e083a9315 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E6=9C=88=E5=AD=A3=E8=8A=B1=E4=B8=AD=E7=9A=84=E6=98=9F?= =?UTF-8?q?=E8=BE=B0?= Date: Sat, 15 Jan 2022 10:37:57 +0000 Subject: [PATCH 05/11] Update count.go --- count.go | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/count.go b/count.go index d0d318f..fe8905f 100644 --- a/count.go +++ b/count.go @@ -40,10 +40,9 @@ func fas_count(sum_nex []dna) []charset { func fas_name(old_name string) string { //needed to import string - str := compileRegex := regexp.MustCompile("(\w+).\w+") - matchArr := compileRegex.FindStringSubmatch(str) + matchArr := compileRegex.FindStringSubmatch(old_name) //needed to use the string get from the old string - fmt.Println("output content:", matchArr[len(matchArr)-1]) + oldname := matchArr[len(matchArr)-1] return old_name } From 5675e8d78b85d68ce993498fd19bacf82db427c3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E6=9C=88=E5=AD=A3=E8=8A=B1=E4=B8=AD=E7=9A=84=E6=98=9F?= =?UTF-8?q?=E8=BE=B0?= Date: Sat, 15 Jan 2022 10:39:54 +0000 Subject: [PATCH 06/11] Update count.go --- count.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/count.go b/count.go index fe8905f..be52d9d 100644 --- a/count.go +++ b/count.go @@ -43,6 +43,6 @@ func fas_name(old_name string) string { compileRegex := regexp.MustCompile("(\w+).\w+") matchArr := compileRegex.FindStringSubmatch(old_name) //needed to use the string get from the old string - oldname := matchArr[len(matchArr)-1] + old_name := matchArr[len(matchArr)-1] return old_name } From 3449750a7340dbfeee7a610ac1bb12bce9524e6b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E6=9C=88=E5=AD=A3=E8=8A=B1=E4=B8=AD=E7=9A=84=E6=98=9F?= =?UTF-8?q?=E8=BE=B0?= Date: Sat, 15 Jan 2022 10:41:32 +0000 Subject: [PATCH 07/11] Update count.go --- count.go | 1 + 1 file changed, 1 insertion(+) diff --git a/count.go b/count.go index be52d9d..3aa8a43 100644 --- a/count.go +++ b/count.go @@ -2,6 +2,7 @@ package main import ( "fmt" + "regexp" ) type charset struct { From 65196699c37c62ee46d31705ef4651ce79621b43 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E6=9C=88=E5=AD=A3=E8=8A=B1=E4=B8=AD=E7=9A=84=E6=98=9F?= =?UTF-8?q?=E8=BE=B0?= Date: Sat, 15 Jan 2022 10:48:01 +0000 Subject: [PATCH 08/11] Update count.go --- count.go | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/count.go b/count.go index 3aa8a43..42c344e 100644 --- a/count.go +++ b/count.go @@ -41,9 +41,9 @@ func fas_count(sum_nex []dna) []charset { func fas_name(old_name string) string { //needed to import string - compileRegex := regexp.MustCompile("(\w+).\w+") + compileRegex := regexp.MustCompile(`(w+.`) matchArr := compileRegex.FindStringSubmatch(old_name) //needed to use the string get from the old string - old_name := matchArr[len(matchArr)-1] - return old_name + new_name := matchArr[len(matchArr)-1] + return new_name } From e599b8486d8cc778a3f685b3c19f9077443bba25 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E6=9C=88=E5=AD=A3=E8=8A=B1=E4=B8=AD=E7=9A=84=E6=98=9F?= =?UTF-8?q?=E8=BE=B0?= Date: Sat, 15 Jan 2022 10:48:16 +0000 Subject: [PATCH 09/11] Update count.go --- count.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/count.go b/count.go index 42c344e..9a6e071 100644 --- a/count.go +++ b/count.go @@ -41,7 +41,7 @@ func fas_count(sum_nex []dna) []charset { func fas_name(old_name string) string { //needed to import string - compileRegex := regexp.MustCompile(`(w+.`) + compileRegex := regexp.MustCompile(`\w+.`) matchArr := compileRegex.FindStringSubmatch(old_name) //needed to use the string get from the old string new_name := matchArr[len(matchArr)-1] From 3c32f0d5e3bc56af09becf347f6f0ae9a41dd51a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E6=9C=88=E5=AD=A3=E8=8A=B1=E4=B8=AD=E7=9A=84=E6=98=9F?= =?UTF-8?q?=E8=BE=B0?= Date: Sat, 15 Jan 2022 10:49:16 +0000 Subject: [PATCH 10/11] Update count.go --- count.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/count.go b/count.go index 9a6e071..2deddd8 100644 --- a/count.go +++ b/count.go @@ -41,7 +41,7 @@ func fas_count(sum_nex []dna) []charset { func fas_name(old_name string) string { //needed to import string - compileRegex := regexp.MustCompile(`\w+.`) + compileRegex := regexp.MustCompile(`\w+\.`) matchArr := compileRegex.FindStringSubmatch(old_name) //needed to use the string get from the old string new_name := matchArr[len(matchArr)-1] From f791976b844572f97a8e04509d4ca21befd66e09 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E6=9C=88=E5=AD=A3=E8=8A=B1=E4=B8=AD=E7=9A=84=E6=98=9F?= =?UTF-8?q?=E8=BE=B0?= Date: Sat, 15 Jan 2022 10:56:29 +0000 Subject: [PATCH 11/11] Update count.go --- count.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/count.go b/count.go index 2deddd8..aa6ece4 100644 --- a/count.go +++ b/count.go @@ -41,7 +41,7 @@ func fas_count(sum_nex []dna) []charset { func fas_name(old_name string) string { //needed to import string - compileRegex := regexp.MustCompile(`\w+\.`) + compileRegex := regexp.MustCompile(`(\w+)\.\w+`) matchArr := compileRegex.FindStringSubmatch(old_name) //needed to use the string get from the old string new_name := matchArr[len(matchArr)-1]