diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..38a7bcb --- /dev/null +++ b/.gitignore @@ -0,0 +1,2 @@ +a.nex +test.go \ No newline at end of file diff --git a/a.nex b/a.nex new file mode 100644 index 0000000..ad020cc --- /dev/null +++ b/a.nex @@ -0,0 +1,16 @@ +#NEXUS +BEGIN DATA; + DIMENSIONS NTAX=4 NCHAR=2031; + FORMAT DATATYPE=DNA GAP=- MISSING=?; +MATRIX +'Bradybaena circulus circulus' -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------aactatatg-gtatcatatatagctatttctgctcaatg-taatataaatagccgcagtactctgactgtgctaaggtagcataatcatttggcttataattgaagtctagtatgaaagaagatatgggagttaactgtttcctaaacgtttacttaatttacttagggggtgaaaatacccccacaaacataatagacgagaagacccttgaaatttttagtata---attttaaatcgtgctttttgttggggcgacaaggtagcatagtaaacctactaagtggttttattagaacaaaattgtatgaataattaaattactcaagggataacagcataatattttaaagtttgtgacctcgatgttggactaggacaatatagtttaaaagactattatttttgctctgttcg---------------------tatatattatttggtgtttggtgtgggatagttggtacaggtttaagattattgattcgaatagagttaggtagttctggtgttatagcagaagagcatttctacaatgttattgtaacagctcatgcttttgtaataattttttttatagttatgccaatcatgattggaggttttggaaattgaatagtaccgttgttgattggggcacccgatatgagctttccacgtataaacaatataaggttttgattgttacccccttcttttcttctattaattagaagtagtctagtagaaggcggtgcagggaccggttgaacagtgtatcctccacttagctcacttgtaggtcataggagagctgccgtagacttagcaatcttttctcttcatttggctgggatatcatcaattttaggtgcaatcaattttattacaactatttttaatatacgagccccaggaataactatggaacgtgttagactgtttgtttgatctattttagtgacagtgtttcttttattact-----------------------------------------------------------------------------tatatattatttggtgtttggtgtgggatagttggtacaggtttaagattattgattcgaatagagttaggtagttctggtgttatagcagaagagcatttctacaatgttattgtaacagctcatgcttttgtaataattttttttatagttatgccaatcatgattggaggttttggaaattgaatagtaccgttgttgattggggcacccgatatgagctttccacgtataaacaatataaggttttgattgttacccccttcttttcttctattaattagaagtagtctagtagaaggcggtgcagggaccggttgaacagtgtatcctccacttagctcacttgtaggtcataggagagctgccgtagacttagcaatcttttctcttcatttggctgggatatcatcaattttaggtgcaatcaattttattacaactatttttaatatacgagccccaggaataactatggaacgtgttagactgtttgtttgatctattttagtgacagtgtttcttttattact----------------------------------------------------------------------------- +'Bradybaena phaeogramma phaeogramma' -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------aactatatg-gtatcatatatagctatttctgctcaatg-taatataaatagccgcagtactctgactgtgctaaggtagcataatcatttggcttataattgaagtctagtatgaaagaagatatgggagttaactgtttcctaaacgtttacttaatttacttagggggtgaaaatacccccacaaacataatagacgagaagacccttgaaatttttagtata---attttaaatcgtgctttttgttggggcgacaaggtagcatagtaaacctactaagtggttttattagaacaaaattgtatgaataattaaattactcaagggataacagcataatattttaaagtttgtgacctcgatgttggactaggacaatatagtttaaaagactattatttttgctctgttcg---------------------tatatattatttggtgtttggtgtgggatagttggtacaggtttaagattattgattcgaatagagttaggtagttctggtgttatagcagaagagcatttctacaatgttattgtaacagctcatgcttttgtaataattttttttatagttatgccaatcatgattggaggttttggaaattgaatagtaccgttgttgattggggcacccgatatgagctttccacgtataaacaatataaggttttgattgttacccccttcttttcttctattaattagaagtagtctagtagaaggcggtgcagggaccggttgaacagtgtatcctccacttagctcacttgtaggtcataggagagctgccgtagacttagcaatcttttctcttcatttggctgggatatcatcaattttaggtgcaatcaattttattacaactatttttaatatacgagccccaggaataactatggaacgtgttagactgtttgtttgatctattttagtgacagtgtttcttttattact-----------------------------------------------------------------------------tatatattatttggtgtttggtgtgggatagttggtacaggtttaagattattgattcgaatagagttaggtagttctggtgttatagcagaagagcatttctacaatgttattgtaacagctcatgcttttgtaataattttttttatagttatgccaatcatgattggaggttttggaaattgaatagtaccgttgttgattggggcacccgatatgagctttccacgtataaacaatataaggttttgattgttacccccttcttttcttctattaattagaagtagtctagtagaaggcggtgcagggaccggttgaacagtgtatcctccacttagctcacttgtaggtcataggagagctgccgtagacttagcaatcttttctcttcatttggctgggatatcatcaattttaggtgcaatcaattttattacaactatttttaatatacgagccccaggaataactatggaacgtgttagactgtttgtttgatctattttagtgacagtgtttcttttattact----------------------------------------------------------------------------- +'Bradybaena similaris' -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------aactatatg-gtatcatatatagctatttctgctcaatg-taatataaatagccgcagtactctgactgtgctaaggtagcataatcatttggcttataattgaagtctagtatgaaagaagatatgggagttaactgtttcctaaacgtttacttaatttacttagggggtgaaaatacccccacaaacataatagacgagaagacccttgaaatttttagtata---attttaaatcgtgctttttgttggggcgacaaggtagcatagtaaacctactaagtggttttattagaacaaaattgtatgaataattaaattactcaagggataacagcataatattttaaagtttgtgacctcgatgttggactaggacaatatagtttaaaagactattatttttgctctgttcg---------------------???????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????? +'Bradybaena virgo virgo' ???????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????tatatattatttggtgtttggtgtgggatagttggtacaggtttaagattattgattcgaatagagttaggtagttctggtgttatagcagaagagcatttctacaatgttattgtaacagctcatgcttttgtaataattttttttatagttatgccaatcatgattggaggttttggaaattgaatagtaccgttgttgattggggcacccgatatgagctttccacgtataaacaatataaggttttgattgttacccccttcttttcttctattaattagaagtagtctagtagaaggcggtgcagggaccggttgaacagtgtatcctccacttagctcacttgtaggtcataggagagctgccgtagacttagcaatcttttctcttcatttggctgggatatcatcaattttaggtgcaatcaattttattacaactatttttaatatacgagccccaggaataactatggaacgtgttagactgtttgtttgatctattttagtgacagtgtttcttttattact-----------------------------------------------------------------------------tatatattatttggtgtttggtgtgggatagttggtacaggtttaagattattgattcgaatagagttaggtagttctggtgttatagcagaagagcatttctacaatgttattgtaacagctcatgcttttgtaataattttttttatagttatgccaatcatgattggaggttttggaaattgaatagtaccgttgttgattggggcacccgatatgagctttccacgtataaacaatataaggttttgattgttacccccttcttttcttctattaattagaagtagtctagtagaaggcggtgcagggaccggttgaacagtgtatcctccacttagctcacttgtaggtcataggagagctgccgtagacttagcaatcttttctcttcatttggctgggatatcatcaattttaggtgcaatcaattttattacaactatttttaatatacgagccccaggaataactatggaacgtgttagactgtttgtttgatctattttagtgacagtgtttcttttattact----------------------------------------------------------------------------- +; +END; +BEGIN SETS; + CHARSET ./data/16S.fas = 1-811; + CHARSET ./data/CO1.fas = 812-1421; + CHARSET ./data/CO1.mega.fas = 1422-2031; +END; \ No newline at end of file diff --git a/count.go b/count.go new file mode 100644 index 0000000..aa6ece4 --- /dev/null +++ b/count.go @@ -0,0 +1,49 @@ +package main + +import ( + "fmt" + "regexp" +) + +type charset struct { + Name string + From int + To int +} + +// 遍历文件得到基本数据 +func fas_sum() []dna { + sum := []dna{} + for i, f := range file_input { + sum = append(sum, fas_parser(f)) + fmt.Println("[ working A ]", i+1, f) + } + return sum +} + +// 整合若干文件的统计 +func fas_count(sum_nex []dna) []charset { + fas_charset := []charset{} + for k, v := range sum_nex { + n := fas_name(v.name) + f := 1 + if k != 0 { + f = fas_charset[k-1].To + 1 + } + t := f + v.count - 1 + fmt.Println("[ working B ]", n, f, t) + new_charset := charset{n, f, t} + fas_charset = append(fas_charset, new_charset) + } + fmt.Println(fas_charset) + return fas_charset +} + +func fas_name(old_name string) string { + //needed to import string + compileRegex := regexp.MustCompile(`(\w+)\.\w+`) + matchArr := compileRegex.FindStringSubmatch(old_name) + //needed to use the string get from the old string + new_name := matchArr[len(matchArr)-1] + return new_name +} diff --git a/flag.go b/flag.go new file mode 100644 index 0000000..22dff76 --- /dev/null +++ b/flag.go @@ -0,0 +1,20 @@ +package main + +import ( + "flag" + "fmt" +) + +var ( + file_output string + file_input []string +) + +func dna_flag() { + flag.StringVar(&file_output, "o", "a.nex", "files name wait to out") + flag.Parse() + file_input = flag.Args() // []string{"foo", "bar"} + fmt.Println("==============") + fmt.Println("[input file:]", file_input) + fmt.Println("[output file:]", file_output) +} diff --git a/gocomb.go b/gocomb.go new file mode 100644 index 0000000..c2ce004 --- /dev/null +++ b/gocomb.go @@ -0,0 +1,33 @@ +package main + +import ( + "strings" +) + +type tmpl_data struct { + Ntax int + Nchar int + Matrix map[string]string + Charset []charset +} + +func main() { + + dna_flag() + + sum_nex := fas_sum() + + sum_charset := fas_count(sum_nex) + + sum_dna, ntax, nchar := fas_mix(sum_nex, sum_charset) + + matrix := make(map[string]string, ntax) + for k := range sum_dna { + matrix[k] = strings.Join(sum_dna[k], "") + } + + // 准备发射到模板的数据 + last_data := tmpl_data{ntax, nchar, matrix, sum_charset} + // fmt.Println(last_data) + do_impl(last_data) +} diff --git a/mix.go b/mix.go new file mode 100644 index 0000000..bae2274 --- /dev/null +++ b/mix.go @@ -0,0 +1,33 @@ +package main + +import "strings" + +func fas_mix(sum_nex []dna, sum_charset []charset) (map[string][]string, int, int) { + // dna 的整合 + ntax := 0 + nchar := sum_charset[len(sum_charset)-1].To + sum_dna := make(map[string][]string) + for _, v := range sum_nex { + for k1 := range v.min { + _, has := sum_dna[k1] + if !has { + sum_dna[k1] = make([]string, len(sum_charset)) + ntax++ + } + } + } + for k, v := range sum_nex { + for _, v1 := range v.min { + for k2 := range sum_dna { + if _, ok := v.min[k2]; ok { + sum_dna[k2][k] = v1 + } else { + sum_dna[k2][k] = strings.Repeat("?", v.count) + // 之前就没写错吗 + } + } + } + } + // fmt.Println(sum_dna) + return sum_dna, ntax, nchar +} diff --git a/src/nex_tmpl.go b/nex.tmpl similarity index 62% rename from src/nex_tmpl.go rename to nex.tmpl index 2891241..807700c 100644 --- a/src/nex_tmpl.go +++ b/nex.tmpl @@ -1,20 +1,15 @@ -package fas_parser - -const Nex_tmpl = `#NEXUS +#NEXUS BEGIN DATA; DIMENSIONS NTAX={{ .Ntax }} NCHAR={{ .Nchar }}; FORMAT DATATYPE=DNA GAP=- MISSING=?; MATRIX {{- range $k, $v := .Matrix }} -'{{ $k }}' {{ $v }}{{ end }} +'{{ $k }}' {{ $v }} +{{- end }} ; END; BEGIN SETS; {{- range $_, $i := .Charset }} CHARSET {{ $i.Name }} = {{ $i.From }}-{{ $i.To }}; {{- end }} -END; -` - -// 最后那个 $i 好像有问题 -// {{/* $k| printf "%-40s" */}} +END; \ No newline at end of file diff --git a/parser.go b/parser.go index a3f40e0..0232b4f 100644 --- a/parser.go +++ b/parser.go @@ -1,119 +1,53 @@ -package main - -import ( - "flag" - "fmt" - fas_parser "gocomb/src" - "os" - "strings" - "text/template" -) - -type dna struct { - name string - min_dna map[string]string - count int -} - -type charset struct { - Name string - From int - To int -} - -type tmpl_data struct { - Ntax int - Nchar int - Matrix map[string]string - Charset []charset -} - -func main() { - - // 读取命令行,这里一定要是指针 - file_export := flag.String("o", "a.nex", "files name wait to out") - flag.Parse() - file_names := flag.Args() // []string{"foo", "bar"} - fmt.Println("[ export here ]", *file_export) - - // 遍历文件得到基本数据 - sum_nex := make([]dna, 0, 5) - for k, v := range file_names { - i, j := fas_parser.Fas_parser(v) - new_nex := dna{v, i, j} - sum_nex = append(sum_nex, new_nex) - fmt.Println("[ working A ]", k+1, v) - } - - // 整合若干文件的统计 - sum_charset := []charset{} - for k, v := range sum_nex { - n := v.name - f := 1 - if k != 0 { - f = sum_charset[k-1].To + 1 - } - t := f + v.count - 1 - fmt.Println("[ working B ]", n, f, t) - new_charset := charset{n, f, t} - sum_charset = append(sum_charset, new_charset) - } - // fmt.Println(sum_charset) - - // dna 的整合 - ntax := 0 - nchar := sum_charset[len(sum_charset)-1].To - sum_dna := make(map[string][]string) - for _, v := range sum_nex { - for k1 := range v.min_dna { - _, has := sum_dna[k1] - if !has { - sum_dna[k1] = make([]string, len(sum_charset)) - ntax ++ - } - } - } - for k, v := range sum_nex { - for _, v1 := range v.min_dna { - for k2 := range sum_dna { - if _, ok := v.min_dna[k2]; ok { - sum_dna[k2][k] = v1 - } else { - sum_dna[k2][k] = strings.Repeat("?", v.count) - } - } - } - } - // fmt.Println(sum_dna) - - matrix := make(map[string]string, ntax) - for k := range sum_dna { - matrix[k] = strings.Join(sum_dna[k], "") - } - - // 准备发射到模板的数据 - last_data := tmpl_data{ntax, nchar, matrix, sum_charset} - // fmt.Println(last_data) - - // 读取模板 - nex_tmpl, err := template.New("nex").Parse(fas_parser.Nex_tmpl) - if err != nil { - fmt.Println("[ tmpl err ]", err) - return - } - - // 覆盖创建要写入的 nex 文件 - new_file, err := os.OpenFile(*file_export, os.O_WRONLY|os.O_TRUNC|os.O_CREATE, 0666) - if err != nil { - fmt.Println("[ create or open file error ]", err) - return - } - defer new_file.Close() - - // 写入 nex 模板 - err = nex_tmpl.Execute(new_file, last_data) - if err != nil { - fmt.Println("[ err at tmpl exec ]", err) - return - } -} +package main + +import ( + "fmt" + "io/ioutil" + "strings" +) + +type dna struct { + name string + count int + min map[string]string +} + +// 读取fas文件 +func fas_parser(file_name string) dna { + + // 读取文件 + f, err := ioutil.ReadFile("./" + file_name) + if err != nil { + fmt.Println(err) + return dna{"", 0, nil} + } + + count := 0 + i := 0 // acgt行计数 + j := -1 // 标题行计数 + seq := make(map[string]string) + indid := "" + + for k, v := range f { + switch v { + case '>': + j = k + 1 + case '\n': + if j != -1 { + indid = string(f[j:k]) + i = k + 1 + j = -1 + continue + } + seq[indid] = seq[indid] + strings.ToLower(string(f[i:k])) + i = k + 1 + } + } + count = len(seq[indid]) + // for k1, v1 := range seq { + // fmt.Println(k1) + // fmt.Println(v1) + // } + // fmt.Println(count) + return dna{file_name, count, seq} +} diff --git a/src/fas_parser.go b/src/fas_parser.go deleted file mode 100644 index fb055f5..0000000 --- a/src/fas_parser.go +++ /dev/null @@ -1,53 +0,0 @@ -package fas_parser - -import( - "io/ioutil" - "fmt" -) - -func Fas_parser(file_name string) (map[string]string, int) { - f, err := ioutil.ReadFile("./" + file_name) - if err != nil { - fmt.Println(err) - return nil, 0 - } - // fmt.Println(f) - count := 0 - i := 0 // DNA行计数 - j := 0 // 非序列行计数 - seq := make(map[string]string) - section := "" - // fmt.Println('a', 'c', 'g', 't', '-', '\n', '\r') - for k, v := range f { - switch v { - case 'a', 'c', 'g', 't', '-': - if j != 0 { - continue - } - if i == 0 { - i = k - } - case '\n': - if i != 0 { - seq[section] = seq[section] + string(f[i:k]) - if len(seq) < 2 && j == 0 { - count += k - i - } - i = 0 - continue - } - section = string(f[j:k]) - j = 0 - default: - if j == 0 { - j = k + 1 - } - } - } - // for k1, v1 := range seq { - // fmt.Println(k1) - // fmt.Println(v1) - // } - // fmt.Println(count) - return seq, count -} diff --git a/tmpl.go b/tmpl.go new file mode 100644 index 0000000..ef86bc2 --- /dev/null +++ b/tmpl.go @@ -0,0 +1,39 @@ +package main + +import ( + "fmt" + "io/ioutil" + "os" + "text/template" +) + +func do_impl(last_data tmpl_data) { + + f, err := ioutil.ReadFile("nex.tmpl") + if err != nil { + fmt.Println(err) + return + } + + // 读取模板 + nex_tmpl, err := template.New("nex").Parse(string(f)) + if err != nil { + fmt.Println("[ tmpl err ]", err) + return + } + + // 覆盖创建要写入的 nex 文件 + new_file, err := os.OpenFile(file_output, os.O_WRONLY|os.O_TRUNC|os.O_CREATE, 0666) + if err != nil { + fmt.Println("[ create or open file error ]", err) + return + } + defer new_file.Close() + + // 写入 nex 模板 + err = nex_tmpl.Execute(new_file, last_data) + if err != nil { + fmt.Println("[ err at tmpl exec ]", err) + return + } +}