diff --git a/RGBEPP.d b/RGBEPP.d index 45b907b..7a2445d 100644 --- a/RGBEPP.d +++ b/RGBEPP.d @@ -25,13 +25,15 @@ void show_help(string pkgver) { -m\t--memory\tmemory settings (optional, default 16 GB) -r\t--reference\treference genome path -t\t--threads\tthreads setting (optional, default 8 threads) + --codon\t\tOnly use the codon region (optional) --fastp\t\tFastp path (optional) --spades\t\tSpades python path (optional) --diamond\t\tDiamond python path (optional) - --sortdiamond\t\tSortDiamond python path (optional) + --sortdiamond\tSortDiamond python path (optional) --bowtie2\t\tBowtie2 path (optional) --samtools\t\tSamtools path (optional) --bcftools\t\tBcftools path (optional) + --exonerate\t\tExonerate path (optional) --macse\t\tMacse jarfile path (optional) --delstop\t\tDelstop path (optional) --trimal\t\tTrimal path (optional) @@ -73,6 +75,15 @@ void createDir(string path) { } } +void moveDir (string oldPath, string newPath) { + try { + rename(oldPath, newPath); + writeln("Directory renamed successfully."); + } catch (Exception e) { + writeln("Error renaming directory: ", e.msg); + } +} + void executeCommand(string[] cmd) { auto process = spawnProcess(cmd); @@ -81,6 +92,27 @@ void executeCommand(string[] cmd) { } } +void executeCommandToFile(string[] cmd, string outputFile) { + // Create a pipe for the command's output + auto pipe = pipe(); + + // Spawn the process + auto pid = spawnProcess(cmd, stdin, pipe.writeEnd); + + // Close the write end of the pipe to signal EOF + pipe.writeEnd.close(); + + // Read the output from the pipe + auto output = cast(string) pipe.readEnd.byChunk(4096).joiner.array; + + // Wait for the process to finish + wait(pid); + + // Write the output to the specified file + std.file.write(outputFile, cast(ubyte[])output); +} + + void executeCommandPipe(string[][] cmds) { Pid[] pids; @@ -128,7 +160,7 @@ string getBaseName(string ARG_R){ string[] getRef(string ARG_R, string DirMap){ string baseNameRef = getBaseName(ARG_R); - string ARG_R_index = DirMap ~ "/index/" ~ baseNameRef; // bt2_index_base + string ARG_R_index = buildPath(DirMap, "index", baseNameRef); // bt2_index_base string ARG_R_refer = ARG_R_index ~ ".fasta"; //reference_in fasta file string[] Refs = [ARG_R_index, ARG_R_refer]; return Refs; @@ -168,12 +200,12 @@ void processQcTrim(string[] ARG_L, int ARG_T, string DirRaw, string DirQcTrim, s writeln("QcTrimming::Start"); foreach (string file; ARG_L) { string baseName = getBaseName(file); - string inputFileR1 = DirRaw ~ "/" ~ baseName ~ "_R1.fastq.gz"; - string inputFileR2 = DirRaw ~ "/" ~ baseName ~ "_R2.fastq.gz"; - string outputFileR1 = DirQcTrim ~ "/" ~ baseName ~ "_R1.fastq.gz"; - string outputFileR2 = DirQcTrim ~ "/" ~ baseName ~ "_R2.fastq.gz"; - string jsonFile = DirQcTrim ~ "/" ~ baseName ~ ".json"; - string htmlFile = DirQcTrim ~ "/" ~ baseName ~ ".html"; + string inputFileR1 = buildPath(DirRaw, baseName ~ "_R1.fastq.gz"); + string inputFileR2 = buildPath(DirRaw, baseName ~ "_R2.fastq.gz"); + string outputFileR1 = buildPath(DirQcTrim, baseName ~ "_R1.fastq.gz"); + string outputFileR2 = buildPath(DirQcTrim, baseName ~ "_R2.fastq.gz"); + string jsonFile = buildPath(DirQcTrim, baseName ~ ".json"); + string htmlFile = buildPath(DirQcTrim, baseName ~ ".html"); // Perform quality control and trimming using external program `fastp` string[] cmdQcTrim = [PathFastp, "-i", inputFileR1, "-I", inputFileR2, @@ -190,10 +222,10 @@ void processAssembly(string[] ARG_L, int ARG_M, int ARG_T, string DirQcTrim, str createDir(DirAssembly); foreach (string file; ARG_L) { string baseName = getBaseName(file); - string DirAss = DirAssembly ~ "/" ~ baseName; + string DirAss = buildPath(DirAssembly, baseName); createDir(DirAss); - string inputFileR1 = DirQcTrim ~ "/" ~ baseName ~ "_R1.fastq.gz"; - string inputFileR2 = DirQcTrim ~ "/" ~ baseName ~ "_R2.fastq.gz"; + string inputFileR1 = buildPath(DirQcTrim, baseName ~ "_R1.fastq.gz"); + string inputFileR2 = buildPath(DirQcTrim, baseName ~ "_R2.fastq.gz"); string[] cmdAssembly = [PathSpades, "--pe1-1", inputFileR1, "--pe1-2", inputFileR2, "-t", ARG_T.to!string, "-m", ARG_M.to!string, "--careful", "--phred-offset", "33", "-o", DirAss]; executeCommand(cmdAssembly); } @@ -202,18 +234,18 @@ void processAssembly(string[] ARG_L, int ARG_M, int ARG_T, string DirQcTrim, str void processAssemMv(string[] ARG_L,string DirAssembly){ // Prepare - string DirAssemblySca = DirAssembly ~ "/" ~ "scaffolds"; - string DirAssemblyCont = DirAssembly ~ "/" ~ "contigs"; + string DirAssemblySca = buildPath(DirAssembly, "scaffolds"); + string DirAssemblyCont = buildPath(DirAssembly, "contigs"); writeln("Assembly_Move::Start"); createDir(DirAssemblySca); createDir(DirAssemblyCont); foreach (string file; ARG_L ){ string baseName = getBaseName(file); - string DirAssemblyInd = DirAssembly ~ "/" ~ baseName; - string inputSca = DirAssemblyInd ~ "/" ~ "scaffolds.fasta"; - string inputCont = DirAssemblyInd ~ "/" ~ "contigs.fasta"; - string outputSca = DirAssemblySca ~ "/" ~ baseName ~ ".fasta"; - string outputCont = DirAssemblyCont ~ "/" ~ baseName ~ ".fasta"; + string DirAssemblyInd = buildPath(DirAssembly, baseName); + string inputSca = buildPath(DirAssemblyInd, "scaffolds.fasta"); + string inputCont = buildPath(DirAssemblyInd, "contigs.fasta"); + string outputSca = buildPath(DirAssemblySca, baseName ~ ".fasta"); + string outputCont = buildPath(DirAssemblyCont, baseName ~ ".fasta"); if (!exists(inputSca)) { writeln("File not found: ", inputSca); continue; @@ -234,28 +266,28 @@ void processMappingDenovo(string[] ARG_L, string ARG_R, int ARG_T, string DirQcT // Prepare directory writeln("Mapping::Start"); createDir(DirMap); - createDir(DirMap ~ "/index"); - string DirAssemblySca = DirAssembly ~ "/" ~ "scaffolds"; - string DirAssemblyFas = DirAssembly ~ "/" ~ "fasta"; + createDir(buildPath(DirMap, "index")); + string DirAssemblySca = buildPath(DirAssembly, "scaffolds"); + string DirAssemblyFas = buildPath(DirAssembly, "fasta"); createDir(DirAssemblyFas); string ARG_R_Base = getBaseName(ARG_R); - string ARG_R_Ref = DirAssemblyFas ~ "/" ~ ARG_R_Base ~ ".fasta"; + string ARG_R_Ref = buildPath(DirAssemblyFas, ARG_R_Base ~ ".fasta"); copy(ARG_R, ARG_R_Ref); string [] cmdDmMakeDB = [ PathDiamond, "makedb", "--db", "Reference", "--in", ARG_R_Ref]; executeCommand(cmdDmMakeDB); - string ReferDmnd = DirAssemblyFas ~ "/" ~ "Reference.dmnd"; + string ReferDmnd = buildPath(DirAssemblyFas, "Reference.dmnd"); string PathBowtie2_build = PathBowtie2 ~ "-build"; foreach (string file; ARG_L) { string baseName = getBaseName(file); - string inputM8 = DirAssemblySca ~ "/" ~ baseName ~ ".m8"; - string inputFasta = DirAssemblySca ~ "/" ~ baseName ~ ".fasta"; - string outputSort = DirAssemblyFas ~ "/" ~ baseName ~ ".fasta"; - string outputIndex = DirAssemblyFas ~ "/" ~ baseName; - string inputFileR1 = DirQcTrim ~ "/" ~ baseName ~ "_R1.fastq.gz"; - string inputFileR2 = DirQcTrim ~ "/" ~ baseName ~ "_R2.fastq.gz"; - string outputBam = DirMap ~ "/" ~ baseName ~ ".bam"; + string inputM8 = buildPath(DirAssemblySca, baseName ~ ".m8"); + string inputFasta = buildPath(DirAssemblySca, baseName ~ ".fasta"); + string outputSort = buildPath(DirAssemblyFas, baseName ~ ".fasta"); + string outputIndex = buildPath(DirAssemblyFas, baseName); + string inputFileR1 = buildPath(DirQcTrim, baseName ~ "_R1.fastq.gz"); + string inputFileR2 = buildPath(DirQcTrim, baseName ~ "_R2.fastq.gz"); + string outputBam = buildPath(DirMap, baseName ~ ".bam"); string[] cmdDiamond = [PathDiamond, "blastx", "-d", "Reference.dmnd", "-q", inputFasta, "-o", inputM8, "--outfmt", "6", "qseqid", "sseqid", "pident", "length", "mismatch", "gapopen", "qstart", "qend", "sstart", "send", "evalue", "bitscore", "qlen", "slen", "gaps", "ppos", "qframe", "qseq"]; string[] cmdSortDiamond = [PathSortDiamond, inputM8, outputSort]; @@ -278,8 +310,8 @@ void processPostMap(string[] ARG_L, int ARG_T, string DirMap, string DirBam, str foreach (string file; ARG_L) { string baseName = getBaseName(file); - string inputBam = DirMap ~ "/" ~ baseName ~ ".bam"; - string outputBam = DirBam ~ "/" ~ baseName ~ ".bam"; + string inputBam = buildPath(DirMap, baseName ~ ".bam"); + string outputBam = buildPath(DirBam, baseName ~ ".bam"); // Convert SAM to BAM, sort and remove duplicates using Samtools string[] cmdFixmate = [PathSamtools, "fixmate", "-@", ARG_T.to!string, "-m", inputBam, "-"]; @@ -297,14 +329,14 @@ void processPostMap(string[] ARG_L, int ARG_T, string DirMap, string DirBam, str void processVarCallDenovo(string[] ARG_L, int ARG_T, string DirAssembly, string DirMap, string DirBam, string DirVcf, string PathBcftools) { writeln("VarCalling::Start"); - string DirAssemblyFas = DirAssembly ~ "/" ~ "fasta"; + string DirAssemblyFas = buildPath(DirAssembly, "fasta"); createDir(DirVcf); foreach (string file; parallel(ARG_L, 1)) { string baseName = getBaseName(file); - string inputBam = DirBam ~ "/" ~ baseName ~ ".bam"; - string outputVcf = DirVcf ~ "/" ~ baseName ~ ".vcf.gz"; - string referFasta = DirAssemblyFas ~ "/" ~ baseName ~ ".fasta"; + string inputBam = buildPath(DirBam, baseName ~ ".bam"); + string outputVcf = buildPath(DirVcf, baseName ~ ".vcf.gz"); + string referFasta = buildPath(DirAssemblyFas, baseName ~ ".fasta"); // Variant calling using bcftools string[] cmdPileup = [PathBcftools, "mpileup", "-Oz", "--threads", ARG_T.to!string, "-f", referFasta, inputBam]; string[] cmdVarCall = [PathBcftools, "call", "-mv", "-Oz", "--threads", ARG_T.to!string]; @@ -321,17 +353,17 @@ void processVarCallDenovo(string[] ARG_L, int ARG_T, string DirAssembly, string void processConDenovo(string[] ARG_G, string[] ARG_L, int ARG_T, string DirAssembly, string DirVcf, string DirConsensus, string PathBcftools) { createDir(DirConsensus); - string DirConTaxa = DirConsensus ~ "/" ~ "taxa"; - string DirAssemblyFas = DirAssembly ~ "/" ~ "fasta"; + string DirConTaxa = buildPath(DirConsensus, "taxa"); + string DirAssemblyFas = buildPath(DirAssembly, "fasta"); createDir(DirConTaxa); writeln("Consensus::Start"); // Extract fasta from vcf file foreach (string file; ARG_L) { string baseName = getBaseName(file); - string inputVcf = DirVcf ~ "/" ~ baseName ~ ".vcf.gz"; - string outputFasta = DirConTaxa ~ "/" ~ baseName ~ ".fasta"; - string referFasta = DirAssemblyFas ~ "/" ~ baseName ~ ".fasta"; + string inputVcf = buildPath(DirVcf, baseName ~ ".vcf.gz"); + string outputFasta = buildPath(DirConTaxa, baseName ~ ".fasta"); + string referFasta = buildPath(DirAssemblyFas, baseName ~ ".fasta"); // index vcf.gz string[] cmdIndexVcf = [PathBcftools, "index", inputVcf]; executeCommand(cmdIndexVcf); @@ -347,8 +379,8 @@ void processConDenovo(string[] ARG_G, string[] ARG_L, int ARG_T, string DirAssem void processCombFasta(string[] ARG_G, string[] ARG_L, string DirConsensus) { - string DirConTaxa = DirConsensus ~ "/" ~ "taxa"; - string DirConGene = DirConsensus ~ "/" ~ "gene"; + string DirConTaxa = buildPath(DirConsensus, "taxa"); + string DirConGene = buildPath(DirConsensus, "gene"); createDir(DirConGene); // create a dictory @@ -357,7 +389,7 @@ void processCombFasta(string[] ARG_G, string[] ARG_L, string DirConsensus) { writeln("ConvertFasta::Start"); // read first foreach (file; ARG_L) { - string inputFile = DirConTaxa ~ "/" ~ file ~ ".fasta"; + string inputFile = buildPath(DirConTaxa, file ~ ".fasta"); if (!exists(inputFile)) { writeln("File not found: ", inputFile); continue; @@ -385,7 +417,7 @@ void processCombFasta(string[] ARG_G, string[] ARG_L, string DirConsensus) { } // write different files foreach (gene; ARG_G) { - string outputFile = DirConGene ~ "/" ~ gene ~ ".fasta"; + string outputFile = buildPath(DirConGene, gene ~ ".fasta"); File output = File(outputFile, "w"); if (gene in geneSequences) { output.write(geneSequences[gene]); @@ -394,20 +426,99 @@ void processCombFasta(string[] ARG_G, string[] ARG_L, string DirConsensus) { writeln("ConvertFasta::End"); } +void splitFasta(const string inputFasta) { + File infile; + try { + infile = File(inputFasta, "r"); + } catch (FileException e) { + stderr.writeln("Error: Unable to open input file ", inputFasta); + return; + } + + string line; + string seqName; + File outfile; + bool in_sequence = false; + + foreach (lineContent; infile.byLine()) { + if (lineContent.empty) continue; + + if (lineContent[0] == '>') { + // New sequence header + if (in_sequence) { // if found new sequence, close + outfile.close(); // previous output file + } + seqName = cast(string)lineContent[1 .. $]; // Remove '>' + string outputFile = seqName ~ ".fasta"; // suitable to many os + outfile = File(outputFile, "w"); + outfile.writeln(">", getBaseName(inputFasta)); + // will enter sequence + in_sequence = true; + } else if (in_sequence) { + // Inside sequence content + outfile.writeln(lineContent); + } + } + + if (in_sequence) { + outfile.close(); + } + + if (infile.eof) { + writeln("Sequences have been split into individual files."); + } else { + stderr.writeln("Error occurred while reading file."); + } + + infile.close(); +} + +void processCodon(string[] ARG_G, string ARG_R, string DirConsensus, string PathExonerate){ + + string DirConGene = buildPath(DirConsensus , "gene"); + + string ARG_R_Base = getBaseName(ARG_R); + string ARG_R_Ref = buildPath(DirConsensus, ARG_R_Base ~ ".fasta"); + copy(ARG_R, ARG_R_Ref); + splitFasta(ARG_R_Ref); + + moveDir(DirConGene, DirConGene ~ "_bak"); + + writeln("GetCodon::Start"); + + foreach (gene; ARG_G) { + string inputFile = buildPath(DirConGene ~ "_bak", gene ~ ".fasta"); + string outputFile = buildPath(DirConGene, gene ~ ".fasta"); + string referFile = buildPath(DirConsensus, gene ~ ".fasta"); + if (!exists(inputFile)) { + writeln("File not found: ", inputFile); + continue; + } else { + string[] cmdExonerate = [PathExonerate, inputFile, referFile, "--showalignment", "no", "--showvulgar", "no", "--showtargetgff", "no", "--ryo", "\">%qi\n%qcs\n\"", "--verbose", "0"]; + executeCommandToFile(cmdExonerate, outputFile); + } + std.file.remove(referFile); + } + + rmdirRecurse(DirConGene ~ "_bak"); + + writeln("GetCodon::End"); +} + void processAlign(string[] ARG_G, string DirConsensus, string DirAlign, string PathMacse){ - string DirConGene = DirConsensus ~ "/" ~ "gene"; - string DirAlignAA = DirAlign ~ "/" ~ "AA"; - string DirAlignNT = DirAlign ~ "/" ~ "NT"; + string DirConGene = buildPath(DirConsensus, "gene"); + string DirAlignAA = buildPath(DirAlign, "AA"); + string DirAlignNT = buildPath(DirAlign, "NT"); writeln("Align::Start"); createDir(DirAlign); createDir(DirAlignAA); createDir(DirAlignNT); foreach (gene; parallel(ARG_G, 1)) { - string inputFasta = DirConGene ~ "/" ~ gene ~ ".fasta"; - string outAA = DirAlignAA ~ "/" ~ gene ~ ".fasta"; - string outNT = DirAlignNT ~ "/" ~ gene ~ ".fasta"; + string inputFasta = buildPath(DirConGene, gene ~ ".fasta"); + string outAA = buildPath(DirAlignAA, gene ~ ".fasta"); + string outNT = buildPath(DirAlignNT, gene ~ ".fasta"); string[] cmdAlign = ["java", "-jar", PathMacse, "-prog", "alignSequences", "-seq" , inputFasta, "-out_AA", outAA, "-out_NT", outNT ]; executeCommand(cmdAlign); } @@ -418,20 +529,20 @@ void processAlign(string[] ARG_G, string DirConsensus, string DirAlign, string P void processTrimming(string[] ARG_G, string DirAlign, string DirTrim, string PathDelstop, string PathTrimal){ writeln("Trimming::Start"); - string DirAA = DirAlign ~ "/" ~ "AA"; - string DirNT = DirAlign ~ "/" ~ "NT"; - string DirAA_out = DirAlign ~ "/" ~ "AA_out"; - string DirNT_out = DirAlign ~ "/" ~ "NT_out"; + string DirAA = buildPath(DirAlign, "AA"); + string DirNT = buildPath(DirAlign, "NT"); + string DirAA_out = buildPath(DirAlign, "AA_out"); + string DirNT_out = buildPath(DirAlign, "NT_out"); createDir(DirAA_out); createDir(DirNT_out); // copy file firstly foreach (gene; parallel(ARG_G,1)){ - string inputFastaAA = DirAA ~ "/" ~ gene ~ ".fasta"; - string outputFastaAA = DirAA_out ~ "/" ~ gene ~ ".fasta"; - string inputFastaNT = DirNT ~ "/" ~ gene ~ ".fasta"; - string outputFastaNT = DirNT_out ~ "/" ~ gene ~ ".fasta"; + string inputFastaAA = buildPath(DirAA, gene ~ ".fasta"); + string outputFastaAA = buildPath(DirAA_out, gene ~ ".fasta"); + string inputFastaNT = buildPath(DirNT, gene ~ ".fasta"); + string outputFastaNT = buildPath(DirNT_out, gene ~ ".fasta"); copy(inputFastaNT, outputFastaNT); copy(inputFastaAA, outputFastaAA); @@ -440,13 +551,13 @@ void processTrimming(string[] ARG_G, string DirAlign, string DirTrim, string Pat executeCommand(cmdDelStop); } - string DirTrimNT = DirTrim ~ "/" ~ "NT"; + string DirTrimNT = buildPath(DirTrim, "NT"); createDir(DirTrim); createDir(DirTrimNT); foreach (gene; parallel(ARG_G,1)){ - string inputFastaAA = DirAA_out ~ "/" ~ gene ~ ".fasta"; - string inputBackTransNT = DirNT_out ~ "/" ~ gene ~ ".fasta"; - string outputFastaNT = DirTrimNT ~ "/" ~ gene ~ ".fasta"; + string inputFastaAA = buildPath(DirAA_out, gene ~ ".fasta"); + string inputBackTransNT = buildPath(DirNT_out, gene ~ ".fasta"); + string outputFastaNT = buildPath(DirTrimNT, gene ~ ".fasta"); if (exists(inputFastaAA) && exists(inputBackTransNT)) { string[] cmdTrim = [PathTrimal, "-in", inputFastaAA, "-backtrans", inputBackTransNT, "-out", outputFastaNT, "-automated1"]; executeCommand(cmdTrim); @@ -462,16 +573,16 @@ void main(string[] args) { string pkgver = "0.0.3"; string DirHome = std.file.getcwd(); - string DirRaw = DirHome ~ "/00_raw"; - string DirQcTrim = DirHome ~ "/01_fastp"; - string DirMap = DirHome ~ "/02_bowtie2"; - string DirAssembly = DirHome ~ "/03_spades"; - string DirBam = DirHome ~ "/04_bam"; - string DirVcf = DirHome ~ "/05_vcf"; - string DirConsensus = DirHome ~ "/06_consen"; - string DirConsensus1 = DirHome ~ "/07_consen1"; - string DirAlign = DirHome ~ "/08_macse"; - string DirTrim = DirHome ~ "/09_trimal"; + string DirRaw = buildPath(DirHome, "00_raw"); + string DirQcTrim = buildPath(DirHome, "01_fastp"); + string DirMap = buildPath(DirHome, "02_bowtie2"); + string DirAssembly = buildPath(DirHome, "03_spades"); + string DirBam = buildPath(DirHome, "04_bam"); + string DirVcf = buildPath(DirHome, "05_vcf"); + string DirConsensus = buildPath(DirHome, "06_consen"); + string DirConsensus1 = buildPath(DirHome, "07_consen1"); + string DirAlign = buildPath(DirHome, "08_macse"); + string DirTrim = buildPath(DirHome, "09_trimal"); string PathFastp = "/usr/bin/fastp"; string PathSpades = "/usr/bin/spades.py"; @@ -480,6 +591,7 @@ void main(string[] args) { string PathBowtie2 = "/usr/bin/bowtie2"; string PathSamtools = "/usr/bin/samtools"; string PathBcftools = "/usr/bin/bcftools"; + string PathExonerate = "/usr/bin/exonerate"; string PathMacse = "/usr/share/java/macse.jar"; string PathDelstop = "/usr/bin/delstop"; string PathTrimal = "/usr/bin/trimal"; @@ -491,7 +603,8 @@ void main(string[] args) { string ARG_C; string ARG_F; string ARG_R; - + bool enableCodon = false; + if (args.length > 1){ foreach (int i; 0 .. cast(int)args.length) { switch (args[i]) { @@ -522,6 +635,9 @@ void main(string[] args) { i++; ARG_T = args[i].to!int; break; + case "--codon": + enableCodon = true; + break; case "--fastp": i++; PathFastp = args[i]; @@ -550,6 +666,10 @@ void main(string[] args) { i++; PathBcftools = args[i]; break; + case "--exonerate": + i++; + PathExonerate = args[i]; + break; case "--macse": i++; PathMacse = args[i]; @@ -586,6 +706,7 @@ void main(string[] args) { PathBowtie2 = getValueFromConfig(ARG_C, "bowtie2"); PathSamtools = getValueFromConfig(ARG_C, "samtools"); PathBcftools = getValueFromConfig(ARG_C, "bcftools"); + PathExonerate = getValueFromConfig(ARG_C, "exonerate"); PathMacse = getValueFromConfig(ARG_C, "macse"); PathDelstop = getValueFromConfig(ARG_C, "delstop"); PathTrimal = getValueFromConfig(ARG_C, "trimal"); @@ -644,6 +765,14 @@ void main(string[] args) { } } + if (ARG_F == "all" && enableCodon) { + if(testFiles([PathExonerate]) && testStringArray(ARG_G) && testString(ARG_R)){ + processCodon(ARG_G, ARG_R, DirConsensus, PathExonerate); //ARG_G + } else { + throw new Exception("please confirm paramenters are correct"); + } + } + if (ARG_F == "all" || ARG_F == "align") { if(testFiles([PathMacse]) && testJava && testStringArray(ARG_G)){ processAlign(ARG_G, DirConsensus, DirAlign, PathMacse); //ARG_G diff --git a/RGBEPP.sh b/RGBEPP.sh deleted file mode 100755 index a2eae20..0000000 --- a/RGBEPP.sh +++ /dev/null @@ -1,371 +0,0 @@ -#!/bin/bash - -### Environment Setting - -pkgver=0.0.2 - -DirHome=$(pwd) -DirRaw=$DirHome/00_raw -DirQcTrim=$DirHome/01_fastp -DirAssembly=$DirHome/02_spades -DirFasta=$DirHome/03_assemblied -DirMap=$DirHome/04_diamond -DirPre=$DirHome/05_pre -DirSplit=$DirHome/06_split -DirMerge=$DirHome/07_merge -DirAlign=$DirHome/08_macse - -PathSplitfsata=/usr/bin/splitfasta-cpp -PathMacse=/usr/share/java/macse.jar -PathSortdiamond=/usr/bin/sortdiamond - -ARG_C='scaffolds' -ARG_M=16 -ARG_T=8 - -### Get some arrays - -show_help(){ -# echo -e "\t\t\t\t\t\t\t\033[0;31mR\033[0m\033[0;92mG\033[0m\033[0;94mB\033[0m \033[0;33mE\033[0m\033[0;94mP\033[0m\033[0;33mP\033[0m\n\t\t\t\t\tReference Genome based Exon Phylogeny Pipeline\n \ - echo -e "\t\t\t\t\t\t\t\033[0;47;31mR\033[0m\033[0;47;92mG\033[0m\033[0;47;94mB\033[0m\033[0;47m \033[0m\033[0;47;33mE\033[0m\033[0;47;94mP\033[0m\033[0;47;33mP\033[0m\n\t\t\t\t\tReference Genome based Exon Phylogeny Pipeline\n \ - Version: $pkgver\n \ - License: GPL-2.0-only\n \ - Author: Guoyi Zhang\n \ - -c\t--contigs\tcontings type: scaffolds or contigs\n \ - -g\t--genes\t\tgene file path\n \ - -f\t--functions\tfunctions type (optional): all clean \n \ - \t \t\tassembly fasta map pre split merge align\n \ - -h\t--help\t\tshow this information\n \ - -l\t--list\t\tlist file path\n \ - -m\t--memory\tmemory settings (optional, default 16 GB)\n \ - -r\t--reference\treference genome path\n \ - -t\t--threads\tthreads setting (optional, default 8 threads)\n \ - \t--macse\t\tMacse jarfile path\n \ - \t--sortdiamond\tsortdiamond file path\n \ - \t--splitfasta\tsplitfasta file path\n \ - for example: bash $0 -c scaffolds -f all -l list -g genes \ \n \ - \t -r reference.aa.fas \n" -} - -if [ $# -eq 0 ]; then - show_help - exit 1 -else - -ARGS=$(getopt -o c:,f:,g:,h,l:,m:,r:,t: --long contigs:,genes:,functions:,help,list:,memory:,reference:,threads:,macse:,sortdiamond:,splitfasta: -n 'RGBEPP.sh' -- "$@") -if [ $? != 0 ]; then - echo "Failed to parse options." >&2 - exit 1 -fi -eval set -- "$ARGS" - -while true; do - case "$1" in - -c|--contigs) - case "$2" in - "") shift 2 ;; - *) ARG_C=$2; shift 2 ;; - esac ;; - -g|--genes) - case "$2" in - "") shift 2 ;; - *) ARG_G=$2; shift 2 ;; - esac ;; - -f|--functions) - case "$2" in - "") shift 2 ;; - *) ARG_F=$2; shift 2 ;; - esac ;; - -h|--help) - show_help - shift ;; - -l|--list) - case "$2" in - "") shift 2 ;; - *) ARG_L=$2; shift 2 ;; - esac ;; - -m|--memory) - case "$2" in - "") shift 2 ;; - *) ARG_M=$2; shift 2 ;; - esac ;; - -r|--reference) - case "$2" in - "") shift 2 ;; - *) ARG_R=$2; shift 2 ;; - esac ;; - -t|--threads) - case "$2" in - "") shift 2 ;; - *) ARG_T=$2; shift 2 ;; - esac ;; - --macse) - case "$2" in - "") shift 2 ;; - *) PathMacse=$2; shift 2 ;; - esac ;; - --sortdiamond) - case "$2" in - "") shift 2 ;; - *) PathSortdiamond=$2; shift 2 ;; - esac ;; - --splitfasta) - case "$2" in - "") shift 2 ;; - *) PathSplitfsata=$2; shift 2 ;; - esac ;; - --) - shift; break ;; - *) echo "Unknown option: $1" - exit 1 - ;; - esac -done - -fi - -### Get and check some arguments - -check_var() { - local var_name="$1" - local var_value="${!var_name}" # get value - - if [ -z "$var_value" ]; then - echo "Error: $var_name is not set or is empty" - exit 1 - else - echo "$var_name is set to: $var_value" - - case "$var_name" in - "ARG_G") - readarray -t genes < "$var_value" - length_gn=${#genes[@]} - ;; - "ARG_L") - readarray -t full_names < "$var_value" - length_fn=${#full_names[@]} - ;; - "ARG_F") - check_command "cp" - check_command "cd" - check_command "mv" - check_command "find" - check_command "mkdir" - ;; - esac - - fi -} - -check_path(){ - local path_name="$1" - local path_value="${!path_name}" # get value - - # expand ~ - path_value=$(eval echo "$path_value") - - if [ -e "$path_value" ]; then - echo "$path_name exists at: $path_value" - else - echo "Error: $path_name does not exist at: $path_value" - exit 1 - fi -} - -check_command() { - local cmd_name="$1" - - if command -v "$cmd_name" >/dev/null 2>&1; then - echo "$cmd_name command exists." - else - echo "Error: $cmd_name command not found." - exit 1 - fi -} - - -### Quality control && Trimming - -if [ "$ARG_F" = "all" ] || [ "$ARG_F" = "clean" ]; then - - ## Prepare - mkdir -p $DirQcTrim - - check_var "ARG_L" - check_command "fastp" - - readarray -t full_names < "$ARG_L" - length_fn=${#full_names[@]} - - readarray -t genes < "$ARG_G" - length_gn=${#genes[@]} - - ## Quality control and trimming using fastp - for (( i=0; i<$length_fn; i++ )); do - fastp -i $DirRaw/${full_names[$i]}_R1.fastq.gz -I $DirRaw/${full_names[$i]}_R2.fastq.gz -j $DirQcTrim/${full_names[$i]}.json -h $DirQcTrim/${full_names[$i]}.html -o $DirQcTrim/${full_names[$i]}_R1.fastq.gz -O $DirQcTrim/${full_names[$i]}_R2.fastq.gz -w $ARG_T - done - -fi - -### De novo assembly - -if [ "$ARG_F" = "all" ] || [ "$ARG_F" = "assembly" ]; then - - ## Prepare - mkdir -p $DirAssembly - - check_var "ARG_L" - check_command "spades.py" - - ## De novo assembly using spades - for (( i=0; i<$length_fn; i++ )); do - mkdir -p $DirAssembly/${full_names[$i]} - spades.py --pe1-1 $DirQcTrim/${full_names[$i]}_R1.fastq.gz --pe1-2 $DirQcTrim/${full_names[$i]}_R2.fastq.gz -t $ARG_T -m $ARG_M --careful --phred-offset 33 -o $DirAssembly/${full_names[$i]} - # -k 96,107,117,127 \ - done - -fi - -### Moving scaffords or Contigs out - -if [ "$ARG_F" = "all" ] || [ "$ARG_F" = "fasta" ]; then - - ## Check if the contigs type is specified - check_var "ARG_C" - check_var "ARG_L" - - ## Prepare - mkdir -p $DirFasta - - ## Move the assemblied fasta file to the folder - if [ "$ARG_C" = "scaffolds" ] || [ "$ARG_C" = "contigs" ] ; then - for (( i=0; i<$length_fn; i++ )); do - cp $DirAssembly/${full_names[$i]}/$ARG_C.fasta $DirFasta/$ARG_C/${full_names[$i]}.fasta - done - fi - -fi - -### Mapping - -if [ "$ARG_F" = "all" ] || [ "$ARG_F" = "map" ]; then - - ## Check if the reference or contigs type is specified - check_var "ARG_C" - check_var "ARG_R" - check_command "diamond" - - ## Prepare - mkdir -p $DirMap - - ## Index reference database - cd $DirFasta/$ARG_C - diamond makedb --db Reference --in $ARG_R - cd - - - ## Blastx for mapping DNA sequences to protein reference sequence - cd $DirFasta/$ARG_C - for (( i=0; i<$length_fn; i++ )); do - diamond blastx -d Reference.dmnd -q ${full_names[$i]}.fasta -o ${full_names[$i]}.m8 \ - --outfmt 6 qseqid sseqid pident length mismatch gapopen qstart qend sstart send evalue bitscore qlen slen gaps ppos qframe qseq - # subject: reference; query: align-aimed - #1: qseqid: Query Seq-id - #2: sseqid: Subject Seq - id - #3: pident: Percentage of identical matches - #4: length: Alignment length - #5: mismatch: Number of mismatches - #6: gapopen: Number of gap openings - #7: qstart: Start of alignment in query - #8: qend: End of alignment in query - #9: sstart: Start of alignment in subject - #10: send: End of alignment in subject - #11: evalue: Expect value - #12: bitscore: Bit score - #13: qlen: Query sequence length - #14: slen: Subject sequence length - #15: gaps: Total number of gaps - #16: ppos: Percentage of positive - scoring matches - #17: qframe: Query frame (frames in blast?) - #18: qseq: Aligned part of query sequence - - done - cd - - - mv $DirFasta/$ARG_C/*.m8 $DirMap - -fi - -if [ "$ARG_F" = "all" ] || [ "$ARG_F" = "pre" ]; then - - mkdir -p $DirPre - - check_var "ARG_L" - check_path "PathSortdiamond" - - # extract fasta file from diamond balst style output - for (( i=0; i<$length_fn; i++ )); do - $PathSortdiamond $DirMap/${full_names[$i]}.m8 $DirPre/${full_names[$i]}.fasta - done -fi - - -if [ "$ARG_F" = "all" ] || [ "$ARG_F" = "split" ]; then - mkdir -p $DirSplit - cd $DirPre - - check_var "ARG_L" - check_path "PathSplitfasta" - - # split fasta into different folders based on sequence names - for (( i=0; i<$length_fn; i++ )); do - $PathSplitfsata ${full_names[$i]}.fasta - done - - # mv to destdir - find . -mindepth 1 -maxdepth 1 -type d -exec mv {} ../$DirSplit \; - cd - -fi - -if [ "$ARG_F" = "all" ] || [ "$ARG_F" = "merge" ]; then - - ## Check if the genes is specified - check_var "ARG_G" - - mkdir -p $DirMerge - cd $DirSplit - - # merge different taxa sequences in same gene to one fasta - for (( i=0; i<$length_gn; i++ )) - do - cd ${genes[$i]} - cat * > ../${genes[$i]}.fasta - cd .. - done - - # mv to destdir - mv *.fasta ../$DirMerge - - cd - - -fi - -if [ "$ARG_F" = "all" ] || [ "$ARG_F" = "align" ]; then - - ## Check if the genes is specified - check_var "ARG_G" - check_command "java" - check_command "parallel" - check_path "PathMacse" - - # current_thread=0 - mkdir -p $DirAlign - mkdir -p $DirAlign/AA && mkdir -p $DirAlign/NT - cd $DirMerge - - # align the sequence based on codon - parallel -j $ARG_T java -jar $PathMacse -prog alignSequences -seq {}.fasta -out_AA ../$DirAlign/AA/{}.fasta -out_NT ../$DirAlign/NT/{}.fasta ::: "${genes[@]}" - - cd - - -fi - diff --git a/config.example b/config.example index f8188f3..5003056 100644 --- a/config.example +++ b/config.example @@ -5,6 +5,7 @@ sortdiamond = /usr/bin/sortdiamond bowtie2 = /usr/bin/bowtie2 samtools = /usr/bin/samtools bcftools = /usr/bin/bcftools +exonerate = /usr/bin/exonerate macse = /usr/share/java/macse.jar delstop = /usr/bin/delstop trimal = /usr/bin/trimal