#!/bin/bash ### Environment Setting pkgver=0.0.2 DirHome=$(pwd) DirRaw=$DirHome/00_raw DirQcTrim=$DirHome/01_fastp DirAssembly=$DirHome/02_spades DirFasta=$DirHome/03_assemblied DirMap=$DirHome/04_diamond DirPre=$DirHome/05_pre DirSplit=$DirHome/06_split DirMerge=$DirHome/07_merge DirAlign=$DirHome/08_macse PathSplitfsata=/usr/bin/splitfasta-cpp PathMacse=/usr/share/java/macse.jar PathSortdiamond=/usr/bin/sortdiamond ARG_C='scaffolds' ARG_M=16 ARG_T=8 ### Get some arrays show_help(){ # echo -e "\t\t\t\t\t\t\t\033[0;31mR\033[0m\033[0;92mG\033[0m\033[0;94mB\033[0m \033[0;33mE\033[0m\033[0;94mP\033[0m\033[0;33mP\033[0m\n\t\t\t\t\tReference Genome based Exon Phylogeny Pipeline\n \ echo -e "\t\t\t\t\t\t\t\033[0;47;31mR\033[0m\033[0;47;92mG\033[0m\033[0;47;94mB\033[0m\033[0;47m \033[0m\033[0;47;33mE\033[0m\033[0;47;94mP\033[0m\033[0;47;33mP\033[0m\n\t\t\t\t\tReference Genome based Exon Phylogeny Pipeline\n \ Version: $pkgver\n \ License: GPL-2.0-only\n \ Author: Guoyi Zhang\n \ -c\t--contigs\tcontings type: scaffolds or contigs\n \ -g\t--genes\t\tgene file path\n \ -f\t--functions\tfunctions type (optional): all clean \n \ \t \t\tassembly fasta map pre split merge align\n \ -h\t--help\t\tshow this information\n \ -l\t--list\t\tlist file path\n \ -m\t--memory\tmemory settings (optional, default 16 GB)\n \ -r\t--reference\treference genome path\n \ -t\t--threads\tthreads setting (optional, default 8 threads)\n \ \t--macse\t\tMacse jarfile path\n \ \t--sortdiamond\tsortdiamond file path\n \ \t--splitfasta\tsplitfasta file path\n \ for example: bash $0 -c scaffolds -f all -l list -g genes \ \n \ \t -r reference.aa.fas \n" } if [ $# -eq 0 ]; then show_help exit 1 else ARGS=$(getopt -o c:,f:,g:,h,l:,m:,r:,t: --long contigs:,genes:,functions:,help,list:,memory:,reference:,threads:,macse:,sortdiamond:,splitfasta: -n 'RGBEPP.sh' -- "$@") if [ $? != 0 ]; then echo "Failed to parse options." >&2 exit 1 fi eval set -- "$ARGS" while true; do case "$1" in -c|--contigs) case "$2" in "") shift 2 ;; *) ARG_C=$2; shift 2 ;; esac ;; -g|--genes) case "$2" in "") shift 2 ;; *) ARG_G=$2; shift 2 ;; esac ;; -f|--functions) case "$2" in "") shift 2 ;; *) ARG_F=$2; shift 2 ;; esac ;; -h|--help) show_help shift ;; -l|--list) case "$2" in "") shift 2 ;; *) ARG_L=$2; shift 2 ;; esac ;; -m|--memory) case "$2" in "") shift 2 ;; *) ARG_M=$2; shift 2 ;; esac ;; -r|--reference) case "$2" in "") shift 2 ;; *) ARG_R=$2; shift 2 ;; esac ;; -t|--threads) case "$2" in "") shift 2 ;; *) ARG_T=$2; shift 2 ;; esac ;; --macse) case "$2" in "") shift 2 ;; *) PathMacse=$2; shift 2 ;; esac ;; --sortdiamond) case "$2" in "") shift 2 ;; *) PathSortdiamond=$2; shift 2 ;; esac ;; --splitfasta) case "$2" in "") shift 2 ;; *) PathSplitfsata=$2; shift 2 ;; esac ;; --) shift; break ;; *) echo "Unknown option: $1" exit 1 ;; esac done fi ### Get and check some arguments check_var() { local var_name="$1" local var_value="${!var_name}" # get value if [ -z "$var_value" ]; then echo "Error: $var_name is not set or is empty" exit 1 else echo "$var_name is set to: $var_value" case "$var_name" in "ARG_G") readarray -t genes < "$var_value" length_gn=${#genes[@]} ;; "ARG_L") readarray -t full_names < "$var_value" length_fn=${#full_names[@]} ;; "ARG_F") check_command "cp" check_command "cd" check_command "mv" check_command "find" check_command "mkdir" ;; esac fi } check_path(){ local path_name="$1" local path_value="${!path_name}" # get value # expand ~ path_value=$(eval echo "$path_value") if [ -e "$path_value" ]; then echo "$path_name exists at: $path_value" else echo "Error: $path_name does not exist at: $path_value" exit 1 fi } check_command() { local cmd_name="$1" if command -v "$cmd_name" >/dev/null 2>&1; then echo "$cmd_name command exists." else echo "Error: $cmd_name command not found." exit 1 fi } ### Quality control && Trimming if [ "$ARG_F" = "all" ] || [ "$ARG_F" = "clean" ]; then ## Prepare mkdir -p $DirQcTrim check_var "ARG_L" check_command "fastp" readarray -t full_names < "$ARG_L" length_fn=${#full_names[@]} readarray -t genes < "$ARG_G" length_gn=${#genes[@]} ## Quality control and trimming using fastp for (( i=0; i<$length_fn; i++ )); do fastp -i $DirRaw/${full_names[$i]}_R1.fastq.gz -I $DirRaw/${full_names[$i]}_R2.fastq.gz -j $DirQcTrim/${full_names[$i]}.json -h $DirQcTrim/${full_names[$i]}.html -o $DirQcTrim/${full_names[$i]}_R1.fastq.gz -O $DirQcTrim/${full_names[$i]}_R2.fastq.gz -w $ARG_T done fi ### De novo assembly if [ "$ARG_F" = "all" ] || [ "$ARG_F" = "assembly" ]; then ## Prepare mkdir -p $DirAssembly check_var "ARG_L" check_command "spades.py" ## De novo assembly using spades for (( i=0; i<$length_fn; i++ )); do mkdir -p $DirAssembly/${full_names[$i]} spades.py --pe1-1 $DirQcTrim/${full_names[$i]}_R1.fastq.gz --pe1-2 $DirQcTrim/${full_names[$i]}_R2.fastq.gz -t $ARG_T -m $ARG_M --careful --phred-offset 33 -o $DirAssembly/${full_names[$i]} # -k 96,107,117,127 \ done fi ### Moving scaffords or Contigs out if [ "$ARG_F" = "all" ] || [ "$ARG_F" = "fasta" ]; then ## Check if the contigs type is specified check_var "ARG_C" check_var "ARG_L" ## Prepare mkdir -p $DirFasta ## Move the assemblied fasta file to the folder if [ "$ARG_C" = "scaffolds" ] || [ "$ARG_C" = "contigs" ] ; then for (( i=0; i<$length_fn; i++ )); do cp $DirAssembly/${full_names[$i]}/$ARG_C.fasta $DirFasta/$ARG_C/${full_names[$i]}.fasta done fi fi ### Mapping if [ "$ARG_F" = "all" ] || [ "$ARG_F" = "map" ]; then ## Check if the reference or contigs type is specified check_var "ARG_C" check_var "ARG_R" check_command "diamond" ## Prepare mkdir -p $DirMap ## Index reference database cd $DirFasta/$ARG_C diamond makedb --db Reference --in $ARG_R cd - ## Blastx for mapping DNA sequences to protein reference sequence cd $DirFasta/$ARG_C for (( i=0; i<$length_fn; i++ )); do diamond blastx -d Reference.dmnd -q ${full_names[$i]}.fasta -o ${full_names[$i]}.m8 \ --outfmt 6 qseqid sseqid pident length mismatch gapopen qstart qend sstart send evalue bitscore qlen slen gaps ppos qframe qseq # subject: reference; query: align-aimed #1: qseqid: Query Seq-id #2: sseqid: Subject Seq - id #3: pident: Percentage of identical matches #4: length: Alignment length #5: mismatch: Number of mismatches #6: gapopen: Number of gap openings #7: qstart: Start of alignment in query #8: qend: End of alignment in query #9: sstart: Start of alignment in subject #10: send: End of alignment in subject #11: evalue: Expect value #12: bitscore: Bit score #13: qlen: Query sequence length #14: slen: Subject sequence length #15: gaps: Total number of gaps #16: ppos: Percentage of positive - scoring matches #17: qframe: Query frame (frames in blast?) #18: qseq: Aligned part of query sequence done cd - mv $DirFasta/$ARG_C/*.m8 $DirMap fi if [ "$ARG_F" = "all" ] || [ "$ARG_F" = "pre" ]; then mkdir -p $DirPre check_var "ARG_L" check_path "PathSortdiamond" # extract fasta file from diamond balst style output for (( i=0; i<$length_fn; i++ )); do $PathSortdiamond $DirMap/${full_names[$i]}.m8 $DirPre/${full_names[$i]}.fasta done fi if [ "$ARG_F" = "all" ] || [ "$ARG_F" = "split" ]; then mkdir -p $DirSplit cd $DirPre check_var "ARG_L" check_path "PathSplitfasta" # split fasta into different folders based on sequence names for (( i=0; i<$length_fn; i++ )); do $PathSplitfsata ${full_names[$i]}.fasta done # mv to destdir find . -mindepth 1 -maxdepth 1 -type d -exec mv {} ../$DirSplit \; cd - fi if [ "$ARG_F" = "all" ] || [ "$ARG_F" = "merge" ]; then ## Check if the genes is specified check_var "ARG_G" mkdir -p $DirMerge cd $DirSplit # merge different taxa sequences in same gene to one fasta for (( i=0; i<$length_gn; i++ )) do cd ${genes[$i]} cat * > ../${genes[$i]}.fasta cd .. done # mv to destdir mv *.fasta ../$DirMerge cd - fi if [ "$ARG_F" = "all" ] || [ "$ARG_F" = "align" ]; then ## Check if the genes is specified check_var "ARG_G" check_command "java" check_command "parallel" check_path "PathMacse" # current_thread=0 mkdir -p $DirAlign mkdir -p $DirAlign/AA && mkdir -p $DirAlign/NT cd $DirMerge # align the sequence based on codon parallel -j $ARG_T java -jar $PathMacse -prog alignSequences -seq {}.fasta -out_AA ../$DirAlign/AA/{}.fasta -out_NT ../$DirAlign/NT/{}.fasta ::: "${genes[@]}" cd - fi