2024-07-03 08:41:39 +08:00
|
|
|
#!/bin/bash
|
|
|
|
|
2024-06-05 13:53:49 +08:00
|
|
|
### Environment Setting
|
|
|
|
|
2024-07-03 08:41:39 +08:00
|
|
|
pkgver=0.0.1
|
2024-06-05 13:53:49 +08:00
|
|
|
DirRaw=00_raw
|
|
|
|
DirQcTrim=01_fastp
|
|
|
|
DirAssembly=02_spades
|
2024-07-05 09:34:56 +08:00
|
|
|
DirFasta=03_assemblied
|
|
|
|
DirMap=04_diamond
|
2024-07-03 08:41:39 +08:00
|
|
|
DirPre=05_pre
|
|
|
|
DirSplit=06_split
|
|
|
|
DirMerge=07_merge
|
2024-07-05 09:34:56 +08:00
|
|
|
DirAlign=08_macse
|
2024-06-05 13:53:49 +08:00
|
|
|
|
2024-07-03 08:41:39 +08:00
|
|
|
PathSplitfsata=~/Downloads/PhD/wes/splitfasta-cpp
|
|
|
|
PathMacse=/usr/share/java/macse.jar
|
|
|
|
PathSortdiamond=/home/guoyi/Downloads/PhD/wes/sortdiamond
|
|
|
|
|
|
|
|
HELP=false
|
2024-06-05 13:53:49 +08:00
|
|
|
|
2024-07-03 15:14:33 +08:00
|
|
|
ARG_C='scaffolds'
|
|
|
|
#ARG_F='all'
|
|
|
|
ARG_M=16
|
|
|
|
ARG_T=8
|
|
|
|
|
2024-07-03 08:41:39 +08:00
|
|
|
### Get some arrays
|
2024-06-05 13:53:49 +08:00
|
|
|
|
2024-07-03 15:14:33 +08:00
|
|
|
show_help(){
|
|
|
|
# echo -e "\t\t\t\t\t\t\t\033[0;31mR\033[0m\033[0;92mG\033[0m\033[0;94mB\033[0m \033[0;33mE\033[0m\033[0;94mP\033[0m\033[0;33mP\033[0m\n\t\t\t\t\tReference Genome based Exon Phylogeny Pipeline\n \
|
|
|
|
echo -e "\t\t\t\t\t\t\t\033[0;47;31mR\033[0m\033[0;47;92mG\033[0m\033[0;47;94mB\033[0m\033[0;47m \033[0m\033[0;47;33mE\033[0m\033[0;47;94mP\033[0m\033[0;47;33mP\033[0m\n\t\t\t\t\tReference Genome based Exon Phylogeny Pipeline\n \
|
|
|
|
Version: $pkgver\n \
|
2024-07-05 09:34:56 +08:00
|
|
|
License: GPL-2.0-only\n \
|
2024-07-03 15:14:33 +08:00
|
|
|
Author: Guoyi Zhang\n \
|
|
|
|
-c\t--contigs\tcontings type: scaffolds or contigs\n \
|
|
|
|
-g\t--genes\t\tgene file path\n \
|
|
|
|
-f\t--functions\tfunctions type (optional): all clean \n \
|
|
|
|
\t \t\tassembly fasta map pre split merge align\n \
|
|
|
|
-h\t--help\t\tshow this information\n \
|
|
|
|
-l\t--list\t\tlist file path\n \
|
|
|
|
-m\t--memory\tmemory settings (optional, default 16 GB)\n \
|
|
|
|
-r\t--reference\treference genome path\n \
|
|
|
|
-t\t--threads\tthreads setting (optional, default 8 threads)\n \
|
2024-07-03 15:32:19 +08:00
|
|
|
\t--macse\t\tMacse jarfile path\n \
|
|
|
|
\t--sortdiamond\tsortdiamond file path\n \
|
|
|
|
\t--splitfasta\tsplitfasta file path\n \
|
2024-07-03 15:14:33 +08:00
|
|
|
for example: bash $0 -c scaffolds -f all -l list -g genes \ \n \
|
|
|
|
\t -r Reference.exons.aa.fas \n"
|
|
|
|
}
|
|
|
|
|
2024-07-03 15:32:19 +08:00
|
|
|
if [ $# -eq 0 ]; then
|
|
|
|
show_help
|
|
|
|
exit 1
|
|
|
|
else
|
|
|
|
|
2024-07-03 15:14:33 +08:00
|
|
|
ARGS=$(getopt -o c:,f:,g:,h,l:,m:,r:,t: --long contigs:,genes:,functions:,help,list:,memory:,reference:,threads:,macse:,sortdiamond:,splitfasta: -n 'RGBEPP.sh' -- "$@")
|
2024-07-03 08:41:39 +08:00
|
|
|
if [ $? != 0 ]; then
|
|
|
|
echo "Failed to parse options." >&2
|
|
|
|
exit 1
|
|
|
|
fi
|
|
|
|
eval set -- "$ARGS"
|
2024-06-05 13:53:49 +08:00
|
|
|
|
2024-07-03 08:41:39 +08:00
|
|
|
while true; do
|
|
|
|
case "$1" in
|
2024-07-03 08:54:01 +08:00
|
|
|
-c|--contigs)
|
2024-07-03 08:41:39 +08:00
|
|
|
case "$2" in
|
2024-07-03 15:14:33 +08:00
|
|
|
"") shift 2 ;;
|
2024-07-03 08:41:39 +08:00
|
|
|
*) ARG_C=$2; shift 2 ;;
|
|
|
|
esac ;;
|
2024-07-03 08:54:01 +08:00
|
|
|
-g|--genes)
|
|
|
|
case "$2" in
|
|
|
|
"") shift 2 ;;
|
|
|
|
*) ARG_G=$2; shift 2 ;;
|
|
|
|
esac ;;
|
2024-07-03 08:41:39 +08:00
|
|
|
-f|--functions)
|
|
|
|
case "$2" in
|
2024-07-03 15:14:33 +08:00
|
|
|
"") shift 2 ;;
|
2024-07-03 08:41:39 +08:00
|
|
|
*) ARG_F=$2; shift 2 ;;
|
|
|
|
esac ;;
|
|
|
|
-h|--help)
|
2024-07-03 15:14:33 +08:00
|
|
|
show_help
|
2024-07-03 08:41:39 +08:00
|
|
|
shift ;;
|
|
|
|
-l|--list)
|
|
|
|
case "$2" in
|
|
|
|
"") shift 2 ;;
|
|
|
|
*) ARG_L=$2; shift 2 ;;
|
|
|
|
esac ;;
|
|
|
|
-m|--memory)
|
|
|
|
case "$2" in
|
2024-07-03 15:14:33 +08:00
|
|
|
"") shift 2 ;;
|
2024-07-03 08:41:39 +08:00
|
|
|
*) ARG_M=$2; shift 2 ;;
|
|
|
|
esac ;;
|
|
|
|
-r|--reference)
|
|
|
|
case "$2" in
|
|
|
|
"") shift 2 ;;
|
|
|
|
*) ARG_R=$2; shift 2 ;;
|
|
|
|
esac ;;
|
|
|
|
-t|--threads)
|
|
|
|
case "$2" in
|
2024-07-03 15:14:33 +08:00
|
|
|
"") shift 2 ;;
|
2024-07-03 08:41:39 +08:00
|
|
|
*) ARG_T=$2; shift 2 ;;
|
|
|
|
esac ;;
|
2024-07-03 15:14:33 +08:00
|
|
|
--macse)
|
|
|
|
case "$2" in
|
|
|
|
"") shift 2 ;;
|
|
|
|
*) PathMacse=$2; shift 2 ;;
|
|
|
|
esac ;;
|
|
|
|
--sortdiamond)
|
|
|
|
case "$2" in
|
|
|
|
"") shift 2 ;;
|
|
|
|
*) PathSortdiamond=$2; shift 2 ;;
|
|
|
|
esac ;;
|
|
|
|
--splitfasta)
|
|
|
|
case "$2" in
|
|
|
|
"") shift 2 ;;
|
|
|
|
*) PathSplitfsata=$2; shift 2 ;;
|
|
|
|
esac ;;
|
|
|
|
--)
|
|
|
|
shift; break ;;
|
|
|
|
*) echo "Unknown option: $1"
|
|
|
|
exit 1
|
|
|
|
;;
|
2024-07-03 08:41:39 +08:00
|
|
|
esac
|
|
|
|
done
|
2024-06-05 13:53:49 +08:00
|
|
|
|
2024-07-03 15:32:19 +08:00
|
|
|
fi
|
|
|
|
|
2024-07-03 08:41:39 +08:00
|
|
|
### Get and check some arguments
|
2024-06-05 13:53:49 +08:00
|
|
|
|
2024-07-03 15:14:33 +08:00
|
|
|
check_var() {
|
|
|
|
local var_name="$1"
|
|
|
|
local var_value="${!var_name}" # get value
|
|
|
|
|
|
|
|
if [ -z "$var_value" ]; then
|
|
|
|
echo "Error: $var_name is not set or is empty"
|
|
|
|
exit 1
|
|
|
|
else
|
|
|
|
echo "$var_name is set to: $var_value"
|
|
|
|
|
|
|
|
case "$var_name" in
|
|
|
|
"ARG_G")
|
|
|
|
readarray -t genes < "$var_value"
|
|
|
|
length_gn=${#genes[@]}
|
|
|
|
;;
|
|
|
|
"ARG_L")
|
|
|
|
readarray -t full_names < "$var_value"
|
|
|
|
length_fn=${#full_names[@]}
|
|
|
|
;;
|
2024-07-03 15:32:19 +08:00
|
|
|
"ARG_F")
|
|
|
|
check_command "cp"
|
|
|
|
check_command "cd"
|
|
|
|
check_command "mv"
|
|
|
|
check_command "find"
|
|
|
|
check_command "mkdir"
|
|
|
|
;;
|
2024-07-03 15:14:33 +08:00
|
|
|
esac
|
|
|
|
|
2024-07-03 08:41:39 +08:00
|
|
|
fi
|
2024-07-03 15:14:33 +08:00
|
|
|
}
|
2024-06-05 13:53:49 +08:00
|
|
|
|
2024-07-03 15:14:33 +08:00
|
|
|
check_path(){
|
|
|
|
local path_name="$1"
|
|
|
|
local path_value="${!path_name}" # get value
|
|
|
|
|
|
|
|
# expand ~
|
|
|
|
path_value=$(eval echo "$path_value")
|
|
|
|
|
|
|
|
if [ -e "$path_value" ]; then
|
|
|
|
echo "$path_name exists at: $path_value"
|
|
|
|
else
|
|
|
|
echo "Error: $path_name does not exist at: $path_value"
|
|
|
|
exit 1
|
|
|
|
fi
|
|
|
|
}
|
|
|
|
|
|
|
|
check_command() {
|
|
|
|
local cmd_name="$1"
|
|
|
|
|
|
|
|
if command -v "$cmd_name" >/dev/null 2>&1; then
|
|
|
|
echo "$cmd_name command exists."
|
|
|
|
else
|
|
|
|
echo "Error: $cmd_name command not found."
|
|
|
|
exit 1
|
|
|
|
fi
|
|
|
|
}
|
|
|
|
|
2024-06-05 13:53:49 +08:00
|
|
|
|
|
|
|
### Quality control && Trimming
|
|
|
|
|
2024-07-03 08:41:39 +08:00
|
|
|
if [ "$ARG_F" = "all" ] || [ "$ARG_F" = "clean" ]; then
|
2024-06-05 13:53:49 +08:00
|
|
|
|
2024-07-03 08:41:39 +08:00
|
|
|
## Prepare
|
|
|
|
mkdir -p $DirQcTrim
|
|
|
|
|
2024-07-03 15:14:33 +08:00
|
|
|
check_var "ARG_L"
|
|
|
|
check_command "fastp"
|
|
|
|
|
|
|
|
readarray -t full_names < "$ARG_L"
|
|
|
|
length_fn=${#full_names[@]}
|
|
|
|
|
|
|
|
readarray -t genes < "$ARG_G"
|
|
|
|
length_gn=${#genes[@]}
|
|
|
|
|
2024-07-03 08:41:39 +08:00
|
|
|
## Quality control and trimming using fastp
|
|
|
|
for (( i=0; i<$length_fn; i++ )); do
|
|
|
|
fastp -i $DirRaw/${full_names[$i]}_R1.fastq.gz -I $DirRaw/${full_names[$i]}_R2.fastq.gz -j $DirQcTrim/${full_names[$i]}.json -h $DirQcTrim/${full_names[$i]}.html -o $DirQcTrim/${full_names[$i]}_R1.fastq.gz -O $DirQcTrim/${full_names[$i]}_R2.fastq.gz -w $ARG_T
|
|
|
|
done
|
|
|
|
|
|
|
|
fi
|
2024-06-05 13:53:49 +08:00
|
|
|
|
|
|
|
### De novo assembly
|
|
|
|
|
2024-07-03 08:41:39 +08:00
|
|
|
if [ "$ARG_F" = "all" ] || [ "$ARG_F" = "assembly" ]; then
|
|
|
|
|
|
|
|
## Prepare
|
|
|
|
mkdir -p $DirAssembly
|
2024-07-03 15:14:33 +08:00
|
|
|
|
|
|
|
check_var "ARG_L"
|
|
|
|
check_command "spades.py"
|
|
|
|
|
2024-07-03 08:41:39 +08:00
|
|
|
## De novo assembly using spades
|
|
|
|
for (( i=0; i<$length_fn; i++ )); do
|
|
|
|
mkdir -p $DirAssembly/${full_names[$i]}
|
|
|
|
spades.py --pe1-1 $DirQcTrim/${full_names[$i]}_R1.fastq.gz --pe1-2 $DirQcTrim/${full_names[$i]}_R2.fastq.gz -t $ARG_T -m $ARG_M --careful --phred-offset 33 -o $DirAssembly/${full_names[$i]}
|
|
|
|
# -k 96,107,117,127 \
|
|
|
|
done
|
|
|
|
|
|
|
|
fi
|
|
|
|
|
|
|
|
### Moving scaffords or Contigs out
|
|
|
|
|
|
|
|
if [ "$ARG_F" = "all" ] || [ "$ARG_F" = "fasta" ]; then
|
|
|
|
|
|
|
|
## Check if the contigs type is specified
|
2024-07-03 15:14:33 +08:00
|
|
|
check_var "ARG_C"
|
|
|
|
check_var "ARG_L"
|
2024-07-03 08:41:39 +08:00
|
|
|
|
|
|
|
## Prepare
|
|
|
|
mkdir -p $DirFasta
|
|
|
|
|
|
|
|
## Move the assemblied fasta file to the folder
|
|
|
|
if [ "$ARG_C" = "scaffolds" ] || [ "$ARG_C" = "contigs" ] ; then
|
|
|
|
for (( i=0; i<$length_fn; i++ )); do
|
|
|
|
cp $DirAssembly/${full_names[$i]}/$ARG_C.fasta $DirFasta/$ARG_C/${full_names[$i]}.fasta
|
|
|
|
done
|
|
|
|
fi
|
|
|
|
|
|
|
|
fi
|
|
|
|
|
|
|
|
### Mapping
|
|
|
|
|
|
|
|
if [ "$ARG_F" = "all" ] || [ "$ARG_F" = "map" ]; then
|
|
|
|
|
|
|
|
## Check if the reference or contigs type is specified
|
2024-07-03 15:14:33 +08:00
|
|
|
check_var "ARG_C"
|
|
|
|
check_var "ARG_R"
|
|
|
|
check_command "diamond"
|
2024-07-03 08:41:39 +08:00
|
|
|
|
|
|
|
## Prepare
|
|
|
|
mkdir -p $DirMap
|
|
|
|
|
|
|
|
## Index reference database
|
|
|
|
cd $DirFasta/$ARG_C
|
|
|
|
diamond makedb --db Reference --in $ARG_R
|
|
|
|
cd -
|
|
|
|
|
|
|
|
## Blastx for mapping DNA sequences to protein reference sequence
|
|
|
|
cd $DirFasta/$ARG_C
|
|
|
|
for (( i=0; i<$length_fn; i++ )); do
|
|
|
|
diamond blastx -d Reference.dmnd -q ${full_names[$i]}.fasta -o ${full_names[$i]}.m8 \
|
|
|
|
--outfmt 6 qseqid sseqid pident length mismatch gapopen qstart qend sstart send evalue bitscore qlen slen gaps ppos qframe qseq
|
|
|
|
# subject: reference; query: align-aimed
|
|
|
|
#1: qseqid: Query Seq-id
|
|
|
|
#2: sseqid: Subject Seq - id
|
|
|
|
#3: pident: Percentage of identical matches
|
|
|
|
#4: length: Alignment length
|
|
|
|
#5: mismatch: Number of mismatches
|
|
|
|
#6: gapopen: Number of gap openings
|
|
|
|
#7: qstart: Start of alignment in query
|
|
|
|
#8: qend: End of alignment in query
|
|
|
|
#9: sstart: Start of alignment in subject
|
|
|
|
#10: send: End of alignment in subject
|
|
|
|
#11: evalue: Expect value
|
|
|
|
#12: bitscore: Bit score
|
|
|
|
#13: qlen: Query sequence length 比对序列长度
|
|
|
|
#14: slen: Subject sequence length
|
|
|
|
#15: gaps: Total number of gaps
|
|
|
|
#16: ppos: Percentage of positive - scoring matches
|
|
|
|
#17: qframe: Query frame (frames in ECPP.sh)
|
|
|
|
#18: qseq: Aligned part of query sequence
|
|
|
|
|
|
|
|
done
|
|
|
|
cd -
|
|
|
|
|
|
|
|
mv $DirFasta/$ARG_C/*.m8 $DirMap
|
|
|
|
|
|
|
|
fi
|
|
|
|
|
|
|
|
if [ "$ARG_F" = "all" ] || [ "$ARG_F" = "pre" ]; then
|
2024-07-03 15:14:33 +08:00
|
|
|
|
2024-07-03 08:41:39 +08:00
|
|
|
mkdir -p $DirPre
|
2024-07-03 15:14:33 +08:00
|
|
|
|
|
|
|
check_var "ARG_L"
|
2024-07-03 15:38:07 +08:00
|
|
|
check_path "PathSortdiamond"
|
2024-07-03 15:14:33 +08:00
|
|
|
|
2024-07-03 08:41:39 +08:00
|
|
|
for (( i=0; i<$length_fn; i++ )); do
|
|
|
|
$PathSortdiamond $DirMap/${full_names[$i]}.m8 $DirPre/${full_names[$i]}.fasta
|
|
|
|
done
|
|
|
|
fi
|
|
|
|
|
|
|
|
|
|
|
|
if [ "$ARG_F" = "all" ] || [ "$ARG_F" = "split" ]; then
|
|
|
|
mkdir -p $DirSplit
|
|
|
|
cd $DirPre
|
2024-07-03 15:14:33 +08:00
|
|
|
|
|
|
|
check_var "ARG_L"
|
2024-07-03 15:38:07 +08:00
|
|
|
check_path "PathSplitfasta"
|
2024-07-03 15:14:33 +08:00
|
|
|
|
2024-07-03 08:41:39 +08:00
|
|
|
for (( i=0; i<$length_fn; i++ )); do
|
|
|
|
$PathSplitfsata ${full_names[$i]}.fasta
|
|
|
|
done
|
|
|
|
find . -mindepth 1 -maxdepth 1 -type d -exec mv {} ../$DirSplit \;
|
|
|
|
cd -
|
|
|
|
fi
|
|
|
|
|
|
|
|
if [ "$ARG_F" = "all" ] || [ "$ARG_F" = "merge" ]; then
|
|
|
|
|
2024-07-03 08:54:01 +08:00
|
|
|
## Check if the genes is specified
|
2024-07-03 15:14:33 +08:00
|
|
|
check_var "ARG_G"
|
2024-07-03 08:54:01 +08:00
|
|
|
|
2024-07-03 08:41:39 +08:00
|
|
|
mkdir -p $DirMerge
|
|
|
|
cd $DirSplit
|
2024-07-03 08:54:01 +08:00
|
|
|
for (( i=0; i<$length_gn; i++ ))
|
2024-07-03 08:41:39 +08:00
|
|
|
do
|
2024-07-03 08:54:01 +08:00
|
|
|
cd ${genes[$i]}
|
|
|
|
cat * > ../${genes[$i]}.fasta
|
2024-07-03 08:41:39 +08:00
|
|
|
cd ..
|
|
|
|
done
|
|
|
|
mv *.fasta ../$DirMerge
|
|
|
|
cd -
|
|
|
|
|
|
|
|
fi
|
|
|
|
|
|
|
|
if [ "$ARG_F" = "all" ] || [ "$ARG_F" = "align" ]; then
|
|
|
|
|
2024-07-03 08:54:01 +08:00
|
|
|
## Check if the genes is specified
|
2024-07-03 15:14:33 +08:00
|
|
|
check_var "ARG_G"
|
|
|
|
check_command "java"
|
|
|
|
check_command "parallel"
|
2024-07-03 15:38:07 +08:00
|
|
|
check_path "PathMacse"
|
2024-07-03 08:54:01 +08:00
|
|
|
|
2024-07-03 13:39:13 +08:00
|
|
|
# current_thread=0
|
2024-07-03 08:41:39 +08:00
|
|
|
mkdir -p $DirAlign
|
|
|
|
mkdir -p $DirAlign/AA && mkdir -p $DirAlign/NT
|
|
|
|
cd $DirMerge
|
2024-07-03 13:39:13 +08:00
|
|
|
|
2024-07-03 15:14:33 +08:00
|
|
|
parallel -j $ARG_T java -jar $PathMacse -prog alignSequences -seq {}.fasta -out_AA ../$DirAlign/AA/{}.fasta -out_NT ../$DirAlign/NT/{}.fasta ::: "${genes[@]}"
|
2024-07-03 13:39:13 +08:00
|
|
|
|
2024-07-03 08:41:39 +08:00
|
|
|
cd -
|
|
|
|
|
|
|
|
fi
|
2024-06-05 13:53:49 +08:00
|
|
|
|