RGBEPP/RGBEPP.sh

263 lines
7.1 KiB
Bash

#!/bin/bash
### Environment Setting
pkgver=0.0.1
DirRaw=00_raw
DirQcTrim=01_fastp
DirAssembly=02_spades
DirFasta=03_contig
DirMap=04_map
DirPre=05_pre
DirSplit=06_split
DirMerge=07_merge
DirAlign=08_align
PathSplitfsata=~/Downloads/PhD/wes/splitfasta-cpp
PathMacse=/usr/share/java/macse.jar
PathSortdiamond=/home/guoyi/Downloads/PhD/wes/sortdiamond
HELP=false
### Get some arrays
ARGS=$(getopt -o c:,f:,g:,h,l:,m:,r:,t: --long contigs:,genes:,functions:,help,list:,memory:,reference:,threads: -n 'RGBEPP.sh' -- "$@")
if [ $? != 0 ]; then
echo "Failed to parse options." >&2
exit 1
fi
eval set -- "$ARGS"
while true; do
case "$1" in
-c|--contigs)
case "$2" in
"") ARG_C='scaffolds'; shift 2 ;;
*) ARG_C=$2; shift 2 ;;
esac ;;
-g|--genes)
case "$2" in
"") shift 2 ;;
*) ARG_G=$2; shift 2 ;;
esac ;;
-f|--functions)
case "$2" in
"") ARG_F='all'; shift 2 ;;
*) ARG_F=$2; shift 2 ;;
esac ;;
-h|--help)
echo -e "\t\t\t\t\t\t\tRGB EPP\n\t\t\t\t\tReference Genome based Exon Phylogeny Pipeline\n \
Version: $pkgver\n \
License: GPL-3.0-only\n \
Author: Guoyi Zhang\n \
-c\t--contigs\tcontings type: scaffolds or contigs\n \
-g\t--genes\t\tgene file path\n \
-f\t--functions\tfunctions type (optional): all clean \n \
\t \t\tassembly fasta map pre split merge align\n \
-h\t--help\t\thelp: show this information\n \
-l\t--list\t\tlist file path\n \
-m\t--memory\tmemory settings (optional, default 16 GB)\n \
-r\t--reference\treference genome path\n \
-t\t--threads\tthreads setting (optional, default 8 threads)\n \
for example: bash $0 -c scaffolds -f all -l list -g genes -r Reference.exons.aa.fas \n"
HELP=true
shift ;;
-l|--list)
case "$2" in
"") shift 2 ;;
*) ARG_L=$2; shift 2 ;;
esac ;;
-m|--memory)
case "$2" in
"") ARG_M=16; shift 2 ;;
*) ARG_M=$2; shift 2 ;;
esac ;;
-r|--reference)
case "$2" in
"") shift 2 ;;
*) ARG_R=$2; shift 2 ;;
esac ;;
-t|--threads)
case "$2" in
"") ARG_T=8; shift 2 ;;
*) ARG_T=$2; shift 2 ;;
esac ;;
--) shift; break ;;
*) echo "Internal error!"; exit 1 ;;
esac
done
### Get and check some arguments
if [ "$HELP" = false ]; then
if [ -z "$ARG_L" ]; then
echo "List argument can't be empty"
exit 1
fi
readarray -t full_names < "$ARG_L"
readarray -t genes < "$ARG_G"
length_fn=${#full_names[@]}
length_gn=${#genes[@]}
fi
### Quality control && Trimming
if [ "$ARG_F" = "all" ] || [ "$ARG_F" = "clean" ]; then
## Prepare
mkdir -p $DirQcTrim
## Quality control and trimming using fastp
for (( i=0; i<$length_fn; i++ )); do
fastp -i $DirRaw/${full_names[$i]}_R1.fastq.gz -I $DirRaw/${full_names[$i]}_R2.fastq.gz -j $DirQcTrim/${full_names[$i]}.json -h $DirQcTrim/${full_names[$i]}.html -o $DirQcTrim/${full_names[$i]}_R1.fastq.gz -O $DirQcTrim/${full_names[$i]}_R2.fastq.gz -w $ARG_T
done
fi
### De novo assembly
if [ "$ARG_F" = "all" ] || [ "$ARG_F" = "assembly" ]; then
## Prepare
mkdir -p $DirAssembly
## De novo assembly using spades
for (( i=0; i<$length_fn; i++ )); do
mkdir -p $DirAssembly/${full_names[$i]}
spades.py --pe1-1 $DirQcTrim/${full_names[$i]}_R1.fastq.gz --pe1-2 $DirQcTrim/${full_names[$i]}_R2.fastq.gz -t $ARG_T -m $ARG_M --careful --phred-offset 33 -o $DirAssembly/${full_names[$i]}
# -k 96,107,117,127 \
done
fi
### Moving scaffords or Contigs out
if [ "$ARG_F" = "all" ] || [ "$ARG_F" = "fasta" ]; then
## Check if the contigs type is specified
if [ -z "$ARG_C" ] ; then
echo "Argument of contig type missing."
exit 1
fi
## Prepare
mkdir -p $DirFasta
## Move the assemblied fasta file to the folder
if [ "$ARG_C" = "scaffolds" ] || [ "$ARG_C" = "contigs" ] ; then
for (( i=0; i<$length_fn; i++ )); do
cp $DirAssembly/${full_names[$i]}/$ARG_C.fasta $DirFasta/$ARG_C/${full_names[$i]}.fasta
done
fi
fi
### Mapping
if [ "$ARG_F" = "all" ] || [ "$ARG_F" = "map" ]; then
## Check if the reference or contigs type is specified
if [ -z "$ARG_R" ] || [ -z "$ARG_C" ] ; then
echo "Argument of reference or contig type missing."
exit 1
fi
## Prepare
mkdir -p $DirMap
## Index reference database
cd $DirFasta/$ARG_C
diamond makedb --db Reference --in $ARG_R
cd -
## Blastx for mapping DNA sequences to protein reference sequence
cd $DirFasta/$ARG_C
for (( i=0; i<$length_fn; i++ )); do
diamond blastx -d Reference.dmnd -q ${full_names[$i]}.fasta -o ${full_names[$i]}.m8 \
--outfmt 6 qseqid sseqid pident length mismatch gapopen qstart qend sstart send evalue bitscore qlen slen gaps ppos qframe qseq
# subject: reference; query: align-aimed
#1: qseqid: Query Seq-id
#2: sseqid: Subject Seq - id
#3: pident: Percentage of identical matches
#4: length: Alignment length
#5: mismatch: Number of mismatches
#6: gapopen: Number of gap openings
#7: qstart: Start of alignment in query
#8: qend: End of alignment in query
#9: sstart: Start of alignment in subject
#10: send: End of alignment in subject
#11: evalue: Expect value
#12: bitscore: Bit score
#13: qlen: Query sequence length 比对序列长度
#14: slen: Subject sequence length
#15: gaps: Total number of gaps
#16: ppos: Percentage of positive - scoring matches
#17: qframe: Query frame (frames in ECPP.sh)
#18: qseq: Aligned part of query sequence
done
cd -
mv $DirFasta/$ARG_C/*.m8 $DirMap
fi
if [ "$ARG_F" = "all" ] || [ "$ARG_F" = "pre" ]; then
mkdir -p $DirPre
for (( i=0; i<$length_fn; i++ )); do
$PathSortdiamond $DirMap/${full_names[$i]}.m8 $DirPre/${full_names[$i]}.fasta
done
fi
if [ "$ARG_F" = "all" ] || [ "$ARG_F" = "split" ]; then
mkdir -p $DirSplit
cd $DirPre
for (( i=0; i<$length_fn; i++ )); do
$PathSplitfsata ${full_names[$i]}.fasta
done
find . -mindepth 1 -maxdepth 1 -type d -exec mv {} ../$DirSplit \;
cd -
fi
if [ "$ARG_F" = "all" ] || [ "$ARG_F" = "merge" ]; then
## Check if the genes is specified
if [ -z "$ARG_G" ] ; then
echo "Argument of genes list missing."
exit 1
fi
mkdir -p $DirMerge
cd $DirSplit
for (( i=0; i<$length_gn; i++ ))
do
cd ${genes[$i]}
cat * > ../${genes[$i]}.fasta
cd ..
done
mv *.fasta ../$DirMerge
cd -
fi
if [ "$ARG_F" = "all" ] || [ "$ARG_F" = "align" ]; then
## Check if the genes is specified
if [ -z "$ARG_G" ] ; then
echo "Argument of genes list missing."
exit 1
fi
mkdir -p $DirAlign
mkdir -p $DirAlign/AA && mkdir -p $DirAlign/NT
cd $DirMerge
for (( i=0; i<$length_gn; i++ ))
do
java -jar $PathMacse -prog alignSequences -seq ${genes[$i]}.fasta -out_AA ../$DirAlign/AA/${genes[$i]}.fasta -out_NT ../$DirAlign/NT/${genes[$i]}.fasta
done
cd -
fi