polish: add args, polish all process
This commit is contained in:
parent
d639d38659
commit
74c2dde71e
1 changed files with 221 additions and 23 deletions
244
batch.sh
244
batch.sh
|
@ -1,44 +1,242 @@
|
|||
#!/bin/bash
|
||||
|
||||
### Environment Setting
|
||||
|
||||
pkgver=0.0.1
|
||||
DirRaw=00_raw
|
||||
DirQcTrim=01_fastp
|
||||
DirAssembly=02_spades
|
||||
DirFasta=03_contig
|
||||
DirMap=04_map
|
||||
DirPre=05_pre
|
||||
DirSplit=06_split
|
||||
DirMerge=07_merge
|
||||
DirAlign=08_align
|
||||
|
||||
PathSplitfsata=~/Downloads/PhD/wes/splitfasta-cpp
|
||||
PathMacse=/usr/share/java/macse.jar
|
||||
PathSortdiamond=/home/guoyi/Downloads/PhD/wes/sortdiamond
|
||||
|
||||
HELP=false
|
||||
|
||||
### Get some arrays
|
||||
|
||||
cd $DirRaw
|
||||
ARGS=$(getopt -o c:,f:,h,l:,m:,r:,t: --long contig:,functions:,help,list:,memory:,reference:,threads: -n 'batch.sh' -- "$@")
|
||||
if [ $? != 0 ]; then
|
||||
echo "Failed to parse options." >&2
|
||||
exit 1
|
||||
fi
|
||||
eval set -- "$ARGS"
|
||||
|
||||
readarray -t full_names < <(ls | awk -F '_' '{print $1 "_" $2 "_" $3 "_" $4}' | uniq)
|
||||
readarray -t species_names < <(ls | awk -F '_' '{print $2 "_" $3}' | uniq)
|
||||
readarray -t output_names < <(ls | awk -F '_' '{print $2 "_" $3 "_" $4}' | uniq)
|
||||
while true; do
|
||||
case "$1" in
|
||||
-c|--contig)
|
||||
case "$2" in
|
||||
"") ARG_C='scaffolds'; shift 2 ;;
|
||||
*) ARG_C=$2; shift 2 ;;
|
||||
esac ;;
|
||||
-f|--functions)
|
||||
case "$2" in
|
||||
"") ARG_F='all'; shift 2 ;;
|
||||
*) ARG_F=$2; shift 2 ;;
|
||||
esac ;;
|
||||
-h|--help)
|
||||
echo -e "\t\t\t\t\tExon Phylogeny Pipeline\n \
|
||||
Version: $pkgver\n \
|
||||
License: GPL-3.0-only\n \
|
||||
Author: Guoyi Zhang\n \
|
||||
-c\t--contig\tcontings type: scaffolds or contigs\n \
|
||||
-f\t--functions\tfunctions type (optional): all clean assembly fasta map pre\n \
|
||||
-h\t--help\thelp: show this information\n \
|
||||
-l\t--list\tlist file path\n \
|
||||
-m\t--memory\tmemory settings (optional, default 16 GB)\n \
|
||||
-r\t--reference\treference genome path\n \
|
||||
-t\t--threads\tthreads setting (optional, default 8 threads)\n \
|
||||
for example: bash $0 -c scaffolds -f all -l list -r Reference.exons.aa.fas \n"
|
||||
HELP=true
|
||||
shift ;;
|
||||
-l|--list)
|
||||
case "$2" in
|
||||
"") shift 2 ;;
|
||||
*) ARG_L=$2; shift 2 ;;
|
||||
esac ;;
|
||||
-m|--memory)
|
||||
case "$2" in
|
||||
"") ARG_M=16; shift 2 ;;
|
||||
*) ARG_M=$2; shift 2 ;;
|
||||
esac ;;
|
||||
-r|--reference)
|
||||
case "$2" in
|
||||
"") shift 2 ;;
|
||||
*) ARG_R=$2; shift 2 ;;
|
||||
esac ;;
|
||||
-t|--threads)
|
||||
case "$2" in
|
||||
"") ARG_T=8; shift 2 ;;
|
||||
*) ARG_T=$2; shift 2 ;;
|
||||
esac ;;
|
||||
--) shift; break ;;
|
||||
*) echo "Internal error!"; exit 1 ;;
|
||||
esac
|
||||
done
|
||||
|
||||
cd ..
|
||||
### Get and check some arguments
|
||||
|
||||
length_fn=${#full_names[@]}
|
||||
length_sn=${#species_names[@]}
|
||||
length_on=${#output_names[@]}
|
||||
if [ "$HELP" = false ]; then
|
||||
if [ -z "$ARG_L" ]; then
|
||||
echo "List argument can't be empty"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
### Check the arrays
|
||||
|
||||
if [ $length_fn -ne $length_sn ] || [ $length_fn -ne $length_on ] || [ $length_sn -ne $length_on ]
|
||||
then
|
||||
echo "Please check the amount number of arrays"
|
||||
exit 0
|
||||
readarray -t full_names < "$ARG_L"
|
||||
length_fn=${#full_names[@]}
|
||||
fi
|
||||
|
||||
### Quality control && Trimming
|
||||
|
||||
mkdir -p $DirQcTrim
|
||||
if [ "$ARG_F" = "all" ] || [ "$ARG_F" = "clean" ]; then
|
||||
|
||||
for (( i=0; i<$length_fn; i++ )); do
|
||||
fastp -i $DirRaw/${full_names[$i]}_R1.fastq.gz -I $DirRaw/${full_names[$i]}_R2.fastq.gz -j ${species_names[$i]}.json -h ${species_names[$i]}.html -o $DirQcTrim/${output_names[$i]}_R1.fastq.gz -O $DirQcTrim/${output_names[$i]}_R2.fastq.gz -w 4
|
||||
done
|
||||
## Prepare
|
||||
mkdir -p $DirQcTrim
|
||||
|
||||
## Quality control and trimming using fastp
|
||||
for (( i=0; i<$length_fn; i++ )); do
|
||||
fastp -i $DirRaw/${full_names[$i]}_R1.fastq.gz -I $DirRaw/${full_names[$i]}_R2.fastq.gz -j $DirQcTrim/${full_names[$i]}.json -h $DirQcTrim/${full_names[$i]}.html -o $DirQcTrim/${full_names[$i]}_R1.fastq.gz -O $DirQcTrim/${full_names[$i]}_R2.fastq.gz -w $ARG_T
|
||||
done
|
||||
|
||||
fi
|
||||
|
||||
### De novo assembly
|
||||
|
||||
mkdir -p $DirAssembly
|
||||
if [ "$ARG_F" = "all" ] || [ "$ARG_F" = "assembly" ]; then
|
||||
|
||||
## Prepare
|
||||
mkdir -p $DirAssembly
|
||||
|
||||
## De novo assembly using spades
|
||||
for (( i=0; i<$length_fn; i++ )); do
|
||||
mkdir -p $DirAssembly/${full_names[$i]}
|
||||
spades.py --pe1-1 $DirQcTrim/${full_names[$i]}_R1.fastq.gz --pe1-2 $DirQcTrim/${full_names[$i]}_R2.fastq.gz -t $ARG_T -m $ARG_M --careful --phred-offset 33 -o $DirAssembly/${full_names[$i]}
|
||||
# -k 96,107,117,127 \
|
||||
done
|
||||
|
||||
fi
|
||||
|
||||
### Moving scaffords or Contigs out
|
||||
|
||||
if [ "$ARG_F" = "all" ] || [ "$ARG_F" = "fasta" ]; then
|
||||
|
||||
## Check if the contigs type is specified
|
||||
if [ -z "$ARG_C" ] ; then
|
||||
echo "Argument of contig type missing."
|
||||
exit 1
|
||||
fi
|
||||
|
||||
## Prepare
|
||||
mkdir -p $DirFasta
|
||||
|
||||
## Move the assemblied fasta file to the folder
|
||||
if [ "$ARG_C" = "scaffolds" ] || [ "$ARG_C" = "contigs" ] ; then
|
||||
for (( i=0; i<$length_fn; i++ )); do
|
||||
cp $DirAssembly/${full_names[$i]}/$ARG_C.fasta $DirFasta/$ARG_C/${full_names[$i]}.fasta
|
||||
done
|
||||
fi
|
||||
|
||||
fi
|
||||
|
||||
### Mapping
|
||||
|
||||
if [ "$ARG_F" = "all" ] || [ "$ARG_F" = "map" ]; then
|
||||
|
||||
## Check if the reference or contigs type is specified
|
||||
if [ -z "$ARG_R" ] || [ -z "$ARG_C" ] ; then
|
||||
echo "Argument of reference or contig type missing."
|
||||
exit 1
|
||||
fi
|
||||
|
||||
## Prepare
|
||||
mkdir -p $DirMap
|
||||
|
||||
## Index reference database
|
||||
cd $DirFasta/$ARG_C
|
||||
diamond makedb --db Reference --in $ARG_R
|
||||
cd -
|
||||
|
||||
## Blastx for mapping DNA sequences to protein reference sequence
|
||||
cd $DirFasta/$ARG_C
|
||||
for (( i=0; i<$length_fn; i++ )); do
|
||||
diamond blastx -d Reference.dmnd -q ${full_names[$i]}.fasta -o ${full_names[$i]}.m8 \
|
||||
--outfmt 6 qseqid sseqid pident length mismatch gapopen qstart qend sstart send evalue bitscore qlen slen gaps ppos qframe qseq
|
||||
# subject: reference; query: align-aimed
|
||||
#1: qseqid: Query Seq-id
|
||||
#2: sseqid: Subject Seq - id
|
||||
#3: pident: Percentage of identical matches
|
||||
#4: length: Alignment length
|
||||
#5: mismatch: Number of mismatches
|
||||
#6: gapopen: Number of gap openings
|
||||
#7: qstart: Start of alignment in query
|
||||
#8: qend: End of alignment in query
|
||||
#9: sstart: Start of alignment in subject
|
||||
#10: send: End of alignment in subject
|
||||
#11: evalue: Expect value
|
||||
#12: bitscore: Bit score
|
||||
#13: qlen: Query sequence length 比对序列长度
|
||||
#14: slen: Subject sequence length
|
||||
#15: gaps: Total number of gaps
|
||||
#16: ppos: Percentage of positive - scoring matches
|
||||
#17: qframe: Query frame (frames in ECPP.sh)
|
||||
#18: qseq: Aligned part of query sequence
|
||||
|
||||
done
|
||||
cd -
|
||||
|
||||
mv $DirFasta/$ARG_C/*.m8 $DirMap
|
||||
|
||||
fi
|
||||
|
||||
if [ "$ARG_F" = "all" ] || [ "$ARG_F" = "pre" ]; then
|
||||
mkdir -p $DirPre
|
||||
for (( i=0; i<$length_fn; i++ )); do
|
||||
$PathSortdiamond $DirMap/${full_names[$i]}.m8 $DirPre/${full_names[$i]}.fasta
|
||||
done
|
||||
fi
|
||||
|
||||
|
||||
if [ "$ARG_F" = "all" ] || [ "$ARG_F" = "split" ]; then
|
||||
mkdir -p $DirSplit
|
||||
cd $DirPre
|
||||
for (( i=0; i<$length_fn; i++ )); do
|
||||
$PathSplitfsata ${full_names[$i]}.fasta
|
||||
done
|
||||
find . -mindepth 1 -maxdepth 1 -type d -exec mv {} ../$DirSplit \;
|
||||
cd -
|
||||
fi
|
||||
|
||||
if [ "$ARG_F" = "all" ] || [ "$ARG_F" = "merge" ]; then
|
||||
|
||||
mkdir -p $DirMerge
|
||||
cd $DirSplit
|
||||
for genes in $(ls)
|
||||
do
|
||||
cd $genes
|
||||
cat * > ../$genes.fasta
|
||||
cd ..
|
||||
done
|
||||
mv *.fasta ../$DirMerge
|
||||
cd -
|
||||
|
||||
fi
|
||||
|
||||
if [ "$ARG_F" = "all" ] || [ "$ARG_F" = "align" ]; then
|
||||
|
||||
mkdir -p $DirAlign
|
||||
mkdir -p $DirAlign/AA && mkdir -p $DirAlign/NT
|
||||
cd $DirMerge
|
||||
for genes in $(ls | sed "s@.fasta@@g")
|
||||
do
|
||||
java -jar $PathMacse -prog alignSequences -seq ${genes}.fasta -out_AA ../$DirAlign/AA/$genes.fasta -out_NT ../$DirAlign/NT/$genes.fasta
|
||||
done
|
||||
cd -
|
||||
|
||||
fi
|
||||
|
||||
for (( i=0; i<$length_fn; i++ )); do
|
||||
mkdir -p $DirAssembly/${species_names[$i]}
|
||||
spades.py --pe1-1 $DirQcTrim/${output_names[$i]}_R1.fastq.gz --pe1-2 $DirQcTrim/${output_names[$i]}_R2.fastq.gz -t 8 -k 97,107,117,127 -m 14 --careful --phred-offset 33 -o $DirAssembly/${species_names[$i]}
|
||||
done
|
||||
|
|
Loading…
Reference in a new issue