polish: add args, polish all process

2024-07-03 10:41:39 +10:00 · 2024-07-03 10:41:39 +10:00 · 74c2dde71e
commit 74c2dde71e
parent d639d38659
1 changed files with 221 additions and 23 deletions
--- a/batch.sh
+++ b/batch.sh
@ -1,44 +1,242 @@
 #!/bin/bash
 ### Environment Setting
 pkgver=0.0.1
 DirRaw=00_raw
 DirQcTrim=01_fastp
 DirAssembly=02_spades
 DirFasta=03_contig
 DirMap=04_map
 DirPre=05_pre
 DirSplit=06_split
 DirMerge=07_merge
 DirAlign=08_align
 PathSplitfsata=~/Downloads/PhD/wes/splitfasta-cpp
 PathMacse=/usr/share/java/macse.jar
 PathSortdiamond=/home/guoyi/Downloads/PhD/wes/sortdiamond
 HELP=false
 ### Get some arrays
-cd $DirRaw
+ARGS=$(getopt -o c:,f:,h,l:,m:,r:,t: --long contig:,functions:,help,list:,memory:,reference:,threads: -n 'batch.sh' -- "$@")
 if [ $? != 0 ]; then
    echo "Failed to parse options." >&2
    exit 1
 fi
 eval set -- "$ARGS"
-readarray -t full_names < <(ls | awk -F '_' '{print $1 "_" $2 "_" $3 "_" $4}' | uniq)
+while true; do
-readarray -t species_names < <(ls | awk -F '_' '{print $2 "_" $3}' | uniq)
+    case "$1" in
-readarray -t output_names < <(ls | awk -F '_' '{print $2 "_" $3 "_" $4}' | uniq)
+        -c|--contig)
            case "$2" in
                "") ARG_C='scaffolds'; shift 2 ;;
                *) ARG_C=$2; shift 2 ;;
            esac ;;
        -f|--functions)
            case "$2" in
                "") ARG_F='all'; shift 2 ;;
                *) ARG_F=$2; shift 2 ;;
            esac ;;
        -h|--help)
            echo -e "\t\t\t\t\tExon Phylogeny Pipeline\n \
 		       Version: $pkgver\n \
 		       License: GPL-3.0-only\n \
 		       Author: Guoyi Zhang\n \
                      -c\t--contig\tcontings type: scaffolds or contigs\n \
                      -f\t--functions\tfunctions type (optional): all clean assembly fasta map pre\n \
                      -h\t--help\thelp: show this information\n \
                      -l\t--list\tlist file path\n \
                      -m\t--memory\tmemory settings (optional, default 16 GB)\n \
                      -r\t--reference\treference genome path\n \
                      -t\t--threads\tthreads setting (optional, default 8 threads)\n \
                      for example: bash $0 -c scaffolds -f all -l list -r Reference.exons.aa.fas \n"
            HELP=true
            shift ;;
        -l|--list)
            case "$2" in
                "") shift 2 ;;
                *) ARG_L=$2; shift 2 ;;
            esac ;;
        -m|--memory)
            case "$2" in
                "") ARG_M=16; shift 2 ;;
                *) ARG_M=$2; shift 2 ;;
            esac ;;
        -r|--reference)
            case "$2" in
                "") shift 2 ;;
                *) ARG_R=$2; shift 2 ;;
            esac ;;
        -t|--threads)
            case "$2" in
                "") ARG_T=8; shift 2 ;;
                *) ARG_T=$2; shift 2 ;;
            esac ;;
        --) shift; break ;;
        *) echo "Internal error!"; exit 1 ;;
    esac
 done
-cd ..
+### Get and check some arguments
-length_fn=${#full_names[@]}
+if [ "$HELP" = false ]; then
-length_sn=${#species_names[@]}
+    if [ -z "$ARG_L" ]; then
-length_on=${#output_names[@]}
+        echo "List argument can't be empty"
        exit 1
    fi
-### Check the arrays
+    readarray -t full_names < "$ARG_L"
-
+    length_fn=${#full_names[@]}
 if [ $length_fn -ne $length_sn ] || [ $length_fn -ne $length_on ] || [ $length_sn -ne $length_on ]
 then
  echo "Please check the amount number of arrays"
  exit 0
 fi
 ### Quality control && Trimming
-mkdir -p $DirQcTrim
+if [ "$ARG_F" = "all" ] || [ "$ARG_F" = "clean" ]; then
-for (( i=0; i<$length_fn; i++ )); do
+	## Prepare
-	fastp -i $DirRaw/${full_names[$i]}_R1.fastq.gz -I $DirRaw/${full_names[$i]}_R2.fastq.gz -j ${species_names[$i]}.json -h ${species_names[$i]}.html -o $DirQcTrim/${output_names[$i]}_R1.fastq.gz -O $DirQcTrim/${output_names[$i]}_R2.fastq.gz -w 4
+	mkdir -p $DirQcTrim
-done
+
 	## Quality control and trimming using fastp
 	for (( i=0; i<$length_fn; i++ )); do
 		fastp -i $DirRaw/${full_names[$i]}_R1.fastq.gz -I $DirRaw/${full_names[$i]}_R2.fastq.gz -j $DirQcTrim/${full_names[$i]}.json -h $DirQcTrim/${full_names[$i]}.html -o $DirQcTrim/${full_names[$i]}_R1.fastq.gz -O $DirQcTrim/${full_names[$i]}_R2.fastq.gz -w $ARG_T
 	done
 fi
 ### De novo assembly
-mkdir -p $DirAssembly
+if [ "$ARG_F" = "all" ] || [ "$ARG_F" = "assembly" ]; then
 	## Prepare
 	mkdir -p $DirAssembly
 	## De novo assembly using spades
 	for (( i=0; i<$length_fn; i++ )); do
 		mkdir -p $DirAssembly/${full_names[$i]} 
 		spades.py --pe1-1 $DirQcTrim/${full_names[$i]}_R1.fastq.gz --pe1-2 $DirQcTrim/${full_names[$i]}_R2.fastq.gz -t $ARG_T -m $ARG_M --careful --phred-offset 33 -o $DirAssembly/${full_names[$i]}
 			# -k 96,107,117,127 \
 	done
 fi
 ### Moving scaffords or Contigs out
 if [ "$ARG_F" = "all" ] || [ "$ARG_F" = "fasta" ]; then
 	## Check if the contigs type is specified
 	if [ -z "$ARG_C" ] ; then
 		echo "Argument of contig type missing."
 		exit 1
 	fi
 	## Prepare
 	mkdir -p $DirFasta
 	## Move the assemblied fasta file to the folder
 	if [ "$ARG_C" = "scaffolds" ] || [ "$ARG_C" = "contigs" ] ; then
 		for (( i=0; i<$length_fn; i++ )); do
 			cp $DirAssembly/${full_names[$i]}/$ARG_C.fasta $DirFasta/$ARG_C/${full_names[$i]}.fasta
 		done
 	fi
 fi
 ### Mapping
 if [ "$ARG_F" = "all" ] || [ "$ARG_F" = "map" ]; then
 	## Check if the reference or contigs type is specified
 	if [ -z "$ARG_R" ] || [ -z "$ARG_C" ] ; then
 		echo "Argument of reference or contig type missing."
 		exit 1
 	fi
 	## Prepare
 	mkdir -p $DirMap
 	## Index reference database
 	cd $DirFasta/$ARG_C
 	diamond makedb --db Reference --in $ARG_R
 	cd -
 	## Blastx for mapping DNA sequences to protein reference sequence
 	cd $DirFasta/$ARG_C
 	for (( i=0; i<$length_fn; i++ )); do		
 		diamond blastx -d Reference.dmnd -q ${full_names[$i]}.fasta -o ${full_names[$i]}.m8 \
 		--outfmt 6 qseqid sseqid pident length mismatch gapopen qstart qend sstart send evalue bitscore qlen slen gaps ppos qframe qseq
 	# subject: reference; query: align-aimed 
 	#1:	qseqid: Query Seq-id
 	#2:	sseqid: Subject Seq - id
 	#3:	pident: Percentage of identical matches
 	#4:	length: Alignment length
 	#5:	mismatch: Number of mismatches
 	#6:	gapopen: Number of gap openings
 	#7:	qstart: Start of alignment in query
 	#8:	qend: End of alignment in query
 	#9:	sstart: Start of alignment in subject
 	#10:	send: End of alignment in subject
 	#11:	evalue: Expect value
 	#12:	bitscore: Bit score
 	#13:	qlen: Query sequence length 比对序列长度
 	#14:	slen: Subject sequence length
 	#15:	gaps: Total number of gaps
 	#16:	ppos: Percentage of positive - scoring matches
 	#17:	qframe: Query frame (frames in ECPP.sh)
 	#18:	qseq: Aligned part of query sequence
 	done
 	cd -
 	mv $DirFasta/$ARG_C/*.m8 $DirMap
 fi
 if [ "$ARG_F" = "all" ] || [ "$ARG_F" = "pre" ]; then
 	mkdir -p $DirPre
 	for (( i=0; i<$length_fn; i++ )); do	
 		$PathSortdiamond $DirMap/${full_names[$i]}.m8 $DirPre/${full_names[$i]}.fasta
 	done
 fi
 if [ "$ARG_F" = "all" ] || [ "$ARG_F" = "split" ]; then
 	mkdir -p $DirSplit
 	cd $DirPre
 	for (( i=0; i<$length_fn; i++ )); do
 		$PathSplitfsata ${full_names[$i]}.fasta
 	done
 	find . -mindepth 1 -maxdepth 1 -type d -exec mv {} ../$DirSplit \;
 	cd -
 fi
 if [ "$ARG_F" = "all" ] || [ "$ARG_F" = "merge" ]; then
 	mkdir -p $DirMerge
 	cd $DirSplit
 	for genes in $(ls) 
 	do
 		cd $genes
 		cat * > ../$genes.fasta
 		cd ..
 	done
 	mv *.fasta ../$DirMerge
 	cd -
 fi
 if [ "$ARG_F" = "all" ] || [ "$ARG_F" = "align" ]; then
 	mkdir -p $DirAlign
 	mkdir -p $DirAlign/AA && mkdir -p $DirAlign/NT
 	cd $DirMerge
 	for genes in $(ls | sed "s@.fasta@@g")
 	do
 		java -jar $PathMacse -prog alignSequences -seq ${genes}.fasta -out_AA ../$DirAlign/AA/$genes.fasta -out_NT ../$DirAlign/NT/$genes.fasta 
 	done
 	cd -
 fi
 for (( i=0; i<$length_fn; i++ )); do
 	mkdir -p $DirAssembly/${species_names[$i]} 
 	spades.py --pe1-1 $DirQcTrim/${output_names[$i]}_R1.fastq.gz --pe1-2 $DirQcTrim/${output_names[$i]}_R2.fastq.gz -t 8 -k 97,107,117,127 -m 14 --careful --phred-offset 33 -o $DirAssembly/${species_names[$i]}
 done