polish: add args, polish all process

2024-07-03 10:41:39 +10:00 · 2024-07-03 10:41:39 +10:00 · 74c2dde71e
commit 74c2dde71e
parent d639d38659
1 changed files with 221 additions and 23 deletions
--- a/batch.sh
+++ b/batch.sh
@ -1,44 +1,242 @@
+#!/bin/bash
+
 ### Environment Setting

+pkgver=0.0.1
 DirRaw=00_raw
 DirQcTrim=01_fastp
 DirAssembly=02_spades
+DirFasta=03_contig
+DirMap=04_map
+DirPre=05_pre
+DirSplit=06_split
+DirMerge=07_merge
+DirAlign=08_align
+
+PathSplitfsata=~/Downloads/PhD/wes/splitfasta-cpp
+PathMacse=/usr/share/java/macse.jar
+PathSortdiamond=/home/guoyi/Downloads/PhD/wes/sortdiamond
+
+HELP=false

 ### Get some arrays

-cd $DirRaw
+ARGS=$(getopt -o c:,f:,h,l:,m:,r:,t: --long contig:,functions:,help,list:,memory:,reference:,threads: -n 'batch.sh' -- "$@")
+if [ $? != 0 ]; then
+    echo "Failed to parse options." >&2
+    exit 1
+fi
+eval set -- "$ARGS"

-readarray -t full_names < <(ls | awk -F '_' '{print $1 "_" $2 "_" $3 "_" $4}' | uniq)
-readarray -t species_names < <(ls | awk -F '_' '{print $2 "_" $3}' | uniq)
-readarray -t output_names < <(ls | awk -F '_' '{print $2 "_" $3 "_" $4}' | uniq)
+while true; do
+    case "$1" in
+        -c|--contig)
+            case "$2" in
+                "") ARG_C='scaffolds'; shift 2 ;;
+                *) ARG_C=$2; shift 2 ;;
+            esac ;;
+        -f|--functions)
+            case "$2" in
+                "") ARG_F='all'; shift 2 ;;
+                *) ARG_F=$2; shift 2 ;;
+            esac ;;
+        -h|--help)
+            echo -e "\t\t\t\t\tExon Phylogeny Pipeline\n \
+		       Version: $pkgver\n \
+		       License: GPL-3.0-only\n \
+		       Author: Guoyi Zhang\n \
+                      -c\t--contig\tcontings type: scaffolds or contigs\n \
+                      -f\t--functions\tfunctions type (optional): all clean assembly fasta map pre\n \
+                      -h\t--help\thelp: show this information\n \
+                      -l\t--list\tlist file path\n \
+                      -m\t--memory\tmemory settings (optional, default 16 GB)\n \
+                      -r\t--reference\treference genome path\n \
+                      -t\t--threads\tthreads setting (optional, default 8 threads)\n \
+                      for example: bash $0 -c scaffolds -f all -l list -r Reference.exons.aa.fas \n"
+            HELP=true
+            shift ;;
+        -l|--list)
+            case "$2" in
+                "") shift 2 ;;
+                *) ARG_L=$2; shift 2 ;;
+            esac ;;
+        -m|--memory)
+            case "$2" in
+                "") ARG_M=16; shift 2 ;;
+                *) ARG_M=$2; shift 2 ;;
+            esac ;;
+        -r|--reference)
+            case "$2" in
+                "") shift 2 ;;
+                *) ARG_R=$2; shift 2 ;;
+            esac ;;
+        -t|--threads)
+            case "$2" in
+                "") ARG_T=8; shift 2 ;;
+                *) ARG_T=$2; shift 2 ;;
+            esac ;;
+        --) shift; break ;;
+        *) echo "Internal error!"; exit 1 ;;
+    esac
+done

-cd ..
+### Get and check some arguments

-length_fn=${#full_names[@]}
-length_sn=${#species_names[@]}
-length_on=${#output_names[@]}
+if [ "$HELP" = false ]; then
+    if [ -z "$ARG_L" ]; then
+        echo "List argument can't be empty"
+        exit 1
+    fi

-### Check the arrays
-
-if [ $length_fn -ne $length_sn ] || [ $length_fn -ne $length_on ] || [ $length_sn -ne $length_on ]
-then
-  echo "Please check the amount number of arrays"
-  exit 0
+    readarray -t full_names < "$ARG_L"
+    length_fn=${#full_names[@]}
 fi

 ### Quality control && Trimming

-mkdir -p $DirQcTrim
+if [ "$ARG_F" = "all" ] || [ "$ARG_F" = "clean" ]; then

-for (( i=0; i<$length_fn; i++ )); do
-	fastp -i $DirRaw/${full_names[$i]}_R1.fastq.gz -I $DirRaw/${full_names[$i]}_R2.fastq.gz -j ${species_names[$i]}.json -h ${species_names[$i]}.html -o $DirQcTrim/${output_names[$i]}_R1.fastq.gz -O $DirQcTrim/${output_names[$i]}_R2.fastq.gz -w 4
-done
+	## Prepare
+	mkdir -p $DirQcTrim
+
+	## Quality control and trimming using fastp
+	for (( i=0; i<$length_fn; i++ )); do
+		fastp -i $DirRaw/${full_names[$i]}_R1.fastq.gz -I $DirRaw/${full_names[$i]}_R2.fastq.gz -j $DirQcTrim/${full_names[$i]}.json -h $DirQcTrim/${full_names[$i]}.html -o $DirQcTrim/${full_names[$i]}_R1.fastq.gz -O $DirQcTrim/${full_names[$i]}_R2.fastq.gz -w $ARG_T
+	done
+
+fi

 ### De novo assembly

-mkdir -p $DirAssembly
+if [ "$ARG_F" = "all" ] || [ "$ARG_F" = "assembly" ]; then
+
+	## Prepare
+	mkdir -p $DirAssembly
+	
+	## De novo assembly using spades
+	for (( i=0; i<$length_fn; i++ )); do
+		mkdir -p $DirAssembly/${full_names[$i]} 
+		spades.py --pe1-1 $DirQcTrim/${full_names[$i]}_R1.fastq.gz --pe1-2 $DirQcTrim/${full_names[$i]}_R2.fastq.gz -t $ARG_T -m $ARG_M --careful --phred-offset 33 -o $DirAssembly/${full_names[$i]}
+			# -k 96,107,117,127 \
+	done
+
+fi
+
+### Moving scaffords or Contigs out
+
+if [ "$ARG_F" = "all" ] || [ "$ARG_F" = "fasta" ]; then
+
+	## Check if the contigs type is specified
+	if [ -z "$ARG_C" ] ; then
+		echo "Argument of contig type missing."
+		exit 1
+	fi
+
+	## Prepare
+	mkdir -p $DirFasta
+
+	## Move the assemblied fasta file to the folder
+	if [ "$ARG_C" = "scaffolds" ] || [ "$ARG_C" = "contigs" ] ; then
+		for (( i=0; i<$length_fn; i++ )); do
+			cp $DirAssembly/${full_names[$i]}/$ARG_C.fasta $DirFasta/$ARG_C/${full_names[$i]}.fasta
+		done
+	fi
+
+fi
+
+### Mapping
+
+if [ "$ARG_F" = "all" ] || [ "$ARG_F" = "map" ]; then
+
+	## Check if the reference or contigs type is specified
+	if [ -z "$ARG_R" ] || [ -z "$ARG_C" ] ; then
+		echo "Argument of reference or contig type missing."
+		exit 1
+	fi
+
+	## Prepare
+	mkdir -p $DirMap
+
+	## Index reference database
+	cd $DirFasta/$ARG_C
+	diamond makedb --db Reference --in $ARG_R
+	cd -
+
+	## Blastx for mapping DNA sequences to protein reference sequence
+	cd $DirFasta/$ARG_C
+	for (( i=0; i<$length_fn; i++ )); do		
+		diamond blastx -d Reference.dmnd -q ${full_names[$i]}.fasta -o ${full_names[$i]}.m8 \
+		--outfmt 6 qseqid sseqid pident length mismatch gapopen qstart qend sstart send evalue bitscore qlen slen gaps ppos qframe qseq
+	# subject: reference; query: align-aimed 
+	#1:	qseqid: Query Seq-id
+	#2:	sseqid: Subject Seq - id
+	#3:	pident: Percentage of identical matches
+	#4:	length: Alignment length
+	#5:	mismatch: Number of mismatches
+	#6:	gapopen: Number of gap openings
+	#7:	qstart: Start of alignment in query
+	#8:	qend: End of alignment in query
+	#9:	sstart: Start of alignment in subject
+	#10:	send: End of alignment in subject
+	#11:	evalue: Expect value
+	#12:	bitscore: Bit score
+	#13:	qlen: Query sequence length 比对序列长度
+	#14:	slen: Subject sequence length
+	#15:	gaps: Total number of gaps
+	#16:	ppos: Percentage of positive - scoring matches
+	#17:	qframe: Query frame (frames in ECPP.sh)
+	#18:	qseq: Aligned part of query sequence
+
+	done
+	cd -
+
+	mv $DirFasta/$ARG_C/*.m8 $DirMap
+
+fi
+
+if [ "$ARG_F" = "all" ] || [ "$ARG_F" = "pre" ]; then
+	mkdir -p $DirPre
+	for (( i=0; i<$length_fn; i++ )); do	
+		$PathSortdiamond $DirMap/${full_names[$i]}.m8 $DirPre/${full_names[$i]}.fasta
+	done
+fi
+
+
+if [ "$ARG_F" = "all" ] || [ "$ARG_F" = "split" ]; then
+	mkdir -p $DirSplit
+	cd $DirPre
+	for (( i=0; i<$length_fn; i++ )); do
+		$PathSplitfsata ${full_names[$i]}.fasta
+	done
+	find . -mindepth 1 -maxdepth 1 -type d -exec mv {} ../$DirSplit \;
+	cd -
+fi
+
+if [ "$ARG_F" = "all" ] || [ "$ARG_F" = "merge" ]; then
+
+	mkdir -p $DirMerge
+	cd $DirSplit
+	for genes in $(ls) 
+	do
+		cd $genes
+		cat * > ../$genes.fasta
+		cd ..
+	done
+	mv *.fasta ../$DirMerge
+	cd -
+	
+fi
+
+if [ "$ARG_F" = "all" ] || [ "$ARG_F" = "align" ]; then
+
+	mkdir -p $DirAlign
+	mkdir -p $DirAlign/AA && mkdir -p $DirAlign/NT
+	cd $DirMerge
+	for genes in $(ls | sed "s@.fasta@@g")
+	do
+		java -jar $PathMacse -prog alignSequences -seq ${genes}.fasta -out_AA ../$DirAlign/AA/$genes.fasta -out_NT ../$DirAlign/NT/$genes.fasta 
+	done
+	cd -
+
+fi

-for (( i=0; i<$length_fn; i++ )); do
-	mkdir -p $DirAssembly/${species_names[$i]} 
-	spades.py --pe1-1 $DirQcTrim/${output_names[$i]}_R1.fastq.gz --pe1-2 $DirQcTrim/${output_names[$i]}_R2.fastq.gz -t 8 -k 97,107,117,127 -m 14 --careful --phred-offset 33 -o $DirAssembly/${species_names[$i]}
-done