#!/usr/bin/env bash
#
# n2fz -- Convert an NCBI-BLAST database into one compressed FASTA format file
#
# The time stamp on the compressed FASTA output file is taken from
# the .[np]al or .[np]sq file associated with the NCBI-BLAST database.
# The default directory location for the compressed FASTA file is
# established by the FASTADIR variable below. Alternatively the -d
# option can be used to specify another directory.
#
# The ability to obtain substantial speed improvement through the use
# of parallel processing likely requires the input NCBI-BLAST database
# be stored on a fast solid-state drive or a RAID array.
#
# The buffering feature of the GNU parallel -k option maintains the order
# of sequences in the NCBI-BLAST database in the FASTA output.
# *** The failure mode of parallel is undefined if it runs out of memory
# *** while buffering multiple outputs from blastdbcmd due to too
# *** many threads being requested. As the incremental speed-up from
# *** running more than about 4 simultaneous instances of blastdbcmd
# *** is marginal, there's no need to go crazy with parallelizing blastdbcmd.
# *** Most of the speed-up may be due to the use of pigz and zstd.
#
# 3rd party add-ons used here:
#    NCBI tools (blastdbcmd), GNU parallel, pigz, zstd, pushover
#
# zstd is much faster and compresses better than gzip/pigz
# and is therefore recommended over gzip if your workflow
# is compatible with zstd.
#
# pushover is optionally used to notify of job start and end.
#
# Author: Warren Gish
# Date: 2024-02-16
# Revised: 2025-10-10
#
# Permission is granted to copy, modify and redistribute this script
# as long as acknowledgement of the original author is maintained.
#
set -o pipefail
# Default directory to store the compressed FASTA output file
FASTADIR="${FASTADIR:-~/db/fasta}"
# Default max. no. of threads to use in pigz/zstd compression
CCORES=64

MEXT=".json"
PMETA="-prot-metadata${MEXT}"
NMETA="-nucl-metadata${MEXT}"

ME=${BASH_SOURCE[0]##*/}

print_usage() {
	>&2 echo "n2fz -- Dump an NCBI-BLAST database to a compressed FASTA file
        using parallel processing to expedite the process

Usage:  $ME [options] DBNAME

Options:
  -d DIR     Save compressed FASTA file in directory DIR (default $FASTADIR)
  -h         Display this help message
  -j<j>      Attempt to run <j> simultaneous blastdbcmd jobs
  -c<c>      Attempt compression using <c> threads in pigz/zstd
  -f         Force over-write of an existing database
  -m<m>      Use compression method <m> (gzip or zstd, default $DEFMETHOD)
  -n NAME    Use NAME for base name of compressed output file (default DBNAME)
  -o OUTFILE Full filename of output file is OUTFILE, including any desired extension
  -t<type>   Dump the <type> of database (nucl, prot or guess)
  -v         Enable verbose mode
  -p         Issue start/end notifications via Pushover (if installed)
  -Q         Report the realpath of the specified database and exit
"
}

N2DIR=${BASH_SOURCE[0]%/*}
[[ $N2DIR == "${BASH_SOURCE[0]}" ]] && N2DIR=.
if [[ ! -r $N2DIR/n2lib ]]; then
	>&2 echo "$ME: required library 'n2lib' not found in $N2DIR"
	>&2 echo "$ME: download it from https://blast.advbiocomp.com/ncbihelper and install it in $N2DIR"
	exit 1
fi
. "$N2DIR/n2lib"

MYDIR="$N2DIR"

# Use the -Q option of COWORKER to locate the database and avoid repetition
# in this script of the code required to locate the database.
# COWORKER is then called to convert the database to FASTA, which we
# then compress.
COWORKER="n2f"
DEFMETHOD="gzip"
command -v zstd &>/dev/null && DEFMETHOD=zstd
CMETHOD=$DEFMETHOD
VERBOSE=""
PUSHOVER=":" # No pushover notifications by default
# Filename extension for FASTA-format files

COWLOC=$(command -v "$COWORKER" 2>/dev/null)
if [[ -z $COWLOC ]]; then
	if [[ ! -x "${MYDIR}/${COWORKER}" ]]; then
		>&2 echo "$ME: Could not find executable helper script: $COWORKER"
		exit 1
	fi
	COWORKER="${MYDIR}/${COWORKER}"
else
	COWORKER="$COWLOC"
fi

unset BCORES DBTYPE FORCE OUTFILE BASENAME
while getopts ":d:fhc:j:m:n:o:pt:v" opt; do
	case $opt in
		d )
		FASTADIR="$OPTARG"
		;;
		f )
		FORCE=1
		;;
		h )
		print_usage
		exit 0
		;;
		j )
		BCORES="-j$OPTARG"
		;;
		c )
		CCORES="$OPTARG"
		;;
		m )
		CMETHOD="$OPTARG"
		;;
		n )
		BASENAME="$OPTARG"
		if [[ -n $OUTFILE ]]; then
			print_usage
			>&2 echo "Error: -o and -n options can not both be used"
			exit 1
		fi
		;;
		o )
		OUTFILE="$OPTARG"
		if [[ -n $BASENAME ]]; then
			print_usage
			>&2 echo "Error: -n and -o options can not both be used"
			exit 1
		fi
		[[ $OUTFILE = "-" ]] && OUTFILE="/dev/stdout"
		;;
		p )
		PUSHOVER=$(command -v pushover 2>/dev/null || echo ":")
		;;
		t )
		DBTYPE="-t$OPTARG"
		;;
		v )
		VERBOSE="-v"
		;;
		\? )
		print_usage
		>&2 echo "$ME: invalid option: -$OPTARG"
		exit 1
		;;
		: )
		print_usage
		>&2 echo "$ME: the -$OPTARG option requires an argument"
		exit 1
		;;
	esac
done
shift $((OPTIND -1))
if [ $# != 1 ]; then
	print_usage
	>&2 echo "Specify a database name to dump to a compressed FASTA file."
	exit 1
fi

export FASTADIR=$(expand_path "$FASTADIR")
if [[ ! -d ${FASTADIR} ]]; then
	>&2 echo "$ME: directory for storing compressed FASTA file does not exist: $FASTADIR"
	exit 1
fi

case "$CMETHOD" in
	z|zs|zst|zstd|Z|ZS|ZST|ZSTD)
	CUTIL=zstd
	EXT="zst"
	;;
	pz|pzs|pzst|pzstd|PZ|PZS|PZST|PZSTD)
	CUTIL=pzstd
	EXT="zst"
	;;
	g|gz|gzi|gzip|G|GZ|GZI|GZIP)
	CUTIL=gzip
	EXT="gz"
	;;
	p|pi|pig|pigz|P|PI|PIG|PIGZ)
	CUTIL=gzip
	EXT="gz"
	;;
	*)
	print_usage
	>&2 echo "$ME: unrecognized compression method: $CMETHOD"
	exit 1
	;;
esac
DB="$1"

# Get the realpath for the specified NCBI-BLAST database from our sister script
DBASE="$("${COWORKER}" -Q "$DB")"
if [ $? != 0 ]; then
	exit 1
fi

# Determine which file to use as the timestamp on the output file
if [[ -f "$DBASE$PMETA" ]]; then
	dbt=p
	TSFILE="$DBASE$PMETA"
elif [[ -f "$DBASE$NMETA" ]]; then
	dbt=n
	TSFILE="$DBASE$NMETA"
else
	DBPATH="${DBASE%/*}"
	>&2 echo "$ME: $MEXT file for \"${DB}\" database not found in $DBPATH"
	exit 1
fi

if [[ -z $OUTFILE ]]; then
	if [[ -z $BASENAME ]]; then
		OUTFILE="${FASTADIR}/${DB}.fa.${EXT}"
	else
		OUTFILE="${FASTADIR}/${BASENAME}.fa.${EXT}"
	fi
fi
if [[ $OUTFILE} != "/dev/"* && -e ${OUTFILE} ]]; then
# Check timestamps for equality
	if [[ ! $FORCE && ! $TSFILE -nt $OUTFILE && ! $OUTFILE -nt $TSFILE ]]; then
		>&2 echo "$ME: output file with latest timestamp already exists"
		exit 1
	fi
fi


n2lib_determine_cores
CCORES=$(min PCORES CCORES)
CCORES=$(max CCORES 1)

if [[ -n $VERBOSE ]]; then
	if [[ -n $BCORES ]]; then
		>&2 echo "$ME: simultaneous dumper jobs: ${BCORES#-j}"
	else
		>&2 echo "$ME: simultaneous dumper jobs: determined by $COWORKER"
	fi
	>&2 echo "$ME: compression threads: $CCORES"
fi

if [[ "$CUTIL" = "gzip" ]]; then
	if command -v pigz &>/dev/null ; then
# pigz will utilize all cores _and_ hyperthreads unless limited
		CUTIL="pigz"
		CCORES="-p${CCORES}"
	else
		>&2 echo "$ME: compression utility \"pigz\" not found... using \"gzip\" instead.'"
		CUTIL="gzip"
		CCORES=""
	fi
else
	CUTIL="zstd"
	CCORES="-T${CCORES}"
fi
COMPRESSOR=$(command -v "$CUTIL" 2>/dev/null)
if [[ -z $COMPRESSOR ]]; then
	>&2 echo "$ME: compression utility \"$CUTIL\" not found."
	exit 1
fi


interrupted()
{
	trap '' TERM
	>&2 echo "$ME: killed. Incomplete output file: $OUTFILE"
	# Only kill child jobs, not ourselves
	jobs -p | xargs -r kill -TERM &> /dev/null
	"$PUSHOVER" "$HOSTNAME $ME $DB interrupted"
	exit 1;
}

trap "interrupted" HUP INT TERM QUIT


"$PUSHOVER" $HOSTNAME $ME $DB start

set -o pipefail
"$COWORKER" $DBTYPE $BCORES $VERBOSE "$DBASE" | "${COMPRESSOR}" $CCORES > "$OUTFILE"
rc=$?

trap '' TERM

# Set timestamp on the output file using the timestamp of TSFILE
if [[ $rc -eq 0 && $OUTFILE != "/dev/"* ]]; then
	touch -r "$TSFILE" "$OUTFILE"
fi

"$PUSHOVER" $HOSTNAME $ME $DB exit $rc