#!/bin/bash
#
# n2f -- Dump an NCBI-BLAST database to stdout in FASTA format
#
# Parallel processing is employed to expedite the conversion.
# The ability to obtain substantial speed improvement through the use
# of parallel processing may require the NCBI-BLAST input database be stored
# on fast solid-state drives or a RAID array. If conventional hard disk drives
# are used instead, this script may actually run far slower than the naive
# single-threaded method.
#
# The NCBI-BLAST database must have been unpacked (untarred) from its *.tar.gz files
# for this script to succeed. This script looks in standard directories for the
# specified NCBI-BLAST database. If you would prefer not to unpack the entire
# database, use the pn2f script that can dump NCBI-BLAST databases still packaged
# in *.tar.gz files.
#
# The buffering feature of GNU parallel's -k option maintains the order
# of sequences in the NCBI-BLAST database in the FASTA output.
# NOTE:	 the failure mode of GNU parallel is undefined if it runs out of memory.
# Hence, this script determines the instantaneous free memory available
# and estimates the maximum number of threads for which blastdbcmd output
# can be buffered.
#
# If only a single job is to be run at one time, then blastdbcmd is invoked
# directly in the normal fashion.
#
# 3rd party add-ons used here:
#	 NCBI tools (for blastdbcmd), GNU parallel
#
# Optionally:
#    ncbidump
#
# pushover is optionally used to notify of job start and end.
#
# Author: Warren Gish
# Date: 2024-02-16
# Revised: 2024-12-03
# Revised: 2026-05-29
# Revised: 2026-06-07
#
# Permission is granted to copy, modify and redistribute this script
# as long as acknowledgement of the original author is maintained.
#
set -o pipefail

ME=${BASH_SOURCE[0]##*/}

N2DIR=${BASH_SOURCE[0]%/*}
[[ $N2DIR == "${BASH_SOURCE[0]}" ]] && N2DIR=.
if [[ ! -r $N2DIR/n2lib ]]; then
	>&2 echo "$ME: required library 'n2lib' not found in $N2DIR"
	>&2 echo "$ME: download it from https://blast.advbiocomp.com/ncbihelper and install it in $N2DIR"
	exit 1
fi
. "$N2DIR/n2lib"

# Default max. no. of dumper jobs to run simultaneously
BCORES=12

print_usage() {
	>&2 echo "n2f -- Dump an NCBI-BLAST database to stdout in FASTA format
	   using parallel processing to expedite the process

Usage:	n2f [options] DBNAME

Options:
  -h		 Display this help message
  -j<j>		 Attempt to run <j> simultaneous dumper jobs
  -v		 Enable verbose mode (print commands to stderr as executed)
  -p		 Issue start and end notifications via Pushover (if installed)
  -t<type>	 Dump the <type> of database (\"nucl\", \"prot\" or \"guess\")
  -Q		 Query mode: report the realpath of DBNAME and exit
"
}

# macOS doesn't include a terminal / in TMPDIR but Linux does
TMPFILE=${TMPDIR:-/var/tmp}/${ME}.$$
# purely for cosmetics, replace any instance of // with /
TMPFILE="${TMPFILE//\/\///}"
/bin/rm -f "$TMPFILE"

VERBOSE=""
j_set="" # whether -j was explicitly specified
PUSHOVER=":" # No pushover notifications by default
# query_mode:  output the realpath of the requested NCBI-BLAST database and quit
query_mode=""

unset dbtype_opt
dbtype_hint=""

while getopts ":hj:pt:vQ" opt; do
	case $opt in
		h )
		print_usage
		exit 0
		;;
		j )
		BCORES=$OPTARG
		j_set=1
		;;
		v )
		VERBOSE="-t"
		;;
		p )
		PUSHOVER=$(command -v pushover 2>/dev/null || echo ":")
		;;
		Q )
		query_mode=1
		;;
		t )
		case "$OPTARG" in
			g|gu|gue|gues|guess)
			dbtype_opt=guess
			dbtype_hint=""
			;;
			p|pr|pro|prot|prote|protei|protein|aa|pe|pep|pept|pepti|peptid|peptide)
			dbtype_opt=prot
			dbtype_hint=p
			;;
			n|nu|nuc|nucl|nucle|nucleo|nucleot|nucleoti|nucleotid|nucleotide|nt)
			dbtype_opt=nucl
			dbtype_hint=n
			;;
			* )
			>&2 echo "$ME: Invalid type of database: $OPTARG"
			exit 1
			;;
		esac
		;;
		\? )
		print_usage
		>&2 echo "$ME: invalid option: -$OPTARG"
		exit 1
		;;
		: )
		print_usage
		>&2 echo "$ME: the -$OPTARG option requires an argument"
		exit 1
		;;
	esac
done
shift $((OPTIND -1))
if [[ $# != 1 ]]; then
	print_usage
	>&2 echo "$ME: Specify one NCBI-BLAST database to dump into FASTA format."
	exit 1
fi


interrupted()
{
	trap '' TERM
	>&2 echo "$ME: killed"
	# Only kill child jobs, not ourselves
	jobs -p | xargs -r kill -TERM &> /dev/null
	"$PUSHOVER" "$HOSTNAME $ME $DB interrupted"
	exit 1;
}

cleanup()
{
	/bin/rm -f ${TMPFILE}
}

trap "interrupted" HUP INT TERM QUIT

trap "cleanup" EXIT


function find_ncbirc() {
	if [ -z "${NCBI_DONT_USE_LOCAL_CONFIG:-}" ]; then
# Check .ncbirc, if NCBI_DONT_USE_LOCAL_CONFIG isn't set
		if [[ -r .ncbirc ]]; then
			NCBIRC="${PWD}/.ncbirc"
			return
		fi
		if [[ -r "${HOME}/.ncbirc" ]]; then
			NCBIRC="${HOME}/.ncbirc"
			return
		fi
	fi
	if [[ -n ${NCBI} ]]; then
		if [[ -r ${NCBI}/.ncbirc ]]; then
			NCBIRC="${NCBIRC}/.ncbirc"
			return
		fi
	fi
# A nod to Windows, but Windows isn't really supported
	if [[ -n ${SYSTEMROOT} ]]; then
		if [[ -r ${SYSTEMROOT}/.ncbirc ]]; then
			NCBIRC="${SYSTEMROOT}/.ncbirc"
			return
		fi
	else
# /etc is immutable on modern versions of macOS, but look anyway
		if [[ -r /etc/.ncbirc ]]; then
			NCBIRC="/etc/.ncbirc"
			return
		fi
	fi
# .ncbirc not found -- leave NCBIRC unset
	NCBIRC=""
	return
}

function get_blastdb_from_ncbirc() {
	local file="${1:-$HOME/.ncbirc}"

	awk '
		BEGIN {
			in_blast = 0
		}

		/^[[:space:]]*\[/ {
			in_blast = ($0 ~ /^[[:space:]]*\[BLAST\][[:space:]]*$/)
			next
		}

		in_blast && /^[[:space:]]*BLASTDB[[:space:]]*=/ {
			line = $0
			sub(/^[[:space:]]*BLASTDB[[:space:]]*=[[:space:]]*/, "", line)
			sub(/[[:space:]]*$/, "", line)
			print line
			exit
		}
	' "$file"
}

function get_blastdb() {
# get database search path from the blastdbcmd itself
#	BLASTDB="`blastdbcmd -show_blastdb_search_path`"
	if [ -z "${BLASTDB:-}" ]; then
		BLASTDB="."
	else
		BLASTDB=".:${BLASTDB}"
	fi
	if [ -n "${NCBI_DONT_USE_LOCAL_CONFIG+x}" ]; then
		return
	fi

	find_ncbirc
	if [ -z "${NCBIRC:-}" ]; then
		return
	fi
	bdb=$(get_blastdb_from_ncbirc "$NCBIRC")
	if [ -n "$bdb" ]; then
		BLASTDB="${BLASTDB}:${bdb}"
	fi
	return
}

DB="$1"
DBASE=$DB
if [[ "$DBASE" == *"/"* ]]; then
# user specified an explicit path to the database
	directories=("${DBASE%/*}")
	DBASE="${DBASE##*/}"
else
	get_blastdb
# Split BLASTDB into an array of directories
	IFS=':' read -r -a directories <<< "$BLASTDB"
fi

found_dir=""
for dir in "${directories[@]}"; do
	dir=$(expand_path "$dir")
	if n2lib_discover_db "$dir" "$DBASE" "$TMPFILE" "$dbtype_hint"; then
		found_dir="$dir"
		break
	fi
done

if [[ -z $found_dir ]]; then
	>&2 echo "$ME: Database not found: $DBASE"
	exit 1
fi


if [[ -n $query_mode ]]; then
# Output the full path to the database and halt
	echo "$found_dir/$DBASE"
	exit 0
fi


n2lib_select_dumper

DUMPER_OPTS="-entry all -ctrl_a"
if [[ ${dbtype_opt+x} ]]; then
	DUMPER_OPTS="${DUMPER_OPTS} -dbtype ${dbtype_opt}"
fi

if [[ $USING_NCBIDUMP -eq 1 ]]; then
	# ncbidump reads a multi-volume database natively, so it is invoked once
	# with the user's database specification rather than iterating over the
	# individual components. A -j setting is forwarded to ncbidump's own -j
	# option, and verbose mode enables ncbidump's -info reporting.
	if [[ -n $j_set ]]; then
		DUMPER_OPTS="${DUMPER_OPTS} -j${BCORES}"
	fi
	if [[ -n $VERBOSE ]]; then
		DUMPER_OPTS="${DUMPER_OPTS} -info"
	fi
	DUMPER_OPTS="${DUMPER_OPTS} -db"

	cd "$found_dir" || {
		>&2 echo "$ME: cannot change into temporary working directory: ${found_dir}"
		exit 1
	}

	"$PUSHOVER" $HOSTNAME $ME $1 start

	if [[ -n $VERBOSE ]]; then
		>&2 echo "${DUMPER}" ${DUMPER_OPTS} "$DBASE"
	fi
	"${DUMPER}" ${DUMPER_OPTS} "$DBASE"
	rc=$?

	"$PUSHOVER" $HOSTNAME $ME $DBASE exit $rc
	exit $rc
fi

DUMPER_OPTS="${DUMPER_OPTS} -db"

n2lib_determine_cores
BCORES=$(min PCORES BCORES)
BCORES=$(min BCORES MCORES)

if [[ -n $VERBOSE ]]; then
	>&2 echo "$ME: simultaneous dumper jobs: $BCORES"
fi

cd "$found_dir" || {
	>&2 echo "$ME: cannot change into temporary working directory: ${found_dir}"
	exit 1
}

lines=$(awk 'END{print NR}' "$TMPFILE")

"$PUSHOVER" $HOSTNAME $ME $1 start

if [[ $lines -gt 1 && $BCORES -gt 1 ]]; then
	if tty &>/dev/null; then
		FGOPTS="--lb" # this causes output to appear sooner (but slows overall execution)
	else
		FGOPTS=""
	fi
	parallel ${VERBOSE} -k --jobs ${BCORES} $FGOPTS --halt now,fail=1 "${DUMPER}" ${DUMPER_OPTS} {} :::: "${TMPFILE}"
	rc=$?
else
	rc=0
	while IFS= read -r entry; do
		if [[ -n ${VERBOSE} ]]; then
			>&2 echo "${DUMPER}" ${DUMPER_OPTS} "$entry"
		fi
		"${DUMPER}" ${DUMPER_OPTS} "$entry"
		rc=$?
		[[ $rc -ne 0 ]] && break
	done < "$TMPFILE"
fi

"$PUSHOVER" $HOSTNAME $ME $DBASE exit $rc

exit $rc
