#!/bin/bash
# Working copy of pn2fz, to modify it to call pn2f
#
# pn2f -- Dump a "packaged" NCBI-BLAST database to stdout in FASTA format
#
# Convert a "packaged" NCBI-BLAST database (stored in *.tar.gz files--NOT unpacked)
# to FASTA. The NCBI-BLAST database is transiently unpacked and deleted after FASTA
# conversion to minimize storage demands. Only database components required
# for dumping to FASTA are transiently unpacked--and the ncbidump utility has fewer
# needs than does blastdbcmd.
#
# The actual FASTA dumping is delegated to the companion n2f script (which in turn
# selects ncbidump or blastdbcmd and shares the n2lib support script). pn2f handles
# only the transient unpacking and the cross-volume parallelism; n2f is invoked once
# per volume, both for a single-volume database and for each volume run in parallel.
#
# The ability to obtain substantial speed improvement through the use
# of parallel processing requires the NCBI-BLAST input database be stored
# on fast solid-state drives.
#
# The buffering feature of GNU parallel's -k option maintains the order
# of sequences.
#
# 3rd party add-ons used here (besides GNU parallel and the companion n2f
# script), if ncbidump is not installed:
#    NCBI tools (blastdbcmd)
#
# pushover is optionally used to notify of job start and end.
#
# Author: Warren Gish
# Date: 2024-02-17
# Revised: 2026-05-29
#
# Permission is granted to copy, modify and redistribute this script
# as long as acknowledgement of the original author is maintained.
#

# Default max. no. of blastdbcmd jobs to run simultaneously
BCORES=8

# Directory where packed (*.tar.gz) NCBI-BLAST databases are stored
TARDIR=~/mirror/ncbi/blast/db

ME=${BASH_SOURCE[0]##*/}

MEXT=".json"
PMETA="-prot-metadata${MEXT}"
NMETA="-nucl-metadata${MEXT}"

print_usage() {
	>&2 echo "
pn2f -- Dump a packaged NCBI-BLAST *.tar.gz database to stdout in FASTA format

Tar files are transiently unpacked in a temporary directory and
removed after extraction, to conserve disk storage. One tar file
is present in unpacked form per running blastdbcmd job (-j option).
More simultaneous jobs may yield faster execution but will require
more temporary storage. Use of SSD storage is highly recommended.

Usage:  pn2f [options] DBNAME

Options:
  -h      Display this help message
  -j<j>   Attempt to run <j> simultaneous blastdbcmd jobs
  -v      Enable verbose mode (print commands to stderr as executed)
  -P      Issue start and end notifications via Pushover utility (if installed)
"
}

N2DIR=${BASH_SOURCE[0]%/*}
[[ $N2DIR == "${BASH_SOURCE[0]}" ]] && N2DIR=.
if [[ ! -r $N2DIR/n2lib ]]; then
	>&2 echo "$ME: required library 'n2lib' not found in $N2DIR"
	>&2 echo "$ME: download it from https://blast.advbiocomp.com/ncbihelper and install it in $N2DIR"
	exit 1
fi
. "$N2DIR/n2lib"


if [[ ! -d ${TARDIR} ]]; then
	>&2 echo "$ME: directory does not exist: ${TARDIR}"
	exit 2
fi

# macOS doesn't include a terminal / in TMPDIR but Linux does
TMPD=${TMPDIR:-/var/tmp/}${ME}.$$
# purely for cosmetics, replace any instance of // with /
TMPD="${TMPD//\/\///}"
TMPFILE=${TMPD}/filelist
PUSHOVER=":" # No pushover notifications by default

VERBOSE=""	# controls pn2f's own verbose messages
N2F_V=""	# verbose option forwarded to n2f (-v) when requested
while getopts ":hj:vp" opt; do
	case $opt in
		h )
		print_usage
		exit 0
		;;
		j )
		BCORES=$OPTARG
		;;
		v )
		VERBOSE=1
		N2F_V="-v"
		;;
		p )
		PUSHOVER=$(command -v pushover 2>/dev/null || echo ":")
		;;
		\? )
		print_usage
		>&2 echo "$ME: invalid option: -$OPTARG"
		exit 1
		;;
		: )
		print_usage
		>&2 echo "$ME: the -$OPTARG option requires an argument"
		exit 1
	esac
done
shift $((OPTIND -1))
if [ $# != 1 ]; then
	print_usage
	>&2 echo "$ME: specify one packed NCBI-BLAST database to dump to stdout in FASTA format."
	exit 1
fi
DB=$1

# pn2f delegates the actual FASTA dumping to n2f, which is expected to reside in
# the same directory as pn2f (alongside the shared n2lib support script). Resolve
# its directory to an absolute path now, while still in the original working
# directory, so the invocation survives the later cd into the temporary dir.
N2FDIR=$(cd -- "$N2DIR" 2>/dev/null && pwd) || {
	>&2 echo "$ME: cannot resolve the directory of $ME: $N2DIR"
	exit 2
}
N2F="$N2FDIR/n2f"
if [[ ! -x "$N2F" ]]; then
	>&2 echo "$ME: required helper script not found or not executable: $N2F"
	exit 2
fi

# Select the dump utility: prefer ncbidump, fall back to blastdbcmd (see n2lib).
# n2lib_select_dumper sets DUMPER and USING_NCBIDUMP authoritatively. When
# ncbidump is used, the *.[np]db (LMDB) index files are not required and are
# deliberately left unextracted.
n2lib_select_dumper

interrupt()
{
	trap '' TERM
	>&2 echo "$ME: killed"
	# Only kill child jobs, not ourselves
	jobs -p | xargs -r kill -TERM &> /dev/null
	"$PUSHOVER" "$HOSTNAME $ME $DB interrupted"
	exit 1;
}

cleanup()
{
	cd
	/bin/rm -rf "${TMPD}"
}

trap "interrupt" HUP INT TERM QUIT

trap "cleanup" EXIT


createtmpdir() {
	>&2 mkdir "$TMPD"
	if [[ ! -d ${TMPD} ]]; then
		>&2 echo "$ME: failed creation of temporary working directory: ${TMPD}"
		exit 3
	fi
	if [[ -n "$VERBOSE" ]]; then
		>&2 echo Temporary working directory: $TMPD
	fi
	cd "$TMPD" || {
		>&2 echo "$ME: cannot change into temporary working directory: ${TMPD}"
		exit 3
	}
}


# Database components to transiently unpack for FASTA dumping. ncbidump does
# not require the *.[np]db (LMDB) index, so omit that pattern in that case.
DBPATS=('*.[np]in' '*.[np]hr' '*.[np]sq')
if [[ $USING_NCBIDUMP -eq 0 ]]; then
	DBPATS=('*.[np]db' "${DBPATS[@]}")
fi

case "$(tar --version 2>/dev/null | head -n 1)" in
	*"GNU tar"*)
		WILDCARDS=(--wildcards "${DBPATS[@]}")
		;;
	*"bsdtar"*|*"libarchive"*)
		WILDCARDS=()
		for pat in "${DBPATS[@]}"; do
			WILDCARDS+=(--include="$pat")
		done
		;;
	*)
		>&2 echo "$ME: unknown tar:  $(tar --version 2>&1 | head -n 1)"
		exit 1
		;;
esac


n2lib_determine_cores
BCORES=$(min PCORES BCORES)
BCORES=$(min BCORES MCORES)

if [[ -n $VERBOSE ]]; then
	>&2 echo "$ME: simultaneous jobs: $BCORES"
fi


# Establish the type of database (protein or nucleotide). Only the single-letter
# $dbt ("p"/"n") is needed here, to select which component files to unpack; n2f
# determines the full dump type ("prot"/"nucl") on its own from the unpacked db.
if [[ -f "$TARDIR/$DB$PMETA" ]]; then
	dbt="p"
	TSFILE="$TARDIR/$DB$PMETA"
elif [[ -f "$TARDIR/$DB$NMETA" ]]; then
	dbt="n"
	TSFILE="$TARDIR/$DB$NMETA"
else
	>&2 echo "$ME: $MEXT file for \"$DB\" database not found in $TARDIR"
	exit 1
fi

SINGLEFILE=""
found=""
if [[ -f "${TARDIR}/${DB}.tar.gz" ]]; then
	createtmpdir
	FIRSTFILE="${TARDIR}/${DB}.tar.gz"
	tar -xf "${FIRSTFILE}" "${WILDCARDS[@]}"
	if [ $? != 0 ]; then
		>&2 echo "$ME: error unpacking $FIRSTFILE"
		exit 4
	fi
	# Hand the transiently-unpacked, single-volume database to n2f. The path
	# prefix tells n2f which directory to search; it discovers the components
	# (and the dump type) on its own.
	"$N2F" $N2F_V "${TMPD}/${DB}"
	rc=$?
	"$PUSHOVER" $HOSTNAME $ME $DB, exit $rc
	exit $rc
fi

if ! find "${TARDIR}" -maxdepth 1 -type f -regex "$(printf '%q' "$TARDIR")/${DB}\.0\{1,\}\.tar\.gz" -print -quit &>/dev/null; then
	>&2 echo "$ME: no matching database: $DB"
	exit 1
fi

createtmpdir
while IFS= read -r -d '' x; do
	y="${x##*/}"		# Remove directory path
	y="${y%.tar.gz}"	# Remove .tar.gz suffix
	y="${y#${DB}.}"		# Remove prefix
	echo "${y}" >> "${TMPFILE}"
done < <(find "${TARDIR}" -type f -name "${DB}.*.tar.gz" -print0 2>/dev/null | sort -zn )

FIRSTFILE=$(head -1 "${TMPFILE}")
FIRSTFILE="${TARDIR}/${DB}.${FIRSTFILE}.tar.gz"
if [[ $USING_NCBIDUMP == 0 ]]; then
	if [[ -n "$VERBOSE" ]]; then
		>&2 echo "Preparing to dump by extracting the .${dbt}db file..."
	fi
	tar -xf "$FIRSTFILE" "${WILDCARDS[@]}"
	if [ $? != 0 ]; then
		>&2 echo "$ME: error unpacking $FIRSTFILE"
		exit 4
	fi
fi


"$PUSHOVER" $HOSTNAME $ME $DB start

# Determine the (possibly differently-named) base name of the unpacked database.
# Prefer the extracted .[np]db index; if it was not extracted (ncbidump), read
# the un-numbered .[np]db member name from the first tar without extracting it.
if [[ $USING_NCBIDUMP -eq 0 ]]; then
	# tar already extracted the index; the glob matches by extension, so a
	# differently-named DB is picked up here automatically.
	BASE=$(compgen -G "${TMPD}/*.${dbt}db")
	if [[ -n $BASE ]]; then
		BASE=$(basename "$BASE" ".${dbt}db")
	fi
else
	# ncbidump: nothing was extracted yet, so there's no file to glob. Peek the
	# member name out of the tar without extracting -- but only when the
	# archive is small enough that listing it is cheap; otherwise trust $DB.
	BASE=$DB
	if (( $(wc -c < "$FIRSTFILE") < 800000000 )); then
		member=$(tar tf "$FIRSTFILE" | grep -m1 "\.${dbt}db\$") || true
		[[ -n $member ]] && BASE=$(basename "$member" ".${dbt}db")
	fi
fi

if [[ -z $BASE ]]; then
	>&2 echo "$ME: unable to determine the database base name from $FIRSTFILE"
	exit 1
fi

if [[ $BASE != $DB ]]; then
# Small no. of NCBI databases are inconsistent in how they're packaged vs. named
	>&2 echo "NOTE: the unpacked NCBI database is named differently (\"${BASE}\")."
fi


if tty &>/dev/null; then
	FGOPT="--lb" # this causes output to appear sooner (but slows overall execution)
else
	FGOPT=""
fi

# Dump a single database volume: transiently unpack just the components needed
# for FASTA conversion, hand the volume to n2f, then delete those components.
# Run once per volume; the volume identifier arrives as $1. Jobs run in $TMPD
# (parallel inherits the caller's working directory), so component names stay
# relative. Only the per-volume .[np]hr/.[np]in/.[np]sq are pulled here; the
# database-level (un-numbered) .[np]db index that blastdbcmd needs is extracted
# once, up front, by the FIRSTFILE step and persists in $TMPD for every volume.
n2f_dump_volume() {
	local vol="$1"
	local base="${BASE}.${vol}"
	local hr="${base}.${dbt}hr"
	local in="${base}.${dbt}in"
	local sq="${base}.${dbt}sq"
	tar -xf "${TARDIR}/${DB}.${vol}.tar.gz" "$hr" "$in" "$sq" \
		&& "$N2F" -j1 $N2F_V "${TMPD}/${base}" \
		&& /bin/rm -f "$hr" "$in" "$sq"
}

# The function and the scalars it (and the n2f child it spawns) reference are
# exported so parallel's jobs inherit them. n2f re-selects the dumper from the
# inherited $DUMPER, so it runs the same utility pn2f extracted components for.
export -f n2f_dump_volume
export DB DUMPER TARDIR BASE dbt TMPD N2F N2F_V

if [[ $USING_NCBIDUMP -eq 1 && $BCORES -eq 1 ]]; then
	# ncbidump with a single job: bypass GNU parallel and process the volumes
	# directly, one after another.
	rc=0
	while IFS= read -r vol; do
		n2f_dump_volume "$vol"
		rc=$?
		[[ $rc -ne 0 ]] && break
	done < "${TMPFILE}"
else
	parallel --halt now,fail=1 $FGOPT -k --jobs "${BCORES}" \
		n2f_dump_volume :::: "${TMPFILE}"
	rc=$?
fi

"$PUSHOVER" "$HOSTNAME $ME $DB, exit $rc"

exit $rc
