#!/bin/bash
# Working copy of pn2fz, to modify it to call pn2f
#
# pn2fz -- Convert a "packaged" NCBI-BLAST database into a single, compressed FASTA file
#
# Convert a "packaged" NCBI-BLAST database (stored in *.tar.gz files--NOT unpacked)
# to a monolithic compressed FASTA file. The NCBI-BLAST database is transiently
# unpacked (and deleted after FASTA conversion) to minimize storage
# requirements. Only database components required for dumping to FASTA are transiently
# unpacked.
#
# The ability to obtain substantial speed improvement through the use
# of parallel processing requires the NCBI-BLAST input database be stored
# on fast solid-state drives.
#
# The buffering feature of GNU parallel's -k option maintains the order
# of sequences.
#
# 3rd party add-ons used here:
#    NCBI tools (blastdbcmd), GNU parallel, zstd, pigz
#
# pushover is optionally used to notify of job start and end.
#
# zstd is much faster and compresses better than gzip/pigz
# and is therefore recommended over gzip if your workflow
# is compatible with zstd.
#
# Author: Warren Gish
# Date: 2024-02-17
# Revised: 2025-10-10
# Revised: 2026-05-29
#
# Permission is granted to copy, modify and redistribute this script
# as long as acknowledgement of the original author is maintained.
#
#
# Default max. no. of threads to use in pigz/zstd compression
CCORES=64

# Directory where packed (*.tar.gz) NCBI-BLAST database is stored
TARDIR=~/mirror/ncbi/blast/db

# Directory to store the compressed FASTA output file
FASTADIR="${FASTADIR:-~/db/fasta}"

ME=${BASH_SOURCE[0]##*/}

MEXT=".json"
PMETA="-prot-metadata${MEXT}"
NMETA="-nucl-metadata${MEXT}"

print_usage() {
	>&2 echo "
pn2fz -- Dump a packaged NCBI-BLAST *.tar.gz db into a compressed FASTA file

Tar files are transiently unpacked in a temporary directory and
removed after extraction, to conserve disk storage. One tar file
is present in unpacked form per running blastdbcmd job (-j option).
More simultaneous jobs may yield faster execution but will require
more temporary storage. Use of SSD storage is highly recommended.

Usage:  pn2fz [options] DBNAME

Options:
  -d DIR      Save compressed FASTA file in directory DIR (default $FASTADIR)
  -h          Display this help message
  -j<j>       Attempt to run <j> simultaneous blastdbcmd jobs
  -c<c>       Attempt to use <c> CPUS when compressing
  -f          Force over-write of an existing output file
  -m<m>       Use compression method <m> (gzip or zstd, default $DEFMETHOD)
  -n NAME     Use NAME for base name of compressed output file (default DBNAME)
  -o OUTFILE  Full filename of output file is OUTFILE, including any desired extension
  -v          Enable verbose mode
  -p          Issue start and end notifications via Pushover (if installed)

Supported compression methods: zstd, gzip
"
}

N2DIR=${BASH_SOURCE[0]%/*}
[[ $N2DIR == "${BASH_SOURCE[0]}" ]] && N2DIR=.
if [[ ! -r $N2DIR/n2lib ]]; then
	>&2 echo "$ME: required library 'n2lib' not found in $N2DIR"
	>&2 echo "$ME: download it from https://blast.advbiocomp.com/ncbihelper and install it in $N2DIR"
	exit 1
fi

. "$N2DIR/n2lib"

MYDIR="$N2DIR"
COWORKER="pn2f"
if ! COWLOC=$(command -v "${COWORKER}" 2>/dev/null); then
	if [[ ! -x "${MYDIR}/${COWORKER}" ]]; then
		>&2 echo "$ME: could not find helper script: ${COWORKER}"
		exit 1
	fi
	COWORKER="${MYDIR}/${COWORKER}"
else
	COWORKER="$COWLOC"
fi

if [[ ! -d ${TARDIR} ]]; then
	>&2 echo "$ME: directory containing packaged NCBI-BLAST databases does not exist: ${TARDIR}"
	exit 2
fi

DEFMETHOD="gzip"
command -v zstd &>/dev/null && DEFMETHOD=zstd
CMETHOD=$DEFMETHOD

PUSHOVER=":" # No pushover notifications by default

unset BCORES FORCE VERBOSE OUTFILE
while getopts ":c:d:fhj:m:n:o:vp" opt; do
	case $opt in
		d )
		FASTADIR="$OPTARG"
		;;
		c )
		CCORES="$OPTARG"
		;;
		h )
		print_usage
		exit 0
		;;
		f )
		FORCE=1
		;;
		j )
		BCORES="-j$OPTARG"
		;;
		m )
		CMETHOD="$OPTARG"
		;;
		n )
		BASENAME="$OPTARG"
		if [[ -n $OUTFILE ]]; then
			print_usage
			>&2 echo "$ME: the -o and -n options can not both be used"
			exit 1
		fi
		;;
		o )
		OUTFILE="$OPTARG"
		if [[ -n $BASENAME ]]; then
			print_usage
			>&2 echo "$ME: the -n and -o options can not both be used"
			exit 1
		fi
		[[ $OUTFILE = "-" ]] && OUTFILE="/dev/stdout"
		;;
		v )
		VERBOSE="-v"
		;;
		p )
		PUSHOVER=$(command -v pushover 2>/dev/null || echo ":")
		;;
		\? )
		print_usage
		>&2 echo "$ME: invalid option: -$OPTARG"
		exit 1
		;;
		: )
		print_usage
		>&2 echo "$ME: the -$OPTARG option requires an argument"
		exit 1
	esac
done
shift $((OPTIND -1))
if [ $# != 1 ]; then
	print_usage
	>&2 echo "Specify one packed NCBI-BLAST database to dump to a compressed FASTA file"
	exit 1
fi
DB="$1"

FASTADIR="$(expand_path "$FASTADIR")"
if [[ ! -d ${FASTADIR} ]]; then
	>&2 echo "$ME: directory for storing compressed FASTA file does not exist: ${FASTADIR}"
	exit 2
fi
if [[ ! -w ${FASTADIR} ]]; then
	>&2 echo "$ME: directory for storing compressed FASTA file is not writable: ${FASTADIR}"
	exit 2
fi

case "$CMETHOD" in
	z|zs|zst|zstd|Z|ZS|ZST|ZSTD)
	CUTIL=zstd
	EXT="zst"
	;;
	g|gz|gzi|gzip|G|GZ|GZI|GZIP)
	CUTIL=gzip
	EXT="gz"
	;;
	p|pi|pig|pigz|P|PI|PIG|PIGZ)
	CUTIL=gzip
	EXT="gz"
	;;
	*)
	print_usage
	>&2 echo "Unrecognized compression method: $CMETHOD"
	exit 1
	;;
esac

# Determine which file to use for the timestamp on the output file
if [[ -f "$TARDIR/$DB$PMETA" ]]; then
	TSFILE="$TARDIR/$DB$PMETA"
elif [[ -f "$TARDIR/$DB$NMETA" ]]; then
	TSFILE="$TARDIR/$DB$NMETA"
else
	>&2 echo "$ME: $MEXT file for \"$DB\" database not found in $TARDIR"
	exit 1
fi

if [[ -z $OUTFILE ]]; then
	if [[ -z $BASENAME ]]; then
		OUTFILE="${FASTADIR}/$DB.fa.${EXT}"
	else
		OUTFILE="${FASTADIR}/$BASENAME.fa.${EXT}"
	fi
fi
if [[ $OUTFILE != "/dev/"* && -e $OUTFILE ]]; then
# Check timestamps for equality
	if [[ ! $FORCE && ! "$TSFILE" -nt "$OUTFILE" && ! "$OUTFILE" -nt "$TSFILE" ]]; then
		>&2 echo "$ME: output file with latest timestamp already exists"
		exit 1
	fi
fi


n2lib_determine_cores
CCORES=$(min PCORES CCORES)

if [[ -n $VERBOSE ]]; then
	if [[ -n $BCORES ]]; then
		>&2 echo "$ME: simultaneous jobs: ${BCORES#-j}"
	else
		>&2 echo "$ME: simultaneous jobs: determined by $COWORKER"
	fi
	>&2 echo "$ME: compression threads: $CCORES"
fi

if [ "$CUTIL" = "gzip" ]; then
	if command -v pigz &>/dev/null ; then
# pigz will utilize all cores _and_ hyperthreads unless limited
		CUTIL="pigz"
		CCORES="-p${CCORES}"
	else
		>&2 echo "$ME: parallel compression utility \"pigz\" not found... using using \"gzip\" instead."
		CUTIL="gzip"
		CCORES=""
	fi
else
	CUTIL="zstd"
	CCORES="-T${CCORES}"
fi
COMPRESSOR=$(command -v "$CUTIL" 2>/dev/null)
if [[ -z "$COMPRESSOR" ]]; then
	>&2 echo "$ME: compression utility \"$CUTIL\" not found."
	exit 1
fi


interrupt()
{
	trap '' TERM
	>&2 echo "$ME: killed. Incomplete output file: ${OUTFILE}"
	# Only kill child jobs, not ourselves
	jobs -p | xargs -r kill -TERM &>/dev/null
	"$PUSHOVER" "$HOSTNAME $ME $DB interrupted"
	exit 1;
}

trap "interrupt" HUP INT TERM QUIT


if ! [[ -f "$TARDIR/$DB.tar.gz" ]] && \
		! find "$TARDIR" -maxdepth 1 -type f -regex "$TARDIR/$DB\.0\{1,\}\.tar\.gz" -print -quit &>/dev/null; then
	>&2 echo "$ME: no matching *.tar.gz file(s) for database: $DB"
	exit 1
fi


"$PUSHOVER" $HOSTNAME $ME $DB start

set -o pipefail
"$COWORKER" $BCORES $VERBOSE "$DB" | "${COMPRESSOR}" $CCORES > "$OUTFILE"
rc=$?

trap '' TERM

if [[ $rc -eq 0 && $OUTFILE != "/dev/"* ]]; then
	touch -r "$TSFILE" "$OUTFILE"
fi

"$PUSHOVER" $HOSTNAME $ME ${OUTFILE}, exit $rc

