#!/usr/bin/env bash
#
# n2lib -- shared functions for n2x and n2f
#
# Source this file; do not execute directly:
# N2DIR=${BASH_SOURCE[0]%/*}
# [[ $N2DIR == "${BASH_SOURCE[0]}" ]] && N2DIR=.
# . "$N2DIR/n2lib"
#
#
# Authors:  Warren Gish with Claude Code 2.1.143, Opus 4.7 xhigh
# Date:  2026-05-16
#
# Permission is granted to copy, modify and redistribute this code
# as long as acknowledgement of the original authors is maintained.
#

if [[ ${BASH_SOURCE[0]} == "$0" ]]; then
	>&2 echo "n2lib: this script is intended to be sourced by others, not executed directly"
	exit 1
fi

function min() {
	echo $(($1 > $2 ? $2 : $1))
}

function max() {
	echo $(($1 < $2 ? $2 : $1))
}

function expand_path() {
	local p="$1"
	case "$p" in
		"~"|"~/"*)	p=${p/#\~/"$HOME"} ;;
		"~+"* )		p=${p/#~+/"$PWD"} ;;
		"~-"* )		p=${p/#~-/"$OLDPWD"} ;;
		* ) ;;
	esac
	if [[ "$p" == ~* ]]; then
		>&2 echo "${ME:-n2lib}: tilde directory could not be resolved: $p"
		exit 1
	fi
	printf '%s\n' "$p"
}

# Select the dumper utility: prefer ncbidump, fall back to blastdbcmd.
# Inputs (optional, with defaults): DUMPER, BLASTDBCMD.
# On success: sets and exports DUMPER to the resolved utility.
# Exits if neither is available.
function n2lib_select_dumper() {
	DUMPER="${DUMPER:-ncbidump}"
	BLASTDBCMD="${BLASTDBCMD:-blastdbcmd}"
	DUMPER=$(expand_path "$DUMPER")
	BLASTDBCMD=$(expand_path "$BLASTDBCMD")
	command -v "$DUMPER" >& /dev/null || {
		command -v "$BLASTDBCMD" >& /dev/null || {
			echo "${ME:-n2lib}: Neither $BLASTDBCMD nor $DUMPER were found." >&2
			exit 1
		}
		DUMPER="$BLASTDBCMD"
	}
	# USING_NCBIDUMP reflects the identity of the resolved dumper to use.
	# ncbidump does not require .pdb (LMDB) files
	if [[ ${DUMPER##*/} == blastdbcmd ]]; then
		USING_NCBIDUMP=0
	else
		USING_NCBIDUMP=1
	fi
	export DUMPER USING_NCBIDUMP
}

# Discover database components in <dir> for <db>.
# Usage: n2lib_discover_db <dir> <db> <tmpfile> [<hint>]
#   <hint>: "p", "n", or "" (auto-detect)
# On success: sets and exports dbtype ("p" or "n") and json; sets DATESRC;
# writes bare component names to <tmpfile>; returns 0.
# On not-found-in-this-dir: leaves <tmpfile> empty and returns 1.
# Exits if both nucleotide and protein databases exist in <dir>.
function n2lib_discover_db() {
	local dir="$1"
	local db="$2"
	local tmpfile="$3"
	local hint="${4:-}"
	local check_p=1
	local check_n=1
	case "$hint" in
		p) check_n=0 ;;
		n) check_p=0 ;;
		"") ;;
		*) echo "${ME:-n2lib}: invalid dbtype hint: $hint" >&2; exit 1 ;;
	esac

	: > "$tmpfile"
	dbtype=""
	json=""
	DATESRC=""

	if [[ $check_p -eq 1 && -e $dir/$db.pal ]]; then
		dbtype=p
		json="$db-prot-metadata.json"
		DATESRC=$dir/$db.pal
	fi
	if [[ $check_n -eq 1 && -e $dir/$db.nal ]]; then
		if [[ -n $dbtype ]]; then
			echo "${ME:-n2lib}: both nucleotide and protein databases exist in $dir" >&2
			exit 1
		fi
		dbtype=n
		json="$db-nucl-metadata.json"
		DATESRC=$dir/$db.nal
	fi
	if [[ -z $dbtype ]]; then
		if [[ $check_p -eq 1 && -e $dir/$db.psq ]]; then
			dbtype=p
			json="$db-prot-metadata.json"
			DATESRC=$dir/$db.psq
		fi
		if [[ $check_n -eq 1 && -e $dir/$db.nsq ]]; then
			if [[ -n $dbtype ]]; then
				echo "${ME:-n2lib}: both nucleotide and protein databases exist in $dir" >&2
				exit 1
			fi
			dbtype=n
			json="$db-nucl-metadata.json"
			DATESRC=$dir/$db.nsq
		fi
	fi
	if [[ -z $dbtype ]]; then
		# No alias or single .sq file; look for multi-file components.
		local pcount=0
		local ncount=0
		[[ $check_p -eq 1 ]] && pcount=$(find "$dir" -maxdepth 1 -type f -name "${db}.*.psq" 2>/dev/null | wc -l)
		[[ $check_n -eq 1 ]] && ncount=$(find "$dir" -maxdepth 1 -type f -name "${db}.*.nsq" 2>/dev/null | wc -l)
		if [[ $pcount -gt 0 && $ncount -gt 0 ]]; then
			echo "${ME:-n2lib}: both nucleotide and protein databases exist in $dir" >&2
			exit 1
		elif [[ $pcount -gt 0 ]]; then
			dbtype=p
			json="$db-prot-metadata.json"
		elif [[ $ncount -gt 0 ]]; then
			dbtype=n
			json="$db-nucl-metadata.json"
		else
			return 1
		fi
	fi

	local save_nullglob
	save_nullglob=$(shopt -p nullglob)
	shopt -s nullglob
	local single="$dir/${db}.${dbtype}sq"
	local suffix=".${dbtype}sq"
	if [[ -f $single ]]; then
		printf '%s\n' "$db" > "$tmpfile"
	else
		local x base
		while IFS= read -r x; do
			base=${x##*/}
			case $base in
				*"$suffix") printf '%s\n' "${base%"$suffix"}" >> "$tmpfile" ;;
				*) echo "Unexpected file: $x" >&2; eval "$save_nullglob"; exit 1 ;;
			esac
		done < <(find "$dir" -maxdepth 1 -type f -name "${db}.*.${dbtype}sq" -print | sort)
	fi
	eval "$save_nullglob"
	export dbtype json
	return 0
}

# Detect physical CPU cores, classifying them into Super, Performance and
# Efficiency tiers (Apple Silicon perflevels and Intel hybrid CPUs). This
# integrates the logic of the former external cpus/wcores/scores/ecores
# helpers so n2lib is self-contained.
# On success: sets PHYSICAL_CPUS (sockets), CORES, LOGICAL_PROCESSORS
# (hyperthreads), SUPER_CORES, PERF_CORES and EFF_CORES, plus the derived
# WCORES (Super + Performance worker cores), SCORES (Super) and ECORES
# (Efficiency); returns 0.
# Returns 1 on an unsupported OS, leaving all counts at 0.
function n2lib_count_cores() {
	SUPER_CORES=0
	PERF_CORES=0
	EFF_CORES=0
	CORES=0
	PHYSICAL_CPUS=0
	LOGICAL_PROCESSORS=0
	WCORES=0
	SCORES=0
	ECORES=0
	if [[ "$OSTYPE" == "linux-gnu"* ]]; then
		local cores physical_cpus cores_per_cpu cpu_model
		LOGICAL_PROCESSORS=$(grep -c '^processor' /proc/cpuinfo)
		physical_cpus=$(grep 'physical id' /proc/cpuinfo 2>/dev/null | sort -u | wc -l)
		[[ $physical_cpus -eq 0 ]] && physical_cpus=1
		PHYSICAL_CPUS=$physical_cpus
		cores=$(grep '^core id' /proc/cpuinfo 2>/dev/null | sort -u | wc -l)
		if [[ $cores -eq 0 ]]; then
			cores_per_cpu=$(grep '^cpu cores' /proc/cpuinfo | uniq | awk '{print $4}')
			[[ -z $cores_per_cpu ]] && cores_per_cpu=1
			cores=$((cores_per_cpu * physical_cpus))
		fi
		CORES=$cores
		# Distinguish performance/efficiency cores on Intel hybrid CPUs
		# (e.g. Alder Lake); otherwise treat all cores as performance.
		if command -v lscpu >& /dev/null; then
			cpu_model=$(lscpu | awk -F: '/Model name/ {sub(/^[ \t]*/, "", $2); print $2}')
		else
			cpu_model=""
		fi
		if echo "$cpu_model" | grep -qi 'Alder Lake\|Raptor Lake\|Lakefield'; then
			PERF_CORES=$(awk '
				/^siblings/  { sibs=$3 }
				/^cpu cores/ { ncores=$4 }
				/^core id/   { cid=$3 }
				/^$/ { if (sibs > ncores) pcores[cid]=1; else ecores[cid]=1 }
				END { print length(pcores) }' /proc/cpuinfo)
			EFF_CORES=$(awk '
				/^siblings/  { sibs=$3 }
				/^cpu cores/ { ncores=$4 }
				/^core id/   { cid=$3 }
				/^$/ { if (sibs > ncores) pcores[cid]=1; else ecores[cid]=1 }
				END { print length(ecores) }' /proc/cpuinfo)
			if [[ -z $PERF_CORES || -z $EFF_CORES ]]; then
				PERF_CORES=$cores
				EFF_CORES=0
			fi
		else
			PERF_CORES=$cores
		fi
	elif [[ "$OSTYPE" == "darwin"* ]]; then
		local cores physical_cpus nperflevels i level_name level_cpus
		LOGICAL_PROCESSORS=$(sysctl -n hw.logicalcpu)
		cores=$(sysctl -n hw.physicalcpu)
		CORES=$cores
		physical_cpus=$(sysctl -n hw.packages 2>/dev/null)
		if [[ -z $physical_cpus ]]; then
			physical_cpus=$(system_profiler SPHardwareDataType | awk '/Number of Processors/{print $NF}')
			[[ -z $physical_cpus ]] && physical_cpus=1
		fi
		PHYSICAL_CPUS=$physical_cpus
		# Classify Apple Silicon perf levels by their reported names; do not
		# assume perflevel0 is always Performance or Super.
		nperflevels=$(sysctl -n hw.nperflevels 2>/dev/null || echo 0)
		i=0
		while [[ $i -lt $nperflevels ]]; do
			level_name=$(sysctl -n "hw.perflevel${i}.name" 2>/dev/null || true)
			level_cpus=$(sysctl -n "hw.perflevel${i}.physicalcpu" 2>/dev/null || echo 0)
			case "$level_name" in
				Super)       SUPER_CORES=$level_cpus ;;
				Performance) PERF_CORES=$level_cpus ;;
				Efficiency)  EFF_CORES=$level_cpus ;;
			esac
			i=$((i + 1))
		done
		# Fallback for older/non-Apple-Silicon macOS where perflevel* keys
		# are absent.
		if [[ $SUPER_CORES -eq 0 && $PERF_CORES -eq 0 && $EFF_CORES -eq 0 ]]; then
			PERF_CORES=$cores
		fi
	else
		return 1
	fi
	WCORES=$((SUPER_CORES + PERF_CORES))
	SCORES=$SUPER_CORES
	ECORES=$EFF_CORES
	return 0
}

# Determine the number of cores to use based on physical cores and free
# memory (3.5 GB per worker estimate). Sets PCORES and MCORES (and, via
# n2lib_count_cores, the *CORES globals).
function n2lib_determine_cores() {
	n2lib_count_cores || :
	PCORES=$WCORES
	local mem_free
	if [[ "$OSTYPE" = "linux-gnu"* ]]; then
		local meminfo cached
		meminfo=$(cat /proc/meminfo)
		mem_free=$(awk '/MemFree/ {print $2}' <<< "$meminfo")
		cached=$(awk '/^Cached:/ {print $2}' <<< "$meminfo")
		mem_free=$((mem_free + cached))
	elif [[ "$OSTYPE" = "darwin"* ]]; then
		local vm_stat_output page_size free_pages inactive_pages
		vm_stat_output=$(vm_stat)
		page_size=$(awk '/page size of/ {print $8}' <<< "$vm_stat_output")
		free_pages=$(awk '/Pages free/ {print $3}' <<< "$vm_stat_output" | tr -d '.')
		inactive_pages=$(awk '/Pages inactive/ {print $3}' <<< "$vm_stat_output" | tr -d '.')
		mem_free=$(( (free_pages + inactive_pages) * page_size / 1024 ))
	else
		mem_free=1
		PCORES=$(parallel --number-of-cores)
	fi
	PCORES=$(max PCORES 1)
	MCORES=$(( mem_free / 3500000 ))
	MCORES=$(max MCORES 1)
}
