#!/usr/bin/env bash
# pmd5 -- Use parallelization to calculate MD5 checksums on chunks of a huge file
# and finally report a single MD5 checksum of the list of those MD5 checksums.
# The -c option can be used to verify the checksum of checksums in a file
# generated by this script.
#
# Author: Warren Gish
# Date: 2023-07-20
#
# Permission is granted to copy, modify and redistribute this script
# as long as acknowledgement of the original author is maintained.
#
ME="${0##*/}"
ME="${ME%.*}"
PROG="`basename $ME`"
check_mode=0
VERBOSE=""
rc=0
BLOCK="2G-1"

print_usage() {
	>&2 echo "$PROG -- Compute an MD5 checksum (of MD5 checksums!) of a specified file

For huge files, component MD5 checksums are computed in parallel
on consecutive blocks of ${BLOCK} (2 billion minus 1) bytes

Usage: $PROG [options] filename
 -c  Check the MD5 checksum-of-checksums saved in the specified file
 -j# Attempt to compute the specified number of checksums simultaneously
 -v  Verbose mode"
	exit 1
}


# Parse options
while getopts ":cj:v" opt; do
	case ${opt} in
    c )
		check_mode=1
		;;
	j )
		PCORES=$OPTARG
		;;
	v )
		VERBOSE="-t"
		;;
	\? )
		>&2 echo "invalid option: -$OPTARG"
		exit 1
		;;
	: )
		>&2 echo "$ME: the -$OPTARG option requires an argument"
		exit 1
		;;
	esac
done
shift $((OPTIND -1))
if [ $# != 1 ]; then
	print_usage
fi
file_to_check=$1

# Check mode operation
if [ $check_mode -eq 1 ]; then
	if [ ! -f "$file_to_check" ]; then
		>&2 echo "Checksum file $file_to_check does not exist."
		exit 1
	fi
	# Assuming the checksum file contains "checksum filename" in each line
	while IFS=' ' read -r expected_checksum filename; do
	output=$($ME "$filename" | grep -v '^#' | awk '{print $1}')
	if [ "$output" == "$expected_checksum" ]; then
		echo "$filename: OK"
	else
		echo "$filename: FAILED"
		rc=1
	fi
	done < <(grep -v '^#' $file_to_check)
	exit $rc
fi

# Original script operation
if [[ ! -f $1 ]]; then
	>&2 echo "File does not exist: $1"
	exit 1
fi
if [[ ! -r $1 ]]; then
	>&2 echo "File not readable: $1"
	exit 1
fi

if [[ "$OSTYPE" == "darwin"* ]]; then
	MD5="md5 -r"
else
	MD5=md5sum
fi

if [[ -z "$PCORES" ]]; then
# Determine no. of physical cores
	if [ "$MYOS" = "Linux" ]; then
# Does not consider multi-socket systems
		PCORES=$(lscpu |& grep 'Core.*per socket:' | sed -E 's/[A-Za-z(): ]+//')
	elif [ "$MYOS" = "Darwin" ]; then
		PCORES=$(sysctl -n machdep.cpu.core_count | sed -e 's/.*: //')
	else
# parallel includes hyperthreads in this count
		PCORES=$(parallel --number-of-cores)
	fi
fi

echo "# pmd5 output -- this is a parallelized, double MD5 checksum."
echo "# This checksum is not a simple MD5 checksum.  It is an MD5 checksum"
echo "# of multiple MD5 checksums of consecutive blocks of ${BLOCK} bytes"
output=$(parallel $VERBOSE --block=${BLOCK} --pipe-part --recend '' -k -j ${PCORES} -a $1 ${MD5} \
	| sed -e 's/ .*$//' \
	| ${MD5} \
	| sed -e 's/ .*$//' \
	)
rc=$?
echo $output $1
exit $rc

