#! /bin/sh

# Copyright (c) Christopher M. R. Lowth - chris@lowth.com
# Written by Christopher M. R. Lowth
#
# Permission is granted to anyone to use this software for any purpose on
# any computer system, and to alter it and redistribute it freely, subject
# to the following restrictions:
#
# 1. The author is not responsible for the consequences of use of this
#    software, no matter how awful, even if they arise from flaws in it.
#
# 2. The origin of this software must not be misrepresented, either by
#    explicit claim or by omission.  Since few users ever read sources,
#    credits must appear in the documentation.
#
# 3. Altered versions must be plainly marked as such, and must not be
#    misrepresented as being the original software.  Since few users
#    ever read sources, credits must appear in the documentation.
#
# 4. This notice may not be removed or altered.


# part_filter.
#
# This script is the "magic" behind the protector suite. It is called by
# the protector program as a unix filter. It reads a mail message "part"
# on standard input, and writes the same part (possibly modified) on its
# standard output. Further; it is passed a number of important data items
# in environment variables.
#
# Environment variables we get given ...
#
# $HEADERS			The full set of headers provided for the
#				attachment, exactly as given in the original
#				mail.
#
# $H_*				All the header fields are also passed as
#				individual values beginning with $H_,
#				letters in the key name mapped to uppercase
#				and non alpha-numeric characters in the key
#				name mapped to "_". Also: all tabs, newlines
#				and carriage returns in the value are mapped
#				to spaces. So:
#					Content-type: application/msword
#				maps to:
#					$H_CONTENT_TYPE=application/msword
#				Note that sub-fields remain in place.
#
# $CONTENT_TYPE			Equivalent to the $H_CONTENT_TYPE, but mapped
#				to lower case, and with subfields removed.
#
# $CONTENT_TRANSFER_ENCODING	Equivalent to $H_CONTENT_TRANSFER_ENCODING but
#				mapped to lower case (subfields are retained)
#
# $CONTENT_FILE_NAME		The value of the "name=" field from the
#				Content-type header.
#
# $LIB_DIR			The full path name of the directory into which
#				the suite's files are installed.
#


VER=1.00.7
TMP=/tmp/protector.$$		# For safely - this MUST include "$$".

ORIGINAL=$TMP/original
DECODED=$TMP/decoded
ZIPPED=$TMP/zipped

LF="
"

# Copy the stdout descriptor so we can redirect to it from within functions
# that are themselves redirected (such as replace_with).

exec 9>&1

##################################################################################

main()
{
	mkdir $TMP || replace_with internal_error
	trap 'rm -rf $TMP' 0

	check_filename "$CONTENT_TYPE__NAME"
	check_filename "$CONTENT_DISPOSITION__FILENAME"

	## If important things arent specified - use default values.

	[ "$CONTENT_TYPE"              = "" ] && CONTENT_TYPE=text/plain

	if [ "$CONTENT_TRANSFER_ENCODING" = "" ]; then
		case "$CONTENT_TYPE" in
			image/*)	CONTENT_TRANSFER_ENCODING=base64 ;;
			*)		CONTENT_TRANSFER_ENCODING=7bit   ;;
		esac
	fi


	## Sort out the confusion about whether RTF files application/rtf
	## or text/rtf. We like the former convention - it seems to be the
	## more widely used one. This is also reflected in the "mime.magic"
	## file used by protector.

	[ "$CONTENT_TYPE" = text/rtf ] && CONTENT_TYPE=application/rtf


	## Remember what we started with, for future reference

	ORIGINAL_CONTENT_TYPE="$CONTENT_TYPE"


	## Pre-allow basic text types - these are normally moderately safe,
	## although I guess someone will prove me wrong in this one day.

	case "$CONTENT_TYPE" in
		message/disposition-notification	|\
		message/delivery-status			|\
		text/*					)
			echo "$HEADERS"; cat; return 0;;
	esac


	## Decode the attachment and get it's file type by inspecting it's
	## "magic" numbers - etc.

	cat >$ORIGINAL
	decode <$ORIGINAL >$DECODED
	CONTENT_TYPE="`classify $DECODED`"


	## Verify that the file type we have determined, and the file type
	## we have been told by the sender agree with each other. If they
	## dont - the chances are that someone or something is trying to
	## lie to us :: Get upset if so. Some combinations of mis-match
	## are benign, and we allow through - all others are suspect.

	case "$ORIGINAL_CONTENT_TYPE:$CONTENT_TYPE" in

		application/octet-stream:*			|\
		application/*msword:text/plain			|\
		application/*msword:application/rtf		|\
		application/*msword:application/*excel		|\
		application/pgp-signature:text/plain		|\
		application/x-troff:application/x-tar		|\
		message/rfc822:text/plain			|\
		application/zip:application/x-zip		|\
		application/x-zip-compressed:application/x-zip	|\
		message/rfc822:text/x-mail			)
			;;

		*)
			if [ "$CONTENT_TYPE" != "$ORIGINAL_CONTENT_TYPE" ]; then
				replace_with wrong_content_type
			fi
			;;

	esac


	## If the attachment is a zipped file, unzip it and re-classify.  Note
	## that we cant handle multi-file archives at this point in the game.
	## So archive formats that can contain multiple files must "grok" if
	## more than one file is held in the archive. A later version may
	## try to get round this limitation. Live with it for now.
	##
	## This code is a loop - which allows us to handle zips inside
	## zips, compressed tar balls, etc. Think of it as "recursive"
	## uncompressing.
	##
	## Each time round the loop, we add the new content-type to the
	## remembered headers as an X-Discovered-Type field. This is simply
	## to aid us in debugging situations where things dont work (!)

	while : ; do
		HEADERS="${HEADERS}X-Discovered-Type: $CONTENT_TYPE$LF"

		case "$CONTENT_TYPE" in

			application/gzip |\
			application/x-gzip)	unzip_using -r "gunzip -c" ;;

			application/bzip2 |\
			application/x-bzip2)	unzip_using -r "bzip2 -d -c" ;;

			application/compress |\
			application/x-compress)	unzip_using -r "uncompress" ;;

			application/pack |\
			application/x-pack)	replace_with cant_unzip ;;

			application/zip |\
			application/x-zip-compressed |\
			application/x-zip)	unzip_using -f do_unzip ;;

			application/tar |\
			application/x-tar)	unzip_using -f do_tar ;;

			*) break ;;

		esac
	done


	## Now verify the safety of the attachment. Some of these can be
	## pre-determined as safe just from their types (which we should now
	## be confident about). Others need something along the lines of
	## a crude virus check first.

	case "$CONTENT_TYPE" in

		message/rfc822				)
			echo "$HEADERS"
			$LIB_DIR/bin/protector $DECODED ;;

		application/rtf				)
			eval `$LIB_DIR/bin/check_rtf $DECODED` ;;

		application/msword 			|\
		application/x-msword			)
			eval `$LIB_DIR/bin/check_msword $DECODED` ;;

		application/vnd.ms-excel		)
			eval `$LIB_DIR/bin/check_excel $DECODED` ;;

		application/ms-tnef			)
			eval `$LIB_DIR/bin/check_mstnef $DECODED` ;;

		application/x-svr4-package		)
			allow_unchanged ;;

		text/*					|\
		image/*					|\
		audio/*					|\
		video/* 				)
			allow_unchanged ;;

		*					)
			replace_with disallowed_type ;;

	esac

	exit 0
}

##############################################################################
# Dangerous extension checking - a recent trick is to give files in MS mail
# attachments two extensions, the last being hidden by the mail client, but
# being the significant one in terms of how the file is handled when "clicked".
# This means that calling a file "Picture.gif.exe" makes it appear as a .gif file
# when it is in fact a .exe - here we try to trap common hybrids of this type.
# We do this by looking for slashes or colons, and more then one dot and
# other things we dont like

check_filename()
{
	case "$1" in
		*/* | *\\* | *:* | *.*.* | *.[vV][bB]? )
			replace_with dangerous_name ;;
	esac
}

###############################################################################
# Decode the attachment into it's native raw form. Several encoding styles
# are handled here - it should be easy enough to add others if required.
#
# This function is a filter, it decodes its standard input, and writes the
# result to its standard output.

decode()
{
	case "$CONTENT_TRANSFER_ENCODING" in

		base64)			mmencode -u;		stat=$? ;;
		7bit)			cat; 			stat=$? ;;
		8bit)			cat; 			stat=$? ;;
		binary)			cat; 			stat=$? ;;
		quoted-printable)	$LIB_DIR/bin/qpdecode;	stat=$? ;;
		x-uuencode)		call_uudecode;		stat=$? ;;

		*)			replace_with bad_encoding ;;
	esac

	[ $stat = 0 ] || replace_with cant_decode
}

call_uudecode()
{
	if uudecode -o $TMP/uudecoded; then
		cat $TMP/uudecoded
	else
		return 2
	fi
	rm -f $TMP/uudecoded
}

###############################################################################

do_unzip()
{
	##-- Find out what the zip file contains.
	mv $1 $1.zip
	unzip -l $1.zip >$TMP/ziplst1 2>&1	|| replace_with bad_zip_file

	##-- Process "unzip -l" output to make it easier to use.
	awk '
		/[0-9]+ +[0-9-]+ +[0-9:]+ +/ {
			print $4
		 }
	' < $TMP/ziplst1 > $TMP/ziplst2		|| replace_with internal_error

	##-- we only allow ZIP files with a SINGLE file inside.
	[ `wc -l < $TMP/ziplst2` = 1 ]		|| replace_with bad_zip_file

	##-- extract the file as safely as know how.
	mkdir -p $TMP/unzipped			|| replace_with internal_error
	unzip -jq -d $TMP/unzipped $1.zip	|| replace_with bad_zip_file
	mv $TMP/unzipped/* $2			|| replace_with internal_error
}

do_tar()
{
	##-- Find out what the tarball contains.
	tar tf $1 >$TMP/tarlst 2>&1		|| replace_with bad_tar_file

	##-- we only allow tar files with a SINGLE file inside.
	[ `wc -l < $TMP/tarlst` = 1 ]		|| replace_with bad_tar_file

	##-- leading slashes in file names are a bad idea!
	[ `grep -c '^/' $TMP/tarlst` = 0 ]	|| replace_with bad_tar_file

	##-- extract the file in safest way we know how.
	tar xOf $1 > $2				|| replace_with bad_tar_file
}

unzip_using()
{
	mv $DECODED $ZIPPED
	if [ $1 = -r ]; then
		# -r means redirect stdio
		eval $2 <$ZIPPED >$DECODED || replace_with cant_unzip

	elif [ $1 = -f ]; then
		# -f means work with files
		eval $2 $ZIPPED $DECODED || replace_with cant_unzip

	fi
	CONTENT_TYPE="`classify $DECODED`"
}

###############################################################################
# allow the attachment without making any changes to it what so ever.

allow_unchanged()
{
	echo "$HEADERS"
	cat <$ORIGINAL
	exit 0
}

###############################################################################
# Replace the attachment with the specified message text. Presumably because we
# we dont like the attachment for some reason or other. Hopefully the message
# will tell the recipient a little more about our reasons for the refusal.

replace_with()
{
	exec 1>&9

	f=`$LIB_DIR/bin/save_reject -n`

	echo "Content-Type: text/plain; charset=US-ASCII"
	echo "Content-Transfer-Encoding: 7bit"
	echo "Content-Description: Warning Message - $1"
	echo

	echo "--- Warning message from your e-mail virus checker (protector $VER) ---"
	echo
	if [ -f $LIB_DIR/messages/$1.txt ]; then
		cat $LIB_DIR/messages/$1.txt
	else
		echo "WARNING CODE: $1"
		cat $LIB_DIR/messages/general.txt
	fi

	echo

	(
		echo "$HEADERS"
		echo "X-Copy-Of-Original: `basename $f`"
	)  | awk '!/^$/ { print "    " $0 }'

	echo

	(
		echo "X-Warning-Code: $1"
		echo "$HEADERS";
		if [ -f $ORIGINAL ]; then
			cat $ORIGINAL
		else
			cat
		fi
	) | $LIB_DIR/bin/save_reject -w $f

	exit 111
}

###############################################################################

classify_ms_office()
{
	(
		$LIB_DIR/bin/classify_msoffice $1
	) | awk '{print $1}'
}

###############################################################################

classify()
{
	t="`$LIB_DIR/bin/m_file -ib -m $LIB_DIR/etc/mime.magic $1`"

	t="`expr \"$t\" : '\([a-zA-Z0-9+/.-]*\)'`"

	case "$t" in
		data)				echo application/x-data ;;

		application/x-ms-office		|\
		application/vnd.ms-excel	|\
		application/msword)		classify_ms_office $1 ;;

		[a-zA-Z0-9+-]*/[a-zA-Z0-9+.-]*)	echo $t ;;

		*)				echo unknown/unknown ;;
	esac
}

###############################################################################

main
