#!/bin/sh
# vim: ts=4
# dwww-convert -- convert docs to HTML if necessary
#
# Some types of files (e.g. UNIX man pages) are typically not handled by most
# webbrowser setups; we convert these to HTML. Other types (e.g. PDF files
# or PNG images) are best handled by or via the user's webbrowser.
#
# Simple usage: $0 <type> <location>
# 	<type> is document type: text, man, html, ps, and so on
#	<location> is full pathname to original document
#
# In future versions, the converted HTML will be stored into a cache.
#
# Part of the Debian dwww package.  Written by Lars Wirzenius.
# Modified by Robert Luberda
# "@(#)dwww:$Id: dwww-convert,v 1.40 2003/07/27 16:17:32 robert Exp $"

#
# Setup defaults.
#
DWWW_MAN2HTML=builtin_man2html
DWWW_DIR2HTML=builtin_dir2html
DWWW_TEXT2HTML=builtin_text2html
DWWW_HTML2HTML=builtin_html2html
DWWW_INFO2HTML=builtin_info2html
DWWW_CSS2CSS=builtin_css2css

################################################################
#
# Initialization
#

DATE=`LC_ALL=C date`

. /usr/share/dwww/functions.sh && dwww_initialize || exit 1


################################################################
#
# Local dwww-convert functions
#

#
# Print error message and exit the program
# usage: ErrorMsg status title message
ErrorMsg() {
	status="$1"
	title="$2"
	message="$3"

	echo "Status: $status"
	echo "Content-type: text/html; charset=iso-8895-1"
	echo ""
	echo "<HTML>"
	echo "<HEAD>"
	echo " <TITLE>$title</TITLE>"
	echo "</HEAD>"
	echo "<BODY>"
	echo " <H1 align=\"center\">$title</H1>"
	echo "$message"
	echo "</BODY>"
	echo "</HTML>"
	exit 1
}

#
# Get Last Modification date
#
PrintLastMod() {
	lastmod=`LC_ALL=C  date -ur "$1" +"%a, %d %b %Y %H:%M:%S %Z"`
	[ "$lastmod" ] && echo "Last-Modified: $lastmod"
}

#
# Are we allowed to show this file?
#
# Note: getting this check wrong compromises security.
#
badfile() {
	d="$1"
	[ -z "$d" ] && return 0
	link=""
	[ "$2" = "symlink" ] && link="-s"

	dw_path=`echo $DWWW_DOCPATH | tr : ' '`
	for i in `realpath $link $dw_path 2>/dev/null`
	do
	  if [ -d "$i" ]; then
		case "$d/" in
		"$i"/*) return 1 ;;
		esac
	  fi
	done
	return 0
}

#
# Checks if original_file is a symlink from file in directory included in
# $DWWW_DOC_PATH to file from $DWWW_ALLOWEDLINKPATH
#
# Please note that orig_f must not contain /../ (eg. this can be output
# of `realpath -s`)
#
bad_symlink() {
	orig_f="$1"
	real_f="$2"

	badfile "$orig_f" "symlink" && return 0

	all_sym_path=`echo "$DWWW_ALLOWEDLINKPATH" | tr : ' '`
	for i in `realpath $all_sym_path 2>/dev/null`
	do
		if [ -d "$i" ]; then
			case "$real_f/" in
				"$i"/*) return 1 ;;
			esac
		fi
	done

	return 0
}


#######################################################################
#
# Builtin converters
#


#
# Create a directory listing in HTML.
#
builtin_dir2html() {
	dir="$1"

	if [ -r "$dwww_libdir/dwww-convert.dir.start" ] ; then
	                sed "s|%TITLE%|Files in $dir|g" \
                        "$dwww_libdir/dwww-convert.dir.start"
	else
		echo "<html><head><title>Files in $dir</title></head><body>"
		echo "<h1>Files in $dir</h1>"
	fi

	case "$1" in
		*[!a-zA-Z0-9/_.-]*)
			j_d="`urlencode $dir`"
			;;
		*)
			j_d="$dir"
			;;
	esac

#	Files:
	find "$dir" -type f -follow -maxdepth 1 -printf "%f\n" 2>/dev/null | sort |
	while read i
	do
			case "$i" in
				*[!a-zA-Z0-9/_.-]*)
					j="$j_d/`urlencode \"$i\"`"
					;;
				*)
					j="$j_d/$i"
					;;
			esac

			case "$i" in
				*.htm*)
					if [ -n "$DWWW_USEFILEURL" ] ; then
						echo "<a href=\"file://localhost$j\">$i</a>"
					else
						echo "<a href=\"/cgi-bin/dwww?type=file&amp;location=$j\">$i</a>"
					fi
					;;
				*)
						echo "<a href=\"/cgi-bin/dwww?type=file&amp;location=$j\">$i</a>"
					;;
			esac
	done | table_it

# Subdirectores
	if  find "$dir/." -type d -follow -maxdepth 1 ! -name . ! -name .. 2>/dev/null| grep . > /dev/null
	then
		echo "<p><h2>Subdirectories:</h2>"
		find "$dir/." -type d  -follow -maxdepth 1 ! -name . ! -name .. -printf "%f\n" 2>/dev/null | sort |
		while read i
		do
			case $i in
				*[!a-zA-Z0-9/_.-]*)
					j="$j_d/`urlencode \"$i\"`"
					;;
				*)
					j="$j_d/$i"
					;;
			esac

			if [ -n "$DWWW_USEFILEURL" ] && [ -f "$1/$i/index.html" ]; then
				echo "<a href=\"file://localhost$j/index.html\">$i</a>"
			elif [ -n "$DWWW_USEFILEURL" ] && [ -f "$1/$i/index.html.gz" ]; then
				echo "<a href=\"file://localhost$j/index.html.gz\">$i</a>"
			elif [ -n "$DWWW_USEFILEURL" ] && [ -f "$1/$i/index.htm" ]; then
				echo "<a href=\"file://localhost$j/index.htm\">$i</a>"
			elif [ -n "$DWWW_USEFILEURL" ] && [ -f "$1/$i/index.htm.gz" ]; then
				echo "<a href=\"file://localhost$j/index.htm.gz\">$i</a>"
			else
				echo "<a href=\"/cgi-bin/dwww?type=file&amp;location=$j\">$i</a>"
			fi
		done | table_it
	fi

	if [ -r "$dwww_libdir/dwww-convert.dir.end" ] ; then
		sed "s|%DATE%|$DATE|g;s|%VERSION%|$dwww_version|g" \
            "$dwww_libdir/dwww-convert.dir.end"
	else
		echo "<hr>Created automatically: $DATE</body></html>"
	fi
}


#
# Convert a manual page source code file to HTML.
#
builtin_man2html() {

	if [ -r "$dwww_libdir/dwww-convert.start" ] ; then
	                sed "s|%TITLE%|${2:-$1}|g" \
                        "$dwww_libdir/dwww-convert.start"
	else
		echo "<html><head><title>${2:-1}</title></head><body>"
	fi

	dir=`dirname "$1"`
	cd "$dir/.."
	man -P/bin/cat -l "$1" | dwww-txt2html --man
	cd /

	if [ -r "$dwww_libdir/dwww-convert.end" ] ; then
		sed "s|%DATE%|$DATE|g;s|%VERSION%|$dwww_version|g" \
            "$dwww_libdir/dwww-convert.end"
	else
		echo "</body></html>"
	fi
}


#
# Convert plain text to HTML.  This is really trivial, and buggy.
# Input from stdin.
#
builtin_text2html() {
	if [ -r "$dwww_libdir/dwww-convert.start" ] ; then
	                sed "s|%TITLE%|$1|g" \
                        "$dwww_libdir/dwww-convert.start"
	else
		echo "<html><head><title>$1</title></head><body>"
	fi

	$decompress "$1" | dwww-txt2html

	if [ -r "$dwww_libdir/dwww-convert.end" ] ; then
		sed "s|%DATE%|$DATE|g;s|%VERSION%|$dwww_version|g" \
            "$dwww_libdir/dwww-convert.end"
	else
		echo "</body></html>"
	fi
}


#
# Convert info file to HTML using info2www
#
builtin_info2html() {
	/usr/lib/cgi-bin/info2www "$1"
}


#
# Convert links in an HTML and CSS documents, to be able to use the automatic
# decompression and conversion features.
#
# Usage: convert_html_anchors file "html"/"css"
# 
convert_html_anchors() {

# The procedure below was written by Daniel Martin <martin@snowplow.org>
# See bug #151637 for more info, how it works...

    $decompress "$1" | perl -0777ne '

$file = '\'"$1"\'';
$type = '\'"$2"\'';

($directory = $file) =~ s/^[^\/]*(\/.*\/)[^\/]*$/$1/;

if ($type eq "css") {
	$cgi = "/cgi-bin/dwww?type=file&location=";
} else {
	$cgi = "/cgi-bin/dwww?type=file&amp;location=";
}

sub make_cgi_ref {
  my($target) = shift;
  # technically, according to rfc 2396, s. 3.4, we should also escape /,
  # but for our uses it does not matter and it significantly uglifies our URLs
  $target =~ s|[^\w_.!~/\-]|sprintf("%%%02X",unpack("C",$&))|ge;
  return ($cgi . $target);
}

sub mangle_anchor {
  my($anchortext) = shift;
  # We do not deal with anything that starts with /cgi-bin/
  if ($anchortext =~ m|^/cgi-bin/|) {return $anchortext;}
  # We do not deal with anything that includes a protocol or query
  if ($anchortext =~ /[:?]/) {return $anchortext;}
  # first undo html escaping
  $anchortext =~ s/^\s*//;
  $anchortext =~ s/\&lt;/</g;
  $anchortext =~ s/\&gt;/>/g;
  $anchortext =~ s/\&(amp|(#x?)([a-fA-F\d]+));/($1 eq "amp")?"&":pack("C",($2 eq "#x")?hex($3):$3)/ge;
  # Now it is safe to pull of the partial fragment
  my ($partial) = "";
  if ($anchortext =~ s/(#.*)//) {$partial=$1;}
  # now undo URI escaping
  $anchortext =~ s/%([a-fA-F\d]{2})/pack("C",hex($1))/ge;
  # re-html-encode potentially unsafe characters in partial.
  # shouldnt happen, but...
  if ($partial =~ s/^#//) {
    $partial =~ s/[<>&'\''"]/"&".unpack("C",$&).";"/ge;
    $partial = "#" . $partial;
  }
  # now...
  if (!$anchortext) {return $partial;}
  if ($anchortext =~ m[^/]) {
    return (make_cgi_ref($anchortext) . $partial);
  } else {
    return (make_cgi_ref($directory . $anchortext) . $partial);
  }
}

sub handle_simple_tag {
  # handle the substitution
  my($tag,$attr) = @_;
  # quoted
  s/(<\s*$tag[^>]*\s+$attr\s*=\s*)(["'\''])([^>\2]*?)(\2[^>]*>)/$1.$2.mangle_anchor($3).$4/iges;
  s/(<\s*$tag[^>]*\s+$attr\s*=\s*)([^\s"'\''][^>\s]*)(.*?>)/$1.mangle_anchor($2).$3/iegs;
}

sub transform_meta_tag {
  my($metatext) = shift;
  if ($metatext =~ m/equiv\s*=\s*["'\'']?refresh/is) {
    $metatext =~ s/(URL\s*=\s*)([^>:\s"]*)/$1.mangle_anchor($2)/ies;
  }
  return($metatext);
}

if ($type eq "css") {

  s/(url\s*\(\s*)(["'\'']?)([^)\2]+)\2(\s*\))/$1.$2.mangle_anchor($3).$2.$4/ges;

} else {

  handle_simple_tag(qw(a href));
  handle_simple_tag(qw(link href));
  
  handle_simple_tag(qw(img src));
  handle_simple_tag(qw(frame src));
  
  handle_simple_tag(qw(body background));
  
  # Meta tags need their own sub since the matching is a little different
  s/<META\s+[^>]*>/transform_meta_tag($&)/iges;
  
  handle_simple_tag(qw(applet archive));

}
  
  print $_;

        '
}

#
# Convert links in an HTML documents
#

builtin_html2html() {

	convert_html_anchors "$1" "html"
	echo "<!-- Generated by dwww $dwww_version on $DATE -->"
	exit 0


}

#
# Convert links inside CSS url() function
#

builtin_css2css() {

	convert_html_anchors "$1" "css"
	echo "/* Generated by dwww $dwww_version on $DATE */"
	exit 0

}

################################################################
#
# Main program
#


if [ "$1" = "" -o "$2" = "" ]
then
	echo "Error: invalid arguments" 1>&2
	echo "Usage: $0 <type> <location>" 1>&2
	exit 1
fi

if [ -n "$DWWW_DEBUG" ] ; then
	echo "--- dwww-convert $1 $2 ---" 1>&2
	set -x
fi

type="$1"
file="$2"
orig_file="$file"
# anchor=$(echo $file | sed -e "s/^*\(#.*$\)/$1/")
file=$(echo $file | sed -e "s/#.*$//")

# Check for directory
if [ -d "$file" ] ; then
	if [ "$type" != "dir" ] ; then
		type="dir"

		# search for HTML indexes
		for lang in "" ".en"; do
			for comp in "" ".gz" ".bz2"; do
				for suff in ".html" ".htm"; do
					if [ -f  "$file/index$suff$lang$comp" ] ; then
						file="$file/index$suff$lang$comp"
						type="html"
						break 3
					fi
				done
			done
		done
	fi

elif [ "$type" = dir ]; then
		type="file"
fi

# Check	info file
if [ "$type" = "info" -a ! -x /usr/lib/cgi-bin/info2www ] ; then
    ErrorMsg "500 dwww error" \
             "Cannot convert info files" \
             "dwww could not find the <B>info2www</B> program, which is required to conver the info files"
fi



# Check for man page
if [ "$type" = "runman" ]; then

	type="man"
	name="`echo \"$orig_file\" | sed 's/\/.*//'`"
	section="`echo \"$orig_file\" | sed 's/.*\///'`"
	file="`man --location -e \"$section\" \"$name\" | sed 's/ .*//;1q'`"

    if [ "X$file" = "X" -o ! -f "$file" ] ; then
        ErrorMsg "404 Man page not found" \
                 "Man page not found" \
                 "dwww could not find the man page $name($section)"
    fi


# Check to see if html file exists
elif [ \( "$type" = "html" -o "$type" = "text/html" \) -a ! -e "$file" ]; then
	# A link may have referred to a .html file
	# when only a .html.gz file exists.  So check
	# to see if alternate file exists, and use
	# that one if it does
	basefile=$(echo $file | sed -e "s/\.htm.*$//")

	for lang in "" ".en"; do
		for comp in "" ".gz" ".bz2"; do
			for suff in ".html" ".htm"; do
				if [ -f  "$basefile$suff$lang$comp" ] ; then
					file="$basefile$suff$lang$comp"
					type="html"
					break 3
				fi
			done
		done
	done

# Check for compressed/translated file
elif [ ! -e "$file" ]; then
	for lang in "" ".en"; do
		for comp in "" ".gz" ".bz2"; do
			if [ \( "$comp" != "$lang" \) -a -f  "$file$lang$comp" ] ; then
				file="$file$lang$comp"
				break 2
			fi
		done
	done
fi

if [ -e "$file" ] ; then

    real_file="`realpath \"$file\"`"
    file="`realpath -s \"$file\"`"

    if badfile "$real_file"  ; then
        if [ ! -s "$file" ] || bad_symlink "$file" "$real_file" ; then
            ErrorMsg "403 Access denied" \
                     "Access denied" \
                     "dwww will not allow you to read the file $orig_file"
        fi
    fi

else

    real_file=""
    file="`realpath -s \"$file\"`"

    if badfile "$file" "symlink" ; then
            ErrorMsg "403 Access denied" \
                     "Access denied" \
                     "dwww will not allow you to read the file $orig_file"
    else
            ErrorMsg "404 File not found" \
                     "File not found" \
                     "dwww will could not find the file $orig_file"
    fi

fi


# identify the compression algorithm used for non-directories
# then, calculate the name of the uncompressed file (in extension)
if [ "$type" != "dir" -a -e "$real_file" ]; then

	case `file -Lb "$real_file"` in
		gzip*)
			decompress="zcat"
			base_name=`basename "$real_file" .gz`
			;;

		GNUzip*)
			decompress="zcat"
			base_name=`basename "$real_file" .gz`
			;;

		bzip2*)
			decompress="bzcat"
			base_name=`basename "$real_file" .bz2`
			;;
		*)
			decompress="cat"
			base_name=`basename "$real_file"`
			;;
	esac
else
	base_name=`basename "$real_file"`
fi



# identify the file type from the file extension
# or using the "file" command.

if [ "$type" = "file" -a -e "$real_file" ] ; then

# first find the file extension

	noslash=$(echo "$base_name" | sed -e "s/[^\/]*\///g" | tr A-Z a-z )
	extension=$(echo "$noslash" | sed -e "s/[^\.]*\.//g" )

	if [ "$noslash" = "$extension" ] ; then
		extension=""
	fi

	# then guess the file type depending on the extension
	# if the extension is of size 0 or more than 4
	# text is assumed.
 
	case "$extension" in 
		txt|text)
			type=text/plain
			;;
		htm|html)
			type=text/html
			;;
		css)
			type=text/css
			;;
		[1-9]|[1-9][a-z]*)
			if [ "${file#*/man/}" != "${file}" ] ; then
			# if file is located in */man/* directory, assume it's a man page
				type=runman
			elif [ "X${ext#????}" != "X" ] ;then
			# size of extension > 4, assume text/plain
				type=text/plain
			fi
			;;				
		""|?????*)
			# extension is empty or its size is > 4
			type=text/plain
			;;
		*)
			;;
	esac	

	if [ "$type" = "file" ] ; then
		tmptype="`"$decompress" "$real_file" | file -bi -`"
		type="${tmptype%%[!a-zA-Z/-]*}"
	fi
fi



mime_type="text/html" #default

case "$type" in
	html)
		converter="$DWWW_HTML2HTML"
		;;
	text/html)
		converter="$DWWW_HTML2HTML"
		type=html
		;;
	man)
		converter="$DWWW_MAN2HTML"
		;;
	runman)	# impossible
		converter="$DWWW_MAN2HTML"
		type=man
		;;
	dir)
		converter="$DWWW_DIR2HTML"
		;;
	info)
		converter="$DWWW_INFO2HTML"
		;;
	text/plain)
		converter="$DWWW_TEXT2HTML"
		;;
	text/css)
		converter="$DWWW_CSS2CSS"
		mime_type="$type"
		;;
	*)
		converter=""
		mime_type="$type"
esac

mime_charset=""
if [ "$type" != "html" ] ; then
	if [ "$base_name" = "changelog.Debian" ] ; then
		mime_charset="UTF-8"
	else
		mime_charset="ISO-8859-1"
	fi
fi


if [ "X$converter" = "X"  ]; then
	echo "Content-type: $mime_type"
	PrintLastMod "$real_file"
	echo "Content-Disposition: inline; filename=\"$base_name\""
	echo ""
	"$decompress" "$real_file"
	exit 0
fi

# Check to see if user wants to access HTML files directly.

if [ "$type" = "html" -a -n "$DWWW_USEFILEURL" ]; then
		if [ -n "$mime_charset" ] ; then
	        echo "Content-type: text/html; charset=$mime_charset"
		else
	        echo "Content-type: text/html"
		fi
        echo ""
        "$decompress" "$real_file"
        exit 0
fi



# Print the header
if [ -n "$mime_charset" ] ; then
	echo "Content-type: $mime_type; charset=$mime_charset"
else
    echo "Content-type: $mime_type"
fi
PrintLastMod "$real_file"
if [ \( "$mime_type" != 'text/html' \) -o \(  "$type" = "html" \) ]; then
echo "Content-Disposition: inline; filename=\"$base_name\""
else
echo "Content-Disposition: inline; filename=\"$base_name.html\""
fi
echo ""

case "$DWWW_USE_CACHE" in
	[Yy][Ee][Ss]*)
		# Store the file in the cache unless it is already stored
		if ! dwww-cache --lookup "$type" "$real_file" ; then
			if [ "$converter" = builtin_man2html ] ; then
				"$converter" "$file" "$orig_file" | dwww-cache --store "$type" "$real_file"
			else
				"$converter" "$file"  | dwww-cache --store "$type" "$real_file"
			fi
		fi
		;;
	*)
		if [ "$converter" = builtin_man2html ] ; then
			"$converter" "$file" "$orig_file"
		else
			"$converter" "$file"
		fi
		;;
esac

exit 0
