/*
    BFilter - a smart ad-filtering web proxy
    Copyright (C) 2002-2005  Joseph Artsimovich <joseph_a@mail.ru>

    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation; either version 2 of the License, or
    (at your option) any later version.

    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.

    You should have received a copy of the GNU General Public License
    along with this program; if not, write to the Free Software
    Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
*/

#include "pch.h"

#ifdef HAVE_CONFIG_H
#include <config.h>
#endif

#include "HtmlEscaper.h"
#include "StringUtils.h"
#include "BString.h"
#include <algorithm>
#include <string>
#include <cstring>

// the records are sorted by entity names
HtmlEscaper::EntityRecord const HtmlEscaper::m_entityTable[] = {
	"\306" "AElig",
	"\301" "Aacute",
	"\302" "Acirc",
	"\300" "Agrave",
	"\305" "Aring",
	"\303" "Atilde",
	"\304" "Auml",
	"\307" "Ccedil",
	"\320" "ETH",
	"\311" "Eacute",
	"\312" "Ecirc",
	"\310" "Egrave",
	"\313" "Euml",
	"\315" "Iacute",
	"\316" "Icirc",
	"\314" "Igrave",
	"\317" "Iuml",
	"\321" "Ntilde",
	"\323" "Oacute",
	"\324" "Ocirc",
	"\322" "Ograve",
	"\330" "Oslash",
	"\325" "Otilde",
	"\326" "Ouml",
	"\336" "THORN",
	"\332" "Uacute",
	"\333" "Ucirc",
	"\331" "Ugrave",
	"\334" "Uuml",
	"\335" "Yacute",
	"\341" "aacute",
	"\342" "acirc",
	"\264" "acute",
	"\346" "aelig",
	"\340" "agrave",
	"\046" "amp",
	"\345" "aring",
	"\343" "atilde",
	"\344" "auml",
	"\246" "brvbar",
	"\347" "ccedil",
	"\270" "cedil",
	"\242" "cent",
	"\251" "copy",
	"\244" "curren",
	"\260" "deg",
	"\367" "divide",
	"\351" "eacute",
	"\352" "ecirc",
	"\350" "egrave",
	"\360" "eth",
	"\353" "euml",
	"\275" "frac12",
	"\274" "frac14",
	"\276" "frac34",
	"\076" "gt",
	"\355" "iacute",
	"\356" "icirc",
	"\241" "iexcl",
	"\354" "igrave",
	"\277" "iquest",
	"\357" "iuml",
	"\253" "laquo",
	"\074" "lt",
	"\257" "macr",
	"\265" "micro",
	"\267" "middot",
	"\240" "nbsp",
	"\254" "not",
	"\361" "ntilde",
	"\363" "oacute",
	"\364" "ocirc",
	"\362" "ograve",
	"\252" "ordf",
	"\272" "ordm",
	"\370" "oslash",
	"\365" "otilde",
	"\366" "ouml",
	"\266" "para",
	"\261" "plusmn",
	"\243" "pound",
	"\042" "quot",
	"\273" "raquo",
	"\256" "reg",
	"\247" "sect",
	"\255" "shy",
	"\271" "sup1",
	"\262" "sup2",
	"\263" "sup3",
	"\337" "szlig",
	"\376" "thorn",
	"\327" "times",
	"\372" "uacute",
	"\373" "ucirc",
	"\371" "ugrave",
	"\250" "uml",
	"\374" "uuml",
	"\375" "yacute",
	"\245" "yen"
};

struct HtmlEscaper::EntityRecordComparator
{
	static int compare(char const* lhs, char const* rhs) {
		return std::memcmp(lhs+1, rhs+1, sizeof(EntityRecord)-1);
	}
	bool operator()(char const* lhs, char const* rhs) {
		return compare(lhs, rhs) < 0;
	}
};


std::string
HtmlEscaper::escape(std::string const& str)
{
	const char* begin = str.data();
	const char* const end = begin + str.length();
	std::string res;
	while (true) {
		const char* app = 0;
		const char* cur = begin;
		for (; cur != end; ++cur) {
			switch (*cur) {
				case '<': app = "&lt;"; break;
				case '>': app = "&gt;"; break;
				case '&': app = "&amp;"; break;
				case '"': app = "&quot;"; break;
				default: continue;
			}
			break;
		}
		if (begin == str.data()) {
			if (cur == end) {
				return str;
			} else {
				res.reserve(str.length());
			}
		}
		res.append(begin, cur - begin);
		if (app) {
			res.append(app);
			begin = cur+1;
		} else {
			break;
		}
	}
	return res;
}

std::string
HtmlEscaper::unescape(std::string const& str)
{
	std::string::size_type pos = 0, len = str.length();
	std::string res;
	while (pos < len) {
		std::string::size_type amp_pos = str.find('&', pos);
		if (amp_pos == std::string::npos) {
			if (pos == 0) {
				return str;
			}
			res.append(str, pos, std::string::npos);
			break;
		} else if (pos == 0) {
			res.reserve(str.length());
		}
		res.append(str, pos, amp_pos-pos);
		std::string::size_type limit = std::min(amp_pos+sizeof(EntityRecord), len);
		std::string::size_type sc_pos = amp_pos+1;
		for (; sc_pos < limit && str[sc_pos] != ';'; ++sc_pos);
		if (sc_pos == limit) {
			pos = sc_pos+1;
			res.append(str, amp_pos, pos-amp_pos);
			continue;
		}
		if (str[amp_pos+1] == '#') {
			const char* eptr = str.data()+sc_pos;
			unsigned int ch = StringUtils::parseUnsigned<unsigned int>(str.data()+amp_pos+2, eptr);
			if (eptr == str.data()+sc_pos && ch < 256) {
				res.append(1, static_cast<unsigned char>(ch));
			} else {
				pos = sc_pos+1;
				res.append(str, amp_pos, pos-amp_pos);
				continue;
			}
		} else {
			EntityRecord search = { 0 };
			std::memcpy(search, str.data()+amp_pos, sc_pos-amp_pos);
			const EntityRecord* end = &m_entityTable[sizeof(m_entityTable)/sizeof(*m_entityTable)];
			const EntityRecord* it = std::lower_bound(&m_entityTable[0],
				end, search, EntityRecordComparator());
			if (it != end && EntityRecordComparator::compare(search, *it) == 0) {
				res.append(1, **it);
			} else {
				pos = sc_pos+1;
				res.append(str, amp_pos, pos-amp_pos);
			}
		}
		pos = sc_pos+1;
	}
	return res;
}

BString
HtmlEscaper::escape(BString const& str)
{
	return BString(escape(str.toStdString()));
}

BString
HtmlEscaper::unescape(BString const& str)
{
	return BString(unescape(str.toStdString()));
}
