/*
    BFilter - a smart ad-filtering web proxy
    Copyright (C) 2002-2006  Joseph Artsimovich <joseph_a@mail.ru>

    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation; either version 2 of the License, or
    (at your option) any later version.

    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.

    You should have received a copy of the GNU General Public License
    along with this program; if not, write to the Free Software
    Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
*/

#include "pch.h"

#ifdef HAVE_CONFIG_H
#include <config.h>
#endif

#include "TextContentValidator.h"
#include "RequestTag.h"
#include "HttpRequestMetadata.h"
#include "HttpResponseMetadata.h"
#include "HttpHeadersCollection.h"
#include "HttpHeader.h"

using namespace std;

static unsigned SIZE_TO_CHECK = 1024;

TextContentValidator::TextContentValidator(ResponseFilterChain& chain)
:	ResponseFilterBase(chain),
	m_bytesChecked(0),
	m_numNulls(0),
	m_numSpaces(0),
	m_numASCII(0)
{
}

TextContentValidator::~TextContentValidator()
{
}

void
TextContentValidator::processMetadata(auto_ptr<HttpResponseMetadata> metadata)
{
	// Content-Type is checked in FilterTryList::tryTextContentValidator
	m_ptrMetadata = metadata;
}

/*
Two things may indicate a file is plain text:
1. Spaces are common. (common text)
2. Text is mostly ASCII. (source code)
Additionally, the presence of null characters indicates binary content.
*/

void
TextContentValidator::processBodyData(SplittableBuffer& data, bool eof)
{
	if (!m_ptrMetadata.get()) {
		outputBodyData(data, eof);
		return;
	}
	
	char ch;
	char last_ch = '\0';
	SplittableBuffer::ByteIterator it(data.begin());
	for (; !it.isAtRightBorder() && m_bytesChecked < SIZE_TO_CHECK;
	     ++it, ++m_bytesChecked, last_ch = ch) {
		ch = *it;
		if ((signed char)ch >= 0) {
			++m_numASCII;
			if (ch == '\0') {
				++m_numNulls;
			} else if (ch == ' ') {
				if (last_ch != ' ') {
					++m_numSpaces;
				}
			}
		}
	}
	
	m_bufferedData.appendDestructive(data);
	
	if (m_bytesChecked >= SIZE_TO_CHECK || eof) {
		if (!isText()) {
			m_ptrMetadata->headers().removeHeader(BString("Content-Type"));
			getRequestTag()->flags().set(RequestTag::RESPONSE_MODIFIED);
		}
		outputMetadata(m_ptrMetadata);
		outputBodyData(m_bufferedData, eof);
	}
}

bool
TextContentValidator::isText() const
{
	if (m_numNulls != 0) {
		return false;
	}
	
	if (m_bytesChecked < 50) {
		// not enough statistical data
		return true;
	}
	
	if (m_numSpaces > m_bytesChecked * 0.1) {
		return true;
	}
	
	if (m_numASCII > m_bytesChecked * 0.9) {
		return true;
	}
	
	return false;
}
