/* iksemel (XML parser for Jabber)
** Copyright (C) 2000-2003 Gurer Ozen <madcat@e-kolay.net>
** This code is free software; you can redistribute it and/or
** modify it under the terms of GNU Lesser General Public License.
*/

#include "common.h"
#include "iksemel.h"

enum cons_e {
	C_CDATA = 0,
	C_TAG_START,
	C_TAG,
	C_TAG_END,
	C_ATTRIBUTE,
	C_ATTRIBUTE_1,
	C_ATTRIBUTE_2,
	C_VALUE,
	C_VALUE_APOS,
	C_VALUE_QUOT,
	C_WHITESPACE,
	C_ENTITY,
	C_COMMENT,
	C_COMMENT_1,
	C_COMMENT_2,
	C_COMMENT_3,
	C_MARKUP,
	C_MARKUP_1,
	C_SECT,
	C_SECT_CDATA,
	C_SECT_CDATA_1,
	C_SECT_CDATA_2,
	C_SECT_CDATA_3,
	C_SECT_CDATA_4,
	C_SECT_CDATA_C,
	C_SECT_CDATA_E,
	C_SECT_CDATA_E2,
	C_PI
};

/* if you add a variable here, dont forget changing iks_parser_reset */
struct iksparser_struct {
	ikstack *s;
	void *user_data;
	iksTagHook *tagHook;
	iksCDataHook *cdataHook;
	iksDeleteHook *deleteHook;
	/* parser context */
	char *buffer;
	size_t bufmax;
	unsigned int pos;

	enum cons_e context;
	enum cons_e oldcontext;

	int keep;
	unsigned int keepstart;

	enum ikstagtype tagtype;

	unsigned int attmax;
	unsigned int attcur;
	int attflag;
	char **atts;
	int valflag;

	unsigned int entpos;
	char entity[8];

	unsigned long nr_bytes;
	unsigned long nr_lines;

	int uni_max;
	int uni_len;
};

iksparser *
iks_sax_new (void *user_data, iksTagHook *tagHook, iksCDataHook *cdataHook)
{
	iksparser *prs;

	prs = iks_malloc (sizeof (iksparser));
	if (NULL == prs) return NULL;
	memset (prs, 0, sizeof (iksparser));
	prs->user_data = user_data;
	prs->tagHook = tagHook;
	prs->cdataHook = cdataHook;
	return prs;
}

iksparser *
iks_sax_extend (ikstack *s, void *user_data, iksTagHook *tagHook, iksCDataHook *cdataHook, iksDeleteHook *deleteHook)
{
	iksparser *prs;

	prs = iks_stack_alloc (s, sizeof (iksparser));
	if (NULL == prs) return NULL;
	memset (prs, 0, sizeof (iksparser));
	prs->s = s;
	prs->user_data = user_data;
	prs->tagHook = tagHook;
	prs->cdataHook = cdataHook;
	prs->deleteHook = deleteHook;
	return prs;
}

void *
iks_user_data (iksparser *prs)
{
	return prs->user_data;
}

unsigned long
iks_nr_bytes (iksparser *prs)
{
	return prs->nr_bytes;
}

unsigned long
iks_nr_lines (iksparser *prs)
{
	return prs->nr_lines;
}

#define IS_WHITESPACE(x) ' ' == (x) || '\t' == (x) || '\r' == (x) || '\n' == (x)
#define NOT_WHITESPACE(x) ' ' != (x) && '\t' != (x) && '\r' != (x) && '\n' != (x)

static enum ikserror
sax_core (iksparser *prs, char *buf, int len)
{
	enum ikserror err;
	int pos = 0, old = 0, re;
	unsigned char c;

	while (pos < len) {
		re = 0;
		c = buf[pos];
		if (0 == c || 0xFE == c || 0xFF == c) return IKS_BADXML;
		if (prs->uni_max) {
			if ((c & 0xC0) != 0x80) return IKS_BADXML;
			prs->uni_len++;
			if (prs->uni_len == prs->uni_max) prs->uni_max = 0;
			goto cont;
		} else {
			if (c & 0x80) {
				unsigned char mask;
				if ((c & 0x60) == 0x40) {
					prs->uni_max = 2;
					mask = 0x1F;
				} else if ((c & 0x70) == 0x60) {
					prs->uni_max = 3;
					mask = 0x0F;
				} else if ((c & 0x78) == 0x70) {
					prs->uni_max = 4;
					mask = 0x07;
				} else if ((c & 0x7C) == 0x78) {
					prs->uni_max = 5;
					mask = 0x03;
				} else if ((c & 0x7E) == 0x7C) {
					prs->uni_max = 6;
					mask = 0x01;
				} else {
					return IKS_BADXML;
				}
				if ((c & mask) == 0) return IKS_BADXML;
				prs->uni_len = 1;
				goto cont;
			}
		}

		switch (prs->context) {
			case C_CDATA:
				if ('&' == c) {
					buf[pos] = '\0';
					if (old < pos && prs->cdataHook) {
						err = prs->cdataHook (prs->user_data, &buf[old], pos - old);
						if (IKS_OK != err) return err;
					}
					prs->context = C_ENTITY;
					prs->entpos = 0;
					break;
				}
				if ('<' == c) {
					if (old < pos && prs->cdataHook) {
						err = prs->cdataHook (prs->user_data, &buf[old], pos - old);
						if (IKS_OK != err) return err;
					}
					prs->keepstart = buf - prs->buffer + pos;
					prs->keep = 1;
					prs->context = C_TAG_START;
				}
				break;

			case C_TAG_START:
				prs->context = C_TAG;
				if ('/' == c) {
					prs->tagtype = IKS_CLOSE;
					prs->keepstart++;
					break;
				}
				if ('?' == c) {
					prs->context = C_PI;
					break;
				}
				if ('!' == c) {
					prs->context = C_MARKUP;
					break;
				}
				prs->tagtype = IKS_OPEN;
				break;

			case C_TAG:
				if (IS_WHITESPACE(c)) {
					buf[pos] = '\0';
					if (IKS_CLOSE == prs->tagtype)
						prs->oldcontext = C_TAG_END;
					else
						prs->oldcontext = C_ATTRIBUTE;
					prs->context = C_WHITESPACE;
					break;
				}
				if ('/' == c) {
					if (IKS_CLOSE == prs->tagtype) return IKS_BADXML;
					buf[pos] = '\0';
					prs->tagtype = IKS_SINGLE;
					prs->context = C_TAG_END;
					break;
				}
				if ('>' == c) {
					prs->context = C_TAG_END;
					re = 1;
				}
				break;

			case C_TAG_END:
				if (c != '>') return IKS_BADXML;
				buf[pos] = '\0';
				if (prs->tagHook) {
					char **tmp;
					if (prs->attcur == 0) tmp = NULL; else tmp = prs->atts;
					err = prs->tagHook (prs->user_data, prs->buffer + prs->keepstart + 1, tmp, prs->tagtype);
					if (IKS_OK != err) return err;
				}
				prs->attcur = 0;
				prs->attflag = 0;
				prs->keep = 0;
				prs->context = C_CDATA;
				old = pos + 1;
				break;

			case C_ATTRIBUTE:
				if ('/' == c) {
					prs->tagtype = IKS_SINGLE;
					prs->context = C_TAG_END;
					break;
				}
				if ('>' == c) {
					prs->context = C_TAG_END;
					re = 1;
					break;
				}
				if (!prs->atts) {
					prs->attmax = 12;
					prs->atts = iks_malloc (sizeof(char *) * 2 * 12);
					if (!prs->atts) return IKS_NOMEM;
					memset (prs->atts, 0, sizeof(char *) * 2 * 12);
					prs->attcur = 0;
				} else {
					if (prs->attcur >= (prs->attmax * 2)) {
						void *tmp;
						prs->attmax += 12;
						tmp = iks_malloc (sizeof(char *) * 2 * prs->attmax);
						if (!tmp) return IKS_NOMEM;
						memset (tmp, 0, sizeof(char *) * 2 * prs->attmax);
						memcpy (tmp, prs->atts, sizeof(char *) * prs->attcur);
						free (prs->atts);
						prs->atts = tmp;
					}
				}
				prs->attflag = 1;
				prs->atts[prs->attcur] = buf + pos;
				prs->context = C_ATTRIBUTE_1;
				break;

			case C_ATTRIBUTE_1:
				if ('=' == c) {
					buf[pos] = '\0';
					prs->context = C_VALUE;
				}
				break;

			case C_ATTRIBUTE_2:
				if ('/' == c) {
					prs->tagtype = IKS_SINGLE;
					prs->atts[prs->attcur] = NULL;
					prs->context = C_TAG_END;
					break;
				}
				if ('>' == c) {
					prs->atts[prs->attcur] = NULL;
					prs->context = C_TAG_END;
					re = 1;
					break;
				}
				prs->context = C_ATTRIBUTE;
				re = 1;
				break;

			case C_VALUE:
				prs->atts[prs->attcur + 1] = buf + pos + 1;
				if ('\'' == c) {
					prs->context = C_VALUE_APOS;
					break;
				}
				if ('"' == c) {
					prs->context = C_VALUE_QUOT;
					break;
				}
				return IKS_BADXML;

			case C_VALUE_APOS:
				if ('\'' == c) {
					buf[pos] = '\0';
					prs->oldcontext = C_ATTRIBUTE_2;
					prs->context = C_WHITESPACE;
					prs->attcur += 2;
				}
				break;

			case C_VALUE_QUOT:
				if ('"' == c) {
					buf[pos] = '\0';
					prs->oldcontext = C_ATTRIBUTE_2;
					prs->context = C_WHITESPACE;
					prs->attcur += 2;
				}
				break;

			case C_WHITESPACE:
				if (NOT_WHITESPACE(c)) {
					prs->context = prs->oldcontext;
					re = 1;
				}
				break;

			case C_ENTITY:
				if (';' == c) {
					char t = '?';
					prs->entity[prs->entpos] = '\0';
					if (strcmp(prs->entity, "amp") == 0)
						t = '&';
					else if (strcmp(prs->entity, "quot") == 0)
						t = '"';
					else if (strcmp(prs->entity, "apos") == 0)
						t = '\'';
					else if (strcmp(prs->entity, "lt") == 0)
						t = '<';
					else if (strcmp(prs->entity, "gt") == 0)
						t = '>';
					buf[pos] = t;
					old = pos;
					prs->context = C_CDATA;
				} else {
					prs->entity[prs->entpos++] = buf[pos];
					if (prs->entpos > 7) return IKS_BADXML;
				}
				break;

			case C_COMMENT:
				if ('-' != c) return IKS_BADXML;
				prs->context = C_COMMENT_1;
				break;

			case C_COMMENT_1:
				if ('-' == c) prs->context = C_COMMENT_2;
				break;

			case C_COMMENT_2:
				if ('-' == c)
					prs->context = C_COMMENT_3;
				else
					prs->context = C_COMMENT_1;
				break;

			case C_COMMENT_3:
				if ('>' != c) return IKS_BADXML;
				prs->context = C_CDATA;
				old = pos + 1;
				break;

			case C_MARKUP:
				prs->keep = 0;
				if ('[' == c) {
					prs->context = C_SECT;
					break;
				}
				if ('-' == c) {
					prs->context = C_COMMENT;
					break;
				}
				prs->context = C_MARKUP_1;

			case C_MARKUP_1:
				if ('>' == c) {
					old = pos + 1;
					prs->context = C_CDATA;
				}
				break;

			case C_SECT:
				if ('C' == c) {
					prs->context = C_SECT_CDATA;
					break;
				}
				return IKS_BADXML;

			case C_SECT_CDATA:
				if ('D' != c) return IKS_BADXML;
				prs->context = C_SECT_CDATA_1;
				break;

			case C_SECT_CDATA_1:
				if ('A' != c) return IKS_BADXML;
				prs->context = C_SECT_CDATA_2;
				break;

			case C_SECT_CDATA_2:
				if ('T' != c) return IKS_BADXML;
				prs->context = C_SECT_CDATA_3;
				break;

			case C_SECT_CDATA_3:
				if ('A' != c) return IKS_BADXML;
				prs->context = C_SECT_CDATA_4;
				break;

			case C_SECT_CDATA_4:
				if ('[' != c) return IKS_BADXML;
				old = pos + 1;
				prs->context = C_SECT_CDATA_C;
				break;

			case C_SECT_CDATA_C:
				if (']' == c) {
					prs->context = C_SECT_CDATA_E;
					if (prs->cdataHook && old < pos) {
						err = prs->cdataHook (prs->user_data, &buf[old], pos - old);
						if (IKS_OK != err) return err;
					}
				}
				break;

			case C_SECT_CDATA_E:
				if (']' == c) {
					prs->context = C_SECT_CDATA_E2;
				} else {
					if (prs->cdataHook) {
						err = prs->cdataHook (prs->user_data, "]", 1);
						if (IKS_OK != err) return err;
					}
					old = pos;
					prs->context = C_SECT_CDATA_C;
				}
				break;

			case C_SECT_CDATA_E2:
				if ('>' == c) {
					old = pos + 1;
					prs->context = C_CDATA;
				} else {
					if (prs->cdataHook) {
						err = prs->cdataHook (prs->user_data, "]]", 2);
						if (IKS_OK != err) return err;
					}
					old = pos;
					prs->context = C_SECT_CDATA_C;
				}
				break;

			case C_PI:
				old = pos + 1;
				prs->keep = 0;
				if ('>' == c) prs->context = C_CDATA;
				break;
		}
cont:
		if (0 == re) {
			pos++;
			prs->nr_bytes++;
			if ('\n' == c) prs->nr_lines++;
		}
	}

	err = IKS_OK;
	if (prs->cdataHook && (prs->context == C_CDATA || prs->context == C_SECT_CDATA_C) && old < pos)
		err = prs->cdataHook (prs->user_data, &buf[old], pos - old);
	return err;
}

int
iks_parse (iksparser *prs, const char *data, size_t len, int finish)
{
	char *buf;

	if (!data) return IKS_OK;
	if (len == 0) len = strlen (data);

	if (prs->buffer) {
		int diff = 0, need, size;

		if (prs->keep) {
			size = prs->pos - prs->keepstart;
			if (prs->keepstart > 0) {
				memmove (prs->buffer, prs->buffer + prs->keepstart, size);
				prs->pos -= prs->keepstart;
				diff = - prs->keepstart;
				prs->keepstart = 0;
			}
		} else {
			prs->pos = 0;
			size = 0;
		}

		need = len - (prs->bufmax - prs->pos);
		if (need > 0) {
			buf = iks_malloc (prs->bufmax + need);
			if (!buf) return IKS_NOMEM;
			diff += buf - prs->buffer;
			if (size) memcpy (buf, prs->buffer, size);
			memcpy (buf + size, data, len);
			iks_free (prs->buffer);
			prs->buffer = buf;
			buf += size;
			prs->bufmax += need;
		} else {
			buf = prs->buffer + size;
			memcpy (buf, data, len);
		}
		if (prs->attflag != 0) {
			int i = 0;
			while (i < (prs->attmax * 2)) {
				if (prs->atts[i]) prs->atts[i] += diff;
				i++;
			}
		}
	} else {
		int need;
		need = len;
		if (need < 256) need = 256;
		buf = iks_malloc (need);
		if (!buf) return IKS_NOMEM;
		memcpy (buf, data, len);
		prs->buffer = buf;
		prs->bufmax = need;
	}
	prs->pos += len;
	return sax_core (prs, buf, len);
}

void
iks_parser_reset (iksparser *prs)
{
	if (prs->deleteHook) prs->deleteHook (prs->user_data);
	prs->pos = 0;
	prs->context = 0;
	prs->oldcontext = 0;
	prs->keep = 0;
	prs->keepstart = 0;
	prs->tagtype = 0;
	prs->attcur = 0;
	prs->attflag = 0;
	prs->valflag = 0;
	prs->entpos = 0;
	prs->nr_bytes = 0;
	prs->nr_lines = 0;
	prs->uni_max = 0;
	prs->uni_len = 0;
}

void
iks_parser_delete (iksparser *prs)
{
	if (prs->deleteHook) prs->deleteHook (prs->user_data);
	if (prs->buffer) iks_free (prs->buffer);
	if (prs->atts) iks_free (prs->atts);
	if (prs->s) iks_stack_delete (prs->s); else iks_free (prs);
}
