/*
 * KHeiseReg
 *
 * A utility to search for articles within the Heise register.
 *
 * Copyright (C) 2002 Oliver Gantz <Oliver.Gantz@epost.de>
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
 */

#include <stdlib.h>
#include <qcstring.h>

#include "regfile.h"
#include "global.h"



static int ibm_strnicmp(const char *str1, const char *str2, uint len)
{
	static const uchar ibm_tolower_tab[256] = {
		0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07,0x08,0x09,0x0a,0x0b,0x0c,0x0d,0x0e,0x0f,
		0x10,0x11,0x12,0x13,0x20,0x20,0x16,0x17,0x18,0x19,0x1a,0x1b,0x1c,0x1d,0x1e,0x1f,
		0x20,0x21,0x22,0x23,0x24,0x25,0x26,0x27,0x28,0x29,0x2a,0x2b,0x2c,0x2d,0x2e,0x2f,
		0x30,0x31,0x32,0x33,0x34,0x35,0x36,0x37,0x38,0x39,0x3a,0x3b,0x3c,0x3d,0x3e,0x3f,
		0x40,0x61,0x62,0x63,0x64,0x65,0x66,0x67,0x68,0x69,0x6a,0x6b,0x6c,0x6d,0x6e,0x6f,
		0x70,0x71,0x72,0x73,0x74,0x75,0x76,0x77,0x78,0x79,0x7a,0x5b,0x5c,0x5d,0x5e,0x5f,
		0x60,0x61,0x62,0x63,0x64,0x65,0x66,0x67,0x68,0x69,0x6a,0x6b,0x6c,0x6d,0x6e,0x6f,
		0x70,0x71,0x72,0x73,0x74,0x75,0x76,0x77,0x78,0x79,0x7a,0x7b,0x7c,0x7d,0x7e,0x7f,
		0x87,0x81,0x82,0x83,0x84,0x85,0x86,0x87,0x88,0x89,0x8a,0x8b,0x8c,0x8d,0x84,0x86,
		0x82,0x91,0x91,0x93,0x94,0x95,0x96,0x97,0x98,0x94,0x81,0x9b,0x9c,0x9d,0x9e,0x9f,
		0xa0,0xa1,0xa2,0xa3,0xa4,0xa4,0xa6,0xa7,0xa8,0xa9,0xaa,0xab,0xac,0xad,0xae,0xaf,
		0xb0,0xb1,0xb2,0xb3,0xb4,0xb5,0xb6,0xb7,0xb8,0xb9,0xba,0xbb,0xbc,0xbd,0xbe,0xbf,
		0xc0,0xc1,0xc2,0xc3,0xc4,0xc5,0xc6,0xc7,0xc8,0xc9,0xca,0xcb,0xcc,0xcd,0xce,0xcf,
		0xd0,0xd1,0xd2,0xd3,0xd4,0xd5,0xd6,0xd7,0xd8,0xd9,0xda,0xdb,0xdc,0xdd,0xde,0xdf,
		0xe0,0xe1,0xe2,0xe3,0xe4,0xe5,0xe6,0xe7,0xe8,0xe9,0xea,0xeb,0xec,0xed,0xee,0xef,
		0xf0,0xf1,0xf2,0xf3,0xf4,0xf5,0xf6,0xf7,0xf8,0xf9,0xfa,0xfb,0xfc,0xfd,0xfe,0xff
	};
	register const uchar *s1 = (const uchar *)str1;
	register const uchar *s2 = (const uchar *)str2;
	int res;
	uchar c;

	if (!s1 || !s2)
		return s1 ? 1 : (s2 ? -1 : 0);

	for ( ; (len--); s1++, s2++) {
		if ((res = (c = ibm_tolower_tab[*s1]) - ibm_tolower_tab[*s2]))
			return res;

		if (!c)
			break;
	}

	return 0;
}


static int ibm_contains(const QCString &s, const char *str, bool cs)
{
	int count = 0;
	char *d = s.data();

	if (!d)
		return 0;

	uint len = qstrlen(str);

	while (*d) {
		if (cs) {
			if (qstrncmp(d, str, len) == 0)
				count++;
		} else {
			if (ibm_strnicmp(d, str, len) == 0)
				count++;
		}
		d++;
	}

	return count;
}


static QString fromIBM(const char *ibm)
{
	static const ushort ibm_tab[256] = {
		0x0000, 0x0001, 0x0002, 0x0003, 0x0004, 0x0005, 0x0006, 0x0007,
		0x0008, 0x0009, 0x000a, 0x000b, 0x000c, 0x000d, 0x000e, 0x000f,
		0x0010, 0x0011, 0x0012, 0x0013, 0x00b6, 0x00a7, 0x0016, 0x0017,
		0x0018, 0x0019, 0x001a, 0x001b, 0x001c, 0x001d, 0x001e, 0x001f,
		0x0020, 0x0021, 0x0022, 0x0023, 0x0024, 0x0025, 0x0026, 0x0027,
		0x0028, 0x0029, 0x002a, 0x002b, 0x002c, 0x002d, 0x002e, 0x002f,
		0x0030, 0x0031, 0x0032, 0x0033, 0x0034, 0x0035, 0x0036, 0x0037,
		0x0038, 0x0039, 0x003a, 0x003b, 0x003c, 0x003d, 0x003e, 0x003f,
		0x0040, 0x0041, 0x0042, 0x0043, 0x0044, 0x0045, 0x0046, 0x0047,
		0x0048, 0x0049, 0x004a, 0x004b, 0x004c, 0x004d, 0x004e, 0x004f,
		0x0050, 0x0051, 0x0052, 0x0053, 0x0054, 0x0055, 0x0056, 0x0057,
		0x0058, 0x0059, 0x005a, 0x005b, 0x005c, 0x005d, 0x005e, 0x005f,
		0x0060, 0x0061, 0x0062, 0x0063, 0x0064, 0x0065, 0x0066, 0x0067,
		0x0068, 0x0069, 0x006a, 0x006b, 0x006c, 0x006d, 0x006e, 0x006f,
		0x0070, 0x0071, 0x0072, 0x0073, 0x0074, 0x0075, 0x0076, 0x0077,
		0x0078, 0x0079, 0x007a, 0x007b, 0x007c, 0x007d, 0x007e, 0x007f,
		0x00c7, 0x00fc, 0x00e9, 0x00e2, 0x00e4, 0x00e0, 0x00e5, 0x00e7,
		0x00ea, 0x00eb, 0x00e8, 0x00ef, 0x00ee, 0x00ec, 0x00c4, 0x00c5,
		0x00c9, 0x00e6, 0x00c6, 0x00f4, 0x00f6, 0x00f2, 0x00fb, 0x00f9,
		0x00ff, 0x00d6, 0x00dc, 0x00a2, 0x00a3, 0x00a5, 0x0020, 0x0020,
		0x00e1, 0x00ed, 0x00f3, 0x00fa, 0x00f1, 0x00d1, 0x00aa, 0x00ba,
		0x00bf, 0x0020, 0x00ac, 0x00bd, 0x00bc, 0x00a1, 0x00ab, 0x00bb,
		0x0020, 0x0020, 0x0020, 0x0020, 0x0020, 0x0020, 0x0020, 0x0020,
		0x0020, 0x0020, 0x0020, 0x0020, 0x0020, 0x0020, 0x0020, 0x0020,
		0x0020, 0x0020, 0x0020, 0x0020, 0x0020, 0x0020, 0x0020, 0x0020,
		0x0020, 0x0020, 0x0020, 0x0020, 0x0020, 0x0020, 0x0020, 0x0020,
		0x0020, 0x0020, 0x0020, 0x0020, 0x0020, 0x20ac, 0x0020, 0x0020,
		0x0020, 0x0020, 0x0020, 0x0020, 0x0020, 0x0020, 0x0020, 0x0020,
		0x0020, 0x00df, 0x0020, 0x0020, 0x0020, 0x0020, 0x00b5, 0x0020,
		0x0020, 0x0020, 0x0020, 0x0020, 0x0020, 0x0020, 0x0020, 0x0020,
		0x0020, 0x00b1, 0x0020, 0x0020, 0x0020, 0x0020, 0x00f7, 0x0020,
		0x00b0, 0x0020, 0x00b7, 0x0020, 0x0020, 0x00b2, 0x0020, 0x0020
	};
	QString s;
	uint len, i;

	len = qstrlen(ibm);
	s.fill(' ', len);

	for (i = 0; i < len; i++)
		s[i] = QChar(ibm_tab[(unsigned char)ibm[i]]);

	return s;
}


static QCString toIBM(const QString &text)
{
	QCString s;
	uint len, i, j;
	ushort u;
	struct {
		ushort unicode;
		uchar ibm;
	} unicode_tab[] = {
		{ 0x00a1, 173 }, /* '' */
		{ 0x00a2, 155 }, /* '' */
		{ 0x00a3, 156 }, /* '' */
		{ 0x00a5, 157 }, /* '' */
		{ 0x00aa, 166 }, /* '' */
		{ 0x00ab, 174 }, /* '' */
		{ 0x00ac, 170 }, /* '' */
		{ 0x00b0, 248 }, /* '' */
		{ 0x00b1, 241 }, /* '' */
		{ 0x00b2, 253 }, /* '' */
		{ 0x00b5, 230 }, /* '' */
		{ 0x00b7, 250 }, /* '' */
		{ 0x00ba, 167 }, /* '' */
		{ 0x00bb, 175 }, /* '' */
		{ 0x00bc, 172 }, /* '1/4' */
		{ 0x00bd, 171 }, /* '1/2' */
		{ 0x00bf, 168 }, /* '' */
		{ 0x00c4, 142 }, /* '' */
		{ 0x00c5, 143 }, /* '' */
		{ 0x00c6, 146 }, /* '' */
		{ 0x00c7, 128 }, /* '' */
		{ 0x00c9, 144 }, /* '' */
		{ 0x00d1, 165 }, /* '' */
		{ 0x00d6, 153 }, /* '' */
		{ 0x00dc, 154 }, /* '' */
		{ 0x00df, 225 }, /* '' */
		{ 0x00e0, 133 }, /* '' */
		{ 0x00e1, 160 }, /* '' */
		{ 0x00e2, 131 }, /* '' */
		{ 0x00e4, 132 }, /* '' */
		{ 0x00e5, 133 }, /* '' */
		{ 0x00e6, 145 }, /* '' */
		{ 0x00e7, 135 }, /* '' */
		{ 0x00e8, 138 }, /* '' */
		{ 0x00ea, 136 }, /* '' */
		{ 0x00eb, 137 }, /* '' */
		{ 0x00ec, 141 }, /* '' */
		{ 0x00ed, 161 }, /* '' */
		{ 0x00ee, 140 }, /* '' */
		{ 0x00ef, 139 }, /* '' */
		{ 0x00f1, 164 }, /* '' */
		{ 0x00f2, 149 }, /* '' */
		{ 0x00f3, 162 }, /* '' */
		{ 0x00f6, 148 }, /* '' */
		{ 0x00f7, 246 }, /* '' */
		{ 0x00f9, 151 }, /* '' */
		{ 0x00fa, 163 }, /* '' */
		{ 0x00fb, 150 }, /* '' */
		{ 0x00fc, 129 }, /* '' */
		{ 0x00ff, 152 }, /* '' */
		{ 0x20ac, 213 }, /* '' */
		{ 0x0000,  32 },
	};
		

	len = text.length();
	s.fill(' ', len);

	for (i = 0; i < len; i++) {
		u = text[i].unicode();
		if (u < 0x0080)
			s[i] = (char)u;
		else {
			for (j = 0; (unicode_tab[j].unicode); j++)
				if (unicode_tab[j].unicode == u)
					break;
			s[i] = (char)unicode_tab[j].ibm;
		}
	}

	return s;
}



RegSearchList::RegSearchList()
{
}


void RegSearchList::setKeyWords(const QString &text)
{
	uint l, s, e;
	bool inc, exc;
	QCString input;

	m_include.clear();
	m_require.clear();
	m_exclude.clear();

	input = toIBM(text);
	l = input.length();

	s = 0;
	while (s < l) {
		inc = exc = false;
		while (input.at(s) == ' ')
			s++;
		if (s == l)
			break;
		if (input.at(s) == '+') {
			s++;
			inc = true;
		} else if (input.at(s) == '-') {
			s++;
			exc = true;
		}
		e = s;
		if (input.at(s) == '"') {
			s++;
			e++;
			while (e < l && input.at(e) != '"')
				e++;
		} else {
			while (e < l && input.at(e) != ' ')
				e++;
		}

		if (inc)
			m_require.append(input.mid(s, e - s));
		else if (exc)
			m_exclude.append(input.mid(s, e - s));
		else
			m_include.append(input.mid(s, e - s));

		s = e + 1;
	}
}


bool RegSearchList::matches(const QCString &text, bool cs) const
{
	RegCStrList::ConstIterator it;

	for (it = m_exclude.begin(); it != m_exclude.end(); ++it)
		if (ibm_contains(text, *it, cs))
			return false;

	for (it = m_require.begin(); it != m_require.end(); ++it)
		if (!ibm_contains(text, *it, cs))
			return false;

	if (m_include.isEmpty())
		return true;

	for (it = m_include.begin(); it != m_include.end(); ++it)
		if (ibm_contains(text, *it, cs))
			return true;

	return false;
}


bool RegSearchList::matchesFuzzy(const QCString &text, bool cs, int threshold) const
{
	RegCStrList::ConstIterator it;

	for (it = m_exclude.begin(); it != m_exclude.end(); ++it)
		if (stringContainsFuzzy(text, *it, cs, threshold))
			return false;

	for (it = m_require.begin(); it != m_require.end(); ++it)
		if (!stringContainsFuzzy(text, *it, cs, threshold))
			return false;

	if (m_include.isEmpty())
		return true;

	for (it = m_include.begin(); it != m_include.end(); ++it)
		if (stringContainsFuzzy(text, *it, cs, threshold))
			return true;

	return false;
}


bool RegSearchList::stringContainsFuzzy(const QCString &str, const QCString &substr, bool cs, int threshold) const
{
	char ngram[4];
	int ngramcount = substr.length() - 2;
	int i, count = 0;

	ngram[3] = '\0';

	for (i = 0; i < ngramcount; i++) {
		qstrncpy(ngram, &substr[i], 4);
		
		if (ibm_contains(str, ngram, cs))
			count++;
	}

	return (100 * count / ngramcount) >= threshold;
}




RegSearchNumList::RegSearchNumList()
{
}


void RegSearchNumList::setNumbers(const QString &text, bool year)
{
	int l, s, e, num;
	QCString input(text.latin1());
	bool ok;

	m_include.clear();

	input = input.simplifyWhiteSpace();
	l = input.length();

	s = 0;
	while (s < l) {
		if ((e = input.find(' ', s)) == -1)
			e = l;
		num = input.mid(s, e - s).toInt(&ok);
		if (ok) {
			if (year && num < 100)
				num += (num < 81) ? 2000 : 1900;
			m_include.append(num);
		}
		s = e + 1;
	}
}


bool RegSearchNumList::matches(int num) const
{
	RegNumList::ConstIterator it;

	if (m_include.isEmpty())
		return true;

	for (it = m_include.begin(); it != m_include.end(); ++it)
		if (num == *it)
			return true;

	return false;
}




RegEntry::RegEntry()
{
}


QString RegEntry::lineStr(int num) const
{
	QString s;
	char buff[REG_LINE_SIZE];
	uint l;

	qstrcpy((char *)buff, m_lines[num]);
	l = qstrlen((char *)buff);

	if ((l) && buff[l-1] == '\x0a')
		l--;
	if ((l) && buff[l-1] == '\x0d')
		l--;
	buff[l] = 0;

	return fromIBM(buff);
}


int RegEntry::page() const
{
	return atoi(m_lines[REG_LINE_PAGE]);
}


int RegEntry::edition() const
{
	return atoi(m_lines[REG_LINE_EDITION]);
}


unsigned char RegEntry::magazine() const
{
	if (*m_lines[REG_LINE_MAGYEAR] == 'c')
		return REG_MAGAZINE_CT;
	if (*m_lines[REG_LINE_MAGYEAR] == 'i')
		return REG_MAGAZINE_IX;

	return REG_MAGAZINE_NONE;
}


int RegEntry::year() const
{
	int y = atoi(&m_lines[REG_LINE_MAGYEAR][1]);

	return (y < 81) ? y + 2000 : y + 1900;
}


int RegEntry::verify() const
{
	int i, len;

	// - Do all lines end with CR LF?
	for (i = 0; i < REG_LINE_COUNT; i++) {
		len = qstrlen(m_lines[i]);
		if ((len < 2) || (m_lines[i][len-2] != 0x0d) || (m_lines[i][len-1] != 0x0a))
			return i + 1;
	}

	// - Does line 5 (page number) only contain blanks and digits?
	if (qstrlen(m_lines[4]) != 5)
		return REG_LINE_PAGE + 1;
	for (i = 0; i < 3; i++)
		if ((m_lines[REG_LINE_PAGE][i] != ' ') && ((m_lines[REG_LINE_PAGE][i] < '0') || (m_lines[REG_LINE_PAGE][i] > '9')))
			return REG_LINE_PAGE + 1;

	// - Does line 6 (edition) only contain blanks and digits?
	if (qstrlen(m_lines[REG_LINE_EDITION]) != 4)
		return REG_LINE_EDITION + 1;
	for (i = 0; i < 2; i++)
		if ((m_lines[REG_LINE_EDITION][i] != ' ') && ((m_lines[REG_LINE_EDITION][i] < '0') || (m_lines[REG_LINE_EDITION][i] > '9')))
			return REG_LINE_EDITION + 1;

	// - Does line 7 (magazine/year) only contain the magazine's id and digits?
	if (qstrlen(m_lines[REG_LINE_MAGYEAR]) != 5)
		return REG_LINE_MAGYEAR + 1;
	if ((m_lines[REG_LINE_MAGYEAR][0] != 'c') && (m_lines[REG_LINE_MAGYEAR][0] != 'i') && (m_lines[REG_LINE_MAGYEAR][0] != 'g'))
		return REG_LINE_MAGYEAR + 1;
	if ((m_lines[REG_LINE_MAGYEAR][1] < '0') || (m_lines[REG_LINE_MAGYEAR][1] > '9') || (m_lines[REG_LINE_MAGYEAR][2] < '0') || (m_lines[REG_LINE_MAGYEAR][2] > '9'))
		return REG_LINE_MAGYEAR + 1;

	return 0;
}



RegMask::RegMask()
{
	m_magazines = REG_MAGAZINE_NONE;
	m_cs = false;
	m_fuzzy = false;
	m_threshold = 100;
}


void RegMask::setBywords(const QString &text)
{
	m_bywords.setKeyWords(text);
}


void RegMask::setAuthors(const QString &text)
{
	m_authors.setKeyWords(text);
}


void RegMask::setEditors(const QString &text)
{
	m_editors.setKeyWords(text);
}


void RegMask::setEditions(const QString &text)
{
	m_editions.setNumbers(text, false);
}


void RegMask::setYears(const QString &text)
{
	m_years.setNumbers(text, true);
}


bool RegMask::matches(const RegEntry &entry) const
{
	if (!(m_magazines & entry.magazine()))
		return false;

	if (!m_editions.matches(entry.edition()))
		return false;

	if (!m_years.matches(entry.year()))
		return false;

	if (m_fuzzy) {
		if (!(m_bywords.matchesFuzzy(entry.bywords(), m_cs, m_threshold) || m_bywords.matchesFuzzy(entry.title(), m_cs, m_threshold) || m_bywords.matchesFuzzy(entry.subTitle(), m_cs, m_threshold)))
			return false;
		if (!m_authors.matchesFuzzy(entry.author(), m_cs, m_threshold))
			return false;
		if (!m_editors.matchesFuzzy(entry.editor(), m_cs, m_threshold))
			return false;
	} else {
		if (!(m_bywords.matches(entry.bywords(), m_cs) || m_bywords.matches(entry.title(), m_cs) || m_bywords.matches(entry.subTitle(), m_cs)))
			return false;
		if (!m_authors.matches(entry.author(), m_cs))
			return false;
		if (!m_editors.matches(entry.editor(), m_cs))
			return false;
	}

	return true;
}



RegFile::RegFile(): QFile()
{
	ct_first_ed = 999999;
	ct_last_ed = 0;
	ct_articles = 0;
	ix_first_ed = 999999;
	ix_last_ed = 0;
	ix_articles = 0;
	error_pos = 0;
}


RegFile::RegFile(const QString &name): QFile(name)
{
	ct_first_ed = 999999;
	ct_last_ed = 0;
	ct_articles = 0;
	ix_first_ed = 999999;
	ix_last_ed = 0;
	ix_articles = 0;
	error_pos = 0;
}


RegFile::~RegFile()
{
}


bool RegFile::readEntry(RegEntry *entry)
{
	int i;

	for (i = 0; i < REG_LINE_COUNT; i++) {
		if (readLine(entry->line(i), REG_LINE_SIZE) == -1) {
			error_pos += i + 1;
			return false;
		}
	}

	return true;
}


bool RegFile::scanEntries()
{
	RegEntry entry;
	int edition, line;

	ct_first_ed = 999999;
	ct_last_ed = 0;
	ct_articles = 0;
	ct_editions.clear();
	ix_first_ed = 999999;
	ix_last_ed = 0;
	ix_articles = 0;
	ix_editions.clear();
	error_pos = 0;

	at(0);

	while (!atEnd()) {
		if (!readEntry(&entry))
			return false;
		line = entry.verify();
		if (line) {
			error_pos += line;
			return false;
		}
		error_pos += REG_LINE_COUNT;

		edition = entry.year() * 100 + entry.edition();
		if (entry.magazine() == REG_MAGAZINE_CT) {
			if (ct_first_ed > edition)
				ct_first_ed = edition;
			if (ct_last_ed < edition)
				ct_last_ed = edition;
			ct_articles++;
			if (!ct_editions.contains(edition))
				ct_editions.append(edition);
		} else if (entry.magazine() == REG_MAGAZINE_IX) {
			if (ix_first_ed > edition)
				ix_first_ed = edition;
			if (ix_last_ed < edition)
				ix_last_ed = edition;
			ix_articles++;
			if (!ix_editions.contains(edition))
				ix_editions.append(edition);
		}
	}

	at(0);
	error_pos = 0;

	return true;
}


int RegFile::firstEdition(unsigned char mag) const
{
	if (mag == REG_MAGAZINE_CT)
		return (ct_first_ed == 999999) ? 0 : ct_first_ed;

	if (mag == REG_MAGAZINE_IX)
		return (ix_first_ed == 999999) ? 0 : ix_first_ed;

	return 0;
}


bool RegFile::containsEditions(unsigned char mag, const RegNumList &editions) const
{
	RegNumList::ConstIterator it;

	if (mag == REG_MAGAZINE_CT) {
		for (it = ct_editions.begin(); it != ct_editions.end(); ++it)
			if (editions.contains(*it))
				return true;
	}

	if (mag == REG_MAGAZINE_IX) {
		for (it = ix_editions.begin(); it != ix_editions.end(); ++it)
			if (editions.contains(*it))
				return true;
	}

	return false;
}


int RegFile::editionCount(unsigned char mag) const
{
	if (mag == REG_MAGAZINE_CT)
		return ct_editions.size();

	if (mag == REG_MAGAZINE_IX)
		return ix_editions.size();

	return 0;
}


int RegFile::lastEdition(unsigned char mag) const
{
	if (mag == REG_MAGAZINE_CT)
		return ct_last_ed;

	if (mag == REG_MAGAZINE_IX)
		return ix_last_ed;

	return 0;
}


int RegFile::articles(unsigned char mag) const
{
	if (mag == REG_MAGAZINE_CT)
		return ct_articles;

	if (mag == REG_MAGAZINE_IX)
		return ix_articles;

	return 0;
}


RegNumList RegFile::editions(unsigned char mag) const
{
	if (mag == REG_MAGAZINE_CT)
		return ct_editions;

	if (mag == REG_MAGAZINE_IX)
		return ix_editions;

	return RegNumList();
}
