#|  Logiweb, a system for electronic distribution of mathematics
    Copyright (C) 2004-2010 Klaus Grue

    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation; either version 2 of the License, or
    (at your option) any later version.

    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.

    You should have received a copy of the GNU General Public License
    along with this program; if not, write to the Free Software
    Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA

    Contact: Klaus Grue, DIKU, Universitetsparken 1, DK2100 Copenhagen,
    Denmark, grue@diku.dk, http://logiweb.eu/, http://www.diku.dk/~grue/

    Logiweb is a system for distribution of mathematical definitions,
    lemmas, and proofs. For more on Logiweb, consult http://logiweb.eu/.
|#

#|
=============================================
The Logiweb Compiler
=============================================
Front end
=============================================
|#

(in-package "COMMON-LISP-USER")

#|
=============================================
Supported source file encodings
=============================================
Internally, Logiweb uses 'Logiweb-UTF-8' encoding.

Logiweb-UTF-8 is identical to UTF-8 except:
(1) Code 10 is used as line separator.
(2) Codes 0-9 and 11-31 are illegal.

When used on operating systems which use other line separators than code 10 (e.g. a crlf = code 13 followed by code 10), the frontend of the Logiweb compiler must convert the newline sequence of the underlying operating system to code 10 before processing. That can be achieved by setting options for the Logiweb compiler properly, e.g. in the site configuration file (c.f. the 'filter' option in 'man lgc').

The Logiweb compiler supports the following source file encodings:

#BINARY. Binary data encoded BASE64 using the following characters:
  ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789-_
#BS (Binary string). The same as BINARY, but the data are prefixed by
the escape sequence ""/ and suffixed by " such that they become a
binary string.

In addition, lgc supports the following source file encodings mentioned at
http://clisp.cons.org/impnotes.html.

#UCS-2 / UNICODE-16 / UNICODE-16-BIG-ENDIAN, the 16-bit basic multilingual plane of the UNICODE character set. Every character is represented as two bytes.
#UNICODE-16-LITTLE-ENDIAN
#UCS-4 / UNICODE-32 / UNICODE-32-BIG-ENDIAN, the 21-bit UNICODE character set. Every character is represented as four bytes. This encoding is used by CLISP internally.
#UNICODE-32-LITTLE-ENDIAN
#UTF-8, the 21-bit UNICODE character set. Every character is represented as one to four bytes. ASCII characters represent themselves and need one byte per character. Most Latin/Greek/Cyrillic/Hebrew characters need two bytes per character. Most other characters need three bytes per character, and the rarely used remaining characters need four bytes per character. This is therefore, in general, the most space-efficient encoding of all of Unicode.
#UTF-16, the 21-bit UNICODE character set. Every character in the 16-bit basic multilingual plane is represented as two bytes, and the rarely used remaining characters need four bytes per character. This character set is only available on platforms with GNU libc or GNU libiconv.
#UTF-7, the 21-bit UNICODE character set. This is a stateful 7-bit encoding. Not all ASCII characters represent themselves. This character set is only available on platforms with GNU libc or GNU libiconv.
#JAVA, the 21-bit UNICODE character set. ASCII characters represent themselves and need one byte per character. All other characters of the basic multilingual plane are represented by \unnnn sequences (nnnn a hexadecimal number) and need 6 bytes per character. The remaining characters are represented by \uxxxx\uyyyy and need 12 bytes per character. While this encoding is very comfortable for editing Unicode files using only ASCII-aware tools and editors, it cannot faithfully represent all UNICODE text. Only text which does not contain \u (backslash followed by lowercase Latin u) can be faithfully represented by this encoding.
#ASCII, the well-known US-centric 7-bit character set (American Standard Code for Information Interchange - ASCII).
#ISO-8859-1, an extension of the ASCII character set, suitable for the Afrikaans, Albanian, Basque, Breton, Catalan, Cornish, Danish, Dutch, English, Faeroese, Finnish, French, Frisian, Galician, German, Greenlandic, Icelandic, Irish, Italian, Latin, Luxemburgish, Norwegian, Portuguese, Raeto-Romanic, Scottish, Spanish, and Swedish languages.
#ISO-8859-2, an extension of the ASCII character set, suitable for the Croatian, Czech, German, Hungarian, Polish, Slovak, Slovenian, and Sorbian languages.
#ISO-8859-3, an extension of the ASCII character set, suitable for the Esperanto and Maltese languages.
#ISO-8859-4, an extension of the ASCII character set, suitable for the Estonian, Latvian, Lithuanian and Sami (Lappish) languages.
#ISO-8859-5, an extension of the ASCII character set, suitable for the Bulgarian, Byelorussian, Macedonian, Russian, Serbian, and Ukrainian languages.
#ISO-8859-6, suitable for the Arabic language.
#ISO-8859-7, an extension of the ASCII character set, suitable for the Greek language.
#ISO-8859-8, an extension of the ASCII character set, suitable for the Hebrew language (without punctuation).
#ISO-8859-9, an extension of the ASCII character set, suitable for the Turkish language.
#ISO-8859-10, an extension of the ASCII character set, suitable for the Estonian, Icelandic, Inuit (Greenlandic), Latvian, Lithuanian, and Sami (Lappish) languages.
#ISO-8859-13, an extension of the ASCII character set, suitable for the Estonian, Latvian, Lithuanian, Polish and Sami (Lappish) languages.
#ISO-8859-14, an extension of the ASCII character set, suitable for the Irish Gaelic, Manx Gaelic, Scottish Gaelic, and Welsh languages.
#ISO-8859-15, an extension of the ASCII character set, suitable for the ISO-8859-1 languages, with improvements for French, Finnish and the Euro.
#ISO-8859-16 an extension of the ASCII character set, suitable for the Rumanian language.
#KOI8-R, an extension of the ASCII character set, suitable for the Russian language (very popular, especially on the internet).
#KOI8-U, an extension of the ASCII character set, suitable for the Ukrainian language (very popular, especially on the internet).
#KOI8-RU, an extension of the ASCII character set, suitable for the Russian language. This character set is only available on platforms with GNU libiconv.
#JIS_X0201, a character set for the Japanese language.
#MAC-ARABIC, a platform specific extension of the ASCII character set.
#MAC-CENTRAL-EUROPE, a platform specific extension of the ASCII character set.
#MAC-CROATIAN, a platform specific extension of the ASCII character set.
#MAC-CYRILLIC, a platform specific extension of the ASCII character set.
#MAC-DINGBAT, a platform specific character set.
#MAC-GREEK, a platform specific extension of the ASCII character set.
#MAC-HEBREW, a platform specific extension of the ASCII character set.
#MAC-ICELAND, a platform specific extension of the ASCII character set.
#MAC-ROMAN / MACINTOSH, a platform specific extension of the ASCII character set.
#MAC-ROMANIA, a platform specific extension of the ASCII character set.
#MAC-SYMBOL, a platform specific character set.
#MAC-THAI, a platform specific extension of the ASCII character set.
#MAC-TURKISH, a platform specific extension of the ASCII character set.
#MAC-UKRAINE, a platform specific extension of the ASCII character set.
#CP437, a DOS oldie, a platform specific extension of the ASCII character set.
#CP437-IBM, an IBM variant of CP437.
#CP737, a DOS oldie, a platform specific extension of the ASCII character set, meant to be suitable for the Greek language.
#CP775, a DOS oldie, a platform specific extension of the ASCII character set, meant to be suitable for some Baltic languages.
#CP850, a DOS oldie, a platform specific extension of the ASCII character set.
#CP852, a DOS oldie, a platform specific extension of the ASCII character set.
#CP852-IBM, an IBM variant of CP852.
#CP855, a DOS oldie, a platform specific extension of the ASCII character set, meant to be suitable for the Russian language.
#CP857, a DOS oldie, a platform specific extension of the ASCII character set, meant to be suitable for the Turkish language.
#CP860, a DOS oldie, a platform specific extension of the ASCII character set, meant to be suitable for the Portuguese language.
#CP860-IBM, an IBM variant of CP860.
#CP861, a DOS oldie, a platform specific extension of the ASCII character set, meant to be suitable for the Icelandic language.
#CP861-IBM, an IBM variant of CP861.
#CP862, a DOS oldie, a platform specific extension of the ASCII character set, meant to be suitable for the Hebrew language.
#CP862-IBM, an IBM variant of CP862.
#CP863, a DOS oldie, a platform specific extension of the ASCII character set.
#CP863-IBM, an IBM variant of CP863.
#CP864, a DOS oldie, meant to be suitable for the Arabic language.
#CP864-IBM, an IBM variant of CP864.
#CP865, a DOS oldie, a platform specific extension of the ASCII character set, meant to be suitable for some Nordic languages.
#CP865-IBM, an IBM variant of CP865.
#CP866, a DOS oldie, a platform specific extension of the ASCII character set, meant to be suitable for the Russian language.
#CP869, a DOS oldie, a platform specific extension of the ASCII character set, meant to be suitable for the Greek language.
#CP869-IBM, an IBM variant of CP869.
#CP874, a DOS oldie, a platform specific extension of the ASCII character set, meant to be suitable for the Thai language.
#CP874-IBM, an IBM variant of CP874.
#WINDOWS-1250 / CP1250, a platform specific extension of the ASCII character set, heavily incompatible with ISO-8859-2.
#WINDOWS-1251 / CP1251, a platform specific extension of the ASCII character set, heavily incompatible with ISO-8859-5, meant to be suitable for the Russian language.
#WINDOWS-1252 / CP1252, a platform specific extension of the ISO-8859-1 character set.
#WINDOWS-1253 / CP1253, a platform specific extension of the ASCII character set, gratuitously incompatible with ISO-8859-7, meant to be suitable for the Greek language.
#WINDOWS-1254 / CP1254, a platform specific extension of the ISO-8859-9 character set.
#WINDOWS-1255 / CP1255, a platform specific extension of the ASCII character set, gratuitously incompatible with ISO-8859-8, suitable for the Hebrew language. This character set is only available on platforms with GNU libc or GNU libiconv.
#WINDOWS-1256 / CP1256, a platform specific extension of the ASCII character set, meant to be suitable for the Arabic language.
#WINDOWS-1257 / CP1257, a platform specific extension of the ASCII character set.
#WINDOWS-1258 / CP1258, a platform specific extension of the ASCII character set, meant to be suitable for the Vietnamese language. This character set is only available on platforms with GNU libc or GNU libiconv.
#HP-ROMAN8, a platform specific extension of the ASCII character set.
#NEXTSTEP, a platform specific extension of the ASCII character set.
#EUC-JP, a multibyte character set for the Japanese language. This character set is only available on platforms with GNU libc or GNU libiconv.
#SHIFT-JIS, a multibyte character set for the Japanese language. This character set is only available on platforms with GNU libc or GNU libiconv.
#CP932, a Microsoft variant of SHIFT-JIS. This character set is only available on platforms with GNU libc or GNU libiconv.
#ISO-2022-JP, a stateful 7-bit multibyte character set for the Japanese language. This character set is only available on platforms with GNU libc or GNU libiconv.
#ISO-2022-JP-2, a stateful 7-bit multibyte character set for the Japanese language. This character set is only available on platforms with GNU libc 2.3 or newer or GNU libiconv.
#ISO-2022-JP-1, a stateful 7-bit multibyte character set for the Japanese language. This character set is only available on platforms with GNU libiconv.
#EUC-CN, a multibyte character set for simplified Chinese. This character set is only available on platforms with GNU libc or GNU libiconv.
#HZ, a stateful 7-bit multibyte character set for simplified Chinese. This character set is only available on platforms with GNU libiconv.
#GBK, a multibyte character set for Chinese, This character set is only available on platforms with GNU libc or GNU libiconv.
#CP936, a Microsoft variant of GBK. This character set is only available on platforms with GNU libc or GNU libiconv.
#GB18030, a multibyte character set for Chinese, This character set is only available on platforms with GNU libc or GNU libiconv.
#EUC-TW, a multibyte character set for traditional Chinese. This character set is only available on platforms with GNU libc or GNU libiconv.
#BIG5, a multibyte character set for traditional Chinese. This character set is only available on platforms with GNU libc or GNU libiconv.
#CP950, a Microsoft variant of BIG5. This character set is only available on platforms with GNU libc or GNU libiconv.
#BIG5-HKSCS, a multibyte character set for traditional Chinese. This character set is only available on platforms with GNU libc or GNU libiconv.
#ISO-2022-CN, a stateful 7-bit multibyte character set for Chinese. This character set is only available on platforms with GNU libc or GNU libiconv.
#ISO-2022-CN-EXT, a stateful 7-bit multibyte character set for Chinese. This character set is only available on platforms with GNU libc or GNU libiconv.
#EUC-KR, a multibyte character set for Korean. This character set is only available on platforms with GNU libc or GNU libiconv.
#CP949, a Microsoft variant of EUC-KR. This character set is only available on platforms with GNU libc or GNU libiconv.
#ISO-2022-KR, a stateful 7-bit multibyte character set for Korean. This character set is only available on platforms with GNU libc or GNU libiconv.
#JOHAB, a multibyte character set for Korean used mostly on DOS. This character set is only available on platforms with GNU libc or GNU libiconv.
#ARMSCII-8, an extension of the ASCII character set, suitable for the Armenian. This character set is only available on platforms with GNU libc or GNU libiconv.
#GEORGIAN-ACADEMY, an extension of the ASCII character set, suitable for the Georgian. This character set is only available on platforms with GNU libc or GNU libiconv.
#GEORGIAN-PS, an extension of the ASCII character set, suitable for the Georgian. This character set is only available on platforms with GNU libc or GNU libiconv.
#TIS-620, an extension of the ASCII character set, suitable for the Thai. This character set is only available on platforms with GNU libc or GNU libiconv.
#MULELAO-1, an extension of the ASCII character set, suitable for the Laotian. This character set is only available on platforms with GNU libiconv.
#CP1133, an extension of the ASCII character set, suitable for the Laotian. This character set is only available on platforms with GNU libc or GNU libiconv.
#VISCII, an extension of the ASCII character set, suitable for the Vietnamese. This character set is only available on platforms with GNU libc or GNU libiconv.
#TCVN, an extension of the ASCII character set, suitable for the Vietnamese. This character set is only available on platforms with GNU libc or GNU libiconv.

In addition to the aliases defined above (e.g. UNICODE-16 for UCS-2), lgc also defined the following aliases:

UTF-8 / UTF8
ISO-8859-1 / 8859-1 / LATIN1 / WEST
ISO-8859-2 / 8859-2 / LATIN2 / EAST
ISO-8859-3 / 8859-3 / LATIN3 / SOUTH
ISO-8859-4 / 8859-4 / LATIN4 / NORTH
ISO-8859-5 / 8859-5 / CYRILLIC
ISO-8859-6 / 8859-6 / ARABIC
ISO-8859-7 / 8859-7 / GREEK
ISO-8859-8 / 8859-8 / HEBREW
ISO-8859-9 / 8859-9 / LATIN5 / TURKISH
ISO-8859-10 / 8859-10 / LATIN6 / NORDIC
ISO-8859-11 / 8859-11 / THAI
ISO-8859-13 / 8859-13 / LATIN7 / BALTIC
ISO-8859-14 / 8859-14 / LATIN8 / CELTIC
ISO-8859-15 / 8859-15 / LATIN9 / LATIN0
ISO-8859-16 / 8859-16 / LATIN10 / SOUTHEAST

=============================================
Encoding name normalization
=============================================
filter-assoc has form ((name11 name12 ...) ...) where name12 is an alias for name11. In general, the first name in each list is the canonical name for the encoding.

(normalize-name name filter-assoc) normalizes the given name.
|#

(defc filter-assoc
 '(("BINARY")
   ("BS")
   ("UCS-2" "UNICODE-16" "UNICODE-16-BIG-ENDIAN")
   ("UNICODE-16-LITTLE-ENDIAN")
   ("UCS-4" "UNICODE-32" "UNICODE-32-BIG-ENDIAN")
   ("UNICODE-32-LITTLE-ENDIAN")
   ("UTF-8" "UTF8")
   ("UTF-16")
   ("UTF-7")
   ("JAVA")
   ("ASCII")
   ("ISO-8859-1" "8859-1" "LATIN1" "WEST")
   ("ISO-8859-2" "8859-2" "LATIN2" "EAST")
   ("ISO-8859-3" "8859-3" "LATIN3" "SOUTH")
   ("ISO-8859-4" "8859-4" "LATIN4" "NORTH")
   ("ISO-8859-5" "8859-5" "CYRILLIC")
   ("ISO-8859-6" "8859-6" "ARABIC")
   ("ISO-8859-7" "8859-7" "GREEK")
   ("ISO-8859-8" "8859-8" "HEBREW")
   ("ISO-8859-9" "8859-9" "LATIN5" "TURKISH")
   ("ISO-8859-10" "8859-10" "LATIN6" "NORDIC")
   ("ISO-8859-11" "8859-11" "THAI")
   ("ISO-8859-13" "8859-13" "LATIN7" "BALTIC")
   ("ISO-8859-14" "8859-14" "LATIN8" "CELTIC")
   ("ISO-8859-15" "8859-15" "LATIN9" "LATIN0")
   ("ISO-8859-16" "8859-16" "LATIN10" "SOUTHEAST")
   ("KOI8-R")
   ("KOI8-U")
   ("KOI8-RU")
   ("JIS_X0201")
   ("MAC-ARABIC")
   ("MAC-CENTRAL-EUROPE")
   ("MAC-CROATIAN")
   ("MAC-CYRILLIC")
   ("MAC-DINGBAT")
   ("MAC-GREEK")
   ("MAC-HEBREW")
   ("MAC-ICELAND")
   ("MAC-ROMAN" "MACINTOSH")
   ("MAC-ROMANIA")
   ("MAC-SYMBOL")
   ("MAC-THAI")
   ("MAC-TURKISH")
   ("MAC-UKRAINE")
   ("CP437")
   ("CP437-IBM")
   ("CP737")
   ("CP775")
   ("CP850")
   ("CP852")
   ("CP852-IBM")
   ("CP855")
   ("CP857")
   ("CP860")
   ("CP860-IBM")
   ("CP861")
   ("CP861-IBM")
   ("CP862")
   ("CP862-IBM")
   ("CP863")
   ("CP863-IBM")
   ("CP864")
   ("CP864-IBM")
   ("CP865")
   ("CP865-IBM")
   ("CP866")
   ("CP869")
   ("CP869-IBM")
   ("CP874")
   ("CP874-IBM")
   ("WINDOWS-1250" "CP1250")
   ("WINDOWS-1251" "CP1251")
   ("WINDOWS-1252" "CP1252")
   ("WINDOWS-1253" "CP1253")
   ("WINDOWS-1254" "CP1254")
   ("WINDOWS-1255" "CP1255")
   ("WINDOWS-1256" "CP1256")
   ("WINDOWS-1257" "CP1257")
   ("WINDOWS-1258" "CP1258")
   ("HP-ROMAN8")
   ("NEXTSTEP")
   ("EUC-JP")
   ("SHIFT-JIS")
   ("CP932")
   ("ISO-2022-JP")
   ("ISO-2022-JP-2")
   ("ISO-2022-JP-1")
   ("EUC-CN")
   ("HZ")
   ("GBK")
   ("CP936")
   ("GB18030")
   ("EUC-TW")
   ("BIG5")
   ("CP950")
   ("BIG5-HKSCS")
   ("ISO-2022-CN")
   ("ISO-2022-CN-EXT")
   ("EUC-KR")
   ("CP949")
   ("ISO-2022-KR")
   ("JOHAB")
   ("ARMSCII-8")
   ("GEORGIAN-ACADEMY")
   ("GEORGIAN-PS")
   ("TIS-620")
   ("MULELAO-1")
   ("CP1133")
   ("VISCII")
   ("TCVN")))

(deff normalize-name (name assoc)
 (:when (atom assoc) (c-error "Unknown filter: ~a" name))
 (:let (name* . assoc) assoc)
 (:when (member name name* :test 'equalp) (car name*))
 (normalize-name name assoc))

#|
=============================================
Bytewise translation
=============================================
(table-translate vector table) translates the given vector byte by byte according to the given table. The table must have 256 entries. table-translate raises an exception for bytes that translate to 0 and removes bytes that translate to 1. The output from table-translate is supposed to be in Logiweb-UTF-8.

(filter-table parm*) constructs a table which translates all bytes to themselves except that bytes below 32 are translated to zero. Then filter-table modifies the table according to the given parm* which is supposed to be a list of strings like ("newline" "10" "space" "9" "ignore" "13") which indicates how individual bytes should be translated.
|#

(deff filter-table (parm*)
 (:let table (make-vector 256))
 (dotimes (n 256) (setf (aref table n) n))
 (dotimes (n 32) (setf (aref table n) 0))
 (filter-table1 parm* table))

(deff filter-table1 (parm* table)
 (:when (atom parm*) table)
 (:let (key . parm*) parm*)
 (:when (atom parm*) (c-error "Filter keywords and values must occur pairwise"))
 (:let (value . parm*) parm*)
 (:let key (key2byte key))
 (:let value (value2byte value))
 (setf (aref table value) key)
 (filter-table1 parm* table))

(deff value2byte (value)
 (:mlet (byte length) (parse-integer value :junk-allowed t))
 (:unless (numberp byte)
  (c-error "Filter parameter ~s is not a number" value))
 (:unless (equalp length (length value))
  (c-error "Filter parameter ~s is a number followed by garbage" value))
 (:unless (<= 0 byte 255)
  (c-error "Filter parameter ~s not in 0..255" value))
 byte)

(deff key2byte (key)
 (:when (equalp key "newline") 10)
 (:when (equalp key "space") 32)
 (:when (equalp key "ignore") 1)
 (c-error "Unknown filter keyword: ~a"))

(deff table-translate (vector table)
 (:let length (length vector))
 (:let vector1 (make-vector length))
 (table-translate1 vector (- length 1) vector1 (- length 1) table))

(deff table-translate1 (vector1 n1 vector2 n2 table)
 (:when (< n1 0) (subvector vector2 (+ n2 1)))
 (:let value1 (aref vector1 n1))
 (:let value2 (aref table value1))
 (:when (equalp value2 0)
  (c-error "Illegal character in input. Character code: ~d" value1))
 (:when (equalp value2 1)
  (table-translate1 vector1 (- n1 1) vector2 n2 table))
 (setf (aref vector2 n2) value2)
 (table-translate1 vector1 (- n1 1) vector2 (- n2 1) table))

(etst (ct2vector "ABC")
 (table-translate (ct2vector "ABC") (filter-table nil)))
(xtst (table-translate (ct2vector '(65 31 67)) (filter-table nil)))
(etst (ct2vector '(10 32 10 32))
 (table-translate (ct2vector '(65 66 67 65 66 67))
  (filter-table '("newline" "65" "space" "66" "ignore" "67"))))

#|
=============================================
BINARY and BS filters
=============================================
The binary filter encodes a binary input file base 64.
The binary-string filter also puts ""/ in front and " after the input.
|#

(deff filter-binary (vector)
 (card*2safe* (vector2card* vector)))

(deff filter-bs (vector)
 (list "\"\"/" (filter-binary vector) "\" "))

(etst
 (ct2vector "AAAABA")
 (ct2vector (filter-binary (ct2vector (list 0 0 0 1)))))

(etst
 (ct2vector "\"\"/AAAABA\" ")
 (ct2vector (filter-bs (ct2vector (list 0 0 0 1)))))

#|
=============================================
Clisp filters
=============================================
(filter-clisp vector name table) applies the clisp filter with the given name to the given vector, then expresses the result in Logiweb-UTF-8 using the clisp UTF-8 filter and table-translate.
|#

(deff name2encoding (name)
 (make-encoding
  :charset name
  :line-terminator :unix
  :input-error-action #\Null
  :output-error-action #\Null))

(defc utf8-encoding (name2encoding "UTF-8"))

(deff filter-error (string position)
 (:let start (max 0 (- position 100)))
 (:let length (- position start))
 (format t "Invalid character at position ~d~%" position)
 (:when (< length 2) (raise))
 (format t "Last ~d characters before error:~%" length)
 (format t "~s~%" (subseq string start position))
 (raise))

(deff filter-clisp (vector name table)
 (:let encoding (name2encoding name))
 (:let string (convert-string-from-bytes vector encoding))
 (:let position (position #\Null string))
 (:when position (filter-error string position))
 (:let vector (convert-string-to-bytes string utf8-encoding))
 (table-translate vector table))

; For experimenting with character encodings
(deff reverse-filter (name &rest ct)
 (:let enc (name2encoding name))
 (:let string (map 'string 'code-char (ct2card* ct)))
 (:let vector (convert-string-to-bytes string enc))
 (dotimes (n (length vector)) (reverse-filter-print (aref vector n)))
 (terpri)
 vector)

(deff reverse-filter-print (card)
 (:when (<= 32 card 127) (format t "~a" (code-char card)))
 (:when (<= 160 card) (format t "~a" (code-char card)))
 (:when (= 09 card) (format t "<tab>"))
 (:when (= 10 card) (format t "<newline>"))
 (:when (= 13 card) (format t "<return>"))
 (:when (= 27 card) (format t "<esc>"))
 (:when (< card 16) (format t "<0~x>" card))
 (format t "<~x>" card))

#|
=============================================
Filtering
=============================================
|#

(defc *default-filter*
 (string2list ",UTF-8,newline,10,space,9,ignore,13"))

(deff apply-filter (option vector)
 (:let (name . parm*) (default *default-filter* option))
 (:let name (normalize-name name filter-assoc))
 (:when (equalp name "BINARY") (filter-binary vector))
 (:when (equalp name "BS") (filter-bs vector))
 (:let table (filter-table parm*))
 (filter-clisp vector name table))

(etst
 (vec 10 32 32 65)
 (apply-filter (string2list ",UTF-8,newline,5,space,6,ignore,7")
  (vec 5 6 7 32 65)))
(xtst
 (apply-filter (string2list ",UTF-8")
  (vec 65 66 67 253)))
(xtst
 (apply-filter (string2list ",UTF-8,newline,5,space,6,ignore,7")
  (vec 5 6 8 32 65)))
(etst
 (vec #x61 #x62 #x63 #xC3 #xA6 #xC3 #xB8 #xC3 #xA5)
 (apply-filter (string2list ",8859-1")
  (vec #x61 #x62 #x63 #xE6 #xF8 #xE5)))
(etst
 (vec #x61 #x20 #x63 #xC3 #xA6 #xC3 #xB8 #xC3 #xA5)
 (apply-filter (string2list ",latin1,space,98")
  (vec #x61 #x62 #x63 #xE6 #xF8 #xE5)))








