;;; unicode.el --- for UNICODE special features

;; Copyright (C) 1997 Miyashita Hisashi

;; Keywords: mule, multilingual, 
;;           character set, coding-system, ISO10646, Unicode

;; This file is part of MULE-UCS

;; MULE-UCS is free software; you can redistribute it and/or modify
;; it under the terms of the GNU General Public License as published by
;; the Free Software Foundation; either version 2, or (at your option)
;; any later version.

;; MULE-UCS is distributed in the hope that it will be useful,
;; but WITHOUT ANY WARRANTY; without even the implied warranty of
;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
;; GNU General Public License for more details.

;; You should have received a copy of the GNU General Public License
;; along with this program; see the file COPYING.  If not, write to the
;; Free Software Foundation, Inc., 59 Temple Place - Suite 330,
;; Boston, MA 02111-1307, USA.

;; Comment:
;;  This module supports special unicode characters, and non-character values.
;;  For example, byte order mark, separators, layout and format 
;;  control characters and replacement characters.

(require 'mucs)
(require 'mucs-ccl)
(require 'trans-util)
(require 'tae)

(require 'un-data)

(defconst utf8-encode-buffer-magnification 2)
(defconst utf8-decode-buffer-magnification 2)
(defconst utf16-encode-buffer-magnification 2)
(defconst utf16-decode-buffer-magnification 2)

(defvar mucs-unicode-default-decode-replacement ??)
(defvar mucs-unicode-default-encode-replacement ?\xfffd)
(defvar mucs-unicode-charset-translation-max-gap t)

(defvar mucs-ccl-unicode-translation-table-number 1)

(defvar mucs-unicode-default-alist-symbol 'unicode)

(defun mucs-unicode-default-encoding (x)
  (cons (char-codepoint (car x)) (cdr x)))

(defun utf16-ccl-surrogate-pair-p (reg)
  `((,reg & ?\xf800) == ?\xd800))

;;
;; Dealing with line separator problem.
;;

(defun mucs-ccl-convert-lf-2-crlf (cr-output)
  `((if (r0 == ,unicode-lf)
	,(append cr-output))))

(defvar lf-vs-cr-alist
  (list (cons ?\xa unicode-cr)))

(put 'ascii
     'lf-vs-cr
     'lf-vs-cr-alist)

(defvar lf-vs-unicode-line-separator-alist
  (list (cons ?\xa unicode-line-separator)))

(put 'ascii
     'lf-vs-unicode-line-separator
     'lf-vs-unicode-line-separator-alist)

;;
;; WRITE SIGNATURE, CHECK AND READM SIGNATURE
;; READ, WRITE
;;

(defvar ucs4-be-ccl-encode
  mucs-ccl-write-ex-be-4-octet)

(defvar ucs4-le-ccl-encode
  mucs-ccl-write-ex-le-4-octet)

;UTF 8 ------------------------------------------------

(defvar utf8-ccl-encode
  `((if (r0 < ?\x80)
	((write r0))
      (if (r0 < ?\x800)
	  ((write ((r0 >> 6) | ?\xc0))
	   (write (r0 & ?\x3f)))
	(if (r0 < ?\x10000)
	    ((write ((r0 >> 12) | ?\xe0))
	     (write (((r0 >> 6) & ?\x3f) | ?\x80))
	     (write (r0 & ?\x3f)))
	  (if (r0 < ?\x200000)
	      ((write ((r0 >> 12) | ?\xf0))
	       (write (((r0 >> 6) & ?\x3f) | ?\x80))
	       (write (r0 & ?\x3f)))
	    (if (r0 < ?\x4000000)
		((write ((r0 >> 18) | ?\xf8))
		 (write (((r0 >> 12) & ?\x3f) | ?\x80))
		 (write (((r0 >> 6) & ?\x3f) | ?\x80))
		 (write (r0 & ?\x3f)))
	      ((write ((r0 >> 24) | ?\xf8))
	       (write (((r0 >> 18) & ?\x3f) | ?\x80))
	       (write (((r0 >> 12) & ?\x3f) | ?\x80))
	       (write (((r0 >> 6) & ?\x3f) | ?\x80))
	       (write (r0 & ?\x3f))))))))))

(defvar utf8-ccl-decode
  `((read-if (r0 >= ?\x80)
	((if (r0 < ?\xe0)
	     ((read r4)
	      (r4 &= ?\x3f)
	      (r0 = (((r0 & ?\x1f) << 6) | r4)))
	   (if (r0 < ?\xf0)
	       ((read r4 r6)
		(r4 = ((r4  & ?\x3f) << 6))
		(r6 &= ?\x3f)
		(r0 = ((((r0 & ?\xf) << 12) | r4) | r6)))
	     (if (r0 < ?\xf8)
		 ((read r1 r4 r6)
		  (r1 = ((r1  & ?\x3f) << 12))
		  (r4 = ((r4  & ?\x3f) << 6))
		  (r6 &= ?\x3f)
		  (r0 = (((((r0 & ?\x7) << 18) | r1) | r4) | r6)))
	       (if (r0 < ?\xfc)
;;;; MUCS can't read any numbers lager than 24bit
		   ((read r0 r1 r4 r6)
		    (r1 = ((r1  & ?\x3f) << 12))
		    (r4 = ((r4  & ?\x3f) << 6))
		    (r6 &= ?\x3f)
		    (r0 = (((((r0 & ?\x3f) << 18) | r1) | r4) | r6)))
		 (r0 = 0)))))))))

(defun mucs-ccl-write-utf8-signature ()
  '((write ?\xef) (write ?\xbb) (write ?\xbf)))

(defun mucs-ccl-utf8-check-signature-read ()
  (append
   utf8-ccl-decode
   `((if (r0 == ,unicode-signature)
	 ,utf8-ccl-decode))))

(defun mucs-ccl-read-utf8 ()
  utf8-ccl-decode)

(defun mucs-ccl-write-utf8 ()
  utf8-ccl-encode)

(defun mucs-ccl-write-utf8-dos ()
  (append
   (mucs-ccl-convert-lf-2-crlf '((write ?\xd)))
   utf8-ccl-encode))

;UTF 16 -----------------------------------------------
;;;;
;;;; If register (r5 & ?\x1) is 1, current mode is little endian.
;;;;

(defun mucs-ccl-utf16-little-endian-p ()
  '(r5 & ?\x1))

(defun mucs-ccl-set-utf16-endian (littlep)
  (if littlep
      '((r5 |= ?\x1))
    `((r5 &= ,(logxor (mucs-number-mask) ?\x1)))))

(defvar utf16-ccl-decode
  `((if ,(mucs-ccl-utf16-little-endian-p)
	,mucs-ccl-read-ex-le-2-octet
      ,mucs-ccl-read-ex-be-2-octet)
    (if ,(utf16-ccl-surrogate-pair-p 'r0)
	((if ,(mucs-ccl-utf16-little-endian-p)
	     ((read r6 r4))
	   ((read r4 r6)))
	 (r0 = (((r0 & ?\x3ff) + ?\x40) << 10))
	 (r6 &= ?\x3f)
	 (r4 = ((r4 & ?\x3) << 6) | r6)
	 (r0 |=  r4)))))

(defun mucs-ccl-utf16-check-signature-read ()
  (append mucs-ccl-read-ex-le-2-octet
	  `((if (r0 == ,unicode-signature)
		,(append (mucs-ccl-set-utf16-endian t)
			 mucs-ccl-read-ex-le-2-octet)
	      (if (r0 == ,unicode-reverse-signature)
		  ,(append (mucs-ccl-set-utf16-endian nil)
			   mucs-ccl-read-ex-be-2-octet)))
	    (if ,(utf16-ccl-surrogate-pair-p 'r0)
		((if ,(mucs-ccl-utf16-little-endian-p)
		     ((read r6 r4))
		   ((read r4 r6)))
		 (r0 = (((r0 & ?\x3ff) + ?\x40) << 10))
		 (r6 &= ?\x3f)
		 (r4 = ((r4 & ?\x3) << 6) | r6)
		 (r0 |=  r4))))))

(defun mucs-ccl-read-utf16 ()
  utf16-ccl-decode)

;UTF 16 Little Endian----------------------------------

(defvar utf16-le-ccl-encode
  `((if (r0 < ?\xffff)
	,mucs-ccl-write-ex-le-2-octet
      ((r4 = (((r0 >> 16) - 1) & ?\xf))
       (r6 = ((r0 >> 10) & ?\x3f))
       (write ((r4 & ?\x3) | r6))
       (write ((r4 >> 2) | ?\xd8))
       (write (r0 & ?\7f))
       (write (((r0 >> 8) & ?\x3) | ?\xdc))))))

(defvar utf16-le-ccl-decode
  (append mucs-ccl-read-ex-le-2-octet
    `((if ,(utf16-ccl-surrogate-pair-p 'r0)
	  ((read r6 r4)
	   (r0 = (((r0 & ?\x3ff) + ?\x40) << 10))
	   (r6 &= ?\x3f)
	   (r4 = ((r4 & ?\x3) << 6) | r6)
	   (r0 |=  r4))))))

(defun mucs-ccl-write-utf16-le-signature ()
  '((write ?\xff) (write ?\xfe)))

(defun mucs-ccl-read-utf16-le ()
  utf16-le-ccl-decode)

(defun mucs-ccl-write-utf16-le ()
  utf16-le-ccl-encode)

(defun mucs-ccl-write-utf16-le-dos ()
  (append
   (mucs-ccl-convert-lf-2-crlf '((write "\xd\x0")))
   utf16-le-ccl-encode))

;UTF 16 Big Endian-------------------------------------

(defvar utf16-be-ccl-decode
  (append mucs-ccl-read-ex-be-2-octet
    `((if ,(utf16-ccl-surrogate-pair-p 'r0)
	  ((read r4 r6)
	   (r0 = (((r0 & ?\x3ff) + ?\x40) << 10))
	   (r6 &= ?\x3f)
	   (r4 = ((r4 & ?\x3) << 6) | r6)
	   (r0 |=  r4))))))

(defvar utf16-be-ccl-encode
  `((if (r0 < ?\xffff)
	,mucs-ccl-write-ex-be-2-octet
      ((r4 = (((r0 >> 16) - 1) & ?\xf))
       (r6 = ((r0 >> 10) & ?\x3f))
       (write ((r4 >> 2) | ?\xd8))
       (write ((r4 & ?\x3) | r6))
       (write (((r0 >> 8) & ?\x3) | ?\xdc))
       (write (r0 & ?\7f))))))

(defun mucs-ccl-write-utf16-be-signature ()
  '((write ?\xfe) (write ?\xff)))

(defun mucs-ccl-read-utf16-be ()
  utf16-be-ccl-decode)

(defun mucs-ccl-write-utf16-be ()
  utf16-be-ccl-encode)

(defun mucs-ccl-write-utf16-be-dos ()
  (append
   (mucs-ccl-convert-lf-2-crlf '((write "\x0\xd")))
   utf16-be-ccl-encode))

;------------------------------------------------------

(defvar unicode-charset-library-alist
  '((ascii . uascii)
    (latin-iso8859-1 . uiso8859-1)
    (latin-iso8859-2 . uiso8859-2)
    (latin-iso8859-3 . uiso8859-3)
    (latin-iso8859-4 . uiso8859-4)
    (cyrillic-iso8859-5 . uiso8859-5)
    (arabic-iso8859-6 . uiso8859-6)
    (greek-iso8859-7 . uiso8859-7)
    (hebrew-iso8859-8 . uiso8859-8)
    (latin-iso8859-9 . uiso8859-9)
    (latin-jisx0201 . ujisx0201)
    (katakana-jisx0201 . ujisx0201)
    (japanese-jisx0208 . ujisx0208)
    (japanese-jisx0212 . ujisx0212)
    (chinese-gb2312 . ugb2312)
    (chinese-cns11643-1 . ucns11643)
    (chinese-cns11643-2 . ucns11643)
    (chinese-cns11643-3 . ucns11643)
    (korean-ksc5601 . uksc5601)))

(defun require-unicode-charset-data (charset)
  (let ((package (cdr (assq charset unicode-charset-library-alist))))
    (or (featurep package)
	(load (concat mucs-data-path (symbol-name package)) t)
	(require package))))

(defun mucs-charset-unicode-table-set-symbol (charset def)
  (intern (format "%s-%s-table-set" 
		  (symbol-name charset)
		  (symbol-name def))))

(defun mucs-ccl-unicode-get-translation-table-symbol ()
  (let ((symname (format
		 "mucs-unicode-translation-table-%d" 
		 mucs-ccl-unicode-translation-table-number))
	sym)
    (setq mucs-ccl-unicode-translation-table-number
	  (1+ mucs-ccl-unicode-translation-table-number))
    (if (and (setq sym (intern-soft symname))
	     (get sym 'ccl-translation-table))
	(mucs-ccl-unicode-get-translation-table-symbol)
      (intern symname))))

(defun mucs-ccl-translate-encode-simple-unicode
  (charset-key-list &optional notfound-to alist-symbol)
  (if (null alist-symbol)
      (setq alist-symbol
	    mucs-unicode-default-alist-symbol))
  (if (null notfound-to) 
      (setq 
       notfound-to 
       mucs-unicode-default-encode-replacement))
  (let (table-set-alist
	sym
	confr
	func-table
	func-table-sym
	code-tables
	tables
	ccl-pgm
	pgm)
    (setq table-set-alist
	  (mapcar
	   (lambda (cs)
	     (let ((alist-symbol alist-symbol))
	       (if (and (consp cs)
			(charsetp (car cs))
			(get (car cs) (cdr cs)))
		   (setq alist-symbol (cdr cs)
			 cs (car cs))
		 (require-unicode-charset-data cs))

	       (setq sym 
		     (mucs-charset-unicode-table-set-symbol 
		      cs alist-symbol))
	       (if (null (get-table-set-alist sym))
		   (put-table-set-alist sym (get cs alist-symbol)))
	       (put-table-set-encode-method
		sym 'mucs-unicode-default-encoding)
	       (cons (charset-id cs) sym)))
	   charset-key-list)
	  confr (tae-generate-func-to-func-map
		 table-set-alist nil 'number)

	  func-table-sym (mucs-ccl-unicode-get-translation-table-symbol)
	  func-table (car (make-translation-tables (car confr) nil nil t))
	  code-tables (cdr confr)
	  tables (progn
		   (register-ccl-translation-table func-table-sym func-table)
		   (cons func-table-sym code-tables))

	  pgm (generate-table-registration-program tables)

	  ccl-pgm `((translate-single-map r4 r1 ,func-table-sym)
		    (if (r4 == -1) 
			((r0 = ,notfound-to))
		      ( ,(append '(iterate-multiple-map r1 r0) code-tables)
		       (if (r0 == -1) ((r0 = ,notfound-to)))))
		    )
	  mucs-ccl-represent-2-form t)

    (list 'mucs-preparation pgm ccl-pgm)))

(defun mucs-ccl-translate-decode-simple-unicode
  (charset-key-list &optional notfound-to alist-symbol)
  (if (null alist-symbol)
      (setq alist-symbol
	    mucs-unicode-default-alist-symbol))
  (if (null notfound-to) 
      (setq 
       notfound-to 
       mucs-unicode-default-decode-replacement))
  (let (table-set-list
	tables
	ccl-pgm
	pgm)
    (setq table-set-list
	  (mapcar
	   (lambda (cs)
	     (let ((alist-symbol alist-symbol)
		   sym)
	       (if (and (consp cs)
			(charsetp (car cs))
			(get (car cs) (cdr cs)))
		   (setq alist-symbol (cdr cs)
			 cs (car cs))
		 (require-unicode-charset-data cs))
	       (setq sym
		     (mucs-charset-unicode-table-set-symbol
		      cs alist-symbol))
	       (if (null (get-table-set-alist sym))
		   (put-table-set-alist sym (get cs alist-symbol)))
	       sym))
	   charset-key-list)
	  tables (tae-generate-union-func-map
		  table-set-list t)

	  pgm (generate-table-registration-program tables)

	  ccl-pgm `((r1 = 0)
		    ( ,(append '(iterate-multiple-map r1 r0) tables)
		    (if (r1 == -1) ((r0 = ,notfound-to)))))
	  mucs-ccl-represent-2-form nil)

    (list 'mucs-preparation pgm ccl-pgm)))

(provide 'unicode)

