;;; unicode.el --- for UNICODE special features

;; Copyright (C) 1997-2000 Miyashita Hisashi

;; Keywords: mule, multilingual, 
;;           character set, coding-system, ISO10646, Unicode

;; This file is part of Mule-UCS

;; Mule-UCS is free software; you can redistribute it and/or modify
;; it under the terms of the GNU General Public License as published by
;; the Free Software Foundation; either version 2, or (at your option)
;; any later version.

;; Mule-UCS is distributed in the hope that it will be useful,
;; but WITHOUT ANY WARRANTY; without even the implied warranty of
;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
;; GNU General Public License for more details.

;; You should have received a copy of the GNU General Public License
;; along with this program; see the file COPYING.  If not, write to the
;; Free Software Foundation, Inc., 59 Temple Place - Suite 330,
;; Boston, MA 02111-1307, USA.

;; Comment:
;;  This module supports unicode translations.

(require 'mucs)
(require 'trans-util)
(require 'tae)

(require 'un-data)

(defconst utf-7-encode-buffer-magnification 3)
(defconst utf-7-decode-buffer-magnification 1)
(defconst utf-8-encode-buffer-magnification 2)
(defconst utf-8-decode-buffer-magnification 2)
(defconst utf-16-encode-buffer-magnification 2)
(defconst utf-16-decode-buffer-magnification 2)

(defvar mucs-unicode-default-decode-replacement ??)
(defvar mucs-unicode-default-encode-replacement ?\xFFFD)

(defun utf-16-ccl-surrogate-pair-p (reg)
  `((,reg & ?\xf800) == ?\xd800))

;;;
;;; UCS generic type definition.
;;;
(mucs-define-type
 'ucs-generic
 'identity
 'identity)

;;;
;;; UCS replacement or ignore translation rule.
;;;

(defvar unicode-not-found-to-replace-or-invalid-assoc
  `(assoc (char-1 . ucs-generic)
	  ( ,@(mapcar
	       (lambda (x)
		 (cons 'invalid x))
	       unicode-ignore-characters)
	      (all . ,mucs-unicode-default-encode-replacement)
	      (,mucs-unicode-default-decode-replacement . all)))
  "Translate any values to replacement character or invalid code.
If you want to deal with untranslated character, use this translation rule.")

(defvar unicode-not-found-to-invalid-assoc
  '(assoc (char-1 . ucs-generic)
	  ((all . invalid)
	   (invalid . all)))
  "Translate any values to invalid code.
If you want to deal with untranslated character, use this translation rule.")

;;
;; Dealing with line separator problem.
;;

(defun convert-unicode-lf-2-crlf (cr-output)
  `((if (r0 == ,unicode-lf)
	,(append cr-output))))

(defvar lf-vs-cr-assoc
  `(assoc (char-1 . ucs-generic)
	  ((?\xa . ,unicode-cr))))

(defvar lf-vs-unicode-line-separator-assoc
  `(assoc (char-1 . ucs-generic)
	  ((?\xa . ,unicode-line-separator))))

;;
;; WRITE SIGNATURE, CHECK AND READ SIGNATURE
;; READ, WRITE
;;

;UTF 8 ------------------------------------------------

(defvar utf-8-ccl-encode
  `((if (r0 < ?\x80)
	((write r0))
      (if (r0 < ?\x800)
	  ((write ((r0 >> 6) | ?\xc0))
	   (write ((r0 & ?\x3f) | ?\x80)))
	(if (r0 < ?\x10000)
	    ((write ((r0 >> 12) | ?\xe0))
	     (write (((r0 >> 6) & ?\x3f) | ?\x80))
	     (write ((r0 & ?\x3f) | ?\x80)))
	  (if (r0 < ?\x200000)
	      ((write ((r0 >> 18) | ?\xf0))
	       (write (((r0 >> 12) & ?\3f) | ?\x80))
	       (write (((r0 >> 6) & ?\x3f) | ?\x80))
	       (write ((r0 & ?\x3f) | ?\x80)))
	    (if (r0 < ?\x4000000)
		((write ((r0 >> 24) | ?\xf8))
		 (write (((r0 >> 18) & ?\x3f) | ?\x80))
		 (write (((r0 >> 12) & ?\x3f) | ?\x80))
		 (write (((r0 >> 6) & ?\x3f) | ?\x80))
		 (write ((r0 & ?\x3f) | ?\x80)))
	      ((write ((r0 >> 30) | ?\xfc))
	       (write (((r0 >> 24) & ?\x3f) | ?\x80))
	       (write (((r0 >> 18) & ?\x3f) | ?\x80))
	       (write (((r0 >> 12) & ?\x3f) | ?\x80))
	       (write (((r0 >> 6) & ?\x3f) | ?\x80))
	       (write ((r0 & ?\x3f) | ?\x80))))))))))

(defvar utf-8-ccl-decode
  `((read-if (r0 >= ?\x80)
	((if (r0 < ?\xe0)
	     ((read r4)
	      (r4 &= ?\x3f)
	      (r0 = (((r0 & ?\x1f) << 6) | r4)))
	   (if (r0 < ?\xf0)
	       ((read r4 r6)
		(r4 = ((r4  & ?\x3f) << 6))
		(r6 &= ?\x3f)
		(r0 = ((((r0 & ?\xf) << 12) | r4) | r6)))
	     (if (r0 < ?\xf8)
		 ((read r1 r4 r6)
		  (r1 = ((r1  & ?\x3f) << 12))
		  (r4 = ((r4  & ?\x3f) << 6))
		  (r6 &= ?\x3f)
		  (r0 = (((((r0 & ?\x7) << 18) | r1) | r4) | r6)))
	       (if (r0 < ?\xfc)
;;;; MUCS can't read any numbers lager than 24bit
		   ((read r0 r1 r4 r6)
		    (r1 = ((r1  & ?\x3f) << 12))
		    (r4 = ((r4  & ?\x3f) << 6))
		    (r6 &= ?\x3f)
		    (r0 = (((((r0 & ?\x3f) << 18) | r1) | r4) | r6)))
		 (r0 = 0)))))))))

(mucs-type-register-serialization
 'ucs-generic
 'utf-8
 utf-8-ccl-encode
 utf-8-ccl-decode)

(mucs-type-register-serialization
 'ucs-generic
 'utf-8-dos
 (append
  (convert-unicode-lf-2-crlf '((write ?\xd)))
  utf-8-ccl-encode)
 'none)

(defun mucs-ccl-write-utf-8-signature ()
  '((write "\xEF\xBB\xBF")))

(defun mucs-ccl-utf-8-check-signature-read ()
  (append
   utf-8-ccl-decode
   `((if (r0 == ,unicode-signature)
	 ,utf-8-ccl-decode))))

;UTF 16 -----------------------------------------------

(mucs-ccl-internal-state-reserve 'utf-16-little-endian-p 1)

(defvar utf-16-ccl-decode
  `((if ,(mucs-ccl-check-internal-state 'utf-16-little-endian-p)
	,mucs-ccl-read-ex-le-2-octet
      ,mucs-ccl-read-ex-be-2-octet)
    (if ,(utf-16-ccl-surrogate-pair-p 'r0)
	((if ,(mucs-ccl-check-internal-state 'utf-16-little-endian-p)
	     ((read r6 r4))
	   ((read r4 r6)))
	 (r0 = (((r0 & ?\x3ff) + ?\x40) << 10))
	 (r6 &= ?\x3f)
	 (r4 = ((r4 & ?\x3) << 6) | r6)
	 (r0 |=  r4)))))

(defun mucs-ccl-utf-16-check-signature-read ()
  (append mucs-ccl-read-ex-le-2-octet
	  `((if (r0 == ,unicode-signature)
		,(append (mucs-ccl-set-internal-state
			  'utf-16-little-endian-p t)
			 mucs-ccl-read-ex-le-2-octet)
	      (if (r0 == ,unicode-reverse-signature)
		  ,(append (mucs-ccl-set-internal-state
			    'utf-16-little-endian-p nil)
			   mucs-ccl-read-ex-be-2-octet)))
	    (if ,(utf-16-ccl-surrogate-pair-p 'r0)
		((if ,(mucs-ccl-check-internal-state 'utf-16-little-endian-p)
		     ((read r6 r4))
		   ((read r4 r6)))
		 (r0 = (((r0 & ?\x3ff) + ?\x40) << 10))
		 (r6 &= ?\x3f)
		 (r4 = ((r4 & ?\x3) << 6) | r6)
		 (r0 |=  r4))))))

(defun mucs-ccl-read-utf-16 ()
  utf-16-ccl-decode)

;UTF 16 Little Endian----------------------------------

(defvar utf-16-le-ccl-encode
  `((if (r0 < ?\xFFFF)
	,mucs-ccl-write-ex-le-2-octet
      ((r4 = (((r0 >> 16) - 1) & ?\xF))
       (r6 = ((r0 >> 10) & ?\x3F))
       (write ((r4 & ?\x3) | r6))
       (write ((r4 >> 2) | ?\xd8))
       (write (r0 & ?\xFF))
       (write (((r0 >> 8) & ?\x3) | ?\xDC))))))

(defvar utf-16-le-ccl-decode
  (append mucs-ccl-read-ex-le-2-octet
    `((if ,(utf-16-ccl-surrogate-pair-p 'r0)
	  ((read r6 r4)
	   (r4 = ((r4 & ?\x3) << 8) | r6)
	   (r0 = ((((r0 & ?\x3FF) + ?\x40) << 10) | r4)))))))

(mucs-type-register-serialization
 'ucs-generic
 'utf-16-le
 utf-16-le-ccl-encode
 utf-16-le-ccl-decode)

(mucs-type-register-serialization
 'ucs-generic
 'utf-16-le-dos
 (append
  (convert-unicode-lf-2-crlf '((write "\x0D\x00")))
  utf-16-le-ccl-encode)
 'none)

(defun mucs-ccl-write-utf-16-le-signature ()
  '((write "\xFF\xFE")))

;UTF 16 Big Endian-------------------------------------

(defvar utf-16-be-ccl-decode
  (append mucs-ccl-read-ex-be-2-octet
    `((if ,(utf-16-ccl-surrogate-pair-p 'r0)
	  ((read r4 r6)
	   (r4 = ((r4 & ?\x3) << 8) | r6)
	   (r0 = ((((r0 & ?\x3FF) + ?\x40) << 10) | r4)))))))

(defvar utf-16-be-ccl-encode
  `((if (r0 < ?\xffff)
	,mucs-ccl-write-ex-be-2-octet
      ((r4 = (((r0 >> 16) - 1) & ?\xF))
       (r6 = ((r0 >> 10) & ?\x3F))
       (write ((r4 >> 2) | ?\xd8))
       (write ((r4 & ?\x3) | r6))
       (write (((r0 >> 8) & ?\x3) | ?\xDC))
       (write (r0 & ?\xFF))))))

(mucs-type-register-serialization
 'ucs-generic
 'utf-16-be
 utf-16-be-ccl-encode
 utf-16-be-ccl-decode)

(mucs-type-register-serialization
 'ucs-generic
 'utf-16-be-dos
 (append
  (convert-unicode-lf-2-crlf '((write "\x00\x0D")))
  utf-16-be-ccl-encode)
 'none)

(defun mucs-ccl-write-utf-16-be-signature ()
  '((write ?\xfe) (write ?\xff)))

;UTF 7 ------------------------------------------------

(mucs-ccl-internal-state-reserve 'utf-7-shifted-p 2)

(defconst utf-7-direct-characters-assoc
  `(assoc
    (ucs-generic . ucs-generic)
    ,(mapcar
      (lambda (x)
	(cons x x))
      '(?+ ?  ?\t ?\r ?\n ;;;; Notice that `+' is included.
	   ?A ?B ?C ?D ?E ?F ?G ?H ?I ?J
	   ?K ?L ?M ?N ?O ?P ?Q ?R ?S ?T
	   ?U ?V ?W ?X ?Y ?Z
	   ?a ?b ?c ?d ?e ?f ?g ?h ?i ?j
	   ?k ?l ?m ?n ?o ?p ?q ?r ?s ?t
	   ?u ?v ?w ?x ?y ?z
	   ?0 ?1 ?2 ?3 ?4 ?5 ?6 ?7 ?8 ?9
	   ?' ?\( ?\)  ?, ?- ?. ?/ ?: ??))))

(defconst utf-7-optional-direct-characters-assoc
  `(assoc
    (ucs-generic . ucs-generic)
    ,(mapcar
      (lambda (x)
	(cons x x))
      '(?! ?\" ?# ?$ ?% ?& ?* ?; ?< ?=
	   ?> ?@ ?\[ ?\] ?^ ?_ ?` ?\{ ?| ?\}))))

(defconst utf-7-shifted-character-assoc
  '(assoc (ucs-generic . ucs-generic)
	  ((all . invalid))))

(defconst ccl-b64-encode-table
  '[?A ?B ?C ?D ?E ?F ?G ?H ?I ?J ;; 0-9
    ?K ?L ?M ?N ?O ?P ?Q ?R ?S ?T ;; 10-19
    ?U ?V ?W ?X ?Y ?Z             ;; 20-25
    ?a ?b ?c ?d ?e ?f ?g ?h ?i ?j ;; 26-35
    ?k ?l ?m ?n ?o ?p ?q ?r ?s ?t ;; 36-45
    ?u ?v ?w ?x ?y ?z             ;; 46-51
    ?0 ?1 ?2 ?3 ?4 ?5 ?6 ?7 ?8 ?9 ;; 52-61
    ?+ ?/])                       ;; 62-63

(defconst ccl-b64-decode-table
  '[ -1  -1  -1  -1  -1  -1  -1  -1  -1  -1  -1  -1  -1  -1  -1  -1
     -1  -1  -1  -1  -1  -1  -1  -1  -1  -1  -1  -1  -1  -1  -1  -1
     -1  -1  -1  -1  -1  -1  -1  -1  -1  -1  -1  62  -1  -1  -1  63
     52  53  54  55  56  57  58  59  60  61  -1  -1  -1  -2  -1  -1
     -1   0   1   2   3   4   5   6   7   8   9  10  11  12  13  14
     15  16  17  18  19  20  21  22  23  24  25  -1  -1  -1  -1  -1
     -1  26  27  28  29  30  31  32  33  34  35  36  37  38  39  40
     41  42  43  44  45  46  47  48  49  50  51  -1  -1  -1  -1  -1
     -1  -1  -1  -1  -1  -1  -1  -1  -1  -1  -1  -1  -1  -1  -1  -1
     -1  -1  -1  -1  -1  -1  -1  -1  -1  -1  -1  -1  -1  -1  -1  -1
     -1  -1  -1  -1  -1  -1  -1  -1  -1  -1  -1  -1  -1  -1  -1  -1
     -1  -1  -1  -1  -1  -1  -1  -1  -1  -1  -1  -1  -1  -1  -1  -1
     -1  -1  -1  -1  -1  -1  -1  -1  -1  -1  -1  -1  -1  -1  -1  -1
     -1  -1  -1  -1  -1  -1  -1  -1  -1  -1  -1  -1  -1  -1  -1  -1
     -1  -1  -1  -1  -1  -1  -1  -1  -1  -1  -1  -1  -1  -1  -1  -1
     -1  -1  -1  -1  -1  -1  -1  -1  -1  -1  -1  -1  -1  -1  -1  -1])

(tae-declare-translation
 'utf-7-direct-character-p-translation
 `(| ,utf-7-direct-characters-assoc
     ,utf-7-shifted-character-assoc))

(tae-declare-translation
 'utf-7-direct-or-optional-character-p-translation
 `(| ,utf-7-direct-characters-assoc
     ,utf-7-optional-direct-characters-assoc
     ,utf-7-shifted-character-assoc))

(defun ccl-utf-7-base64-encode (valc valp &optional init)
  (if (or (eq valc 'r0)
	  (eq valp 'r0))
      (error "r0 is reserved for caliculation."))
  (let (result)
    (if init
	(setq result
	      `(,@(mucs-ccl-set-internal-state 'utf-7-shifted-p t)
		  (,valp = 0))))
    (setq result
	  (append
	   result
	   `((if (,valc > ?\xFFFF)
		 ((r0 = (((,valc >> 10) - ?\x40) & ?\x3FF) | ?\xD800)
		  (,valc = ((,valc & ?\x3FF) | ?\xDC00)))
	       ((r0 = ,valc)
		(,valc = 0)))
	     (loop
	      (if (,valp == 0) ;; 0bit
		  ((,valp = (r0 >> 10))
		   (write ,valp ,ccl-b64-encode-table)
		   (,valp = ((r0 >> 4) & ?\x3F))
		   (write ,valp ,ccl-b64-encode-table)
		   (,valp = ((r0 & ?\xF) | ?\x30)))
		((if (,valp >= ?\x30) ;; 4bit
		     ((,valp = ((,valp & ?\xF) << 2))
		      (,valp |= (r0 >> 14))
		      (write ,valp ,ccl-b64-encode-table)
		      (,valp = ((r0 >> 8) & ?\x3F))
		      (write ,valp ,ccl-b64-encode-table)
		      (,valp = ((r0 >> 2) & ?\x3F))
		      (write ,valp ,ccl-b64-encode-table)
		      (,valp = ((r0 & ?\x3) | ?\x20)))
		   ((if (,valp >= ?\x20) ;; 2bit
			((,valp = ((,valp & ?\x3) << 4))
			 (,valp |= (r0 >> 12))
			 (write ,valp ,ccl-b64-encode-table)
			 (,valp = ((r0 >> 6) & ?\x3F))
			 (write ,valp ,ccl-b64-encode-table)
			 (,valp = (r0 & ?\x3F))
			 (write ,valp ,ccl-b64-encode-table)
			 (,valp = 0)))))))
	      (if (,valc != 0)
		  ((r0 = ,valc)
		   (,valc = 0)
		   (repeat)))))))))

(defun ccl-utf-7-base64-encode-flush (valp)
  `(,@(mucs-ccl-set-internal-state 'utf-7-shifted-p nil)
    (if (,valp >= ?\x30) ;; 4bit
	((,valp = ((,valp << 2) & ?\x3F))
	 (write ,valp ,ccl-b64-encode-table))
      ((if (,valp >= ?\x20) ;; 2bit
	   ((,valp = ((,valp << 4) & ?\x3F))
	    (write ,valp ,ccl-b64-encode-table)))))
    (write ?-)
    (,valp = 0)))

(defun mucs-ccl-utf-7-encode-eof ()
  `((if ,(mucs-ccl-check-internal-state 'utf-7-shifted-p)
	,(ccl-utf-7-base64-encode-flush 'r6))))

(defun mucs-ccl-utf-7-encode (direct-character-translation dos)
  (mucs-ccl-bind-program
   `(,(mucs-ccl-if-invalid-repeat)
     ((r4 = r0))
     ,(tae-compile direct-character-translation nil)
     ,(mucs-ccl-if-invalid
       ;;; shifted state
       `((if ,(mucs-ccl-check-internal-state 'utf-7-shifted-p)
	     ,(ccl-utf-7-base64-encode 'r4 'r6)
	   ((write ?+)
	    ,@(ccl-utf-7-base64-encode 'r4 'r6 t))))
       ;;; Non-shifted state
       `((if ,(mucs-ccl-check-internal-state 'utf-7-shifted-p)
	     ,(ccl-utf-7-base64-encode-flush 'r6))
	 ,@(if dos
	       '((if (r0 == ?\n)
		     ((write "\x0D\x0A"))
		   ((write r0))))
	     '((write r0)))
	 (if (r0 == ?+)
	     ((write ?-))))))))

(defconst ccl-utf-7-decode
  `((r4 = 0)
    (loop
     (if ,(mucs-ccl-check-internal-state 'utf-7-shifted-p)
	 ((read r0)
	  (r1 = r0 ,ccl-b64-decode-table)
	  (if (r1 >= 0)
	      ((r6 = ((r6 << 6) | r1))
	       (read r0)
	       (r1 = r0 ,ccl-b64-decode-table)
	       (if (r1 >= 0)
		   ((r6 = ((r6 << 6) | r1))
		    (if (r6 >= ?\x30000)
			((r0 = (r6 & ?\xFFFF))
			 (r6 = 0))
		      ((read r0)
		       (r1 = r0 ,ccl-b64-decode-table)
		       (if (r1 >= 0)
			   ((r6 = ((r6 << 6) | r1))
			    (if (r6 >= ?\x800000)
				;; r6 = 1000 XXXX XXXX XXXX XXXX XXXX
				((r0 = ((r6 >> 4) & ?\xFFFF))
				 (r6 = ((r6 & ?\xF) | ?\x30)))
			      ;; r6 = XXXX XXXX XXXX XXXX XX
			      ((r0 = ((r6 >> 2) & ?\xFFFF))
			       (r6 = ((r6 & ?\x3) | ?\x20)))))
			 ,(mucs-ccl-set-internal-state
			   'utf-7-shifted-p nil)))))
		 ,(mucs-ccl-set-internal-state
		   'utf-7-shifted-p nil)))
	    (,@(mucs-ccl-set-internal-state
		'utf-7-shifted-p nil)
	     (if (r0 == ?-)
		 ((repeat))))))
       ((read-if (r0 == ?+)
		 ((read-if (r0 == ?-)
			   ((r0 = ?+))
			((r6 = r0 ,ccl-b64-decode-table)
			 (if (r6 >= 0)
			     ((read r0)
			      (r1 = r0 ,ccl-b64-decode-table)
			      (if (r1 >= 0)
				  ((r6 = ((r6 << 6) | r1))
				   (read r0)
				   (r1 = r0 ,ccl-b64-decode-table)
				   (if (r1 >= 0)
				       ((r6 = ((r6 << 6) | r1))
					(r0 = (r6 >> 2))
					(r6 = ((r6 & ?\x3) | ?\x20))
					,@(mucs-ccl-set-internal-state
					   'utf-7-shifted-p t)))))))))))))
     (if ,(utf-16-ccl-surrogate-pair-p 'r0)
	 ((r4 = r0)
	  (repeat)))
     (if (r4 != 0)
	 ((r4 &= ?\x3FF)
	  (r0 = (((r0 & ?\x3FF) + ?\x40) << 10) | r4))))))

(mucs-type-register-serialization
 'ucs-generic
 'utf-7
 'none
 ccl-utf-7-decode)

;
; Assoc translation rule loader
;

(defvar unicode-charset-library-alist
  '((ascii . uascii)
    (latin-iso8859-1 . uiso8859-1)
    (latin-iso8859-2 . uiso8859-2)
    (latin-iso8859-3 . uiso8859-3)
    (latin-iso8859-4 . uiso8859-4)
    (cyrillic-iso8859-5 . uiso8859-5)
    (arabic-iso8859-6 . uiso8859-6)
    (greek-iso8859-7 . uiso8859-7)
    (hebrew-iso8859-8 . uiso8859-8)
    (latin-iso8859-9 . uiso8859-9)
    (latin-jisx0201 . ujisx0201)
    (katakana-jisx0201 . ujisx0201)
    (japanese-jisx0208 . ujisx0208)
    (japanese-jisx0212 . ujisx0212)
    (chinese-gb2312 . ugb2312)
    (chinese-cns11643-1 . u-cns-1)
    (chinese-cns11643-2 . u-cns-2)
    (chinese-cns11643-3 . u-cns-3)
    (chinese-cns11643-4 . u-cns-4)
    (chinese-cns11643-5 . u-cns-5)
    (chinese-cns11643-6 . u-cns-6)
    (chinese-cns11643-7 . u-cns-7)
    (korean-ksc5601 . uksc5601)
    (ipa . uipa)))

(defun require-unicode-charset-data (charset)
  (let ((package (cdr (assq charset unicode-charset-library-alist))))
    (or (featurep package)
	(load (expand-file-name (symbol-name package)
				mucs-data-path)
	      t)
	(require package))))

;
; Unicode or its transformation format support function.
;

(defun ucs-to-char (codepoint)
  (mucs-convert 'ucs-codepoint-to-emacs-char-conversion
		codepoint))

(defun char-to-ucs (char)
  (logior (lsh (charset-id (char-charset char)) 16)
	  (char-codepoint char))
  (mucs-convert 'emacs-char-to-ucs-codepoint-conversion
		char))

(defun insert-ucs-character (codepoint)
  "Insert character which is converted from give UCS codepoint."
  (interactive "nUCS codepoint:")
  (insert (or (ucs-to-char codepoint)
	      (error "Invalid or cannot translate:U+%X"
		     codepoint))))

(provide 'unicode)
;;; unicode ends here.


