;;; -*- mode: lisp; coding: utf-8 -*-
;;; 
;;; Copyright (c) 2007 Masayuki Onjo <onjo@lispuser.net>
;;; 
;;; Redistribution and use in source and binary forms, with or without
;;; modification, are permitted provided that the following conditions
;;; are met:
;;; 
;;;   * Redistributions of source code must retain the above copyright
;;;     notice, this list of conditions and the following disclaimer.
;;; 
;;;   * Redistributions in binary form must reproduce the above
;;;     copyright notice, this list of conditions and the following
;;;     disclaimer in the documentation and/or other materials
;;;     provided with the distribution.
;;; 
;;; THIS SOFTWARE IS PROVIDED BY THE AUTHOR 'AS IS' AND ANY EXPRESSED
;;; OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
;;; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
;;; ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
;;; DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
;;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE
;;; GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
;;; INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
;;; WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
;;; NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
;;; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

;;;
;;; Gauche's guessing character encoding
;;;
;;;   Copyright (c) 2000-2007 Shiro Kawai  <shiro@acm.org>
;;;

(defpackage :guess (:use :cl) (:export #:ces-guess-from-vector))
(in-package :guess)

(eval-when (:compile-toplevel :load-toplevel :execute)

  (defclass <dfa> ()
    ((name   :initarg :name :accessor name-of)
     (states :initarg :states :accessor states-of)
     #+nil (instances :allocation :class :initform nil)))

  (defclass <state> ()
    ((name  :initarg :name  :accessor name-of)
     (index :initarg :index :accessor index-of)
     (arcs  :initarg :arcs  :accessor arcs-of :initform nil)))

  (defclass <arc> ()
    ((from-state :initarg :from-state  :accessor from-state-of)
     (to-state   :initarg :to-state    :accessor to-state-of)
     (ranges     :initarg :ranges      :accessor ranges-of)
     (index      :initarg :index       :accessor index-of)
     (score      :initarg :score       :accessor score-of)))

  (defun resolve-states (state-defs)
    (let ((states (mapcar (lambda (d i)
			    (make-instance '<state> :name (car d) :index i))
			  state-defs
			  (loop for i from 0 below (length state-defs) collect i))))
      (labels ((gen (s d i &aux (num-arcs (length (cdr d))))
		 (setf (arcs-of s)
		       (mapcar (lambda (arc aindex)
				 (make-instance '<arc>
						:from-state s
						:to-state (or (find-if
							       (lambda (e)
								 (eq (name-of e) (cadr arc)))
							       states))
						:ranges (car arc)
						:index aindex
						:score (caddr arc)))
			       (cdr d)
			       (loop repeat num-arcs for x from i collect x)))
		 (+ i num-arcs))
	       (fold (fun  state arg1 arg2)
		 (if (or (null arg1) (null arg2))
		     state
		     (fold fun
			   (funcall fun (car arg1) (car arg2) state)
			   (cdr arg1)
			   (cdr arg2)))))
	(fold #'gen 0 states state-defs)
	states)))

;;;;;; DFA

  (defmacro define-dfa (name &body states)
    (let ((name-st (intern (string-upcase (format nil "+~A-ST+" name))))
	  (name-ar (intern (string-upcase (format nil "+~A-AR+" name)))))
      `(unless (boundp ',name-st)
	 (let ((dfa (make-instance '<dfa> :name ',name :states (resolve-states ',states))))
	   (defconstant ,name-st (apply #'vector
					(loop for state in (states-of dfa)
					   collect (let ((vec (make-array 256 :initial-element -1)))
						     (flet ((b2i (byte) (if (characterp byte) (char-code byte) byte)))
						       (dolist (br (arcs-of state))
							 (dolist (range (ranges-of br))
							   (if (consp range)
							       (fill vec (index-of br)
								     :start (b2i (car range))
								     :end   (+ (b2i (cadr range)) 1))
							       (setf (aref vec (b2i range)) (index-of br)))))
						       vec)))))
	   (defconstant ,name-ar (apply #'vector
					(loop for arc in (loop for state in (states-of dfa) appending (arcs-of state))
					   collect (cons (index-of (to-state-of arc)) (score-of arc)))))))))

;;;;; state date from Gauche's guess.scm

;;;
;;; EUC-JP
;;;

  (define-dfa eucj
    ;; first byte
    (init
     (((#x00 #x7f)) init         1.0)   ; ASCII range
     ((#x8e)        jis0201_kana 0.8)   ; JISX 0201 kana
     ((#x8f)        jis0213_2    0.95)  ; JISX 0213 plane 2
     (((#xa1 #xfe)) jis0213_1    1.0)   ; JISX 0213 plane 1
     )
    ;; jis x 0201 kana
    (jis0201_kana
     (((#xa1 #xdf)) init         1.0)
     )
    ;; jis x 0208 and jis x 0213 plane 1
    (jis0213_1
     (((#xa1 #xfe)) init         1.0))
    ;; jis x 0213 plane 2
    (jis0213_2
     (((#xa1 #xfe)) init         1.0))
    )

;;;
;;; Shift_JIS
;;;

  (define-dfa sjis
    ;; first byte
    (init
     (((#x00 #x7f)) init         1.0)	   ;ascii
     (((#x81 #x9f) (#xe1 #xef)) jis0213 1.0) ;jisx0213 plane 1
     (((#xa1 #xdf)) init         0.8)	     ;jisx0201 kana
     (((#xf0 #xfc)) jis0213      0.95)	     ;jisx0213 plane 2
     (((#xfd #xff)) init         0.8))	     ;vendor extension
    (jis0213
     (((#x40 #x7e) (#x80 #xfc)) init 1.0))
    )

;;;
;;; UTF-8
;;;

  (define-dfa utf8
    (init
     (((#x00 #x7f)) init         1.0)
     (((#xc2 #xdf)) 1byte_more   1.0)
     (((#xe0 #xef)) 2byte_more   1.0)
     (((#xf0 #xf7)) 3byte_more   1.0)
     (((#xf8 #xfb)) 4byte_more   1.0)
     (((#xfc #xfd)) 5byte_more   1.0))
    (1byte_more
     (((#x80 #xbf)) init         1.0))
    (2byte_more
     (((#x80 #xbf)) 1byte_more   1.0))
    (3byte_more
     (((#x80 #xbf)) 2byte_more   1.0))
    (4byte_more
     (((#x80 #xbf)) 3byte_more   1.0))
    (5byte_more
     (((#x80 #xbf)) 4byte_more   1.0))
    )

;;;
;;; JIS (ISO2022JP)
;;;

  ;; NB: for now, we just check the sequence of <ESC> $ or <ESC> '('.
  '(define-dfa jis
    (init
     ((#x1b)        esc          1.0)
     (((#x00 #x1a)  (#x1c #x1f)) init 1.0) ;C0
     (((#x20 #x7f)) init         1.0)      ;ASCII
     (((#xa1 #xdf)) init         0.7)      ;JIS8bit kana
     )
    (esc
     ((#x0d #x0a)   init         0.9)	;cancel
     ((#\( )        esc-paren    1.0)
     ((#\$ )        esc-$        1.0)
     ((#\& )        esc-&        1.0)
     )
    (esc-paren
     ((#\B #\J #\H) init         1.0)
     ((#\I)         jis0201kana  0.8)
     )
    (esc-$
     ((#\@ #\B)     kanji        1.0)
     ((#\( )        esc-$-paren  1.0)
     )
    (esc-$-paren
     ((#\D #\O #\P) kanji        1.0))
    (esc-&
     ((#\@ )        init         1.0))
    (jis0201kana
     ((#x1b)        esc          1.0)
     (((#x20 #x5f)) jis0201kana  1.0))
    (kanji
     ((#x1b)        esc          1.0)
     (((#x21 #x7e)) kanji-2      1.0))
    (kanji-2
     (((#x21 #x7e)) kanji        1.0))
    )

) 

(defun ces-guess-from-vector (vector scheme)
  (case scheme
    ((:*JP :JP) (guess-jp vector))
    (t          (error "scheme parameter: supported :*JP only"))))

(defun guess-jp (buffer &aux (len (length buffer)))
  (declare (optimize (speed 3) (space 0) (safety 0) (debug 0)))
  (macrolet ((dfa-init (dfa-st dfa-ar)
	       `(vector ,dfa-st ,dfa-ar 0 1.0d0))
	     (score (dfa)  `(svref ,dfa 3))
	     (state (dfa)  `(svref ,dfa 2))
	     (arcs (dfa)   `(svref ,dfa 1))
	     (states (dfa) `(svref ,dfa 0))
	     (dfa-alive (dfa) `(>= (the fixnum (state ,dfa)) (the fixnum 0)))
	     (dfa-next (dfa ch)
	       `(when (dfa-alive ,dfa)
		  (when (>= (the fixnum (state ,dfa)) (the fixnum 0))
		    (let ((temp (svref
				 (svref (states ,dfa) (state ,dfa))
				 ,ch)))
		      (if (< (the fixnum temp) (the fixnum  0))
			  (setf (state ,dfa) -1)
			  (setf (state ,dfa) (the fixnum (car (svref (arcs ,dfa) temp)))
				(score ,dfa) (* (the double-float (score ,dfa))
						(the double-float (cdr (svref (arcs ,dfa) temp))))))))))
	     ;; utility
	     (process-dfa (dfa ch value &rest others)
	       `(when (dfa-alive ,dfa)
		  (when (and ,@(mapcar (lambda (dfa) `(not (dfa-alive ,dfa))) others))
		    (return-from guess-body ,value))
		  (dfa-next ,dfa ,ch)))
	     ;; result
	     (iso-2022-jp () #+clisp 'charset:iso-2022-jp
			     #+allegro :jis
			     #+lispworks :jis)
	     (euc-jp ()      #+clisp 'charset:euc-jp
			     #-clisp :euc-jp)
	     (shiftjis ()    #+clisp 'charset:shift-jis
		             #+sbcl :sjis
			     #+(or lispworks allegro) :shiftjis)
	     (utf-8 ()       #+clisp 'charset:utf-8
			     #-clisp :utf-8))
    (block guess-body
       (let* ((eucj (dfa-init +eucj-st+ +eucj-ar+))
	      (sjis (dfa-init +sjis-st+ +sjis-ar+))
	      (utf8 (dfa-init +utf8-st+ +utf8-ar+))
	      (top  nil))
	 (declare (dynamic-extent eucj sjis utf8 top))
	 (loop for c of-type fixnum across buffer
	       for i of-type fixnum from 0 do
	      (when (and (= (the fixnum c) (the fixnum #x1b)) (< i len))
		 (let ((c (aref buffer (the fixnum (1+ i)))))
		   (when (or (= (the fixnum c) (the fixnum #x24))  ; $
			     (= (the fixnum c) (the fixnum #x28))) ; (
		     (return-from guess-body (iso-2022-jp)))))
	       (process-dfa eucj c (euc-jp)    sjis utf8)
	       (process-dfa sjis c (shiftjis)  eucj utf8)
	       (process-dfa utf8 c (utf-8)     sjis eucj)
               (when (and (not (dfa-alive eucj)) (not (dfa-alive sjis)) (not (dfa-alive utf8)))
		 (return nil)))
	 ;; pick highest score
	 (when (dfa-alive eucj)
	   (setf top eucj))
	 (when (dfa-alive utf8)
	   (if top
	       (when (<= (the double-float (score top)) (the double-float (score utf8)))
		 (setf top utf8))
	       (setf top utf8)))
	 (when (dfa-alive sjis)
	   (if top
	       (when (< (the double-float (score top)) (the double-float (score sjis)))
		 (setf top sjis))
	       (setf top sjis)))
	 (cond ((eq top eucj) (euc-jp))
	       ((eq top utf8) (utf-8))
	       ((eq top sjis) (shiftjis))
	       (t             nil))))))
