M.Hiroi's Home Page

Common Lisp Programming

お気楽 Common Lisp プログラミング入門 : 自作ライブラリ編

サンプルプログラム

[ Common Lisp | library ]

有限文脈モデル

ハフマン符号 (2) で説明した「無記憶情報源モデル」はもっとも簡単な情報源モデルです。このモデルは、記号を生成するとき以前に生成した記号との間に関係がないため「無記憶」と呼ばれますが、このモデルを一般化して状態 (記憶) を持つモデルを考えることができます。

参考文献 [1] によると、記憶があるモデルを「有限状態確率モデル」とか「マルコフ情報源モデル」と呼ぶそうです。一般には、入力データから適切な情報源モデルを作成するのは大変難しいので、次のような単純なモデルを考えることにします。

これを「有限文脈モデル」といい、直前に出現した記号列の長さを「次数 (order)」といいます。有限文脈モデルは 1 次 (order-1) がいちばん簡単です。直前に出力した記号を覚えておいて、それに従って出現頻度表を切り替えるという単純な方法で実現できます。つまり、各記号ごとに出現頻度表を用意しておいて、直前に出力した記号が a であれば、a の出現頻度表を使って符号化を行うわけです。

したがって、記号が 256 種類あれば、出現頻度表も 256 個必要になります。order-2 であれば、ab や cd のあとに現れる記号の出現頻度表が必要になるので、個数は 256 * 256 = 65536 になります。このように、次数が大きくなるほど必要となるメモリ量が爆発的に増えるので、単純な方法では低次の有限文脈モデルしか実現できないのが欠点です。

有限文脈モデルの詳しい説明は、以下に示す拙作のページをお読みくださいませ。

●参考文献

  1. 植松友彦, 『文書データ圧縮アルゴリズム入門』, CQ出版社, 1994

●適応型レンジコーダの有限文脈モデル

;;;
;;; rca1.lisp : 適応型レンジコーダ (order-1)
;;;
;;; Copyright (C) 2023 Makoto Hiroi
;;;
;;; Released under the MIT license
;;; https://opensource.org/license/mit/
;;;
(require :arc)
(use-package '(:arc :bitree :rangecoder))

;;; ファイルサイズの書き込み
(defun write-file-size (out size)
  (write-byte (logand (ash size -24) #xff) out)
  (write-byte (logand (ash size -16) #xff) out)
  (write-byte (logand (ash size -8) #xff) out)
  (write-byte (logand size #xff) out))

;;; ファイルサイズの読み込み
(defun read-file-size (in)
  (+ (ash (read-byte in) 24)
     (ash (read-byte in) 16)
     (ash (read-byte in) 8)
     (read-byte in)))

;;; 出現頻度表の取得
(defun get-freq (freq c)
  (unless (aref freq c)
    (setf (aref freq c) (bitree-initialize 256)))
  (aref freq c))

;;; ファイルの符号化
(defun encode-file (in-file out-file)
  (call-with-byte-output-file
   out-file
   (lambda (out)
     (call-with-byte-input-file
      in-file
      (lambda (in)
        (let ((size (file-length in))
              (freq (make-array 256 :initial-element nil))
              (c0 0))
          (write-file-size out size)
          (when (plusp size)
            (call-with-range-encoder
             out
             (lambda (rc)
               (dotimes (x size)
                 (let ((c (read-byte in)))
                   (arc-encode rc (get-freq freq c0) c :limit #x4000 :inc 16)
                   (setf c0 c))))))))))))

;;; ファイルの復号
(defun decode-file (in-file out-file)
  (call-with-byte-input-file
   in-file
   (lambda (in)
     (let ((size (read-file-size in))
           (freq (make-array 256 :initial-element nil))
           (c0 0))
       (call-with-byte-output-file
        out-file
        (lambda (out)
          (when (plusp size)
            (call-with-range-decoder
             in
             (lambda (rc)
               (dotimes (x size)
                 (let ((c (arc-decode rc (get-freq freq c0) :limit #x4000 :inc 16)))
                   (write-byte c out)
                   (setf c0 c))))))))))))

;;; 簡単なテスト
(defun test ()
  (let ((files '("alice29.txt" "asyoulik.txt" "cp.html" "fields.c" "grammar.lsp"
                 "kennedy.xls" "lcet10.txt" "plrabn12.txt" "ptt5" "sum" "xargs.1")))
    (format t "----- encode -----~%")
    (time
     (dolist (file files)
       (encode-file (format nil "./canterbury/~a" file)
                    (format nil "~a.en" file))))
    ;;
    (format t "----- decode -----~%")
    (time
     (dolist (file files)
       (decode-file (format nil "~a.en" file)
                    (format nil "~a.de" file))))))
;;;
;;; rca2.lisp : 適応型レンジコーダ (order-2)
;;;
;;; Copyright (C) 2023 Makoto Hiroi
;;;
;;; Released under the MIT license
;;; https://opensource.org/license/mit/
;;;
(require :arc)
(use-package '(:arc :bitree :rangecoder))

;;; ファイルサイズの書き込み
(defun write-file-size (out size)
  (write-byte (logand (ash size -24) #xff) out)
  (write-byte (logand (ash size -16) #xff) out)
  (write-byte (logand (ash size -8) #xff) out)
  (write-byte (logand size #xff) out))

;;; ファイルサイズの読み込み
(defun read-file-size (in)
  (+ (ash (read-byte in) 24)
     (ash (read-byte in) 16)
     (ash (read-byte in) 8)
     (read-byte in)))

;;; 出現頻度表の取得
(defun get-freq (freq c0 c1)
  (unless (aref freq c0 c1)
    (setf (aref freq c0 c1) (bitree-initialize 256)))
  (aref freq c0 c1))

;;; ファイルの符号化
(defun encode-file (in-file out-file)
  (call-with-byte-output-file
   out-file
   (lambda (out)
     (call-with-byte-input-file
      in-file
      (lambda (in)
        (let ((size (file-length in))
              (freq (make-array '(256 256) :initial-element nil))
              (c0 0)
              (c1 0))
          (write-file-size out size)
          (when (plusp size)
            (call-with-range-encoder
             out
             (lambda (rc)
               (dotimes (x size)
                 (let ((c (read-byte in)))
                   (arc-encode rc (get-freq freq c0 c1) c :limit #x4000 :inc 64)
                   (setf c0 c1
                         c1 c))))))))))))

;;; ファイルの復号
(defun decode-file (in-file out-file)
  (call-with-byte-input-file
   in-file
   (lambda (in)
     (let ((size (read-file-size in))
           (freq (make-array '(256 256) :initial-element nil))
           (c0 0)
           (c1 0))
       (call-with-byte-output-file
        out-file
        (lambda (out)
          (when (plusp size)
            (call-with-range-decoder
             in
             (lambda (rc)
               (dotimes (x size)
                 (let ((c (arc-decode rc (get-freq freq c0 c1) :limit #x4000 :inc 64)))
                   (write-byte c out)
                   (setf c0 c1
                         c1 c))))))))))))

;;; 簡単なテスト
(defun test ()
  (let ((files '("alice29.txt" "asyoulik.txt" "cp.html" "fields.c" "grammar.lsp"
                 "kennedy.xls" "lcet10.txt" "plrabn12.txt" "ptt5" "sum" "xargs.1")))
    (format t "----- encode -----~%")
    (time
     (dolist (file files)
       (encode-file (format nil "./canterbury/~a" file)
                    (format nil "~a.en" file))))
    ;;
    (format t "----- decode -----~%")
    (time
     (dolist (file files)
       (decode-file (format nil "~a.en" file)
                    (format nil "~a.de" file))))))

●バイナリレンジコーダの有限文脈モデル

;;;
;;; rcb1.lisp : バイナリレンジコーダ (order-1)
;;;
;;; Copyright (C) 2023 Makoto Hiroi
;;;
(require :brc)
(use-package '(:brc :rangecoder))

;;; ファイルサイズの書き込み
(defun write-file-size (out size)
  (write-byte (logand (ash size -24) #xff) out)
  (write-byte (logand (ash size -16) #xff) out)
  (write-byte (logand (ash size -8) #xff) out)
  (write-byte (logand size #xff) out))

;;; ファイルサイズの読み込み
(defun read-file-size (in)
  (+ (ash (read-byte in) 24)
     (ash (read-byte in) 16)
     (ash (read-byte in) 8)
     (read-byte in)))

;;; 出現頻度表の取得
(defun get-freq (freq c)
  (unless (aref freq c)
    (setf (aref freq c) (initialize-binary-model 256)))
  (aref freq c))

;;; ファイルの符号化
(defun encode-file (in-file out-file)
  (call-with-byte-output-file
   out-file
   (lambda (out)
     (call-with-byte-input-file
      in-file
      (lambda (in)
        (let ((size (file-length in))
              (freq (make-array 256 :initial-element nil))
              (c0 0))
          (write-file-size out size)
          (when (plusp size)
            (call-with-range-encoder
             out
             (lambda (rc)
               (dotimes (x size)
                 (let ((c (read-byte in)))
                   (bm-encode rc (get-freq freq c0) c :limit #x200 :inc 4)
                   (setf c0 c))))))))))))

;;; ファイルの復号
(defun decode-file (in-file out-file)
  (call-with-byte-input-file
   in-file
   (lambda (in)
     (let ((size (read-file-size in))
           (freq (make-array 256 :initial-element nil))
           (c0 0))
       (call-with-byte-output-file
        out-file
        (lambda (out)
          (when (plusp size)
            (call-with-range-decoder
             in
             (lambda (rc)
               (dotimes (x size)
                 (let ((c (bm-decode rc (get-freq freq c0) :limit #x200 :inc 4)))
                   (write-byte c out)
                   (setf c0 c))))))))))))

;;; 簡単なテスト
(defun test ()
  (let ((files '("alice29.txt" "asyoulik.txt" "cp.html" "fields.c" "grammar.lsp"
                 "kennedy.xls" "lcet10.txt" "plrabn12.txt" "ptt5" "sum" "xargs.1")))
    (format t "----- encode -----~%")
    (time
     (dolist (file files)
       (encode-file (format nil "./canterbury/~a" file)
                    (format nil "~a.en" file))))
    ;;
    (format t "----- decode -----~%")
    (time
     (dolist (file files)
       (decode-file (format nil "~a.en" file)
                    (format nil "~a.de" file))))))
;;;
;;; rcb2.lisp : バイナリレンジコーダ (order-2)
;;;
;;; Copyright (C) 2023 Makoto Hiroi
;;;
;;; Released under the MIT license
;;; https://opensource.org/license/mit/
;;;
(require :brc)
(use-package '(:brc :rangecoder))

;;; ファイルサイズの書き込み
(defun write-file-size (out size)
  (write-byte (logand (ash size -24) #xff) out)
  (write-byte (logand (ash size -16) #xff) out)
  (write-byte (logand (ash size -8) #xff) out)
  (write-byte (logand size #xff) out))

;;; ファイルサイズの読み込み
(defun read-file-size (in)
  (+ (ash (read-byte in) 24)
     (ash (read-byte in) 16)
     (ash (read-byte in) 8)
     (read-byte in)))

;;; 出現頻度表の取得
(defun get-freq (freq c0 c1)
  (unless (aref freq c0 c1)
    (setf (aref freq c0 c1) (initialize-binary-model 256)))
  (aref freq c0 c1))

;;; ファイルの符号化
(defun encode-file (in-file out-file)
  (call-with-byte-output-file
   out-file
   (lambda (out)
     (call-with-byte-input-file
      in-file
      (lambda (in)
        (let ((size (file-length in))
              (freq (make-array '(256 256) :initial-element nil))
              (c0 0)
              (c1 0))
          (write-file-size out size)
          (when (plusp size)
            (call-with-range-encoder
             out
             (lambda (rc)
               (dotimes (x size)
                 (let ((c (read-byte in)))
                   (bm-encode rc (get-freq freq c0 c1) c :limit #x200 :inc 4)
                   (setf c0 c1
                         c1 c))))))))))))

;;; ファイルの復号
(defun decode-file (in-file out-file)
  (call-with-byte-input-file
   in-file
   (lambda (in)
     (let ((size (read-file-size in))
           (freq (make-array '(256 256) :initial-element nil))
           (c0 0)
           (c1 0))
       (call-with-byte-output-file
        out-file
        (lambda (out)
          (when (plusp size)
            (call-with-range-decoder
             in
             (lambda (rc)
               (dotimes (x size)
                 (let ((c (bm-decode rc (get-freq freq c0 c1) :limit #x200 :inc 4)))
                   (write-byte c out)
                   (setf c0 c1
                         c1 c))))))))))))

;;; 簡単なテスト
(defun test ()
  (let ((files '("alice29.txt" "asyoulik.txt" "cp.html" "fields.c" "grammar.lsp"
                 "kennedy.xls" "lcet10.txt" "plrabn12.txt" "ptt5" "sum" "xargs.1")))
    (format t "----- encode -----~%")
    (time
     (dolist (file files)
       (encode-file (format nil "./canterbury/~a" file)
                    (format nil "~a.en" file))))
    ;;
    (format t "----- decode -----~%")
    (time
     (dolist (file files)
       (decode-file (format nil "~a.en" file)
                    (format nil "~a.de" file))))))

●LZRC 符号の有限文脈モデル

;;;
;;; lzrc1.py : LZSS + Binary Range Coder (order-1)
;;;
;;; Copyright (C) 2023 Makoto Hiroi
;;;
;;; Released under the MIT license
;;; https://opensource.org/license/mit/
;;;
(require :brc)
(use-package '(:brc :rangecoder))

;;; 定数
(defconstant min-len 6)
(defconstant max-len 256)
(defconstant pos-bits 16)                   ; 13 (8k), 15 (32k), 16 (64k)
(defconstant slide-size (ash 1 pos-bits))
(defconstant slide-size2 (* slide-size 2))
(defconstant slide-limit (+ slide-size2 max-len))

;;; ファイルサイズの書き込み
(defun write-file-size (out size)
  (write-byte (logand (ash size -24) #xff) out)
  (write-byte (logand (ash size -16) #xff) out)
  (write-byte (logand (ash size -8) #xff) out)
  (write-byte (logand size #xff) out))

;;; ファイルサイズの読み込み
(defun read-file-size (in)
  (+ (ash (read-byte in) 24)
     (ash (read-byte in) 16)
     (ash (read-byte in) 8)
     (read-byte in)))

;;; スライド窓
(defstruct slide-window stream buff ht data-size)

(defun initialize-slide-window (s)
  (let* ((buff (make-array slide-limit :element-type 'unsigned-byte))
         (data-size (read-sequence buff s)))
    (make-slide-window
     :stream s
     :buff buff
     :ht (make-hash-table)
     :data-size data-size)))

;;; ハッシュ値を求める
(defun hash-value (sw rp)
  (do ((buff (slide-window-buff sw))
       (value 0)
       (i 0 (1+ i)))
      ((>= i min-len) value)
      (setf value
            (+ (* value 256) (aref buff (+ rp i))))))

;;; データ入力
(defun read-data (sw start size)
  (read-sequence
   (slide-window-buff sw)
   (slide-window-stream sw)
   :start start
   :end (+ start size)))

;;; データ移動
(defun move-data (sw to from size)
  (let ((buff (slide-window-buff sw)))
    (dotimes (n size)
      (setf (aref buff (+ to n)) (aref buff (+ from n))))))

;;; スライド窓の更新
(defun update-slide (sw rp)
  (cond
   ((< (slide-window-data-size sw) slide-limit)
    ;; EOF
    rp)
   (t
    ;; buffer update
    (move-data sw 0 slide-size (+ slide-size max-len))
    (setf (slide-window-data-size sw)
          (read-data sw (+ slide-size max-len) slide-size))
    ;; hash update
    (maphash
     (lambda (k v)
       (if (< (car v) slide-size)
           (remhash k (slide-window-ht sw))
         (do ((xs v (cdr xs)))
             ((null xs))
             (decf (car xs) slide-size)
             (when (and (consp (cdr xs))
                        (< (cadr xs) slide-size))
               (setf (cdr xs) nil)))))
     (slide-window-ht sw))
    (- rp slide-size))))

;;; データの挿入
(defun insert-data (sw rp)
  (let ((value (hash-value sw rp))
        (ht (slide-window-ht sw)))
    (if (gethash value ht)
        (push rp (gethash value ht))
      (setf (gethash value ht) (list rp)))))

;;; 最長一致列の探索
(defun longest-match (buff s1 s2 &key (start 0) (limit max-len))
  (do ((i start (1+ i)))
      ((>= i limit) i)
      (unless (= (aref buff (+ s1 i)) (aref buff (+ s2 i)))
        (return i))))

(defun search-data (sw rp)
  (let ((buff (slide-window-buff sw))
        (value (hash-value sw rp))
        (low-limit (- rp slide-size))
        (match-len 0)
        (match-pos 0))
    (do ((xs (gethash value (slide-window-ht sw)) (cdr xs)))
        ((or (null xs)
             (< (car xs) low-limit)))
        (when (and (< (+ rp (max min-len match-len)) (slide-window-data-size sw))
                   (= (aref buff (+ rp match-len))
                      (aref buff (+ (car xs) match-len)))
          (let ((n (longest-match buff rp (car xs) :start 3)))
            (when (< match-len n)
              (setf match-len n
                    match-pos (car xs))
              (when (= n max-len)
                (return)))))))
    ;; データの終端をチェック
    (when (>= match-len (- (slide-window-data-size sw) rp))
      (setf match-len (- (slide-window-data-size sw) rp)))
    (values match-len match-pos)))

;;; 出現頻度表の取得
(defun get-freq-flag (freq f0)
  (unless (aref freq f0)
    (setf (aref freq f0) (initialize-alpha-model 2)))
  (aref freq f0))

(defun get-freq-code (freq c0)
  (unless (aref freq c0)
    (setf (aref freq c0) (initialize-binary-model 256)))
  (aref freq c0))

;;; LZRC 符号 : 符号化
(defun encode (rc fin)
  (do ((freq-flag (make-array 2 :initial-element nil))
       (freq-code (make-array 256 :initial-element nil))
       (freq-len  (initialize-gamma-model (1+ (- max-len min-len))))
       (freq-pos  (initialize-gamma-model slide-size))
       (sw (initialize-slide-window fin))
       (rp 0)
       (c0 0)
       (f0 0))
      ((>= rp (slide-window-data-size sw)))
      (multiple-value-bind
       (match-len match-pos)
       (search-data sw rp)
       (cond
        ((< match-len min-len)
         (setf match-len 1)
         (alpha-encode rc (get-freq-flag freq-flag f0) 0 :limit #x200 :inc 4)
         (setf f0 0)
         (bm-encode rc (get-freq-code freq-code c0) (aref (slide-window-buff sw) rp) :limit #x200 :inc 4))
        (t
         (alpha-encode rc (get-freq-flag freq-flag f0) 1 :limit #x200 :inc 4)
         (setf f0 1)
         (gamma-encode rc freq-len (- match-len min-len) :limit #x200 :inc 4)
         (gamma-encode rc freq-pos (- rp match-pos 1) :limit #x200 :inc 4)))
       ;;
       (do ((n match-len (1- n)))
           ((zerop n))
           (setf c0 (aref (slide-window-buff sw) rp))
           (insert-data sw rp)
           (incf rp)
           (when (>= rp slide-size2)
             (setf rp (update-slide sw rp)))))))

;;; ファイルの符号化
(defun encode-file (in-file out-file)
  (call-with-byte-output-file
   out-file
   (lambda (fout)
     (call-with-byte-input-file
      in-file
      (lambda (fin)
        (let ((size (file-length fin)))
          (write-file-size fout size)
          (when (plusp size)
            (call-with-range-encoder
             fout
             (lambda (rc)
               (encode rc fin))))))))))

;;; LZSS 符号 : 復号
(defun decode (rc fout size)
  (do ((freq-flag (make-array 2 :initial-element nil))
       (freq-code (make-array 256 :initial-element nil))
       (freq-len  (initialize-gamma-model (1+ (- max-len min-len))))
       (freq-pos  (initialize-gamma-model slide-size))
       (buff (make-array (ash 1 pos-bits) :element-type 'unsigned-byte))
       (rp 0)
       (c0 0)
       (f0 0))
      ((zerop size))
      (cond
       ((= (alpha-decode rc (get-freq-flag freq-flag f0) :limit #x200 :inc 4) 1)
        (setf f0 1)
        (let ((num (+ (gamma-decode rc freq-len :limit #x200 :inc 4) min-len))
              (pos (- rp (1+ (gamma-decode rc freq-pos :limit #x200 :inc 4)))))
          (when (minusp pos)
            (incf pos (length buff)))
          (do ((num num (1- num)))
              ((zerop num))
              (let ((c (aref buff pos)))
                (write-byte c fout)
                (setf (aref buff rp) c
                      c0 c)
                (incf pos)
                (incf rp)
                (when (>= pos (length buff))
                  (setf pos 0))
                (when (>= rp (length buff))
                  (setf rp 0))))
          (decf size num)))
       (t
        (setf f0 0)
        (let ((c (bm-decode rc (get-freq-code freq-code c0) :limit #x200 :inc 4)))
          (write-byte c fout)
          (setf (aref buff rp) c
                c0 c)
          (incf rp)
          (when (>= rp (length buff))
            (setf rp 0)))
        (decf size)))))

;;; ファイルの復号
(defun decode-file (in-file out-file)
  (call-with-byte-input-file
   in-file
   (lambda (fin)
     (let ((size (read-file-size fin)))
       (call-with-byte-output-file
        out-file
        (lambda (fout)
          (when (plusp size)
            (call-with-range-decoder
             fin
             (lambda (rc)
               (decode rc fout size))))))))))

;;; 簡単なテスト
(defun test ()
  (let ((files '("alice29.txt" "asyoulik.txt" "cp.html" "fields.c" "grammar.lsp"
                 "kennedy.xls" "lcet10.txt" "plrabn12.txt" "ptt5" "sum" "xargs.1")))
    (format t "----- encode -----~%")
    (time
     (dolist (file files)
       (encode-file (format nil "./canterbury/~a" file)
                    (format nil "~a.en" file))))
    ;;
    (format t "----- decode -----~%")
    (time
     (dolist (file files)
       (decode-file (format nil "~a.en" file)
                    (format nil "~a.de" file))))))
;;;
;;; lzrc2.py : LZSS + Binary Range Coder (order-2)
;;;
;;; Copyright (C) 2023 Makoto Hiroi
;;;
;;; Released under the MIT license
;;; https://opensource.org/license/mit/
;;;
(require :brc)
(use-package '(:brc :rangecoder))

;;; 定数
(defconstant min-len 9)
(defconstant max-len 256)
(defconstant pos-bits 16)                   ; 13 (8k), 15 (32k), 16 (64k)
(defconstant slide-size (ash 1 pos-bits))
(defconstant slide-size2 (* slide-size 2))
(defconstant slide-limit (+ slide-size2 max-len))

;;; ファイルサイズの書き込み
(defun write-file-size (out size)
  (write-byte (logand (ash size -24) #xff) out)
  (write-byte (logand (ash size -16) #xff) out)
  (write-byte (logand (ash size -8) #xff) out)
  (write-byte (logand size #xff) out))

;;; ファイルサイズの読み込み
(defun read-file-size (in)
  (+ (ash (read-byte in) 24)
     (ash (read-byte in) 16)
     (ash (read-byte in) 8)
     (read-byte in)))

;;; スライド窓
(defstruct slide-window stream buff ht data-size)

(defun initialize-slide-window (s)
  (let* ((buff (make-array slide-limit :element-type 'unsigned-byte))
         (data-size (read-sequence buff s)))
    (make-slide-window
     :stream s
     :buff buff
     :ht (make-hash-table)
     :data-size data-size)))

;;; ハッシュ値を求める
(defun hash-value (sw rp)
  (do ((buff (slide-window-buff sw))
       (value 0)
       (i 0 (1+ i)))
      ((>= i min-len) value)
      (setf value
            (+ (* value 256) (aref buff (+ rp i))))))

;;; データ入力
(defun read-data (sw start size)
  (read-sequence
   (slide-window-buff sw)
   (slide-window-stream sw)
   :start start
   :end (+ start size)))

;;; データ移動
(defun move-data (sw to from size)
  (let ((buff (slide-window-buff sw)))
    (dotimes (n size)
      (setf (aref buff (+ to n)) (aref buff (+ from n))))))

;;; スライド窓の更新
(defun update-slide (sw rp)
  (cond
   ((< (slide-window-data-size sw) slide-limit)
    ;; EOF
    rp)
   (t
    ;; buffer update
    (move-data sw 0 slide-size (+ slide-size max-len))
    (setf (slide-window-data-size sw)
          (read-data sw (+ slide-size max-len) slide-size))
    ;; hash update
    (maphash
     (lambda (k v)
       (if (< (car v) slide-size)
           (remhash k (slide-window-ht sw))
         (do ((xs v (cdr xs)))
             ((null xs))
             (decf (car xs) slide-size)
             (when (and (consp (cdr xs))
                        (< (cadr xs) slide-size))
               (setf (cdr xs) nil)))))
     (slide-window-ht sw))
    (- rp slide-size))))

;;; データの挿入
(defun insert-data (sw rp)
  (let ((value (hash-value sw rp))
        (ht (slide-window-ht sw)))
    (if (gethash value ht)
        (push rp (gethash value ht))
      (setf (gethash value ht) (list rp)))))

;;; 最長一致列の探索
(defun longest-match (buff s1 s2 &key (start 0) (limit max-len))
  (do ((i start (1+ i)))
      ((>= i limit) i)
      (unless (= (aref buff (+ s1 i)) (aref buff (+ s2 i)))
        (return i))))

(defun search-data (sw rp)
  (let ((buff (slide-window-buff sw))
        (value (hash-value sw rp))
        (low-limit (- rp slide-size))
        (match-len 0)
        (match-pos 0))
    (do ((xs (gethash value (slide-window-ht sw)) (cdr xs)))
        ((or (null xs)
             (< (car xs) low-limit)))
        (when (and (< (+ rp (max min-len match-len)) (slide-window-data-size sw))
                   (= (aref buff (+ rp match-len))
                      (aref buff (+ (car xs) match-len)))
          (let ((n (longest-match buff rp (car xs) :start 3)))
            (when (< match-len n)
              (setf match-len n
                    match-pos (car xs))
              (when (= n max-len)
                (return)))))))
    ;; データの終端をチェック
    (when (>= match-len (- (slide-window-data-size sw) rp))
      (setf match-len (- (slide-window-data-size sw) rp)))
    (values match-len match-pos)))

;;; 出現頻度表の取得
(defun get-freq-flag (freq f0 f1)
  (unless (aref freq f0 f1)
    (setf (aref freq f0 f1) (initialize-alpha-model 2)))
  (aref freq f0 f1))

(defun get-freq-code (freq c0 c1)
  (unless (aref freq c0 c1)
    (setf (aref freq c0 c1) (initialize-binary-model 256)))
  (aref freq c0 c1))

;;; LZRC 符号 : 符号化
(defun encode (rc fin)
  (do ((freq-flag (make-array '(2 2) :initial-element nil))
       (freq-code (make-array '(256 256) :initial-element nil))
       (freq-len  (initialize-gamma-model (1+ (- max-len min-len))))
       (freq-pos  (initialize-gamma-model slide-size))
       (sw (initialize-slide-window fin))
       (rp 0)
       (c0 0) (c1 0)
       (f0 0) (f1 0))
      ((>= rp (slide-window-data-size sw)))
      (multiple-value-bind
       (match-len match-pos)
       (search-data sw rp)
       (cond
        ((< match-len min-len)
         (setf match-len 1)
         (alpha-encode rc (get-freq-flag freq-flag f0 f1) 0 :limit #x200 :inc 4)
         (setf f0 f1 f1 0)
         (bm-encode rc (get-freq-code freq-code c0 c1) (aref (slide-window-buff sw) rp) :limit #x200 :inc 4))
        (t
         (alpha-encode rc (get-freq-flag freq-flag f0 f1) 1 :limit #x200 :inc 4)
         (setf f0 f1 f1 1)
         (gamma-encode rc freq-len (- match-len min-len) :limit #x200 :inc 4)
         (gamma-encode rc freq-pos (- rp match-pos 1) :limit #x200 :inc 4)))
       ;;
       (do ((n match-len (1- n)))
           ((zerop n))
           (setf c0 c1
                 c1 (aref (slide-window-buff sw) rp))
           (insert-data sw rp)
           (incf rp)
           (when (>= rp slide-size2)
             (setf rp (update-slide sw rp)))))))

;;; ファイルの符号化
(defun encode-file (in-file out-file)
  (call-with-byte-output-file
   out-file
   (lambda (fout)
     (call-with-byte-input-file
      in-file
      (lambda (fin)
        (let ((size (file-length fin)))
          (write-file-size fout size)
          (when (plusp size)
            (call-with-range-encoder
             fout
             (lambda (rc)
               (encode rc fin))))))))))

;;; LZSS 符号 : 復号
(defun decode (rc fout size)
  (do ((freq-flag (make-array '(2 2) :initial-element nil))
       (freq-code (make-array '(256 256) :initial-element nil))
       (freq-len  (initialize-gamma-model (1+ (- max-len min-len))))
       (freq-pos  (initialize-gamma-model slide-size))
       (buff (make-array (ash 1 pos-bits) :element-type 'unsigned-byte))
       (rp 0)
       (c0 0) (c1 0)
       (f0 0) (f1 0))
      ((zerop size))
      (cond
       ((= (alpha-decode rc (get-freq-flag freq-flag f0 f1) :limit #x200 :inc 4) 1)
        (setf f0 f1 f1 1)
        (let ((num (+ (gamma-decode rc freq-len :limit #x200 :inc 4) min-len))
              (pos (- rp (1+ (gamma-decode rc freq-pos :limit #x200 :inc 4)))))
          (when (minusp pos)
            (incf pos (length buff)))
          (do ((num num (1- num)))
              ((zerop num))
              (let ((c (aref buff pos)))
                (write-byte c fout)
                (setf (aref buff rp) c
                      c0 c1
                      c1 c)
                (incf pos)
                (incf rp)
                (when (>= pos (length buff))
                  (setf pos 0))
                (when (>= rp (length buff))
                  (setf rp 0))))
          (decf size num)))
       (t
        (setf f0 f1 f1 0)
        (let ((c (bm-decode rc (get-freq-code freq-code c0 c1) :limit #x200 :inc 4)))
          (write-byte c fout)
          (setf (aref buff rp) c
                c0 c1
                c1 c)
          (incf rp)
          (when (>= rp (length buff))
            (setf rp 0)))
        (decf size)))))

;;; ファイルの復号
(defun decode-file (in-file out-file)
  (call-with-byte-input-file
   in-file
   (lambda (fin)
     (let ((size (read-file-size fin)))
       (call-with-byte-output-file
        out-file
        (lambda (fout)
          (when (plusp size)
            (call-with-range-decoder
             fin
             (lambda (rc)
               (decode rc fout size))))))))))

;;; 簡単なテスト
(defun test ()
  (let ((files '("alice29.txt" "asyoulik.txt" "cp.html" "fields.c" "grammar.lsp"
                 "kennedy.xls" "lcet10.txt" "plrabn12.txt" "ptt5" "sum" "xargs.1")))
    (format t "----- encode -----~%")
    (time
     (dolist (file files)
       (encode-file (format nil "./canterbury/~a" file)
                    (format nil "~a.en" file))))
    ;;
    (format t "----- decode -----~%")
    (time
     (dolist (file files)
       (decode-file (format nil "~a.en" file)
                    (format nil "~a.de" file))))))

Copyright (C) 2023 Makoto Hiroi
All rights reserved.

[ Common Lisp | library ]