LZ 符号は Jacob Ziv 氏と Abraham Lempel 氏によって開発された「適応型辞書法」によるデータ圧縮アルゴリズムです。LZ 符号には多数のバリエーションが存在しますが、「LZ77 符号」と「LZ78 符号」の 2 つに大別されます。LZ77 符号は 1977 年に、LZ78 符号は 1978 年に発表されました。
両者の符号は互いに関係があるものの、辞書の作成方法はまったく異なっているので、混同しないように注意してください。LZ77 符号は「スライド辞書法」、 LZ78 符号は「動的辞書法」と呼ばれています。LZ78 符号の詳しい説明は以下に示す拙作のページをお読みください。
LZ77 符号と同様に、LZ78 符号にも多数のバリエーションが存在します。その中で広く用いられている符号が 1984 年 T. Welch 氏によって開発された「LZW 符号」です。LZW 符号ではスライド窓の使用をやめて、これまでに出現した記号列を辞書に登録することで大域的な辞書を作成します。
なお、今回のプログラムでは辞書に「トライ (trie)」を使っていますが、ハッシュ表や TST (Ternary Search Tree) を使うと、実行時間はもう少し速くなります。
;;; ;;; lzw.lisp : LZW 符号 ;;; ;;; Copyright (c) 2023 Makoto Hiroi ;;; ;;; Released under the MIT license ;;; https://opensource.org/license/mit/ ;;; (require :bitio) (require :utils) (use-package '(:bitio :utils)) ;;; 定数 (defconstant dic-bits 13) (defconstant dic-size (ash 1 dic-bits)) ;;; 節 (defstruct node sym code (bros nil) (child nil)) ;;; 子を探す (defun search-child (nd c) (do ((xs (node-child nd) (node-bros xs))) ((null xs)) (when (= (node-sym xs) c) (return xs)))) ;;; 子を挿入する (defun insert-child (nd c num) (setf (node-child nd) (make-node :sym c :code num :bros (node-child nd)))) ;;; LZW 符号の符号化 (defun encode (fin fout) (let ((buff (vector-tabulate (lambda (x) (make-node :sym x :code x)) 256)) (num 256) (p nil) (q nil)) (setf p (aref buff (read-byte fin nil))) (loop (let ((c (read-byte fin nil))) (unless c (putbits fout dic-bits (node-code p)) (return)) (setf q (search-child p c)) (cond ((null q) (putbits fout dic-bits (node-code p)) (when (< num dic-size) (insert-child p c num) (incf num)) (setf p (aref buff c))) (t (setf p q))))))) ;;; ファイルの符号化 (defun encode-file (in-file out-file) (call-with-bit-output-file out-file (lambda (fout) (call-with-byte-input-file in-file (lambda (fin) (let ((size (file-length fin))) (putbits fout 32 size) (when (plusp size) (encode fin fout)))))))) ;;; 記号列の出力 (defun output (buff n fout) (cond ((null (cdr (aref buff n))) (write-byte n fout) (values n 1)) (t (multiple-value-bind (m i) (output buff (cdr (aref buff n)) fout) (write-byte (car (aref buff n)) fout) (values m (1+ i)))))) ;;; LZW 符号の復号 (defun decode (fin fout size) (let ((buff (make-array dic-size :initial-element nil)) (num 256) (p nil) (q nil) (c nil) (i nil)) ;; 初期化 (dotimes (x 256) (setf (aref buff x) (cons x nil))) ;; 最初の記号を読み込む (setf p (getbits fin dic-bits)) (multiple-value-setq (c i) (output buff p fout)) (decf size i) (do () ((zerop size)) (setf q (getbits fin dic-bits)) (cond ((< q num) (multiple-value-setq (c i) (output buff q fout)) (when (< num dic-size) (setf (aref buff num) (cons c p)) (incf num))) (t (setf (aref buff num) (cons c p)) (incf num) (multiple-value-setq (c i) (output buff q fout)))) (setf p q) (decf size i)))) ;;; ファイルの復号 (defun decode-file (in-file out-file) (call-with-bit-input-file in-file (lambda (fin) (let ((size (getbits fin 32))) (call-with-byte-output-file out-file (lambda (fout) (when (plusp size) (decode fin fout size)))))))) ;;; 簡単なテスト (defun test () (let ((files '("alice29.txt" "asyoulik.txt" "cp.html" "fields.c" "grammar.lsp" "kennedy.xls" "lcet10.txt" "plrabn12.txt" "ptt5" "sum" "xargs.1"))) (format t "----- encode -----~%") (time (dolist (file files) (encode-file (format nil "./canterbury/~a" file) (format nil "~a.en" file)))) ;; (format t "----- decode -----~%") (time (dolist (file files) (decode-file (format nil "~a.en" file) (format nil "~a.de" file))))))
* (load "lzw.lisp") T * (test) ----- encode ----- Evaluation took: 0.469 seconds of real time 0.466224 seconds of total run time (0.406800 user, 0.059424 system) 99.36% CPU 1,119,039,336 processor cycles 3,551,184 bytes consed ----- decode ----- Evaluation took: 0.349 seconds of real time 0.347480 seconds of total run time (0.337439 user, 0.010041 system) 99.43% CPU 833,957,086 processor cycles 1,917,088 bytes consed NIL
表 : LZW 符号の評価結果 LZW (辞書サイズ) LZSS ファイル名 サイズ 8192 32768 65536 8192 ------------------------------------------------------------ alice29.txt 152,089 68,448 65,841 70,152 68,332 asyoulik.txt 125,179 59,085 58,831 62,752 61,789 cp.html 24,603 12,150 14,018 14,952 10,278 fields.c 11,150 5,760 6,646 7,088 3,859 grammar.lsp 3,721 2,294 2,646 2,822 1,594 kennedy.xls 1,029,744 339,542 342,728 351,610 291,968 lcet10.txt 426,754 194,996 173,209 171,612 184,684 plrabn12.txt 481,861 220,850 205,236 204,868 247,780 ptt5 513,216 66,101 65,856 70,120 107,289 sum 38,240 30,163 23,493 25,058 17,500 xargs.1 4,227 2,916 3,364 3,588 2,198 ------------------------------------------------------------ 合計 2,810,784 1,002,305 961,868 984,622 997,271 表 : 実行時間 (秒) (辞書サイズ 8192, スライド窓 8192) : LZSS : LZW -------+------+------ 符号化 : 1.66 : 0.47 復号 : 0.30 : 0.35 実行環境 : SBCL 2.1.11, Ubunts 22.04 (WSL2), Intel Core i5-6200U 2.30GHz
;;; ;;; lzw1.lisp : LZW 符号 + CBT 符号 ;;; ;;; Copyright (c) 2023 Makoto Hiroi ;;; ;;; Released under the MIT license ;;; https://opensource.org/license/mit/ ;;; (require :bitio) (require :utils) (use-package '(:bitio :utils)) ;;; 定数 (defconstant dic-bits 13) (defconstant dic-size (ash 1 dic-bits)) ;;; 節 (defstruct node sym code (bros nil) (child nil)) ;;; 子を探す (defun search-child (nd c) (do ((xs (node-child nd) (node-bros xs))) ((null xs)) (when (= (node-sym xs) c) (return xs)))) ;;; 子を挿入する (defun insert-child (nd c num) (setf (node-child nd) (make-node :sym c :code num :bros (node-child nd)))) ;;; 符号語長 (defvar *code-count* nil) (defvar *dic-bits* nil) ;;; 初期化 (defun init-dic-bits () (setf *dic-bits* 9 *code-count* 256)) ;;; 更新 (defun update-dic-bits () (incf *code-count*) (when (> *code-count* (1- (ash 1 *dic-bits*))) (incf *dic-bits*))) ;;; 辞書番号の符号化 (defun encode1 (fout n) (cond ((< *dic-bits* dic-bits) (cbt-encode fout n *code-count* *dic-bits*) (update-dic-bits)) (t (putbits fout dic-bits n)))) ;;; 辞書番号の復号 (defun decode1 (fin) (if (< *dic-bits* dic-bits) (prog1 (cbt-decode fin *code-count* *dic-bits*) (update-dic-bits)) (getbits fin dic-bits))) ;;; LZW 符号の符号化 (defun encode (fin fout) (let ((buff (vector-tabulate (lambda (x) (make-node :sym x :code x)) 256)) (num 256) (p nil) (q nil)) (init-dic-bits) (setf p (aref buff (read-byte fin nil))) (loop (let ((c (read-byte fin nil))) (unless c (encode1 fout (node-code p)) (return)) (setf q (search-child p c)) (cond ((null q) (encode1 fout (node-code p)) (when (< num dic-size) (insert-child p c num) (incf num)) (setf p (aref buff c))) (t (setf p q))))))) ;;; ファイルの符号化 (defun encode-file (in-file out-file) (call-with-bit-output-file out-file (lambda (fout) (call-with-byte-input-file in-file (lambda (fin) (let ((size (file-length fin))) (putbits fout 32 size) (when (plusp size) (encode fin fout)))))))) ;;; 記号列の出力 (defun output (buff n fout) (cond ((null (cdr (aref buff n))) (write-byte n fout) (values n 1)) (t (multiple-value-bind (m i) (output buff (cdr (aref buff n)) fout) (write-byte (car (aref buff n)) fout) (values m (1+ i)))))) ;;; LZW 符号の復号 (defun decode (fin fout size) (let ((buff (make-array dic-size :initial-element nil)) (num 256) (p nil) (q nil) (c nil) (i nil)) ;; 初期化 (dotimes (x 256) (setf (aref buff x) (cons x nil))) (init-dic-bits) ;; 最初の記号を読み込む (setf p (decode1 fin)) (multiple-value-setq (c i) (output buff p fout)) (decf size i) (do () ((zerop size)) (setf q (decode1 fin)) (cond ((< q num) (multiple-value-setq (c i) (output buff q fout)) (when (< num dic-size) (setf (aref buff num) (cons c p)) (incf num))) (t (setf (aref buff num) (cons c p)) (incf num) (multiple-value-setq (c i) (output buff q fout)))) (setf p q) (decf size i)))) ;;; ファイルの復号 (defun decode-file (in-file out-file) (call-with-bit-input-file in-file (lambda (fin) (let ((size (getbits fin 32))) (call-with-byte-output-file out-file (lambda (fout) (when (plusp size) (decode fin fout size)))))))) ;;; 簡単なテスト (defun test () (let ((files '("alice29.txt" "asyoulik.txt" "cp.html" "fields.c" "grammar.lsp" "kennedy.xls" "lcet10.txt" "plrabn12.txt" "ptt5" "sum" "xargs.1"))) (format t "----- encode -----~%") (time (dolist (file files) (encode-file (format nil "./canterbury/~a" file) (format nil "~a.en" file)))) ;; (format t "----- decode -----~%") (time (dolist (file files) (decode-file (format nil "~a.en" file) (format nil "~a.de" file))))))
* (load "lzw1.lisp") T * (test) ----- encode ----- Evaluation took: 0.470 seconds of real time 0.473621 seconds of total run time (0.432921 user, 0.040700 system) 100.85% CPU 1,136,790,763 processor cycles 3,549,760 bytes consed ----- decode ----- Evaluation took: 0.369 seconds of real time 0.364025 seconds of total run time (0.364025 user, 0.000000 system) 98.64% CPU 873,646,248 processor cycles 1,916,960 bytes consed NIL
表 : LZW + CBT 符号の評価結果 辞書サイズ ファイル名 サイズ 8192 32768 65536 --------------------------------------------------- alice29.txt 152,089 67,415 61,123 60,513 asyoulik.txt 125,179 58,056 54,117 53,206 cp.html 24,603 11,111 10,877 10,877 fields.c 11,150 4,810 4,810 4,810 grammar.lsp 3,721 1,747 1,747 1,747 kennedy.xls 1,029,744 338,558 338,206 341,659 lcet10.txt 426,754 193,968 168,556 162,070 plrabn12.txt 481,861 219,812 200,493 195,131 ptt5 513,216 65,084 61,101 60,333 sum 38,240 29,123 19,473 19,473 xargs.1 4,227 2,249 2,249 2,249 --------------------------------------------------- 合計 2,810,784 991,933 922,752 912,068 表 : 実行時間 (秒) (辞書サイズ 8192, スライド窓 8192) : LZSS : LZW : LZW1 -------+------+------+------ 符号化 : 1.66 : 0.47 : 0.47 復号 : 0.30 : 0.35 : 0.37 実行環境 : SBCL 2.1.11, Ubunts 22.04 (WSL2), Intel Core i5-6200U 2.30GHz
;;; ;;; lzwrc.lisp : LZW 符号 + RangeCoder ;;; ;;; Copyright (c) 2023 Makoto Hiroi ;;; ;;; Released under the MIT license ;;; https://opensource.org/license/mit/ ;;; (require :brc) (require :utils) (use-package '(:brc :rangecoder :utils)) ;;; 定数 (defconstant dic-bits 13) (defconstant dic-size (ash 1 dic-bits)) ;;; ファイルサイズの書き込み (defun write-file-size (out size) (write-byte (logand (ash size -24) #xff) out) (write-byte (logand (ash size -16) #xff) out) (write-byte (logand (ash size -8) #xff) out) (write-byte (logand size #xff) out)) ;;; ファイルサイズの読み込み (defun read-file-size (in) (+ (ash (read-byte in) 24) (ash (read-byte in) 16) (ash (read-byte in) 8) (read-byte in))) ;;; 節 (defstruct node sym code (bros nil) (child nil)) ;;; 子を探す (defun search-child (nd c) (do ((xs (node-child nd) (node-bros xs))) ((null xs)) (when (= (node-sym xs) c) (return xs)))) ;;; 子を挿入する (defun insert-child (nd c num) (setf (node-child nd) (make-node :sym c :code num :bros (node-child nd)))) ;;; LZW 符号の符号化 (defun encode (rc fin) (let ((gm (initialize-gamma-model dic-size)) (buff (vector-tabulate (lambda (x) (make-node :sym x :code x)) 256)) (num 256) (p nil) (q nil)) (setf p (aref buff (read-byte fin nil))) (loop (let ((c (read-byte fin nil))) (unless c (gamma-encode rc gm (node-code p) :limit #x200 :inc 4) (return)) (setf q (search-child p c)) (cond ((null q) (gamma-encode rc gm (node-code p) :limit #x200 :inc 4) (when (< num dic-size) (insert-child p c num) (incf num)) (setf p (aref buff c))) (t (setf p q))))))) ;;; ファイルの符号化 (defun encode-file (in-file out-file) (call-with-byte-output-file out-file (lambda (fout) (call-with-byte-input-file in-file (lambda (fin) (let ((size (file-length fin))) (write-file-size fout size) (when (plusp size) (call-with-range-encoder fout (lambda (rc) (encode rc fin)))))))))) ;;; 記号列の出力 (defun output (buff n fout) (cond ((null (cdr (aref buff n))) (write-byte n fout) (values n 1)) (t (multiple-value-bind (m i) (output buff (cdr (aref buff n)) fout) (write-byte (car (aref buff n)) fout) (values m (1+ i)))))) ;;; LZW 符号の復号 (defun decode (rc fout size) (let ((gm (initialize-gamma-model dic-size)) (buff (make-array dic-size :initial-element nil)) (num 256) (p nil) (q nil) (c nil) (i nil)) ;; 初期化 (dotimes (x 256) (setf (aref buff x) (cons x nil))) ;; 最初の記号を読み込む (setf p (gamma-decode rc gm :limit #x200 :inc 4)) (multiple-value-setq (c i) (output buff p fout)) (decf size i) (do () ((zerop size)) (setf q (gamma-decode rc gm :limit #x200 :inc 4)) (cond ((< q num) (multiple-value-setq (c i) (output buff q fout)) (when (< num dic-size) (setf (aref buff num) (cons c p)) (incf num))) (t (setf (aref buff num) (cons c p)) (incf num) (multiple-value-setq (c i) (output buff q fout)))) (setf p q) (decf size i)))) ;;; ファイルの復号 (defun decode-file (in-file out-file) (call-with-byte-input-file in-file (lambda (fin) (let ((size (read-file-size fin))) (call-with-byte-output-file out-file (lambda (fout) (when (plusp size) (call-with-range-decoder fin (lambda (rc) (decode rc fout size)))))))))) ;;; 簡単なテスト (defun test () (let ((files '("alice29.txt" "asyoulik.txt" "cp.html" "fields.c" "grammar.lsp" "kennedy.xls" "lcet10.txt" "plrabn12.txt" "ptt5" "sum" "xargs.1"))) (format t "----- encode -----~%") (time (dolist (file files) (encode-file (format nil "./canterbury/~a" file) (format nil "~a.en" file)))) ;; (format t "----- decode -----~%") (time (dolist (file files) (decode-file (format nil "~a.en" file) (format nil "~a.de" file))))))
* (load "lzwrc.lisp") T * (test) ----- encode ----- Evaluation took: 1.380 seconds of real time 1.387658 seconds of total run time (1.358178 user, 0.029480 system) 100.58% CPU 3,330,504,821 processor cycles 3,599,312 bytes consed ----- decode ----- Evaluation took: 1.429 seconds of real time 1.432493 seconds of total run time (1.392416 user, 0.040077 system) 100.21% CPU 3,438,338,153 processor cycles 1,965,872 bytes consed NIL
表 : LZW 符号 + BinaryRangeCoderの評価結果 辞書サイズ ファイル名 サイズ 8192 32768 65536 --------------------------------------------------- alice29.txt 152,089 66,695 60,553 60,522 asyoulik.txt 125,179 57,482 53,555 53,555 cp.html 24,603 10,843 10,843 10,843 fields.c 11,150 4,730 4,730 4,730 grammar.lsp 3,721 1,707 1,707 1,707 kennedy.xls 1,029,744 219,014 200,148 197,994 lcet10.txt 426,754 191,396 166,521 160,011 plrabn12.txt 481,861 218,528 200,084 194,439 ptt5 513,216 62,453 58,931 58,952 sum 38,240 21,492 17,802 17,802 xargs.1 4,227 2,226 2,226 2,226 --------------------------------------------------- 合計 2,810,784 856,566 777,100 762,781 表 : 実行時間 (秒) (辞書サイズ 8192, スライド窓 8192) : LZSS : LZW : LZW1 : LZWRC -------+------+------+------+------- 符号化 : 1.66 : 0.47 : 0.47 : 1.38 復号 : 0.30 : 0.35 : 0.37 : 1.43 実行環境 : SBCL 2.1.11, Ubunts 22.04 (WSL2), Intel Core i5-6200U 2.30GHz
LZT 符号は 1987 年に Tischer 氏によって開発されました。おもな改良点は、辞書が満杯になった場合、長い間使われていない (最長時間未使用 : Least Recently Used) 語を取り除くことで辞書の空きスペースを作るところです。このような操作を「LRU スキーム」と呼びます。
LZT 符号は LRU スキームを行うため符号化と復号に時間がかかることになりますが、少ないメモリでも高い圧縮率を期待することができます。また、データの局所的な偏在も辞書に反映することが可能になります。
;;; ;;; lzt.lisp : LZT 符号 ;;; ;;; Copyright (c) 2023 Makoto Hiroi ;;; ;;; Released under the MIT license ;;; https://opensource.org/license/mit/ ;;; (require :bitio) (require :utils) (use-package '(:bitio :utils)) ;;; 定数 (defconstant dic-bits 13) (defconstant dic-size (ash 1 dic-bits)) ;;; 節 (defstruct node sym code (bros nil) (child nil) (parent nil)) ;;; 辞書 (defvar *dic-table* nil) (defvar *dic-num* nil) ;;; キュー (defvar *next* nil) (defvar *prev* nil) ;;; 初期化 (defun init-dic () (setf *dic-table* (make-array dic-size :initial-element nil)) (dotimes (x 256) (setf (aref *dic-table* x) (make-node :sym x :code x))) (setf *dic-num* 256) (setf *next* (make-array dic-size :initial-element 0)) (setf *prev* (make-array dic-size :initial-element 0))) ;;; 節を求める (defun get-node (num) (aref *dic-table* num)) ;;; 節をキューから削除 ;;; p <--> n <--> q ;;; => p <--> q (defun delete-node (nd) (let* ((n (node-code nd)) (p (aref *prev* n)) (q (aref *next* n))) (setf (aref *next* p) q (aref *prev* q) p))) ;;; 後ろへ追加 ;;; p <--> H (0) <--> ;;; => p <--> n <--> H (0) <--> (defun push-back (nd) (let ((n (node-code nd)) (p (aref *prev* 0))) (setf (aref *prev* 0) n (aref *next* p) n (aref *next* n) 0 (aref *prev* n) p))) ;;; 節を後ろへ移動 (defun move-node (nd) (delete-node nd) (push-back nd)) ;;; 葉を探す (defun search-leaf () (do ((n (aref *next* 0) (aref *next* n))) ((zerop n)) (unless (node-child (get-node n)) (return n)))) ;;; 子を探す (defun search-child (nd c) (do ((xs (node-child nd) (node-bros xs))) ((null xs)) (when (= (node-sym xs) c) (return xs)))) ;;; 子を削除する (defun delete-child (nd) (let ((p (node-parent nd))) (if (eql (node-child p) nd) (setf (node-child p) (node-bros nd)) (do ((xs (node-child p) (node-bros xs))) ((null xs)) (when (eql (node-bros xs) nd) (setf (node-bros xs) (node-bros nd)) (return)))))) ;;; 子を挿入する (defun insert-child (nd c) (cond ((< *dic-num* dic-size) (let ((new-nd (make-node :sym c :code *dic-num* :bros (node-child nd) :parent nd))) (push-back new-nd) (setf (node-child nd) new-nd (aref *dic-table* *dic-num*) new-nd) (incf *dic-num*))) (t ;; 葉を探す (let ((leaf (get-node (search-leaf)))) (delete-child leaf) (setf (node-sym leaf) c (node-bros leaf) (node-child nd) (node-child leaf) nil (node-parent leaf) nd (node-child nd) leaf) (move-node leaf))))) ;;; 辞書番号のチェック (defun check-code (n) (if (= *dic-num* dic-size) (/= (search-leaf) n) (< n *dic-num*))) ;;; LZT 符号の符号化 (defun encode (fin fout) (init-dic) (let ((p nil) (q nil)) (setf p (get-node (read-byte fin nil))) (loop (let ((c (read-byte fin nil))) (unless c (putbits fout dic-bits (node-code p)) (return)) (setf q (search-child p c)) (cond ((null q) (putbits fout dic-bits (node-code p)) (insert-child p c) (setf p (get-node c))) (t (move-node q) (setf p q))))))) ;;; ファイルの符号化 (defun encode-file (in-file out-file) (call-with-bit-output-file out-file (lambda (fout) (call-with-byte-input-file in-file (lambda (fin) (let ((size (file-length fin))) (putbits fout 32 size) (when (plusp size) (encode fin fout)))))))) ;;; 記号列の出力 (defun output (nd fout) (cond ((null (node-parent nd)) (write-byte (node-sym nd) fout) (values (node-sym nd) 1)) (t (multiple-value-bind (m i) (output (node-parent nd) fout) (write-byte (node-sym nd) fout) (move-node nd) (values m (1+ i)))))) ;;; LZT 符号の復号 (defun decode (fin fout size) (init-dic) (let ((p nil) (q nil) (c nil) (i nil)) ;; 最初の記号を読み込む (setf p (getbits fin dic-bits)) (multiple-value-setq (c i) (output (get-node p) fout)) (decf size i) (do () ((zerop size)) (setf q (getbits fin dic-bits)) (cond ((check-code q) (multiple-value-setq (c i) (output (get-node q) fout)) (insert-child (get-node p) c)) (t (insert-child (get-node p) c) (multiple-value-setq (c i) (output (get-node q) fout)))) (setf p q) (decf size i)))) ;;; ファイルの復号 (defun decode-file (in-file out-file) (call-with-bit-input-file in-file (lambda (fin) (let ((size (getbits fin 32))) (call-with-byte-output-file out-file (lambda (fout) (when (plusp size) (decode fin fout size)))))))) ;;; 簡単なテスト (defun test () (let ((files '("alice29.txt" "asyoulik.txt" "cp.html" "fields.c" "grammar.lsp" "kennedy.xls" "lcet10.txt" "plrabn12.txt" "ptt5" "sum" "xargs.1"))) (format t "----- encode -----~%") (time (dolist (file files) (encode-file (format nil "./canterbury/~a" file) (format nil "~a.en" file)))) ;; (format t "----- decode -----~%") (time (dolist (file files) (decode-file (format nil "~a.en" file) (format nil "~a.de" file))))))
* (load "lzt.lisp") T * (test) ----- encode ----- Evaluation took: 0.820 seconds of real time 0.746334 seconds of total run time (0.746334 user, 0.000000 system) 90.98% CPU 2,007,996,851 processor cycles 5,698,576 bytes consed ----- decode ----- Evaluation took: 0.590 seconds of real time 0.580272 seconds of total run time (0.580272 user, 0.000000 system) 98.31% CPU 1,392,703,076 processor cycles 5,682,160 bytes consed NIL
表 : LZT 符号の評価結果 LZW LZT (辞書サイズ) ファイル名 サイズ 8192 8192 32768 65536 ------------------------------------------------------------ alice29.txt 152,089 68,448 65,225 65,815 70,152 asyoulik.txt 125,179 59,085 57,965 58,831 62,752 cp.html 24,603 12,150 12,150 14,018 14,952 fields.c 11,150 5,760 5,760 6,646 7,088 grammar.lsp 3,721 2,294 2,294 2,646 2,822 kennedy.xls 1,029,744 339,542 275,226 307,799 326,882 lcet10.txt 426,754 194,996 181,460 167,931 169,536 plrabn12.txt 481,861 220,850 216,263 203,564 204,380 ptt5 513,216 66,101 62,424 65,819 70,120 sum 38,240 30,163 20,403 23,493 25,058 xargs.1 4,227 2,916 2,916 3,364 3,588 ------------------------------------------------------------ 合計 2,810,784 1,002,305 902,086 919,926 957,330 表 : 実行時間 (秒) (辞書サイズ 8192, スライド窓 8192) : LZSS : LZW : LZT -------+------+------+----- 符号化 : 1.66 : 0.47 : 0.82 復号 : 0.30 : 0.35 : 0.59 実行環境 : SBCL 2.1.11, Ubunts 22.04 (WSL2), Intel Core i5-6200U 2.30GHz
;;; ;;; lzt1.lisp : LZT 符号 + CBT 符号 ;;; ;;; Copyright (c) 2023 Makoto Hiroi ;;; ;;; Released under the MIT license ;;; https://opensource.org/license/mit/ ;;; (require :bitio) (require :utils) (use-package '(:bitio :utils)) ;;; 定数 (defconstant dic-bits 13) (defconstant dic-size (ash 1 dic-bits)) ;;; 節 (defstruct node sym code (bros nil) (child nil) (parent nil)) ;;; 辞書 (defvar *dic-table* nil) (defvar *dic-num* nil) ;;; キュー (defvar *next* nil) (defvar *prev* nil) ;;; 初期化 (defun init-dic () (setf *dic-table* (make-array dic-size :initial-element nil)) (dotimes (x 256) (setf (aref *dic-table* x) (make-node :sym x :code x))) (setf *dic-num* 256) (setf *next* (make-array dic-size :initial-element 0)) (setf *prev* (make-array dic-size :initial-element 0))) ;;; 節を求める (defun get-node (num) (aref *dic-table* num)) ;;; 節をキューから削除 ;;; p <--> n <--> q ;;; => p <--> q (defun delete-node (nd) (let* ((n (node-code nd)) (p (aref *prev* n)) (q (aref *next* n))) (setf (aref *next* p) q (aref *prev* q) p))) ;;; 後ろへ追加 ;;; p <--> H (0) <--> ;;; => p <--> n <--> H (0) <--> (defun push-back (nd) (let ((n (node-code nd)) (p (aref *prev* 0))) (setf (aref *prev* 0) n (aref *next* p) n (aref *next* n) 0 (aref *prev* n) p))) ;;; 節を後ろへ移動 (defun move-node (nd) (delete-node nd) (push-back nd)) ;;; 葉を探す (defun search-leaf () (do ((n (aref *next* 0) (aref *next* n))) ((zerop n)) (unless (node-child (get-node n)) (return n)))) ;;; 子を探す (defun search-child (nd c) (do ((xs (node-child nd) (node-bros xs))) ((null xs)) (when (= (node-sym xs) c) (return xs)))) ;;; 子を削除する (defun delete-child (nd) (let ((p (node-parent nd))) (if (eql (node-child p) nd) (setf (node-child p) (node-bros nd)) (do ((xs (node-child p) (node-bros xs))) ((null xs)) (when (eql (node-bros xs) nd) (setf (node-bros xs) (node-bros nd)) (return)))))) ;;; 子を挿入する (defun insert-child (nd c) (cond ((< *dic-num* dic-size) (let ((new-nd (make-node :sym c :code *dic-num* :bros (node-child nd) :parent nd))) (push-back new-nd) (setf (node-child nd) new-nd (aref *dic-table* *dic-num*) new-nd) (incf *dic-num*))) (t ;; 葉を探す (let ((leaf (get-node (search-leaf)))) (delete-child leaf) (setf (node-sym leaf) c (node-bros leaf) (node-child nd) (node-child leaf) nil (node-parent leaf) nd (node-child nd) leaf) (move-node leaf))))) ;;; 辞書番号のチェック (defun check-code (n) (if (= *dic-num* dic-size) (/= (search-leaf) n) (< n *dic-num*))) ;;; 符号語長 (defvar *code-count* nil) (defvar *dic-bits* nil) ;;; 初期化 (defun init-dic-bits () (setf *dic-bits* 9 *code-count* 256)) ;;; 更新 (defun update-dic-bits () (incf *code-count*) (when (> *code-count* (1- (ash 1 *dic-bits*))) (incf *dic-bits*))) ;;; 辞書番号の符号化 (defun encode1 (fout n) (cond ((< *dic-bits* dic-bits) (cbt-encode fout n *code-count* *dic-bits*) (update-dic-bits)) (t (putbits fout dic-bits n)))) ;;; 辞書番号の復号 (defun decode1 (fin) (if (< *dic-bits* dic-bits) (prog1 (cbt-decode fin *code-count* *dic-bits*) (update-dic-bits)) (getbits fin dic-bits))) ;;; LZT 符号の符号化 (defun encode (fin fout) (init-dic) (init-dic-bits) (let ((p nil) (q nil)) (setf p (get-node (read-byte fin nil))) (loop (let ((c (read-byte fin nil))) (unless c (encode1 fout (node-code p)) (return)) (setf q (search-child p c)) (cond ((null q) (encode1 fout (node-code p)) (insert-child p c) (setf p (get-node c))) (t (move-node q) (setf p q))))))) ;;; ファイルの符号化 (defun encode-file (in-file out-file) (call-with-bit-output-file out-file (lambda (fout) (call-with-byte-input-file in-file (lambda (fin) (let ((size (file-length fin))) (putbits fout 32 size) (when (plusp size) (encode fin fout)))))))) ;;; 記号列の出力 (defun output (nd fout) (cond ((null (node-parent nd)) (write-byte (node-sym nd) fout) (values (node-sym nd) 1)) (t (multiple-value-bind (m i) (output (node-parent nd) fout) (write-byte (node-sym nd) fout) (move-node nd) (values m (1+ i)))))) ;;; LZT 符号の復号 (defun decode (fin fout size) (init-dic) (init-dic-bits) (let ((p nil) (q nil) (c nil) (i nil)) ;; 最初の記号を読み込む (setf p (decode1 fin)) (multiple-value-setq (c i) (output (get-node p) fout)) (decf size i) (do () ((zerop size)) (setf q (decode1 fin)) (cond ((check-code q) (multiple-value-setq (c i) (output (get-node q) fout)) (insert-child (get-node p) c)) (t (insert-child (get-node p) c) (multiple-value-setq (c i) (output (get-node q) fout)))) (setf p q) (decf size i)))) ;;; ファイルの復号 (defun decode-file (in-file out-file) (call-with-bit-input-file in-file (lambda (fin) (let ((size (getbits fin 32))) (call-with-byte-output-file out-file (lambda (fout) (when (plusp size) (decode fin fout size)))))))) ;;; 簡単なテスト (defun test () (let ((files '("alice29.txt" "asyoulik.txt" "cp.html" "fields.c" "grammar.lsp" "kennedy.xls" "lcet10.txt" "plrabn12.txt" "ptt5" "sum" "xargs.1"))) (format t "----- encode -----~%") (time (dolist (file files) (encode-file (format nil "./canterbury/~a" file) (format nil "~a.en" file)))) ;; (format t "----- decode -----~%") (time (dolist (file files) (decode-file (format nil "~a.en" file) (format nil "~a.de" file))))))
* (load "lzt1.lisp") T * (test) ----- encode ----- Evaluation took: 0.700 seconds of real time 0.715342 seconds of total run time (0.712893 user, 0.002449 system) 102.14% CPU 1,716,929,989 processor cycles 5,682,240 bytes consed ----- decode ----- Evaluation took: 0.590 seconds of real time 0.585889 seconds of total run time (0.585798 user, 0.000091 system) 99.32% CPU 1,406,113,127 processor cycles 5,666,016 bytes consed NIL
表 : LZT + CBT 符号の評価結果 LZT LZT + CBT (辞書サイズ) ファイル名 サイズ 8192 8192 32768 65536 ----------------------------------------------------------- alice29.txt 152,089 65,225 64,193 61,097 60,513 asyoulik.txt 125,179 57,965 56,937 54,117 53,206 cp.html 24,603 12,150 11,111 10,877 10,877 fields.c 11,150 5,760 4,810 4,810 4,810 grammar.lsp 3,721 2,294 1,747 1,747 1,747 kennedy.xls 1,029,744 275,226 274,242 303,277 316,931 lcet10.txt 426,754 181,460 180,431 163,278 159,994 plrabn12.txt 481,861 216,263 215,225 198,821 194,643 ptt5 513,216 62,424 61,407 61,064 60,333 sum 38,240 20,403 19,364 19,473 19,473 xargs.1 4,227 2,916 2,249 2,249 2,249 ------------------------------------------------------------ 合計 2,810,784 902,086 891,716 880,810 884,776 表 : 実行時間 (秒) (辞書サイズ 8192, スライド窓 8192) : LZSS : LZW : LZT : LZT1 -------+------+------+------+------ 符号化 : 1.66 : 0.47 : 0.82 : 0.70 復号 : 0.30 : 0.35 : 0.59 : 0.59 実行環境 : SBCL 2.1.11, Ubunts 22.04 (WSL2), Intel Core i5-6200U 2.30GHz
;;; ;;; lztrc.lisp : LZT 符号 + BinaryRangeCoder ;;; ;;; Copyright (c) 2023 Makoto Hiroi ;;; ;;; Released under the MIT license ;;; https://opensource.org/license/mit/ ;;; (require :brc) (use-package '(:brc :rangecoder)) ;;; 定数 (defconstant dic-bits 13) (defconstant dic-size (ash 1 dic-bits)) ;;; ファイルサイズの書き込み (defun write-file-size (out size) (write-byte (logand (ash size -24) #xff) out) (write-byte (logand (ash size -16) #xff) out) (write-byte (logand (ash size -8) #xff) out) (write-byte (logand size #xff) out)) ;;; ファイルサイズの読み込み (defun read-file-size (in) (+ (ash (read-byte in) 24) (ash (read-byte in) 16) (ash (read-byte in) 8) (read-byte in))) ;;; 節 (defstruct node sym code (bros nil) (child nil) (parent nil)) ;;; 辞書 (defvar *dic-table* nil) (defvar *dic-num* nil) ;;; キュー (defvar *next* nil) (defvar *prev* nil) ;;; 初期化 (defun init-dic () (setf *dic-table* (make-array dic-size :initial-element nil)) (dotimes (x 256) (setf (aref *dic-table* x) (make-node :sym x :code x))) (setf *dic-num* 256) (setf *next* (make-array dic-size :initial-element 0)) (setf *prev* (make-array dic-size :initial-element 0))) ;;; 節を求める (defun get-node (num) (aref *dic-table* num)) ;;; 節をキューから削除 ;;; p <--> n <--> q ;;; => p <--> q (defun delete-node (nd) (let* ((n (node-code nd)) (p (aref *prev* n)) (q (aref *next* n))) (setf (aref *next* p) q (aref *prev* q) p))) ;;; 後ろへ追加 ;;; p <--> H (0) <--> ;;; => p <--> n <--> H (0) <--> (defun push-back (nd) (let ((n (node-code nd)) (p (aref *prev* 0))) (setf (aref *prev* 0) n (aref *next* p) n (aref *next* n) 0 (aref *prev* n) p))) ;;; 節を後ろへ移動 (defun move-node (nd) (delete-node nd) (push-back nd)) ;;; 葉を探す (defun search-leaf () (do ((n (aref *next* 0) (aref *next* n))) ((zerop n)) (unless (node-child (get-node n)) (return n)))) ;;; 子を探す (defun search-child (nd c) (do ((xs (node-child nd) (node-bros xs))) ((null xs)) (when (= (node-sym xs) c) (return xs)))) ;;; 子を削除する (defun delete-child (nd) (let ((p (node-parent nd))) (if (eql (node-child p) nd) (setf (node-child p) (node-bros nd)) (do ((xs (node-child p) (node-bros xs))) ((null xs)) (when (eql (node-bros xs) nd) (setf (node-bros xs) (node-bros nd)) (return)))))) ;;; 子を挿入する (defun insert-child (nd c) (cond ((< *dic-num* dic-size) (let ((new-nd (make-node :sym c :code *dic-num* :bros (node-child nd) :parent nd))) (push-back new-nd) (setf (node-child nd) new-nd (aref *dic-table* *dic-num*) new-nd) (incf *dic-num*))) (t ;; 葉を探す (let ((leaf (get-node (search-leaf)))) (delete-child leaf) (setf (node-sym leaf) c (node-bros leaf) (node-child nd) (node-child leaf) nil (node-parent leaf) nd (node-child nd) leaf) (move-node leaf))))) ;;; 辞書番号のチェック (defun check-code (n) (if (= *dic-num* dic-size) (/= (search-leaf) n) (< n *dic-num*))) ;;; LZT 符号の符号化 (defun encode (rc fin) (init-dic) (let ((gm (initialize-gamma-model dic-size)) (p nil) (q nil)) (setf p (get-node (read-byte fin nil))) (loop (let ((c (read-byte fin nil))) (unless c (gamma-encode rc gm (node-code p) :limit #x200 :inc 4) (return)) (setf q (search-child p c)) (cond ((null q) (gamma-encode rc gm (node-code p) :limit #x200 :inc 4) (insert-child p c) (setf p (get-node c))) (t (move-node q) (setf p q))))))) ;;; ファイルの符号化 (defun encode-file (in-file out-file) (call-with-byte-output-file out-file (lambda (fout) (call-with-byte-input-file in-file (lambda (fin) (let ((size (file-length fin))) (write-file-size fout size) (when (plusp size) (call-with-range-encoder fout (lambda (rc) (encode rc fin)))))))))) ;;; 記号列の出力 (defun output (nd fout) (cond ((null (node-parent nd)) (write-byte (node-sym nd) fout) (values (node-sym nd) 1)) (t (multiple-value-bind (m i) (output (node-parent nd) fout) (write-byte (node-sym nd) fout) (move-node nd) (values m (1+ i)))))) ;;; LZT 符号の復号 (defun decode (rc fout size) (init-dic) (let ((gm (initialize-gamma-model dic-size)) (p nil) (q nil) (c nil) (i nil)) ;; 最初の記号を読み込む (setf p (gamma-decode rc gm :limit #x200 :inc 4)) (multiple-value-setq (c i) (output (get-node p) fout)) (decf size i) (do () ((zerop size)) (setf q (gamma-decode rc gm :limit #x200 :inc 4)) (cond ((check-code q) (multiple-value-setq (c i) (output (get-node q) fout)) (insert-child (get-node p) c)) (t (insert-child (get-node p) c) (multiple-value-setq (c i) (output (get-node q) fout)))) (setf p q) (decf size i)))) ;;; ファイルの復号 (defun decode-file (in-file out-file) (call-with-byte-input-file in-file (lambda (fin) (let ((size (read-file-size fin))) (call-with-byte-output-file out-file (lambda (fout) (when (plusp size) (call-with-range-decoder fin (lambda (rc) (decode rc fout size)))))))))) ;;; 簡単なテスト (defun test () (let ((files '("alice29.txt" "asyoulik.txt" "cp.html" "fields.c" "grammar.lsp" "kennedy.xls" "lcet10.txt" "plrabn12.txt" "ptt5" "sum" "xargs.1"))) (format t "----- encode -----~%") (time (dolist (file files) (encode-file (format nil "./canterbury/~a" file) (format nil "~a.en" file)))) ;; (format t "----- decode -----~%") (time (dolist (file files) (decode-file (format nil "~a.en" file) (format nil "~a.de" file))))))
* (load "lztrc.lisp") T * (test) ----- encode ----- Evaluation took: 1.640 seconds of real time 1.653928 seconds of total run time (1.633722 user, 0.020206 system) 100.85% CPU 3,969,466,154 processor cycles 5,747,808 bytes consed ----- decode ----- Evaluation took: 1.630 seconds of real time 1.623808 seconds of total run time (1.614050 user, 0.009758 system) 99.63% CPU 3,897,169,190 processor cycles 5,731,408 bytes consed NIL
表 : LZT + BinaryRangeCoder の評価結果 LZT1 LZT + BRC (辞書サイズ) ファイル名 サイズ 8192 8192 32768 65536 ----------------------------------------------------------- alice29.txt 152,089 64,193 63,416 60,509 60,522 asyoulik.txt 125,179 56,937 56,327 53,555 53,555 cp.html 24,603 11,111 10,843 10,843 10,843 fields.c 11,150 4,810 4,730 4,730 4,730 grammar.lsp 3,721 1,747 1,707 1,707 1,707 kennedy.xls 1,029,744 274,242 206,468 196,491 196,312 lcet10.txt 426,754 180,431 178,916 162,120 158,575 plrabn12.txt 481,861 215,225 213,765 197,763 193,611 ptt5 513,216 61,407 59,661 58,953 58,952 sum 38,240 19,364 17,815 17,802 17,802 xargs.1 4,227 2,249 2,226 2,226 2,226 ------------------------------------------------------------ 合計 2,810,784 891,716 815,874 766,699 758,835 表 : 実行時間 (秒) (辞書サイズ 8192, スライド窓 8192) : LZSS : LZW : LZT : LZT1 : LZTRC -------+------+------+------+------+------- 符号化 : 1.66 : 0.47 : 0.82 : 0.70 : 1.64 復号 : 0.30 : 0.35 : 0.59 : 0.59 : 1.63 実行環境 : SBCL 2.1.11, Ubunts 22.04 (WSL2), Intel Core i5-6200U 2.30GHz