axiom-developer
[Top][All Lists]
Advanced

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

Re: [Axiom-developer] No progress on notangle


From: Waldek Hebisch
Subject: Re: [Axiom-developer] No progress on notangle
Date: Sun, 18 Feb 2007 21:19:49 +0100 (CET)

Below is a newer version of CL notangle, this time handling recursive
references.  There are still significant differences compared to
real notangle (lack of @ escapes, different end of line handling, ...).
I also fixed performace problem in scan-for-chunks (previous version
was using 'equal' and sbcl emited actual call to 'equal', current
version uses 'eql' which generates inline code).  Another performance
problem was using arrays of bytes as hash keys (completly killed
performance for pamphlets with many small chunks, or chunks
split into many small pieces).  I solved this problem converting
arrays to strings before hashing.

This version seem to be significantly faster than notangle: it is able
to process simple 8 Mb file (containing few big chunks) in 0.14s while
notangle needs 2s.  On 8Mb file containing 1000 chunks, each consisting
of 100 pieces 10 lines each, my version needs 0.24s, while notangle
uses 2.85s. 

BTW. Timing was done on 2GHz Athlon 64.  Notangle came from Axiom build,
Lisp is sbcl 1.0.  I tried also clisp, but it is quite slow (need 8.73
senconds on my second test file) -- no wonder since I use a low level
code.  This version (like the previous one) does not work with gcl
(ANSI 2.6.5 interpreter claims that 'read-sequence' is undefined,
non-ANSI interpreter chokes on 'defclass').

(defconstant start-tag-code-1 (char-code #\<))
(defconstant start-tag-code-2 (char-code #\<))
(defconstant end-tag-code-1 (char-code #\>))
(defconstant end-tag-code-2 (char-code #\>))
(defconstant end-tag-code-3 (char-code #\=))
(defconstant start-marker-code-1 (char-code #\<))
(defconstant start-marker-code-2 (char-code #\<))
(defconstant end-marker-code-1 (char-code #\>))
(defconstant end-marker-code-2 (char-code #\>))
(defconstant chunk-end-code (char-code #\@))
(defconstant newline-code 10)
(defconstant space-code (char-code #\ ))

(defun read-file (f)
    (let* ((b-len (file-length f))
           (buff (make-array (list (+ b-len 1))
                             :element-type '(unsigned-byte  8))))
        (read-sequence buff f)
        (setf (aref buff b-len) newline-code)
        buff))

(defun read-named-file (name)
    (with-open-file (f name :element-type '(unsigned-byte  8))
         (read-file f)))

(defclass chunk ()
  ((chunk-name
    :initarg :chunk-name
    :initform (error "Must supply a chunk name.")
    :reader chunk-name
    :documentation "Name of chunk.")
   (chunk-contents
    :initarg :chunk-contents
    :initform '()
    :accessor chunk-contents
    :documentation "Text of chunk - may include references to other chunks")))

(defparameter *chunk-hash-table* (make-hash-table :test 'equal))

(defun add-to-chunk-contents (name content)
    (declare (optimize (speed 3)))
    (let ((chunk (gethash name *chunk-hash-table*)))
         (if (not chunk)
             (setf chunk (setf (gethash name *chunk-hash-table*)
                               (make-instance 'chunk :chunk-name name))))
             (push content (chunk-contents chunk))))

(defun view-all-chunks ()
  (maphash #'(lambda (k v) (format t "~a => ~S~&" k (chunk-contents v))) 
*chunk-hash-table*))

(defun print-chunk-name (name line-number)
    (format t "Chunk start in line ~A: <<" line-number)
    (dotimes (i (array-dimension name 0))
        (format t "~A" (code-char (aref name i))))
    (format t ">>~&")) 

(defun array-to-string (ar)
    (declare (optimize (speed 3))
             (type (simple-array (unsigned-byte 8) (*)) ar))
    (let* ((len (array-dimension ar 0))
           (str (make-string len)))
          (dotimes (i len)
               (setf (aref str i) (code-char (aref ar i))))
          str))

(defun scan-for-chunks (buff)
    (declare (optimize (speed 3)))
    (prog ((pos -1)
           (end-buff (- (array-dimension buff 0) 1))
           (start-pos)
           (code)
           (chunk-name)
           (content nil)
           (chunk-start-pos)
           (chunk-end-pos)
           (line-number 0))
         (declare (type (signed-byte 32) pos end-buff line-number)
                  (type (simple-array (unsigned-byte  8) (*)) buff))
                  
       normal-start
         (incf pos)
         (incf line-number)
         (if (>= pos end-buff)
             (return-from scan-for-chunks))
         (setf code (aref buff pos))
         (if (eql code start-tag-code-1)
             (go chunk-start-tag-1))
         (if (eql code newline-code)
             (go normal-start))
         (go normal)
       normal
         (incf pos)
         (setf code (aref buff pos))
         (if (eql code newline-code)
             (go normal-start))
         (go normal)
       chunk-start-tag-1
         (incf pos)
         (setf code (aref buff pos))
         (if (eql code start-tag-code-2)
             (progn
                  (setf start-pos (+ pos 1))
                  ;;; (format t "start-pos: ~A, char: ~A~&" start-pos
                  ;;;                                  (aref buff start-pos))
                  (go in-chunk-start-tag)))
         (if (eql code newline-code)
             (go normal-start))
         (go normal)
       in-chunk-start-tag
         (incf pos)
         (setf code (aref buff pos))
         (if (eql code end-tag-code-1)
             (go in-chunk-start-tag-3))
         (if (eql code newline-code)
             (go normal-start))
         (go in-chunk-start-tag)
       in-chunk-start-tag-3
         (incf pos)
         (setf code (aref buff pos))
         (if (eql code end-tag-code-2)
             (go in-chunk-start-tag-4))
         (if (eql code newline-code)
             (go normal-start))
         (go normal)
       in-chunk-start-tag-4
         (incf pos)
         (setf code (aref buff pos))
         (if (eql code end-tag-code-3)
             (progn 
                  (setf chunk-name (subseq buff start-pos (- pos 2)))
                  (go in-chunk-start-tag-trailing)))
         (if (eql code newline-code)
             (go normal-start))
         (go normal)
       in-chunk-start-tag-trailing
         (incf pos)
         (setf code (aref buff pos))
         (if (eql code newline-code)
             (progn
                 ;;; (print-chunk-name chunk-name line-number)
                 (setf chunk-start-pos (+ pos 1))
                 (go in-chunk)))
         (if (eql code space-code)
             (go in-chunk-start-tag-trailing))
         (go normal)
       chunk-start
         (setf chunk-end-pos pos)
         (incf pos)
         (if (>= pos end-buff)
             (break "unexpected end of file"))
         (setf code (aref buff pos))
         (incf line-number)
         (if (eql code newline-code)
             (go chunk-start))
         (if (eql code start-marker-code-1)
             (go start-marker-1))
         (if (eql code chunk-end-code)
             (progn
                 (push (list chunk-start-pos chunk-end-pos) content)
                 (setf content (reverse content))
                 ;;; (format t "Adding ~S to ~S~&" content chunk-name) 
                 (add-to-chunk-contents 
                     (array-to-string chunk-name)
                     content)
                 ;;; (view-all-chunks)
                 (setf content nil)
                 (go chunk-end-trailing)))
         (go in-chunk)
       chunk-end-trailing
         (incf pos)
         (setf code (aref buff pos))
         (if (eql code newline-code)
            (go normal-start))
         (if (eql code space-code)
            (go chunk-end-trailing))
         (format t "pos: ~A, code: ~A~&" pos code)
         (break "garbage after end of chunk marker")
       in-chunk
         (incf pos)
         (setf code (aref buff pos))
         (if (eql code newline-code)
             (go chunk-start))
         (if (eql code start-marker-code-1)
             (go start-marker-1))
         (go in-chunk)
       start-marker-1
         (incf pos)
         (setf code (aref buff pos))
         (if (eql code start-marker-code-2)
             (progn
                 (setf start-pos (+ pos 1))
                 (go start-marker-2)))
         (if (eql code newline-code)
             (go chunk-start))
         (go in-chunk)
       start-marker-2
         (incf pos)
         (setf code (aref buff pos))
         (if (eql code end-marker-code-1)
             (go start-marker-3))
         (if (eql code newline-code)
             (go chunk-start))
         (go start-marker-2)
       start-marker-3
         (incf pos)
         (setf code (aref buff pos))
         (if (eql code end-marker-code-2)
              (progn
                   (push (list chunk-start-pos (- start-pos 2)) content)
                   (push (array-to-string
                             (subseq buff start-pos (- pos 1)))
                          content)
                   (setf chunk-start-pos (+ pos 1))
                   (go in-chunk)))
         (if (eql code newline-code)
             (go chunk-start))
         (go start-marker-2)))

(defun write-chunk (buff name out-file)
    (let ((chunk (gethash name *chunk-hash-table*)))
        (dolist (segs (reverse (chunk-contents chunk)))
            (dolist (seg segs)
                (if (consp seg)
                    (write-sequence buff out-file :start (nth 0 seg)
                                                :end (nth 1 seg))
                    (write-chunk buff seg out-file))))))

(defun untangle (in-file out-name)
    (clrhash *chunk-hash-table*)
    (let ((buff (read-named-file in-file))
          (chunk)
          (name (make-array (list 1)
                                         :element-type '(unsigned-byte  8)
                                         :initial-element (char-code #\*))))
          (scan-for-chunks buff)
          ;;; (setf name (array-to-string name))
          (setf name "*")
          (setf chunk (gethash name *chunk-hash-table*))
          (if (not chunk)
              (break "No main chunk"))
          (with-open-file (out-file out-name :direction :output
                                       :if-exists :supersede
                                       :element-type '(unsigned-byte  8))
              (write-chunk buff name out-file))))
         

-- 
                              Waldek Hebisch
address@hidden 




reply via email to

[Prev in Thread] Current Thread [Next in Thread]