head	1.2;
access;
symbols
	perseant-exfatfs-base-20250801:1.2
	perseant-exfatfs-base-20240630:1.2
	cjep_sun2x:1.2.0.14
	cjep_sun2x-base:1.2
	cjep_staticlib_x-base1:1.2
	cjep_staticlib_x:1.2.0.12
	cjep_staticlib_x-base:1.2
	phil-wifi-20200421:1.2
	phil-wifi-20200411:1.2
	phil-wifi-20200406:1.2
	pgoyette-compat-merge-20190127:1.2
	pgoyette-compat-20190127:1.2
	pgoyette-compat-20190118:1.2
	pgoyette-compat-1226:1.2
	pgoyette-compat-1126:1.2
	pgoyette-compat-1020:1.2
	pgoyette-compat-0930:1.2
	pgoyette-compat-0906:1.2
	netbsd-7-2-RELEASE:1.1.1.1
	pgoyette-compat-0728:1.2
	pgoyette-compat-0625:1.2
	pgoyette-compat-0521:1.2
	pgoyette-compat-0502:1.2
	pgoyette-compat-0422:1.2
	pgoyette-compat-0415:1.2
	pgoyette-compat-0407:1.2
	pgoyette-compat-0330:1.2
	pgoyette-compat-0322:1.2
	pgoyette-compat-0315:1.2
	netbsd-7-1-2-RELEASE:1.1.1.1
	pgoyette-compat:1.2.0.10
	pgoyette-compat-base:1.2
	netbsd-7-1-1-RELEASE:1.1.1.1
	perseant-stdc-iso10646:1.2.0.8
	perseant-stdc-iso10646-base:1.2
	prg-localcount2-base3:1.2
	prg-localcount2-base2:1.2
	prg-localcount2-base1:1.2
	prg-localcount2:1.2.0.6
	prg-localcount2-base:1.2
	pgoyette-localcount-20170426:1.2
	bouyer-socketcan-base1:1.2
	pgoyette-localcount-20170320:1.2
	netbsd-7-1:1.1.1.1.0.68
	netbsd-7-1-RELEASE:1.1.1.1
	netbsd-7-1-RC2:1.1.1.1
	netbsd-7-nhusb-base-20170116:1.1.1.1
	bouyer-socketcan:1.2.0.4
	bouyer-socketcan-base:1.2
	pgoyette-localcount-20170107:1.2
	netbsd-7-1-RC1:1.1.1.1
	pgoyette-localcount-20161104:1.2
	netbsd-7-0-2-RELEASE:1.1.1.1
	localcount-20160914:1.2
	netbsd-7-nhusb:1.1.1.1.0.66
	netbsd-7-nhusb-base:1.1.1.1
	pgoyette-localcount-20160806:1.2
	pgoyette-localcount-20160726:1.2
	pgoyette-localcount:1.2.0.2
	pgoyette-localcount-base:1.2
	netbsd-7-0-1-RELEASE:1.1.1.1
	netbsd-7-0:1.1.1.1.0.64
	netbsd-7-0-RELEASE:1.1.1.1
	netbsd-7-0-RC3:1.1.1.1
	netbsd-7-0-RC2:1.1.1.1
	netbsd-7-0-RC1:1.1.1.1
	netbsd-5-2-3-RELEASE:1.1.1.1
	netbsd-5-1-5-RELEASE:1.1.1.1
	netbsd-6-0-6-RELEASE:1.1.1.1
	netbsd-6-1-5-RELEASE:1.1.1.1
	netbsd-7:1.1.1.1.0.62
	netbsd-7-base:1.1.1.1
	yamt-pagecache-base9:1.1.1.1
	yamt-pagecache-tag8:1.1.1.1
	netbsd-6-1-4-RELEASE:1.1.1.1
	netbsd-6-0-5-RELEASE:1.1.1.1
	tls-earlyentropy:1.1.1.1.0.60
	tls-earlyentropy-base:1.1.1.1
	riastradh-xf86-video-intel-2-7-1-pre-2-21-15:1.1.1.1
	riastradh-drm2-base3:1.1.1.1
	netbsd-6-1-3-RELEASE:1.1.1.1
	netbsd-6-0-4-RELEASE:1.1.1.1
	netbsd-5-2-2-RELEASE:1.1.1.1
	netbsd-5-1-4-RELEASE:1.1.1.1
	netbsd-6-1-2-RELEASE:1.1.1.1
	netbsd-6-0-3-RELEASE:1.1.1.1
	netbsd-5-2-1-RELEASE:1.1.1.1
	netbsd-5-1-3-RELEASE:1.1.1.1
	netbsd-6-1-1-RELEASE:1.1.1.1
	riastradh-drm2-base2:1.1.1.1
	riastradh-drm2-base1:1.1.1.1
	riastradh-drm2:1.1.1.1.0.54
	riastradh-drm2-base:1.1.1.1
	netbsd-6-1:1.1.1.1.0.58
	netbsd-6-0-2-RELEASE:1.1.1.1
	netbsd-6-1-RELEASE:1.1.1.1
	netbsd-6-1-RC4:1.1.1.1
	netbsd-6-1-RC3:1.1.1.1
	agc-symver:1.1.1.1.0.56
	agc-symver-base:1.1.1.1
	netbsd-6-1-RC2:1.1.1.1
	netbsd-6-1-RC1:1.1.1.1
	yamt-pagecache-base8:1.1.1.1
	netbsd-5-2:1.1.1.1.0.52
	netbsd-6-0-1-RELEASE:1.1.1.1
	yamt-pagecache-base7:1.1.1.1
	netbsd-5-2-RELEASE:1.1.1.1
	netbsd-5-2-RC1:1.1.1.1
	matt-nb6-plus-nbase:1.1.1.1
	yamt-pagecache-base6:1.1.1.1
	netbsd-6-0:1.1.1.1.0.50
	netbsd-6-0-RELEASE:1.1.1.1
	netbsd-6-0-RC2:1.1.1.1
	tls-maxphys:1.1.1.1.0.48
	tls-maxphys-base:1.1.1.1
	matt-nb6-plus:1.1.1.1.0.46
	matt-nb6-plus-base:1.1.1.1
	netbsd-6-0-RC1:1.1.1.1
	yamt-pagecache-base5:1.1.1.1
	yamt-pagecache-base4:1.1.1.1
	netbsd-6:1.1.1.1.0.44
	netbsd-6-base:1.1.1.1
	netbsd-5-1-2-RELEASE:1.1.1.1
	netbsd-5-1-1-RELEASE:1.1.1.1
	yamt-pagecache-base3:1.1.1.1
	yamt-pagecache-base2:1.1.1.1
	yamt-pagecache:1.1.1.1.0.42
	yamt-pagecache-base:1.1.1.1
	cherry-xenmp:1.1.1.1.0.40
	cherry-xenmp-base:1.1.1.1
	bouyer-quota2-nbase:1.1.1.1
	bouyer-quota2:1.1.1.1.0.38
	bouyer-quota2-base:1.1.1.1
	matt-mips64-premerge-20101231:1.1.1.1
	matt-nb5-mips64-premerge-20101231:1.1.1.1
	matt-nb5-pq3:1.1.1.1.0.36
	matt-nb5-pq3-base:1.1.1.1
	netbsd-5-1:1.1.1.1.0.34
	netbsd-5-1-RELEASE:1.1.1.1
	netbsd-5-1-RC4:1.1.1.1
	matt-nb5-mips64-k15:1.1.1.1
	netbsd-5-1-RC3:1.1.1.1
	netbsd-5-1-RC2:1.1.1.1
	netbsd-5-1-RC1:1.1.1.1
	netbsd-5-0-2-RELEASE:1.1.1.1
	matt-nb5-mips64-premerge-20091211:1.1.1.1
	matt-premerge-20091211:1.1.1.1
	matt-nb5-mips64-u2-k2-k4-k7-k8-k9:1.1.1.1
	matt-nb4-mips64-k7-u2a-k9b:1.1.1.1
	matt-nb5-mips64-u1-k1-k5:1.1.1.1
	matt-nb5-mips64:1.1.1.1.0.32
	netbsd-5-0-1-RELEASE:1.1.1.1
	jym-xensuspend-nbase:1.1.1.1
	netbsd-5-0:1.1.1.1.0.30
	netbsd-5-0-RELEASE:1.1.1.1
	netbsd-5-0-RC4:1.1.1.1
	netbsd-5-0-RC3:1.1.1.1
	netbsd-5-0-RC2:1.1.1.1
	jym-xensuspend:1.1.1.1.0.28
	jym-xensuspend-base:1.1.1.1
	netbsd-5-0-RC1:1.1.1.1
	netbsd-5:1.1.1.1.0.26
	netbsd-5-base:1.1.1.1
	matt-mips64-base2:1.1.1.1
	matt-mips64:1.1.1.1.0.24
	mjf-devfs2:1.1.1.1.0.22
	mjf-devfs2-base:1.1.1.1
	netbsd-4-0-1-RELEASE:1.1.1.1
	wrstuden-revivesa-base-3:1.1.1.1
	wrstuden-revivesa-base-2:1.1.1.1
	wrstuden-fixsa-newbase:1.1.1.1
	wrstuden-revivesa-base-1:1.1.1.1
	yamt-pf42-base4:1.1.1.1
	yamt-pf42-base3:1.1.1.1
	hpcarm-cleanup-nbase:1.1.1.1
	yamt-pf42-baseX:1.1.1.1
	yamt-pf42-base2:1.1.1.1
	wrstuden-revivesa:1.1.1.1.0.20
	wrstuden-revivesa-base:1.1.1.1
	yamt-pf42:1.1.1.1.0.18
	yamt-pf42-base:1.1.1.1
	keiichi-mipv6:1.1.1.1.0.16
	keiichi-mipv6-base:1.1.1.1
	matt-armv6-nbase:1.1.1.1
	matt-armv6-prevmlocking:1.1.1.1
	wrstuden-fixsa-base-1:1.1.1.1
	netbsd-4-0:1.1.1.1.0.14
	netbsd-4-0-RELEASE:1.1.1.1
	cube-autoconf:1.1.1.1.0.12
	cube-autoconf-base:1.1.1.1
	netbsd-4-0-RC5:1.1.1.1
	netbsd-4-0-RC4:1.1.1.1
	netbsd-4-0-RC3:1.1.1.1
	netbsd-4-0-RC2:1.1.1.1
	netbsd-4-0-RC1:1.1.1.1
	matt-armv6:1.1.1.1.0.10
	matt-armv6-base:1.1.1.1
	matt-mips64-base:1.1.1.1
	hpcarm-cleanup:1.1.1.1.0.8
	hpcarm-cleanup-base:1.1.1.1
	wrstuden-fixsa:1.1.1.1.0.6
	wrstuden-fixsa-base:1.1.1.1
	abandoned-netbsd-4-base:1.1.1.1
	abandoned-netbsd-4:1.1.1.1.0.2
	netbsd-4:1.1.1.1.0.4
	netbsd-4-base:1.1.1.1
	v0_14_4:1.1.1.1
	GNU:1.1.1;
locks; strict;
comment	@# @;


1.2
date	2016.01.13.01.59.37;	author christos;	state dead;
branches;
next	1.1;
commitid	U9rtUbw2y5ZheIQy;

1.1
date	2005.04.29.15.08.15;	author christos;	state Exp;
branches
	1.1.1.1;
next	;

1.1.1.1
date	2005.04.29.15.08.15;	author christos;	state Exp;
branches;
next	;


desc
@@


1.2
log
@gettext has moved.
@
text
@#!/usr/local/bin/clisp -C

;;; Creation of CLISP's uni_names.h from the UnicodeData.txt table.
;;; Bruno Haible 2000-12-28

(defparameter add-comments nil)

(defstruct unicode-char
  (code nil :type integer)
  (name nil :type string)
  word-indices
  word-indices-index
)

(defstruct word-list
  (hashed nil :type hash-table)
  (sorted nil :type list)
  size                          ; number of characters total
  length                        ; number of words
)

(defun main (inputfile outputfile)
  (declare (type string inputfile outputfile))
  #+UNICODE (setq *default-file-encoding* charset:utf-8)
  (let ((all-chars '()))
    ;; Read all characters and names from the input file.
    (with-open-file (istream inputfile :direction :input)
      (loop
        (let ((line (read-line istream nil nil)))
          (unless line (return))
          (let* ((i1 (position #\; line))
                 (i2 (position #\; line :start (1+ i1)))
                 (code-string (subseq line 0 i1))
                 (code (parse-integer code-string :radix 16))
                 (name-string (subseq line (1+ i1) i2)))
            ; Ignore characters whose name starts with "<".
            (unless (eql (char name-string 0) #\<)
              ; Also ignore Hangul syllables; they are treated specially.
              (unless (<= #xAC00 code #xD7A3)
                ; Also ignore CJK compatibility ideographs; they are treated
                ; specially as well.
                (unless (or (<= #xF900 code #xFA2D) (<= #xFA30 code #xFA6A)
                            (<= #x2F800 code #x2FA1D))
                  ; Transform the code so that it fits in 16 bits. In
                  ; Unicode 3.1 the following ranges are used.
                  ;   0x00000..0x033FF  >>12=  0x00..0x03  -> 0x0..0x3
                  ;   0x0A000..0x0A4FF  >>12=  0x0A        -> 0x4
                  ;   0x0F900..0x0FFFF  >>12=  0x0F        -> 0x5
                  ;   0x10300..0x104FF  >>12=  0x10        -> 0x6
                  ;   0x1D000..0x1D7DD  >>12=  0x1D        -> 0x7
                  ;   0x2F800..0x2FAFF  >>12=  0x2F        -> 0x8
                  ;   0xE0000..0xE00FF  >>12=  0xE0        -> 0x9
                  (flet ((transform (x)
                           (dpb
                             (case (ash x -12)
                               ((#x00 #x01 #x02 #x03) (ash x -12))
                               (#x0A 4)
                               (#x0F 5)
                               (#x10 6)
                               (#x1D 7)
                               (#x2F 8)
                               (#xE0 9)
                               (t (error "Update the transform function for 0x~5,'0X" x))
                             )
                             (byte 8 12)
                             x
                        )) )
                    (push (make-unicode-char :code (transform code)
                                             :name name-string)
                          all-chars
            ) ) ) ) )
    ) ) ) )
    (setq all-chars (nreverse all-chars))
    ;; Split into words.
    (let ((words-by-length (make-array 0 :adjustable t)))
      (dolist (name (list* "HANGUL SYLLABLE" "CJK COMPATIBILITY" (mapcar #'unicode-char-name all-chars)))
        (let ((i1 0))
          (loop
            (when (>= i1 (length name)) (return))
            (let ((i2 (or (position #\Space name :start i1) (length name))))
              (let* ((word (subseq name i1 i2))
                     (len (length word)))
                (when (>= len (length words-by-length))
                  (adjust-array words-by-length (1+ len))
                )
                (unless (aref words-by-length len)
                  (setf (aref words-by-length len)
                        (make-word-list
                          :hashed (make-hash-table :test #'equal)
                          :sorted '()
                ) )     )
                (let ((word-list (aref words-by-length len)))
                  (unless (gethash word (word-list-hashed word-list))
                    (setf (gethash word (word-list-hashed word-list)) t)
                    (push word (word-list-sorted word-list))
                ) )
              )
              (setq i1 (1+ i2))
      ) ) ) )
      ;; Sort the word lists.
      (dotimes (len (length words-by-length))
        (unless (aref words-by-length len)
          (setf (aref words-by-length len)
                (make-word-list
                  :hashed (make-hash-table :test #'equal)
                  :sorted '()
        ) )     )
        (let ((word-list (aref words-by-length len)))
          (setf (word-list-sorted word-list)
                (sort (word-list-sorted word-list) #'string<)
          )
          (setf (word-list-size word-list)
                (reduce #'+ (mapcar #'length (word-list-sorted word-list)))
          )
          (setf (word-list-length word-list)
                (length (word-list-sorted word-list))
      ) ) )
      ;; Output the tables.
      (with-open-file (ostream outputfile :direction :output
                       #+UNICODE :external-format #+UNICODE charset:ascii)
        (format ostream "/*~%")
        (format ostream " * ~A~%" (file-namestring outputfile))
        (format ostream " *~%")
        (format ostream " * Unicode character name table.~%")
        (format ostream " * Generated automatically by the gen-uninames utility.~%")
        (format ostream " */~%")
        (format ostream "~%")
        (format ostream "static const char unicode_name_words[~D] = {~%"
                        (let ((sum 0))
                          (dotimes (len (length words-by-length))
                            (let ((word-list (aref words-by-length len)))
                              (incf sum (word-list-size word-list))
                          ) )
                          sum
        )               )
        (dotimes (len (length words-by-length))
          (let ((word-list (aref words-by-length len)))
            (dolist (word (word-list-sorted word-list))
              (format ostream " ~{ '~C',~}~%" (coerce word 'list))
        ) ) )
        (format ostream "};~%")
        (format ostream "#define UNICODE_CHARNAME_NUM_WORDS ~D~%"
                        (let ((sum 0))
                          (dotimes (len (length words-by-length))
                            (let ((word-list (aref words-by-length len)))
                              (incf sum (word-list-length word-list))
                          ) )
                          sum
        )               )
        #| ; Redundant data
        (format ostream "static const uint16_t unicode_name_word_offsets[~D] = {~%"
                        (let ((sum 0))
                          (dotimes (len (length words-by-length))
                            (let ((word-list (aref words-by-length len)))
                              (incf sum (word-list-length word-list))
                          ) )
                          sum
        )               )
        (dotimes (len (length words-by-length))
          (let ((word-list (aref words-by-length len)))
            (when (word-list-sorted word-list)
              (format ostream " ")
              (do ((l (word-list-sorted word-list) (cdr l))
                   (offset 0 (+ offset (length (car l)))))
                  ((endp l))
                (format ostream "~<~% ~0,79:; ~D,~>" offset)
              )
              (format ostream "~%")
        ) ) )
        (format ostream "};~%")
        |#
        (format ostream "static const struct { uint16_t extra_offset; uint16_t ind_offset; } unicode_name_by_length[~D] = {~%"
                        (1+ (length words-by-length))
        )
        (let ((extra-offset 0)
              (ind-offset 0))
          (dotimes (len (length words-by-length))
            (let ((word-list (aref words-by-length len)))
              (format ostream "  { ~D, ~D },~%" extra-offset ind-offset)
              (incf extra-offset (word-list-size word-list))
              (incf ind-offset (word-list-length word-list))
          ) )
          (format ostream "  { ~D, ~D }~%" extra-offset ind-offset)
        )
        (format ostream "};~%")
        (let ((ind-offset 0))
          (dotimes (len (length words-by-length))
            (let ((word-list (aref words-by-length len)))
              (dolist (word (word-list-sorted word-list))
                (setf (gethash word (word-list-hashed word-list)) ind-offset)
                (incf ind-offset)
        ) ) ) )
        (dolist (word '("HANGUL" "SYLLABLE" "CJK" "COMPATIBILITY"))
          (format ostream "#define UNICODE_CHARNAME_WORD_~A ~D~%" word
                          (gethash word (word-list-hashed (aref words-by-length (length word))))
        ) )
        ;; Compute the word-indices for every unicode-char.
        (dolist (uc all-chars)
          (let ((name (unicode-char-name uc))
                (indices '()))
            (let ((i1 0))
              (loop
                (when (>= i1 (length name)) (return))
                (let ((i2 (or (position #\Space name :start i1) (length name))))
                  (let* ((word (subseq name i1 i2))
                         (len (length word)))
                    (push (gethash word (word-list-hashed (aref words-by-length len)))
                          indices
                    )
                  )
                  (setq i1 (1+ i2))
            ) ) )
            (setf (unicode-char-word-indices uc)
                  (coerce (nreverse indices) 'vector)
            )
        ) )
        ;; Sort the list of unicode-chars by word-indices.
        (setq all-chars
              (sort all-chars
                    (lambda (vec1 vec2)
                      (let ((len1 (length vec1))
                            (len2 (length vec2)))
                        (do ((i 0 (1+ i)))
                            (nil)
                          (if (< i len2)
                            (if (< i len1)
                              (cond ((< (aref vec1 i) (aref vec2 i)) (return t))
                                    ((> (aref vec1 i) (aref vec2 i)) (return nil))
                              )
                              (return t)
                            )
                            (return nil)
                    ) ) ) )
                    :key #'unicode-char-word-indices
        )     )
        ;; Output the word-indices.
        (format ostream "static const uint16_t unicode_names[~D] = {~%"
                        (reduce #'+ (mapcar (lambda (uc) (length (unicode-char-word-indices uc))) all-chars))
        )
        (let ((i 0))
          (dolist (uc all-chars)
            (format ostream " ~{ ~D,~}"
                            (maplist (lambda (r) (+ (* 2 (car r)) (if (cdr r) 1 0)))
                                     (coerce (unicode-char-word-indices uc) 'list)
                            )
            )
            (when add-comments
              (format ostream "~40T/* ~A */" (unicode-char-name uc))
            )
            (format ostream "~%")
            (setf (unicode-char-word-indices-index uc) i)
            (incf i (length (unicode-char-word-indices uc)))
        ) )
        (format ostream "};~%")
        (format ostream "static const struct { uint16_t code; uint16_t name; } unicode_name_to_code[~D] = {~%"
                        (length all-chars)
        )
        (dolist (uc all-chars)
          (format ostream "  { 0x~4,'0X, ~D },"
                          (unicode-char-code uc)
                          (unicode-char-word-indices-index uc)
          )
          (when add-comments
            (format ostream "~21T/* ~A */" (unicode-char-name uc))
          )
          (format ostream "~%")
        )
        (format ostream "};~%")
        (format ostream "static const struct { uint16_t code; uint16_t name; } unicode_code_to_name[~D] = {~%"
                        (length all-chars)
        )
        (dolist (uc (sort (copy-list all-chars) #'< :key #'unicode-char-code))
          (format ostream "  { 0x~4,'0X, ~D },"
                          (unicode-char-code uc)
                          (unicode-char-word-indices-index uc)
          )
          (when add-comments
            (format ostream "~21T/* ~A */" (unicode-char-name uc))
          )
          (format ostream "~%")
        )
        (format ostream "};~%")
        (format ostream "#define UNICODE_CHARNAME_MAX_LENGTH ~D~%"
                        (reduce #'max (mapcar (lambda (uc) (length (unicode-char-name uc))) all-chars))
        )
        (format ostream "#define UNICODE_CHARNAME_MAX_WORDS ~D~%"
                        (reduce #'max (mapcar (lambda (uc) (length (unicode-char-word-indices uc))) all-chars))
        )
      )
) ) )

(main (first *args*) (second *args*))
@


1.1
log
@Initial revision
@
text
@@


1.1.1.1
log
@ftp ftp.gnu.org
@
text
@@
