code.delx.au - gnu-emacs/blob - lisp/international/characters.el

   1 ;;; characters.el --- set syntax and category for multibyte characters
   2
   3 ;; Copyright (C) 1997, 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007
   4 ;;   Free Software Foundation, Inc.
   5 ;; Copyright (C) 1995, 1996, 1997, 1998, 1999, 2000, 2001, 2002, 2003, 2004,
   6 ;;   2005, 2006, 2007
   7 ;;   National Institute of Advanced Industrial Science and Technology (AIST)
   8 ;;   Registration Number H14PRO021
   9 ;; Copyright (C) 2003
  10 ;;   National Institute of Advanced Industrial Science and Technology (AIST)
  11 ;;   Registration Number H13PRO009
  12
  13 ;; Keywords: multibyte character, character set, syntax, category
  14
  15 ;; This file is part of GNU Emacs.
  16
  17 ;; GNU Emacs is free software; you can redistribute it and/or modify
  18 ;; it under the terms of the GNU General Public License as published by
  19 ;; the Free Software Foundation; either version 3, or (at your option)
  20 ;; any later version.
  21
  22 ;; GNU Emacs is distributed in the hope that it will be useful,
  23 ;; but WITHOUT ANY WARRANTY; without even the implied warranty of
  24 ;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  25 ;; GNU General Public License for more details.
  26
  27 ;; You should have received a copy of the GNU General Public License
  28 ;; along with GNU Emacs; see the file COPYING.  If not, write to the
  29 ;; Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
  30 ;; Boston, MA 02110-1301, USA.
  31
  32 ;;; Commentary:
  33
  34 ;;; Code:
  35
  36 ;;; Predefined categories.
  37
  38 ;; For each character set.
  39
  40 (define-category ?a "ASCII graphic characters 32-126 (ISO646 IRV:1983[4/0])")
  41 (define-category ?l "Latin")
  42 (define-category ?t "Thai")
  43 (define-category ?g "Greek")
  44 (define-category ?b "Arabic")
  45 (define-category ?w "Hebrew")
  46 (define-category ?y "Cyrillic")
  47 (define-category ?k "Japanese katakana")
  48 (define-category ?r "Japanese roman")
  49 (define-category ?c "Chinese")
  50 (define-category ?j "Japanese")
  51 (define-category ?h "Korean")
  52 (define-category ?e "Ethiopic (Ge'ez)")
  53 (define-category ?v "Vietnamese")
  54 (define-category ?i "Indian")
  55 (define-category ?o "Lao")
  56 (define-category ?q "Tibetan")
  57
  58 ;; For each group (row) of 2-byte character sets.
  59
  60 (define-category ?A "Alpha-numeric characters of 2-byte character sets")
  61 (define-category ?C "Chinese (Han) characters of 2-byte character sets")
  62 (define-category ?G "Greek characters of 2-byte character sets")
  63 (define-category ?H "Japanese Hiragana characters of 2-byte character sets")
  64 (define-category ?K "Japanese Katakana characters of 2-byte character sets")
  65 (define-category ?N "Korean Hangul characters of 2-byte character sets")
  66 (define-category ?Y "Cyrillic characters of 2-byte character sets")
  67 (define-category ?I "Indian Glyphs")
  68
  69 ;; For phonetic classifications.
  70
  71 (define-category ?0 "consonant")
  72 (define-category ?1 "base (independent) vowel")
  73 (define-category ?2 "upper diacritical mark (including upper vowel)")
  74 (define-category ?3 "lower diacritical mark (including lower vowel)")
  75 (define-category ?4 "combining tone mark")
  76 (define-category ?5 "symbol")
  77 (define-category ?6 "digit")
  78 (define-category ?7 "vowel-modifying diacritical mark")
  79 (define-category ?8 "vowel-signs")
  80 (define-category ?9 "semivowel lower")
  81
  82 ;; For filling.
  83 (define-category ?| "While filling, we can break a line at this character.")
  84
  85 ;; For indentation calculation.
  86 (define-category ?\s
  87   "This character counts as a space for indentation purposes.")
  88
  89 ;; Keep the following for `kinsoku' processing.  See comments in
  90 ;; kinsoku.el.
  91 (define-category ?> "A character which can't be placed at beginning of line.")
  92 (define-category ?< "A character which can't be placed at end of line.")
  93
  94 ;; Combining
  95 (define-category ?^ "Combining diacritic or mark")
  96 \f
  97 ;;; Setting syntax and category.
  98
  99 ;; ASCII
 100
 101 ;; All ASCII characters have the category `a' (ASCII) and `l' (Latin).
 102 (modify-category-entry '(32 . 127) ?a)
 103 (modify-category-entry '(32 . 127) ?l)
 104
 105 ;; Deal with the CJK charsets first.  Since the syntax of blocks is
 106 ;; defined per charset, and the charsets may contain e.g. Latin
 107 ;; characters, we end up with the wrong syntax definitions if we're
 108 ;; not careful.
 109
 110 ;; Chinese characters (Unicode)
 111 (modify-category-entry '(#x2E80 . #x312F) ?|)
 112 (modify-category-entry '(#x3190 . #x33FF) ?|)
 113 (modify-category-entry '(#x3400 . #x9FAF) ?C)
 114 (modify-category-entry '(#x3400 . #x9FAF) ?c)
 115 (modify-category-entry '(#x3400 . #x9FAF) ?|)
 116 (modify-category-entry '(#xF900 . #xFAFF) ?C)
 117 (modify-category-entry '(#xF900 . #xFAFF) ?c)
 118 (modify-category-entry '(#xF900 . #xFAFF) ?|)
 119 (modify-category-entry '(#x20000 . #x2AFFF) ?|)
 120 (modify-category-entry '(#x2F800 . #x2FFFF) ?|)
 121
 122
 123 ;; Chinese character set (GB2312)
 124
 125 (map-charset-chars #'modify-syntax-entry 'chinese-gb2312 "_" #x2121 #x217E)
 126 (map-charset-chars #'modify-syntax-entry 'chinese-gb2312 "_" #x2221 #x227E)
 127 (map-charset-chars #'modify-syntax-entry 'chinese-gb2312 "_" #x2921 #x297E)
 128
 129 (map-charset-chars #'modify-category-entry 'chinese-gb2312 ?c)
 130 (map-charset-chars #'modify-category-entry 'chinese-gb2312 ?A #x2330 #x2339)
 131 (map-charset-chars #'modify-category-entry 'chinese-gb2312 ?A #x2341 #x235A)
 132 (map-charset-chars #'modify-category-entry 'chinese-gb2312 ?A #x2361 #x237A)
 133 (map-charset-chars #'modify-category-entry 'chinese-gb2312 ?H #x2421 #x247E)
 134 (map-charset-chars #'modify-category-entry 'chinese-gb2312 ?K #x2521 #x257E)
 135 (map-charset-chars #'modify-category-entry 'chinese-gb2312 ?G #x2621 #x267E)
 136 (map-charset-chars #'modify-category-entry 'chinese-gb2312 ?Y #x2721 #x277E)
 137 (map-charset-chars #'modify-category-entry 'chinese-gb2312 ?C #x3021 #x7E7E)
 138
 139 ;; Chinese character set (BIG5)
 140
 141 (map-charset-chars #'modify-category-entry 'big5 ?c)
 142 (map-charset-chars #'modify-category-entry 'big5 ?C #xA259 #xA25F)
 143 (map-charset-chars #'modify-category-entry 'big5 ?C #xA440 #xC67E)
 144 (map-charset-chars #'modify-category-entry 'big5 ?C #xC940 #xF9DF)
 145
 146 ;; Chinese character set (CNS11643)
 147
 148 (dolist (c '(chinese-cns11643-1 chinese-cns11643-2 chinese-cns11643-3
 149              chinese-cns11643-4 chinese-cns11643-5 chinese-cns11643-6
 150              chinese-cns11643-7))
 151   (map-charset-chars #'modify-category-entry c ?c)
 152   (if (eq c 'chinese-cns11643-1)
 153       (map-charset-chars #'modify-category-entry c ?C #x4421 #x7E7E)
 154     (map-charset-chars #'modify-category-entry c ?C)))
 155
 156 ;; Japanese character set (JISX0201, JISX0208, JISX0212, JISX0213)
 157
 158 (map-charset-chars #'modify-category-entry 'katakana-jisx0201 ?k)
 159
 160 (map-charset-chars #'modify-category-entry 'latin-jisx0201 ?r)
 161
 162 (dolist (l '(katakana-jisx0201 japanese-jisx0208 japanese-jisx0212
 163                                japanese-jisx0213-1 japanese-jisx0213-2))
 164   (map-charset-chars #'modify-category-entry l ?j))
 165
 166 ;; Unicode equivalents of JISX0201-kana
 167 (let ((range '(#xff61 . #xff9f)))
 168   (modify-category-entry range  ?k)
 169   (modify-category-entry range ?j)
 170   (modify-category-entry range ?\|))
 171
 172 ;; Katakana block
 173 (let ((range '(#x30a0 . #x30ff)))
 174   ;; ?K is double width, ?k isn't specified
 175   (modify-category-entry range ?K)
 176   (modify-category-entry range ?\|))
 177
 178 ;; Hiragana block
 179 (let ((range '(#x3040 . #x309d)))
 180   ;; ?H is actually defined to be double width
 181   ;;(modify-category-entry range ?H)
 182   (modify-category-entry range ?\|)
 183   )
 184
 185 ;; JISX0208
 186 (map-charset-chars #'modify-syntax-entry 'japanese-jisx0208 "_" #x2121 #x227E)
 187 (map-charset-chars #'modify-syntax-entry 'japanese-jisx0208 "_" #x2821 #x287E)
 188 (let ((chars '(?ー ?゛ ?゜ ?ヽ ?ヾ ?ゝ ?ゞ ?〃 ?仝 ?々 ?〆 ?〇)))
 189   (dolist (elt chars)
 190     (modify-syntax-entry (car chars) "w")))
 191
 192 (map-charset-chars #'modify-category-entry 'japanese-jisx0208 ?A #x2321 #x237E)
 193 (map-charset-chars #'modify-category-entry 'japanese-jisx0208 ?H #x2421 #x247E)
 194 (map-charset-chars #'modify-category-entry 'japanese-jisx0208 ?K #x2521 #x257E)
 195 (map-charset-chars #'modify-category-entry 'japanese-jisx0208 ?G #x2621 #x267E)
 196 (map-charset-chars #'modify-category-entry 'japanese-jisx0208 ?Y #x2721 #x277E)
 197 (map-charset-chars #'modify-category-entry 'japanese-jisx0208 ?C #x3021 #x7E7E)
 198 (modify-category-entry ?ー ?K)
 199 (let ((chars '(?゛ ?゜)))
 200   (while chars
 201     (modify-category-entry (car chars) ?K)
 202     (modify-category-entry (car chars) ?H)
 203     (setq chars (cdr chars))))
 204 (let ((chars '(?ヽ ?ヾ ?ゝ ?ゞ ?〃 ?仝 ?々 ?〆 ?〇)))
 205   (while chars
 206     (modify-category-entry (car chars) ?C)
 207     (setq chars (cdr chars))))
 208
 209 ;; JISX0212
 210
 211 (map-charset-chars #'modify-syntax-entry 'japanese-jisx0212 "_" #x2121 #x237E)
 212
 213 ;; JISX0201-Kana
 214
 215 (let ((chars '(?｡ ?､ ?･)))
 216   (while chars
 217     (modify-syntax-entry (car chars) ".")
 218     (setq chars (cdr chars))))
 219
 220 (modify-syntax-entry ?\｢ "(｣")
 221 (modify-syntax-entry ?\｣ "(｢")
 222
 223 ;; Korean character set (KSC5601)
 224
 225 (map-charset-chars #'modify-category-entry 'korean-ksc5601 ?h)
 226
 227 (map-charset-chars #'modify-syntax-entry 'korean-ksc5601 "_" #x2121 #x227E)
 228 (map-charset-chars #'modify-syntax-entry 'korean-ksc5601 "_" #x2621 #x277E)
 229 (map-charset-chars #'modify-syntax-entry 'korean-ksc5601 "_" #x2830 #x287E)
 230 (map-charset-chars #'modify-syntax-entry 'korean-ksc5601 "_" #x2930 #x297E)
 231 (map-charset-chars #'modify-category-entry 'korean-ksc5601 ?A #x2330 #x2339)
 232 (map-charset-chars #'modify-category-entry 'korean-ksc5601 ?A #x2341 #x235A)
 233 (map-charset-chars #'modify-category-entry 'korean-ksc5601 ?A #x2361 #x237A)
 234 (map-charset-chars #'modify-category-entry 'korean-ksc5601 ?G #x2521 #x257E)
 235 (map-charset-chars #'modify-category-entry 'korean-ksc5601 ?H #x2A21 #x2A7E)
 236 (map-charset-chars #'modify-category-entry 'korean-ksc5601 ?K #x2B21 #x2B7E)
 237 (map-charset-chars #'modify-category-entry 'korean-ksc5601 ?Y #x2C21 #x2C7E)
 238
 239 ;; These are in more than one charset.
 240 (let ((parens (concat "〈〉《》「」『』【】〔〕〖〗〘〙〚〛"
 241                       "︵︶︷︸︹︺︻︼︽︾︿﹀﹁﹂﹃﹄"
 242                       "（）［］｛｝"))
 243       open close)
 244   (dotimes (i (/ (length parens) 2))
 245     (setq open (aref parens (* i 2))
 246           close (aref parens (1+ (* i 2))))
 247     (modify-syntax-entry open (format "(%c" close))
 248     (modify-syntax-entry close (format ")%c" open))))
 249
 250 ;; Arabic character set
 251
 252 (let ((charsets '(arabic-iso8859-6
 253                   arabic-digit
 254                   arabic-1-column
 255                   arabic-2-column)))
 256   (while charsets
 257     (map-charset-chars #'modify-category-entry (car charsets) ?b)
 258     (setq charsets (cdr charsets))))
 259 (modify-category-entry '(#x600 . #x6ff) ?b)
 260 (modify-category-entry '(#xfb50 . #xfdff) ?b)
 261 (modify-category-entry '(#xfe70 . #xfefe) ?b)
 262
 263 ;; Cyrillic character set (ISO-8859-5)
 264
 265 (modify-syntax-entry ?№ ".")
 266
 267 ;; Ethiopic character set
 268
 269 (modify-category-entry '(#x1200 . #x1399) ?e)
 270 (modify-category-entry '(#x2d80 . #x2dde) ?e)
 271 (let ((chars '(?፡ ?። ?፣ ?፤ ?፥ ?፦ ?፧ ?፨ ?���� ?���� ?���� ?���� ?���� ?����)))
 272   (while chars
 273     (modify-syntax-entry (car chars) ".")
 274     (setq chars (cdr chars))))
 275 (map-charset-chars #'modify-category-entry 'ethiopic ?e)
 276
 277 ;; Hebrew character set (ISO-8859-8)
 278
 279 (modify-syntax-entry #x5be ".") ; MAQAF
 280 (modify-syntax-entry #x5c0 ".") ; PASEQ
 281 (modify-syntax-entry #x5c3 ".") ; SOF PASUQ
 282 (modify-syntax-entry #x5f3 ".") ; GERESH
 283 (modify-syntax-entry #x5f4 ".") ; GERSHAYIM
 284
 285 ;; Indian character set (IS 13194 and other Emacs original Indian charsets)
 286
 287 (modify-category-entry '(#x901 . #x970) ?i)
 288 (map-charset-chars #'modify-category-entry 'indian-is13194 ?i)
 289 (map-charset-chars #'modify-category-entry 'indian-2-column ?i)
 290
 291 ;; Lao character set
 292
 293 (modify-category-entry '(#xe80 . #xeff) ?o)
 294 (map-charset-chars #'modify-category-entry 'lao ?o)
 295
 296 (let ((deflist  '(("ກ-ຮ"    "w"     ?0) ; consonant
 297                   ("ະາຳຽເ-ໄ"        "w"     ?1) ; vowel base
 298                   ("ັິ-ືົໍ"   "w"     ?2) ; vowel upper
 299                   ("ຸູ"     "w"     ?3) ; vowel lower
 300                   ("່-໋"    "w"     ?4) ; tone mark
 301                   ("ຼຽ"     "w"     ?9) ; semivowel lower
 302                   ("໐-໙"    "w"     ?6) ; digit
 303                   ("ຯໆ"     "_"     ?5) ; symbol
 304                   ))
 305       elm chars len syntax category to ch i)
 306   (while deflist
 307     (setq elm (car deflist))
 308     (setq chars (car elm)
 309           len (length chars)
 310           syntax (nth 1 elm)
 311           category (nth 2 elm)
 312           i 0)
 313     (while (< i len)
 314       (if (= (aref chars i) ?-)
 315           (setq i (1+ i)
 316                 to (aref chars i))
 317         (setq ch (aref chars i)
 318               to ch))
 319       (while (<= ch to)
 320         (unless (string-equal syntax "w")
 321           (modify-syntax-entry ch syntax))
 322         (modify-category-entry ch category)
 323         (setq ch (1+ ch)))
 324       (setq i (1+ i)))
 325     (setq deflist (cdr deflist))))
 326
 327 ;; Thai character set (TIS620)
 328
 329 (modify-category-entry '(#xe00 . #xe7f) ?t)
 330 (map-charset-chars #'modify-category-entry 'thai-tis620 ?t)
 331
 332 (let ((deflist  '(;; chars      syntax  category
 333                   ("ก-รลว-ฮ"  "w"     ?0) ; consonant
 334                   ("ฤฦะาำเ-ๅ"     "w"     ?1) ; vowel base
 335                   ("ัิ-ื็๎"   "w"     ?2) ; vowel upper
 336                   ("ุ-ฺ"    "w"     ?3) ; vowel lower
 337                   ("่-ํ"    "w"     ?4) ; tone mark
 338                   ("๐-๙"    "w"     ?6) ; digit
 339                   ("ฯๆ฿๏๚๛" "_"     ?5) ; symbol
 340                   ))
 341       elm chars len syntax category to ch i)
 342   (while deflist
 343     (setq elm (car deflist))
 344     (setq chars (car elm)
 345           len (length chars)
 346           syntax (nth 1 elm)
 347           category (nth 2 elm)
 348           i 0)
 349     (while (< i len)
 350       (if (= (aref chars i) ?-)
 351           (setq i (1+ i)
 352                 to (aref chars i))
 353         (setq ch (aref chars i)
 354               to ch))
 355       (while (<= ch to)
 356         (unless (string-equal syntax "w")
 357           (modify-syntax-entry ch syntax))
 358         (modify-category-entry ch category)
 359         (setq ch (1+ ch)))
 360       (setq i (1+ i)))
 361     (setq deflist (cdr deflist))))
 362
 363 ;; Tibetan character set
 364
 365 (modify-category-entry '(#xf00 . #xfff) ?q)
 366 (map-charset-chars #'modify-category-entry 'tibetan ?q)
 367 (map-charset-chars #'modify-category-entry 'tibetan-1-column ?q)
 368
 369 (let ((deflist  '(;; chars             syntax category
 370                   ("ཀ-ཀྵཪ"         "w"     ?0) ; consonant
 371                   ("ྐ-ྐྵྺྻྼ��������"       "w"     ?0) ;
 372                   ("����-����"              "w"     ?0) ;
 373                   ("����-����"              "w"     ?0) ;
 374                   ("ིེཻོཽྀ"       "w"       ?2) ; upper vowel
 375                   ("ཾྂྃ྆྇ྈྉྊྋ" "w"    ?2) ; upper modifier
 376                   ("༙����྄ཱུ༵༷"       "w"   ?3) ; lowel vowel/modifier
 377                   ("཰"                "w" ?3)             ; invisible vowel a
 378                   ("༠-༩༪-༳"             "w"     ?6) ; digit
 379                   ("་།-༒༔ཿ"        "."     ?|) ; line-break char
 380                   ("་།༏༐༑༔ཿ"            "."     ?|) ;
 381                   ("༈་།-༒༔ཿ༽༴"  "."     ?>) ; prohibition
 382                   ("་།༏༐༑༔ཿ"            "."     ?>) ;
 383                   ("ༀ-༊༼࿁࿂྅"      "."     ?<) ; prohibition
 384                   ("༓༕-༘༚-༟༶༸-༻༾༿྾྿-࿏" "." ?q) ; others
 385                   ))
 386       elm chars len syntax category to ch i)
 387   (while deflist
 388     (setq elm (car deflist))
 389     (setq chars (car elm)
 390           len (length chars)
 391           syntax (nth 1 elm)
 392           category (nth 2 elm)
 393           i 0)
 394     (while (< i len)
 395       (if (= (aref chars i) ?-)
 396           (setq i (1+ i)
 397                 to (aref chars i))
 398         (setq ch (aref chars i)
 399               to ch))
 400       (while (<= ch to)
 401         (unless (string-equal syntax "w")
 402           (modify-syntax-entry ch syntax))
 403         (modify-category-entry ch category)
 404         (setq ch (1+ ch)))
 405       (setq i (1+ i)))
 406     (setq deflist (cdr deflist))))
 407
 408 ;; Vietnamese character set
 409
 410 ;; To make a word with Latin characters
 411 (map-charset-chars #'modify-category-entry 'vietnamese-viscii-lower ?l)
 412 (map-charset-chars #'modify-category-entry 'vietnamese-viscii-lower ?v)
 413
 414 (map-charset-chars #'modify-category-entry 'vietnamese-viscii-upper ?l)
 415 (map-charset-chars #'modify-category-entry 'vietnamese-viscii-upper ?v)
 416
 417 (let ((tbl (standard-case-table))
 418       (i 32))
 419   (while (< i 128)
 420     (let* ((char (decode-char 'vietnamese-viscii-upper i))
 421            (charl (decode-char 'vietnamese-viscii-lower i))
 422            (uc (encode-char char 'ucs))
 423            (lc (encode-char charl 'ucs)))
 424       (set-case-syntax-pair char (decode-char 'vietnamese-viscii-lower i)
 425                             tbl)
 426       (if uc (modify-category-entry uc ?v))
 427       (if lc (modify-category-entry lc ?v)))
 428     (setq i (1+ i))))
 429
 430 ;; Tai Viet
 431 (let ((deflist '(;; chars       syntax  category
 432                  ((?ꪀ.  ?ꪯ) "w"     ?0) ; cosonant
 433                  ("ꪱꪵꪶ"           "w"     ?1) ; vowel base
 434                  ((?ꪹ . ?ꪽ) "w"     ?1) ; vowel base
 435                  ("ꪰꪲꪳꪷꪸꪾ"  "w"     ?2) ; vowel upper
 436                  ("ꪴ"         "w"     ?3) ; vowel lower
 437                  ("ꫀꫂ"              "w"     ?1) ; non-combining tone-mark
 438                  ("꪿꫁"              "w"     ?4) ; combining tone-mark
 439                  ((?ꫛ . ?꫟) "_"     ?5) ; symbol
 440                  )))
 441   (dolist (elm deflist)
 442     (let ((chars (car elm))
 443           (syntax (nth 1 elm))
 444           (category (nth 2 elm)))
 445       (if (consp chars)
 446           (progn
 447             (modify-syntax-entry chars syntax)
 448             (modify-category-entry chars category))
 449         (mapc #'(lambda (x)
 450                   (modify-syntax-entry x syntax)
 451                   (modify-category-entry x category))
 452               chars)))))
 453
 454 ;; Latin
 455
 456 (modify-category-entry '(#x80 . #x024F) ?l)
 457
 458 (let ((tbl (standard-case-table)) c)
 459
 460   ;; Latin-1
 461
 462   ;; Fixme: Some of the non-word syntaxes here perhaps should be
 463   ;; reviewed.  (Note that the following all implicitly have word
 464   ;; syntax: ¢£¤¥¨ª¯²³´¶¸¹º.)  There should be a well-defined way of
 465   ;; relating Unicode categories to Emacs syntax codes.
 466
 467   ;; NBSP isn't semantically interchangeable with other whitespace chars,
 468   ;; so it's more like punctation.
 469   (set-case-syntax ?  "." tbl)
 470   (set-case-syntax ?¡ "." tbl)
 471   (set-case-syntax ?¦ "_" tbl)
 472   (set-case-syntax ?§ "." tbl)
 473   (set-case-syntax ?© "_" tbl)
 474   (set-case-syntax-delims 171 187 tbl)  ; « »
 475   (set-case-syntax ?¬ "_" tbl)
 476   (set-case-syntax ? "_" tbl)
 477   (set-case-syntax ?® "_" tbl)
 478   (set-case-syntax ?° "_" tbl)
 479   (set-case-syntax ?± "_" tbl)
 480   (set-case-syntax ?µ "_" tbl)
 481   (set-case-syntax ?· "_" tbl)
 482   (set-case-syntax ?¼ "_" tbl)
 483   (set-case-syntax ?½ "_" tbl)
 484   (set-case-syntax ?¾ "_" tbl)
 485   (set-case-syntax ?¿ "." tbl)
 486   (let ((c 192))
 487     (while (<= c 222)
 488       (set-case-syntax-pair c (+ c 32) tbl)
 489       (setq c (1+ c))))
 490   (set-case-syntax ?× "_" tbl)
 491   (set-case-syntax ?ß "w" tbl)
 492   (set-case-syntax ?÷ "_" tbl)
 493   ;; See below for ÿ.
 494
 495   ;; Latin Extended-A, Latin Extended-B
 496   (setq c #x0100)
 497   (while (<= c #x02B8)
 498     (modify-category-entry c ?l)
 499     (setq c (1+ c)))
 500
 501   (let ((pair-ranges '((#x0100 . #x012F)
 502                        (#x0132 . #x0137)
 503                        (#x0139 . #x0148)
 504                        (#x014a . #x0177)
 505                        (#x0179 . #x017E)
 506                        (#x0182 . #x0185)
 507                        (#x0187 . #x018C)
 508                        (#x0191 . #x0192)
 509                        (#x0198 . #x0199)
 510                        (#x01A0 . #x01A5)
 511                        (#x01A7 . #x01A8)
 512                        (#x01AC . #x01AD)
 513                        (#x01AF . #x01B0)
 514                        (#x01B3 . #x01B6)
 515                        (#x01BC . #x01BD)
 516                        (#x01CD . #x01DC)
 517                        (#x01DE . #x01EF)
 518                        (#x01F4 . #x01F5)
 519                        (#x01F8 . #x021F)
 520                        (#x0222 . #x0233)
 521                        (#x023B . #x023C)
 522                        (#x0241 . #x0242)
 523                        (#x0246 . #x024F))))
 524     (dolist (elt pair-ranges)
 525       (let ((from (car elt)) (to (cdr elt)))
 526         (while (< from to)
 527           (set-case-syntax-pair from (1+ from) tbl)
 528           (setq from (+ from 2))))))
 529
 530   ;; In some languages, such as Turkish, U+0049 LATIN CAPITAL LETTER I
 531   ;; and U+0131 LATIN SMALL LETTER DOTLESS I make a case pair, and so
 532   ;; do U+0130 LATIN CAPITAL LETTER I WITH DOT ABOVE and U+0069 LATIN
 533   ;; SMALL LETTER I.
 534
 535   ;; We used to set up half of those correspondence unconditionally,
 536   ;; but that makes searches slow.  So now we don't set up either half
 537   ;; of these correspondences by default.
 538
 539   ;; (set-downcase-syntax  ?İ ?i tbl)
 540   ;; (set-upcase-syntax    ?I ?ı tbl)
 541
 542   (set-case-syntax-pair ?Ǆ ?ǆ tbl)
 543   (set-case-syntax-pair ?ǅ ?ǆ tbl)
 544   (set-case-syntax-pair ?Ǉ ?ǉ tbl)
 545   (set-case-syntax-pair ?ǈ ?ǉ tbl)
 546   (set-case-syntax-pair ?Ǌ ?ǌ tbl)
 547   (set-case-syntax-pair ?ǋ ?ǌ tbl)
 548
 549   ;; 01F0; F; 006A 030C; # LATIN SMALL LETTER J WITH CARON
 550   (set-case-syntax-pair ?Ǳ ?ǳ tbl)
 551   (set-case-syntax-pair ?ǲ ?ǳ tbl)
 552   (set-case-syntax-pair ?Ƕ ?ƕ tbl)
 553   (set-case-syntax-pair ?Ƿ ?ƿ tbl)
 554
 555   ;; Latin Extended Additional
 556   (modify-category-entry '(#x1e00 . #x1ef9) ?l)
 557   (setq c #x1e00)
 558   (while (<= c #x1ef9)
 559     (and (zerop (% c 2))
 560          (or (<= c #x1e94) (>= c #x1ea0))
 561          (set-case-syntax-pair c (1+ c) tbl))
 562     (setq c (1+ c)))
 563
 564   ;; Greek
 565   (modify-category-entry '(#x0370 . #x03ff) ?g)
 566   (setq c #x0370)
 567   (while (<= c #x03ff)
 568     (if (or (and (>= c #x0391) (<= c #x03a1))
 569             (and (>= c #x03a3) (<= c #x03ab)))
 570         (set-case-syntax-pair c (+ c 32) tbl))
 571     (and (>= c #x03da)
 572          (<= c #x03ee)
 573          (zerop (% c 2))
 574          (set-case-syntax-pair c (1+ c) tbl))
 575     (setq c (1+ c)))
 576   (set-case-syntax-pair ?Ά ?ά tbl)
 577   (set-case-syntax-pair ?Έ ?έ tbl)
 578   (set-case-syntax-pair ?Ή ?ή tbl)
 579   (set-case-syntax-pair ?Ί ?ί tbl)
 580   (set-case-syntax-pair ?Ό ?ό tbl)
 581   (set-case-syntax-pair ?Ύ ?ύ tbl)
 582   (set-case-syntax-pair ?Ώ ?ώ tbl)
 583
 584   ;; Armenian
 585   (setq c #x531)
 586   (while (<= c #x556)
 587     (set-case-syntax-pair c (+ c #x30) tbl)
 588     (setq c (1+ c)))
 589
 590   ;; Greek Extended
 591   (modify-category-entry '(#x1f00 . #x1fff) ?g)
 592   (setq c #x1f00)
 593   (while (<= c #x1fff)
 594     (and (<= (logand c #x000f) 7)
 595          (<= c #x1fa7)
 596          (not (memq c '(#x1f50 #x1f52 #x1f54 #x1f56)))
 597          (/= (logand c #x00f0) 7)
 598          (set-case-syntax-pair (+ c 8) c tbl))
 599     (setq c (1+ c)))
 600   (set-case-syntax-pair ?Ᾰ ?ᾰ tbl)
 601   (set-case-syntax-pair ?Ᾱ ?ᾱ tbl)
 602   (set-case-syntax-pair ?Ὰ ?ὰ tbl)
 603   (set-case-syntax-pair ?Ά ?ά tbl)
 604   (set-case-syntax-pair ?ᾼ ?ᾳ tbl)
 605   (set-case-syntax-pair ?Ὲ ?ὲ tbl)
 606   (set-case-syntax-pair ?Έ ?έ tbl)
 607   (set-case-syntax-pair ?Ὴ ?ὴ tbl)
 608   (set-case-syntax-pair ?Ή ?ή tbl)
 609   (set-case-syntax-pair ?ῌ ?ῃ tbl)
 610   (set-case-syntax-pair ?Ῐ ?ῐ tbl)
 611   (set-case-syntax-pair ?Ῑ ?ῑ tbl)
 612   (set-case-syntax-pair ?Ὶ ?ὶ tbl)
 613   (set-case-syntax-pair ?Ί ?ί tbl)
 614   (set-case-syntax-pair ?Ῠ ?ῠ tbl)
 615   (set-case-syntax-pair ?Ῡ ?ῡ tbl)
 616   (set-case-syntax-pair ?Ὺ ?ὺ tbl)
 617   (set-case-syntax-pair ?Ύ ?ύ tbl)
 618   (set-case-syntax-pair ?Ῥ ?ῥ tbl)
 619   (set-case-syntax-pair ?Ὸ ?ὸ tbl)
 620   (set-case-syntax-pair ?Ό ?ό tbl)
 621   (set-case-syntax-pair ?Ὼ ?ὼ tbl)
 622   (set-case-syntax-pair ?Ώ ?ώ tbl)
 623   (set-case-syntax-pair ?ῼ ?ῳ tbl)
 624
 625   ;; cyrillic
 626   (modify-category-entry '(#x0400 . #x04FF) ?y)
 627   (setq c #x0400)
 628   (while (<= c #x04ff)
 629     (and (>= c #x0400)
 630          (<= c #x040f)
 631          (set-case-syntax-pair c (+ c 80) tbl))
 632     (and (>= c #x0410)
 633          (<= c #x042f)
 634          (set-case-syntax-pair c (+ c 32) tbl))
 635     (and (zerop (% c 2))
 636          (or (and (>= c #x0460) (<= c #x0480))
 637              (and (>= c #x048c) (<= c #x04be))
 638              (and (>= c #x04d0) (<= c #x04f4)))
 639          (set-case-syntax-pair c (1+ c) tbl))
 640     (setq c (1+ c)))
 641   (set-case-syntax-pair ?Ӂ ?ӂ tbl)
 642   (set-case-syntax-pair ?Ӄ ?ӄ tbl)
 643   (set-case-syntax-pair ?Ӈ ?ӈ tbl)
 644   (set-case-syntax-pair ?Ӌ ?ӌ tbl)
 645   (set-case-syntax-pair ?Ӹ ?ӹ tbl)
 646
 647   ;; general punctuation
 648   (setq c #x2000)
 649   (while (<= c #x200b)
 650     (set-case-syntax c " " tbl)
 651     (setq c (1+ c)))
 652   (while (<= c #x200F)
 653     (set-case-syntax c "." tbl)
 654     (setq c (1+ c)))
 655   ;; Fixme: These aren't all right:
 656   (setq c #x2010)
 657   (while (<= c #x2016)
 658     (set-case-syntax c "_" tbl)
 659     (setq c (1+ c)))
 660   ;; Punctuation syntax for quotation marks (like `)
 661   (while (<= c #x201f)
 662     (set-case-syntax  c "." tbl)
 663     (setq c (1+ c)))
 664   ;; Fixme: These aren't all right:
 665   (while (<= c #x2027)
 666     (set-case-syntax c "_" tbl)
 667     (setq c (1+ c)))
 668   (while (<= c #x206F)
 669     (set-case-syntax c "." tbl)
 670     (setq c (1+ c)))
 671
 672   ;; Roman numerals
 673   (setq c #x2160)
 674   (while (<= c #x216f)
 675     (set-case-syntax-pair c (+ c #x10) tbl)
 676     (setq c (1+ c)))
 677
 678   ;; Fixme: The following blocks might be better as symbol rather than
 679   ;; punctuation.
 680   ;; Arrows
 681   (setq c #x2190)
 682   (while (<= c #x21FF)
 683     (set-case-syntax c "." tbl)
 684     (setq c (1+ c)))
 685   ;; Mathematical Operators
 686   (while (<= c #x22FF)
 687     (set-case-syntax c "." tbl)
 688     (setq c (1+ c)))
 689   ;; Miscellaneous Technical
 690   (while (<= c #x23FF)
 691     (set-case-syntax c "." tbl)
 692     (setq c (1+ c)))
 693   ;; Control Pictures
 694   (while (<= c #x243F)
 695     (set-case-syntax c "_" tbl)
 696     (setq c (1+ c)))
 697
 698   ;; Circled Latin
 699   (setq c #x24b6)
 700   (while (<= c #x24cf)
 701     (set-case-syntax-pair c (+ c 26) tbl)
 702     (modify-category-entry c ?l)
 703     (modify-category-entry (+ c 26) ?l)
 704     (setq c (1+ c)))
 705
 706   ;; Fullwidth Latin
 707   (setq c #xff21)
 708   (while (<= c #xff3a)
 709     (set-case-syntax-pair c (+ c #x20) tbl)
 710     (modify-category-entry c ?l)
 711     (modify-category-entry (+ c #x20) ?l)
 712     (setq c (1+ c)))
 713
 714   ;; Combining diacritics
 715   (modify-category-entry '(#x300 . #x362) ?^)
 716   ;; Combining marks
 717   (modify-category-entry '(#x20d0 . #x20e3) ?^)
 718
 719   ;; Fixme: syntax for symbols &c
 720   )
 721
 722 (let ((pairs
 723        '("⁅⁆"                               ; U+2045 U+2046
 724          "⁽⁾"                               ; U+207D U+207E
 725          "₍₎"                               ; U+208D U+208E
 726          "〈〉"                               ; U+2329 U+232A
 727          "⎴⎵"                               ; U+23B4 U+23B5
 728          "❨❩"                               ; U+2768 U+2769
 729          "❪❫"                               ; U+276A U+276B
 730          "❬❭"                               ; U+276C U+276D
 731          "❰❱"                               ; U+2770 U+2771
 732          "❲❳"                               ; U+2772 U+2773
 733          "❴❵"                               ; U+2774 U+2775
 734          "⟦⟧"                               ; U+27E6 U+27E7
 735          "⟨⟩"                               ; U+27E8 U+27E9
 736          "⟪⟫"                               ; U+27EA U+27EB
 737          "⦃⦄"                               ; U+2983 U+2984
 738          "⦅⦆"                               ; U+2985 U+2986
 739          "⦇⦈"                               ; U+2987 U+2988
 740          "⦉⦊"                               ; U+2989 U+298A
 741          "⦋⦌"                               ; U+298B U+298C
 742          "⦍⦎"                               ; U+298D U+298E
 743          "⦏⦐"                               ; U+298F U+2990
 744          "⦑⦒"                               ; U+2991 U+2992
 745          "⦓⦔"                               ; U+2993 U+2994
 746          "⦕⦖"                               ; U+2995 U+2996
 747          "⦗⦘"                               ; U+2997 U+2998
 748          "⧼⧽"                               ; U+29FC U+29FD
 749          "〈〉"                               ; U+3008 U+3009
 750          "《》"                               ; U+300A U+300B
 751          "「」"                               ; U+300C U+300D
 752          "『』"                               ; U+300E U+300F
 753          "【】"                               ; U+3010 U+3011
 754          "〔〕"                               ; U+3014 U+3015
 755          "〖〗"                               ; U+3016 U+3017
 756          "〘〙"                               ; U+3018 U+3019
 757          "〚〛"                               ; U+301A U+301B
 758          "﴾﴿"                               ; U+FD3E U+FD3F
 759          "︵︶"                               ; U+FE35 U+FE36
 760          "︷︸"                               ; U+FE37 U+FE38
 761          "︹︺"                               ; U+FE39 U+FE3A
 762          "︻︼"                               ; U+FE3B U+FE3C
 763          "︽︾"                               ; U+FE3D U+FE3E
 764          "︿﹀"                               ; U+FE3F U+FE40
 765          "﹁﹂"                               ; U+FE41 U+FE42
 766          "﹃﹄"                               ; U+FE43 U+FE44
 767          "﹙﹚"                               ; U+FE59 U+FE5A
 768          "﹛﹜"                               ; U+FE5B U+FE5C
 769          "﹝﹞"                               ; U+FE5D U+FE5E
 770          "（）"                               ; U+FF08 U+FF09
 771          "［］"                               ; U+FF3B U+FF3D
 772          "｛｝"                               ; U+FF5B U+FF5D
 773          "｟｠"                               ; U+FF5F U+FF60
 774          "｢｣"                               ; U+FF62 U+FF63
 775          )))
 776   (dolist (elt pairs)
 777     (modify-syntax-entry (aref elt 0) (string ?\( (aref elt 1)))
 778     (modify-syntax-entry (aref elt 1) (string ?\) (aref elt 0)))))
 779
 780 \f
 781 ;; For each character set, put the information of the most proper
 782 ;; coding system to encode it by `preferred-coding-system' property.
 783
 784 ;; Fixme: should this be junked?
 785 (let ((l '((latin-iso8859-1     . iso-latin-1)
 786            (latin-iso8859-2     . iso-latin-2)
 787            (latin-iso8859-3     . iso-latin-3)
 788            (latin-iso8859-4     . iso-latin-4)
 789            (thai-tis620         . thai-tis620)
 790            (greek-iso8859-7     . greek-iso-8bit)
 791            (arabic-iso8859-6    . iso-2022-7bit)
 792            (hebrew-iso8859-8    . hebrew-iso-8bit)
 793            (katakana-jisx0201   . japanese-shift-jis)
 794            (latin-jisx0201      . japanese-shift-jis)
 795            (cyrillic-iso8859-5  . cyrillic-iso-8bit)
 796            (latin-iso8859-9     . iso-latin-5)
 797            (japanese-jisx0208-1978 . iso-2022-jp)
 798            (chinese-gb2312      . chinese-iso-8bit)
 799            (chinese-gbk         . chinese-gbk)
 800            (gb18030-2-byte      . chinese-gb18030)
 801            (gb18030-4-byte-bmp  . chinese-gb18030)
 802            (gb18030-4-byte-smp  . chinese-gb18030)
 803            (gb18030-4-byte-ext-1 . chinese-gb18030)
 804            (gb18030-4-byte-ext-2 . chinese-gb18030)
 805            (japanese-jisx0208   . iso-2022-jp)
 806            (korean-ksc5601      . iso-2022-kr)
 807            (japanese-jisx0212   . iso-2022-jp)
 808            (chinese-big5-1      . chinese-big5)
 809            (chinese-big5-2      . chinese-big5)
 810            (chinese-sisheng     . iso-2022-7bit)
 811            (ipa                 . iso-2022-7bit)
 812            (vietnamese-viscii-lower . vietnamese-viscii)
 813            (vietnamese-viscii-upper . vietnamese-viscii)
 814            (arabic-digit        . iso-2022-7bit)
 815            (arabic-1-column     . iso-2022-7bit)
 816            (lao                 . lao)
 817            (arabic-2-column     . iso-2022-7bit)
 818            (indian-is13194      . devanagari)
 819            (indian-glyph        . devanagari)
 820            (tibetan-1-column    . tibetan)
 821            (ethiopic            . iso-2022-7bit)
 822            (chinese-cns11643-1  . iso-2022-cn)
 823            (chinese-cns11643-2  . iso-2022-cn)
 824            (chinese-cns11643-3  . iso-2022-cn)
 825            (chinese-cns11643-4  . iso-2022-cn)
 826            (chinese-cns11643-5  . iso-2022-cn)
 827            (chinese-cns11643-6  . iso-2022-cn)
 828            (chinese-cns11643-7  . iso-2022-cn)
 829            (indian-2-column     . devanagari)
 830            (tibetan             . tibetan)
 831            (latin-iso8859-14    . iso-latin-8)
 832            (latin-iso8859-15    . iso-latin-9))))
 833   (while l
 834     (put-charset-property (car (car l)) 'preferred-coding-system (cdr (car l)))
 835     (setq l (cdr l))))
 836
 837 \f
 838 ;; Setup auto-fill-chars for charsets that should invoke auto-filling.
 839 ;; SPACE and NEWLINE are already set.
 840
 841 (set-char-table-range auto-fill-chars '(#x3041 . #x30FF) t)
 842 (set-char-table-range auto-fill-chars '(#x3400 . #x4DB5) t)
 843 (set-char-table-range auto-fill-chars '(#x4e00 . #x9fbb) t)
 844 (set-char-table-range auto-fill-chars '(#xF900 . #xFAFF) t)
 845 (set-char-table-range auto-fill-chars '(#xFF00 . #xFF9F) t)
 846 (set-char-table-range auto-fill-chars '(#x20000 . #x2FFFF) t)
 847
 848 \f
 849 ;;; Setting char-width-table.  The default is 1.
 850
 851 ;; 0: non-spacing, enclosing combining, formatting, Hangul Jamo medial
 852 ;;    and final characters.
 853 (let ((l '((#x00AD . #x00AD)
 854            (#x0300 . #x036F)
 855            (#x0483 . #x0489)
 856            (#x0591 . #x05BD)
 857            (#x05BF . #x05BF)
 858            (#x05C1 . #x05C2)
 859            (#x05C4 . #x05C5)
 860            (#x05C7 . #x05C7)
 861            (#x0600 . #x0603)
 862            (#x0610 . #x0615)
 863            (#x064B . #x065E)
 864            (#x0670 . #x0670)
 865            (#x06D6 . #x06E4)
 866            (#x06E7 . #x06E8)
 867            (#x06EA . #x06ED)
 868            (#x070F . #x070F)
 869            (#x0711 . #x0711)
 870            (#x0730 . #x074A)
 871            (#x07A6 . #x07B0)
 872            (#x07EB . #x07F3)
 873            (#x0901 . #x0902)
 874            (#x093C . #x093C)
 875            (#x0941 . #x0948)
 876            (#x094D . #x094D)
 877            (#x0951 . #x0954)
 878            (#x0962 . #x0963)
 879            (#x0981 . #x0981)
 880            (#x09BC . #x09BC)
 881            (#x09C1 . #x09C4)
 882            (#x09CD . #x09CD)
 883            (#x09E2 . #x09E3)
 884            (#x0A01 . #x0A02)
 885            (#x0A3C . #x0A3C)
 886            (#x0A41 . #x0A4D)
 887            (#x0A70 . #x0A71)
 888            (#x0A81 . #x0A82)
 889            (#x0ABC . #x0ABC)
 890            (#x0AC1 . #x0AC8)
 891            (#x0ACD . #x0ACD)
 892            (#x0AE2 . #x0AE3)
 893            (#x0B01 . #x0B01)
 894            (#x0B3C . #x0B3C)
 895            (#x0B3F . #x0B3F)
 896            (#x0B41 . #x0B43)
 897            (#x0B4D . #x0B56)
 898            (#x0B82 . #x0B82)
 899            (#x0BC0 . #x0BC0)
 900            (#x0BCD . #x0BCD)
 901            (#x0C3E . #x0C40)
 902            (#x0C46 . #x0C56)
 903            (#x0CBC . #x0CBC)
 904            (#x0CBF . #x0CBF)
 905            (#x0CC6 . #x0CC6)
 906            (#x0CCC . #x0CCD)
 907            (#x0CE2 . #x0CE3)
 908            (#x0D41 . #x0D43)
 909            (#x0D4D . #x0D4D)
 910            (#x0DCA . #x0DCA)
 911            (#x0DD2 . #x0DD6)
 912            (#x0E31 . #x0E31)
 913            (#x0E34 . #x0E3A)
 914            (#x0E47 . #x0E4E)
 915            (#x0EB1 . #x0EB1)
 916            (#x0EB4 . #x0EBC)
 917            (#x0EC8 . #x0ECD)
 918            (#x0F18 . #x0F19)
 919            (#x0F35 . #x0F35)
 920            (#x0F37 . #x0F37)
 921            (#x0F39 . #x0F39)
 922            (#x0F71 . #x0F7E)
 923            (#x0F80 . #x0F84)
 924            (#x0F86 . #x0F87)
 925            (#x0F90 . #x0FBC)
 926            (#x0FC6 . #x0FC6)
 927            (#x102D . #x1030)
 928            (#x1032 . #x1037)
 929            (#x1039 . #x1039)
 930            (#x1058 . #x1059)
 931            (#x1160 . #x11FF)
 932            (#x135F . #x135F)
 933            (#x1712 . #x1714)
 934            (#x1732 . #x1734)
 935            (#x1752 . #x1753)
 936            (#x1772 . #x1773)
 937            (#x17B4 . #x17B5)
 938            (#x17B7 . #x17BD)
 939            (#x17C6 . #x17C6)
 940            (#x17C9 . #x17D3)
 941            (#x17DD . #x17DD)
 942            (#x180B . #x180D)
 943            (#x18A9 . #x18A9)
 944            (#x1920 . #x1922)
 945            (#x1927 . #x1928)
 946            (#x1932 . #x1932)
 947            (#x1939 . #x193B)
 948            (#x1A17 . #x1A18)
 949            (#x1B00 . #x1B03)
 950            (#x1B34 . #x1B34)
 951            (#x1B36 . #x1B3A)
 952            (#x1B3C . #x1B3C)
 953            (#x1B42 . #x1B42)
 954            (#x1B6B . #x1B73)
 955            (#x1DC0 . #x1DFF)
 956            (#x200B . #x200F)
 957            (#x202A . #x202E)
 958            (#x2060 . #x206F)
 959            (#x20D0 . #x20EF)
 960            (#x302A . #x302F)
 961            (#x3099 . #x309A)
 962            (#xA806 . #xA806)
 963            (#xA80B . #xA80B)
 964            (#xA825 . #xA826)
 965            (#xFB1E . #xFB1E)
 966            (#xFE00 . #xFE0F)
 967            (#xFE20 . #xFE23)
 968            (#xFEFF . #xFEFF)
 969            (#xFFF9 . #xFFFB)
 970            (#x10A01 . #x10A0F)
 971            (#x10A38 . #x10A3F)
 972            (#x1D167 . #x1D169)
 973            (#x1D173 . #x1D182)
 974            (#x1D185 . #x1D18B)
 975            (#x1D1AA . #x1D1AD)
 976            (#x1D242 . #x1D244)
 977            (#xE0001 . #xE01EF))))
 978   (dolist (elt l)
 979     (set-char-table-range char-width-table elt 0)))
 980
 981 ;; 2: East Asian Wide and Full-width characters.
 982 (let ((l '((#x1100 . #x115F)
 983            (#x2329 . #x232A)
 984            (#x2E80 . #x303E)
 985            (#x3040 . #xA4CF)
 986            (#xAC00 . #xD7A3)
 987            (#xF900 . #xFAFF)
 988            (#xFE30 . #xFE6F)
 989            (#xFF01 . #xFF60)
 990            (#xFFE0 . #xFFE6)
 991            (#x20000 . #x2FFFF)
 992            (#x30000 . #x3FFFF))))
 993   (dolist (elt l)
 994     (set-char-table-range char-width-table elt 2)))
 995
 996 ;; Other double width
 997 ;;(map-charset-chars
 998 ;; (lambda (range ignore) (set-char-table-range char-width-table range 2))
 999 ;; 'ethiopic)
1000 ;; (map-charset-chars
1001 ;;  (lambda (range ignore) (set-char-table-range char-width-table range 2))
1002 ;; 'tibetan)
1003 (map-charset-chars
1004  (lambda (range ignore) (set-char-table-range char-width-table range 2))
1005  'indian-2-column)
1006 (map-charset-chars
1007  (lambda (range ignore) (set-char-table-range char-width-table range 2))
1008  'arabic-2-column)
1009
1010 (optimize-char-table (standard-case-table))
1011 (optimize-char-table (standard-category-table))
1012 (optimize-char-table (standard-syntax-table))
1013
1014 ;; The Unicode blocks actually extend past some of these ranges with
1015 ;; undefined codepoints.
1016 (let ((script-list nil))
1017   (dolist
1018       (elt
1019        '((#x0000 #x007F latin)
1020          (#x00A0 #x036F latin)
1021          (#x0370 #x03E1 greek)
1022          (#x03E2 #x03EF coptic)
1023          (#x03F0 #x03F3 greek)
1024          (#x0400 #x04FF cyrillic)
1025          (#x0530 #x058F armenian)
1026          (#x0590 #x05FF hebrew)
1027          (#x0600 #x06FF arabic)
1028          (#x0700 #x074F syriac)
1029          (#x07C0 #x07FA nko)
1030          (#x0780 #x07BF thaana)
1031          (#x0900 #x097F devanagari)
1032          (#x0980 #x09FF bengali)
1033          (#x0A00 #x0A7F gurmukhi)
1034          (#x0A80 #x0AFF gujarati)
1035          (#x0B00 #x0B7F oriya)
1036          (#x0B80 #x0BFF tamil)
1037          (#x0C00 #x0C7F telugu)
1038          (#x0C80 #x0CFF kannada)
1039          (#x0D00 #x0D7F malayalam)
1040          (#x0D80 #x0DFF sinhala)
1041          (#x0E00 #x0E5F thai)
1042          (#x0E80 #x0EDF lao)
1043          (#x0F00 #x0FFF tibetan)
1044          (#x1000 #x105F myanmar)
1045          (#x10A0 #x10FF georgian)
1046          (#x1100 #x11FF hangul)
1047          (#x1200 #x139F ethiopic)
1048          (#x13A0 #x13FF cherokee)
1049          (#x1400 #x167F canadian-aboriginal)
1050          (#x1680 #x169F ogham)
1051          (#x16A0 #x16FF runic)
1052          (#x1780 #x17FF khmer)
1053          (#x1800 #x18AF mongolian)
1054          (#x1E00 #x1EFF latin)
1055          (#x1F00 #x1FFF greek)
1056          (#x2000 #x27FF symbol)
1057          (#x2800 #x28FF braille)
1058          (#x2D80 #x2DDF ethiopic)
1059          (#x2E80 #x2FDF han)
1060          (#x2FF0 #x2FFF ideographic-description)
1061          (#x3000 #x303F cjk-misc)
1062          (#x3040 #x30FF kana)
1063          (#x3100 #x312F bopomofo)
1064          (#x3130 #x318F hangul)
1065          (#x3190 #x319F kanbun)
1066          (#x31A0 #x31BF bopomofo)
1067          (#x3400 #x9FAF han)
1068          (#xA000 #xA4CF yi)
1069          (#xAA80 #xAADF tai-viet)
1070          (#xAC00 #xD7AF hangul)
1071          (#xF900 #xFAFF han)
1072          (#xFB1D #xFB4F hebrew)
1073          (#xFB50 #xFDFF arabic)
1074          (#xFE70 #xFEFC arabic)
1075          (#xFF00 #xFF5F cjk-misc)
1076          (#xFF61 #xFF9F kana)
1077          (#xFFE0 #xFFE6 cjk-misc)
1078          (#x1D000 #x1D0FF byzantine-musical-symbol)
1079          (#x1D100 #x1D1FF musical-symbol)
1080          (#x1D400 #x1D7FF mathematical)
1081          (#x20000 #x2AFFF han)
1082          (#x2F800 #x2FFFF han)))
1083     (set-char-table-range char-script-table
1084                           (cons (car elt) (nth 1 elt)) (nth 2 elt))
1085     (or (memq (nth 2 elt) script-list)
1086         (setq script-list (cons (nth 2 elt) script-list))))
1087   (set-char-table-extra-slot char-script-table 0 (nreverse script-list)))
1088
1089 (map-charset-chars
1090  #'(lambda (range ignore)
1091      (set-char-table-range char-script-table range 'tibetan))
1092  'tibetan)
1093
1094 \f
1095 ;;; Setting word boundary.
1096
1097 (defun next-word-boundary-han (pos limit)
1098   (if (<= pos limit)
1099       (save-excursion
1100         (goto-char pos)
1101         (looking-at "\\cC+")
1102         (goto-char (match-end 0))
1103         (if (looking-at "\\cH+")
1104             (goto-char (match-end 0)))
1105         (point))
1106     (while (and (> pos limit)
1107                 (eq (aref char-script-table (char-after (1- pos))) 'han))
1108       (setq pos (1- pos)))
1109     pos))
1110
1111 (defun next-word-boundary-kana (pos limit)
1112   (if (<= pos limit)
1113       (save-excursion
1114         (goto-char pos)
1115         (if (looking-at "\\cK+")
1116             (goto-char (match-end 0)))
1117         (if (looking-at "\\cH+")
1118             (goto-char (match-end 0)))
1119         (if (looking-at "\\ck+")
1120             (goto-char (match-end 0)))
1121         (point))
1122     (let ((category-set (char-category-set (char-after pos)))
1123           category)
1124       (if (or (aref category-set ?K) (aref category-set ?k))
1125           (while (and (> pos limit)
1126                       (setq category-set
1127                             (char-category-set (char-after (1- pos))))
1128                       (or (aref category-set ?K) (aref category-set ?k)))
1129             (setq pos (1- pos)))
1130         (while (and (> pos limit)
1131                     (aref (setq category-set
1132                                 (char-category-set (char-after (1- pos)))) ?H))
1133           (setq pos (1- pos)))
1134         (setq category (cond ((aref category-set ?C) ?C)
1135                              ((aref category-set ?K) ?K)
1136                              ((aref category-set ?A) ?A)))
1137         (when category
1138           (setq pos (1- pos))
1139           (while (and (> pos limit)
1140                       (aref (char-category-set (char-after (1- pos)))
1141                             category))
1142             (setq pos (1- pos)))))
1143       pos)))
1144
1145 (map-char-table
1146  #'(lambda (char script)
1147      (cond ((eq script 'han)
1148             (set-char-table-range find-word-boundary-function-table
1149                                   char #'next-word-boundary-han))
1150            ((eq script 'kana)
1151             (set-char-table-range find-word-boundary-function-table
1152                                   char #'next-word-boundary-kana))))
1153  char-script-table)
1154
1155 (setq word-combining-categories
1156       '((?l . ?l)
1157         (?C . ?C)
1158         (?C . ?H)
1159         (?C . ?K)))
1160
1161 (setq word-separating-categories        ;  (2-byte character sets)
1162       '((?A . ?K)                       ; Alpha numeric - Katakana
1163         (?A . ?C)                       ; Alpha numeric - Chinese
1164         (?H . ?A)                       ; Hiragana - Alpha numeric
1165         (?H . ?K)                       ; Hiragana - Katakana
1166         (?H . ?C)                       ; Hiragana - Chinese
1167         (?K . ?A)                       ; Katakana - Alpha numeric
1168         (?K . ?C)                       ; Katakana - Chinese
1169         (?C . ?A)                       ; Chinese - Alpha numeric
1170         (?C . ?K)                       ; Chinese - Katakana
1171         ))
1172
1173 ;; Local Variables:
1174 ;; coding: utf-8
1175 ;; End:
1176
1177 ;; arch-tag: 85889c35-9f4d-4912-9bf5-82de31b0d42d
1178 ;;; characters.el ends here