]> code.delx.au - gnu-emacs/blob - lisp/international/characters.el
Tweak previous change.
[gnu-emacs] / lisp / international / characters.el
1 ;;; characters.el --- set syntax and category for multibyte characters
2
3 ;; Copyright (C) 1997, 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009
4 ;; Free Software Foundation, Inc.
5 ;; Copyright (C) 1995, 1996, 1997, 1998, 1999, 2000, 2001, 2002, 2003, 2004,
6 ;; 2005, 2006, 2007, 2008, 2009
7 ;; National Institute of Advanced Industrial Science and Technology (AIST)
8 ;; Registration Number H14PRO021
9 ;; Copyright (C) 2003
10 ;; National Institute of Advanced Industrial Science and Technology (AIST)
11 ;; Registration Number H13PRO009
12
13 ;; Keywords: multibyte character, character set, syntax, category
14
15 ;; This file is part of GNU Emacs.
16
17 ;; GNU Emacs is free software: you can redistribute it and/or modify
18 ;; it under the terms of the GNU General Public License as published by
19 ;; the Free Software Foundation, either version 3 of the License, or
20 ;; (at your option) any later version.
21
22 ;; GNU Emacs is distributed in the hope that it will be useful,
23 ;; but WITHOUT ANY WARRANTY; without even the implied warranty of
24 ;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
25 ;; GNU General Public License for more details.
26
27 ;; You should have received a copy of the GNU General Public License
28 ;; along with GNU Emacs. If not, see <http://www.gnu.org/licenses/>.
29
30 ;;; Commentary:
31
32 ;;; Code:
33
34 ;;; Predefined categories.
35
36 ;; For each character set.
37
38 (define-category ?a "ASCII
39 ASCII graphic characters 32-126 (ISO646 IRV:1983[4/0])")
40 (define-category ?l "Latin")
41 (define-category ?t "Thai")
42 (define-category ?g "Greek")
43 (define-category ?b "Arabic")
44 (define-category ?w "Hebrew")
45 (define-category ?y "Cyrillic")
46 (define-category ?k "Katakana
47 Japanese katakana")
48 (define-category ?r "Roman
49 Japanese roman")
50 (define-category ?c "Chinese")
51 (define-category ?j "Japanese")
52 (define-category ?h "Korean")
53 (define-category ?e "Ethiopic
54 Ethiopic (Ge'ez)")
55 (define-category ?v "Viet
56 Vietnamese")
57 (define-category ?i "Indian")
58 (define-category ?o "Lao")
59 (define-category ?q "Tibetan")
60
61 ;; For each group (row) of 2-byte character sets.
62
63 (define-category ?A "2-byte alnum
64 Alpha-numeric characters of 2-byte character sets")
65 (define-category ?C "2-byte han
66 Chinese (Han) characters of 2-byte character sets")
67 (define-category ?G "2-byte Greek
68 Greek characters of 2-byte character sets")
69 (define-category ?H "2-byte Hiragana
70 Japanese Hiragana characters of 2-byte character sets")
71 (define-category ?K "2-byte Katakana
72 Japanese Katakana characters of 2-byte character sets")
73 (define-category ?N "2-byte Korean
74 Korean Hangul characters of 2-byte character sets")
75 (define-category ?Y "2-byte Cyrillic
76 Cyrillic characters of 2-byte character sets")
77 (define-category ?I "Indian Glyphs")
78
79 ;; For phonetic classifications.
80
81 (define-category ?0 "consonant")
82 (define-category ?1 "base vowel
83 base (independent) vowel")
84 (define-category ?2 "upper diacritic
85 upper diacritical mark (including upper vowel)")
86 (define-category ?3 "lower diacritic
87 lower diacritical mark (including lower vowel)")
88 (define-category ?4 "combining tone
89 combining tone mark")
90 (define-category ?5 "symbol")
91 (define-category ?6 "digit")
92 (define-category ?7 "vowel diacritic
93 vowel-modifying diacritical mark")
94 (define-category ?8 "vowel-signs")
95 (define-category ?9 "semivowel lower")
96
97 ;; For filling.
98 (define-category ?| "line breakable
99 While filling, we can break a line at this character.")
100
101 ;; For indentation calculation.
102 (define-category ?\s
103 "space for indent
104 This character counts as a space for indentation purposes.")
105
106 ;; Keep the following for `kinsoku' processing. See comments in
107 ;; kinsoku.el.
108 (define-category ?> "Not at bol
109 A character which can't be placed at beginning of line.")
110 (define-category ?< "Not at eol
111 A character which can't be placed at end of line.")
112
113 ;; Combining
114 (define-category ?^ "Combining
115 Combining diacritic or mark")
116 \f
117 ;;; Setting syntax and category.
118
119 ;; ASCII
120
121 ;; All ASCII characters have the category `a' (ASCII) and `l' (Latin).
122 (modify-category-entry '(32 . 127) ?a)
123 (modify-category-entry '(32 . 127) ?l)
124
125 ;; Deal with the CJK charsets first. Since the syntax of blocks is
126 ;; defined per charset, and the charsets may contain e.g. Latin
127 ;; characters, we end up with the wrong syntax definitions if we're
128 ;; not careful.
129
130 ;; Chinese characters (Unicode)
131 (modify-category-entry '(#x2E80 . #x312F) ?|)
132 (modify-category-entry '(#x3190 . #x33FF) ?|)
133 (modify-category-entry '(#x3400 . #x9FAF) ?C)
134 (modify-category-entry '(#x3400 . #x9FAF) ?c)
135 (modify-category-entry '(#x3400 . #x9FAF) ?|)
136 (modify-category-entry '(#xF900 . #xFAFF) ?C)
137 (modify-category-entry '(#xF900 . #xFAFF) ?c)
138 (modify-category-entry '(#xF900 . #xFAFF) ?|)
139 (modify-category-entry '(#x20000 . #x2AFFF) ?|)
140 (modify-category-entry '(#x2F800 . #x2FFFF) ?|)
141 (modify-category-entry '(#x20000 . #x2AFFF) ?C)
142 (modify-category-entry '(#x2F800 . #x2FFFF) ?C)
143
144
145 ;; Chinese character set (GB2312)
146
147 (map-charset-chars #'modify-syntax-entry 'chinese-gb2312 "_" #x2121 #x217E)
148 (map-charset-chars #'modify-syntax-entry 'chinese-gb2312 "_" #x2221 #x227E)
149 (map-charset-chars #'modify-syntax-entry 'chinese-gb2312 "_" #x2921 #x297E)
150
151 (map-charset-chars #'modify-category-entry 'chinese-gb2312 ?c)
152 (map-charset-chars #'modify-category-entry 'chinese-gb2312 ?A #x2330 #x2339)
153 (map-charset-chars #'modify-category-entry 'chinese-gb2312 ?A #x2341 #x235A)
154 (map-charset-chars #'modify-category-entry 'chinese-gb2312 ?A #x2361 #x237A)
155 (map-charset-chars #'modify-category-entry 'chinese-gb2312 ?H #x2421 #x247E)
156 (map-charset-chars #'modify-category-entry 'chinese-gb2312 ?K #x2521 #x257E)
157 (map-charset-chars #'modify-category-entry 'chinese-gb2312 ?G #x2621 #x267E)
158 (map-charset-chars #'modify-category-entry 'chinese-gb2312 ?Y #x2721 #x277E)
159 (map-charset-chars #'modify-category-entry 'chinese-gb2312 ?C #x3021 #x7E7E)
160
161 ;; Chinese character set (BIG5)
162
163 (map-charset-chars #'modify-category-entry 'big5 ?c)
164 (map-charset-chars #'modify-category-entry 'big5 ?C #xA259 #xA25F)
165 (map-charset-chars #'modify-category-entry 'big5 ?C #xA440 #xC67E)
166 (map-charset-chars #'modify-category-entry 'big5 ?C #xC940 #xF9DF)
167
168 ;; Chinese character set (CNS11643)
169
170 (dolist (c '(chinese-cns11643-1 chinese-cns11643-2 chinese-cns11643-3
171 chinese-cns11643-4 chinese-cns11643-5 chinese-cns11643-6
172 chinese-cns11643-7))
173 (map-charset-chars #'modify-category-entry c ?c)
174 (if (eq c 'chinese-cns11643-1)
175 (map-charset-chars #'modify-category-entry c ?C #x4421 #x7E7E)
176 (map-charset-chars #'modify-category-entry c ?C)))
177
178 ;; Japanese character set (JISX0201, JISX0208, JISX0212, JISX0213)
179
180 (map-charset-chars #'modify-category-entry 'katakana-jisx0201 ?k)
181
182 (map-charset-chars #'modify-category-entry 'latin-jisx0201 ?r)
183
184 (dolist (l '(katakana-jisx0201 japanese-jisx0208 japanese-jisx0212
185 japanese-jisx0213-1 japanese-jisx0213-2))
186 (map-charset-chars #'modify-category-entry l ?j))
187
188 ;; Unicode equivalents of JISX0201-kana
189 (let ((range '(#xff61 . #xff9f)))
190 (modify-category-entry range ?k)
191 (modify-category-entry range ?j)
192 (modify-category-entry range ?\|))
193
194 ;; Katakana block
195 (let ((range '(#x30a0 . #x30ff)))
196 ;; ?K is double width, ?k isn't specified
197 (modify-category-entry range ?K)
198 (modify-category-entry range ?\|))
199
200 ;; Hiragana block
201 (let ((range '(#x3040 . #x309d)))
202 ;; ?H is actually defined to be double width
203 ;;(modify-category-entry range ?H)
204 (modify-category-entry range ?\|)
205 )
206
207 ;; JISX0208
208 (map-charset-chars #'modify-syntax-entry 'japanese-jisx0208 "_" #x2121 #x227E)
209 (map-charset-chars #'modify-syntax-entry 'japanese-jisx0208 "_" #x2821 #x287E)
210 (let ((chars '(?ー ?゛ ?゜ ?ヽ ?ヾ ?ゝ ?ゞ ?〃 ?仝 ?々 ?〆 ?〇)))
211 (dolist (elt chars)
212 (modify-syntax-entry (car chars) "w")))
213
214 (map-charset-chars #'modify-category-entry 'japanese-jisx0208 ?A #x2321 #x237E)
215 (map-charset-chars #'modify-category-entry 'japanese-jisx0208 ?H #x2421 #x247E)
216 (map-charset-chars #'modify-category-entry 'japanese-jisx0208 ?K #x2521 #x257E)
217 (map-charset-chars #'modify-category-entry 'japanese-jisx0208 ?G #x2621 #x267E)
218 (map-charset-chars #'modify-category-entry 'japanese-jisx0208 ?Y #x2721 #x277E)
219 (map-charset-chars #'modify-category-entry 'japanese-jisx0208 ?C #x3021 #x7E7E)
220 (modify-category-entry ?ー ?K)
221 (let ((chars '(?゛ ?゜)))
222 (while chars
223 (modify-category-entry (car chars) ?K)
224 (modify-category-entry (car chars) ?H)
225 (setq chars (cdr chars))))
226 (let ((chars '(?ヽ ?ヾ ?ゝ ?ゞ ?〃 ?仝 ?々 ?〆 ?〇)))
227 (while chars
228 (modify-category-entry (car chars) ?C)
229 (setq chars (cdr chars))))
230
231 ;; JISX0212
232
233 (map-charset-chars #'modify-syntax-entry 'japanese-jisx0212 "_" #x2121 #x237E)
234
235 ;; JISX0201-Kana
236
237 (let ((chars '(?。 ?、 ?・)))
238 (while chars
239 (modify-syntax-entry (car chars) ".")
240 (setq chars (cdr chars))))
241
242 (modify-syntax-entry ?\「 "(」")
243 (modify-syntax-entry ?\」 "(「")
244
245 ;; Korean character set (KSC5601)
246
247 (map-charset-chars #'modify-category-entry 'korean-ksc5601 ?h)
248
249 (map-charset-chars #'modify-syntax-entry 'korean-ksc5601 "_" #x2121 #x227E)
250 (map-charset-chars #'modify-syntax-entry 'korean-ksc5601 "_" #x2621 #x277E)
251 (map-charset-chars #'modify-syntax-entry 'korean-ksc5601 "_" #x2830 #x287E)
252 (map-charset-chars #'modify-syntax-entry 'korean-ksc5601 "_" #x2930 #x297E)
253 (map-charset-chars #'modify-category-entry 'korean-ksc5601 ?A #x2330 #x2339)
254 (map-charset-chars #'modify-category-entry 'korean-ksc5601 ?A #x2341 #x235A)
255 (map-charset-chars #'modify-category-entry 'korean-ksc5601 ?A #x2361 #x237A)
256 (map-charset-chars #'modify-category-entry 'korean-ksc5601 ?G #x2521 #x257E)
257 (map-charset-chars #'modify-category-entry 'korean-ksc5601 ?H #x2A21 #x2A7E)
258 (map-charset-chars #'modify-category-entry 'korean-ksc5601 ?K #x2B21 #x2B7E)
259 (map-charset-chars #'modify-category-entry 'korean-ksc5601 ?Y #x2C21 #x2C7E)
260
261 ;; These are in more than one charset.
262 (let ((parens (concat "〈〉《》「」『』【】〔〕〖〗〘〙〚〛"
263 "︵︶︷︸︹︺︻︼︽︾︿﹀﹁﹂﹃﹄"
264 "()[]{}"))
265 open close)
266 (dotimes (i (/ (length parens) 2))
267 (setq open (aref parens (* i 2))
268 close (aref parens (1+ (* i 2))))
269 (modify-syntax-entry open (format "(%c" close))
270 (modify-syntax-entry close (format ")%c" open))))
271
272 ;; Arabic character set
273
274 (let ((charsets '(arabic-iso8859-6
275 arabic-digit
276 arabic-1-column
277 arabic-2-column)))
278 (while charsets
279 (map-charset-chars #'modify-category-entry (car charsets) ?b)
280 (setq charsets (cdr charsets))))
281 (modify-category-entry '(#x600 . #x6ff) ?b)
282 (modify-category-entry '(#xfb50 . #xfdff) ?b)
283 (modify-category-entry '(#xfe70 . #xfefe) ?b)
284
285 ;; Cyrillic character set (ISO-8859-5)
286
287 (modify-syntax-entry ?№ ".")
288
289 ;; Ethiopic character set
290
291 (modify-category-entry '(#x1200 . #x1399) ?e)
292 (modify-category-entry '(#x2d80 . #x2dde) ?e)
293 (let ((chars '(?፡ ?። ?፣ ?፤ ?፥ ?፦ ?፧ ?፨)))
294 (while chars
295 (modify-syntax-entry (car chars) ".")
296 (setq chars (cdr chars))))
297 (map-charset-chars #'modify-category-entry 'ethiopic ?e)
298
299 ;; Hebrew character set (ISO-8859-8)
300
301 (modify-syntax-entry #x5be ".") ; MAQAF
302 (modify-syntax-entry #x5c0 ".") ; PASEQ
303 (modify-syntax-entry #x5c3 ".") ; SOF PASUQ
304 (modify-syntax-entry #x5f3 ".") ; GERESH
305 (modify-syntax-entry #x5f4 ".") ; GERSHAYIM
306
307 ;; Indian character set (IS 13194 and other Emacs original Indian charsets)
308
309 (modify-category-entry '(#x901 . #x970) ?i)
310 (map-charset-chars #'modify-category-entry 'indian-is13194 ?i)
311 (map-charset-chars #'modify-category-entry 'indian-2-column ?i)
312
313 ;; Lao character set
314
315 (modify-category-entry '(#xe80 . #xeff) ?o)
316 (map-charset-chars #'modify-category-entry 'lao ?o)
317
318 (let ((deflist '(("ກ-ຮ" "w" ?0) ; consonant
319 ("ະາຳຽເ-ໄ" "w" ?1) ; vowel base
320 ("ັິ-ືົໍ" "w" ?2) ; vowel upper
321 ("ຸູ" "w" ?3) ; vowel lower
322 ("່-໋" "w" ?4) ; tone mark
323 ("ຼຽ" "w" ?9) ; semivowel lower
324 ("໐-໙" "w" ?6) ; digit
325 ("ຯໆ" "_" ?5) ; symbol
326 ))
327 elm chars len syntax category to ch i)
328 (while deflist
329 (setq elm (car deflist))
330 (setq chars (car elm)
331 len (length chars)
332 syntax (nth 1 elm)
333 category (nth 2 elm)
334 i 0)
335 (while (< i len)
336 (if (= (aref chars i) ?-)
337 (setq i (1+ i)
338 to (aref chars i))
339 (setq ch (aref chars i)
340 to ch))
341 (while (<= ch to)
342 (unless (string-equal syntax "w")
343 (modify-syntax-entry ch syntax))
344 (modify-category-entry ch category)
345 (setq ch (1+ ch)))
346 (setq i (1+ i)))
347 (setq deflist (cdr deflist))))
348
349 ;; Thai character set (TIS620)
350
351 (modify-category-entry '(#xe00 . #xe7f) ?t)
352 (map-charset-chars #'modify-category-entry 'thai-tis620 ?t)
353
354 (let ((deflist '(;; chars syntax category
355 ("ก-รลว-ฮ" "w" ?0) ; consonant
356 ("ฤฦะาำเ-ๅ" "w" ?1) ; vowel base
357 ("ัิ-ื็๎" "w" ?2) ; vowel upper
358 ("ุ-ฺ" "w" ?3) ; vowel lower
359 ("่-ํ" "w" ?4) ; tone mark
360 ("๐-๙" "w" ?6) ; digit
361 ("ฯๆ฿๏๚๛" "_" ?5) ; symbol
362 ))
363 elm chars len syntax category to ch i)
364 (while deflist
365 (setq elm (car deflist))
366 (setq chars (car elm)
367 len (length chars)
368 syntax (nth 1 elm)
369 category (nth 2 elm)
370 i 0)
371 (while (< i len)
372 (if (= (aref chars i) ?-)
373 (setq i (1+ i)
374 to (aref chars i))
375 (setq ch (aref chars i)
376 to ch))
377 (while (<= ch to)
378 (unless (string-equal syntax "w")
379 (modify-syntax-entry ch syntax))
380 (modify-category-entry ch category)
381 (setq ch (1+ ch)))
382 (setq i (1+ i)))
383 (setq deflist (cdr deflist))))
384
385 ;; Tibetan character set
386
387 (modify-category-entry '(#xf00 . #xfff) ?q)
388 (map-charset-chars #'modify-category-entry 'tibetan ?q)
389 (map-charset-chars #'modify-category-entry 'tibetan-1-column ?q)
390
391 (let ((deflist '(;; chars syntax category
392 ("ཀ-ཀྵཪ" "w" ?0) ; consonant
393 ("ྐ-ྐྵྺྻྼ" "w" ?0) ;
394 ("ིེཻོཽྀ" "w" ?2) ; upper vowel
395 ("ཾྂྃ྆྇ྈྉྊྋ" "w" ?2) ; upper modifier
396 ("྄ཱུ༙༵༷" "w" ?3) ; lowel vowel/modifier
397 ("཰" "w" ?3) ; invisible vowel a
398 ("༠-༩༪-༳" "w" ?6) ; digit
399 ("་།-༒༔ཿ" "." ?|) ; line-break char
400 ("་།༏༐༑༔ཿ" "." ?|) ;
401 ("༈་།-༒༔ཿ༽༴" "." ?>) ; prohibition
402 ("་།༏༐༑༔ཿ" "." ?>) ;
403 ("ༀ-༊༼࿁࿂྅" "." ?<) ; prohibition
404 ("༓༕-༘༚-༟༶༸-༻༾༿྾྿-࿏" "." ?q) ; others
405 ))
406 elm chars len syntax category to ch i)
407 (while deflist
408 (setq elm (car deflist))
409 (setq chars (car elm)
410 len (length chars)
411 syntax (nth 1 elm)
412 category (nth 2 elm)
413 i 0)
414 (while (< i len)
415 (if (= (aref chars i) ?-)
416 (setq i (1+ i)
417 to (aref chars i))
418 (setq ch (aref chars i)
419 to ch))
420 (while (<= ch to)
421 (unless (string-equal syntax "w")
422 (modify-syntax-entry ch syntax))
423 (modify-category-entry ch category)
424 (setq ch (1+ ch)))
425 (setq i (1+ i)))
426 (setq deflist (cdr deflist))))
427
428 ;; Vietnamese character set
429
430 ;; To make a word with Latin characters
431 (map-charset-chars #'modify-category-entry 'vietnamese-viscii-lower ?l)
432 (map-charset-chars #'modify-category-entry 'vietnamese-viscii-lower ?v)
433
434 (map-charset-chars #'modify-category-entry 'vietnamese-viscii-upper ?l)
435 (map-charset-chars #'modify-category-entry 'vietnamese-viscii-upper ?v)
436
437 (let ((tbl (standard-case-table))
438 (i 32))
439 (while (< i 128)
440 (let* ((char (decode-char 'vietnamese-viscii-upper i))
441 (charl (decode-char 'vietnamese-viscii-lower i))
442 (uc (encode-char char 'ucs))
443 (lc (encode-char charl 'ucs)))
444 (set-case-syntax-pair char (decode-char 'vietnamese-viscii-lower i)
445 tbl)
446 (if uc (modify-category-entry uc ?v))
447 (if lc (modify-category-entry lc ?v)))
448 (setq i (1+ i))))
449
450 ;; Tai Viet
451 (let ((deflist '(;; chars syntax category
452 ((?ꪀ. ?ꪯ) "w" ?0) ; cosonant
453 ("ꪱꪵꪶ" "w" ?1) ; vowel base
454 ((?ꪹ . ?ꪽ) "w" ?1) ; vowel base
455 ("ꪰꪲꪳꪷꪸꪾ" "w" ?2) ; vowel upper
456 ("ꪴ" "w" ?3) ; vowel lower
457 ("ꫀꫂ" "w" ?1) ; non-combining tone-mark
458 ("꪿꫁" "w" ?4) ; combining tone-mark
459 ((?ꫛ . ?꫟) "_" ?5) ; symbol
460 )))
461 (dolist (elm deflist)
462 (let ((chars (car elm))
463 (syntax (nth 1 elm))
464 (category (nth 2 elm)))
465 (if (consp chars)
466 (progn
467 (modify-syntax-entry chars syntax)
468 (modify-category-entry chars category))
469 (mapc #'(lambda (x)
470 (modify-syntax-entry x syntax)
471 (modify-category-entry x category))
472 chars)))))
473
474 ;; Latin
475
476 (modify-category-entry '(#x80 . #x024F) ?l)
477
478 (let ((tbl (standard-case-table)) c)
479
480 ;; Latin-1
481
482 ;; Fixme: Some of the non-word syntaxes here perhaps should be
483 ;; reviewed. (Note that the following all implicitly have word
484 ;; syntax: ¢£¤¥¨ª¯²³´¶¸¹º.) There should be a well-defined way of
485 ;; relating Unicode categories to Emacs syntax codes.
486
487 ;; NBSP isn't semantically interchangeable with other whitespace chars,
488 ;; so it's more like punctation.
489 (set-case-syntax ?  "." tbl)
490 (set-case-syntax ?¡ "." tbl)
491 (set-case-syntax ?¦ "_" tbl)
492 (set-case-syntax ?§ "." tbl)
493 (set-case-syntax ?© "_" tbl)
494 (set-case-syntax-delims 171 187 tbl) ; « »
495 (set-case-syntax ?¬ "_" tbl)
496 (set-case-syntax ?­ "_" tbl)
497 (set-case-syntax ?® "_" tbl)
498 (set-case-syntax ?° "_" tbl)
499 (set-case-syntax ?± "_" tbl)
500 (set-case-syntax ?µ "_" tbl)
501 (set-case-syntax ?· "_" tbl)
502 (set-case-syntax ?¼ "_" tbl)
503 (set-case-syntax ?½ "_" tbl)
504 (set-case-syntax ?¾ "_" tbl)
505 (set-case-syntax ?¿ "." tbl)
506 (let ((c 192))
507 (while (<= c 222)
508 (set-case-syntax-pair c (+ c 32) tbl)
509 (setq c (1+ c))))
510 (set-case-syntax ?× "_" tbl)
511 (set-case-syntax ?ß "w" tbl)
512 (set-case-syntax ?÷ "_" tbl)
513 ;; See below for ÿ.
514
515 ;; Latin Extended-A, Latin Extended-B
516 (setq c #x0100)
517 (while (<= c #x02B8)
518 (modify-category-entry c ?l)
519 (setq c (1+ c)))
520
521 (let ((pair-ranges '((#x0100 . #x012F)
522 (#x0132 . #x0137)
523 (#x0139 . #x0148)
524 (#x014a . #x0177)
525 (#x0179 . #x017E)
526 (#x0182 . #x0185)
527 (#x0187 . #x018C)
528 (#x0191 . #x0192)
529 (#x0198 . #x0199)
530 (#x01A0 . #x01A5)
531 (#x01A7 . #x01A8)
532 (#x01AC . #x01AD)
533 (#x01AF . #x01B0)
534 (#x01B3 . #x01B6)
535 (#x01BC . #x01BD)
536 (#x01CD . #x01DC)
537 (#x01DE . #x01EF)
538 (#x01F4 . #x01F5)
539 (#x01F8 . #x021F)
540 (#x0222 . #x0233)
541 (#x023B . #x023C)
542 (#x0241 . #x0242)
543 (#x0246 . #x024F))))
544 (dolist (elt pair-ranges)
545 (let ((from (car elt)) (to (cdr elt)))
546 (while (< from to)
547 (set-case-syntax-pair from (1+ from) tbl)
548 (setq from (+ from 2))))))
549
550 ;; In some languages, such as Turkish, U+0049 LATIN CAPITAL LETTER I
551 ;; and U+0131 LATIN SMALL LETTER DOTLESS I make a case pair, and so
552 ;; do U+0130 LATIN CAPITAL LETTER I WITH DOT ABOVE and U+0069 LATIN
553 ;; SMALL LETTER I.
554
555 ;; We used to set up half of those correspondence unconditionally,
556 ;; but that makes searches slow. So now we don't set up either half
557 ;; of these correspondences by default.
558
559 ;; (set-downcase-syntax ?İ ?i tbl)
560 ;; (set-upcase-syntax ?I ?ı tbl)
561
562 (set-case-syntax-pair ?DŽ ?dž tbl)
563 (set-case-syntax-pair ?Dž ?dž tbl)
564 (set-case-syntax-pair ?LJ ?lj tbl)
565 (set-case-syntax-pair ?Lj ?lj tbl)
566 (set-case-syntax-pair ?NJ ?nj tbl)
567 (set-case-syntax-pair ?Nj ?nj tbl)
568
569 ;; 01F0; F; 006A 030C; # LATIN SMALL LETTER J WITH CARON
570 (set-case-syntax-pair ?DZ ?dz tbl)
571 (set-case-syntax-pair ?Dz ?dz tbl)
572 (set-case-syntax-pair ?Ƕ ?ƕ tbl)
573 (set-case-syntax-pair ?Ƿ ?ƿ tbl)
574
575 ;; Latin Extended Additional
576 (modify-category-entry '(#x1e00 . #x1ef9) ?l)
577 (setq c #x1e00)
578 (while (<= c #x1ef9)
579 (and (zerop (% c 2))
580 (or (<= c #x1e94) (>= c #x1ea0))
581 (set-case-syntax-pair c (1+ c) tbl))
582 (setq c (1+ c)))
583
584 ;; Greek
585 (modify-category-entry '(#x0370 . #x03ff) ?g)
586 (setq c #x0370)
587 (while (<= c #x03ff)
588 (if (or (and (>= c #x0391) (<= c #x03a1))
589 (and (>= c #x03a3) (<= c #x03ab)))
590 (set-case-syntax-pair c (+ c 32) tbl))
591 (and (>= c #x03da)
592 (<= c #x03ee)
593 (zerop (% c 2))
594 (set-case-syntax-pair c (1+ c) tbl))
595 (setq c (1+ c)))
596 (set-case-syntax-pair ?Ά ?ά tbl)
597 (set-case-syntax-pair ?Έ ?έ tbl)
598 (set-case-syntax-pair ?Ή ?ή tbl)
599 (set-case-syntax-pair ?Ί ?ί tbl)
600 (set-case-syntax-pair ?Ό ?ό tbl)
601 (set-case-syntax-pair ?Ύ ?ύ tbl)
602 (set-case-syntax-pair ?Ώ ?ώ tbl)
603
604 ;; Armenian
605 (setq c #x531)
606 (while (<= c #x556)
607 (set-case-syntax-pair c (+ c #x30) tbl)
608 (setq c (1+ c)))
609
610 ;; Greek Extended
611 (modify-category-entry '(#x1f00 . #x1fff) ?g)
612 (setq c #x1f00)
613 (while (<= c #x1fff)
614 (and (<= (logand c #x000f) 7)
615 (<= c #x1fa7)
616 (not (memq c '(#x1f50 #x1f52 #x1f54 #x1f56)))
617 (/= (logand c #x00f0) 7)
618 (set-case-syntax-pair (+ c 8) c tbl))
619 (setq c (1+ c)))
620 (set-case-syntax-pair ?Ᾰ ?ᾰ tbl)
621 (set-case-syntax-pair ?Ᾱ ?ᾱ tbl)
622 (set-case-syntax-pair ?Ὰ ?ὰ tbl)
623 (set-case-syntax-pair ?Ά ?ά tbl)
624 (set-case-syntax-pair ?ᾼ ?ᾳ tbl)
625 (set-case-syntax-pair ?Ὲ ?ὲ tbl)
626 (set-case-syntax-pair ?Έ ?έ tbl)
627 (set-case-syntax-pair ?Ὴ ?ὴ tbl)
628 (set-case-syntax-pair ?Ή ?ή tbl)
629 (set-case-syntax-pair ?ῌ ?ῃ tbl)
630 (set-case-syntax-pair ?Ῐ ?ῐ tbl)
631 (set-case-syntax-pair ?Ῑ ?ῑ tbl)
632 (set-case-syntax-pair ?Ὶ ?ὶ tbl)
633 (set-case-syntax-pair ?Ί ?ί tbl)
634 (set-case-syntax-pair ?Ῠ ?ῠ tbl)
635 (set-case-syntax-pair ?Ῡ ?ῡ tbl)
636 (set-case-syntax-pair ?Ὺ ?ὺ tbl)
637 (set-case-syntax-pair ?Ύ ?ύ tbl)
638 (set-case-syntax-pair ?Ῥ ?ῥ tbl)
639 (set-case-syntax-pair ?Ὸ ?ὸ tbl)
640 (set-case-syntax-pair ?Ό ?ό tbl)
641 (set-case-syntax-pair ?Ὼ ?ὼ tbl)
642 (set-case-syntax-pair ?Ώ ?ώ tbl)
643 (set-case-syntax-pair ?ῼ ?ῳ tbl)
644
645 ;; cyrillic
646 (modify-category-entry '(#x0400 . #x04FF) ?y)
647 (setq c #x0400)
648 (while (<= c #x04ff)
649 (and (>= c #x0400)
650 (<= c #x040f)
651 (set-case-syntax-pair c (+ c 80) tbl))
652 (and (>= c #x0410)
653 (<= c #x042f)
654 (set-case-syntax-pair c (+ c 32) tbl))
655 (and (zerop (% c 2))
656 (or (and (>= c #x0460) (<= c #x0480))
657 (and (>= c #x048c) (<= c #x04be))
658 (and (>= c #x04d0) (<= c #x04f4)))
659 (set-case-syntax-pair c (1+ c) tbl))
660 (setq c (1+ c)))
661 (set-case-syntax-pair ?Ӂ ?ӂ tbl)
662 (set-case-syntax-pair ?Ӄ ?ӄ tbl)
663 (set-case-syntax-pair ?Ӈ ?ӈ tbl)
664 (set-case-syntax-pair ?Ӌ ?ӌ tbl)
665 (set-case-syntax-pair ?Ӹ ?ӹ tbl)
666
667 ;; general punctuation
668 (setq c #x2000)
669 (while (<= c #x200b)
670 (set-case-syntax c " " tbl)
671 (setq c (1+ c)))
672 (while (<= c #x200F)
673 (set-case-syntax c "." tbl)
674 (setq c (1+ c)))
675 ;; Fixme: These aren't all right:
676 (setq c #x2010)
677 (while (<= c #x2016)
678 (set-case-syntax c "_" tbl)
679 (setq c (1+ c)))
680 ;; Punctuation syntax for quotation marks (like `)
681 (while (<= c #x201f)
682 (set-case-syntax c "." tbl)
683 (setq c (1+ c)))
684 ;; Fixme: These aren't all right:
685 (while (<= c #x2027)
686 (set-case-syntax c "_" tbl)
687 (setq c (1+ c)))
688 (while (<= c #x206F)
689 (set-case-syntax c "." tbl)
690 (setq c (1+ c)))
691
692 ;; Roman numerals
693 (setq c #x2160)
694 (while (<= c #x216f)
695 (set-case-syntax-pair c (+ c #x10) tbl)
696 (setq c (1+ c)))
697
698 ;; Fixme: The following blocks might be better as symbol rather than
699 ;; punctuation.
700 ;; Arrows
701 (setq c #x2190)
702 (while (<= c #x21FF)
703 (set-case-syntax c "." tbl)
704 (setq c (1+ c)))
705 ;; Mathematical Operators
706 (while (<= c #x22FF)
707 (set-case-syntax c "." tbl)
708 (setq c (1+ c)))
709 ;; Miscellaneous Technical
710 (while (<= c #x23FF)
711 (set-case-syntax c "." tbl)
712 (setq c (1+ c)))
713 ;; Control Pictures
714 (while (<= c #x243F)
715 (set-case-syntax c "_" tbl)
716 (setq c (1+ c)))
717
718 ;; Circled Latin
719 (setq c #x24b6)
720 (while (<= c #x24cf)
721 (set-case-syntax-pair c (+ c 26) tbl)
722 (modify-category-entry c ?l)
723 (modify-category-entry (+ c 26) ?l)
724 (setq c (1+ c)))
725
726 ;; Fullwidth Latin
727 (setq c #xff21)
728 (while (<= c #xff3a)
729 (set-case-syntax-pair c (+ c #x20) tbl)
730 (modify-category-entry c ?l)
731 (modify-category-entry (+ c #x20) ?l)
732 (setq c (1+ c)))
733
734 ;; Combining diacritics
735 (modify-category-entry '(#x300 . #x362) ?^)
736 ;; Combining marks
737 (modify-category-entry '(#x20d0 . #x20e3) ?^)
738
739 ;; Fixme: syntax for symbols &c
740 )
741
742 (let ((pairs
743 '("⁅⁆" ; U+2045 U+2046
744 "⁽⁾" ; U+207D U+207E
745 "₍₎" ; U+208D U+208E
746 "〈〉" ; U+2329 U+232A
747 "⎴⎵" ; U+23B4 U+23B5
748 "❨❩" ; U+2768 U+2769
749 "❪❫" ; U+276A U+276B
750 "❬❭" ; U+276C U+276D
751 "❰❱" ; U+2770 U+2771
752 "❲❳" ; U+2772 U+2773
753 "❴❵" ; U+2774 U+2775
754 "⟦⟧" ; U+27E6 U+27E7
755 "⟨⟩" ; U+27E8 U+27E9
756 "⟪⟫" ; U+27EA U+27EB
757 "⦃⦄" ; U+2983 U+2984
758 "⦅⦆" ; U+2985 U+2986
759 "⦇⦈" ; U+2987 U+2988
760 "⦉⦊" ; U+2989 U+298A
761 "⦋⦌" ; U+298B U+298C
762 "⦍⦎" ; U+298D U+298E
763 "⦏⦐" ; U+298F U+2990
764 "⦑⦒" ; U+2991 U+2992
765 "⦓⦔" ; U+2993 U+2994
766 "⦕⦖" ; U+2995 U+2996
767 "⦗⦘" ; U+2997 U+2998
768 "⧼⧽" ; U+29FC U+29FD
769 "〈〉" ; U+3008 U+3009
770 "《》" ; U+300A U+300B
771 "「」" ; U+300C U+300D
772 "『』" ; U+300E U+300F
773 "【】" ; U+3010 U+3011
774 "〔〕" ; U+3014 U+3015
775 "〖〗" ; U+3016 U+3017
776 "〘〙" ; U+3018 U+3019
777 "〚〛" ; U+301A U+301B
778 "﴾﴿" ; U+FD3E U+FD3F
779 "︵︶" ; U+FE35 U+FE36
780 "︷︸" ; U+FE37 U+FE38
781 "︹︺" ; U+FE39 U+FE3A
782 "︻︼" ; U+FE3B U+FE3C
783 "︽︾" ; U+FE3D U+FE3E
784 "︿﹀" ; U+FE3F U+FE40
785 "﹁﹂" ; U+FE41 U+FE42
786 "﹃﹄" ; U+FE43 U+FE44
787 "﹙﹚" ; U+FE59 U+FE5A
788 "﹛﹜" ; U+FE5B U+FE5C
789 "﹝﹞" ; U+FE5D U+FE5E
790 "()" ; U+FF08 U+FF09
791 "[]" ; U+FF3B U+FF3D
792 "{}" ; U+FF5B U+FF5D
793 "⦅⦆" ; U+FF5F U+FF60
794 "「」" ; U+FF62 U+FF63
795 )))
796 (dolist (elt pairs)
797 (modify-syntax-entry (aref elt 0) (string ?\( (aref elt 1)))
798 (modify-syntax-entry (aref elt 1) (string ?\) (aref elt 0)))))
799
800 \f
801 ;; For each character set, put the information of the most proper
802 ;; coding system to encode it by `preferred-coding-system' property.
803
804 ;; Fixme: should this be junked?
805 (let ((l '((latin-iso8859-1 . iso-latin-1)
806 (latin-iso8859-2 . iso-latin-2)
807 (latin-iso8859-3 . iso-latin-3)
808 (latin-iso8859-4 . iso-latin-4)
809 (thai-tis620 . thai-tis620)
810 (greek-iso8859-7 . greek-iso-8bit)
811 (arabic-iso8859-6 . iso-2022-7bit)
812 (hebrew-iso8859-8 . hebrew-iso-8bit)
813 (katakana-jisx0201 . japanese-shift-jis)
814 (latin-jisx0201 . japanese-shift-jis)
815 (cyrillic-iso8859-5 . cyrillic-iso-8bit)
816 (latin-iso8859-9 . iso-latin-5)
817 (japanese-jisx0208-1978 . iso-2022-jp)
818 (chinese-gb2312 . chinese-iso-8bit)
819 (chinese-gbk . chinese-gbk)
820 (gb18030-2-byte . chinese-gb18030)
821 (gb18030-4-byte-bmp . chinese-gb18030)
822 (gb18030-4-byte-smp . chinese-gb18030)
823 (gb18030-4-byte-ext-1 . chinese-gb18030)
824 (gb18030-4-byte-ext-2 . chinese-gb18030)
825 (japanese-jisx0208 . iso-2022-jp)
826 (korean-ksc5601 . iso-2022-kr)
827 (japanese-jisx0212 . iso-2022-jp)
828 (chinese-big5-1 . chinese-big5)
829 (chinese-big5-2 . chinese-big5)
830 (chinese-sisheng . iso-2022-7bit)
831 (ipa . iso-2022-7bit)
832 (vietnamese-viscii-lower . vietnamese-viscii)
833 (vietnamese-viscii-upper . vietnamese-viscii)
834 (arabic-digit . iso-2022-7bit)
835 (arabic-1-column . iso-2022-7bit)
836 (lao . lao)
837 (arabic-2-column . iso-2022-7bit)
838 (indian-is13194 . devanagari)
839 (indian-glyph . devanagari)
840 (tibetan-1-column . tibetan)
841 (ethiopic . iso-2022-7bit)
842 (chinese-cns11643-1 . iso-2022-cn)
843 (chinese-cns11643-2 . iso-2022-cn)
844 (chinese-cns11643-3 . iso-2022-cn)
845 (chinese-cns11643-4 . iso-2022-cn)
846 (chinese-cns11643-5 . iso-2022-cn)
847 (chinese-cns11643-6 . iso-2022-cn)
848 (chinese-cns11643-7 . iso-2022-cn)
849 (indian-2-column . devanagari)
850 (tibetan . tibetan)
851 (latin-iso8859-14 . iso-latin-8)
852 (latin-iso8859-15 . iso-latin-9))))
853 (while l
854 (put-charset-property (car (car l)) 'preferred-coding-system (cdr (car l)))
855 (setq l (cdr l))))
856
857 \f
858 ;; Setup auto-fill-chars for charsets that should invoke auto-filling.
859 ;; SPACE and NEWLINE are already set.
860
861 (set-char-table-range auto-fill-chars '(#x3041 . #x30FF) t)
862 (set-char-table-range auto-fill-chars '(#x3400 . #x4DB5) t)
863 (set-char-table-range auto-fill-chars '(#x4e00 . #x9fbb) t)
864 (set-char-table-range auto-fill-chars '(#xF900 . #xFAFF) t)
865 (set-char-table-range auto-fill-chars '(#xFF00 . #xFF9F) t)
866 (set-char-table-range auto-fill-chars '(#x20000 . #x2FFFF) t)
867
868 \f
869 ;;; Setting char-width-table. The default is 1.
870
871 ;; 0: non-spacing, enclosing combining, formatting, Hangul Jamo medial
872 ;; and final characters.
873 (let ((l '((#x0300 . #x036F)
874 (#x0483 . #x0489)
875 (#x0591 . #x05BD)
876 (#x05BF . #x05BF)
877 (#x05C1 . #x05C2)
878 (#x05C4 . #x05C5)
879 (#x05C7 . #x05C7)
880 (#x0600 . #x0603)
881 (#x0610 . #x0615)
882 (#x064B . #x065E)
883 (#x0670 . #x0670)
884 (#x06D6 . #x06E4)
885 (#x06E7 . #x06E8)
886 (#x06EA . #x06ED)
887 (#x070F . #x070F)
888 (#x0711 . #x0711)
889 (#x0730 . #x074A)
890 (#x07A6 . #x07B0)
891 (#x07EB . #x07F3)
892 (#x0901 . #x0902)
893 (#x093C . #x093C)
894 (#x0941 . #x0948)
895 (#x094D . #x094D)
896 (#x0951 . #x0954)
897 (#x0962 . #x0963)
898 (#x0981 . #x0981)
899 (#x09BC . #x09BC)
900 (#x09C1 . #x09C4)
901 (#x09CD . #x09CD)
902 (#x09E2 . #x09E3)
903 (#x0A01 . #x0A02)
904 (#x0A3C . #x0A3C)
905 (#x0A41 . #x0A4D)
906 (#x0A70 . #x0A71)
907 (#x0A81 . #x0A82)
908 (#x0ABC . #x0ABC)
909 (#x0AC1 . #x0AC8)
910 (#x0ACD . #x0ACD)
911 (#x0AE2 . #x0AE3)
912 (#x0B01 . #x0B01)
913 (#x0B3C . #x0B3C)
914 (#x0B3F . #x0B3F)
915 (#x0B41 . #x0B43)
916 (#x0B4D . #x0B56)
917 (#x0B82 . #x0B82)
918 (#x0BC0 . #x0BC0)
919 (#x0BCD . #x0BCD)
920 (#x0C3E . #x0C40)
921 (#x0C46 . #x0C56)
922 (#x0CBC . #x0CBC)
923 (#x0CBF . #x0CBF)
924 (#x0CC6 . #x0CC6)
925 (#x0CCC . #x0CCD)
926 (#x0CE2 . #x0CE3)
927 (#x0D41 . #x0D43)
928 (#x0D4D . #x0D4D)
929 (#x0DCA . #x0DCA)
930 (#x0DD2 . #x0DD6)
931 (#x0E31 . #x0E31)
932 (#x0E34 . #x0E3A)
933 (#x0E47 . #x0E4E)
934 (#x0EB1 . #x0EB1)
935 (#x0EB4 . #x0EBC)
936 (#x0EC8 . #x0ECD)
937 (#x0F18 . #x0F19)
938 (#x0F35 . #x0F35)
939 (#x0F37 . #x0F37)
940 (#x0F39 . #x0F39)
941 (#x0F71 . #x0F7E)
942 (#x0F80 . #x0F84)
943 (#x0F86 . #x0F87)
944 (#x0F90 . #x0FBC)
945 (#x0FC6 . #x0FC6)
946 (#x102D . #x1030)
947 (#x1032 . #x1037)
948 (#x1039 . #x1039)
949 (#x1058 . #x1059)
950 (#x1160 . #x11FF)
951 (#x135F . #x135F)
952 (#x1712 . #x1714)
953 (#x1732 . #x1734)
954 (#x1752 . #x1753)
955 (#x1772 . #x1773)
956 (#x17B4 . #x17B5)
957 (#x17B7 . #x17BD)
958 (#x17C6 . #x17C6)
959 (#x17C9 . #x17D3)
960 (#x17DD . #x17DD)
961 (#x180B . #x180D)
962 (#x18A9 . #x18A9)
963 (#x1920 . #x1922)
964 (#x1927 . #x1928)
965 (#x1932 . #x1932)
966 (#x1939 . #x193B)
967 (#x1A17 . #x1A18)
968 (#x1B00 . #x1B03)
969 (#x1B34 . #x1B34)
970 (#x1B36 . #x1B3A)
971 (#x1B3C . #x1B3C)
972 (#x1B42 . #x1B42)
973 (#x1B6B . #x1B73)
974 (#x1DC0 . #x1DFF)
975 (#x200B . #x200F)
976 (#x202A . #x202E)
977 (#x2060 . #x206F)
978 (#x20D0 . #x20EF)
979 (#x302A . #x302F)
980 (#x3099 . #x309A)
981 (#xA806 . #xA806)
982 (#xA80B . #xA80B)
983 (#xA825 . #xA826)
984 (#xFB1E . #xFB1E)
985 (#xFE00 . #xFE0F)
986 (#xFE20 . #xFE23)
987 (#xFEFF . #xFEFF)
988 (#xFFF9 . #xFFFB)
989 (#x10A01 . #x10A0F)
990 (#x10A38 . #x10A3F)
991 (#x1D167 . #x1D169)
992 (#x1D173 . #x1D182)
993 (#x1D185 . #x1D18B)
994 (#x1D1AA . #x1D1AD)
995 (#x1D242 . #x1D244)
996 (#xE0001 . #xE01EF))))
997 (dolist (elt l)
998 (set-char-table-range char-width-table elt 0)))
999
1000 ;; 2: East Asian Wide and Full-width characters.
1001 (let ((l '((#x1100 . #x115F)
1002 (#x2329 . #x232A)
1003 (#x2E80 . #x303E)
1004 (#x3040 . #xA4CF)
1005 (#xAC00 . #xD7A3)
1006 (#xF900 . #xFAFF)
1007 (#xFE30 . #xFE6F)
1008 (#xFF01 . #xFF60)
1009 (#xFFE0 . #xFFE6)
1010 (#x20000 . #x2FFFF)
1011 (#x30000 . #x3FFFF))))
1012 (dolist (elt l)
1013 (set-char-table-range char-width-table elt 2)))
1014
1015 ;; Other double width
1016 ;;(map-charset-chars
1017 ;; (lambda (range ignore) (set-char-table-range char-width-table range 2))
1018 ;; 'ethiopic)
1019 ;; (map-charset-chars
1020 ;; (lambda (range ignore) (set-char-table-range char-width-table range 2))
1021 ;; 'tibetan)
1022 (map-charset-chars
1023 (lambda (range ignore) (set-char-table-range char-width-table range 2))
1024 'indian-2-column)
1025 (map-charset-chars
1026 (lambda (range ignore) (set-char-table-range char-width-table range 2))
1027 'arabic-2-column)
1028
1029 (defvar cjk-char-width-table
1030 (let ((table (make-char-table nil)))
1031 (dolist (charset '(big5 chinese-gb2312 chinese-cns11643-1
1032 japanese-jisx0208 korean-ksc5601))
1033 (map-charset-chars #'(lambda (range arg)
1034 (set-char-table-range table range 2))
1035 charset))
1036 (optimize-char-table table)
1037 (set-char-table-parent table char-width-table)
1038 table)
1039 "Character width table used in CJK language environment.")
1040
1041 (defun use-cjk-char-width-table ()
1042 "Internal use only.
1043 Setup char-width-table appropriate for CJK language environment."
1044 (setq char-width-table cjk-char-width-table))
1045
1046 (defun use-default-char-width-table ()
1047 "Internal use only.
1048 Setup char-width-table appropriate for non-CJK language environment."
1049 (setq char-width-table (char-table-parent cjk-char-width-table)))
1050
1051 (optimize-char-table (standard-case-table))
1052 (optimize-char-table (standard-category-table))
1053 (optimize-char-table (standard-syntax-table))
1054
1055 \f
1056 ;; Setting char-script-table.
1057
1058 ;; The Unicode blocks actually extend past some of these ranges with
1059 ;; undefined codepoints.
1060 (let ((script-list nil))
1061 (dolist
1062 (elt
1063 '((#x0000 #x007F latin)
1064 (#x00A0 #x024F latin)
1065 (#x0250 #x02AF phonetic)
1066 (#x02B0 #x036F latin)
1067 (#x0370 #x03E1 greek)
1068 (#x03E2 #x03EF coptic)
1069 (#x03F0 #x03F3 greek)
1070 (#x0400 #x04FF cyrillic)
1071 (#x0530 #x058F armenian)
1072 (#x0590 #x05FF hebrew)
1073 (#x0600 #x06FF arabic)
1074 (#x0700 #x074F syriac)
1075 (#x07C0 #x07FA nko)
1076 (#x0780 #x07BF thaana)
1077 (#x0900 #x097F devanagari)
1078 (#x0980 #x09FF bengali)
1079 (#x0A00 #x0A7F gurmukhi)
1080 (#x0A80 #x0AFF gujarati)
1081 (#x0B00 #x0B7F oriya)
1082 (#x0B80 #x0BFF tamil)
1083 (#x0C00 #x0C7F telugu)
1084 (#x0C80 #x0CFF kannada)
1085 (#x0D00 #x0D7F malayalam)
1086 (#x0D80 #x0DFF sinhala)
1087 (#x0E00 #x0E5F thai)
1088 (#x0E80 #x0EDF lao)
1089 (#x0F00 #x0FFF tibetan)
1090 (#x1000 #x105F myanmar)
1091 (#x10A0 #x10FF georgian)
1092 (#x1100 #x11FF hangul)
1093 (#x1200 #x139F ethiopic)
1094 (#x13A0 #x13FF cherokee)
1095 (#x1400 #x167F canadian-aboriginal)
1096 (#x1680 #x169F ogham)
1097 (#x16A0 #x16FF runic)
1098 (#x1780 #x17FF khmer)
1099 (#x1800 #x18AF mongolian)
1100 (#x1D00 #x1DFF phonetic)
1101 (#x1E00 #x1EFF latin)
1102 (#x1F00 #x1FFF greek)
1103 (#x2000 #x27FF symbol)
1104 (#x2800 #x28FF braille)
1105 (#x2D80 #x2DDF ethiopic)
1106 (#x2E80 #x2FDF han)
1107 (#x2FF0 #x2FFF ideographic-description)
1108 (#x3000 #x303F cjk-misc)
1109 (#x3040 #x30FF kana)
1110 (#x3100 #x312F bopomofo)
1111 (#x3130 #x318F hangul)
1112 (#x3190 #x319F kanbun)
1113 (#x31A0 #x31BF bopomofo)
1114 (#x3400 #x9FAF han)
1115 (#xA000 #xA4CF yi)
1116 (#xAA00 #xAA5F cham)
1117 (#xAA80 #xAADF tai-viet)
1118 (#xAC00 #xD7AF hangul)
1119 (#xF900 #xFAFF han)
1120 (#xFB1D #xFB4F hebrew)
1121 (#xFB50 #xFDFF arabic)
1122 (#xFE70 #xFEFC arabic)
1123 (#xFF00 #xFF5F cjk-misc)
1124 (#xFF61 #xFF9F kana)
1125 (#xFFE0 #xFFE6 cjk-misc)
1126 (#x10000 #x100FF linear-b)
1127 (#x10100 #x1013F aegean-number)
1128 (#x10140 #x1018A ancient-greek-number)
1129 (#x10190 #x1019B ancient-symbol)
1130 (#x101D0 #x101FF phaistos-disc)
1131 (#x10280 #x1029F lycian)
1132 (#x102A0 #x102DF carian)
1133 (#x10300 #x1032F olt-italic)
1134 (#x10380 #x1039F ugaritic)
1135 (#x103A0 #x103DF old-persian)
1136 (#x10400 #x1044F deseret)
1137 (#x10450 #x1047F shavian)
1138 (#x10480 #x104AF osmanya)
1139 (#x10800 #x1083F cypriot-syllabary)
1140 (#x10900 #x1091F phoenician)
1141 (#x10920 #x1093F lydian)
1142 (#x10A00 #x10A5F kharoshthi)
1143 (#x12000 #x123FF cuneiform)
1144 (#x12400 #x1247F cuneiform-numbers-and-punctuation)
1145 (#x1D000 #x1D0FF byzantine-musical-symbol)
1146 (#x1D100 #x1D1FF musical-symbol)
1147 (#x1D200 #x1D24F ancient-greek-musical-notation)
1148 (#x1D300 #x1D35F tai-xuan-jing-symbol)
1149 (#x1D360 #x1D37F counting-rod-numeral)
1150 (#x1D400 #x1D7FF mathematical)
1151 (#x1F000 #x1F02F mahjong-tile)
1152 (#x1F030 #x1F09F domino-tile)
1153 (#x20000 #x2AFFF han)
1154 (#x2F800 #x2FFFF han)))
1155 (set-char-table-range char-script-table
1156 (cons (car elt) (nth 1 elt)) (nth 2 elt))
1157 (or (memq (nth 2 elt) script-list)
1158 (setq script-list (cons (nth 2 elt) script-list))))
1159 (set-char-table-extra-slot char-script-table 0 (nreverse script-list)))
1160
1161 (map-charset-chars
1162 #'(lambda (range ignore)
1163 (set-char-table-range char-script-table range 'tibetan))
1164 'tibetan)
1165
1166 \f
1167 ;;; Setting unicode-category-table.
1168
1169 ;; This macro is to build unicode-category-table at compile time so
1170 ;; that C code can access the table efficiently.
1171 (defmacro build-unicode-category-table ()
1172 (let ((table (make-char-table 'unicode-category-table nil)))
1173 (dotimes (i #x110000)
1174 (if (or (< i #xD800)
1175 (and (> i #xF900) (< i #x30000))
1176 (and (> i #xE0000) (< i #xE0200)))
1177 (aset table i (get-char-code-property i 'general-category))))
1178 (set-char-table-range table '(#xE000 . #xF8FF) 'Co)
1179 (set-char-table-range table '(#xF0000 . #xFFFFD) 'Co)
1180 (set-char-table-range table '(#x100000 . #x10FFFD) 'Co)
1181 (optimize-char-table table 'eq)
1182 table))
1183
1184 (setq unicode-category-table (build-unicode-category-table))
1185
1186 \f
1187 ;;; Setting word boundary.
1188
1189 (setq word-combining-categories
1190 '((nil . ?^)
1191 (?^ . nil)
1192 (?C . ?H)
1193 (?C . ?K)))
1194
1195 (setq word-separating-categories ; (2-byte character sets)
1196 '((?H . ?K) ; Hiragana - Katakana
1197 ))
1198
1199 ;; Local Variables:
1200 ;; coding: utf-8
1201 ;; End:
1202
1203 ;; arch-tag: 85889c35-9f4d-4912-9bf5-82de31b0d42d
1204 ;;; characters.el ends here