]> code.delx.au - gnu-emacs/blob - lisp/international/characters.el
*** empty log message ***
[gnu-emacs] / lisp / international / characters.el
1 ;;; characters.el --- set syntax and category for multibyte characters
2
3 ;; Copyright (C) 1997, 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007
4 ;; Free Software Foundation, Inc.
5 ;; Copyright (C) 1995, 1996, 1997, 1998, 1999, 2000, 2001, 2002, 2003, 2004,
6 ;; 2005, 2006, 2007
7 ;; National Institute of Advanced Industrial Science and Technology (AIST)
8 ;; Registration Number H14PRO021
9 ;; Copyright (C) 2003
10 ;; National Institute of Advanced Industrial Science and Technology (AIST)
11 ;; Registration Number H13PRO009
12
13 ;; Keywords: multibyte character, character set, syntax, category
14
15 ;; This file is part of GNU Emacs.
16
17 ;; GNU Emacs is free software; you can redistribute it and/or modify
18 ;; it under the terms of the GNU General Public License as published by
19 ;; the Free Software Foundation; either version 3, or (at your option)
20 ;; any later version.
21
22 ;; GNU Emacs is distributed in the hope that it will be useful,
23 ;; but WITHOUT ANY WARRANTY; without even the implied warranty of
24 ;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
25 ;; GNU General Public License for more details.
26
27 ;; You should have received a copy of the GNU General Public License
28 ;; along with GNU Emacs; see the file COPYING. If not, write to the
29 ;; Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
30 ;; Boston, MA 02110-1301, USA.
31
32 ;;; Commentary:
33
34 ;;; Code:
35
36 ;;; Predefined categories.
37
38 ;; For each character set.
39
40 (define-category ?a "ASCII graphic characters 32-126 (ISO646 IRV:1983[4/0])")
41 (define-category ?l "Latin")
42 (define-category ?t "Thai")
43 (define-category ?g "Greek")
44 (define-category ?b "Arabic")
45 (define-category ?w "Hebrew")
46 (define-category ?y "Cyrillic")
47 (define-category ?k "Japanese katakana")
48 (define-category ?r "Japanese roman")
49 (define-category ?c "Chinese")
50 (define-category ?j "Japanese")
51 (define-category ?h "Korean")
52 (define-category ?e "Ethiopic (Ge'ez)")
53 (define-category ?v "Vietnamese")
54 (define-category ?i "Indian")
55 (define-category ?o "Lao")
56 (define-category ?q "Tibetan")
57
58 ;; For each group (row) of 2-byte character sets.
59
60 (define-category ?A "Alpha-numeric characters of 2-byte character sets")
61 (define-category ?C "Chinese (Han) characters of 2-byte character sets")
62 (define-category ?G "Greek characters of 2-byte character sets")
63 (define-category ?H "Japanese Hiragana characters of 2-byte character sets")
64 (define-category ?K "Japanese Katakana characters of 2-byte character sets")
65 (define-category ?N "Korean Hangul characters of 2-byte character sets")
66 (define-category ?Y "Cyrillic characters of 2-byte character sets")
67 (define-category ?I "Indian Glyphs")
68
69 ;; For phonetic classifications.
70
71 (define-category ?0 "consonant")
72 (define-category ?1 "base (independent) vowel")
73 (define-category ?2 "upper diacritical mark (including upper vowel)")
74 (define-category ?3 "lower diacritical mark (including lower vowel)")
75 (define-category ?4 "combining tone mark")
76 (define-category ?5 "symbol")
77 (define-category ?6 "digit")
78 (define-category ?7 "vowel-modifying diacritical mark")
79 (define-category ?8 "vowel-signs")
80 (define-category ?9 "semivowel lower")
81
82 ;; For filling.
83 (define-category ?| "While filling, we can break a line at this character.")
84
85 ;; For indentation calculation.
86 (define-category ?\s
87 "This character counts as a space for indentation purposes.")
88
89 ;; Keep the following for `kinsoku' processing. See comments in
90 ;; kinsoku.el.
91 (define-category ?> "A character which can't be placed at beginning of line.")
92 (define-category ?< "A character which can't be placed at end of line.")
93
94 ;; Combining
95 (define-category ?^ "Combining diacritic or mark")
96 \f
97 ;;; Setting syntax and category.
98
99 ;; ASCII
100
101 ;; All ASCII characters have the category `a' (ASCII) and `l' (Latin).
102 (modify-category-entry '(32 . 127) ?a)
103 (modify-category-entry '(32 . 127) ?l)
104
105 ;; Deal with the CJK charsets first. Since the syntax of blocks is
106 ;; defined per charset, and the charsets may contain e.g. Latin
107 ;; characters, we end up with the wrong syntax definitions if we're
108 ;; not careful.
109
110 ;; Chinese characters (Unicode)
111 (modify-category-entry '(#x2E80 . #x312F) ?|)
112 (modify-category-entry '(#x3190 . #x33FF) ?|)
113 (modify-category-entry '(#x3400 . #x9FAF) ?C)
114 (modify-category-entry '(#x3400 . #x9FAF) ?c)
115 (modify-category-entry '(#x3400 . #x9FAF) ?|)
116 (modify-category-entry '(#xF900 . #xFAFF) ?C)
117 (modify-category-entry '(#xF900 . #xFAFF) ?c)
118 (modify-category-entry '(#xF900 . #xFAFF) ?|)
119 (modify-category-entry '(#x20000 . #x2AFFF) ?|)
120 (modify-category-entry '(#x2F800 . #x2FFFF) ?|)
121
122
123 ;; Chinese character set (GB2312)
124
125 (map-charset-chars #'modify-syntax-entry 'chinese-gb2312 "_" #x2121 #x217E)
126 (map-charset-chars #'modify-syntax-entry 'chinese-gb2312 "_" #x2221 #x227E)
127 (map-charset-chars #'modify-syntax-entry 'chinese-gb2312 "_" #x2921 #x297E)
128
129 (map-charset-chars #'modify-category-entry 'chinese-gb2312 ?c)
130 (map-charset-chars #'modify-category-entry 'chinese-gb2312 ?A #x2330 #x2339)
131 (map-charset-chars #'modify-category-entry 'chinese-gb2312 ?A #x2341 #x235A)
132 (map-charset-chars #'modify-category-entry 'chinese-gb2312 ?A #x2361 #x237A)
133 (map-charset-chars #'modify-category-entry 'chinese-gb2312 ?H #x2421 #x247E)
134 (map-charset-chars #'modify-category-entry 'chinese-gb2312 ?K #x2521 #x257E)
135 (map-charset-chars #'modify-category-entry 'chinese-gb2312 ?G #x2621 #x267E)
136 (map-charset-chars #'modify-category-entry 'chinese-gb2312 ?Y #x2721 #x277E)
137 (map-charset-chars #'modify-category-entry 'chinese-gb2312 ?C #x3021 #x7E7E)
138
139 ;; Chinese character set (BIG5)
140
141 (map-charset-chars #'modify-category-entry 'big5 ?c)
142 (map-charset-chars #'modify-category-entry 'big5 ?C #xA259 #xA25F)
143 (map-charset-chars #'modify-category-entry 'big5 ?C #xA440 #xC67E)
144 (map-charset-chars #'modify-category-entry 'big5 ?C #xC940 #xF9DF)
145
146 ;; Chinese character set (CNS11643)
147
148 (dolist (c '(chinese-cns11643-1 chinese-cns11643-2 chinese-cns11643-3
149 chinese-cns11643-4 chinese-cns11643-5 chinese-cns11643-6
150 chinese-cns11643-7))
151 (map-charset-chars #'modify-category-entry c ?c)
152 (if (eq c 'chinese-cns11643-1)
153 (map-charset-chars #'modify-category-entry c ?C #x4421 #x7E7E)
154 (map-charset-chars #'modify-category-entry c ?C)))
155
156 ;; Japanese character set (JISX0201, JISX0208, JISX0212, JISX0213)
157
158 (map-charset-chars #'modify-category-entry 'katakana-jisx0201 ?k)
159
160 (map-charset-chars #'modify-category-entry 'latin-jisx0201 ?r)
161
162 (dolist (l '(katakana-jisx0201 japanese-jisx0208 japanese-jisx0212
163 japanese-jisx0213-1 japanese-jisx0213-2))
164 (map-charset-chars #'modify-category-entry l ?j))
165
166 ;; Unicode equivalents of JISX0201-kana
167 (let ((range '(#xff61 . #xff9f)))
168 (modify-category-entry range ?k)
169 (modify-category-entry range ?j)
170 (modify-category-entry range ?\|))
171
172 ;; Katakana block
173 (let ((range '(#x30a0 . #x30ff)))
174 ;; ?K is double width, ?k isn't specified
175 (modify-category-entry range ?K)
176 (modify-category-entry range ?\|))
177
178 ;; Hiragana block
179 (let ((range '(#x3040 . #x309d)))
180 ;; ?H is actually defined to be double width
181 ;;(modify-category-entry range ?H)
182 (modify-category-entry range ?\|)
183 )
184
185 ;; JISX0208
186 (map-charset-chars #'modify-syntax-entry 'japanese-jisx0208 "_" #x2121 #x227E)
187 (map-charset-chars #'modify-syntax-entry 'japanese-jisx0208 "_" #x2821 #x287E)
188 (let ((chars '(?ー ?゛ ?゜ ?ヽ ?ヾ ?ゝ ?ゞ ?〃 ?仝 ?々 ?〆 ?〇)))
189 (dolist (elt chars)
190 (modify-syntax-entry (car chars) "w")))
191
192 (map-charset-chars #'modify-category-entry 'japanese-jisx0208 ?A #x2321 #x237E)
193 (map-charset-chars #'modify-category-entry 'japanese-jisx0208 ?H #x2421 #x247E)
194 (map-charset-chars #'modify-category-entry 'japanese-jisx0208 ?K #x2521 #x257E)
195 (map-charset-chars #'modify-category-entry 'japanese-jisx0208 ?G #x2621 #x267E)
196 (map-charset-chars #'modify-category-entry 'japanese-jisx0208 ?Y #x2721 #x277E)
197 (map-charset-chars #'modify-category-entry 'japanese-jisx0208 ?C #x3021 #x7E7E)
198 (modify-category-entry ?ー ?K)
199 (let ((chars '(?゛ ?゜)))
200 (while chars
201 (modify-category-entry (car chars) ?K)
202 (modify-category-entry (car chars) ?H)
203 (setq chars (cdr chars))))
204 (let ((chars '(?ヽ ?ヾ ?ゝ ?ゞ ?〃 ?仝 ?々 ?〆 ?〇)))
205 (while chars
206 (modify-category-entry (car chars) ?C)
207 (setq chars (cdr chars))))
208
209 ;; JISX0212
210
211 (map-charset-chars #'modify-syntax-entry 'japanese-jisx0212 "_" #x2121 #x237E)
212
213 ;; JISX0201-Kana
214
215 (let ((chars '(?。 ?、 ?・)))
216 (while chars
217 (modify-syntax-entry (car chars) ".")
218 (setq chars (cdr chars))))
219
220 (modify-syntax-entry ?\「 "(」")
221 (modify-syntax-entry ?\」 "(「")
222
223 ;; Korean character set (KSC5601)
224
225 (map-charset-chars #'modify-category-entry 'korean-ksc5601 ?h)
226
227 (map-charset-chars #'modify-syntax-entry 'korean-ksc5601 "_" #x2121 #x227E)
228 (map-charset-chars #'modify-syntax-entry 'korean-ksc5601 "_" #x2621 #x277E)
229 (map-charset-chars #'modify-syntax-entry 'korean-ksc5601 "_" #x2830 #x287E)
230 (map-charset-chars #'modify-syntax-entry 'korean-ksc5601 "_" #x2930 #x297E)
231 (map-charset-chars #'modify-category-entry 'korean-ksc5601 ?A #x2330 #x2339)
232 (map-charset-chars #'modify-category-entry 'korean-ksc5601 ?A #x2341 #x235A)
233 (map-charset-chars #'modify-category-entry 'korean-ksc5601 ?A #x2361 #x237A)
234 (map-charset-chars #'modify-category-entry 'korean-ksc5601 ?G #x2521 #x257E)
235 (map-charset-chars #'modify-category-entry 'korean-ksc5601 ?H #x2A21 #x2A7E)
236 (map-charset-chars #'modify-category-entry 'korean-ksc5601 ?K #x2B21 #x2B7E)
237 (map-charset-chars #'modify-category-entry 'korean-ksc5601 ?Y #x2C21 #x2C7E)
238
239 ;; These are in more than one charset.
240 (let ((parens (concat "〈〉《》「」『』【】〔〕〖〗〘〙〚〛"
241 "︵︶︷︸︹︺︻︼︽︾︿﹀﹁﹂﹃﹄"
242 "()[]{}"))
243 open close)
244 (dotimes (i (/ (length parens) 2))
245 (setq open (aref parens (* i 2))
246 close (aref parens (1+ (* i 2))))
247 (modify-syntax-entry open (format "(%c" close))
248 (modify-syntax-entry close (format ")%c" open))))
249
250 ;; Arabic character set
251
252 (let ((charsets '(arabic-iso8859-6
253 arabic-digit
254 arabic-1-column
255 arabic-2-column)))
256 (while charsets
257 (map-charset-chars #'modify-category-entry (car charsets) ?b)
258 (setq charsets (cdr charsets))))
259 (modify-category-entry '(#x600 . #x6ff) ?b)
260 (modify-category-entry '(#xfb50 . #xfdff) ?b)
261 (modify-category-entry '(#xfe70 . #xfefe) ?b)
262
263 ;; Cyrillic character set (ISO-8859-5)
264
265 (modify-syntax-entry ?№ ".")
266
267 ;; Ethiopic character set
268
269 (modify-category-entry '(#x1200 . #x1399) ?e)
270 (modify-category-entry '(#x2d80 . #x2dde) ?e)
271 (let ((chars '(?፡ ?። ?፣ ?፤ ?፥ ?፦ ?፧ ?፨ ? ? ? ? ? ?)))
272 (while chars
273 (modify-syntax-entry (car chars) ".")
274 (setq chars (cdr chars))))
275 (map-charset-chars #'modify-category-entry 'ethiopic ?e)
276
277 ;; Hebrew character set (ISO-8859-8)
278
279 (modify-syntax-entry #x5be ".") ; MAQAF
280 (modify-syntax-entry #x5c0 ".") ; PASEQ
281 (modify-syntax-entry #x5c3 ".") ; SOF PASUQ
282 (modify-syntax-entry #x5f3 ".") ; GERESH
283 (modify-syntax-entry #x5f4 ".") ; GERSHAYIM
284
285 ;; Indian character set (IS 13194 and other Emacs original Indian charsets)
286
287 (modify-category-entry '(#x901 . #x970) ?i)
288 (map-charset-chars #'modify-category-entry 'indian-is13194 ?i)
289 (map-charset-chars #'modify-category-entry 'indian-2-column ?i)
290
291 ;; Lao character set
292
293 (modify-category-entry '(#xe80 . #xeff) ?o)
294 (map-charset-chars #'modify-category-entry 'lao ?o)
295
296 (let ((deflist '(("ກ-ຮ" "w" ?0) ; consonant
297 ("ະາຳຽເ-ໄ" "w" ?1) ; vowel base
298 ("ັິ-ືົໍ" "w" ?2) ; vowel upper
299 ("ຸູ" "w" ?3) ; vowel lower
300 ("່-໋" "w" ?4) ; tone mark
301 ("ຼຽ" "w" ?9) ; semivowel lower
302 ("໐-໙" "w" ?6) ; digit
303 ("ຯໆ" "_" ?5) ; symbol
304 ))
305 elm chars len syntax category to ch i)
306 (while deflist
307 (setq elm (car deflist))
308 (setq chars (car elm)
309 len (length chars)
310 syntax (nth 1 elm)
311 category (nth 2 elm)
312 i 0)
313 (while (< i len)
314 (if (= (aref chars i) ?-)
315 (setq i (1+ i)
316 to (aref chars i))
317 (setq ch (aref chars i)
318 to ch))
319 (while (<= ch to)
320 (unless (string-equal syntax "w")
321 (modify-syntax-entry ch syntax))
322 (modify-category-entry ch category)
323 (setq ch (1+ ch)))
324 (setq i (1+ i)))
325 (setq deflist (cdr deflist))))
326
327 ;; Thai character set (TIS620)
328
329 (modify-category-entry '(#xe00 . #xe7f) ?t)
330 (map-charset-chars #'modify-category-entry 'thai-tis620 ?t)
331
332 (let ((deflist '(;; chars syntax category
333 ("ก-รลว-ฮ" "w" ?0) ; consonant
334 ("ฤฦะาำเ-ๅ" "w" ?1) ; vowel base
335 ("ัิ-ื็๎" "w" ?2) ; vowel upper
336 ("ุ-ฺ" "w" ?3) ; vowel lower
337 ("่-ํ" "w" ?4) ; tone mark
338 ("๐-๙" "w" ?6) ; digit
339 ("ฯๆ฿๏๚๛" "_" ?5) ; symbol
340 ))
341 elm chars len syntax category to ch i)
342 (while deflist
343 (setq elm (car deflist))
344 (setq chars (car elm)
345 len (length chars)
346 syntax (nth 1 elm)
347 category (nth 2 elm)
348 i 0)
349 (while (< i len)
350 (if (= (aref chars i) ?-)
351 (setq i (1+ i)
352 to (aref chars i))
353 (setq ch (aref chars i)
354 to ch))
355 (while (<= ch to)
356 (unless (string-equal syntax "w")
357 (modify-syntax-entry ch syntax))
358 (modify-category-entry ch category)
359 (setq ch (1+ ch)))
360 (setq i (1+ i)))
361 (setq deflist (cdr deflist))))
362
363 ;; Tibetan character set
364
365 (modify-category-entry '(#xf00 . #xfff) ?q)
366 (map-charset-chars #'modify-category-entry 'tibetan ?q)
367 (map-charset-chars #'modify-category-entry 'tibetan-1-column ?q)
368
369 (let ((deflist '(;; chars syntax category
370 ("ཀ-ཀྵཪ" "w" ?0) ; consonant
371 ("ྐ-ྐྵྺྻྼ" "w" ?0) ;
372 ("-" "w" ?0) ;
373 ("-" "w" ?0) ;
374 ("ིེཻོཽྀ" "w" ?2) ; upper vowel
375 ("ཾྂྃ྆྇ྈྉྊྋ" "w" ?2) ; upper modifier
376 ("྄ཱུ༙༵༷" "w" ?3) ; lowel vowel/modifier
377 ("཰" "w" ?3) ; invisible vowel a
378 ("༠-༩༪-༳" "w" ?6) ; digit
379 ("་།-༒༔ཿ" "." ?|) ; line-break char
380 ("་།༏༐༑༔ཿ" "." ?|) ;
381 ("༈་།-༒༔ཿ༽༴" "." ?>) ; prohibition
382 ("་།༏༐༑༔ཿ" "." ?>) ;
383 ("ༀ-༊༼࿁࿂྅" "." ?<) ; prohibition
384 ("༓༕-༘༚-༟༶༸-༻༾༿྾྿-࿏" "." ?q) ; others
385 ))
386 elm chars len syntax category to ch i)
387 (while deflist
388 (setq elm (car deflist))
389 (setq chars (car elm)
390 len (length chars)
391 syntax (nth 1 elm)
392 category (nth 2 elm)
393 i 0)
394 (while (< i len)
395 (if (= (aref chars i) ?-)
396 (setq i (1+ i)
397 to (aref chars i))
398 (setq ch (aref chars i)
399 to ch))
400 (while (<= ch to)
401 (unless (string-equal syntax "w")
402 (modify-syntax-entry ch syntax))
403 (modify-category-entry ch category)
404 (setq ch (1+ ch)))
405 (setq i (1+ i)))
406 (setq deflist (cdr deflist))))
407
408 ;; Vietnamese character set
409
410 ;; To make a word with Latin characters
411 (map-charset-chars #'modify-category-entry 'vietnamese-viscii-lower ?l)
412 (map-charset-chars #'modify-category-entry 'vietnamese-viscii-lower ?v)
413
414 (map-charset-chars #'modify-category-entry 'vietnamese-viscii-upper ?l)
415 (map-charset-chars #'modify-category-entry 'vietnamese-viscii-upper ?v)
416
417 (let ((tbl (standard-case-table))
418 (i 32))
419 (while (< i 128)
420 (let* ((char (decode-char 'vietnamese-viscii-upper i))
421 (charl (decode-char 'vietnamese-viscii-lower i))
422 (uc (encode-char char 'ucs))
423 (lc (encode-char charl 'ucs)))
424 (set-case-syntax-pair char (decode-char 'vietnamese-viscii-lower i)
425 tbl)
426 (if uc (modify-category-entry uc ?v))
427 (if lc (modify-category-entry lc ?v)))
428 (setq i (1+ i))))
429
430 ;; Tai Viet
431 (let ((deflist '(;; chars syntax category
432 ((?ꪀ. ?ꪯ) "w" ?0) ; cosonant
433 ("ꪱꪵꪶ" "w" ?1) ; vowel base
434 ((?ꪹ . ?ꪽ) "w" ?1) ; vowel base
435 ("ꪰꪲꪳꪷꪸꪾ" "w" ?2) ; vowel upper
436 ("ꪴ" "w" ?3) ; vowel lower
437 ("ꫀꫂ" "w" ?1) ; non-combining tone-mark
438 ("꪿꫁" "w" ?4) ; combining tone-mark
439 ((?ꫛ . ?꫟) "_" ?5) ; symbol
440 )))
441 (dolist (elm deflist)
442 (let ((chars (car elm))
443 (syntax (nth 1 elm))
444 (category (nth 2 elm)))
445 (if (consp chars)
446 (progn
447 (modify-syntax-entry chars syntax)
448 (modify-category-entry chars category))
449 (mapc #'(lambda (x)
450 (modify-syntax-entry x syntax)
451 (modify-category-entry x category))
452 chars)))))
453
454 ;; Latin
455
456 (modify-category-entry '(#x80 . #x024F) ?l)
457
458 (let ((tbl (standard-case-table)) c)
459
460 ;; Latin-1
461
462 ;; Fixme: Some of the non-word syntaxes here perhaps should be
463 ;; reviewed. (Note that the following all implicitly have word
464 ;; syntax: ¢£¤¥¨ª¯²³´¶¸¹º.) There should be a well-defined way of
465 ;; relating Unicode categories to Emacs syntax codes.
466
467 ;; NBSP isn't semantically interchangeable with other whitespace chars,
468 ;; so it's more like punctation.
469 (set-case-syntax ?  "." tbl)
470 (set-case-syntax ?¡ "." tbl)
471 (set-case-syntax ?¦ "_" tbl)
472 (set-case-syntax ?§ "." tbl)
473 (set-case-syntax ?© "_" tbl)
474 (set-case-syntax-delims 171 187 tbl) ; « »
475 (set-case-syntax ?¬ "_" tbl)
476 (set-case-syntax ?­ "_" tbl)
477 (set-case-syntax ?® "_" tbl)
478 (set-case-syntax ?° "_" tbl)
479 (set-case-syntax ?± "_" tbl)
480 (set-case-syntax ?µ "_" tbl)
481 (set-case-syntax ?· "_" tbl)
482 (set-case-syntax ?¼ "_" tbl)
483 (set-case-syntax ?½ "_" tbl)
484 (set-case-syntax ?¾ "_" tbl)
485 (set-case-syntax ?¿ "." tbl)
486 (let ((c 192))
487 (while (<= c 222)
488 (set-case-syntax-pair c (+ c 32) tbl)
489 (setq c (1+ c))))
490 (set-case-syntax ?× "_" tbl)
491 (set-case-syntax ?ß "w" tbl)
492 (set-case-syntax ?÷ "_" tbl)
493 ;; See below for ÿ.
494
495 ;; Latin Extended-A, Latin Extended-B
496 (setq c #x0100)
497 (while (<= c #x02B8)
498 (modify-category-entry c ?l)
499 (setq c (1+ c)))
500
501 (let ((pair-ranges '((#x0100 . #x012F)
502 (#x0132 . #x0137)
503 (#x0139 . #x0148)
504 (#x014a . #x0177)
505 (#x0179 . #x017E)
506 (#x0182 . #x0185)
507 (#x0187 . #x018C)
508 (#x0191 . #x0192)
509 (#x0198 . #x0199)
510 (#x01A0 . #x01A5)
511 (#x01A7 . #x01A8)
512 (#x01AC . #x01AD)
513 (#x01AF . #x01B0)
514 (#x01B3 . #x01B6)
515 (#x01BC . #x01BD)
516 (#x01CD . #x01DC)
517 (#x01DE . #x01EF)
518 (#x01F4 . #x01F5)
519 (#x01F8 . #x021F)
520 (#x0222 . #x0233)
521 (#x023B . #x023C)
522 (#x0241 . #x0242)
523 (#x0246 . #x024F))))
524 (dolist (elt pair-ranges)
525 (let ((from (car elt)) (to (cdr elt)))
526 (while (< from to)
527 (set-case-syntax-pair from (1+ from) tbl)
528 (setq from (+ from 2))))))
529
530 ;; In some languages, such as Turkish, U+0049 LATIN CAPITAL LETTER I
531 ;; and U+0131 LATIN SMALL LETTER DOTLESS I make a case pair, and so
532 ;; do U+0130 LATIN CAPITAL LETTER I WITH DOT ABOVE and U+0069 LATIN
533 ;; SMALL LETTER I.
534
535 ;; We used to set up half of those correspondence unconditionally,
536 ;; but that makes searches slow. So now we don't set up either half
537 ;; of these correspondences by default.
538
539 ;; (set-downcase-syntax ?İ ?i tbl)
540 ;; (set-upcase-syntax ?I ?ı tbl)
541
542 (set-case-syntax-pair ?DŽ ?dž tbl)
543 (set-case-syntax-pair ?Dž ?dž tbl)
544 (set-case-syntax-pair ?LJ ?lj tbl)
545 (set-case-syntax-pair ?Lj ?lj tbl)
546 (set-case-syntax-pair ?NJ ?nj tbl)
547 (set-case-syntax-pair ?Nj ?nj tbl)
548
549 ;; 01F0; F; 006A 030C; # LATIN SMALL LETTER J WITH CARON
550 (set-case-syntax-pair ?DZ ?dz tbl)
551 (set-case-syntax-pair ?Dz ?dz tbl)
552 (set-case-syntax-pair ?Ƕ ?ƕ tbl)
553 (set-case-syntax-pair ?Ƿ ?ƿ tbl)
554
555 ;; Latin Extended Additional
556 (modify-category-entry '(#x1e00 . #x1ef9) ?l)
557 (setq c #x1e00)
558 (while (<= c #x1ef9)
559 (and (zerop (% c 2))
560 (or (<= c #x1e94) (>= c #x1ea0))
561 (set-case-syntax-pair c (1+ c) tbl))
562 (setq c (1+ c)))
563
564 ;; Greek
565 (modify-category-entry '(#x0370 . #x03ff) ?g)
566 (setq c #x0370)
567 (while (<= c #x03ff)
568 (if (or (and (>= c #x0391) (<= c #x03a1))
569 (and (>= c #x03a3) (<= c #x03ab)))
570 (set-case-syntax-pair c (+ c 32) tbl))
571 (and (>= c #x03da)
572 (<= c #x03ee)
573 (zerop (% c 2))
574 (set-case-syntax-pair c (1+ c) tbl))
575 (setq c (1+ c)))
576 (set-case-syntax-pair ?Ά ?ά tbl)
577 (set-case-syntax-pair ?Έ ?έ tbl)
578 (set-case-syntax-pair ?Ή ?ή tbl)
579 (set-case-syntax-pair ?Ί ?ί tbl)
580 (set-case-syntax-pair ?Ό ?ό tbl)
581 (set-case-syntax-pair ?Ύ ?ύ tbl)
582 (set-case-syntax-pair ?Ώ ?ώ tbl)
583
584 ;; Armenian
585 (setq c #x531)
586 (while (<= c #x556)
587 (set-case-syntax-pair c (+ c #x30) tbl)
588 (setq c (1+ c)))
589
590 ;; Greek Extended
591 (modify-category-entry '(#x1f00 . #x1fff) ?g)
592 (setq c #x1f00)
593 (while (<= c #x1fff)
594 (and (<= (logand c #x000f) 7)
595 (<= c #x1fa7)
596 (not (memq c '(#x1f50 #x1f52 #x1f54 #x1f56)))
597 (/= (logand c #x00f0) 7)
598 (set-case-syntax-pair (+ c 8) c tbl))
599 (setq c (1+ c)))
600 (set-case-syntax-pair ?Ᾰ ?ᾰ tbl)
601 (set-case-syntax-pair ?Ᾱ ?ᾱ tbl)
602 (set-case-syntax-pair ?Ὰ ?ὰ tbl)
603 (set-case-syntax-pair ?Ά ?ά tbl)
604 (set-case-syntax-pair ?ᾼ ?ᾳ tbl)
605 (set-case-syntax-pair ?Ὲ ?ὲ tbl)
606 (set-case-syntax-pair ?Έ ?έ tbl)
607 (set-case-syntax-pair ?Ὴ ?ὴ tbl)
608 (set-case-syntax-pair ?Ή ?ή tbl)
609 (set-case-syntax-pair ?ῌ ?ῃ tbl)
610 (set-case-syntax-pair ?Ῐ ?ῐ tbl)
611 (set-case-syntax-pair ?Ῑ ?ῑ tbl)
612 (set-case-syntax-pair ?Ὶ ?ὶ tbl)
613 (set-case-syntax-pair ?Ί ?ί tbl)
614 (set-case-syntax-pair ?Ῠ ?ῠ tbl)
615 (set-case-syntax-pair ?Ῡ ?ῡ tbl)
616 (set-case-syntax-pair ?Ὺ ?ὺ tbl)
617 (set-case-syntax-pair ?Ύ ?ύ tbl)
618 (set-case-syntax-pair ?Ῥ ?ῥ tbl)
619 (set-case-syntax-pair ?Ὸ ?ὸ tbl)
620 (set-case-syntax-pair ?Ό ?ό tbl)
621 (set-case-syntax-pair ?Ὼ ?ὼ tbl)
622 (set-case-syntax-pair ?Ώ ?ώ tbl)
623 (set-case-syntax-pair ?ῼ ?ῳ tbl)
624
625 ;; cyrillic
626 (modify-category-entry '(#x0400 . #x04FF) ?y)
627 (setq c #x0400)
628 (while (<= c #x04ff)
629 (and (>= c #x0400)
630 (<= c #x040f)
631 (set-case-syntax-pair c (+ c 80) tbl))
632 (and (>= c #x0410)
633 (<= c #x042f)
634 (set-case-syntax-pair c (+ c 32) tbl))
635 (and (zerop (% c 2))
636 (or (and (>= c #x0460) (<= c #x0480))
637 (and (>= c #x048c) (<= c #x04be))
638 (and (>= c #x04d0) (<= c #x04f4)))
639 (set-case-syntax-pair c (1+ c) tbl))
640 (setq c (1+ c)))
641 (set-case-syntax-pair ?Ӂ ?ӂ tbl)
642 (set-case-syntax-pair ?Ӄ ?ӄ tbl)
643 (set-case-syntax-pair ?Ӈ ?ӈ tbl)
644 (set-case-syntax-pair ?Ӌ ?ӌ tbl)
645 (set-case-syntax-pair ?Ӹ ?ӹ tbl)
646
647 ;; general punctuation
648 (setq c #x2000)
649 (while (<= c #x200b)
650 (set-case-syntax c " " tbl)
651 (setq c (1+ c)))
652 (while (<= c #x200F)
653 (set-case-syntax c "." tbl)
654 (setq c (1+ c)))
655 ;; Fixme: These aren't all right:
656 (setq c #x2010)
657 (while (<= c #x2016)
658 (set-case-syntax c "_" tbl)
659 (setq c (1+ c)))
660 ;; Punctuation syntax for quotation marks (like `)
661 (while (<= c #x201f)
662 (set-case-syntax c "." tbl)
663 (setq c (1+ c)))
664 ;; Fixme: These aren't all right:
665 (while (<= c #x2027)
666 (set-case-syntax c "_" tbl)
667 (setq c (1+ c)))
668 (while (<= c #x206F)
669 (set-case-syntax c "." tbl)
670 (setq c (1+ c)))
671
672 ;; Roman numerals
673 (setq c #x2160)
674 (while (<= c #x216f)
675 (set-case-syntax-pair c (+ c #x10) tbl)
676 (setq c (1+ c)))
677
678 ;; Fixme: The following blocks might be better as symbol rather than
679 ;; punctuation.
680 ;; Arrows
681 (setq c #x2190)
682 (while (<= c #x21FF)
683 (set-case-syntax c "." tbl)
684 (setq c (1+ c)))
685 ;; Mathematical Operators
686 (while (<= c #x22FF)
687 (set-case-syntax c "." tbl)
688 (setq c (1+ c)))
689 ;; Miscellaneous Technical
690 (while (<= c #x23FF)
691 (set-case-syntax c "." tbl)
692 (setq c (1+ c)))
693 ;; Control Pictures
694 (while (<= c #x243F)
695 (set-case-syntax c "_" tbl)
696 (setq c (1+ c)))
697
698 ;; Circled Latin
699 (setq c #x24b6)
700 (while (<= c #x24cf)
701 (set-case-syntax-pair c (+ c 26) tbl)
702 (modify-category-entry c ?l)
703 (modify-category-entry (+ c 26) ?l)
704 (setq c (1+ c)))
705
706 ;; Fullwidth Latin
707 (setq c #xff21)
708 (while (<= c #xff3a)
709 (set-case-syntax-pair c (+ c #x20) tbl)
710 (modify-category-entry c ?l)
711 (modify-category-entry (+ c #x20) ?l)
712 (setq c (1+ c)))
713
714 ;; Combining diacritics
715 (modify-category-entry '(#x300 . #x362) ?^)
716 ;; Combining marks
717 (modify-category-entry '(#x20d0 . #x20e3) ?^)
718
719 ;; Fixme: syntax for symbols &c
720 )
721
722 (let ((pairs
723 '("⁅⁆" ; U+2045 U+2046
724 "⁽⁾" ; U+207D U+207E
725 "₍₎" ; U+208D U+208E
726 "〈〉" ; U+2329 U+232A
727 "⎴⎵" ; U+23B4 U+23B5
728 "❨❩" ; U+2768 U+2769
729 "❪❫" ; U+276A U+276B
730 "❬❭" ; U+276C U+276D
731 "❰❱" ; U+2770 U+2771
732 "❲❳" ; U+2772 U+2773
733 "❴❵" ; U+2774 U+2775
734 "⟦⟧" ; U+27E6 U+27E7
735 "⟨⟩" ; U+27E8 U+27E9
736 "⟪⟫" ; U+27EA U+27EB
737 "⦃⦄" ; U+2983 U+2984
738 "⦅⦆" ; U+2985 U+2986
739 "⦇⦈" ; U+2987 U+2988
740 "⦉⦊" ; U+2989 U+298A
741 "⦋⦌" ; U+298B U+298C
742 "⦍⦎" ; U+298D U+298E
743 "⦏⦐" ; U+298F U+2990
744 "⦑⦒" ; U+2991 U+2992
745 "⦓⦔" ; U+2993 U+2994
746 "⦕⦖" ; U+2995 U+2996
747 "⦗⦘" ; U+2997 U+2998
748 "⧼⧽" ; U+29FC U+29FD
749 "〈〉" ; U+3008 U+3009
750 "《》" ; U+300A U+300B
751 "「」" ; U+300C U+300D
752 "『』" ; U+300E U+300F
753 "【】" ; U+3010 U+3011
754 "〔〕" ; U+3014 U+3015
755 "〖〗" ; U+3016 U+3017
756 "〘〙" ; U+3018 U+3019
757 "〚〛" ; U+301A U+301B
758 "﴾﴿" ; U+FD3E U+FD3F
759 "︵︶" ; U+FE35 U+FE36
760 "︷︸" ; U+FE37 U+FE38
761 "︹︺" ; U+FE39 U+FE3A
762 "︻︼" ; U+FE3B U+FE3C
763 "︽︾" ; U+FE3D U+FE3E
764 "︿﹀" ; U+FE3F U+FE40
765 "﹁﹂" ; U+FE41 U+FE42
766 "﹃﹄" ; U+FE43 U+FE44
767 "﹙﹚" ; U+FE59 U+FE5A
768 "﹛﹜" ; U+FE5B U+FE5C
769 "﹝﹞" ; U+FE5D U+FE5E
770 "()" ; U+FF08 U+FF09
771 "[]" ; U+FF3B U+FF3D
772 "{}" ; U+FF5B U+FF5D
773 "⦅⦆" ; U+FF5F U+FF60
774 "「」" ; U+FF62 U+FF63
775 )))
776 (dolist (elt pairs)
777 (modify-syntax-entry (aref elt 0) (string ?\( (aref elt 1)))
778 (modify-syntax-entry (aref elt 1) (string ?\) (aref elt 0)))))
779
780 \f
781 ;; For each character set, put the information of the most proper
782 ;; coding system to encode it by `preferred-coding-system' property.
783
784 ;; Fixme: should this be junked?
785 (let ((l '((latin-iso8859-1 . iso-latin-1)
786 (latin-iso8859-2 . iso-latin-2)
787 (latin-iso8859-3 . iso-latin-3)
788 (latin-iso8859-4 . iso-latin-4)
789 (thai-tis620 . thai-tis620)
790 (greek-iso8859-7 . greek-iso-8bit)
791 (arabic-iso8859-6 . iso-2022-7bit)
792 (hebrew-iso8859-8 . hebrew-iso-8bit)
793 (katakana-jisx0201 . japanese-shift-jis)
794 (latin-jisx0201 . japanese-shift-jis)
795 (cyrillic-iso8859-5 . cyrillic-iso-8bit)
796 (latin-iso8859-9 . iso-latin-5)
797 (japanese-jisx0208-1978 . iso-2022-jp)
798 (chinese-gb2312 . chinese-iso-8bit)
799 (chinese-gbk . chinese-gbk)
800 (gb18030-2-byte . chinese-gb18030)
801 (gb18030-4-byte-bmp . chinese-gb18030)
802 (gb18030-4-byte-smp . chinese-gb18030)
803 (gb18030-4-byte-ext-1 . chinese-gb18030)
804 (gb18030-4-byte-ext-2 . chinese-gb18030)
805 (japanese-jisx0208 . iso-2022-jp)
806 (korean-ksc5601 . iso-2022-kr)
807 (japanese-jisx0212 . iso-2022-jp)
808 (chinese-big5-1 . chinese-big5)
809 (chinese-big5-2 . chinese-big5)
810 (chinese-sisheng . iso-2022-7bit)
811 (ipa . iso-2022-7bit)
812 (vietnamese-viscii-lower . vietnamese-viscii)
813 (vietnamese-viscii-upper . vietnamese-viscii)
814 (arabic-digit . iso-2022-7bit)
815 (arabic-1-column . iso-2022-7bit)
816 (lao . lao)
817 (arabic-2-column . iso-2022-7bit)
818 (indian-is13194 . devanagari)
819 (indian-glyph . devanagari)
820 (tibetan-1-column . tibetan)
821 (ethiopic . iso-2022-7bit)
822 (chinese-cns11643-1 . iso-2022-cn)
823 (chinese-cns11643-2 . iso-2022-cn)
824 (chinese-cns11643-3 . iso-2022-cn)
825 (chinese-cns11643-4 . iso-2022-cn)
826 (chinese-cns11643-5 . iso-2022-cn)
827 (chinese-cns11643-6 . iso-2022-cn)
828 (chinese-cns11643-7 . iso-2022-cn)
829 (indian-2-column . devanagari)
830 (tibetan . tibetan)
831 (latin-iso8859-14 . iso-latin-8)
832 (latin-iso8859-15 . iso-latin-9))))
833 (while l
834 (put-charset-property (car (car l)) 'preferred-coding-system (cdr (car l)))
835 (setq l (cdr l))))
836
837 \f
838 ;; Setup auto-fill-chars for charsets that should invoke auto-filling.
839 ;; SPACE and NEWLINE are already set.
840
841 (set-char-table-range auto-fill-chars '(#x3041 . #x30FF) t)
842 (set-char-table-range auto-fill-chars '(#x3400 . #x4DB5) t)
843 (set-char-table-range auto-fill-chars '(#x4e00 . #x9fbb) t)
844 (set-char-table-range auto-fill-chars '(#xF900 . #xFAFF) t)
845 (set-char-table-range auto-fill-chars '(#xFF00 . #xFF9F) t)
846 (set-char-table-range auto-fill-chars '(#x20000 . #x2FFFF) t)
847
848 \f
849 ;;; Setting char-width-table. The default is 1.
850
851 ;; 0: non-spacing, enclosing combining, formatting, Hangul Jamo medial
852 ;; and final characters.
853 (let ((l '((#x00AD . #x00AD)
854 (#x0300 . #x036F)
855 (#x0483 . #x0489)
856 (#x0591 . #x05BD)
857 (#x05BF . #x05BF)
858 (#x05C1 . #x05C2)
859 (#x05C4 . #x05C5)
860 (#x05C7 . #x05C7)
861 (#x0600 . #x0603)
862 (#x0610 . #x0615)
863 (#x064B . #x065E)
864 (#x0670 . #x0670)
865 (#x06D6 . #x06E4)
866 (#x06E7 . #x06E8)
867 (#x06EA . #x06ED)
868 (#x070F . #x070F)
869 (#x0711 . #x0711)
870 (#x0730 . #x074A)
871 (#x07A6 . #x07B0)
872 (#x07EB . #x07F3)
873 (#x0901 . #x0902)
874 (#x093C . #x093C)
875 (#x0941 . #x0948)
876 (#x094D . #x094D)
877 (#x0951 . #x0954)
878 (#x0962 . #x0963)
879 (#x0981 . #x0981)
880 (#x09BC . #x09BC)
881 (#x09C1 . #x09C4)
882 (#x09CD . #x09CD)
883 (#x09E2 . #x09E3)
884 (#x0A01 . #x0A02)
885 (#x0A3C . #x0A3C)
886 (#x0A41 . #x0A4D)
887 (#x0A70 . #x0A71)
888 (#x0A81 . #x0A82)
889 (#x0ABC . #x0ABC)
890 (#x0AC1 . #x0AC8)
891 (#x0ACD . #x0ACD)
892 (#x0AE2 . #x0AE3)
893 (#x0B01 . #x0B01)
894 (#x0B3C . #x0B3C)
895 (#x0B3F . #x0B3F)
896 (#x0B41 . #x0B43)
897 (#x0B4D . #x0B56)
898 (#x0B82 . #x0B82)
899 (#x0BC0 . #x0BC0)
900 (#x0BCD . #x0BCD)
901 (#x0C3E . #x0C40)
902 (#x0C46 . #x0C56)
903 (#x0CBC . #x0CBC)
904 (#x0CBF . #x0CBF)
905 (#x0CC6 . #x0CC6)
906 (#x0CCC . #x0CCD)
907 (#x0CE2 . #x0CE3)
908 (#x0D41 . #x0D43)
909 (#x0D4D . #x0D4D)
910 (#x0DCA . #x0DCA)
911 (#x0DD2 . #x0DD6)
912 (#x0E31 . #x0E31)
913 (#x0E34 . #x0E3A)
914 (#x0E47 . #x0E4E)
915 (#x0EB1 . #x0EB1)
916 (#x0EB4 . #x0EBC)
917 (#x0EC8 . #x0ECD)
918 (#x0F18 . #x0F19)
919 (#x0F35 . #x0F35)
920 (#x0F37 . #x0F37)
921 (#x0F39 . #x0F39)
922 (#x0F71 . #x0F7E)
923 (#x0F80 . #x0F84)
924 (#x0F86 . #x0F87)
925 (#x0F90 . #x0FBC)
926 (#x0FC6 . #x0FC6)
927 (#x102D . #x1030)
928 (#x1032 . #x1037)
929 (#x1039 . #x1039)
930 (#x1058 . #x1059)
931 (#x1160 . #x11FF)
932 (#x135F . #x135F)
933 (#x1712 . #x1714)
934 (#x1732 . #x1734)
935 (#x1752 . #x1753)
936 (#x1772 . #x1773)
937 (#x17B4 . #x17B5)
938 (#x17B7 . #x17BD)
939 (#x17C6 . #x17C6)
940 (#x17C9 . #x17D3)
941 (#x17DD . #x17DD)
942 (#x180B . #x180D)
943 (#x18A9 . #x18A9)
944 (#x1920 . #x1922)
945 (#x1927 . #x1928)
946 (#x1932 . #x1932)
947 (#x1939 . #x193B)
948 (#x1A17 . #x1A18)
949 (#x1B00 . #x1B03)
950 (#x1B34 . #x1B34)
951 (#x1B36 . #x1B3A)
952 (#x1B3C . #x1B3C)
953 (#x1B42 . #x1B42)
954 (#x1B6B . #x1B73)
955 (#x1DC0 . #x1DFF)
956 (#x200B . #x200F)
957 (#x202A . #x202E)
958 (#x2060 . #x206F)
959 (#x20D0 . #x20EF)
960 (#x302A . #x302F)
961 (#x3099 . #x309A)
962 (#xA806 . #xA806)
963 (#xA80B . #xA80B)
964 (#xA825 . #xA826)
965 (#xFB1E . #xFB1E)
966 (#xFE00 . #xFE0F)
967 (#xFE20 . #xFE23)
968 (#xFEFF . #xFEFF)
969 (#xFFF9 . #xFFFB)
970 (#x10A01 . #x10A0F)
971 (#x10A38 . #x10A3F)
972 (#x1D167 . #x1D169)
973 (#x1D173 . #x1D182)
974 (#x1D185 . #x1D18B)
975 (#x1D1AA . #x1D1AD)
976 (#x1D242 . #x1D244)
977 (#xE0001 . #xE01EF))))
978 (dolist (elt l)
979 (set-char-table-range char-width-table elt 0)))
980
981 ;; 2: East Asian Wide and Full-width characters.
982 (let ((l '((#x1100 . #x115F)
983 (#x2329 . #x232A)
984 (#x2E80 . #x303E)
985 (#x3040 . #xA4CF)
986 (#xAC00 . #xD7A3)
987 (#xF900 . #xFAFF)
988 (#xFE30 . #xFE6F)
989 (#xFF01 . #xFF60)
990 (#xFFE0 . #xFFE6)
991 (#x20000 . #x2FFFF)
992 (#x30000 . #x3FFFF))))
993 (dolist (elt l)
994 (set-char-table-range char-width-table elt 2)))
995
996 ;; Other double width
997 ;;(map-charset-chars
998 ;; (lambda (range ignore) (set-char-table-range char-width-table range 2))
999 ;; 'ethiopic)
1000 ;; (map-charset-chars
1001 ;; (lambda (range ignore) (set-char-table-range char-width-table range 2))
1002 ;; 'tibetan)
1003 (map-charset-chars
1004 (lambda (range ignore) (set-char-table-range char-width-table range 2))
1005 'indian-2-column)
1006 (map-charset-chars
1007 (lambda (range ignore) (set-char-table-range char-width-table range 2))
1008 'arabic-2-column)
1009
1010 (optimize-char-table (standard-case-table))
1011 (optimize-char-table (standard-category-table))
1012 (optimize-char-table (standard-syntax-table))
1013
1014 ;; The Unicode blocks actually extend past some of these ranges with
1015 ;; undefined codepoints.
1016 (let ((script-list nil))
1017 (dolist
1018 (elt
1019 '((#x0000 #x007F latin)
1020 (#x00A0 #x036F latin)
1021 (#x0370 #x03E1 greek)
1022 (#x03E2 #x03EF coptic)
1023 (#x03F0 #x03F3 greek)
1024 (#x0400 #x04FF cyrillic)
1025 (#x0530 #x058F armenian)
1026 (#x0590 #x05FF hebrew)
1027 (#x0600 #x06FF arabic)
1028 (#x0700 #x074F syriac)
1029 (#x07C0 #x07FA nko)
1030 (#x0780 #x07BF thaana)
1031 (#x0900 #x097F devanagari)
1032 (#x0980 #x09FF bengali)
1033 (#x0A00 #x0A7F gurmukhi)
1034 (#x0A80 #x0AFF gujarati)
1035 (#x0B00 #x0B7F oriya)
1036 (#x0B80 #x0BFF tamil)
1037 (#x0C00 #x0C7F telugu)
1038 (#x0C80 #x0CFF kannada)
1039 (#x0D00 #x0D7F malayalam)
1040 (#x0D80 #x0DFF sinhala)
1041 (#x0E00 #x0E5F thai)
1042 (#x0E80 #x0EDF lao)
1043 (#x0F00 #x0FFF tibetan)
1044 (#x1000 #x105F myanmar)
1045 (#x10A0 #x10FF georgian)
1046 (#x1100 #x11FF hangul)
1047 (#x1200 #x139F ethiopic)
1048 (#x13A0 #x13FF cherokee)
1049 (#x1400 #x167F canadian-aboriginal)
1050 (#x1680 #x169F ogham)
1051 (#x16A0 #x16FF runic)
1052 (#x1780 #x17FF khmer)
1053 (#x1800 #x18AF mongolian)
1054 (#x1E00 #x1EFF latin)
1055 (#x1F00 #x1FFF greek)
1056 (#x2000 #x27FF symbol)
1057 (#x2800 #x28FF braille)
1058 (#x2D80 #x2DDF ethiopic)
1059 (#x2E80 #x2FDF han)
1060 (#x2FF0 #x2FFF ideographic-description)
1061 (#x3000 #x303F cjk-misc)
1062 (#x3040 #x30FF kana)
1063 (#x3100 #x312F bopomofo)
1064 (#x3130 #x318F hangul)
1065 (#x3190 #x319F kanbun)
1066 (#x31A0 #x31BF bopomofo)
1067 (#x3400 #x9FAF han)
1068 (#xA000 #xA4CF yi)
1069 (#xAA80 #xAADF tai-viet)
1070 (#xAC00 #xD7AF hangul)
1071 (#xF900 #xFAFF han)
1072 (#xFB1D #xFB4F hebrew)
1073 (#xFB50 #xFDFF arabic)
1074 (#xFE70 #xFEFC arabic)
1075 (#xFF00 #xFF5F cjk-misc)
1076 (#xFF61 #xFF9F kana)
1077 (#xFFE0 #xFFE6 cjk-misc)
1078 (#x1D000 #x1D0FF byzantine-musical-symbol)
1079 (#x1D100 #x1D1FF musical-symbol)
1080 (#x1D400 #x1D7FF mathematical)
1081 (#x20000 #x2AFFF han)
1082 (#x2F800 #x2FFFF han)))
1083 (set-char-table-range char-script-table
1084 (cons (car elt) (nth 1 elt)) (nth 2 elt))
1085 (or (memq (nth 2 elt) script-list)
1086 (setq script-list (cons (nth 2 elt) script-list))))
1087 (set-char-table-extra-slot char-script-table 0 (nreverse script-list)))
1088
1089 (map-charset-chars
1090 #'(lambda (range ignore)
1091 (set-char-table-range char-script-table range 'tibetan))
1092 'tibetan)
1093
1094 \f
1095 ;;; Setting word boundary.
1096
1097 (defun next-word-boundary-han (pos limit)
1098 (if (<= pos limit)
1099 (save-excursion
1100 (goto-char pos)
1101 (looking-at "\\cC+")
1102 (goto-char (match-end 0))
1103 (if (looking-at "\\cH+")
1104 (goto-char (match-end 0)))
1105 (point))
1106 (while (and (> pos limit)
1107 (eq (aref char-script-table (char-after (1- pos))) 'han))
1108 (setq pos (1- pos)))
1109 pos))
1110
1111 (defun next-word-boundary-kana (pos limit)
1112 (if (<= pos limit)
1113 (save-excursion
1114 (goto-char pos)
1115 (if (looking-at "\\cK+")
1116 (goto-char (match-end 0)))
1117 (if (looking-at "\\cH+")
1118 (goto-char (match-end 0)))
1119 (if (looking-at "\\ck+")
1120 (goto-char (match-end 0)))
1121 (point))
1122 (let ((category-set (char-category-set (char-after pos)))
1123 category)
1124 (if (or (aref category-set ?K) (aref category-set ?k))
1125 (while (and (> pos limit)
1126 (setq category-set
1127 (char-category-set (char-after (1- pos))))
1128 (or (aref category-set ?K) (aref category-set ?k)))
1129 (setq pos (1- pos)))
1130 (while (and (> pos limit)
1131 (aref (setq category-set
1132 (char-category-set (char-after (1- pos)))) ?H))
1133 (setq pos (1- pos)))
1134 (setq category (cond ((aref category-set ?C) ?C)
1135 ((aref category-set ?K) ?K)
1136 ((aref category-set ?A) ?A)))
1137 (when category
1138 (setq pos (1- pos))
1139 (while (and (> pos limit)
1140 (aref (char-category-set (char-after (1- pos)))
1141 category))
1142 (setq pos (1- pos)))))
1143 pos)))
1144
1145 (map-char-table
1146 #'(lambda (char script)
1147 (cond ((eq script 'han)
1148 (set-char-table-range find-word-boundary-function-table
1149 char #'next-word-boundary-han))
1150 ((eq script 'kana)
1151 (set-char-table-range find-word-boundary-function-table
1152 char #'next-word-boundary-kana))))
1153 char-script-table)
1154
1155 (setq word-combining-categories
1156 '((?l . ?l)
1157 (?C . ?C)
1158 (?C . ?H)
1159 (?C . ?K)))
1160
1161 (setq word-separating-categories ; (2-byte character sets)
1162 '((?A . ?K) ; Alpha numeric - Katakana
1163 (?A . ?C) ; Alpha numeric - Chinese
1164 (?H . ?A) ; Hiragana - Alpha numeric
1165 (?H . ?K) ; Hiragana - Katakana
1166 (?H . ?C) ; Hiragana - Chinese
1167 (?K . ?A) ; Katakana - Alpha numeric
1168 (?K . ?C) ; Katakana - Chinese
1169 (?C . ?A) ; Chinese - Alpha numeric
1170 (?C . ?K) ; Chinese - Katakana
1171 ))
1172
1173 ;; Local Variables:
1174 ;; coding: utf-8
1175 ;; End:
1176
1177 ;; arch-tag: 85889c35-9f4d-4912-9bf5-82de31b0d42d
1178 ;;; characters.el ends here