]> code.delx.au - gnu-emacs/blob - lisp/international/characters.el
(recode-region): Deactivate mark at the end.
[gnu-emacs] / lisp / international / characters.el
1 ;;; characters.el --- set syntax and category for multibyte characters
2
3 ;; Copyright (C) 1997, 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008
4 ;; Free Software Foundation, Inc.
5 ;; Copyright (C) 1995, 1996, 1997, 1998, 1999, 2000, 2001, 2002, 2003, 2004,
6 ;; 2005, 2006, 2007, 2008
7 ;; National Institute of Advanced Industrial Science and Technology (AIST)
8 ;; Registration Number H14PRO021
9 ;; Copyright (C) 2003
10 ;; National Institute of Advanced Industrial Science and Technology (AIST)
11 ;; Registration Number H13PRO009
12
13 ;; Keywords: multibyte character, character set, syntax, category
14
15 ;; This file is part of GNU Emacs.
16
17 ;; GNU Emacs is free software: you can redistribute it and/or modify
18 ;; it under the terms of the GNU General Public License as published by
19 ;; the Free Software Foundation, either version 3 of the License, or
20 ;; (at your option) any later version.
21
22 ;; GNU Emacs is distributed in the hope that it will be useful,
23 ;; but WITHOUT ANY WARRANTY; without even the implied warranty of
24 ;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
25 ;; GNU General Public License for more details.
26
27 ;; You should have received a copy of the GNU General Public License
28 ;; along with GNU Emacs. If not, see <http://www.gnu.org/licenses/>.
29
30 ;;; Commentary:
31
32 ;;; Code:
33
34 ;;; Predefined categories.
35
36 ;; For each character set.
37
38 (define-category ?a "ASCII graphic characters 32-126 (ISO646 IRV:1983[4/0])")
39 (define-category ?l "Latin")
40 (define-category ?t "Thai")
41 (define-category ?g "Greek")
42 (define-category ?b "Arabic")
43 (define-category ?w "Hebrew")
44 (define-category ?y "Cyrillic")
45 (define-category ?k "Japanese katakana")
46 (define-category ?r "Japanese roman")
47 (define-category ?c "Chinese")
48 (define-category ?j "Japanese")
49 (define-category ?h "Korean")
50 (define-category ?e "Ethiopic (Ge'ez)")
51 (define-category ?v "Vietnamese")
52 (define-category ?i "Indian")
53 (define-category ?o "Lao")
54 (define-category ?q "Tibetan")
55
56 ;; For each group (row) of 2-byte character sets.
57
58 (define-category ?A "Alpha-numeric characters of 2-byte character sets")
59 (define-category ?C "Chinese (Han) characters of 2-byte character sets")
60 (define-category ?G "Greek characters of 2-byte character sets")
61 (define-category ?H "Japanese Hiragana characters of 2-byte character sets")
62 (define-category ?K "Japanese Katakana characters of 2-byte character sets")
63 (define-category ?N "Korean Hangul characters of 2-byte character sets")
64 (define-category ?Y "Cyrillic characters of 2-byte character sets")
65 (define-category ?I "Indian Glyphs")
66
67 ;; For phonetic classifications.
68
69 (define-category ?0 "consonant")
70 (define-category ?1 "base (independent) vowel")
71 (define-category ?2 "upper diacritical mark (including upper vowel)")
72 (define-category ?3 "lower diacritical mark (including lower vowel)")
73 (define-category ?4 "combining tone mark")
74 (define-category ?5 "symbol")
75 (define-category ?6 "digit")
76 (define-category ?7 "vowel-modifying diacritical mark")
77 (define-category ?8 "vowel-signs")
78 (define-category ?9 "semivowel lower")
79
80 ;; For filling.
81 (define-category ?| "While filling, we can break a line at this character.")
82
83 ;; For indentation calculation.
84 (define-category ?\s
85 "This character counts as a space for indentation purposes.")
86
87 ;; Keep the following for `kinsoku' processing. See comments in
88 ;; kinsoku.el.
89 (define-category ?> "A character which can't be placed at beginning of line.")
90 (define-category ?< "A character which can't be placed at end of line.")
91
92 ;; Combining
93 (define-category ?^ "Combining diacritic or mark")
94 \f
95 ;;; Setting syntax and category.
96
97 ;; ASCII
98
99 ;; All ASCII characters have the category `a' (ASCII) and `l' (Latin).
100 (modify-category-entry '(32 . 127) ?a)
101 (modify-category-entry '(32 . 127) ?l)
102
103 ;; Deal with the CJK charsets first. Since the syntax of blocks is
104 ;; defined per charset, and the charsets may contain e.g. Latin
105 ;; characters, we end up with the wrong syntax definitions if we're
106 ;; not careful.
107
108 ;; Chinese characters (Unicode)
109 (modify-category-entry '(#x2E80 . #x312F) ?|)
110 (modify-category-entry '(#x3190 . #x33FF) ?|)
111 (modify-category-entry '(#x3400 . #x9FAF) ?C)
112 (modify-category-entry '(#x3400 . #x9FAF) ?c)
113 (modify-category-entry '(#x3400 . #x9FAF) ?|)
114 (modify-category-entry '(#xF900 . #xFAFF) ?C)
115 (modify-category-entry '(#xF900 . #xFAFF) ?c)
116 (modify-category-entry '(#xF900 . #xFAFF) ?|)
117 (modify-category-entry '(#x20000 . #x2AFFF) ?|)
118 (modify-category-entry '(#x2F800 . #x2FFFF) ?|)
119
120
121 ;; Chinese character set (GB2312)
122
123 (map-charset-chars #'modify-syntax-entry 'chinese-gb2312 "_" #x2121 #x217E)
124 (map-charset-chars #'modify-syntax-entry 'chinese-gb2312 "_" #x2221 #x227E)
125 (map-charset-chars #'modify-syntax-entry 'chinese-gb2312 "_" #x2921 #x297E)
126
127 (map-charset-chars #'modify-category-entry 'chinese-gb2312 ?c)
128 (map-charset-chars #'modify-category-entry 'chinese-gb2312 ?A #x2330 #x2339)
129 (map-charset-chars #'modify-category-entry 'chinese-gb2312 ?A #x2341 #x235A)
130 (map-charset-chars #'modify-category-entry 'chinese-gb2312 ?A #x2361 #x237A)
131 (map-charset-chars #'modify-category-entry 'chinese-gb2312 ?H #x2421 #x247E)
132 (map-charset-chars #'modify-category-entry 'chinese-gb2312 ?K #x2521 #x257E)
133 (map-charset-chars #'modify-category-entry 'chinese-gb2312 ?G #x2621 #x267E)
134 (map-charset-chars #'modify-category-entry 'chinese-gb2312 ?Y #x2721 #x277E)
135 (map-charset-chars #'modify-category-entry 'chinese-gb2312 ?C #x3021 #x7E7E)
136
137 ;; Chinese character set (BIG5)
138
139 (map-charset-chars #'modify-category-entry 'big5 ?c)
140 (map-charset-chars #'modify-category-entry 'big5 ?C #xA259 #xA25F)
141 (map-charset-chars #'modify-category-entry 'big5 ?C #xA440 #xC67E)
142 (map-charset-chars #'modify-category-entry 'big5 ?C #xC940 #xF9DF)
143
144 ;; Chinese character set (CNS11643)
145
146 (dolist (c '(chinese-cns11643-1 chinese-cns11643-2 chinese-cns11643-3
147 chinese-cns11643-4 chinese-cns11643-5 chinese-cns11643-6
148 chinese-cns11643-7))
149 (map-charset-chars #'modify-category-entry c ?c)
150 (if (eq c 'chinese-cns11643-1)
151 (map-charset-chars #'modify-category-entry c ?C #x4421 #x7E7E)
152 (map-charset-chars #'modify-category-entry c ?C)))
153
154 ;; Japanese character set (JISX0201, JISX0208, JISX0212, JISX0213)
155
156 (map-charset-chars #'modify-category-entry 'katakana-jisx0201 ?k)
157
158 (map-charset-chars #'modify-category-entry 'latin-jisx0201 ?r)
159
160 (dolist (l '(katakana-jisx0201 japanese-jisx0208 japanese-jisx0212
161 japanese-jisx0213-1 japanese-jisx0213-2))
162 (map-charset-chars #'modify-category-entry l ?j))
163
164 ;; Unicode equivalents of JISX0201-kana
165 (let ((range '(#xff61 . #xff9f)))
166 (modify-category-entry range ?k)
167 (modify-category-entry range ?j)
168 (modify-category-entry range ?\|))
169
170 ;; Katakana block
171 (let ((range '(#x30a0 . #x30ff)))
172 ;; ?K is double width, ?k isn't specified
173 (modify-category-entry range ?K)
174 (modify-category-entry range ?\|))
175
176 ;; Hiragana block
177 (let ((range '(#x3040 . #x309d)))
178 ;; ?H is actually defined to be double width
179 ;;(modify-category-entry range ?H)
180 (modify-category-entry range ?\|)
181 )
182
183 ;; JISX0208
184 (map-charset-chars #'modify-syntax-entry 'japanese-jisx0208 "_" #x2121 #x227E)
185 (map-charset-chars #'modify-syntax-entry 'japanese-jisx0208 "_" #x2821 #x287E)
186 (let ((chars '(?ー ?゛ ?゜ ?ヽ ?ヾ ?ゝ ?ゞ ?〃 ?仝 ?々 ?〆 ?〇)))
187 (dolist (elt chars)
188 (modify-syntax-entry (car chars) "w")))
189
190 (map-charset-chars #'modify-category-entry 'japanese-jisx0208 ?A #x2321 #x237E)
191 (map-charset-chars #'modify-category-entry 'japanese-jisx0208 ?H #x2421 #x247E)
192 (map-charset-chars #'modify-category-entry 'japanese-jisx0208 ?K #x2521 #x257E)
193 (map-charset-chars #'modify-category-entry 'japanese-jisx0208 ?G #x2621 #x267E)
194 (map-charset-chars #'modify-category-entry 'japanese-jisx0208 ?Y #x2721 #x277E)
195 (map-charset-chars #'modify-category-entry 'japanese-jisx0208 ?C #x3021 #x7E7E)
196 (modify-category-entry ?ー ?K)
197 (let ((chars '(?゛ ?゜)))
198 (while chars
199 (modify-category-entry (car chars) ?K)
200 (modify-category-entry (car chars) ?H)
201 (setq chars (cdr chars))))
202 (let ((chars '(?ヽ ?ヾ ?ゝ ?ゞ ?〃 ?仝 ?々 ?〆 ?〇)))
203 (while chars
204 (modify-category-entry (car chars) ?C)
205 (setq chars (cdr chars))))
206
207 ;; JISX0212
208
209 (map-charset-chars #'modify-syntax-entry 'japanese-jisx0212 "_" #x2121 #x237E)
210
211 ;; JISX0201-Kana
212
213 (let ((chars '(?。 ?、 ?・)))
214 (while chars
215 (modify-syntax-entry (car chars) ".")
216 (setq chars (cdr chars))))
217
218 (modify-syntax-entry ?\「 "(」")
219 (modify-syntax-entry ?\」 "(「")
220
221 ;; Korean character set (KSC5601)
222
223 (map-charset-chars #'modify-category-entry 'korean-ksc5601 ?h)
224
225 (map-charset-chars #'modify-syntax-entry 'korean-ksc5601 "_" #x2121 #x227E)
226 (map-charset-chars #'modify-syntax-entry 'korean-ksc5601 "_" #x2621 #x277E)
227 (map-charset-chars #'modify-syntax-entry 'korean-ksc5601 "_" #x2830 #x287E)
228 (map-charset-chars #'modify-syntax-entry 'korean-ksc5601 "_" #x2930 #x297E)
229 (map-charset-chars #'modify-category-entry 'korean-ksc5601 ?A #x2330 #x2339)
230 (map-charset-chars #'modify-category-entry 'korean-ksc5601 ?A #x2341 #x235A)
231 (map-charset-chars #'modify-category-entry 'korean-ksc5601 ?A #x2361 #x237A)
232 (map-charset-chars #'modify-category-entry 'korean-ksc5601 ?G #x2521 #x257E)
233 (map-charset-chars #'modify-category-entry 'korean-ksc5601 ?H #x2A21 #x2A7E)
234 (map-charset-chars #'modify-category-entry 'korean-ksc5601 ?K #x2B21 #x2B7E)
235 (map-charset-chars #'modify-category-entry 'korean-ksc5601 ?Y #x2C21 #x2C7E)
236
237 ;; These are in more than one charset.
238 (let ((parens (concat "〈〉《》「」『』【】〔〕〖〗〘〙〚〛"
239 "︵︶︷︸︹︺︻︼︽︾︿﹀﹁﹂﹃﹄"
240 "()[]{}"))
241 open close)
242 (dotimes (i (/ (length parens) 2))
243 (setq open (aref parens (* i 2))
244 close (aref parens (1+ (* i 2))))
245 (modify-syntax-entry open (format "(%c" close))
246 (modify-syntax-entry close (format ")%c" open))))
247
248 ;; Arabic character set
249
250 (let ((charsets '(arabic-iso8859-6
251 arabic-digit
252 arabic-1-column
253 arabic-2-column)))
254 (while charsets
255 (map-charset-chars #'modify-category-entry (car charsets) ?b)
256 (setq charsets (cdr charsets))))
257 (modify-category-entry '(#x600 . #x6ff) ?b)
258 (modify-category-entry '(#xfb50 . #xfdff) ?b)
259 (modify-category-entry '(#xfe70 . #xfefe) ?b)
260
261 ;; Cyrillic character set (ISO-8859-5)
262
263 (modify-syntax-entry ?№ ".")
264
265 ;; Ethiopic character set
266
267 (modify-category-entry '(#x1200 . #x1399) ?e)
268 (modify-category-entry '(#x2d80 . #x2dde) ?e)
269 (let ((chars '(?፡ ?። ?፣ ?፤ ?፥ ?፦ ?፧ ?፨)))
270 (while chars
271 (modify-syntax-entry (car chars) ".")
272 (setq chars (cdr chars))))
273 (map-charset-chars #'modify-category-entry 'ethiopic ?e)
274
275 ;; Hebrew character set (ISO-8859-8)
276
277 (modify-syntax-entry #x5be ".") ; MAQAF
278 (modify-syntax-entry #x5c0 ".") ; PASEQ
279 (modify-syntax-entry #x5c3 ".") ; SOF PASUQ
280 (modify-syntax-entry #x5f3 ".") ; GERESH
281 (modify-syntax-entry #x5f4 ".") ; GERSHAYIM
282
283 ;; Indian character set (IS 13194 and other Emacs original Indian charsets)
284
285 (modify-category-entry '(#x901 . #x970) ?i)
286 (map-charset-chars #'modify-category-entry 'indian-is13194 ?i)
287 (map-charset-chars #'modify-category-entry 'indian-2-column ?i)
288
289 ;; Lao character set
290
291 (modify-category-entry '(#xe80 . #xeff) ?o)
292 (map-charset-chars #'modify-category-entry 'lao ?o)
293
294 (let ((deflist '(("ກ-ຮ" "w" ?0) ; consonant
295 ("ະາຳຽເ-ໄ" "w" ?1) ; vowel base
296 ("ັິ-ືົໍ" "w" ?2) ; vowel upper
297 ("ຸູ" "w" ?3) ; vowel lower
298 ("່-໋" "w" ?4) ; tone mark
299 ("ຼຽ" "w" ?9) ; semivowel lower
300 ("໐-໙" "w" ?6) ; digit
301 ("ຯໆ" "_" ?5) ; symbol
302 ))
303 elm chars len syntax category to ch i)
304 (while deflist
305 (setq elm (car deflist))
306 (setq chars (car elm)
307 len (length chars)
308 syntax (nth 1 elm)
309 category (nth 2 elm)
310 i 0)
311 (while (< i len)
312 (if (= (aref chars i) ?-)
313 (setq i (1+ i)
314 to (aref chars i))
315 (setq ch (aref chars i)
316 to ch))
317 (while (<= ch to)
318 (unless (string-equal syntax "w")
319 (modify-syntax-entry ch syntax))
320 (modify-category-entry ch category)
321 (setq ch (1+ ch)))
322 (setq i (1+ i)))
323 (setq deflist (cdr deflist))))
324
325 ;; Thai character set (TIS620)
326
327 (modify-category-entry '(#xe00 . #xe7f) ?t)
328 (map-charset-chars #'modify-category-entry 'thai-tis620 ?t)
329
330 (let ((deflist '(;; chars syntax category
331 ("ก-รลว-ฮ" "w" ?0) ; consonant
332 ("ฤฦะาำเ-ๅ" "w" ?1) ; vowel base
333 ("ัิ-ื็๎" "w" ?2) ; vowel upper
334 ("ุ-ฺ" "w" ?3) ; vowel lower
335 ("่-ํ" "w" ?4) ; tone mark
336 ("๐-๙" "w" ?6) ; digit
337 ("ฯๆ฿๏๚๛" "_" ?5) ; symbol
338 ))
339 elm chars len syntax category to ch i)
340 (while deflist
341 (setq elm (car deflist))
342 (setq chars (car elm)
343 len (length chars)
344 syntax (nth 1 elm)
345 category (nth 2 elm)
346 i 0)
347 (while (< i len)
348 (if (= (aref chars i) ?-)
349 (setq i (1+ i)
350 to (aref chars i))
351 (setq ch (aref chars i)
352 to ch))
353 (while (<= ch to)
354 (unless (string-equal syntax "w")
355 (modify-syntax-entry ch syntax))
356 (modify-category-entry ch category)
357 (setq ch (1+ ch)))
358 (setq i (1+ i)))
359 (setq deflist (cdr deflist))))
360
361 ;; Tibetan character set
362
363 (modify-category-entry '(#xf00 . #xfff) ?q)
364 (map-charset-chars #'modify-category-entry 'tibetan ?q)
365 (map-charset-chars #'modify-category-entry 'tibetan-1-column ?q)
366
367 (let ((deflist '(;; chars syntax category
368 ("ཀ-ཀྵཪ" "w" ?0) ; consonant
369 ("ྐ-ྐྵྺྻྼ" "w" ?0) ;
370 ("ིེཻོཽྀ" "w" ?2) ; upper vowel
371 ("ཾྂྃ྆྇ྈྉྊྋ" "w" ?2) ; upper modifier
372 ("྄ཱུ༙༵༷" "w" ?3) ; lowel vowel/modifier
373 ("཰" "w" ?3) ; invisible vowel a
374 ("༠-༩༪-༳" "w" ?6) ; digit
375 ("་།-༒༔ཿ" "." ?|) ; line-break char
376 ("་།༏༐༑༔ཿ" "." ?|) ;
377 ("༈་།-༒༔ཿ༽༴" "." ?>) ; prohibition
378 ("་།༏༐༑༔ཿ" "." ?>) ;
379 ("ༀ-༊༼࿁࿂྅" "." ?<) ; prohibition
380 ("༓༕-༘༚-༟༶༸-༻༾༿྾྿-࿏" "." ?q) ; others
381 ))
382 elm chars len syntax category to ch i)
383 (while deflist
384 (setq elm (car deflist))
385 (setq chars (car elm)
386 len (length chars)
387 syntax (nth 1 elm)
388 category (nth 2 elm)
389 i 0)
390 (while (< i len)
391 (if (= (aref chars i) ?-)
392 (setq i (1+ i)
393 to (aref chars i))
394 (setq ch (aref chars i)
395 to ch))
396 (while (<= ch to)
397 (unless (string-equal syntax "w")
398 (modify-syntax-entry ch syntax))
399 (modify-category-entry ch category)
400 (setq ch (1+ ch)))
401 (setq i (1+ i)))
402 (setq deflist (cdr deflist))))
403
404 ;; Vietnamese character set
405
406 ;; To make a word with Latin characters
407 (map-charset-chars #'modify-category-entry 'vietnamese-viscii-lower ?l)
408 (map-charset-chars #'modify-category-entry 'vietnamese-viscii-lower ?v)
409
410 (map-charset-chars #'modify-category-entry 'vietnamese-viscii-upper ?l)
411 (map-charset-chars #'modify-category-entry 'vietnamese-viscii-upper ?v)
412
413 (let ((tbl (standard-case-table))
414 (i 32))
415 (while (< i 128)
416 (let* ((char (decode-char 'vietnamese-viscii-upper i))
417 (charl (decode-char 'vietnamese-viscii-lower i))
418 (uc (encode-char char 'ucs))
419 (lc (encode-char charl 'ucs)))
420 (set-case-syntax-pair char (decode-char 'vietnamese-viscii-lower i)
421 tbl)
422 (if uc (modify-category-entry uc ?v))
423 (if lc (modify-category-entry lc ?v)))
424 (setq i (1+ i))))
425
426 ;; Tai Viet
427 (let ((deflist '(;; chars syntax category
428 ((?ꪀ. ?ꪯ) "w" ?0) ; cosonant
429 ("ꪱꪵꪶ" "w" ?1) ; vowel base
430 ((?ꪹ . ?ꪽ) "w" ?1) ; vowel base
431 ("ꪰꪲꪳꪷꪸꪾ" "w" ?2) ; vowel upper
432 ("ꪴ" "w" ?3) ; vowel lower
433 ("ꫀꫂ" "w" ?1) ; non-combining tone-mark
434 ("꪿꫁" "w" ?4) ; combining tone-mark
435 ((?ꫛ . ?꫟) "_" ?5) ; symbol
436 )))
437 (dolist (elm deflist)
438 (let ((chars (car elm))
439 (syntax (nth 1 elm))
440 (category (nth 2 elm)))
441 (if (consp chars)
442 (progn
443 (modify-syntax-entry chars syntax)
444 (modify-category-entry chars category))
445 (mapc #'(lambda (x)
446 (modify-syntax-entry x syntax)
447 (modify-category-entry x category))
448 chars)))))
449
450 ;; Latin
451
452 (modify-category-entry '(#x80 . #x024F) ?l)
453
454 (let ((tbl (standard-case-table)) c)
455
456 ;; Latin-1
457
458 ;; Fixme: Some of the non-word syntaxes here perhaps should be
459 ;; reviewed. (Note that the following all implicitly have word
460 ;; syntax: ¢£¤¥¨ª¯²³´¶¸¹º.) There should be a well-defined way of
461 ;; relating Unicode categories to Emacs syntax codes.
462
463 ;; NBSP isn't semantically interchangeable with other whitespace chars,
464 ;; so it's more like punctation.
465 (set-case-syntax ?  "." tbl)
466 (set-case-syntax ?¡ "." tbl)
467 (set-case-syntax ?¦ "_" tbl)
468 (set-case-syntax ?§ "." tbl)
469 (set-case-syntax ?© "_" tbl)
470 (set-case-syntax-delims 171 187 tbl) ; « »
471 (set-case-syntax ?¬ "_" tbl)
472 (set-case-syntax ?­ "_" tbl)
473 (set-case-syntax ?® "_" tbl)
474 (set-case-syntax ?° "_" tbl)
475 (set-case-syntax ?± "_" tbl)
476 (set-case-syntax ?µ "_" tbl)
477 (set-case-syntax ?· "_" tbl)
478 (set-case-syntax ?¼ "_" tbl)
479 (set-case-syntax ?½ "_" tbl)
480 (set-case-syntax ?¾ "_" tbl)
481 (set-case-syntax ?¿ "." tbl)
482 (let ((c 192))
483 (while (<= c 222)
484 (set-case-syntax-pair c (+ c 32) tbl)
485 (setq c (1+ c))))
486 (set-case-syntax ?× "_" tbl)
487 (set-case-syntax ?ß "w" tbl)
488 (set-case-syntax ?÷ "_" tbl)
489 ;; See below for ÿ.
490
491 ;; Latin Extended-A, Latin Extended-B
492 (setq c #x0100)
493 (while (<= c #x02B8)
494 (modify-category-entry c ?l)
495 (setq c (1+ c)))
496
497 (let ((pair-ranges '((#x0100 . #x012F)
498 (#x0132 . #x0137)
499 (#x0139 . #x0148)
500 (#x014a . #x0177)
501 (#x0179 . #x017E)
502 (#x0182 . #x0185)
503 (#x0187 . #x018C)
504 (#x0191 . #x0192)
505 (#x0198 . #x0199)
506 (#x01A0 . #x01A5)
507 (#x01A7 . #x01A8)
508 (#x01AC . #x01AD)
509 (#x01AF . #x01B0)
510 (#x01B3 . #x01B6)
511 (#x01BC . #x01BD)
512 (#x01CD . #x01DC)
513 (#x01DE . #x01EF)
514 (#x01F4 . #x01F5)
515 (#x01F8 . #x021F)
516 (#x0222 . #x0233)
517 (#x023B . #x023C)
518 (#x0241 . #x0242)
519 (#x0246 . #x024F))))
520 (dolist (elt pair-ranges)
521 (let ((from (car elt)) (to (cdr elt)))
522 (while (< from to)
523 (set-case-syntax-pair from (1+ from) tbl)
524 (setq from (+ from 2))))))
525
526 ;; In some languages, such as Turkish, U+0049 LATIN CAPITAL LETTER I
527 ;; and U+0131 LATIN SMALL LETTER DOTLESS I make a case pair, and so
528 ;; do U+0130 LATIN CAPITAL LETTER I WITH DOT ABOVE and U+0069 LATIN
529 ;; SMALL LETTER I.
530
531 ;; We used to set up half of those correspondence unconditionally,
532 ;; but that makes searches slow. So now we don't set up either half
533 ;; of these correspondences by default.
534
535 ;; (set-downcase-syntax ?İ ?i tbl)
536 ;; (set-upcase-syntax ?I ?ı tbl)
537
538 (set-case-syntax-pair ?DŽ ?dž tbl)
539 (set-case-syntax-pair ?Dž ?dž tbl)
540 (set-case-syntax-pair ?LJ ?lj tbl)
541 (set-case-syntax-pair ?Lj ?lj tbl)
542 (set-case-syntax-pair ?NJ ?nj tbl)
543 (set-case-syntax-pair ?Nj ?nj tbl)
544
545 ;; 01F0; F; 006A 030C; # LATIN SMALL LETTER J WITH CARON
546 (set-case-syntax-pair ?DZ ?dz tbl)
547 (set-case-syntax-pair ?Dz ?dz tbl)
548 (set-case-syntax-pair ?Ƕ ?ƕ tbl)
549 (set-case-syntax-pair ?Ƿ ?ƿ tbl)
550
551 ;; Latin Extended Additional
552 (modify-category-entry '(#x1e00 . #x1ef9) ?l)
553 (setq c #x1e00)
554 (while (<= c #x1ef9)
555 (and (zerop (% c 2))
556 (or (<= c #x1e94) (>= c #x1ea0))
557 (set-case-syntax-pair c (1+ c) tbl))
558 (setq c (1+ c)))
559
560 ;; Greek
561 (modify-category-entry '(#x0370 . #x03ff) ?g)
562 (setq c #x0370)
563 (while (<= c #x03ff)
564 (if (or (and (>= c #x0391) (<= c #x03a1))
565 (and (>= c #x03a3) (<= c #x03ab)))
566 (set-case-syntax-pair c (+ c 32) tbl))
567 (and (>= c #x03da)
568 (<= c #x03ee)
569 (zerop (% c 2))
570 (set-case-syntax-pair c (1+ c) tbl))
571 (setq c (1+ c)))
572 (set-case-syntax-pair ?Ά ?ά tbl)
573 (set-case-syntax-pair ?Έ ?έ tbl)
574 (set-case-syntax-pair ?Ή ?ή tbl)
575 (set-case-syntax-pair ?Ί ?ί tbl)
576 (set-case-syntax-pair ?Ό ?ό tbl)
577 (set-case-syntax-pair ?Ύ ?ύ tbl)
578 (set-case-syntax-pair ?Ώ ?ώ tbl)
579
580 ;; Armenian
581 (setq c #x531)
582 (while (<= c #x556)
583 (set-case-syntax-pair c (+ c #x30) tbl)
584 (setq c (1+ c)))
585
586 ;; Greek Extended
587 (modify-category-entry '(#x1f00 . #x1fff) ?g)
588 (setq c #x1f00)
589 (while (<= c #x1fff)
590 (and (<= (logand c #x000f) 7)
591 (<= c #x1fa7)
592 (not (memq c '(#x1f50 #x1f52 #x1f54 #x1f56)))
593 (/= (logand c #x00f0) 7)
594 (set-case-syntax-pair (+ c 8) c tbl))
595 (setq c (1+ c)))
596 (set-case-syntax-pair ?Ᾰ ?ᾰ tbl)
597 (set-case-syntax-pair ?Ᾱ ?ᾱ tbl)
598 (set-case-syntax-pair ?Ὰ ?ὰ tbl)
599 (set-case-syntax-pair ?Ά ?ά tbl)
600 (set-case-syntax-pair ?ᾼ ?ᾳ tbl)
601 (set-case-syntax-pair ?Ὲ ?ὲ tbl)
602 (set-case-syntax-pair ?Έ ?έ tbl)
603 (set-case-syntax-pair ?Ὴ ?ὴ tbl)
604 (set-case-syntax-pair ?Ή ?ή tbl)
605 (set-case-syntax-pair ?ῌ ?ῃ tbl)
606 (set-case-syntax-pair ?Ῐ ?ῐ tbl)
607 (set-case-syntax-pair ?Ῑ ?ῑ tbl)
608 (set-case-syntax-pair ?Ὶ ?ὶ tbl)
609 (set-case-syntax-pair ?Ί ?ί tbl)
610 (set-case-syntax-pair ?Ῠ ?ῠ tbl)
611 (set-case-syntax-pair ?Ῡ ?ῡ tbl)
612 (set-case-syntax-pair ?Ὺ ?ὺ tbl)
613 (set-case-syntax-pair ?Ύ ?ύ tbl)
614 (set-case-syntax-pair ?Ῥ ?ῥ tbl)
615 (set-case-syntax-pair ?Ὸ ?ὸ tbl)
616 (set-case-syntax-pair ?Ό ?ό tbl)
617 (set-case-syntax-pair ?Ὼ ?ὼ tbl)
618 (set-case-syntax-pair ?Ώ ?ώ tbl)
619 (set-case-syntax-pair ?ῼ ?ῳ tbl)
620
621 ;; cyrillic
622 (modify-category-entry '(#x0400 . #x04FF) ?y)
623 (setq c #x0400)
624 (while (<= c #x04ff)
625 (and (>= c #x0400)
626 (<= c #x040f)
627 (set-case-syntax-pair c (+ c 80) tbl))
628 (and (>= c #x0410)
629 (<= c #x042f)
630 (set-case-syntax-pair c (+ c 32) tbl))
631 (and (zerop (% c 2))
632 (or (and (>= c #x0460) (<= c #x0480))
633 (and (>= c #x048c) (<= c #x04be))
634 (and (>= c #x04d0) (<= c #x04f4)))
635 (set-case-syntax-pair c (1+ c) tbl))
636 (setq c (1+ c)))
637 (set-case-syntax-pair ?Ӂ ?ӂ tbl)
638 (set-case-syntax-pair ?Ӄ ?ӄ tbl)
639 (set-case-syntax-pair ?Ӈ ?ӈ tbl)
640 (set-case-syntax-pair ?Ӌ ?ӌ tbl)
641 (set-case-syntax-pair ?Ӹ ?ӹ tbl)
642
643 ;; general punctuation
644 (setq c #x2000)
645 (while (<= c #x200b)
646 (set-case-syntax c " " tbl)
647 (setq c (1+ c)))
648 (while (<= c #x200F)
649 (set-case-syntax c "." tbl)
650 (setq c (1+ c)))
651 ;; Fixme: These aren't all right:
652 (setq c #x2010)
653 (while (<= c #x2016)
654 (set-case-syntax c "_" tbl)
655 (setq c (1+ c)))
656 ;; Punctuation syntax for quotation marks (like `)
657 (while (<= c #x201f)
658 (set-case-syntax c "." tbl)
659 (setq c (1+ c)))
660 ;; Fixme: These aren't all right:
661 (while (<= c #x2027)
662 (set-case-syntax c "_" tbl)
663 (setq c (1+ c)))
664 (while (<= c #x206F)
665 (set-case-syntax c "." tbl)
666 (setq c (1+ c)))
667
668 ;; Roman numerals
669 (setq c #x2160)
670 (while (<= c #x216f)
671 (set-case-syntax-pair c (+ c #x10) tbl)
672 (setq c (1+ c)))
673
674 ;; Fixme: The following blocks might be better as symbol rather than
675 ;; punctuation.
676 ;; Arrows
677 (setq c #x2190)
678 (while (<= c #x21FF)
679 (set-case-syntax c "." tbl)
680 (setq c (1+ c)))
681 ;; Mathematical Operators
682 (while (<= c #x22FF)
683 (set-case-syntax c "." tbl)
684 (setq c (1+ c)))
685 ;; Miscellaneous Technical
686 (while (<= c #x23FF)
687 (set-case-syntax c "." tbl)
688 (setq c (1+ c)))
689 ;; Control Pictures
690 (while (<= c #x243F)
691 (set-case-syntax c "_" tbl)
692 (setq c (1+ c)))
693
694 ;; Circled Latin
695 (setq c #x24b6)
696 (while (<= c #x24cf)
697 (set-case-syntax-pair c (+ c 26) tbl)
698 (modify-category-entry c ?l)
699 (modify-category-entry (+ c 26) ?l)
700 (setq c (1+ c)))
701
702 ;; Fullwidth Latin
703 (setq c #xff21)
704 (while (<= c #xff3a)
705 (set-case-syntax-pair c (+ c #x20) tbl)
706 (modify-category-entry c ?l)
707 (modify-category-entry (+ c #x20) ?l)
708 (setq c (1+ c)))
709
710 ;; Combining diacritics
711 (modify-category-entry '(#x300 . #x362) ?^)
712 ;; Combining marks
713 (modify-category-entry '(#x20d0 . #x20e3) ?^)
714
715 ;; Fixme: syntax for symbols &c
716 )
717
718 (let ((pairs
719 '("⁅⁆" ; U+2045 U+2046
720 "⁽⁾" ; U+207D U+207E
721 "₍₎" ; U+208D U+208E
722 "〈〉" ; U+2329 U+232A
723 "⎴⎵" ; U+23B4 U+23B5
724 "❨❩" ; U+2768 U+2769
725 "❪❫" ; U+276A U+276B
726 "❬❭" ; U+276C U+276D
727 "❰❱" ; U+2770 U+2771
728 "❲❳" ; U+2772 U+2773
729 "❴❵" ; U+2774 U+2775
730 "⟦⟧" ; U+27E6 U+27E7
731 "⟨⟩" ; U+27E8 U+27E9
732 "⟪⟫" ; U+27EA U+27EB
733 "⦃⦄" ; U+2983 U+2984
734 "⦅⦆" ; U+2985 U+2986
735 "⦇⦈" ; U+2987 U+2988
736 "⦉⦊" ; U+2989 U+298A
737 "⦋⦌" ; U+298B U+298C
738 "⦍⦎" ; U+298D U+298E
739 "⦏⦐" ; U+298F U+2990
740 "⦑⦒" ; U+2991 U+2992
741 "⦓⦔" ; U+2993 U+2994
742 "⦕⦖" ; U+2995 U+2996
743 "⦗⦘" ; U+2997 U+2998
744 "⧼⧽" ; U+29FC U+29FD
745 "〈〉" ; U+3008 U+3009
746 "《》" ; U+300A U+300B
747 "「」" ; U+300C U+300D
748 "『』" ; U+300E U+300F
749 "【】" ; U+3010 U+3011
750 "〔〕" ; U+3014 U+3015
751 "〖〗" ; U+3016 U+3017
752 "〘〙" ; U+3018 U+3019
753 "〚〛" ; U+301A U+301B
754 "﴾﴿" ; U+FD3E U+FD3F
755 "︵︶" ; U+FE35 U+FE36
756 "︷︸" ; U+FE37 U+FE38
757 "︹︺" ; U+FE39 U+FE3A
758 "︻︼" ; U+FE3B U+FE3C
759 "︽︾" ; U+FE3D U+FE3E
760 "︿﹀" ; U+FE3F U+FE40
761 "﹁﹂" ; U+FE41 U+FE42
762 "﹃﹄" ; U+FE43 U+FE44
763 "﹙﹚" ; U+FE59 U+FE5A
764 "﹛﹜" ; U+FE5B U+FE5C
765 "﹝﹞" ; U+FE5D U+FE5E
766 "()" ; U+FF08 U+FF09
767 "[]" ; U+FF3B U+FF3D
768 "{}" ; U+FF5B U+FF5D
769 "⦅⦆" ; U+FF5F U+FF60
770 "「」" ; U+FF62 U+FF63
771 )))
772 (dolist (elt pairs)
773 (modify-syntax-entry (aref elt 0) (string ?\( (aref elt 1)))
774 (modify-syntax-entry (aref elt 1) (string ?\) (aref elt 0)))))
775
776 \f
777 ;; For each character set, put the information of the most proper
778 ;; coding system to encode it by `preferred-coding-system' property.
779
780 ;; Fixme: should this be junked?
781 (let ((l '((latin-iso8859-1 . iso-latin-1)
782 (latin-iso8859-2 . iso-latin-2)
783 (latin-iso8859-3 . iso-latin-3)
784 (latin-iso8859-4 . iso-latin-4)
785 (thai-tis620 . thai-tis620)
786 (greek-iso8859-7 . greek-iso-8bit)
787 (arabic-iso8859-6 . iso-2022-7bit)
788 (hebrew-iso8859-8 . hebrew-iso-8bit)
789 (katakana-jisx0201 . japanese-shift-jis)
790 (latin-jisx0201 . japanese-shift-jis)
791 (cyrillic-iso8859-5 . cyrillic-iso-8bit)
792 (latin-iso8859-9 . iso-latin-5)
793 (japanese-jisx0208-1978 . iso-2022-jp)
794 (chinese-gb2312 . chinese-iso-8bit)
795 (chinese-gbk . chinese-gbk)
796 (gb18030-2-byte . chinese-gb18030)
797 (gb18030-4-byte-bmp . chinese-gb18030)
798 (gb18030-4-byte-smp . chinese-gb18030)
799 (gb18030-4-byte-ext-1 . chinese-gb18030)
800 (gb18030-4-byte-ext-2 . chinese-gb18030)
801 (japanese-jisx0208 . iso-2022-jp)
802 (korean-ksc5601 . iso-2022-kr)
803 (japanese-jisx0212 . iso-2022-jp)
804 (chinese-big5-1 . chinese-big5)
805 (chinese-big5-2 . chinese-big5)
806 (chinese-sisheng . iso-2022-7bit)
807 (ipa . iso-2022-7bit)
808 (vietnamese-viscii-lower . vietnamese-viscii)
809 (vietnamese-viscii-upper . vietnamese-viscii)
810 (arabic-digit . iso-2022-7bit)
811 (arabic-1-column . iso-2022-7bit)
812 (lao . lao)
813 (arabic-2-column . iso-2022-7bit)
814 (indian-is13194 . devanagari)
815 (indian-glyph . devanagari)
816 (tibetan-1-column . tibetan)
817 (ethiopic . iso-2022-7bit)
818 (chinese-cns11643-1 . iso-2022-cn)
819 (chinese-cns11643-2 . iso-2022-cn)
820 (chinese-cns11643-3 . iso-2022-cn)
821 (chinese-cns11643-4 . iso-2022-cn)
822 (chinese-cns11643-5 . iso-2022-cn)
823 (chinese-cns11643-6 . iso-2022-cn)
824 (chinese-cns11643-7 . iso-2022-cn)
825 (indian-2-column . devanagari)
826 (tibetan . tibetan)
827 (latin-iso8859-14 . iso-latin-8)
828 (latin-iso8859-15 . iso-latin-9))))
829 (while l
830 (put-charset-property (car (car l)) 'preferred-coding-system (cdr (car l)))
831 (setq l (cdr l))))
832
833 \f
834 ;; Setup auto-fill-chars for charsets that should invoke auto-filling.
835 ;; SPACE and NEWLINE are already set.
836
837 (set-char-table-range auto-fill-chars '(#x3041 . #x30FF) t)
838 (set-char-table-range auto-fill-chars '(#x3400 . #x4DB5) t)
839 (set-char-table-range auto-fill-chars '(#x4e00 . #x9fbb) t)
840 (set-char-table-range auto-fill-chars '(#xF900 . #xFAFF) t)
841 (set-char-table-range auto-fill-chars '(#xFF00 . #xFF9F) t)
842 (set-char-table-range auto-fill-chars '(#x20000 . #x2FFFF) t)
843
844 \f
845 ;;; Setting char-width-table. The default is 1.
846
847 ;; 0: non-spacing, enclosing combining, formatting, Hangul Jamo medial
848 ;; and final characters.
849 (let ((l '((#x0300 . #x036F)
850 (#x0483 . #x0489)
851 (#x0591 . #x05BD)
852 (#x05BF . #x05BF)
853 (#x05C1 . #x05C2)
854 (#x05C4 . #x05C5)
855 (#x05C7 . #x05C7)
856 (#x0600 . #x0603)
857 (#x0610 . #x0615)
858 (#x064B . #x065E)
859 (#x0670 . #x0670)
860 (#x06D6 . #x06E4)
861 (#x06E7 . #x06E8)
862 (#x06EA . #x06ED)
863 (#x070F . #x070F)
864 (#x0711 . #x0711)
865 (#x0730 . #x074A)
866 (#x07A6 . #x07B0)
867 (#x07EB . #x07F3)
868 (#x0901 . #x0902)
869 (#x093C . #x093C)
870 (#x0941 . #x0948)
871 (#x094D . #x094D)
872 (#x0951 . #x0954)
873 (#x0962 . #x0963)
874 (#x0981 . #x0981)
875 (#x09BC . #x09BC)
876 (#x09C1 . #x09C4)
877 (#x09CD . #x09CD)
878 (#x09E2 . #x09E3)
879 (#x0A01 . #x0A02)
880 (#x0A3C . #x0A3C)
881 (#x0A41 . #x0A4D)
882 (#x0A70 . #x0A71)
883 (#x0A81 . #x0A82)
884 (#x0ABC . #x0ABC)
885 (#x0AC1 . #x0AC8)
886 (#x0ACD . #x0ACD)
887 (#x0AE2 . #x0AE3)
888 (#x0B01 . #x0B01)
889 (#x0B3C . #x0B3C)
890 (#x0B3F . #x0B3F)
891 (#x0B41 . #x0B43)
892 (#x0B4D . #x0B56)
893 (#x0B82 . #x0B82)
894 (#x0BC0 . #x0BC0)
895 (#x0BCD . #x0BCD)
896 (#x0C3E . #x0C40)
897 (#x0C46 . #x0C56)
898 (#x0CBC . #x0CBC)
899 (#x0CBF . #x0CBF)
900 (#x0CC6 . #x0CC6)
901 (#x0CCC . #x0CCD)
902 (#x0CE2 . #x0CE3)
903 (#x0D41 . #x0D43)
904 (#x0D4D . #x0D4D)
905 (#x0DCA . #x0DCA)
906 (#x0DD2 . #x0DD6)
907 (#x0E31 . #x0E31)
908 (#x0E34 . #x0E3A)
909 (#x0E47 . #x0E4E)
910 (#x0EB1 . #x0EB1)
911 (#x0EB4 . #x0EBC)
912 (#x0EC8 . #x0ECD)
913 (#x0F18 . #x0F19)
914 (#x0F35 . #x0F35)
915 (#x0F37 . #x0F37)
916 (#x0F39 . #x0F39)
917 (#x0F71 . #x0F7E)
918 (#x0F80 . #x0F84)
919 (#x0F86 . #x0F87)
920 (#x0F90 . #x0FBC)
921 (#x0FC6 . #x0FC6)
922 (#x102D . #x1030)
923 (#x1032 . #x1037)
924 (#x1039 . #x1039)
925 (#x1058 . #x1059)
926 (#x1160 . #x11FF)
927 (#x135F . #x135F)
928 (#x1712 . #x1714)
929 (#x1732 . #x1734)
930 (#x1752 . #x1753)
931 (#x1772 . #x1773)
932 (#x17B4 . #x17B5)
933 (#x17B7 . #x17BD)
934 (#x17C6 . #x17C6)
935 (#x17C9 . #x17D3)
936 (#x17DD . #x17DD)
937 (#x180B . #x180D)
938 (#x18A9 . #x18A9)
939 (#x1920 . #x1922)
940 (#x1927 . #x1928)
941 (#x1932 . #x1932)
942 (#x1939 . #x193B)
943 (#x1A17 . #x1A18)
944 (#x1B00 . #x1B03)
945 (#x1B34 . #x1B34)
946 (#x1B36 . #x1B3A)
947 (#x1B3C . #x1B3C)
948 (#x1B42 . #x1B42)
949 (#x1B6B . #x1B73)
950 (#x1DC0 . #x1DFF)
951 (#x200B . #x200F)
952 (#x202A . #x202E)
953 (#x2060 . #x206F)
954 (#x20D0 . #x20EF)
955 (#x302A . #x302F)
956 (#x3099 . #x309A)
957 (#xA806 . #xA806)
958 (#xA80B . #xA80B)
959 (#xA825 . #xA826)
960 (#xFB1E . #xFB1E)
961 (#xFE00 . #xFE0F)
962 (#xFE20 . #xFE23)
963 (#xFEFF . #xFEFF)
964 (#xFFF9 . #xFFFB)
965 (#x10A01 . #x10A0F)
966 (#x10A38 . #x10A3F)
967 (#x1D167 . #x1D169)
968 (#x1D173 . #x1D182)
969 (#x1D185 . #x1D18B)
970 (#x1D1AA . #x1D1AD)
971 (#x1D242 . #x1D244)
972 (#xE0001 . #xE01EF))))
973 (dolist (elt l)
974 (set-char-table-range char-width-table elt 0)))
975
976 ;; 2: East Asian Wide and Full-width characters.
977 (let ((l '((#x1100 . #x115F)
978 (#x2329 . #x232A)
979 (#x2E80 . #x303E)
980 (#x3040 . #xA4CF)
981 (#xAC00 . #xD7A3)
982 (#xF900 . #xFAFF)
983 (#xFE30 . #xFE6F)
984 (#xFF01 . #xFF60)
985 (#xFFE0 . #xFFE6)
986 (#x20000 . #x2FFFF)
987 (#x30000 . #x3FFFF))))
988 (dolist (elt l)
989 (set-char-table-range char-width-table elt 2)))
990
991 ;; Other double width
992 ;;(map-charset-chars
993 ;; (lambda (range ignore) (set-char-table-range char-width-table range 2))
994 ;; 'ethiopic)
995 ;; (map-charset-chars
996 ;; (lambda (range ignore) (set-char-table-range char-width-table range 2))
997 ;; 'tibetan)
998 (map-charset-chars
999 (lambda (range ignore) (set-char-table-range char-width-table range 2))
1000 'indian-2-column)
1001 (map-charset-chars
1002 (lambda (range ignore) (set-char-table-range char-width-table range 2))
1003 'arabic-2-column)
1004
1005 (defvar cjk-char-width-table
1006 (let ((table (make-char-table nil)))
1007 (dolist (charset '(big5 chinese-gb2312 chinese-cns11643-1
1008 japanese-jisx0208 korean-ksc5601))
1009 (map-charset-chars #'(lambda (range arg)
1010 (set-char-table-range table range 2))
1011 charset))
1012 (optimize-char-table table)
1013 (set-char-table-parent table char-width-table)
1014 table)
1015 "Character width table used in CJK language environment.")
1016
1017 (defun use-cjk-char-width-table ()
1018 "Internal use only.
1019 Setup char-width-table appropriate for CJK language environment."
1020 (setq char-width-table cjk-char-width-table))
1021
1022 (defun use-default-char-width-table ()
1023 "Internal use only.
1024 Setup char-width-table appropriate for non-CJK language environment."
1025 (setq char-width-table (char-table-parent cjk-char-width-table)))
1026
1027 (optimize-char-table (standard-case-table))
1028 (optimize-char-table (standard-category-table))
1029 (optimize-char-table (standard-syntax-table))
1030
1031 \f
1032 ;; Setting char-script-table.
1033
1034 ;; The Unicode blocks actually extend past some of these ranges with
1035 ;; undefined codepoints.
1036 (let ((script-list nil))
1037 (dolist
1038 (elt
1039 '((#x0000 #x007F latin)
1040 (#x00A0 #x024F latin)
1041 (#x0250 #x02AF phonetic)
1042 (#x02B0 #x036F latin)
1043 (#x0370 #x03E1 greek)
1044 (#x03E2 #x03EF coptic)
1045 (#x03F0 #x03F3 greek)
1046 (#x0400 #x04FF cyrillic)
1047 (#x0530 #x058F armenian)
1048 (#x0590 #x05FF hebrew)
1049 (#x0600 #x06FF arabic)
1050 (#x0700 #x074F syriac)
1051 (#x07C0 #x07FA nko)
1052 (#x0780 #x07BF thaana)
1053 (#x0900 #x097F devanagari)
1054 (#x0980 #x09FF bengali)
1055 (#x0A00 #x0A7F gurmukhi)
1056 (#x0A80 #x0AFF gujarati)
1057 (#x0B00 #x0B7F oriya)
1058 (#x0B80 #x0BFF tamil)
1059 (#x0C00 #x0C7F telugu)
1060 (#x0C80 #x0CFF kannada)
1061 (#x0D00 #x0D7F malayalam)
1062 (#x0D80 #x0DFF sinhala)
1063 (#x0E00 #x0E5F thai)
1064 (#x0E80 #x0EDF lao)
1065 (#x0F00 #x0FFF tibetan)
1066 (#x1000 #x105F myanmar)
1067 (#x10A0 #x10FF georgian)
1068 (#x1100 #x11FF hangul)
1069 (#x1200 #x139F ethiopic)
1070 (#x13A0 #x13FF cherokee)
1071 (#x1400 #x167F canadian-aboriginal)
1072 (#x1680 #x169F ogham)
1073 (#x16A0 #x16FF runic)
1074 (#x1780 #x17FF khmer)
1075 (#x1800 #x18AF mongolian)
1076 (#x1D00 #x1DFF phonetic)
1077 (#x1E00 #x1EFF latin)
1078 (#x1F00 #x1FFF greek)
1079 (#x2000 #x27FF symbol)
1080 (#x2800 #x28FF braille)
1081 (#x2D80 #x2DDF ethiopic)
1082 (#x2E80 #x2FDF han)
1083 (#x2FF0 #x2FFF ideographic-description)
1084 (#x3000 #x303F cjk-misc)
1085 (#x3040 #x30FF kana)
1086 (#x3100 #x312F bopomofo)
1087 (#x3130 #x318F hangul)
1088 (#x3190 #x319F kanbun)
1089 (#x31A0 #x31BF bopomofo)
1090 (#x3400 #x9FAF han)
1091 (#xA000 #xA4CF yi)
1092 (#xAA00 #xAA5F cham)
1093 (#xAA80 #xAADF tai-viet)
1094 (#xAC00 #xD7AF hangul)
1095 (#xF900 #xFAFF han)
1096 (#xFB1D #xFB4F hebrew)
1097 (#xFB50 #xFDFF arabic)
1098 (#xFE70 #xFEFC arabic)
1099 (#xFF00 #xFF5F cjk-misc)
1100 (#xFF61 #xFF9F kana)
1101 (#xFFE0 #xFFE6 cjk-misc)
1102 (#x1D000 #x1D0FF byzantine-musical-symbol)
1103 (#x1D100 #x1D1FF musical-symbol)
1104 (#x1D400 #x1D7FF mathematical)
1105 (#x20000 #x2AFFF han)
1106 (#x2F800 #x2FFFF han)))
1107 (set-char-table-range char-script-table
1108 (cons (car elt) (nth 1 elt)) (nth 2 elt))
1109 (or (memq (nth 2 elt) script-list)
1110 (setq script-list (cons (nth 2 elt) script-list))))
1111 (set-char-table-extra-slot char-script-table 0 (nreverse script-list)))
1112
1113 (map-charset-chars
1114 #'(lambda (range ignore)
1115 (set-char-table-range char-script-table range 'tibetan))
1116 'tibetan)
1117
1118 \f
1119 ;;; Setting word boundary.
1120
1121 (defun next-word-boundary-han (pos limit)
1122 (if (<= pos limit)
1123 (save-excursion
1124 (goto-char pos)
1125 (looking-at "\\cC+")
1126 (goto-char (match-end 0))
1127 (if (looking-at "\\cH+")
1128 (goto-char (match-end 0)))
1129 (point))
1130 (while (and (> pos limit)
1131 (eq (aref char-script-table (char-after (1- pos))) 'han))
1132 (setq pos (1- pos)))
1133 pos))
1134
1135 (defun next-word-boundary-kana (pos limit)
1136 (if (<= pos limit)
1137 (save-excursion
1138 (goto-char pos)
1139 (if (looking-at "\\cK+")
1140 (goto-char (match-end 0)))
1141 (if (looking-at "\\cH+")
1142 (goto-char (match-end 0)))
1143 (if (looking-at "\\ck+")
1144 (goto-char (match-end 0)))
1145 (point))
1146 (let ((category-set (char-category-set (char-after pos)))
1147 category)
1148 (if (or (aref category-set ?K) (aref category-set ?k))
1149 (while (and (> pos limit)
1150 (setq category-set
1151 (char-category-set (char-after (1- pos))))
1152 (or (aref category-set ?K) (aref category-set ?k)))
1153 (setq pos (1- pos)))
1154 (while (and (> pos limit)
1155 (aref (setq category-set
1156 (char-category-set (char-after (1- pos)))) ?H))
1157 (setq pos (1- pos)))
1158 (setq category (cond ((aref category-set ?C) ?C)
1159 ((aref category-set ?K) ?K)
1160 ((aref category-set ?A) ?A)))
1161 (when category
1162 (setq pos (1- pos))
1163 (while (and (> pos limit)
1164 (aref (char-category-set (char-after (1- pos)))
1165 category))
1166 (setq pos (1- pos)))))
1167 pos)))
1168
1169 (map-char-table
1170 #'(lambda (char script)
1171 (cond ((eq script 'han)
1172 (set-char-table-range find-word-boundary-function-table
1173 char #'next-word-boundary-han))
1174 ((eq script 'kana)
1175 (set-char-table-range find-word-boundary-function-table
1176 char #'next-word-boundary-kana))))
1177 char-script-table)
1178
1179 (setq word-combining-categories
1180 '((?l . ?l)
1181 (?C . ?C)
1182 (?C . ?H)
1183 (?C . ?K)))
1184
1185 (setq word-separating-categories ; (2-byte character sets)
1186 '((?A . ?K) ; Alpha numeric - Katakana
1187 (?A . ?C) ; Alpha numeric - Chinese
1188 (?H . ?A) ; Hiragana - Alpha numeric
1189 (?H . ?K) ; Hiragana - Katakana
1190 (?H . ?C) ; Hiragana - Chinese
1191 (?K . ?A) ; Katakana - Alpha numeric
1192 (?K . ?C) ; Katakana - Chinese
1193 (?C . ?A) ; Chinese - Alpha numeric
1194 (?C . ?K) ; Chinese - Katakana
1195 ))
1196
1197 ;; Local Variables:
1198 ;; coding: utf-8
1199 ;; End:
1200
1201 ;; arch-tag: 85889c35-9f4d-4912-9bf5-82de31b0d42d
1202 ;;; characters.el ends here