]> code.delx.au - gnu-emacs/blob - lisp/international/characters.el
Revision: miles@gnu.org--gnu-2004/emacs--unicode--0--patch-26
[gnu-emacs] / lisp / international / characters.el
1 ;;; characters.el --- set syntax and category for multibyte characters
2
3 ;; Copyright (C) 1995, 1997 Electrotechnical Laboratory, JAPAN.
4 ;; Licensed to the Free Software Foundation.
5 ;; Copyright (C) 2001, 2002 Free Software Foundation, Inc.
6 ;; Copyright (C) 2003
7 ;; National Institute of Advanced Industrial Science and Technology (AIST)
8 ;; Registration Number H13PRO009
9
10 ;; Keywords: multibyte character, character set, syntax, category
11
12 ;; This file is part of GNU Emacs.
13
14 ;; GNU Emacs is free software; you can redistribute it and/or modify
15 ;; it under the terms of the GNU General Public License as published by
16 ;; the Free Software Foundation; either version 2, or (at your option)
17 ;; any later version.
18
19 ;; GNU Emacs is distributed in the hope that it will be useful,
20 ;; but WITHOUT ANY WARRANTY; without even the implied warranty of
21 ;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
22 ;; GNU General Public License for more details.
23
24 ;; You should have received a copy of the GNU General Public License
25 ;; along with GNU Emacs; see the file COPYING. If not, write to the
26 ;; Free Software Foundation, Inc., 59 Temple Place - Suite 330,
27 ;; Boston, MA 02111-1307, USA.
28
29 ;;; Commentary:
30
31 ;;; Code:
32
33 ;;; Predefined categories.
34
35 ;; For each character set.
36
37 (define-category ?a "ASCII")
38 (define-category ?l "Latin")
39 (define-category ?t "Thai")
40 (define-category ?g "Greek")
41 (define-category ?b "Arabic")
42 (define-category ?w "Hebrew")
43 (define-category ?y "Cyrillic")
44 (define-category ?k "Japanese katakana")
45 (define-category ?r "Japanese roman")
46 (define-category ?c "Chinese")
47 (define-category ?j "Japanese")
48 (define-category ?h "Korean")
49 (define-category ?e "Ethiopic (Ge'ez)")
50 (define-category ?v "Vietnamese")
51 (define-category ?i "Indian")
52 (define-category ?o "Lao")
53 (define-category ?q "Tibetan")
54
55 ;; For each group (row) of 2-byte character sets.
56
57 (define-category ?A "Alpha-numeric characters of 2-byte character sets")
58 (define-category ?C "Chinese (Han) characters of 2-byte character sets")
59 (define-category ?G "Greek characters of 2-byte character sets")
60 (define-category ?H "Japanese Hiragana characters of 2-byte character sets")
61 (define-category ?K "Japanese Katakana characters of 2-byte character sets")
62 (define-category ?N "Korean Hangul characters of 2-byte character sets")
63 (define-category ?Y "Cyrillic characters of 2-byte character sets")
64 (define-category ?I "Indian Glyphs")
65
66 ;; For phonetic classifications.
67
68 (define-category ?0 "consonant")
69 (define-category ?1 "base (independent) vowel")
70 (define-category ?2 "upper diacritical mark (including upper vowel)")
71 (define-category ?3 "lower diacritical mark (including lower vowel)")
72 (define-category ?4 "tone mark")
73 (define-category ?5 "symbol")
74 (define-category ?6 "digit")
75 (define-category ?7 "vowel-modifying diacritical mark")
76 (define-category ?8 "vowel-signs")
77 (define-category ?9 "semivowel lower")
78
79 ;; For filling.
80 (define-category ?| "While filling, we can break a line at this character.")
81
82 ;; For indentation calculation.
83 (define-category ?\s
84 "This character counts as a space for indentation purposes.")
85
86 ;; Keep the following for `kinsoku' processing. See comments in
87 ;; kinsoku.el.
88 (define-category ?> "A character which can't be placed at beginning of line.")
89 (define-category ?< "A character which can't be placed at end of line.")
90
91 ;; Combining
92 (define-category ?^ "Combining diacritic or mark")
93 \f
94 ;;; Setting syntax and category.
95
96 ;; ASCII
97
98 ;; All ASCII characters have the category `a' (ASCII) and `l' (Latin).
99 (modify-category-entry '(32 . 127) ?a)
100 (modify-category-entry '(32 . 127) ?l)
101
102 ;; Deal with the CJK charsets first. Since the syntax of blocks is
103 ;; defined per charset, and the charsets may contain e.g. Latin
104 ;; characters, we end up with the wrong syntax definitions if we're
105 ;; not careful.
106
107 ;; Chinese characters (Unicode)
108 (modify-category-entry '(#x2E80 . #x312F) ?|)
109 (modify-category-entry '(#x3190 . #x33FF) ?|)
110 (modify-category-entry '(#x3400 . #x9FAF) ?C)
111 (modify-category-entry '(#x3400 . #x9FAF) ?c)
112 (modify-category-entry '(#x3400 . #x9FAF) ?|)
113 (modify-category-entry '(#xF900 . #xFAFF) ?C)
114 (modify-category-entry '(#xF900 . #xFAFF) ?c)
115 (modify-category-entry '(#xF900 . #xFAFF) ?|)
116 (modify-category-entry '(#x20000 . #x2AFFF) ?|)
117 (modify-category-entry '(#x2F800 . #x2FFFF) ?|)
118
119
120 ;; Chinese character set (GB2312)
121
122 (map-charset-chars #'modify-syntax-entry 'chinese-gb2312 "_" #x2121 #x217E)
123 (map-charset-chars #'modify-syntax-entry 'chinese-gb2312 "_" #x2221 #x227E)
124 (map-charset-chars #'modify-syntax-entry 'chinese-gb2312 "_" #x2921 #x297E)
125
126 (map-charset-chars #'modify-category-entry 'chinese-gb2312 ?c)
127 (map-charset-chars #'modify-category-entry 'chinese-gb2312 ?A #x2330 #x2339)
128 (map-charset-chars #'modify-category-entry 'chinese-gb2312 ?A #x2341 #x235A)
129 (map-charset-chars #'modify-category-entry 'chinese-gb2312 ?A #x2361 #x237A)
130 (map-charset-chars #'modify-category-entry 'chinese-gb2312 ?H #x2421 #x247E)
131 (map-charset-chars #'modify-category-entry 'chinese-gb2312 ?K #x2521 #x257E)
132 (map-charset-chars #'modify-category-entry 'chinese-gb2312 ?G #x2621 #x267E)
133 (map-charset-chars #'modify-category-entry 'chinese-gb2312 ?Y #x2721 #x277E)
134 (map-charset-chars #'modify-category-entry 'chinese-gb2312 ?C #x3021 #x7E7E)
135
136 ;; Chinese character set (BIG5)
137
138 (map-charset-chars #'modify-category-entry 'big5 ?c)
139 (map-charset-chars #'modify-category-entry 'big5 ?C #xA259 #xA25F)
140 (map-charset-chars #'modify-category-entry 'big5 ?C #xA440 #xC67E)
141 (map-charset-chars #'modify-category-entry 'big5 ?C #xC940 #xF9DF)
142
143 ;; Chinese character set (CNS11643)
144
145 (dolist (c '(chinese-cns11643-1 chinese-cns11643-2 chinese-cns11643-3
146 chinese-cns11643-4 chinese-cns11643-5 chinese-cns11643-6
147 chinese-cns11643-7))
148 (map-charset-chars #'modify-category-entry c ?c)
149 (if (eq c 'chinese-cns11643-1)
150 (map-charset-chars #'modify-category-entry c ?C #x4421 #x7E7E)
151 (map-charset-chars #'modify-category-entry c ?C)))
152
153 ;; Japanese character set (JISX0201, JISX0208, JISX0212, JISX0213)
154
155 (map-charset-chars #'modify-category-entry 'katakana-jisx0201 ?k)
156
157 (map-charset-chars #'modify-category-entry 'latin-jisx0201 ?r)
158
159 (dolist (l '(katakana-jisx0201 japanese-jisx0208 japanese-jisx0212
160 japanese-jisx0213-1 japanese-jisx0213-2))
161 (map-charset-chars #'modify-category-entry l ?j))
162
163 ;; Unicode equivalents of JISX0201-kana
164 (let ((range '(#xff61 . #xff9f)))
165 (modify-category-entry range ?k)
166 (modify-category-entry range ?j)
167 (modify-category-entry range ?\|))
168
169 ;; Katakana block
170 (let ((range '(#x30a0 . #x30ff)))
171 ;; ?K is double width, ?k isn't specified
172 (modify-category-entry range ?K)
173 (modify-category-entry range ?\|))
174
175 ;; Hiragana block
176 (let ((range '(#x3040 . #x309d)))
177 ;; ?H is actually defined to be double width
178 ;;(modify-category-entry range ?H)
179 (modify-category-entry range ?\|)
180 )
181
182 ;; JISX0208
183 (map-charset-chars #'modify-syntax-entry 'japanese-jisx0208 "_" #x2121 #x227E)
184 (map-charset-chars #'modify-syntax-entry 'japanese-jisx0208 "_" #x2821 #x287E)
185 (let ((chars '(?ー ?゛ ?゜ ?ヽ ?ヾ ?ゝ ?ゞ ?〃 ?仝 ?々 ?〆 ?〇)))
186 (dolist (elt chars)
187 (modify-syntax-entry (car chars) "w")))
188
189 (map-charset-chars #'modify-category-entry 'japanese-jisx0208 ?A #x2321 #x237E)
190 (map-charset-chars #'modify-category-entry 'japanese-jisx0208 ?H #x2421 #x247E)
191 (map-charset-chars #'modify-category-entry 'japanese-jisx0208 ?K #x2521 #x257E)
192 (map-charset-chars #'modify-category-entry 'japanese-jisx0208 ?G #x2621 #x267E)
193 (map-charset-chars #'modify-category-entry 'japanese-jisx0208 ?Y #x2721 #x277E)
194 (map-charset-chars #'modify-category-entry 'japanese-jisx0208 ?C #x3021 #x7E7E)
195 (modify-category-entry ?ー ?K)
196 (let ((chars '(?゛ ?゜)))
197 (while chars
198 (modify-category-entry (car chars) ?K)
199 (modify-category-entry (car chars) ?H)
200 (setq chars (cdr chars))))
201 (let ((chars '(?ヽ ?ヾ ?ゝ ?ゞ ?〃 ?仝 ?々 ?〆 ?〇)))
202 (while chars
203 (modify-category-entry (car chars) ?C)
204 (setq chars (cdr chars))))
205
206 ;; JISX0212
207
208 (map-charset-chars #'modify-syntax-entry 'japanese-jisx0212 "_" #x2121 #x237E)
209
210 ;; JISX0201-Kana
211
212 (let ((chars '(?。 ?、 ?・)))
213 (while chars
214 (modify-syntax-entry (car chars) ".")
215 (setq chars (cdr chars))))
216
217 (modify-syntax-entry ?\「 "(」")
218 (modify-syntax-entry ?\」 "(「")
219
220 ;; Korean character set (KSC5601)
221
222 (map-charset-chars #'modify-category-entry 'korean-ksc5601 ?h)
223
224 (map-charset-chars #'modify-syntax-entry 'korean-ksc5601 "_" #x2121 #x227E)
225 (map-charset-chars #'modify-syntax-entry 'korean-ksc5601 "_" #x2621 #x277E)
226 (map-charset-chars #'modify-syntax-entry 'korean-ksc5601 "_" #x2830 #x287E)
227 (map-charset-chars #'modify-syntax-entry 'korean-ksc5601 "_" #x2930 #x297E)
228 (map-charset-chars #'modify-category-entry 'korean-ksc5601 ?A #x2330 #x2339)
229 (map-charset-chars #'modify-category-entry 'korean-ksc5601 ?A #x2341 #x235A)
230 (map-charset-chars #'modify-category-entry 'korean-ksc5601 ?A #x2361 #x237A)
231 (map-charset-chars #'modify-category-entry 'korean-ksc5601 ?G #x2521 #x257E)
232 (map-charset-chars #'modify-category-entry 'korean-ksc5601 ?H #x2A21 #x2A7E)
233 (map-charset-chars #'modify-category-entry 'korean-ksc5601 ?K #x2B21 #x2B7E)
234 (map-charset-chars #'modify-category-entry 'korean-ksc5601 ?Y #x2C21 #x2C7E)
235
236 ;; These are in more than one charset.
237 (let ((parens (concat "〈〉《》「」『』【】〔〕〖〗〘〙〚〛"
238 "︵︶︷︸︹︺︻︼︽︾︿﹀﹁﹂﹃﹄"
239 "()[]{}"))
240 open close)
241 (dotimes (i (/ (length parens) 2))
242 (setq open (aref parens (* i 2))
243 close (aref parens (1+ (* i 2))))
244 (modify-syntax-entry open (format "(%c" close))
245 (modify-syntax-entry close (format ")%c" open))))
246
247 ;; Arabic character set
248
249 (let ((charsets '(arabic-iso8859-6
250 arabic-digit
251 arabic-1-column
252 arabic-2-column)))
253 (while charsets
254 (map-charset-chars #'modify-category-entry (car charsets) ?b)
255 (setq charsets (cdr charsets))))
256 (modify-category-entry '(#x600 . #x6ff) ?b)
257 (modify-category-entry '(#xfb50 . #xfdff) ?b)
258 (modify-category-entry '(#xfe70 . #xfefe) ?b)
259
260 ;; Cyrillic character set (ISO-8859-5)
261
262 (modify-syntax-entry ?№ ".")
263
264 ;; Ethiopic character set
265
266 (modify-category-entry '(#x1200 . #x137c) ?e)
267 (let ((chars '(?፡ ?። ?፣ ?፤ ?፥ ?፦ ?፧ ?፨ ? ? ? ? ? ?)))
268 (while chars
269 (modify-syntax-entry (car chars) ".")
270 (setq chars (cdr chars))))
271 (map-charset-chars #'modify-category-entry 'ethiopic ?e)
272
273 ;; Hebrew character set (ISO-8859-8)
274
275 (modify-syntax-entry #x5be ".") ; MAQAF
276 (modify-syntax-entry #x5c0 ".") ; PASEQ
277 (modify-syntax-entry #x5c3 ".") ; SOF PASUQ
278 (modify-syntax-entry #x5f3 ".") ; GERESH
279 (modify-syntax-entry #x5f4 ".") ; GERSHAYIM
280
281 ;; Indian character set (IS 13194 and other Emacs original Indian charsets)
282
283 (modify-category-entry '(#x901 . #x970) ?i)
284 (map-charset-chars #'modify-category-entry 'indian-is13194 ?i)
285 (map-charset-chars #'modify-category-entry 'indian-2-column ?i)
286
287 ;; Lao character set
288
289 (modify-category-entry '(#xe80 . #xeff) ?o)
290 (map-charset-chars #'modify-category-entry 'lao ?o)
291
292 (let ((deflist '(("ກ-ຮ" "w" ?0) ; consonant
293 ("ະາຳຽເ-ໄ" "w" ?1) ; vowel base
294 ("ັິ-ືົໍ" "w" ?2) ; vowel upper
295 ("ຸູ" "w" ?3) ; vowel lower
296 ("່-໋" "w" ?4) ; tone mark
297 ("ຼຽ" "w" ?9) ; semivowel lower
298 ("໐-໙" "w" ?6) ; digit
299 ("ຯໆ" "_" ?5) ; symbol
300 ))
301 elm chars len syntax category to ch i)
302 (while deflist
303 (setq elm (car deflist))
304 (setq chars (car elm)
305 len (length chars)
306 syntax (nth 1 elm)
307 category (nth 2 elm)
308 i 0)
309 (while (< i len)
310 (if (= (aref chars i) ?-)
311 (setq i (1+ i)
312 to (aref chars i))
313 (setq ch (aref chars i)
314 to ch))
315 (while (<= ch to)
316 (unless (string-equal syntax "w")
317 (modify-syntax-entry ch syntax))
318 (modify-category-entry ch category)
319 (setq ch (1+ ch)))
320 (setq i (1+ i)))
321 (setq deflist (cdr deflist))))
322
323 ;; Thai character set (TIS620)
324
325 (modify-category-entry '(#xe00 . #xe7f) ?t)
326 (map-charset-chars #'modify-category-entry 'thai-tis620 ?t)
327
328 (let ((deflist '(;; chars syntax category
329 ("ก-รลว-ฮ" "w" ?0) ; consonant
330 ("ฤฦะาำเ-ๅ" "w" ?1) ; vowel base
331 ("ัิ-ื็๎" "w" ?2) ; vowel upper
332 ("ุ-ฺ" "w" ?3) ; vowel lower
333 ("่-ํ" "w" ?4) ; tone mark
334 ("๐-๙" "w" ?6) ; digit
335 ("ฯๆ฿๏๚๛" "_" ?5) ; symbol
336 ))
337 elm chars len syntax category to ch i)
338 (while deflist
339 (setq elm (car deflist))
340 (setq chars (car elm)
341 len (length chars)
342 syntax (nth 1 elm)
343 category (nth 2 elm)
344 i 0)
345 (while (< i len)
346 (if (= (aref chars i) ?-)
347 (setq i (1+ i)
348 to (aref chars i))
349 (setq ch (aref chars i)
350 to ch))
351 (while (<= ch to)
352 (unless (string-equal syntax "w")
353 (modify-syntax-entry ch syntax))
354 (modify-category-entry ch category)
355 (setq ch (1+ ch)))
356 (setq i (1+ i)))
357 (setq deflist (cdr deflist))))
358
359 ;; Tibetan character set
360
361 (modify-category-entry '(#xf00 . #xfff) ?q)
362 (map-charset-chars #'modify-category-entry 'tibetan ?q)
363 (map-charset-chars #'modify-category-entry 'tibetan-1-column ?q)
364
365 (let ((deflist '(;; chars syntax category
366 ("ཀ-ཀྵཪ" "w" ?0) ; consonant
367 ("ྐ-ྐྵྺྻྼ" "w" ?0) ;
368 ("-" "w" ?0) ;
369 ("-" "w" ?0) ;
370 ("ིེཻོཽྀ" "w" ?2) ; upper vowel
371 ("ཾྂྃ྆྇ྈྉྊྋ" "w" ?2) ; upper modifier
372 ("྄ཱུ༙༵༷" "w" ?3) ; lowel vowel/modifier
373 ("཰" "w" ?3) ; invisible vowel a
374 ("༠-༩༪-༳" "w" ?6) ; digit
375 ("་།-༒༔ཿ" "." ?|) ; line-break char
376 ("་།༏༐༑༔ཿ" "." ?|) ;
377 ("༈་།-༒༔ཿ༽༴" "." ?>) ; prohibition
378 ("་།༏༐༑༔ཿ" "." ?>) ;
379 ("ༀ-༊༼࿁࿂྅" "." ?<) ; prohibition
380 ("༓༕-༘༚-༟༶༸-༻༾༿྾྿-࿏" "." ?q) ; others
381 ))
382 elm chars len syntax category to ch i)
383 (while deflist
384 (setq elm (car deflist))
385 (setq chars (car elm)
386 len (length chars)
387 syntax (nth 1 elm)
388 category (nth 2 elm)
389 i 0)
390 (while (< i len)
391 (if (= (aref chars i) ?-)
392 (setq i (1+ i)
393 to (aref chars i))
394 (setq ch (aref chars i)
395 to ch))
396 (while (<= ch to)
397 (unless (string-equal syntax "w")
398 (modify-syntax-entry ch syntax))
399 (modify-category-entry ch category)
400 (setq ch (1+ ch)))
401 (setq i (1+ i)))
402 (setq deflist (cdr deflist))))
403
404 ;; Vietnamese character set
405
406 ;; To make a word with Latin characters
407 (map-charset-chars #'modify-category-entry 'vietnamese-viscii-lower ?l)
408 (map-charset-chars #'modify-category-entry 'vietnamese-viscii-lower ?v)
409
410 (map-charset-chars #'modify-category-entry 'vietnamese-viscii-upper ?l)
411 (map-charset-chars #'modify-category-entry 'vietnamese-viscii-upper ?v)
412
413 (let ((tbl (standard-case-table))
414 (i 32))
415 (while (< i 128)
416 (let* ((char (decode-char 'vietnamese-viscii-upper i))
417 (charl (decode-char 'vietnamese-viscii-lower i))
418 (uc (encode-char char 'ucs))
419 (lc (encode-char charl 'ucs)))
420 (set-case-syntax-pair char (decode-char 'vietnamese-viscii-lower i)
421 tbl)
422 (if uc (modify-category-entry uc ?v))
423 (if lc (modify-category-entry lc ?v)))
424 (setq i (1+ i))))
425
426
427 ;; Latin
428
429 (modify-category-entry '(#x80 . #x024F) ?l)
430
431 (let ((tbl (standard-case-table)) c)
432
433 ;; In some languages, U+0049 LATIN CAPITAL LETTER I and U+0131 LATIN
434 ;; SMALL LETTER DOTLESS I make a case pair, and so do U+0130 LATIN
435 ;; CAPITAL LETTER I WITH DOT ABOVE and U+0069 LATIN SMALL LETTER I.
436 ;; See the Turkish language environment.
437
438 ;; Latin-1
439
440 ;; Fixme: Some of the non-word syntaxes here perhaps should be
441 ;; reviewed. (Note that the following all implicitly have word
442 ;; syntax: ¢£¤¥¨ª¯²³´¶¸¹º.) There should be a well-defined way of
443 ;; relating Unicode categories to Emacs syntax codes.
444 (set-case-syntax ?  " " tbl) ; dubious
445 (set-case-syntax ?¡ "." tbl)
446 (set-case-syntax ?¦ "_" tbl)
447 (set-case-syntax ?§ "." tbl)
448 (set-case-syntax ?© "_" tbl)
449 (set-case-syntax-delims 171 187 tbl) ; « »
450 (set-case-syntax ?¬ "_" tbl)
451 (set-case-syntax ?­ "_" tbl)
452 (set-case-syntax ?® "_" tbl)
453 (set-case-syntax ?° "_" tbl)
454 (set-case-syntax ?± "_" tbl)
455 (set-case-syntax ?µ "_" tbl)
456 (set-case-syntax ?· "_" tbl)
457 (set-case-syntax ?¼ "_" tbl)
458 (set-case-syntax ?½ "_" tbl)
459 (set-case-syntax ?¾ "_" tbl)
460 (set-case-syntax ?¿ "." tbl)
461 (let ((c 192))
462 (while (<= c 222)
463 (set-case-syntax-pair c (+ c 32) tbl)
464 (setq c (1+ c))))
465 (set-case-syntax ?× "_" tbl)
466 (set-case-syntax ?ß "w" tbl)
467 (set-case-syntax ?÷ "_" tbl)
468 ;; See below for ÿ.
469
470 ;; Latin Extended-A, Latin Extended-B
471 (setq c #x0100)
472 (while (<= c #x0233)
473 (and (or (<= c #x012e)
474 (and (>= c #x014a) (<= c #x0177)))
475 (zerop (% c 2))
476 (set-case-syntax-pair c (1+ c) tbl))
477 (and (>= c #x013a)
478 (<= c #x0148)
479 (zerop (% c 2))
480 (set-case-syntax-pair (1- c) c tbl))
481 (setq c (1+ c)))
482 (set-case-syntax-pair ?IJ ?ij tbl)
483 (set-case-syntax-pair ?Ĵ ?ĵ tbl)
484 (set-case-syntax-pair ?Ķ ?ķ tbl)
485 (set-case-syntax-pair ?Ÿ ?ÿ tbl)
486 (set-case-syntax-pair ?Ź ?ź tbl)
487 (set-case-syntax-pair ?Ż ?ż tbl)
488 (set-case-syntax-pair ?Ž ?ž tbl)
489
490 ;; Latin Extended-B
491 (set-case-syntax-pair ?Ɓ ?ɓ tbl)
492 (set-case-syntax-pair ?Ƃ ?ƃ tbl)
493 (set-case-syntax-pair ?Ƅ ?ƅ tbl)
494 (set-case-syntax-pair ?Ɔ ?ɔ tbl)
495 (set-case-syntax-pair ?Ƈ ?ƈ tbl)
496 (set-case-syntax-pair ?Ɖ ?ɖ tbl)
497 (set-case-syntax-pair ?Ɗ ?ɗ tbl)
498 (set-case-syntax-pair ?Ƌ ?ƌ tbl)
499 (set-case-syntax-pair ?Ǝ ?ǝ tbl)
500 (set-case-syntax-pair ?Ə ?ə tbl)
501 (set-case-syntax-pair ?Ɛ ?ɛ tbl)
502 (set-case-syntax-pair ?Ƒ ?ƒ tbl)
503 (set-case-syntax-pair ?Ɠ ?ɠ tbl)
504 (set-case-syntax-pair ?Ɣ ?ɣ tbl)
505 (set-case-syntax-pair ?Ɩ ?ɩ tbl)
506 (set-case-syntax-pair ?Ɨ ?ɨ tbl)
507 (set-case-syntax-pair ?Ƙ ?ƙ tbl)
508 (set-case-syntax-pair ?Ɯ ?ɯ tbl)
509 (set-case-syntax-pair ?Ɲ ?ɲ tbl)
510 (set-case-syntax-pair ?Ɵ ?ɵ tbl)
511 (set-case-syntax-pair ?Ơ ?ơ tbl)
512 (set-case-syntax-pair ?Ƣ ?ƣ tbl)
513 (set-case-syntax-pair ?Ƥ ?ƥ tbl)
514 (set-case-syntax-pair ?Ʀ ?ʀ tbl)
515 (set-case-syntax-pair ?Ƨ ?ƨ tbl)
516 (set-case-syntax-pair ?Ʃ ?ʃ tbl)
517 (set-case-syntax-pair ?Ƭ ?ƭ tbl)
518 (set-case-syntax-pair ?Ʈ ?ʈ tbl)
519 (set-case-syntax-pair ?Ư ?ư tbl)
520 (set-case-syntax-pair ?Ʊ ?ʊ tbl)
521 (set-case-syntax-pair ?Ʋ ?ʋ tbl)
522 (set-case-syntax-pair ?Ƴ ?ƴ tbl)
523 (set-case-syntax-pair ?Ƶ ?ƶ tbl)
524 (set-case-syntax-pair ?Ʒ ?ʒ tbl)
525 (set-case-syntax-pair ?Ƹ ?ƹ tbl)
526 (set-case-syntax-pair ?Ƽ ?ƽ tbl)
527 (set-case-syntax-pair ?DŽ ?dž tbl)
528 (set-case-syntax-pair ?Dž ?dž tbl)
529 (set-case-syntax-pair ?LJ ?lj tbl)
530 (set-case-syntax-pair ?Lj ?lj tbl)
531 (set-case-syntax-pair ?NJ ?nj tbl)
532 (set-case-syntax-pair ?Nj ?nj tbl)
533 (set-case-syntax-pair ?Ǎ ?ǎ tbl)
534 (set-case-syntax-pair ?Ǐ ?ǐ tbl)
535 (set-case-syntax-pair ?Ǒ ?ǒ tbl)
536 (set-case-syntax-pair ?Ǔ ?ǔ tbl)
537 (set-case-syntax-pair ?Ǖ ?ǖ tbl)
538 (set-case-syntax-pair ?Ǘ ?ǘ tbl)
539 (set-case-syntax-pair ?Ǚ ?ǚ tbl)
540 (set-case-syntax-pair ?Ǜ ?ǜ tbl)
541 (set-case-syntax-pair ?Ǟ ?ǟ tbl)
542 (set-case-syntax-pair ?Ǡ ?ǡ tbl)
543 (set-case-syntax-pair ?Ǣ ?ǣ tbl)
544 (set-case-syntax-pair ?Ǥ ?ǥ tbl)
545 (set-case-syntax-pair ?Ǧ ?ǧ tbl)
546 (set-case-syntax-pair ?Ǩ ?ǩ tbl)
547 (set-case-syntax-pair ?Ǫ ?ǫ tbl)
548 (set-case-syntax-pair ?Ǭ ?ǭ tbl)
549 (set-case-syntax-pair ?Ǯ ?ǯ tbl)
550 ;; 01F0; F; 006A 030C; # LATIN SMALL LETTER J WITH CARON
551 (set-case-syntax-pair ?DZ ?dz tbl)
552 (set-case-syntax-pair ?Dz ?dz tbl)
553 (set-case-syntax-pair ?Ǵ ?ǵ tbl)
554 (set-case-syntax-pair ?Ƕ ?ƕ tbl)
555 (set-case-syntax-pair ?Ƿ ?ƿ tbl)
556 (set-case-syntax-pair ?Ǹ ?ǹ tbl)
557 (set-case-syntax-pair ?Ǻ ?ǻ tbl)
558 (set-case-syntax-pair ?Ǽ ?ǽ tbl)
559 (set-case-syntax-pair ?Ǿ ?ǿ tbl)
560 (set-case-syntax-pair ?Ȁ ?ȁ tbl)
561 (set-case-syntax-pair ?Ȃ ?ȃ tbl)
562 (set-case-syntax-pair ?Ȅ ?ȅ tbl)
563 (set-case-syntax-pair ?Ȇ ?ȇ tbl)
564 (set-case-syntax-pair ?Ȉ ?ȉ tbl)
565 (set-case-syntax-pair ?Ȋ ?ȋ tbl)
566 (set-case-syntax-pair ?Ȍ ?ȍ tbl)
567 (set-case-syntax-pair ?Ȏ ?ȏ tbl)
568 (set-case-syntax-pair ?Ȑ ?ȑ tbl)
569 (set-case-syntax-pair ?Ȓ ?ȓ tbl)
570 (set-case-syntax-pair ?Ȕ ?ȕ tbl)
571 (set-case-syntax-pair ?Ȗ ?ȗ tbl)
572 (set-case-syntax-pair ?Ș ?ș tbl)
573 (set-case-syntax-pair ?Ț ?ț tbl)
574 (set-case-syntax-pair ?Ȝ ?ȝ tbl)
575 (set-case-syntax-pair ?Ȟ ?ȟ tbl)
576 (set-case-syntax-pair ?Ȣ ?ȣ tbl)
577 (set-case-syntax-pair ?Ȥ ?ȥ tbl)
578 (set-case-syntax-pair ?Ȧ ?ȧ tbl)
579 (set-case-syntax-pair ?Ȩ ?ȩ tbl)
580 (set-case-syntax-pair ?Ȫ ?ȫ tbl)
581 (set-case-syntax-pair ?Ȭ ?ȭ tbl)
582 (set-case-syntax-pair ?Ȯ ?ȯ tbl)
583 (set-case-syntax-pair ?Ȱ ?ȱ tbl)
584 (set-case-syntax-pair ?Ȳ ?ȳ tbl)
585
586 ;; Latin Extended Additional
587 (modify-category-entry '(#x1e00 . #x1ef9) ?l)
588 (setq c #x1e00)
589 (while (<= c #x1ef9)
590 (and (zerop (% c 2))
591 (or (<= c #x1e94) (>= c #x1ea0))
592 (set-case-syntax-pair c (1+ c) tbl))
593 (setq c (1+ c)))
594
595 ;; Greek
596 (modify-category-entry '(#x0370 . #x03ff) ?g)
597 (setq c #x0370)
598 (while (<= c #x03ff)
599 (if (or (and (>= c #x0391) (<= c #x03a1))
600 (and (>= c #x03a3) (<= c #x03ab)))
601 (set-case-syntax-pair c (+ c 32) tbl))
602 (and (>= c #x03da)
603 (<= c #x03ee)
604 (zerop (% c 2))
605 (set-case-syntax-pair c (1+ c) tbl))
606 (setq c (1+ c)))
607 (set-case-syntax-pair ?Ά ?ά tbl)
608 (set-case-syntax-pair ?Έ ?έ tbl)
609 (set-case-syntax-pair ?Ή ?ή tbl)
610 (set-case-syntax-pair ?Ί ?ί tbl)
611 (set-case-syntax-pair ?Ό ?ό tbl)
612 (set-case-syntax-pair ?Ύ ?ύ tbl)
613 (set-case-syntax-pair ?Ώ ?ώ tbl)
614
615 ;; Armenian
616 (setq c #x531)
617 (while (<= c #x556)
618 (set-case-syntax-pair c (+ c #x30) tbl)
619 (setq c (1+ c)))
620
621 ;; Greek Extended
622 (modify-category-entry '(#x1f00 . #x1fff) ?g)
623 (setq c #x1f00)
624 (while (<= c #x1fff)
625 (and (<= (logand c #x000f) 7)
626 (<= c #x1fa7)
627 (not (memq c '(#x1f50 #x1f52 #x1f54 #x1f56)))
628 (/= (logand c #x00f0) 7)
629 (set-case-syntax-pair (+ c 8) c tbl))
630 (setq c (1+ c)))
631 (set-case-syntax-pair ?Ᾰ ?ᾰ tbl)
632 (set-case-syntax-pair ?Ᾱ ?ᾱ tbl)
633 (set-case-syntax-pair ?Ὰ ?ὰ tbl)
634 (set-case-syntax-pair ?Ά ?ά tbl)
635 (set-case-syntax-pair ?ᾼ ?ᾳ tbl)
636 (set-case-syntax-pair ?Ὲ ?ὲ tbl)
637 (set-case-syntax-pair ?Έ ?έ tbl)
638 (set-case-syntax-pair ?Ὴ ?ὴ tbl)
639 (set-case-syntax-pair ?Ή ?ή tbl)
640 (set-case-syntax-pair ?ῌ ?ῃ tbl)
641 (set-case-syntax-pair ?Ῐ ?ῐ tbl)
642 (set-case-syntax-pair ?Ῑ ?ῑ tbl)
643 (set-case-syntax-pair ?Ὶ ?ὶ tbl)
644 (set-case-syntax-pair ?Ί ?ί tbl)
645 (set-case-syntax-pair ?Ῠ ?ῠ tbl)
646 (set-case-syntax-pair ?Ῡ ?ῡ tbl)
647 (set-case-syntax-pair ?Ὺ ?ὺ tbl)
648 (set-case-syntax-pair ?Ύ ?ύ tbl)
649 (set-case-syntax-pair ?Ῥ ?ῥ tbl)
650 (set-case-syntax-pair ?Ὸ ?ὸ tbl)
651 (set-case-syntax-pair ?Ό ?ό tbl)
652 (set-case-syntax-pair ?Ὼ ?ὼ tbl)
653 (set-case-syntax-pair ?Ώ ?ώ tbl)
654 (set-case-syntax-pair ?ῼ ?ῳ tbl)
655
656 ;; cyrillic
657 (modify-category-entry '(#x0400 . #x04FF) ?y)
658 (setq c #x0400)
659 (while (<= c #x04ff)
660 (and (>= c #x0400)
661 (<= c #x040f)
662 (set-case-syntax-pair c (+ c 80) tbl))
663 (and (>= c #x0410)
664 (<= c #x042f)
665 (set-case-syntax-pair c (+ c 32) tbl))
666 (and (zerop (% c 2))
667 (or (and (>= c #x0460) (<= c #x0480))
668 (and (>= c #x048c) (<= c #x04be))
669 (and (>= c #x04d0) (<= c #x04f4)))
670 (set-case-syntax-pair c (1+ c) tbl))
671 (setq c (1+ c)))
672 (set-case-syntax-pair ?Ӂ ?ӂ tbl)
673 (set-case-syntax-pair ?Ӄ ?ӄ tbl)
674 (set-case-syntax-pair ?Ӈ ?ӈ tbl)
675 (set-case-syntax-pair ?Ӌ ?ӌ tbl)
676 (set-case-syntax-pair ?Ӹ ?ӹ tbl)
677
678 ;; general punctuation
679 (setq c #x2000)
680 (while (<= c #x200b)
681 (set-case-syntax c " " tbl)
682 (setq c (1+ c)))
683 (while (<= c #x200F)
684 (set-case-syntax c "." tbl)
685 (setq c (1+ c)))
686 ;; Fixme: These aren't all right:
687 (setq c #x2010)
688 (while (<= c #x2016)
689 (set-case-syntax c "_" tbl)
690 (setq c (1+ c)))
691 ;; Punctuation syntax for quotation marks (like `)
692 (while (<= c #x201f)
693 (set-case-syntax c "." tbl)
694 (setq c (1+ c)))
695 ;; Fixme: These aren't all right:
696 (while (<= c #x2027)
697 (set-case-syntax c "_" tbl)
698 (setq c (1+ c)))
699 (while (<= c #x206F)
700 (set-case-syntax c "." tbl)
701 (setq c (1+ c)))
702
703 ;; Roman numerals
704 (setq c #x2160)
705 (while (<= c #x216f)
706 (set-case-syntax-pair c (+ c #x10) tbl)
707 (setq c (1+ c)))
708
709 ;; Fixme: The following blocks might be better as symbol rather than
710 ;; punctuation.
711 ;; Arrows
712 (setq c #x2190)
713 (while (<= c #x21FF)
714 (set-case-syntax c "." tbl)
715 (setq c (1+ c)))
716 ;; Mathematical Operators
717 (while (<= c #x22FF)
718 (set-case-syntax c "." tbl)
719 (setq c (1+ c)))
720 ;; Miscellaneous Technical
721 (while (<= c #x23FF)
722 (set-case-syntax c "." tbl)
723 (setq c (1+ c)))
724 ;; Control Pictures
725 (while (<= c #x243F)
726 (set-case-syntax c "_" tbl)
727 (setq c (1+ c)))
728
729 ;; Circled Latin
730 (setq c #x24b6)
731 (while (<= c #x24cf)
732 (set-case-syntax-pair c (+ c 26) tbl)
733 (modify-category-entry c ?l)
734 (modify-category-entry (+ c 26) ?l)
735 (setq c (1+ c)))
736
737 ;; Fullwidth Latin
738 (setq c #xff21)
739 (while (<= c #xff3a)
740 (set-case-syntax-pair c (+ c #x20) tbl)
741 (modify-category-entry c ?l)
742 (modify-category-entry (+ c #x20) ?l)
743 (setq c (1+ c)))
744
745 ;; Combining diacritics
746 (modify-category-entry '(#x300 . #x362) ?^)
747 ;; Combining marks
748 (modify-category-entry '(#x20d0 . #x20e3) ?^)
749
750 ;; Fixme: syntax for symbols &c
751 )
752
753 (let ((pairs
754 '("⁅⁆" ; U+2045 U+2046
755 "⁽⁾" ; U+207D U+207E
756 "₍₎" ; U+208D U+208E
757 "〈〉" ; U+2329 U+232A
758 "⎴⎵" ; U+23B4 U+23B5
759 "❨❩" ; U+2768 U+2769
760 "❪❫" ; U+276A U+276B
761 "❬❭" ; U+276C U+276D
762 "❰❱" ; U+2770 U+2771
763 "❲❳" ; U+2772 U+2773
764 "❴❵" ; U+2774 U+2775
765 "⟦⟧" ; U+27E6 U+27E7
766 "⟨⟩" ; U+27E8 U+27E9
767 "⟪⟫" ; U+27EA U+27EB
768 "⦃⦄" ; U+2983 U+2984
769 "⦅⦆" ; U+2985 U+2986
770 "⦇⦈" ; U+2987 U+2988
771 "⦉⦊" ; U+2989 U+298A
772 "⦋⦌" ; U+298B U+298C
773 "⦍⦎" ; U+298D U+298E
774 "⦏⦐" ; U+298F U+2990
775 "⦑⦒" ; U+2991 U+2992
776 "⦓⦔" ; U+2993 U+2994
777 "⦕⦖" ; U+2995 U+2996
778 "⦗⦘" ; U+2997 U+2998
779 "⧼⧽" ; U+29FC U+29FD
780 "〈〉" ; U+3008 U+3009
781 "《》" ; U+300A U+300B
782 "「」" ; U+300C U+300D
783 "『』" ; U+300E U+300F
784 "【】" ; U+3010 U+3011
785 "〔〕" ; U+3014 U+3015
786 "〖〗" ; U+3016 U+3017
787 "〘〙" ; U+3018 U+3019
788 "〚〛" ; U+301A U+301B
789 "﴾﴿" ; U+FD3E U+FD3F
790 "︵︶" ; U+FE35 U+FE36
791 "︷︸" ; U+FE37 U+FE38
792 "︹︺" ; U+FE39 U+FE3A
793 "︻︼" ; U+FE3B U+FE3C
794 "︽︾" ; U+FE3D U+FE3E
795 "︿﹀" ; U+FE3F U+FE40
796 "﹁﹂" ; U+FE41 U+FE42
797 "﹃﹄" ; U+FE43 U+FE44
798 "﹙﹚" ; U+FE59 U+FE5A
799 "﹛﹜" ; U+FE5B U+FE5C
800 "﹝﹞" ; U+FE5D U+FE5E
801 "()" ; U+FF08 U+FF09
802 "[]" ; U+FF3B U+FF3D
803 "{}" ; U+FF5B U+FF5D
804 "⦅⦆" ; U+FF5F U+FF60
805 "「」" ; U+FF62 U+FF63
806 )))
807 (dolist (elt pairs)
808 (modify-syntax-entry (aref elt 0) (string ?\( (aref elt 1)))
809 (modify-syntax-entry (aref elt 1) (string ?\) (aref elt 0)))))
810
811 \f
812 ;; For each character set, put the information of the most proper
813 ;; coding system to encode it by `preferred-coding-system' property.
814
815 ;; Fixme: should this be junked?
816 (let ((l '((latin-iso8859-1 . iso-latin-1)
817 (latin-iso8859-2 . iso-latin-2)
818 (latin-iso8859-3 . iso-latin-3)
819 (latin-iso8859-4 . iso-latin-4)
820 (thai-tis620 . thai-tis620)
821 (greek-iso8859-7 . greek-iso-8bit)
822 (arabic-iso8859-6 . iso-2022-7bit)
823 (hebrew-iso8859-8 . hebrew-iso-8bit)
824 (katakana-jisx0201 . japanese-shift-jis)
825 (latin-jisx0201 . japanese-shift-jis)
826 (cyrillic-iso8859-5 . cyrillic-iso-8bit)
827 (latin-iso8859-9 . iso-latin-5)
828 (japanese-jisx0208-1978 . iso-2022-jp)
829 (chinese-gb2312 . cn-gb-2312)
830 (japanese-jisx0208 . iso-2022-jp)
831 (korean-ksc5601 . iso-2022-kr)
832 (japanese-jisx0212 . iso-2022-jp)
833 (chinese-cns11643-1 . iso-2022-cn)
834 (chinese-cns11643-2 . iso-2022-cn)
835 (chinese-big5-1 . chinese-big5)
836 (chinese-big5-2 . chinese-big5)
837 (chinese-sisheng . iso-2022-7bit)
838 (ipa . iso-2022-7bit)
839 (vietnamese-viscii-lower . vietnamese-viscii)
840 (vietnamese-viscii-upper . vietnamese-viscii)
841 (arabic-digit . iso-2022-7bit)
842 (arabic-1-column . iso-2022-7bit)
843 (lao . lao)
844 (arabic-2-column . iso-2022-7bit)
845 (indian-is13194 . devanagari)
846 (indian-glyph . devanagari)
847 (tibetan-1-column . tibetan)
848 (ethiopic . iso-2022-7bit)
849 (chinese-cns11643-3 . iso-2022-cn)
850 (chinese-cns11643-4 . iso-2022-cn)
851 (chinese-cns11643-5 . iso-2022-cn)
852 (chinese-cns11643-6 . iso-2022-cn)
853 (chinese-cns11643-7 . iso-2022-cn)
854 (indian-2-column . devanagari)
855 (tibetan . tibetan)
856 (latin-iso8859-14 . iso-latin-8)
857 (latin-iso8859-15 . iso-latin-9))))
858 (while l
859 (put-charset-property (car (car l)) 'preferred-coding-system (cdr (car l)))
860 (setq l (cdr l))))
861
862 \f
863 ;; Setup auto-fill-chars for charsets that should invoke auto-filling.
864 ;; SPACE and NEWLINE are already set. Also put `nospace-between-words'
865 ;; property on the charsets.
866 (let ((l '(katakana-jisx0201
867 japanese-jisx0208 japanese-jisx0212
868 chinese-gb2312 chinese-big5-1 chinese-big5-2)))
869 (while l
870 ;;(aset auto-fill-chars (make-char (car l)) t)
871 (put-charset-property (car l) 'nospace-between-words t)
872 (setq l (cdr l))))
873
874 \f
875 ;; CJK double width characters.
876 (let ((l '((#x1100 . #x11FF)
877 (#x2E80 . #x9FAF)
878 (#xAC00 . #xD7AF)
879 (#xF900 . #xFAFF)
880 (#xFE30 . #xFE4F)
881 (#xFF00 . #xFF5F)
882 (#xFFE0 . #xFFEF)
883 (#x20000 . #x2AFFF)
884 (#x2F800 . #x2FFFF))))
885 (dolist (elt l)
886 (set-char-table-range char-width-table
887 (cons (car elt) (cdr elt))
888 2)))
889 ;; Fixme: Doing this affects non-CJK characters through unification,
890 ;; but presumably CJK users expect those characters to be
891 ;; double-width when using these charsets.
892 ;; (map-charset-chars
893 ;; #'(lambda (range ignore) (set-char-table-range char-width-table range 2))
894 ;; 'japanese-jisx0208)
895 ;; (map-charset-chars
896 ;; #'(lambda (range ignore) (set-char-table-range char-width-table range 2))
897 ;; 'japanese-jisx0212)
898 ;; (map-charset-chars
899 ;; #'(lambda (range ignore) (set-char-table-range char-width-table range 2))
900 ;; 'japanese-jisx0213-1)
901 ;; (map-charset-chars
902 ;; #'(lambda (range ignore) (set-char-table-range char-width-table range 2))
903 ;; 'japanese-jisx0213-2)
904 ;; (map-charset-chars
905 ;; (lambda (range ignore) (set-char-table-range char-width-table range 2))
906 ;; 'korean-ksc5601)
907
908 ;; Other double width
909 (map-charset-chars
910 (lambda (range ignore) (set-char-table-range char-width-table range 2))
911 'ethiopic)
912 (map-charset-chars
913 (lambda (range ignore) (set-char-table-range char-width-table range 2))
914 'tibetan)
915 (map-charset-chars
916 (lambda (range ignore) (set-char-table-range char-width-table range 2))
917 'indian-2-column)
918 (map-charset-chars
919 (lambda (range ignore) (set-char-table-range char-width-table range 2))
920 'arabic-2-column)
921
922 (optimize-char-table (standard-case-table))
923 (optimize-char-table char-width-table)
924 (optimize-char-table (standard-category-table))
925 (optimize-char-table (standard-syntax-table))
926
927 ;; The Unicode blocks actually extend past some of these ranges with
928 ;; undefined codepoints.
929 (let ((script-list nil))
930 (dolist
931 (elt
932 '((#x0000 #x007F latin)
933 (#x00A0 #x036F latin)
934 (#x0370 #x03E1 greek)
935 (#x03E2 #x03EF coptic)
936 (#x03F0 #x03F3 greek)
937 (#x0400 #x04FF cyrillic)
938 (#x0530 #x058F armenian)
939 (#x0590 #x05FF hebrew)
940 (#x0600 #x06FF arabic)
941 (#x0700 #x074F syriac)
942 (#x0780 #x07BF thaana)
943 (#x0900 #x097F devanagari)
944 (#x0980 #x09FF bengali)
945 (#x0A00 #x0A7F gurmukhi)
946 (#x0A80 #x0AFF gujarati)
947 (#x0B00 #x0B7F oriya)
948 (#x0B80 #x0BFF tamil)
949 (#x0C00 #x0C7F telugu)
950 (#x0C80 #x0CFF kannada)
951 (#x0D00 #x0D7F malayalam)
952 (#x0D80 #x0DFF sinhala)
953 (#x0E00 #x0E5F thai)
954 (#x0E80 #x0EDF lao)
955 (#x0F00 #x0FFF tibetan)
956 (#x1000 #x105F myanmar)
957 (#x10A0 #x10FF georgian)
958 (#x1100 #x11FF hangul)
959 (#x1200 #x137F ethiopic)
960 (#x13A0 #x13FF cherokee)
961 (#x1400 #x167F canadian-aboriginal)
962 (#x1680 #x169F ogham)
963 (#x16A0 #x16FF runic)
964 (#x1780 #x17FF khmer)
965 (#x1800 #x18AF mongolian)
966 (#x1E00 #x1EFF latin)
967 (#x1F00 #x1FFF greek)
968 (#x2000 #x27FF symbol)
969 (#x2800 #x28FF braille)
970 (#x2E80 #x2FDF han)
971 (#x2FF0 #x2FFF ideographic-description)
972 (#x3000 #x303F cjk-misc)
973 (#x3040 #x30FF kana)
974 (#x3100 #x312F bopomofo)
975 (#x3130 #x318F hangul)
976 (#x3190 #x319F kanbun)
977 (#x31A0 #x31BF bopomofo)
978 (#x3400 #x9FAF han)
979 (#xA000 #xA4CF yi)
980 (#xAC00 #xD7AF hangul)
981 (#xF900 #xFAFF han)
982 (#xFB1D #xFB4F hebrew)
983 (#xFB50 #xFDFF arabic)
984 (#xFE70 #xFEFC arabic)
985 (#xFF00 #xFF5F cjk-misc)
986 (#xFF61 #xFF9F kana)
987 (#xFFE0 #xFFE6 cjk-misc)
988 (#x20000 #x2AFFF han)
989 (#x2F800 #x2FFFF han)))
990 (set-char-table-range char-script-table
991 (cons (car elt) (nth 1 elt)) (nth 2 elt))
992 (or (memq (nth 2 elt) script-list)
993 (setq script-list (cons (nth 2 elt) script-list))))
994 (set-char-table-extra-slot char-script-table 0 (nreverse script-list)))
995
996 (map-charset-chars
997 #'(lambda (range ignore)
998 (set-char-table-range char-script-table range 'tibetan))
999 'tibetan)
1000
1001 \f
1002 ;;; Setting word boundary.
1003
1004 (defun next-word-boundary-han (pos limit)
1005 (if (<= pos limit)
1006 (save-excursion
1007 (goto-char pos)
1008 (looking-at "\\cC+")
1009 (goto-char (match-end 0))
1010 (if (looking-at "\\cH+")
1011 (goto-char (match-end 0)))
1012 (point))
1013 (while (and (> pos limit)
1014 (eq (aref char-script-table (char-after (1- pos))) 'han))
1015 (setq pos (1- pos)))
1016 pos))
1017
1018 (defun next-word-boundary-kana (pos limit)
1019 (if (<= pos limit)
1020 (save-excursion
1021 (goto-char pos)
1022 (if (looking-at "\\cK+")
1023 (goto-char (match-end 0)))
1024 (if (looking-at "\\cH+")
1025 (goto-char (match-end 0)))
1026 (if (looking-at "\\ck+")
1027 (goto-char (match-end 0)))
1028 (point))
1029 (let ((category-set (char-category-set (char-after pos)))
1030 category)
1031 (if (or (aref category-set ?K) (aref category-set ?k))
1032 (while (and (> pos limit)
1033 (setq category-set
1034 (char-category-set (char-after (1- pos))))
1035 (or (aref category-set ?K) (aref category-set ?k)))
1036 (setq pos (1- pos)))
1037 (while (and (> pos limit)
1038 (aref (setq category-set
1039 (char-category-set (char-after (1- pos)))) ?H))
1040 (setq pos (1- pos)))
1041 (setq category (cond ((aref category-set ?C) ?C)
1042 ((aref category-set ?K) ?K)
1043 ((aref category-set ?A) ?A)))
1044 (when category
1045 (setq pos (1- pos))
1046 (while (and (> pos limit)
1047 (aref (char-category-set (char-after (1- pos)))
1048 category))
1049 (setq pos (1- pos)))))
1050 pos)))
1051
1052 (map-char-table
1053 #'(lambda (char script)
1054 (cond ((eq script 'han)
1055 (set-char-table-range find-word-boundary-function-table
1056 char #'next-word-boundary-han))
1057 ((eq script 'kana)
1058 (set-char-table-range find-word-boundary-function-table
1059 char #'next-word-boundary-kana))))
1060 char-script-table)
1061
1062 (setq word-combining-categories
1063 '((?l . ?l)))
1064
1065 (setq word-separating-categories ; (2-byte character sets)
1066 '((?A . ?K) ; Alpha numeric - Katakana
1067 (?A . ?C) ; Alpha numeric - Chinese
1068 (?H . ?A) ; Hiragana - Alpha numeric
1069 (?H . ?K) ; Hiragana - Katakana
1070 (?H . ?C) ; Hiragana - Chinese
1071 (?K . ?A) ; Katakana - Alpha numeric
1072 (?K . ?C) ; Katakana - Chinese
1073 (?C . ?A) ; Chinese - Alpha numeric
1074 (?C . ?K) ; Chinese - Katakana
1075 ))
1076
1077 ;;; Local Variables:
1078 ;;; coding: utf-8-emacs
1079 ;;; End:
1080
1081 ;;; arch-tag: 85889c35-9f4d-4912-9bf5-82de31b0d42d
1082 ;;; characters.el ends here