]> code.delx.au - gnu-emacs/blob - lisp/language/chinese.el
Update copyright years.
[gnu-emacs] / lisp / language / chinese.el
1 ;;; chinese.el --- support for Chinese -*- coding: iso-2022-7bit; -*-
2
3 ;; Copyright (C) 2001, 2002, 2003, 2004, 2005, 2006
4 ;; Free Software Foundation, Inc.
5 ;; Copyright (C) 1995, 1997, 1998
6 ;; National Institute of Advanced Industrial Science and Technology (AIST)
7 ;; Registration Number H14PRO021
8
9 ;; Keywords: multilingual, Chinese
10
11 ;; This file is part of GNU Emacs.
12
13 ;; GNU Emacs is free software; you can redistribute it and/or modify
14 ;; it under the terms of the GNU General Public License as published by
15 ;; the Free Software Foundation; either version 2, or (at your option)
16 ;; any later version.
17
18 ;; GNU Emacs is distributed in the hope that it will be useful,
19 ;; but WITHOUT ANY WARRANTY; without even the implied warranty of
20 ;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
21 ;; GNU General Public License for more details.
22
23 ;; You should have received a copy of the GNU General Public License
24 ;; along with GNU Emacs; see the file COPYING. If not, write to the
25 ;; Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
26 ;; Boston, MA 02110-1301, USA.
27
28 ;;; Commentary:
29
30 ;; For Chinese, three character sets GB2312, BIG5, and CNS11643 are
31 ;; supported.
32
33 ;;; Code:
34
35 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
36 ;;; Chinese (general)
37 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
38
39 (make-coding-system
40 'iso-2022-cn 2 ?C
41 "ISO 2022 based 7bit encoding for Chinese GB and CNS (MIME:ISO-2022-CN)."
42 '(ascii
43 (nil chinese-gb2312 chinese-cns11643-1)
44 (nil chinese-cns11643-2)
45 nil
46 nil ascii-eol ascii-cntl seven locking-shift single-shift nil nil nil
47 init-bol)
48 '((safe-charsets ascii chinese-gb2312 chinese-cns11643-1 chinese-cns11643-2)
49 (mime-charset . iso-2022-cn)))
50
51 (define-coding-system-alias 'chinese-iso-7bit 'iso-2022-cn)
52
53 (make-coding-system
54 'iso-2022-cn-ext 2 ?C
55 "ISO 2022 based 7bit encoding for Chinese GB and CNS (MIME:ISO-2022-CN-EXT)."
56 '(ascii
57 (nil chinese-gb2312 chinese-cns11643-1)
58 (nil chinese-cns11643-2)
59 (nil chinese-cns11643-3 chinese-cns11643-4 chinese-cns11643-5
60 chinese-cns11643-6 chinese-cns11643-7)
61 nil ascii-eol ascii-cntl seven locking-shift single-shift nil nil nil
62 init-bol)
63 '((safe-charsets ascii chinese-gb2312 chinese-cns11643-1 chinese-cns11643-2
64 chinese-cns11643-3 chinese-cns11643-4 chinese-cns11643-5
65 chinese-cns11643-6 chinese-cns11643-7)
66 (mime-charset . iso-2022-cn-ext)))
67
68 \f
69 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
70 ;;; Chinese GB2312 (simplified)
71 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
72
73 (make-coding-system
74 'chinese-iso-8bit 2 ?c
75 "ISO 2022 based EUC encoding for Chinese GB2312 (MIME:GB2312)."
76 '(ascii chinese-gb2312 nil nil
77 nil ascii-eol ascii-cntl nil nil nil nil)
78 '((safe-charsets ascii chinese-gb2312)
79 (mime-charset . gb2312)))
80
81 (define-coding-system-alias 'cn-gb-2312 'chinese-iso-8bit)
82 (define-coding-system-alias 'euc-china 'chinese-iso-8bit)
83 (define-coding-system-alias 'euc-cn 'chinese-iso-8bit)
84 (define-coding-system-alias 'cn-gb 'chinese-iso-8bit)
85 (define-coding-system-alias 'gb2312 'chinese-iso-8bit)
86 (define-coding-system-alias 'cp936 'chinese-iso-8bit)
87
88 (make-coding-system
89 'chinese-hz 0 ?z
90 "Hz/ZW 7-bit encoding for Chinese GB2312 (MIME:HZ-GB-2312)."
91 nil
92 '((safe-charsets ascii chinese-gb2312)
93 (mime-charset . hz-gb-2312)
94 (post-read-conversion . post-read-decode-hz)
95 (pre-write-conversion . pre-write-encode-hz)))
96
97 (define-coding-system-alias 'hz-gb-2312 'chinese-hz)
98 (define-coding-system-alias 'hz 'chinese-hz)
99
100 (defun post-read-decode-hz (len)
101 (let ((pos (point))
102 (buffer-modified-p (buffer-modified-p))
103 last-coding-system-used)
104 (prog1
105 (decode-hz-region pos (+ pos len))
106 (set-buffer-modified-p buffer-modified-p))))
107
108 (defun pre-write-encode-hz (from to)
109 (let ((buf (current-buffer)))
110 (set-buffer (generate-new-buffer " *temp*"))
111 (if (stringp from)
112 (insert from)
113 (insert-buffer-substring buf from to))
114 (let (last-coding-system-used)
115 (encode-hz-region 1 (point-max)))
116 nil))
117
118 (set-language-info-alist
119 "Chinese-GB" '((charset chinese-gb2312 chinese-sisheng)
120 (coding-system chinese-iso-8bit iso-2022-cn chinese-hz)
121 (coding-priority chinese-iso-8bit chinese-big5 iso-2022-cn)
122 (input-method . "chinese-py-punct")
123 (features china-util)
124 (sample-text . "Chinese (\e$AVPND\e(B,\e$AFUM(;0\e(B,\e$A::So\e(B) \e$ADc:C\e(B")
125 (documentation . "Support for Chinese GB2312 character set.")
126 (tutorial . "TUTORIAL.cn"))
127 '("Chinese"))
128
129 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
130 ;; Chinese BIG5 (traditional)
131 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
132
133 (make-coding-system
134 'chinese-big5 3 ?B
135 "BIG5 8-bit encoding for Chinese (MIME:Big5)."
136 nil
137 '((safe-charsets ascii chinese-big5-1 chinese-big5-2)
138 (mime-charset . big5)
139 (charset-origin-alist (chinese-big5-1 "BIG5" encode-big5-char)
140 (chinese-big5-2 "BIG5" encode-big5-char))))
141
142 (define-coding-system-alias 'big5 'chinese-big5)
143 (define-coding-system-alias 'cn-big5 'chinese-big5)
144 (define-coding-system-alias 'cp950 'chinese-big5)
145
146 ;; Big5 font requires special encoding.
147 (define-ccl-program ccl-encode-big5-font
148 `(0
149 ;; In: R0:chinese-big5-1 or chinese-big5-2
150 ;; R1:position code 1
151 ;; R2:position code 2
152 ;; Out: R1:font code point 1
153 ;; R2:font code point 2
154 ((r2 = ((((r1 - ?\x21) * 94) + r2) - ?\x21))
155 (if (r0 == ,(charset-id 'chinese-big5-2)) (r2 += 6280))
156 (r1 = ((r2 / 157) + ?\xA1))
157 (r2 %= 157)
158 (if (r2 < ?\x3F) (r2 += ?\x40) (r2 += ?\x62))))
159 "CCL program to encode a Big5 code to code point of Big5 font.")
160
161 (setq font-ccl-encoder-alist
162 (cons (cons "big5" ccl-encode-big5-font) font-ccl-encoder-alist))
163
164 (set-language-info-alist
165 "Chinese-BIG5" '((charset chinese-big5-1 chinese-big5-2)
166 (coding-system chinese-big5 chinese-iso-7bit)
167 (coding-priority chinese-big5 iso-2022-cn chinese-iso-8bit)
168 (input-method . "chinese-py-punct-b5")
169 (features china-util)
170 (sample-text . "Cantonese (\e$(0GnM$\e(B,\e$(0N]0*Hd\e(B) \e$(0*/=(\e(B, \e$(0+$)p\e(B")
171 (documentation . "Support for Chinese Big5 character set.")
172 (tutorial . "TUTORIAL.zh"))
173 '("Chinese"))
174
175 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
176 ;; Chinese CNS11643 (traditional)
177 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
178
179 (defvar big5-to-cns (make-translation-table)
180 "Translation table for encoding to `euc-tw'.")
181 ;; Could have been done by china-util loaded before.
182 (unless (get 'big5-to-cns 'translation-table)
183 (define-translation-table 'big5-to-cns big5-to-cns))
184
185 (define-ccl-program ccl-decode-euc-tw
186 ;; CNS plane 1 needs either two or four bytes in EUC-TW encoding;
187 ;; CNS planes 2 to 7 always need four bytes. In internal encoding of
188 ;; Emacs, CNS planes 1 and 2 need three bytes, and planes 3 to 7 need
189 ;; four bytes. Thus a buffer magnification value of 2 (for both
190 ;; encoding and decoding) is sufficient.
191 `(2
192 ;; we don't have enough registers to hold all charset-ids
193 ((r4 = ,(charset-id 'chinese-cns11643-1))
194 (r5 = ,(charset-id 'chinese-cns11643-2))
195 (r6 = ,(charset-id 'chinese-cns11643-3))
196 (loop
197 (read-if (r0 < #x80)
198 ;; ASCII
199 (write-repeat r0)
200 ;; not ASCII
201 (if (r0 == #x8E)
202 ;; single shift
203 (read-if (r1 < #xA1)
204 ;; invalid byte
205 ((write r0)
206 (write-repeat r1))
207 (if (r1 > #xA7)
208 ;; invalid plane
209 ((write r0)
210 (write-repeat r1))
211 ;; OK, we have a plane
212 (read-if (r2 < #xA1)
213 ;; invalid first byte
214 ((write r0 r1)
215 (write-repeat r2))
216 (read-if (r3 < #xA1)
217 ;; invalid second byte
218 ((write r0 r1 r2)
219 (write-repeat r3))
220 ;; CNS 1-7, finally
221 ((branch (r1 - #xA1)
222 (r1 = r4)
223 (r1 = r5)
224 (r1 = r6)
225 (r1 = ,(charset-id 'chinese-cns11643-4))
226 (r1 = ,(charset-id 'chinese-cns11643-5))
227 (r1 = ,(charset-id 'chinese-cns11643-6))
228 (r1 = ,(charset-id 'chinese-cns11643-7)))
229 (r2 = ((((r2 - #x80) << 7) + r3) - #x80))
230 (write-multibyte-character r1 r2)
231 (repeat))))))
232 ;; standard EUC
233 (if (r0 < #xA1)
234 ;; invalid first byte
235 (write-repeat r0)
236 (read-if (r1 < #xA1)
237 ;; invalid second byte
238 ((write r0)
239 (write-repeat r1))
240 ;; CNS 1, finally
241 ((r1 = ((((r0 - #x80) << 7) + r1) - #x80))
242 (write-multibyte-character r4 r1)
243 (repeat)))))))))
244 "CCL program to decode EUC-TW encoding."
245 )
246
247 (define-ccl-program ccl-encode-euc-tw
248 `(2
249 ;; we don't have enough registers to hold all charset-ids
250 ((r2 = ,(charset-id 'ascii))
251 (r3 = ,(charset-id 'chinese-big5-1))
252 (r4 = ,(charset-id 'chinese-big5-2))
253 (r5 = ,(charset-id 'chinese-cns11643-1))
254 (r6 = ,(charset-id 'chinese-cns11643-2))
255 (loop
256 (read-multibyte-character r0 r1)
257 (if (r0 == r2)
258 (write-repeat r1)
259 (;; Big 5 encoded characters are first translated to CNS
260 (if (r0 == r3)
261 (translate-character big5-to-cns r0 r1)
262 (if (r0 == r4)
263 (translate-character big5-to-cns r0 r1)))
264 (if (r0 == r5)
265 (r0 = #xA1)
266 (if (r0 == r6)
267 (r0 = #xA2)
268 (if (r0 == ,(charset-id 'chinese-cns11643-3))
269 (r0 = #xA3)
270 (if (r0 == ,(charset-id 'chinese-cns11643-4))
271 (r0 = #xA4)
272 (if (r0 == ,(charset-id 'chinese-cns11643-5))
273 (r0 = #xA5)
274 (if (r0 == ,(charset-id 'chinese-cns11643-6))
275 (r0 = #xA6)
276 (if (r0 == ,(charset-id 'chinese-cns11643-7))
277 (r0 = #xA7)
278 ;; not CNS. We use a dummy character which
279 ;; can't occur in EUC-TW encoding to indicate
280 ;; this.
281 (write-repeat #xFF))))))))))
282 (if (r0 != #xA1)
283 ;; single shift and CNS plane
284 ((write #x8E)
285 (write r0)))
286 (write ((r1 >> 7) + #x80))
287 (write ((r1 % #x80) + #x80))
288 (repeat))))
289 "CCL program to encode EUC-TW encoding."
290 )
291
292 (defun euc-tw-pre-write-conversion (beg end)
293 "Semi-dummy pre-write function effectively to autoload china-util."
294 ;; Ensure translation table is loaded.
295 (require 'china-util)
296 ;; Don't do this again.
297 (coding-system-put 'euc-tw 'pre-write-conversion nil)
298 nil)
299
300 (make-coding-system
301 'euc-tw 4 ?Z
302 "ISO 2022 based EUC encoding for Chinese CNS11643.
303 Big5 encoding is accepted for input also (which is then converted to CNS)."
304 '(ccl-decode-euc-tw . ccl-encode-euc-tw)
305 '((safe-charsets ascii
306 chinese-big5-1
307 chinese-big5-2
308 chinese-cns11643-1
309 chinese-cns11643-2
310 chinese-cns11643-3
311 chinese-cns11643-4
312 chinese-cns11643-5
313 chinese-cns11643-6
314 chinese-cns11643-7)
315 (valid-codes (0 . 255))
316 (pre-write-conversion . euc-tw-pre-write-conversion)))
317
318 (define-coding-system-alias 'euc-taiwan 'euc-tw)
319
320 (set-language-info-alist
321 "Chinese-CNS" '((charset chinese-cns11643-1 chinese-cns11643-2
322 chinese-cns11643-3 chinese-cns11643-4
323 chinese-cns11643-5 chinese-cns11643-6
324 chinese-cns11643-7)
325 (coding-system iso-2022-cn euc-tw)
326 (coding-priority iso-2022-cn euc-tw chinese-big5
327 chinese-iso-8bit)
328 (features china-util)
329 (input-method . "chinese-cns-quick")
330 (documentation . "\
331 Support for Chinese CNS character sets. Note that the EUC-TW coding system
332 accepts Big5 for input also (which is then converted to CNS)."))
333 '("Chinese"))
334
335 (set-language-info-alist
336 "Chinese-EUC-TW" '((charset chinese-cns11643-1 chinese-cns11643-2
337 chinese-cns11643-3 chinese-cns11643-4
338 chinese-cns11643-5 chinese-cns11643-6
339 chinese-cns11643-7 chinese-big5-1 chinese-big5-2)
340 (coding-system euc-tw iso-2022-cn)
341 (coding-priority euc-tw chinese-big5 iso-2022-cn
342 chinese-iso-8bit)
343 (features china-util)
344 (input-method . "chinese-cns-quick")
345 (documentation . "\
346 Support for Chinese, prefering the EUC-TW character set. Note that
347 the EUC-TW coding system accepts Big5 for input also (which is then
348 converted to CNS)."))
349 '("Chinese"))
350
351 (provide 'chinese)
352
353 ;;; arch-tag: b82fcf7a-84f6-4e0b-b38c-1742dac0e09f
354 ;;; chinese.el ends here