]> code.delx.au - gnu-emacs/blob - lisp/gnus/rfc2047.el
(message-posting-charset): defvar when compiling.
[gnu-emacs] / lisp / gnus / rfc2047.el
1 ;;; rfc2047.el --- functions for encoding and decoding rfc2047 messages
2 ;; Copyright (C) 1998, 1999, 2000, 2002 Free Software Foundation, Inc.
3
4 ;; Author: Lars Magne Ingebrigtsen <larsi@gnus.org>
5 ;; MORIOKA Tomohiko <morioka@jaist.ac.jp>
6 ;; This file is part of GNU Emacs.
7
8 ;; GNU Emacs is free software; you can redistribute it and/or modify
9 ;; it under the terms of the GNU General Public License as published by
10 ;; the Free Software Foundation; either version 2, or (at your option)
11 ;; any later version.
12
13 ;; GNU Emacs is distributed in the hope that it will be useful,
14 ;; but WITHOUT ANY WARRANTY; without even the implied warranty of
15 ;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 ;; GNU General Public License for more details.
17
18 ;; You should have received a copy of the GNU General Public License
19 ;; along with GNU Emacs; see the file COPYING. If not, write to the
20 ;; Free Software Foundation, Inc., 59 Temple Place - Suite 330,
21 ;; Boston, MA 02111-1307, USA.
22
23 ;;; Commentary:
24
25 ;; RFC 2047 is "MIME (Multipurpose Internet Mail Extensions) Part
26 ;; Three: Message Header Extensions for Non-ASCII Text".
27
28 ;;; Code:
29
30 (eval-when-compile
31 (require 'cl)
32 (defvar message-posting-charset))
33
34 (require 'qp)
35 (require 'mm-util)
36 (require 'ietf-drums)
37 (require 'mail-prsvr)
38 (require 'base64)
39 ;; Fixme: Avoid this (for gnus-point-at-...) mm dependence on gnus.
40 (require 'gnus-util)
41 (autoload 'mm-body-7-or-8 "mm-bodies")
42
43 (defvar rfc2047-header-encoding-alist
44 '(("Newsgroups" . nil)
45 ("Message-ID" . nil)
46 ("\\(Resent-\\)?\\(From\\|Cc\\|To\\|Bcc\\|Reply-To\\|Sender\\)" .
47 address-mime)
48 (t . mime))
49 "*Header/encoding method alist.
50 The list is traversed sequentially. The keys can either be
51 header regexps or t.
52
53 The values can be:
54
55 1) nil, in which case no encoding is done;
56 2) `mime', in which case the header will be encoded according to RFC2047;
57 3) `address-mime', like `mime', but takes account of the rules for address
58 fields (where quoted strings and comments must be treated separately);
59 4) a charset, in which case it will be encoded as that charset;
60 5) `default', in which case the field will be encoded as the rest
61 of the article.")
62
63 (defvar rfc2047-charset-encoding-alist
64 '((us-ascii . nil)
65 (iso-8859-1 . Q)
66 (iso-8859-2 . Q)
67 (iso-8859-3 . Q)
68 (iso-8859-4 . Q)
69 (iso-8859-5 . B)
70 (koi8-r . B)
71 (iso-8859-7 . B)
72 (iso-8859-8 . B)
73 (iso-8859-9 . Q)
74 (iso-8859-14 . Q)
75 (iso-8859-15 . Q)
76 (iso-2022-jp . B)
77 (iso-2022-kr . B)
78 (gb2312 . B)
79 (big5 . B)
80 (cn-big5 . B)
81 (cn-gb . B)
82 (cn-gb-2312 . B)
83 (euc-kr . B)
84 (iso-2022-jp-2 . B)
85 (iso-2022-int-1 . B))
86 "Alist of MIME charsets to RFC2047 encodings.
87 Valid encodings are nil, `Q' and `B'. These indicate binary (no) encoding,
88 quoted-printable and base64 respectively.")
89
90 (defvar rfc2047-encoding-function-alist
91 '((Q . rfc2047-q-encode-region)
92 (B . rfc2047-b-encode-region)
93 (nil . ignore))
94 "Alist of RFC2047 encodings to encoding functions.")
95
96 (defvar rfc2047-q-encoding-alist
97 '(("\\(Resent-\\)?\\(From\\|Cc\\|To\\|Bcc\\|Reply-To\\|Sender\\):"
98 . "-A-Za-z0-9!*+/" )
99 ;; = (\075), _ (\137), ? (\077) are used in the encoded word.
100 ;; Avoid using 8bit characters.
101 ;; Equivalent to "^\000-\007\011\013\015-\037\200-\377=_?"
102 ("." . "\010\012\014\040-\074\076\100-\136\140-\177"))
103 "Alist of header regexps and valid Q characters.")
104
105 ;;;
106 ;;; Functions for encoding RFC2047 messages
107 ;;;
108
109 (defun rfc2047-narrow-to-field ()
110 "Narrow the buffer to the header on the current line."
111 (beginning-of-line)
112 (narrow-to-region
113 (point)
114 (progn
115 (forward-line 1)
116 (if (re-search-forward "^[^ \n\t]" nil t)
117 (progn
118 (beginning-of-line)
119 (point))
120 (point-max))))
121 (goto-char (point-min)))
122
123 (defun rfc2047-encode-message-header ()
124 "Encode the message header according to `rfc2047-header-encoding-alist'.
125 Should be called narrowed to the head of the message."
126 (interactive "*")
127 (save-excursion
128 (goto-char (point-min))
129 (let (alist elem method)
130 (while (not (eobp))
131 (save-restriction
132 (rfc2047-narrow-to-field)
133 (if (not (rfc2047-encodable-p))
134 (if (and (eq (mm-body-7-or-8) '8bit)
135 (mm-multibyte-p)
136 (mm-coding-system-p
137 (car message-posting-charset)))
138 ;; 8 bit must be decoded.
139 ;; Is message-posting-charset a coding system?
140 (mm-encode-coding-region
141 (point-min) (point-max)
142 (car message-posting-charset)))
143 ;; We found something that may perhaps be encoded.
144 (setq method nil
145 alist rfc2047-header-encoding-alist)
146 (while (setq elem (pop alist))
147 (when (or (and (stringp (car elem))
148 (looking-at (car elem)))
149 (eq (car elem) t))
150 (setq alist nil
151 method (cdr elem))))
152 (goto-char (point-min))
153 (re-search-forward "^[^:]+: *" nil t)
154 (cond
155 ((eq method 'address-mime)
156 (rfc2047-encode-region (point) (point-max)))
157 ((eq method 'mime)
158 (let (rfc2047-special-chars)
159 (rfc2047-encode-region (point) (point-max))))
160 ((eq method 'default)
161 (if (and (featurep 'mule)
162 (if (boundp 'default-enable-multibyte-characters)
163 default-enable-multibyte-characters)
164 mail-parse-charset)
165 (mm-encode-coding-region (point) (point-max)
166 mail-parse-charset)))
167 ((mm-coding-system-p method)
168 (if (and (featurep 'mule)
169 (if (boundp 'default-enable-multibyte-characters)
170 default-enable-multibyte-characters))
171 (mm-encode-coding-region (point) (point-max) method)))
172 ;; Hm.
173 (t)))
174 (goto-char (point-max)))))))
175
176 ;; Fixme: This, and the require below may not be the Right Thing, but
177 ;; should be safe just before release. -- fx 2001-02-08
178 (eval-when-compile (defvar message-posting-charset))
179
180 (defun rfc2047-encodable-p ()
181 "Return non-nil if any characters in current buffer need encoding in headers.
182 The buffer may be narrowed."
183 (require 'message) ; for message-posting-charset
184 (let ((charsets
185 (mm-find-mime-charset-region (point-min) (point-max))))
186 (and charsets (not (equal charsets (list message-posting-charset))))))
187
188 ;; ietf-drums-specials-token less \ . @
189 (defconst rfc2047-special-chars (append "()<>[]:;,\"" nil)
190 "List of characters treated as special when rfc207-encoding address fields.
191 When encoding other sorts of fields, bin it to nil to avoid treating
192 RFC 2822 quoted words and comments specially.")
193
194 (defconst rfc2047-non-special-chars (concat "^" rfc2047-special-chars))
195
196 (defun rfc2047-dissect-region (b e)
197 "Dissect the region between B and E into tokens.
198 The tokens comprise sequences of atoms, quoted strings, special
199 characters and whitespace."
200 (save-restriction
201 (narrow-to-region b e)
202 (if (null rfc2047-special-chars)
203 ;; simple `mime' case -- no need to tokenize
204 (list (buffer-substring b e))
205 ;; `address-mime' case -- take care of quoted words, comments
206 (with-syntax-table ietf-drums-syntax-table
207 (let ((start (point))
208 words)
209 (goto-char (point-min))
210 (condition-case nil ; in case of unbalanced specials
211 ;; Dissect into: sequences of atoms, quoted strings,
212 ;; specials, whitespace. (Specials mustn't be encoded.)
213 (while (not (eobp))
214 (setq start (point))
215 (unless (= 0 (skip-chars-forward ietf-drums-wsp-token))
216 (push (buffer-substring start (point)) words)
217 (setq start (point)))
218 (cond
219 ((memq (char-after) rfc2047-special-chars)
220 ;; Grab string or special char.
221 (if (eq ?\" (char-after))
222 (progn
223 (forward-sexp)
224 (push (buffer-substring start (point)) words))
225 (push (string (char-after)) words)
226 (forward-char)))
227 ((not (char-after))) ; eob
228 (t ; normal token/whitespace sequence
229 (skip-chars-forward rfc2047-non-special-chars)
230 (skip-chars-backward ietf-drums-wsp-token)
231 (push (buffer-substring start (point)) words))))
232 (error (error "Invalid data for rfc2047 encoding: %s"
233 (buffer-substring b e))))
234 (nreverse words))))))
235
236 ;; Fixme: why does this cons a list of words and insert them, rather
237 ;; than encoding in place?
238 (defun rfc2047-encode-region (b e)
239 "Encode all encodable words in region B to E.
240 By default, the region is treated as containing addresses (see
241 `rfc2047-special-chars')."
242 (let ((words (rfc2047-dissect-region b e)) word)
243 (save-restriction
244 (narrow-to-region b e)
245 (delete-region (point-min) (point-max))
246 (dolist (word words)
247 ;; Quoted strings can't contain encoded words. Strip the
248 ;; quotes.
249 (if rfc2047-special-chars
250 (if (eq ?\" (aref word 0))
251 (setq word (substring word 1 -1))))
252 (if (string-match "\\`[\0-\177]*\\'" word) ; including whitespace
253 (insert word)
254 (rfc2047-fold-region (gnus-point-at-bol) (point))
255 (goto-char (point-max))
256 (if (> (- (point) (save-restriction
257 (widen)
258 (gnus-point-at-bol))) 76)
259 (insert "\n "))
260 ;; Insert blank between encoded words
261 (if (eq (char-before) ?=) (insert " "))
262 (rfc2047-encode (point)
263 (progn (insert word) (point)))))
264 (rfc2047-fold-region (point-min) (point-max)))))
265
266 (defun rfc2047-encode-string (string)
267 "Encode words in STRING.
268 By default, the string is treated as containing addresses (see
269 `rfc2047-special-chars')."
270 (with-temp-buffer
271 (insert string)
272 (rfc2047-encode-region (point-min) (point-max))
273 (buffer-string)))
274
275 (defun rfc2047-encode (b e)
276 "Encode the word(s) in the region B to E.
277 By default, the region is treated as containing addresses (see
278 `rfc2047-special-chars')."
279 (let* ((mime-charset (mm-find-mime-charset-region b e))
280 (cs (if (> (length mime-charset) 1)
281 ;; Fixme: instead of this, try to break region into
282 ;; parts that can be encoded separately.
283 (error "Can't rfc2047-encode `%s'"
284 (buffer-substring b e))
285 (setq mime-charset (car mime-charset))
286 (mm-charset-to-coding-system mime-charset)))
287 (encoding (if (assq mime-charset
288 rfc2047-charset-encoding-alist)
289 (cdr (assq mime-charset
290 rfc2047-charset-encoding-alist))
291 'B))
292 (start (concat
293 "=?" (downcase (symbol-name mime-charset)) "?"
294 (downcase (symbol-name encoding)) "?"))
295 (first t))
296 (if mime-charset
297 (save-restriction
298 (narrow-to-region b e)
299 (when (eq encoding 'B)
300 ;; break into lines before encoding
301 (goto-char (point-min))
302 (while (not (eobp))
303 (goto-char (min (point-max) (+ 15 (point))))
304 (unless (eobp)
305 (insert "\n"))))
306 (if (and (mm-multibyte-p)
307 (mm-coding-system-p cs))
308 (mm-encode-coding-region (point-min) (point-max) cs))
309 (funcall (cdr (assq encoding rfc2047-encoding-function-alist))
310 (point-min) (point-max))
311 (goto-char (point-min))
312 (while (not (eobp))
313 (unless first
314 (insert " "))
315 (setq first nil)
316 (insert start)
317 (end-of-line)
318 (insert "?=")
319 (forward-line 1))))))
320
321 (defun rfc2047-fold-region (b e)
322 "Fold long lines in region B to E."
323 (save-restriction
324 (narrow-to-region b e)
325 (goto-char (point-min))
326 (let ((break nil)
327 (qword-break nil)
328 (bol (save-restriction
329 (widen)
330 (gnus-point-at-bol))))
331 (while (not (eobp))
332 (when (and (or break qword-break) (> (- (point) bol) 76))
333 (goto-char (or break qword-break))
334 (setq break nil
335 qword-break nil)
336 (if (looking-at " \t")
337 (insert "\n")
338 (insert "\n "))
339 (setq bol (1- (point)))
340 ;; Don't break before the first non-LWSP characters.
341 (skip-chars-forward " \t")
342 (unless (eobp) (forward-char 1)))
343 (cond
344 ((eq (char-after) ?\n)
345 (forward-char 1)
346 (setq bol (point)
347 break nil
348 qword-break nil)
349 (skip-chars-forward " \t")
350 (unless (or (eobp) (eq (char-after) ?\n))
351 (forward-char 1)))
352 ((eq (char-after) ?\r)
353 (forward-char 1))
354 ((memq (char-after) '(? ?\t))
355 (skip-chars-forward " \t")
356 (setq break (1- (point))))
357 ((not break)
358 (if (not (looking-at "=\\?[^=]"))
359 (if (eq (char-after) ?=)
360 (forward-char 1)
361 (skip-chars-forward "^ \t\n\r="))
362 (setq qword-break (point))
363 (skip-chars-forward "^ \t\n\r")))
364 (t
365 (skip-chars-forward "^ \t\n\r"))))
366 (when (and (or break qword-break) (> (- (point) bol) 76))
367 (goto-char (or break qword-break))
368 (setq break nil
369 qword-break nil)
370 (if (looking-at " \t")
371 (insert "\n")
372 (insert "\n "))
373 (setq bol (1- (point)))
374 ;; Don't break before the first non-LWSP characters.
375 (skip-chars-forward " \t")
376 (unless (eobp) (forward-char 1))))))
377
378 (defun rfc2047-unfold-region (b e)
379 "Unfold lines in region B to E."
380 (save-restriction
381 (narrow-to-region b e)
382 (goto-char (point-min))
383 (let ((bol (save-restriction
384 (widen)
385 (gnus-point-at-bol)))
386 (eol (gnus-point-at-eol))
387 leading)
388 (forward-line 1)
389 (while (not (eobp))
390 (looking-at "[ \t]*")
391 (setq leading (- (match-end 0) (match-beginning 0)))
392 (if (< (- (gnus-point-at-eol) bol leading) 76)
393 (progn
394 (goto-char eol)
395 (delete-region eol (progn
396 (skip-chars-forward "[ \t\n\r]+")
397 (1- (point)))))
398 (setq bol (gnus-point-at-bol)))
399 (setq eol (gnus-point-at-eol))
400 (forward-line 1)))))
401
402 (defun rfc2047-b-encode-region (b e)
403 "Base64-encode the header contained in region B to E."
404 (save-restriction
405 (narrow-to-region (goto-char b) e)
406 (while (not (eobp))
407 (base64-encode-region (point) (progn (end-of-line) (point)) t)
408 (if (and (bolp) (eolp))
409 (delete-backward-char 1))
410 (forward-line))))
411
412 (defun rfc2047-q-encode-region (b e)
413 "Quoted-printable-encode the header in region B to E."
414 (save-excursion
415 (save-restriction
416 (narrow-to-region (goto-char b) e)
417 (let ((alist rfc2047-q-encoding-alist)
418 (bol (save-restriction
419 (widen)
420 (gnus-point-at-bol))))
421 (while alist
422 (when (looking-at (caar alist))
423 (quoted-printable-encode-region b e nil (cdar alist))
424 (subst-char-in-region (point-min) (point-max) ? ?_)
425 (setq alist nil))
426 (pop alist))
427 ;; The size of QP encapsulation is about 20, so set limit to
428 ;; 56=76-20.
429 (unless (< (- (point-max) (point-min)) 56)
430 ;; Don't break if it could fit in one line.
431 ;; Let rfc2047-encode-region break it later.
432 (goto-char (1+ (point-min)))
433 (while (and (not (bobp)) (not (eobp)))
434 (goto-char (min (point-max) (+ 56 bol)))
435 (search-backward "=" (- (point) 2) t)
436 (unless (or (bobp) (eobp))
437 (insert "\n")
438 (setq bol (point)))))))))
439
440 ;;;
441 ;;; Functions for decoding RFC2047 messages
442 ;;;
443
444 (defvar rfc2047-encoded-word-regexp
445 "=\\?\\([^][\000-\040()<>@,\;:\\\"/?.=]+\\)\\?\\(B\\|Q\\)\\?\\([!->@-~ +]+\\)\\?=")
446
447 (defun rfc2047-decode-region (start end)
448 "Decode MIME-encoded words in region between START and END."
449 (interactive "r")
450 (let ((case-fold-search t)
451 b e)
452 (save-excursion
453 (save-restriction
454 (narrow-to-region start end)
455 (goto-char (point-min))
456 ;; Remove whitespace between encoded words.
457 (while (re-search-forward
458 (concat "\\(" rfc2047-encoded-word-regexp "\\)"
459 "\\(\n?[ \t]\\)+"
460 "\\(" rfc2047-encoded-word-regexp "\\)")
461 nil t)
462 (delete-region (goto-char (match-end 1)) (match-beginning 6)))
463 ;; Decode the encoded words.
464 (setq b (goto-char (point-min)))
465 (while (re-search-forward rfc2047-encoded-word-regexp nil t)
466 (setq e (match-beginning 0))
467 (insert (rfc2047-parse-and-decode
468 (prog1
469 (match-string 0)
470 (delete-region (match-beginning 0) (match-end 0)))))
471 (when (and (mm-multibyte-p)
472 mail-parse-charset
473 (not (eq mail-parse-charset 'gnus-decoded)))
474 (mm-decode-coding-region b e mail-parse-charset))
475 (setq b (point)))
476 (when (and (mm-multibyte-p)
477 mail-parse-charset
478 (not (eq mail-parse-charset 'us-ascii))
479 (not (eq mail-parse-charset 'gnus-decoded)))
480 (mm-decode-coding-region b (point-max) mail-parse-charset))
481 (rfc2047-unfold-region (point-min) (point-max))))))
482
483 (defun rfc2047-decode-string (string)
484 "Decode the quoted-printable-encoded STRING and return the results."
485 (let ((m (mm-multibyte-p)))
486 (with-temp-buffer
487 (when m
488 (mm-enable-multibyte))
489 (insert string)
490 (inline
491 (rfc2047-decode-region (point-min) (point-max)))
492 (buffer-string))))
493
494 (defun rfc2047-parse-and-decode (word)
495 "Decode WORD and return it if it is an encoded word.
496 Return WORD if not."
497 (if (not (string-match rfc2047-encoded-word-regexp word))
498 word
499 (or
500 (condition-case nil
501 (rfc2047-decode
502 (match-string 1 word)
503 (upcase (match-string 2 word))
504 (match-string 3 word))
505 (error word))
506 word)))
507
508 (defun rfc2047-decode (charset encoding string)
509 "Decode STRING from the given MIME CHARSET in the given ENCODING.
510 Valid ENCODINGs are \"B\" and \"Q\".
511 If your Emacs implementation can't decode CHARSET, return nil."
512 (if (stringp charset)
513 (setq charset (intern (downcase charset))))
514 (if (or (not charset)
515 (eq 'gnus-all mail-parse-ignored-charsets)
516 (memq 'gnus-all mail-parse-ignored-charsets)
517 (memq charset mail-parse-ignored-charsets))
518 (setq charset mail-parse-charset))
519 (let ((cs (mm-charset-to-coding-system charset)))
520 (if (and (not cs) charset
521 (listp mail-parse-ignored-charsets)
522 (memq 'gnus-unknown mail-parse-ignored-charsets))
523 (setq cs (mm-charset-to-coding-system mail-parse-charset)))
524 (when cs
525 (when (and (eq cs 'ascii)
526 mail-parse-charset)
527 (setq cs mail-parse-charset))
528 ;; Ensure unibyte result in Emacs 20.
529 (let (default-enable-multibyte-characters)
530 (with-temp-buffer
531 (mm-decode-coding-string
532 (cond
533 ((equal "B" encoding)
534 (base64-decode-string string))
535 ((equal "Q" encoding)
536 (quoted-printable-decode-string
537 (mm-replace-chars-in-string string ?_ ? )))
538 (t (error "Invalid encoding: %s" encoding)))
539 cs))))))
540
541 (provide 'rfc2047)
542
543 ;;; rfc2047.el ends here