]> code.delx.au - gnu-emacs/blob - lisp/net/puny.el
Add a new function to say whether a string is restrictive
[gnu-emacs] / lisp / net / puny.el
1 ;;; puny.el --- translate non-ASCII domain names to ASCII
2
3 ;; Copyright (C) 2015 Free Software Foundation, Inc.
4
5 ;; Author: Lars Magne Ingebrigtsen <larsi@gnus.org>
6 ;; Keywords: mail, net
7
8 ;; This file is part of GNU Emacs.
9
10 ;; GNU Emacs is free software: you can redistribute it and/or modify
11 ;; it under the terms of the GNU General Public License as published by
12 ;; the Free Software Foundation, either version 3 of the License, or
13 ;; (at your option) any later version.
14
15 ;; GNU Emacs is distributed in the hope that it will be useful,
16 ;; but WITHOUT ANY WARRANTY; without even the implied warranty of
17 ;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18 ;; GNU General Public License for more details.
19
20 ;; You should have received a copy of the GNU General Public License
21 ;; along with GNU Emacs. If not, see <http://www.gnu.org/licenses/>.
22
23 ;;; Commentary:
24
25 ;; Written by looking at
26 ;; http://stackoverflow.com/questions/183485/can-anyone-recommend-a-good-free-javascript-for-punycode-to-unicode-conversion
27
28 ;;; Code:
29
30 (require 'seq)
31
32 (defun puny-encode-domain (domain)
33 "Encode DOMAIN according to the IDNA/punycode algorith.
34 For instance, \"fśf.org\" => \"xn--ff-2sa.org\"."
35 ;; The vast majority of domain names are not IDNA domain names, so
36 ;; add a check first to avoid doing unnecessary work.
37 (if (string-match "\\'[[:ascii:]]+\\'" domain)
38 domain
39 (mapconcat 'puny-encode-string (split-string domain "[.]") ".")))
40
41 (defun puny-encode-string (string)
42 "Encode STRING according to the IDNA/punycode algorithm.
43 This is used to encode non-ASCII domain names.
44 For instance, \"bücher\" => \"xn--bcher-kva\"."
45 (let ((ascii (seq-filter (lambda (char)
46 (< char 128))
47 string)))
48 (if (= (length ascii) (length string))
49 string
50 (concat "xn--"
51 (if (null ascii)
52 ""
53 (concat ascii "-"))
54 (puny-encode-complex (length ascii) string)))))
55
56 (defun puny-decode-domain (domain)
57 "Decode DOMAIN according to the IDNA/punycode algorith.
58 For instance, \"xn--ff-2sa.org\" => \"fśf.org\"."
59 (mapconcat 'puny-decode-string (split-string domain "[.]") "."))
60
61 (defun puny-decode-string (string)
62 "Decode an IDNA/punycode-encoded string.
63 For instance \"xn--bcher-kva\" => \"bücher\"."
64 (if (string-match "\\`xn--" string)
65 (puny-decode-string-internal (substring string 4))
66 string))
67
68 (defconst puny-initial-n 128)
69 (defconst puny-initial-bias 72)
70 (defconst puny-base 36)
71 (defconst puny-damp 700)
72 (defconst puny-tmin 1)
73 (defconst puny-tmax 26)
74 (defconst puny-skew 28)
75
76 ;; 0-25 a-z
77 ;; 26-36 0-9
78 (defun puny-encode-digit (d)
79 (if (< d 26)
80 (+ ?a d)
81 (+ ?0 (- d 26))))
82
83 (defun puny-adapt (delta num-points first-time)
84 (let ((delta (if first-time
85 (/ delta puny-damp)
86 (/ delta 2)))
87 (k 0))
88 (setq delta (+ delta (/ delta num-points)))
89 (while (> delta (/ (* (- puny-base puny-tmin)
90 puny-tmax)
91 2))
92 (setq delta (/ delta (- puny-base puny-tmin))
93 k (+ k puny-base)))
94 (+ k (/ (* (1+ (- puny-base puny-tmin)) delta)
95 (+ delta puny-skew)))))
96
97 (defun puny-encode-complex (insertion-points string)
98 (let ((n puny-initial-n)
99 (delta 0)
100 (bias puny-initial-bias)
101 (h insertion-points)
102 result m ijv q)
103 (while (< h (length string))
104 (setq ijv (cl-loop for char across string
105 when (>= char n)
106 minimize char))
107 (setq m ijv)
108 (setq delta (+ delta (* (- m n) (+ h 1)))
109 n m)
110 (cl-loop for char across string
111 when (< char n)
112 do (cl-incf delta)
113 when (= char ijv)
114 do (progn
115 (setq q delta)
116 (cl-loop with k = puny-base
117 for t1 = (cond
118 ((<= k bias)
119 puny-tmin)
120 ((>= k (+ bias puny-tmax))
121 puny-tmax)
122 (t
123 (- k bias)))
124 while (>= q t1)
125 do (push (puny-encode-digit
126 (+ t1 (mod (- q t1)
127 (- puny-base t1))))
128 result)
129 do (setq q (/ (- q t1) (- puny-base t1))
130 k (+ k puny-base)))
131 (push (puny-encode-digit q) result)
132 (setq bias (puny-adapt delta (+ h 1) (= h insertion-points))
133 delta 0
134 h (1+ h))))
135 (cl-incf delta)
136 (cl-incf n))
137 (nreverse result)))
138
139 (defun puny-decode-digit (cp)
140 (cond
141 ((<= cp ?9)
142 (+ (- cp ?0) 26))
143 ((<= cp ?Z)
144 (- cp ?A))
145 ((<= cp ?z)
146 (- cp ?a))
147 (t
148 puny-base)))
149
150 (defun puny-decode-string-internal (string)
151 (with-temp-buffer
152 (insert string)
153 (goto-char (point-max))
154 (search-backward "-" nil (point-min))
155 ;; The encoded chars are after the final dash.
156 (let ((encoded (buffer-substring (1+ (point)) (point-max)))
157 (ic 0)
158 (i 0)
159 (bias puny-initial-bias)
160 (n puny-initial-n)
161 out)
162 (delete-region (point) (point-max))
163 (while (< ic (length encoded))
164 (let ((old-i i)
165 (w 1)
166 (k puny-base)
167 digit t1)
168 (cl-loop do (progn
169 (setq digit (puny-decode-digit (aref encoded ic)))
170 (cl-incf ic)
171 (cl-incf i (* digit w))
172 (setq t1 (cond
173 ((<= k bias)
174 puny-tmin)
175 ((>= k (+ bias puny-tmax))
176 puny-tmax)
177 (t
178 (- k bias)))))
179 while (>= digit t1)
180 do (setq w (* w (- puny-base t1))
181 k (+ k puny-base)))
182 (setq out (1+ (buffer-size)))
183 (setq bias (puny-adapt (- i old-i) out (= old-i 0))))
184
185 (setq n (+ n (/ i out))
186 i (mod i out))
187 (goto-char (point-min))
188 (forward-char i)
189 (insert (format "%c" n))
190 (cl-incf i)))
191 (buffer-string)))
192
193 ;; http://www.unicode.org/reports/tr39/#Restriction_Level_Detection
194
195 (defun puny-highly-restrictive-p (string)
196 (let ((scripts
197 (seq-uniq
198 (seq-map (lambda (char)
199 (aref char-script-table char))
200 string))))
201 (or
202 ;; Every character uses the same script.
203 (= (length scripts) 1)
204 (seq-some 'identity
205 (mapcar (lambda (list)
206 (seq-every-p (lambda (script)
207 (memq script list))
208 scripts))
209 '((latin han hiragana kana)
210 (latin han bopomofo)
211 (latin han hangul)))))))
212
213 (provide 'puny)
214
215 ;;; puny.el ends here