1 ;;; puny.el --- translate non-ASCII domain names to ASCII
3 ;; Copyright (C) 2015-2016 Free Software Foundation, Inc.
5 ;; Author: Lars Magne Ingebrigtsen <larsi@gnus.org>
8 ;; This file is part of GNU Emacs.
10 ;; GNU Emacs is free software: you can redistribute it and/or modify
11 ;; it under the terms of the GNU General Public License as published by
12 ;; the Free Software Foundation, either version 3 of the License, or
13 ;; (at your option) any later version.
15 ;; GNU Emacs is distributed in the hope that it will be useful,
16 ;; but WITHOUT ANY WARRANTY; without even the implied warranty of
17 ;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18 ;; GNU General Public License for more details.
20 ;; You should have received a copy of the GNU General Public License
21 ;; along with GNU Emacs. If not, see <http://www.gnu.org/licenses/>.
25 ;; Written by looking at
26 ;; http://stackoverflow.com/questions/183485/can-anyone-recommend-a-good-free-javascript-for-punycode-to-unicode-conversion
32 (defun puny-encode-domain (domain)
33 "Encode DOMAIN according to the IDNA/punycode algorithm.
34 For instance, \"fśf.org\" => \"xn--ff-2sa.org\"."
35 ;; The vast majority of domain names are not IDNA domain names, so
36 ;; add a check first to avoid doing unnecessary work.
37 (if (string-match "\\'[[:ascii:]]+\\'" domain)
39 (mapconcat 'puny-encode-string (split-string domain "[.]") ".")))
41 (defun puny-encode-string (string)
42 "Encode STRING according to the IDNA/punycode algorithm.
43 This is used to encode non-ASCII domain names.
44 For instance, \"bücher\" => \"xn--bcher-kva\"."
45 (let ((ascii (seq-filter (lambda (char)
48 (if (= (length ascii) (length string))
54 (puny-encode-complex (length ascii) string)))))
56 (defun puny-decode-domain (domain)
57 "Decode DOMAIN according to the IDNA/punycode algorithm.
58 For instance, \"xn--ff-2sa.org\" => \"fśf.org\"."
59 (mapconcat 'puny-decode-string (split-string domain "[.]") "."))
61 (defun puny-decode-string (string)
62 "Decode an IDNA/punycode-encoded string.
63 For instance \"xn--bcher-kva\" => \"bücher\"."
64 (if (string-match "\\`xn--" string)
65 (puny-decode-string-internal (substring string 4))
68 (defconst puny-initial-n 128)
69 (defconst puny-initial-bias 72)
70 (defconst puny-base 36)
71 (defconst puny-damp 700)
72 (defconst puny-tmin 1)
73 (defconst puny-tmax 26)
74 (defconst puny-skew 28)
78 (defun puny-encode-digit (d)
83 (defun puny-adapt (delta num-points first-time)
84 (let ((delta (if first-time
88 (setq delta (+ delta (/ delta num-points)))
89 (while (> delta (/ (* (- puny-base puny-tmin)
92 (setq delta (/ delta (- puny-base puny-tmin))
94 (+ k (/ (* (1+ (- puny-base puny-tmin)) delta)
95 (+ delta puny-skew)))))
97 (defun puny-encode-complex (insertion-points string)
98 (let ((n puny-initial-n)
100 (bias puny-initial-bias)
103 (while (< h (length string))
104 (setq ijv (cl-loop for char across string
108 (setq delta (+ delta (* (- m n) (+ h 1)))
110 (cl-loop for char across string
116 (cl-loop with k = puny-base
120 ((>= k (+ bias puny-tmax))
125 do (push (puny-encode-digit
129 do (setq q (/ (- q t1) (- puny-base t1))
131 (push (puny-encode-digit q) result)
132 (setq bias (puny-adapt delta (+ h 1) (= h insertion-points))
139 (defun puny-decode-digit (cp)
150 (defun puny-decode-string-internal (string)
153 (goto-char (point-max))
154 (search-backward "-" nil (point-min))
155 ;; The encoded chars are after the final dash.
156 (let ((encoded (buffer-substring (1+ (point)) (point-max)))
159 (bias puny-initial-bias)
162 (delete-region (point) (point-max))
163 (while (< ic (length encoded))
169 (setq digit (puny-decode-digit (aref encoded ic)))
171 (cl-incf i (* digit w))
175 ((>= k (+ bias puny-tmax))
180 do (setq w (* w (- puny-base t1))
182 (setq out (1+ (buffer-size)))
183 (setq bias (puny-adapt (- i old-i) out (= old-i 0))))
185 (setq n (+ n (/ i out))
187 (goto-char (point-min))
189 (insert (format "%c" n))
193 ;; http://www.unicode.org/reports/tr39/#Restriction_Level_Detection
194 ;; http://www.unicode.org/reports/tr31/#Table_Candidate_Characters_for_Inclusion_in_Identifiers
196 (defun puny-highly-restrictive-string-p (string)
197 "Say whether STRING is \"highly restrictive\" in the Unicode IDNA sense.
198 See http://www.unicode.org/reports/tr39/#Restriction_Level_Detection
199 for details. The main idea is that if you're mixing
200 scripts (like latin and cyrillic), you may confuse the user by
206 (seq-map (lambda (char)
208 ;; These characters are always allowed
210 '(#x0027 ; APOSTROPHE
211 #x002D ; HYPHEN-MINUS
215 #x058A ; ARMENIAN HYPHEN
216 #x05F3 ; HEBREW PUNCTUATION GERESH
217 #x05F4 ; HEBREW PUNCTUATION GERSHAYIM
218 #x0F0B ; TIBETAN MARK INTERSYLLABIC TSHEG
219 #x200C ; ZERO WIDTH NON-JOINER*
220 #x200D ; ZERO WIDTH JOINER*
222 #x2019 ; RIGHT SINGLE QUOTATION MARK
223 #x2027 ; HYPHENATION POINT
224 #x30A0 ; KATAKANA-HIRAGANA DOUBLE HYPHEN
225 #x30FB)) ; KATAKANA MIDDLE DOT
227 (aref char-script-table char)))
230 ;; Every character uses the same script.
231 (= (length scripts) 1)
233 (mapcar (lambda (list)
234 (seq-every-p (lambda (script)
237 '((latin han hiragana kana)
239 (latin han hangul)))))))
241 (defun puny-highly-restrictive-domain-p (domain)
242 "Say whether DOMAIN is \"highly restrictive\" in the Unicode IDNA sense.
243 See `puny-highly-restrictive-string-p' for further details."
244 (seq-every-p 'puny-highly-restrictive-string-p (split-string domain "[.]")))
248 ;;; puny.el ends here