;; The entry point of this code is
;;
;; mail-extract-address-components: (address &optional all)
-;;
+;;
;; Given an RFC-822 ADDRESS, extract full name and canonical address.
;; Returns a list of the form (FULL-NAME CANONICAL-ADDRESS).
;; If no name can be extracted, FULL-NAME will be nil.
-;; ADDRESS may be a string or a buffer. If it is a buffer, the visible
+;; ADDRESS may be a string or a buffer. If it is a buffer, the visible
;; (narrowed) portion of the buffer will be interpreted as the address.
;; (This feature exists so that the clever caller might be able to avoid
;; consing a string.)
;; make sure you're not breaking functionality. The test cases aren't included
;; because they are over 100K.
;;
-;; If you find an address that mail-extr fails on, please send it to the
+;; If you find an address that mail-extr fails on, please send it to the
;; maintainer along with what you think the correct results should be. We do
;; not consider it a bug if mail-extr mangles a comment that does not
-;; correspond to a real human full name, although we would prefer that
+;; correspond to a real human full name, although we would prefer that
;; mail-extr would return the comment as-is.
;;
;; Features:
;; * insert documentation strings!
;; * handle X.400-gatewayed addresses according to RFC 1148.
-;;; Change Log:
-;;
+;;; Change Log:
+;;
;; Thu Feb 17 17:57:33 1994 Jamie Zawinski (jwz@lucid.com)
;;
;; * merged with jbw's latest version
;; * some more cleanup, doc, added provide
;;
;; Tue Mar 23 21:23:18 1993 Joe Wells (jbw at csd.bu.edu)
-;;
+;;
;; * Made mail-full-name-prefixes a user-customizable variable.
;; Allow passing the address as a buffer as well as a string.
;; Allow [ and ] as name characters (Finnish character set).
-;;
+;;
;; Mon Mar 22 21:20:56 1993 Joe Wells (jbw at bigbird.bu.edu)
-;;
+;;
;; * Handle "null" addresses. Handle = used for spacing in mailbox
;; name. Fix bug in handling of ROUTE-ADDR-type addresses that are
;; missing their brackets. Handle uppercase "JR". Extract full
;; names from X.400 addresses encoded in RFC-822. Fix bug in
;; handling of multiple addresses where first has trailing comment.
;; Handle more kinds of telephone extension lead-ins.
-;;
+;;
;; Mon Mar 22 20:16:57 1993 Joe Wells (jbw at bigbird.bu.edu)
-;;
+;;
;; * Handle HZ encoding for embedding GB encoded chinese characters.
-;;
+;;
;; Mon Mar 22 00:46:12 1993 Joe Wells (jbw at bigbird.bu.edu)
-;;
+;;
;; * Fixed too broad matching of ham radio call signs. Fixed bug in
;; handling an unmatched ' in a name string. Enhanced recognition
;; of when . in the mailbox name terminates the name portion.
;; introduced in switching last name order. Fixed bug in handling
;; address with ! and % but no @. Narrowed the cases in which
;; certain trailing words are discarded.
-;;
+;;
;; Sun Mar 21 21:41:06 1993 Joe Wells (jbw at bigbird.bu.edu)
-;;
+;;
;; * Fixed bugs in handling GROUP addresses. Certain words in the
;; middle of a name no longer terminate it. Handle LISTSERV list
;; names. Ignore comment field containing mailbox name.
-;;
+;;
;; Sun Mar 21 14:39:38 1993 Joe Wells (jbw at bigbird.bu.edu)
-;;
+;;
;; * Moved variant-method code back into main function. Handle
;; underscores as spaces in comments. Handle leading nickname. Add
;; flag to ignore single-word names. Other changes.
-;;
+;;
;; Mon Feb 1 22:23:31 1993 Joe Wells (jbw at bigbird.bu.edu)
-;;
+;;
;; * Added in changes by Rod Whitby and Jamie Zawinski. This
;; includes the flag mail-extr-guess-middle-initial and the fix for
;; handling multiple addresses correctly. (Whitby just changed
;; a > to a <.)
-;;
+;;
;; Mon Apr 6 23:59:09 1992 Joe Wells (jbw at bigbird.bu.edu)
-;;
+;;
;; * Cleaned up some more. Release version 1.0 to world.
-;;
+;;
;; Sun Apr 5 19:39:08 1992 Joe Wells (jbw at bigbird.bu.edu)
-;;
+;;
;; * Cleaned up full name extraction extensively.
-;;
+;;
;; Sun Feb 2 14:45:24 1992 Joe Wells (jbw at bigbird.bu.edu)
-;;
+;;
;; * Total rewrite. Integrated mail-canonicalize-address into
;; mail-extract-address-components. Now handles GROUP addresses more
;; or less correctly. Better handling of lots of different cases.
-;;
+;;
;; Fri Jun 14 19:39:50 1991
;; * Created.
:type 'boolean
:group 'mail-extr)
-(defcustom mail-extr-ignore-single-names t
+(defcustom mail-extr-ignore-single-names nil
"*Whether to ignore a name that is just a single word.
If true, then when we see an address like \"Idiot <dumb@stupid.com>\"
we will act as though we couldn't find a full name in the address."
(defconst mail-extr-leading-garbage "\\W+")
-;; (defconst mail-extr-non-name-chars
+;; (defconst mail-extr-non-name-chars
;; (purecopy (concat "^" mail-extr-all-letters ".")))
;; (defconst mail-extr-non-begin-name-chars
;; (purecopy (concat "^" mail-extr-first-letters)))
;; (defconst mail-extr-non-end-name-chars
;; (purecopy (concat "^" mail-extr-last-letters)))
-;; Matches an initial not followed by both a period and a space.
+;; Matches an initial not followed by both a period and a space.
;; (defconst mail-extr-bad-initials-pattern
-;; (purecopy
+;; (purecopy
;; (format "\\(\\([^%s]\\|\\`\\)[%s]\\)\\(\\.\\([^ ]\\)\\| \\|\\([^%s .]\\)\\|\\'\\)"
;; mail-extr-all-letters mail-extr-first-letters mail-extr-all-letters)))
;; Must not match a trailing uppercase last name or trailing initial
(defconst mail-extr-weird-acronym-pattern
(purecopy "\\([A-Z]+[-_/]\\|[A-Z][A-Z][A-Z]?\\b\\)"))
-
+
;; Matches a mixed-case or lowercase name (not an initial).
;; #### Match Latin1 lower case letters here too?
;; (defconst mail-extr-mixed-case-name-pattern
;; Matches a trailing alternative address.
;; #### Match Latin1 letters here too?
-;; #### Match _ before @ here too?
+;; #### Match _ before @ here too?
(defconst mail-extr-alternative-address-pattern
(purecopy "\\(aka *\\)?[a-zA-Z.]+[!@][a-zA-Z.]"))
;; Matches a single word name.
;; (defconst mail-extr-one-name-pattern
;; (purecopy (concat "\\`" mail-extr-normal-name-pattern "\\'")))
-
+
;; Matches normal two names with missing middle initial
;; The first name is not allowed to have a hyphen because this can cause
;; false matches where the "middle initial" is actually the first letter
;; encountered. The character '~' is an escape character. By convention, it
;; must be immediately followed ONLY by '~', '{' or '\n' (<LF>), with the
;; following special meaning.
-;;
+;;
;; o The escape sequence '~~' is interpreted as a '~'.
;; o The escape-to-GB sequence '~{' switches the mode from ASCII to GB.
;; o The escape sequence '~\n' is a line-continuation marker to be consumed
;; with no output produced.
-;;
+;;
;; In GB mode, characters are interpreted two bytes at a time as (pure) GB
;; codes until the escape-from-GB code '~}' is read. This code switches the
;; mode from GB back to ASCII. (Note that the escape-from-GB code '~}'
(defun mail-extract-address-components (address &optional all)
"Given an RFC-822 address ADDRESS, extract full name and canonical address.
Returns a list of the form (FULL-NAME CANONICAL-ADDRESS).
-If no name can be extracted, FULL-NAME will be nil.
+If no name can be extracted, FULL-NAME will be nil. Also see
+`mail-extr-ignore-single-names'.
If the optional argument ALL is non-nil, then ADDRESS can contain zero
or more recipients, separated by commas, and we return a list of
one recipients, all but the first is ignored.
ADDRESS may be a string or a buffer. If it is a buffer, the visible
- (narrowed) portion of the buffer will be interpreted as the address.
- (This feature exists so that the clever caller might be able to avoid
- consing a string.)"
+\(narrowed) portion of the buffer will be interpreted as the address.
+\(This feature exists so that the clever caller might be able to avoid
+consing a string.)"
(let ((canonicalization-buffer (get-buffer-create " *canonical address*"))
(extraction-buffer (get-buffer-create " *extract address components*"))
value-list)
(widen)
(erase-buffer)
(setq case-fold-search nil)
-
+
;; Insert extra space at beginning to allow later replacement with <
;; without having to move markers.
(insert ?\ )
(buffer-disable-undo canonicalization-buffer)
(setq case-fold-search nil))
-
+
;; Unfold multiple lines.
(goto-char (point-min))
(while (re-search-forward "\\([^\\]\\(\\\\\\\\\\)*\\)\n[ \t]" nil t)
(replace-match "\\1 " t))
-
+
;; Loop over addresses until we have as many as we want.
(while (and (or all (null value-list))
(progn (goto-char (point-min))
;; Any commas must be between < and : of ROUTE-ADDR. Nuke any
;; others.
- ;; Hell, go ahead an nuke all of the commas.
+ ;; Hell, go ahead and nuke all of the commas.
;; **** This will cause problems when we start handling commas in
;; the PHRASE part .... no it won't ... yes it will ... ?????
(mail-extr-nuke-outside-range comma-pos 1 1)
(if (bobp)
(delete-region (point) cbeg)
(just-one-space))))))
-
+
;; This was moved above.
;; Fix . used as space
;; But it belongs here because it occurs not only as
;; Loop over the words (and other junk) in the name.
(goto-char (point-min))
(while (not name-done-flag)
-
+
(when word-found-flag
;; Last time through this loop we skipped over a word.
(setq last-word-beg this-word-beg)
(setq lower-case-flag nil)
;; (setq upper-case-flag nil)
(setq begin-again-flag nil))
-
+
;; Initialize for this iteration of the loop.
(mail-extr-skip-whitespace-forward)
(if (eq word-count 0) (narrow-to-region (point) (point-max)))
(setq this-word-beg (point))
(setq drop-this-word-if-trailing-flag nil)
-
+
;; Decide what to do based on what we are looking at.
(cond
-
+
;; Delete title
((and (eq word-count 0)
(looking-at mail-extr-full-name-prefixes))
(goto-char (match-end 0))
(narrow-to-region (point) (point-max)))
-
+
;; Stop after name suffix
((and (>= word-count 2)
(looking-at mail-extr-full-name-suffix-pattern))
(upcase-word 1)))
(setq word-found-flag t)
(setq name-done-flag t))
-
+
;; Handle SCA names
((looking-at "MKA \\(.+\\)") ; "Mundanely Known As"
(goto-char (match-beginning 1))
(narrow-to-region (point) (point-max))
(setq begin-again-flag t))
-
+
;; Check for initial last name followed by comma
((and (eq ?, (following-char))
(eq word-count 1))
(setq last-name-comma-flag t)
(or (eq ?\ (following-char))
(insert ?\ )))
-
+
;; Stop before trailing comma-separated comment
;; THIS CASE MUST BE AFTER THE PRECEDING CASES.
;; *** This case is redundant???
;;((eq ?, (following-char))
;; (setq name-done-flag t))
-
+
;; Delete parenthesized/quoted comment/nickname
((memq (following-char) '(?\( ?\{ ?\[ ?\" ?\' ?\`))
(setq cbeg (point))
(delete-region cbeg cend)
(if initial
(insert initial ". ")))))
-
+
;; Handle *Stupid* VMS date stamps
((looking-at mail-extr-stupid-vms-date-stamp-pattern)
(replace-match "" t))
-
+
;; Handle Chinese characters.
((looking-at mail-extr-hz-embedded-gb-encoded-chinese-pattern)
(goto-char (match-end 0))
(setq word-found-flag t))
-
+
;; Skip initial garbage characters.
;; THIS CASE MUST BE AFTER THE PRECEDING CASES.
((and (eq word-count 0)
;; *** Skip backward over these???
;; (skip-chars-backward "& \"")
(narrow-to-region (point) (point-max)))
-
+
;; Various stopping points
((or
-
+
;; Stop before ALL CAPS acronyms, if preceded by mixed-case
;; words. Example: XT-DEM.
(and (>= word-count 2)
mixed-case-flag
(looking-at mail-extr-weird-acronym-pattern)
(not (looking-at mail-extr-roman-numeral-pattern)))
-
+
;; Stop before trailing alternative address
(looking-at mail-extr-alternative-address-pattern)
-
+
;; Stop before trailing comment not introduced by comma
;; THIS CASE MUST BE AFTER AN EARLIER CASE.
(looking-at mail-extr-trailing-comment-start-pattern)
-
+
;; Stop before telephone numbers
(and (>= word-count 1)
(looking-at mail-extr-telephone-extension-pattern)))
(setq name-done-flag t))
-
+
;; Delete ham radio call signs
((looking-at mail-extr-ham-call-sign-pattern)
(delete-region (match-beginning 0) (match-end 0)))
-
+
;; Fixup initials
((looking-at mail-extr-initial-pattern)
(or (eq (following-char) (upcase (following-char)))
(or (eq ?\ (following-char))
(insert ?\ ))
(setq word-found-flag t))
-
+
;; Handle BITNET LISTSERV list names.
((and (eq word-count 0)
(looking-at mail-extr-listserv-list-name-pattern))
(narrow-to-region (match-beginning 1) (match-end 1))
(setq word-found-flag t)
(setq name-done-flag t))
-
+
;; Handle & substitution, when & is last and is not first.
((and (> word-count 0)
(eq ?\ (preceding-char))
((looking-at mail-extr-name-pattern)
(setq name-beg (point))
(setq name-end (match-end 0))
-
+
;; Certain words will be dropped if they are at the end.
(and (>= word-count 2)
(not lower-case-flag)
;; Drop a trailing word which is terminated with a period.
(eq ?. (char-after (1- name-end))))
(setq drop-this-word-if-trailing-flag t))
-
+
;; Set the flags that indicate whether we have seen a lowercase
;; word, a mixed case word, and an uppercase word.
(if (re-search-forward "[a-z]" name-end t)
(setq lower-case-flag t))
;; (setq upper-case-flag t)
)
-
+
(goto-char name-end)
(setq word-found-flag t))
(t
(setq name-done-flag t)
))
-
+
;; Count any word that we skipped over.
(if word-found-flag
(setq word-count (1+ word-count))))
-
+
;; If the last thing in the name is 2 or more periods, or one or more
;; other sentence terminators (but not a single period) then keep them
;; and the preceding word. This is for the benefit of whole sentences
(or (and drop-last-word-if-trailing-flag
last-word-beg)
(point)))
-
+
;; Xerox's mailers SUCK!!!!!!
;; We simply refuse to believe that any last name is PARC or ADOC.
;; If it looks like that is the last name, that there is no meaningful
(goto-char name-end)
(skip-chars-forward "\t ,")
(narrow-to-region (point) (point-max)))
-
+
;; Delete leading and trailing junk characters.
;; *** This is probably completely unneeded now.
;;(goto-char (point-max))
;; (goto-char (point-min))
;; (skip-chars-forward mail-extr-non-begin-name-chars)
;; (point)))
-
+
;; Compress whitespace
(goto-char (point-min))
(while (re-search-forward "[ \t\n]+" nil t)
;; Keep in mind that the country abbreviations follow ISO-3166. There is
;; a U.S. FIPS that specifies a different set of two-letter country
;; abbreviations.
+;;
+;; Updated by the RIPE Network Coordination Centre.
+;;
+;; Source: ISO 3166 Maintenance Agency
+;; http://www.iso.org/iso/en/prods-services/iso3166ma/02iso-3166-code-lists/list-en1-semic.txt
+;; http://www.iana.org/domain-names.htm
+;; http://www.iana.org/cctld/cctld-whois.htm
+;; Latest change: Mon Jul 8 14:21:59 CEST 2002
(defconst mail-extr-all-top-level-domains
(let ((ob (make-vector 739 0)))
("bz" "Belize")
("ca" "Canada")
("cc" "Cocos (Keeling) Islands")
- ("cd" "The Democratic Republic of The Congo")
+ ("cd" "Congo" "The Democratic Republic of the %s")
("cf" "Central African Republic")
("cg" "Congo")
("ch" "Switzerland" "The Swiss Confederation")
("gp" "Guadeloupe (Fr.)")
("gq" "Equatorial Guinea")
("gr" "Greece" "The Hellenic Republic (%s)")
- ("gs" "South Georgia And The South Sandwich Islands")
+ ("gs" "South Georgia and The South Sandwich Islands")
("gt" "Guatemala")
("gu" "Guam (U.S.)")
("gw" "Guinea-Bissau")
("gy" "Guyana")
("hk" "Hong Kong")
- ("hm" "Heard Island And Mcdonald Islands")
+ ("hm" "Heard Island and Mcdonald Islands")
("hn" "Honduras")
("hr" "Croatia" "Croatia (Hrvatska)")
("ht" "Haiti")
("kr" "Korea (South)" "Republic of Korea")
("kw" "Kuwait")
("ky" "Cayman Islands")
- ("kz" "Kazakstan")
+ ("kz" "Kazakhstan")
("la" "Lao People's Democratic Republic")
("lb" "Lebanon")
("lc" "Saint Lucia")
("ml" "Mali")
("mm" "Myanmar")
("mn" "Mongolia")
- ("mo" "Macau")
+ ("mo" "Macao")
("mp" "Northern Mariana Islands")
("mq" "Martinique")
("mr" "Mauritania")
("mv" "Maldives")
("mw" "Malawi")
("mx" "Mexico" "The United Mexican States")
- ("my" "Malaysia" "%s (changed to Myanmar?)") ;???
+ ("my" "Malaysia")
("mz" "Mozambique")
("na" "Namibia")
("nc" "New Caledonia (Fr.)")
("om" "Oman")
("pa" "Panama")
("pe" "Peru")
- ("pf" "Polynesia (Fr.)")
+ ("pf" "French Polynesia")
("pg" "Papua New Guinea")
("ph" "Philippines" "The Republic of the %s")
("pk" "Pakistan")
("sn" "Senegal")
("so" "Somalia")
("sr" "Suriname")
- ("st" "Sao Tome And Principe")
+ ("st" "Sao Tome and Principe")
("su" "U.S.S.R." "The Union of Soviet Socialist Republics")
("sv" "El Salvador")
("sy" "Syrian Arab Republic")
("sz" "Swaziland")
- ("tc" "Turks And Caicos Islands")
+ ("tc" "Turks and Caicos Islands")
("td" "Chad")
("tf" "French Southern Territories")
("tg" "Togo")
("th" "Thailand" "The Kingdom of %s")
("tj" "Tajikistan")
("tk" "Tokelau")
+ ("tl" "East Timor")
("tm" "Turkmenistan")
("tn" "Tunisia")
("to" "Tonga")
("uy" "Uruguay" "The Eastern Republic of %s")
("uz" "Uzbekistan")
("va" "Holy See (Vatican City State)")
- ("vc" "St. Vincent and the Grenadines")
+ ("vc" "Saint Vincent and the Grenadines")
("ve" "Venezuela" "The Republic of %s")
("vg" "Virgin Islands, British")
("vi" "Virgin Islands, U.S.")
("za" "South Africa" "The Republic of %s")
("zm" "Zambia")
("zw" "Zimbabwe" "Republic of %s")
- ;; Special top-level domains:
- ("arpa" t "Advanced Research Projects Agency (U.S. DoD)")
- ("bitnet" t "Because It's Time NET")
+ ;; Generic Domains:
+ ("aero" t "Air Transport Industry")
+ ("biz" t "Businesses")
("com" t "Commercial")
- ("edu" t "Educational")
- ("gov" t "Government (U.S.)")
- ("int" t "International (NATO)")
- ("mil" t "Military (U.S.)")
- ("nato" t "North Atlantic Treaty Organization")
+ ("coop" t "Cooperative Associations")
+ ("info" t "Info")
+ ("museum" t "Museums")
+ ("name" t "Individuals")
("net" t "Network")
("org" t "Non-profit Organization")
- ;;("unter-dom" t "? (Ger.)")
+ ;;("pro" t "Credentialed professionals")
+ ;;("bitnet" t "Because It's Time NET")
+ ("gov" t "United States Government")
+ ("edu" t "Educational")
+ ("mil" t "United States Military")
+ ("int" t "International Treaties")
+ ;;("nato" t "North Atlantic Treaty Organization")
("uucp" t "Unix to Unix CoPy")
- ;;("fipnet" nil "?")
+ ;; Infrastructure Domains:
+ ("arpa" t "Advanced Research Projects Agency (U.S. DoD)")
))
ob))
\f
;(let ((all nil))
; (mapatoms #'(lambda (x)
-; (if (and (boundp x)
+; (if (and (boundp x)
; (string-match "^mail-extr-" (symbol-name x)))
; (setq all (cons x all)))))
; (setq all (sort all #'string-lessp))