id-or-mail.pl
1 | #!/usr/bin/perl
2 | #
3 | # This script looks for message-ids/e-mail addresses (one per line) in STDIN.
4 | # For each it tries to decide if it is a message-id or an e-mail address.
5 | #
6 | # Special thanks to Urs Janßen for his extensive help and numerous suggestions.
7 | # And thanks to all those other people who offered their knowledge.
8 | #
9 | # Another solution:
10 | # https://www.enyo.de/fw/scripts/
11 |
12 | my$ip='(?:[01]?\d\d?|2[0-4]\d|25[0-5])'; # one byte of an IP-address
13 | $ip="(?:$ip\.){3}$ip"; # complete IP-address
14 |
15 | while (<STDIN>) {
16 | print "$1 -> ".&analyze($1)."\n" if (/(<?[^\s<>@]+@[^\s<>@]+>?)/);
17 | }
18 |
19 | sub analyze {
20 | my$test=$_[0];
21 | my($l,$lp,$fqdn,$r)=$test=~/(<?)([^\s<>@]+)@([^\s<>@]+)(>?)/;
22 | $fqdn=lc($fqdn);
23 | my$len=length($lp);
24 | return "mail" unless ($l && $r); # Message-IDs MUST have angle brackets
25 | return "m-id" if ($fqdn eq "4ax.com"); # This domain is for Forte Agent's IDs only
26 | return "m-id" if ($fqdn eq "localhost"); # Crippleware
27 | return "m-id" if ($fqdn=~/^\[?$ip\]?$/); # People don't use IP-addresses as local parts
28 | return "mail" if ($len<8); # Short local parts are unlikely in Message-IDs
29 | return "m-id" if ($fqdn=~/^[^.]+$/); # Crippleware (risk: local address and few people)
30 | return "m-id" if ($lp=~/[\$%!\/=~*'{}^?|]/); # Symbols very unlikely in e-mail addresses
31 | return "m-id" if ($lp=~/[\da-z]{8,}\.fsf/); # Gnus
32 | return "m-id" if ($lp=~/^[A-Z]{28}\./); # unidentified (user-id at the end)
33 | return "m-id" if ($lp=~/^[a-z]{9,24}\.[\da-z]{7}\.pminews$/); # PMINews
34 | return "mail" if ($lp=~/^\d+\.\d+$/ && $fqdn eq "compuserve.com"); # CompuServe
35 | return "mail" if ($lp=~/^[^\d]+$/); # IDs are likely to have digits ...
36 | return "mail" if ($lp=~/^\d+$/ # ... but not only ...
37 | && $fqdn!~'aol\.com$' # ... ... except for AOL ...
38 | && $fqdn!~'virtualsight\.com$' # ... ... and virtualsight ...
39 | && $fqdn!~'zetnet\.' # ... ... and zetnet ...
40 | && $fqdn!~'(?:^|\.)fido'); # ... ... and Fido ...
41 | return "mail" if ($lp=~/^\d+[-#]\d*$/); # ... or blocked.
42 |
43 | return "m-id" if ($lp=~/^[\dA-F]{4,}\.[\dA-F]{4,}/); # various #692227
44 | return "m-id" if ($lp=~/^[\da-f]{4,}\.[\da-f]{4,}/); # unknown #501211
45 | return "m-id" if ($lp=~/^[\da-f]{9}/); # Pluto et al. #241513
46 | return "m-id" if ($lp=~/^[\da-v]{6,7}\.[\da-v]{1,7}\.\d+$/); # Hamster #59222
47 | return "m-id" if ($lp=~/^[a-z](?=[\da-z]*\d[a-z]+\d)[\da-z]{9,11}[\da-f]{2}$/); # supernews (WAG) #58783
48 | return "m-id" if ($lp=~/^slrn[\da-v.]+\.[\da-v]+\.[\da-zA-Z.+_\-]+$/);# slrn #55603
49 | return "m-id" if ($lp=~/^MPG\.[\da-f]{12,24}$/); # Gravity #43462
50 | return "m-id" if ($lp=~/^[\da-z]{6}\.[\da-z]{2,6}\.ln$/); # Leafnode #35780
51 | return "m-id" if ($lp=~/^[\da-f]{8}_\d\d?$/); # DNews #29771
52 | # Dnews actually looks different, but $-part -- if exists -- matched already:
53 | # return "m-id\n" if ($lp=~/^[\da-f]{8}(?:\$\d\d?)?_\d\d?$/); # DNews
54 | return "m-id" if ($lp=~/[-.][\dA-F]{6}\.\d{14}$/); # MT-NewsWatcher #26422
55 | return "m-id" if ($lp=~/^Pine\.[\dA-Z-]{3,4}\.\d\.\d[\dA-Za-z_-]+(?:\..+)?\.\d{12,14}\.[-\da-zA-Z]+$/);# Pine #22114
56 | return "m-id" if ($lp=~/^(?=.*[\d-])[\da-zA-Z-]{10}B($|\.)/); # XP #18862
57 | return "m-id" if ($lp=~/^\d[a-z](?=.*\d)[\da-z]{7,9}$/); # newsguy.com #17554
58 | return "m-id" if ($lp=~/^.{11}8E.{3}$/); # Turnpike #11017
59 | return "m-id" if ($lp=~/^Xns[\dA-F]{8}/); # XNews #9061
60 | return "m-id" if ($lp=~/[PUV]R[\da-zA-Z_-]{0,3}_B[DNOS][\dA-Z](?:$|\.)/); # clari.net (WAG) #8855
61 | return "m-id" if ($lp=~/^[\da-z]{34}$/); # unknown #8005
62 | return "m-id" if ($lp=~/^[\da-z]{7}\.[\da-zA-Z]{8,}N$/); # MacSoup #7939
63 | return "m-id" if ($lp=~/^pan\.[12]\d\d\d\.[01]\d\.[0-3]\d\.[0-2]\d\.[0-5]\d\.[0-5]\d\.\d+\.\d+$/); # pan #5388
64 | return "m-id" if ($lp=~/^(?:\d+Z){3}\d+Y\d+X\d+$/); # usenet.iol.it #4360
65 | return "m-id" if ($lp=~/^VA\.[\da-f]{8}\.[\da-f]{8}$/); # Virtual Access #3839
66 | return "m-id" if ($lp=~/^[\da-zA-Z#]{8}[AB]HA\.\d{2,4}$/); # Microsoft (unclear) #3640
67 | return "m-id" if ($lp=~/^(?=.*(?:\d[A-Z]+\d|[A-Z]\d+[A-Z]))[\dA-Z]{12,14}$/);# unknown (WAG) #3322
68 | return "m-id" if ($lp=~/^[A-Z][\da-zA-Z]{19}[\da-f]{7}(?:$|\.)/); # unknown #3253
69 | return "m-id" if ($lp=~/^[\da-zA-Z]{12}-pn2-[\da-zA-Z]{12}$/); # ProNews #3247
70 | return "m-id" if ($lp=~/^\d\dHW\.[\dA-F]{24}$/); # Hogwasher #3118
71 | return "m-id" if ($lp=~/^M\.\d{10}\.[ABC]\.\d{1,3}$/); # bbs.nsysu.edu.tw #2557
72 | return "m-id" if ($lp=~/^mailman\.\d{10}\.\d{3,5}\./); # Mailman #2375
73 | return "m-id" if ($lp=~/^[\da-f]{1,3}\.[\da-f]{5,8}\.[\da-f]{7,8}$/); # AOL #1892
74 | return "m-id" if ($lp=~/^ee[\da-f]{5}\.-?\d{1,2}$/); # WebX (WAG) #1493
75 | return "m-id" if ($lp=~/^memo\.\d{14}\.[\da-zA-Z]{3,6}$/); # unknown #1380
76 | return "m-id" if ($lp=~/^[\dA-Z]{13}\.\d{9,10}$/); # some anon remailers #1296
77 | return "m-id" if ($lp=~/^na\.[\da-f]{10}\.a\d\d/); # NewsAgent #1170
78 | return "m-id" if ($lp=~/^ant\d{6}.{7}$/); # ANT #1037
79 | return "m-id" if ($lp=~/^(?:cancel|[sS]upersedes)/); # guess what #1023
80 | return "m-id" if ($lp=~/^[\da-fA-F]{8}\.MD-\d\.\d\.\d\d\./); # MicroDot-II #642
81 | return "m-id" if ($lp=~/^[^a-zA-Z0-9_.+-]/); # usual address start #610
82 | return "m-id" if ($lp=~/^(?:[\da-f]+[tin]){3}[\da-f]+$/); # tin #454
83 | # tin actually looks different, but last part -- if exists -- matched already:
84 | # return "m-id\n" if ($lp=~/^(?:[\da-f]+[tin]){3}[\da-f]+(?:%[\da-z.+_\-]+)?$/);
85 | return "m-id" if ($lp=~/^[\dA-F]{8}yf$/); # Yanoff Conduit #264
86 | return "m-id" if ($lp=~/^XFMail\.\d{12,14}\./); # XFMail #225
87 | return "m-id" if ($lp=~/^Y[aA][mM]2NN\.(?:AmigaOS\.|NewsNet\.)?[\dA-F]{7,8}\.[\dA-F]{4,8}$/);# Yam2NN #142
88 |
89 | # case-independent from here on ###################################################################
90 | $lp=lc($lp);
91 | return "m-id" if ($lp=~/^[\da-z]{6}\.[\da-z]{3}$/); # CNews
92 | return "m-id" if ($lp=~/^e[\da-z]{6}-[\da-z]{6}-[\da-z]{2}$/); # exim
93 | return "m-id" if ($lp=~/^.{5}md[\da-z]{7,8}$/); # MicroDot
94 | return "m-id" if ($lp=~/^[12]\d\d\d[01]\d[0-3]\d[0-2]\d[0-5]\d(?:[0-5]\d)?\.[a-z]{1,3}\d+(?:\.[\da-z.+_\-]+)?$/); # mutt (variations)
95 | return "m-id" if ($lp=~/^[12]\d\d\d[01]\d[0-3]\d[0-2]\d[0-5]\d\.[\da-z][\dab][\da-u][\da-n][\da-z]{3}\d{5,6}$/); # sendmail
96 | return "m-id" if ($lp=~/^[12]\d\d\d[01]\d[0-3]\d[0-2]\d[0-5]\d\.[a-x]?[a-z~][a-z]\d{5}$/); # sendmail
97 | # return "m-id" if ($lp=~/^[\da-f]{8}\.[\da-f]{1,8}$/); # Netscape, matched above
98 |
99 | # no we can only guess ############################################################################
100 | return "mail" if ($len==8 && $lp=~/^[a-f]+\d+$/); # just guessing
101 | return "m-id" if ($lp=~/^[\da-f]{8}$/); # just guessing
102 | return "m-id" if ($lp=~/^(?![\d#]+$).*[\da-f]{15,}/); # just guessing
103 | return "m-id" if ($lp=~/^\d{2,}-[\da-f]{8,}-\d+$/); # just guessing (webtv.net)
104 | return "m-id" if ($lp=~/([.-])\d{4,}\1\d{3,}$/); # just guessing
105 | return "m-id" if ($lp=~/^[\da-f]{8,}[._]/); # just guessing
106 | return "m-id" if ($lp=~/^\d{7,8}\.[\da-z]{10}/); # just guessing
107 | return "m-id" if ($lp=~/^[\da-f]{11,}$/); # just guessing
108 | return "m-id" if ($lp=~/^\d{2,4}[.-]\d\d[.-]\d{2,4}.*\d\d[.-]\d\d[.-]\d\d/);# just guessing
109 | return "mmmm"; # When in doubt, say mail.
110 | }

© Boris 'pi' Piwinger,
February 18, 2002