1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23 package org.apache.james.smtpserver.urirbl;
24
25 import java.util.HashSet;
26 import java.util.Iterator;
27 import java.util.regex.*;
28 import java.net.URI;
29 import java.net.URISyntaxException;
30
31 public class URIScanner {
32
33
34 static private final String reserved = ";/?:@&=+$,[]\\#|";
35
36 static private final String reservedNoColon = ";/?@&=+$,[]\\#|";
37
38 static private final String mark = "-_.!~*'()";
39
40 static private final String unreserved = "A-Za-z0-9" + escape(mark)
41 + "\\x00-\\x08\\x0b\\x0c\\x0e-\\x1f";
42
43 static private final String uricSet = escape(reserved) + unreserved + "%";
44
45 static private final String uricNoColon = escape(reservedNoColon)
46 + unreserved + "%";
47
48 static private final String schemeRE = "(?-xism:(?:https?|ftp|mailto|javascript|file))";
49
50 static private final String schemelessRE = "(?-xism:(?<![.=])(?:(?i)www\\d*\\.|(?i)ftp\\.))";
51
52 static private final String uriRE = "(?-xism:\\b(?:" + schemeRE + ":["
53 + uricNoColon + "]|" + schemelessRE + ")[" + uricSet + "#]*)";
54
55
56 static private final Pattern uriPattern = Pattern.compile(uriRE);
57
58
59 static private final Pattern schemePattern = Pattern.compile("^" + schemeRE
60 + ":");
61
62
63 static private final Pattern uriCleanup = Pattern.compile("^<(.*)>$");
64
65
66 static private final Pattern uriCleanup2 = Pattern.compile("[\\]\\)>#]$");
67
68
69 static private final Pattern uriCleanup3 = Pattern
70 .compile("^(?i)mailto:([^\\/]{2})(.*)$");
71
72
73 static private final String esc = "\\\\";
74
75 static private final String period = "\\.";
76
77 static private final String space = "\\040";
78
79 static private final String open_br = "\\[";
80
81 static private final String close_br = "\\]";
82
83 static private final String nonASCII = "\\x80-\\xff";
84
85 static private final String ctrl = "\\000-\\037";
86
87 static private final String cr_list = "\\n\\015";
88
89 static private final String qtext = "[^" + esc + nonASCII + cr_list + "\"]";
90
91 static private final String dtext = "[^" + esc + nonASCII + cr_list
92 + open_br + close_br + "]";
93
94 static private final String quoted_pair = esc + "[^" + nonASCII + "]";
95
96 static private final String atom_char = "[^(" + space + ")<>@,;:\"." + esc
97 + open_br + close_br + ctrl + nonASCII + "]";
98
99 static private final String atom = "(?>" + atom_char + "+)";
100
101 static private final String quoted_str = "\"" + qtext + "*(?:"
102 + quoted_pair + qtext + "*)*\"";
103
104 static private final String word = "(?:" + atom + "|" + quoted_str + ")";
105
106 static private final String local_part = word + "(?:" + period + word
107 + ")*";
108
109 static private final String label = "[A-Za-z\\d](?:[A-Za-z\\d-]*[A-Za-z\\d])?";
110
111 static private final String domain_ref = label + "(?:" + period + label
112 + ")*";
113
114 static private final String domain_lit = open_br + "(?:" + dtext + "|"
115 + quoted_pair + ")*" + close_br;
116
117 static private final String domain = "(?:" + domain_ref + "|" + domain_lit
118 + ")";
119
120 static private final String Addr_spec_re = "(?-xism:" + local_part
121 + "\\s*\\@\\s*" + domain + ")";
122
123
124 static private final Pattern emailAddrPattern = Pattern
125 .compile(Addr_spec_re);
126
127
128 static private final String octet = "(?:[1-2][0-9][0-9])|(?:[1-9][0-9])|(?:[0-9])";
129
130
131
132 static private final String tld = "[A-Za-z0-9\\-]*";
133
134
135 static private final String tld2 = tld + "\\." + tld;
136
137
138 static private final String tld3 = tld + "\\." + tld + "\\." + tld;
139
140
141
142 static private final String tldCap = "(" + tld + "\\.(" + tld + "))$";
143
144
145
146 static private final String tld2Cap = "(" + tld + "\\.(" + tld2 + "))$";
147
148
149
150 static private final String tld3Cap = "(" + tld + "\\.(" + tld3 + "))$";
151
152
153 static private final String ipCap = "((" + octet + ")\\.(" + octet
154 + ")\\.(" + octet + ")\\.(" + octet + "))$";
155
156
157 static private final Pattern ipCapPattern = Pattern.compile(ipCap);
158
159
160
161 static private final Pattern tldCapPattern = Pattern.compile(tldCap);
162
163
164
165 static private final Pattern tld2CapPattern = Pattern.compile(tld2Cap);
166
167
168
169 static private final Pattern tld3CapPattern = Pattern.compile(tld3Cap);
170
171
172 static private boolean testing = false;
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187 static public HashSet scanContentForDomains(HashSet domains, CharSequence content) {
188 HashSet newDomains = new HashSet();
189 HashSet hosts = scanContentForHosts(content);
190 for (Iterator i = hosts.iterator(); i.hasNext();) {
191 String domain = domainFromHost((String) i.next());
192
193 if (null != domain) {
194 if (false == domains.contains(domain)) {
195 newDomains.add(domain);
196 }
197 }
198 }
199 return newDomains;
200 }
201
202
203
204
205
206
207
208
209 static protected HashSet scanContentForHosts(CharSequence content) {
210 HashSet set = new HashSet();
211
212
213 Matcher mat = uriPattern.matcher(content);
214 while (mat.find()) {
215 String found = mat.group();
216 Matcher cleanMat = uriCleanup.matcher(found);
217 if (cleanMat.find()) {
218 found = cleanMat.group(1);
219 }
220
221 cleanMat = uriCleanup2.matcher(found);
222 if (cleanMat.find()) {
223 found = cleanMat.replaceAll("");
224 }
225
226 cleanMat = uriCleanup3.matcher(found);
227 if (cleanMat.find()) {
228 found = "mailto://" + cleanMat.group(1) + cleanMat.group(2);
229 }
230
231 cleanMat = schemePattern.matcher(found);
232 if (!cleanMat.find()) {
233 if (found.matches("^(?i)www\\d*\\..*")) {
234 found = "http://" + found;
235 } else if (found.matches("^(?i)ftp\\..*")) {
236 found = "ftp://" + found;
237 }
238 }
239
240 String host = hostFromUriStr(found);
241 if (null != host) {
242 host = host.toLowerCase();
243 if (false == set.contains(host)) {
244 set.add(host);
245 }
246 }
247 }
248
249
250 mat = emailAddrPattern.matcher(content);
251 while (mat.find()) {
252 String found = mat.group();
253 debugOut("******** mailfound=\"" + found + "\"");
254 found = "mailto://" + found;
255 debugOut("*******6 mailfoundfound=\"" + found
256 + "\" after cleanup 6");
257
258 String host = hostFromUriStr(found);
259 if (null != host) {
260
261 host = host.toLowerCase();
262 if (false == set.contains(host)) {
263 set.add(host);
264 }
265 }
266 }
267 return set;
268 }
269
270
271
272
273
274
275
276
277
278
279 static protected String hostFromUriStr(String uriStr) {
280 debugOut("hostFromUriStr(\"" + uriStr + "\")");
281 String host = null;
282 URI uri;
283 try {
284 uri = new URI(uriStr);
285 host = uri.getHost();
286 } catch (URISyntaxException e) {
287 debugOut(e.getMessage());
288 }
289 return host;
290 }
291
292
293
294
295
296
297
298
299
300
301
302
303
304 static protected String domainFromHost(String host) {
305 debugOut("domainFromHost(\"" + host + "\")");
306 String domain = null;
307 Matcher mat;
308
309
310 mat = ipCapPattern.matcher(host);
311 if (mat.find()) {
312
313 domain = mat.group(5) + "." + mat.group(4) + "." + mat.group(3) + "." + mat.group(2);
314 debugOut("domain=\"" + domain + "\"");
315 return domain;
316 }
317
318
319 mat = tld3CapPattern.matcher(host);
320 if (mat.find()) {
321 String tld = mat.group(2);
322 if (TLDLookup.isThreePartTLD(tld)) {
323 domain = mat.group(1);
324 debugOut("domain=\"" + domain + ", tld=\"" + tld + "\"");
325 return domain;
326 }
327 }
328
329
330 mat = tld2CapPattern.matcher(host);
331 if (mat.find()) {
332 String tld = mat.group(2);
333 if (TLDLookup.isTwoPartTLD(tld)) {
334 domain = mat.group(1);
335 debugOut("domain=\"" + domain + ", tld=\"" + tld + "\"");
336 return domain;
337 }
338 }
339
340
341 mat = tldCapPattern.matcher(host);
342 if (mat.find()) {
343 String tld = mat.group(2);
344 domain = mat.group(1);
345 debugOut("domain=\"" + domain + ", tld=\"" + tld + "\"");
346 return domain;
347 }
348 return domain;
349 }
350
351
352
353
354 private static void debugOut(String msg) {
355 if (true == testing) {
356 System.out.println(msg);
357 }
358 }
359
360
361
362
363
364
365
366 private static String escape(String str) {
367 StringBuffer buffer = new StringBuffer();
368 for (int i = 0; i < str.length(); i++) {
369 char ch = str.charAt(i);
370 if (Character.isDigit(ch) || Character.isUpperCase(ch) || Character.isLowerCase(ch) || ch == '_') {
371 buffer.append(ch);
372 } else {
373 buffer.append("\\");
374 buffer.append(ch);
375 }
376 }
377 return buffer.toString();
378 }
379 }