URIScanner xref

View Javadoc

1   /****************************************************************
2    * Licensed to the Apache Software Foundation (ASF) under one   *
3    * or more contributor license agreements.  See the NOTICE file *
4    * distributed with this work for additional information        *
5    * regarding copyright ownership.  The ASF licenses this file   *
6    * to you under the Apache License, Version 2.0 (the            *
7    * "License"); you may not use this file except in compliance   *
8    * with the License.  You may obtain a copy of the License at   *
9    *                                                              *
10   *   http://www.apache.org/licenses/LICENSE-2.0                 *
11   *                                                              *
12   * Unless required by applicable law or agreed to in writing,   *
13   * software distributed under the License is distributed on an  *
14   * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY       *
15   * KIND, either express or implied.  See the License for the    *
16   * specific language governing permissions and limitations      *
17   * under the License.                                           *
18   ****************************************************************/
19  
20  
21  
22  
23  package org.apache.james.smtpserver.urirbl;
24  
25  import java.util.HashSet;
26  import java.util.Iterator;
27  import java.util.regex.*;
28  import java.net.URI;
29  import java.net.URISyntaxException;
30  
31  public class URIScanner {
32  
33      /* These regular expressions "inspired" by Spamassassin */
34      static private final String reserved = ";/?:@&=+$,[]\\#|";
35  
36      static private final String reservedNoColon = ";/?@&=+$,[]\\#|";
37  
38      static private final String mark = "-_.!~*'()";
39  
40      static private final String unreserved = "A-Za-z0-9" + escape(mark)
41          + "\\x00-\\x08\\x0b\\x0c\\x0e-\\x1f";
42  
43      static private final String uricSet = escape(reserved) + unreserved + "%";
44  
45      static private final String uricNoColon = escape(reservedNoColon)
46          + unreserved + "%";
47  
48      static private final String schemeRE = "(?-xism:(?:https?|ftp|mailto|javascript|file))";
49  
50      static private final String schemelessRE = "(?-xism:(?<![.=])(?:(?i)www\\d*\\.|(?i)ftp\\.))";
51  
52      static private final String uriRE = "(?-xism:\\b(?:" + schemeRE + ":["
53          + uricNoColon + "]|" + schemelessRE + ")[" + uricSet + "#]*)";
54  
55      /** Pre-compiled pattern that matches URIs */
56      static private final Pattern uriPattern = Pattern.compile(uriRE);
57  
58      /** Pre-compiled pattern that matches URI scheme strings */
59      static private final Pattern schemePattern = Pattern.compile("^" + schemeRE
60          + ":");
61  
62      /** Pre-compiled pattern used to cleanup a found URI string */
63      static private final Pattern uriCleanup = Pattern.compile("^<(.*)>$");
64  
65      /** Pre-compiled pattern used to cleanup a found URI string */
66      static private final Pattern uriCleanup2 = Pattern.compile("[\\]\\)>#]$");
67  
68      /** Pre-compile pattern for identifying "mailto" patterns */
69      static private final Pattern uriCleanup3 = Pattern
70          .compile("^(?i)mailto:([^\\/]{2})(.*)$");
71  
72      /* These regular expressions also "inspired" by Spamassassin */
73      static private final String esc = "\\\\";
74  
75      static private final String period = "\\.";
76  
77      static private final String space = "\\040";
78  
79      static private final String open_br = "\\[";
80  
81      static private final String close_br = "\\]";
82  
83      static private final String nonASCII = "\\x80-\\xff";
84  
85      static private final String ctrl = "\\000-\\037";
86  
87      static private final String cr_list = "\\n\\015";
88  
89      static private final String qtext = "[^" + esc + nonASCII + cr_list + "\"]";
90  
91      static private final String dtext = "[^" + esc + nonASCII + cr_list
92          + open_br + close_br + "]";
93  
94      static private final String quoted_pair = esc + "[^" + nonASCII + "]";
95  
96      static private final String atom_char = "[^(" + space + ")<>@,;:\"." + esc
97          + open_br + close_br + ctrl + nonASCII + "]";
98  
99      static private final String atom = "(?>" + atom_char + "+)";
100 
101     static private final String quoted_str = "\"" + qtext + "*(?:"
102         + quoted_pair + qtext + "*)*\"";
103 
104     static private final String word = "(?:" + atom + "|" + quoted_str + ")";
105 
106     static private final String local_part = word + "(?:" + period + word
107         + ")*";
108 
109     static private final String label = "[A-Za-z\\d](?:[A-Za-z\\d-]*[A-Za-z\\d])?";
110 
111     static private final String domain_ref = label + "(?:" + period + label
112         + ")*";
113 
114     static private final String domain_lit = open_br + "(?:" + dtext + "|"
115         + quoted_pair + ")*" + close_br;
116 
117     static private final String domain = "(?:" + domain_ref + "|" + domain_lit
118         + ")";
119 
120     static private final String Addr_spec_re = "(?-xism:" + local_part
121         + "\\s*\\@\\s*" + domain + ")";
122 
123     /** Pre-compiled pattern for matching "schemeless" mailto strings */
124     static private final Pattern emailAddrPattern = Pattern
125         .compile(Addr_spec_re);
126 
127     /** Simple reqular expression to match an octet part of an IP address */
128     static private final String octet = "(?:[1-2][0-9][0-9])|(?:[1-9][0-9])|(?:[0-9])";
129 
130     /** Simple regular expression to match a part of a domain string in the
131      TLDLookup cache. */
132     static private final String tld = "[A-Za-z0-9\\-]*";
133 
134     /** Simple regular expression that matches a two-part TLD */
135     static private final String tld2 = tld + "\\." + tld;
136 
137     /** Simple regular expression that matches a three-part TLD */
138     static private final String tld3 = tld + "\\." + tld + "\\." + tld;
139 
140     /** Regular expression that matches and captures parts of a possible 
141      one-part TLD domain string */
142     static private final String tldCap = "(" + tld + "\\.(" + tld + "))$";
143 
144     /** Regular expression that matches and captures parts of a possible 
145      two-part TLD domain string */
146     static private final String tld2Cap = "(" + tld + "\\.(" + tld2 + "))$";
147 
148     /** Regular expression that matches and captures parts of a possible 
149      three-part TLD domain string */
150     static private final String tld3Cap = "(" + tld + "\\.(" + tld3 + "))$";
151 
152     /** Regular expression that matches and captures parts of an IP address */
153     static private final String ipCap = "((" + octet + ")\\.(" + octet
154         + ")\\.(" + octet + ")\\.(" + octet + "))$";
155 
156     /** Pre-compiled pattern that matches IP addresses */
157     static private final Pattern ipCapPattern = Pattern.compile(ipCap);
158 
159     /** Pre-compiled pattern that matches domain string that is possibly
160      contained in a one-part TLD */
161     static private final Pattern tldCapPattern = Pattern.compile(tldCap);
162 
163     /** Pre-compiled pattern that matches domain string that is possibly
164      contained in a two-part TLD */
165     static private final Pattern tld2CapPattern = Pattern.compile(tld2Cap);
166 
167     /** Pre-compiled pattern that matches domain string that is possibly
168      contained in a three-part TLD */
169     static private final Pattern tld3CapPattern = Pattern.compile(tld3Cap);
170  
171     /** controls testing/debug output */
172     static private boolean testing = false;
173 
174     /**
175      * Scans a character sequence for URIs. Then add all unique domain strings 
176      * derived from those found URIs to the supplied HashSet.
177      * <p>
178      * This function calls scanContentForHosts() to grab all the host strings.
179      * Then it calls domainFromHost() on each host string found to distill them
180      * to their basic "registrar" domains. 
181      *
182      * @param domains a HashSet to be populated with all domain strings found in
183      *        the content
184      * @param content a character sequence to be scanned for URIs
185      * @return newDomains the domains which were extracted
186      */
187     static public HashSet scanContentForDomains(HashSet domains, CharSequence content) {
188         HashSet newDomains = new HashSet();
189         HashSet hosts = scanContentForHosts(content);
190         for (Iterator i = hosts.iterator(); i.hasNext();) {
191             String domain = domainFromHost((String) i.next());
192     
193             if (null != domain) {
194                 if (false == domains.contains(domain)) {
195                     newDomains.add(domain);
196                 }
197             }
198         }
199         return newDomains;
200     }
201 
202     /**
203      * Scans a character sequence for URIs. Then returns all unique host strings 
204      * derived from those found URIs in a HashSet
205      *
206      * @param content a character sequence to be scanned for URIs
207      * @return a HashSet containing host strings
208      */
209     static protected HashSet scanContentForHosts(CharSequence content) {
210         HashSet set = new HashSet();
211         
212         // look for URIs
213         Matcher mat = uriPattern.matcher(content);
214         while (mat.find()) {
215             String found = mat.group();
216             Matcher cleanMat = uriCleanup.matcher(found);
217             if (cleanMat.find()) {
218                 found = cleanMat.group(1);
219             }
220                 
221             cleanMat = uriCleanup2.matcher(found);
222             if (cleanMat.find()) {
223                found = cleanMat.replaceAll("");
224             }
225                 
226             cleanMat = uriCleanup3.matcher(found);
227             if (cleanMat.find()) {
228                 found = "mailto://" + cleanMat.group(1) + cleanMat.group(2);
229             }
230         
231             cleanMat = schemePattern.matcher(found);
232             if (!cleanMat.find()) {
233                 if (found.matches("^(?i)www\\d*\\..*")) {
234                     found = "http://" + found;
235                 } else if (found.matches("^(?i)ftp\\..*")) {
236                     found = "ftp://" + found;
237                 }
238             }
239        
240             String host = hostFromUriStr(found);
241             if (null != host) {
242                 host = host.toLowerCase();
243                 if (false == set.contains(host)) {
244                     set.add(host);
245                 }
246             }
247         }
248 
249         // look for "schemeless" email addresses, too
250         mat = emailAddrPattern.matcher(content);
251         while (mat.find()) {
252             String found = mat.group();
253             debugOut("******** mailfound=\"" + found + "\"");
254             found = "mailto://" + found;
255             debugOut("*******6 mailfoundfound=\"" + found
256                 + "\" after cleanup 6");
257           
258             String host = hostFromUriStr(found);
259             if (null != host) {
260                 
261                 host = host.toLowerCase();
262                 if (false == set.contains(host)) {
263                     set.add(host);
264                 }
265             }
266         }
267         return set;
268     }
269 
270     /**
271      * Extracts and returns the host portion of URI string.
272      *
273      * This function uses java.net.URI.
274      *
275      * @param uriStr a string containing a URI
276      * @return the host portion of the supplied URI, null if no host string
277      *         could be found
278      */
279     static protected String hostFromUriStr(String uriStr) {
280         debugOut("hostFromUriStr(\"" + uriStr + "\")");
281         String host = null;
282         URI uri;
283         try {
284             uri = new URI(uriStr);
285             host = uri.getHost();
286         } catch (URISyntaxException e) {
287             debugOut(e.getMessage());
288         }
289         return host;
290     }
291 
292     /**
293      * Extracts and returns the registrar domain portion of a host string. This
294      * funtion checks all known multi-part TLDs to make sure that registrar
295      * domain is complete. For example, if the supplied host string is
296      * "subdomain.example.co.uk", the TLD is "co.uk" and not "uk". Therefore,
297      * the correct registrar domain is not "co.uk", but "example.co.uk". If the
298      * domain string is an IP address, then the octets are returned in reverse
299      * order.
300      *
301      * @param host a string containing a host name
302      * @return the registrar domain portion of the supplied host string
303      */
304     static protected String domainFromHost(String host) {
305         debugOut("domainFromHost(\"" + host + "\")");
306         String domain = null;
307         Matcher mat;
308             
309         // IP addrs 
310         mat = ipCapPattern.matcher(host);
311         if (mat.find()) {
312             // reverse the octets now
313             domain = mat.group(5) + "." + mat.group(4) + "." + mat.group(3) + "." + mat.group(2);
314             debugOut("domain=\"" + domain + "\"");
315             return domain;
316         }
317 
318         // 3-part TLDs
319         mat = tld3CapPattern.matcher(host);
320         if (mat.find()) {
321             String tld = mat.group(2);
322             if (TLDLookup.isThreePartTLD(tld)) {
323                 domain = mat.group(1);
324                 debugOut("domain=\"" + domain + ", tld=\"" + tld + "\"");
325                 return domain;
326             }
327         }
328 
329         // 2-part TLDs
330         mat = tld2CapPattern.matcher(host);
331         if (mat.find()) {
332             String tld = mat.group(2);
333             if (TLDLookup.isTwoPartTLD(tld)) {
334                 domain = mat.group(1);
335                 debugOut("domain=\"" + domain + ", tld=\"" + tld + "\"");
336                 return domain;
337             }
338         }
339 
340         // 1-part TLDs
341         mat = tldCapPattern.matcher(host);
342         if (mat.find()) {
343             String tld = mat.group(2);
344             domain = mat.group(1);
345             debugOut("domain=\"" + domain + ", tld=\"" + tld + "\"");
346             return domain;
347         }
348         return domain;
349     }
350 
351     /**
352      * Debugging output
353      */
354     private static void debugOut(String msg) {
355         if (true == testing) {
356             System.out.println(msg);
357         }
358     }
359 
360     /**
361      * A utility function that "escapes" special characters in a string.
362      *
363      * @param str a string to be processed
364      * @return modified "escaped" string
365      */
366     private static String escape(String str) {
367         StringBuffer buffer = new StringBuffer();
368         for (int i = 0; i < str.length(); i++) {
369             char ch = str.charAt(i);
370             if (Character.isDigit(ch) || Character.isUpperCase(ch) || Character.isLowerCase(ch) || ch == '_') {
371                 buffer.append(ch);
372             } else {
373                 buffer.append("\\");
374                 buffer.append(ch);
375             }
376         }
377         return buffer.toString();
378     }
379 }