Mastering Regular Expressions
Listings from page 208
Chapter 5; page 208 (download)
String SubDomain = "(?i:[a-z0-9]|[a-z0-9][-a-z0-9]*[a-z0-9])";
String TopDomains = "(?x-i:com\\b \n" +
" |edu\\b \n" +
" |biz\\b \n" +
" |in(?:t|fo)\\b \n" +
" |mil\\b \n" +
" |net\\b \n" +
" |org\\b \n" +
" |[a-z][a-z]\\b \n" + // country codes
") \n";
String Hostname = "(?:" + SubDomain + "\\.)+" + TopDomains;
String NOT_IN = ";\"'<>()\\[\\]{}\\s\\x7F-\\xFF";
String NOT_END = "!.,?";
String ANYWHERE = "[^" + NOT_IN + NOT_END + "]";
String EMBEDDED = "[" + NOT_END + "]";
String UrlPath = "/"+ANYWHERE + "*("+EMBEDDED+"+"+ANYWHERE+"+)*";
String Url =
"(?x: \n"+
" \\b \n"+
" ## match the hostname part \n"+
" ( \n"+
" (?: ftp | http s? ): // [-\\w]+(\\.\\w[-\\w]*)+ \n"+
" | \n"+
" " + Hostname + " \n"+
" ) \n"+
" # allow optional port \n"+
" (?: :\\d+ )? \n"+
" \n"+
" # rest of url is optional, and begins with / \n"+
" (?: " + UrlPath + ")? \n"+
")";
// Now convert string we've built up into a real regex object
Pattern UrlRegex = Pattern.compile(Url);
// Now ready to apply to raw text to find urls . . . |
Fetch additional Third-Edition listings and data: