return to the main page Mastering Regular Expressions
Third Edition

Listings from page 208

Chapter 5; page 208 (download)

String SubDomain  = "(?i:[a-z0-9]|[a-z0-9][-a-z0-9]*[a-z0-9])";
String TopDomains = "(?x-i:com\\b        \n" +
                    "     |edu\\b        \n" +
                    "     |biz\\b        \n" +
                    "     |in(?:t|fo)\\b \n" +
                    "     |mil\\b        \n" +
                    "     |net\\b        \n" +
                    "     |org\\b        \n" +
                    "     |[a-z][a-z]\\b \n" + // country codes
                    ")                   \n";
String Hostname = "(?:" + SubDomain + "\\.)+" + TopDomains;

String NOT_IN   = ";\"'<>()\\[\\]{}\\s\\x7F-\\xFF";
String NOT_END  = "!.,?";
String ANYWHERE = "[^" + NOT_IN + NOT_END + "]";
String EMBEDDED = "[" + NOT_END + "]";
String UrlPath  = "/"+ANYWHERE + "*("+EMBEDDED+"+"+ANYWHERE+"+)*";
String Url = 
  "(?x:                                                \n"+
  "  \\b                                               \n"+
  "  ## match the hostname part                        \n"+
  "  (                                                 \n"+
  "    (?: ftp | http s? ): // [-\\w]+(\\.\\w[-\\w]*)+ \n"+
  "   |                                                \n"+
  "    " + Hostname + "                                \n"+
  "  )                                                 \n"+
  "  # allow optional port                             \n"+
  "  (?: :\\d+ )?                                      \n"+
  "                                                    \n"+
  "  # rest of url is optional, and begins with /      \n"+
  " (?: " + UrlPath + ")?                              \n"+
  ")";

// Now convert string we've built up into a real regex object
Pattern UrlRegex = Pattern.compile(Url);
// Now ready to apply to raw text to find urls . . . 

Copyright © 2024 Jeffrey Friedl

Fetch additional Third-Edition listings and data:

Fetch listings from page(s)