String SubDomain = "(?i:[a-z0-9]|[a-z0-9][-a-z0-9]*[a-z0-9])"; String TopDomains = "(?x-i:com\\b \n" + " |edu\\b \n" + " |biz\\b \n" + " |in(?:t|fo)\\b \n" + " |mil\\b \n" + " |net\\b \n" + " |org\\b \n" + " |[a-z][a-z]\\b \n" + // country codes ") \n"; String Hostname = "(?:" + SubDomain + "\\.)+" + TopDomains; String NOT_IN = ";\"'<>()\\[\\]{}\\s\\x7F-\\xFF"; String NOT_END = "!.,?"; String ANYWHERE = "[^" + NOT_IN + NOT_END + "]"; String EMBEDDED = "[" + NOT_END + "]"; String UrlPath = "/"+ANYWHERE + "*("+EMBEDDED+"+"+ANYWHERE+"+)*"; String Url = "(?x: \n"+ " \\b \n"+ " ## match the hostname part \n"+ " ( \n"+ " (?: ftp | http s? ): // [-\\w]+(\\.\\w[-\\w]*)+ \n"+ " | \n"+ " " + Hostname + " \n"+ " ) \n"+ " # allow optional port \n"+ " (?: :\\d+ )? \n"+ " \n"+ " # rest of url is optional, and begins with / \n"+ " (?: " + UrlPath + ")? \n"+ ")"; // Now convert string we've built up into a real regex object Pattern UrlRegex = Pattern.compile(Url); // Now ready to apply to raw text to find urls . . . ----------------------------------------------------------------------------- Copyright 1997-2024 Jeffrey Friedl