Mastering Regular Expressions, Third Edition, listings from pages 1 through 484

Mastering Regular Expressions
Third Edition

Listings from pages 1 through 484
Download all listings shown below.

Chapter 1; page 14 (download)

% egrep  '^(From|Subject|Date): '  mailbox
From: elvis@tabloid.org (The King)
Subject: be seein' ya around
Date: Mon, 23 Oct 2006 11:04:13
From: The Prez <president@whitehouse.gov>
Date: Wed, 25 Oct 2006 8:36:24
Subject: now, about your vote...

Chapter 1; page 15 (download)

% egrep -i  '^(From|Subject|Date): '  mailbox

Chapter 1; page 15 (download)

SUBJECT: MAKE MONEY FAST

Chapter 1; page 22 (download)

% egrep -i '\<([a-z]+) +\1\>' files...

Chapter 1; page 25 (download)

http://hostname/path.html

Chapter 2; page 35 (download)

$/ = ".\n";
while (<>) {
  next if !s/\b([a-z]+)((?:\s|<[^>]+>)+)(\1\b)/\e[7m$1\e[m$2\e[7m$3\e[m/ig;
  s/^(?:[^\e]*\n)+//mg;   # Remove any unmarked lines.
  s/^/$ARGV: /mg;      # Ensure lines begin with filename.
  print;
}

Chapter 2; page 37 (download)

$celsius = 30;
$fahrenheit = ($celsius * 9 / 5) + 32;  # calculate Fahrenheit
print "$celsius C is $fahrenheit F.\n";# report both temperatures

Chapter 2; page 37 (download)

$celsius = 20;
while ($celsius <= 45)
{
  $fahrenheit = ($celsius * 9 / 5) + 32; # calculate Fahrenheit
  print "$celsius C is $fahrenheit F.\n";
  $celsius = $celsius + 5;
}

Chapter 2; page 38 (download)

if ($reply =~ m/^[0-9]+$/) {
    print "only digits\n";
} else {
    print "not only digits\n";
}

Chapter 2; page 39 (download)

if ($reply =~ m/^[0-9]+$/)

Chapter 2; page 39 (download)

print "Enter a temperature in Celsius:\n";
$celsius = <STDIN>; # this reads one line from the user
chomp($celsius);   # this removes the ending newline from $celsius

if ($celsius =~ m/^[0-9]+$/) {
    $fahrenheit = ($celsius * 9 / 5) + 32; # calculate Fahrenheit
    print "$celsius C is $fahrenheit F\n";
} else {
    print "Expecting a number, so I don't understand \"$celsius\".\n";
}

Chapter 2; page 40 (download)

printf "%.2f C is %.2f F\n", $celsius, $fahrenheit;

Chapter 2; page 40 (download)

if ($celsius =~ m/^[-+]?[0-9]+(\.[0-9]*)?$/)  {

Chapter 2; page 41 (download)

$celsius =~ m/^[-+]?[0-9]+[CF]$/
$celsius =~ m/^([-+]?[0-9]+)([CF])$/

Chapter 2; page 42 (download)

print "Enter a temperature (e.g., 32F, 100C):\n";
$input = <STDIN>; # This reads one line from the user.
chomp($input);    # This removes the ending newline from $input.

if ($input =~ m/^([-+]?[0-9]+)([CF])$/)
{
    # If we get in here, we had a match. $1 is the number, $2 is "C" or "F".
    $InputNum = $1;  # Save to named variables to make the ...
    $type     = $2;  # ... rest of the program easier to read.

    if ($type eq "C") {      # `eq' tests if two strings are equal
        # The input was Celsius, so calculate Fahrenheit
        $celsius = $InputNum;
        $fahrenheit = ($celsius * 9 / 5) + 32;
    } else {
        # If not "C", it must be an "F", so calculate Celsius
        $fahrenheit = $InputNum;
        $celsius = ($fahrenheit - 32) * 5 / 9;
    }
    # At this point we have both temperatures, so display the results:
    printf "%.2f C is %.2f F\n", $celsius, $fahrenheit;
} else {
    # The initial regex did not match, so issue a warning.
    print "Expecting a number followed by \"C\" or \"F\",\n";
    print "so I don't understand \"$input\".\n";
}

Chapter 2; page 43 (download)

if ($input =~ m/^([-+]?[0-9]+(\.[0-9]*)?)([CF])$/)

Chapter 2; page 44 (download)

if ($input =~ m/^([-+]?[0-9]+(?:\.[0-9]*)?)([CF])$/)

Chapter 2; page 44 (download)

if ($input =~ m/^([-+]?[0-9]+(\.[0-9]*)?) *([CF])$/)

Chapter 2; page 47 (download)

$input =~ m/^([-+]?[0-9]+(\.[0-9]*)?)\s*([CF])$/

Chapter 2; page 47 (download)

$input =~ m/^([-+]?[0-9]+(\.[0-9]*)?)\s*([CF])$/i

Chapter 2; page 48 (download)

if ($input =~ m/^([-+]?[0-9]+(\.[0-9]*)?)\s*([CF])$/i)
{
 
    $type = $3;  # save to a named variable to make rest of program more readable

    if ($type eq "C") { # `eq' tests if two strings are equal
 
    } else {

Chapter 2; page 48 (download)

if ($type eq "C" or $type eq "c") {

Chapter 2; page 48 (download)

if ($type =~ m/c/i) {

Chapter 2; page 48 (download)

print "Enter a temperature (e.g., 32F, 100C):\n";
$input = <STDIN>; # This reads one line from the user.
chomp($input);    # This removes the ending newline from $input.

if ($input =~ m/^([-+]?[0-9]+(\.[0-9]*)?)\s*([CF])$/i)
{
    # If we get in here, we had a match. $1 is the number, $3 is "C" or "F".
    $InputNum = $1;  # Save to named variables to make the ...
    $type     = $3;  # ... rest of the program easier to read.

    if ($type =~ m/c/i) {    # Is it "c" or "C"?
        # The input was Celsius, so calculate Fahrenheit
        $celsius = $InputNum;
        $fahrenheit = ($celsius * 9 / 5) + 32;
    } else {
        # If not "C", it must be an "F", so calculate Celsius
        $fahrenheit = $InputNum;
        $celsius = ($fahrenheit - 32) * 5 / 9;
    }
    # At this point we have both temperatures, so display the results:
    printf "%.2f C is %.2f F\n", $celsius, $fahrenheit;
} else {
    # The initial regex did not match, so issue a warning.
    print "Expecting a number followed by \"C\" or \"F\",\n";
    print "so I don't understand \"$input\".\n";
}

Chapter 2; page 50 (download)

$var =~ s/Jeff/Jeffrey/;

Chapter 2; page 50 (download)

$var =~ s/\bJeff\b/Jeffrey/;

Chapter 2; page 50 (download)

$var =~ s/\bJeff\b/Jeff/i;

Chapter 2; page 51 (download)

Dear =FIRST=,
You have been chosen to win a brand new =TRINKET=! Free!
Could you use another =TRINKET= in the =FAMILY= household?
Yes =SUCKER=, I bet you could! Just respond by.....

Chapter 2; page 51 (download)

$given = "Tom";
$family = "Cruise";
$wunderprize = "100% genuine faux diamond";

Chapter 2; page 51 (download)

$letter =~ s/=FIRST=/$given/g;
$letter =~ s/=FAMILY=/$family/g;
$letter =~ s/=SUCKER=/$given $family/g;
$letter =~ s/=TRINKET=/fabulous $wunderprize/g;

Chapter 2; page 52 (download)

$price =~ s/(\.\d\d[1-9]?)\d*/$1/

Chapter 2; page 53 (download)

From elvis Thu Feb 29 11:15 2007
Received: from elvis@localhost by tabloid.org (8.11.3) id KA8CMY
Received: from tabloid.org by gateway.net (8.12.5/2) id N8XBK
To: jfriedl@regex.info (Jeffrey Friedl)
From: elvis@tabloid.org (The King)
Date: Thu, Feb 29 2007 11:15
Message-Id: <2007022939939.KA8CMY@tabloid.org>
Subject: Be seein' ya around
Reply-To: elvis@hh.tabloid.org
X-Mailer: Madam Zelda's Psychic Orb [version 3.7 PL92]

Sorry I haven't been around lately. A few years back I checked
into that ole heartbreak hotel in the sky, ifyaknowwhatImean.
The Duke says "hi".
        Elvis

Chapter 2; page 54 (download)

To: elvis@hh.tabloid.org (The King)
From: jfriedl@regex.info (Jeffrey Friedl)
Subject: Re: Be seein' ya around

On Thu, Feb 29 2007 11:15 The King wrote:
|> Sorry I haven't been around lately. A few years back I checked
|> into that ole heartbreak hotel in the sky, ifyaknowwhatImean.
|> The Duke says "hi".
|>         Elvis

Chapter 2; page 55 (download)

while ($line = <>) {
    ... work with $line here ...
}

Chapter 2; page 55 (download)

# Process the header
while ($line = <>) {
   if ($line =~ m/^\s*$/) {
       last; # stop processing within this while loop, continue below
   }
   ... process header line here ...
}
... processing for the rest of the message follows ...

Chapter 2; page 55 (download)

if ($line =~ m/^Subject: (.*)/i) {
    $subject = $1;
}

Chapter 2; page 56 (download)

if ($line =~ m/^Date: (.*)/i) {
   $date = $1;
}
if ($line =~ m/^Reply-To: (.*)/i) {
   $reply_address = $1;
}

Chapter 2; page 56 (download)

From: elvis@tabloid.org (The King)

Chapter 2; page 57 (download)

if ($line =~ m/^From: (\S+) \(([^()]*)\)/i) {
    $reply_address = $1;
    $from_name = $2;
}

Chapter 2; page 57 (download)

while ($line = <>)
{

    if ($line =~ m/^\s*$/ ) { # If we have an empty line...
        last; # this immediately ends the `while' loop.
    }

    if ($line =~ m/^Subject: (.*)/i) {
        $subject = $1;
    }

    if ($line =~ m/^Date: (.*)/i) {
        $date = $1;
    }

    if ($line =~ m/^Reply-To: (\S+)/i) {
        $reply_address = $1;
    }

    if ($line =~ m/^From: (\S+) \(([^()]*)\)/i) {
        $reply_address = $1;
        $from_name = $2;
    }

}

Chapter 2; page 58 (download)

print "To: $reply_address ($from_name)\n";
print "From: jfriedl\@regex.info (Jeffrey Friedl)\n";
print "Subject: Re: $subject\n";
print "\n" ; # blank line to separate the header from message body.

Chapter 2; page 58 (download)

print "On $date $from_name wrote:\n";

Chapter 2; page 58 (download)

while ($line = <>) {
    print "|> $line";
}

Chapter 2; page 58 (download)

$line =~ s/^/|> /;
print $line;

Chapter 2; page 59 (download)

if (    not defined($reply_address)
     or not defined($from_name)
     or not defined($subject)
     or not defined($date) )
{
    die "couldn't glean the required information!";
}

Chapter 2; page 59 (download)

print "The US population is $pop\n";

Chapter 2; page 60 (download)

... by Jeffrey Friedl.

Chapter 2; page 60 (download)

... by Jeffrey Friedl.

Chapter 2; page 60 (download)

... by Jeffrey Friedl.

Chapter 2; page 60 (download)

... by Thomas Jefferson

Chapter 2; page 61 (download)

s/\bJeff(?=s\b)/Jeff'/g

Chapter 2; page 62 (download)

s/(?<=\bJeff)(?=s\b)/'/g

Chapter 2; page 65 (download)

$pop =~ s/(?<=\d)(?=(\d\d\d)+$)/,/g;
print "The US population is $pop\n";

Chapter 2; page 65 (download)

$text = "The population of 298444215 is growing";
   
$text =~ s/(?<=\d)(?=(\d\d\d)+$)/,/g;
print "$text\n";

Chapter 2; page 66 (download)

$text =~ s/(?<=\d)(?=(\d\d\d)+(?!\d))/,/g;

Chapter 2; page 67 (download)

$text =~ s/(\d)(?=(\d\d\d)+(?!\d))/$1,/g;

Chapter 2; page 67 (download)

$text =~ s/(\d)((\d\d\d)+\b)/$1,$2/g;

Chapter 2; page 67 (download)

while ( $text =~ s/(\d)((\d\d\d)+\b)/$1,$2/g ) {
   # Nothing to do inside the body of the while -- we merely want to reapply the regex until it fails
}

Chapter 2; page 67 (download)

undef $/;    # Enter "file-slurp" mode.
$text = <>; # Slurp up the first file given on the command line.

Chapter 2; page 68 (download)

This is a sample file.
It has three lines.
That's all

Chapter 2; page 68 (download)

This is a sample file.
It has three lines.
That's all

Chapter 2; page 68 (download)

This is a sample file.
It has three lines.
That's all

Chapter 2; page 69 (download)

$text =~ s/&/&amp;/g; # Make the basic HTML . . . 
$text =~ s/</&lt;/g;  #  . . . characters &, <, and > . . . 
$text =~ s/>/&gt;/g;  #  . . . HTML safe.

Chapter 2; page 69 (download)

$text =~ s/^$/<p>/g;

Chapter 2; page 69 (download)

$text =~ s/^$/<p>/mg;

Chapter 2; page 70 (download)

... with.


	 
Therefore ...

Chapter 2; page 70 (download)

$text =~ s/^[ \t\r]*$/<p>/mg;

Chapter 2; page 70 (download)

... with.
<p>
<p>
<p>
Therefore ...

Chapter 2; page 70 (download)

$text =~ s/^\s*$/<p>/mg;

Chapter 2; page 70 (download)

... with.
<p>
Therefore ...

Chapter 2; page 70 (download)

$text =~ s/\b(usernameregex\@hostnameregex)\b/<a href="mailto:$1">$1<\/a>/g;

Chapter 2; page 72 (download)

$text =~ s{\b(usernameregex\@hostnameregex)\b}{<a href="mailto:$1">$1</a>}gi;

Chapter 2; page 72 (download)

$text =~ s{
   \b
   # Capture the address to $1 . . . 
   (
     usernameregex
     \@
     hostnameregex
   )
   \b
}{<a href="mailto:$1">$1</a>}gix;

Chapter 2; page 73 (download)

undef $/;    # Enter "file-slurp" mode.
$text = <>; # Slurp up the first file given on the command line.

$text =~ s/&/&amp;/g;      # Make the basic HTML . . . 
$text =~ s/</&lt;/g;  #  . . . characters &, <, and > . . . 
$text =~ s/>/&gt;/g;  #  . . . HTML safe.

$text =~ s/^\s*$/<p>/mg; # Separate paragraphs.

# Turn email addresses into links . . . 
$text =~ s{
   \b
   # Capture the address to $1 . . . 
   (
     \w[-.\w]*                                    # username
     \@
     [-a-z0-9]+(\.[-a-z0-9]+)*\.(com|edu|info)  # hostname
   )
   \b
}{<a href="mailto:$1">$1</a>}gix;

print $text; # Finally, display the HTML-ized text.

Chapter 2; page 74 (download)

$text =~ s{
   \b
   # Capture the URL to $1 . . . 
   (
      http:// hostname
      (
         / path
      )?
   )
}{<a href="$1">$1</a>}gix;

Chapter 2; page 74 (download)

$text =~ s{
   \b
   # Capture the URL to $1 . . . 
   (
      http:// [-a-z0-9]+(\.[-a-z0-9]+)*\.(com|edu|info) \b # hostname
      (
         / [-a-z0-9_:\@&?=+,.!/~*'%\$]*  # optional path
      )?
   )
}{<a href="$1">$1</a>}gix;

Chapter 2; page 75 (download)

undef $/;    # Enter "file-slurp" mode
$text = <>; # Slurp up the first file given on the command line.

$text =~ s/&/&amp;/g;      # Make the basic HTML . . . 
$text =~ s/</&lt;/g;  #  . . . characters &, <, and > . . . 
$text =~ s/>/&gt;/g;  #  . . . HTML safe.

$text =~ s/^\s*$/<p>/mg; # Separate paragraphs.

# Turn email addresses into links . . . 
$text =~ s{
   \b
   # Capture the address to $1 . . . 
   (
     \w[-.\w]*                                    # username
     \@
     [-a-z0-9]+(\.[-a-z0-9]+)*\.(com|edu|info)  # hostname
   )
   \b
}{<a href="mailto:$1">$1</a>}gix;

# Turn HTTP URLs into links . . . 
$text =~ s{
   \b
   # Capture the URL to $1 . . . 
   (
      http:// [-a-z0-9]+(\.[-a-z0-9]+)*\.(com|edu|info) \b   # hostname
      (
         / [-a-z0-9_:\@&?=+,.!/~*'%\$]* # Optional path
           (?<![.,?!])    # Not allowed to end with [.,?!]
      )?
   )
}{<a href="$1">$1</a>}gix;

print $text; # Finally, display the HTML-ized text.

Chapter 2; page 76 (download)

$HostnameRegex = qr/[-a-z0-9]+(\.[-a-z0-9]+)*\.(com|edu|info)/i;

# Turn email addresses into links . . . 
$text =~ s{
   \b
   # Capture the address to $1 . . . 
   (
     \w[-.\w]*                          # username
     \@
     $HostnameRegex  # hostname
   )
   \b
}{<a href="mailto:$1">$1</a>}gix;

# Turn HTTP URLs into links . . . 
$text =~ s{
   \b
   # Capture the URL to $1 . . . 
   (
      http:// $HostnameRegex \b          # hostname
      (
         / [-a-z0-9_:\@&?=+,.!/~*'%\$]* # Optional path
           (?<![.,?!])    # not allowed to end with [.,?!]
      )?
   )
}{<a href="$1">$1</a>}gix;

Chapter 2; page 77 (download)

$/ = ".\n";
while (<>) {
  next if !s/\b([a-z]+)((?:\s|<[^>]+>)+)(\1\b)/\e[7m$1\e[m$2\e[7m$3\e[m/ig;
  s/^(?:[^\e]*\n)+//mg;   # Remove any unmarked lines.
  s/^/$ARGV: /mg;      # Ensure lines begin with filename.
  print;
}

Chapter 2; page 77 (download)

% perl -w FindDbl ch01.txt
ch01.txt: check for doubled words (such as thisthis thisthis), a common problem with
ch01.txt: * Find doubled words despite capitalization differences, such as with `TheThe
ch01.txt: thethe...', as well as allow differing amounts of whitespace (space, tabs,
ch01.txt: /\<(1,000,000|million|thousandthousand thousandthousand)/. But alternation can't be
ch01.txt: of this chapter. If you knew thethe thethe specific doubled word to find (such

Chapter 2; page 78 (download)

$/ = ".\n";   # Sets a special ``chunk-mode''; chunks end with a period-newline combination

while (<>) 
{
    next unless s{# (regex starts here)

          ### Need to match one word:
          \b           # Start of word . . . .
          ( [a-z]+ )  # Grab word, filling $1 (and \1).

          ### Now need to allow any number of spaces and/or <TAGS>
          (         # Save what intervenes to $2.
              (?:   # (Non-capturing parens for grouping the alternation)
                 \s       # Whitespace (includes newline, which is good).
                |         # -or-
                 <[^>]+>  # Item like <TAG>.
              )+       # Need at least one of the above, but allow more.
          )

          ### Now match the first word again:
          (\1\b)     # \b ensures not embedded. This copy saved to $3.

      #(regex ends here)
    }
    # Above is the regex. The replacement string is below, followed by the modifiers, /i, /g, and /x
    {\e[7m$1\e[m$2\e[7m$3\e[m}igx;  

    s/^(?:[^\e]*\n)+//mg;     # Remove any unmarked lines.
    s/^/$ARGV: /mg;         # Ensure lines begin with filename.
    print;
}

Chapter 2; page 81 (download)

import java.io.*;
import java.util.regex.Pattern;
import java.util.regex.Matcher;

public class TwoWord
{
 public static void main(String [] args)
 {
   Pattern regex1 = Pattern.compile(
       "\\b([a-z]+)((?:\\s|\\<[^>]+\\>)+)(\\1\\b)",
       Pattern.CASE_INSENSITIVE);
   String replace1 = "\033[7m$1\033[m$2\033[7m$3\033[m";
   Pattern regex2 = Pattern.compile("^(?:[^\\e]*\\n)+", Pattern.MULTILINE);
   Pattern regex3 = Pattern.compile("^([^\\n]+)", Pattern.MULTILINE);

   // For each command-line argument....
   for (int i = 0; i < args.length; i++)
   {
     try {
       BufferedReader in = new BufferedReader(new FileReader(args[i]));
       String text;

       // For each paragraph of each file.....
       while ((text = getPara(in)) != null)
       {
           // Apply the three substitutions
           text = regex1.matcher(text).replaceAll(replace1);
           text = regex2.matcher(text).replaceAll("");
           text = regex3.matcher(text).replaceAll(args[i] + ": $1");

           // Display results
           System.out.print(text);
       }
     } catch (IOException e) {
       System.err.println("can't read ["+args[i]+"]: " + e.getMessage());
     }
   }
 }

 // Routine to read next "paragraph" and return as a string
 static String getPara(BufferedReader in) throws java.io.IOException
 {
   StringBuffer buf = new StringBuffer();
   String line;

   while ((line = in.readLine()) != null &&
          (buf.length() == 0 || line.length() != 0))
   {
       buf.append(line + "\n");
   }
   return  buf.length() == 0 ? null : buf.toString();
 }
}

Chapter 3; page 94 (download)

if ($line =~ m/^Subject: (.*)/i) {
    $subject = $1;
}

Chapter 3; page 95 (download)

    import java.util.regex.*; // Make regex classes easily available
        
Pattern r = Pattern.compile("^Subject: (.*)", Pattern.CASE_INSENSITIVE);
Matcher m = r.matcher(line);
if (m.find()) {
    subject = m.group(1);
 }

Chapter 3; page 96 (download)

if (! Pattern.matches("\\s*", line))
{
    //  . . . line is not blank . . . 
}

Chapter 3; page 96 (download)

if (! line.matches("\\s*", ))
{
    //  . . . line is not blank . . . 
}

Chapter 3; page 96 (download)

Imports System.Text.RegularExpressions   ' Make regex classes easily available
   
Dim R as Regex = New Regex("^Subject: (.*)", RegexOptions.IgnoreCase)
Dim M as Match = R.Match(line)
If M.Success
    subject = M.Groups(1).Value
End If

Chapter 3; page 97 (download)

If Not Regex.IsMatch(Line, "^\s*$") Then
   '  . . . line is not blank . . . 
End If

Chapter 3; page 97 (download)

if (preg_match('/^Subject: (.*)/i', $line, $matches))
     $Subject = $matches[1];

Chapter 3; page 97 (download)

import re;
    
R = re.compile("^Subject: (.*)", re.IGNORECASE);
M = R.search(line)
if M:
    subject = M.group(1)

Chapter 3; page 98 (download)

$text =~ s{
   \b
   # Capture the address to $1 . . . 
   (
     \w[-.\w]*                          # username
     @
     [-\w]+(\.[-\w]+)*\.(com|edu|info)  # hostname
   )
   \b
}{<a href="mailto:$1">$1</a>}gix;

Chapter 3; page 98 (download)

import java.util.regex.*; // Make regex classes easily available
     
Pattern r = Pattern.compile(
   "\\b                                                  \n"+
   "# Capture the address to $1 . . . \n"+
   "(\n"+
   "  \\w[-.\\w]*                            # username\n"+
   "  @\n"+
   "  [-\\w]+(\\.[-\\w]+)*\\.(com|edu|info)  # hostname\n"+
   ")\n"+
   "\\b\n",
   Pattern.CASE_INSENSITIVE|Pattern.COMMENTS);

Matcher m = r.matcher(text);
text = m.replaceAll("<a href=\"mailto:$1\">$1</a>");

Chapter 3; page 99 (download)

Dim R As Regex = New Regex _
("\b                                                 " & _
 "(?# Capture the address to $1 . . . )                    " & _
 "(                                                                 " & _
 "  \w[-.\w]*                         (?# username)    " & _
 "  @                                                               " & _
 "  [-\w]+(\.[-\w]+)*\.(com|edu|info) (?# hostname)    " & _
 ")                                                                 " & _
 "\b",  _
 RegexOptions.IgnoreCase Or RegexOptions.IgnorePatternWhitespace)

text = R.Replace(text, "<a href=""mailto:${1}"">${1}</a>")

Chapter 3; page 99 (download)

$text = preg_replace('{
                         \b
                         # Capture the address to $1 . . . 
                         (
                           \w[-.\w]*                          # username
                           @
                           [-\w]+(\.[-\w]+)*\.(com|edu|info)  # hostname
                         )
                         \b
                     }ix',
                     '<a href="mailto:$1">$1</a>',  # replacement string
                     $text);

Chapter 3; page 100 (download)

sub(/mizpel/, "misspell")

Chapter 3; page 100 (download)

regsub mizpel $var misspell newvar

Chapter 3; page 100 (download)

regsub -all mizpel $var misspell newvar

Chapter 3; page 101 (download)

(defun FindNextDbl ()
  "move to next doubled word, ignoring <...> tags"   (interactive)
  (re-search-forward "\\<\\([a-z]+\\)\\([\n \t]\\|<[^>]+>\\)+\\1\\>")
)

Chapter 3; page 105 (download)

$str =~ m/(\w+)/;

Chapter 3; page 105 (download)

$regex = '(\w+)';
$str =~ $regex;

Chapter 3; page 105 (download)

$regex = "(\\w+)";
$str =~ $regex;

Chapter 3; page 127 (download)

(?!\p{Cn})\p{InThai}
(?=\P{Cn})\p{InThai}
\p{InThai}(?<!\p{Cn})
\p{InThai}(?<=\P{Cn})

Chapter 3; page 131 (download)

my $need_close_anchor = 0; # True if we've seen <A>, but not its closing </A>.

while (not $html =~ m/\G\z/gc) # While we haven't worked our way to the end . . . 
{
  if ($html =~ m/\G(\w+)/gc) {
     . . . have a word or number in $1 -- can now check for profanity, for example . . . 
  } elsif ($html =~ m/\G[^<>&\w]+/gc) {
    # Other non-HTML stuff -- simply allow it.
  } elsif ($html =~ m/\G<img\s+([^>]+)>/gci) {
     . . . have an image tag -- can check that it's appropriate . . . 
              
  } elsif (not $need_close_anchor and $html =~ m/\G<A\s+([^>]+)>/gci){
     . . . have a link anchor - can validate it . . . 
              
    $need_close_anchor = 1; # Note that we now need </A>
  } elsif ($need_close_anchor and $html =~ m{\G</A>}gci){
    $need_close_anchor = 0; # Got what we needed; don't allow again
  } elsif ($html =~ m/\G&(#\d+|\w+);/gc){
    # Allow entities like &gt; and &#123;
  } else {
    # Nothing matched at this point, so it must be an error. Note the location, and grab a dozen or so
    # characters from the HTML so that we can issue an informative error message.
    my $location = pos($html); # Note where the unexpected HTML starts.
    my ($badstuff) = $html =~ m/\G(.{1,12})/s;
    die "Unexpected HTML at position $location: $badstuff\n";
  }
}

# Make sure there's no dangling <A>
if ($need_close_anchor) {
   die "Missing final </A>"
}

Chapter 3; page 137 (download)

$HostnameRegex = qr/[-a-z0-9]+(\.[-a-z0-9]+)*\.(com|edu|info)/i;

Chapter 3; page 140 (download)

( <A\s+[^>]+> \s* )?  # Match leading <A> tag, if there.
<IMG\s+[^>]+>         # Match <IMG> tag.
(?(1)\s*</A>)         # Match a closing </A>, if we'd matched an <A> before.

Chapter 4; page 147 (download)

echo =XX========================================= | egrep 'X(.+)+X'

Chapter 4; page 148 (download)

The dragging belly indicates your cat is too fat

Chapter 4; page 162 (download)

a 1234 num
a 1234 num
a 1234 num
a 1234 num

Chapter 4; page 165 (download)

The name "McDonald's" is said "makudonarudo" in Japanese

Chapter 4; page 165 (download)

The name "McDonald's" is said "makudonarudo" in Japanese

Chapter 4; page 165 (download)

...<B>Billions</B> and <B>Zillions</B> of suns...

Chapter 4; page 166 (download)

...<B>Billions</B> and <B>Zillions</B> of suns...

Chapter 4; page 166 (download)

...<B>Billions</B> and <B>Zillions</B> of suns...

Chapter 4; page 166 (download)

...<B>Billions and <B>Zillions</B> of suns...

Chapter 4; page 167 (download)

<B>            # Match the opening <B>
(# Now, only as many of the following as needed . . . 
  (?!  <B>  )  #    If not <B> . . . 
  .                          #            . . . any character is okay
)*? #
</B> #  . . . until the closing delimiter can match

Chapter 4; page 167 (download)

<B>               # Match the opening <B>
(# Now, as many of the following as possible . . . 
  (?!  </?B>  )  #    If not <B>, and not </B> . . . 
  .                          #        . . . any character is okay
)* # (now greedy)
</B> # <ANNO> . . . until the closing delimiter can match.

Chapter 4; page 167 (download)

$price =~ s/(\.\d\d[1-9]?)\d*/$1/;

Chapter 4; page 168 (download)

$price =~ s/(\.\d\d[1-9]?)\d+/$1/

Chapter 4; page 178 (download)

SRC=array.c builtin.c eval.c field.c gawkmisc.c io.c main.c \
        missing.c msg.c node.c re.c version.c

Chapter 5; page 186 (download)

SRC=array.c builtin.c eval.c field.c gawkmisc.c io.c main.c \
        missing.c msg.c node.c re.c version.c

Chapter 5; page 191 (download)

$WholePath =~ m{([^/]*)$}; # Check variable $WholePath with regex.
$FileName = $1; # Note text matched

Chapter 5; page 192 (download)

if ( $WholePath =~ m!^(.*)/([^/]*)$! ) {
    # Have a match -- $1 and $2 are valid
    $LeadingPath = $1;
    $FileName = $2;
} else {
    # No match, so there's no `/' in the filename
    $LeadingPath = "."; # so "file.txt" looks like ". / file.txt" ("." is the current directory)
    $FileName = $WholePath;
}

Chapter 5; page 194 (download)

\([^()]*(\([^()]*\)[^()]*)*\)

Chapter 5; page 194 (download)

$regex =  '\(' . '(?:[^()]|\(' x $depth . '[^()]*' . '\))*' x $depth . '\)';

Chapter 5; page 197 (download)

Darth Symbol: "/-|-\\" or "[^-^]"

Chapter 5; page 197 (download)

Darth Symbol: "/-|-\\" or "[^-^]"

Chapter 5; page 197 (download)

"You need a 2\"x3\" photo.

Chapter 5; page 199 (download)

s/^\s+//;
s/\s+$//;

Chapter 5; page 200 (download)

s/^\s+//;
s/\s+$//;

Chapter 5; page 200 (download)

$html =~ s/<[^>]+>//g;

Chapter 5; page 200 (download)

<              # Opening "<"
  (            #    Any amount of . . . 
     "[^"]*"   #      double-quoted string,
     |         #      or . . . 
     '[^']*'   #      single-quoted string,
     |         #      or . . . 
     [^'">]    #      "other stuff"
  )*           #
>              # Closing ">"

Chapter 5; page 201 (download)

...<a href="http://www.oreilly.com">O'Reilly Media</a>...

Chapter 5; page 202 (download)

# Note: the regex in the while(...) is overly simplistic - see text for discussion
while ($Html =~ m{<a\b([^>]+)>(.*?)</a>}ig)
{
  my $Guts = $1; # Save results from the match above, to their own . . . 
  my $Link = $2; #  . . . named variables, for clarity below.

  if ($Guts =~ m{
                 \b HREF       # "href" attribute
                 \s* = \s*     # "=" may have whitespace on either side
                 (?:           # Value is . . . 
                   "([^"]*)"   #   double-quoted string,
                   |           #   or . . . 
                   '([^']*)'   #   single-quoted string,
                   |           #   or . . . 
                   ([^'">\s]+) #   "other stuff"
                 )             #
                }xi)
  {
    my $Url = $+; # Gives the highest-numbered actually-filled $1, $2, etc.
    print "$Url with link text: $Link\n";
  }
}

Chapter 5; page 203 (download)

Imports System.Text.RegularExpressions
   
' Set up the regular expressions we'll use in the loop
Dim A_Regex as Regex = New Regex(                  _
            "<a\b(?<guts>[^>]+)>(?<Link>.*?)</a>", _
            RegexOptions.IgnoreCase)

Dim GutsRegex as Regex = New Regex( _
   "\b HREF                (?#  'href' attribute             )" & _
   "\s* = \s*              (?#  '=' with optional whitespace )" & _
   "(?:                    (?#  Value is ...                 )" & _
   "  ""(?<url>[^""]*)""   (?#    double-quoted string,      )" & _
   "  |                    (?#    or ...                     )" & _
   "  '(?<url>[^']*)'      (?#    single-quoted string,      )" & _
   "  |                    (?#    or ...                     )" & _
   "  (?<url>[^'"">\s]+)   (?#    'other stuff'              )" & _
   ")                      (?#                               )",  _
   RegexOptions.IgnoreCase OR RegexOptions.IgnorePatternWhitespace)

' Now check the 'Html' Variable . . .  
Dim CheckA as Match = A_Regex.Match(Html)

' For each match within . . . 
While CheckA.Success
   ' We matched an <a> tag, so now check for the URL.
   Dim UrlCheck as Match = _
      GutsRegex.Match(CheckA.Groups("guts").Value)
   If UrlCheck.Success
      ' We've got a match, so have a URL/link pair
      Console.WriteLine("Url " & UrlCheck.Groups("url").Value & _
                        " WITH LINK " & CheckA.Groups("Link").Value)
   End If
   CheckA = CheckA.NextMatch
End While

Chapter 5; page 203 (download)

if ($url =~ m{^http://([^/:]+)(:(\d+))?(/.*)?$}i)
{
  my $host = $1;
  my $port = $3 || 80;  # Use $3 if it exists; otherwise default to 80.
  my $path = $4 || "/"; # Use $4 if it exists; otherwise default to "/".
  print "Host: $host\n";
  print "Port: $port\n";
  print "Path: $path\n";
} else {
  print "Not an HTTP URL\n";
}

Chapter 5; page 205 (download)

^
 (?i)  # apply this regex in a case-insensitive manner.
 # One or more dot-separated parts . . . 
 (?: [a-z0-9]\. | [a-z0-9][-a-z0-9]*[a-z0-9]\. )+
 # Followed by the final suffix part . . . 
 (?: com|edu|gov|int|mil|net|org|biz|info|name|museum|coop|aero|[a-z][a-z] )
$

Chapter 5; page 205 (download)

^
 (?i)  # apply this regex in a case-insensitive manner.
 # Zero or more dot-separated parts . . . 
 (?: [a-z0-9]\. | [a-z0-9][-a-z0-9]{0,61}[a-z0-9]\. )*
 # Followed by the final suffix part . . . 
 (?: com|edu|gov|int|mil|net|org|biz|info|name|museum|coop|aero|[a-z][a-z] )
$

Chapter 5; page 206 (download)

...visit us at www.oreilly.com or mail to orders@oreilly.com.

Chapter 5; page 206 (download)

(?i: [a-z0-9] (?:[-a-z0-9]*[a-z0-9])? \. )+ # sub domains
# Now ending .com, etc. For these, we require lowercase
(?-i: com\b
    | edu\b
    | biz\b
    | org\b
    | gov\b
    | in(?:t|fo)\b # .int or .info
    | mil\b
    | net\b
    | name\b
    | museum\b
    | coop\b
    | aero\b
    | [a-z][a-z]\b # two-letter country codes
)

Chapter 5; page 207 (download)

\b
# Match the leading part (proto://hostname, or just hostname)
(
    # ftp://, http://, or https:// leading part
    (ftp|https?)://[-\w]+(\.\w[-\w]*)+
  |
    # or, try to find a hostname with our more specific sub-expression
    full-hostname-regex
)

# Allow an optional port number
( : \d+ )?

# The rest of the URL is optional, and begins with / . . . 
(
   / path-part
)?

Chapter 5; page 207 (download)

Read his comments at http://www.oreilly.com/ask_tim/index.html. He ...

Chapter 5; page 207 (download)

\b
# Match the leading part (proto://hostname, or just hostname)
(
    # ftp://, http://, or https:// leading part
    (ftp|https?)://[-\w]+(\.\w[-\w]*)+
  |
    # or, try to find a hostname with our more specific sub-expression
    (?i: [a-z0-9] (?:[-a-z0-9]*[a-z0-9])? \. )+ # sub domains
    # Now ending .com, etc. For these, require lowercase
    (?-i: com\b
        | edu\b
        | biz\b
        | gov\b
        | in(?:t|fo)\b # .int or .info
        | mil\b
        | net\b
        | org\b
        | [a-z][a-z]\b # two-letter country codes
    )
)

# Allow an optional port number
( : \d+ )?

# The rest of the URL is optional, and begins with / . . . 
(
     /
     # The rest are heuristics for what seems to work well
     [^.!,?;"'<>()\[\]{}\s\x7F-\xFF]*
     (?:
        [.!,?]+  [^.!,?;"'<>()\[\]{}\s\x7F-\xFF]+
     )*
)?

Chapter 5; page 208 (download)

String SubDomain  = "(?i:[a-z0-9]|[a-z0-9][-a-z0-9]*[a-z0-9])";
String TopDomains = "(?x-i:com\\b        \n" +
                    "     |edu\\b        \n" +
                    "     |biz\\b        \n" +
                    "     |in(?:t|fo)\\b \n" +
                    "     |mil\\b        \n" +
                    "     |net\\b        \n" +
                    "     |org\\b        \n" +
                    "     |[a-z][a-z]\\b \n" + // country codes
                    ")                   \n";
String Hostname = "(?:" + SubDomain + "\\.)+" + TopDomains;

String NOT_IN   = ";\"'<>()\\[\\]{}\\s\\x7F-\\xFF";
String NOT_END  = "!.,?";
String ANYWHERE = "[^" + NOT_IN + NOT_END + "]";
String EMBEDDED = "[" + NOT_END + "]";
String UrlPath  = "/"+ANYWHERE + "*("+EMBEDDED+"+"+ANYWHERE+"+)*";
String Url = 
  "(?x:                                                \n"+
  "  \\b                                               \n"+
  "  ## match the hostname part                        \n"+
  "  (                                                 \n"+
  "    (?: ftp | http s? ): // [-\\w]+(\\.\\w[-\\w]*)+ \n"+
  "   |                                                \n"+
  "    " + Hostname + "                                \n"+
  "  )                                                 \n"+
  "  # allow optional port                             \n"+
  "  (?: :\\d+ )?                                      \n"+
  "                                                    \n"+
  "  # rest of url is optional, and begins with /      \n"+
  " (?: " + UrlPath + ")?                              \n"+
  ")";

// Now convert string we've built up into a real regex object
Pattern UrlRegex = Pattern.compile(Url);
// Now ready to apply to raw text to find urls . . .

Chapter 5; page 211 (download)

@zips = m/(?:\d\d\d\d\d)*?(44\d\d\d)/g;

Chapter 5; page 212 (download)

@zips = m/\G(?:(?!44)\d\d\d\d\d)*(44\d\d\d)/g;

Chapter 5; page 212 (download)

@zips = ( ); # Ensure the array is empty

while (m/(\d\d\d\d\d)/g) {
   $zip = $1;
   if (substr($zip, 0, 2) eq "44") {
       push @zips, $zip;
   }
}

Chapter 5; page 213 (download)

Ten Thousand,10000, 2710 ,,"10,000","It's ""10 Grand"", baby",10K

Chapter 5; page 213 (download)

# Either some non-quote/non-comma text . . . 
[^",]+
# . . . or . . . 
 |
# . . . a double-quoted field (inside, paired double quotes are allowed)
" # field's opening quote
 (?: [^"] | "" )*
" # field's closing quote

Chapter 5; page 214 (download)

# Either some non-quote/non-comma text . . . 
( [^",]+ )
# . . . or . . . 
 |
# . .. a double-quoted field (inside, paired double quotes are allowed)
" # field's opening quote
 (   (?: [^"] | "" )*   )
" # field's closing quote

Chapter 5; page 214 (download)

while ($line =~ m{
           # Either some non-quote/non-comma text . . . 
           ( [^",]+ )
           # . . . or . . . 
            |
           # . . . a double-quoted field ("" allowed inside)
           " # field's opening quote
            (   (?: [^"] | "" )*   )
           " # field's closing quote
        }gx)
{
   if (defined $1) {
       $field = $1;
   } else {
       $field = $2;
       $field =~ s/""/"/g;
   }
   print "[$field]"; # print the field, for debugging
   Can work with $field now . . . 
}

Chapter 5; page 215 (download)

[Ten Thousand][10000][ 2710 ][10,000][It's "10 Grand", baby][10K]

Chapter 5; page 216 (download)

(?:^|,)
(?:
    # Either some non-quote/non-comma text....
    ( [^",]* )
  # ... or...
  |
    # ... a double-quoted field (inside, paired double quotes are allowed)
    " # field's opening quote
     (  (?: [^"] | "" )*   )
    " # field's closing quote
)

Chapter 5; page 216 (download)

(?:^|,)
(?: # Now, match either a double-quoted field (inside, paired double quotes are allowed) . . . 
        " # (double-quoted field's opening quote)
         (   (?: [^"] | "" )*   )
        " # (double-quoted field's closing quote)
  |
    #  . . . or, some non-quote/non-comma text . . . 
        ( [^",]* )
)

Chapter 5; page 217 (download)

import java.util.regex.*;

        

String regex = // Puts a doublequoted field into group(1), an unquoted field into group(2)
   "\\G(?:^|,)                                  \n"+
   "(?:                                         \n"+
   "   # Either a double-quoted field . . .        \n"+
   "   \"  # field's opening quote              \n"+
   "    (   (?: [^\"]++ | \"\" )*+   )          \n"+
   "   \"  # field's closing quote              \n"+
   " |#  . . . or . . .                               \n"+
   "       # some non-quote/non-comma text . . .  \n"+
   "   ( [^\",]* )                              \n"+
   " )                       \n";
// Create a matcher, using the regex above, with dummy text for the time being.
Matcher mMain = Pattern.compile(regex, Pattern.COMMENTS).matcher("");

// Create a matcher for  「"" , with dummy text for the time being
Matcher mQuote = Pattern.compile("\"\"").matcher("");

        

// Above is the preparation; the code below is executed on a per-line basis
mMain.reset(line); // Use this line of CSV text in the processing below

while (mMain.find())
{
    String field;
    if (mMain.start(2) >= 0)
        field = mMain.group(2); // The field is unquoted, so we can use it as is
    else
        // The field is quoted, so we must replace paired doublequotes with one double quote
        field = mQuote.reset(mMain.group(1)).replaceAll("\"");

    // We can now work with field . . . 
    System.out.println("Field [" + field + "]");
}

Chapter 5; page 219 (download)

Imports System.Text.RegularExpressions
   
Dim FieldRegex as Regex = New Regex( _
       "(?:^|,)                                        " & _
       "(?:                                            " & _
       "   (?# Either a doublequoted field ...)        " & _
       "   ""  (?# field's opening quote )             " & _
       "    (   (?> [^""]+ | """" )*   )               " & _
       "   ""  (?# field's closing quote )             " & _
       " (?# ... or ...)                               " & _
       " |                                             " & _
       "   (?# ... some non-quote/non-comma text ...)  " & _
       "   ( [^"",]* )                                 " & _
       " )", RegexOptions.IgnorePatternWhitespace)

Dim QuotesRegex as Regex = New Regex("""""") 'A string with two double quotes
    
Dim FieldMatch as Match = FieldRegex.Match(Line)
While FieldMatch.Success
   Dim Field as String
   If FieldMatch.Groups(1).Success
     Field = QuotesRegex.Replace(FieldMatch.Groups(1).Value, """")
   Else
     Field = FieldMatch.Groups(2).Value
   End If

   Console.WriteLine("[" & Field & "]")
   ' Can now work with 'Field'....

   FieldMatch = FieldMatch.NextMatch
End While

Chapter 6; page 224 (download)

"You need a 2\"3\" photo."

Chapter 6; page 232 (download)

use Time::HiRes 'time'; # So time() gives a high-resolution value.

$StartTime = time();
"abababdedfg" =~ m/^(a|b|c|d|e|f|g)+$/;
$EndTime = time();
printf("Alternation takes %.3f seconds.\n", $EndTime - $StartTime);

$StartTime = time();
"abababdedfg" =~ m/^[a-g]+$/;
$EndTime = time();
printf("Character class takes %.3f seconds.\n", $EndTime - $StartTime);

Chapter 6; page 233 (download)

use Time::HiRes 'time'; # So time() gives a high-resolution value.
$TimesToDo = 1000;                  # Simple setup
$TestString = "abababdedfg" x 1000; # Makes a huge string

$Count = $TimesToDo;
$StartTime = time();
while ($Count-- > 0) {
      $TestString =~ m/^(a|b|c|d|e|f|g)+$/;
}
$EndTime = time();
printf("Alternation takes %.3f seconds.\n", $EndTime - $StartTime);

$Count = $TimesToDo;
$StartTime = time();
while ($Count-- > 0) {
      $TestString =~ m/^[a-g]+$/;
}
$EndTime = time();
printf("Character class takes %.3f seconds.\n", $EndTime - $StartTime);

Chapter 6; page 234 (download)

$TimesToDo = 1000000;
$TestString = "abababdedfg";

Chapter 6; page 234 (download)

$TimesToDo = 1000;

/* Prepare the test string */
$TestString = "";
for ($i = 0; $i < 1000; $i++)
    $TestString .= "abababdedfg";

/* Do the first test */
$start = gettimeofday();
for ($i = 0; $i < $TimesToDo; $i++)
     preg_match('/^(a|b|c|d|e|f|g)+$/', $TestString);
$final = gettimeofday();
$sec = ($final['sec'] + $final['usec']/1000000) -
       ($start['sec'] + $start['usec']/1000000);
printf("Alternation takes %.3f seconds\n", $sec);

/* And now the second test */
$start = gettimeofday();
for ($i = 0; $i < $TimesToDo; $i++)
     preg_match('/^[a-g]+$/', $TestString);
$final = gettimeofday();
$sec = ($final['sec'] + $final['usec']/1000000) -
       ($start['sec'] + $start['usec']/1000000);
printf("Character class takes %.3f seconds\n", $sec);

Chapter 6; page 235 (download)

if (phpversion() >= 5)
   date_default_timezone_set("GMT");

Chapter 6; page 235 (download)

import java.util.regex.*;
public class JavaBenchmark {
 public static void main(String [] args)
 {
   Matcher regex1 = Pattern.compile("^(a|b|c|d|e|f|g)+$").matcher("");
   Matcher regex2 = Pattern.compile("^[a-g]+$").matcher("");
   long timesToDo = 1000;

   StringBuffer temp = new StringBuffer();
   for (int i = 1000; i > 0; i--)
           temp.append("abababdedfg");
   String testString = temp.toString();

   // Time first one . . . 
   long count = timesToDo;
   long startTime = System.currentTimeMillis();
   while (--count > 0)
         regex1.reset(testString).find();
   double seconds = (System.currentTimeMillis() - startTime)/1000.0;
   System.out.println("Alternation takes " + seconds + " seconds");

   // Time second one . . . 
   count = timesToDo;
   startTime = System.currentTimeMillis();
   while (--count > 0)
         regex2.reset(testString).find();
   seconds = (System.currentTimeMillis() - startTime)/1000.0;
   System.out.println("Character class takes " + seconds + " seconds");
 }
}

Chapter 6; page 236 (download)

// Time first one . . . 
for (int i = 4; i > 0; i--)
{
    long count = timesToDo;
    long startTime = System.currentTimeMillis();
    while (--count > 0)
          regex1.reset(testString).find();
    double seconds = (System.currentTimeMillis() - startTime)/1000.0;
    System.out.println("Alternation takes " + seconds + " seconds");
}

Chapter 6; page 237 (download)

Option Explicit On
Option Strict On

Imports System.Text.RegularExpressions

Module Benchmark
Sub Main()
  Dim Regex1 as Regex = New Regex("^(a|b|c|d|e|f|g)+$")
  Dim Regex2 as Regex = New Regex("^[a-g]+$")
  Dim TimesToDo as Integer = 1000
  Dim TestString as String = ""
  Dim I as Integer
  For I = 1 to 1000
     TestString = TestString & "abababdedfg"
  Next

  Dim StartTime as Double = Timer()
  For I = 1 to TimesToDo
     Regex1.Match(TestString)
  Next
  Dim Seconds as Double = Math.Round(Timer() - StartTime, 3)
  Console.WriteLine("Alternation takes " & Seconds & " seconds")

  StartTime = Timer()
  For I = 1 to TimesToDo
     Regex2.Match(TestString)
  Next
  Seconds = Math.Round(Timer() - StartTime, 3)
  Console.WriteLine("Character class takes " & Seconds & " seconds")
End Sub
End Module

Chapter 6; page 238 (download)

TimesToDo=1000
testString=""
for i in 1..1000
    testString += "abababdedfg"
end

Regex1 = Regexp::new("^(a|b|c|d|e|f|g)+$");
Regex2 = Regexp::new("^[a-g]+$");

startTime = Time.new.to_f
for i in 1..TimesToDo
    Regex1.match(testString)
end
print "Alternation takes %.3f seconds\n" % (Time.new.to_f - startTime);

startTime = Time.new.to_f
for i in 1..TimesToDo
    Regex2.match(testString)
end
print "Character class takes %.3f seconds\n" % (Time.new.to_f - startTime);

Chapter 6; page 238 (download)

import re
import time
import fpformat

Regex1 = re.compile("^(a|b|c|d|e|f|g)+$")
Regex2 = re.compile("^[a-g]+$")

TimesToDo = 1250;
TestString = ""
for i in range(800):
    TestString += "abababdedfg"

StartTime = time.time()
for i in range(TimesToDo):
   Regex1.search(TestString)
Seconds = time.time() - StartTime
print "Alternation takes " + fpformat.fix(Seconds,3) + " seconds"

StartTime = time.time()
for i in range(TimesToDo):
   Regex2.search(TestString)
Seconds = time.time() - StartTime
print "Character class takes " + fpformat.fix(Seconds,3) + " seconds"

Chapter 6; page 239 (download)

set TimesToDo 1000
set TestString ""
for {set i 1000} {$i > 0} {incr i -1} {
    append TestString "abababdedfg"
}

set Count $TimesToDo
set StartTime [clock clicks -milliseconds]
for {} {$Count > 0} {incr Count -1} {
    regexp {^(a|b|c|d|e|f|g)+$} $TestString
}
set EndTime [clock clicks -milliseconds]
set Seconds [expr ($EndTime - $StartTime)/1000.0]
puts [format "Alternation takes %.3f seconds" $Seconds]

set Count $TimesToDo
set StartTime [clock clicks -milliseconds]
for {} {$Count > 0} {incr Count -1} {
    regexp {^[a-g]+$} $TestString
}
set EndTime [clock clicks -milliseconds]
set Seconds [expr ($EndTime - $StartTime)/1000.0]
puts [format "Character class takes %.3f seconds" $Seconds]

Chapter 6; page 242 (download)

while (...) {
    if ($line =~ m/^\s*$/ ) ...
    if ($line =~ m/^Subject: (.*)/) ...
    if ($line =~ m/^Date: (.*)/) ...
    if ($line =~ m/^Reply-To: (\S+)/)...
    if ($line =~ m/^From: (\S+) \(([^()]*)\)/)...
      
}

Chapter 6; page 258 (download)

if ($data =~ m/\(0x/
    and
    $data =~ m/(?:SCALAR|ARRAY|...|HASH)\(0x[0-9a-fA-F]+\)/)
{
   # warn about bogus data...
}

Chapter 6; page 270 (download)

<B>               # Match the opening <B>
(# Now, as many of the following as possible . . . 
  (?!  </?B>  )  #    If not <B>, and not </B> . . . 
  .                          #        . . . any character is okay
)* # (now greedy)
</B> # <ANNO> . . . until the closing delimiter can match.

Chapter 6; page 270 (download)

<B>              # Match the opening <B>
  (?> [^<]* )    # Now match any "normal" . . . 
  (?>                   # Any amount of . . . 
     (?! </?B> ) #   if not at <B> or </B>,
     <           #   match one "special"
     [^<]*       #   and then any amount of "normal"
   )*                   #
</B>             # And finally the closing </B>

Chapter 6; page 270 (download)

^ \w+ =                 # leading field name and '='
# Now read (and capture) the value . . . 
(
   (?> [^\n\\]* )       # "normal"*
   (?> \\. [^\n\\]* )*  # ( "special" "normal"* )*
)

Chapter 6; page 271 (download)

(?:^|,)
(?: # Now, match either a double-quoted field (inside, paired double quotes are allowed) . . . 
        " # (double-quoted field's opening quote)
         (   (?: [^"] | "" )*   )
        " # (double-quoted field's closing quote)
  |
    #  . . . or, some non-quote/non-comma text . . . 
        ( [^",]* )
)

Chapter 6; page 271 (download)

while ($line =~ m{
          \G(?:^|,)
          (?:
             # Either a double-quoted field (with "" for each ")...
             " # field's opening quote
              ( (?> [^"]* ) (?> "" [^"]* )*  )
             " # field's closing quote
           # ..or...
           |
             # ... some non-quote/non-comma text....
             ( [^",]* )
          )
      }gx)
{
   if (defined $2) {
       $field = $2;
   } else {
       $field = $1;
       $field =~ s/""/"/g;
   }
   print "[$field]"; # print the field, for debugging
   Can work with $field now . . . 
}

Chapter 6; page 274 (download)

years = days /x divide x//365; /x assume non-leap year x/

Chapter 6; page 276 (download)

const char *cstart = "/*", *cend = "*/";

Chapter 6; page 277 (download)

$prog =~ s{/\*[^*]*\*+(?:[^/*][^*]*\*+)*/}{}g; # remove C comments (and more!)

Chapter 6; page 277 (download)

char *CommentStart = "/*";  /* start of comment */
char *CommentEnd   = "*/";  /* end of comment */

Chapter 6; page 277 (download)

$COMMENT = qr{/\*[^*]*\*+(?:[^/*][^*]*\*+)*/}; # regex to match a comment
$DOUBLE = qr{"(?:\\.|[^\\"])*"};# regex to match double-quoted string
$text =~ s/$DOUBLE|$COMMENT//g;

Chapter 6; page 278 (download)

$COMMENT = qr{/\*[^*]*\*+(?:[^/*][^*]*\*+)*/}; # regex to match a comment
$DOUBLE = qr{"(?:\\.|[^\\"])*"};# Regex to match double-quoted string
$text =~ s/($DOUBLE)|$COMMENT/$1/g;

Chapter 6; page 278 (download)

$text =~ s/($DOUBLE)|$COMMENT/defined($1) ? $1 : ""/ge;

Chapter 6; page 278 (download)

$COMMENT = qr{/\*[^*]*\*+(?:[^/*][^*]*\*+)*/}; # regex to match a comment
$COMMENT2 = qr{//[^\n]*};  # regex to match a C++ // comment
$DOUBLE = qr{"(?:\\.|[^\\"])*"};# regex to match double-quoted string
$SINGLE = qr{'(?:\\.|[^'\\])*'};# regex to match single-quoted string

$text =~ s/($DOUBLE|$SINGLE)|$COMMENT|$COMMENT2/$1/g;

Chapter 6; page 279 (download)

$OTHER = qr{[^"'/]};  # Stuff that couldn't possibly begin one of the other alternatives
  
$text =~ s/($DOUBLE|$SINGLE|$OTHER+)|$COMMENT|$COMMENT2/$1/g;

Chapter 6; page 281 (download)

$DOUBLE = qr{"[^\\"]*(?:\\.[^\\"]*)*"};
$SINGLE = qr{'[^'\\]*(?:\\.[^'\\]*)*'};

Chapter 6; page 281 (download)

([^"'/]+|"[^\\"]*(?:\\.[^\\"]*)*"[^"'/]*|'[^'\\]*
(?:\\.[^'\\]*)*'[^"'/]*)|/\*[^*]*\*+(?:[^/*][^*]*\*+)*/|//[^\n]*

Chapter 7; page 289 (download)

$MatchField = "^Subject:"; # Normal string assignment
   
if ($text =~ $MatchField) {

Chapter 7; page 289 (download)

$text =~ $MatchField

Chapter 7; page 289 (download)

$text =~ m/$MatchField/

Chapter 7; page 290 (download)

use Config;
print "$Config{privlib}/unicore/UnicodeData.txt\n";

Chapter 7; page 291 (download)

m{
    regex  # comments
    here   # here
}x;

Chapter 7; page 292 (download)

$text =~ m/.../;
$text =~  /.../;

Chapter 7; page 294 (download)

$s = expression one;
@a = expression two;

Chapter 7; page 295 (download)

$var = ($this, &is, 0xA, 'list');

Chapter 7; page 296 (download)

{
    local($Acme::Widget::Debug) = 1; # Ensure it's turned on
    # work with Acme::Widget while debugging is on
      
}
# $Acme::Widget::Debug is now back to whatever it had been before

Chapter 7; page 297 (download)

{
    local $^W = 0; # Ensure warnings are off.
    UnrulyFunction(...);
}
# Exiting the block restores the original value of $^W.

Chapter 7; page 298 (download)

if (m/(...)/)
{
    DoSomeOtherStuff();
    print "the matched text was $1.\n";
}

Chapter 7; page 299 (download)

if ($result =~ m/ERROR=(.*)/) {
   warn "Hey, tell $Config{perladmin} about $1!\n";
}

Chapter 7; page 299 (download)

"Pi is 3.14159, roughly" =~ m/\b(;(;tasty|fattening);|(;\d+(;\.\d*);?););\b/;

Chapter 7; page 301 (download)

$url =~ m{
   href \s* = \s*   # Match the "href = " part, then the value . . . 
   (?: "([^"]*)"    # a double-quoted value, or . . . 
     | '([^']*)'    # a single-quoted value, or . . . 
     | ([^'"<>]+) ) # an unquoted value.
}ix;

Chapter 7; page 302 (download)

$text = "Version 6 coming soon?";
   
$text =~ m/\d+/;

Chapter 7; page 302 (download)

1 while $line =~ s/\t/' ' x (8 - $-[0] % 8)/e;

Chapter 7; page 303 (download)

my $HostnameRegex = qr/[-a-z0-9]+(?:\.[-a-z0-9]+)*\.(?:com|edu|info)/i;

my $HttpUrl = qr{
   http:// $HostnameRegex \b  # Hostname
   (?:
        / [-a-z0-9_:\@&?=+,.!/~*'%\$]* # Optional path
           (?<![.,?!])                 # Not allowed to end with [.,?!]
   )?
}ix;

Chapter 7; page 304 (download)

if ($text =~ $HttpUrl) {
   print "There is a URL\n";
}

Chapter 7; page 304 (download)

while ($text =~ m/($HttpUrl)/g) {
   print "Found URL: $1\n";
}

Chapter 7; page 304 (download)

my $HostnameRegex = qr{
   # One or more dot-separated parts...
   (?: [a-z0-9]\. | [a-z0-9][-a-z0-9]{0,61}[a-z0-9]\. )*
   # Followed by the final suffix part...
   (?: com|edu|gov|int|mil|net|org|biz|info|...|aero|[a-z][a-z] )
}xi;

Chapter 7; page 304 (download)

my $WordRegex = qr/\b \w+ \b/; # Oops, missing the /x modifier!
   
if ($text =~ m/^($WordRegex)/x) {
    print "found word at start of text: $1\n";
}

Chapter 7; page 305 (download)

my $WordRegex = qr/\b \w+ \b/x;  # This works!
   
if ($text =~ m/^($WordRegex)/) {
    print "found word at start of text: $1\n";
}

Chapter 7; page 305 (download)

my $WordRegex = '\b \w+ \b';  # Normal string assignment
   
if ($text =~ m/^($WordRegex)/x) {
    print "found word at start of text: $1\n";
}

Chapter 7; page 305 (download)

my $WordRegex = '(?x:\b \w+ \b)'; # Normal string assignment
   
if ($text =~ m/^($WordRegex)/) {
    print "found word at start of text: $1\n";
}

Chapter 7; page 306 (download)

(?ix-sm:
   http:// (?ix-sm:
   # One or more dot-separated parts...
   (?: [a-z0-9]\. | [a-z0-9][-a-z0-9]{0,61}[a-z0-9]\. )*
   # Followed by the final suffix part...
   (?: com|edu|gov|int|mil|net|org|biz|info|...|aero|[a-z][a-z] )
) \b          # hostname
   (?:
        / [-a-z0-9_:\@&?=+,.!/~*'%\$]* # Optional path
           (?<![.,?!])                 # Not allowed to end with [.,?!]
   )?
)

Chapter 7; page 306 (download)

$text =~ m/regex/

Chapter 7; page 307 (download)

StringOperand =~ RegexOperand

Chapter 7; page 307 (download)

my $regex = qr/regex/;
  
if ($text =~ $regex) {

Chapter 7; page 307 (download)

if ($text =~ m/$regex/) {

Chapter 7; page 308 (download)

$text =~ m/.../;   # Just do it, presumably, for the side effects.
 
if ($text =~ m/.../) {
  # Do code if match is successful
 
 
$result = ( $text =~ m/.../ ); # Set $result to result of match against $text
$result =   $text =~ m/.../  ; # Same thing; =~ has higher precedence than = 
 
  $copy = $text;             # Copy $text to $copy ...
  $copy           =~ m/.../;# ... and perform match on $copy
( $copy = $text ) =~ m/.../;# Same thing in one expression

Chapter 7; page 309 (download)

$text =~ m/regex/;

Chapter 7; page 309 (download)

$text = m/regex/;

Chapter 7; page 309 (download)

$text =        m/regex/;
$text = ($_ =~ m/regex/);

Chapter 7; page 309 (download)

while (<>)
{
   if (m/.../) {
     
   } elsif (m/.../) {

Chapter 7; page 309 (download)

if ($text !~ m/.../)

if (not $text =~ m/.../)

unless ($text =~ m/.../)

Chapter 7; page 310 (download)

if ($target =~ m/.../) {
    #  . . . processing after successful match . . . 
 
} else {
    #  . . . processing after unsuccessful match . . . 
 
}

Chapter 7; page 310 (download)

my $success  =  $target =~ m/.../;
  
if ($success) {
  
}

Chapter 7; page 310 (download)

my ($year, $month, $day)  =  $date =~ m{^ (\d+) / (\d+) / (\d+) $}x;

Chapter 7; page 310 (download)

my @parts  =  $text =~ m/^(\d+)-(\d+)-(\d+)$/;

Chapter 7; page 310 (download)

my ($word)   =  $text =~ m/(\w+)/;
my $success  =  $text =~ m/(\w+)/;

Chapter 7; page 311 (download)

if ( my ($year, $month, $day) = $date =~ m{^ (\d+) / (\d+) / (\d+) $}x ) {
    # Process for when we have a match: $year and such are available
} else {
    # here if no match . . . 
}

Chapter 7; page 311 (download)

my @nums  =  $text =~ m/\d+/g;

Chapter 7; page 311 (download)

my $hex_ip = join '', map { sprintf("%02x", $_) } $ip =~ m/\d+/g;

Chapter 7; page 311 (download)

my $ip = join '.', map { hex($_) } $hex_ip =~ m/../g

Chapter 7; page 311 (download)

my @nums  =  $text =~ m/\d+(?:\.\d+)?|\.\d+/g;

Chapter 7; page 311 (download)

my @Tags  =  $Html =~ m/<(\w+)/g;

Chapter 7; page 312 (download)

alias  Jeff      jfriedl@regex.info
alias  Perlbug   perl5-porters@perl.org
alias  Prez      president@whitehouse.gov

Chapter 7; page 312 (download)

( 'Jeff', 'jfriedl@regex.info', 'Perlbug',
  'perl5-porters@perl.org', 'Prez', 'president@whitehouse.gov' )

Chapter 7; page 312 (download)

my %alias  =  $text =~ m/^alias\s+(\S+)\s+(.+)/mg;

Chapter 7; page 312 (download)

$text = "WOW! This is a SILLY test.";

$text =~ m/\b([a-z]+\b)/g;
print "The first all-lowercase word: $1\n";

$text =~ m/\b([A-Z]+\b)/g;
print "The subsequent all-uppercase word: $1\n";

Chapter 7; page 313 (download)

while ($ConfigData =~ m/^(\w+)=(.*)/mg) {
    my($key, $value) = ($1, $2);
      
}

Chapter 7; page 313 (download)

while ($text =~ m/(\d+)/) { # dangerous!
    print "found: $1\n";
}

Chapter 7; page 313 (download)

while ($text =~ m/(\d+)/g) {
    print "found: $1\n";
}

Chapter 7; page 314 (download)

my $ip = "64.156.215.240";
while ($ip =~ m/(\d+)/g) {
   printf "found '$1' ending at location %d\n", pos($ip);
}

Chapter 7; page 314 (download)

if ($logline =~ m/^.{32}(\S+)/) {
    $RequestedPage = $1;
}

Chapter 7; page 314 (download)

pos($logline) = 32; # The page starts at the 32nd character, so start the next match there . . . 
if ($logline =~ m/(\S+)/g) {
    $RequestedPage = $1;
}

Chapter 7; page 315 (download)

pos($logline) = 32; # The page starts at the 32nd character, so start the next match there . . . 
if ($logline =~ m/\G(\S+)/g) {
    $RequestedPage = $1;
}

Chapter 7; page 315 (download)

while (not $html =~ m/\G\z/gc) # While we haven't worked to the end . . . 
{
  if    ($html =~ m/\G( <[^>]+>   )/xgc) { print "TAG: $1\n"            }
  elsif ($html =~ m/\G( &\w+;     )/xgc) { print "NAMED ENTITY: $1\n"   }
  elsif ($html =~ m/\G( &\#\d+;   )/xgc) { print "NUMERIC ENTITY: $1\n" }
  elsif ($html =~ m/\G( [^<>&\n]+ )/xgc) { print "TEXT: $1\n"           }
  elsif ($html =~ m/\G  \n         /xgc) { print "NEWLINE\n"            }
  elsif ($html =~ m/\G( .         )/xgc) { print "ILLEGAL CHAR: $1\n"   }
  else {
      die "$0: oops, this shouldn't happen!";
  }
}

Chapter 7; page 316 (download)

$html =~ m/\G ( <script[^>]*>.*?</script> )/xgcsi

Chapter 7; page 318 (download)

while ("Larry Curly Moe" =~ m/\w+/g) {
   print "WHILE stooge is $&.\n";
}
print "\n";

if ("Larry Curly Moe" =~ m/\w+/g) {
   print "IF stooge is $&.\n";
}
print "\n";

foreach ("Larry Curly Moe" =~ m/\w+/g) {
   print "FOREACH stooge is $&.\n";
}

Chapter 7; page 318 (download)

$text =~ s/regex/replacement/modifiers

Chapter 7; page 319 (download)

$text =~ s{
  ...some big regex here, with lots of comments and such...
} {
  ...a Perl code snippet to be evaluated to produce the replacement text...
}ex;

Chapter 7; page 319 (download)

$text =~ s/-time-/localtime/ge;

Chapter 7; page 320 (download)

$url =~ s/([^a-zA-Z0-9])/sprintf('%%%02x', ord($1))/ge;

Chapter 7; page 320 (download)

$url =~ s/%([0-9a-f][0-9a-f])/pack("C", hex($1))/ige;

Chapter 7; page 321 (download)

$data =~ s/(\$[a-zA-Z_]\w*)/$1/eeg;

Chapter 7; page 321 (download)

@Paragraphs = split(m/\s*<p>\s*/i, $html);

Chapter 7; page 321 (download)

@Lines = split(m/^/m, $lines);

Chapter 7; page 322 (download)

split(match operand, target string, chunk-limit operand)

Chapter 7; page 322 (download)

($var1, $var2, $var3, ...) = split(...);
 
@array = split(...);
 
for my $item (split(...)) {
   
}

Chapter 7; page 323 (download)

( 'IO.SYS', '225558', '95-10-03:-a-sh:optional' )

Chapter 7; page 323 (download)

('IO.SYS', '225558', '95-10-03', '-a-sh:optional')

Chapter 7; page 323 (download)

($filename, $size, $date) = split(/:/, $text);

Chapter 7; page 324 (download)

@nums = split(m/:/, "12:34::78");

Chapter 7; page 324 (download)

("12", "34", "", "78")

Chapter 7; page 324 (download)

@nums = split(m/:/, "12:34::78:::");

Chapter 7; page 324 (download)

("12", "34", "", "78")

Chapter 7; page 324 (download)

my @NonEmpty = grep { length } split(/:/, $text);

Chapter 7; page 324 (download)

@nums = split(m/:/, ":12:34::78");

Chapter 7; page 325 (download)

("", "12", "34", "", "78")

Chapter 7; page 326 (download)

... and <B>very <FONT color=red>very</FONT> much</B> effort...

Chapter 7; page 326 (download)

( '... and ', '<B>', 'very ', '<FONT color=red>',
  'very', '</FONT>', ' much', '</B>', ' effort...' )

Chapter 7; page 326 (download)

( '... and ', 'very ', 'very', ' much', ' effort...' )

Chapter 7; page 327 (download)

"have a nice day" =~ m{
   (?{ print "Starting match.\n" })
   \b(?: the | an | a )\b
}x;

Chapter 7; page 328 (download)

my $Level0 = qr/ \(  ( [^()] )*  \) /x; # Parenthesized text
  
if ($text =~ m/\b( \w+$Level0 )/x) {
   print "found function call: $1\n";
}

Chapter 7; page 329 (download)

my $Level0 = qr/ \(  ( [^()]          )*  \) /x; # Parenthesized text
my $Level1 = qr/ \(  ( [^()]| $Level0 )*  \) /x; # One level of nesting

Chapter 7; page 329 (download)

my $Level0 = qr/ \(  ( [^()]           )*  \) /x; # Parenthesized text
my $Level1 = qr/ \(  ( [^()] | $Level0 )*  \) /x; # One level of nesting
my $Level2 = qr/ \(  ( [^()] | $Level1 )*  \) /x; # Two levels of nesting

Chapter 7; page 329 (download)

my $Level3 = qr/ \(  ( [^()] | $Level2 )*  \) /x; # Three levels of nesting
my $Level4 = qr/ \(  ( [^()] | $Level3 )*  \) /x; # Four levels of nesting
my $Level5 = qr/ \(  ( [^()] | $Level4 )*  \) /x; # Five levels of nesting

Chapter 7; page 330 (download)

my $LevelN; # This must be predeclared because it's used in its own definition.
$LevelN = qr/ \(( [^()] | (??{ $LevelN }) )* \) /x;

Chapter 7; page 330 (download)

if ($text =~ m/\b( \w+$LevelN )/x) {
   print "found function call: $1\n";
}

Chapter 7; page 330 (download)

$LevelN = qr/ (?> [^()]+ | \( (??{ $LevelN }) \)  )* /x;

Chapter 7; page 331 (download)

if ($text =~ m/\b( \w+ \( $LevelN \) )/x) {
   print "found function call: $1\n";
}
 
if (not $text =~ m/^ $LevelN $/x) {
   print "mismatched parentheses!\n";
}

Chapter 7; page 331 (download)

"abcdefgh" =~ m{
  (?{ print "starting match at [$`|$']\n" })
  (?:d|e|f)
}x;

Chapter 7; page 331 (download)

print "starting match at [$`|$']\n"

Chapter 7; page 332 (download)

(?{ print "matched at [$`<$&>$']\n" })

Chapter 7; page 332 (download)

"abcdefgh" =~ m{
  (?{  print "starting match at [$`|$']\n" })
  [def]
}x;

Chapter 7; page 332 (download)

panic: top_env

Chapter 7; page 332 (download)

"oneselfsufficient" =~ m{
    one(self)?(selfsufficient)?
   (?{ print "matched at [$`<$&>$']\n" })
}x;

Chapter 7; page 333 (download)

"123" =~ m{
   \d+
   (?{ print "matched at [$`<$&>$']\n" })
   (?!)
}x;

Chapter 7; page 334 (download)

$longest_match = undef; # We'll keep track of the longest match here

"oneselfsufficient" =~ m{
   one(self)?(selfsufficient)?
   (?{
      # Check to see if the current match ($&) is the longest so far
      if (not defined($longest_match)
          or
          length($&) > length($longest_match))
      {
          $longest_match = $&;
      }
   })
   (?!) # Force failure so we'll backtrack to find further "matches"
}x;

# Now report the accumulated result, if any
if (defined($longest_match)) {
   print "longest match=[$longest_match]\n";
} else {
   print "no match\n";
}

Chapter 7; page 334 (download)

my $RecordPossibleMatch = qr{
   (?{
      # Check to see if the current match ($&) is the longest so far
      if (not defined($longest_match)
          or
          length($&) > length($longest_match))
      {
          $longest_match = $&;
      }
   })
   (?!) # Force failure so we'll backtrack to find further "matches"
}x;

Chapter 7; page 335 (download)

$longest_match = undef; # We'll keep track of the longest match here

"800-998-9938" =~ m{  \d+  $RecordPossibleMatch  }x;

# Now report the accumulated result, if any
if (defined($longest_match)) {
   print "longest match=[$longest_match]\n";
} else {
   print "no match\n";
}

Chapter 7; page 335 (download)

my $BailIfAnyMatch = qr/(?(?{ defined $longest_match})(?!))/;

Chapter 7; page 335 (download)

"800-998-9938" =~ m{ $BailIfAnyMatch  \d+  $RecordPossibleMatch  }x;

Chapter 7; page 336 (download)

my $Count = 0;

$text =~ m{
   ^ (?> \d+ (?{ $Count++ }) \b | \w+ | \s+ )* $
}x;

Chapter 7; page 336 (download)

our $Count = 0;

$text =~ m{
 ^ (?> \d+ (?{ local($Count) = $Count + 1 }) \b | \w+ | \s+ )* $
}x;

Chapter 7; page 337 (download)

m{   (?{ print "starting\n" })  some regex...  }x;

Chapter 7; page 337 (download)

my $ShowStart = '(?{ print "starting\n" })';
    
m{ $ShowStart some regex...  }x;

Chapter 7; page 337 (download)

use re 'eval';

Chapter 7; page 338 (download)

my $Count = undef;
our $TmpCount = 0;

$text =~ m{
 ^ (?> \d+ (?{ local($TmpCount) = $TmpCount + 1 }) \b | \w+ | \s+ )* $
 (?{ $Count = $TmpCount }) # Save the "ending" $Count to a non-localized variable
}x;
if (defined $Count) {
    print "Count is $Count.\n";
} else {
    print "no match\n";
}

Chapter 7; page 338 (download)

sub CheckOptimizer
{
    my $text  = shift; # The first argument is the text to check.
    my $start = undef; # We'll note here where the regex is first applied.

    my $match = $text =~ m{
      (?{ $start = $-[0] if not defined $start}) # Save the first starting position
      \d # This is the regex being tested
    }x;

    if (not defined $start) {
        print "The whole match was optimized away.\n";
        if ($match) {
            # This can't possibly happen!
            print "Whoa, but it matched! How can this happen!?\n";
        }
    } elsif ($start == 0) {
        print "The match start was not optimized.\n";
    } else {
        print "The optimizer started the match at character $start.\n"
    }
}

Chapter 7; page 339 (download)

CheckOptimizer("test 123");

Chapter 7; page 339 (download)

The optimizer started the match at character 5.

Chapter 7; page 339 (download)

The whole match was optimized away.
Whoa, but it matched! How can this happen!?

Chapter 7; page 340 (download)

my $NestedGuts = qr{
  (?>
    (?:
       # Stuff not parenthesis
        [^()]+
       # An opening parenthesis
       |  \(
       # A closing parenthesis
       |  \)
    )*
  )
}x;

Chapter 7; page 340 (download)

(?{ local $OpenParens = 0 })

Chapter 7; page 340 (download)

(?{ $OpenParens++ })

Chapter 7; page 340 (download)

(?(?{ $OpenParens }) (?{ $OpenParens-- }) | (?!) )

Chapter 7; page 341 (download)

(?(?{ $OpenParens != 0 })(?!))

Chapter 7; page 341 (download)

my $NestedGuts = qr{
  (?{ local $OpenParens = 0 }) #  Counts the number of nested opens waiting to close.
  (?> # atomic-grouping for efficiency
     (?:
        # Stuff not parenthesis
          [^()]+
        #  An opening parenthesis
        |  \(   (?{ $OpenParens++ })
        #  Allow a closing parenthesis, if we're expecting any
        |  \)  (?(?{ $OpenParens != 0 }) (?{ $OpenParens-- }) | (?!) )
     )*
  )
  (?(?{ $OpenParens != 0 })(?!)) #  If there are any open parens left, don't finish
}x;

Chapter 7; page 342 (download)

sub MungeRegexLiteral($)
{
   my ($RegexLiteral) = @_; # Argument is a string
   $RegexLiteral =~ s/\\</(?<!\\w)(?=\\w)/g; # Mimic \< as start-of-word boundary
   $RegexLiteral =~ s/\\>/(?<=\\w)(?!\\w)/g; # Mimic \> as end-of-word boundary
   return $RegexLiteral; # Return possibly-modified string
}

Chapter 7; page 342 (download)

package MyRegexStuff; # Best to call the package something unique
use strict;   # Good practice to always use this
use warnings; # Good practice to always use this
use overload; # Allows us to invoke Perl's overloading mechanism
# Have our regex handler installed when we're use'd . . . .
sub import { overload::constant qr => \&MungeRegexLiteral }

sub MungeRegexLiteral($)
{
   my ($RegexLiteral) = @_; # Argument is a string
   $RegexLiteral =~ s/\\</(?<!\\w)(?=\\w)/g; # Mimic \< as start-of-word boundary
   $RegexLiteral =~ s/\\>/(?<=\\w)(?!\\w)/g; # Mimic \> as end-of-word boundary
   return $RegexLiteral; # Return possibly-modified string
}

1; # Standard idiom so that a 'use' of this file returns something true

Chapter 7; page 342 (download)

use lib '.';      # Look for library files in the current directory
use MyRegexStuff; # We now have our new functionality available!
   
$text =~ s/\s+\</ /g; # Normalize any type of whitespace before a word to a single space

Chapter 7; page 343 (download)

$RegexLiteral =~ s/(  \( $LevelN \)[*+?]  )\+/(?>$1)/gx;

Chapter 7; page 343 (download)

$text =~ s/"(\\.|[^"])*+"//; # Remove double-quoted strings

Chapter 7; page 343 (download)

$RegexLiteral =~ s{
 (
   # Match something that can be quantified . . . 
   (?:  \\[\\abCdDefnrsStwWX] # \n, \w, etc.
     |  \\c.                  # \cA
     |  \\x[\da-fA-F]{1,2}    # \xFF
     |  \\x\{[\da-fA-F]*\}    # \x{1234}
     |  \\[pP]\{[^{}]+\}      # \p{Letter}
     |  \[\]?[^]]+\]          # "poor man's" class
     |  \\\W                  # \*
     |  \( $LevelN \)         # (...)
     |  [^()*+?\\]            # almost anything else
    )
    #  . . . and is quantified . . . 
    (?: [*+?] | \{\d+(?:,\d*)?\} )
 )
 \+  #  . . . and has an extra '+' after the quantifier.
}{(?>$1)}gx;

Chapter 7; page 345 (download)

my $SaveUrl = qr{
    ($HttpUrl)         # Match an HTTP URL . . . 
    (?{ $url = $^N })  #  . . . and save to $url
}x;

$text =~ m{
   http \s*=\s* ($SaveUrl)
 | src  \s*=\s* ($SaveUrl)
}xi;

Chapter 7; page 345 (download)

package MyRegexStuff;
use strict;
use warnings;
use overload;
sub import { overload::constant('qr' => \&MungeRegexLiteral) }

my $NestedStuffRegex; # This should be predeclared, because it's used in its own definition.
$NestedStuffRegex = qr{
 (?>
   (?:  # Stuff not parens, not '#', and not an escape . . . 
        [^()\#\\]+
        # Escaped stuff . . . 
      | (?s: \\. )
        # Regex comment . . . 
      | \#.*\n
        # Matching parens, with more nested stuff inside . . . 
      |  \(  (??{ $NestedStuffRegex })   \)
   )*
 )
}x;

sub SimpleConvert($); # This must be predeclared, as it's used recursively
sub SimpleConvert($)
{
  my $re = shift;  # Regex to mangle
  $re =~ s{
      \(\?                    #  "(?"
        <  ( (?>\w+) ) >      #     < $1 > $1 is an identifier
        ( $NestedStuffRegex ) #     $2 - possibly-nested stuff
      \)                      #  ")"
  }{
    my $id   = $1;
    my $guts = SimpleConvert($2);
    # We change
    #    (?<id>guts)
    # to
    #    (?: (guts)  # match the guts
    #        (?{
    #           local($^N{$id}) = $guts # Save to a localized element of %^T
    #         })
    #     )
    "(?:($guts)(?{ local(\$^T{'$id'}) = \$^N }))"
  }xeog;
  return $re;  # Return mangled regex
}

sub MungeRegexLiteral($)
{
  my ($RegexLiteral) = @_; # Argument is a string
  # print "BEFORE: $RegexLiteral\n"; # Uncomment this for debugging
  my $new = SimpleConvert($RegexLiteral);
  if ($new ne $RegexLiteral)
  {
     my $before = q/(?{ local(%^T) = () })/; # Localize temporary hash
     my $after  = q/(?{ %^N = %^T       })/; # Copy temp to "real" hash
     $RegexLiteral = "$before(?:$new)$after";
  }
  # print "AFTER:  $RegexLiteral\n"; # Uncomment this for debugging
  return $RegexLiteral;
}

1;