my $need_close_anchor = 0; # True if we've seen , but not its closing .
while (not $html =~ m/\G\z/gc) # While we haven't worked our way to the end . . .
{
if ($html =~ m/\G(\w+)/gc) {
. . . have a word or number in $1 -- can now check for profanity, for example . . .
} elsif ($html =~ m/\G[^<>&\w]+/gc) {
# Other non-HTML stuff -- simply allow it.
} elsif ($html =~ m/\G
]+)>/gci) {
. . . have an image tag -- can check that it's appropriate . . .
} elsif (not $need_close_anchor and $html =~ m/\G]+)>/gci){
. . . have a link anchor - can validate it . . .
$need_close_anchor = 1; # Note that we now need
} elsif ($need_close_anchor and $html =~ m{\G}gci){
$need_close_anchor = 0; # Got what we needed; don't allow again
} elsif ($html =~ m/\G&(#\d+|\w+);/gc){
# Allow entities like > and {
} else {
# Nothing matched at this point, so it must be an error. Note the location, and grab a dozen or so
# characters from the HTML so that we can issue an informative error message.
my $location = pos($html); # Note where the unexpected HTML starts.
my ($badstuff) = $html =~ m/\G(.{1,12})/s;
die "Unexpected HTML at position $location: $badstuff\n";
}
}
# Make sure there's no dangling
if ($need_close_anchor) {
die "Missing final "
}
-----------------------------------------------------------------------------
Copyright 1997-2025 Jeffrey Friedl