\n$1%mg; s%?(table|tr|td|tbody|div|center)[^>]*>\n?%%g; } open(OUT, ">$file.new") or die "Unable to write $file.new\n"; print OUT $_; close(OUT); rename("$file.new", $file); } sub cleanPage { s%\s+$%%gm; s%^[ \t]+%%mg; s%.*?%%igs; s%
]*src *= *"?$prefix\.jpg[^>]*>(\s*
)?%%i;
s%]+>(.*?)%$1%ig;
s%]+>(Back|Next|Framed|Contents)( *\| *)?%%ig;
s%\s*$%\n%;
}
sub tweakPunctuation
{
# Obfuscate certain characters in the HTML tags (restored later).
1 while s/(<[^>]*)`/$1\x0F/;
1 while s/(<[^>]*)"/$1\x0E/;
1 while s/(<[^>]*)'/$1\x0D/;
1 while s/(<[^>]*)&/$1\x0C/;
1 while s/(<[^>]*)-/$1\x0B/;
# Make regex matching easier and the final file size smaller.
s/"?/"/g;
s/
?/\x85/g;
s/?/\x91/g;
s/?/\x92/g;
s/?/\x93/g;
s/?/\x94/g;
s/?/\x97/g;
s/ ?/\xA0/g;
# We make our life easier by putting each paragraph on a single line.
s%\n+([^<])% $1%g;
# Now, transform the blah ASCII into more interesting punctuation.
s%`%\x91%g;
s%'%\x92%g;
s%\.\s*\.\s*\.%\x85%gs;
s%\x85(\s*[.!?]+["\x92\x94]+)%