#!/usr/bin/perl # # A script for tweaking the HTML in Baen's WebScription books so that the # books look/work nicer in their .lit format (as created by other software). # # Name: baen2lit # Author: Wayne Davison . # Version: 1.0.0 # # Feel free to use and distribute this script however you like. # # Type "baen2lit -h" to see a list of options. # # See http://www.baen.com/ws_faq.htm for more information on a WebScription. # # See http://www.readerworks.com/ for software that creates the .lit books. # use Getopt::Std; $opt_e = $opt_h = $opt_l = 0; # Remove single-use warnings $OPTS = 'ehl'; &usage if !getopts($OPTS) || $opt_h || !@ARGV; if (grep(/[*?]/, @ARGV)) { foreach (@ARGV) { push(@files, grep(/^\d\d\d\d\d\d.*_+(\d+|toc|c_)\.htm$/, glob $_)); } } else { @files = grep(/^\d\d\d\d\d\d.*_+(\d+|toc|c_)\.htm$/, @ARGV); } undef $/; foreach $file (@files) { print "Processing $file:\n"; ($prefix) = $file =~ /^([^_.]+)/; open(IN, $file) or die "Unable to read $file\n"; $_ = ; close(IN); &cleanPage; &tweakPunctuation unless $opt_l; if ($file =~ /_c_/) { s%^()%

\n$1%mg; s%]*>\n?%%g; } open(OUT, ">$file.new") or die "Unable to write $file.new\n"; print OUT $_; close(OUT); rename("$file.new", $file); } sub cleanPage { s%\s+$%%gm; s%^[ \t]+%%mg; s%.*?%%igs; s%]*>\s*\s*%%s; s%%%gs; s%<\?xml [^>]*>\n?%%; s%]*>\n?%%g; s%

]*src *= *"?$prefix\.jpg[^>]*>(\s*
)?%%i; s%]+>(.*?)%$1%ig; s%]+>(Back|Next|Framed|Contents)( *\| *)?%%ig; s%\s*$%\n%; } sub tweakPunctuation { # Obfuscate certain characters in the HTML tags (restored later). 1 while s/(<[^>]*)`/$1\x0F/; 1 while s/(<[^>]*)"/$1\x0E/; 1 while s/(<[^>]*)'/$1\x0D/; 1 while s/(<[^>]*)&/$1\x0C/; 1 while s/(<[^>]*)-/$1\x0B/; # Make regex matching easier and the final file size smaller. s/"?/"/g; s/…?/\x85/g; s/‘?/\x91/g; s/’?/\x92/g; s/“?/\x93/g; s/”?/\x94/g; s/—?/\x97/g; s/ ?/\xA0/g; # We make our life easier by putting each paragraph on a single line. s%\n+([^<])% $1%g; # Now, transform the blah ASCII into more interesting punctuation. s%`%\x91%g; s%'%\x92%g; s%\.\s*\.\s*\.%\x85%gs; s%\x85(\s*[.!?]+["\x92\x94]+)%\x85$1%igs; s%[-\x96]{2,}%\x97%g; s%[-\x96\x97]([."\x92\x94]+)%\x97$1%ig; s%()%$1%ig; s%()%$1%ig; s%%%ig; s%"([^"\x93\x94\n]+)"%\x93$1\x94%g; s%(^|[\xA0\s])((?:<[^>]*>)*)"%$1$2\x93%gm; s%(\x93[^\x94"\n]+)["\x93]%$1\x94%g; s%\s*["\x94]((?:<[^>]*>)*;?)([\s)\xA0]|$)%\x94$1$2%gm; # Unmangle any obfuscated HTML tags. 1 while s/(<[^>]*)\x0F/$1`/; 1 while s/(<[^>]*)\x0E/$1"/; 1 while s/(<[^>]*)\x0D/$1'/; 1 while s/(<[^>]*)\x0C/$1&/; 1 while s/(<[^>]*)\x0B/$1-/; if ($opt_e) { # Turn enhanced single-character punctuation into HTML entities. s/\x85/…/g; s/\x91/‘/g; s/\x92/’/g; s/\x93/“/g; s/\x94/”/g; s/\x97/—/g; s/\xA0/ /g; } } sub usage { die <