#!/usr/bin/perl # pml2html converts a BOOK.pml file (unencrypted only) into a BOOK.html file # in the local directory with the following extra files also created: # # styles.css A simple style-sheet for the converted html # content.opf An opf file compatible with OEB standard # toc.ncx A table of contents # # This program is freely redistributable. # TODO: # - Can we do something with \T="NUM%" when it comes after some text? use strict; use warnings; use Getopt::Long; my %TAG = ( _ => 'h1', _0 => 'h1', _1 => 'h2', _2 => 'h3', _3 => 'h4', _4 => 'h5', a => 'a', b => 'b', i => 'i', o => 'strike', u => 'u', sb => 'sub', sp => 'sup', ); my %tag_attr = map { $_ => '' } keys %TAG; $tag_attr{_} = ' class="x"'; &Getopt::Long::Configure('bundling'); &usage if !&GetOptions( 'language|l=s' => \( my $language = 'English' ), 'max-indent|m=i' => \( my $max_indent = 4 ), 'shy' => \( my $soft_hyphen ), 'skip|s=i' => \( my $empty_skip_cnt = 0 ), 'nounspace|U' => \( my $no_unspace_Q ), 'help|h' => \( my $help_opt ), ); &usage() if $help_opt || @ARGV != 1; $soft_hyphen = $soft_hyphen ? '­' : ''; my $pml_fn = shift; &usage("Book suffix must be .pml\n") unless $pml_fn =~ /\.pml$/; # Read the whole book into $_. open PML, '<', $pml_fn or die "Unable to open $pml_fn: $!\n"; undef $/; $_ = ; $/ = "\n"; close PML; s{\r\n|\r} {\n}g; # Turn pseudo html tags into internally-understood escapes. s{}{\\FN="$1"}g; s{}{\\FN}g; s{}{\\SD="$1"}g; s{}{\\SD}g; s{&} {&}g; s{<} {<}g; s{>} {>}g; # Turn \\ into \ while leaving all \x escapes intact. This ensures # we can properly parse \\x (as \x) and \\\x (as \\x). s{\\(.)} { $1 eq '\\' ? '\' : "\\$1" }ge; s{\\-} {$soft_hyphen}g; s/\x85|\\a133/…/g; s/\x91|\\a145/‘/g; s/\x92|\\a146/’/g; s/\x93|\\a147/“/g; s/\x94|\\a148/”/g; s/\x96|\\a150/–/g; s/\x97|\\a151/—/g; s/\xA0|\\a160/ /g; s{\\a(\d{3})} {&#$1;}g; s{\\U([0-9a-f]{4})} {&#x$1;}g; # Check the input file to see if it has a consistent space following \Q="..." my $unspace_Q = $no_unspace_Q || /\\Q="[^"]+"([^ ]|\z)/ ? '' : ' '; my $line_num = 0; my @input = split /\n/, $_; undef $_; $pml_fn =~ s{.*/}{}; (my $html_fn = $pml_fn) =~ s/\.pml$/.html/; open HTML, '>', $html_fn or die $!; print HTML < EOT my(@toc, %images); my(@current, %tag_flag, %font_flag); my @font_stack = ( 'n' ); my $block_style = ''; my $empty_count = 0; my $allow_extra_empty = 0; my %paras = ( 0 => 1 ); my $title = 'Unknown'; my $author = 'Unknown'; my $publisher = 'Unknown'; my $copyright = 'Unknown'; my $isbn = 'Unknown'; while (&get_next_line) { my $line = $_; my $chapter_id = ''; my $check_for_indent = 1; my $out = $block_style ? '' : qq|

|; my $p_is_open = !$block_style; # This indicates that the current line needs a

. my $div_is_open = 0; # This indicates that the current line needs a . See also $block_style. my $line_is_effectively_empty = 1; s/^\s+$//; while ($_ ne '') { # Handle text that does not include any escapes. if (s/^([^\\]+)//) { my $txt = $1; # Spaces that come before any other text are significant. if ($check_for_indent) { if ($txt =~ s/^([ \t]+)//) { (my $spaces = $1) =~ s/\t/ /g; my $spaces_len = length($spaces); if ($block_style) { $out .= ' ' x $spaces_len; } else { if ($out =~ /

/) { $spaces_len += $1; } $spaces_len = $max_indent if $spaces_len > $max_indent; $paras{$spaces_len} = 1; $out =~ s/

/

/; } } next if $txt eq ''; $check_for_indent = 0; } &output_empty_lines($empty_count); # Updates $empty_count. $txt = uc($txt) if $font_flag{k}; my @new = ($font_stack[-1], sort keys %tag_flag); if ("@current" ne "@new") { $out .= attr_change(\@current, \@new); # Updates @current. } $out .= $txt; $line_is_effectively_empty = 0; last if $_ eq ''; } # Now process the escape at the start of the buffer. if (s/^\\([qbBiIuox]|S[pb]|X[0-4])//) { # We turn all escapes into lower-case, which unifies the treatment # of \b & \B and \i & \I. We also transform some of the letters # so that they sort earlier in the attr list ('x' becomes '_', # 'X0' - 'X4' becomes '_0' - '_4', and 'q' becomes 'a'). (my $attr = lc($1)) =~ tr/xq/_a/; if ($tag_flag{$attr}) { delete $tag_flag{$attr}; } else { $tag_flag{$attr} = 1; if ($attr eq 'a') { s/^="([^"]*)"// or die qq|Mangled \\q="..."...\\q attr.\n|; $tag_attr{a} = qq| href="$1"|; } } } elsif (s/^\\([klns])//) { my $attr = $1; if ($font_flag{$attr} ^= 1) { push @font_stack, $attr; } elsif (pop(@font_stack) ne $attr) { die "Font changes are out of order.\n"; } } elsif (s/^\\([crt])//) { my $blk = $1; &output_empty_lines($empty_count); # Updates $empty_count. if ($block_style) { if ($block_style ne $blk) { die "Illegal nesting of \\$blk inside \\$block_style\n"; } $div_is_open++; close_tags($out, \@current, $p_is_open, $div_is_open); # Tweaks the args. $block_style = ''; $check_for_indent = 0; } else { close_tags($out, \@current, $p_is_open, $div_is_open); # Tweaks the args. $out .= qq|

|; $block_style = $blk; } } elsif (s/^\\m="([^"]*)"//) { my $img = $1; &output_empty_lines($empty_count); # Updates $empty_count. close_tags($out, \@current, $p_is_open, $div_is_open); # Tweaks the args. $images{$img} = 1; $out .= qq||; $line_is_effectively_empty = 0; } elsif (s/^\\p//) { &output_empty_lines($empty_count); # Updates $empty_count. close_tags($out, \@current, $p_is_open, $div_is_open); # Tweaks the args. $check_for_indent = 0; $out .= qq|
|; } elsif (s/^\\w="(\d+)%"//) { my $hr_width = $1; &output_empty_lines($empty_count); # Updates $empty_count. close_tags($out, \@current, $p_is_open, $div_is_open); # Tweaks the args. $out .= qq|
|; $line_is_effectively_empty = 0; } elsif (s/^\\(Fn|Sd)="([^"]*)"(.*?)\\\1//) { my($type, $id, $txt) = (lc($1), $2, $3); &output_empty_lines($empty_count); # Updates $empty_count. $out .= qq|$txt|; $line_is_effectively_empty = 0; } elsif (s/^\\(FN|SD)="([^"]*)"//) { my($type, $id) = (lc($1), $2); # Reset everything for end-of-book items. $div_is_open++ if $block_style; close_tags($out, \@current, $p_is_open, $div_is_open, 1); # Tweaks the args. &output_empty_lines($empty_count); # Updates $empty_count. $out .= qq|
|; $line_is_effectively_empty = 0; } elsif (s/^\\(FN|SD)//) { $div_is_open++ if $block_style; close_tags($out, \@current, $p_is_open, $div_is_open, 1); # Tweaks the args. $out .= '
'; } elsif (s/^\\Q="([^"]*)"$unspace_Q//o) { my $id = $1; &output_empty_lines($empty_count); # Updates $empty_count. $out .= qq||; $chapter_id = $id; # Remember ID in case \C needs it. } elsif (s/^\\C\d="([^"]*)"//) { my $ch = $1; &output_empty_lines($empty_count); # Updates $empty_count. if ($chapter_id eq '') { if (/\\Q="([^"]*)"/) { $chapter_id = $1; } else { $chapter_id = 'p2h_toc' . scalar @toc; $out .= qq||; } } push @toc, [ $ch, $chapter_id ]; } elsif (s/^\\v//) { my $info = ''; while (1) { if (s/^(.*?)\\v//) { $info .= $1; last; } $info .= $_ . "\n"; last unless &get_next_line; $line = $_; } $title = $1 if $info =~ /TITLE="([^"]+)"/; $author = $1 if $info =~ /AUTHOR="([^"]+)"/; $publisher = $1 if $info =~ /PUBLISHER="([^"]+)"/; $copyright = $1 if $info =~ /COPYRIGHT="([^"]+)"/; $isbn = $1 if $info =~ /ISBN="([^"]+)"/; $out .= ""; } elsif (s/^\\T="(\d+)%?"//) { my $width = $1; $allow_extra_empty = 1; &output_empty_lines($empty_count); # Updates $empty_count. if ($p_is_open && $out =~ /^

$/) { $p_is_open = 0; $out = qq|

|; $div_is_open++; if ($_ !~ /\S/) { $out .= $_ . "
\n"; last unless &get_next_line; $line = $_; } $line_is_effectively_empty = 0; } elsif ($block_style && $out =~ /^
$/) { $out =~ s/^
', 'styles.css' or die "Unable to write styles.css: $!\n"; print CSS < $b} keys %paras) { my $width = sprintf('%.1fem', $spaces_len / 2); print CSS ".p$spaces_len {text-indent: $width; margin-top: 0; margin-bottom: 0}\n"; } close CSS; my $navmap = ''; my $cnt = 1; foreach my $ref (@toc) { my($label, $id) = @$ref; $navmap .= < $label EOT $cnt++; } open NCX, '>', 'toc.ncx' or die "Unable to write toc.ncx: $!\n"; print NCX < $title $navmap EOT close NCX; my $images = ''; $cnt = 1; foreach (sort keys %images) { my($type) = /([^.]+)$/; $type = lc($type); $type =~ s/jpg/jpeg/; my $id = $_ eq 'cover.png' ? 'cover' : "image$cnt"; $images .= < EOT $cnt++; } open OPF, '>', 'content.opf' or die "Unable to write content.opf: $!\n"; my $cover_meta = ''; if (defined $images{'cover.png'}) { $cover_meta = < EOT } print OPF < $publisher $language $author $title $copyright $cover_meta $images EOT close OPF; exit; # Puts the next line into $_ and updates $line_num. sub get_next_line { return undef unless @input; $_ = shift @input; $line_num++; 1; } sub close_tags { my($out, $cur_ref, $p_is_open, $div_is_open, $reset_everything) = @_; if (@$cur_ref) { $out .= attr_change($cur_ref, [ ]); # Updates @$cur_ref. } if ($p_is_open) { $out .= '

' unless $out =~ s/^

]+>\s*$//; $p_is_open = 0; $allow_extra_empty = 0 unless $out eq ''; } elsif ($div_is_open) { $out .= '

' unless $out =~ s/^
]+>\s*$//; while (--$div_is_open) { $out .= '
'; } } else { $allow_extra_empty = 0; } if ($reset_everything) { $block_style = ''; %tag_flag = ( ); %font_flag = ( ); @font_stack = ( 'n' ); } # Set caller's args. $_[0] = $out; $_[2] = $p_is_open; $_[3] = $div_is_open; } sub attr_change { my($cur_ref, $new_ref) = @_; my @close = reverse @$cur_ref; my @open = @$new_ref; my $tags = ''; # Get rid of any useless close-open sequences (as nesting allows). while (@close && @open && $open[0] eq $close[-1]) { shift @open; pop @close; } if (@close) { foreach my $attr (@close) { my $tag = $TAG{$attr}; if (defined $tag) { $tags .= ''; } elsif ($attr ne 'n') { $tags .= ''; } } } if (@open) { foreach my $attr (@open) { my $tag = $TAG{$attr}; if (defined $tag) { $tags .= '<' . $tag . $tag_attr{$attr} . '>'; } elsif ($attr ne 'n') { $tags .= qq||; } } } @$cur_ref = @$new_ref; # Affects caller's arg. $tags; } sub output_empty_lines { my($cnt) = @_; return unless $cnt; my $skip = $empty_skip_cnt - $allow_extra_empty; if ($cnt <= $skip || $block_style) { print HTML "\n" x $cnt; } else { if ($skip > 0) { print HTML "\n" x $skip; $cnt -= $skip; } if ($cnt > 2) { print HTML "\n" x ($cnt - 2); $cnt = 2; } while ($cnt--) { print HTML qq|

 

\n|; } } $_[0] = 0; # Affects caller's arg. } sub usage { print STDERR @_ if @_; die <