SQLite Forum

3.31.1: docs as PDF?
Login
For what it's worth, here are two pretty straightforward Perl scripts that will amalgamate the SQLite doc structure and produce a PDF file of about 3500 pages. 

The first script contains usage hints in the comment section near the top. The second script is only necessary if you, like myself, prefer to have 7-bit clean input for LaTeX. Obviously, there is a lot of room for tweaking the scripts and the pandoc template according to your taste.

---
~~~perl
#! /usr/bin/perl
#
# File name: pre.pl
#
# Amalgamate and convert the SQLite documentation to PDF format
# (with the help of pandoc and TeX)
#
# Simple usage:
# perl pre.pl docs.html *.html c3ref/*.html session.html syntax/*.html   \
#      releaselog/*.html                                               | \ 
#   pandoc --to pdf --pdf-engine xelatex -o sqlitebook.pdf
# (Start this in the SQLite doc base directory.)
#
# perl pre.pl docs.html *.html c3ref/*.html session.html syntax/*.html   \
#      releaselog/*.html                                               | \ 
#   pandoc --standalone --from html --to latex --table-of-contents       \
#   --toc-depth 3 --number-sections --highlight-style tango              \
#   --variable documentclass=scrreprt --variable fontsize=10pt           \
#   --variable colorlinks=yes                                            \
#   --variable titlegraphic="images/sqlite370_banner.png"  |             \
#   perl post.pl > sqlitebook.tex
# pdflatex sqlitebook.tex    # repeat twice! no xelatex needed!
# (Start this in the SQLite doc base directory.)
#
# Preparatory steps:
# (1) Once only: get
#     https://www.fossil-scm.org/fossil/doc/trunk/www/xkcd-git.gif
#     and put it into the images/ folder. E.g.,
#       wget ... --output-document=images/xkcd-git.gif
# (2) Once only: if necessary, install pandoc, TeX, and possibly ImageMagick
# (3) Once only: if necessary, install the HTML::TreeBuilder Perl through
#       cpan install HTML::TreeBuilder
# (2) Once, but may need updates from time to time: convert all GIF images
#     in the images/ folder and its subfolders to PNG
#     (e.g., ImageMagick will do the trick:  magic foo.gif foo.png  )
#
# Bundling all processing steps into a Makefile is left as an exercise for
# the reader.
#
# TapirSoft Gisbert W. Selke 2020-04-07.
#
# This software may be used under the terms of the Artistic License, i.e.,
# under the same terms as Perl itself:
# https://dev.perl.org/licenses/artistic.html
#

use strict;
use warnings;
use HTML::TreeBuilder;

our $VERSION = '0.1.1';

my( %file_seen, $master );

for my $fn( glob( join( ' ', @ARGV ) ) ) {
  # process all our files in order, but each filename once only

  next if $file_seen{$fn};
  $file_seen{$fn}++;
  print STDERR "$fn...\n";

  # Build the DOM tree of this file:
  open( my $fh, '<:utf8', $fn ) || die "Cannot open '$fn' for reading: $!";
  my $tree = HTML::TreeBuilder->new( );
  $tree->parse_file($fh);
  $tree->elementify( );

  # Essential steps:
  fix_references( $tree, $fn );
  fix_headers( $tree );

  # Nice to have: remove navigation elements
  $_->delete( ) for $tree->look_down( 'class', 'nosearch' );
  # remove script elements (just because we can)
  $_->delete( ) for $tree->find( 'script' );

  # Add a comment near the beginning indicating which file this content is
  # coming from, and an anchor so that we can link here:
  $tree->find('body')->unshift_content(
    HTML::Element->new( 'br' ),
    HTML::Element->new( '~comment', 'text' => 'start of input file ' . $fn ),
    HTML::Element->new( 'a', 'id' => $fn . '__' )
  );

  if ( $master ) {
    # Append the contents of the body of this document to the master element
    $master->find('body')->push_content( $tree->find('body')->content_list( ) );
  } else {
    # Our first document serves as the master container
    $master = $tree;
  }

}

print $master->as_HTML( undef, '  ', { } ); # safe, human-readable, clean tag structure


sub fix_references {
  # Fix links and anchors so that they work within the amalgamated document

  my( $tree, $fn ) = @_;

  # find names and ids and disambiguate them, using our file name as prefix:
  fix_names( [ $tree->descendants() ], $fn, 'name' );
  fix_names( [ $tree->descendants() ], $fn, 'id' );
  # find links to other documents and change to disambiguated names:
  fix_hrefs( [ $tree->find('a') ] , $fn, 'href' );
  # fix images:
  fix_images( [ $tree->find('img') ], $fn, 'src' );

  return $tree;
} ## end sub fix_references


sub fix_names {
  # Disambiguate the anchor names and ids in this document

  my( $elems, $fn, $attrname ) = @_;
  for my $elem ( grep { defined $_->attr($attrname) } @{ $elems } ) {
    # walk through all nodes and fix them
    $elem->attr( $attrname, $fn . '__' . $elem->attr($attrname) );
  }

  return $elems;
} ## end sub fix_names


sub fix_hrefs {
  # Change links so that they work with the disambiguated anchors

  my( $elems, $fn, $attrname ) = @_;
  for my $elem ( grep { defined $_->attr($attrname) } @{ $elems } ) {
    # walk through all these nodes
    my $label = $elem->attr($attrname);
    if ( $label =~ m|^https?://| ) {
      # external link, leave unchanged
    } elsif ( $label =~ m|^#(.+)$| ) {
      # internal link to an anchor within this document => prepend our name
      $elem->attr( $attrname, '#' . $fn . '__' . $1 );
    } elsif ( $label =~ m|^[./]*(.+)#(.+)$| ) {
      # relative link to a different file of ours,  with an anchor therein.
      # remember to flatten relative paths.
      $elem->attr( $attrname, '#' . $1 . '__' . $2 );
    } elsif ( $label =~ m|^[./]*(.+)$| ) {
      # relative link to a different file of ours,  with no anchor therein.
      # remember to flatten relative paths.
      $elem->attr( $attrname, '#' . $1 . '__' );
    } else {
      warn "file $fn: strange $attrname: '$label'";
    }
  }

  return $elems;
} ## end sub fix_hrefs


sub fix_images {
  # Change references to GIF to PNG. Clean up relative paths.
  # Make <img> within <dl> nicer.

  my( $elems, $fn, $attrname ) = @_;
  for my $elem ( grep { defined $_->attr($attrname) } @{ $elems } ) {
    # walk through all these nodes
    my $label = $elem->attr($attrname);
    if ( $label =~ m|^https?://| ) {
      # external link, leave unchanged, but handle the special case of an xkcd
      # comic that is not ordinarily included with the docs bundle but is
      # referenced directly from the sqlite fossil site. We require this to be
      # downloaded separately and placed in the images/ folder.
      if ( lc($label) eq
           'https://www.fossil-scm.org/fossil/doc/trunk/www/xkcd-git.gif' ) {
        $elem->attr( $attrname, 'images/xkcd-git.png' )
      }
    } else {
      # relative link to a different file of ours
      $label =~ s|^[./]+||; # flatten relative paths
      # change .gif to .png.
      # Conversion of gif file to png files must be done outside this script,
      # probably using ImageMagick ("magic picname.gif picname.png").
      $label =~ s|\.gif$|.png|i;
      $elem->attr( $attrname, $label );
    }

    # Prepend <br> to <img> within <p> within <dd> to produce nicer results
    if ( ( lc( $elem->parent( )->tag( ) ) eq 'p' ) &&
         ( lc( $elem->parent( )->parent( )->tag( ) ) eq 'dd' ) &&
         ( !$elem->left( ) ) &&
         ( !$elem->parent( )->left( ) )
       ) {
      $elem->preinsert( HTML::Element->new( 'br' ) );
    }

  }

  return $elems;
} ## end sub fix_images


sub fix_headers {
  # remove explicit numbering from <h...> headers because latex will add its own.

  my( $tree ) = @_;
  for my $level (1..6) {
    for my $node (  $tree->find( "h$level" ) ) {
      $node->objectify_text( );
      my $textnode = $node->find( '~text' );
      if ( $textnode ) {
        my $text = $textnode->attr( 'text' );
        if ( $text =~ m|^\s*[0-9.]+\s+| ) {
          $text =~ s|^\s*[0-9.]+\s*||;
          $textnode->attr( 'text', $text );
        }
      }
      $node->deobjectify_text( );
    }
  }

  return $tree;
} ## end sub fix_headers
~~~
---

And here's the second script:

---
~~~perl
#! /usr/bin/perl
#
# File name: post.pl
#
# Convert UTF-8 characters in the output of pandoc to clean 7-bit ASCII
# characters by recoding as LaTeX macros, this obviating the need to
# use xelatex.
#
# For usage hints see pre.pl
#
# TapirSoft Gisbert W. Selke 2020-04-07.
#
# This software may be used under the terms of the Artistic License, i.e.,
# under the same terms as Perl itself:
# https://dev.perl.org/licenses/artistic.html
#

use strict;
use warnings;

our $VERSION = '0.1.1';

binmode(STDIN, ':encoding(UTF-8)');

while (<>) {
  s/\N{U+03c0}/\\ensuremath{\\pi}/g;                # pi
  s/\N{U+2007}/\\hspace*{1em}/g;                    # fixed space
  s/\N{U+2190}/\\ensuremath{\\leftarrow}/g;         # <-
  s/\N{U+2191}/\\ensuremath{\\uparrow}/g;           # ^
  s/\N{U+2192}/\\ensuremath{\\rightarrow}/g;        # ->
  s/\N{U+21d2}/\\ensuremath{\\Rightarrow}/g;        # =>
  s/\N{U+2260}/\\ensuremath{\\neq}/g;               # !=
  s/\N{U+2265}/\\ensuremath{\\geq}/g;               # >=
  s/\N{U+2588}/\\rule{1em}{1ex}/g;                  # little black box
  s/\N{U+25ba}/\\ensuremath{\\bigtriangledown}/g;   # down-pointing triangle   (might use dingbat instad)
  s/\N{U+25bc}/\\ensuremath{\\rhd}/g;               # right-ppointing triangle (might use dingbat instad)
  s/\N{U+2714}/\\checkmark{}/g;                     # check mar, (might use dingbat instead)
  s/\N{U+00a0}/~/g;                                 # non-breaking space
  s/\N{U+00b1}/\\ensuremath{\\pm}/g;                # +/-
  s/\N{U+00b2}/\\ensuremath{^2}/g;                  # subscript 2
  s/\N{U+00b9}/\\ensuremath{^1}/g;                  # subscript 1
  s/\N{U+00c0}/\\`{A}/g;                            # A acute
  s/\N{U+00c2}/\\^{A}/g;                            # A circumflex
  s/\N{U+00c3}/\\~{A}/g;                            # A tilde
  s/\N{U+00c6}/\\AE{}/g;                            # AE ligature
  s/\N{U+00d7}/\\ensuremath{\\times}/g;             # multiplication sign
  s/\N{U+00df}/\\ss{}/g;                            # esszet
  s/\N{U+00e0}/\\`{a}/g;                            # a acute
  s/\N{U+00e1}/\\'{a}/g;                            # a gravis
  s/\N{U+00e2}/\\^{a}/g;                            # a circumflex
  s/\N{U+00e3}/\\~{a}/g;                            # a tilde
  s/\N{U+00e4}/\\"{a}/g;                            # a umlaut
  s/\N{U+00e6}/\\ae{}/g;                            # ae ligature
  s/\N{U+00fe}/\\th{}/g;                            # thorn

  print;
}
~~~
---