From: Baldur Kristinsson Date: Fri, 11 Dec 2015 00:08:00 +0000 (+0000) Subject: Harmonize ikiwiki-pandoc with recent versions of pandoc X-Git-Url: https://git.g-eek.se/?a=commitdiff_plain;h=a063db715abb38c3f3df7255a1bbe298fc4e1f31;p=ikiwiki-pandoc.git Harmonize ikiwiki-pandoc with recent versions of pandoc - Parse YAML meta blocks and correct parsing of old-style meta block (title, author, date). - Improve bibliography/citation support. - Get rid of deprecated `--html5` parameter. - Add support for Org and OPML files. - Improve math support. - Add filter support. - New options in *.setup file: pandoc_citeproc, pandoc_org, pandoc_opml, pandoc_math_custom_js, pandoc_filters. --- diff --git a/pandoc.pm b/pandoc.pm index 3b63df3..efa4715 100755 --- a/pandoc.pm +++ b/pandoc.pm @@ -8,6 +8,7 @@ use IkiWiki; use FileHandle; use IPC::Open2; use JSON; +# use Data::Dumper; sub import { my $markdown_ext = $config{pandoc_markdown_ext} || "mdwn"; @@ -43,6 +44,14 @@ sub import { hook(type => "htmlize", id => "mediawiki", call => sub { htmlize("mediawiki", @_) }); } + if ($config{pandoc_opml}) { + hook(type => "htmlize", id => "opml", + call => sub { htmlize("opml", @_) }); + } + if ($config{pandoc_org}) { + hook(type => "htmlize", id => "org", + call => sub { htmlize("org", @_) }); + } } @@ -59,38 +68,59 @@ sub getsetup () { safe => 0, rebuild => 0, }, + pandoc_citeproc => { + type => "string", + example => "/usr/local/bin/pandoc-citeproc", + description => "Path to pandoc-citeproc executable", + safe => 0, + rebuild => 0, + }, pandoc_markdown_ext => { type => "string", - example => "mdwn", - description => "File extension for Markdown files", + example => "mdwn,md,markdown", + description => "File extension(s) for Markdown files handled by Pandoc", safe => 1, rebuild => 1, }, pandoc_latex => { type => "boolean", example => 0, - description => "Enable Pandoc processing of LaTeX documents", + description => "Enable Pandoc processing of LaTeX documents (extension=tex)", safe => 0, rebuild => 1, }, pandoc_rst => { type => "boolean", example => 0, - description => "Enable Pandoc processing of reStructuredText documents", + description => "Enable Pandoc processing of reStructuredText documents (extension=rst)", safe => 0, rebuild => 1, }, pandoc_textile => { type => "boolean", example => 0, - description => "Enable Pandoc processing of Textile documents", + description => "Enable Pandoc processing of Textile documents (extension=textile)", safe => 0, rebuild => 1, }, pandoc_mediawiki => { type => "boolean", example => 0, - description => "Enable Pandoc processing of MediaWiki documents", + description => "Enable Pandoc processing of MediaWiki documents (extension=mediawiki)", + safe => 0, + rebuild => 1, + }, + pandoc_org => { + type => "boolean", + example => 0, + description => "Enable Pandoc processing of Emacs org-mode documents (extension=org)", + safe => 0, + rebuild => 1, + }, + pandoc_opml => { + type => "boolean", + example => 0, + description => "Enable Pandoc processing of OPML documents (extension=opml)", safe => 0, rebuild => 1, }, @@ -146,7 +176,14 @@ sub getsetup () { pandoc_math => { type => "string", example => "mathjax", - description => "Process TeX math using", + description => "How to process TeX math; e.g. mathjax, katex, or mathml", + safe => 0, + rebuild => 1, + }, + pandoc_math_custom_js => { + type => "string", + example => "", + description => "Link to local/custom script for math (requires appropriate pandoc_math setting)", safe => 0, rebuild => 1, }, @@ -164,6 +201,13 @@ sub getsetup () { safe => 0, rebuild => 1, }, + pandoc_filters => { + type => "string", + example => "", + description => "A comma-separated list of custom pandoc filters", + safe => 0, + rebuild => 1, + }, } @@ -171,92 +215,65 @@ sub htmlize ($@) { my $format = shift; my %params = @_; my $page = $params{page}; + my $htmlformat = 'html'; local(*PANDOC_IN, *JSON_IN, *JSON_OUT, *PANDOC_OUT); - my @args; + my @args = (); - my $command = $config{pandoc_command} || "/usr/local/bin/pandoc"; + # The default assumes pandoc is in PATH + my $command = $config{pandoc_command} || "pandoc"; if ($config{pandoc_smart}) { push @args, '--smart'; - }; + } if ($config{pandoc_obfuscate}) { push @args, '--email-obfuscation=references'; } else { push @args, '--email-obfuscation=none'; - }; + } if ($config{pandoc_html5}) { - push @args, '--html5'; - }; + $htmlformat = 'html5'; + } if ($config{pandoc_ascii}) { push @args, '--ascii'; - }; + } if ($config{pandoc_numsect}) { push @args, '--number-sections'; - }; + } if ($config{pandoc_sectdiv}) { push @args, '--section-divs'; - }; + } if ($config{pandoc_codeclasses} && ($config{pandoc_codeclasses} ne "")) { push @args, '--indented-code-classes=' . $config{pandoc_codeclasses}; - }; - - if ($config{pandoc_bibliography}) { - push @args, '--bibliography='.$config{pandoc_bibliography}; } - if ($config{pandoc_csl}) { - push @args, '--csl='.$config{pandoc_csl}; + # How to process math. Normally either mathjax or katex. + my %mathconf = map {($_=>"--$_")} qw( + jsmath mathjax latexmathml asciimathml mimetex mathml katex mimetex webtex + ); + my $mathopt = $1 if $config{pandoc_math} =~ /(\w+)/; + my $custom_js = $config{pandoc_math_custom_js} || ''; + if ($mathopt && $mathconf{$mathopt}) { + push @args, $mathconf{$mathopt}; + $pagestate{$page}{meta}{"pandoc_math_$mathopt"} = 1; + $pagestate{$page}{meta}{"pandoc_math_custom_js"} = $custom_js if $custom_js; } - - for ($config{pandoc_math}) { - if (/^mathjax$/) { - push @args, '--mathjax=/dev/null'; - } - elsif (/^jsmath$/) { - push @args, '--jsmath'; - } - elsif (/^latexmathml$/) { - push @args, '--latexmathml'; - } - elsif (/^mimetex$/) { - push @args, '--mimetex'; - } - elsif (/^mathtex$/) { - push @args, '--mimetex=/cgi-bin/mathtex.cgi'; - } - elsif (/^google$/) { - push @args, '--webtex'; - } - elsif (/^mathml$/) { - push @args, '--mathml'; - } - else { } - } - # Convert to intermediate JSON format so that the title block # can be parsed out + # We must omit the 'bibliography' parameter here, otherwise the list of + # references will be doubled. my $to_json_pid = open2(*JSON_OUT, *PANDOC_OUT, $command, '-f', $format, '-t', 'json', @args); - error("Unable to open $command") unless $to_json_pid; - # $ENV{"LC_ALL"} = "en_US.UTF-8"; - my $to_html_pid = open2(*PANDOC_IN, *JSON_IN, $command, - '-f', 'json', - '-t', 'html', - @args); - - error("Unable to open $command") unless $to_html_pid; - # Workaround for perl bug (#376329) require Encode; my $content = Encode::encode_utf8($params{content}); @@ -269,60 +286,105 @@ sub htmlize ($@) { waitpid $to_json_pid, 0; - print JSON_IN $json_content; - close JSON_IN; - - my @html = ; - close PANDOC_IN; - - waitpid $to_html_pid, 0; - - $content = Encode::decode_utf8(join('', @html)); - # Parse the title block out of the JSON and set the meta values - my @perl_content = @{decode_json($json_content)}; - my %header_section = %{$perl_content[0]}; - my @doc_title = @{$header_section{'docTitle'}}; - my @doc_authors = @{$header_section{'docAuthors'}} if ref $header_section{'docAuthors'} eq 'ARRAY'; - my $num_authors = scalar @doc_authors; - my @primary_author = (); - if ($num_authors gt 0) { - @primary_author = @{$doc_authors[0]}; + my @json_content = @{decode_json($json_content)}; + my $meta = {}; + if (ref $json_content[0] eq 'HASH') { + $meta = $json_content[0]->{'unMeta'}; + } + else { + warn "WARNING: Unexpected format for meta block. Incompatible version of Pandoc?\n"; } - my @doc_date = @{$header_section{'docDate'}} if ref $header_section{'docDate'} eq 'ARRAY'; + + # Get some selected meta attributes, more specifically: + # (title date bibliography csl subtitle abstract summary version + # author references [+ num_authors primary_author]) sub compile_string { - # The uncompiled string is an array of hashes containing words and - # string with the word "Space". - my (@uncompiled_string) = @_; + # Partially represents an item from the data structure in meta as a string. + my @uncompiled = @_; + @uncompiled = @{$uncompiled[0]} if @uncompiled==1 && ref $uncompiled[0] eq 'ARRAY'; my $compiled_string = ''; - foreach my $word_or_space(@uncompiled_string) { - if (ref($word_or_space) eq "HASH") { - if ($word_or_space->{"Str"}) { - $compiled_string .= $word_or_space->{"Str"}; - } - } - else { - $compiled_string .= ' '; - } + foreach my $word_or_space (@uncompiled) { + next unless ref $word_or_space eq 'HASH'; + my $type = $word_or_space->{'t'}; + $compiled_string .= compile_string(@{ $word_or_space->{c} }) if $type eq 'MetaInlines'; + next unless $type eq 'Str' || $type eq 'Space' || $type eq 'MetaString'; + $compiled_string .= $type eq 'Space' ? ' ' : $word_or_space->{c}; } return $compiled_string; } - my $title = compile_string @doc_title; - my $author = compile_string @primary_author; - my $date = compile_string @doc_date; + my %scalar_meta = map { ($_=>undef) } qw( + title date bibliography csl subtitle abstract summary version); + my %list_meta = map { ($_=>[]) } qw/author references/; + foreach my $k (keys %scalar_meta) { + $scalar_meta{$k} = compile_string($meta->{$k}->{c}) if $meta->{$k}; + $pagestate{$page}{meta}{$k} = $scalar_meta{$k}; + } + foreach my $k (keys %list_meta) { + $list_meta{$k} = $meta->{$k}->{'c'} if $meta->{$k}; + $list_meta{$k} = [ map { compile_string($_) } @{$list_meta{$k}} ] if $k eq 'author'; + $pagestate{$page}{meta}{$k} = $list_meta{$k}; + } + my $num_authors = scalar @{ $list_meta{author} }; + $scalar_meta{num_authors} = $num_authors; + $pagestate{$page}{meta}{num_authors} = $num_authors; + if ($num_authors) { + $scalar_meta{primary_author} = $list_meta{author}->[0]; + $pagestate{$page}{meta}{primary_author} = $list_meta{author}->[0]; + } - if ($title) { - $pagestate{$page}{meta}{title} = $title; + # The bibliography may be set in a Meta block in the page or in the .setup file. + # If both are present, the Meta block has precedence. + for my $bibl ($scalar_meta{bibliography}, $config{pandoc_bibliography}) { + if ($bibl) { + $pagestate{$page}{meta}{bibliography} = $bibl; + push @args, '--bibliography='.$bibl; + last; + } } - if ($author) { - $pagestate{$page}{meta}{author} = $author; + # Similarly for the CSL file... + for my $cslfile ($scalar_meta{csl}, $config{pandoc_csl}) { + if ($cslfile) { + $pagestate{$page}{meta}{csl} = $cslfile; + push @args, '--csl='.$cslfile; + last; + } } - if ($date) { - $pagestate{$page}{meta}{date} = $date; + + # In any case, turn on the pandoc-citeproc filter, since + # we may have the bibliography under 'references' in the meta section. + my $citeproc = $config{pandoc_citeproc} || 'pandoc-citeproc'; + push @args, "--filter=$citeproc"; + + # Other pandoc filters. Note that currently there is no way to + # configure a filter to run before pandoc-citeproc has done its work. + if ($config{pandoc_filters}) { + my @filters = split /\s*,\s*/, $config{pandoc_filters}; + s/^["']//g for @filters; # get rid of enclosing quotes + foreach my $filter (@filters) { + push @args, "--filter=$filter"; + } } + + + my $to_html_pid = open2(*PANDOC_IN, *JSON_IN, $command, + '-f', 'json', + '-t', $htmlformat, + @args); + error("Unable to open $command") unless $to_html_pid; + + print JSON_IN $json_content; + close JSON_IN; + + my @html = ; + close PANDOC_IN; + + waitpid $to_html_pid, 0; + + $content = Encode::decode_utf8(join('', @html)); return $content; }