genorg.pl
author Pranshu Sharma <pranshu@bauherren.ovh>
Tue, 17 Dec 2024 01:49:39 +1000
changeset 8 ee9b57cbbefc
parent 7 f00ed34eca17
child 9 58b72fea234c
permissions -rwxr-xr-x
Big

#!/usr/bin/perl
# Author: Pranshu Sharma <pranshu@bauherren.ovh>

use v5.36.0;
use strict;
use warnings;

use XML::LibXML;
use File::Find;
use File::Copy qw(copy move);
use Cwd;
use experimental qw(declared_refs);
use Data::Dumper;
use POSIX;
# !! Add seperatio between title and identifier

# Fix up CLI interface
# Add option ro set cache to 0 through command line
# Print required elisp code it is in $req_config
# Add config option for $max_rss and $max_cat
# Sort cataogirs by entires
# Document link to file x from file x no work
# Variable for map_css
# Remove the desc
# Domain thing as well
# 2 articles 1 day better sorting
# cataogry linking
# reinforce slash ending for $oudir
my $cache = 1;

my $config_file = "genorg-conf";
my $dir = "blog";
my $outdir = "out/";
my $emacs = "emacs";
my $art_css = "astyle.css";
my $cat_css ="cstyle.css";
my $no_name_dir = "c";
my $cache_file = "genorg-cache/cache.xml";
my $domain = "p.bauherren.ovh";
my $max_rss = 30;
# The first page has one less article.  Think of it as a feature.
my $max_cat = 7;
my $main_title= "";
my $blog_title = "Pranshu's blog";

my $username = $ENV{LOGNAME} || $ENV{USER} || getpwuid ($<);
my $template = <<"END";
#+title: Test document
#+subtitle: This is the subtitle
#+author: $username
#+keywords: tag thing | related
#+options: html-link-use-abs-url:nil html-postamble:auto
#+options: html-preamble:t html-scripts:nil html-style:t
#+options: html5-fancy:nil tex:t
#+options: tex:mathjax
#+html_doctype: html5
#+html_container: div
#+html_content_class: content
#+html_link_home:
#+html_link_up:
#+html_mathjax:
#+html_equation_reference_format: \eqref{%s}
#+html_head:
#+html_head_extra:
#+infojs_opt:
#+creator: <a href="https://www.gnu.org/software/emacs/">Emacs</a> 31.0.50 (<a href="https://orgmode.org">Org</a> mode 9.7.11)
#+latex_header:
END

my $req_config = <<"END";
(require 'org)
(require 'ob)
(org-link-set-parameters "genorg"
			 :follow nil
			 :export
			 #'(lambda (link desc _ _)
			     (format "<a class=\"rakim\" href=\"%s\">%s</a>" desc link))
			 :store nil)
END

my $custom_code = "(list 1)";

my $navbar = <<"END" =~ s/\n\s+//gr;
<ul>
  <li> Home
  <li> Catagories
  <li> About
</ul>
END

my $elisp_code = <<"END" =~ s/\n\s+//gr;
(let ((ls '(LIST)))
  (require 'org)
  (mapc
   (lambda (b)
     (with-current-buffer b
       (when (and (not (string-match "^ " (buffer-name b)))
		  (eq major-mode 'org-mode))
	 (org-export-to-file 'html (pop ls)))))
   (buffer-list))
   (kill-emacs)
)
END


my ($h_intro, $h_c_intro, $h_cat_li, $h_chap, $h_en_cat, $h_en, $h_end)
  = split /SPLIT/ ,<<"END" =~ s/\n\s+//gr;
<!DOCTYPE html>
<html lang="en">
  <head>
    <meta charset="UTF-8">
    <meta name="viewport" content="width=device-width, initial-scale=1.0">
    <title>TITLE</title>
    <link rel="stylesheet" href="CSS">
  </head>
  <body>
    NAV
    <h1> BTIT </h1>
SPLIT
    <ul id="cataogries">
SPLIT 
      <li> <a href="LINK">  CATAG (NUMBER)</a>
SPLIT
    </ul>
    <div>
      <h2> All articles <a id="rss" href="rss.xml">(rss)</span> </h2> <hr>
SPLIT
   <a href="LINK">
     <strong> TITLE </strong> <span>DATE</span> <i>CATAG</i>
     <hr>
  </a>
SPLIT
   <a href="LINK">
     <strong> TITLE </strong> <span>DATE</span>
     <hr>
   </a>
SPLIT
    </div>
  </body>
</html>
END

my $help = <<"END";
genorg [option] [dir|file]
Option can be:
 -t FILE : Make a template org file at FILE
 -h      : Help
 -c FILE : sepcify a new config file instead of genorg-conf
 -r      : unconditionally remake org files

If no option is set, accept a directory to generate.

$config_file follow the syntax of:
VAR VALUE
where VAR can be:
 emacs-path : The value should be path of emacs. 'emacs' by default.
 css        : The path of the css file to use.
 in_dir     : The directory that contains the files.
 out_dir    : the output directory.
 no_name_dir: The directory for generate files.  This means in_dir/no_name_dir
              cannot exist.
END


my $rss_entry = <<"END" =~ s/\n\s+//gr;
<?xml version=\"1.0\" encoding=\"UTF-8\" ?>
<rss version=\"2.0\">
 <channel>
  <title>W3Schools Home Page</title>
  <link>https://www.w3schools.com</link>
  <description>Free web building tutorials</description>"
END
  
my %config_vars =
  (
   'emacs-path' => \$emacs,
   css => \$art_css,
   indir => \$dir,
   outdir => \$outdir,
   no_name_dir => \$no_name_dir
  );


sub read_config ($path) {
  open my $fh, '<', $path or die "Couldn't open config file\n";
  while (<$fh>) {
    next if /^\s+$/;
    if (/^>>---+/) {
      local $/;
      $custom_code = <$fh>;
      return;
    }
    chomp;
    my ($directive, $rest) = split /\s+/, $_, 2;
    if (exists($config_vars{$directive})) {
      ${$config_vars{$directive}} = $rest;
    } else {
      print "Unknown variable $directive\n";
    }
  }
  close $fh;
}

sub help_and_bye {
  print $help;
  exit;
}

sub template ($file) {
  $file // die "-t needs an argument.\n";
  $file =~ s/\.org$//;
  open my $fh, '>', $file . ".org" or die $file . " couldn't be opened\n";
  print $fh $template;
  close $fh;
}

# Reletive file from the perspect of file1, to file2
sub prel_path ($from, $to) {
  return '.' if $from eq $to;
  my \(@f1, @f2) = map [m{/[^/]+}g], ($from, $to);
  # return substr($f1[$#f1], 1) if $from =~ $to;
  while (@f1 && @f2 && $f1[0] =~ $f2[0]) {
    shift @f1;
    shift @f2;
  }
  my $p = @f1-1;
  local $" = "";
  "../" x ($p > 0 ? $p : 0) . substr("@f2", 1);
}

my @org_exps;
my @files_to_make;
my @files_to_move;

sub wanted {
  my $n_path = $outdir . prel_path ($dir,  "$File::Find::name/");
  # We need to mirror the direcotry structure
  if (-d) {
    push @files_to_make, $n_path || die "Couldn't make directory $n_path\n"
      unless -d $n_path;
    return;
  }
  if (/\.org$/) {
    push @org_exps, substr $n_path, length($outdir);
  } else {
    push @files_to_move, $File::Find::name
  }
}

my @modified_files;

# Export @org_exps
sub conv_files {
  my %changed_files;
  # Remove files that are already converted
  for my $rel_file (@org_exps) {
    # Emacs need the full file path
    my ($file, $to) = map { getcwd . "/$_" . $rel_file} ($dir, $outdir);
    s/org$/html/ for ($to, $rel_file);
    unless ($cache && -f $to && (stat($to))[9] > (stat($file))[9]) {
      $changed_files{$file} = $to;
      push @modified_files, $rel_file;
    }
  }
  %changed_files // exit;
  $elisp_code =~ s/LIST/join '', map "\"$_\" ", values %changed_files /e;
  open my $fh, '-|', ($emacs, '-Q','--eval', "(progn $custom_code)", keys %changed_files, "--eval" , "$elisp_code");
  close $fh;
  \%changed_files
}

my $cdir;

$ARGV[0] // die "No arguments provided\n";
while ($_ = shift @ARGV) {
  /^-c$/ && do { print "$req_config\n"; exit };
  /^(-h|--?help)$/ && help_and_bye;
  /^-t$/ && do { template shift @ARGV ; exit };
  /^-c$/ && do { $config_file = shift @ARGV; next };
  /^-d$/ && do { $cache = 0; next };
  $cdir = $_;
}
$cdir || die "directory not provided\n";
main();

my %catags;
my @arts;
my $cache_dom;

sub main {
    $outdir =~ s!/*$!/!;
    -d $cdir or help_and_bye;
    mkdir $outdir;
    chdir $cdir or die "Couldn't access $cdir\n";
    read_config $config_file;
    -d $dir or die "$dir doesn't exist \n";
    find (\&wanted, $dir);
    for (@files_to_move) {
	copy $_, s#$dir/?#$outdir#er;
    }
    mkdir for @files_to_make;
    values %{conv_files()};
    $cache_dom = XML::LibXML->load_xml(location => $cache_file) if $cache && -f $cache_file;
    chdir $outdir;
    @arts = sort { $b->{date} <=> $a->{date}} map { html_fixup($_) } @org_exps;
    finalise_html();
    chdir '..';
    populate_cache();
}

sub populate_cache {
  my $dom = XML::LibXML::Document->new('1.0', 'UTF-8');
  my $root = $dom->createElement('root');
  my %c_els;
  for my $art (@arts) {
    # We need the: related, file, date
    my ($art_el, $date, $title, $ca, $related, $utitle) =
      map {$dom->createElement($_)} ("art", "date", "title", "catag", "related", "utitle");
    $art_el->{path} = $art->{file};
    $ca->appendText($art->{catag});
    $title->appendText($art->{title});
    $date->appendText($art->{date}->text_easy);
    $utitle->appendText($art->{utitle});
    my @ll = @{$art->{links}};
    if (@ll) {
      my $links = $dom->createElement('links');
      for (@ll) {
	my $link = $dom->createElement("link");
	$link->{catag} = $_->[0];
	$link->appendText($_->[1]);
	$links->appendChild($link);
      }
      $art_el->appendChild($links);
    }
    my $rel_str;
    for my $thing ($art->{related}) {
      for (@{$thing}) {
	my ($catag, $artt) = @{$_};
	my $rel_str .= "\"$catag\"" . ($artt && "->\"$artt\"");
	$related->appendText("\"$catag\"" . ($artt && "->\"$artt\" "));
      }
    }
    $art_el->appendChild($_) for ($date, $ca, $related, $title, $utitle);
    $root->appendChild($art_el);
  }
  $root->appendChild($_) for values %c_els;
  $dom->setDocumentElement($root);
  open my $fh, '>' , $cache_file;
  print $fh ($dom->toString(1) =~ s/(.+)/$1\n<!-- Nicley formated :() -->/r);
  close $fh;
}

sub entry {
  my ($data, $catagory) = @_;
  my $str = defined $catagory
    ? ($h_en_cat =~ s/CATAG/$data->{catag}/er) : $h_en;
  my $prel_path = "/$no_name_dir/" . (defined $catagory ? "c" : "c/b");
  $str =~ s/DATE/$data->{date}->fmt/er
    =~ s/TITLE/$data->{utitle}/er
    =~ s|LINK|prel_path($prel_path, '/'. $data->{file})|er
  }

sub rss_en ($data) {
  my $thing = <<"END" =~ s/\n\s+//gr;
 <item>
    <title>TITLE</title>
    <link>LINK</link>
    <description>DESC</description>
  </item>
END
  $thing 
    =~ s|LINK|"$domain/" . $data->{file}|er
      =~ s/TITLE/$data->{utitle}/er;
}

sub min ($x, $y) {
  ($x, $y)[$x > $y]
}

# @_ has to be in format of [STRING, LINK]
sub list2paginaiton {
  my $page = shift;
  '<ul>' . join ("", map {
    my ($str, $file) = @{$_};
    $str = "<span class=\"psel\">$str</span>" if $file && ($file == $page);
    defined $file ?
      "<li><a href=\"$file.html\">$str</a>" : $str
    } ((["←", $page - 1]) x!! ($page - 1),
       @_,
       (["→", $page + 1]) x ($page != $_[$#_]->[1]))) . '</ul>'
     }

sub move_on_if_neccasary ($fh, $art_num, $total_articles, $path) {
  return $fh if $art_num == 0 || $art_num % $max_cat;
  my $cpage = $art_num / $max_cat;
  print $fh panigation($cpage, ceil($total_articles / $max_cat), $path);
  open $fh, '>', $path . '/' . ($cpage + 1).'.html';
  $fh;
}

sub panigation ($page, $no_pages, $dir) {
  if ($no_pages < 9) {
    list2paginaiton ($page, (map [$_, $_], (1..$no_pages)))
  } elsif (4 < $page <= ($no_pages - 4)) {
    list2paginaiton
      ($page,
       [1, 1],
       ["<li>..."],
       (map [$_, $_], (($page-2)..($page+2))),
       ["<li>..."],
       [($no_pages) x 2])  
    } elsif ($page < 6) {
      list2paginaiton $page,
	((map [$_, $_], (1..5)),
	 ["<li>..."],
	 [($no_pages) x 2])
      } else {
	list2paginaiton $page,
	  ([1,1],
	   ["<li>..."],
	   (map [$_, $_], (($no_pages - 5)..$no_pages)))
	}
}

sub get_art($cat, $id) {
  for (@arts) {
    return $_ if $_->{catag} eq $cat && $_->{title} eq $id
  }
}

sub cssbs($s,$t,$p){
  $s =~ s|CSS|prel_path("/$p","/$cat_css")|er
    =~ s/TITLE|BTIT/$t/ger
    =~ s/NAV/$navbar/er;
}

sub finalise_html {
  mkdir $no_name_dir;
  open my $fh, '>', "$no_name_dir/1.html";
  open my $mrss, '>', "$no_name_dir/rss.xml"; # The master rss file
  print $mrss $rss_entry;
  print $fh cssbs($h_intro . $h_c_intro, $blog_title, "a/c");
  my (%c_files, %r_files);	# Cataogry files and rss
  
  for my $cat (sort { $catags{$b} <=> $catags{$a} } keys(%catags)) {
    my $num = $catags{$cat};
    # ($h_intro, $h_c_intro, $h_cat_li, $h_chap, $h_en_cat, $h_en, $h_end)
    print $fh $h_cat_li =~ s/NUMBER/$num/re
      =~ s/CATAG/$cat/er
      =~ s=LINK="$cat/1.html"=er;
    mkdir "$no_name_dir/$cat";
    open $c_files{$cat}, '>', "$no_name_dir/$cat/1.html";
    open $r_files{$cat}, '>', "$no_name_dir/$cat/rss.xml";
    print {$r_files{$cat}} $rss_entry;
    print {$c_files{$cat}}
      cssbs($h_intro, "$cat  <a id=\"rss\" href=\"rss.xml\">(rss)</span> ", "a/b/c");
  }
  print $fh $h_chap;
  # :-)
  my $total_articles = () =  map {(1)x$_} values %catags;
  # Now we iterate through the articles and add their index
  my ($n_fh, $n_mrss, %n_rss) = (0,0);
  my %n_cat;
  for my $art (@arts) {
    # TODO do we need $title?
    my ($catag, $title) = @{$art}{qw(catag title)};
    my $rentry = rss_en $art;
    if (($n_rss{$catag} // 0) < $max_rss) {
      print {$r_files{$catag}} $rentry;
      ++$n_rss{$catag};
    }
    if ($n_mrss < $max_rss) {
      print $mrss $rentry;
      ++$n_mrss
    }
    # ($fh, $art_num, $total_articles, $path)
    $fh = move_on_if_neccasary $fh, $n_fh, $total_articles, $no_name_dir;
    $n_fh++;
    print $fh entry $art, 1;
    my $ncat = \$n_cat{$catag};
    ++$$ncat;
    my $cfh = \$c_files{$catag};
    $$cfh = move_on_if_neccasary $$cfh, $$ncat, $catags{$catag}, "$no_name_dir/$catag";
    print {$$cfh} entry $art;
    my $a_file = $art->{file};
    if (1 or grep /^$a_file$/, @modified_files) {
      my $doom = linkify($art);
      open my $h_file, '>', $a_file;
      print $h_file $doom->toStringHTML;
      close $h_file;
    }
  }
  print $fh panigation ((ceil $n_fh / $max_cat) x 2, $no_name_dir) if $max_cat < $total_articles;
  print $fh $h_end;
  close $fh;
  for (values %c_files) {
    print $_ $h_end;
    close $_;
  }
  print $mrss '</channel> </rss>';
  for (values %r_files) {
    print $_ '</channel> </rss>';;
    close $_;
  }
}

sub linkify ($art) {
  # if $art->{dom} is undefined, we must sadly parse again
  my @links;
  my @olinks = @{$art->{links} //[]};
  my $bom = $art->{dom} // XML::LibXML->load_html(location => $art->{file});
  for ($bom->findnodes('//a[@class="rakim"]')) {
    my $key;
    my $link_text;
    if (@olinks) {
      $key = pop @olinks;
    } else {
      ($key) = parse_keywords($_->to_literal);
      $link_text = $_->{href};
      $_->removeChild($_->firstChild());
      $_->appendText($link_text);
    }
    push @links, $key;
    my $l = get_art(@{$key})->{file} or die "Link not found in $art->{filename}\n";
    $_->{href} = prel_path("/".$art->{file}, "/$l");
  }
  $art->{links} = \@links;
  return $bom
}

# I don't want to prepopulate, as it is not known which ones are
# usless and outdated, and it would be a waste of cycles parsing
# something to know it is not needed
sub getcache ($fn) {
  $cache_dom || return 0;
  for my $d ($cache_dom->findnodes("/root/art[\@path=\"$fn\"]")) {
    # TODO related
    my %dome = ('file' => $fn);
    (@dome{'title', 'catag', 'utitle'}, my $date) =
      map
      { $d->getChildrenByTagName($_)->[0]->to_literal }
      qw[title catag  utitle date];
    for ($d->getChildrenByTagName("links")) {
      my @ll = map [$_->{catag}, $_->to_literal],
	($_->getChildrenByTagName('link'));
      $dome{links} = \@ll;
    }
    $catags{$dome{catag}}++;
    $dome{date} = PDate->new(split /-/, $date);
    return \%dome;
  }
  return 0;
}

sub html_fixup ($filename) {
  if ($cache && !grep /^$filename$/, @modified_files) {
    for (getcache($filename)) {
      return $_ if $_
    }
  }
  my $dom =
    XML::LibXML->load_html(location  => $filename);
  my %data = ('file', $filename);

  # We don't need style or table of contents heading (if exists)
  for my $xpath ('/html/head/style', '/html/body/div/div[@id="table-of-contents"]/h2') {
    $_->parentNode->removeChild($_) for $dom->findnodes($xpath) 
  }
  
  # Info we need

  # Date
  for ($dom->findnodes('/html/body/div[@id="postamble"]/p[@class="date"]')) {
    my $text = $_->to_literal;
    if ($text =~ /^Date: (\d+)-(\d+)-(\d+)/) {
      my $date = PDate->new($1, $2, $3);
      $data{date} = $date;
      my $p = $_->parentNode;
      $p->parentNode->removeChild($p);
    }
  }
  # Title
  say "HELLO";
  for ($dom->findnodes('/html/head/title')) {
    say "HERE";
    $data{utitle}  = $_->to_literal;
  }
  # keywords/catogry
  for my $node ($dom->findnodes('/html/head/meta')) {
    (my $cont = $node->getAttribute('content')) || next;
    for ($node->getAttribute('name')) {
      if (/keywords/) {
	my ($key, $rest) = parse_keywords($cont);
	@data{qw[catag title]} = @{$key};
	$catags{$data{catag}}++;
	$rest =~ s/\s+\|\s+//;
	while ($rest) {
	  (my $t,$rest) = parse_keywords($rest);
	  push @{$data{related}}, $t;
	}
      } else {
	next;
      }
      $node->parentNode->removeChild($node);
    }
  }
  $data{dom} = $dom;
  \%data;
}

sub parse_keywords ($str) {
  if ($str =~ s/\s*(?:"(.+?)"|(\w+))\s*->(?:\s*(?:"(.+?)"|(\w+)))//) {
    ([$1 // $2, $3 // $4], $str);
  } else {
    die "Keywords not arranged properly\n"
  }
}

# I wonder if sean combs has made a similar class
{
  package PDate;
  
  sub new {
    my $class = shift;
    my $self = { year => 0 + shift,
		 month => 0 + shift,
		 day => 0 + shift,
	       };
    bless $self, $class;
    return $self;
  }

  # $d1 is greater than $d2
  sub cmp  {
    my ($d1, $d2) = @_;
    for ($d1->{year} <=> $d2->{year},
	 $d1->{month} <=> $d2->{month},
	 $d1->{day} <=> $d2->{day}) {
      return $_ unless $_ == 0
    }
    0
  }
  use overload '<=>' => \&cmp;
 
  sub fmt {
    my $self = shift;
    my @months =
      qw(January Febuary March April May June July August September November October December);
    my $n = $self->{day};
    if ($n == 1) { $n = '1st' }
    elsif (($n - 2) % 10 == 0) { $n = "${n}nd" }
    elsif (($n - 3) % 10 == 0) { $n = "${n}rd" }
    else { $n = "${n}th" }
    $months[$self->{month} - 1] . " $n, " . $self->{year}
  }
    
  sub short_fmt {
    my $self = shift;
    join "-", ($self->{year}, $self->{month}, $self->{day});
  }
  sub text_easy {
    my $self = shift;
    join "-", ($self->{year}, $self->{month}, $self->{day});
  }
}