genorg.pl
author Pranshu Sharma <pranshu@bauherren.ovh>
Sun, 15 Dec 2024 20:38:17 +1000
changeset 4 c98c7c32ab46
parent 3 bf4ae6f1dbbf
child 5 24fc5a406138
permissions -rw-r--r--
Cache output, found the best soltuion Instead of repilicating html tree to be reparsed, keep all the parsed info in xml, and for links cache them as well. Forgot to do more commits

#!/usr/bin/perl
# Author: Pranshu Sharma <pranshu@bauherren.ovh>

use v5.36.0;
use strict;
use warnings;

use XML::LibXML;
use File::Find;
use File::Copy qw(copy move);
use Cwd;
use experimental qw(declared_refs);
use Data::Dumper;

# Fix up CLI interface
# Add option ro set cache to 0 through command line
# Print required elisp code it is in $req_config
# Add config option for $max_rss and $max_cat
# Sort cataogirs by entires
# Variable for map_css
# Remove the desc
# Domain thing as well
# Load custom filpe for htmlize and all that
# 2 articles 1 day better sorting
my $cache = 1;

my $config_file = "genorg-conf";
my $dir = "blog";
my $outdir = "out/";
my $emacs = "emacs";
my $css = "style.css";
my $map_css ="other.css";
my $no_name_dir = "c";
my $cache_file = "genorg-cache/cache.xml";
my $domain = "p.bauherren.ovh";
my $max_rss = 30;
# The first page has one less article.  Think of it as a feature.
my $max_cat = 10;

my $username = $ENV{LOGNAME} || $ENV{USER} || getpwuid ($<);
my $template = <<"END";
#+title: Test document
#+subtitle: This is the subtitle
#+author: $username
#+keywords: tag thing | related
#+options: html-link-use-abs-url:nil html-postamble:auto
#+options: html-preamble:t html-scripts:nil html-style:t
#+options: html5-fancy:nil tex:t
#+options: tex:mathjax
#+html_doctype: html5
#+html_container: div
#+html_content_class: content
#+html_link_home:
#+html_link_up:
#+html_mathjax:
#+html_equation_reference_format: \eqref{%s}
#+html_head:
#+html_head_extra:
#+infojs_opt:
#+creator: <a href="https://www.gnu.org/software/emacs/">Emacs</a> 31.0.50 (<a href="https://orgmode.org">Org</a> mode 9.7.11)
#+latex_header:
END

my $req_config = <<"END";
(with-eval-after-load 'org
    (org-link-set-parameters "genorg"
			 :follow nil
			 :export
			 #'(lambda (link desc _ _)
			     (format "<genorg desc=\"%s\" link=\"%s\"/>" desc link))
			 :store nil))
END

my $elisp_code = <<"END" =~ s/\n\s+//gr;
(let ((ls '(LIST)))
  (require 'org)
  (mapc
   (lambda (b)
     (with-current-buffer b
       (when (and (not (string-match "^ " (buffer-name b)))
		  (eq major-mode 'org-mode))
	 (org-export-to-file 'html (pop ls)))))
   (buffer-list))
  (kill-emacs))
END


my ($h_intro, $h_c_intro, $h_cat_li, $h_chap, $h_en_cat, $h_en, $h_end)
  = split /SPLIT/ ,<<"END" =~ s/\n\s+//gr;
<!DOCTYPE html>
<html lang="en">
  <head>
    <meta charset="UTF-8">
    <meta name="viewport" content="width=device-width, initial-scale=1.0">
    <title>TITLE</title>
    <link rel="stylesheet" href="style.css">
  </head>
SPLIT
  <body>
    <h1> Pranshu's Blog </h1>
    <ul id="cataogries">
SPLIT 
      <li> <a href="LINK">  CATAG (NUMBER)</a>
SPLIT
    </ul>
    <div>
      <h2> All articles </h2> <hr>
SPLIT
   <a href="LINK">
     <strong> TITLE </strong> <span>DATE</span>
     <p>DESCREPTION</p>
     <i>CATAG</i>
     <hr>
  </a>
SPLIT
   <a href="LINK">
     <strong> TITLE </strong> <span>DATE</span>
     <p>DESCREPTION</p>
     <hr>
   </a>
SPLIT
    </div>
  </body>
</html>
END

my $help = <<"END";
genorg [option] [dir|file]
Option can be:
 -t FILE : Make a template org file at FILE
 -h      : Help
 -c FILE : sepcify a new config file instead of genorg-conf
 -r      : unconditionally remake org files

If no option is set, accept a directory to generate.

$config_file follow the syntax of:
VAR VALUE
where VAR can be:
 emacs-path : The value should be path of emacs. 'emacs' by default.
 css        : The path of the css file to use.
 in_dir     : The directory that contains the files.
 out_dir    : the output directory.
 no_name_dir: The directory for generate files.  This means in_dir/no_name_dir
              cannot exist.
END


my $rss_entry = <<"END" =~ s/\n\s+//gr;
<?xml version=\"1.0\" encoding=\"UTF-8\" ?>
<rss version=\"2.0\">
 <channel>
  <title>W3Schools Home Page</title>
  <link>https://www.w3schools.com</link>
  <description>Free web building tutorials</description>"
END
  
my %config_vars =
  (
   'emacs-path' => \$emacs,
   css => \$css,
   indir => \$dir,
   outdir => \$outdir,
   no_name_dir => \$no_name_dir
  );

sub read_config ($path) {
  open my $fh, '<', $path or die "Couldn't open config file\n";
  while (<$fh>) {
    next if /^\s+$/;
    chomp;
    my ($directive, $rest) = split /\s+/, $_, 2;
    if (exists($config_vars{$directive})) {
      ${$config_vars{$directive}} = $rest;
    } else {
      print "Unknown variable $directive\n";
    }
  }
  close $fh;
}

sub help_and_bye {
  print $help;
  exit;
}

sub template ($file) {
  $file // die "-t needs an argument.\n";
  $file =~ s/\.org$//;
  open my $fh, '>', $file . ".org" or die $file . " couldn't be opened\n";
  print $fh $template;
  close $fh;
}

# Reletive file from the perspect of file1, to file2
sub prel_path ($from, $to) {
  return '.' if $from eq $to;
  my \(@f1, @f2) = map [m{/[^/]+}g], ($from, $to);
  # return substr($f1[$#f1], 1) if $from =~ $to;
  while (@f1 && @f2 && $f1[0] =~ $f2[0]) {
    shift @f1;
    shift @f2;
  }
  my $p = @f1-1;
  local $" = "";
  "../" x ($p > 0 ? $p : 0) . substr("@f2", 1);
}

my @org_exps;
my @files_to_make;

sub wanted {
  my $n_path = $outdir . prel_path ($dir,  "$File::Find::name/");
  # We need to mirror the direcotry structure
  if (-d) {
    push @files_to_make, $n_path || die "Couldn't make directory $n_path\n"
      unless -d $n_path;
    return;
  }
  if (/\.org$/) {
    push @org_exps, substr $n_path, length($outdir);
  } else {
    copy $_, $n_path;
  }
}

my @modified_files;

# Export @org_exps
sub conv_files {
  my %changed_files;
  # Remove files that are already converted
  for my $rel_file (@org_exps) {
    # Emacs need the full file path
    my ($file, $to) = map { getcwd . "/$_" . $rel_file} ($dir, $outdir);
    s/org$/html/ for ($to, $rel_file);
    unless ($cache && -f $to && (stat($to))[9] > (stat($file))[9]) {
      $changed_files{$file} = $to;
      push @modified_files, $rel_file;
    }
  }
  %changed_files // exit;
  $elisp_code =~ s/LIST/join '', map "\"$_\" ", values %changed_files /e;
  open my $fh, '-|', ($emacs, '--batch', '~/.emacs.d/init.el', keys %changed_files, "--eval" , "$elisp_code");
  close $fh;
  \%changed_files
}

my $cdir;

$ARGV[0] // die "No arguments provided\n";
while ($_ = shift @ARGV) {
  /^(-h|--?help)$/ && help_and_bye;
  /^-t$/ && do { template shift @ARGV ; exit };
  /^-c$/ && do { $config_file = shift @ARGV; next };
  /^-d$/ && do { $cache = 0; next };
  $cdir = $_;
}
$cdir || die "directory not provided\n";
main();

my %catags;
my @arts;
my $cache_dom;

sub main {
  -d $cdir or help_and_bye;
  mkdir $outdir;
  chdir $cdir or die "Couldn't access $cdir\n";
  read_config $config_file;
  -d $dir or die "$dir doesn't exist \n";
  find (\&wanted, $dir);
  mkdir for @files_to_make;
  values %{conv_files()};
  my $cache_dom = XML::LibXML->load_xml(location => $cache_file) if $cache && -f $cache_file;
  chdir $outdir;
  @arts = sort { $b->{date} <=> $a->{date}} map { html_fixup($_) } @org_exps;
  chdir '..';
  populate_cache();
  # finalise_html();
}

sub populate_cache {
  my $dom = XML::LibXML::Document->new('1.0', 'UTF-8');
  my $root = $dom->createElement('root');
  my %c_els;
  for my $art (@arts) {
    # We need the: related, file, date
    my ($art_el, $date, $title, $ca, $related) =
      map {$dom->createElement($_)} ("art", "date", "title", "catag", "related");
    $art_el->{path} = $art->{file};
    $ca->appendText($art->{catag});
    $title->appendText($art->{title});
    $date->appendText($art->{date}->text_easy);
    my $rel_str;
    for my $thing ($art->{related}) {
      for (@{$thing}) {
	my ($catag, $artt) = @{$_};
	my $rel_str .= "\"$catag\"" . ($artt && "->\"$artt\"");
	$related->appendText("\"$catag\"" . ($artt && "->\"$artt\" "));
      }
    }
    $art_el->appendChild($_) for ($date, $ca, $related);
    $root->appendChild($art_el);
  }
  $root->appendChild($_) for values %c_els;
  $dom->setDocumentElement($root);
  open my $fh, '>' ,$cache_file;
  print $fh ($dom->toString(1) =~ s/(.+)/$1\n<!-- Nicley formated :() -->/r);
  close $fh;
}

sub entry {
  my ($data, $catagory) = @_;
  my $str = defined $catagory
    ? ($h_en_cat =~ s/CATAG/$data->{catag}/er) : $h_en;
  my $prel_path = "/$no_name_dir/" . (defined $catagory ? "c" : "c/b");
  $str =~ s/DATE/$data->{date}->fmt/er
    =~ s/TITLE/$data->{title}/er
    =~ s/DESCREPTION/$data->{desc}/er
    =~ s|LINK|prel_path($prel_path, '/'. $data->{file})|er
  }

sub rss_en ($data) {
  my $thing = <<"END" =~ s/\n\s+//gr;
  <item>
    <title>TITLE</title>
    <link>LINK</link>
    <description>DESC</description>
  </item>
END
  $thing =~ s|LINK|"$domain/" . $data->{file}|er
    =~ s/TITLE/$data->{title}/er
    =~ s/DESC/$data->{desc}/er;
}

sub min ($x, $y) {
  ($x, $y)[$x > $y]
}

# @_ has to be in format of [STRING, LINK]
sub list2paginaiton {
  my $page = shift;
  '<ul>' . join ("", map {
    my ($str, $file) = @{$_};
    $str = "---$str" if $file && ($file == $page);
    defined $file ?
      "<li><a href=\"$file.html\">$str</a>" : $str
    } ((["←", $page - 1]) x!! ($page - 1),
       @_,
       (["→", $page + 1]) x ($page != $_[$#_]->[1]))) . '</ul>'
     }

sub move_on_if_neccasary ($fh, $art_num, $total_articles, $path) {
  return $fh if $art_num == 0 || $art_num % $max_cat;
  my $cpage = $art_num / $max_cat;
  print $fh panigation($cpage, int 0.6 +  $total_articles / $max_cat, $path);
  open $fh, '>', $path . '/' . ($cpage + 1).'.html';
  $fh;
}

sub panigation ($page, $no_pages, $dir) {
  say ($no_pages - 4);
  if ($no_pages < 9) {
    list2paginaiton ($page, (map [$_, $_], (1..$no_pages)))
  } elsif (4 < $page <= ($no_pages - 4)) {
    say "HERE";
    list2paginaiton
      ($page,
       [1, 1],
       ["<li>..."],
       (map [$_, $_], (($page-2)..($page+2))),
       ["<li>..."],
       [($no_pages) x 2])  
    } elsif ($page < 6) {
      list2paginaiton $page,
	((map [$_, $_], (1..5)),
	 ["<li>..."],
	 [($no_pages) x 2])
      } else {
	list2paginaiton $page,
	  ([1,1],
	   ["<li>..."],
	   (map [$_, $_], (($no_pages - 5)..$no_pages)))
	}
}

sub finalise_html {
  mkdir $no_name_dir;
  open my $fh, '>', "$no_name_dir/1.html";
  open my $mrss, '>', "$no_name_dir/rss.xml"; # The master rss file
  print $mrss $rss_entry;
  print $fh $h_intro . $h_c_intro;
  my (%c_files, %r_files);	# Cataogry files and rss
  while (my ($cat, $num) = each %catags) {
    # ($h_intro, $h_c_intro, $h_cat_li, $h_chap, $h_en_cat, $h_en, $h_end)
    print $fh $h_cat_li =~ s/NUMBER/$num/re
      =~ s/CATAG/$cat/er
      =~ s=LINK="$cat/1.html"=er;
    mkdir "$no_name_dir/$cat";
    open $c_files{$cat}, '>', "$no_name_dir/$cat/1.html";
    open $r_files{$cat}, '>', "$no_name_dir/$cat/rss.xml";
    print {$r_files{$cat}} $rss_entry;
    print {$c_files{$cat}} $h_intro;
  }
  print $fh $h_chap;
  # :-)
  my $total_articles = () =  map {(1)x$_} values %catags;
  # Now we iterate through the articles and add their index
  my ($n_fh, $n_mrss, %n_rss) = (0,0);
  my %n_cat;
  for my $art (@arts) {
    my ($catag, $title) = @{$art}{qw(catag title)};
    my $rentry = rss_en $art;
    if (($n_rss{$catag} // 0) < $max_rss) {
      print {$r_files{$catag}} $rentry;
      ++$n_rss{$catag};
    }
    if ($n_mrss < $max_rss) {
      print $mrss $rentry;
      ++$n_mrss
    }
    # ($fh, $art_num, $total_articles, $path)
    $fh = move_on_if_neccasary $fh, $n_fh, $total_articles, $no_name_dir;
    $n_fh++;
    print $fh entry $art, 1;
    my $ncat = \$n_cat{$catag};
    ++$$ncat;
    my $cfh = \$c_files{$catag};
    $$cfh = move_on_if_neccasary $$cfh, $$ncat, $catags{$catag}, "$no_name_dir/$catag";
    print {$$cfh} entry $art;
    my $a_file = $art->{file};
    # say $a_file;
    if (grep /^$a_file$/, @modified_files) {
      open my $h_file, '>', $a_file;
      # print $h_file $art->{dom}->toString;
      close $h_file;
    }
  }
  print $fh panigation (($n_fh / $max_cat) x 2, $no_name_dir) if $max_cat < $total_articles;
  print $fh $h_end;
  close $fh;
  for (values %c_files) {
    print $_ $h_end;
    close;
  }
  print $mrss '</channel> </rss>';
  for (values %r_files) {
    print $_ '</channel> </rss>';;
    close;
  }
}

# I don't want to prepopulate, as it is not known which ones are
# usless and outdated, and it would be a waste of cycles parsing
# something to know it is not needed
sub getcache {
  $cache_dom || return 0;
  return 0;
}

sub html_fixup ($filename) {
   # for (getcache($filename)) {
    # $_ && return $_;
  # }
  unless (grep /^$filename$/, @modified_files) {
    for (getcache($filename)) {
      return $_ if $_
    }
  }
  my $dom =
    XML::LibXML->load_html(location  => $filename, recover   => 1);
  my %data = ('file', $filename);

  # We don't need style or table of contents heading (if exists)
  for my $xpath ('/html/head/style', '/html/body/div/div[@id="table-of-contents"]/h2') {
    $_->parentNode->removeChild($_) for $dom->findnodes($xpath) 
  }
  
  # Info we need

  # Date
  for ($dom->findnodes('/html/body/div[@id="postamble"]/p[@class="date"]')) {
    my $text = $_->to_literal;
    if ($text =~ /^Date: (\d+)-(\d+)-(\d+)/) {
      my $date = PDate->new($1, $2, $3);
      $data{date} = $date;
      my $p = $_->parentNode;
      $p->parentNode->removeChild($p);
    }
  }
  # Title\
  for ($dom->findnodes('/html/head/title')) {
    $data{title}  = $_->to_literal;
  }
  # keywords/catogry
  for my $node ($dom->findnodes('/html/head/meta')) {
    (my $cont = $node->getAttribute('content')) || next;
    for ($node->getAttribute('name')) {
      if (/description/) {
	$data{desc} = $cont;
      } elsif (/keywords/) {
	my ($key, $rest) = parse_keywords($cont);
	@data{qw[catag title]} = @{$key};
	$catags{$data{catag}}++;
	$rest =~ s/\s+\|\s+//;
	while ($rest) {
	  (my $t,$rest) = parse_keywords($rest);
	  push @{$data{related}}, $t;
	}
      } else {
	next;
      }
      $node->parentNode->removeChild($node);
    }
  }
  $data{dom} = $dom;
  \%data;
}

sub parse_keywords ($str) {
  if ($str =~ s/\s*(?:"(.+?)"|(\w+))\s*->(?:\s*(?:"(.+?)"|(\w+)))//) {
    ([$1 // $2, $3 // $4], $str);
  } else {
    die "Keywords not arranged properly\n"
  }
}

# I wonder if sean combs has made a similar class
{
  package PDate;
  
  sub new {
    my $class = shift;
    my $self = { year => 0 + shift,
		 month => 0 + shift,
		 day => 0 + shift,
	       };
    bless $self, $class;
    return $self;
  }

  # $d1 is greater than $d2
  sub cmp  {
    my ($d1, $d2) = @_;
    for ($d1->{year} <=> $d2->{year},
	 $d1->{month} <=> $d2->{month},
	 $d1->{day} <=> $d2->{day}) {
      return $_ unless $_ == 0
    }
    0
  }
  use overload '<=>' => \&cmp;
 
  sub fmt {
    my $self = shift;
    my @months =
      qw(January Febuary March April May June July August September November October December);
    my $n = $self->{day};
    if ($n == 1) { $n = '1st' }
    elsif (($n - 2) % 10 == 0) { $n = "${n}nd" }
    elsif (($n - 3) % 10 == 0) { $n = "${n}rd" }
    else { $n = "${n}st" }
    $months[$self->{month} - 1] . " $n, " . $self->{year}
  }
    
  sub short_fmt {
    my $self = shift;
    join "-", ($self->{year}, $self->{month}, $self->{day});
  }
  sub text_easy {
    my $self = shift;
    join "-", ($self->{year}, $self->{month}, $self->{day});
  }
}