Cache output, found the best soltuion
Instead of repilicating html tree to be reparsed, keep all the parsed
info in xml, and for links cache them as well.
Forgot to do more commits
#!/usr/bin/perl
# Author: Pranshu Sharma <pranshu@bauherren.ovh>
use v5.36.0;
use strict;
use warnings;
use XML::LibXML;
use File::Find;
use File::Copy qw(copy move);
use Cwd;
use experimental qw(declared_refs);
use Data::Dumper;
# Fix up CLI interface
# Add option ro set cache to 0 through command line
# Print required elisp code it is in $req_config
# Add config option for $max_rss and $max_cat
# Sort cataogirs by entires
# Variable for map_css
# Remove the desc
# Domain thing as well
# Load custom filpe for htmlize and all that
# 2 articles 1 day better sorting
my $cache = 1;
my $config_file = "genorg-conf";
my $dir = "blog";
my $outdir = "out/";
my $emacs = "emacs";
my $css = "style.css";
my $map_css ="other.css";
my $no_name_dir = "c";
my $cache_file = "genorg-cache/cache.xml";
my $domain = "p.bauherren.ovh";
my $max_rss = 30;
# The first page has one less article. Think of it as a feature.
my $max_cat = 10;
my $username = $ENV{LOGNAME} || $ENV{USER} || getpwuid ($<);
my $template = <<"END";
#+title: Test document
#+subtitle: This is the subtitle
#+author: $username
#+keywords: tag thing | related
#+options: html-link-use-abs-url:nil html-postamble:auto
#+options: html-preamble:t html-scripts:nil html-style:t
#+options: html5-fancy:nil tex:t
#+options: tex:mathjax
#+html_doctype: html5
#+html_container: div
#+html_content_class: content
#+html_link_home:
#+html_link_up:
#+html_mathjax:
#+html_equation_reference_format: \eqref{%s}
#+html_head:
#+html_head_extra:
#+infojs_opt:
#+creator: <a href="https://www.gnu.org/software/emacs/">Emacs</a> 31.0.50 (<a href="https://orgmode.org">Org</a> mode 9.7.11)
#+latex_header:
END
my $req_config = <<"END";
(with-eval-after-load 'org
(org-link-set-parameters "genorg"
:follow nil
:export
#'(lambda (link desc _ _)
(format "<genorg desc=\"%s\" link=\"%s\"/>" desc link))
:store nil))
END
my $elisp_code = <<"END" =~ s/\n\s+//gr;
(let ((ls '(LIST)))
(require 'org)
(mapc
(lambda (b)
(with-current-buffer b
(when (and (not (string-match "^ " (buffer-name b)))
(eq major-mode 'org-mode))
(org-export-to-file 'html (pop ls)))))
(buffer-list))
(kill-emacs))
END
my ($h_intro, $h_c_intro, $h_cat_li, $h_chap, $h_en_cat, $h_en, $h_end)
= split /SPLIT/ ,<<"END" =~ s/\n\s+//gr;
<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8">
<meta name="viewport" content="width=device-width, initial-scale=1.0">
<title>TITLE</title>
<link rel="stylesheet" href="style.css">
</head>
SPLIT
<body>
<h1> Pranshu's Blog </h1>
<ul id="cataogries">
SPLIT
<li> <a href="LINK"> CATAG (NUMBER)</a>
SPLIT
</ul>
<div>
<h2> All articles </h2> <hr>
SPLIT
<a href="LINK">
<strong> TITLE </strong> <span>DATE</span>
<p>DESCREPTION</p>
<i>CATAG</i>
<hr>
</a>
SPLIT
<a href="LINK">
<strong> TITLE </strong> <span>DATE</span>
<p>DESCREPTION</p>
<hr>
</a>
SPLIT
</div>
</body>
</html>
END
my $help = <<"END";
genorg [option] [dir|file]
Option can be:
-t FILE : Make a template org file at FILE
-h : Help
-c FILE : sepcify a new config file instead of genorg-conf
-r : unconditionally remake org files
If no option is set, accept a directory to generate.
$config_file follow the syntax of:
VAR VALUE
where VAR can be:
emacs-path : The value should be path of emacs. 'emacs' by default.
css : The path of the css file to use.
in_dir : The directory that contains the files.
out_dir : the output directory.
no_name_dir: The directory for generate files. This means in_dir/no_name_dir
cannot exist.
END
my $rss_entry = <<"END" =~ s/\n\s+//gr;
<?xml version=\"1.0\" encoding=\"UTF-8\" ?>
<rss version=\"2.0\">
<channel>
<title>W3Schools Home Page</title>
<link>https://www.w3schools.com</link>
<description>Free web building tutorials</description>"
END
my %config_vars =
(
'emacs-path' => \$emacs,
css => \$css,
indir => \$dir,
outdir => \$outdir,
no_name_dir => \$no_name_dir
);
sub read_config ($path) {
open my $fh, '<', $path or die "Couldn't open config file\n";
while (<$fh>) {
next if /^\s+$/;
chomp;
my ($directive, $rest) = split /\s+/, $_, 2;
if (exists($config_vars{$directive})) {
${$config_vars{$directive}} = $rest;
} else {
print "Unknown variable $directive\n";
}
}
close $fh;
}
sub help_and_bye {
print $help;
exit;
}
sub template ($file) {
$file // die "-t needs an argument.\n";
$file =~ s/\.org$//;
open my $fh, '>', $file . ".org" or die $file . " couldn't be opened\n";
print $fh $template;
close $fh;
}
# Reletive file from the perspect of file1, to file2
sub prel_path ($from, $to) {
return '.' if $from eq $to;
my \(@f1, @f2) = map [m{/[^/]+}g], ($from, $to);
# return substr($f1[$#f1], 1) if $from =~ $to;
while (@f1 && @f2 && $f1[0] =~ $f2[0]) {
shift @f1;
shift @f2;
}
my $p = @f1-1;
local $" = "";
"../" x ($p > 0 ? $p : 0) . substr("@f2", 1);
}
my @org_exps;
my @files_to_make;
sub wanted {
my $n_path = $outdir . prel_path ($dir, "$File::Find::name/");
# We need to mirror the direcotry structure
if (-d) {
push @files_to_make, $n_path || die "Couldn't make directory $n_path\n"
unless -d $n_path;
return;
}
if (/\.org$/) {
push @org_exps, substr $n_path, length($outdir);
} else {
copy $_, $n_path;
}
}
my @modified_files;
# Export @org_exps
sub conv_files {
my %changed_files;
# Remove files that are already converted
for my $rel_file (@org_exps) {
# Emacs need the full file path
my ($file, $to) = map { getcwd . "/$_" . $rel_file} ($dir, $outdir);
s/org$/html/ for ($to, $rel_file);
unless ($cache && -f $to && (stat($to))[9] > (stat($file))[9]) {
$changed_files{$file} = $to;
push @modified_files, $rel_file;
}
}
%changed_files // exit;
$elisp_code =~ s/LIST/join '', map "\"$_\" ", values %changed_files /e;
open my $fh, '-|', ($emacs, '--batch', '~/.emacs.d/init.el', keys %changed_files, "--eval" , "$elisp_code");
close $fh;
\%changed_files
}
my $cdir;
$ARGV[0] // die "No arguments provided\n";
while ($_ = shift @ARGV) {
/^(-h|--?help)$/ && help_and_bye;
/^-t$/ && do { template shift @ARGV ; exit };
/^-c$/ && do { $config_file = shift @ARGV; next };
/^-d$/ && do { $cache = 0; next };
$cdir = $_;
}
$cdir || die "directory not provided\n";
main();
my %catags;
my @arts;
my $cache_dom;
sub main {
-d $cdir or help_and_bye;
mkdir $outdir;
chdir $cdir or die "Couldn't access $cdir\n";
read_config $config_file;
-d $dir or die "$dir doesn't exist \n";
find (\&wanted, $dir);
mkdir for @files_to_make;
values %{conv_files()};
my $cache_dom = XML::LibXML->load_xml(location => $cache_file) if $cache && -f $cache_file;
chdir $outdir;
@arts = sort { $b->{date} <=> $a->{date}} map { html_fixup($_) } @org_exps;
chdir '..';
populate_cache();
# finalise_html();
}
sub populate_cache {
my $dom = XML::LibXML::Document->new('1.0', 'UTF-8');
my $root = $dom->createElement('root');
my %c_els;
for my $art (@arts) {
# We need the: related, file, date
my ($art_el, $date, $title, $ca, $related) =
map {$dom->createElement($_)} ("art", "date", "title", "catag", "related");
$art_el->{path} = $art->{file};
$ca->appendText($art->{catag});
$title->appendText($art->{title});
$date->appendText($art->{date}->text_easy);
my $rel_str;
for my $thing ($art->{related}) {
for (@{$thing}) {
my ($catag, $artt) = @{$_};
my $rel_str .= "\"$catag\"" . ($artt && "->\"$artt\"");
$related->appendText("\"$catag\"" . ($artt && "->\"$artt\" "));
}
}
$art_el->appendChild($_) for ($date, $ca, $related);
$root->appendChild($art_el);
}
$root->appendChild($_) for values %c_els;
$dom->setDocumentElement($root);
open my $fh, '>' ,$cache_file;
print $fh ($dom->toString(1) =~ s/(.+)/$1\n<!-- Nicley formated :() -->/r);
close $fh;
}
sub entry {
my ($data, $catagory) = @_;
my $str = defined $catagory
? ($h_en_cat =~ s/CATAG/$data->{catag}/er) : $h_en;
my $prel_path = "/$no_name_dir/" . (defined $catagory ? "c" : "c/b");
$str =~ s/DATE/$data->{date}->fmt/er
=~ s/TITLE/$data->{title}/er
=~ s/DESCREPTION/$data->{desc}/er
=~ s|LINK|prel_path($prel_path, '/'. $data->{file})|er
}
sub rss_en ($data) {
my $thing = <<"END" =~ s/\n\s+//gr;
<item>
<title>TITLE</title>
<link>LINK</link>
<description>DESC</description>
</item>
END
$thing =~ s|LINK|"$domain/" . $data->{file}|er
=~ s/TITLE/$data->{title}/er
=~ s/DESC/$data->{desc}/er;
}
sub min ($x, $y) {
($x, $y)[$x > $y]
}
# @_ has to be in format of [STRING, LINK]
sub list2paginaiton {
my $page = shift;
'<ul>' . join ("", map {
my ($str, $file) = @{$_};
$str = "---$str" if $file && ($file == $page);
defined $file ?
"<li><a href=\"$file.html\">$str</a>" : $str
} ((["←", $page - 1]) x!! ($page - 1),
@_,
(["→", $page + 1]) x ($page != $_[$#_]->[1]))) . '</ul>'
}
sub move_on_if_neccasary ($fh, $art_num, $total_articles, $path) {
return $fh if $art_num == 0 || $art_num % $max_cat;
my $cpage = $art_num / $max_cat;
print $fh panigation($cpage, int 0.6 + $total_articles / $max_cat, $path);
open $fh, '>', $path . '/' . ($cpage + 1).'.html';
$fh;
}
sub panigation ($page, $no_pages, $dir) {
say ($no_pages - 4);
if ($no_pages < 9) {
list2paginaiton ($page, (map [$_, $_], (1..$no_pages)))
} elsif (4 < $page <= ($no_pages - 4)) {
say "HERE";
list2paginaiton
($page,
[1, 1],
["<li>..."],
(map [$_, $_], (($page-2)..($page+2))),
["<li>..."],
[($no_pages) x 2])
} elsif ($page < 6) {
list2paginaiton $page,
((map [$_, $_], (1..5)),
["<li>..."],
[($no_pages) x 2])
} else {
list2paginaiton $page,
([1,1],
["<li>..."],
(map [$_, $_], (($no_pages - 5)..$no_pages)))
}
}
sub finalise_html {
mkdir $no_name_dir;
open my $fh, '>', "$no_name_dir/1.html";
open my $mrss, '>', "$no_name_dir/rss.xml"; # The master rss file
print $mrss $rss_entry;
print $fh $h_intro . $h_c_intro;
my (%c_files, %r_files); # Cataogry files and rss
while (my ($cat, $num) = each %catags) {
# ($h_intro, $h_c_intro, $h_cat_li, $h_chap, $h_en_cat, $h_en, $h_end)
print $fh $h_cat_li =~ s/NUMBER/$num/re
=~ s/CATAG/$cat/er
=~ s=LINK="$cat/1.html"=er;
mkdir "$no_name_dir/$cat";
open $c_files{$cat}, '>', "$no_name_dir/$cat/1.html";
open $r_files{$cat}, '>', "$no_name_dir/$cat/rss.xml";
print {$r_files{$cat}} $rss_entry;
print {$c_files{$cat}} $h_intro;
}
print $fh $h_chap;
# :-)
my $total_articles = () = map {(1)x$_} values %catags;
# Now we iterate through the articles and add their index
my ($n_fh, $n_mrss, %n_rss) = (0,0);
my %n_cat;
for my $art (@arts) {
my ($catag, $title) = @{$art}{qw(catag title)};
my $rentry = rss_en $art;
if (($n_rss{$catag} // 0) < $max_rss) {
print {$r_files{$catag}} $rentry;
++$n_rss{$catag};
}
if ($n_mrss < $max_rss) {
print $mrss $rentry;
++$n_mrss
}
# ($fh, $art_num, $total_articles, $path)
$fh = move_on_if_neccasary $fh, $n_fh, $total_articles, $no_name_dir;
$n_fh++;
print $fh entry $art, 1;
my $ncat = \$n_cat{$catag};
++$$ncat;
my $cfh = \$c_files{$catag};
$$cfh = move_on_if_neccasary $$cfh, $$ncat, $catags{$catag}, "$no_name_dir/$catag";
print {$$cfh} entry $art;
my $a_file = $art->{file};
# say $a_file;
if (grep /^$a_file$/, @modified_files) {
open my $h_file, '>', $a_file;
# print $h_file $art->{dom}->toString;
close $h_file;
}
}
print $fh panigation (($n_fh / $max_cat) x 2, $no_name_dir) if $max_cat < $total_articles;
print $fh $h_end;
close $fh;
for (values %c_files) {
print $_ $h_end;
close;
}
print $mrss '</channel> </rss>';
for (values %r_files) {
print $_ '</channel> </rss>';;
close;
}
}
# I don't want to prepopulate, as it is not known which ones are
# usless and outdated, and it would be a waste of cycles parsing
# something to know it is not needed
sub getcache {
$cache_dom || return 0;
return 0;
}
sub html_fixup ($filename) {
# for (getcache($filename)) {
# $_ && return $_;
# }
unless (grep /^$filename$/, @modified_files) {
for (getcache($filename)) {
return $_ if $_
}
}
my $dom =
XML::LibXML->load_html(location => $filename, recover => 1);
my %data = ('file', $filename);
# We don't need style or table of contents heading (if exists)
for my $xpath ('/html/head/style', '/html/body/div/div[@id="table-of-contents"]/h2') {
$_->parentNode->removeChild($_) for $dom->findnodes($xpath)
}
# Info we need
# Date
for ($dom->findnodes('/html/body/div[@id="postamble"]/p[@class="date"]')) {
my $text = $_->to_literal;
if ($text =~ /^Date: (\d+)-(\d+)-(\d+)/) {
my $date = PDate->new($1, $2, $3);
$data{date} = $date;
my $p = $_->parentNode;
$p->parentNode->removeChild($p);
}
}
# Title\
for ($dom->findnodes('/html/head/title')) {
$data{title} = $_->to_literal;
}
# keywords/catogry
for my $node ($dom->findnodes('/html/head/meta')) {
(my $cont = $node->getAttribute('content')) || next;
for ($node->getAttribute('name')) {
if (/description/) {
$data{desc} = $cont;
} elsif (/keywords/) {
my ($key, $rest) = parse_keywords($cont);
@data{qw[catag title]} = @{$key};
$catags{$data{catag}}++;
$rest =~ s/\s+\|\s+//;
while ($rest) {
(my $t,$rest) = parse_keywords($rest);
push @{$data{related}}, $t;
}
} else {
next;
}
$node->parentNode->removeChild($node);
}
}
$data{dom} = $dom;
\%data;
}
sub parse_keywords ($str) {
if ($str =~ s/\s*(?:"(.+?)"|(\w+))\s*->(?:\s*(?:"(.+?)"|(\w+)))//) {
([$1 // $2, $3 // $4], $str);
} else {
die "Keywords not arranged properly\n"
}
}
# I wonder if sean combs has made a similar class
{
package PDate;
sub new {
my $class = shift;
my $self = { year => 0 + shift,
month => 0 + shift,
day => 0 + shift,
};
bless $self, $class;
return $self;
}
# $d1 is greater than $d2
sub cmp {
my ($d1, $d2) = @_;
for ($d1->{year} <=> $d2->{year},
$d1->{month} <=> $d2->{month},
$d1->{day} <=> $d2->{day}) {
return $_ unless $_ == 0
}
0
}
use overload '<=>' => \&cmp;
sub fmt {
my $self = shift;
my @months =
qw(January Febuary March April May June July August September November October December);
my $n = $self->{day};
if ($n == 1) { $n = '1st' }
elsif (($n - 2) % 10 == 0) { $n = "${n}nd" }
elsif (($n - 3) % 10 == 0) { $n = "${n}rd" }
else { $n = "${n}st" }
$months[$self->{month} - 1] . " $n, " . $self->{year}
}
sub short_fmt {
my $self = shift;
join "-", ($self->{year}, $self->{month}, $self->{day});
}
sub text_easy {
my $self = shift;
join "-", ($self->{year}, $self->{month}, $self->{day});
}
}