From 1b2f47456ab4533a4044065edecebb4c11a5e92e Mon Sep 17 00:00:00 2001 From: Case Duckworth Date: Sun, 29 May 2022 00:41:41 -0500 Subject: Stub out new version of markup language and awk script --- ht.txt | 176 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ ht3.awk | 168 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 344 insertions(+) create mode 100644 ht.txt create mode 100755 ht3.awk diff --git a/ht.txt b/ht.txt new file mode 100644 index 0000000..c409335 --- /dev/null +++ b/ht.txt @@ -0,0 +1,176 @@ +# HAT TRICK + +HAT TRICK is both a lightweight markup language inspired by gemtext and html, +and this awk program to convert the markup language to gemtext, html, and +gophermap markup. It uses a mixture of "block"-level and line-level sigils to +extend the pure line-based markup of gemtext, while removing some of the more +annoying points (to my mind) of writing pure html---i.e., repetitive tags and +other boilerplate. + +## Syntax + +### Blocks + +In HAT TRICK, block of text separated by a blank line is a type of "block." The +default block is a paragraph, or
tag in html. (In gemini and gophermaps, no
+extra tags are added.) Other blocks defined by the syntax are as follows:
+
+>>>
+# HEADING 1
+## HEADING 2
+### HEADING 3
+<<<
+
+Correspond to ; passed through unmodified to gemtext and
+gophermaps.
+
+>>>
+- UNORDERED LIST ITEM
+1. ORDERED LIST ITEM
+<<<
+
+The first list item in a block automatically opens the necessary list tag in
+html. In gemtext, the "-" is converted to "*" (which signifies a list item);
+the hyphen is passed through in gophermaps, because I think it's better syntax
+personally.
+
+>>>
+--- SECTION BREAK
+<<<
+
+A visual indication to break sections. Corresponds to
in html
+(TODO: consider html5 tags; in gemtext, it's
+wrapped in gemtext's own verbatim text markers ```; and the text is unwrapped in
+gophermaps for a cleaner look.
+
+The OUTPUTS can be any output specifier HAT TRICK accepts; see INVOCATION below
+for details. If OUTPUTS is present, the verbatim text will only be passed
+through in the output formats listed; with no OUTPUTS listed it will output to
+all formats.
+
+## "Escaping" line- and block-types
+
+Each of the types listed above are anchored at the beginning of the line.
+Therefore, a simple "escaping" mechanism is available for free: simply prepend a
+space to any line you don't want processed as a line or block and you'll be
+gravy. Astute readers will notice I did just that above, to describe the syntax
+for verbatim fencing.
+
+## Invocation
+
+An invocation of HAT TRICK will look something like this:
+
+>>>
+./ht.awk [HT_FORMATS] [HT_TAGCHARS] < INPUT
+<<<
+
+It processes text from standard input and uses two positional parameters to
+customize its usage, in addition to environment variables. In each instance,
+the parameter will override the variable, and if neither are provided, HAT TRICK
+will choose a default.
+
+### HT_FORMATS (default: "html")
+
+The format(s) to export to. Can be one or more of "html", "gemini", and
+"gopher". As a convenience, a format can be prepended with a "-" (i.e.,
+"-html"), in which case every other format will be exported to. Multiple
+formats can also be specified by separating them with a comma. The special
+keyword "all" will export to all formats (this is the default).
+
+If HAT TRICK exports to one format, it will simply print out each line
+translated into that format. However, if more than one format is given,
+HAT TRICK prints each line multiple times, prepending the name of the format to
+the output. This allows for further processing to filter outputs according to
+output type with just one pass through the input.
+
+### HT_TAGCHARS (default: 'b:**,i://,code:``')
+
+The correspondance between html tag lines and other output formats. If
+HT_FORMAT is only html, this option has no real meaning.
+
+Each correspondance is of the (exploded) form
+
+>>>
+TAG : LEFT_CHAR RIGHT_CHAR
+<<<
+
+where TAG is the html tag, LEFT_CHAR is the character on the left of the
+enclosed text, and RIGHT_CHAR is the character on the right. Rules can be
+separated by commas to pass multiple ones to HAT TRICK.
diff --git a/ht3.awk b/ht3.awk
new file mode 100755
index 0000000..996cb1a
--- /dev/null
+++ b/ht3.awk
@@ -0,0 +1,168 @@
+#!/usr/bin/awk -f
+# -*- indent-tabs-mode: t; -*-
+# HAT TRICK
+# (C) 2022 C. Duckworth
+
+### Commentary:
+
+### Code:
+BEGIN {
+ split("html,gemini,gopher", HT_FORMATS_AVAILABLE, ",")
+ process_arguments()
+ normalize_ht_formats()
+ if (! HT_TAGCHARS[1]) {
+ split("b:**,i://,code:``", HT_TAGCHARS, ",")
+ }
+ # Output buffer. The output will be chunked into blocks.
+ BUFFER = ""
+ # The current block type. We start with a standard paragraph.
+ BLOCK = "p"
+}
+
+### RAW TEXT
+/^>>>/ {
+}
+
+/^<< {
+}
+
+### BLOCKS
+/^#+/ { # Headers
+}
+
+/^>/ { # Block quote
+}
+
+/^-/ { # Unordered list
+}
+
+/^[0-9]\./ { # Ordered list
+}
+
+/^---$/ { # Section break
+}
+
+### LINES
+/^=>/ { # Link
+}
+
+/^ { # HTML tag
+}
+
+/^;/ { # Comment
+}
+
+### EVERYTHING ELSE
+{
+}
+
+### FINISH
+END {
+ buflush()
+}
+
+### FUNCTIONS
+function buflush()
+{
+ # Print the buffer and close the current block.
+ if (BUFFER) {
+ ht_print(BUFFER)
+ }
+ BUFFER = ""
+ if ("html" in HT_FORMATS && BLOCK != "raw") {
+ ht_print("" BLOCK ">")
+ }
+ print
+}
+
+function bufpush(str)
+{
+ # Push STR onto the buffer after a newline.
+ BUFFER = BUFFER (BUFFER ? "\n" : "") str
+}
+
+function ht_print(str)
+{
+ if (HT_FORMATS_COUNT == 1) {
+ print str
+ } else {
+ split(str, arr, "\n")
+ for (format in HT_FORMATS) {
+ line = 1
+ while (arr[line]) {
+ printf "%s\t%s\n", format, arr[line++]
+ }
+ }
+ }
+}
+
+function html_escape(str)
+{
+ # Escape HTML entities and beginning-line spaces.
+ gsub(/&/, "\\&", t)
+ gsub(/, "\\<", t)
+ gsub(/>/, "\\>", t)
+ sub(/^ /, "\\ ", t)
+ return t
+}
+
+function normalize_ht_formats()
+{
+ for (format in HT_FORMATS_AVAILABLE) {
+ normat[format] = 0
+ }
+ if (! HT_FORMATS[1]) {
+ for (i in HT_FORMATS_AVAILABLE) {
+ HT_FORMATS[i] = HT_FORMATS_AVAILABLE[i]
+ }
+ }
+ for (format in HT_FORMATS) {
+ if (format == "all") {
+ for (i in HT_FORMATS_AVAILABLE) {
+ HT_FORMATS[i] = HT_FORMATS_AVAILABLE[i]
+ }
+ return
+ } else if (format ~ /^-/) {
+ delete normat[substr(format, 2)]
+ } else {
+ normat[format] = 1
+ }
+ }
+ for (format in normat) {
+ if (normat[format]) {
+ HT_FORMATS[format] = format
+ }
+ }
+ for (format in HT_FORMATS) {
+ HT_FORMATS_COUNT++
+ }
+}
+
+function process_arguments()
+{
+ a = 1
+ HT_FORMATS[1] = 0
+ HT_TAGCHARS[1] = 0
+ while (ARGV[a]) {
+ if (a == "-c" || a ~ /^--chars=/) {
+ # HTML tag <-> markup character correspondance
+ if (a == "-c") {
+ a++
+ } else if (a ~ /^--chars=/) {
+ sub(/^[^=]*=/, "", a)
+ # HT_TAGCHARS is an array
+ }
+ split(a, HT_TAGCHARS, ",")
+ } else if (a == "-f" || a ~ /^--format=/) {
+ # Output format
+ if (a == "-f") {
+ a++
+ } else if (a ~ /^--format=/) {
+ sub(/^[^=]*=/, "", a)
+ # HT_FORMATS is an array
+ }
+ split(a, HT_FORMATS, ",")
+ }
+ a++
+ }
+}
--
cgit 1.4.1-21-gabe81