diff options
author | Case Duckworth | 2022-05-29 00:41:41 -0500 |
---|---|---|
committer | Case Duckworth | 2022-05-29 00:41:41 -0500 |
commit | 1b2f47456ab4533a4044065edecebb4c11a5e92e (patch) | |
tree | 4a66301ba164932be868f61b3f767dcb5bba69a6 | |
parent | New post, asset moving, dir-locals, etc. (diff) | |
download | hat-trick-1b2f47456ab4533a4044065edecebb4c11a5e92e.tar.gz hat-trick-1b2f47456ab4533a4044065edecebb4c11a5e92e.zip |
Stub out new version of markup language and awk script
-rw-r--r-- | ht.txt | 176 | ||||
-rwxr-xr-x | ht3.awk | 168 |
2 files changed, 344 insertions, 0 deletions
diff --git a/ht.txt b/ht.txt new file mode 100644 index 0000000..c409335 --- /dev/null +++ b/ht.txt | |||
@@ -0,0 +1,176 @@ | |||
1 | # HAT TRICK | ||
2 | |||
3 | HAT TRICK is both a lightweight markup language inspired by gemtext and html, | ||
4 | and this awk program to convert the markup language to gemtext, html, and | ||
5 | gophermap markup. It uses a mixture of "block"-level and line-level sigils to | ||
6 | extend the pure line-based markup of gemtext, while removing some of the more | ||
7 | annoying points (to my mind) of writing pure html---i.e., repetitive tags and | ||
8 | other boilerplate. | ||
9 | |||
10 | ## Syntax | ||
11 | |||
12 | ### Blocks | ||
13 | |||
14 | In HAT TRICK, block of text separated by a blank line is a type of "block." The | ||
15 | default block is a paragraph, or <p> tag in html. (In gemini and gophermaps, no | ||
16 | extra tags are added.) Other blocks defined by the syntax are as follows: | ||
17 | |||
18 | >>> | ||
19 | # HEADING 1 | ||
20 | ## HEADING 2 | ||
21 | ### HEADING 3 | ||
22 | <<< | ||
23 | |||
24 | Correspond to <hx> in html; passed through unmodified to gemtext and gophermaps. | ||
25 | |||
26 | >>> | ||
27 | > BLOCK QUOTE | ||
28 | <<< | ||
29 | |||
30 | Corresponds to <blockquote>; passed through unmodified to gemtext and | ||
31 | gophermaps. | ||
32 | |||
33 | >>> | ||
34 | - UNORDERED LIST ITEM | ||
35 | 1. ORDERED LIST ITEM | ||
36 | <<< | ||
37 | |||
38 | The first list item in a block automatically opens the necessary list tag in | ||
39 | html. In gemtext, the "-" is converted to "*" (which signifies a list item); | ||
40 | the hyphen is passed through in gophermaps, because I think it's better syntax | ||
41 | personally. | ||
42 | |||
43 | >>> | ||
44 | --- SECTION BREAK | ||
45 | <<< | ||
46 | |||
47 | A visual indication to break sections. Corresponds to <hr> in html | ||
48 | (TODO: consider html5 <section> tags --- this would take more logic.) | ||
49 | |||
50 | HAT TRICK reflows blocks, which means that only the first line of a block | ||
51 | needs to start with the sigils outlined above. However, each line of the block | ||
52 | can begin with the sigil character for easier reading. | ||
53 | |||
54 | ### Lines | ||
55 | |||
56 | Within blocks, there are certain other sigils that apply only to the line they | ||
57 | prepend. They include the following: | ||
58 | |||
59 | >>> | ||
60 | => LINK | ||
61 | <<< | ||
62 | |||
63 | Links are probably the most important element in any hypertext language---since | ||
64 | without them, it's hardly hypertext. HAT TRICK borrows its link syntax from | ||
65 | gemtext: the line starts with a "=>", the next field is the link's URL, and the | ||
66 | rest of the line is the link's display text. | ||
67 | |||
68 | >>> | ||
69 | <TAG> HTML TAG | ||
70 | <<< | ||
71 | |||
72 | Lines beginning with an html tag are passed on to html verbatim. The closing | ||
73 | tag is automatically appended to the end of the line, before any ending | ||
74 | punctuation. I've found that 99 times out of 100, I don't want formatting to | ||
75 | include the ending punctuation. | ||
76 | |||
77 | A backslash (\) at the end of the tag line will prevent the tag from being | ||
78 | ended, which is useful for tag-included punctuation as well as nesting tags. | ||
79 | However, the tag is never closed, so you'll have to close it yourself on the | ||
80 | next line. In addition, text that isn't in a tag is html-escaped, so for the | ||
81 | markup to properly apply, you'll need to write something like this: | ||
82 | |||
83 | >>> | ||
84 | <b>She sells \ | ||
85 | <i>sea shells \ | ||
86 | </i> | ||
87 | on the sea shore.\ | ||
88 | </b> | ||
89 | <<< | ||
90 | |||
91 | So while this markup is possible, it's discouraged through the awkwardness of | ||
92 | the syntax. | ||
93 | |||
94 | To translate these tags to meaningful markup in gemtext and gophermaps, a lookup | ||
95 | table is used to correspond the tags to opening and closing characters around | ||
96 | the line's text. This correspondance can be defined with the environment | ||
97 | variable HT_TAGCHARS or HAT TRICK's second positional argument (See INVOCATION, | ||
98 | below). | ||
99 | |||
100 | >>> | ||
101 | ; COMMENT | ||
102 | <<< | ||
103 | |||
104 | Comments in HAT TRICK aren't passed on to the output text---even in html, which | ||
105 | has a comment syntax. Instead, comments are passed, including the prepending | ||
106 | semicolon, to standard error for further processing. | ||
107 | |||
108 | ### Verbatim blocks | ||
109 | |||
110 | Finally, there is a special type of block for passing raw text through to the | ||
111 | next phase of processing: the verbatim block. | ||
112 | |||
113 | >>> | ||
114 | >>> [OUTPUTS] | ||
115 | VERBATIM TEXT | ||
116 | <<< | ||
117 | <<< | ||
118 | |||
119 | In html, the verbatim text is wrapped in <pre><code> tags; in gemtext, it's | ||
120 | wrapped in gemtext's own verbatim text markers ```; and the text is unwrapped in | ||
121 | gophermaps for a cleaner look. | ||
122 | |||
123 | The OUTPUTS can be any output specifier HAT TRICK accepts; see INVOCATION below | ||
124 | for details. If OUTPUTS is present, the verbatim text will only be passed | ||
125 | through in the output formats listed; with no OUTPUTS listed it will output to | ||
126 | all formats. | ||
127 | |||
128 | ## "Escaping" line- and block-types | ||
129 | |||
130 | Each of the types listed above are anchored at the beginning of the line. | ||
131 | Therefore, a simple "escaping" mechanism is available for free: simply prepend a | ||
132 | space to any line you don't want processed as a line or block and you'll be | ||
133 | gravy. Astute readers will notice I did just that above, to describe the syntax | ||
134 | for verbatim fencing. | ||
135 | |||
136 | ## Invocation | ||
137 | |||
138 | An invocation of HAT TRICK will look something like this: | ||
139 | |||
140 | >>> | ||
141 | ./ht.awk [HT_FORMATS] [HT_TAGCHARS] < INPUT | ||
142 | <<< | ||
143 | |||
144 | It processes text from standard input and uses two positional parameters to | ||
145 | customize its usage, in addition to environment variables. In each instance, | ||
146 | the parameter will override the variable, and if neither are provided, HAT TRICK | ||
147 | will choose a default. | ||
148 | |||
149 | ### HT_FORMATS (default: "html") | ||
150 | |||
151 | The format(s) to export to. Can be one or more of "html", "gemini", and | ||
152 | "gopher". As a convenience, a format can be prepended with a "-" (i.e., | ||
153 | "-html"), in which case every other format will be exported to. Multiple | ||
154 | formats can also be specified by separating them with a comma. The special | ||
155 | keyword "all" will export to all formats (this is the default). | ||
156 | |||
157 | If HAT TRICK exports to one format, it will simply print out each line | ||
158 | translated into that format. However, if more than one format is given, | ||
159 | HAT TRICK prints each line multiple times, prepending the name of the format to | ||
160 | the output. This allows for further processing to filter outputs according to | ||
161 | output type with just one pass through the input. | ||
162 | |||
163 | ### HT_TAGCHARS (default: 'b:**,i://,code:``') | ||
164 | |||
165 | The correspondance between html tag lines and other output formats. If | ||
166 | HT_FORMAT is only html, this option has no real meaning. | ||
167 | |||
168 | Each correspondance is of the (exploded) form | ||
169 | |||
170 | >>> | ||
171 | TAG : LEFT_CHAR RIGHT_CHAR | ||
172 | <<< | ||
173 | |||
174 | where TAG is the html tag, LEFT_CHAR is the character on the left of the | ||
175 | enclosed text, and RIGHT_CHAR is the character on the right. Rules can be | ||
176 | separated by commas to pass multiple ones to HAT TRICK. | ||
diff --git a/ht3.awk b/ht3.awk new file mode 100755 index 0000000..996cb1a --- /dev/null +++ b/ht3.awk | |||
@@ -0,0 +1,168 @@ | |||
1 | #!/usr/bin/awk -f | ||
2 | # -*- indent-tabs-mode: t; -*- | ||
3 | # HAT TRICK | ||
4 | # (C) 2022 C. Duckworth | ||
5 | |||
6 | ### Commentary: | ||
7 | |||
8 | ### Code: | ||
9 | BEGIN { | ||
10 | split("html,gemini,gopher", HT_FORMATS_AVAILABLE, ",") | ||
11 | process_arguments() | ||
12 | normalize_ht_formats() | ||
13 | if (! HT_TAGCHARS[1]) { | ||
14 | split("b:**,i://,code:``", HT_TAGCHARS, ",") | ||
15 | } | ||
16 | # Output buffer. The output will be chunked into blocks. | ||
17 | BUFFER = "" | ||
18 | # The current block type. We start with a standard paragraph. | ||
19 | BLOCK = "p" | ||
20 | } | ||
21 | |||
22 | ### RAW TEXT | ||
23 | /^>>>/ { | ||
24 | } | ||
25 | |||
26 | /^<<</ { | ||
27 | } | ||
28 | |||
29 | ### BLOCKS | ||
30 | /^#+/ { # Headers | ||
31 | } | ||
32 | |||
33 | /^>/ { # Block quote | ||
34 | } | ||
35 | |||
36 | /^-/ { # Unordered list | ||
37 | } | ||
38 | |||
39 | /^[0-9]\./ { # Ordered list | ||
40 | } | ||
41 | |||
42 | /^---$/ { # Section break | ||
43 | } | ||
44 | |||
45 | ### LINES | ||
46 | /^=>/ { # Link | ||
47 | } | ||
48 | |||
49 | /^</ { # HTML tag | ||
50 | } | ||
51 | |||
52 | /^;/ { # Comment | ||
53 | } | ||
54 | |||
55 | ### EVERYTHING ELSE | ||
56 | { | ||
57 | } | ||
58 | |||
59 | ### FINISH | ||
60 | END { | ||
61 | buflush() | ||
62 | } | ||
63 | |||
64 | ### FUNCTIONS | ||
65 | function buflush() | ||
66 | { | ||
67 | # Print the buffer and close the current block. | ||
68 | if (BUFFER) { | ||
69 | ht_print(BUFFER) | ||
70 | } | ||
71 | BUFFER = "" | ||
72 | if ("html" in HT_FORMATS && BLOCK != "raw") { | ||
73 | ht_print("</" BLOCK ">") | ||
74 | } | ||
75 | |||
76 | } | ||
77 | |||
78 | function bufpush(str) | ||
79 | { | ||
80 | # Push STR onto the buffer after a newline. | ||
81 | BUFFER = BUFFER (BUFFER ? "\n" : "") str | ||
82 | } | ||
83 | |||
84 | function ht_print(str) | ||
85 | { | ||
86 | if (HT_FORMATS_COUNT == 1) { | ||
87 | print str | ||
88 | } else { | ||
89 | split(str, arr, "\n") | ||
90 | for (format in HT_FORMATS) { | ||
91 | line = 1 | ||
92 | while (arr[line]) { | ||
93 | printf "%s\t%s\n", format, arr[line++] | ||
94 | } | ||
95 | } | ||
96 | } | ||
97 | } | ||
98 | |||
99 | function html_escape(str) | ||
100 | { | ||
101 | # Escape HTML entities and beginning-line spaces. | ||
102 | gsub(/&/, "\\&", t) | ||
103 | gsub(/</, "\\<", t) | ||
104 | gsub(/>/, "\\>", t) | ||
105 | sub(/^ /, "\\ ", t) | ||
106 | return t | ||
107 | } | ||
108 | |||
109 | function normalize_ht_formats() | ||
110 | { | ||
111 | for (format in HT_FORMATS_AVAILABLE) { | ||
112 | normat[format] = 0 | ||
113 | } | ||
114 | if (! HT_FORMATS[1]) { | ||
115 | for (i in HT_FORMATS_AVAILABLE) { | ||
116 | HT_FORMATS[i] = HT_FORMATS_AVAILABLE[i] | ||
117 | } | ||
118 | } | ||
119 | for (format in HT_FORMATS) { | ||
120 | if (format == "all") { | ||
121 | for (i in HT_FORMATS_AVAILABLE) { | ||
122 | HT_FORMATS[i] = HT_FORMATS_AVAILABLE[i] | ||
123 | } | ||
124 | return | ||
125 | } else if (format ~ /^-/) { | ||
126 | delete normat[substr(format, 2)] | ||
127 | } else { | ||
128 | normat[format] = 1 | ||
129 | } | ||
130 | } | ||
131 | for (format in normat) { | ||
132 | if (normat[format]) { | ||
133 | HT_FORMATS[format] = format | ||
134 | } | ||
135 | } | ||
136 | for (format in HT_FORMATS) { | ||
137 | HT_FORMATS_COUNT++ | ||
138 | } | ||
139 | } | ||
140 | |||
141 | function process_arguments() | ||
142 | { | ||
143 | a = 1 | ||
144 | HT_FORMATS[1] = 0 | ||
145 | HT_TAGCHARS[1] = 0 | ||
146 | while (ARGV[a]) { | ||
147 | if (a == "-c" || a ~ /^--chars=/) { | ||
148 | # HTML tag <-> markup character correspondance | ||
149 | if (a == "-c") { | ||
150 | a++ | ||
151 | } else if (a ~ /^--chars=/) { | ||
152 | sub(/^[^=]*=/, "", a) | ||
153 | # HT_TAGCHARS is an array | ||
154 | } | ||
155 | split(a, HT_TAGCHARS, ",") | ||
156 | } else if (a == "-f" || a ~ /^--format=/) { | ||
157 | # Output format | ||
158 | if (a == "-f") { | ||
159 | a++ | ||
160 | } else if (a ~ /^--format=/) { | ||
161 | sub(/^[^=]*=/, "", a) | ||
162 | # HT_FORMATS is an array | ||
163 | } | ||
164 | split(a, HT_FORMATS, ",") | ||
165 | } | ||
166 | a++ | ||
167 | } | ||
168 | } | ||