doc.awk


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213

#!/bin/awk -f
# DOC AWK
# ======
#
# A quick-and-dirty literate-programming-style documentation generator
# inspired by [docco][].
#
# by Case Duckworth <acdw@acdw.net>
#
# Source available under the [Good Choices License][gcl].
#
# [gcl]: https://acdw.casa/gcl Good Choices License
#
# There's a lot of quick-and-dirty "literate programming tools" out there, many
# of which were inspired by, and also borrowed from, docco.  I was particularly
# interested in [shocco][], written in POSIX shell (of which I am a fan).
#
# Notably missing, however, was a converter of some kind written in AWK.  Thus,
# DOC AWK was born.
#
# This page is the result of DOC AWK working on itself.  Not bad for < 250 lines
# including commentary!  You can pick up the raw source code of doc.awk [in its
# git repository][git] to use it yourself.
#
# [docco]: https://ashkenas.com/docco/
# [shocco]: https://rtomayko.github.io/shocco/
# [git]: https://git.acdw.net/docawk
#
# Code
# ----
BEGIN {
	# All the best awk scripts start with a `BEGIN` block.  In this one, we
	# set a few variables from the environment, with defaults.  I use the
	# convenience function `getenv`, further down this script, to make it
	# easier.
	#
	# First, the comment regex.  This regex detects a comment *line*, not an
	# inline comment.  By default, it's set up for awk, shell, and other
	# languages that use `#` as a comment delimiter, but you can make it
	# whatever you want.
	COMMENT = getenv("DOCAWK_COMMENT", COMMENT, "^[ \t]*#+[ \t]*")
	# You can set `DOCAWK_TEXTPROC` to any text processor you want, but the
	# default is the vendored `mdown.awk` script in this repo.  It's from
	# [d.awk](https://github.com/wernsey/d.awk).
	TEXTPROC = getenv("DOCAWK_TEXTPROC", TEXTPROC, "./mdown.awk")
	# You can also set the processor for code sections of the source file;
	# the included `htmlsafe.awk` simply escapes <, &, and >.
	CODEPROC = getenv("DOCAWK_CODEPROC", CODEPROC, "./htmlsafe.awk")
	# Usually, a file header and footer are enough for most documents.  The
	# defaults here are the included header.html and footer.html, since the
	# default output type is html.
	#
	# Each of these documents are actually *templates*, with keys that can
	# expand to variables inside of `@@VARIABLE@@`.  This is mostly
	# for title expansion.
	HEADER = getenv("DOCAWK_HEADER", HEADER, "./header.html")
	FOOTER = getenv("DOCAWK_FOOTER", FOOTER, "./footer.html")
}

# Because `FILENAME` is unset during `BEGIN`, template expansion that attempts
# to view the filename doesn't work.  Thus, I need a state variable to track
# whether we've started or not (so that I don't print a header with every new
# file).
! begun {
	# The template array is initialized with the document's title.
	TV["TITLE"] = get_title()
	# Print the header here, since if multiple files are passed to DOC AWK
	# they'll all be concatenated anyway.
	file_print(HEADER)
}

# `doc.awk` is multi-file aware.  It also removes the shebang line from the
# script if it exists, because you probably don't want that in the output.
#
# It wouldn't be a *bad* idea to make a heuristic for determining the type of
# source file we're converting here.
FNR == 1 {
	begun = 1
	if ($0 ~ COMMENT) {
		lt = "text"
	} else {
		lt = "code"
	}
	if ($0 !~ /^#!/) {
		bufadd(lt)
	}
	next
}

# The main logic is quite simple: if a given line is a comment as defined by
# `DOCAWK_COMMENT`, it's in a text block and should be treated as such;
# otherwise, it's in a code block.  Accumulate each part in a dedicated buffer,
# and on a switch-over between code and text, print the buffer and reset.
$0 !~ COMMENT {
	lt = "code"
	bufprint("text")
}

$0 ~ COMMENT {
	lt = "text"
	bufprint("code")
	sub(COMMENT, "", $0)
}

{
	bufadd(lt)
}

# Of course, at the end there might be something in either buffer, so print that
# out too.  I've decided to put text last for the possibility of ending commentary.
END {
	bufprint("code")
	bufprint("text")
	file_print(FOOTER)
}


# Functions
# ---------
#
# *bufadd*: Add a STR to buffer TYPE.  STR defaults to $0, the input record.
function bufadd(type, str)
{
	buf[type] = buf[type] (str ? str : $0) "\n"
}

# *bufprint*: Print a buffer of TYPE.  Automatically wrap the code blocks in a
# little HTML code block.  I could maybe have a DOCAWK_CODE_PRE/POST and maybe
# even one for text too, to make it more extensible (to other markup languages,
# for example).
function bufprint(type)
{
	buf[type] = trim(buf[type])
	if (buf[type]) {
		if (type == "code") {
			printf "<pre><code>"
			printf(buf[type]) | CODEPROC
			close(CODEPROC)
			print "</code></pre>"
		} else if (type == "text") {
			print(buf[type]) | TEXTPROC
			close(TEXTPROC)
		}
		buf[type] = ""
	}
}

# *file_print*: Print FILE line-by-line.  The `> 0` check here ensures that it
# bails on error (-1).
function file_print(file)
{
	if (file) {
		while ((getline l < file) > 0) {
			print template_expand(l)
		}
		close(file)
	}
}

# *get_title*: get the title of the current script, for the expanded document.
# If variables are set, use those; otherwise try to figure out the title from
# the document's basename.
function get_title()
{
	title = getenv("DOCAWK_TITLE", TITLE)
	if (! title) {
		title = FILENAME
		sub(/.*\//, "", title)
	}
	return title
}

# *getenv*: a convenience function for pulling values out of the environment.
# If an environment variable ENV isn't found, test if VAR is set (i.e., `doc.awk
# -v var=foo`.) and return it if it's set.  Otherwise, return the default value
# DEF.
function getenv(env, var, def)
{
	if (ENVIRON[env]) {
		return ENVIRON[env]
	} else if (var) {
		return var
	} else {
		return def
	}
}

# *template_expand*: expand templates of the form `@@template@@` in the text.
# Currently it only does variables, and works by line.
#
# Due to the way awk works, template variables need to live in their own special
# array, `TV`.  I'd love it if awk had some kind of `eval` functionality, but at
# least POSIX awk doesn't.
function template_expand(text)
{
	if (match(text, /@@[^@]*@@/)) {
		var = substr(text, RSTART + 2, RLENGTH - 4)
		new = substr(text, 1, RSTART - 1)
		new = new TV[var]
		new = new substr(text, RSTART + RLENGTH)
	} else {
		new = text
	}
	return new
}

# *trim*: remove whitespace from either end of a string.
function trim(str)
{
	sub(/^[ \n]*/, "", str)
	sub(/[ \n]*$/, "", str)
	return str
}