#!/bin/awk -f
# DOC AWK
# ======
#
# A quick-and-dirty literate-programming-style documentation generator
# inspired by [docco][].
#
# by Case Duckworth <acdw@acdw.net>
#
# Source available under the [Good Choices License][gcl].
#
# [gcl]: https://acdw.casa/gcl Good Choices License
#
# There's a lot of quick-and-dirty "literate programming tools" out there, many
# of which were inspired by, and also borrowed from, docco.  I was particularly
# interested in [shocco][], written in POSIX shell (of which I am a fan).
#
# Notably missing, however, was a converter of some kind written in AWK.  Thus,
# DOC AWK was born.
#
# This page is the result of DOC AWK working on itself.  Not bad for < 250 lines
# including commentary!  You can pick up the raw source code of doc.awk [in its
# git repository][git] to use it yourself.
#
# [docco]: https://ashkenas.com/docco/
# [shocco]: https://rtomayko.github.io/shocco/
# [git]: https://git.acdw.net/docawk
#
# Code
# ----
BEGIN {
	# All the best awk scripts start with a `BEGIN` block.  In this one, we
	# set a few variables from the environment, with defaults.  I use the
	# convenience function `getenv`, further down this script, to make it
	# easier.
	#
	# First, the comment regex.  This regex detects a comment *line*, not an
	# inline comment.  By default, it's set up for awk, shell, and other
	# languages that use `#` as a comment delimiter, but you can make it
	# whatever you want.
	COMMENT = getenv("DOCAWK_COMMENT", COMMENT, "^[ \t]*#+[ \t]*")
	# You can set `DOCAWK_TEXTPROC` to any text processor you want, but the
	# default is the vendored `mdown.awk` script in this repo.  It's from
	# [d.awk](https://github.com/wernsey/d.awk).
	TEXTPROC = getenv("DOCAWK_TEXTPROC", TEXTPROC, "./mdown.awk")
	# You can also set the processor for code sections of the source file;
	# the included `htmlsafe.awk` simply escapes <, &, and >.
	CODEPROC = getenv("DOCAWK_CODEPROC", CODEPROC, "./htmlsafe.awk")
	# Usually, a file header and footer are enough for most documents.  The
	# defaults here are the included header.html and footer.html, since the
	# default output type is html.
	#
	# Each of these documents are actually *templates*, with keys that can
	# expand to variables inside of `@@VARIABLE@@`.  This is mostly
	# for title expansion.
	HEADER = getenv("DOCAWK_HEADER", HEADER, "./header.html")
	FOOTER = getenv("DOCAWK_FOOTER", FOOTER, "./footer.html")
}

# Because `FILENAME` is unset during `BEGIN`, template expansion that attempts
# to view the filename doesn't work.  Thus, I need a state variable to track
# whether we've started or not (so that I don't print a header with every new
# file).
! begun {
	# The template array is initialized with the document's title.
	TV["TITLE"] = get_title()
	# Print the header here, since if multiple files are passed to DOC AWK
	# they'll all be concatenated anyway.
	file_print(HEADER)
}

# `doc.awk` is multi-file aware.  It also removes the shebang line from the
# script if it exists, because you probably don't want that in the output.
#
# It wouldn't be a *bad* idea to make a heuristic for determining the type of
# source file we're converting here.
FNR == 1 {
	begun = 1
	if ($0 ~ COMMENT) {
		lt = "text"
	} else {
		lt = "code"
	}
	if ($0 !~ /^#!/) {
		bufadd(lt)
	}
	next
}

# The main logic is quite simple: if a given line is a comment as defined by
# `DOCAWK_COMMENT`, it's in a text block and should be treated as such;
# otherwise, it's in a code block.  Accumulate each part in a dedicated buffer,
# and on a switch-over between code and text, print the buffer and reset.
$0 !~ COMMENT {
	lt = "code"
	bufprint("text")
}

$0 ~ COMMENT {
	lt = "text"
	bufprint("code")
	sub(COMMENT, "", $0)
}

{
	bufadd(lt)
}

# Of course, at the end there might be something in either buffer, so print that
# out too.  I've decided to put text last for the possibility of ending commentary.
END {
	bufprint("code")
	bufprint("text")
	file_print(FOOTER)
}


# Functions
# ---------
#
# *bufadd*: Add a STR to buffer TYPE.  STR defaults to $0, the input record.
function bufadd(type, str)
{
	buf[type] = buf[type] (str ? str : $0) "\n"
}

# *bufprint*: Print a buffer of TYPE.  Automatically wrap the code blocks in a
# little HTML code block.  I could maybe have a DOCAWK_CODE_PRE/POST and maybe
# even one for text too, to make it more extensible (to other markup languages,
# for example).
function bufprint(type)
{
	buf[type] = trim(buf[type])
	if (buf[type]) {
		if (type == "code") {
			printf "<pre><code>"
			printf(buf[type]) | CODEPROC
			close(CODEPROC)
			print "</code></pre>"
		} else if (type == "text") {
			print(buf[type]) | TEXTPROC
			close(TEXTPROC)
		}
		buf[type] = ""
	}
}

# *file_print*: Print FILE line-by-line.  The `> 0` check here ensures that it
# bails on error (-1).
function file_print(file)
{
	if (file) {
		while ((getline l < file) > 0) {
			print template_expand(l)
		}
		close(file)
	}
}

# *get_title*: get the title of the current script, for the expanded document.
# If variables are set, use those; otherwise try to figure out the title from
# the document's basename.
function get_title()
{
	title = getenv("DOCAWK_TITLE", TITLE)
	if (! title) {
		title = FILENAME
		sub(/.*\//, "", title)
	}
	return title
}

# *getenv*: a convenience function for pulling values out of the environment.
# If an environment variable ENV isn't found, test if VAR is set (i.e., `doc.awk
# -v var=foo`.) and return it if it's set.  Otherwise, return the default value
# DEF.
function getenv(env, var, def)
{
	if (ENVIRON[env]) {
		return ENVIRON[env]
	} else if (var) {
		return var
	} else {
		return def
	}
}

# *template_expand*: expand templates of the form `@@template@@` in the text.
# Currently it only does variables, and works by line.
#
# Due to the way awk works, template variables need to live in their own special
# array, `TV`.  I'd love it if awk had some kind of `eval` functionality, but at
# least POSIX awk doesn't.
function template_expand(text)
{
	if (match(text, /@@[^@]*@@/)) {
		var = substr(text, RSTART + 2, RLENGTH - 4)
		new = substr(text, 1, RSTART - 1)
		new = new TV[var]
		new = new substr(text, RSTART + RLENGTH)
	} else {
		new = text
	}
	return new
}

# *trim*: remove whitespace from either end of a string.
function trim(str)
{
	sub(/^[ \n]*/, "", str)
	sub(/[ \n]*$/, "", str)
	return str
}