From c62e428c168c151ac852df6f578e3349d4c831e3 Mon Sep 17 00:00:00 2001
From: Case Duckworth
Date: Sat, 30 May 2020 13:38:18 -0500
Subject: Bring URL transforming up to spec

URL transformation (formerly called "munging") is now (afaict) fully
compliant with the RFC spec. It's also implemented in pure bash and is
available as a library at https://git.sr.ht/~acdw/shurlie.
---
 bollux | 198 +++++++++++++++++++++++++++++++++++++++--------------------------
 1 file changed, 118 insertions(+), 80 deletions(-)

diff --git a/bollux b/bollux
index a74db7d..0686c21 100755
--- a/bollux
+++ b/bollux
@@ -122,7 +122,7 @@ blastoff() { # load a url
 	URL="$1"
 
 	if $well_formed && [[ "$1" != "$BOLLUX_URL" ]]; then
-		URL="$(run munge_url "$1" "$BOLLUX_URL")"
+		URL="$(run transform_resource "$BOLLUX_URL" "$1")"
 	fi
 	[[ "$URL" != *://* ]] && URL="$BOLLUX_PROTO://$URL"
 	URL="$(trim <<<"$URL")"
@@ -134,95 +134,133 @@ blastoff() { # load a url
 		run handle_response "$URL"
 }
 
-munge_url() {
-	local -A new old u
-	eval "$(split_url new <<<"$1")"
-	for k in "${!new[@]}"; do log d "new[$k]=${new[$k]}"; done
-	eval "$(split_url old <<<"$2")"
-	for k in "${!old[@]}"; do log d "old[$k]=${old[$k]}"; done
-
-	u['scheme']="${new['scheme']:-${old['scheme']:-}}"
-	u['authority']="${new['authority']:-${old['authority']:-}}"
-	# XXX this whole path thing is wack
-	if [[ "${new['path']+isset}" ]]; then
-		log d 'new path set'
-		if [[ "${new['path']}" == /* ]]; then
-			log d 'new path == /*'
-			u['path']="${new['path']}"
-		elif [[ "${new['authority']}" == "${old['authority']}" || ! "${new['authority']+isset}" ]]; then
-			p="${old['path']:-}/${new['path']}"
-			log d "$p ( $(normalize_path <<<"$p") )"
-			u['path']="$(normalize_path <<<"$p")"
+transform_resource() { # transform_resource BASE_URL REFERENCE_URL
+	declare -A R B T # reference, base url, target
+	eval "$(parse_url B "$1")"
+	eval "$(parse_url R "$2")"
+	# A non-strict parser may ignore a scheme in the reference
+	# if it is identical to the base URI's scheme.
+	if ! "${STRICT:-true}" && [[ "${R[scheme]}" == "${B[scheme]}" ]]; then
+		unset "${R[scheme]}"
+	fi
+
+	# basically pseudo-code from spec ported to bash
+	if isdefined "R[scheme]"; then
+		T[scheme]="${R[scheme]}"
+		isdefined "R[authority]" && T[authority]="${R[authority]}"
+		isdefined R[path] &&
+			T[path]="$(remove_dot_segments "${R[path]}")"
+		isdefined "R[query]" && T[query]="${R[query]}"
+	else
+		if isdefined "R[authority]"; then
+			T[authority]="${R[authority]}"
+			isdefined "R[authority]" &&
+				T[path]="$(remove_dot_segments "${R[path]}")"
+			isdefined R[query] && T[query]="${R[query]}"
 		else
-			log d 'u path = new path'
-			u['path']="${new['path']}"
+			if isempty "R[path]"; then
+				T[path]="${B[path]}"
+				if isdefined R[query]; then
+					T[query]="${R[query]}"
+				else
+					T[query]="${B[query]}"
+				fi
+			else
+				if [[ "${R[path]}" == /* ]]; then
+					T[path]="$(remove_dot_segments "${R[path]}")"
+				else
+					T[path]="$(merge_paths "B[authority]" "${B[path]}" "${R[path]}")"
+					T[path]="$(remove_dot_segments "${T[path]}")"
+				fi
+				isdefined R[query] && T[query]="${R[query]}"
+			fi
+			T[authority]="${B[authority]}"
 		fi
-	elif [[ "${new['query']+isset}" || "${new['fragment']+isset}" ]]; then
-		log d 'u path = old path'
-		u['path']="${old['path']}"
-	else
-		u['path']="/"
+		T[scheme]="${B[scheme]}"
+	fi
+	isdefined R[fragment] && T[fragment]="${R[fragment]}"
+	# cf. 5.3 -- recomposition
+	local r=""
+	isdefined "T[scheme]" && r="$r${T[scheme]}:"
+	isdefined "T[authority]" && r="$r//${T[authority]}"
+	r="$r${T[path]}"
+	isdefined T[query] && r="$r?${T[query]}"
+	isdefined T[fragment] && r="$r#${T[fragment]}"
+	printf '%s\n' "$r"
+}
+
+merge_paths() { # 5.2.3
+	# shellcheck disable=2034
+	B_authority="$1"
+	B_path="$2"
+	R_path="$3"
+	# if R_path is empty, get rid of // in B_path
+	if [[ -z "$R_path" ]]; then
+		printf '%s\n' "${B_path//\/\//\//}"
+		return
 	fi
-	u['query']="${new['query']:-}"
-	u['fragment']="${new['fragment']:-}"
-	for k in "${!u[@]}"; do log d "u[$k]=${u[$k]}"; done
 
-	run printf '%s%s%s%s%s\n' \
-		"${u['scheme']}" "${u['authority']}" "${u['path']}" \
-		"${u['query']}" "${u['fragment']}"
+	if isdefined "B_authority" && isempty "B_path"; then
+		printf '/%s\n' "${R_path//\/\//\//}"
+	else
+		if [[ "$B_path" == */* ]]; then
+			B_path="${B_path%/*}/"
+		else
+			B_path=""
+		fi
+		printf '%s/%s\n' "${B_path%/}" "${R_path#/}"
+	fi
 }
 
-normalize_path() {
-	gawk '{
-	split($0, path, /\//)
-	for (c in path) {
-		if (path[c] == "" || path[c] == ".") {
-			continue
-		}
-		if (path[c] == "..") {
-			sub(/[^\/]+$/, "", ret)
-			continue
-		}
-		if (! ret || match(ret, /\/$/)) {
-			slash = ""
-		} else {
-			slash = "/"
-		}
-		ret = ret slash path[c]
-	}
-	print (ret ~ /^\// ? "" : "/") ret
-	}'
+remove_dot_segments() { # 5.2.4
+	local input="$1"
+	local output=
+	# ^/\.(/|$) - BASH_REMATCH[0]
+	while [[ "$input" ]]; do
+		if [[ "$input" =~ ^\.\.?/ ]]; then
+			input="${input#${BASH_REMATCH[0]}}"
+		elif [[ "$input" =~ ^/\.(/|$) ]]; then
+			input="/${input#${BASH_REMATCH[0]}}"
+		elif [[ "$input" =~ ^/\.\.(/|$) ]]; then
+			input="/${input#${BASH_REMATCH[0]}}"
+			[[ "$output" =~ /?[^/]+$ ]]
+			output="${output%${BASH_REMATCH[0]}}"
+		elif [[ "$input" == . || "$input" == .. ]]; then
+			input=
+		else
+			[[ $input =~ ^(/?[^/]*)(/?.*)$ ]] || echo NOMATCH >&2
+			output="$output${BASH_REMATCH[1]}"
+			input="${BASH_REMATCH[2]}"
+		fi
+	done
+	printf '%s\n' "${output//\/\//\//}"
 }
 
-split_url() {
-	gawk -vvar="$1" '{
-	if (match($0, /^[A-Za-z]+:/)) {
-		arr["scheme"] = substr($0, RSTART, RLENGTH)
-		$0 = substr($0, RLENGTH + 1)
-	}
-	if (match($0, /^\/\/[^\/?#]+?/) || (match($0, /^[^\/?#]+?/) && scheme)) {
-	arr["authority"] = substr($0, RSTART, RLENGTH)
-		$0 = substr($0, RLENGTH + 1)
-	}
-	if (match($0, /^\/?[^?#]+/)) {
-		arr["path"] = substr($0, RSTART, RLENGTH)
-		$0 = substr($0, RLENGTH + 1)
-	}
-	if (match($0, /^\?[^#]+/)) {
-		arr["query"] = substr($0, RSTART, RLENGTH)
-		$0 = substr($0, RLENGTH + 1)
-	}
-	if (match($0, /^#.*/)) {
-		arr["fragment"] = substr($0, RSTART, RLENGTH)
-		$0 = substr($0, RLENGTH + 1)
-	}
-	for (part in arr) {
-		sub(/[[:space:]]+$/, "", arr[part])
-		printf var "[\"%s\"]=\"%s\"\n", part, arr[part]
-	}
-	}'
+parse_url() { # eval "$(split_url NAME STRING)" => NAME[...]
+	local name="$1"
+	local string="$2"
+	local re='^(([^:/?#]+):)?(//([^/?#]*))?([^?#]*)(\?([^#]*))?(#(.*))?'
+	[[ $string =~ $re ]] || return $?
+
+	local scheme="${BASH_REMATCH[2]}"
+	local authority="${BASH_REMATCH[4]}"
+	local path="${BASH_REMATCH[5]}"
+	local query="${BASH_REMATCH[7]}"
+	local fragment="${BASH_REMATCH[9]}"
+
+	for c in scheme authority query fragment; do
+		[[ "${!c}" ]] &&
+			printf '%s[%s]=%q\n' "$name" "$c" "${!c}"
+	done
+	# unclear if the path is always set even if empty but it looks that way
+	printf '%s[path]=%q\n' "$name" "$path"
 }
 
+# is a NAME defined ('set' in bash)?
+isdefined() { [[ "${!1+x}" ]]; } # isdefined NAME
+# is a NAME defined AND empty?
+isempty() { [[ ! "${!1-x}" ]]; } # isempty NAME
+
 request_url() {
 	local server="$1"
 	local port="$2"
-- 
cgit 1.4.1-21-gabe81