transform_uri.sh


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157

#!/usr/bin/env bash
# transform-url
# cf. https://tools.ietf.org/html/rfc3986#section-5 and
# cf. https://tools.ietf.org/html/rfc3986#section-5.1
# cf. also https://tools.ietf.org/html/rfc3986#appendix-B -- regex

# TEST WITH https://tools.ietf.org/html/rfc3986#section-5.4

transform_resource() { # 5.2.2
	declare -A R B T           # reference, base url, target
	eval "$(parse_url R "$2")" # XXX CHANGE
	eval "$(parse_url B "$1")"
	# Basically going to follow the pseudocode in the spec.
	# the '+x' bit after the fields of the arrays tests if they're set
	if [[ "${R['scheme']+x}" ]]; then
		T['scheme']="${R['scheme']}"
		T['authority']="${R['authority']}"
		T['path']="$(remove_dot_segments "${R['path']}")"
		T['query']="${R['query']}"
	else
		if [[ "${R['authority']+x}" ]]; then
			T['authority']="${R['authority']}"
			T['path']="$(remove_dot_segments "${R['path']}")"
			T['query']="${R['query']}"
		else
			if [[ "${R['path']-x}" == "" ]]; then
				T['path']="${B['path']}"
				if [[ "${R['query']-x}" ]]; then
					T['query']="${R['query']}"
				else
					T['query']="${B['query']}"
				fi
			else
				if [[ "${R['path']}" == /* ]]; then
					T['path']="$(remove_dot_segments "${R['path']}")"
				else
					T['path']="$(merge "${B['authority']-?}" \
						"${B['path']}" "${R['path']}")"
					T['path']="$(remove_dot_segments "${T['path']}")"
				fi
				T['query']="${R['query']}"
			fi
			T['authority']="${B['authority']}"
		fi
		T['scheme']="${B['scheme']}"
	fi
	T['fragment']="${R['fragment']}"
	# 5.3 -- recomposition
	local r=""
	[[ "${T['scheme']-x}" ]] &&
		r="$r${T['scheme']}:"
	[[ "${T['authority']-x}" ]] &&
		r="$r//${T['authority']}"
	r="$r${T['path']}"
	[[ "${T['query']-x}" ]] &&
		r="$r?${T['query']}"
	[[ "${T['fragment']-x}" ]] &&
		r="$r#${T['fragment']}"
	printf '%s\n' "$r"
}

merge() { # 5.2.3
	#>If the base URI has a defined authority component and an empty
	#>path, then return a string consisting of "/" concatenated with the
	#>reference's path; otherwise,
	#>return a string consisting of the reference's path component
	#>appended to all but the last segment of the base URI's path (i.e.,
	#>excluding any characters after the right-most "/" in the base URI
	#>path, or excluding the entire base URI path if it does not contain
	#>any "/" characters).
	B_authority="$1" # if ? is here, it means undefined (see caller)
	B_path="$2"
	R_path="$3"
	if [[ -z "$R_path" ]]; then
		printf '%q\n' "$B_path" |
			sed 's,//,/,g' # XXX is this okay....?
		return
	fi

	if [[ "${B_authority:-?}" != "?" && "${B_path-x}" == "" ]]; then
		printf '/%q\n' "$R_path"
	else
		if [[ "$B_path" == */* ]]; then
			B_path="${B_path%/*}/"
		else
			B_path=""
		fi
		printf '%q/%q\n' "$B_path" "$R_path" # XXX - %q vs %s
	fi
}

# I can probably just use normalize_path already in bollux here
remove_dot_segments() { # 5.2.4
	local input="$1"
	local output=
	while [[ -n "$input" ]]; do
		if [[ "$input" == ../* || "$input" == ./* ]]; then
			input="${input#*/}"
		elif [[ "$input" == /./* ]]; then
			input="${input#/./}/"
		elif [[ "$input" == /.* ]]; then
			input="${input#/.}/b"
		elif [[ "$input" == /../* ]]; then
			input="${input#/../}/c"
			output="${output%/*}"
		elif [[ "$input" == /..* ]]; then
			input="${input#/..}/d"
			output="${output%/*}"
		elif [[ "$input" == . || "$input" == .. ]]; then
			input=
		else
			# move the first path segment in the input buffer to the end of
			# the output buffer, including the initial "/" character (if
			# any) and any subsequent characters up to, but not including,
			# the next "/" character or the end of the input buffer.
			[[ $input =~ ^(/?[^/]*)(/?.*)$ ]] || echo NOMATCH >&2
			output="$output${BASH_REMATCH[1]}"
			input="${BASH_REMATCH[2]}"
		fi
	done
	printf '%s\n' "$output" |
		sed 's,//,/,g' # XXX is this okay....?
}

# *FINDING* URLS ... IN PURE BASH !!!
parse_url() { # eval "$(split_url NAME STRING)" => NAME[...]
	local name="$1"
	local string="$2"
	local re='^(([^:/?#]+):)?(//([^/?#]*))?([^?#]*)(\?([^#]*))?(#(.*))?'
	[[ $string =~ $re ]] || return $?

	local scheme="${BASH_REMATCH[2]}"
	local authority="${BASH_REMATCH[4]}"
	local path="${BASH_REMATCH[5]}"
	local query="${BASH_REMATCH[7]}"
	local fragment="${BASH_REMATCH[9]}"

	for c in scheme authority path query fragment; do
		[[ "${!c}" ]] &&
			printf '%s[%s]=%s\n' "$name" "$c" "${!c}" |
			sed 's/[\|&;()<>]/\\&/g' # quote shell metacharacters
	done
}

# ease-of-life functions
isdefined() { # isdefined NAME => tests if NAME is defined ONLY
	[[ "${!1+x}" ]]
}
isempty() { # isempty NAME => tests if NAME is empty ONLY
	[[ ! "${!1-x}" ]]
}

set -x
transform_resource "$@"

# NEXT ....
# NORMALIZATION !!!