diff options
author | Case Duckworth | 2020-05-30 13:38:18 -0500 |
---|---|---|
committer | Case Duckworth | 2020-05-30 13:38:18 -0500 |
commit | c62e428c168c151ac852df6f578e3349d4c831e3 (patch) | |
tree | d7e2caad96d48ac1d878ed2b4437306250dd0804 | |
parent | Backup (diff) | |
download | bollux-c62e428c168c151ac852df6f578e3349d4c831e3.tar.gz bollux-c62e428c168c151ac852df6f578e3349d4c831e3.zip |
Bring URL transforming up to spec
URL transformation (formerly called "munging") is now (afaict) fully compliant with the RFC spec. It's also implemented in pure bash and is available as a library at https://git.sr.ht/~acdw/shurlie.
-rwxr-xr-x | bollux | 198 |
1 files changed, 118 insertions, 80 deletions
diff --git a/bollux b/bollux index a74db7d..0686c21 100755 --- a/bollux +++ b/bollux | |||
@@ -122,7 +122,7 @@ blastoff() { # load a url | |||
122 | URL="$1" | 122 | URL="$1" |
123 | 123 | ||
124 | if $well_formed && [[ "$1" != "$BOLLUX_URL" ]]; then | 124 | if $well_formed && [[ "$1" != "$BOLLUX_URL" ]]; then |
125 | URL="$(run munge_url "$1" "$BOLLUX_URL")" | 125 | URL="$(run transform_resource "$BOLLUX_URL" "$1")" |
126 | fi | 126 | fi |
127 | [[ "$URL" != *://* ]] && URL="$BOLLUX_PROTO://$URL" | 127 | [[ "$URL" != *://* ]] && URL="$BOLLUX_PROTO://$URL" |
128 | URL="$(trim <<<"$URL")" | 128 | URL="$(trim <<<"$URL")" |
@@ -134,95 +134,133 @@ blastoff() { # load a url | |||
134 | run handle_response "$URL" | 134 | run handle_response "$URL" |
135 | } | 135 | } |
136 | 136 | ||
137 | munge_url() { | 137 | transform_resource() { # transform_resource BASE_URL REFERENCE_URL |
138 | local -A new old u | 138 | declare -A R B T # reference, base url, target |
139 | eval "$(split_url new <<<"$1")" | 139 | eval "$(parse_url B "$1")" |
140 | for k in "${!new[@]}"; do log d "new[$k]=${new[$k]}"; done | 140 | eval "$(parse_url R "$2")" |
141 | eval "$(split_url old <<<"$2")" | 141 | # A non-strict parser may ignore a scheme in the reference |
142 | for k in "${!old[@]}"; do log d "old[$k]=${old[$k]}"; done | 142 | # if it is identical to the base URI's scheme. |
143 | 143 | if ! "${STRICT:-true}" && [[ "${R[scheme]}" == "${B[scheme]}" ]]; then | |
144 | u['scheme']="${new['scheme']:-${old['scheme']:-}}" | 144 | unset "${R[scheme]}" |
145 | u['authority']="${new['authority']:-${old['authority']:-}}" | 145 | fi |
146 | # XXX this whole path thing is wack | 146 | |
147 | if [[ "${new['path']+isset}" ]]; then | 147 | # basically pseudo-code from spec ported to bash |
148 | log d 'new path set' | 148 | if isdefined "R[scheme]"; then |
149 | if [[ "${new['path']}" == /* ]]; then | 149 | T[scheme]="${R[scheme]}" |
150 | log d 'new path == /*' | 150 | isdefined "R[authority]" && T[authority]="${R[authority]}" |
151 | u['path']="${new['path']}" | 151 | isdefined R[path] && |
152 | elif [[ "${new['authority']}" == "${old['authority']}" || ! "${new['authority']+isset}" ]]; then | 152 | T[path]="$(remove_dot_segments "${R[path]}")" |
153 | p="${old['path']:-}/${new['path']}" | 153 | isdefined "R[query]" && T[query]="${R[query]}" |
154 | log d "$p ( $(normalize_path <<<"$p") )" | 154 | else |
155 | u['path']="$(normalize_path <<<"$p")" | 155 | if isdefined "R[authority]"; then |
156 | T[authority]="${R[authority]}" | ||
157 | isdefined "R[authority]" && | ||
158 | T[path]="$(remove_dot_segments "${R[path]}")" | ||
159 | isdefined R[query] && T[query]="${R[query]}" | ||
156 | else | 160 | else |
157 | log d 'u path = new path' | 161 | if isempty "R[path]"; then |
158 | u['path']="${new['path']}" | 162 | T[path]="${B[path]}" |
163 | if isdefined R[query]; then | ||
164 | T[query]="${R[query]}" | ||
165 | else | ||
166 | T[query]="${B[query]}" | ||
167 | fi | ||
168 | else | ||
169 | if [[ "${R[path]}" == /* ]]; then | ||
170 | T[path]="$(remove_dot_segments "${R[path]}")" | ||
171 | else | ||
172 | T[path]="$(merge_paths "B[authority]" "${B[path]}" "${R[path]}")" | ||
173 | T[path]="$(remove_dot_segments "${T[path]}")" | ||
174 | fi | ||
175 | isdefined R[query] && T[query]="${R[query]}" | ||
176 | fi | ||
177 | T[authority]="${B[authority]}" | ||
159 | fi | 178 | fi |
160 | elif [[ "${new['query']+isset}" || "${new['fragment']+isset}" ]]; then | 179 | T[scheme]="${B[scheme]}" |
161 | log d 'u path = old path' | 180 | fi |
162 | u['path']="${old['path']}" | 181 | isdefined R[fragment] && T[fragment]="${R[fragment]}" |
163 | else | 182 | # cf. 5.3 -- recomposition |
164 | u['path']="/" | 183 | local r="" |
184 | isdefined "T[scheme]" && r="$r${T[scheme]}:" | ||
185 | isdefined "T[authority]" && r="$r//${T[authority]}" | ||
186 | r="$r${T[path]}" | ||
187 | isdefined T[query] && r="$r?${T[query]}" | ||
188 | isdefined T[fragment] && r="$r#${T[fragment]}" | ||
189 | printf '%s\n' "$r" | ||
190 | } | ||
191 | |||
192 | merge_paths() { # 5.2.3 | ||
193 | # shellcheck disable=2034 | ||
194 | B_authority="$1" | ||
195 | B_path="$2" | ||
196 | R_path="$3" | ||
197 | # if R_path is empty, get rid of // in B_path | ||
198 | if [[ -z "$R_path" ]]; then | ||
199 | printf '%s\n' "${B_path//\/\//\//}" | ||
200 | return | ||
165 | fi | 201 | fi |
166 | u['query']="${new['query']:-}" | ||
167 | u['fragment']="${new['fragment']:-}" | ||
168 | for k in "${!u[@]}"; do log d "u[$k]=${u[$k]}"; done | ||
169 | 202 | ||
170 | run printf '%s%s%s%s%s\n' \ | 203 | if isdefined "B_authority" && isempty "B_path"; then |
171 | "${u['scheme']}" "${u['authority']}" "${u['path']}" \ | 204 | printf '/%s\n' "${R_path//\/\//\//}" |
172 | "${u['query']}" "${u['fragment']}" | 205 | else |
206 | if [[ "$B_path" == */* ]]; then | ||
207 | B_path="${B_path%/*}/" | ||
208 | else | ||
209 | B_path="" | ||
210 | fi | ||
211 | printf '%s/%s\n' "${B_path%/}" "${R_path#/}" | ||
212 | fi | ||
173 | } | 213 | } |
174 | 214 | ||
175 | normalize_path() { | 215 | remove_dot_segments() { # 5.2.4 |
176 | gawk '{ | 216 | local input="$1" |
177 | split($0, path, /\//) | 217 | local output= |
178 | for (c in path) { | 218 | # ^/\.(/|$) - BASH_REMATCH[0] |
179 | if (path[c] == "" || path[c] == ".") { | 219 | while [[ "$input" ]]; do |
180 | continue | 220 | if [[ "$input" =~ ^\.\.?/ ]]; then |
181 | } | 221 | input="${input#${BASH_REMATCH[0]}}" |
182 | if (path[c] == "..") { | 222 | elif [[ "$input" =~ ^/\.(/|$) ]]; then |
183 | sub(/[^\/]+$/, "", ret) | 223 | input="/${input#${BASH_REMATCH[0]}}" |
184 | continue | 224 | elif [[ "$input" =~ ^/\.\.(/|$) ]]; then |
185 | } | 225 | input="/${input#${BASH_REMATCH[0]}}" |
186 | if (! ret || match(ret, /\/$/)) { | 226 | [[ "$output" =~ /?[^/]+$ ]] |
187 | slash = "" | 227 | output="${output%${BASH_REMATCH[0]}}" |
188 | } else { | 228 | elif [[ "$input" == . || "$input" == .. ]]; then |
189 | slash = "/" | 229 | input= |
190 | } | 230 | else |
191 | ret = ret slash path[c] | 231 | [[ $input =~ ^(/?[^/]*)(/?.*)$ ]] || echo NOMATCH >&2 |
192 | } | 232 | output="$output${BASH_REMATCH[1]}" |
193 | print (ret ~ /^\// ? "" : "/") ret | 233 | input="${BASH_REMATCH[2]}" |
194 | }' | 234 | fi |
235 | done | ||
236 | printf '%s\n' "${output//\/\//\//}" | ||
195 | } | 237 | } |
196 | 238 | ||
197 | split_url() { | 239 | parse_url() { # eval "$(split_url NAME STRING)" => NAME[...] |
198 | gawk -vvar="$1" '{ | 240 | local name="$1" |
199 | if (match($0, /^[A-Za-z]+:/)) { | 241 | local string="$2" |
200 | arr["scheme"] = substr($0, RSTART, RLENGTH) | 242 | local re='^(([^:/?#]+):)?(//([^/?#]*))?([^?#]*)(\?([^#]*))?(#(.*))?' |
201 | $0 = substr($0, RLENGTH + 1) | 243 | [[ $string =~ $re ]] || return $? |
202 | } | 244 | |
203 | if (match($0, /^\/\/[^\/?#]+?/) || (match($0, /^[^\/?#]+?/) && scheme)) { | 245 | local scheme="${BASH_REMATCH[2]}" |
204 | arr["authority"] = substr($0, RSTART, RLENGTH) | 246 | local authority="${BASH_REMATCH[4]}" |
205 | $0 = substr($0, RLENGTH + 1) | 247 | local path="${BASH_REMATCH[5]}" |
206 | } | 248 | local query="${BASH_REMATCH[7]}" |
207 | if (match($0, /^\/?[^?#]+/)) { | 249 | local fragment="${BASH_REMATCH[9]}" |
208 | arr["path"] = substr($0, RSTART, RLENGTH) | 250 | |
209 | $0 = substr($0, RLENGTH + 1) | 251 | for c in scheme authority query fragment; do |
210 | } | 252 | [[ "${!c}" ]] && |
211 | if (match($0, /^\?[^#]+/)) { | 253 | printf '%s[%s]=%q\n' "$name" "$c" "${!c}" |
212 | arr["query"] = substr($0, RSTART, RLENGTH) | 254 | done |
213 | $0 = substr($0, RLENGTH + 1) | 255 | # unclear if the path is always set even if empty but it looks that way |
214 | } | 256 | printf '%s[path]=%q\n' "$name" "$path" |
215 | if (match($0, /^#.*/)) { | ||
216 | arr["fragment"] = substr($0, RSTART, RLENGTH) | ||
217 | $0 = substr($0, RLENGTH + 1) | ||
218 | } | ||
219 | for (part in arr) { | ||
220 | sub(/[[:space:]]+$/, "", arr[part]) | ||
221 | printf var "[\"%s\"]=\"%s\"\n", part, arr[part] | ||
222 | } | ||
223 | }' | ||
224 | } | 257 | } |
225 | 258 | ||
259 | # is a NAME defined ('set' in bash)? | ||
260 | isdefined() { [[ "${!1+x}" ]]; } # isdefined NAME | ||
261 | # is a NAME defined AND empty? | ||
262 | isempty() { [[ ! "${!1-x}" ]]; } # isempty NAME | ||
263 | |||
226 | request_url() { | 264 | request_url() { |
227 | local server="$1" | 265 | local server="$1" |
228 | local port="$2" | 266 | local port="$2" |