From c62e428c168c151ac852df6f578e3349d4c831e3 Mon Sep 17 00:00:00 2001 From: Case Duckworth Date: Sat, 30 May 2020 13:38:18 -0500 Subject: Bring URL transforming up to spec URL transformation (formerly called "munging") is now (afaict) fully compliant with the RFC spec. It's also implemented in pure bash and is available as a library at https://git.sr.ht/~acdw/shurlie. --- bollux | 198 +++++++++++++++++++++++++++++++++++++++-------------------------- 1 file changed, 118 insertions(+), 80 deletions(-) diff --git a/bollux b/bollux index a74db7d..0686c21 100755 --- a/bollux +++ b/bollux @@ -122,7 +122,7 @@ blastoff() { # load a url URL="$1" if $well_formed && [[ "$1" != "$BOLLUX_URL" ]]; then - URL="$(run munge_url "$1" "$BOLLUX_URL")" + URL="$(run transform_resource "$BOLLUX_URL" "$1")" fi [[ "$URL" != *://* ]] && URL="$BOLLUX_PROTO://$URL" URL="$(trim <<<"$URL")" @@ -134,95 +134,133 @@ blastoff() { # load a url run handle_response "$URL" } -munge_url() { - local -A new old u - eval "$(split_url new <<<"$1")" - for k in "${!new[@]}"; do log d "new[$k]=${new[$k]}"; done - eval "$(split_url old <<<"$2")" - for k in "${!old[@]}"; do log d "old[$k]=${old[$k]}"; done - - u['scheme']="${new['scheme']:-${old['scheme']:-}}" - u['authority']="${new['authority']:-${old['authority']:-}}" - # XXX this whole path thing is wack - if [[ "${new['path']+isset}" ]]; then - log d 'new path set' - if [[ "${new['path']}" == /* ]]; then - log d 'new path == /*' - u['path']="${new['path']}" - elif [[ "${new['authority']}" == "${old['authority']}" || ! "${new['authority']+isset}" ]]; then - p="${old['path']:-}/${new['path']}" - log d "$p ( $(normalize_path <<<"$p") )" - u['path']="$(normalize_path <<<"$p")" +transform_resource() { # transform_resource BASE_URL REFERENCE_URL + declare -A R B T # reference, base url, target + eval "$(parse_url B "$1")" + eval "$(parse_url R "$2")" + # A non-strict parser may ignore a scheme in the reference + # if it is identical to the base URI's scheme. + if ! "${STRICT:-true}" && [[ "${R[scheme]}" == "${B[scheme]}" ]]; then + unset "${R[scheme]}" + fi + + # basically pseudo-code from spec ported to bash + if isdefined "R[scheme]"; then + T[scheme]="${R[scheme]}" + isdefined "R[authority]" && T[authority]="${R[authority]}" + isdefined R[path] && + T[path]="$(remove_dot_segments "${R[path]}")" + isdefined "R[query]" && T[query]="${R[query]}" + else + if isdefined "R[authority]"; then + T[authority]="${R[authority]}" + isdefined "R[authority]" && + T[path]="$(remove_dot_segments "${R[path]}")" + isdefined R[query] && T[query]="${R[query]}" else - log d 'u path = new path' - u['path']="${new['path']}" + if isempty "R[path]"; then + T[path]="${B[path]}" + if isdefined R[query]; then + T[query]="${R[query]}" + else + T[query]="${B[query]}" + fi + else + if [[ "${R[path]}" == /* ]]; then + T[path]="$(remove_dot_segments "${R[path]}")" + else + T[path]="$(merge_paths "B[authority]" "${B[path]}" "${R[path]}")" + T[path]="$(remove_dot_segments "${T[path]}")" + fi + isdefined R[query] && T[query]="${R[query]}" + fi + T[authority]="${B[authority]}" fi - elif [[ "${new['query']+isset}" || "${new['fragment']+isset}" ]]; then - log d 'u path = old path' - u['path']="${old['path']}" - else - u['path']="/" + T[scheme]="${B[scheme]}" + fi + isdefined R[fragment] && T[fragment]="${R[fragment]}" + # cf. 5.3 -- recomposition + local r="" + isdefined "T[scheme]" && r="$r${T[scheme]}:" + isdefined "T[authority]" && r="$r//${T[authority]}" + r="$r${T[path]}" + isdefined T[query] && r="$r?${T[query]}" + isdefined T[fragment] && r="$r#${T[fragment]}" + printf '%s\n' "$r" +} + +merge_paths() { # 5.2.3 + # shellcheck disable=2034 + B_authority="$1" + B_path="$2" + R_path="$3" + # if R_path is empty, get rid of // in B_path + if [[ -z "$R_path" ]]; then + printf '%s\n' "${B_path//\/\//\//}" + return fi - u['query']="${new['query']:-}" - u['fragment']="${new['fragment']:-}" - for k in "${!u[@]}"; do log d "u[$k]=${u[$k]}"; done - run printf '%s%s%s%s%s\n' \ - "${u['scheme']}" "${u['authority']}" "${u['path']}" \ - "${u['query']}" "${u['fragment']}" + if isdefined "B_authority" && isempty "B_path"; then + printf '/%s\n' "${R_path//\/\//\//}" + else + if [[ "$B_path" == */* ]]; then + B_path="${B_path%/*}/" + else + B_path="" + fi + printf '%s/%s\n' "${B_path%/}" "${R_path#/}" + fi } -normalize_path() { - gawk '{ - split($0, path, /\//) - for (c in path) { - if (path[c] == "" || path[c] == ".") { - continue - } - if (path[c] == "..") { - sub(/[^\/]+$/, "", ret) - continue - } - if (! ret || match(ret, /\/$/)) { - slash = "" - } else { - slash = "/" - } - ret = ret slash path[c] - } - print (ret ~ /^\// ? "" : "/") ret - }' +remove_dot_segments() { # 5.2.4 + local input="$1" + local output= + # ^/\.(/|$) - BASH_REMATCH[0] + while [[ "$input" ]]; do + if [[ "$input" =~ ^\.\.?/ ]]; then + input="${input#${BASH_REMATCH[0]}}" + elif [[ "$input" =~ ^/\.(/|$) ]]; then + input="/${input#${BASH_REMATCH[0]}}" + elif [[ "$input" =~ ^/\.\.(/|$) ]]; then + input="/${input#${BASH_REMATCH[0]}}" + [[ "$output" =~ /?[^/]+$ ]] + output="${output%${BASH_REMATCH[0]}}" + elif [[ "$input" == . || "$input" == .. ]]; then + input= + else + [[ $input =~ ^(/?[^/]*)(/?.*)$ ]] || echo NOMATCH >&2 + output="$output${BASH_REMATCH[1]}" + input="${BASH_REMATCH[2]}" + fi + done + printf '%s\n' "${output//\/\//\//}" } -split_url() { - gawk -vvar="$1" '{ - if (match($0, /^[A-Za-z]+:/)) { - arr["scheme"] = substr($0, RSTART, RLENGTH) - $0 = substr($0, RLENGTH + 1) - } - if (match($0, /^\/\/[^\/?#]+?/) || (match($0, /^[^\/?#]+?/) && scheme)) { - arr["authority"] = substr($0, RSTART, RLENGTH) - $0 = substr($0, RLENGTH + 1) - } - if (match($0, /^\/?[^?#]+/)) { - arr["path"] = substr($0, RSTART, RLENGTH) - $0 = substr($0, RLENGTH + 1) - } - if (match($0, /^\?[^#]+/)) { - arr["query"] = substr($0, RSTART, RLENGTH) - $0 = substr($0, RLENGTH + 1) - } - if (match($0, /^#.*/)) { - arr["fragment"] = substr($0, RSTART, RLENGTH) - $0 = substr($0, RLENGTH + 1) - } - for (part in arr) { - sub(/[[:space:]]+$/, "", arr[part]) - printf var "[\"%s\"]=\"%s\"\n", part, arr[part] - } - }' +parse_url() { # eval "$(split_url NAME STRING)" => NAME[...] + local name="$1" + local string="$2" + local re='^(([^:/?#]+):)?(//([^/?#]*))?([^?#]*)(\?([^#]*))?(#(.*))?' + [[ $string =~ $re ]] || return $? + + local scheme="${BASH_REMATCH[2]}" + local authority="${BASH_REMATCH[4]}" + local path="${BASH_REMATCH[5]}" + local query="${BASH_REMATCH[7]}" + local fragment="${BASH_REMATCH[9]}" + + for c in scheme authority query fragment; do + [[ "${!c}" ]] && + printf '%s[%s]=%q\n' "$name" "$c" "${!c}" + done + # unclear if the path is always set even if empty but it looks that way + printf '%s[path]=%q\n' "$name" "$path" } +# is a NAME defined ('set' in bash)? +isdefined() { [[ "${!1+x}" ]]; } # isdefined NAME +# is a NAME defined AND empty? +isempty() { [[ ! "${!1-x}" ]]; } # isempty NAME + request_url() { local server="$1" local port="$2" -- cgit 1.4.1-21-gabe81