From 2e6b42e5c1e00d946691a0b40f64be1091338519 Mon Sep 17 00:00:00 2001 From: Case Duckworth Date: Thu, 28 May 2020 08:37:40 -0500 Subject: Start testing transform_uri --- transform_uri.sh | 157 +++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 157 insertions(+) create mode 100644 transform_uri.sh diff --git a/transform_uri.sh b/transform_uri.sh new file mode 100644 index 0000000..e9c9fc9 --- /dev/null +++ b/transform_uri.sh @@ -0,0 +1,157 @@ +#!/usr/bin/env bash +# transform-url +# cf. https://tools.ietf.org/html/rfc3986#section-5 and +# cf. https://tools.ietf.org/html/rfc3986#section-5.1 +# cf. also https://tools.ietf.org/html/rfc3986#appendix-B -- regex + +# TEST WITH https://tools.ietf.org/html/rfc3986#section-5.4 + +transform_resource() { # 5.2.2 + declare -A R B T # reference, base url, target + eval "$(parse_url R "$2")" # XXX CHANGE + eval "$(parse_url B "$1")" + # Basically going to follow the pseudocode in the spec. + # the '+x' bit after the fields of the arrays tests if they're set + if [[ "${R['scheme']+x}" ]]; then + T['scheme']="${R['scheme']}" + T['authority']="${R['authority']}" + T['path']="$(remove_dot_segments "${R['path']}")" + T['query']="${R['query']}" + else + if [[ "${R['authority']+x}" ]]; then + T['authority']="${R['authority']}" + T['path']="$(remove_dot_segments "${R['path']}")" + T['query']="${R['query']}" + else + if [[ "${R['path']-x}" == "" ]]; then + T['path']="${B['path']}" + if [[ "${R['query']-x}" ]]; then + T['query']="${R['query']}" + else + T['query']="${B['query']}" + fi + else + if [[ "${R['path']}" == /* ]]; then + T['path']="$(remove_dot_segments "${R['path']}")" + else + T['path']="$(merge "${B['authority']-?}" \ + "${B['path']}" "${R['path']}")" + T['path']="$(remove_dot_segments "${T['path']}")" + fi + T['query']="${R['query']}" + fi + T['authority']="${B['authority']}" + fi + T['scheme']="${B['scheme']}" + fi + T['fragment']="${R['fragment']}" + # 5.3 -- recomposition + local r="" + [[ "${T['scheme']-x}" ]] && + r="$r${T['scheme']}:" + [[ "${T['authority']-x}" ]] && + r="$r//${T['authority']}" + r="$r${T['path']}" + [[ "${T['query']-x}" ]] && + r="$r?${T['query']}" + [[ "${T['fragment']-x}" ]] && + r="$r#${T['fragment']}" + printf '%s\n' "$r" +} + +merge() { # 5.2.3 + #>If the base URI has a defined authority component and an empty + #>path, then return a string consisting of "/" concatenated with the + #>reference's path; otherwise, + #>return a string consisting of the reference's path component + #>appended to all but the last segment of the base URI's path (i.e., + #>excluding any characters after the right-most "/" in the base URI + #>path, or excluding the entire base URI path if it does not contain + #>any "/" characters). + B_authority="$1" # if ? is here, it means undefined (see caller) + B_path="$2" + R_path="$3" + if [[ -z "$R_path" ]]; then + printf '%q\n' "$B_path" | + sed 's,//,/,g' # XXX is this okay....? + return + fi + + if [[ "${B_authority:-?}" != "?" && "${B_path-x}" == "" ]]; then + printf '/%q\n' "$R_path" + else + if [[ "$B_path" == */* ]]; then + B_path="${B_path%/*}/" + else + B_path="" + fi + printf '%q/%q\n' "$B_path" "$R_path" # XXX - %q vs %s + fi +} + +# I can probably just use normalize_path already in bollux here +remove_dot_segments() { # 5.2.4 + local input="$1" + local output= + while [[ -n "$input" ]]; do + if [[ "$input" == ../* || "$input" == ./* ]]; then + input="${input#*/}" + elif [[ "$input" == /./* ]]; then + input="${input#/./}/" + elif [[ "$input" == /.* ]]; then + input="${input#/.}/b" + elif [[ "$input" == /../* ]]; then + input="${input#/../}/c" + output="${output%/*}" + elif [[ "$input" == /..* ]]; then + input="${input#/..}/d" + output="${output%/*}" + elif [[ "$input" == . || "$input" == .. ]]; then + input= + else + # move the first path segment in the input buffer to the end of + # the output buffer, including the initial "/" character (if + # any) and any subsequent characters up to, but not including, + # the next "/" character or the end of the input buffer. + [[ $input =~ ^(/?[^/]*)(/?.*)$ ]] || echo NOMATCH >&2 + output="$output${BASH_REMATCH[1]}" + input="${BASH_REMATCH[2]}" + fi + done + printf '%s\n' "$output" | + sed 's,//,/,g' # XXX is this okay....? +} + +# *FINDING* URLS ... IN PURE BASH !!! +parse_url() { # eval "$(split_url NAME STRING)" => NAME[...] + local name="$1" + local string="$2" + local re='^(([^:/?#]+):)?(//([^/?#]*))?([^?#]*)(\?([^#]*))?(#(.*))?' + [[ $string =~ $re ]] || return $? + + local scheme="${BASH_REMATCH[2]}" + local authority="${BASH_REMATCH[4]}" + local path="${BASH_REMATCH[5]}" + local query="${BASH_REMATCH[7]}" + local fragment="${BASH_REMATCH[9]}" + + for c in scheme authority path query fragment; do + [[ "${!c}" ]] && + printf '%s[%s]=%s\n' "$name" "$c" "${!c}" | + sed 's/[\|&;()<>]/\\&/g' # quote shell metacharacters + done +} + +# ease-of-life functions +isdefined() { # isdefined NAME => tests if NAME is defined ONLY + [[ "${!1+x}" ]] +} +isempty() { # isempty NAME => tests if NAME is empty ONLY + [[ ! "${!1-x}" ]] +} + +set -x +transform_resource "$@" + +# NEXT .... +# NORMALIZATION !!! -- cgit 1.4.1-21-gabe81