#!/usr/bin/env bash # transform-url # cf. https://tools.ietf.org/html/rfc3986#section-5 and # cf. https://tools.ietf.org/html/rfc3986#section-5.1 # cf. also https://tools.ietf.org/html/rfc3986#appendix-B -- regex # TEST WITH https://tools.ietf.org/html/rfc3986#section-5.4 transform_resource() { # 5.2.2 declare -A R B T # reference, base url, target eval "$(parse_url R "$2")" # XXX CHANGE eval "$(parse_url B "$1")" # Basically going to follow the pseudocode in the spec. # the '+x' bit after the fields of the arrays tests if they're set if [[ "${R['scheme']+x}" ]]; then T['scheme']="${R['scheme']}" T['authority']="${R['authority']}" T['path']="$(remove_dot_segments "${R['path']}")" T['query']="${R['query']}" else if [[ "${R['authority']+x}" ]]; then T['authority']="${R['authority']}" T['path']="$(remove_dot_segments "${R['path']}")" T['query']="${R['query']}" else if [[ "${R['path']-x}" == "" ]]; then T['path']="${B['path']}" if [[ "${R['query']-x}" ]]; then T['query']="${R['query']}" else T['query']="${B['query']}" fi else if [[ "${R['path']}" == /* ]]; then T['path']="$(remove_dot_segments "${R['path']}")" else T['path']="$(merge "${B['authority']-?}" \ "${B['path']}" "${R['path']}")" T['path']="$(remove_dot_segments "${T['path']}")" fi T['query']="${R['query']}" fi T['authority']="${B['authority']}" fi T['scheme']="${B['scheme']}" fi T['fragment']="${R['fragment']}" # 5.3 -- recomposition local r="" [[ "${T['scheme']-x}" ]] && r="$r${T['scheme']}:" [[ "${T['authority']-x}" ]] && r="$r//${T['authority']}" r="$r${T['path']}" [[ "${T['query']-x}" ]] && r="$r?${T['query']}" [[ "${T['fragment']-x}" ]] && r="$r#${T['fragment']}" printf '%s\n' "$r" } merge() { # 5.2.3 #>If the base URI has a defined authority component and an empty #>path, then return a string consisting of "/" concatenated with the #>reference's path; otherwise, #>return a string consisting of the reference's path component #>appended to all but the last segment of the base URI's path (i.e., #>excluding any characters after the right-most "/" in the base URI #>path, or excluding the entire base URI path if it does not contain #>any "/" characters). B_authority="$1" # if ? is here, it means undefined (see caller) B_path="$2" R_path="$3" if [[ -z "$R_path" ]]; then printf '%q\n' "$B_path" | sed 's,//,/,g' # XXX is this okay....? return fi if [[ "${B_authority:-?}" != "?" && "${B_path-x}" == "" ]]; then printf '/%q\n' "$R_path" else if [[ "$B_path" == */* ]]; then B_path="${B_path%/*}/" else B_path="" fi printf '%q/%q\n' "$B_path" "$R_path" # XXX - %q vs %s fi } # I can probably just use normalize_path already in bollux here remove_dot_segments() { # 5.2.4 local input="$1" local output= while [[ -n "$input" ]]; do if [[ "$input" == ../* || "$input" == ./* ]]; then input="${input#*/}" elif [[ "$input" == /./* ]]; then input="${input#/./}/" elif [[ "$input" == /.* ]]; then input="${input#/.}/b" elif [[ "$input" == /../* ]]; then input="${input#/../}/c" output="${output%/*}" elif [[ "$input" == /..* ]]; then input="${input#/..}/d" output="${output%/*}" elif [[ "$input" == . || "$input" == .. ]]; then input= else # move the first path segment in the input buffer to the end of # the output buffer, including the initial "/" character (if # any) and any subsequent characters up to, but not including, # the next "/" character or the end of the input buffer. [[ $input =~ ^(/?[^/]*)(/?.*)$ ]] || echo NOMATCH >&2 output="$output${BASH_REMATCH[1]}" input="${BASH_REMATCH[2]}" fi done printf '%s\n' "$output" | sed 's,//,/,g' # XXX is this okay....? } # *FINDING* URLS ... IN PURE BASH !!! parse_url() { # eval "$(split_url NAME STRING)" => NAME[...] local name="$1" local string="$2" local re='^(([^:/?#]+):)?(//([^/?#]*))?([^?#]*)(\?([^#]*))?(#(.*))?' [[ $string =~ $re ]] || return $? local scheme="${BASH_REMATCH[2]}" local authority="${BASH_REMATCH[4]}" local path="${BASH_REMATCH[5]}" local query="${BASH_REMATCH[7]}" local fragment="${BASH_REMATCH[9]}" for c in scheme authority path query fragment; do [[ "${!c}" ]] && printf '%s[%s]=%s\n' "$name" "$c" "${!c}" | sed 's/[\|&;()<>]/\\&/g' # quote shell metacharacters done } # ease-of-life functions isdefined() { # isdefined NAME => tests if NAME is defined ONLY [[ "${!1+x}" ]] } isempty() { # isempty NAME => tests if NAME is empty ONLY [[ ! "${!1-x}" ]] } set -x transform_resource "$@" # NEXT .... # NORMALIZATION !!!