From b7499b1ed8dc7d3de9fcdff650a0b5356185b63e Mon Sep 17 00:00:00 2001 From: Larefly Date: Sat, 4 Jan 2025 17:26:51 +0000 Subject: [PATCH] migrate `parse_url`, `parse_query_string` and `build_url` to `cpp` for performance improvement (#872) --- paws.common/NEWS.md | 1 + paws.common/R/RcppExports.R | 16 + paws.common/R/custom_s3.R | 22 +- paws.common/R/url.R | 135 +-------- paws.common/src/RcppExports.cpp | 48 +++ paws.common/src/encoding.cpp | 142 ++++++--- paws.common/src/encoding.h | 42 +++ paws.common/src/url_parse.cpp | 286 ++++++++++++++++++ paws.common/tests/testthat/test_credentials.R | 8 +- paws.common/tests/testthat/test_escape.R | 5 + paws.common/tests/testthat/test_url.R | 42 ++- 11 files changed, 556 insertions(+), 191 deletions(-) create mode 100644 paws.common/src/encoding.h create mode 100644 paws.common/src/url_parse.cpp diff --git a/paws.common/NEWS.md b/paws.common/NEWS.md index 6d85b9eff..e9af157ca 100644 --- a/paws.common/NEWS.md +++ b/paws.common/NEWS.md @@ -6,6 +6,7 @@ * migrate backend `httr` to `httr2` * new `PawsStreamHandler`, allows paws to handle aws stream event (#842). Thankyou to @hadley for developing the initial solution in `httr2`. * deprecated custom handler for `s3_unmarshal_select_object_content` +* migrate `parse_url`, `parse_query_string` and `build_url` to `cpp` for performance improvement. # paws.common 0.7.7 * fix unix time expiration check diff --git a/paws.common/R/RcppExports.R b/paws.common/R/RcppExports.R index d9397031c..3990b2ef9 100644 --- a/paws.common/R/RcppExports.R +++ b/paws.common/R/RcppExports.R @@ -7,6 +7,10 @@ paws_url_encoder <- function(urls, safe = "") { .Call('_paws_common_paws_url_encoder', PACKAGE = 'paws.common', urls, safe) } +paws_url_unencoder <- function(urls) { + .Call('_paws_common_paws_url_unencoder', PACKAGE = 'paws.common', urls) +} + scan_ini_file <- function(filename) { .Call('_paws_common_scan_ini_file', PACKAGE = 'paws.common', filename) } @@ -39,6 +43,18 @@ get_region_pattern <- function(region_pattern, region) { .Call('_paws_common_get_region_pattern', PACKAGE = 'paws.common', region_pattern, region) } +parse_query_string <- function(query) { + .Call('_paws_common_parse_query_string', PACKAGE = 'paws.common', query) +} + +parse_url <- function(url) { + .Call('_paws_common_parse_url', PACKAGE = 'paws.common', url) +} + +build_url <- function(url_components) { + .Call('_paws_common_build_url', PACKAGE = 'paws.common', url_components) +} + #' @useDynLib paws.common _paws_common_char_sort #' @importFrom Rcpp evalCpp char_sort <- function(str) { diff --git a/paws.common/R/custom_s3.R b/paws.common/R/custom_s3.R index 4c03482c1..e4f8700e0 100644 --- a/paws.common/R/custom_s3.R +++ b/paws.common/R/custom_s3.R @@ -437,23 +437,19 @@ s3_get_bucket_region <- function(request, error, bucket) { set_request_url <- function(original_endpoint, new_endpoint, use_new_scheme = TRUE) { - new_endpoint_components <- paws_url_parse(new_endpoint) - original_endpoint_components <- paws_url_parse(original_endpoint) - scheme <- original_endpoint_components[["scheme"]] + new_endpoint_components <- parse_url(new_endpoint) + final_endpoint_components <- parse_url(original_endpoint) + scheme <- final_endpoint_components$scheme if (use_new_scheme) { scheme <- new_endpoint_components[["scheme"]] } - final_endpoint_components <- list( - scheme = scheme, - host = new_endpoint_components[["hostname"]] %||% "", - path = original_endpoint_components[["path"]] %||% "", - query = original_endpoint_components[["query"]] %||% "", - fragment = "", - raw_path = "", - raw_query = "" + path <- ( + if (final_endpoint_components[["path"]] == "/") "" else final_endpoint_components[["path"]] ) - final_endpoint <- build_url(final_endpoint_components) - return(final_endpoint) + final_endpoint_components[["host"]] <- new_endpoint_components$host + final_endpoint_components[["scheme"]] <- scheme + final_endpoint_components[["path"]] <- path + return(build_url(final_endpoint_components)) } ################################################################################ diff --git a/paws.common/R/url.R b/paws.common/R/url.R index 258ce2da0..ba9ab27e0 100644 --- a/paws.common/R/url.R +++ b/paws.common/R/url.R @@ -8,6 +8,7 @@ Url <- struct( scheme = "", opaque = "", user = "", + password = "", host = "", path = "", raw_path = "", @@ -16,84 +17,6 @@ Url <- struct( fragment = "" ) -# Parse a URL into a Url object. -# TODO: Finish. -parse_url <- function(url) { - p <- paws_url_parse(url) - if (is.null(p$hostname)) p$hostname <- "" - if (!is.null(p$port)) p$hostname <- paste0(p$hostname, ":", p$port) - raw_path <- p$path - if (is.null(raw_path)) { - raw_path <- "/" - } else if (substr(raw_path, 1, 1) != "/") raw_path <- paste0("/", raw_path) - path <- unescape(raw_path) - escaped_path <- escape(raw_path, "encodePath") - if (escaped_path == raw_path) raw_path <- "" - u <- Url( - scheme = p$scheme %||% "", - host = p$hostname, - path = path, - raw_path = raw_path, - raw_query = build_query_string(p$query), - ) - return(u) -} - -# Developed from httr2: -# https://github.com/r-lib/httr2/blob/main/R/url.R#L26-L67 -paws_url_parse <- function(url) { - pieces <- str_match(url, "^(([^:/?#]+):)?(//([^/?#]*))?([^?#]*)(\\?([^#]*))?(#(.*))?") - scheme <- pieces[[2]] - authority <- pieces[[4]] - path <- pieces[[5]] - query <- pieces[[7]] - if (!is.null(query)) { - query <- parse_query_string(query) - } - fragment <- pieces[[9]] - pieces <- str_match(authority %||% "", "^(([^@]+)@)?([^:]+)?(:([^#]+))?") - userinfo <- pieces[[2]] - if (!is.null(userinfo)) { - if (grepl(":", userinfo)) { - userinfo <- parse_in_half(userinfo, ":") - } else { - userinfo <- list(userinfo, NULL) - } - } - hostname <- pieces[[3]] - port <- pieces[[5]] - return( - list( - scheme = scheme, - hostname = hostname, - port = port, - path = path, - query = query, - fragment = fragment, - username = userinfo[[1]], - password = userinfo[[2]] - ) - ) -} - -# Build a URL from a Url object. -# :///;?# -build_url <- function(url) { - if (nzchar(url$scheme) && nzchar(url[["host"]])) { - l <- paste0(url$scheme, "://", url[["host"]]) - } else { - return("") - } - prefix <- function(prefix, x) { - if (nzchar(x)) paste0(prefix, x) - } - l <- paste0( - l, if (nzchar(url[["raw_path"]])) url[["raw_path"]] else url[["path"]], - prefix("?", url[["raw_query"]]), prefix("#", url$fragment) - ) - return(l) -} - # helper function to filter out empty elements within build_query_string query_empty <- function(params) { (is.null(params) || length(params) == 0) @@ -126,19 +49,6 @@ build_query_string <- function(params) { return(paste(params[char_sort(param_names)], collapse = "&")) } -# Decode a query string into a list. -# e.g. `parse_query_string("bar=baz&foo=qux")` -> `list(bar = "baz", foo = "qux")` -parse_query_string <- function(query) { - query <- gsub("^\\?", "", query) - params <- parse_in_half(strsplit(query, "&")[[1]], "=") - if (length(params) == 0) { - return(list()) - } - out <- as.list(curl::curl_unescape(params[, 2])) - names(out) <- curl::curl_unescape(params[, 1]) - return(out) -} - # Add the key/value pairs in `params` to a query string in `query_string`, # and return a new query string. Keys in the query string that are also in # params will be overwritten with the new value from params. @@ -151,39 +61,16 @@ update_query_string <- function(query_string, params) { # Escape strings so they can be safely included in a URL. escape <- function(string, mode) { - # Ensure anything going to paws_url_encoder is a string - string <- as.character(string) - # base characters that won't be encoded - if (mode == "encodeHost" || mode == "encodeZone") { - # host and zone characters that won't be encoded - host_zone_pattern <- "][!$&'()*+,;=:<>\"" - return( - paws_url_encoder(string, host_zone_pattern) - ) - } - # path and path segment characters that won't be encoded - path_pattern <- "$&+,/;:=?@" - - if (mode == "encodePath") { - # remove character ? from pattern so that it can be encoded - rm_pattern <- "[?]" - pattern <- gsub(rm_pattern, "", path_pattern) - return(paws_url_encoder(string, pattern)) - } - if (mode == "encodePathSegment") { - # remove character /;,? from pattern so that it can be encoded - rm_pattern <- "[/;,?]" - pattern <- gsub(rm_pattern, "", path_pattern) - return(paws_url_encoder(string, pattern)) - } - if (mode == "encodeQueryComponent") { - # escape string using base_url_encode - return(paws_url_encoder(string)) - } - if (mode == "encodeFragment") { - return(paws_url_encoder(string, path_pattern)) - } - return(paws_url_encoder(string)) + safe_pattern <- switch(mode, + "encodeHost" = "][!$&'()*+,;=:<>\"", + "encodeZone" = "][!$&'()*+,;=:<>\"", + "encodeFragment" = "$&+,/;:=?@", + "encodePath" = "$&+,/;:=@", + "encodePathSegment" = "$&+:=@", + "encodeQueryComponent" = "", + "" + ) + return(paws_url_encoder(as.character(string), safe_pattern)) } # Un-escape a string. diff --git a/paws.common/src/RcppExports.cpp b/paws.common/src/RcppExports.cpp index 24ed5b0d4..5e576fc40 100644 --- a/paws.common/src/RcppExports.cpp +++ b/paws.common/src/RcppExports.cpp @@ -22,6 +22,17 @@ BEGIN_RCPP return rcpp_result_gen; END_RCPP } +// paws_url_unencoder +CharacterVector paws_url_unencoder(CharacterVector urls); +RcppExport SEXP _paws_common_paws_url_unencoder(SEXP urlsSEXP) { +BEGIN_RCPP + Rcpp::RObject rcpp_result_gen; + Rcpp::RNGScope rcpp_rngScope_gen; + Rcpp::traits::input_parameter< CharacterVector >::type urls(urlsSEXP); + rcpp_result_gen = Rcpp::wrap(paws_url_unencoder(urls)); + return rcpp_result_gen; +END_RCPP +} // scan_ini_file std::vector scan_ini_file(const std::string& filename); RcppExport SEXP _paws_common_scan_ini_file(SEXP filenameSEXP) { @@ -90,6 +101,39 @@ BEGIN_RCPP return rcpp_result_gen; END_RCPP } +// parse_query_string +List parse_query_string(std::string query); +RcppExport SEXP _paws_common_parse_query_string(SEXP querySEXP) { +BEGIN_RCPP + Rcpp::RObject rcpp_result_gen; + Rcpp::RNGScope rcpp_rngScope_gen; + Rcpp::traits::input_parameter< std::string >::type query(querySEXP); + rcpp_result_gen = Rcpp::wrap(parse_query_string(query)); + return rcpp_result_gen; +END_RCPP +} +// parse_url +Rcpp::List parse_url(const std::string& url); +RcppExport SEXP _paws_common_parse_url(SEXP urlSEXP) { +BEGIN_RCPP + Rcpp::RObject rcpp_result_gen; + Rcpp::RNGScope rcpp_rngScope_gen; + Rcpp::traits::input_parameter< const std::string& >::type url(urlSEXP); + rcpp_result_gen = Rcpp::wrap(parse_url(url)); + return rcpp_result_gen; +END_RCPP +} +// build_url +std::string build_url(const Rcpp::List& url_components); +RcppExport SEXP _paws_common_build_url(SEXP url_componentsSEXP) { +BEGIN_RCPP + Rcpp::RObject rcpp_result_gen; + Rcpp::RNGScope rcpp_rngScope_gen; + Rcpp::traits::input_parameter< const Rcpp::List& >::type url_components(url_componentsSEXP); + rcpp_result_gen = Rcpp::wrap(build_url(url_components)); + return rcpp_result_gen; +END_RCPP +} // char_sort CharacterVector char_sort(CharacterVector str); RcppExport SEXP _paws_common_char_sort(SEXP strSEXP) { @@ -115,12 +159,16 @@ END_RCPP static const R_CallMethodDef CallEntries[] = { {"_paws_common_paws_url_encoder", (DL_FUNC) &_paws_common_paws_url_encoder, 2}, + {"_paws_common_paws_url_unencoder", (DL_FUNC) &_paws_common_paws_url_unencoder, 1}, {"_paws_common_scan_ini_file", (DL_FUNC) &_paws_common_scan_ini_file, 1}, {"_paws_common_process_profile_name", (DL_FUNC) &_paws_common_process_profile_name, 1}, {"_paws_common_json_convert_string", (DL_FUNC) &_paws_common_json_convert_string, 1}, {"_paws_common_check_global", (DL_FUNC) &_paws_common_check_global, 1}, {"_paws_common_endpoint_unescape", (DL_FUNC) &_paws_common_endpoint_unescape, 2}, {"_paws_common_get_region_pattern", (DL_FUNC) &_paws_common_get_region_pattern, 2}, + {"_paws_common_parse_query_string", (DL_FUNC) &_paws_common_parse_query_string, 1}, + {"_paws_common_parse_url", (DL_FUNC) &_paws_common_parse_url, 1}, + {"_paws_common_build_url", (DL_FUNC) &_paws_common_build_url, 1}, {"_paws_common_char_sort", (DL_FUNC) &_paws_common_char_sort, 1}, {"_paws_common_uuid_v4", (DL_FUNC) &_paws_common_uuid_v4, 1}, {NULL, NULL, 0} diff --git a/paws.common/src/encoding.cpp b/paws.common/src/encoding.cpp index f36f8f451..62166f6b3 100644 --- a/paws.common/src/encoding.cpp +++ b/paws.common/src/encoding.cpp @@ -1,11 +1,8 @@ // This encoder has been modified from the excellent urltools R package // https://github.com/Ironholds/urltools/blob/master/src/encoding.cpp +#include "encoding.h" #include -#include -#include -#include -#include using namespace Rcpp; @@ -13,26 +10,23 @@ using namespace Rcpp; const char hex_chars[] = "0123456789ABCDEF"; // Precomputed unreserved characters lookup table -const std::bitset<256> unreserved_chars_map = [] { +const std::bitset<256> unreserved_chars_map = [] +{ std::bitset<256> bitset; std::string unreserved_chars = "0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ._~-"; - for (char ch : unreserved_chars) { + for (char ch : unreserved_chars) + { bitset.set(static_cast(ch)); } return bitset; }(); -inline std::string to_hex(char x) { - std::string output; - output.push_back(hex_chars[(x & 0xF0) >> 4]); - output.push_back(hex_chars[x & 0x0F]); - return output; -} - -std::string internal_url_encode(const std::string& url, const std::string& safe) { +std::string internal_url_encode(const std::string &url, const std::string &safe) +{ // Create a bitset for additional safe characters std::bitset<256> safe_chars_map = unreserved_chars_map; - for (char ch : safe) { + for (char ch : safe) + { safe_chars_map.set(static_cast(ch)); } @@ -41,11 +35,15 @@ std::string internal_url_encode(const std::string& url, const std::string& safe) output.reserve(url.size() * 3); // Assume worst case where all characters are encoded // For each character - for (char ch : url) { + for (char ch : url) + { // If it's in the list of unreserved ones, just pass it through - if (safe_chars_map.test(static_cast(ch))) { + if (safe_chars_map.test(static_cast(ch))) + { output.push_back(ch); - } else { + } + else + { // Otherwise, append in an encoded form output.push_back('%'); output.append(to_hex(ch)); @@ -56,33 +54,95 @@ std::string internal_url_encode(const std::string& url, const std::string& safe) return output; } +std::string internal_url_unencode(const std::string &url) +{ + std::string unescaped; + unescaped.reserve(url.size()); // Reserve memory to avoid reallocations + size_t length = url.length(); + + for (size_t i = 0; i < length; ++i) + { + if (url[i] == '%' && i + 2 < length) + { + int high = from_hex(url[i + 1]); + int low = from_hex(url[i + 2]); + if (high != -1 && low != -1) + { + unescaped.push_back(static_cast((high << 4) | low)); + i += 2; + } + else + { + unescaped.push_back('%'); // Invalid percent encoding, keep as is + } + } + else + { + unescaped.push_back(url[i]); // Keep other characters as is + } + } + return unescaped; +} + // Escape characters for use in URLs. // param urls A character vector to be encoded // param safe A characters of extra that should not be encoded //' @useDynLib paws.common _paws_common_paws_url_encoder //' @importFrom Rcpp evalCpp // [[Rcpp::export]] -CharacterVector paws_url_encoder(CharacterVector urls, CharacterVector safe = "") { - // Measure size, create output object and holding objects - int input_size = urls.size(); - CharacterVector output(input_size); - std::string safe_pattern = Rcpp::as(safe); - - // For each string - for (int i = 0; i < input_size; ++i) { - // Check for user interrupts - if ((i % 10000) == 0) { - Rcpp::checkUserInterrupt(); - } - - if (urls[i] == NA_STRING) { - output[i] = NA_STRING; - } else { - std::string holding = Rcpp::as(urls[i]); - output[i] = internal_url_encode(holding, safe_pattern); - } - } - - // Return - return output; +CharacterVector paws_url_encoder(CharacterVector urls, CharacterVector safe = "") +{ + // Measure size, create output object and holding objects + int input_size = urls.size(); + CharacterVector output(input_size); + std::string safe_pattern = Rcpp::as(safe); + + // For each string + for (int i = 0; i < input_size; ++i) + { + // Check for user interrupts + if ((i % 10000) == 0) + { + Rcpp::checkUserInterrupt(); + } + + if (urls[i] == NA_STRING) + { + output[i] = NA_STRING; + } + else + { + std::string holding = Rcpp::as(urls[i]); + output[i] = internal_url_encode(holding, safe_pattern); + } + } + + // Return + return output; +} + +// Escape all special characters (i.e. everything except for a-z, A-Z, 0-9, '-', '.', '_' or '~') for use in URLs. +// [[Rcpp::export]] +CharacterVector paws_url_unencoder(CharacterVector urls) +{ + int input_size = urls.size(); + CharacterVector output(input_size); + + for (int i = 0; i < input_size; ++i) + { + if (urls[i] == NA_STRING) + { + output[i] = NA_STRING; + } + else + { + output[i] = internal_url_unencode(Rcpp::as(urls[i])); + } + if ((i % 10000) == 0) + { + Rcpp::checkUserInterrupt(); + } + } + + return output; } diff --git a/paws.common/src/encoding.h b/paws.common/src/encoding.h new file mode 100644 index 000000000..c16df0f53 --- /dev/null +++ b/paws.common/src/encoding.h @@ -0,0 +1,42 @@ +#ifndef ENCODING_H +#define ENCODING_H + +#include +#include +#include +#include + +// Precomputed hex conversion lookup table +extern const char hex_chars[]; + +// Precomputed unreserved characters lookup table +extern const std::bitset<256> unreserved_chars_map; + +// Function to convert a hex character to its decimal value +inline int from_hex(char ch) +{ + if ('0' <= ch && ch <= '9') + return ch - '0'; + if ('a' <= ch && ch <= 'f') + return ch - 'a' + 10; + if ('A' <= ch && ch <= 'F') + return ch - 'A' + 10; + return -1; // Invalid hex character +} + +// Function to convert a character to its hex representation +inline std::string to_hex(char x) +{ + std::string output; + output.push_back(hex_chars[(x & 0xF0) >> 4]); + output.push_back(hex_chars[x & 0x0F]); + return output; +} + +// Function to escape a URL-encoded string +std::string internal_url_encode(const std::string &url, const std::string &safe); + +// Function to unescape a URL-encoded string +std::string internal_url_unencode(const std::string &url); + +#endif // ENCODING_H \ No newline at end of file diff --git a/paws.common/src/url_parse.cpp b/paws.common/src/url_parse.cpp new file mode 100644 index 000000000..cedb21328 --- /dev/null +++ b/paws.common/src/url_parse.cpp @@ -0,0 +1,286 @@ +#include +#include "encoding.h" + +#include +#include +#include +#include + +using namespace Rcpp; + +// [[Rcpp::export]] +List parse_query_string(std::string query) +{ + // Remove leading '?' if present + if (!query.empty() && query[0] == '?') + { + query.erase(0, 1); + } + + // Return an empty list if the query is empty + if (query.empty()) + { + return List::create(); + } + + std::vector> result_vector; + size_t start = 0, end = 0; + std::string key, value; + + while (start < query.length()) + { + end = query.find('=', start); + if (end == std::string::npos || query[end] == '&') + { + // Handle case where there is no '=' or it's part of an empty key-value pair + key = query.substr(start, query.find('&', start) - start); + value = ""; + start = query.find('&', start); + if (start == std::string::npos) + { + start = query.length(); + } + else + { + ++start; + } + } + else + { + key = query.substr(start, end - start); + start = end + 1; + end = query.find('&', start); + if (end == std::string::npos) + { + value = query.substr(start); + start = query.length(); + } + else + { + value = query.substr(start, end - start); + start = end + 1; + } + } + result_vector.push_back({internal_url_unencode(key), internal_url_unencode(value)}); + } + + List result(result_vector.size()); + CharacterVector names(result_vector.size()); + for (size_t i = 0; i < result_vector.size(); ++i) + { + names[i] = result_vector[i].first; + result[i] = result_vector[i].second; + } + result.attr("names") = names; + + return result; +} + +// Encode a list into a query string. +// # e.g. `list(bar = "baz", foo = "qux")` -> "bar=baz&foo=qux". +std::string decode_query_string(List params) +{ + if (params.size() == 0) + { + return ""; + } + std::string query; + std::vector nv = params.names(); + for (int i = 0; i < params.size(); ++i) + { + std::string key = internal_url_encode(nv[i], ""); + std::string value = internal_url_encode(params[i], ""); + if (!query.empty()) + { + query += "&"; + } + query += key + "=" + value; + } + return query; +} + +class URL +{ +public: + std::string scheme; + std::string user; + std::string password; + std::string host; + std::string raw_path; + std::string path; + std::string raw_query; + std::string fragment; + + // Method to recreate the URL string from components + std::string toString() const + { + if (scheme.empty() && host.empty()) + { + return ""; + } + + std::ostringstream url; + if (!scheme.empty()) + { + url << scheme << "://"; + } + if (!user.empty()) + { + url << user; + if (!password.empty()) + { + url << ":" << password; + } + url << "@"; + } + url << host; + if (!raw_path.empty()) + { + url << raw_path; + } + else + { + url << path; + } + if (!raw_query.empty()) + { + url << "?" << raw_query; + } + if (!fragment.empty()) + { + url << "#" << fragment; + } + return url.str(); + } +}; + +class URLParser +{ +public: + static URL parse(const std::string &url) + { + URL result; + std::string::const_iterator it = url.begin(); + std::string::const_iterator end = url.end(); + + // Parse scheme + static const std::string scheme_delim = "://"; + auto scheme_end = std::search(it, end, scheme_delim.begin(), scheme_delim.end()); + if (scheme_end != end) + { + result.scheme = std::string(it, scheme_end); + it = scheme_end + scheme_delim.size(); // Skip "://" + } + + // Parse user and password (if present) + auto user_info_end = std::find(it, end, '@'); + if (user_info_end != end) + { + std::string user_info(it, user_info_end); + auto colon_pos = user_info.find(':'); + if (colon_pos != std::string::npos) + { + result.user = user_info.substr(0, colon_pos); + result.password = user_info.substr(colon_pos + 1); + } + else + { + result.user = user_info; + } + it = user_info_end + 1; // Skip '@' + } + + // Parse host (including port if present) + auto host_end = std::find_if(it, end, [](char ch) + { return ch == '/' || ch == '?' || ch == '#'; }); + result.host = std::string(it, host_end); + it = host_end; + + // Parse path + if (it != end && *it == '/') + { + auto path_end = std::find_if(it, end, [](char ch) + { return ch == '?' || ch == '#'; }); + result.path = std::string(it, path_end); + it = path_end; + } + + // Parse raw_query + if (it != end && *it == '?') + { + auto query_end = std::find(it, end, '#'); + result.raw_query = std::string(it + 1, query_end); // Skip '?' + it = query_end; + } + + // Parse fragment + if (it != end && *it == '#') + { + result.fragment = std::string(it + 1, end); // Skip '#' + } + + return result; + } +}; + +// [[Rcpp::export]] +Rcpp::List parse_url(const std::string &url) +{ + URL parsed_url = URLParser::parse(url); + std::string raw_path = parsed_url.path; + std::string path; + std::string escaped_path; + std::string raw_query; + + // Ensure raw_path starts with "/" + if (raw_path.empty()) + { + raw_path = "/"; + } + else if (raw_path[0] != '/') + { + raw_path = "/" + raw_path; + } + + path = internal_url_unencode(raw_path); + // escaped path: encodePath + escaped_path = internal_url_encode(raw_path, "$&+,/;:=@"); + + if (escaped_path == raw_path) + { + raw_path = ""; + } + + raw_query = decode_query_string(parse_query_string(parsed_url.raw_query)); + + Rcpp::List result; + result = Rcpp::List::create( + Rcpp::Named("scheme") = parsed_url.scheme, + Rcpp::Named("opaque") = "", + Rcpp::Named("user") = parsed_url.user, + Rcpp::Named("password") = parsed_url.password, + Rcpp::Named("host") = parsed_url.host, + Rcpp::Named("path") = path, + Rcpp::Named("raw_path") = raw_path, + Rcpp::Named("force_query") = false, + Rcpp::Named("raw_query") = raw_query, + Rcpp::Named("fragment") = parsed_url.fragment); + + result.attr("class") = "struct"; + return result; +} + +// [[Rcpp::export]] +std::string build_url(const Rcpp::List &url_components) +{ + URL url; + url.scheme = Rcpp::as(url_components["scheme"]); + url.user = Rcpp::as(url_components["user"]); + url.password = Rcpp::as(url_components["password"]); + url.host = Rcpp::as(url_components["host"]); + url.raw_path = Rcpp::as(url_components["raw_path"]); + url.path = Rcpp::as(url_components["path"]); + url.raw_query = Rcpp::as(url_components["raw_query"]); + url.fragment = Rcpp::as(url_components["fragment"]); + return url.toString(); +} diff --git a/paws.common/tests/testthat/test_credentials.R b/paws.common/tests/testthat/test_credentials.R index 14331e588..e60e8a71c 100644 --- a/paws.common/tests/testthat/test_credentials.R +++ b/paws.common/tests/testthat/test_credentials.R @@ -35,7 +35,7 @@ test_that("credentials not provided", { }) test_that("credentials expired", { - creds <- paws.common:::Creds( + creds <- Creds( access_key_id = "foo", secret_access_key = "bar", expiration = 1000 @@ -43,10 +43,10 @@ test_that("credentials expired", { expect_false(is_credentials_provided(creds)) - creds <- paws.common:::Creds( + creds <- Creds( access_key_id = "foo", secret_access_key = "bar", - expiration = Sys.time() - 5*60 + expiration = Sys.time() - 5 * 60 ) expect_false(is_credentials_provided(creds)) @@ -54,7 +54,7 @@ test_that("credentials expired", { creds <- Creds( access_key_id = "foo", secret_access_key = "bar", - expiration = Sys.time() + 30*60 + expiration = Sys.time() + 30 * 60 ) expect_true(is_credentials_provided(creds)) diff --git a/paws.common/tests/testthat/test_escape.R b/paws.common/tests/testthat/test_escape.R index dfb7d162d..185b78e2d 100644 --- a/paws.common/tests/testthat/test_escape.R +++ b/paws.common/tests/testthat/test_escape.R @@ -54,3 +54,8 @@ test_that("check if json string is converted correctly", { string <- paste0(c(letters, LETTERS, intToUtf8(1:31, multiple = TRUE), "\\", '"', "\b", "\f", "\r", "\t", "\n"), collapse = "") expect_equal(json_convert_string(string), expect) }) + +test_that("check encoding and unencoding", { + expect <- "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789-._~`!@#$%^&*()=+[{]}\\|;:'\",<>/? " + expect_equal(paws_url_unencoder(escape(expect, "encodePath")), expect) +}) diff --git a/paws.common/tests/testthat/test_url.R b/paws.common/tests/testthat/test_url.R index 03741db1e..c8fb9a835 100644 --- a/paws.common/tests/testthat/test_url.R +++ b/paws.common/tests/testthat/test_url.R @@ -1,23 +1,47 @@ +test_that("parsing complex URL", { + expected <- "https://user_1:password_1@example.org:8080/dir/../api?q=1#frag" + expect_equal(build_url(parse_url(expected)), expected) +}) + test_that("parsing and building URLs", { - input <- "https://example.com/a%20path%20with%20spaces" - actual <- build_url(parse_url(input)) - expected <- input + expected <- "https://example.com/a%20path%20with%20spaces" + actual <- build_url(parse_url(expected)) + expect_equal(actual, expected) + + expected <- "https://example.com/a-path-without-spaces" + actual <- build_url(parse_url(expected)) expect_equal(actual, expected) - input <- "https://example.com/a-path-without-spaces" - actual <- build_url(parse_url(input)) - expected <- input + expected <- "https://example.com/a-path-without-spaces?foo=bar&baz=qux" + actual <- build_url(parse_url(expected)) + expect_equal(actual, expected) + + expected <- "https://example.com/a-path-without-spaces?foo=bar&baz=qux#frag1" + actual <- build_url(parse_url(expected)) expect_equal(actual, expected) }) test_that("parse and build query strings", { # One parameter with a value, one without a value. - input <- "bar=baz&foo=" - actual <- build_query_string(parse_query_string(input)) - expected <- input + expected <- "bar=baz&foo=" + actual <- build_query_string(parse_query_string(expected)) expect_equal(actual, expected) }) +test_that("build_url no scheme or host", { + input <- list( + scheme = "", + host = "", + user = "user", + password = "password", + raw_path = "", + path = "/a-path-without-space", + raw_query = "foo=bar&baz=qux", + fragment = "frag1" + ) + expect_equal(build_url(input), "") +}) + test_that("missing query values become empty strings", { expect_equal(parse_query_string("?q="), list(q = "")) expect_equal(parse_query_string("?q"), list(q = ""))