From ded9a683a3ea33b0bc27d91608f3d28212913c1d Mon Sep 17 00:00:00 2001 From: Joshua James Venter Date: Tue, 9 Jul 2024 22:45:36 +0200 Subject: [PATCH 1/7] Refactor `atol`: StringRef to StringSlice and reusability - StringRef to StringSlice - Introduce _handle_base_prefix & _trim_and_handle_sign - Changed var to alias Signed-off-by: Joshua James Venter --- stdlib/src/builtin/int.mojo | 34 +++-- stdlib/src/builtin/string.mojo | 180 ++++++++++++++++--------- stdlib/src/builtin/string_literal.mojo | 2 +- stdlib/src/utils/stringref.mojo | 2 +- 4 files changed, 141 insertions(+), 77 deletions(-) diff --git a/stdlib/src/builtin/int.mojo b/stdlib/src/builtin/int.mojo index 31e0e4c234..524ea57d84 100644 --- a/stdlib/src/builtin/int.mojo +++ b/stdlib/src/builtin/int.mojo @@ -218,21 +218,39 @@ fn int[T: IntableRaising](value: T) raises -> Int: fn int(value: String, base: Int = 10) raises -> Int: - """Parses the given string as an integer in the given base and returns that value. + """Parses and returns the given string as an integer in the given base. - For example, `atol("19")` returns `19`. If the given string cannot be parsed - as an integer value, an error is raised. For example, `atol("hi")` raises an - error. - - If base is 0 the the string is parsed as an Integer literal, - see: https://docs.python.org/3/reference/lexical_analysis.html#integers + If base is set to 0, the string is parsed as an Integer literal, with the + following considerations: + - '0b' or '0B' prefix indicates binary (base 2) + - '0o' or '0O' prefix indicates octal (base 8) + - '0x' or '0X' prefix indicates hexadecimal (base 16) + - Without a prefix, it's treated as decimal (base 10) Args: value: A string to be parsed as an integer in the given base. base: Base used for conversion, value must be between 2 and 36, or 0. Returns: - An integer value that represents the string, or otherwise raises. + An integer value that represents the string. + + Raises: + If the given string cannot be parsed as an integer value or if an + incorrect base is provided. + + Examples: + >>> int("32") + 32 + >>> int("FF", 16) + 255 + >>> int("0xFF", 0) + 255 + >>> int("0b1010", 0) + 10 + + Notes: + This follows [Python's integer literals]( + https://docs.python.org/3/reference/lexical_analysis.html#integers). """ return atol(value, base) diff --git a/stdlib/src/builtin/string.mojo b/stdlib/src/builtin/string.mojo index adf369e9ff..be05982b2d 100644 --- a/stdlib/src/builtin/string.mojo +++ b/stdlib/src/builtin/string.mojo @@ -212,15 +212,15 @@ fn ascii(value: String) -> String: # ===----------------------------------------------------------------------=== # -fn _atol(str_ref: StringRef, base: Int = 10) raises -> Int: - """Implementation of `atol` for StringRef inputs. +fn _atol(str_slice: StringSlice, base: Int = 10) raises -> Int: + """Implementation of `atol` for StringSlice inputs. Please see its docstring for details. """ if (base != 0) and (base < 2 or base > 36): raise Error("Base must be >= 2 and <= 36, or 0.") - if not str_ref: - raise Error(_atol_error(base, str_ref)) + if not str_slice: + raise Error(_str_to_base_error(base, str_slice)) var real_base: Int var ord_num_max: Int @@ -230,53 +230,23 @@ fn _atol(str_ref: StringRef, base: Int = 10) raises -> Int: var is_negative: Bool = False var has_prefix: Bool = False var start: Int = 0 - var str_len = len(str_ref) - var buff = str_ref.unsafe_ptr() - - for pos in range(start, str_len): - if _isspace(buff[pos]): - continue - - if str_ref[pos] == "-": - is_negative = True - start = pos + 1 - elif str_ref[pos] == "+": - start = pos + 1 - else: - start = pos - break + var str_len = len(str_slice) - if str_ref[start] == "0" and start + 1 < str_len: - if base == 2 and ( - str_ref[start + 1] == "b" or str_ref[start + 1] == "B" - ): - start += 2 - has_prefix = True - elif base == 8 and ( - str_ref[start + 1] == "o" or str_ref[start + 1] == "O" - ): - start += 2 - has_prefix = True - elif base == 16 and ( - str_ref[start + 1] == "x" or str_ref[start + 1] == "X" - ): - start += 2 - has_prefix = True + start, is_negative = _trim_and_handle_sign(str_slice, str_len) alias ord_0 = ord("0") - # FIXME: - # Change this to `alias` after fixing support for __getitem__ of alias. - var ord_letter_min = (ord("a"), ord("A")) + alias ord_letter_min = (ord("a"), ord("A")) alias ord_underscore = ord("_") if base == 0: - var real_base_new_start = _identify_base(str_ref, start) + var real_base_new_start = _identify_base(str_slice, start) real_base = real_base_new_start[0] start = real_base_new_start[1] has_prefix = real_base != 10 if real_base == -1: - raise Error(_atol_error(base, str_ref)) + raise Error(_str_to_base_error(base, str_slice)) else: + start, has_prefix = _handle_base_prefix(start, str_slice, str_len, base) real_base = base if real_base <= 10: @@ -288,21 +258,23 @@ fn _atol(str_ref: StringRef, base: Int = 10) raises -> Int: ord("A") + (real_base - 11), ) + var buff = str_slice.unsafe_ptr() var found_valid_chars_after_start = False var has_space_after_number = False + # Prefixed integer literals with real_base 2, 8, 16 may begin with leading # underscores under the conditions they have a prefix - var was_last_digit_undescore = not (real_base in (2, 8, 16) and has_prefix) + var was_last_digit_underscore = not (real_base in (2, 8, 16) and has_prefix) for pos in range(start, str_len): var ord_current = int(buff[pos]) if ord_current == ord_underscore: - if was_last_digit_undescore: - raise Error(_atol_error(base, str_ref)) + if was_last_digit_underscore: + raise Error(_str_to_base_error(base, str_slice)) else: - was_last_digit_undescore = True + was_last_digit_underscore = True continue else: - was_last_digit_undescore = False + was_last_digit_underscore = False if ord_0 <= ord_current <= ord_num_max: result += ord_current - ord_0 found_valid_chars_after_start = True @@ -317,45 +289,101 @@ fn _atol(str_ref: StringRef, base: Int = 10) raises -> Int: start = pos + 1 break else: - raise Error(_atol_error(base, str_ref)) + raise Error(_str_to_base_error(base, str_slice)) if pos + 1 < str_len and not _isspace(buff[pos + 1]): var nextresult = result * real_base if nextresult < result: raise Error( - _atol_error(base, str_ref) + _str_to_base_error(base, str_slice) + " String expresses an integer too large to store in Int." ) result = nextresult - if was_last_digit_undescore or (not found_valid_chars_after_start): - raise Error(_atol_error(base, str_ref)) + if was_last_digit_underscore or (not found_valid_chars_after_start): + raise Error(_str_to_base_error(base, str_slice)) if has_space_after_number: for pos in range(start, str_len): if not _isspace(buff[pos]): - raise Error(_atol_error(base, str_ref)) + raise Error(_str_to_base_error(base, str_slice)) if is_negative: result = -result return result -fn _atol_error(base: Int, str_ref: StringRef) -> String: +@always_inline +fn _trim_and_handle_sign(str_slice: StringSlice, str_len: Int) -> (Int, Bool): + """Trims leading whitespace, handles the sign of the number in the string. + + Args: + str_slice: A StringSlice containing the number to parse. + str_len: The length of the string. + + Returns: + A tuple containing: + - The starting index of the number after whitespace and sign. + - A boolean indicating whether the number is negative. + """ + var buff = str_slice.unsafe_ptr() + var start: Int = 0 + while start < str_len and _isspace(buff[start]): + start += 1 + var p: Bool = buff[start] == ord("+") + var n: Bool = buff[start] == ord("-") + return start + (p or n), n + + +@always_inline +fn _handle_base_prefix( + pos: Int, str_slice: StringSlice, str_len: Int, base: Int +) -> (Int, Bool): + """Adjusts the starting position if a valid base prefix is present. + + Handles "0b"/"0B" for base 2, "0o"/"0O" for base 8, and "0x"/"0X" for base + 16. Only adjusts if the base matches the prefix. + + Args: + pos: Current position in the string. + str_slice: The input StringSlice. + str_len: Length of the input string. + base: The specified base. + + Returns: + A tuple containing: + - Updated position after the prefix, if applicable. + - A boolean indicating if the prefix was valid for the given base. + """ + var start = pos + var buff = str_slice.unsafe_ptr() + if start + 1 < str_len: + var prefix_char = chr(int(buff[start + 1])) + if buff[start] == ord("0") and ( + (base == 2 and (prefix_char == "b" or prefix_char == "B")) + or (base == 8 and (prefix_char == "o" or prefix_char == "O")) + or (base == 16 and (prefix_char == "x" or prefix_char == "X")) + ): + start += 2 + return start, start != pos + + +fn _str_to_base_error(base: Int, str_slice: StringSlice) -> String: return ( "String is not convertible to integer with base " + str(base) + ": '" - + str(str_ref) + + str(str_slice) + "'" ) -fn _identify_base(str_ref: StringRef, start: Int) -> Tuple[Int, Int]: - var length = len(str_ref) +fn _identify_base(str_slice: StringSlice, start: Int) -> Tuple[Int, Int]: + var length = len(str_slice) + var buff = str_slice.unsafe_ptr() # just 1 digit, assume base 10 if start == (length - 1): return 10, start - if str_ref[start] == "0": - var second_digit = str_ref[start + 1] + if buff[start] == ord("0"): + var second_digit = chr(int(buff[start + 1])) if second_digit == "b" or second_digit == "B": return 2, start + 2 if second_digit == "o" or second_digit == "O": @@ -365,7 +393,7 @@ fn _identify_base(str_ref: StringRef, start: Int) -> Tuple[Int, Int]: # checking for special case of all "0", "_" are also allowed var was_last_character_underscore = False for i in range(start + 1, length): - if str_ref[i] == "_": + if buff[i] == ord("_"): if was_last_character_underscore: return -1, -1 else: @@ -373,9 +401,9 @@ fn _identify_base(str_ref: StringRef, start: Int) -> Tuple[Int, Int]: continue else: was_last_character_underscore = False - if str_ref[i] != "0": + if buff[i] != ord("0"): return -1, -1 - elif ord("1") <= ord(str_ref[start]) <= ord("9"): + elif ord("1") <= int(buff[start]) <= ord("9"): return 10, start else: return -1, -1 @@ -386,21 +414,39 @@ fn _identify_base(str_ref: StringRef, start: Int) -> Tuple[Int, Int]: fn atol(str: String, base: Int = 10) raises -> Int: """Parses and returns the given string as an integer in the given base. - For example, `atol("19")` returns `19`. If base is 0 the the string is - parsed as an Integer literal, see: https://docs.python.org/3/reference/lexical_analysis.html#integers. - - Raises: - If the given string cannot be parsed as an integer value. For example in - `atol("hi")`. + If base is set to 0, the string is parsed as an Integer literal, with the + following considerations: + - '0b' or '0B' prefix indicates binary (base 2) + - '0o' or '0O' prefix indicates octal (base 8) + - '0x' or '0X' prefix indicates hexadecimal (base 16) + - Without a prefix, it's treated as decimal (base 10) Args: str: A string to be parsed as an integer in the given base. base: Base used for conversion, value must be between 2 and 36, or 0. Returns: - An integer value that represents the string, or otherwise raises. + An integer value that represents the string. + + Raises: + If the given string cannot be parsed as an integer value or if an + incorrect base is provided. + + Examples: + >>> atol("32") + 32 + >>> atol("FF", 16) + 255 + >>> atol("0xFF", 0) + 255 + >>> atol("0b1010", 0) + 10 + + Notes: + This follows [Python's integer literals]( + https://docs.python.org/3/reference/lexical_analysis.html#integers). """ - return _atol(str._strref_dangerous(), base) + return _atol(str.as_string_slice(), base) fn _atof_error(str_ref: StringRef) -> Error: diff --git a/stdlib/src/builtin/string_literal.mojo b/stdlib/src/builtin/string_literal.mojo index 650904b235..438e973213 100644 --- a/stdlib/src/builtin/string_literal.mojo +++ b/stdlib/src/builtin/string_literal.mojo @@ -210,7 +210,7 @@ struct StringLiteral( Returns: An integer value that represents the string, or otherwise raises. """ - return _atol(self) + return _atol(self.as_string_slice()) @no_inline fn __str__(self) -> String: diff --git a/stdlib/src/utils/stringref.mojo b/stdlib/src/utils/stringref.mojo index 13becfa999..8a4276a738 100644 --- a/stdlib/src/utils/stringref.mojo +++ b/stdlib/src/utils/stringref.mojo @@ -365,7 +365,7 @@ struct StringRef( Returns: An integer value that represents the string, or otherwise raises. """ - return _atol(self) + return atol(self) @always_inline fn __len__(self) -> Int: From 6de98ee97d744c50f102559145aee2068a307e09 Mon Sep 17 00:00:00 2001 From: Joshua James Venter Date: Thu, 4 Jul 2024 20:08:58 +0200 Subject: [PATCH 2/7] Add `stol` function, mirroring `atol` with additional functionality This commit introduces the `stol` (string to long) function, which closely follows the implementation of `atol` while providing extended functionality: - Parses integer literals following `atol`'s logic and base handling (2-36) - Maintains consistency with `atol` in handling base prefixes (0b, 0o, 0x) - Extends `atol` by returning both the parsed integer and remaining string - Stops at the first invalid character instead of raising an error Key differences from `atol`: - Returns a tuple (parsed_int, remaining_string) instead of just an int - Does not raise an error for partially valid inputs This function provides a more flexible parsing option while maintaining consistency with existing string-to-integer conversion in the standard library. Signed-off-by: Joshua James Venter --- stdlib/src/builtin/string.mojo | 321 +++++++++++++++++++++++---- stdlib/test/builtin/test_string.mojo | 147 ++++++++++++ 2 files changed, 429 insertions(+), 39 deletions(-) diff --git a/stdlib/src/builtin/string.mojo b/stdlib/src/builtin/string.mojo index ab9c216543..22611091d3 100644 --- a/stdlib/src/builtin/string.mojo +++ b/stdlib/src/builtin/string.mojo @@ -213,6 +213,198 @@ fn ascii(value: String) -> String: # ===----------------------------------------------------------------------=== # +@always_inline +fn _stol(str_ref: StringRef, base: Int = 10) raises -> (Int, StringRef): + """Implementation if `stol` for StringRef inputs. + + Please see its docstring for details. + """ + if (base != 0) and (base < 2 or base > 36): + raise Error("Base must be >= 2 and <= 36, or 0.") + + if not str_ref: + raise Error("Cannot convert empty string to integer.") + + var result: Int = 0 + var real_base: Int + var start: Int = 0 + var is_negative: Bool = False + var str_len = len(str_ref) + var buff = str_ref.unsafe_ptr() + + start, is_negative = _trim_and_handle_sign(str_ref, str_len) + + if start == str_len or not _is_valid_digit(int(buff[start]), base): + return 0, str_ref + + start = _handle_base_prefix(start, str_ref, str_len, base) + + if base == 0: + if start == (str_len - 1): + real_base = 10 + elif str_ref[start] == "0" and start + 1 < str_len: + var second_digit = str_ref[start + 1] + if second_digit == "b" or second_digit == "B": + real_base = 2 + start += 1 # Move past the '0', but not the 'b' + elif second_digit == "o" or second_digit == "O": + real_base = 8 + start += 1 + elif second_digit == "x" or second_digit == "X": + real_base = 16 + start += 1 + else: + real_base = 10 + + # Check if the character after the prefix is valid + if real_base != 10: + if start + 1 < str_len and _is_valid_digit( + int(buff[start + 1]), real_base + ): + start += 1 # Move past the prefix character + else: + # Invalid prefix or digit after prefix + return 0, StringRef( + str_ref.unsafe_ptr() + start, str_len - start + ) + else: + real_base = 10 + else: + real_base = base + + var ord_num_max: Int + var ord_letter_max = (-1, -1) + alias ord_0 = ord("0") + var ord_letter_min = (ord("a"), ord("A")) + alias ord_underscore = ord("_") + + if real_base <= 10: + ord_num_max = ord(str(real_base - 1)) + else: + ord_num_max = ord("9") + ord_letter_max = ( + ord("a") + (real_base - 11), + ord("A") + (real_base - 11), + ) + + var was_last_digit_underscore = True + var prev_result: Int = 0 + for pos in range(start, str_len): + prev_result = result + var ord_current = int(buff[pos]) + if ord_current == ord_underscore and was_last_digit_underscore: + break # Break out as apposed to raising exception + if ord_current == ord_underscore: + was_last_digit_underscore = True + continue + + was_last_digit_underscore = False + + var digit_value: Int + if ord_0 <= ord_current <= ord_num_max: + digit_value = ord_current - ord_0 + elif ord_letter_min[0] <= ord_current <= ord_letter_max[0]: + digit_value = ord_current - ord_letter_min[0] + 10 + elif ord_letter_min[1] <= ord_current <= ord_letter_max[1]: + digit_value = ord_current - ord_letter_min[1] + 10 + else: + break + + if digit_value >= real_base: + break + + var new_result = result * real_base + digit_value + if new_result < result: + raise Error( + _str_to_base_error(real_base, str_ref) + + " String expresses an integer too large to store in Int." + ) + result = new_result + start = pos + 1 + + if is_negative: + result = -result + + return result, StringRef(str_ref.unsafe_ptr() + start, str_len - start) + + +fn stol(str: String, base: Int = 10) raises -> (Int, String): + """Convert a string to a integer and return the remaining unparsed string. + + Similar to `atol`, but `stol` parses only a portion of the string and returns + both the parsed integer and the remaining unparsed part. For example, `stol("32abc")` returns `(32, "abc")`. + + If base is 0, the string is parsed as an [Integer literal][1], with the following considerations: + - '0b' or '0B' prefix indicates binary (base 2) + - '0o' or '0O' prefix indicates octal (base 8) + - '0x' or '0X' prefix indicates hexadecimal (base 16) + - Without a prefix, it's treated as decimal (base 10) + + Raises: + If the base is invalid or if the string is empty. + + Args: + str: A string to be parsed as an integer in the given base. + base: Base used for conversion, value must be between 2 and 36, or 0. + + Returns: + A tuple containing: + - An integer value representing the parsed part of the string. + - The remaining unparsed part of the string. + + Examples: + >>> stol("19abc") + (19, "abc") + >>> stol("0xFF hello", 16) + (255, " hello") + >>> stol("0x123ghi", 0) + (291, "ghi") + >>> stol("0b1010 binary", 0) + (10, " binary") + >>> stol("0o123 octal", 0) + (83, " octal") + + See Also: + `atol`: A similar function that parses the entire string and returns an integer. + + [1]: https://docs.python.org/3/reference/lexical_analysis.html#integers. + + """ + var result: Int + var remaining: StringRef + result, remaining = _stol(str._strref_dangerous(), base) + + return result, String(remaining) + + +@always_inline +fn _is_valid_digit(char: UInt8, base: Int) -> Bool: + """Checks if a character is a valid digit for the given base. + + Args: + char: The character to check, as a UInt8. + base: The numeric base (0-36, where 0 is special case). + + Returns: + True if the character is a valid digit for the given base, False otherwise. + """ + if base == 0: + # For base 0, we need to allow 0-9 and a-f/A-F for potential hex numbers + if char >= ord("0") and char <= ord("9"): + return True + var upper_char = char & ~32 # Convert to uppercase + return upper_char >= ord("A") and upper_char <= ord("F") + + if char >= ord("0") and char <= ord("9"): + return (char - ord("0")) < base + if base <= 10: + return False + var upper_char = char & ~32 # Convert to uppercase + if upper_char >= ord("A") and upper_char <= ord("Z"): + return (upper_char - ord("A") + 10) < base + return False + + fn _atol(str_ref: StringRef, base: Int = 10) raises -> Int: """Implementation of `atol` for StringRef inputs. @@ -221,7 +413,7 @@ fn _atol(str_ref: StringRef, base: Int = 10) raises -> Int: if (base != 0) and (base < 2 or base > 36): raise Error("Base must be >= 2 and <= 36, or 0.") if not str_ref: - raise Error(_atol_error(base, str_ref)) + raise Error(_str_to_base_error(base, str_ref)) var real_base: Int var ord_num_max: Int @@ -234,35 +426,10 @@ fn _atol(str_ref: StringRef, base: Int = 10) raises -> Int: var str_len = len(str_ref) var buff = str_ref.unsafe_ptr() - for pos in range(start, str_len): - if _isspace(buff[pos]): - continue + start, is_negative = _trim_and_handle_sign(str_ref, str_len) - if str_ref[pos] == "-": - is_negative = True - start = pos + 1 - elif str_ref[pos] == "+": - start = pos + 1 - else: - start = pos - break - if str_ref[start] == "0" and start + 1 < str_len: - if base == 2 and ( - str_ref[start + 1] == "b" or str_ref[start + 1] == "B" - ): - start += 2 - has_prefix = True - elif base == 8 and ( - str_ref[start + 1] == "o" or str_ref[start + 1] == "O" - ): - start += 2 - has_prefix = True - elif base == 16 and ( - str_ref[start + 1] == "x" or str_ref[start + 1] == "X" - ): - start += 2 - has_prefix = True + start = _handle_base_prefix(start, str_ref, str_len, base) alias ord_0 = ord("0") # FIXME: @@ -276,7 +443,7 @@ fn _atol(str_ref: StringRef, base: Int = 10) raises -> Int: start = real_base_new_start[1] has_prefix = real_base != 10 if real_base == -1: - raise Error(_atol_error(base, str_ref)) + raise Error(_str_to_base_error(base, str_ref)) else: real_base = base @@ -298,7 +465,7 @@ fn _atol(str_ref: StringRef, base: Int = 10) raises -> Int: var ord_current = int(buff[pos]) if ord_current == ord_underscore: if was_last_digit_undescore: - raise Error(_atol_error(base, str_ref)) + raise Error(_str_to_base_error(base, str_ref)) else: was_last_digit_undescore = True continue @@ -318,29 +485,96 @@ fn _atol(str_ref: StringRef, base: Int = 10) raises -> Int: start = pos + 1 break else: - raise Error(_atol_error(base, str_ref)) + raise Error(_str_to_base_error(base, str_ref)) if pos + 1 < str_len and not _isspace(buff[pos + 1]): var nextresult = result * real_base if nextresult < result: raise Error( - _atol_error(base, str_ref) + _str_to_base_error(base, str_ref) + " String expresses an integer too large to store in Int." ) result = nextresult if was_last_digit_undescore or (not found_valid_chars_after_start): - raise Error(_atol_error(base, str_ref)) + raise Error(_str_to_base_error(base, str_ref)) if has_space_after_number: for pos in range(start, str_len): if not _isspace(buff[pos]): - raise Error(_atol_error(base, str_ref)) + raise Error(_str_to_base_error(base, str_ref)) if is_negative: result = -result return result -fn _atol_error(base: Int, str_ref: StringRef) -> String: +@always_inline +fn _trim_and_handle_sign(str_ref: StringRef, str_len: Int) -> (Int, Bool): + """Trims leading whitespace and handles the sign of the number in the string. + + Args: + str_ref: A StringRef containing the number to parse. + str_len: The length of the string. + + Returns: + A tuple containing: + - The starting index of the number after whitespace and sign. + - A boolean indicating whether the number is negative. + """ + var buff = str_ref.unsafe_ptr() + var is_negative: Bool = False + var start: Int = 0 + for pos in range(start, str_len): + if _isspace(buff[pos]): + continue + + if str_ref[pos] == "-": + is_negative = True + start = pos + 1 + elif str_ref[pos] == "+": + start = pos + 1 + else: + start = pos + break + + return start, is_negative + + +@always_inline +fn _handle_base_prefix( + pos: Int, str_ref: StringRef, str_len: Int, base: Int +) -> Int: + """Adjusts the starting position if a valid base prefix is present. + + Handles "0b"/"0B" for base 2, "0o"/"0O" for base 8, and "0x"/"0X" for base 16. + Only adjusts if the base matches the prefix. + + Args: + pos: Current position in the string. + str_ref: The input string. + str_len: Length of the input string. + base: The specified base. + + Returns: + Updated position after the prefix, if applicable. + """ + var start = pos + if str_ref[start] == "0" and start + 1 < str_len: + if base == 2 and ( + str_ref[start + 1] == "b" or str_ref[start + 1] == "B" + ): + start += 2 + elif base == 8 and ( + str_ref[start + 1] == "o" or str_ref[start + 1] == "O" + ): + start += 2 + elif base == 16 and ( + str_ref[start + 1] == "x" or str_ref[start + 1] == "X" + ): + start += 2 + return start + + +fn _str_to_base_error(base: Int, str_ref: StringRef) -> String: return ( "String is not convertible to integer with base " + str(base) @@ -387,12 +621,16 @@ fn _identify_base(str_ref: StringRef, start: Int) -> Tuple[Int, Int]: fn atol(str: String, base: Int = 10) raises -> Int: """Parses and returns the given string as an integer in the given base. - For example, `atol("19")` returns `19`. If base is 0 the the string is - parsed as an Integer literal, see: https://docs.python.org/3/reference/lexical_analysis.html#integers. + For example, `atol("32")` returns `32`. If base is 0, the string is parsed as an [Integer literal][1], with the following considerations: + - '0b' or '0B' prefix indicates binary (base 2) + - '0o' or '0O' prefix indicates octal (base 8) + - '0x' or '0X' prefix indicates hexadecimal (base 16) + - Without a prefix, it's treated as decimal (base 10) Raises: - If the given string cannot be parsed as an integer value. For example in - `atol("hi")`. + - If the given string cannot be parsed as an integer value. For example in + `atol("Mojo")`, + - Incorrect base is provided. Args: str: A string to be parsed as an integer in the given base. @@ -400,6 +638,11 @@ fn atol(str: String, base: Int = 10) raises -> Int: Returns: An integer value that represents the string, or otherwise raises. + + See Also: + `stol`: A similar function that returns both the parsed integer and the remaining unparsed string. + + [1]: https://docs.python.org/3/reference/lexical_analysis.html#integers. """ return _atol(str._strref_dangerous(), base) diff --git a/stdlib/test/builtin/test_string.mojo b/stdlib/test/builtin/test_string.mojo index 6341bce169..08b6a0912a 100644 --- a/stdlib/test/builtin/test_string.mojo +++ b/stdlib/test/builtin/test_string.mojo @@ -341,6 +341,151 @@ def test_string_indexing(): assert_equal("H", str[-50::50]) +def test_stol(): + var result: Int + var remaining: String + + # base 10 + result, remaining = stol(String("375 ABC")) + assert_equal(375, result) + assert_equal(" ABC", remaining) + result, remaining = stol(String(" 005")) + assert_equal(5, result) + assert_equal("", remaining) + result, remaining = stol(String(" 013 ")) + assert_equal(13, result) + assert_equal(" ", remaining) + result, remaining = stol(String("-89")) + assert_equal(-89, result) + assert_equal("", remaining) + result, remaining = stol(String(" -52")) + assert_equal(-52, result) + assert_equal("", remaining) + + # other bases + result, remaining = stol(" FF", 16) + assert_equal(255, result) + assert_equal("", remaining) + result, remaining = stol(" 0xff ", 16) + assert_equal(255, result) + assert_equal(" ", remaining) + result, remaining = stol("10010eighteen18", 2) + assert_equal(18, result) + assert_equal("eighteen18", remaining) + result, remaining = stol("0b10010", 2) + assert_equal(18, result) + assert_equal("", remaining) + result, remaining = stol("0o12", 8) + assert_equal(10, result) + assert_equal("", remaining) + result, remaining = stol("Z", 36) + assert_equal(35, result) + assert_equal("", remaining) + + # test with trailing characters + result, remaining = stol("123abc") + assert_equal(123, result) + assert_equal("abc", remaining) + result, remaining = stol("-45def") + assert_equal(-45, result) + assert_equal("def", remaining) + result, remaining = stol("0xffghi", 0) + assert_equal(255, result) + assert_equal("ghi", remaining) + + result, remaining = stol(" ") + assert_equal(0, result) + assert_equal(" ", remaining) + + result, remaining = stol("123.456", 10) + assert_equal(123, result) + assert_equal(".456", remaining) + result, remaining = stol("--123", 10) + assert_equal(0, result) + assert_equal("--123", remaining) + + result, remaining = stol("12a34", 10) + assert_equal(12, result) + assert_equal("a34", remaining) + result, remaining = stol("1G5", 16) + assert_equal(1, result) + assert_equal("G5", remaining) + + result, remaining = stol("-1A", 16) + assert_equal(-26, result) + assert_equal("", remaining) + result, remaining = stol("-110", 2) + assert_equal(-6, result) + assert_equal("", remaining) + + result, remaining = stol("Mojo!") + assert_equal(0, result) + assert_equal("Mojo!", remaining) + + # Negative Cases + with assert_raises(contains="Cannot convert empty string to integer."): + _ = stol("") + + with assert_raises(contains="Base must be >= 2 and <= 36, or 0."): + _ = stol("Bad Base", 42) + + with assert_raises( + contains="String expresses an integer too large to store in Int." + ): + _ = stol(String("9223372036854775832"), 10) + + +def test_stol_base_0(): + var result: Int + var remaining: String + + result, remaining = stol("155_155", 0) + assert_equal(155155, result) + assert_equal("", remaining) + result, remaining = stol("1_2_3_4_5", 0) + assert_equal(12345, result) + assert_equal("", remaining) + result, remaining = stol("1_2_3_4_5_", 0) + assert_equal(12345, result) + assert_equal("_", remaining) + result, remaining = stol("0b1_0_1_0", 0) + assert_equal(10, result) + assert_equal("", remaining) + result, remaining = stol("0o1_2_3", 0) + assert_equal(83, result) + assert_equal("", remaining) + result, remaining = stol("0x1_A_B", 0) + assert_equal(427, result) + assert_equal("", remaining) + result, remaining = stol("123_", 0) + assert_equal(123, result) + assert_equal("_", remaining) + result, remaining = stol("_123", 0) + assert_equal(0, result) + assert_equal("_123", remaining) + result, remaining = stol("123__456", 0) + assert_equal(123, result) + assert_equal("__456", remaining) + result, remaining = stol("0x_123", 0) + assert_equal(0, result) + assert_equal("x_123", remaining) + result, remaining = stol("0x1_23", 0) + assert_equal(291, result) + assert_equal("", remaining) + result, remaining = stol("0_123", 0) + assert_equal(123, result) + assert_equal("", remaining) + result, remaining = stol("0z123", 0) + assert_equal(0, result) + assert_equal("z123", remaining) + result, remaining = stol("Mojo!", 0) + assert_equal(0, result) + assert_equal("Mojo!", remaining) + result, remaining = stol("0o123 octal", 0) + assert_equal(83, result) + assert_equal(" octal", remaining) + + def test_atol(): # base 10 assert_equal(375, atol(String("375"))) @@ -1602,6 +1747,8 @@ def main(): test_ord() test_chr() test_string_indexing() + test_stol() + test_stol_base_0() test_atol() test_atol_base_0() test_atof() From 2ad996567723843553e44ec0b2cefb83eb9c74ff Mon Sep 17 00:00:00 2001 From: Joshua James Venter <67124214+jjvraw@users.noreply.github.com> Date: Fri, 5 Jul 2024 22:28:19 +0200 Subject: [PATCH 3/7] Apply suggestions from @martinvuyk Co-authored-by: martinvuyk <110240700+martinvuyk@users.noreply.github.com> Signed-off-by: Joshua James Venter <67124214+jjvraw@users.noreply.github.com> --- stdlib/src/builtin/string.mojo | 16 ++++++++++++---- stdlib/test/builtin/test_string.mojo | 21 ++++++++++++++++++--- 2 files changed, 30 insertions(+), 7 deletions(-) diff --git a/stdlib/src/builtin/string.mojo b/stdlib/src/builtin/string.mojo index 22611091d3..ad0a60e01f 100644 --- a/stdlib/src/builtin/string.mojo +++ b/stdlib/src/builtin/string.mojo @@ -287,10 +287,8 @@ fn _stol(str_ref: StringRef, base: Int = 10) raises -> (Int, StringRef): ord("A") + (real_base - 11), ) - var was_last_digit_underscore = True - var prev_result: Int = 0 + var was_last_digit_underscore = True if real_base == 10 else False for pos in range(start, str_len): - prev_result = result var ord_current = int(buff[pos]) if ord_current == ord_underscore and was_last_digit_underscore: break # Break out as apposed to raising exception @@ -314,7 +312,7 @@ fn _stol(str_ref: StringRef, base: Int = 10) raises -> (Int, StringRef): break var new_result = result * real_base + digit_value - if new_result < result: + if new_result <= result and result > 0: raise Error( _str_to_base_error(real_base, str_ref) + " String expresses an integer too large to store in Int." @@ -339,6 +337,9 @@ fn stol(str: String, base: Int = 10) raises -> (Int, String): - '0o' or '0O' prefix indicates octal (base 8) - '0x' or '0X' prefix indicates hexadecimal (base 16) - Without a prefix, it's treated as decimal (base 10) + Notes: + This follows [Python's integer literals](\ + https://docs.python.org/3/reference/lexical_analysis.html#integers) Raises: If the base is invalid or if the string is empty. @@ -353,6 +354,13 @@ fn stol(str: String, base: Int = 10) raises -> (Int, String): - The remaining unparsed part of the string. Examples: + ```mojo + print(stol("19abc")) # (19, "abc") + print(stol("0xFF hello", 16)) # (255, " hello") + print(stol("0x123ghi", 0)) # (291, "ghi") + print(stol("0b1010 binary", 0)) # (10, " binary") + print(stol("0o123 octal", 0)) # (83, " octal") + ``` >>> stol("19abc") (19, "abc") >>> stol("0xFF hello", 16) diff --git a/stdlib/test/builtin/test_string.mojo b/stdlib/test/builtin/test_string.mojo index 08b6a0912a..a35da14aba 100644 --- a/stdlib/test/builtin/test_string.mojo +++ b/stdlib/test/builtin/test_string.mojo @@ -374,9 +374,21 @@ def test_stol(): assert_equal("eighteen18", remaining) result, remaining = stol("0b10010", 2) assert_equal(18, result) + result, remaining = stol("0b_10010", 2) + assert_equal(18, result) + result, remaining = stol("0b_0010010", 2) + assert_equal(18, result) + result, remaining = stol("0b0000_0_010010", 2) + assert_equal(18, result) assert_equal("", remaining) result, remaining = stol("0o12", 8) assert_equal(10, result) + result, remaining = stol("0o_12", 8) + assert_equal(10, result) + result, remaining = stol("0o_012", 8) + assert_equal(10, result) + result, remaining = stol("0o0000_0_0012", 8) + assert_equal(10, result) assert_equal("", remaining) result, remaining = stol("Z", 36) assert_equal(35, result) @@ -391,6 +403,12 @@ def test_stol(): assert_equal("def", remaining) result, remaining = stol("0xffghi", 0) assert_equal(255, result) + result, remaining = stol("0x_ffghi", 0) + assert_equal(255, result) + result, remaining = stol("0x_0ffghi", 0) + assert_equal(255, result) + result, remaining = stol("0x0000_0_00ffghi", 0) + assert_equal(255, result) assert_equal("ghi", remaining) result, remaining = stol(" ") @@ -466,9 +484,6 @@ def test_stol_base_0(): result, remaining = stol("123__456", 0) assert_equal(123, result) assert_equal("__456", remaining) - result, remaining = stol("0x_123", 0) - assert_equal(0, result) - assert_equal("x_123", remaining) result, remaining = stol("0x1_23", 0) assert_equal(291, result) assert_equal("", remaining) From 4500c21648ea313a99565f26f84eaafcbb566507 Mon Sep 17 00:00:00 2001 From: Joshua James Venter Date: Fri, 2 Aug 2024 10:33:34 +0200 Subject: [PATCH 4/7] format Signed-off-by: Joshua James Venter --- stdlib/src/builtin/string.mojo | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/stdlib/src/builtin/string.mojo b/stdlib/src/builtin/string.mojo index 7b6de43dfe..74050c22a8 100644 --- a/stdlib/src/builtin/string.mojo +++ b/stdlib/src/builtin/string.mojo @@ -263,9 +263,7 @@ fn _stol(str_slice: StringSlice, base: Int = 10) raises -> (Int, String): start += 1 # Move past the prefix character else: # Invalid prefix or digit after prefix - return 0, String( - str_slice.unsafe_ptr() + start - ) + return 0, String(str_slice.unsafe_ptr() + start) else: real_base = 10 has_prefix = real_base != 10 From d7328b7f0fda2c6786812c24d5402fc8a620f422 Mon Sep 17 00:00:00 2001 From: Joshua James Venter Date: Sat, 30 Nov 2024 20:10:22 +0200 Subject: [PATCH 5/7] clean up with identiy_base Signed-off-by: Joshua James Venter --- stdlib/src/collections/string.mojo | 43 +++++++++--------------------- 1 file changed, 13 insertions(+), 30 deletions(-) diff --git a/stdlib/src/collections/string.mojo b/stdlib/src/collections/string.mojo index 6afbcd47a3..ae89a57603 100644 --- a/stdlib/src/collections/string.mojo +++ b/stdlib/src/collections/string.mojo @@ -235,7 +235,7 @@ fn ascii(value: String) -> String: @always_inline fn _stol(str_slice: StringSlice, base: Int = 10) raises -> (Int, String): - """Implementation if `stol` for StringRef inputs. + """Implementation if `stol` for StringSlice inputs. Please see its docstring for details. """ @@ -259,37 +259,20 @@ fn _stol(str_slice: StringSlice, base: Int = 10) raises -> (Int, String): return 0, String(str_slice) if base == 0: - if start == (str_len - 1): + var real_base_new_start = _identify_base(str_slice, start) + real_base = real_base_new_start[0] + + # If identify_base returns error but starts with 0, treat as base 10 + if real_base == -1 and buff[start] == ord("0"): real_base = 10 - elif buff[start] == ord("0") and start + 1 < str_len: - var second_digit = chr(int(buff[start + 1])) - if second_digit == "b" or second_digit == "B": - real_base = 2 - start += 1 # Move past the '0', but not the 'b' - elif second_digit == "o" or second_digit == "O": - real_base = 8 - start += 1 - elif second_digit == "x" or second_digit == "X": - real_base = 16 - start += 1 - else: - real_base = 10 - - # Check if the character after the prefix is valid - if real_base != 10: - if start + 1 < str_len and _is_valid_digit( - int(buff[start + 1]), real_base - ): - start += 1 # Move past the prefix character - else: - # Invalid prefix or digit after prefix - return 0, String( - StringSlice( - unsafe_from_utf8=str_slice.as_bytes()[start:] - ) - ) + # Keep original start position for base 10 else: - real_base = 10 + # For valid prefixes, use the new start position + if real_base != -1: + start = real_base_new_start[1] + else: + return 0, String(str_slice) + has_prefix = real_base != 10 else: start, has_prefix = _handle_base_prefix(start, str_slice, str_len, base) From 890c56e6770756d7819fc18cf00251437605258a Mon Sep 17 00:00:00 2001 From: Joshua James Venter Date: Sat, 30 Nov 2024 20:12:32 +0200 Subject: [PATCH 6/7] typo Signed-off-by: Joshua James Venter --- stdlib/src/collections/string.mojo | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/stdlib/src/collections/string.mojo b/stdlib/src/collections/string.mojo index ae89a57603..d9fdb5a8eb 100644 --- a/stdlib/src/collections/string.mojo +++ b/stdlib/src/collections/string.mojo @@ -235,7 +235,7 @@ fn ascii(value: String) -> String: @always_inline fn _stol(str_slice: StringSlice, base: Int = 10) raises -> (Int, String): - """Implementation if `stol` for StringSlice inputs. + """Implementation of `stol` for StringSlice inputs. Please see its docstring for details. """ @@ -424,7 +424,6 @@ fn _atol(str_slice: StringSlice, base: Int = 10) raises -> Int: """ if (base != 0) and (base < 2 or base > 36): raise Error("Base must be >= 2 and <= 36, or 0.") - if not str_slice: raise Error(_str_to_base_error(base, str_slice)) @@ -512,10 +511,8 @@ fn _atol(str_slice: StringSlice, base: Int = 10) raises -> Int: for pos in range(start, str_len): if not _isspace(buff[pos]): raise Error(_str_to_base_error(base, str_slice)) - if is_negative: result = -result - return result From 3f0624b8a64489b07478487bed05f0d50bdaa0ee Mon Sep 17 00:00:00 2001 From: Joshua James Venter Date: Sun, 1 Dec 2024 09:08:29 +0200 Subject: [PATCH 7/7] clean up Signed-off-by: Joshua James Venter --- stdlib/src/collections/string.mojo | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/stdlib/src/collections/string.mojo b/stdlib/src/collections/string.mojo index d9fdb5a8eb..b896b534e8 100644 --- a/stdlib/src/collections/string.mojo +++ b/stdlib/src/collections/string.mojo @@ -268,10 +268,7 @@ fn _stol(str_slice: StringSlice, base: Int = 10) raises -> (Int, String): # Keep original start position for base 10 else: # For valid prefixes, use the new start position - if real_base != -1: - start = real_base_new_start[1] - else: - return 0, String(str_slice) + start = real_base_new_start[1] has_prefix = real_base != 10 else: