Skip to content

Commit

Permalink
refactor(allocator): String type (#8568)
Browse files Browse the repository at this point in the history
Wrap `bumpalo::collections::String` in a new type instead of exporting it directly. This opens the door to:

1. Replacing it with our own `String` type which wraps our `Vec` type, rather than having 2 different implementations of `Vec` (`String` is just a wrapper around `Vec`, but a *different* `Vec` implementation).
2. Adding additional methods to `String` (`String::from_utf8` added in this PR).
  • Loading branch information
overlookmotel committed Jan 18, 2025
1 parent 93df57f commit ac05134
Show file tree
Hide file tree
Showing 7 changed files with 262 additions and 21 deletions.
2 changes: 1 addition & 1 deletion crates/oxc_allocator/src/convert.rs
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,7 @@ impl<'a> FromIn<'a, String> for crate::String<'a> {
impl<'a> FromIn<'a, String> for &'a str {
#[inline(always)]
fn from_in(value: String, allocator: &'a Allocator) -> Self {
crate::String::from_str_in(value.as_str(), allocator).into_bump_str()
allocator.alloc_str(value.as_str())
}
}

Expand Down
3 changes: 2 additions & 1 deletion crates/oxc_allocator/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,6 @@ use std::{
ops::{Deref, DerefMut},
};

pub use bumpalo::collections::String;
use bumpalo::Bump;

mod address;
Expand All @@ -53,13 +52,15 @@ mod boxed;
mod clone_in;
mod convert;
pub mod hash_map;
pub mod string;
mod vec;

pub use address::{Address, GetAddress};
pub use boxed::Box;
pub use clone_in::CloneIn;
pub use convert::{FromIn, IntoIn};
pub use hash_map::HashMap;
pub use string::String;
pub use vec::Vec;

/// A bump-allocated memory arena based on [bumpalo].
Expand Down
249 changes: 249 additions & 0 deletions crates/oxc_allocator/src/string.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,249 @@
//! Arena String.
//!
//! See [`String`] for more details.
// All methods which just delegate to `bumpalo::collections::String` methods marked `#[inline(always)]`
#![expect(clippy::inline_always)]

use std::{
fmt::{self, Debug, Display},
hash::{Hash, Hasher},
mem::ManuallyDrop,
ops::{Deref, DerefMut},
};

use bumpalo::collections::String as BumpaloString;
use simdutf8::basic::from_utf8;
pub use simdutf8::basic::Utf8Error;

use crate::{Allocator, Vec};

/// Arena String.
///
/// UTF-8 encoded, growable string. Identical to [`std::string::String`] except that it stores
/// string contents in arena allocator.
#[derive(PartialOrd, Eq, Ord)]
pub struct String<'alloc>(BumpaloString<'alloc>);

impl<'alloc> String<'alloc> {
/// Creates a new empty [`String`].
///
/// Given that the `String` is empty, this will not allocate any initial
/// buffer. While that means that this initial operation is very
/// inexpensive, it may cause excessive allocation later when you add
/// data. If you have an idea of how much data the `String` will hold,
/// consider the [`with_capacity_in`] method to prevent excessive
/// re-allocation.
///
/// [`with_capacity_in`]: String::with_capacity_in
#[inline(always)]
pub fn new_in(allocator: &'alloc Allocator) -> String<'alloc> {
Self(BumpaloString::new_in(allocator))
}

/// Creates a new empty [`String`] with specified capacity.
///
/// `String`s have an internal buffer to hold their data. The capacity is
/// the length of that buffer, and can be queried with the `capacity`
/// method. This method creates an empty `String`, but one with an initial
/// buffer that can hold `capacity` bytes. This is useful when you may be
/// appending a bunch of data to the `String`, reducing the number of
/// reallocations it needs to do.
///
/// If the given capacity is `0`, no allocation will occur, and this method
/// is identical to the [`new_in`] method.
///
/// [`capacity`]: String::capacity
/// [`new_in`]: String::new_in
#[inline(always)]
pub fn with_capacity_in(capacity: usize, allocator: &'alloc Allocator) -> String<'alloc> {
Self(BumpaloString::with_capacity_in(capacity, allocator))
}

/// Construct a new [`String`] from a string slice.
///
/// # Examples
///
/// ```
/// use oxc_allocator::{Allocator, String};
///
/// let allocator = Allocator::default();
///
/// let s = String::from_str_in("hello", &allocator);
/// assert_eq!(s, "hello");
/// ```
#[inline(always)]
pub fn from_str_in(s: &str, allocator: &'alloc Allocator) -> String<'alloc> {
Self(BumpaloString::from_str_in(s, allocator))
}

/// Convert `Vec<u8>` into [`String`].
///
/// # Errors
/// Returns [`Err`] if the `Vec` does not comprise a valid UTF-8 string.
pub fn from_utf8(bytes: Vec<'alloc, u8>) -> Result<String<'alloc>, Utf8Error> {
// Check vec comprises a valid UTF-8 string.
from_utf8(&bytes)?;
// SAFETY: We just checked it's a valid UTF-8 string
let s = unsafe { Self::from_utf8_unchecked(bytes) };
Ok(s)
}

/// Convert `Vec<u8>` into [`String`], without checking bytes comprise a valid UTF-8 string.
///
/// Does not copy the contents of the `Vec`, converts in place. This is a zero-cost operation.
///
/// # SAFETY
/// Caller must ensure this `Vec<u8>` comprises a valid UTF-8 string.
//
// `#[inline(always)]` because this is a no-op at runtime
#[expect(clippy::missing_safety_doc, clippy::unnecessary_safety_comment)]
#[inline(always)]
pub unsafe fn from_utf8_unchecked(bytes: Vec<'alloc, u8>) -> String<'alloc> {
// Cannot use `bumpalo::String::from_utf8_unchecked` because it takes a `bumpalo::collections::Vec`,
// and our inner `Vec` type is `allocator_api2::vec::Vec`.
// SAFETY: Conversion is safe because both types store data in arena in same way.
// Lifetime of returned `String` is same as lifetime of original `Vec<u8>`.
let inner = ManuallyDrop::into_inner(bytes.0);
let (ptr, len, capacity, bump) = inner.into_raw_parts_with_alloc();
Self(BumpaloString::from_raw_parts_in(ptr, len, capacity, bump))
}

/// Creates a new [`String`] from a length, capacity, and pointer.
///
/// # SAFETY
///
/// This is highly unsafe, due to the number of invariants that aren't checked:
///
/// * The memory at `ptr` needs to have been previously allocated by the same [`Allocator`].
/// * `length` needs to be less than or equal to `capacity`.
/// * `capacity` needs to be the correct value.
///
/// Violating these may cause problems like corrupting the allocator's internal data structures.
///
/// The ownership of `ptr` is effectively transferred to the `String` which may then deallocate,
/// reallocate or change the contents of memory pointed to by the pointer at will. Ensure that
/// nothing else uses the pointer after calling this function.
///
/// # Examples
///
/// Basic usage:
///
/// ```
/// use std::mem;
/// use oxc_allocator::{Allocator, String};
///
/// let allocator = Allocator::default();
///
/// unsafe {
/// let mut s = String::from_str_in("hello", &allocator);
/// let ptr = s.as_mut_ptr();
/// let len = s.len();
/// let capacity = s.capacity();
///
/// mem::forget(s);
///
/// let s = String::from_raw_parts_in(ptr, len, capacity, &allocator);
///
/// assert_eq!(s, "hello");
/// }
/// ```
#[expect(clippy::missing_safety_doc, clippy::unnecessary_safety_comment)]
#[inline(always)]
pub unsafe fn from_raw_parts_in(
buf: *mut u8,
length: usize,
capacity: usize,
allocator: &'alloc Allocator,
) -> String<'alloc> {
// SAFETY: Safety conditions of this method are the same as `BumpaloString`'s method
Self(BumpaloString::from_raw_parts_in(buf, length, capacity, allocator))
}

/// Convert this `String<'alloc>` into an `&'alloc str`. This is analogous to
/// [`std::string::String::into_boxed_str`].
///
/// # Example
///
/// ```
/// use oxc_allocator::{Allocator, String};
///
/// let allocator = Allocator::default();
///
/// let s = String::from_str_in("foo", &allocator);
/// assert_eq!(s.into_bump_str(), "foo");
/// ```
#[inline(always)]
pub fn into_bump_str(self) -> &'alloc str {
self.0.into_bump_str()
}
}

// Provide access to all `bumpalo::String`'s methods via deref
impl<'alloc> Deref for String<'alloc> {
type Target = BumpaloString<'alloc>;

#[inline]
fn deref(&self) -> &Self::Target {
&self.0
}
}

impl<'alloc> DerefMut for String<'alloc> {
#[inline]
fn deref_mut(&mut self) -> &mut BumpaloString<'alloc> {
&mut self.0
}
}

impl PartialEq for String<'_> {
#[inline]
fn eq(&self, other: &String) -> bool {
PartialEq::eq(&self[..], &other[..])
}
}

// `impl_eq!` macro copied from `bumpalo`
macro_rules! impl_eq {
($lhs:ty, $rhs: ty) => {
impl<'a, 'alloc> PartialEq<$rhs> for $lhs {
#[inline]
fn eq(&self, other: &$rhs) -> bool {
PartialEq::eq(&self[..], &other[..])
}
}

impl<'a, 'alloc> PartialEq<$lhs> for $rhs {
#[inline]
fn eq(&self, other: &$lhs) -> bool {
PartialEq::eq(&self[..], &other[..])
}
}
};
}

impl_eq! { String<'alloc>, str }
impl_eq! { String<'alloc>, &'a str }
impl_eq! { std::borrow::Cow<'a, str>, String<'alloc> }
impl_eq! { std::string::String, String<'alloc> }

impl Display for String<'_> {
#[inline]
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
Display::fmt(self.as_str(), f)
}
}

impl Debug for String<'_> {
#[inline]
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
Debug::fmt(self.as_str(), f)
}
}

impl Hash for String<'_> {
#[inline]
fn hash<H: Hasher>(&self, hasher: &mut H) {
self.as_str().hash(hasher);
}
}
21 changes: 6 additions & 15 deletions crates/oxc_allocator/src/vec.rs
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ use allocator_api2::vec::Vec as InnerVec;
use bumpalo::Bump;
#[cfg(any(feature = "serialize", test))]
use serde::{ser::SerializeSeq, Serialize, Serializer};
use simdutf8::basic::{from_utf8, Utf8Error};
use simdutf8::basic::Utf8Error;

use crate::{Allocator, Box, String};

Expand All @@ -32,7 +32,7 @@ use crate::{Allocator, Box, String};
/// Note: This is not a soundness issue, as Rust does not support relying on `drop`
/// being called to guarantee soundness.
#[derive(PartialEq, Eq)]
pub struct Vec<'alloc, T>(ManuallyDrop<InnerVec<T, &'alloc Bump>>);
pub struct Vec<'alloc, T>(pub(crate) ManuallyDrop<InnerVec<T, &'alloc Bump>>);

/// SAFETY: Not actually safe, but for enabling `Send` for downstream crates.
unsafe impl<T> Send for Vec<'_, T> {}
Expand Down Expand Up @@ -190,16 +190,12 @@ impl<'alloc, T> Vec<'alloc, T> {
}

impl<'alloc> Vec<'alloc, u8> {
/// Convert `Vec<u8>` into `String`.
/// Convert `Vec<u8>` into [`String`].
///
/// # Errors
/// Returns [`Err`] if the `Vec` does not comprise a valid UTF-8 string.
pub fn into_string(self) -> Result<String<'alloc>, Utf8Error> {
// Check vec comprises a valid UTF-8 string.
from_utf8(&self.0)?;
// SAFETY: We just checked it's a valid UTF-8 string
let s = unsafe { self.into_string_unchecked() };
Ok(s)
String::from_utf8(self)
}

/// Convert `Vec<u8>` into [`String`], without checking bytes comprise a valid UTF-8 string.
Expand All @@ -211,13 +207,8 @@ impl<'alloc> Vec<'alloc, u8> {
#[expect(clippy::missing_safety_doc, clippy::unnecessary_safety_comment)]
#[inline(always)] // `#[inline(always)]` because this is a no-op at runtime
pub unsafe fn into_string_unchecked(self) -> String<'alloc> {
// Cannot use `bumpalo::String::from_utf8_unchecked` because it takes a `bumpalo::collections::Vec`,
// and our inner `Vec` type is `allocator_api2::vec::Vec`.
// SAFETY: Conversion is safe because both types store data in arena in same way.
// Lifetime of returned `String` is same as lifetime of original `Vec<u8>`.
let inner = ManuallyDrop::into_inner(self.0);
let (ptr, len, cap, bump) = inner.into_raw_parts_with_alloc();
String::from_raw_parts_in(ptr, len, cap, bump)
// SAFETY: Caller guarantees vec comprises a valid UTF-8 string.
String::from_utf8_unchecked(self)
}
}

Expand Down
4 changes: 2 additions & 2 deletions crates/oxc_ast/src/ast_builder_impl.rs
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@

use std::{borrow::Cow, mem};

use oxc_allocator::{Allocator, Box, FromIn, String, Vec};
use oxc_allocator::{Allocator, Box, FromIn, Vec};
use oxc_span::{Atom, Span, SPAN};
use oxc_syntax::{number::NumberBase, operator::UnaryOperator, scope::ScopeId};

Expand Down Expand Up @@ -78,7 +78,7 @@ impl<'a> AstBuilder<'a> {
/// in the heap.
#[inline]
pub fn str(self, value: &str) -> &'a str {
String::from_str_in(value, self.allocator).into_bump_str()
self.allocator.alloc_str(value)
}

/// Allocate an [`Atom`] from a string slice.
Expand Down
2 changes: 1 addition & 1 deletion crates/oxc_prettier/src/macros.rs
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,7 @@ macro_rules! text {
#[macro_export]
macro_rules! dynamic_text {
($p:ident, $str:expr) => {{
let s = oxc_allocator::String::from_str_in($str, $p.allocator).into_bump_str();
let s = $p.allocator.alloc_str($str);
$crate::ir::Doc::Str(s)
}};
}
Expand Down
2 changes: 1 addition & 1 deletion crates/oxc_span/src/atom.rs
Original file line number Diff line number Diff line change
Expand Up @@ -79,7 +79,7 @@ impl<'alloc> FromIn<'alloc, &Atom<'alloc>> for Atom<'alloc> {

impl<'alloc> FromIn<'alloc, &str> for Atom<'alloc> {
fn from_in(s: &str, allocator: &'alloc Allocator) -> Self {
Self::from(oxc_allocator::String::from_str_in(s, allocator))
Self::from(&*allocator.alloc_str(s))
}
}

Expand Down

0 comments on commit ac05134

Please sign in to comment.