Skip to content
Merged
Show file tree
Hide file tree
Changes from 5 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
56 changes: 36 additions & 20 deletions crates/hstr/src/dynamic.rs
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,8 @@ use triomphe::ThinArc;

use crate::{
tagged_value::{TaggedValue, MAX_INLINE_LEN},
Atom, INLINE_TAG, INLINE_TAG_INIT, LEN_OFFSET, TAG_MASK,
wtf8::Wtf8,
Atom, Wtf8Atom, INLINE_TAG, INLINE_TAG_INIT, LEN_OFFSET, TAG_MASK,
};

#[derive(PartialEq, Eq)]
Expand Down Expand Up @@ -73,6 +74,11 @@ impl AtomStore {
atom_in(self, &text.into())
}

#[inline(always)]
pub fn wtf8_atom<'a>(&mut self, text: impl Into<Cow<'a, Wtf8>>) -> Wtf8Atom {
wtf8_atom_in(self, text.into().as_bytes())
}

fn gc(&mut self) {
self.data.retain(|item, _| {
let count = ThinArc::strong_count(&item.0);
Expand All @@ -94,6 +100,14 @@ pub fn global_atom_store_gc() {
});
}

pub(crate) fn global_wtf8_atom(text: &[u8]) -> Wtf8Atom {
GLOBAL_DATA.with(|global| {
let mut store = global.borrow_mut();

wtf8_atom_in(&mut *store, text)
})
}

pub(crate) fn global_atom(text: &str) -> Atom {
GLOBAL_DATA.with(|global| {
let mut store = global.borrow_mut();
Expand All @@ -102,9 +116,7 @@ pub(crate) fn global_atom(text: &str) -> Atom {
})
}

/// This can create any kind of [Atom], although this lives in the `dynamic`
/// module.
fn atom_in<S>(storage: S, text: &str) -> Atom
fn wtf8_atom_in<S>(storage: S, text: &[u8]) -> Wtf8Atom
where
S: Storage,
{
Expand All @@ -115,9 +127,9 @@ where
let tag = INLINE_TAG_INIT | ((len as u8) << LEN_OFFSET);
let mut unsafe_data = TaggedValue::new_tag(tag);
unsafe {
unsafe_data.data_mut()[..len].copy_from_slice(text.as_bytes());
unsafe_data.data_mut()[..len].copy_from_slice(text);
}
return Atom { unsafe_data };
return Wtf8Atom { unsafe_data };
}

let hash = calc_hash(text);
Expand All @@ -129,12 +141,22 @@ where
NonNull::new_unchecked(entry)
};
debug_assert!(0 == ptr.as_ptr() as u8 & TAG_MASK);
Atom {
Wtf8Atom {
unsafe_data: TaggedValue::new_ptr(ptr),
}
}

/// Attempts to construct an Atom but only if it can be constructed inline.
/// This can create any kind of [Atom], although this lives in the `dynamic`
/// module.
fn atom_in<S>(storage: S, text: &str) -> Atom
where
S: Storage,
{
// SAFETY: `text` is valid UTF-8
unsafe { Atom::from_wtf8_unchecked(wtf8_atom_in(storage, text.as_bytes())) }
}

/// Attempts to construct an [Atom] but only if it can be constructed inline.
/// This is primarily useful in constant contexts.
pub(crate) const fn inline_atom(text: &str) -> Option<Atom> {
let len = text.len();
Expand All @@ -159,31 +181,25 @@ pub(crate) const fn inline_atom(text: &str) -> Option<Atom> {
}

trait Storage {
fn insert_entry(self, text: &str, hash: u64) -> Item;
fn insert_entry(self, text: &[u8], hash: u64) -> Item;
}

impl Storage for &'_ mut AtomStore {
fn insert_entry(self, text: &str, hash: u64) -> Item {
fn insert_entry(self, text: &[u8], hash: u64) -> Item {
// If the text is too long, interning is not worth it.
if text.len() > 512 {
return Item(ThinArc::from_header_and_slice(
Metadata { hash },
text.as_bytes(),
));
return Item(ThinArc::from_header_and_slice(Metadata { hash }, text));
}

let (entry, _) = self
.data
.raw_entry_mut()
.from_hash(hash, |key| {
key.header.header.hash == hash && key.slice.eq(text.as_bytes())
key.header.header.hash == hash && key.slice.eq(text)
})
.or_insert_with(move || {
(
Item(ThinArc::from_header_and_slice(
Metadata { hash },
text.as_bytes(),
)),
Item(ThinArc::from_header_and_slice(Metadata { hash }, text)),
(),
)
});
Expand All @@ -192,7 +208,7 @@ impl Storage for &'_ mut AtomStore {
}

#[inline(always)]
fn calc_hash(text: &str) -> u64 {
fn calc_hash(text: &[u8]) -> u64 {
let mut hasher = FxHasher::default();
text.hash(&mut hasher);
hasher.finish()
Expand Down
51 changes: 49 additions & 2 deletions crates/hstr/src/global_store.rs
Original file line number Diff line number Diff line change
@@ -1,6 +1,13 @@
use std::borrow::Cow;
use std::{
borrow::Cow,
mem::{forget, ManuallyDrop},
};

use crate::{dynamic::global_atom, Atom};
use crate::{
dynamic::{global_atom, global_wtf8_atom},
wtf8::{Wtf8, Wtf8Buf},
Atom, Wtf8Atom,
};

macro_rules! direct_from_impl {
($T:ty) => {
Expand All @@ -21,3 +28,43 @@ impl From<Box<str>> for crate::Atom {
global_atom(&s)
}
}

macro_rules! direct_from_impl_wtf8 {
($T:ty) => {
impl From<$T> for Wtf8Atom {
fn from(s: $T) -> Self {
global_wtf8_atom(s.as_bytes())
}
}
};
}

direct_from_impl_wtf8!(&'_ str);
direct_from_impl_wtf8!(Cow<'_, str>);
direct_from_impl_wtf8!(String);
direct_from_impl_wtf8!(&'_ Wtf8);
direct_from_impl_wtf8!(Wtf8Buf);

impl From<&Atom> for crate::Wtf8Atom {
fn from(s: &Atom) -> Self {
forget(s.clone());
Wtf8Atom {
unsafe_data: s.unsafe_data,
}
}
}

impl From<Atom> for crate::Wtf8Atom {
fn from(s: Atom) -> Self {
let s = ManuallyDrop::new(s);
Wtf8Atom {
unsafe_data: s.unsafe_data,
}
}
}

impl From<Box<str>> for crate::Wtf8Atom {
fn from(s: Box<str>) -> Self {
global_wtf8_atom(s.as_bytes())
}
}
89 changes: 35 additions & 54 deletions crates/hstr/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ use core::str;
use std::{
fmt::{Debug, Display},
hash::Hash,
mem::{self, forget, transmute},
mem::{self, forget, transmute, ManuallyDrop},
num::NonZeroU8,
ops::Deref,
str::from_utf8_unchecked,
Expand All @@ -15,13 +15,21 @@ use debug_unreachable::debug_unreachable;
use once_cell::sync::Lazy;

pub use crate::dynamic::{global_atom_store_gc, AtomStore};
use crate::tagged_value::TaggedValue;
use crate::{
macros::{get_hash, impl_from_alias, partial_eq},
tagged_value::TaggedValue,
};

mod dynamic;
mod global_store;
mod macros;
mod tagged_value;
#[cfg(test)]
mod tests;
pub mod wtf8;
mod wtf8_atom;

pub use wtf8_atom::Wtf8Atom;

/// An immutable string which is cheap to clone, compare, hash, and has small
/// size.
Expand Down Expand Up @@ -253,20 +261,7 @@ impl Atom {

impl Atom {
fn get_hash(&self) -> u64 {
match self.tag() {
DYNAMIC_TAG => {
unsafe { crate::dynamic::deref_from(self.unsafe_data) }
.header
.header
.hash
}
INLINE_TAG => {
// This is passed as input to the caller's `Hasher` implementation, so it's okay
// that this isn't really a hash
self.unsafe_data.hash()
}
_ => unsafe { debug_unreachable!() },
}
get_hash!(self)
}

fn as_str(&self) -> &str {
Expand Down Expand Up @@ -302,30 +297,7 @@ impl Atom {
impl PartialEq for Atom {
#[inline(never)]
fn eq(&self, other: &Self) -> bool {
if self.unsafe_data == other.unsafe_data {
return true;
}

// If one is inline and the other is not, the length is different.
// If one is static and the other is not, it's different.
if self.tag() != other.tag() {
return false;
}

if self.is_dynamic() && other.is_dynamic() {
let te = unsafe { crate::dynamic::deref_from(self.unsafe_data) };
let oe = unsafe { crate::dynamic::deref_from(other.unsafe_data) };

if te.header.header.hash != oe.header.header.hash {
return false;
}

return te.slice == oe.slice;
}

if self.get_hash() != other.get_hash() {
return false;
}
partial_eq!(self, other);

// If the store is different, the string may be the same, even though the
// `unsafe_data` is different
Expand Down Expand Up @@ -358,20 +330,7 @@ impl Clone for Atom {
}
}

impl Atom {
#[inline]
pub(crate) fn from_alias(alias: TaggedValue) -> Self {
if alias.tag() & TAG_MASK == DYNAMIC_TAG {
unsafe {
let arc = crate::dynamic::restore_arc(alias);
forget(arc.clone());
forget(arc);
}
}

Self { unsafe_data: alias }
}
}
impl_from_alias!(Atom);

impl Deref for Atom {
type Target = str;
Expand Down Expand Up @@ -443,6 +402,28 @@ where
}
}

impl Atom {
/// Converts a WTF-8 encoded [Wtf8Atom] to a regular UTF-8 [Atom] without
/// validation.
///
/// # Safety
///
/// The caller must ensure that the WTF-8 atom contains only valid UTF-8
/// data (no unpaired surrogates). This function performs no validation
/// and will create an invalid `Atom` if the input contains unpaired
/// surrogates.
///
/// This is a zero-cost conversion that preserves all internal optimizations
/// (inline storage, precomputed hashes, etc.) since both types have
/// identical internal representation.
pub unsafe fn from_wtf8_unchecked(s: Wtf8Atom) -> Self {
let s = ManuallyDrop::new(s);
Atom {
unsafe_data: s.unsafe_data,
}
}
}

#[cfg(test)]
mod macro_tests {

Expand Down
71 changes: 71 additions & 0 deletions crates/hstr/src/macros.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,71 @@
macro_rules! get_hash {
($self:expr) => {
match $self.tag() {
DYNAMIC_TAG => {
let unsafe_data = $self.unsafe_data;
unsafe { $crate::dynamic::deref_from(unsafe_data) }
.header
.header
.hash
}
INLINE_TAG => {
// This is passed as input to the caller's `Hasher` implementation, so it's okay
// that this isn't really a hash
$self.unsafe_data.hash()
}
_ => unsafe { debug_unreachable!() },
}
};
}

macro_rules! partial_eq {
($self:expr, $other:expr) => {
if $self.unsafe_data == $other.unsafe_data {
return true;
}

// If one is inline and the other is not, the length is different.
// If one is static and the other is not, it's different.
if $self.tag() != $other.tag() {
return false;
}

if $self.is_dynamic() && $other.is_dynamic() {
let te = unsafe { $crate::dynamic::deref_from($self.unsafe_data) };
let oe = unsafe { $crate::dynamic::deref_from($other.unsafe_data) };

if te.header.header.hash != oe.header.header.hash {
return false;
}

return te.slice == oe.slice;
}

if $self.get_hash() != $other.get_hash() {
return false;
}
};
}

macro_rules! impl_from_alias {
($ty:ty) => {
impl $ty {
#[inline]
pub(crate) fn from_alias(alias: TaggedValue) -> Self {
if alias.tag() & TAG_MASK == DYNAMIC_TAG {
unsafe {
let arc = $crate::dynamic::restore_arc(alias);
forget(arc.clone());
forget(arc);
}
}

Self { unsafe_data: alias }
}
}
};
}

pub(crate) use get_hash;
pub(crate) use impl_from_alias;
pub(crate) use partial_eq;
Loading
Loading