From 745f25addb2960a2fc6ff1841b2329925af687a9 Mon Sep 17 00:00:00 2001 From: Marko Vejnovic Date: Tue, 17 Feb 2026 20:08:14 -0800 Subject: [PATCH 01/58] Mega Cleanup --- CLAUDE.md | 5 + Cargo.lock | 74 +- Cargo.toml | 1 + lib/cache/async_backed.rs | 391 +++++++++ lib/cache/mod.rs | 2 + lib/drop_ward.rs | 133 +++ lib/fs/async_fs.rs | 432 ++++++++++ lib/fs/dcache.rs | 65 ++ lib/fs/fuser.rs | 425 ++++++++++ lib/fs/mod.rs | 188 +++++ lib/lib.rs | 3 + src/daemon.rs | 25 +- src/fs/fuser.rs | 351 -------- src/fs/icache/async_cache.rs | 1410 -------------------------------- src/fs/icache/bridge.rs | 138 ---- src/fs/icache/file_table.rs | 22 - src/fs/icache/inode_factory.rs | 19 - src/fs/icache/mod.rs | 21 - src/fs/mescloud/common.rs | 106 +-- src/fs/mescloud/composite.rs | 634 ++++++++------ src/fs/mescloud/icache.rs | 437 ---------- src/fs/mescloud/mod.rs | 440 +++++----- src/fs/mescloud/org.rs | 449 +++------- src/fs/mescloud/repo.rs | 903 ++++++++++---------- src/fs/mod.rs | 3 - src/fs/trait.rs | 375 --------- tests/async_fs_correctness.rs | 609 ++++++++++++++ tests/common/async_fs_mocks.rs | 104 +++ tests/common/mod.rs | 4 +- 29 files changed, 3706 insertions(+), 4063 deletions(-) create mode 100644 lib/cache/async_backed.rs create mode 100644 lib/drop_ward.rs create mode 100644 lib/fs/async_fs.rs create mode 100644 lib/fs/dcache.rs create mode 100644 lib/fs/fuser.rs create mode 100644 lib/fs/mod.rs delete mode 100644 src/fs/fuser.rs delete mode 100644 src/fs/icache/async_cache.rs delete mode 100644 src/fs/icache/bridge.rs delete mode 100644 src/fs/icache/file_table.rs delete mode 100644 src/fs/icache/inode_factory.rs delete mode 100644 src/fs/icache/mod.rs delete mode 100644 src/fs/mescloud/icache.rs delete mode 100644 src/fs/trait.rs create mode 100644 tests/async_fs_correctness.rs create mode 100644 tests/common/async_fs_mocks.rs diff --git a/CLAUDE.md b/CLAUDE.md index 9ba3f68b..653c07a6 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -43,6 +43,11 @@ cargo fmt --all && cargo clippy --all-targets --all-features -- -D warnings && c - Channels: `tokio::sync::mpsc` for multi-producer, `tokio::sync::oneshot` for request-response - Never block the async runtime — offload blocking work with `tokio::task::spawn_blocking` +## Testing + +- Avoid writing tests in-line in the same file as production code; use separate `tests/` directory + for tests. + ## Dependencies - Check for existing deps with `cargo tree` before adding new crates diff --git a/Cargo.lock b/Cargo.lock index d4cf1499..1050f46b 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -11,6 +11,12 @@ dependencies = [ "memchr", ] +[[package]] +name = "aliasable" +version = "0.1.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "250f629c0161ad8107cf89319e990051fae62832fd343083bea452d93e2205fd" + [[package]] name = "android_system_properties" version = "0.1.5" @@ -228,7 +234,7 @@ version = "4.5.55" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "a92793da1a46a5f2a02a6f4c46c6496b28c43638adea8306fcb0caa1634f24e5" dependencies = [ - "heck", + "heck 0.5.0", "proc-macro2", "quote", "syn", @@ -758,6 +764,7 @@ dependencies = [ "opentelemetry", "opentelemetry-otlp", "opentelemetry_sdk", + "ouroboros", "rand", "reqwest", "reqwest-middleware", @@ -839,6 +846,12 @@ dependencies = [ "hashbrown 0.16.1", ] +[[package]] +name = "heck" +version = "0.4.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "95505c38b4572b2d910cecb0281560f54b440a19336cbbcb27bf6ce6adc6f5a8" + [[package]] name = "heck" version = "0.5.0" @@ -1497,6 +1510,30 @@ version = "0.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "04744f49eae99ab78e0d5c0b603ab218f515ea8cfe5a456d7629ad883a3b6e7d" +[[package]] +name = "ouroboros" +version = "0.18.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1e0f050db9c44b97a94723127e6be766ac5c340c48f2c4bb3ffa11713744be59" +dependencies = [ + "aliasable", + "ouroboros_macro", + "static_assertions", +] + +[[package]] +name = "ouroboros_macro" +version = "0.18.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3c7028bdd3d43083f6d8d4d5187680d0d3560d54df4cc9d752005268b41e64d0" +dependencies = [ + "heck 0.4.1", + "proc-macro2", + "proc-macro2-diagnostics", + "quote", + "syn", +] + [[package]] name = "page_size" version = "0.6.0" @@ -1623,6 +1660,19 @@ dependencies = [ "unicode-ident", ] +[[package]] +name = "proc-macro2-diagnostics" +version = "0.10.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "af066a9c399a26e020ada66a034357a868728e72cd426f3adcd35f80d88d88c8" +dependencies = [ + "proc-macro2", + "quote", + "syn", + "version_check", + "yansi", +] + [[package]] name = "prost" version = "0.13.5" @@ -2312,6 +2362,12 @@ version = "1.2.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "6ce2be8dc25455e1f91df71bfa12ad37d7af1092ae736f3a6cd0e37bc7810596" +[[package]] +name = "static_assertions" +version = "1.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a2eb9349b6444b326872e140eb1cf5e7c522154d69e7a0ffb0fb81c06b37543f" + [[package]] name = "strsim" version = "0.11.1" @@ -2865,6 +2921,12 @@ dependencies = [ "rustversion", ] +[[package]] +name = "version_check" +version = "0.9.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0b928f33d975fc6ad9f86c8f283853ad26bdd5b10b7f1542aa2fa15e2289105a" + [[package]] name = "vt100" version = "0.16.2" @@ -3309,7 +3371,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ea61de684c3ea68cb082b7a88508a8b27fcc8b797d738bfc99a82facf1d752dc" dependencies = [ "anyhow", - "heck", + "heck 0.5.0", "wit-parser", ] @@ -3320,7 +3382,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b7c566e0f4b284dd6561c786d9cb0142da491f46a9fbed79ea69cdad5db17f21" dependencies = [ "anyhow", - "heck", + "heck 0.5.0", "indexmap 2.13.0", "prettyplease", "syn", @@ -3387,6 +3449,12 @@ version = "0.6.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9edde0db4769d2dc68579893f2306b26c6ecfbe0ef499b013d731b7b9247e0b9" +[[package]] +name = "yansi" +version = "1.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cfe53a6657fd280eaa890a3bc59152892ffa3e30101319d168b781ed6529b049" + [[package]] name = "yoke" version = "0.8.1" diff --git a/Cargo.toml b/Cargo.toml index d837f7fe..dcf7b555 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -49,6 +49,7 @@ tracing-indicatif = "0.3.14" opentelemetry = { version = "0.29" } opentelemetry_sdk = { version = "0.29", features = ["rt-tokio"] } opentelemetry-otlp = { version = "0.29", default-features = false, features = ["http-proto", "trace", "reqwest-blocking-client"] } +ouroboros = "0.18" tracing-opentelemetry = { version = "0.30" } hashlink = "0.11.0" diff --git a/lib/cache/async_backed.rs b/lib/cache/async_backed.rs new file mode 100644 index 00000000..c3fddd05 --- /dev/null +++ b/lib/cache/async_backed.rs @@ -0,0 +1,391 @@ +//! Concurrent deduplication cache for async computations. +//! +//! Given a key and an async factory, ensures the factory runs at most once per key. Subsequent +//! callers for the same key await the already-in-flight computation via a [`Shared`] future, +//! avoiding the race conditions inherent in `Notify`-based signalling. +//! +//! Note that this cache does not support automatic eviction. + +use std::panic::AssertUnwindSafe; +use std::{fmt::Debug, future::Future, hash::Hash, pin::Pin}; + +use futures::FutureExt as _; +use futures::future::Shared; + +type SharedFut = Shared> + Send>>>; + +/// Two-state slot: `InFlight` while a factory future is running, then promoted to `Ready` once +/// the future completes. +/// +/// The `InFlight` variant holds a `Shared<..., Output = Option>` where `None` signals that the +/// factory panicked (caught by `catch_unwind`). On `None`, callers remove the entry and retry. +enum Slot { + InFlight(SharedFut), + Ready(V), +} + +/// Deduplicating async cache. +/// +/// If [`get_or_init`](Self::get_or_init) is called concurrently for the same key, only one +/// invocation of the factory runs. All callers receive a clone of the result. +pub struct FutureBackedCache { + map: scc::HashMap>, +} + +impl Default for FutureBackedCache +where + K: Eq + Hash, + V: Clone + Send + 'static, +{ + fn default() -> Self { + Self { + map: scc::HashMap::default(), + } + } +} + +impl FutureBackedCache +where + K: Eq + Hash + Debug + Clone + Send + Sync + 'static, + V: Clone + Send + Sync + 'static, +{ + /// Get the cached value for `key`, or initialize it by running `factory`. + /// + /// If another caller is already computing the value for this key, this awaits the in-flight + /// computation instead of spawning a duplicate. If the factory panics, the entry is removed + /// and the next caller retries with a fresh factory invocation. + /// + /// # Panics + /// + /// Panics if this caller joins an in-flight factory that itself panicked (i.e. the caller + /// lost the race to insert a fresh entry after the poisoned slot was removed). + pub async fn get_or_init(&self, key: K, factory: F) -> V + where + F: FnOnce() -> Fut, + Fut: Future + Send + 'static, + { + // Fast path: value already cached. + let existing = self + .map + .read_async(&key, |_, slot| match slot { + Slot::Ready(v) => Ok(v.clone()), + Slot::InFlight(shared) => Err(shared.clone()), + }) + .await; + + match existing { + Some(Ok(v)) => return v, + Some(Err(shared)) => { + if let Some(v) = self.await_shared(&key, shared).await { + return v; + } + // Factory panicked; entry removed. Fall through to re-insert below. + } + None => {} + } + + // Slow path: use entry_async for atomic check-and-insert. + let shared = match self.map.entry_async(key.clone()).await { + scc::hash_map::Entry::Occupied(occ) => match occ.get() { + Slot::Ready(v) => return v.clone(), + Slot::InFlight(shared) => shared.clone(), + }, + scc::hash_map::Entry::Vacant(vac) => { + let shared = Self::make_shared(factory); + let ret = shared.clone(); + vac.insert_entry(Slot::InFlight(shared)); + ret + } + }; + + if let Some(v) = self.await_shared(&key, shared).await { + return v; + } + + panic!("FutureBackedCache: joined an in-flight factory that panicked for key {key:?}"); + } + + /// Like [`get_or_init`](Self::get_or_init), but for fallible factories. + /// + /// If the factory returns `Ok(v)`, the value is cached and returned. If it returns `Err(e)`, + /// **nothing is cached** and the error is propagated to the caller. + /// + /// Unlike `get_or_init`, concurrent callers are **not** deduplicated — each caller that + /// finds the key absent will invoke the factory independently. However, if a value was + /// previously cached (by either `get_or_init` or a successful `get_or_try_init`), it is + /// returned immediately without calling the factory. + pub async fn get_or_try_init(&self, key: K, factory: F) -> Result + where + F: FnOnce() -> Fut, + Fut: Future> + Send + 'static, + { + // Fast path: value already cached or in-flight from an infallible init. + let existing = self + .map + .read_async(&key, |_, slot| match slot { + Slot::Ready(v) => Ok(v.clone()), + Slot::InFlight(shared) => Err(shared.clone()), + }) + .await; + + match existing { + Some(Ok(v)) => return Ok(v), + Some(Err(shared)) => { + if let Some(v) = self.await_shared(&key, shared).await { + return Ok(v); + } + // Factory panicked; entry was removed. Fall through to run our own factory. + } + None => {} + } + + // Run the fallible factory (not deduplicated). + let val = factory().await?; + + // Attempt to cache. If another caller raced us and already inserted, + // return the existing value and discard ours. + match self.map.entry_async(key).await { + scc::hash_map::Entry::Occupied(occ) => match occ.get() { + Slot::Ready(v) => Ok(v.clone()), + Slot::InFlight(shared) => Ok(self + .await_shared(occ.key(), shared.clone()) + .await + .unwrap_or(val)), + }, + scc::hash_map::Entry::Vacant(vac) => { + vac.insert_entry(Slot::Ready(val.clone())); + Ok(val) + } + } + } + + /// Get the cached value for `key` if it exists. + /// + /// - If the value is `Ready`, returns `Some(v)` immediately. + /// - If the value is `InFlight`, awaits the in-flight computation and returns `Some(v)`. + /// - If the key is absent, returns `None`. + /// - If the in-flight factory panicked, returns `None` (and removes the poisoned entry). + pub async fn get(&self, key: &K) -> Option { + let existing = self + .map + .read_async(key, |_, slot| match slot { + Slot::Ready(v) => Ok(v.clone()), + Slot::InFlight(shared) => Err(shared.clone()), + }) + .await; + + match existing { + Some(Ok(v)) => Some(v), + Some(Err(shared)) => self.await_shared(key, shared).await, + None => None, + } + } + + /// Await a `Shared` future, handle promotion to `Ready`, and handle panic recovery. + /// + /// Returns `Some(v)` on success. Returns `None` if the factory panicked, after removing + /// the poisoned entry from the map. + async fn await_shared(&self, key: &K, shared: SharedFut) -> Option { + let mut guard = PromoteGuard { + map: &self.map, + key, + value: None, + }; + + let result = shared.await; + + if let Some(v) = result { + guard.value = Some(v.clone()); + + self.map + .update_async(key, |_, slot| { + if matches!(slot, Slot::InFlight(_)) { + *slot = Slot::Ready(v.clone()); + } + }) + .await; + + guard.value = None; + Some(v) + } else { + // Factory panicked. Remove the poisoned InFlight entry so the next caller + // can retry. + drop( + self.map + .remove_if_sync(key, |slot| matches!(slot, Slot::InFlight(_))), + ); + None + } + } + + /// Wrap a factory future in `catch_unwind`, producing a `Shared` with `Output = Option`. + fn make_shared(factory: F) -> SharedFut + where + F: FnOnce() -> Fut, + Fut: Future + Send + 'static, + { + let fut = AssertUnwindSafe(factory()).catch_unwind(); + let boxed: Pin> + Send>> = + Box::pin(async move { fut.await.ok() }); + boxed.shared() + } + + /// Returns the number of entries in the cache (both `Ready` and `InFlight`). + #[must_use] + pub fn len(&self) -> usize { + self.map.len() + } + + /// Returns `true` if the cache contains no entries. + #[must_use] + pub fn is_empty(&self) -> bool { + self.map.is_empty() + } + + /// Synchronously insert a value, overwriting any existing entry. + /// + /// Suitable for seeding the cache before async operations begin (e.g. + /// inside an ouroboros builder where async is unavailable). + pub fn insert_sync(&self, key: K, value: V) { + drop(self.map.insert_sync(key, Slot::Ready(value))); + } + + /// Synchronously remove the entry for `key`, returning `true` if it was present. + /// + /// Suitable for use in contexts where async is not available (e.g. inside + /// [`StatelessDrop::delete`](crate::drop_ward::StatelessDrop::delete)). + pub fn remove_sync(&self, key: &K) -> bool { + self.map.remove_sync(key).is_some() + } +} + +/// Drop guard that synchronously promotes an `InFlight` entry to `Ready` if the caller +/// is cancelled between `shared.await` completing and the async promotion running. +/// +/// Set `value = None` to defuse after successful promotion. +struct PromoteGuard<'a, K, V> +where + K: Eq + Hash, + V: Clone + Send + Sync + 'static, +{ + map: &'a scc::HashMap>, + key: &'a K, + value: Option, +} + +impl Drop for PromoteGuard<'_, K, V> +where + K: Eq + Hash, + V: Clone + Send + Sync + 'static, +{ + fn drop(&mut self) { + if let Some(v) = self.value.take() { + self.map.update_sync(self.key, |_, slot| { + if matches!(slot, Slot::InFlight(_)) { + *slot = Slot::Ready(v); + } + }); + } + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[tokio::test] + async fn try_init_ok_caches_value() { + let cache = FutureBackedCache::::default(); + let result: Result = cache + .get_or_try_init(1, || async { Ok("hello".to_owned()) }) + .await; + assert_eq!(result.unwrap(), "hello", "should return Ok value"); + + // Value should now be cached (get returns it without factory) + let cached = cache.get(&1).await; + assert_eq!(cached.unwrap(), "hello", "value should be in cache"); + } + + #[tokio::test] + async fn try_init_err_does_not_cache() { + let cache = FutureBackedCache::::default(); + let result: Result = cache.get_or_try_init(1, || async { Err("boom") }).await; + assert_eq!(result.unwrap_err(), "boom", "should return the error"); + + // Cache should be empty — error was not stored + assert!(cache.is_empty(), "cache should have no entries after error"); + assert!(cache.get(&1).await.is_none(), "key should not exist"); + } + + #[tokio::test] + async fn try_init_err_then_retry_ok() { + let cache = FutureBackedCache::::default(); + + // First call: factory fails + let r1: Result = cache.get_or_try_init(1, || async { Err("fail") }).await; + assert!(r1.is_err(), "first call should fail"); + + // Second call: factory succeeds + let r2: Result = cache + .get_or_try_init(1, || async { Ok("recovered".to_owned()) }) + .await; + assert_eq!(r2.unwrap(), "recovered", "retry should succeed"); + + // Value should now be cached + let cached = cache.get(&1).await; + assert_eq!(cached.unwrap(), "recovered"); + } + + #[tokio::test] + async fn try_init_returns_value_cached_by_init() { + let cache = FutureBackedCache::::default(); + + // Populate via infallible get_or_init + cache + .get_or_init(1, || async { "from_init".to_owned() }) + .await; + + // get_or_try_init should return the cached value without running factory + let result: Result = cache + .get_or_try_init(1, || async { panic!("factory should not run") }) + .await; + assert_eq!(result.unwrap(), "from_init"); + } + + #[tokio::test] + async fn panic_in_factory_is_recovered() { + use std::sync::Arc; + use std::sync::atomic::{AtomicUsize, Ordering}; + + let cache = Arc::new(FutureBackedCache::::default()); + let call_count = Arc::new(AtomicUsize::new(0)); + + // Spawn a task whose factory panics. tokio::spawn catches the panic. + let cache2 = Arc::clone(&cache); + let call_count2 = Arc::clone(&call_count); + let handle = tokio::spawn(async move { + cache2 + .get_or_init(1, || { + call_count2.fetch_add(1, Ordering::Relaxed); + async { panic!("boom") } + }) + .await + }); + // The spawned task panics internally; JoinHandle returns Err. + assert!(handle.await.is_err(), "task should have panicked"); + + // The key should NOT be permanently bricked. A new caller should succeed. + let v = cache + .get_or_init(1, || { + call_count.fetch_add(1, Ordering::Relaxed); + async { "recovered".to_owned() } + }) + .await; + assert_eq!(v, "recovered", "should recover after panic"); + assert_eq!( + call_count.load(Ordering::Relaxed), + 2, + "factory called twice" + ); + } +} diff --git a/lib/cache/mod.rs b/lib/cache/mod.rs index e0c1c97f..5c48ee22 100644 --- a/lib/cache/mod.rs +++ b/lib/cache/mod.rs @@ -1,3 +1,5 @@ +/// Async-backed cache implementation. +pub mod async_backed; /// Cache eviction policies. pub mod eviction; /// File-backed cache implementation. diff --git a/lib/drop_ward.rs b/lib/drop_ward.rs new file mode 100644 index 00000000..4922e13c --- /dev/null +++ b/lib/drop_ward.rs @@ -0,0 +1,133 @@ +//! Automatic, type-directed cleanup driven by reference counting. +//! +//! [`DropWard`] tracks how many live references exist for a given key and invokes a cleanup +//! callback when a key's count reaches zero. The cleanup logic is selected at the type level +//! through a zero-sized "tag" type that implements [`StatelessDrop`], keeping the ward itself +//! generic over *what* it manages without storing per-key values. +//! +//! This is designed for resources whose lifecycle is bound to an external context (e.g. GPU device +//! handles, connection pools, graphics pipelines) where Rust's built-in `Drop` cannot be used +//! because cleanup requires access to that context. +//! +//! # Design rationale +//! +//! The tag type `T` is constrained to be zero-sized. It exists only to carry the [`StatelessDrop`] +//! implementation at the type level — no `T` value is ever constructed or stored. This means a +//! single `DropWard` instance adds no per-key overhead beyond the key and its `usize` count. +//! +//! # Example +//! +//! ```ignore +//! struct GpuTextureDrop; +//! +//! impl StatelessDrop for GpuTextureDrop { +//! fn delete(device: &wgpu::Device, _key: &TextureId) { +//! // e.g. flush a deferred-destruction queue +//! device.poll(wgpu::Maintain::Wait); +//! } +//! } +//! +//! let mut ward: DropWard = DropWard::new(device); +//! +//! ward.inc(texture_id); // → 1 +//! ward.inc(texture_id); // → 2 +//! ward.dec(&texture_id); // → Some(1) +//! ward.dec(&texture_id); // → Some(0), calls GpuTextureDrop::delete(&device, &texture_id) +//! ``` + +use std::marker::PhantomData; + +use rustc_hash::FxHashMap; + +/// Type-level hook for cleanup that requires an external context. +/// +/// Implement this on a zero-sized tag type. The tag is never instantiated — it only selects which +/// `delete` implementation a [`DropWard`] will call. +pub trait StatelessDrop { + /// Called exactly once when a key's reference count reaches zero. + /// + /// `ctx` is the shared context owned by the [`DropWard`]. `key` is the key whose count just + /// reached zero. This callback fires synchronously inside [`DropWard::dec`]; avoid blocking or + /// panicking if the ward is used on a hot path. + fn delete(ctx: &Ctx, key: &K); +} + +/// A reference-counted key set that triggers [`StatelessDrop::delete`] on the associated context +/// when any key's count drops to zero. +/// +/// # Type parameters +/// +/// - `Ctx` — shared context passed to `T::delete` (e.g. a device handle). +/// - `K` — the key type being reference-counted. +/// - `T` — a **zero-sized** tag type carrying the cleanup logic. +/// Will fail to compile if `size_of::() != 0`. +/// +/// # Concurrency +/// +/// Not thread-safe. All access requires `&mut self`. Wrap in a `Mutex` or similar if shared across +/// threads. +/// +#[derive(Debug, Clone)] +pub struct DropWard { + map: FxHashMap, + ctx: Ctx, + _marker: PhantomData, +} + +impl DropWard +where + K: Eq + std::hash::Hash, + T: StatelessDrop, +{ + /// Compile-time guard: `T` must be zero-sized. + const _ASSERT_ZST: () = assert!(size_of::() == 0, "T must be zero-sized"); + + /// Create a new ward that will pass `ctx` to `T::delete` on cleanup. + pub fn new(ctx: Ctx) -> Self { + Self { + map: FxHashMap::default(), + ctx, + _marker: PhantomData, + } + } + + /// Increment the reference count for `key`, inserting it with a count + /// of 1 if it does not exist. + /// + /// Returns the count **after** incrementing. + pub fn inc(&mut self, key: K) -> usize { + *self + .map + .entry(key) + .and_modify(|count| *count += 1) + .or_insert(1) + } + + fn dec_by(&mut self, key: &K, by: usize) -> Option { + let curr = *self.map.get(key)?; + let new_count = curr.saturating_sub(by); + if new_count == 0 { + self.map.remove(key); + T::delete(&self.ctx, key); + } else if let Some(slot) = self.map.get_mut(key) { + *slot = new_count; + } + Some(new_count) + } + + /// Decrement the reference count for `key`. + /// + /// If the count reaches zero, the key is removed and `T::delete` is + /// called synchronously with the ward's context. Returns `Some(0)` in + /// this case — the key will no longer be tracked. + /// + /// Returns `None` if `key` was not present (no-op). + pub fn dec(&mut self, key: &K) -> Option { + self.dec_by(key, 1) + } + + /// Decrement the reference count for `key` by `count`. + pub fn dec_count(&mut self, key: &K, count: usize) -> Option { + self.dec_by(key, count) + } +} diff --git a/lib/fs/async_fs.rs b/lib/fs/async_fs.rs new file mode 100644 index 00000000..7626578f --- /dev/null +++ b/lib/fs/async_fs.rs @@ -0,0 +1,432 @@ +//! Async `INode` Table which supports concurrent access and modification. + +use std::ffi::{OsStr, OsString}; +use std::future::Future; +use std::sync::Arc; +use std::sync::atomic::{AtomicU64, Ordering}; + +use bytes::Bytes; + +use crate::cache::async_backed::FutureBackedCache; +use crate::drop_ward::StatelessDrop; +use crate::fs::{ + AsyncFsStats, DirEntry, FileHandle, INode, INodeType, InodeAddr, LoadedAddr, OpenFlags, + dcache::DCache, +}; + +/// A reader for an open file, returned by [`FsDataProvider::open`]. +/// +/// Implementors provide the actual data for read operations. The FUSE +/// adapter calls [`close`](Self::close) to release resources explicitly. +pub trait FileReader: Send + Sync + 'static { + /// Read up to `size` bytes starting at byte `offset`. + fn read( + &self, + offset: u64, + size: u32, + ) -> impl Future> + Send; + + /// Release any resources held by this reader. + /// + /// Called explicitly by the FUSE adapter during `release`. Implementations + /// that hold inner file handles should release them here. The default + /// implementation is a no-op. + fn close(&self) -> impl Future> + Send { + async { Ok(()) } + } +} + +/// A data provider for [`AsyncFs`] that fetches inode data on cache misses. +pub trait FsDataProvider: Clone + Send + Sync + 'static { + /// The reader type returned by [`open`](Self::open). + type Reader: FileReader; + + /// Look up a child inode by name within the given parent directory. + fn lookup( + &self, + parent: INode, + name: &OsStr, + ) -> impl Future> + Send; + + /// List all children of a directory. + /// + /// Called by [`AsyncFs::readdir`] on a cache miss. The returned + /// children are inserted into the directory cache and inode table + /// so subsequent reads are served from cache. + fn readdir( + &self, + parent: INode, + ) -> impl Future, std::io::Error>> + Send; + + /// Open a file and return a reader for subsequent read calls. + fn open( + &self, + inode: INode, + flags: OpenFlags, + ) -> impl Future> + Send; +} + +/// Zero-sized tag whose [`StatelessDrop`] implementation automatically evicts +/// an inode from the inode table when its reference count reaches zero. +pub struct InodeForget; + +impl<'a> StatelessDrop<&'a FutureBackedCache, InodeAddr> for InodeForget { + fn delete(inode_table: &&'a FutureBackedCache, addr: &InodeAddr) { + inode_table.remove_sync(addr); + } +} + +/// A looked-up inode whose lifetime must be managed by the caller. +/// +/// Each `TrackedINode` returned by [`AsyncFs::lookup`] represents one +/// reference that the FUSE kernel holds. The caller must balance it by +/// decrementing the [`InodeLifecycle`] ward when the kernel sends `forget`. +#[derive(Debug, Clone, Copy)] +pub struct TrackedINode { + /// The resolved inode data. + pub inode: INode, +} + +/// An open file that provides read access. +/// +/// Returned by [`AsyncFs::open`]. The caller owns this handle and uses +/// [`read`](Self::read) to fetch data. Dropping the handle releases +/// the underlying reader when the last `Arc` clone is gone. +#[derive(Debug, Clone)] +pub struct OpenFile { + /// The raw file handle number, suitable for returning to the FUSE kernel. + pub fh: FileHandle, + /// The reader backing this open file. + pub reader: Arc, +} + +impl OpenFile { + /// Read up to `size` bytes starting at byte `offset`. + pub async fn read(&self, offset: u64, size: u32) -> Result { + self.reader.read(offset, size).await + } +} + +mod inode_lifecycle_impl { + #![allow(clippy::future_not_send, clippy::mem_forget)] + use ouroboros::self_referencing; + + use crate::cache::async_backed::FutureBackedCache; + use crate::drop_ward::DropWard; + use crate::fs::InodeAddr; + + use super::{INode, InodeForget}; + + /// Co-located inode table and reference-count ward. + /// + /// The ward borrows the table directly (no `Arc`) via `ouroboros`. + /// When `dec` reaches zero for a key, [`InodeForget::delete`] synchronously + /// removes that inode from the table. + #[self_referencing] + pub struct InodeLifecycle { + pub(super) table: FutureBackedCache, + #[borrows(table)] + #[not_covariant] + pub(super) ward: + DropWard<&'this FutureBackedCache, InodeAddr, InodeForget>, + } + + impl InodeLifecycle { + /// Create a new lifecycle managing the given inode table. + pub fn from_table(table: FutureBackedCache) -> Self { + Self::new(table, |tbl| DropWard::new(tbl)) + } + } +} + +pub use inode_lifecycle_impl::InodeLifecycle; + +impl InodeLifecycle { + /// Increment the reference count for an inode address. + pub fn inc(&mut self, addr: InodeAddr) -> usize { + self.with_ward_mut(|ward| ward.inc(addr)) + } + + /// Decrement the reference count for an inode address. + /// + /// When the count reaches zero, the inode is automatically evicted + /// from the table via [`InodeForget::delete`]. + pub fn dec(&mut self, addr: &InodeAddr) -> Option { + self.with_ward_mut(|ward| ward.dec(addr)) + } + + /// Decrement the reference count by `count`. + /// + /// When the count reaches zero, the inode is automatically evicted. + pub fn dec_count(&mut self, addr: &InodeAddr, count: usize) -> Option { + self.with_ward_mut(|ward| ward.dec_count(addr, count)) + } + + /// Read-only access to the underlying inode table. + #[must_use] + pub fn table(&self) -> &FutureBackedCache { + self.borrow_table() + } +} + +/// An asynchronous filesystem cache mapping `InodeAddr` to `INode`. +/// +/// Uses two [`FutureBackedCache`] layers: +/// - `inode_table` stores resolved inodes by address, used by [`loaded_inode`](Self::loaded_inode). +/// - `lookup_cache` stores lookup results by `(parent_addr, name)`, ensuring `dp.lookup()` is only +/// called on a true cache miss (not already cached or in-flight). +/// +/// The [`DCache`] sits in front as a synchronous fast path mapping `(parent, name)` to child addr. +pub struct AsyncFs<'tbl, DP: FsDataProvider> { + /// Canonical addr -> `INode` map. Used by `loaded_inode()` to retrieve inodes by address. + inode_table: &'tbl FutureBackedCache, + + /// Deduplicating lookup cache keyed by `(parent_addr, child_name)`. The factory is + /// `dp.lookup()`, so the data provider is only called on a true cache miss. + lookup_cache: FutureBackedCache<(InodeAddr, OsString), INode>, + + /// Directory entry cache, mapping `(parent, name)` to child inode address. + directory_cache: DCache, + + /// The data provider used to fetch inode data on cache misses. + data_provider: DP, + + /// Monotonically increasing file handle counter. Starts at 1 (0 is reserved). + next_fh: AtomicU64, + + /// Tracks which directories have had their children fetched via `dp.readdir`. + readdir_populated: FutureBackedCache, +} + +impl<'tbl, DP: FsDataProvider> AsyncFs<'tbl, DP> { + /// Create a new `AsyncFs`, seeding the root inode into the table. + pub async fn new( + data_provider: DP, + root: INode, + inode_table: &'tbl FutureBackedCache, + ) -> Self { + inode_table + .get_or_init(root.addr, || async move { root }) + .await; + + Self { + inode_table, + lookup_cache: FutureBackedCache::default(), + directory_cache: DCache::new(), + data_provider, + next_fh: AtomicU64::new(1), + readdir_populated: FutureBackedCache::default(), + } + } + + /// Create a new `AsyncFs`, assuming the root inode is already in the table. + /// + /// This synchronous constructor is needed for ouroboros builders where + /// async is unavailable. The caller must ensure the root inode has already + /// been inserted into `inode_table` (e.g. via [`FutureBackedCache::insert_sync`]). + #[must_use] + pub fn new_preseeded( + data_provider: DP, + inode_table: &'tbl FutureBackedCache, + ) -> Self { + Self { + inode_table, + lookup_cache: FutureBackedCache::default(), + directory_cache: DCache::new(), + data_provider, + next_fh: AtomicU64::new(1), + readdir_populated: FutureBackedCache::default(), + } + } + + /// Get the total number of inodes currently stored in the inode table. + #[must_use] + pub fn inode_count(&self) -> usize { + self.inode_table.len() + } + + /// Return filesystem statistics. + /// + /// Reports the current inode count from the cache. Block-related + /// fields default to values appropriate for a virtual read-only + /// filesystem (4 KiB blocks, no free space). + #[must_use] + pub fn statfs(&self) -> AsyncFsStats { + AsyncFsStats { + block_size: 4096, + total_blocks: 0, + free_blocks: 0, + available_blocks: 0, + total_inodes: self.inode_count() as u64, + free_inodes: 0, + max_filename_length: 255, + } + } + + /// Asynchronously look up an inode by name within a parent directory. + /// + /// Resolution order: + /// 1. Directory cache (synchronous fast path) + /// 2. Lookup cache (`get_or_try_init` — calls `dp.lookup()` only on a true miss) + /// 3. On success, populates inode table and directory cache + pub async fn lookup( + &self, + parent: LoadedAddr, + name: &OsStr, + ) -> Result { + let parent_ino = self.loaded_inode(parent).await?; + debug_assert!( + matches!(parent_ino.itype, INodeType::Directory), + "parent inode should be a directory" + ); + + if let Some(dentry) = self.directory_cache.lookup(parent, name) + && let Some(inode) = self.inode_table.get(&dentry.ino.0).await + { + return Ok(TrackedINode { inode }); + } + // Inode was evicted from the table — fall through to the slow path. + + let name_owned = name.to_os_string(); + let name_for_cache = name_owned.clone(); + let lookup_key = (parent.0, name_owned.clone()); + let dp = self.data_provider.clone(); + + let child = self + .lookup_cache + .get_or_try_init(lookup_key, || async move { + dp.lookup(parent_ino, &name_owned).await + }) + .await?; + + self.inode_table + .get_or_init(child.addr, || async move { child }) + .await; + + self.directory_cache + .insert( + parent, + name_for_cache, + LoadedAddr(child.addr), + matches!(child.itype, INodeType::Directory), + ) + .await; + + Ok(TrackedINode { inode: child }) + } + + /// Retrieve an inode that is expected to already be loaded. + /// + /// If the inode is currently in-flight (being loaded by another caller), this awaits + /// completion. Returns an error if the inode is not in the table at all. + pub async fn loaded_inode(&self, addr: LoadedAddr) -> Result { + self.inode_table.get(&addr.0).await.ok_or_else(|| { + tracing::error!( + inode = ?addr.0, + "inode not found in table — this is a programming bug" + ); + std::io::Error::from_raw_os_error(libc::ENOENT) + }) + } + + /// Return the attributes of the inode at `addr`. + /// + /// This is the getattr entry point for the filesystem. Returns the + /// cached [`INode`] directly — callers at the FUSE boundary are + /// responsible for converting to `fuser::FileAttr`. + pub async fn getattr(&self, addr: LoadedAddr) -> Result { + self.loaded_inode(addr).await + } + + /// Open a file for reading. + /// + /// Validates the inode is not a directory, delegates to the data provider + /// to create a [`FileReader`], and returns an [`OpenFile`] that the caller + /// owns. Reads go through [`OpenFile::read`]. + pub async fn open( + &self, + addr: LoadedAddr, + flags: OpenFlags, + ) -> Result, std::io::Error> { + let inode = self.loaded_inode(addr).await?; + if inode.itype == INodeType::Directory { + return Err(std::io::Error::from_raw_os_error(libc::EISDIR)); + } + let reader = self.data_provider.open(inode, flags).await?; + let fh = self.next_fh.fetch_add(1, Ordering::Relaxed); + Ok(OpenFile { + fh, + reader: Arc::new(reader), + }) + } + + /// Iterate directory entries for `parent`, starting from `offset`. + /// + /// On the first call for a given parent, fetches the directory listing + /// from the data provider and populates the directory cache and inode + /// table. Subsequent calls serve entries directly from cache. + /// + /// Entries are yielded in name-sorted order. For each entry, `filler` is + /// called with the [`DirEntry`] and the next offset value. If `filler` + /// returns `true` (indicating the caller's buffer is full), iteration + /// stops early. + /// + /// # Concurrency + /// + /// The `readdir_populated` check-then-populate is **not** atomic. If two + /// concurrent callers invoke `readdir` for the same parent, both may call + /// `dp.readdir()` and insert duplicate children. This is safe when the + /// caller serializes access (e.g. via `&mut self` on the `Fs` trait). + /// + /// TODO(MES-746): Implement `opendir` and `releasedir` to snapshot directory contents and + /// avoid racing with `lookup`/`createfile`. + pub async fn readdir( + &self, + parent: LoadedAddr, + offset: u64, + mut filler: impl FnMut(DirEntry<'_>, u64) -> bool, + ) -> Result<(), std::io::Error> { + let parent_inode = self.loaded_inode(parent).await?; + if parent_inode.itype != INodeType::Directory { + return Err(std::io::Error::from_raw_os_error(libc::ENOTDIR)); + } + + // Populate the directory cache on first readdir for this parent. + if self.readdir_populated.get(&parent).await.is_none() { + let children = self.data_provider.readdir(parent_inode).await?; + for (name, child_inode) in children { + self.inode_table + .get_or_init(child_inode.addr, || async move { child_inode }) + .await; + self.directory_cache + .insert( + parent, + name, + LoadedAddr(child_inode.addr), + child_inode.itype == INodeType::Directory, + ) + .await; + } + self.readdir_populated + .get_or_init(parent, || async {}) + .await; + } + + let mut children = self.directory_cache.readdir(parent).await; + children.sort_unstable_by(|(a, _), (b, _)| a.cmp(b)); + + #[expect( + clippy::cast_possible_truncation, + reason = "offset fits in usize on supported 64-bit platforms" + )] + for (i, (name, dvalue)) in children.iter().enumerate().skip(offset as usize) { + let inode = self.loaded_inode(dvalue.ino).await?; + let next_offset = (i + 1) as u64; + if filler(DirEntry { name, inode }, next_offset) { + break; + } + } + + Ok(()) + } +} diff --git a/lib/fs/dcache.rs b/lib/fs/dcache.rs new file mode 100644 index 00000000..5138e802 --- /dev/null +++ b/lib/fs/dcache.rs @@ -0,0 +1,65 @@ +use std::ffi::{OsStr, OsString}; + +use crate::fs::LoadedAddr; + +/// Cached metadata for a directory entry. +#[derive(Debug, Clone, PartialEq, Eq)] +pub struct DValue { + /// Inode address of this entry. + pub ino: LoadedAddr, + /// Whether this entry is itself a directory. + pub is_dir: bool, +} + +/// In-memory directory entry cache mapping `(parent, name)` to child metadata. +/// +/// Backed by [`scc::HashMap`] for atomic upsert on insert. The `readdir` +/// implementation scans the entire map and filters by parent — this is O(n) +/// over the cache size rather than O(log n + k) with an ordered index, but +/// guarantees that `insert` never creates a window where an entry is absent. +#[derive(Default)] +pub struct DCache { + cache: scc::HashMap<(LoadedAddr, OsString), DValue>, +} + +impl DCache { + /// Creates an empty directory cache. + #[must_use] + pub fn new() -> Self { + Self::default() + } + + /// Looks up a single child entry by parent inode and name. + #[must_use] + pub fn lookup(&self, parent_ino: LoadedAddr, name: &OsStr) -> Option { + let key = (parent_ino, name.to_os_string()); + self.cache.read_sync(&key, |_, v| v.clone()) + } + + /// Atomically inserts or overwrites a child entry in the cache. + pub async fn insert( + &self, + parent_ino: LoadedAddr, + name: OsString, + ino: LoadedAddr, + is_dir: bool, + ) { + let key = (parent_ino, name); + let value = DValue { ino, is_dir }; + self.cache.upsert_async(key, value).await; + } + + /// Returns all cached children of `parent_ino` as `(name, value)` pairs. + pub async fn readdir(&self, parent_ino: LoadedAddr) -> Vec<(OsString, DValue)> { + let mut entries = Vec::new(); + self.cache + .iter_async(|key, value| { + if key.0 == parent_ino { + entries.push((key.1.clone(), value.clone())); + } + true + }) + .await; + entries + } +} diff --git a/lib/fs/fuser.rs b/lib/fs/fuser.rs new file mode 100644 index 00000000..50042a24 --- /dev/null +++ b/lib/fs/fuser.rs @@ -0,0 +1,425 @@ +//! FUSE adapter: maps [`fuser::Filesystem`] callbacks to [`AsyncFs`](super::async_fs::AsyncFs). + +use std::collections::HashMap; +use std::ffi::OsStr; +use std::sync::Arc; + +use super::async_fs::{FileReader as _, FsDataProvider}; +use super::{FileHandle, INode, INodeType, InodeAddr, LoadedAddr, OpenFlags}; +use crate::cache::async_backed::FutureBackedCache; +use tracing::{debug, error, instrument}; + +/// Wrapper converting [`std::io::Error`] to errno. +#[derive(Debug, thiserror::Error)] +#[error("{0}")] +struct FuseIoError(std::io::Error); + +#[expect( + clippy::wildcard_enum_match_arm, + reason = "ErrorKind is non_exhaustive; EIO is the safe default" +)] +impl From for i32 { + fn from(e: FuseIoError) -> Self { + e.0.raw_os_error().unwrap_or_else(|| match e.0.kind() { + std::io::ErrorKind::NotFound => libc::ENOENT, + std::io::ErrorKind::PermissionDenied => libc::EACCES, + std::io::ErrorKind::AlreadyExists => libc::EEXIST, + _ => libc::EIO, + }) + } +} + +/// Error for read operations. +#[derive(Debug, thiserror::Error)] +enum FuseReadError { + /// The file handle was not open. + #[error("file handle not open")] + NotOpen, + /// An I/O error occurred during the read. + #[error("I/O error: {0}")] + Io(#[from] std::io::Error), +} + +impl From for i32 { + fn from(e: FuseReadError) -> Self { + match e { + FuseReadError::NotOpen => libc::EBADF, + FuseReadError::Io(ref io) => io.raw_os_error().unwrap_or(libc::EIO), + } + } +} + +/// Error for release operations. +#[derive(Debug, thiserror::Error)] +enum FuseReleaseError { + /// The file handle was not open. + #[error("file handle not open")] + NotOpen, +} + +impl From for i32 { + fn from(e: FuseReleaseError) -> Self { + match e { + FuseReleaseError::NotOpen => libc::EBADF, + } + } +} + +mod inner { + #![allow(clippy::future_not_send, clippy::mem_forget)] + + use ouroboros::self_referencing; + + use crate::cache::async_backed::FutureBackedCache; + use crate::drop_ward::DropWard; + use crate::fs::async_fs::{AsyncFs, FsDataProvider, InodeForget}; + use crate::fs::{INode, InodeAddr}; + + /// Self-referential struct holding the inode table, refcount ward, and `AsyncFs`. + /// + /// Both `ward` and `fs` borrow from `table`. The ward manages inode + /// refcounts; the fs serves lookup/readdir/open/read operations. + #[self_referencing] + pub(super) struct FuseBridgeInner { + table: FutureBackedCache, + #[borrows(table)] + #[not_covariant] + ward: DropWard<&'this FutureBackedCache, InodeAddr, InodeForget>, + #[borrows(table)] + #[covariant] + fs: AsyncFs<'this, DP>, + } + + impl FuseBridgeInner { + pub(super) fn create(table: FutureBackedCache, provider: DP) -> Self { + FuseBridgeInnerBuilder { + table, + ward_builder: |tbl| DropWard::new(tbl), + fs_builder: |tbl| AsyncFs::new_preseeded(provider, tbl), + } + .build() + } + + pub(super) fn get_fs(&self) -> &AsyncFs<'_, DP> { + self.borrow_fs() + } + + pub(super) fn ward_inc(&mut self, addr: InodeAddr) -> usize { + self.with_ward_mut(|ward| ward.inc(addr)) + } + + pub(super) fn ward_dec_count(&mut self, addr: InodeAddr, count: usize) -> Option { + self.with_ward_mut(|ward| ward.dec_count(&addr, count)) + } + } +} + +use inner::FuseBridgeInner; + +/// Convert an `INode` to the fuser-specific `FileAttr`. +fn inode_to_fuser_attr(inode: &INode, block_size: u32) -> fuser::FileAttr { + fuser::FileAttr { + ino: inode.addr, + size: inode.size, + blocks: inode.size.div_ceil(512), + atime: inode.last_modified_at, + mtime: inode.last_modified_at, + ctime: inode.last_modified_at, + crtime: inode.create_time, + kind: inode_type_to_fuser(inode.itype), + perm: inode.permissions.bits(), + nlink: 1, + uid: inode.uid, + gid: inode.gid, + rdev: 0, + blksize: block_size, + flags: 0, + } +} + +#[expect( + clippy::wildcard_enum_match_arm, + reason = "INodeType is non_exhaustive; File is the safe default" +)] +fn inode_type_to_fuser(itype: INodeType) -> fuser::FileType { + match itype { + INodeType::Directory => fuser::FileType::Directory, + INodeType::Symlink => fuser::FileType::Symlink, + _ => fuser::FileType::RegularFile, + } +} + +const BLOCK_SIZE: u32 = 4096; + +/// Bridges a generic [`FsDataProvider`] to the [`fuser::Filesystem`] trait. +/// +/// Owns a self-referential inode table + ward + [`AsyncFs`](super::async_fs::AsyncFs), +/// plus an open-file map and a tokio runtime handle for blocking on async ops. +pub struct FuserAdapter { + inner: FuseBridgeInner, + open_files: HashMap>, + runtime: tokio::runtime::Handle, +} + +impl FuserAdapter { + // TODO(markovejnovic): This low TTL is really not ideal. It slows us down a lot, since the + // kernel has to ask us for every single lookup all the time. + // + // I think a better implementation is to implement + // + // notify_inval_inode(ino, offset, len) + // notify_inval_entry(parent_ino, name) + // + // These two functions can be used to invalidate specific entries in the kernel cache when we + // know they have changed. This would allow us to set a much higher TTL here. + const SHAMEFUL_TTL: std::time::Duration = std::time::Duration::from_secs(1); + + /// Create a new adapter from a pre-seeded inode table and data provider. + /// + /// The `table` must already have the root inode inserted. + pub fn new( + table: FutureBackedCache, + provider: DP, + runtime: tokio::runtime::Handle, + ) -> Self { + Self { + inner: FuseBridgeInner::create(table, provider), + open_files: HashMap::new(), + runtime, + } + } +} + +impl fuser::Filesystem for FuserAdapter { + #[instrument(name = "FuserAdapter::lookup", skip(self, _req, reply))] + fn lookup( + &mut self, + _req: &fuser::Request<'_>, + parent: u64, + name: &OsStr, + reply: fuser::ReplyEntry, + ) { + let result = self.runtime.block_on(async { + let tracked = self + .inner + .get_fs() + .lookup(LoadedAddr(parent), name) + .await + .map_err(FuseIoError)?; + self.inner.ward_inc(tracked.inode.addr); + Ok::<_, FuseIoError>(tracked.inode) + }); + match result { + Ok(inode) => { + let f_attr = inode_to_fuser_attr(&inode, BLOCK_SIZE); + debug!(?f_attr, "replying..."); + reply.entry(&Self::SHAMEFUL_TTL, &f_attr, 0); + } + Err(e) => { + debug!(error = %e, "replying error"); + reply.error(e.into()); + } + } + } + + #[instrument(name = "FuserAdapter::getattr", skip(self, _req, _fh, reply))] + fn getattr( + &mut self, + _req: &fuser::Request<'_>, + ino: u64, + _fh: Option, + reply: fuser::ReplyAttr, + ) { + let result = self.runtime.block_on(async { + self.inner + .get_fs() + .getattr(LoadedAddr(ino)) + .await + .map_err(FuseIoError) + }); + match result { + Ok(inode) => { + let attr = inode_to_fuser_attr(&inode, BLOCK_SIZE); + debug!(?attr, "replying..."); + reply.attr(&Self::SHAMEFUL_TTL, &attr); + } + Err(e) => { + debug!(error = %e, "replying error"); + reply.error(e.into()); + } + } + } + + #[instrument(name = "FuserAdapter::readdir", skip(self, _req, _fh, offset, reply))] + fn readdir( + &mut self, + _req: &fuser::Request<'_>, + ino: u64, + _fh: u64, + offset: i64, + mut reply: fuser::ReplyDirectory, + ) { + let offset_u64 = offset.cast_unsigned(); + let result = self.runtime.block_on(async { + let mut entries = Vec::new(); + self.inner + .get_fs() + .readdir(LoadedAddr(ino), offset_u64, |de, _next_offset| { + entries.push((de.inode.addr, de.name.to_os_string(), de.inode.itype)); + false + }) + .await + .map_err(FuseIoError)?; + Ok::<_, FuseIoError>(entries) + }); + + let entries = match result { + Ok(entries) => entries, + Err(e) => { + debug!(error = %e, "replying error"); + reply.error(e.into()); + return; + } + }; + + #[expect( + clippy::cast_possible_truncation, + reason = "offset fits in usize on supported 64-bit platforms" + )] + for (i, (entry_ino, entry_name, entry_itype)) in entries.iter().enumerate() { + let kind = inode_type_to_fuser(*entry_itype); + let abs_idx = offset_u64 as usize + i + 1; + let Ok(idx): Result = abs_idx.try_into() else { + error!("Directory entry index {} too large for fuser", abs_idx); + reply.error(libc::EIO); + return; + }; + + debug!(?entry_name, ino = entry_ino, "adding entry to reply..."); + if reply.add(*entry_ino, idx, kind, entry_name) { + debug!("buffer full for now, stopping readdir"); + break; + } + } + + debug!("finalizing reply..."); + reply.ok(); + } + + #[instrument(name = "FuserAdapter::open", skip(self, _req, flags, reply))] + fn open(&mut self, _req: &fuser::Request<'_>, ino: u64, flags: i32, reply: fuser::ReplyOpen) { + let flags = OpenFlags::from_bits_truncate(flags); + let result = self.runtime.block_on(async { + let open_file = self + .inner + .get_fs() + .open(LoadedAddr(ino), flags) + .await + .map_err(FuseIoError)?; + let fh = open_file.fh; + self.open_files.insert(fh, Arc::clone(&open_file.reader)); + Ok::<_, FuseIoError>(fh) + }); + match result { + Ok(fh) => { + debug!(handle = fh, "replying..."); + reply.opened(fh, 0); + } + Err(e) => { + debug!(error = %e, "replying error"); + reply.error(e.into()); + } + } + } + + #[instrument( + name = "FuserAdapter::read", + skip(self, _req, _ino, fh, offset, size, _flags, _lock_owner, reply) + )] + fn read( + &mut self, + _req: &fuser::Request<'_>, + _ino: u64, + fh: u64, + offset: i64, + size: u32, + _flags: i32, + _lock_owner: Option, + reply: fuser::ReplyData, + ) { + let result: Result<_, FuseReadError> = self.runtime.block_on(async { + let reader = self.open_files.get(&fh).ok_or(FuseReadError::NotOpen)?; + Ok(reader.read(offset.cast_unsigned(), size).await?) + }); + match result { + Ok(data) => { + debug!(read_bytes = data.len(), "replying..."); + reply.data(&data); + } + Err(e) => { + debug!(error = %e, "replying error"); + reply.error(e.into()); + } + } + } + + #[instrument( + name = "FuserAdapter::release", + skip(self, _req, _ino, fh, _flags, _lock_owner, _flush, reply) + )] + fn release( + &mut self, + _req: &fuser::Request<'_>, + _ino: u64, + fh: u64, + _flags: i32, + _lock_owner: Option, + _flush: bool, + reply: fuser::ReplyEmpty, + ) { + let result: Result<_, FuseReleaseError> = match self.open_files.remove(&fh) { + Some(reader) => { + if let Err(e) = self.runtime.block_on(reader.close()) { + debug!(error = %e, "reader close reported error"); + } + Ok(()) + } + None => Err(FuseReleaseError::NotOpen), + }; + match result { + Ok(()) => { + debug!("replying ok"); + reply.ok(); + } + Err(e) => { + debug!(error = %e, "replying error"); + reply.error(e.into()); + } + } + } + + #[expect( + clippy::cast_possible_truncation, + reason = "nlookups fits in usize on supported 64-bit platforms" + )] + #[instrument(name = "FuserAdapter::forget", skip(self, _req, nlookup))] + fn forget(&mut self, _req: &fuser::Request<'_>, ino: u64, nlookup: u64) { + self.inner.ward_dec_count(ino, nlookup as usize); + } + + #[instrument(name = "FuserAdapter::statfs", skip(self, _req, _ino, reply))] + fn statfs(&mut self, _req: &fuser::Request<'_>, _ino: u64, reply: fuser::ReplyStatfs) { + let stats = self.inner.get_fs().statfs(); + debug!(?stats, "replying..."); + reply.statfs( + stats.total_blocks, + stats.free_blocks, + stats.available_blocks, + stats.total_inodes, + stats.free_inodes, + stats.block_size, + stats.max_filename_length, + 0, + ); + } +} diff --git a/lib/fs/mod.rs b/lib/fs/mod.rs new file mode 100644 index 00000000..e8f971b4 --- /dev/null +++ b/lib/fs/mod.rs @@ -0,0 +1,188 @@ +//! Useful filesystem generalizations. +/// Async filesystem cache with concurrent inode management. +pub mod async_fs; +/// Directory entry cache for fast parent-child lookups. +pub mod dcache; +/// FUSE adapter: maps [`fuser::Filesystem`] callbacks to [`async_fs::AsyncFs`]. +pub mod fuser; + +pub use async_fs::{InodeForget, InodeLifecycle, OpenFile, TrackedINode}; + +use std::ffi::OsStr; +use std::time::SystemTime; + +use bitflags::bitflags; + +/// Type representing an inode identifier. +pub type InodeAddr = u64; + +/// Represents an inode address that has been loaded into the inode table. +/// +/// This newtype wrapper distinguishes inode addresses that are known to exist +/// in the [`async_fs::AsyncFs`] inode table from raw [`InodeAddr`] values. +#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, PartialOrd, Ord)] +pub struct LoadedAddr(pub InodeAddr); + +/// Type representing a file handle. +pub type FileHandle = u64; + +bitflags! { + /// Permission bits for an inode, similar to Unix file permissions. + #[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)] + pub struct InodePerms: u16 { + /// Other: execute permission. + const OTHER_EXECUTE = 1 << 0; + /// Other: write permission. + const OTHER_WRITE = 1 << 1; + /// Other: read permission. + const OTHER_READ = 1 << 2; + + /// Group: execute permission. + const GROUP_EXECUTE = 1 << 3; + /// Group: write permission. + const GROUP_WRITE = 1 << 4; + /// Group: read permission. + const GROUP_READ = 1 << 5; + + /// Owner: execute permission. + const OWNER_EXECUTE = 1 << 6; + /// Owner: write permission. + const OWNER_WRITE = 1 << 7; + /// Owner: read permission. + const OWNER_READ = 1 << 8; + + /// Sticky bit. + const STICKY = 1 << 9; + /// Set-group-ID bit. + const SETGID = 1 << 10; + /// Set-user-ID bit. + const SETUID = 1 << 11; + + /// Other: read, write, and execute. + const OTHER_RWX = Self::OTHER_READ.bits() + | Self::OTHER_WRITE.bits() + | Self::OTHER_EXECUTE.bits(); + /// Group: read, write, and execute. + const GROUP_RWX = Self::GROUP_READ.bits() + | Self::GROUP_WRITE.bits() + | Self::GROUP_EXECUTE.bits(); + /// Owner: read, write, and execute. + const OWNER_RWX = Self::OWNER_READ.bits() + | Self::OWNER_WRITE.bits() + | Self::OWNER_EXECUTE.bits(); + } +} + +bitflags! { + /// Flags for opening a file, similar to Unix open(2) flags. + #[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)] + pub struct OpenFlags: i32 { + /// Open for reading only. + const RDONLY = libc::O_RDONLY; + /// Open for writing only. + const WRONLY = libc::O_WRONLY; + /// Open for reading and writing. + const RDWR = libc::O_RDWR; + + /// Append on each write. + const APPEND = libc::O_APPEND; + /// Truncate to zero length. + const TRUNC = libc::O_TRUNC; + /// Create file if it does not exist. + const CREAT = libc::O_CREAT; + /// Error if file already exists (with `CREAT`). + const EXCL = libc::O_EXCL; + + /// Non-blocking mode. + const NONBLOCK = libc::O_NONBLOCK; + /// Synchronous writes. + const SYNC = libc::O_SYNC; + /// Synchronous data integrity writes. + const DSYNC = libc::O_DSYNC; + /// Do not follow symlinks. + const NOFOLLOW = libc::O_NOFOLLOW; + /// Set close-on-exec. + const CLOEXEC = libc::O_CLOEXEC; + /// Fail if not a directory. + const DIRECTORY = libc::O_DIRECTORY; + + /// Do not update access time (Linux only). + #[cfg(target_os = "linux")] + const NOATIME = libc::O_NOATIME; + } +} + +/// The type of an inode entry in the filesystem. +#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)] +#[non_exhaustive] +pub enum INodeType { + /// A regular file. + File, + /// A directory. + Directory, + /// A symbolic link. + Symlink, +} + +/// Representation of an inode. +#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)] +pub struct INode { + /// The address of this inode, which serves as its unique identifier. + pub addr: InodeAddr, + /// The permissions associated with this inode, represented as a bitfield. + pub permissions: InodePerms, + /// The user ID of the owner of this inode. + pub uid: u32, + /// The group ID of the owner of this inode. + pub gid: u32, + /// The time this inode was created at. + pub create_time: SystemTime, + /// The time this inode was last modified at. + pub last_modified_at: SystemTime, + /// The parent inode address, if any. This is `None` for the root inode. + pub parent: Option, + /// The size of the file represented by this inode, in bytes. + pub size: u64, + /// Additional information about the type of this inode (e.g., file vs directory). + pub itype: INodeType, +} + +impl INode { + /// Check if this inode is the root inode (i.e., has no parent). + #[must_use] + pub fn is_root(&self) -> bool { + self.parent.is_none() + } +} + +/// A directory entry yielded by [`async_fs::AsyncFs::readdir`]. +/// +/// Borrows the entry name from the directory cache's iteration buffer. +#[derive(Debug, Clone, Copy)] +pub struct DirEntry<'a> { + /// The name of this entry within its parent directory. + pub name: &'a OsStr, + /// The full inode data for this entry. + pub inode: INode, +} + +/// Filesystem statistics returned by [`async_fs::AsyncFs::statfs`]. +/// +/// Block-related sizes are in units of `block_size` bytes. +#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)] +pub struct AsyncFsStats { + /// Filesystem block size (bytes). + pub block_size: u32, + /// Total number of data blocks. + pub total_blocks: u64, + /// Number of free blocks. + pub free_blocks: u64, + /// Number of blocks available to unprivileged users. + pub available_blocks: u64, + /// Total number of file nodes (inodes). + pub total_inodes: u64, + /// Number of free file nodes. + pub free_inodes: u64, + /// Maximum filename length (bytes). + pub max_filename_length: u32, +} diff --git a/lib/lib.rs b/lib/lib.rs index f7388bd5..40b1e8f2 100644 --- a/lib/lib.rs +++ b/lib/lib.rs @@ -2,4 +2,7 @@ /// Caching primitives for git-fs. pub mod cache; +pub mod drop_ward; +/// Filesystem abstractions and caching layers. +pub mod fs; pub mod io; diff --git a/src/daemon.rs b/src/daemon.rs index dac2d052..0a7a9f31 100644 --- a/src/daemon.rs +++ b/src/daemon.rs @@ -14,9 +14,13 @@ mod managed_fuse { use nix::errno::Errno; + use git_fs::cache::async_backed::FutureBackedCache; + use git_fs::fs::{INode, INodeType, InodePerms}; + use super::{MesaFS, OrgConfig, app_config, debug, error}; - use crate::fs::fuser::FuserAdapter; + use crate::fs::mescloud::MesaFsProvider; use fuser::BackgroundSession; + use git_fs::fs::fuser::FuserAdapter; pub struct FuseCoreScope { _session: BackgroundSession, @@ -44,7 +48,24 @@ mod managed_fuse { api_key: org.api_key.clone(), }); let mesa_fs = MesaFS::new(orgs, (config.uid, config.gid), &config.cache); - let fuse_adapter = FuserAdapter::new(mesa_fs, handle); + + let table = FutureBackedCache::default(); + let now = std::time::SystemTime::now(); + let root = INode { + addr: 1, + permissions: InodePerms::from_bits_truncate(0o755), + uid: config.uid, + gid: config.gid, + create_time: now, + last_modified_at: now, + parent: None, + size: 0, + itype: INodeType::Directory, + }; + table.insert_sync(1, root); + + let provider = MesaFsProvider::new(mesa_fs); + let fuse_adapter = FuserAdapter::new(table, provider, handle); let mount_opts = [ fuser::MountOption::FSName("git-fs".to_owned()), fuser::MountOption::RO, diff --git a/src/fs/fuser.rs b/src/fs/fuser.rs deleted file mode 100644 index 86ddabb6..00000000 --- a/src/fs/fuser.rs +++ /dev/null @@ -1,351 +0,0 @@ -use std::ffi::OsStr; - -use crate::fs::r#trait::{CommonFileAttr, DirEntryType, FileAttr, Fs, LockOwner, OpenFlags}; -use tracing::{debug, error, instrument}; - -impl From for fuser::FileAttr { - fn from(val: FileAttr) -> Self { - fn common_to_fuser(common: CommonFileAttr) -> fuser::FileAttr { - fuser::FileAttr { - ino: common.ino, - size: 0, - blocks: 0, - atime: common.atime, - mtime: common.mtime, - ctime: common.ctime, - crtime: common.crtime, - kind: fuser::FileType::RegularFile, - perm: common.perm.bits(), - nlink: common.nlink, - uid: common.uid, - gid: common.gid, - rdev: 0, - blksize: common.blksize, - flags: 0, - } - } - - match val { - FileAttr::RegularFile { - common, - size, - blocks, - } => { - let mut attr = common_to_fuser(common); - attr.size = size; - attr.blocks = blocks; - attr.kind = fuser::FileType::RegularFile; - attr - } - FileAttr::Directory { common } => { - let mut attr = common_to_fuser(common); - attr.kind = fuser::FileType::Directory; - attr - } - FileAttr::Symlink { common, size } => { - let mut attr = common_to_fuser(common); - attr.size = size; - attr.kind = fuser::FileType::Symlink; - attr - } - FileAttr::CharDevice { common, rdev } => { - let mut attr = common_to_fuser(common); - debug_assert!(u32::try_from(rdev).is_ok(), "rdev value {rdev} too large"); - attr.rdev = rdev - .try_into() - .map_err(|_| { - error!("rdev value {rdev} too large for fuser::FileAttr"); - }) - .unwrap_or(0); - attr.kind = fuser::FileType::CharDevice; - attr - } - FileAttr::BlockDevice { common, rdev } => { - let mut attr = common_to_fuser(common); - debug_assert!(u32::try_from(rdev).is_ok(), "rdev value {rdev} too large"); - attr.rdev = rdev - .try_into() - .map_err(|_| { - error!("rdev value {rdev} too large for fuser::FileAttr"); - }) - .unwrap_or(0); - attr.kind = fuser::FileType::BlockDevice; - attr - } - FileAttr::NamedPipe { common } => { - let mut attr = common_to_fuser(common); - attr.kind = fuser::FileType::NamedPipe; - attr - } - FileAttr::Socket { common } => { - let mut attr = common_to_fuser(common); - attr.kind = fuser::FileType::Socket; - attr - } - } - } -} - -impl From for fuser::FileType { - fn from(val: DirEntryType) -> Self { - match val { - DirEntryType::RegularFile => Self::RegularFile, - DirEntryType::Directory => Self::Directory, - DirEntryType::Symlink => Self::Symlink, - DirEntryType::CharDevice => Self::CharDevice, - DirEntryType::BlockDevice => Self::BlockDevice, - DirEntryType::NamedPipe => Self::NamedPipe, - DirEntryType::Socket => Self::Socket, - } - } -} - -impl From for OpenFlags { - fn from(val: i32) -> Self { - Self::from_bits_truncate(val) - } -} - -pub struct FuserAdapter -where - F::LookupError: Into, - F::GetAttrError: Into, - F::OpenError: Into, - F::ReadError: Into, - F::ReaddirError: Into, - F::ReleaseError: Into, -{ - fs: F, - runtime: tokio::runtime::Handle, -} - -impl FuserAdapter -where - F::LookupError: Into, - F::GetAttrError: Into, - F::OpenError: Into, - F::ReadError: Into, - F::ReaddirError: Into, - F::ReleaseError: Into, -{ - // TODO(markovejnovic): This low TTL is really not ideal. It slows us down a lot, since the - // kernel has to ask us for every single lookup all the time. - // - // I think a better implementation is to implement - // - // notify_inval_inode(ino, offset, len) - // notify_inval_entry(parent_ino, name) - // - // These two functions can be used to invalidate specific entries in the kernel cache when we - // know they have changed. This would allow us to set a much higher TTL here. - const SHAMEFUL_TTL: std::time::Duration = std::time::Duration::from_secs(1); - - pub fn new(fs: F, runtime: tokio::runtime::Handle) -> Self { - Self { fs, runtime } - } -} - -impl fuser::Filesystem for FuserAdapter -where - F::LookupError: Into, - F::GetAttrError: Into, - F::OpenError: Into, - F::ReadError: Into, - F::ReaddirError: Into, - F::ReleaseError: Into, -{ - #[instrument(name = "FuserAdapter::lookup", skip(self, _req, reply))] - fn lookup( - &mut self, - _req: &fuser::Request<'_>, - parent: u64, - name: &OsStr, - reply: fuser::ReplyEntry, - ) { - match self.runtime.block_on(self.fs.lookup(parent, name)) { - Ok(attr) => { - // TODO(markovejnovic): Passing generation = 0 here is a recipe for disaster. - // Someone with A LOT of files will likely see inode reuse which will lead to a - // disaster. - let f_attr: fuser::FileAttr = attr.into(); - debug!(?f_attr, "replying..."); - reply.entry(&Self::SHAMEFUL_TTL, &f_attr, 0); - } - Err(e) => { - debug!(error = %e, "replying error"); - reply.error(e.into()); - } - } - } - - #[instrument(name = "FuserAdapter::getattr", skip(self, _req, fh, reply))] - fn getattr( - &mut self, - _req: &fuser::Request<'_>, - ino: u64, - fh: Option, - reply: fuser::ReplyAttr, - ) { - match self.runtime.block_on(self.fs.getattr(ino, fh)) { - Ok(attr) => { - debug!(?attr, "replying..."); - reply.attr(&Self::SHAMEFUL_TTL, &attr.into()); - } - Err(e) => { - debug!(error = %e, "replying error"); - reply.error(e.into()); - } - } - } - - #[instrument(name = "FuserAdapter::readdir", skip(self, _req, _fh, offset, reply))] - fn readdir( - &mut self, - _req: &fuser::Request<'_>, - ino: u64, - _fh: u64, - offset: i64, - mut reply: fuser::ReplyDirectory, - ) { - let entries = match self.runtime.block_on(self.fs.readdir(ino)) { - Ok(entries) => entries, - Err(e) => { - debug!(error = %e, "replying error"); - reply.error(e.into()); - return; - } - }; - - #[expect( - clippy::cast_possible_truncation, - reason = "fuser offset is i64 but always non-negative" - )] - for (i, entry) in entries - .iter() - .enumerate() - .skip(offset.cast_unsigned() as usize) - { - let kind: fuser::FileType = entry.kind.into(); - let Ok(idx): Result = (i + 1).try_into() else { - error!("Directory entry index {} too large for fuser", i + 1); - reply.error(libc::EIO); - return; - }; - - debug!(?entry, "adding entry to reply..."); - if reply.add(entry.ino, idx, kind, &entry.name) { - debug!("buffer full for now, stopping readdir"); - break; - } - } - - debug!("finalizing reply..."); - reply.ok(); - } - - #[instrument(name = "FuserAdapter::open", skip(self, _req, flags, reply))] - fn open(&mut self, _req: &fuser::Request<'_>, ino: u64, flags: i32, reply: fuser::ReplyOpen) { - match self.runtime.block_on(self.fs.open(ino, flags.into())) { - Ok(open_file) => { - debug!(handle = open_file.handle, "replying..."); - reply.opened(open_file.handle, 0); - } - Err(e) => { - debug!(error = %e, "replying error"); - reply.error(e.into()); - } - } - } - - #[instrument( - name = "FuserAdapter::read", - skip(self, _req, fh, offset, size, flags, lock_owner, reply) - )] - fn read( - &mut self, - _req: &fuser::Request<'_>, - ino: u64, - fh: u64, - offset: i64, - size: u32, - flags: i32, - lock_owner: Option, - reply: fuser::ReplyData, - ) { - let flags: OpenFlags = flags.into(); - let lock_owner = lock_owner.map(LockOwner); - match self.runtime.block_on(self.fs.read( - ino, - fh, - offset.cast_unsigned(), - size, - flags, - lock_owner, - )) { - Ok(data) => { - debug!(read_bytes = data.len(), "replying..."); - reply.data(&data); - } - Err(e) => { - debug!(error = %e, "replying error"); - reply.error(e.into()); - } - } - } - - #[instrument(name = "FuserAdapter::release", skip(self, _req, _lock_owner, reply))] - fn release( - &mut self, - _req: &fuser::Request<'_>, - ino: u64, - fh: u64, - flags: i32, - _lock_owner: Option, - flush: bool, - reply: fuser::ReplyEmpty, - ) { - match self - .runtime - .block_on(self.fs.release(ino, fh, flags.into(), flush)) - { - Ok(()) => { - debug!("replying ok"); - reply.ok(); - } - Err(e) => { - debug!(error = %e, "replying error"); - reply.error(e.into()); - } - } - } - - #[instrument(name = "FuserAdapter::forget", skip(self, _req, nlookup))] - fn forget(&mut self, _req: &fuser::Request<'_>, ino: u64, nlookup: u64) { - self.runtime.block_on(self.fs.forget(ino, nlookup)); - } - - #[instrument(name = "FuserAdapter::statfs", skip(self, _req, _ino, reply))] - fn statfs(&mut self, _req: &fuser::Request<'_>, _ino: u64, reply: fuser::ReplyStatfs) { - self.runtime.block_on(async { - match self.fs.statfs().await { - Ok(statvfs) => { - debug!(?statvfs, "replying..."); - reply.statfs( - statvfs.total_blocks, - statvfs.free_blocks, - statvfs.available_blocks, - statvfs.total_inodes, - statvfs.free_inodes, - statvfs.block_size, - statvfs.max_filename_length, - 0, - ); - } - Err(e) => { - debug!(error = %e, "replying error"); - reply.error(e.raw_os_error().unwrap_or(libc::EIO)); - } - } - }); - } -} diff --git a/src/fs/icache/async_cache.rs b/src/fs/icache/async_cache.rs deleted file mode 100644 index 84003da3..00000000 --- a/src/fs/icache/async_cache.rs +++ /dev/null @@ -1,1410 +0,0 @@ -//! Async inode cache with InFlight/Available state machine. - -use std::future::Future; - -use scc::HashMap as ConcurrentHashMap; -use tokio::sync::watch; - -use tracing::{instrument, trace, warn}; - -use crate::fs::r#trait::Inode; - -use super::IcbLike; - -/// State of an entry in the async inode cache. -pub enum IcbState { - /// Entry is being loaded; waiters clone the receiver and `.changed().await`. - /// - /// The channel carries `()` rather than the resolved value because the map - /// is the single source of truth: ICBs are mutated in-place (rc, attrs) so - /// a snapshot in the channel would immediately go stale. Sender-drop also - /// gives us implicit, leak-proof signalling on both success and error paths. - InFlight(watch::Receiver<()>), - /// Entry is ready for use. - Available(I), -} - -impl IcbState { - /// Consume `self`, returning the inner value if `Available`, or `None` if `InFlight`. - fn into_available(self) -> Option { - match self { - Self::Available(inner) => Some(inner), - Self::InFlight(_) => None, - } - } -} - -/// Trait for resolving an inode to its control block. -/// -/// Implementations act as a "promise" that an ICB will eventually be produced -/// for a given inode. The cache calls `resolve` when it needs to populate a -/// missing entry. -pub trait IcbResolver: Send + Sync { - /// The inode control block type this resolver produces. - type Icb: IcbLike + Send + Sync; - /// Error type returned when resolution fails. - type Error: Send; - - /// Resolve an inode to a fully-populated control block. - /// - /// - `stub`: `Some(icb)` if upgrading an existing stub entry, `None` if creating - /// from scratch. The stub typically has `parent` and `path` set but `attr` missing. - /// - `cache`: reference to the cache, useful for walking parent chains to build paths. - fn resolve( - &self, - ino: Inode, - stub: Option, - cache: &AsyncICache, - ) -> impl Future> + Send - where - Self: Sized; -} - -/// Async, concurrency-safe inode cache. -/// -/// All methods take `&self` — internal synchronization is provided by -/// `scc::HashMap` (sharded lock-free map). -pub struct AsyncICache { - resolver: R, - inode_table: ConcurrentHashMap>, -} - -impl AsyncICache { - /// Create a new cache with a root ICB at `root_ino` (rc = 1). - pub fn new(resolver: R, root_ino: Inode, root_path: impl Into) -> Self { - let table = ConcurrentHashMap::new(); - // insert_sync is infallible for a fresh map - drop(table.insert_sync( - root_ino, - IcbState::Available(R::Icb::new_root(root_path.into())), - )); - Self { - resolver, - inode_table: table, - } - } - - /// Number of entries (`InFlight` + `Available`) in the table. - pub fn inode_count(&self) -> usize { - self.inode_table.len() - } - - /// Wait until `ino` is `Available`. - /// Returns `true` if the entry exists and is Available, - /// `false` if the entry does not exist. - #[instrument(name = "AsyncICache::wait_for_available", skip(self))] - async fn wait_for_available(&self, ino: Inode) -> bool { - loop { - let rx = self - .inode_table - .read_async(&ino, |_, s| match s { - IcbState::InFlight(rx) => Some(rx.clone()), - IcbState::Available(_) => None, - }) - .await; - - match rx { - None => return false, // key missing - Some(None) => return true, // Available - Some(Some(mut rx)) => { - // Wait for the resolver to complete (or fail/drop sender). - // changed() returns Err(RecvError) when sender is dropped, - // which is fine — it means resolution finished. - let _ = rx.changed().await; - // Loop back — the entry might be InFlight again if another - // resolution cycle started between our wakeup and re-read. - } - } - } - } - - /// Check whether `ino` has an entry in the table (either `InFlight` or `Available`). - /// - /// This is a non-blocking, synchronous check. It does **not** wait for - /// `InFlight` entries to resolve. - pub fn contains(&self, ino: Inode) -> bool { - self.inode_table.contains_sync(&ino) - } - - /// Read an ICB via closure. **Awaits** if `InFlight`. - /// Returns `None` if `ino` doesn't exist. - #[instrument(name = "AsyncICache::get_icb", skip(self, f))] - // `Sync` is required because `f` is held across `.await` points in the - // loop body; for the resulting future to be `Send`, the captured closure - // must be `Sync` (clippy::future_not_send). - pub async fn get_icb( - &self, - ino: Inode, - f: impl Fn(&R::Icb) -> T + Send + Sync, - ) -> Option { - loop { - if !self.wait_for_available(ino).await { - return None; - } - let result = self - .inode_table - .read_async(&ino, |_, state| match state { - IcbState::Available(icb) => Some(f(icb)), - IcbState::InFlight(_) => None, - }) - .await; - match result { - Some(Some(val)) => return Some(val), - Some(None) => {} // was InFlight, retry - None => return None, // key missing - } - } - } - - /// Mutate an ICB via closure. **Awaits** if `InFlight`. - /// Returns `None` if `ino` doesn't exist. - #[instrument(name = "AsyncICache::get_icb_mut", skip(self, f))] - pub async fn get_icb_mut( - &self, - ino: Inode, - mut f: impl FnMut(&mut R::Icb) -> T + Send, - ) -> Option { - loop { - if !self.wait_for_available(ino).await { - return None; - } - let result = self - .inode_table - .update_async(&ino, |_, state| match state { - IcbState::Available(icb) => Some(f(icb)), - IcbState::InFlight(_) => None, - }) - .await; - match result { - Some(Some(val)) => return Some(val), - Some(None) => {} // was InFlight, retry - None => return None, // key missing - } - } - } - - /// Insert an ICB directly as `Available`. If the entry is currently - /// `InFlight`, waits for resolution before overwriting. - #[instrument(name = "AsyncICache::insert_icb", skip(self, icb))] - pub async fn insert_icb(&self, ino: Inode, icb: R::Icb) { - use scc::hash_map::Entry; - let mut icb = Some(icb); - loop { - match self.inode_table.entry_async(ino).await { - Entry::Vacant(vac) => { - let val = icb - .take() - .unwrap_or_else(|| unreachable!("icb consumed more than once")); - vac.insert_entry(IcbState::Available(val)); - return; - } - Entry::Occupied(mut occ) => match occ.get_mut() { - IcbState::InFlight(rx) => { - let mut rx = rx.clone(); - drop(occ); - let _ = rx.changed().await; - } - IcbState::Available(_) => { - let val = icb - .take() - .unwrap_or_else(|| unreachable!("icb consumed more than once")); - *occ.get_mut() = IcbState::Available(val); - return; - } - }, - } - } - } - - /// Get-or-insert pattern. If `ino` exists (awaits `InFlight`), runs `then` - /// on it. If absent, calls `factory` to create, inserts, then runs `then`. - /// - /// Both `factory` and `then` are `FnOnce` — wrapped in `Option` internally - /// to satisfy the borrow checker across the await-loop. - #[instrument(name = "AsyncICache::entry_or_insert_icb", skip(self, factory, then))] - pub async fn entry_or_insert_icb( - &self, - ino: Inode, - factory: impl FnOnce() -> R::Icb, - then: impl FnOnce(&mut R::Icb) -> T, - ) -> T { - use scc::hash_map::Entry; - let mut factory = Some(factory); - let mut then_fn = Some(then); - - loop { - match self.inode_table.entry_async(ino).await { - Entry::Occupied(mut occ) => match occ.get_mut() { - IcbState::Available(icb) => { - let t = then_fn - .take() - .unwrap_or_else(|| unreachable!("then_fn consumed more than once")); - return t(icb); - } - IcbState::InFlight(rx) => { - let mut rx = rx.clone(); - drop(occ); // release shard lock before awaiting - let _ = rx.changed().await; - } - }, - Entry::Vacant(vac) => { - let f = factory - .take() - .unwrap_or_else(|| unreachable!("factory consumed more than once")); - let t = then_fn - .take() - .unwrap_or_else(|| unreachable!("then_fn consumed more than once")); - let mut icb = f(); - let result = t(&mut icb); - vac.insert_entry(IcbState::Available(icb)); - return result; - } - } - } - } - - /// Write an ICB back to the table only if the entry still exists. - /// - /// If the entry was evicted (vacant) during resolution, the result is - /// silently dropped — this prevents resurrecting entries that a concurrent - /// `forget` has already removed. - async fn write_back_if_present(&self, ino: Inode, icb: R::Icb) { - use scc::hash_map::Entry; - match self.inode_table.entry_async(ino).await { - Entry::Occupied(mut occ) => { - *occ.get_mut() = IcbState::Available(icb); - } - Entry::Vacant(_) => { - tracing::debug!( - ino, - "resolved inode was evicted during resolution, dropping result" - ); - } - } - } - - /// Look up `ino`. If `Available` and fully resolved, run `then` and return - /// `Ok(T)`. If `Available` but `needs_resolve()` is true (stub), extract - /// the stub, resolve it, cache the result, then run `then`. If absent, call - /// the resolver to fetch the ICB, cache it, then run `then`. If another task - /// is already resolving this inode (`InFlight`), wait for it. - /// - /// Returns `Err(R::Error)` if resolution fails. On error the `InFlight` - /// entry is removed so subsequent calls can retry. - #[instrument(name = "AsyncICache::get_or_resolve", skip(self, then))] - pub async fn get_or_resolve( - &self, - ino: Inode, - then: impl FnOnce(&R::Icb) -> T, - ) -> Result { - use scc::hash_map::Entry; - - let mut then_fn = Some(then); - - // Fast path: Available and fully resolved - { - let hit = self - .inode_table - .read_async(&ino, |_, s| match s { - IcbState::Available(icb) if !icb.needs_resolve() => { - let t = then_fn - .take() - .unwrap_or_else(|| unreachable!("then_fn consumed more than once")); - Some(t(icb)) - } - IcbState::InFlight(_) | IcbState::Available(_) => None, - }) - .await; - if let Some(Some(r)) = hit { - return Ok(r); - } - } - - // Slow path: missing, InFlight, or stub needing resolution - loop { - match self.inode_table.entry_async(ino).await { - Entry::Occupied(mut occ) => match occ.get_mut() { - IcbState::Available(icb) if !icb.needs_resolve() => { - let t = then_fn - .take() - .unwrap_or_else(|| unreachable!("then_fn consumed more than once")); - return Ok(t(icb)); - } - IcbState::Available(_) => { - // Stub needing resolution — extract stub, replace with InFlight - let (tx, rx) = watch::channel(()); - let old = std::mem::replace(occ.get_mut(), IcbState::InFlight(rx)); - let stub = old.into_available().unwrap_or_else(|| { - unreachable!("matched Available arm, replaced value must be Available") - }); - let fallback = stub.clone(); - drop(occ); // release shard lock before awaiting - - match self.resolver.resolve(ino, Some(stub), self).await { - Ok(icb) => { - let t = then_fn.take().unwrap_or_else(|| { - unreachable!("then_fn consumed more than once") - }); - let result = t(&icb); - self.write_back_if_present(ino, icb).await; - drop(tx); - return Ok(result); - } - Err(e) => { - if fallback.rc() > 0 { - self.write_back_if_present(ino, fallback).await; - } else { - self.inode_table.remove_async(&ino).await; - } - drop(tx); - return Err(e); - } - } - } - IcbState::InFlight(rx) => { - let mut rx = rx.clone(); - drop(occ); - let _ = rx.changed().await; - } - }, - Entry::Vacant(vac) => { - let (tx, rx) = watch::channel(()); - vac.insert_entry(IcbState::InFlight(rx)); - - match self.resolver.resolve(ino, None, self).await { - Ok(icb) => { - let t = then_fn - .take() - .unwrap_or_else(|| unreachable!("then_fn consumed more than once")); - let result = t(&icb); - self.write_back_if_present(ino, icb).await; - drop(tx); - return Ok(result); - } - Err(e) => { - self.inode_table.remove_async(&ino).await; - drop(tx); - return Err(e); - } - } - } - } - } - } - - /// Increment rc. **Awaits** `InFlight`. - /// - /// Returns `None` if the inode does not exist or was evicted concurrently. - /// This can happen when a concurrent `forget` removes the entry between the - /// caller's insert/cache and this `inc_rc` call, or when a concurrent - /// `get_or_resolve` swaps the entry to `InFlight` and the entry is then - /// evicted on resolution failure. Callers in FUSE `lookup` paths should - /// treat `None` as a lookup failure to avoid ref-count leaks (the kernel - /// would hold a reference the cache no longer tracks). - #[instrument(name = "AsyncICache::inc_rc", skip(self))] - pub async fn inc_rc(&self, ino: Inode) -> Option { - loop { - if !self.wait_for_available(ino).await { - warn!(ino, "inc_rc: inode not in table"); - return None; - } - let result = self - .inode_table - .update_async(&ino, |_, state| match state { - IcbState::Available(icb) => { - *icb.rc_mut() += 1; - Some(icb.rc()) - } - IcbState::InFlight(_) => None, - }) - .await - .flatten(); - - match result { - Some(rc) => return Some(rc), - None => { - // Entry was concurrently replaced with InFlight or evicted. - if !self.contains(ino) { - warn!(ino, "inc_rc: inode evicted concurrently"); - return None; - } - // Entry exists but became InFlight — retry. - } - } - } - } - - /// Decrement rc by `nlookups`. If rc drops to zero, evicts and returns - /// the ICB. **Awaits** `InFlight` entries. - #[instrument(name = "AsyncICache::forget", skip(self))] - pub async fn forget(&self, ino: Inode, nlookups: u64) -> Option { - use scc::hash_map::Entry; - - loop { - match self.inode_table.entry_async(ino).await { - Entry::Occupied(mut occ) => match occ.get_mut() { - IcbState::Available(icb) => { - if icb.rc() <= nlookups { - trace!(ino, "evicting inode"); - let (_, state) = occ.remove_entry(); - return state.into_available(); - } - *icb.rc_mut() -= nlookups; - trace!(ino, new_rc = icb.rc(), "decremented rc"); - return None; - } - IcbState::InFlight(rx) => { - let mut rx = rx.clone(); - drop(occ); - let _ = rx.changed().await; - } - }, - Entry::Vacant(_) => { - warn!(ino, "forget on unknown inode"); - return None; - } - } - } - } - - /// Synchronous mutable access to an `Available` entry. - /// Does **not** wait for `InFlight`. Intended for initialization. - pub fn get_icb_mut_sync(&self, ino: Inode, f: impl FnOnce(&mut R::Icb) -> T) -> Option { - self.inode_table - .update_sync(&ino, |_, state| match state { - IcbState::Available(icb) => Some(f(icb)), - IcbState::InFlight(_) => None, - }) - .flatten() - } - - /// Iterate over all `Available` entries (skips `InFlight`). - /// Async-safe iteration using `iter_async` to avoid contention on single-threaded runtimes. - pub async fn for_each(&self, mut f: impl FnMut(&Inode, &R::Icb)) { - self.inode_table - .iter_async(|ino, state| { - if let IcbState::Available(icb) = state { - f(ino, icb); - } - true // continue iteration - }) - .await; - } -} - -#[cfg(test)] -mod tests { - use super::*; - use std::collections::HashMap as StdHashMap; - use std::path::PathBuf; - use std::sync::atomic::Ordering; - use std::sync::{Arc, Mutex}; - - #[derive(Debug, Clone, PartialEq)] - struct TestIcb { - rc: u64, - path: PathBuf, - resolved: bool, - } - - impl IcbLike for TestIcb { - fn new_root(path: PathBuf) -> Self { - Self { - rc: 1, - path, - resolved: true, - } - } - fn rc(&self) -> u64 { - self.rc - } - fn rc_mut(&mut self) -> &mut u64 { - &mut self.rc - } - fn needs_resolve(&self) -> bool { - !self.resolved - } - } - - struct TestResolver { - responses: Mutex>>, - } - - impl TestResolver { - fn new() -> Self { - Self { - responses: Mutex::new(StdHashMap::new()), - } - } - - fn add(&self, ino: Inode, icb: TestIcb) { - self.responses - .lock() - .expect("test mutex") - .insert(ino, Ok(icb)); - } - - fn add_err(&self, ino: Inode, err: impl Into) { - self.responses - .lock() - .expect("test mutex") - .insert(ino, Err(err.into())); - } - } - - impl IcbResolver for TestResolver { - type Icb = TestIcb; - type Error = String; - - fn resolve( - &self, - ino: Inode, - _stub: Option, - _cache: &AsyncICache, - ) -> impl Future> + Send { - let result = self - .responses - .lock() - .expect("test mutex") - .remove(&ino) - .unwrap_or_else(|| Err(format!("no response for inode {ino}"))); - async move { result } - } - } - - fn test_cache() -> AsyncICache { - AsyncICache::new(TestResolver::new(), 1, "/root") - } - - fn test_cache_with(resolver: TestResolver) -> AsyncICache { - AsyncICache::new(resolver, 1, "/root") - } - - #[tokio::test] - async fn contains_returns_true_for_root() { - let cache = test_cache(); - assert!(cache.contains(1), "root should exist"); - } - - #[tokio::test] - async fn contains_returns_false_for_missing() { - let cache = test_cache(); - assert!(!cache.contains(999), "missing inode should not exist"); - } - - #[tokio::test] - async fn contains_after_resolver_completes() { - let resolver = TestResolver::new(); - resolver.add( - 42, - TestIcb { - rc: 1, - path: "/test".into(), - resolved: true, - }, - ); - let cache = Arc::new(test_cache_with(resolver)); - - // Trigger resolve in background - let cache2 = Arc::clone(&cache); - let handle = tokio::spawn(async move { cache2.get_or_resolve(42, |_| ()).await }); - - handle - .await - .expect("task panicked") - .expect("resolve failed"); - assert!(cache.contains(42), "should be true after resolve"); - } - - #[tokio::test] - async fn new_creates_root_entry() { - let cache = test_cache(); - assert_eq!(cache.inode_count(), 1, "should have exactly 1 entry"); - } - - #[tokio::test] - async fn get_icb_returns_value() { - let cache = test_cache(); - let path = cache.get_icb(1, |icb| icb.path.clone()).await; - assert_eq!(path, Some(PathBuf::from("/root"))); - } - - #[tokio::test] - async fn get_icb_returns_none_for_missing() { - let cache = test_cache(); - let result = cache.get_icb(999, IcbLike::rc).await; - assert_eq!(result, None, "missing inode should return None"); - } - - #[tokio::test] - async fn get_icb_mut_modifies_value() { - let cache = test_cache(); - cache - .get_icb_mut(1, |icb| { - *icb.rc_mut() += 10; - }) - .await; - let rc = cache.get_icb(1, IcbLike::rc).await; - assert_eq!(rc, Some(11), "root starts at rc=1, +10 = 11"); - } - - #[tokio::test] - async fn get_icb_after_resolver_completes() { - let resolver = TestResolver::new(); - resolver.add( - 42, - TestIcb { - rc: 1, - path: "/loaded".into(), - resolved: true, - }, - ); - let cache = test_cache_with(resolver); - - // Resolve inode 42 - cache - .get_or_resolve(42, |_| ()) - .await - .expect("resolve failed"); - - let path = cache.get_icb(42, |icb| icb.path.clone()).await; - assert_eq!(path, Some(PathBuf::from("/loaded"))); - } - - #[tokio::test] - async fn insert_icb_adds_entry() { - let cache = test_cache(); - cache - .insert_icb( - 42, - TestIcb { - rc: 1, - path: "/foo".into(), - resolved: true, - }, - ) - .await; - assert!(cache.contains(42), "inserted entry should exist"); - assert_eq!(cache.inode_count(), 2, "root + inserted = 2"); - } - - #[tokio::test] - async fn insert_icb_does_not_clobber_inflight() { - let cache = Arc::new(test_cache()); - let (tx, rx) = watch::channel(()); - cache - .inode_table - .upsert_async(42, IcbState::InFlight(rx)) - .await; - - // Spawn insert_icb in background — should wait for InFlight to resolve - let cache2 = Arc::clone(&cache); - let handle = tokio::spawn(async move { - cache2 - .insert_icb( - 42, - TestIcb { - rc: 5, - path: "/inserted".into(), - resolved: true, - }, - ) - .await; - }); - - // Give insert_icb time to start waiting - tokio::task::yield_now().await; - - // Complete the InFlight from the resolver side (write directly) - cache - .inode_table - .upsert_async( - 42, - IcbState::Available(TestIcb { - rc: 1, - path: "/resolved".into(), - resolved: true, - }), - ) - .await; - drop(tx); // signal watchers - - handle.await.expect("task panicked"); - - // After insert_icb completes, it should have overwritten the resolved value - let path = cache.get_icb(42, |icb| icb.path.clone()).await; - assert_eq!(path, Some(PathBuf::from("/inserted"))); - } - - #[tokio::test] - async fn entry_or_insert_creates_new() { - let cache = test_cache(); - let rc = cache - .entry_or_insert_icb( - 42, - || TestIcb { - rc: 0, - path: "/new".into(), - resolved: true, - }, - |icb| { - *icb.rc_mut() += 1; - icb.rc() - }, - ) - .await; - assert_eq!(rc, 1, "factory creates rc=0, then +1 = 1"); - } - - #[tokio::test] - async fn entry_or_insert_returns_existing() { - let cache = test_cache(); - cache - .insert_icb( - 42, - TestIcb { - rc: 5, - path: "/existing".into(), - resolved: true, - }, - ) - .await; - - let rc = cache - .entry_or_insert_icb( - 42, - || panic!("factory should not be called"), - |icb| icb.rc(), - ) - .await; - assert_eq!(rc, 5, "existing entry rc should be 5"); - } - - #[tokio::test] - async fn entry_or_insert_after_resolver_completes() { - let resolver = TestResolver::new(); - resolver.add( - 42, - TestIcb { - rc: 1, - path: "/resolved".into(), - resolved: true, - }, - ); - let cache = Arc::new(test_cache_with(resolver)); - - // Start resolve in background - let cache2 = Arc::clone(&cache); - let resolve_handle = tokio::spawn(async move { cache2.get_or_resolve(42, |_| ()).await }); - - // Wait for resolve to finish - resolve_handle - .await - .expect("task panicked") - .expect("resolve failed"); - - // Now entry_or_insert should find the existing entry - let rc = cache - .entry_or_insert_icb( - 42, - || panic!("factory should not be called"), - |icb| icb.rc(), - ) - .await; - assert_eq!(rc, 1, "should find the resolved entry"); - } - - #[tokio::test] - async fn inc_rc_increments() { - let cache = test_cache(); - cache - .insert_icb( - 42, - TestIcb { - rc: 1, - path: "/a".into(), - resolved: true, - }, - ) - .await; - let new_rc = cache.inc_rc(42).await; - assert_eq!(new_rc, Some(2), "rc 1 + 1 = 2"); - } - - #[tokio::test] - async fn forget_decrements_rc() { - let cache = test_cache(); - cache - .insert_icb( - 42, - TestIcb { - rc: 5, - path: "/a".into(), - resolved: true, - }, - ) - .await; - - let evicted = cache.forget(42, 2).await; - assert!(evicted.is_none(), "rc 5 - 2 = 3, should not evict"); - - let rc = cache.get_icb(42, IcbLike::rc).await; - assert_eq!(rc, Some(3), "rc should be 3 after forget(2)"); - } - - #[tokio::test] - async fn forget_evicts_when_rc_drops_to_zero() { - let cache = test_cache(); - cache - .insert_icb( - 42, - TestIcb { - rc: 3, - path: "/a".into(), - resolved: true, - }, - ) - .await; - - let evicted = cache.forget(42, 3).await; - assert!(evicted.is_some(), "rc 3 - 3 = 0, should evict"); - assert!(!cache.contains(42), "evicted entry should be gone"); - assert_eq!(cache.inode_count(), 1, "only root remains"); - } - - #[tokio::test] - async fn forget_unknown_inode_returns_none() { - let cache = test_cache(); - let evicted = cache.forget(999, 1).await; - assert!(evicted.is_none(), "unknown inode should return None"); - } - - #[tokio::test] - async fn for_each_iterates_available_entries() { - let cache = test_cache(); - cache - .insert_icb( - 2, - TestIcb { - rc: 1, - path: "/a".into(), - resolved: true, - }, - ) - .await; - cache - .insert_icb( - 3, - TestIcb { - rc: 1, - path: "/b".into(), - resolved: true, - }, - ) - .await; - - let mut seen = std::collections::HashSet::new(); - cache - .for_each(|ino, _icb| { - seen.insert(*ino); - }) - .await; - assert_eq!(seen.len(), 3, "should see all 3 entries"); - assert!(seen.contains(&1), "should contain root"); - assert!(seen.contains(&2), "should contain inode 2"); - assert!(seen.contains(&3), "should contain inode 3"); - } - - #[tokio::test] - async fn for_each_skips_inflight() { - let cache = test_cache(); - // Directly insert an InFlight entry for testing iteration - let (_tx, rx) = watch::channel(()); - cache - .inode_table - .upsert_async(42, IcbState::InFlight(rx)) - .await; - - let mut count = 0; - cache - .for_each(|_, _| { - count += 1; - }) - .await; - assert_eq!(count, 1, "only root, not the InFlight entry"); - } - - #[tokio::test] - async fn wait_does_not_miss_signal_on_immediate_complete() { - let cache = Arc::new(test_cache()); - - // Insert InFlight manually, then immediately complete before anyone waits - let (tx, rx) = watch::channel(()); - cache - .inode_table - .upsert_async(42, IcbState::InFlight(rx)) - .await; - - // Complete before any waiter (simulate resolver by writing directly) - cache - .inode_table - .upsert_async( - 42, - IcbState::Available(TestIcb { - rc: 1, - path: "/fast".into(), - resolved: true, - }), - ) - .await; - drop(tx); - - assert!(cache.contains(42), "entry should exist in table"); - } - - // -- get_or_resolve tests -- - - #[tokio::test] - async fn get_or_resolve_returns_existing() { - let cache = test_cache(); - cache - .insert_icb( - 42, - TestIcb { - rc: 1, - path: "/existing".into(), - resolved: true, - }, - ) - .await; - - let path: Result = cache.get_or_resolve(42, |icb| icb.path.clone()).await; - assert_eq!(path, Ok(PathBuf::from("/existing"))); - } - - #[tokio::test] - async fn get_or_resolve_resolves_missing() { - let resolver = TestResolver::new(); - resolver.add( - 42, - TestIcb { - rc: 1, - path: "/resolved".into(), - resolved: true, - }, - ); - let cache = test_cache_with(resolver); - - let path: Result = cache.get_or_resolve(42, |icb| icb.path.clone()).await; - assert_eq!(path, Ok(PathBuf::from("/resolved"))); - // Should now be cached - assert!(cache.contains(42)); - } - - #[tokio::test] - async fn get_or_resolve_propagates_error() { - let resolver = TestResolver::new(); - resolver.add_err(42, "network error"); - let cache = test_cache_with(resolver); - - let result: Result = - cache.get_or_resolve(42, |icb| icb.path.clone()).await; - assert_eq!(result, Err("network error".to_owned())); - // Entry should be cleaned up on error - assert!(!cache.contains(42)); - } - - struct CountingResolver { - count: Arc, - } - - impl IcbResolver for CountingResolver { - type Icb = TestIcb; - type Error = String; - - fn resolve( - &self, - _ino: Inode, - _stub: Option, - _cache: &AsyncICache, - ) -> impl Future> + Send { - self.count.fetch_add(1, Ordering::SeqCst); - async { - tokio::task::yield_now().await; - Ok(TestIcb { - rc: 1, - path: "/coalesced".into(), - resolved: true, - }) - } - } - } - - #[tokio::test] - async fn get_or_resolve_coalesces_concurrent_requests() { - use std::sync::atomic::AtomicUsize; - - let resolve_count = Arc::new(AtomicUsize::new(0)); - - let cache = Arc::new(AsyncICache::new( - CountingResolver { - count: Arc::clone(&resolve_count), - }, - 1, - "/root", - )); - - let mut handles = Vec::new(); - for _ in 0..5 { - let c = Arc::clone(&cache); - handles.push(tokio::spawn(async move { - c.get_or_resolve(42, |icb| icb.path.clone()).await - })); - } - - for h in handles { - assert_eq!( - h.await.expect("task panicked"), - Ok(PathBuf::from("/coalesced")), - ); - } - - // Resolver should only have been called ONCE (not 5 times) - assert_eq!( - resolve_count.load(Ordering::SeqCst), - 1, - "should coalesce to 1 resolve call" - ); - } - - #[test] - fn icb_state_into_available_returns_inner() { - let state = IcbState::Available(TestIcb { - rc: 1, - path: "/test".into(), - resolved: true, - }); - assert!(state.into_available().is_some()); - } - - #[test] - fn icb_state_into_available_returns_none_for_inflight() { - let (_tx, rx) = watch::channel(()); - let state: IcbState = IcbState::InFlight(rx); - assert!(state.into_available().is_none()); - } - - #[tokio::test] - async fn get_or_resolve_resolves_stub_entry() { - let resolver = TestResolver::new(); - resolver.add( - 42, - TestIcb { - rc: 1, - path: "/resolved".into(), - resolved: true, - }, - ); - let cache = test_cache_with(resolver); - - // Insert unresolved stub - cache - .insert_icb( - 42, - TestIcb { - rc: 0, - path: "/stub".into(), - resolved: false, - }, - ) - .await; - - // get_or_resolve should trigger resolution because needs_resolve() == true - let path: Result = cache.get_or_resolve(42, |icb| icb.path.clone()).await; - assert_eq!(path, Ok(PathBuf::from("/resolved"))); - } - - #[tokio::test] - async fn forget_handles_inflight_entry() { - let cache = Arc::new(test_cache()); - let (tx, rx) = watch::channel(()); - cache - .inode_table - .upsert_async(42, IcbState::InFlight(rx)) - .await; - - let cache2 = Arc::clone(&cache); - let handle = tokio::spawn(async move { cache2.forget(42, 1).await }); - - // Give forget time to start waiting - tokio::task::yield_now().await; - - // Simulate resolver completing (write directly to inode_table) - cache - .inode_table - .upsert_async( - 42, - IcbState::Available(TestIcb { - rc: 3, - path: "/inflight".into(), - resolved: true, - }), - ) - .await; - drop(tx); - - let evicted = handle.await.expect("task panicked"); - assert!(evicted.is_none(), "rc=3 - 1 = 2, should not evict"); - - let rc = cache.get_icb(42, IcbLike::rc).await; - assert_eq!(rc, Some(2), "rc should be 2 after forget(1) on rc=3"); - } - - #[tokio::test] - async fn get_or_resolve_error_preserves_stub_with_nonzero_rc() { - let resolver = TestResolver::new(); - resolver.add_err(42, "resolve failed"); - let cache = test_cache_with(resolver); - - // Insert a stub with rc=2 (simulates a looked-up entry needing resolution) - cache - .insert_icb( - 42, - TestIcb { - rc: 2, - path: "/stub".into(), - resolved: false, - }, - ) - .await; - - // get_or_resolve should fail - let result: Result = - cache.get_or_resolve(42, |icb| icb.path.clone()).await; - assert!(result.is_err(), "should propagate resolver error"); - - // The stub should be preserved since rc > 0 - assert!(cache.contains(42), "entry with rc=2 should survive error"); - let rc = cache.get_icb(42, IcbLike::rc).await; - assert_eq!(rc, Some(2), "rc should be preserved"); - } - - #[tokio::test] - async fn inc_rc_missing_inode_returns_none() { - let cache = test_cache(); - assert_eq!(cache.inc_rc(999).await, None); - } - - #[tokio::test] - async fn inc_rc_waits_for_inflight() { - let cache = Arc::new(test_cache()); - let (tx, rx) = watch::channel(()); - cache - .inode_table - .upsert_async(42, IcbState::InFlight(rx)) - .await; - - let cache2 = Arc::clone(&cache); - let handle = tokio::spawn(async move { cache2.inc_rc(42).await }); - - // Simulate resolver completing by writing directly to inode_table - cache - .inode_table - .upsert_async( - 42, - IcbState::Available(TestIcb { - rc: 1, - path: "/a".into(), - resolved: true, - }), - ) - .await; - drop(tx); - - let result = handle - .await - .unwrap_or_else(|e| panic!("task panicked: {e}")); - assert_eq!( - result, - Some(2), - "waited for Available, then incremented 1 -> 2" - ); - } - - #[tokio::test] - async fn inc_rc_returns_none_after_concurrent_eviction() { - let cache = Arc::new(test_cache()); - let (tx, rx) = watch::channel(()); - cache - .inode_table - .upsert_async(42, IcbState::InFlight(rx)) - .await; - - let cache2 = Arc::clone(&cache); - let handle = tokio::spawn(async move { cache2.inc_rc(42).await }); - - // Evict instead of completing - cache.inode_table.remove_async(&42).await; - drop(tx); - - let result = handle - .await - .unwrap_or_else(|e| panic!("task panicked: {e}")); - assert_eq!(result, None, "evicted entry should return None"); - } - - /// Resolver that pauses mid-resolution via a `Notify`, allowing the test - /// to interleave a `forget` while the resolve future is suspended. - struct SlowResolver { - /// Signalled by the resolver once it has started (so the test knows - /// resolution is in progress). - started: Arc, - /// The resolver waits on this before returning (the test signals it - /// after calling `forget`). - proceed: Arc, - } - - impl IcbResolver for SlowResolver { - type Icb = TestIcb; - type Error = String; - - fn resolve( - &self, - _ino: Inode, - _stub: Option, - _cache: &AsyncICache, - ) -> impl Future> + Send { - let started = Arc::clone(&self.started); - let proceed = Arc::clone(&self.proceed); - async move { - started.notify_one(); - proceed.notified().await; - Ok(TestIcb { - rc: 1, - path: "/slow-resolved".into(), - resolved: true, - }) - } - } - } - - /// Regression test: `get_icb` must survive the entry cycling back to - /// `InFlight` between when `wait_for_available` returns and when - /// `read_async` runs. The loop in `get_icb` should retry and eventually - /// return the final resolved value. - #[tokio::test] - async fn wait_for_available_retries_on_re_inflight() { - let cache = Arc::new(test_cache()); - let ino: Inode = 42; - - // Phase 1: insert an InFlight entry. - let (tx1, rx1) = watch::channel(()); - cache - .inode_table - .upsert_async(ino, IcbState::InFlight(rx1)) - .await; - - // Spawn get_icb — it will wait for InFlight to resolve. - let cache_get = Arc::clone(&cache); - let get_handle = - tokio::spawn(async move { cache_get.get_icb(ino, |icb| icb.path.clone()).await }); - - // Give get_icb time to start waiting on the watch channel. - tokio::task::yield_now().await; - - // Phase 1 complete: transition to Available briefly, then immediately - // back to InFlight (simulates get_or_resolve finding a stub and - // re-entering InFlight for a second resolution). - let (tx2, rx2) = watch::channel(()); - cache - .inode_table - .upsert_async(ino, IcbState::InFlight(rx2)) - .await; - // Signal phase-1 watchers so get_icb wakes up; it will re-read the - // entry and find InFlight again, then loop back to wait. - drop(tx1); - - // Give get_icb time to re-enter the wait loop. - tokio::task::yield_now().await; - - // Phase 2 complete: write the final resolved value. - cache - .inode_table - .upsert_async( - ino, - IcbState::Available(TestIcb { - rc: 1, - path: "/fully-resolved".into(), - resolved: true, - }), - ) - .await; - drop(tx2); - - // get_icb should return the final resolved value (not None). - let result = get_handle.await.expect("get_icb task panicked"); - assert_eq!( - result, - Some(PathBuf::from("/fully-resolved")), - "get_icb must survive re-InFlight and return the final resolved value" - ); - } - - /// Regression test: an entry evicted by `forget` during an in-progress - /// `get_or_resolve` must NOT be resurrected when resolution completes. - #[tokio::test] - async fn get_or_resolve_does_not_resurrect_evicted_entry() { - let started = Arc::new(tokio::sync::Notify::new()); - let proceed = Arc::new(tokio::sync::Notify::new()); - - let cache = Arc::new(AsyncICache::new( - SlowResolver { - started: Arc::clone(&started), - proceed: Arc::clone(&proceed), - }, - 1, - "/root", - )); - - let ino: Inode = 42; - - // Insert a stub with rc=1 (simulates a looked-up, unresolved entry). - cache - .insert_icb( - ino, - TestIcb { - rc: 1, - path: "/stub".into(), - resolved: false, - }, - ) - .await; - - // Spawn get_or_resolve which will trigger slow resolution. - let cache2 = Arc::clone(&cache); - let resolve_handle = - tokio::spawn(async move { cache2.get_or_resolve(ino, |icb| icb.path.clone()).await }); - - // Wait until the resolver has started (entry is now InFlight). - started.notified().await; - - // Evict the entry while resolution is in progress. - // forget waits for InFlight, so we need to complete resolution for - // forget to proceed. Instead, remove the InFlight entry directly to - // simulate a concurrent eviction (e.g., by another path that already - // removed the entry). - cache.inode_table.remove_async(&ino).await; - - // Let the resolver finish. - proceed.notify_one(); - - // Wait for get_or_resolve to complete. - drop(resolve_handle.await.expect("task panicked")); - - // The entry must NOT have been resurrected by write_back_if_present. - assert!( - !cache.contains(ino), - "evicted entry must not be resurrected after resolution completes" - ); - } -} diff --git a/src/fs/icache/bridge.rs b/src/fs/icache/bridge.rs deleted file mode 100644 index e674a564..00000000 --- a/src/fs/icache/bridge.rs +++ /dev/null @@ -1,138 +0,0 @@ -use crate::fs::r#trait::{FileAttr, FileHandle, Inode}; - -/// Bidirectional bridge for both inodes and file handles between two Fs layers. -/// -/// Convention: **left = outer (caller), right = inner (callee)**. -/// `forward(left)` → right, `backward(right)` → left. -pub struct HashMapBridge { - inode_map: bimap::BiMap, - fh_map: bimap::BiMap, -} - -impl HashMapBridge { - pub fn new() -> Self { - Self { - inode_map: bimap::BiMap::new(), - fh_map: bimap::BiMap::new(), - } - } - - // ── Inode methods ──────────────────────────────────────────────────── - - pub fn insert_inode(&mut self, left: Inode, right: Inode) { - self.inode_map.insert(left, right); - } - - /// Look up right→left, or allocate a new left inode if unmapped. - pub fn backward_or_insert_inode( - &mut self, - right: Inode, - allocate: impl FnOnce() -> Inode, - ) -> Inode { - if let Some(&left) = self.inode_map.get_by_right(&right) { - left - } else { - let left = allocate(); - self.inode_map.insert(left, right); - left - } - } - - /// Look up left→right, or allocate a new right inode if unmapped. - pub fn forward_or_insert_inode( - &mut self, - left: Inode, - allocate: impl FnOnce() -> Inode, - ) -> Inode { - if let Some(&right) = self.inode_map.get_by_left(&left) { - right - } else { - let right = allocate(); - self.inode_map.insert(left, right); - right - } - } - - /// Remove an inode mapping by its left (outer) key. - pub fn remove_inode_by_left(&mut self, left: Inode) { - self.inode_map.remove_by_left(&left); - } - - /// Look up left→right directly. - pub fn inode_map_get_by_left(&self, left: Inode) -> Option<&Inode> { - self.inode_map.get_by_left(&left) - } - - /// Rewrite the `ino` field in a [`FileAttr`] from right (inner) to left (outer) namespace. - pub fn attr_backward(&self, attr: FileAttr) -> FileAttr { - let backward = |ino: Inode| -> Inode { - if let Some(&left) = self.inode_map.get_by_right(&ino) { - left - } else { - tracing::warn!( - inner_ino = ino, - "attr_backward: no bridge mapping, using raw inner inode" - ); - ino - } - }; - rewrite_attr_ino(attr, backward) - } - - // ── File handle methods ────────────────────────────────────────────── - - pub fn insert_fh(&mut self, left: FileHandle, right: FileHandle) { - self.fh_map.insert(left, right); - } - - pub fn fh_forward(&self, left: FileHandle) -> Option { - self.fh_map.get_by_left(&left).copied() - } - - /// Remove a file handle mapping by its left (outer) key. - pub fn remove_fh_by_left(&mut self, left: FileHandle) { - self.fh_map.remove_by_left(&left); - } -} - -/// Rewrite the `ino` field in a [`FileAttr`] using the given translation function. -fn rewrite_attr_ino(attr: FileAttr, translate: impl Fn(Inode) -> Inode) -> FileAttr { - match attr { - FileAttr::RegularFile { - mut common, - size, - blocks, - } => { - common.ino = translate(common.ino); - FileAttr::RegularFile { - common, - size, - blocks, - } - } - FileAttr::Directory { mut common } => { - common.ino = translate(common.ino); - FileAttr::Directory { common } - } - FileAttr::Symlink { mut common, size } => { - common.ino = translate(common.ino); - FileAttr::Symlink { common, size } - } - FileAttr::CharDevice { mut common, rdev } => { - common.ino = translate(common.ino); - FileAttr::CharDevice { common, rdev } - } - FileAttr::BlockDevice { mut common, rdev } => { - common.ino = translate(common.ino); - FileAttr::BlockDevice { common, rdev } - } - FileAttr::NamedPipe { mut common } => { - common.ino = translate(common.ino); - FileAttr::NamedPipe { common } - } - FileAttr::Socket { mut common } => { - common.ino = translate(common.ino); - FileAttr::Socket { common } - } - } -} diff --git a/src/fs/icache/file_table.rs b/src/fs/icache/file_table.rs deleted file mode 100644 index 332a6ffb..00000000 --- a/src/fs/icache/file_table.rs +++ /dev/null @@ -1,22 +0,0 @@ -use std::sync::atomic::{AtomicU64, Ordering}; - -use crate::fs::r#trait::FileHandle; - -/// Monotonically increasing file handle allocator. -#[must_use] -pub struct FileTable { - next_fh: AtomicU64, -} - -impl FileTable { - pub fn new() -> Self { - Self { - next_fh: AtomicU64::new(1), - } - } - - #[must_use] - pub fn allocate(&self) -> FileHandle { - self.next_fh.fetch_add(1, Ordering::Relaxed) - } -} diff --git a/src/fs/icache/inode_factory.rs b/src/fs/icache/inode_factory.rs deleted file mode 100644 index 1a603388..00000000 --- a/src/fs/icache/inode_factory.rs +++ /dev/null @@ -1,19 +0,0 @@ -use crate::fs::r#trait::Inode; -use std::sync::atomic::{AtomicU64, Ordering}; - -/// Monotonically increasing inode allocator. -pub struct InodeFactory { - next_inode: AtomicU64, -} - -impl InodeFactory { - pub fn new(start: Inode) -> Self { - Self { - next_inode: AtomicU64::new(start), - } - } - - pub fn allocate(&self) -> Inode { - self.next_inode.fetch_add(1, Ordering::Relaxed) - } -} diff --git a/src/fs/icache/mod.rs b/src/fs/icache/mod.rs deleted file mode 100644 index 2ccd80bd..00000000 --- a/src/fs/icache/mod.rs +++ /dev/null @@ -1,21 +0,0 @@ -//! Generic directory cache and inode management primitives. - -pub mod async_cache; -pub mod bridge; -mod file_table; -mod inode_factory; - -pub use async_cache::AsyncICache; -pub use async_cache::IcbResolver; -pub use file_table::FileTable; -pub use inode_factory::InodeFactory; - -/// Common interface for inode control block types usable with `ICache`. -pub trait IcbLike: Clone { - /// Create an ICB with rc=1, the given path, and no children. - fn new_root(path: std::path::PathBuf) -> Self; - fn rc(&self) -> u64; - fn rc_mut(&mut self) -> &mut u64; - /// Returns true if this entry needs resolution (e.g., attr not yet fetched). - fn needs_resolve(&self) -> bool; -} diff --git a/src/fs/mescloud/common.rs b/src/fs/mescloud/common.rs index 340b5887..6e9c8bf8 100644 --- a/src/fs/mescloud/common.rs +++ b/src/fs/mescloud/common.rs @@ -1,12 +1,12 @@ //! Shared types and helpers used by both `MesaFS` and `RepoFs`. +use std::ffi::{OsStr, OsString}; + +use bytes::Bytes; +use git_fs::fs::{FileHandle, INode, InodeAddr, OpenFlags as LibOpenFlags}; use mesa_dev::low_level::apis; use thiserror::Error; -use crate::fs::r#trait::{FileAttr, Inode}; - -pub(super) use super::icache::InodeControlBlock; - /// A concrete error type that preserves the structure of `mesa_dev::low_level::apis::Error` /// without the generic parameter. #[derive(Debug, Error)] @@ -51,50 +51,22 @@ pub enum LookupError { #[error("inode not found")] InodeNotFound, - #[error("file does not exist")] - FileDoesNotExist, - #[error("remote mesa error")] RemoteMesaError(#[from] MesaApiError), } -impl From for i32 { - fn from(e: LookupError) -> Self { - match e { - LookupError::InodeNotFound | LookupError::FileDoesNotExist => libc::ENOENT, - LookupError::RemoteMesaError(_) => libc::EIO, - } - } -} - #[derive(Debug, Error)] pub enum GetAttrError { #[error("inode not found")] InodeNotFound, } -impl From for i32 { - fn from(e: GetAttrError) -> Self { - match e { - GetAttrError::InodeNotFound => libc::ENOENT, - } - } -} - -#[derive(Debug, Error)] +#[derive(Debug, Clone, Copy, Error)] pub enum OpenError { #[error("inode not found")] InodeNotFound, } -impl From for i32 { - fn from(e: OpenError) -> Self { - match e { - OpenError::InodeNotFound => libc::ENOENT, - } - } -} - #[derive(Debug, Error)] pub enum ReadError { #[error("file not open")] @@ -113,17 +85,6 @@ pub enum ReadError { Base64Decode(#[from] base64::DecodeError), } -impl From for i32 { - fn from(e: ReadError) -> Self { - match e { - ReadError::FileNotOpen => libc::EBADF, - ReadError::InodeNotFound => libc::ENOENT, - ReadError::RemoteMesaError(_) | ReadError::Base64Decode(_) => libc::EIO, - ReadError::NotAFile => libc::EISDIR, - } - } -} - #[derive(Debug, Error)] pub enum ReadDirError { #[error("inode not found")] @@ -143,18 +104,7 @@ impl From for ReadDirError { fn from(e: LookupError) -> Self { match e { LookupError::RemoteMesaError(api) => Self::RemoteMesaError(api), - LookupError::InodeNotFound | LookupError::FileDoesNotExist => Self::InodeNotFound, - } - } -} - -impl From for i32 { - fn from(e: ReadDirError) -> Self { - match e { - ReadDirError::InodeNotFound => libc::ENOENT, - ReadDirError::RemoteMesaError(_) => libc::EIO, - ReadDirError::NotADirectory => libc::ENOTDIR, - ReadDirError::NotPermitted => libc::EPERM, + LookupError::InodeNotFound => Self::InodeNotFound, } } } @@ -165,18 +115,38 @@ pub enum ReleaseError { FileNotOpen, } -impl From for i32 { - fn from(e: ReleaseError) -> Self { - match e { - ReleaseError::FileNotOpen => libc::EBADF, - } - } +/// A directory entry for readdir results, using lib types. +pub struct FsDirEntry { + pub ino: InodeAddr, + pub name: OsString, } -/// Allows a parent compositor to peek at cached attrs from a child filesystem. +/// Trait for child filesystems composed by [`CompositeFs`](super::composite::CompositeFs). +/// +/// Uses lib types (`INode`, `InodeAddr`) directly — no conversion to/from `FileAttr`. +/// Replaces the old `Fs + InodeCachePeek` bound. #[async_trait::async_trait] -pub(super) trait InodeCachePeek { - async fn peek_attr(&self, ino: Inode) -> Option; +pub(super) trait ChildFs: Send + Sync { + /// Look up a child by name within the given parent directory. + async fn lookup(&mut self, parent: InodeAddr, name: &OsStr) -> Result; + + /// List all children of a directory, returning full `INode` data for each. + async fn readdir(&mut self, ino: InodeAddr) -> Result, ReadDirError>; + + /// Open a file for reading. + async fn open(&mut self, ino: InodeAddr, flags: LibOpenFlags) -> Result; + + /// Read data from an open file. + async fn read( + &mut self, + ino: InodeAddr, + fh: FileHandle, + offset: u64, + size: u32, + ) -> Result; + + /// Release (close) a file handle. + async fn release(&mut self, ino: InodeAddr, fh: FileHandle) -> Result<(), ReleaseError>; } #[cfg(test)] @@ -189,12 +159,6 @@ mod tests { assert!(matches!(err, ReadDirError::InodeNotFound)); } - #[test] - fn lookup_file_does_not_exist_converts_to_readdir_inode_not_found() { - let err: ReadDirError = LookupError::FileDoesNotExist.into(); - assert!(matches!(err, ReadDirError::InodeNotFound)); - } - #[test] fn lookup_remote_error_converts_to_readdir_remote_error() { let api_err = MesaApiError::Response { diff --git a/src/fs/mescloud/composite.rs b/src/fs/mescloud/composite.rs index 6dbac250..3356b7b5 100644 --- a/src/fs/mescloud/composite.rs +++ b/src/fs/mescloud/composite.rs @@ -1,308 +1,460 @@ use std::collections::HashMap; use std::ffi::OsStr; +use std::sync::atomic::{AtomicU64, Ordering}; use bytes::Bytes; -use tracing::{instrument, trace, warn}; - -use crate::fs::icache::bridge::HashMapBridge; -use crate::fs::icache::{FileTable, IcbResolver}; -use crate::fs::r#trait::{ - DirEntry, FileAttr, FileHandle, FilesystemStats, Fs, Inode, LockOwner, OpenFile, OpenFlags, +use git_fs::cache::async_backed::FutureBackedCache; +use git_fs::fs::dcache::DCache; +use git_fs::fs::{ + AsyncFsStats, FileHandle, INode, INodeType, InodeAddr, InodePerms, LoadedAddr, OpenFlags, }; +use rustc_hash::FxHashMap; +use tracing::{instrument, trace}; use super::common::{ - GetAttrError, InodeCachePeek, LookupError, OpenError, ReadDirError, ReadError, ReleaseError, + ChildFs, FsDirEntry, GetAttrError, LookupError, OpenError, ReadDirError, ReadError, + ReleaseError, }; -use super::icache::{InodeControlBlock, MescloudICache}; -/// A child filesystem slot: inner filesystem + bidirectional inode/fh bridge. +/// Bidirectional inode mapping between outer (composite) and inner (child) address spaces. +/// +/// Convention: **outer = left, inner = right**. +pub(super) struct InodeBridge { + map: bimap::BiMap, +} + +impl InodeBridge { + pub fn new() -> Self { + Self { + map: bimap::BiMap::new(), + } + } + + pub fn insert(&mut self, outer: InodeAddr, inner: InodeAddr) { + self.map.insert(outer, inner); + } + + pub fn forward(&self, outer: InodeAddr) -> Option { + self.map.get_by_left(&outer).copied() + } + + #[expect(dead_code, reason = "will be needed by future callers")] + pub fn backward(&self, inner: InodeAddr) -> Option { + self.map.get_by_right(&inner).copied() + } + + /// Look up inner->outer, or allocate a new outer address if unmapped. + pub fn backward_or_insert( + &mut self, + inner: InodeAddr, + allocate: impl FnOnce() -> InodeAddr, + ) -> InodeAddr { + if let Some(&outer) = self.map.get_by_right(&inner) { + outer + } else { + let outer = allocate(); + self.map.insert(outer, inner); + outer + } + } + + pub fn remove_by_outer(&mut self, outer: InodeAddr) { + self.map.remove_by_left(&outer); + } + + #[expect(dead_code, reason = "will be needed by future callers")] + pub fn get_inner(&self, outer: InodeAddr) -> Option<&InodeAddr> { + self.map.get_by_left(&outer) + } +} + pub(super) struct ChildSlot { pub inner: Inner, - pub bridge: HashMapBridge, + pub bridge: InodeBridge, } -/// Layered filesystem that presents multiple child filesystems under a single -/// inode namespace. -/// -/// `MesaCloud`'s filesystem is a hierarchy of compositions: -/// -/// ```text -/// MesaFS (CompositeFs<_, OrgFs>) -/// └─ OrgFs (CompositeFs<_, RepoFs>) -/// └─ RepoFs (leaf — backed by git) -/// ``` -/// -/// Each child filesystem numbers its inodes starting from 1, so the composite -/// maintains a bidirectional inode/file-handle bridge per child (see -/// [`ChildSlot`]) to translate between the outer namespace visible to FUSE and -/// each child's internal namespace. -pub(super) struct CompositeFs -where - R: IcbResolver, -{ - pub icache: MescloudICache, - pub file_table: FileTable, - pub readdir_buf: Vec, - /// Maps outer inode to index into `slots` for child-root inodes. - pub child_inodes: HashMap, - /// Maps every translated outer inode to its owning slot index. - pub inode_to_slot: HashMap, - pub slots: Vec>, +/// Tracks an open file: which child slot owns it and the inner fh. +struct OpenFileEntry { + slot_idx: usize, + inner_ino: InodeAddr, + inner_fh: FileHandle, +} + +pub(super) struct CompositeFs { + pub(super) inode_table: FutureBackedCache, + pub(super) directory_cache: DCache, + readdir_populated: FutureBackedCache, + next_ino: AtomicU64, + next_fh: AtomicU64, + refcounts: FxHashMap, + pub(super) readdir_buf: Vec, + open_files: HashMap, + pub(super) child_inodes: HashMap, + pub(super) inode_to_slot: HashMap, + pub(super) slots: Vec>, + fs_owner: (u32, u32), + block_size: u32, } -impl CompositeFs -where - R: IcbResolver, - Inner: Fs< - LookupError = LookupError, - GetAttrError = GetAttrError, - OpenError = OpenError, - ReadError = ReadError, - ReaddirError = ReadDirError, - ReleaseError = ReleaseError, - > + InodeCachePeek - + Send - + Sync, -{ - /// Look up which child slot owns an inode via direct map. - #[instrument(name = "CompositeFs::slot_for_inode", skip(self))] - pub fn slot_for_inode(&self, ino: Inode) -> Option { +impl CompositeFs { + pub const ROOT_INO: InodeAddr = 1; + + pub fn new(fs_owner: (u32, u32), block_size: u32) -> Self { + let inode_table = FutureBackedCache::default(); + let now = std::time::SystemTime::now(); + let root = INode { + addr: Self::ROOT_INO, + permissions: InodePerms::from_bits_truncate(0o755), + uid: fs_owner.0, + gid: fs_owner.1, + create_time: now, + last_modified_at: now, + parent: None, + size: 0, + itype: INodeType::Directory, + }; + inode_table.insert_sync(Self::ROOT_INO, root); + + let mut refcounts = FxHashMap::default(); + refcounts.insert(Self::ROOT_INO, 1); + + Self { + inode_table, + directory_cache: DCache::new(), + readdir_populated: FutureBackedCache::default(), + next_ino: AtomicU64::new(Self::ROOT_INO + 1), + next_fh: AtomicU64::new(1), + refcounts, + readdir_buf: Vec::new(), + open_files: HashMap::new(), + child_inodes: HashMap::new(), + inode_to_slot: HashMap::new(), + slots: Vec::new(), + fs_owner, + block_size, + } + } + + pub fn allocate_inode(&self) -> InodeAddr { + self.next_ino.fetch_add(1, Ordering::Relaxed) + } + + pub fn fs_owner(&self) -> (u32, u32) { + self.fs_owner + } + + #[expect(dead_code, reason = "available for future use")] + pub fn block_size(&self) -> u32 { + self.block_size + } + + pub fn add_child(&mut self, inner: Inner, child_root_ino: InodeAddr) -> InodeAddr { + self.add_child_with_parent(inner, child_root_ino, Self::ROOT_INO) + } + + pub fn cache_inode(&self, inode: INode) { + self.inode_table.insert_sync(inode.addr, inode); + } + + /// Insert the inode into the table and initialise its refcount to zero. + /// + /// The caller is responsible for bumping the refcount via [`inc_rc`](Self::inc_rc). + pub fn cache_inode_and_init_rc(&mut self, inode: INode) { + let addr = inode.addr; + self.inode_table.insert_sync(addr, inode); + self.refcounts.entry(addr).or_insert(0); + } + + pub fn inc_rc(&mut self, addr: InodeAddr) -> Option { + let rc = self.refcounts.get_mut(&addr)?; + *rc += 1; + Some(*rc) + } + + pub fn slot_for_inode(&self, ino: InodeAddr) -> Option { self.inode_to_slot.get(&ino).copied() } - /// Allocate an outer file handle and map it through the bridge. - #[must_use] - pub fn alloc_fh(&mut self, slot_idx: usize, inner_fh: FileHandle) -> FileHandle { - let fh = self.file_table.allocate(); - self.slots[slot_idx].bridge.insert_fh(fh, inner_fh); - fh + /// Like [`add_child`](Self::add_child) but sets a custom parent inode + /// instead of always using `ROOT_INO`. + pub fn add_child_with_parent( + &mut self, + inner: Inner, + child_root_ino: InodeAddr, + parent_ino: InodeAddr, + ) -> InodeAddr { + let outer_ino = self.allocate_inode(); + let now = std::time::SystemTime::now(); + let inode = INode { + addr: outer_ino, + permissions: InodePerms::from_bits_truncate(0o755), + uid: self.fs_owner.0, + gid: self.fs_owner.1, + create_time: now, + last_modified_at: now, + parent: Some(parent_ino), + size: 0, + itype: INodeType::Directory, + }; + self.inode_table.insert_sync(outer_ino, inode); + + let mut bridge = InodeBridge::new(); + bridge.insert(outer_ino, child_root_ino); + + let idx = self.slots.len(); + self.slots.push(ChildSlot { inner, bridge }); + self.child_inodes.insert(outer_ino, idx); + self.inode_to_slot.insert(outer_ino, idx); + + outer_ino } +} - /// Translate an inner inode to an outer inode, allocating if needed. - /// Also inserts a stub ICB into the outer icache when the inode is new. - #[instrument(name = "CompositeFs::translate_inner_ino", skip(self, name))] - pub async fn translate_inner_ino( +impl CompositeFs { + #[instrument(name = "CompositeFs::delegated_lookup", skip(self, name))] + pub async fn delegated_lookup( &mut self, - slot_idx: usize, - inner_ino: Inode, - parent_outer_ino: Inode, + parent: InodeAddr, name: &OsStr, - ) -> Inode { - let outer_ino = self.slots[slot_idx] + ) -> Result { + // Fast path: DCache hit + inode still in table + if let Some(dentry) = self.directory_cache.lookup(LoadedAddr(parent), name) + && let Some(inode) = self.inode_table.get(&dentry.ino.0).await + { + *self.refcounts.entry(inode.addr).or_insert(0) += 1; + return Ok(inode); + } + + // Slow path: delegate to child + let idx = self + .inode_to_slot + .get(&parent) + .copied() + .ok_or(LookupError::InodeNotFound)?; + let inner_parent = self.slots[idx] .bridge - .backward_or_insert_inode(inner_ino, || self.icache.allocate_inode()); - self.inode_to_slot.insert(outer_ino, slot_idx); - self.icache - .entry_or_insert_icb( - outer_ino, - || InodeControlBlock { - rc: 0, - path: name.into(), - parent: Some(parent_outer_ino), - attr: None, - children: None, - }, - |_| {}, + .forward(parent) + .ok_or(LookupError::InodeNotFound)?; + let inner_inode = self.slots[idx].inner.lookup(inner_parent, name).await?; + + let next_ino = &self.next_ino; + let outer_ino = self.slots[idx] + .bridge + .backward_or_insert(inner_inode.addr, || { + next_ino.fetch_add(1, Ordering::Relaxed) + }); + self.inode_to_slot.insert(outer_ino, idx); + + let remapped = INode { + addr: outer_ino, + ..inner_inode + }; + self.inode_table + .get_or_init(outer_ino, || async move { remapped }) + .await; + + let is_dir = matches!(inner_inode.itype, INodeType::Directory); + self.directory_cache + .insert( + LoadedAddr(parent), + name.to_os_string(), + LoadedAddr(outer_ino), + is_dir, ) .await; - outer_ino + + *self.refcounts.entry(outer_ino).or_insert(0) += 1; + let rc = self.refcounts[&outer_ino]; + trace!( + outer_ino, + inner_ino = inner_inode.addr, + rc, + "lookup: resolved via delegation" + ); + + Ok(remapped) + } + + #[instrument(name = "CompositeFs::delegated_readdir", skip(self))] + pub async fn delegated_readdir( + &mut self, + ino: InodeAddr, + ) -> Result<&[FsDirEntry], ReadDirError> { + let idx = self + .inode_to_slot + .get(&ino) + .copied() + .ok_or(ReadDirError::InodeNotFound)?; + + if self.readdir_populated.get(&LoadedAddr(ino)).await.is_none() { + let inner_ino = self.slots[idx] + .bridge + .forward(ino) + .ok_or(ReadDirError::InodeNotFound)?; + let inner_entries = self.slots[idx].inner.readdir(inner_ino).await?; + + for (name, child_inode) in &inner_entries { + let next_ino = &self.next_ino; + let outer_child = self.slots[idx] + .bridge + .backward_or_insert(child_inode.addr, || { + next_ino.fetch_add(1, Ordering::Relaxed) + }); + self.inode_to_slot.insert(outer_child, idx); + + let remapped = INode { + addr: outer_child, + ..*child_inode + }; + self.inode_table + .get_or_init(outer_child, || async move { remapped }) + .await; + + let is_dir = matches!(child_inode.itype, INodeType::Directory); + self.directory_cache + .insert( + LoadedAddr(ino), + name.clone(), + LoadedAddr(outer_child), + is_dir, + ) + .await; + } + + self.readdir_populated + .get_or_init(LoadedAddr(ino), || async {}) + .await; + } + + let mut children = self.directory_cache.readdir(LoadedAddr(ino)).await; + children.sort_unstable_by(|(a, _), (b, _)| a.cmp(b)); + + let mut entries = Vec::with_capacity(children.len()); + for (name, dvalue) in &children { + if let Some(inode) = self.inode_table.get(&dvalue.ino.0).await { + entries.push(FsDirEntry { + ino: inode.addr, + name: name.clone(), + }); + } + } + + self.readdir_buf = entries; + Ok(&self.readdir_buf) } - /// Get cached file attributes for an inode. #[instrument(name = "CompositeFs::delegated_getattr", skip(self))] - pub async fn delegated_getattr(&self, ino: Inode) -> Result { - self.icache.get_attr(ino).await.ok_or_else(|| { - warn!(ino, "getattr on unknown inode"); - GetAttrError::InodeNotFound - }) + pub async fn delegated_getattr(&self, ino: InodeAddr) -> Result { + self.inode_table + .get(&ino) + .await + .ok_or(GetAttrError::InodeNotFound) + } + + #[expect(dead_code, reason = "will be needed by future callers")] + #[must_use] + pub fn delegated_statfs(&self) -> AsyncFsStats { + AsyncFsStats { + block_size: self.block_size, + total_blocks: 0, + free_blocks: 0, + available_blocks: 0, + total_inodes: self.inode_table.len() as u64, + free_inodes: 0, + max_filename_length: 255, + } } - /// Find slot, forward inode, delegate to inner, allocate outer file handle. #[instrument(name = "CompositeFs::delegated_open", skip(self))] pub async fn delegated_open( &mut self, - ino: Inode, + ino: InodeAddr, flags: OpenFlags, - ) -> Result { - let idx = self.slot_for_inode(ino).ok_or_else(|| { - warn!(ino, "open on inode not belonging to any child"); - OpenError::InodeNotFound - })?; + ) -> Result { + let idx = self + .inode_to_slot + .get(&ino) + .copied() + .ok_or(OpenError::InodeNotFound)?; let inner_ino = self.slots[idx] .bridge - .forward_or_insert_inode(ino, || unreachable!("open: ino should be mapped")); - let inner_open = self.slots[idx].inner.open(inner_ino, flags).await?; - let outer_fh = self.alloc_fh(idx, inner_open.handle); - trace!( - ino, + .forward(ino) + .ok_or(OpenError::InodeNotFound)?; + let inner_fh = self.slots[idx].inner.open(inner_ino, flags).await?; + + let outer_fh = self.next_fh.fetch_add(1, Ordering::Relaxed); + self.open_files.insert( outer_fh, - inner_fh = inner_open.handle, - "open: assigned file handle" + OpenFileEntry { + slot_idx: idx, + inner_ino, + inner_fh, + }, ); - Ok(OpenFile { - handle: outer_fh, - options: inner_open.options, - }) + + trace!(ino, outer_fh, inner_fh, "open: assigned fh"); + Ok(outer_fh) } - /// Find slot, forward inode and file handle, delegate read to inner. - #[expect(clippy::too_many_arguments, reason = "mirrors fuser read API")] #[instrument(name = "CompositeFs::delegated_read", skip(self))] pub async fn delegated_read( &mut self, - ino: Inode, fh: FileHandle, offset: u64, size: u32, - flags: OpenFlags, - lock_owner: Option, ) -> Result { - let idx = self.slot_for_inode(ino).ok_or_else(|| { - warn!(ino, "read on inode not belonging to any child"); - ReadError::InodeNotFound - })?; - let inner_ino = self.slots[idx] - .bridge - .forward_or_insert_inode(ino, || unreachable!("read: ino should be mapped")); - let inner_fh = self.slots[idx].bridge.fh_forward(fh).ok_or_else(|| { - warn!(fh, "read: no fh mapping found"); - ReadError::FileNotOpen - })?; - self.slots[idx] + let entry = self.open_files.get(&fh).ok_or(ReadError::FileNotOpen)?; + let slot_idx = entry.slot_idx; + let inner_ino = entry.inner_ino; + let inner_fh = entry.inner_fh; + self.slots[slot_idx] .inner - .read(inner_ino, inner_fh, offset, size, flags, lock_owner) + .read(inner_ino, inner_fh, offset, size) .await } - /// Find slot, forward inode and file handle, delegate release to inner, - /// then clean up the file handle mapping. #[instrument(name = "CompositeFs::delegated_release", skip(self))] - pub async fn delegated_release( - &mut self, - ino: Inode, - fh: FileHandle, - flags: OpenFlags, - flush: bool, - ) -> Result<(), ReleaseError> { - let idx = self.slot_for_inode(ino).ok_or_else(|| { - warn!(ino, "release on inode not belonging to any child"); - ReleaseError::FileNotOpen - })?; - let inner_ino = self.slots[idx] - .bridge - .forward_or_insert_inode(ino, || unreachable!("release: ino should be mapped")); - let inner_fh = self.slots[idx].bridge.fh_forward(fh).ok_or_else(|| { - warn!(fh, "release: no fh mapping found"); - ReleaseError::FileNotOpen - })?; - let result = self.slots[idx] + pub async fn delegated_release(&mut self, fh: FileHandle) -> Result<(), ReleaseError> { + let entry = self + .open_files + .remove(&fh) + .ok_or(ReleaseError::FileNotOpen)?; + let result = self.slots[entry.slot_idx] .inner - .release(inner_ino, inner_fh, flags, flush) + .release(entry.inner_ino, entry.inner_fh) .await; - self.slots[idx].bridge.remove_fh_by_left(fh); - trace!(ino, fh, "release: cleaned up fh mapping"); + trace!(fh, "release: cleaned up fh mapping"); result } - /// Propagate forget to the inner filesystem, evict from icache, and clean - /// up bridge mappings. Returns `true` if the inode was evicted. + /// Returns `true` if the inode was evicted. /// - /// Child-root inodes (those in `child_inodes`) do NOT propagate forget to - /// the inner filesystem: the inner root's `rc=1` is an initialization - /// invariant unrelated to outer FUSE lookup counts. Propagating would - /// evict the inner root, breaking all subsequent operations on that child. + /// The composite only manages its own refcounts and inode table. + /// Inner filesystem inodes are managed by the inner FS itself through + /// its own lifecycle; the composite does not propagate forget to children. + #[expect(dead_code, reason = "will be needed by future callers")] #[must_use] #[instrument(name = "CompositeFs::delegated_forget", skip(self))] - pub async fn delegated_forget(&mut self, ino: Inode, nlookups: u64) -> bool { - let slot_idx = self.slot_for_inode(ino); - let is_child_root = self.child_inodes.contains_key(&ino); - if !is_child_root - && let Some(idx) = slot_idx - && let Some(&inner_ino) = self.slots[idx].bridge.inode_map_get_by_left(ino) - { - self.slots[idx].inner.forget(inner_ino, nlookups).await; - } - if self.icache.forget(ino, nlookups).await.is_some() { - self.child_inodes.remove(&ino); - self.inode_to_slot.remove(&ino); - if let Some(idx) = slot_idx { - self.slots[idx].bridge.remove_inode_by_left(ino); + pub fn delegated_forget(&mut self, ino: InodeAddr, nlookups: u64) -> bool { + let slot_idx = self.inode_to_slot.get(&ino).copied(); + + if let Some(rc) = self.refcounts.get_mut(&ino) { + *rc = rc.saturating_sub(nlookups); + if *rc > 0 { + return false; } - true + self.refcounts.remove(&ino); } else { - false + return false; } - } - - /// Return filesystem statistics from the icache. - #[must_use] - pub fn delegated_statfs(&self) -> FilesystemStats { - self.icache.statfs() - } - - /// Delegation branch for lookup when the parent is owned by a child slot. - #[instrument(name = "CompositeFs::delegated_lookup", skip(self, name))] - pub async fn delegated_lookup( - &mut self, - parent: Inode, - name: &OsStr, - ) -> Result { - let idx = self - .slot_for_inode(parent) - .ok_or(LookupError::InodeNotFound)?; - let inner_parent = self.slots[idx] - .bridge - .forward_or_insert_inode(parent, || unreachable!("lookup: parent should be mapped")); - let inner_attr = self.slots[idx].inner.lookup(inner_parent, name).await?; - let inner_ino = inner_attr.common().ino; - let outer_ino = self.translate_inner_ino(idx, inner_ino, parent, name).await; - let outer_attr = self.slots[idx].bridge.attr_backward(inner_attr); - self.icache.cache_attr(outer_ino, outer_attr).await; - // None means the entry was concurrently evicted; fail the lookup so - // the kernel doesn't hold a ref the cache no longer tracks. - let rc = self - .icache - .inc_rc(outer_ino) - .await - .ok_or(LookupError::InodeNotFound)?; - trace!(outer_ino, inner_ino, rc, "lookup: resolved via delegation"); - Ok(outer_attr) - } - /// Delegation branch for readdir when the inode is owned by a child slot. - #[instrument(name = "CompositeFs::delegated_readdir", skip(self))] - pub async fn delegated_readdir(&mut self, ino: Inode) -> Result<&[DirEntry], ReadDirError> { - let idx = self - .slot_for_inode(ino) - .ok_or(ReadDirError::InodeNotFound)?; - let inner_ino = self.slots[idx] - .bridge - .forward_or_insert_inode(ino, || unreachable!("readdir: ino should be mapped")); - let inner_entries = self.slots[idx].inner.readdir(inner_ino).await?; - let inner_entries: Vec = inner_entries.to_vec(); - let evicted = self.icache.evict_zero_rc_children(ino).await; - for evicted_ino in evicted { - if let Some(slot) = self.inode_to_slot.remove(&evicted_ino) { - self.slots[slot].bridge.remove_inode_by_left(evicted_ino); - } - self.child_inodes.remove(&evicted_ino); + self.inode_table.remove_sync(&ino); + self.child_inodes.remove(&ino); + self.inode_to_slot.remove(&ino); + if let Some(idx) = slot_idx { + self.slots[idx].bridge.remove_by_outer(ino); } - let mut outer_entries = Vec::with_capacity(inner_entries.len()); - for entry in &inner_entries { - let outer_child_ino = self - .translate_inner_ino(idx, entry.ino, ino, &entry.name) - .await; - if let Some(inner_attr) = self.slots[idx].inner.peek_attr(entry.ino).await { - let outer_attr = self.slots[idx].bridge.attr_backward(inner_attr); - self.icache.cache_attr(outer_child_ino, outer_attr).await; - } - outer_entries.push(DirEntry { - ino: outer_child_ino, - name: entry.name.clone(), - kind: entry.kind, - }); - } - self.readdir_buf = outer_entries; - Ok(&self.readdir_buf) + + true } } diff --git a/src/fs/mescloud/icache.rs b/src/fs/mescloud/icache.rs deleted file mode 100644 index 15f1f5d7..00000000 --- a/src/fs/mescloud/icache.rs +++ /dev/null @@ -1,437 +0,0 @@ -//! Mescloud-specific inode control block, helpers, and directory cache wrapper. - -use std::ffi::OsStr; -use std::time::SystemTime; - -use crate::fs::icache::{AsyncICache, IcbLike, IcbResolver, InodeFactory}; -use crate::fs::r#trait::{ - CommonFileAttr, DirEntryType, FileAttr, FilesystemStats, Inode, Permissions, -}; - -/// Inode control block for mescloud filesystem layers. -#[derive(Clone)] -pub struct InodeControlBlock { - pub parent: Option, - pub rc: u64, - pub path: std::path::PathBuf, - /// Cached file attributes from the last lookup. - pub attr: Option, - /// Cached directory children from the resolver (directories only). - pub children: Option>, -} - -impl IcbLike for InodeControlBlock { - fn new_root(path: std::path::PathBuf) -> Self { - Self { - rc: 1, - parent: None, - path, - attr: None, - children: None, - } - } - - fn rc(&self) -> u64 { - self.rc - } - - fn rc_mut(&mut self) -> &mut u64 { - &mut self.rc - } - - fn needs_resolve(&self) -> bool { - match self.attr { - None => true, - Some(FileAttr::Directory { .. }) => self.children.is_none(), - Some(_) => false, - } - } -} - -/// Calculate the number of blocks needed for a given size. -pub fn blocks_of_size(block_size: u32, size: u64) -> u64 { - size.div_ceil(u64::from(block_size)) -} - -/// Free function -- usable by both `MescloudICache` and resolvers. -pub fn make_common_file_attr( - ino: Inode, - perm: u16, - atime: SystemTime, - mtime: SystemTime, - fs_owner: (u32, u32), - block_size: u32, -) -> CommonFileAttr { - CommonFileAttr { - ino, - atime, - mtime, - ctime: SystemTime::UNIX_EPOCH, - crtime: SystemTime::UNIX_EPOCH, - perm: Permissions::from_bits_truncate(perm), - nlink: 1, - uid: fs_owner.0, - gid: fs_owner.1, - blksize: block_size, - } -} - -/// Mescloud-specific directory cache wrapper over `AsyncICache`. -pub struct MescloudICache> { - inner: AsyncICache, - inode_factory: InodeFactory, - fs_owner: (u32, u32), - block_size: u32, -} - -impl> MescloudICache { - /// Create a new `MescloudICache`. Initializes root ICB (rc=1), caches root dir attr. - pub fn new(resolver: R, root_ino: Inode, fs_owner: (u32, u32), block_size: u32) -> Self { - let cache = Self { - inner: AsyncICache::new(resolver, root_ino, "/"), - inode_factory: InodeFactory::new(root_ino + 1), - fs_owner, - block_size, - }; - - // Set root directory attr synchronously during initialization - let now = SystemTime::now(); - let root_attr = FileAttr::Directory { - common: make_common_file_attr(root_ino, 0o755, now, now, fs_owner, block_size), - }; - cache.inner.get_icb_mut_sync(root_ino, |icb| { - icb.attr = Some(root_attr); - }); - - cache - } - - // -- Delegated from AsyncICache (async) -- - - pub fn contains(&self, ino: Inode) -> bool { - self.inner.contains(ino) - } - - pub async fn get_icb( - &self, - ino: Inode, - // `Sync` required: see comment on `AsyncICache::get_icb`. - f: impl Fn(&InodeControlBlock) -> T + Send + Sync, - ) -> Option { - self.inner.get_icb(ino, f).await - } - - pub async fn insert_icb(&self, ino: Inode, icb: InodeControlBlock) { - self.inner.insert_icb(ino, icb).await; - } - - pub async fn entry_or_insert_icb( - &self, - ino: Inode, - factory: impl FnOnce() -> InodeControlBlock, - then: impl FnOnce(&mut InodeControlBlock) -> T, - ) -> T { - self.inner.entry_or_insert_icb(ino, factory, then).await - } - - pub async fn inc_rc(&self, ino: Inode) -> Option { - self.inner.inc_rc(ino).await - } - - pub async fn forget(&self, ino: Inode, nlookups: u64) -> Option { - self.inner.forget(ino, nlookups).await - } - - pub async fn get_or_resolve( - &self, - ino: Inode, - then: impl FnOnce(&InodeControlBlock) -> T, - ) -> Result { - self.inner.get_or_resolve(ino, then).await - } - - // -- Domain-specific -- - - /// Allocate a new inode number. - pub fn allocate_inode(&self) -> Inode { - self.inode_factory.allocate() - } - - pub async fn get_attr(&self, ino: Inode) -> Option { - self.inner.get_icb(ino, |icb| icb.attr).await.flatten() - } - - pub async fn cache_attr(&self, ino: Inode, attr: FileAttr) { - self.inner - .get_icb_mut(ino, |icb| { - icb.attr = Some(attr); - }) - .await; - } - - pub fn fs_owner(&self) -> (u32, u32) { - self.fs_owner - } - - pub fn block_size(&self) -> u32 { - self.block_size - } - - pub fn statfs(&self) -> FilesystemStats { - FilesystemStats { - block_size: self.block_size, - fragment_size: u64::from(self.block_size), - total_blocks: 0, - free_blocks: 0, - available_blocks: 0, - total_inodes: self.inner.inode_count() as u64, - free_inodes: 0, - available_inodes: 0, - filesystem_id: 0, - mount_flags: 0, - max_filename_length: 255, - } - } - - /// Evict all `Available` children of `parent` that have `rc == 0`. - /// Returns the list of evicted inode numbers so callers can clean up - /// associated state (e.g., bridge mappings, slot tracking). - pub async fn evict_zero_rc_children(&self, parent: Inode) -> Vec { - let mut to_evict = Vec::new(); - self.inner - .for_each(|&ino, icb| { - if icb.rc == 0 && icb.parent == Some(parent) { - to_evict.push(ino); - } - }) - .await; - let mut evicted = Vec::new(); - for ino in to_evict { - if self.inner.forget(ino, 0).await.is_some() { - evicted.push(ino); - } - } - evicted - } - - /// Find an existing child by (parent, name) or allocate a new inode. - /// If new, inserts a stub ICB (parent+path set, attr=None, children=None, rc=0). - /// Does NOT bump rc. Returns the inode number. - /// - /// # Safety invariant - /// - /// The `for_each` scan and `insert_icb` are **not** atomic. If two callers - /// race with the same `(parent, name)`, both may allocate distinct inodes - /// for the same logical child. This is currently safe because all callers - /// go through `&mut self` on the owning `Fs` implementation. - pub async fn ensure_child_ino(&self, parent: Inode, name: &OsStr) -> Inode { - // Search for existing child by parent + name - let mut existing_ino = None; - self.inner - .for_each(|&ino, icb| { - if icb.parent == Some(parent) && icb.path.as_os_str() == name { - existing_ino = Some(ino); - } - }) - .await; - - if let Some(ino) = existing_ino { - return ino; - } - - // Allocate new inode and insert stub - let ino = self.inode_factory.allocate(); - self.inner - .insert_icb( - ino, - InodeControlBlock { - rc: 0, - path: name.into(), - parent: Some(parent), - attr: None, - children: None, - }, - ) - .await; - ino - } -} - -#[cfg(test)] -mod tests { - use std::future::Future; - - use super::*; - use crate::fs::icache::async_cache::AsyncICache; - use crate::fs::r#trait::DirEntryType; - - fn dummy_dir_attr(ino: Inode) -> FileAttr { - let now = SystemTime::now(); - FileAttr::Directory { - common: make_common_file_attr(ino, 0o755, now, now, (0, 0), 4096), - } - } - - fn dummy_file_attr(ino: Inode) -> FileAttr { - let now = SystemTime::now(); - FileAttr::RegularFile { - common: make_common_file_attr(ino, 0o644, now, now, (0, 0), 4096), - size: 100, - blocks: 1, - } - } - - #[test] - fn needs_resolve_stub_returns_true() { - let icb = InodeControlBlock { - parent: Some(1), - rc: 0, - path: "stub".into(), - attr: None, - children: None, - }; - assert!(icb.needs_resolve()); - } - - #[test] - fn needs_resolve_file_with_attr_returns_false() { - let icb = InodeControlBlock { - parent: Some(1), - rc: 1, - path: "file.txt".into(), - attr: Some(dummy_file_attr(2)), - children: None, - }; - assert!(!icb.needs_resolve()); - } - - #[test] - fn needs_resolve_dir_without_children_returns_true() { - let icb = InodeControlBlock { - parent: Some(1), - rc: 1, - path: "dir".into(), - attr: Some(dummy_dir_attr(3)), - children: None, - }; - assert!(icb.needs_resolve()); - } - - #[test] - fn needs_resolve_dir_with_children_returns_false() { - let icb = InodeControlBlock { - parent: Some(1), - rc: 1, - path: "dir".into(), - attr: Some(dummy_dir_attr(3)), - children: Some(vec![("README.md".to_owned(), DirEntryType::RegularFile)]), - }; - assert!(!icb.needs_resolve()); - } - - #[test] - fn needs_resolve_dir_with_empty_children_returns_false() { - let icb = InodeControlBlock { - parent: Some(1), - rc: 1, - path: "empty-dir".into(), - attr: Some(dummy_dir_attr(4)), - children: Some(vec![]), - }; - assert!(!icb.needs_resolve()); - } - - struct NoOpResolver; - - impl IcbResolver for NoOpResolver { - type Icb = InodeControlBlock; - type Error = std::convert::Infallible; - - #[expect( - clippy::manual_async_fn, - reason = "must match IcbResolver trait signature" - )] - fn resolve( - &self, - _ino: Inode, - _stub: Option, - _cache: &AsyncICache, - ) -> impl Future> + Send { - async { unreachable!("NoOpResolver should not be called") } - } - } - - fn test_mescloud_cache() -> MescloudICache { - MescloudICache::new(NoOpResolver, 1, (0, 0), 4096) - } - - #[tokio::test] - async fn evict_zero_rc_children_removes_stubs() { - let cache = test_mescloud_cache(); - - // Insert stubs as children of root (ino=1) with rc=0 - cache - .insert_icb( - 10, - InodeControlBlock { - rc: 0, - path: "child_a".into(), - parent: Some(1), - attr: None, - children: None, - }, - ) - .await; - cache - .insert_icb( - 11, - InodeControlBlock { - rc: 0, - path: "child_b".into(), - parent: Some(1), - attr: None, - children: None, - }, - ) - .await; - - // Insert a child with rc > 0 — should survive - cache - .insert_icb( - 12, - InodeControlBlock { - rc: 1, - path: "active".into(), - parent: Some(1), - attr: None, - children: None, - }, - ) - .await; - - // Insert a stub under a different parent — should survive - cache - .insert_icb( - 20, - InodeControlBlock { - rc: 0, - path: "other".into(), - parent: Some(12), - attr: None, - children: None, - }, - ) - .await; - - let evicted = cache.evict_zero_rc_children(1).await; - assert_eq!(evicted.len(), 2, "should evict 2 zero-rc children of root"); - - assert!(!cache.contains(10), "child_a should be evicted"); - assert!(!cache.contains(11), "child_b should be evicted"); - assert!(cache.contains(12), "active child should survive"); - assert!( - cache.contains(20), - "child of different parent should survive" - ); - } -} diff --git a/src/fs/mescloud/mod.rs b/src/fs/mescloud/mod.rs index 1a3cce80..15a70725 100644 --- a/src/fs/mescloud/mod.rs +++ b/src/fs/mescloud/mod.rs @@ -1,24 +1,23 @@ -use std::collections::HashMap; -use std::ffi::OsStr; +use std::ffi::{OsStr, OsString}; use std::future::Future; +use std::sync::Arc; +use std::sync::atomic::{AtomicBool, Ordering}; use std::time::SystemTime; use bytes::Bytes; +use git_fs::fs::{FileHandle, INode, INodeType, InodeAddr, InodePerms, OpenFlags}; use mesa_dev::MesaClient; use opentelemetry::propagation::Injector; use secrecy::ExposeSecret as _; -use tracing::{Instrument as _, instrument, trace, warn}; +use tracing::{instrument, trace, warn}; use tracing_opentelemetry::OpenTelemetrySpanExt as _; use crate::app_config::CacheConfig; -use crate::fs::icache::bridge::HashMapBridge; -use crate::fs::icache::{AsyncICache, FileTable, IcbResolver}; -use crate::fs::r#trait::{ - DirEntry, DirEntryType, FileAttr, FileHandle, FilesystemStats, Fs, Inode, LockOwner, OpenFile, - OpenFlags, -}; -use composite::{ChildSlot, CompositeFs}; +pub use common::FsDirEntry; +use composite::CompositeFs; + +pub use common::{GetAttrError, LookupError, OpenError, ReadDirError, ReadError, ReleaseError}; #[cfg(feature = "staging")] const MESA_API_BASE_URL: &str = "https://staging.depot.mesa.dev/api/v1"; @@ -27,17 +26,11 @@ const MESA_API_BASE_URL: &str = "https://depot.mesa.dev/api/v1"; mod common; mod composite; -use common::InodeControlBlock; -pub use common::{GetAttrError, LookupError, OpenError, ReadDirError, ReadError, ReleaseError}; - -use icache as mescloud_icache; -use icache::MescloudICache; mod org; pub use org::OrgConfig; use org::OrgFs; -pub mod icache; pub mod repo; struct HeaderInjector<'a>(&'a mut reqwest::header::HeaderMap); @@ -89,50 +82,6 @@ fn build_mesa_client(api_key: &str) -> MesaClient { .build() } -struct MesaResolver { - fs_owner: (u32, u32), - block_size: u32, -} - -impl IcbResolver for MesaResolver { - type Icb = InodeControlBlock; - type Error = std::convert::Infallible; - - fn resolve( - &self, - ino: Inode, - stub: Option, - _cache: &AsyncICache, - ) -> impl Future> + Send - where - Self: Sized, - { - let fs_owner = self.fs_owner; - let block_size = self.block_size; - async move { - let stub = stub.unwrap_or_else(|| InodeControlBlock { - parent: None, - path: "/".into(), - rc: 0, - attr: None, - children: None, - }); - let now = SystemTime::now(); - let attr = FileAttr::Directory { - common: mescloud_icache::make_common_file_attr( - ino, 0o755, now, now, fs_owner, block_size, - ), - }; - Ok(InodeControlBlock { - attr: Some(attr), - children: Some(vec![]), - ..stub - }) - } - .instrument(tracing::info_span!("MesaResolver::resolve", ino)) - } -} - /// Classifies an inode by its role in the mesa hierarchy. enum InodeRole { /// The filesystem root (ino == 1). @@ -146,11 +95,11 @@ enum InodeRole { /// Composes multiple [`OrgFs`] instances, each with its own inode namespace, /// delegating to [`CompositeFs`] for inode/fh translation at each boundary. pub struct MesaFS { - composite: CompositeFs, + composite: CompositeFs, } impl MesaFS { - const ROOT_NODE_INO: Inode = 1; + const ROOT_NODE_INO: InodeAddr = CompositeFs::::ROOT_INO; const BLOCK_SIZE: u32 = 4096; /// Create a new `MesaFS` instance. @@ -160,38 +109,17 @@ impl MesaFS { fs_owner: (u32, u32), cache: &CacheConfig, ) -> Self { - let resolver = MesaResolver { - fs_owner, - block_size: Self::BLOCK_SIZE, - }; - Self { - composite: CompositeFs { - icache: MescloudICache::new( - resolver, - Self::ROOT_NODE_INO, - fs_owner, - Self::BLOCK_SIZE, - ), - file_table: FileTable::new(), - readdir_buf: Vec::new(), - child_inodes: HashMap::new(), - inode_to_slot: HashMap::new(), - slots: orgs - .map(|org_conf| { - let client = build_mesa_client(org_conf.api_key.expose_secret()); - let org = OrgFs::new(org_conf.name, client, fs_owner, cache.clone()); - ChildSlot { - inner: org, - bridge: HashMapBridge::new(), - } - }) - .collect(), - }, + let mut composite = CompositeFs::new(fs_owner, Self::BLOCK_SIZE); + for org_conf in orgs { + let client = build_mesa_client(org_conf.api_key.expose_secret()); + let org = OrgFs::new(org_conf.name, client, fs_owner, cache.clone()); + composite.add_child(org, OrgFs::ROOT_INO); } + Self { composite } } /// Classify an inode by its role. - fn inode_role(&self, ino: Inode) -> Option { + fn inode_role(&self, ino: InodeAddr) -> Option { if ino == Self::ROOT_NODE_INO { return Some(InodeRole::Root); } @@ -205,10 +133,8 @@ impl MesaFS { } /// Ensure a mesa-level inode exists for the org at `org_idx`. - /// Seeds the bridge with (`mesa_org_ino`, `OrgFs::ROOT_INO`). /// Does NOT bump rc. - async fn ensure_org_inode(&mut self, org_idx: usize) -> (Inode, FileAttr) { - // Check if an inode already exists. + async fn ensure_org_inode(&mut self, org_idx: usize) -> (InodeAddr, INode) { let existing_ino = self .composite .child_inodes @@ -217,104 +143,62 @@ impl MesaFS { .map(|(&ino, _)| ino); if let Some(existing_ino) = existing_ino { - if let Some(attr) = self.composite.icache.get_attr(existing_ino).await { - let rc = self - .composite - .icache - .get_icb(existing_ino, |icb| icb.rc) - .await - .unwrap_or(0); + if let Ok(inode) = self.composite.delegated_getattr(existing_ino).await { trace!( ino = existing_ino, - org_idx, rc, "ensure_org_inode: reusing existing inode" - ); - return (existing_ino, attr); - } - if self.composite.icache.contains(existing_ino) { - // ICB exists but attr missing — rebuild and cache. - warn!( - ino = existing_ino, - org_idx, "ensure_org_inode: attr missing, rebuilding" + org_idx, "ensure_org_inode: reusing existing inode" ); - let now = SystemTime::now(); - let attr = FileAttr::Directory { - common: mescloud_icache::make_common_file_attr( - existing_ino, - 0o755, - now, - now, - self.composite.icache.fs_owner(), - self.composite.icache.block_size(), - ), - }; - self.composite.icache.cache_attr(existing_ino, attr).await; - return (existing_ino, attr); + return (existing_ino, inode); } - // ICB was evicted — clean up stale tracking entries. warn!( ino = existing_ino, - org_idx, "ensure_org_inode: ICB evicted, cleaning up stale entry" + org_idx, "ensure_org_inode: evicted, rebuilding" ); - self.composite.child_inodes.remove(&existing_ino); - self.composite.inode_to_slot.remove(&existing_ino); + let now = SystemTime::now(); + let inode = INode { + addr: existing_ino, + permissions: InodePerms::from_bits_truncate(0o755), + uid: self.composite.fs_owner().0, + gid: self.composite.fs_owner().1, + create_time: now, + last_modified_at: now, + parent: Some(Self::ROOT_NODE_INO), + size: 0, + itype: INodeType::Directory, + }; + self.composite.cache_inode(inode); + self.composite.inode_to_slot.insert(existing_ino, org_idx); + self.composite.child_inodes.insert(existing_ino, org_idx); + return (existing_ino, inode); } - // Allocate new. + warn!( + org_idx, + "ensure_org_inode: no child_inodes entry for org slot" + ); let org_name = self.composite.slots[org_idx].inner.name().to_owned(); - let ino = self.composite.icache.allocate_inode(); - trace!(ino, org_idx, org = %org_name, "ensure_org_inode: allocated new inode"); - + let ino = self.composite.allocate_inode(); let now = SystemTime::now(); - self.composite - .icache - .insert_icb( - ino, - InodeControlBlock { - rc: 0, - path: org_name.as_str().into(), - parent: Some(Self::ROOT_NODE_INO), - attr: None, - children: None, - }, - ) - .await; - + let inode = INode { + addr: ino, + permissions: InodePerms::from_bits_truncate(0o755), + uid: self.composite.fs_owner().0, + gid: self.composite.fs_owner().1, + create_time: now, + last_modified_at: now, + parent: Some(Self::ROOT_NODE_INO), + size: 0, + itype: INodeType::Directory, + }; + self.composite.cache_inode(inode); self.composite.child_inodes.insert(ino, org_idx); self.composite.inode_to_slot.insert(ino, org_idx); - - // Reset bridge (may have stale mappings from a previous eviction cycle) - // and seed: mesa org-root <-> OrgFs::ROOT_INO. - self.composite.slots[org_idx].bridge = HashMapBridge::new(); - self.composite.slots[org_idx] - .bridge - .insert_inode(ino, OrgFs::ROOT_INO); - - let attr = FileAttr::Directory { - common: mescloud_icache::make_common_file_attr( - ino, - 0o755, - now, - now, - self.composite.icache.fs_owner(), - self.composite.icache.block_size(), - ), - }; - self.composite.icache.cache_attr(ino, attr).await; - (ino, attr) + trace!(ino, org_idx, org = %org_name, "ensure_org_inode: allocated new inode"); + (ino, inode) } -} - -#[async_trait::async_trait] -impl Fs for MesaFS { - type LookupError = LookupError; - type GetAttrError = GetAttrError; - type OpenError = OpenError; - type ReadError = ReadError; - type ReaddirError = ReadDirError; - type ReleaseError = ReleaseError; #[instrument(name = "MesaFS::lookup", skip(self))] - async fn lookup(&mut self, parent: Inode, name: &OsStr) -> Result { + pub async fn lookup(&mut self, parent: InodeAddr, name: &OsStr) -> Result { let role = self.inode_role(parent).ok_or(LookupError::InodeNotFound)?; match role { InodeRole::Root => { @@ -327,31 +211,23 @@ impl Fs for MesaFS { .ok_or(LookupError::InodeNotFound)?; trace!(org = org_name, "lookup: matched org"); - let (ino, attr) = self.ensure_org_inode(org_idx).await; - let rc = self - .composite - .icache + let (ino, inode) = self.ensure_org_inode(org_idx).await; + self.composite .inc_rc(ino) - .await .ok_or(LookupError::InodeNotFound)?; - trace!(ino, org = org_name, rc, "lookup: resolved org inode"); - Ok(attr) + Ok(inode) } InodeRole::OrgOwned => self.composite.delegated_lookup(parent, name).await, } } #[instrument(name = "MesaFS::getattr", skip(self))] - async fn getattr( - &mut self, - ino: Inode, - _fh: Option, - ) -> Result { + pub async fn getattr(&self, ino: InodeAddr) -> Result { self.composite.delegated_getattr(ino).await } #[instrument(name = "MesaFS::readdir", skip(self))] - async fn readdir(&mut self, ino: Inode) -> Result<&[DirEntry], ReadDirError> { + pub async fn readdir(&mut self, ino: InodeAddr) -> Result<&[FsDirEntry], ReadDirError> { let role = self.inode_role(ino).ok_or(ReadDirError::InodeNotFound)?; match role { InodeRole::Root => { @@ -365,11 +241,10 @@ impl Fs for MesaFS { let mut entries = Vec::with_capacity(org_info.len()); for (org_idx, name) in &org_info { - let (org_ino, _) = self.ensure_org_inode(*org_idx).await; - entries.push(DirEntry { - ino: org_ino, + let (entry_ino, _) = self.ensure_org_inode(*org_idx).await; + entries.push(FsDirEntry { + ino: entry_ino, name: name.clone().into(), - kind: DirEntryType::Directory, }); } @@ -382,45 +257,178 @@ impl Fs for MesaFS { } #[instrument(name = "MesaFS::open", skip(self))] - async fn open(&mut self, ino: Inode, flags: OpenFlags) -> Result { + pub async fn open( + &mut self, + ino: InodeAddr, + flags: OpenFlags, + ) -> Result { self.composite.delegated_open(ino, flags).await } #[instrument(name = "MesaFS::read", skip(self))] - async fn read( + pub async fn read( &mut self, - ino: Inode, fh: FileHandle, offset: u64, size: u32, - flags: OpenFlags, - lock_owner: Option, ) -> Result { - self.composite - .delegated_read(ino, fh, offset, size, flags, lock_owner) - .await + self.composite.delegated_read(fh, offset, size).await } #[instrument(name = "MesaFS::release", skip(self))] - async fn release( - &mut self, - ino: Inode, - fh: FileHandle, - flags: OpenFlags, - flush: bool, - ) -> Result<(), ReleaseError> { - self.composite - .delegated_release(ino, fh, flags, flush) - .await + pub async fn release(&mut self, fh: FileHandle) -> Result<(), ReleaseError> { + self.composite.delegated_release(fh).await } +} + +/// A file reader that delegates reads to `MesaFS` through a shared mutex. +/// +/// Resources are released via [`FileReader::close`](git_fs::fs::async_fs::FileReader::close), +/// which is called by the FUSE adapter during `release`. Dropping without +/// calling `close()` emits a diagnostic warning. +pub struct MesaFsReader { + inner: Arc>, + fh: FileHandle, + closed: AtomicBool, +} - #[instrument(name = "MesaFS::forget", skip(self))] - async fn forget(&mut self, ino: Inode, nlookups: u64) { - // MesaFS has no extra state to clean up on eviction (unlike OrgFs::owner_inodes). - let _ = self.composite.delegated_forget(ino, nlookups).await; +impl git_fs::fs::async_fs::FileReader for MesaFsReader { + fn read( + &self, + offset: u64, + size: u32, + ) -> impl Future> + Send { + let inner = Arc::clone(&self.inner); + let fh = self.fh; + async move { + let mut guard = inner.lock().await; + guard + .read(fh, offset, size) + .await + .map_err(|e| std::io::Error::other(e.to_string())) + } } - async fn statfs(&mut self) -> Result { - Ok(self.composite.delegated_statfs()) + fn close(&self) -> impl Future> + Send { + self.closed.store(true, Ordering::Relaxed); + let inner = Arc::clone(&self.inner); + let fh = self.fh; + async move { + let mut guard = inner.lock().await; + guard + .release(fh) + .await + .map_err(|e| std::io::Error::other(e.to_string())) + } + } +} + +impl Drop for MesaFsReader { + fn drop(&mut self) { + if !self.closed.load(Ordering::Relaxed) { + tracing::warn!(fh = self.fh, "MesaFsReader dropped without close()"); + } + } +} + +/// A [`FsDataProvider`](git_fs::fs::async_fs::FsDataProvider) that wraps +/// `MesaFS` behind a shared mutex. +#[derive(Clone)] +pub struct MesaFsProvider { + inner: Arc>, +} + +impl MesaFsProvider { + /// Create a new provider wrapping the given `MesaFS`. + pub fn new(mesa_fs: MesaFS) -> Self { + Self { + inner: Arc::new(tokio::sync::Mutex::new(mesa_fs)), + } + } +} + +fn lookup_error_to_io(e: LookupError) -> std::io::Error { + match e { + LookupError::InodeNotFound => std::io::Error::from_raw_os_error(libc::ENOENT), + LookupError::RemoteMesaError(api) => std::io::Error::other(api.to_string()), + } +} + +fn readdir_error_to_io(e: ReadDirError) -> std::io::Error { + match e { + ReadDirError::InodeNotFound => std::io::Error::from_raw_os_error(libc::ENOENT), + ReadDirError::NotADirectory => std::io::Error::from_raw_os_error(libc::ENOTDIR), + ReadDirError::NotPermitted => std::io::Error::from_raw_os_error(libc::EPERM), + ReadDirError::RemoteMesaError(api) => std::io::Error::other(api.to_string()), + } +} + +fn open_error_to_io(e: OpenError) -> std::io::Error { + match e { + OpenError::InodeNotFound => std::io::Error::from_raw_os_error(libc::ENOENT), + } +} + +impl git_fs::fs::async_fs::FsDataProvider for MesaFsProvider { + type Reader = MesaFsReader; + + fn lookup( + &self, + parent: INode, + name: &OsStr, + ) -> impl Future> + Send { + let inner = Arc::clone(&self.inner); + let name = name.to_os_string(); + async move { + let mut guard = inner.lock().await; + guard + .lookup(parent.addr, &name) + .await + .map_err(lookup_error_to_io) + } + } + + fn readdir( + &self, + parent: INode, + ) -> impl Future, std::io::Error>> + Send { + let inner = Arc::clone(&self.inner); + async move { + let mut guard = inner.lock().await; + let dir_entries: Vec<(OsString, InodeAddr)> = { + let entries = guard + .readdir(parent.addr) + .await + .map_err(readdir_error_to_io)?; + entries.iter().map(|e| (e.name.clone(), e.ino)).collect() + }; + let mut result = Vec::with_capacity(dir_entries.len()); + for (name, ino) in dir_entries { + if let Ok(inode) = guard.getattr(ino).await { + result.push((name, inode)); + } + } + Ok(result) + } + } + + fn open( + &self, + inode: INode, + flags: OpenFlags, + ) -> impl Future> + Send { + let inner = Arc::clone(&self.inner); + async move { + let mut guard = inner.lock().await; + let fh = guard + .open(inode.addr, flags) + .await + .map_err(open_error_to_io)?; + Ok(MesaFsReader { + inner: Arc::clone(&inner), + fh, + closed: AtomicBool::new(false), + }) + } } } diff --git a/src/fs/mescloud/org.rs b/src/fs/mescloud/org.rs index 1f3b8b5f..feefaf8e 100644 --- a/src/fs/mescloud/org.rs +++ b/src/fs/mescloud/org.rs @@ -1,73 +1,19 @@ use std::collections::HashMap; -use std::ffi::OsStr; -use std::future::Future; +use std::ffi::{OsStr, OsString}; use std::time::SystemTime; use bytes::Bytes; use futures::TryStreamExt as _; +use git_fs::fs::{FileHandle, INode, INodeType, InodeAddr, InodePerms, OpenFlags}; use mesa_dev::MesaClient; use secrecy::SecretString; -use tracing::{Instrument as _, instrument, trace, warn}; - -pub use super::common::{ - GetAttrError, LookupError, OpenError, ReadDirError, ReadError, ReleaseError, -}; -use super::common::{InodeControlBlock, MesaApiError}; -use super::composite::{ChildSlot, CompositeFs}; -use super::icache as mescloud_icache; -use super::icache::MescloudICache; +use tracing::{instrument, trace, warn}; + +use super::common::{ChildFs, MesaApiError}; +pub use super::common::{LookupError, OpenError, ReadDirError, ReadError, ReleaseError}; +use super::composite::CompositeFs; use super::repo::RepoFs; use crate::app_config::CacheConfig; -use crate::fs::icache::bridge::HashMapBridge; -use crate::fs::icache::{AsyncICache, FileTable, IcbResolver}; -use crate::fs::r#trait::{ - DirEntry, DirEntryType, FileAttr, FileHandle, FilesystemStats, Fs, Inode, LockOwner, OpenFile, - OpenFlags, -}; - -pub(super) struct OrgResolver { - fs_owner: (u32, u32), - block_size: u32, -} - -impl IcbResolver for OrgResolver { - type Icb = InodeControlBlock; - type Error = LookupError; - - fn resolve( - &self, - ino: Inode, - stub: Option, - _cache: &AsyncICache, - ) -> impl Future> + Send - where - Self: Sized, - { - let fs_owner = self.fs_owner; - let block_size = self.block_size; - async move { - let stub = stub.unwrap_or_else(|| InodeControlBlock { - parent: None, - path: "/".into(), - rc: 0, - attr: None, - children: None, - }); - let now = SystemTime::now(); - let attr = FileAttr::Directory { - common: mescloud_icache::make_common_file_attr( - ino, 0o755, now, now, fs_owner, block_size, - ), - }; - Ok(InodeControlBlock { - attr: Some(attr), - children: Some(vec![]), - ..stub - }) - } - .instrument(tracing::info_span!("OrgResolver::resolve", ino)) - } -} #[derive(Debug, Clone)] pub struct OrgConfig { @@ -81,7 +27,7 @@ enum InodeRole { OrgRoot, /// A virtual owner directory (github only). OwnerDir, - /// An inode owned by some repo. + /// An inode owned by some repo (either a child-root or delegated). RepoOwned, } @@ -92,14 +38,14 @@ enum InodeRole { pub struct OrgFs { name: String, client: MesaClient, - composite: CompositeFs, + composite: CompositeFs, /// Maps org-level owner-dir inodes to owner name (github only). - owner_inodes: HashMap, + owner_inodes: HashMap, cache_config: CacheConfig, } impl OrgFs { - pub(crate) const ROOT_INO: Inode = 1; + pub(crate) const ROOT_INO: InodeAddr = CompositeFs::::ROOT_INO; const BLOCK_SIZE: u32 = 4096; /// The name of the organization. @@ -123,31 +69,14 @@ impl OrgFs { /// Ensure an inode exists for a virtual owner directory (github only). Does NOT bump rc. /// TODO(MES-674): Cleanup "special" casing for github. - async fn ensure_owner_inode(&mut self, owner: &str) -> (Inode, FileAttr) { + async fn ensure_owner_inode(&mut self, owner: &str) -> (InodeAddr, INode) { // Check existing let mut stale_ino = None; for (&ino, existing_owner) in &self.owner_inodes { if existing_owner == owner { - if let Some(attr) = self.composite.icache.get_attr(ino).await { - return (ino, attr); - } - if self.composite.icache.contains(ino) { - // ICB exists but attr missing — rebuild and cache - let now = SystemTime::now(); - let attr = FileAttr::Directory { - common: mescloud_icache::make_common_file_attr( - ino, - 0o755, - now, - now, - self.composite.icache.fs_owner(), - self.composite.icache.block_size(), - ), - }; - self.composite.icache.cache_attr(ino, attr).await; - return (ino, attr); + if let Ok(inode) = self.composite.delegated_getattr(ino).await { + return (ino, inode); } - // ICB was evicted — mark for cleanup stale_ino = Some(ino); break; } @@ -156,35 +85,22 @@ impl OrgFs { self.owner_inodes.remove(&ino); } - // Allocate new - let ino = self.composite.icache.allocate_inode(); + let ino = self.composite.allocate_inode(); let now = SystemTime::now(); - self.composite - .icache - .insert_icb( - ino, - InodeControlBlock { - rc: 0, - path: owner.into(), - parent: Some(Self::ROOT_INO), - attr: None, - children: None, - }, - ) - .await; - self.owner_inodes.insert(ino, owner.to_owned()); - let attr = FileAttr::Directory { - common: mescloud_icache::make_common_file_attr( - ino, - 0o755, - now, - now, - self.composite.icache.fs_owner(), - self.composite.icache.block_size(), - ), + let inode = INode { + addr: ino, + permissions: InodePerms::from_bits_truncate(0o755), + uid: self.composite.fs_owner().0, + gid: self.composite.fs_owner().1, + create_time: now, + last_modified_at: now, + parent: Some(Self::ROOT_INO), + size: 0, + itype: INodeType::Directory, }; - self.composite.icache.cache_attr(ino, attr).await; - (ino, attr) + self.composite.cache_inode_and_init_rc(inode); + self.owner_inodes.insert(ino, owner.to_owned()); + (ino, inode) } #[must_use] @@ -194,28 +110,17 @@ impl OrgFs { fs_owner: (u32, u32), cache_config: CacheConfig, ) -> Self { - let resolver = OrgResolver { - fs_owner, - block_size: Self::BLOCK_SIZE, - }; Self { name, client, - composite: CompositeFs { - icache: MescloudICache::new(resolver, Self::ROOT_INO, fs_owner, Self::BLOCK_SIZE), - file_table: FileTable::new(), - readdir_buf: Vec::new(), - child_inodes: HashMap::new(), - inode_to_slot: HashMap::new(), - slots: Vec::new(), - }, + composite: CompositeFs::new(fs_owner, Self::BLOCK_SIZE), owner_inodes: HashMap::new(), cache_config, } } /// Classify an inode by its role. - fn inode_role(&self, ino: Inode) -> Option { + fn inode_role(&self, ino: InodeAddr) -> Option { if ino == Self::ROOT_INO { return Some(InodeRole::OrgRoot); } @@ -242,144 +147,92 @@ impl OrgFs { repo_name: &str, display_name: &str, default_branch: &str, - parent_ino: Inode, - ) -> (Inode, FileAttr) { + parent_ino: InodeAddr, + ) -> (InodeAddr, INode) { // Check existing repos. for (&ino, &idx) in &self.composite.child_inodes { if self.composite.slots[idx].inner.repo_name() == repo_name { - if let Some(attr) = self.composite.icache.get_attr(ino).await { - let rc = self - .composite - .icache - .get_icb(ino, |icb| icb.rc) - .await - .unwrap_or(0); - trace!(ino, repo = repo_name, rc, "ensure_repo_inode: reusing"); - return (ino, attr); + if let Ok(inode) = self.composite.delegated_getattr(ino).await { + trace!(ino, repo = repo_name, "ensure_repo_inode: reusing"); + return (ino, inode); } warn!( ino, repo = repo_name, "ensure_repo_inode: attr missing, rebuilding" ); - return self.make_repo_dir_attr(ino).await; + return self.make_repo_dir_inode(ino); } } - // Check for orphaned slot (slot exists but not in child_inodes). - if let Some(idx) = self - .composite - .slots - .iter() - .position(|s| s.inner.repo_name() == repo_name) - { - return self.register_repo_slot(idx, display_name, parent_ino).await; - } - - // Allocate truly new slot. - let ino = self.composite.icache.allocate_inode(); - trace!( - ino, - repo = repo_name, - "ensure_repo_inode: allocated new inode" - ); - - self.composite - .icache - .insert_icb( - ino, - InodeControlBlock { - rc: 0, - path: display_name.into(), - parent: Some(parent_ino), - attr: None, - children: None, - }, - ) - .await; - + // Create new RepoFs and register as child. let repo = RepoFs::new( self.client.clone(), self.name.clone(), repo_name.to_owned(), default_branch.to_owned(), - self.composite.icache.fs_owner(), - // TODO(markovejnovic): Unnecessary clone. Refactoring for clearer ownership semantics - // would be ideal. + self.composite.fs_owner(), self.cache_config.clone(), ) .await; - let mut bridge = HashMapBridge::new(); - bridge.insert_inode(ino, RepoFs::ROOT_INO); - - let idx = self.composite.slots.len(); - self.composite.slots.push(ChildSlot { - inner: repo, - bridge, - }); - self.composite.child_inodes.insert(ino, idx); - self.composite.inode_to_slot.insert(ino, idx); - - self.make_repo_dir_attr(ino).await - } - - /// Allocate a new inode, register it in an existing (orphaned) slot, and - /// return `(ino, attr)`. - async fn register_repo_slot( - &mut self, - idx: usize, - display_name: &str, - parent_ino: Inode, - ) -> (Inode, FileAttr) { - let ino = self.composite.icache.allocate_inode(); - trace!(ino, idx, "register_repo_slot: reusing orphaned slot"); + let outer_ino = self + .composite + .add_child_with_parent(repo, RepoFs::ROOT_INO, parent_ino); + trace!( + ino = outer_ino, + repo = repo_name, + "ensure_repo_inode: allocated new inode" + ); + // Register in directory cache so readdir sees it. self.composite - .icache - .insert_icb( - ino, - InodeControlBlock { - rc: 0, - path: display_name.into(), - parent: Some(parent_ino), - attr: None, - children: None, - }, + .directory_cache + .insert( + git_fs::fs::LoadedAddr(parent_ino), + OsString::from(display_name), + git_fs::fs::LoadedAddr(outer_ino), + true, ) .await; - warn!( - ino, - idx, - "register_repo_slot: resetting bridge for orphaned slot; \ - inner filesystem will not receive forget for stale inode mappings" - ); - self.composite.slots[idx].bridge = HashMapBridge::new(); - self.composite.slots[idx] - .bridge - .insert_inode(ino, RepoFs::ROOT_INO); - self.composite.child_inodes.insert(ino, idx); - self.composite.inode_to_slot.insert(ino, idx); - - self.make_repo_dir_attr(ino).await + let inode = self + .composite + .delegated_getattr(outer_ino) + .await + .unwrap_or_else(|_| { + let now = SystemTime::now(); + INode { + addr: outer_ino, + permissions: InodePerms::from_bits_truncate(0o755), + uid: self.composite.fs_owner().0, + gid: self.composite.fs_owner().1, + create_time: now, + last_modified_at: now, + parent: Some(parent_ino), + size: 0, + itype: INodeType::Directory, + } + }); + (outer_ino, inode) } - /// Build and cache a directory attr for `ino`, returning `(ino, attr)`. - async fn make_repo_dir_attr(&self, ino: Inode) -> (Inode, FileAttr) { + /// Build a directory inode for `ino`, returning `(ino, inode)`. + fn make_repo_dir_inode(&self, ino: InodeAddr) -> (InodeAddr, INode) { let now = SystemTime::now(); - let attr = FileAttr::Directory { - common: mescloud_icache::make_common_file_attr( - ino, - 0o755, - now, - now, - self.composite.icache.fs_owner(), - self.composite.icache.block_size(), - ), + let inode = INode { + addr: ino, + permissions: InodePerms::from_bits_truncate(0o755), + uid: self.composite.fs_owner().0, + gid: self.composite.fs_owner().1, + create_time: now, + last_modified_at: now, + parent: None, + size: 0, + itype: INodeType::Directory, }; - self.composite.icache.cache_attr(ino, attr).await; - (ino, attr) + self.composite.cache_inode(inode); + (ino, inode) } /// Fetch a repo by name via the API. @@ -398,62 +251,36 @@ impl OrgFs { } #[async_trait::async_trait] -impl super::common::InodeCachePeek for OrgFs { - async fn peek_attr(&self, ino: Inode) -> Option { - self.composite.icache.get_attr(ino).await - } -} - -#[async_trait::async_trait] -impl Fs for OrgFs { - type LookupError = LookupError; - type GetAttrError = GetAttrError; - type OpenError = OpenError; - type ReadError = ReadError; - type ReaddirError = ReadDirError; - type ReleaseError = ReleaseError; - +impl ChildFs for OrgFs { #[instrument(name = "OrgFs::lookup", skip(self), fields(org = %self.name))] - async fn lookup(&mut self, parent: Inode, name: &OsStr) -> Result { + async fn lookup(&mut self, parent: InodeAddr, name: &OsStr) -> Result { let role = self.inode_role(parent).ok_or(LookupError::InodeNotFound)?; match role { InodeRole::OrgRoot => { - // TODO(MES-674): Cleanup "special" casing for github. let name_str = name.to_str().ok_or(LookupError::InodeNotFound)?; if self.is_github() { - // name is an owner like "torvalds" — create lazily, no API validation. trace!(owner = name_str, "lookup: resolving github owner dir"); - let (ino, attr) = self.ensure_owner_inode(name_str).await; + let (ino, inode) = self.ensure_owner_inode(name_str).await; self.composite - .icache .inc_rc(ino) - .await .ok_or(LookupError::InodeNotFound)?; - Ok(attr) + Ok(inode) } else { - // Children of org root are repos. trace!(repo = name_str, "lookup: resolving repo"); - - // Validate repo exists via API. let repo = self.wait_for_sync(name_str).await?; - - let (ino, attr) = self + let (ino, inode) = self .ensure_repo_inode(name_str, name_str, &repo.default_branch, Self::ROOT_INO) .await; let rc = self .composite - .icache .inc_rc(ino) - .await .ok_or(LookupError::InodeNotFound)?; trace!(ino, repo = name_str, rc, "lookup: resolved repo inode"); - Ok(attr) + Ok(inode) } } InodeRole::OwnerDir => { - // TODO(MES-674): Cleanup "special" casing for github. - // Parent is an owner dir, name is a repo like "linux". let owner = self .owner_inodes .get(&parent) @@ -464,49 +291,32 @@ impl Fs for OrgFs { let encoded = Self::encode_github_repo_name(&full_decoded); trace!( - owner = %owner, - repo = repo_name_str, - encoded = %encoded, + owner = %owner, repo = repo_name_str, encoded = %encoded, "lookup: resolving github repo via owner dir" ); - // Validate via API (uses encoded name). let repo = self.wait_for_sync(&encoded).await?; - - let (ino, attr) = self + let (ino, inode) = self .ensure_repo_inode(&encoded, repo_name_str, &repo.default_branch, parent) .await; self.composite - .icache .inc_rc(ino) - .await .ok_or(LookupError::InodeNotFound)?; - Ok(attr) + Ok(inode) } InodeRole::RepoOwned => self.composite.delegated_lookup(parent, name).await, } } - #[instrument(name = "OrgFs::getattr", skip(self), fields(org = %self.name))] - async fn getattr( - &mut self, - ino: Inode, - _fh: Option, - ) -> Result { - self.composite.delegated_getattr(ino).await - } - #[instrument(name = "OrgFs::readdir", skip(self), fields(org = %self.name))] - async fn readdir(&mut self, ino: Inode) -> Result<&[DirEntry], ReadDirError> { + async fn readdir(&mut self, ino: InodeAddr) -> Result, ReadDirError> { let role = self.inode_role(ino).ok_or(ReadDirError::InodeNotFound)?; match role { InodeRole::OrgRoot => { - // TODO(MES-674): Cleanup "special" casing for github. if self.is_github() { return Err(ReadDirError::NotPermitted); } - // List repos via API. let repos: Vec = self .client .org(&self.name) @@ -528,70 +338,53 @@ impl Fs for OrgFs { let mut entries = Vec::with_capacity(repo_infos.len()); for (repo_name, default_branch) in &repo_infos { - let (repo_ino, _) = self + let (_, inode) = self .ensure_repo_inode(repo_name, repo_name, default_branch, Self::ROOT_INO) .await; - entries.push(DirEntry { - ino: repo_ino, - name: repo_name.clone().into(), - kind: DirEntryType::Directory, - }); + entries.push((OsString::from(repo_name), inode)); } - self.composite.readdir_buf = entries; - Ok(&self.composite.readdir_buf) - } - InodeRole::OwnerDir if self.is_github() => { - // TODO(MES-674): Cleanup "special" casing for github. - Err(ReadDirError::NotPermitted) + Ok(entries) } + InodeRole::OwnerDir if self.is_github() => Err(ReadDirError::NotPermitted), InodeRole::OwnerDir => Err(ReadDirError::NotADirectory), - InodeRole::RepoOwned => self.composite.delegated_readdir(ino).await, + InodeRole::RepoOwned => { + let dir_entries: Vec<_> = self + .composite + .delegated_readdir(ino) + .await? + .iter() + .map(|e| (e.name.clone(), e.ino)) + .collect(); + let mut entries = Vec::with_capacity(dir_entries.len()); + for (name, child_ino) in dir_entries { + if let Some(inode) = self.composite.inode_table.get(&child_ino).await { + entries.push((name, inode)); + } + } + Ok(entries) + } } } #[instrument(name = "OrgFs::open", skip(self), fields(org = %self.name))] - async fn open(&mut self, ino: Inode, flags: OpenFlags) -> Result { + async fn open(&mut self, ino: InodeAddr, flags: OpenFlags) -> Result { self.composite.delegated_open(ino, flags).await } #[instrument(name = "OrgFs::read", skip(self), fields(org = %self.name))] async fn read( &mut self, - ino: Inode, + _ino: InodeAddr, fh: FileHandle, offset: u64, size: u32, - flags: OpenFlags, - lock_owner: Option, ) -> Result { - self.composite - .delegated_read(ino, fh, offset, size, flags, lock_owner) - .await + self.composite.delegated_read(fh, offset, size).await } #[instrument(name = "OrgFs::release", skip(self), fields(org = %self.name))] - async fn release( - &mut self, - ino: Inode, - fh: FileHandle, - flags: OpenFlags, - flush: bool, - ) -> Result<(), ReleaseError> { - self.composite - .delegated_release(ino, fh, flags, flush) - .await - } - - #[instrument(name = "OrgFs::forget", skip(self), fields(org = %self.name))] - async fn forget(&mut self, ino: Inode, nlookups: u64) { - let evicted = self.composite.delegated_forget(ino, nlookups).await; - if evicted { - self.owner_inodes.remove(&ino); - } - } - - async fn statfs(&mut self) -> Result { - Ok(self.composite.delegated_statfs()) + async fn release(&mut self, _ino: InodeAddr, fh: FileHandle) -> Result<(), ReleaseError> { + self.composite.delegated_release(fh).await } } diff --git a/src/fs/mescloud/repo.rs b/src/fs/mescloud/repo.rs index 11b334a7..acff3d04 100644 --- a/src/fs/mescloud/repo.rs +++ b/src/fs/mescloud/repo.rs @@ -2,197 +2,436 @@ //! //! This module directly accesses the mesa repo through the Rust SDK, on a per-repo basis. +use std::collections::HashMap; +use std::ffi::OsString; use std::future::Future; -use std::{collections::HashMap, ffi::OsStr, path::PathBuf, time::SystemTime}; +use std::sync::Arc; +use std::sync::atomic::{AtomicU64, Ordering}; +use std::time::SystemTime; +use std::{ffi::OsStr, path::PathBuf}; use base64::Engine as _; use bytes::Bytes; use mesa_dev::MesaClient; use mesa_dev::low_level::content::{Content, DirEntry as MesaDirEntry}; use num_traits::cast::ToPrimitive as _; -use tracing::{Instrument as _, instrument, trace, warn}; +use tracing::warn; use git_fs::cache::fcache::FileCache; use git_fs::cache::traits::{AsyncReadableCache as _, AsyncWritableCache as _}; +use git_fs::fs::async_fs::{FileReader, FsDataProvider}; +use git_fs::fs::{ + INode, INodeType, InodeAddr, InodePerms, LoadedAddr, OpenFlags as AsyncOpenFlags, +}; use crate::app_config::CacheConfig; -use crate::fs::icache::{AsyncICache, FileTable, IcbResolver}; -use crate::fs::r#trait::{ - DirEntry, DirEntryType, FileAttr, FileHandle, FileOpenOptions, FilesystemStats, Fs, Inode, - LockOwner, OpenFile, OpenFlags, -}; use super::common::MesaApiError; -pub use super::common::{ - GetAttrError, LookupError, OpenError, ReadDirError, ReadError, ReleaseError, -}; -use super::icache as mescloud_icache; -use super::icache::{InodeControlBlock, MescloudICache}; +pub use super::common::{LookupError, OpenError, ReadDirError, ReadError, ReleaseError}; + +fn mesa_api_error_to_io(e: MesaApiError) -> std::io::Error { + match &e { + MesaApiError::Response { status, .. } if *status == 404 => { + std::io::Error::from_raw_os_error(libc::ENOENT) + } + MesaApiError::Reqwest(_) + | MesaApiError::ReqwestMiddleware(_) + | MesaApiError::Serde(_) + | MesaApiError::SerdePath(_) + | MesaApiError::Io(_) + | MesaApiError::Response { .. } => std::io::Error::other(e), + } +} -pub(super) struct RepoResolver { +#[derive(Clone)] +pub(super) struct MesRepoProvider { + inner: Arc, +} + +struct MesRepoProviderInner { client: MesaClient, org_name: String, repo_name: String, ref_: String, fs_owner: (u32, u32), - block_size: u32, + next_addr: AtomicU64, + /// Maps inode addresses to repo-relative paths (e.g., "src/main.rs"). + /// Root directory maps to an empty `PathBuf`. + path_map: scc::HashMap, + file_cache: Option>>, +} + +impl MesRepoProvider { + pub(super) fn new( + client: MesaClient, + org_name: String, + repo_name: String, + ref_: String, + fs_owner: (u32, u32), + file_cache: Option>>, + ) -> Self { + Self { + inner: Arc::new(MesRepoProviderInner { + client, + org_name, + repo_name, + ref_, + fs_owner, + next_addr: AtomicU64::new(2), // 1 is reserved for root + path_map: scc::HashMap::new(), + file_cache, + }), + } + } + + /// Store the path for the root inode address. + pub(super) fn seed_root_path(&self, root_addr: InodeAddr) { + // Root maps to empty PathBuf (no path prefix for API calls) + drop(self.inner.path_map.insert_sync(root_addr, PathBuf::new())); + } + + /// Remove the path entry for an inode. Called during forget/cleanup. + #[expect(dead_code, reason = "will be needed when child forget is implemented")] + pub(super) fn remove_path(&self, addr: InodeAddr) { + self.inner.path_map.remove_sync(&addr); + } + + /// The name of the repository. + pub(super) fn repo_name(&self) -> &str { + &self.inner.repo_name + } } -impl IcbResolver for RepoResolver { - type Icb = InodeControlBlock; - type Error = LookupError; +impl FsDataProvider for MesRepoProvider { + type Reader = MesFileReader; + + fn lookup( + &self, + parent: INode, + name: &OsStr, + ) -> impl Future> + Send { + let inner = Arc::clone(&self.inner); + let name = name.to_os_string(); + async move { + let parent_path = inner + .path_map + .get_async(&parent.addr) + .await + .map(|e| e.get().clone()) + .ok_or_else(|| std::io::Error::from_raw_os_error(libc::ENOENT))?; + + let child_path = parent_path.join(&name); + let child_path_str = child_path.to_str().ok_or_else(|| { + std::io::Error::new( + std::io::ErrorKind::InvalidData, + "path contains non-UTF-8 characters", + ) + })?; + + let content = inner + .client + .org(&inner.org_name) + .repos() + .at(&inner.repo_name) + .content() + .get(Some(inner.ref_.as_str()), Some(child_path_str), Some(1u64)) + .await + .map_err(MesaApiError::from) + .map_err(mesa_api_error_to_io)?; + + let now = SystemTime::now(); + let (uid, gid) = inner.fs_owner; + + let (itype, size) = match &content { + Content::File(f) => (INodeType::File, f.size.to_u64().unwrap_or(0)), + Content::Symlink(s) => (INodeType::File, s.size.to_u64().unwrap_or(0)), + Content::Dir(_) => (INodeType::Directory, 0), + }; + + let perms = if itype == INodeType::Directory { + InodePerms::from_bits_truncate(0o755) + } else { + InodePerms::from_bits_truncate(0o644) + }; + + let addr = inner.next_addr.fetch_add(1, Ordering::Relaxed); + drop(inner.path_map.insert_async(addr, child_path).await); + + Ok(INode { + addr, + permissions: perms, + uid, + gid, + create_time: now, + last_modified_at: now, + parent: Some(parent.addr), + size, + itype, + }) + } + } + + fn readdir( + &self, + parent: INode, + ) -> impl Future, std::io::Error>> + Send { + let inner = Arc::clone(&self.inner); + async move { + let parent_path = inner + .path_map + .get_async(&parent.addr) + .await + .map(|e| e.get().clone()) + .ok_or_else(|| std::io::Error::from_raw_os_error(libc::ENOENT))?; + + let api_path = if parent_path.as_os_str().is_empty() { + None + } else { + Some( + parent_path + .to_str() + .ok_or_else(|| { + std::io::Error::new( + std::io::ErrorKind::InvalidData, + "path contains non-UTF-8 characters", + ) + })? + .to_owned(), + ) + }; + + let content = inner + .client + .org(&inner.org_name) + .repos() + .at(&inner.repo_name) + .content() + .get(Some(inner.ref_.as_str()), api_path.as_deref(), Some(1u64)) + .await + .map_err(MesaApiError::from) + .map_err(mesa_api_error_to_io)?; + + let dir = match content { + Content::Dir(d) => d, + Content::File(_) | Content::Symlink(_) => { + return Err(std::io::Error::from_raw_os_error(libc::ENOTDIR)); + } + }; + + let now = SystemTime::now(); + let (uid, gid) = inner.fs_owner; + let mut entries = Vec::with_capacity(dir.entries.len()); + + for entry in dir.entries { + let (name, itype, size) = match entry { + MesaDirEntry::File(f) => { + let Some(name) = f.name else { continue }; + (name, INodeType::File, f.size.to_u64().unwrap_or(0)) + } + MesaDirEntry::Symlink(s) => { + let Some(name) = s.name else { continue }; + (name, INodeType::File, s.size.to_u64().unwrap_or(0)) + } + MesaDirEntry::Dir(d) => { + let Some(name) = d.name else { continue }; + (name, INodeType::Directory, 0) + } + }; + + let perms = if itype == INodeType::Directory { + InodePerms::from_bits_truncate(0o755) + } else { + InodePerms::from_bits_truncate(0o644) + }; + + let addr = inner.next_addr.fetch_add(1, Ordering::Relaxed); + let child_path = parent_path.join(&name); + drop(inner.path_map.insert_async(addr, child_path).await); + + let inode = INode { + addr, + permissions: perms, + uid, + gid, + create_time: now, + last_modified_at: now, + parent: Some(parent.addr), + size, + itype, + }; + + entries.push((OsString::from(name), inode)); + } + + Ok(entries) + } + } - fn resolve( + fn open( &self, - ino: Inode, - stub: Option, - cache: &AsyncICache, - ) -> impl Future> + Send - where - Self: Sized, - { + inode: INode, + _flags: AsyncOpenFlags, + ) -> impl Future> + Send { + let inner = Arc::clone(&self.inner); + async move { + let path = inner + .path_map + .get_async(&inode.addr) + .await + .map(|e| e.get().clone()) + .ok_or_else(|| std::io::Error::from_raw_os_error(libc::ENOENT))?; + + Ok(MesFileReader { + client: inner.client.clone(), + org_name: inner.org_name.clone(), + repo_name: inner.repo_name.clone(), + ref_: inner.ref_.clone(), + path, + file_cache: inner.file_cache.clone(), + inode_addr: inode.addr, + }) + } + } +} + +pub(super) struct MesFileReader { + client: MesaClient, + org_name: String, + repo_name: String, + ref_: String, + path: PathBuf, + file_cache: Option>>, + inode_addr: InodeAddr, +} + +impl FileReader for MesFileReader { + fn read( + &self, + offset: u64, + size: u32, + ) -> impl Future> + Send { let client = self.client.clone(); let org_name = self.org_name.clone(); let repo_name = self.repo_name.clone(); let ref_ = self.ref_.clone(); - let fs_owner = self.fs_owner; - let block_size = self.block_size; + let path = self.path.clone(); + let file_cache = self.file_cache.clone(); + let inode_addr = self.inode_addr; async move { - let stub = stub.ok_or(LookupError::InodeNotFound)?; - let file_path = build_repo_path(stub.parent, &stub.path, cache, RepoFs::ROOT_INO).await; - - // Non-root inodes must have a resolvable path. - if stub.parent.is_some() && file_path.is_none() { - return Err(LookupError::InodeNotFound); + // Try the file cache first. + if let Some(cache) = &file_cache + && let Some(data) = cache.get(&inode_addr).await + { + let start = usize::try_from(offset) + .unwrap_or(data.len()) + .min(data.len()); + let end = start.saturating_add(size as usize).min(data.len()); + return Ok(Bytes::copy_from_slice(&data[start..end])); } + // Cache miss -- fetch from the Mesa API. + let path_str = path.to_str().ok_or_else(|| { + std::io::Error::new( + std::io::ErrorKind::InvalidData, + "path contains non-UTF-8 characters", + ) + })?; + + let api_path = if path_str.is_empty() { + None + } else { + Some(path_str) + }; + let content = client .org(&org_name) .repos() .at(&repo_name) .content() - .get(Some(ref_.as_str()), file_path.as_deref(), Some(1u64)) + .get(Some(ref_.as_str()), api_path, None) .await - .map_err(MesaApiError::from)?; - - let now = SystemTime::now(); - let attr = match &content { - Content::File(f) => { - let size = f.size.to_u64().unwrap_or(0); - FileAttr::RegularFile { - common: mescloud_icache::make_common_file_attr( - ino, 0o644, now, now, fs_owner, block_size, - ), - size, - blocks: mescloud_icache::blocks_of_size(block_size, size), - } + .map_err(MesaApiError::from) + .map_err(mesa_api_error_to_io)?; + + let encoded_content = match content { + Content::File(f) => f.content.unwrap_or_default(), + Content::Symlink(s) => s.content.unwrap_or_default(), + Content::Dir(_) => { + return Err(std::io::Error::from_raw_os_error(libc::EISDIR)); } - Content::Symlink(s) => { - let size = s.size.to_u64().unwrap_or(0); - FileAttr::RegularFile { - common: mescloud_icache::make_common_file_attr( - ino, 0o644, now, now, fs_owner, block_size, - ), - size, - blocks: mescloud_icache::blocks_of_size(block_size, size), - } - } - Content::Dir(_) => FileAttr::Directory { - common: mescloud_icache::make_common_file_attr( - ino, 0o755, now, now, fs_owner, block_size, - ), - }, }; - let children = match content { - Content::Dir(d) => Some( - d.entries - .into_iter() - .filter_map(|e| { - let (name, kind) = match e { - MesaDirEntry::File(f) => (f.name?, DirEntryType::RegularFile), - // TODO(MES-712): return DirEntryType::Symlink once readlink is wired up. - MesaDirEntry::Symlink(s) => (s.name?, DirEntryType::RegularFile), - MesaDirEntry::Dir(d) => (d.name?, DirEntryType::Directory), - }; - Some((name, kind)) - }) - .collect(), - ), - Content::File(_) | Content::Symlink(_) => None, - }; + let decoded = base64::engine::general_purpose::STANDARD + .decode(&encoded_content) + .map_err(|e| std::io::Error::new(std::io::ErrorKind::InvalidData, e))?; - Ok(InodeControlBlock { - parent: stub.parent, - path: stub.path, - rc: stub.rc, - attr: Some(attr), - children, - }) + let start = usize::try_from(offset) + .unwrap_or(decoded.len()) + .min(decoded.len()); + let end = start.saturating_add(size as usize).min(decoded.len()); + let result = Bytes::copy_from_slice(&decoded[start..end]); + + // Store the decoded content in the cache for future reads. + if let Some(cache) = &file_cache + && let Err(e) = cache.insert(&inode_addr, decoded).await + { + warn!(error = ?e, inode_addr, "failed to cache file content"); + } + + Ok(result) } - .instrument(tracing::info_span!("RepoResolver::resolve", ino)) } } -/// Walk the parent chain in the cache to build the repo-relative path. -/// Returns `None` for the root inode (maps to `path=None` in the mesa content API). -async fn build_repo_path( - parent: Option, - name: &std::path::Path, - cache: &AsyncICache, - root_ino: Inode, -) -> Option { - /// Maximum parent-chain depth before bailing out. Prevents infinite loops - /// if a bug creates a cycle in the parent pointers. - const MAX_DEPTH: usize = 1024; - - let parent = parent?; - if parent == root_ino { - return name.to_str().map(String::from); +mod repo_fs_inner { + #![allow(clippy::future_not_send, clippy::mem_forget)] + use git_fs::cache::async_backed::FutureBackedCache; + use git_fs::fs::async_fs::AsyncFs; + use git_fs::fs::{INode, InodeAddr}; + use ouroboros::self_referencing; + + use super::MesRepoProvider; + + #[self_referencing] + pub struct RepoFsInner { + pub(super) inode_table: FutureBackedCache, + #[borrows(inode_table)] + #[covariant] + pub(super) fs: AsyncFs<'this, MesRepoProvider>, } - let mut components = vec![name.to_path_buf()]; - let mut current = parent; - for _ in 0..MAX_DEPTH { - if current == root_ino { - break; + impl RepoFsInner { + pub fn create( + inode_table: FutureBackedCache, + provider: MesRepoProvider, + ) -> Self { + RepoFsInnerBuilder { + inode_table, + fs_builder: |tbl| AsyncFs::new_preseeded(provider, tbl), + } + .build() } - let (path, next_parent) = cache - .get_icb(current, |icb| (icb.path.clone(), icb.parent)) - .await?; - components.push(path); - current = next_parent?; } - if current != root_ino { - tracing::warn!("build_repo_path: exceeded MAX_DEPTH={MAX_DEPTH}, possible parent cycle"); - return None; - } - components.reverse(); - let joined: PathBuf = components.iter().collect(); - joined.to_str().map(String::from) } +use repo_fs_inner::RepoFsInner; /// A filesystem rooted at a single mesa repository. /// -/// Implements [`Fs`] for navigating files and directories within one repo. -/// Does not handle organizations or multi-repo hierarchy — that is [`super::MesaFS`]'s job. +/// Wraps [`AsyncFs`] via ouroboros to co-locate the inode table +/// and the filesystem that borrows it. Implements [`Fs`] as a thin adapter. pub struct RepoFs { - client: MesaClient, - org_name: String, - repo_name: String, - ref_: String, - - icache: MescloudICache, - file_table: FileTable, - readdir_buf: Vec, - open_files: HashMap, - file_cache: Option>, + inner: RepoFsInner, + /// Reference counts for inodes held by the kernel. + refcounts: rustc_hash::FxHashMap, + /// Open file handles mapped to readers. + open_files: HashMap>, + /// Provider clone for accessing `repo_name` and `path_map` cleanup. + provider: MesRepoProvider, } impl RepoFs { - pub(crate) const ROOT_INO: Inode = 1; - const BLOCK_SIZE: u32 = 4096; + pub(crate) const ROOT_INO: InodeAddr = 1; /// Create a new `RepoFs` for a specific org and repo. pub async fn new( @@ -203,24 +442,15 @@ impl RepoFs { fs_owner: (u32, u32), cache_config: CacheConfig, ) -> Self { - let resolver = RepoResolver { - client: client.clone(), - org_name: org_name.clone(), - repo_name: repo_name.clone(), - ref_: ref_.clone(), - fs_owner, - block_size: Self::BLOCK_SIZE, - }; - let file_cache = match cache_config.max_size { Some(max_size) if max_size.as_u64() > 0 => { let cache_dir = cache_config.path.join(&org_name).join(&repo_name); let max_bytes = max_size.as_u64().try_into().unwrap_or(usize::MAX); match FileCache::new(&cache_dir, max_bytes).await { - Ok(cache) => Some(cache), + Ok(cache) => Some(Arc::new(cache)), Err(e) => { warn!(error = ?e, org = %org_name, repo = %repo_name, - "failed to create file cache, continuing without caching",); + "failed to create file cache, continuing without caching"); None } } @@ -228,317 +458,140 @@ impl RepoFs { _ => None, }; + let provider = + MesRepoProvider::new(client, org_name, repo_name, ref_, fs_owner, file_cache); + provider.seed_root_path(Self::ROOT_INO); + + let root = INode { + addr: Self::ROOT_INO, + permissions: InodePerms::from_bits_truncate(0o755), + uid: fs_owner.0, + gid: fs_owner.1, + create_time: SystemTime::now(), + last_modified_at: SystemTime::now(), + parent: None, + size: 0, + itype: INodeType::Directory, + }; + + let inode_table = git_fs::cache::async_backed::FutureBackedCache::default(); + inode_table.insert_sync(root.addr, root); + + let inner = RepoFsInner::create(inode_table, provider.clone()); + + let mut refcounts = rustc_hash::FxHashMap::default(); + refcounts.insert(Self::ROOT_INO, 1); + Self { - client, - org_name, - repo_name, - ref_, - icache: MescloudICache::new(resolver, Self::ROOT_INO, fs_owner, Self::BLOCK_SIZE), - file_table: FileTable::new(), - readdir_buf: Vec::new(), + inner, + refcounts, open_files: HashMap::new(), - file_cache, + provider, } } /// The name of the repository this filesystem is rooted at. pub(crate) fn repo_name(&self) -> &str { - &self.repo_name - } - - /// Build the repo-relative path for an inode by walking up the parent chain. - /// - /// Returns `None` for the root inode (the repo top-level maps to `path=None` in the - /// mesa content API). - async fn path_of_inode(&self, ino: Inode) -> Option { - /// Maximum parent-chain depth before bailing out. - const MAX_DEPTH: usize = 1024; - - if ino == Self::ROOT_INO { - return None; - } - - let mut components = Vec::new(); - let mut current = ino; - for _ in 0..MAX_DEPTH { - if current == Self::ROOT_INO { - break; - } - let (path, parent) = self - .icache - .get_icb(current, |icb| (icb.path.clone(), icb.parent)) - .await?; - components.push(path); - current = parent?; - } - if current != Self::ROOT_INO { - tracing::warn!( - ino, - "path_of_inode: exceeded MAX_DEPTH={MAX_DEPTH}, possible parent cycle" - ); - return None; - } - components.reverse(); - let joined: PathBuf = components.iter().collect(); - joined.to_str().map(String::from) + self.provider.repo_name() } } -#[async_trait::async_trait] -impl super::common::InodeCachePeek for RepoFs { - async fn peek_attr(&self, ino: Inode) -> Option { - self.icache.get_attr(ino).await - } +#[expect( + clippy::wildcard_enum_match_arm, + reason = "mapping all ErrorKind variants is impractical; EIO is the sensible default" +)] +fn io_error_to_errno(e: &std::io::Error) -> i32 { + e.raw_os_error().unwrap_or_else(|| match e.kind() { + std::io::ErrorKind::NotFound => libc::ENOENT, + std::io::ErrorKind::PermissionDenied => libc::EACCES, + std::io::ErrorKind::AlreadyExists => libc::EEXIST, + _ => libc::EIO, + }) } #[async_trait::async_trait] -impl Fs for RepoFs { - type LookupError = LookupError; - type GetAttrError = GetAttrError; - type OpenError = OpenError; - type ReadError = ReadError; - type ReaddirError = ReadDirError; - type ReleaseError = ReleaseError; - - #[instrument(name = "RepoFs::lookup", skip(self), fields(repo = %self.repo_name))] - async fn lookup(&mut self, parent: Inode, name: &OsStr) -> Result { - debug_assert!( - self.icache.contains(parent), - "lookup: parent inode {parent} not in inode table" - ); - - let ino = self.icache.ensure_child_ino(parent, name).await; - let attr = self - .icache - .get_or_resolve(ino, |icb| icb.attr) - .await? - .ok_or(LookupError::InodeNotFound)?; - - let rc = self - .icache - .inc_rc(ino) +impl super::common::ChildFs for RepoFs { + async fn lookup(&mut self, parent: InodeAddr, name: &OsStr) -> Result { + let tracked = self + .inner + .borrow_fs() + .lookup(LoadedAddr(parent), name) .await - .ok_or(LookupError::InodeNotFound)?; - trace!(ino, ?name, rc, "resolved inode"); - Ok(attr) - } - - #[instrument(name = "RepoFs::getattr", skip(self), fields(repo = %self.repo_name))] - async fn getattr( - &mut self, - ino: Inode, - _fh: Option, - ) -> Result { - self.icache.get_attr(ino).await.ok_or_else(|| { - warn!(ino, "getattr on unknown inode"); - GetAttrError::InodeNotFound - }) + .map_err(|e| { + if io_error_to_errno(&e) == libc::ENOENT { + LookupError::InodeNotFound + } else { + LookupError::RemoteMesaError(MesaApiError::Io(e)) + } + })?; + *self.refcounts.entry(tracked.inode.addr).or_insert(0) += 1; + Ok(tracked.inode) } - #[instrument(name = "RepoFs::readdir", skip(self), fields(repo = %self.repo_name))] - async fn readdir(&mut self, ino: Inode) -> Result<&[DirEntry], ReadDirError> { - debug_assert!( - self.icache.contains(ino), - "readdir: inode {ino} not in inode table" - ); - debug_assert!( - matches!( - self.icache.get_attr(ino).await, - Some(FileAttr::Directory { .. }) | None - ), - "readdir: inode {ino} has non-directory cached attr" - ); - - let children = self - .icache - .get_or_resolve(ino, |icb| icb.children.clone()) - .await? - .ok_or(ReadDirError::NotADirectory)?; - - trace!( - ino, - count = children.len(), - "readdir: resolved directory listing from icache" - ); - - self.icache.evict_zero_rc_children(ino).await; - - let mut entries = Vec::with_capacity(children.len()); - for (name, kind) in &children { - let child_ino = self.icache.ensure_child_ino(ino, OsStr::new(name)).await; - // Only cache directory attrs in readdir. File attrs are left as - // None so that lookup triggers the resolver to fetch the real file - // size. Caching placeholder file attrs (size=0) would poison - // needs_resolve(), preventing resolution on subsequent lookups. - if *kind == DirEntryType::Directory { - let now = SystemTime::now(); - let attr = FileAttr::Directory { - common: mescloud_icache::make_common_file_attr( - child_ino, - 0o755, - now, - now, - self.icache.fs_owner(), - self.icache.block_size(), - ), - }; - self.icache.cache_attr(child_ino, attr).await; - } - entries.push(DirEntry { - ino: child_ino, - name: name.clone().into(), - kind: *kind, - }); - } - - self.readdir_buf = entries; - Ok(&self.readdir_buf) + async fn readdir(&mut self, ino: InodeAddr) -> Result, ReadDirError> { + let mut entries = Vec::new(); + self.inner + .borrow_fs() + .readdir(LoadedAddr(ino), 0, |de, _offset| { + entries.push((de.name.to_os_string(), de.inode)); + false + }) + .await + .map_err(|e| { + if io_error_to_errno(&e) == libc::ENOTDIR { + ReadDirError::NotADirectory + } else if io_error_to_errno(&e) == libc::ENOENT { + ReadDirError::InodeNotFound + } else { + ReadDirError::RemoteMesaError(MesaApiError::Io(e)) + } + })?; + Ok(entries) } - #[instrument(name = "RepoFs::open", skip(self), fields(repo = %self.repo_name))] - async fn open(&mut self, ino: Inode, _flags: OpenFlags) -> Result { - if !self.icache.contains(ino) { - warn!(ino, "open on unknown inode"); - return Err(OpenError::InodeNotFound); - } - debug_assert!( - matches!( - self.icache.get_attr(ino).await, - Some(FileAttr::RegularFile { .. }) | None - ), - "open: inode {ino} has non-file cached attr" - ); - let fh = self.file_table.allocate(); - self.open_files.insert(fh, ino); - trace!(ino, fh, "assigned file handle"); - Ok(OpenFile { - handle: fh, - options: FileOpenOptions::empty(), - }) + async fn open( + &mut self, + ino: InodeAddr, + flags: AsyncOpenFlags, + ) -> Result { + let open_file = self + .inner + .borrow_fs() + .open(LoadedAddr(ino), flags) + .await + .map_err(|_| OpenError::InodeNotFound)?; + self.open_files + .insert(open_file.fh, Arc::clone(&open_file.reader)); + Ok(open_file.fh) } - #[instrument(name = "RepoFs::read", skip(self), fields(repo = %self.repo_name))] async fn read( &mut self, - ino: Inode, - fh: FileHandle, + _ino: InodeAddr, + fh: git_fs::fs::FileHandle, offset: u64, size: u32, - _flags: OpenFlags, - _lock_owner: Option, ) -> Result { - let &file_ino = self.open_files.get(&fh).ok_or_else(|| { - warn!(fh, "read on unknown file handle"); - ReadError::FileNotOpen - })?; - debug_assert!( - file_ino == ino, - "read: file handle {fh} maps to inode {file_ino}, but caller passed inode {ino}" - ); - debug_assert!( - matches!( - self.icache.get_attr(ino).await, - Some(FileAttr::RegularFile { .. }) | None - ), - "read: inode {ino} has non-file cached attr" - ); - - // Try the file cache first. - if let Some(cache) = &self.file_cache - && let Some(data) = cache.get(&ino).await - { - let start = usize::try_from(offset) - .unwrap_or(data.len()) - .min(data.len()); - let end = start.saturating_add(size as usize).min(data.len()); - trace!( - ino, - fh, - cached = true, - decoded_len = data.len(), - start, - end, - "read content" - ); - return Ok(Bytes::copy_from_slice(&data[start..end])); - } - - // Cache miss — fetch from the Mesa API. - let file_path = self.path_of_inode(ino).await; - - if ino != Self::ROOT_INO && file_path.is_none() { - warn!(ino, "read: path_of_inode returned None for non-root inode"); - return Err(ReadError::InodeNotFound); - } - - let content = self - .client - .org(&self.org_name) - .repos() - .at(&self.repo_name) - .content() - .get(Some(self.ref_.as_str()), file_path.as_deref(), None) - .await - .map_err(MesaApiError::from)?; - - let encoded_content = match content { - Content::File(f) => f.content.unwrap_or_default(), - // TODO(MES-712): return ReadError::NotAFile once symlinks are surfaced as - // DirEntryType::Symlink, and implement readlink to return the link target. - Content::Symlink(s) => s.content.unwrap_or_default(), - Content::Dir(_) => return Err(ReadError::NotAFile), - }; - - let decoded = base64::engine::general_purpose::STANDARD.decode(&encoded_content)?; - - let start = usize::try_from(offset) - .unwrap_or(decoded.len()) - .min(decoded.len()); - let end = start.saturating_add(size as usize).min(decoded.len()); - let result = Bytes::copy_from_slice(&decoded[start..end]); - trace!(ino, fh, cached = false, path = ?file_path, decoded_len = decoded.len(), start, end, "read content"); - - // Store the decoded content in the cache for future reads. - if let Some(cache) = &self.file_cache - && let Err(e) = cache.insert(&ino, decoded).await - { - warn!(error = ?e, ino, "failed to cache file content"); - } - - Ok(result) + let reader = self.open_files.get(&fh).ok_or(ReadError::FileNotOpen)?; + reader.read(offset, size).await.map_err(|e| { + if io_error_to_errno(&e) == libc::EISDIR { + ReadError::NotAFile + } else if io_error_to_errno(&e) == libc::ENOENT { + ReadError::InodeNotFound + } else { + ReadError::RemoteMesaError(MesaApiError::Io(e)) + } + }) } - #[instrument(name = "RepoFs::release", skip(self), fields(repo = %self.repo_name))] async fn release( &mut self, - ino: Inode, - fh: FileHandle, - _flags: OpenFlags, - _flush: bool, + _ino: InodeAddr, + fh: git_fs::fs::FileHandle, ) -> Result<(), ReleaseError> { - let released_ino = self.open_files.remove(&fh).ok_or_else(|| { - warn!(fh, "release on unknown file handle"); - ReleaseError::FileNotOpen - })?; - debug_assert!( - released_ino == ino, - "release: file handle {fh} mapped to inode {released_ino}, but caller passed inode {ino}" - ); - trace!(ino = released_ino, fh, "closed file handle"); + self.open_files + .remove(&fh) + .ok_or(ReleaseError::FileNotOpen)?; Ok(()) } - - #[instrument(name = "RepoFs::forget", skip(self), fields(repo = %self.repo_name))] - async fn forget(&mut self, ino: Inode, nlookups: u64) { - debug_assert!( - self.icache.contains(ino), - "forget: inode {ino} not in inode table" - ); - - self.icache.forget(ino, nlookups).await; - } - - async fn statfs(&mut self) -> Result { - Ok(self.icache.statfs()) - } } diff --git a/src/fs/mod.rs b/src/fs/mod.rs index 003e1b04..a696e56f 100644 --- a/src/fs/mod.rs +++ b/src/fs/mod.rs @@ -1,4 +1 @@ -pub mod fuser; -pub mod icache; pub mod mescloud; -pub mod r#trait; diff --git a/src/fs/trait.rs b/src/fs/trait.rs deleted file mode 100644 index f4d98529..00000000 --- a/src/fs/trait.rs +++ /dev/null @@ -1,375 +0,0 @@ -//! Generic trait for implementing filesystems. -//! -//! Note that this is a slightly cleaner interface than directly using fuser. The whole point of -//! this is to abstract away fuser-specific details. -use async_trait::async_trait; -use std::{ - ffi::{OsStr, OsString}, - time::{Duration, SystemTime}, -}; -use tracing::error; - -use bitflags::bitflags; -use bytes::Bytes; - -/// Type representing an inode. -pub type Inode = u64; - -pub type FileHandle = u64; - -/// An opaque lock owner identifier provided by the kernel. -#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)] -pub struct LockOwner(pub u64); - -bitflags! { - #[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)] - pub struct Permissions: u16 { - // Other - const OTHER_EXECUTE = 1 << 0; - const OTHER_WRITE = 1 << 1; - const OTHER_READ = 1 << 2; - - // Group - const GROUP_EXECUTE = 1 << 3; - const GROUP_WRITE = 1 << 4; - const GROUP_READ = 1 << 5; - - // Owner - const OWNER_EXECUTE = 1 << 6; - const OWNER_WRITE = 1 << 7; - const OWNER_READ = 1 << 8; - - // Special bits - const STICKY = 1 << 9; - const SETGID = 1 << 10; - const SETUID = 1 << 11; - - const OTHER_RWX = Self::OTHER_READ.bits() - | Self::OTHER_WRITE.bits() - | Self::OTHER_EXECUTE.bits(); - const GROUP_RWX = Self::GROUP_READ.bits() - | Self::GROUP_WRITE.bits() - | Self::GROUP_EXECUTE.bits(); - const OWNER_RWX = Self::OWNER_READ.bits() - | Self::OWNER_WRITE.bits() - | Self::OWNER_EXECUTE.bits(); - } -} - -bitflags! { - #[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)] - pub struct OpenFlags: i32 { - // Access modes (mutually exclusive) - const RDONLY = libc::O_RDONLY; - const WRONLY = libc::O_WRONLY; - const RDWR = libc::O_RDWR; - - // Creation/status flags - const APPEND = libc::O_APPEND; - const TRUNC = libc::O_TRUNC; - const CREAT = libc::O_CREAT; - const EXCL = libc::O_EXCL; - - // Behavior flags - const NONBLOCK = libc::O_NONBLOCK; - const SYNC = libc::O_SYNC; - const DSYNC = libc::O_DSYNC; - const NOFOLLOW = libc::O_NOFOLLOW; - const CLOEXEC = libc::O_CLOEXEC; - const DIRECTORY = libc::O_DIRECTORY; - - #[cfg(target_os = "linux")] - const NOATIME = libc::O_NOATIME; - } -} - -#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)] -pub struct CommonFileAttr { - pub ino: Inode, - pub atime: SystemTime, - pub mtime: SystemTime, - pub ctime: SystemTime, - pub crtime: SystemTime, - pub perm: Permissions, - pub nlink: u32, - pub uid: u32, - pub gid: u32, - pub blksize: u32, -} - -#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)] -pub enum FileAttr { - RegularFile { - common: CommonFileAttr, - size: u64, - blocks: u64, - }, - Directory { - common: CommonFileAttr, - }, - Symlink { - common: CommonFileAttr, - size: u64, - }, - CharDevice { - common: CommonFileAttr, - rdev: u64, - }, - BlockDevice { - common: CommonFileAttr, - rdev: u64, - }, - NamedPipe { - common: CommonFileAttr, - }, - Socket { - common: CommonFileAttr, - }, -} - -impl FileAttr { - pub fn common(&self) -> &CommonFileAttr { - match self { - Self::RegularFile { common, .. } - | Self::Directory { common } - | Self::Symlink { common, .. } - | Self::CharDevice { common, .. } - | Self::BlockDevice { common, .. } - | Self::NamedPipe { common } - | Self::Socket { common } => common, - } - } -} - -bitflags! { - #[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)] - pub (crate) struct FileOpenOptions: u32 { - const DIRECT_IO = 1 << 0; - const KEEP_CACHE = 1 << 1; - const NONSEEKABLE = 1 << 2; - const STREAM = 1 << 4; - } -} - -#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)] -pub struct OpenFile { - pub handle: FileHandle, - pub options: FileOpenOptions, -} - -#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)] -pub enum DirEntryType { - RegularFile, - Directory, - Symlink, - CharDevice, - BlockDevice, - NamedPipe, - Socket, -} - -impl TryFrom for FileAttr { - type Error = (); - - #[expect( - clippy::cast_possible_truncation, - reason = "metadata mode/nlink/blksize narrowing is intentional" - )] - #[expect( - clippy::cast_sign_loss, - reason = "nsecs from MetadataExt is always in [0, 999_999_999]" - )] - fn try_from(meta: std::fs::Metadata) -> Result { - use std::os::unix::fs::FileTypeExt as _; - use std::os::unix::fs::MetadataExt as _; - - fn to_systime(secs: i64, nsecs: i64) -> SystemTime { - if secs >= 0 { - std::time::UNIX_EPOCH + Duration::new(secs.cast_unsigned(), nsecs as u32) - } else { - // nsecs is always in [0, 999_999_999] from MetadataExt. - // For negative secs, subtract whole seconds then add back nsecs. - std::time::UNIX_EPOCH - Duration::from_secs((-secs).cast_unsigned()) - + Duration::from_nanos(nsecs.cast_unsigned()) - } - } - - let common_attr = CommonFileAttr { - ino: meta.ino(), - atime: to_systime(meta.atime(), meta.atime_nsec()), - mtime: to_systime(meta.mtime(), meta.mtime_nsec()), - ctime: to_systime(meta.ctime(), meta.ctime_nsec()), - crtime: to_systime(0, 0), // Not available in std::fs::Metadata - perm: Permissions::from_bits_truncate(meta.mode() as u16), - nlink: meta.nlink() as u32, - uid: meta.uid(), - gid: meta.gid(), - blksize: meta.blksize() as u32, - }; - - let ft = meta.file_type(); - if ft.is_file() { - Ok(Self::RegularFile { - common: common_attr, - size: meta.len(), - blocks: meta.blocks(), - }) - } else if ft.is_dir() { - Ok(Self::Directory { - common: common_attr, - }) - } else if ft.is_symlink() { - Ok(Self::Symlink { - common: common_attr, - size: meta.len(), - }) - } else if ft.is_char_device() { - Ok(Self::CharDevice { - common: common_attr, - rdev: meta.rdev(), - }) - } else if ft.is_block_device() { - Ok(Self::BlockDevice { - common: common_attr, - rdev: meta.rdev(), - }) - } else if ft.is_fifo() { - Ok(Self::NamedPipe { - common: common_attr, - }) - } else if ft.is_socket() { - Ok(Self::Socket { - common: common_attr, - }) - } else { - debug_assert!( - false, - "Unknown file type encountered in FileAttr conversion" - ); - Err(()) - } - } -} - -impl From for DirEntryType { - fn from(attr: FileAttr) -> Self { - match attr { - FileAttr::RegularFile { .. } => Self::RegularFile, - FileAttr::Directory { .. } => Self::Directory, - FileAttr::Symlink { .. } => Self::Symlink, - FileAttr::CharDevice { .. } => Self::CharDevice, - FileAttr::BlockDevice { .. } => Self::BlockDevice, - FileAttr::NamedPipe { .. } => Self::NamedPipe, - FileAttr::Socket { .. } => Self::Socket, - } - } -} - -impl TryFrom for DirEntryType { - type Error = (); - - fn try_from(ft: std::fs::FileType) -> Result { - use std::os::unix::fs::FileTypeExt as _; - - if ft.is_file() { - Ok(Self::RegularFile) - } else if ft.is_dir() { - Ok(Self::Directory) - } else if ft.is_symlink() { - Ok(Self::Symlink) - } else if ft.is_char_device() { - Ok(Self::CharDevice) - } else if ft.is_block_device() { - Ok(Self::BlockDevice) - } else if ft.is_fifo() { - Ok(Self::NamedPipe) - } else if ft.is_socket() { - Ok(Self::Socket) - } else { - debug_assert!( - false, - "Unknown file type encountered in DirEntryType conversion" - ); - error!(ft = ?ft, "Unknown file type encountered in DirEntryType conversion"); - Err(()) - } - } -} - -#[derive(Debug, Clone, PartialEq, Eq, Hash)] -pub struct DirEntry { - pub ino: Inode, - // TODO(markovejnovic): This OsString is hella expensive - pub name: OsString, - pub kind: DirEntryType, -} - -#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)] -pub struct FilesystemStats { - pub block_size: u32, - pub fragment_size: u64, - pub total_blocks: u64, - pub free_blocks: u64, - pub available_blocks: u64, - pub total_inodes: u64, - pub free_inodes: u64, - pub available_inodes: u64, - pub filesystem_id: u64, - pub mount_flags: u32, - pub max_filename_length: u32, -} - -#[async_trait] -pub trait Fs { - type LookupError: std::error::Error; - type GetAttrError: std::error::Error; - type OpenError: std::error::Error; - type ReadError: std::error::Error; - type ReaddirError: std::error::Error; - type ReleaseError: std::error::Error; - - /// For each lookup call made by the kernel, it expects the icache to be updated with the - /// returned `FileAttr`. - async fn lookup(&mut self, parent: Inode, name: &OsStr) -> Result; - - /// Can be called in two contexts -- the file is not open (in which case `fh` is `None`), - /// or the file is open (in which case `fh` is `Some`). - async fn getattr( - &mut self, - ino: Inode, - fh: Option, - ) -> Result; - - /// Read the contents of a directory. - async fn readdir(&mut self, ino: Inode) -> Result<&[DirEntry], Self::ReaddirError>; - - /// Open a file for reading. - async fn open(&mut self, ino: Inode, flags: OpenFlags) -> Result; - - /// Read data from an open file. - #[expect(clippy::too_many_arguments, reason = "mirrors fuser read API")] - async fn read( - &mut self, - ino: Inode, - fh: FileHandle, - offset: u64, - size: u32, - flags: OpenFlags, - lock_owner: Option, - ) -> Result; - - /// Called when the kernel closes a file handle. - async fn release( - &mut self, - ino: Inode, - fh: FileHandle, - flags: OpenFlags, - flush: bool, - ) -> Result<(), Self::ReleaseError>; - - /// Called when the kernel is done with an inode. - async fn forget(&mut self, ino: Inode, nlookups: u64); - - /// Get filesystem statistics. - async fn statfs(&mut self) -> Result; -} diff --git a/tests/async_fs_correctness.rs b/tests/async_fs_correctness.rs new file mode 100644 index 00000000..5fe27a28 --- /dev/null +++ b/tests/async_fs_correctness.rs @@ -0,0 +1,609 @@ +#![allow(clippy::unwrap_used, clippy::expect_used, missing_docs)] + +mod common; + +use std::ffi::{OsStr, OsString}; + +use git_fs::cache::async_backed::FutureBackedCache; +use git_fs::fs::async_fs::{AsyncFs, InodeLifecycle}; +use git_fs::fs::{INode, INodeType, LoadedAddr, OpenFlags}; + +use common::async_fs_mocks::{MockFsDataProvider, MockFsState, make_inode}; + +#[tokio::test(flavor = "multi_thread", worker_threads = 2)] +async fn lifecycle_inc_returns_count_after_increment() { + let table = FutureBackedCache::default(); + let inode = make_inode(100, INodeType::File, 0, Some(1)); + table.insert_sync(100, inode); + + let mut lifecycle = InodeLifecycle::from_table(table); + + assert_eq!(lifecycle.inc(100), 1, "first inc should return 1"); + assert_eq!(lifecycle.inc(100), 2, "second inc should return 2"); + assert_eq!(lifecycle.inc(100), 3, "third inc should return 3"); +} + +#[tokio::test(flavor = "multi_thread", worker_threads = 2)] +async fn lifecycle_dec_returns_remaining_count() { + let table = FutureBackedCache::default(); + let inode = make_inode(100, INodeType::File, 0, Some(1)); + table.insert_sync(100, inode); + + let mut lifecycle = InodeLifecycle::from_table(table); + lifecycle.inc(100); + lifecycle.inc(100); + + assert_eq!(lifecycle.dec(&100), Some(1), "dec from 2 should give 1"); + assert_eq!(lifecycle.dec(&100), Some(0), "dec from 1 should give 0"); +} + +#[tokio::test(flavor = "multi_thread", worker_threads = 2)] +async fn lifecycle_dec_unknown_addr_returns_none() { + let table: FutureBackedCache = FutureBackedCache::default(); + let mut lifecycle = InodeLifecycle::from_table(table); + + assert_eq!( + lifecycle.dec(&999), + None, + "dec on unknown key should return None" + ); +} + +#[tokio::test(flavor = "multi_thread", worker_threads = 2)] +async fn lifecycle_dec_to_zero_evicts_from_table() { + let table = FutureBackedCache::default(); + let inode = make_inode(100, INodeType::File, 0, Some(1)); + table.insert_sync(100, inode); + + let mut lifecycle = InodeLifecycle::from_table(table); + lifecycle.inc(100); + + assert_eq!(lifecycle.dec(&100), Some(0)); + // The inode should have been evicted from the table. + assert!( + lifecycle.table().get(&100).await.is_none(), + "inode should be evicted after refcount hits zero" + ); +} + +#[tokio::test(flavor = "multi_thread", worker_threads = 2)] +async fn lifecycle_dec_count_decrements_by_n() { + let table: FutureBackedCache = FutureBackedCache::default(); + let inode = make_inode(100, INodeType::File, 0, Some(1)); + table.insert_sync(100, inode); + + let mut lifecycle = InodeLifecycle::from_table(table); + lifecycle.inc(100); + lifecycle.inc(100); + lifecycle.inc(100); // count = 3 + + assert_eq!( + lifecycle.dec_count(&100, 2), + Some(1), + "dec_count(3, 2) should give 1" + ); +} + +#[tokio::test(flavor = "multi_thread", worker_threads = 2)] +async fn lifecycle_dec_count_to_zero_evicts() { + let table = FutureBackedCache::default(); + let inode = make_inode(100, INodeType::File, 0, Some(1)); + table.insert_sync(100, inode); + + let mut lifecycle = InodeLifecycle::from_table(table); + lifecycle.inc(100); + lifecycle.inc(100); // count = 2 + + assert_eq!(lifecycle.dec_count(&100, 2), Some(0)); + assert!( + lifecycle.table().get(&100).await.is_none(), + "inode should be evicted after dec_count to zero" + ); +} + +#[tokio::test(flavor = "multi_thread", worker_threads = 2)] +async fn lifecycle_table_returns_underlying_cache() { + let table = FutureBackedCache::default(); + let inode = make_inode(42, INodeType::Directory, 0, None); + table.insert_sync(42, inode); + + let lifecycle = InodeLifecycle::from_table(table); + + let fetched = lifecycle.table().get(&42).await; + assert_eq!( + fetched.map(|n| n.addr), + Some(42), + "table() should expose the underlying cache" + ); +} + +#[tokio::test(flavor = "multi_thread", worker_threads = 2)] +async fn new_seeds_root_inode_into_table() { + let table = FutureBackedCache::default(); + let root = make_inode(1, INodeType::Directory, 0, None); + let dp = MockFsDataProvider::new(MockFsState::default()); + + let fs = AsyncFs::new(dp, root, &table).await; + + assert_eq!(fs.inode_count(), 1, "root should be the only inode"); + let fetched = table.get(&1).await; + assert_eq!( + fetched.map(|n| n.addr), + Some(1), + "root inode should be in the table" + ); +} + +#[tokio::test(flavor = "multi_thread", worker_threads = 2)] +async fn new_preseeded_does_not_insert_root() { + let table: FutureBackedCache = FutureBackedCache::default(); + let dp = MockFsDataProvider::new(MockFsState::default()); + + let fs = AsyncFs::new_preseeded(dp, &table); + + assert_eq!( + fs.inode_count(), + 0, + "preseeded constructor should not insert anything" + ); +} + +#[tokio::test(flavor = "multi_thread", worker_threads = 2)] +async fn statfs_reports_inode_count() { + let table = FutureBackedCache::default(); + let root = make_inode(1, INodeType::Directory, 0, None); + let dp = MockFsDataProvider::new(MockFsState::default()); + + let fs = AsyncFs::new(dp, root, &table).await; + let stats = fs.statfs(); + + assert_eq!(stats.block_size, 4096); + assert_eq!(stats.total_inodes, 1, "should reflect the root inode"); + assert_eq!(stats.free_blocks, 0); + assert_eq!(stats.max_filename_length, 255); +} + +#[tokio::test(flavor = "multi_thread", worker_threads = 2)] +async fn loaded_inode_returns_seeded_inode() { + let table = FutureBackedCache::default(); + let root = make_inode(1, INodeType::Directory, 0, None); + let dp = MockFsDataProvider::new(MockFsState::default()); + + let fs = AsyncFs::new(dp, root, &table).await; + + let inode = fs.loaded_inode(LoadedAddr(1)).await.unwrap(); + assert_eq!(inode.addr, 1); + assert_eq!(inode.itype, INodeType::Directory); +} + +#[tokio::test(flavor = "multi_thread", worker_threads = 2)] +async fn loaded_inode_returns_enoent_for_missing_addr() { + let table = FutureBackedCache::default(); + let root = make_inode(1, INodeType::Directory, 0, None); + let dp = MockFsDataProvider::new(MockFsState::default()); + + let fs = AsyncFs::new(dp, root, &table).await; + + let err = fs.loaded_inode(LoadedAddr(999)).await.unwrap_err(); + assert_eq!(err.raw_os_error(), Some(libc::ENOENT)); +} + +#[tokio::test(flavor = "multi_thread", worker_threads = 2)] +async fn getattr_delegates_to_loaded_inode() { + let table = FutureBackedCache::default(); + let root = make_inode(1, INodeType::Directory, 4096, None); + let dp = MockFsDataProvider::new(MockFsState::default()); + + let fs = AsyncFs::new(dp, root, &table).await; + + let inode = fs.getattr(LoadedAddr(1)).await.unwrap(); + assert_eq!(inode.addr, 1); + assert_eq!(inode.size, 4096); +} + +#[tokio::test(flavor = "multi_thread", worker_threads = 2)] +async fn lookup_resolves_child_via_data_provider() { + let root = make_inode(1, INodeType::Directory, 0, None); + let child = make_inode(10, INodeType::File, 42, Some(1)); + + let mut state = MockFsState::default(); + state.lookups.insert((1, "readme.md".into()), child); + let dp = MockFsDataProvider::new(state); + + let table = FutureBackedCache::default(); + let fs = AsyncFs::new(dp, root, &table).await; + + let tracked = fs + .lookup(LoadedAddr(1), OsStr::new("readme.md")) + .await + .unwrap(); + + assert_eq!(tracked.inode.addr, 10); + assert_eq!(tracked.inode.size, 42); + assert_eq!(tracked.inode.itype, INodeType::File); +} + +#[tokio::test(flavor = "multi_thread", worker_threads = 2)] +async fn lookup_populates_inode_table() { + let root = make_inode(1, INodeType::Directory, 0, None); + let child = make_inode(10, INodeType::File, 100, Some(1)); + + let mut state = MockFsState::default(); + state.lookups.insert((1, "file.txt".into()), child); + let dp = MockFsDataProvider::new(state); + + let table = FutureBackedCache::default(); + let fs = AsyncFs::new(dp, root, &table).await; + + fs.lookup(LoadedAddr(1), OsStr::new("file.txt")) + .await + .unwrap(); + + // The child should now be in the inode table. + let cached = table.get(&10).await; + assert_eq!( + cached.map(|n| n.addr), + Some(10), + "child inode should be cached in the table after lookup" + ); +} + +#[tokio::test(flavor = "multi_thread", worker_threads = 2)] +async fn lookup_second_call_uses_cache() { + let root = make_inode(1, INodeType::Directory, 0, None); + let child = make_inode(10, INodeType::File, 100, Some(1)); + + let mut state = MockFsState::default(); + state.lookups.insert((1, "cached.txt".into()), child); + let dp = MockFsDataProvider::new(state); + + let table = FutureBackedCache::default(); + let fs = AsyncFs::new(dp, root, &table).await; + + let first = fs + .lookup(LoadedAddr(1), OsStr::new("cached.txt")) + .await + .unwrap(); + let second = fs + .lookup(LoadedAddr(1), OsStr::new("cached.txt")) + .await + .unwrap(); + + assert_eq!(first.inode.addr, second.inode.addr); +} + +#[tokio::test(flavor = "multi_thread", worker_threads = 2)] +async fn lookup_propagates_provider_error() { + let root = make_inode(1, INodeType::Directory, 0, None); + // No lookups configured — provider will return ENOENT. + let dp = MockFsDataProvider::new(MockFsState::default()); + + let table = FutureBackedCache::default(); + let fs = AsyncFs::new(dp, root, &table).await; + + let err = fs + .lookup(LoadedAddr(1), OsStr::new("nonexistent")) + .await + .unwrap_err(); + assert_eq!(err.raw_os_error(), Some(libc::ENOENT)); +} + +// open and OpenFile::read tests + +#[tokio::test(flavor = "multi_thread", worker_threads = 2)] +async fn open_returns_file_handle_and_reader() { + let root = make_inode(1, INodeType::Directory, 0, None); + let file = make_inode(10, INodeType::File, 5, Some(1)); + + let mut state = MockFsState::default(); + state + .file_contents + .insert(10, bytes::Bytes::from_static(b"hello")); + let dp = MockFsDataProvider::new(state); + + let table = FutureBackedCache::default(); + table.insert_sync(10, file); + let fs = AsyncFs::new(dp, root, &table).await; + + let open_file = fs.open(LoadedAddr(10), OpenFlags::RDONLY).await.unwrap(); + + assert!(open_file.fh >= 1, "file handle should start at 1"); + let data = open_file.read(0, 5).await.unwrap(); + assert_eq!(&data[..], b"hello"); +} + +#[tokio::test(flavor = "multi_thread", worker_threads = 2)] +async fn open_returns_eisdir_for_directory() { + let root = make_inode(1, INodeType::Directory, 0, None); + let dp = MockFsDataProvider::new(MockFsState::default()); + + let table = FutureBackedCache::default(); + let fs = AsyncFs::new(dp, root, &table).await; + + let err = fs.open(LoadedAddr(1), OpenFlags::RDONLY).await.unwrap_err(); + assert_eq!(err.raw_os_error(), Some(libc::EISDIR)); +} + +#[tokio::test(flavor = "multi_thread", worker_threads = 2)] +async fn open_returns_enoent_for_missing_inode() { + let root = make_inode(1, INodeType::Directory, 0, None); + let dp = MockFsDataProvider::new(MockFsState::default()); + + let table = FutureBackedCache::default(); + let fs = AsyncFs::new(dp, root, &table).await; + + let err = fs + .open(LoadedAddr(999), OpenFlags::RDONLY) + .await + .unwrap_err(); + assert_eq!(err.raw_os_error(), Some(libc::ENOENT)); +} + +#[tokio::test(flavor = "multi_thread", worker_threads = 2)] +async fn open_assigns_unique_file_handles() { + let root = make_inode(1, INodeType::Directory, 0, None); + let file = make_inode(10, INodeType::File, 0, Some(1)); + + let dp = MockFsDataProvider::new(MockFsState::default()); + + let table = FutureBackedCache::default(); + table.insert_sync(10, file); + let fs = AsyncFs::new(dp, root, &table).await; + + let fh1 = fs.open(LoadedAddr(10), OpenFlags::RDONLY).await.unwrap().fh; + let fh2 = fs.open(LoadedAddr(10), OpenFlags::RDONLY).await.unwrap().fh; + + assert_ne!(fh1, fh2, "each open should produce a unique file handle"); +} + +#[tokio::test(flavor = "multi_thread", worker_threads = 2)] +async fn open_file_read_with_offset() { + let root = make_inode(1, INodeType::Directory, 0, None); + let file = make_inode(10, INodeType::File, 11, Some(1)); + + let mut state = MockFsState::default(); + state + .file_contents + .insert(10, bytes::Bytes::from_static(b"hello world")); + let dp = MockFsDataProvider::new(state); + + let table = FutureBackedCache::default(); + table.insert_sync(10, file); + let fs = AsyncFs::new(dp, root, &table).await; + + let open_file = fs.open(LoadedAddr(10), OpenFlags::RDONLY).await.unwrap(); + + let data = open_file.read(6, 5).await.unwrap(); + assert_eq!(&data[..], b"world"); +} + +// readdir tests + +#[tokio::test(flavor = "multi_thread", worker_threads = 2)] +async fn readdir_lists_children_sorted_by_name() { + let root = make_inode(1, INodeType::Directory, 0, None); + let child_b = make_inode(10, INodeType::File, 10, Some(1)); + let child_a = make_inode(11, INodeType::File, 20, Some(1)); + + let mut state = MockFsState::default(); + state.directories.insert( + 1, + vec![ + (OsString::from("b.txt"), child_b), + (OsString::from("a.txt"), child_a), + ], + ); + let dp = MockFsDataProvider::new(state); + + let table = FutureBackedCache::default(); + let fs = AsyncFs::new(dp, root, &table).await; + + let mut entries: Vec<(OsString, u64)> = Vec::new(); + fs.readdir(LoadedAddr(1), 0, |entry, _offset| { + entries.push((entry.name.to_os_string(), entry.inode.addr)); + false // don't stop + }) + .await + .unwrap(); + + assert_eq!(entries.len(), 2); + assert_eq!(entries[0].0, "a.txt", "entries should be sorted by name"); + assert_eq!(entries[0].1, 11); + assert_eq!(entries[1].0, "b.txt"); + assert_eq!(entries[1].1, 10); +} + +#[tokio::test(flavor = "multi_thread", worker_threads = 2)] +async fn readdir_respects_offset() { + let root = make_inode(1, INodeType::Directory, 0, None); + let child_a = make_inode(10, INodeType::File, 10, Some(1)); + let child_b = make_inode(11, INodeType::File, 20, Some(1)); + let child_c = make_inode(12, INodeType::File, 30, Some(1)); + + let mut state = MockFsState::default(); + state.directories.insert( + 1, + vec![ + (OsString::from("a"), child_a), + (OsString::from("b"), child_b), + (OsString::from("c"), child_c), + ], + ); + let dp = MockFsDataProvider::new(state); + + let table = FutureBackedCache::default(); + let fs = AsyncFs::new(dp, root, &table).await; + + // First readdir to populate cache + fs.readdir(LoadedAddr(1), 0, |_, _| false).await.unwrap(); + + // Second readdir starting at offset 2 (skip first two) + let mut entries: Vec = Vec::new(); + fs.readdir(LoadedAddr(1), 2, |entry, _| { + entries.push(entry.name.to_os_string()); + false + }) + .await + .unwrap(); + + assert_eq!(entries, vec![OsString::from("c")]); +} + +#[tokio::test(flavor = "multi_thread", worker_threads = 2)] +async fn readdir_stops_when_filler_returns_true() { + let root = make_inode(1, INodeType::Directory, 0, None); + let child_a = make_inode(10, INodeType::File, 10, Some(1)); + let child_b = make_inode(11, INodeType::File, 20, Some(1)); + let child_c = make_inode(12, INodeType::File, 30, Some(1)); + + let mut state = MockFsState::default(); + state.directories.insert( + 1, + vec![ + (OsString::from("a"), child_a), + (OsString::from("b"), child_b), + (OsString::from("c"), child_c), + ], + ); + let dp = MockFsDataProvider::new(state); + + let table = FutureBackedCache::default(); + let fs = AsyncFs::new(dp, root, &table).await; + + let mut count = 0; + fs.readdir(LoadedAddr(1), 0, |_, _| { + count += 1; + count >= 2 // stop after 2 entries + }) + .await + .unwrap(); + + assert_eq!(count, 2, "filler should have been called exactly twice"); +} + +#[tokio::test(flavor = "multi_thread", worker_threads = 2)] +async fn readdir_returns_enotdir_for_file() { + let root = make_inode(1, INodeType::Directory, 0, None); + let file = make_inode(10, INodeType::File, 100, Some(1)); + + let dp = MockFsDataProvider::new(MockFsState::default()); + + let table = FutureBackedCache::default(); + table.insert_sync(10, file); + let fs = AsyncFs::new(dp, root, &table).await; + + let err = fs + .readdir(LoadedAddr(10), 0, |_, _| false) + .await + .unwrap_err(); + assert_eq!(err.raw_os_error(), Some(libc::ENOTDIR)); +} + +#[tokio::test(flavor = "multi_thread", worker_threads = 2)] +async fn readdir_populates_inode_table_with_children() { + let root = make_inode(1, INodeType::Directory, 0, None); + let child = make_inode(10, INodeType::File, 42, Some(1)); + + let mut state = MockFsState::default(); + state + .directories + .insert(1, vec![(OsString::from("child.txt"), child)]); + let dp = MockFsDataProvider::new(state); + + let table = FutureBackedCache::default(); + let fs = AsyncFs::new(dp, root, &table).await; + + fs.readdir(LoadedAddr(1), 0, |_, _| false).await.unwrap(); + + let cached = table.get(&10).await; + assert_eq!( + cached.map(|n| n.addr), + Some(10), + "readdir should populate children into the inode table" + ); +} + +#[tokio::test(flavor = "multi_thread", worker_threads = 2)] +async fn readdir_empty_directory() { + let root = make_inode(1, INodeType::Directory, 0, None); + + let mut state = MockFsState::default(); + state.directories.insert(1, vec![]); + let dp = MockFsDataProvider::new(state); + + let table = FutureBackedCache::default(); + let fs = AsyncFs::new(dp, root, &table).await; + + let mut count = 0; + fs.readdir(LoadedAddr(1), 0, |_, _| { + count += 1; + false + }) + .await + .unwrap(); + + assert_eq!(count, 0, "empty directory should yield no entries"); +} + +#[tokio::test(flavor = "multi_thread", worker_threads = 2)] +async fn readdir_provides_correct_next_offsets() { + let root = make_inode(1, INodeType::Directory, 0, None); + let child_a = make_inode(10, INodeType::File, 0, Some(1)); + let child_b = make_inode(11, INodeType::File, 0, Some(1)); + + let mut state = MockFsState::default(); + state.directories.insert( + 1, + vec![ + (OsString::from("a"), child_a), + (OsString::from("b"), child_b), + ], + ); + let dp = MockFsDataProvider::new(state); + + let table = FutureBackedCache::default(); + let fs = AsyncFs::new(dp, root, &table).await; + + let mut offsets: Vec = Vec::new(); + fs.readdir(LoadedAddr(1), 0, |_, next_offset| { + offsets.push(next_offset); + false + }) + .await + .unwrap(); + + assert_eq!( + offsets, + vec![1, 2], + "offsets should be 1-indexed and sequential" + ); +} + +// lookup-after-readdir integration test + +#[tokio::test(flavor = "multi_thread", worker_threads = 2)] +async fn lookup_after_readdir_uses_directory_cache() { + let root = make_inode(1, INodeType::Directory, 0, None); + let child = make_inode(10, INodeType::File, 42, Some(1)); + + let mut state = MockFsState::default(); + // Only configure readdir — no lookup entry. If the directory cache + // fast path is broken, the lookup will fail with ENOENT. + state + .directories + .insert(1, vec![(OsString::from("file.txt"), child)]); + let dp = MockFsDataProvider::new(state); + + let table = FutureBackedCache::default(); + let fs = AsyncFs::new(dp, root, &table).await; + + // readdir populates the directory cache. + fs.readdir(LoadedAddr(1), 0, |_, _| false).await.unwrap(); + + // lookup should hit the directory cache fast path. + let tracked = fs + .lookup(LoadedAddr(1), OsStr::new("file.txt")) + .await + .unwrap(); + assert_eq!(tracked.inode.addr, 10); +} diff --git a/tests/common/async_fs_mocks.rs b/tests/common/async_fs_mocks.rs new file mode 100644 index 00000000..5c132eec --- /dev/null +++ b/tests/common/async_fs_mocks.rs @@ -0,0 +1,104 @@ +#![allow(missing_docs, clippy::unwrap_used)] + +use std::collections::HashMap; +use std::ffi::{OsStr, OsString}; +use std::sync::Arc; +use std::time::SystemTime; + +use bytes::Bytes; + +use git_fs::fs::async_fs::{FileReader, FsDataProvider}; +use git_fs::fs::{INode, INodeType, InodePerms, OpenFlags}; + +/// Builds an `INode` with sensible defaults. Only `addr` and `itype` are required. +pub fn make_inode(addr: u64, itype: INodeType, size: u64, parent: Option) -> INode { + INode { + addr, + permissions: InodePerms::OWNER_RWX | InodePerms::GROUP_READ | InodePerms::OTHER_READ, + uid: 1000, + gid: 1000, + create_time: SystemTime::UNIX_EPOCH, + last_modified_at: SystemTime::UNIX_EPOCH, + parent, + size, + itype, + } +} + +/// A mock `FileReader` that returns a fixed byte slice for any read. +#[derive(Debug, Clone)] +pub struct MockFileReader { + pub data: Bytes, +} + +impl FileReader for MockFileReader { + #[expect( + clippy::cast_possible_truncation, + reason = "test mock — offsets stay small" + )] + async fn read(&self, offset: u64, size: u32) -> Result { + let start = (offset as usize).min(self.data.len()); + let end = (start + size as usize).min(self.data.len()); + Ok(self.data.slice(start..end)) + } +} + +/// Shared state backing `MockFsDataProvider`. +#[derive(Debug, Default)] +pub struct MockFsState { + /// `(parent_addr, child_name) -> child_inode` + pub lookups: HashMap<(u64, OsString), INode>, + /// `parent_addr -> vec of (child_name, child_inode)` + pub directories: HashMap>, + /// `inode_addr -> file content bytes` + pub file_contents: HashMap, +} + +/// A clonable mock data provider for `AsyncFs` tests. +#[derive(Debug, Clone)] +pub struct MockFsDataProvider { + pub state: Arc, +} + +impl MockFsDataProvider { + pub fn new(state: MockFsState) -> Self { + Self { + state: Arc::new(state), + } + } +} + +impl FsDataProvider for MockFsDataProvider { + type Reader = MockFileReader; + + async fn lookup(&self, parent: INode, name: &OsStr) -> Result { + let key = (parent.addr, name.to_os_string()); + self.state + .lookups + .get(&key) + .copied() + .ok_or_else(|| std::io::Error::from_raw_os_error(libc::ENOENT)) + } + + async fn readdir(&self, parent: INode) -> Result, std::io::Error> { + self.state + .directories + .get(&parent.addr) + .cloned() + .ok_or_else(|| std::io::Error::from_raw_os_error(libc::ENOENT)) + } + + async fn open( + &self, + inode: INode, + _flags: OpenFlags, + ) -> Result { + let data = self + .state + .file_contents + .get(&inode.addr) + .cloned() + .unwrap_or_default(); + Ok(MockFileReader { data }) + } +} diff --git a/tests/common/mod.rs b/tests/common/mod.rs index 101f9295..2729c866 100644 --- a/tests/common/mod.rs +++ b/tests/common/mod.rs @@ -1,4 +1,6 @@ -#![allow(missing_docs, clippy::unwrap_used)] +#![allow(dead_code, missing_docs, clippy::unwrap_used)] + +pub mod async_fs_mocks; use std::sync::{Arc, Mutex}; use std::time::Duration; From c80c09c88794589c18d1d062a0c0949a4abc64d9 Mon Sep 17 00:00:00 2001 From: Marko Vejnovic Date: Fri, 20 Feb 2026 12:20:18 -0800 Subject: [PATCH 02/58] refactor: remove redundant FUSE error types, use io_to_errno helper --- lib/fs/fuser.rs | 135 +++++++++++++----------------------------------- 1 file changed, 36 insertions(+), 99 deletions(-) diff --git a/lib/fs/fuser.rs b/lib/fs/fuser.rs index 50042a24..61814119 100644 --- a/lib/fs/fuser.rs +++ b/lib/fs/fuser.rs @@ -9,60 +9,18 @@ use super::{FileHandle, INode, INodeType, InodeAddr, LoadedAddr, OpenFlags}; use crate::cache::async_backed::FutureBackedCache; use tracing::{debug, error, instrument}; -/// Wrapper converting [`std::io::Error`] to errno. -#[derive(Debug, thiserror::Error)] -#[error("{0}")] -struct FuseIoError(std::io::Error); - +/// Convert an I/O error to the corresponding errno value for FUSE replies. #[expect( clippy::wildcard_enum_match_arm, reason = "ErrorKind is non_exhaustive; EIO is the safe default" )] -impl From for i32 { - fn from(e: FuseIoError) -> Self { - e.0.raw_os_error().unwrap_or_else(|| match e.0.kind() { - std::io::ErrorKind::NotFound => libc::ENOENT, - std::io::ErrorKind::PermissionDenied => libc::EACCES, - std::io::ErrorKind::AlreadyExists => libc::EEXIST, - _ => libc::EIO, - }) - } -} - -/// Error for read operations. -#[derive(Debug, thiserror::Error)] -enum FuseReadError { - /// The file handle was not open. - #[error("file handle not open")] - NotOpen, - /// An I/O error occurred during the read. - #[error("I/O error: {0}")] - Io(#[from] std::io::Error), -} - -impl From for i32 { - fn from(e: FuseReadError) -> Self { - match e { - FuseReadError::NotOpen => libc::EBADF, - FuseReadError::Io(ref io) => io.raw_os_error().unwrap_or(libc::EIO), - } - } -} - -/// Error for release operations. -#[derive(Debug, thiserror::Error)] -enum FuseReleaseError { - /// The file handle was not open. - #[error("file handle not open")] - NotOpen, -} - -impl From for i32 { - fn from(e: FuseReleaseError) -> Self { - match e { - FuseReleaseError::NotOpen => libc::EBADF, - } - } +fn io_to_errno(e: &std::io::Error) -> i32 { + e.raw_os_error().unwrap_or_else(|| match e.kind() { + std::io::ErrorKind::NotFound => libc::ENOENT, + std::io::ErrorKind::PermissionDenied => libc::EACCES, + std::io::ErrorKind::AlreadyExists => libc::EEXIST, + _ => libc::EIO, + }) } mod inner { @@ -200,14 +158,9 @@ impl fuser::Filesystem for FuserAdapter { reply: fuser::ReplyEntry, ) { let result = self.runtime.block_on(async { - let tracked = self - .inner - .get_fs() - .lookup(LoadedAddr(parent), name) - .await - .map_err(FuseIoError)?; + let tracked = self.inner.get_fs().lookup(LoadedAddr(parent), name).await?; self.inner.ward_inc(tracked.inode.addr); - Ok::<_, FuseIoError>(tracked.inode) + Ok::<_, std::io::Error>(tracked.inode) }); match result { Ok(inode) => { @@ -217,7 +170,7 @@ impl fuser::Filesystem for FuserAdapter { } Err(e) => { debug!(error = %e, "replying error"); - reply.error(e.into()); + reply.error(io_to_errno(&e)); } } } @@ -230,13 +183,9 @@ impl fuser::Filesystem for FuserAdapter { _fh: Option, reply: fuser::ReplyAttr, ) { - let result = self.runtime.block_on(async { - self.inner - .get_fs() - .getattr(LoadedAddr(ino)) - .await - .map_err(FuseIoError) - }); + let result = self + .runtime + .block_on(async { self.inner.get_fs().getattr(LoadedAddr(ino)).await }); match result { Ok(inode) => { let attr = inode_to_fuser_attr(&inode, BLOCK_SIZE); @@ -245,7 +194,7 @@ impl fuser::Filesystem for FuserAdapter { } Err(e) => { debug!(error = %e, "replying error"); - reply.error(e.into()); + reply.error(io_to_errno(&e)); } } } @@ -268,16 +217,15 @@ impl fuser::Filesystem for FuserAdapter { entries.push((de.inode.addr, de.name.to_os_string(), de.inode.itype)); false }) - .await - .map_err(FuseIoError)?; - Ok::<_, FuseIoError>(entries) + .await?; + Ok::<_, std::io::Error>(entries) }); let entries = match result { Ok(entries) => entries, Err(e) => { debug!(error = %e, "replying error"); - reply.error(e.into()); + reply.error(io_to_errno(&e)); return; } }; @@ -310,15 +258,10 @@ impl fuser::Filesystem for FuserAdapter { fn open(&mut self, _req: &fuser::Request<'_>, ino: u64, flags: i32, reply: fuser::ReplyOpen) { let flags = OpenFlags::from_bits_truncate(flags); let result = self.runtime.block_on(async { - let open_file = self - .inner - .get_fs() - .open(LoadedAddr(ino), flags) - .await - .map_err(FuseIoError)?; + let open_file = self.inner.get_fs().open(LoadedAddr(ino), flags).await?; let fh = open_file.fh; self.open_files.insert(fh, Arc::clone(&open_file.reader)); - Ok::<_, FuseIoError>(fh) + Ok::<_, std::io::Error>(fh) }); match result { Ok(fh) => { @@ -327,7 +270,7 @@ impl fuser::Filesystem for FuserAdapter { } Err(e) => { debug!(error = %e, "replying error"); - reply.error(e.into()); + reply.error(io_to_errno(&e)); } } } @@ -347,9 +290,12 @@ impl fuser::Filesystem for FuserAdapter { _lock_owner: Option, reply: fuser::ReplyData, ) { - let result: Result<_, FuseReadError> = self.runtime.block_on(async { - let reader = self.open_files.get(&fh).ok_or(FuseReadError::NotOpen)?; - Ok(reader.read(offset.cast_unsigned(), size).await?) + let result = self.runtime.block_on(async { + let reader = self + .open_files + .get(&fh) + .ok_or_else(|| std::io::Error::from_raw_os_error(libc::EBADF))?; + reader.read(offset.cast_unsigned(), size).await }); match result { Ok(data) => { @@ -358,7 +304,7 @@ impl fuser::Filesystem for FuserAdapter { } Err(e) => { debug!(error = %e, "replying error"); - reply.error(e.into()); + reply.error(io_to_errno(&e)); } } } @@ -377,24 +323,15 @@ impl fuser::Filesystem for FuserAdapter { _flush: bool, reply: fuser::ReplyEmpty, ) { - let result: Result<_, FuseReleaseError> = match self.open_files.remove(&fh) { - Some(reader) => { - if let Err(e) = self.runtime.block_on(reader.close()) { - debug!(error = %e, "reader close reported error"); - } - Ok(()) - } - None => Err(FuseReleaseError::NotOpen), - }; - match result { - Ok(()) => { - debug!("replying ok"); - reply.ok(); - } - Err(e) => { - debug!(error = %e, "replying error"); - reply.error(e.into()); + if let Some(reader) = self.open_files.remove(&fh) { + if let Err(e) = self.runtime.block_on(reader.close()) { + debug!(error = %e, "reader close reported error"); } + debug!("replying ok"); + reply.ok(); + } else { + debug!("file handle not open, replying error"); + reply.error(libc::EBADF); } } From 7a63d496779e22ec75d86269a661f5853213b93f Mon Sep 17 00:00:00 2001 From: Marko Vejnovic Date: Fri, 20 Feb 2026 12:30:13 -0800 Subject: [PATCH 03/58] feat: add FuseReply trait and FuseResultExt for centralized FUSE error handling --- lib/fs/fuser.rs | 50 +++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 50 insertions(+) diff --git a/lib/fs/fuser.rs b/lib/fs/fuser.rs index 61814119..a24397de 100644 --- a/lib/fs/fuser.rs +++ b/lib/fs/fuser.rs @@ -23,6 +23,56 @@ fn io_to_errno(e: &std::io::Error) -> i32 { }) } +/// Trait abstracting the `.error(errno)` method common to all fuser reply types. +trait FuseReply { + fn error(self, errno: i32); +} + +macro_rules! impl_fuse_reply { + ($($ty:ty),* $(,)?) => { + $(impl FuseReply for $ty { + fn error(self, errno: i32) { + // Calls the inherent fuser method (not this trait method). + self.error(errno); + } + })* + }; +} + +// ReplyEmpty and ReplyStatfs are excluded: release and statfs +// do not follow the block_on -> fuse_reply pattern. +impl_fuse_reply!( + fuser::ReplyEntry, + fuser::ReplyAttr, + fuser::ReplyDirectory, + fuser::ReplyOpen, + fuser::ReplyData, +); + +/// Extension trait on `Result` for FUSE reply handling. +/// +/// Centralizes the error-logging + errno-reply path so each FUSE callback +/// only has to express its success path. +#[expect( + dead_code, + reason = "will be used by FUSE callbacks in upcoming commits" +)] +trait FuseResultExt { + fn fuse_reply(self, reply: R, on_ok: impl FnOnce(T, R)); +} + +impl FuseResultExt for Result { + fn fuse_reply(self, reply: R, on_ok: impl FnOnce(T, R)) { + match self { + Ok(val) => on_ok(val, reply), + Err(e) => { + debug!(error = %e, "replying error"); + reply.error(io_to_errno(&e)); + } + } + } +} + mod inner { #![allow(clippy::future_not_send, clippy::mem_forget)] From 98e906f9f5ea3ce684214de8164e937c13aeba56 Mon Sep 17 00:00:00 2001 From: Marko Vejnovic Date: Fri, 20 Feb 2026 12:37:32 -0800 Subject: [PATCH 04/58] refactor: use fuse_reply in getattr --- lib/fs/fuser.rs | 19 ++++--------------- 1 file changed, 4 insertions(+), 15 deletions(-) diff --git a/lib/fs/fuser.rs b/lib/fs/fuser.rs index a24397de..41ff2140 100644 --- a/lib/fs/fuser.rs +++ b/lib/fs/fuser.rs @@ -53,10 +53,6 @@ impl_fuse_reply!( /// /// Centralizes the error-logging + errno-reply path so each FUSE callback /// only has to express its success path. -#[expect( - dead_code, - reason = "will be used by FUSE callbacks in upcoming commits" -)] trait FuseResultExt { fn fuse_reply(self, reply: R, on_ok: impl FnOnce(T, R)); } @@ -233,20 +229,13 @@ impl fuser::Filesystem for FuserAdapter { _fh: Option, reply: fuser::ReplyAttr, ) { - let result = self - .runtime - .block_on(async { self.inner.get_fs().getattr(LoadedAddr(ino)).await }); - match result { - Ok(inode) => { + self.runtime + .block_on(async { self.inner.get_fs().getattr(LoadedAddr(ino)).await }) + .fuse_reply(reply, |inode, reply| { let attr = inode_to_fuser_attr(&inode, BLOCK_SIZE); debug!(?attr, "replying..."); reply.attr(&Self::SHAMEFUL_TTL, &attr); - } - Err(e) => { - debug!(error = %e, "replying error"); - reply.error(io_to_errno(&e)); - } - } + }); } #[instrument(name = "FuserAdapter::readdir", skip(self, _req, _fh, offset, reply))] From 3d26de286e5c41523718810c553d5dea1b23ec76 Mon Sep 17 00:00:00 2001 From: Marko Vejnovic Date: Fri, 20 Feb 2026 12:37:58 -0800 Subject: [PATCH 05/58] refactor: use fuse_reply in lookup --- lib/fs/fuser.rs | 21 ++++++++------------- 1 file changed, 8 insertions(+), 13 deletions(-) diff --git a/lib/fs/fuser.rs b/lib/fs/fuser.rs index 41ff2140..704cddaf 100644 --- a/lib/fs/fuser.rs +++ b/lib/fs/fuser.rs @@ -203,22 +203,17 @@ impl fuser::Filesystem for FuserAdapter { name: &OsStr, reply: fuser::ReplyEntry, ) { - let result = self.runtime.block_on(async { - let tracked = self.inner.get_fs().lookup(LoadedAddr(parent), name).await?; - self.inner.ward_inc(tracked.inode.addr); - Ok::<_, std::io::Error>(tracked.inode) - }); - match result { - Ok(inode) => { + self.runtime + .block_on(async { + let tracked = self.inner.get_fs().lookup(LoadedAddr(parent), name).await?; + self.inner.ward_inc(tracked.inode.addr); + Ok::<_, std::io::Error>(tracked.inode) + }) + .fuse_reply(reply, |inode, reply| { let f_attr = inode_to_fuser_attr(&inode, BLOCK_SIZE); debug!(?f_attr, "replying..."); reply.entry(&Self::SHAMEFUL_TTL, &f_attr, 0); - } - Err(e) => { - debug!(error = %e, "replying error"); - reply.error(io_to_errno(&e)); - } - } + }); } #[instrument(name = "FuserAdapter::getattr", skip(self, _req, _fh, reply))] From 44ffc1fac2be9d6fa593d6fe5c251173c4e3ddc3 Mon Sep 17 00:00:00 2001 From: Marko Vejnovic Date: Fri, 20 Feb 2026 12:38:24 -0800 Subject: [PATCH 06/58] refactor: use fuse_reply in open --- lib/fs/fuser.rs | 23 +++++++++-------------- 1 file changed, 9 insertions(+), 14 deletions(-) diff --git a/lib/fs/fuser.rs b/lib/fs/fuser.rs index 704cddaf..7d648598 100644 --- a/lib/fs/fuser.rs +++ b/lib/fs/fuser.rs @@ -291,22 +291,17 @@ impl fuser::Filesystem for FuserAdapter { #[instrument(name = "FuserAdapter::open", skip(self, _req, flags, reply))] fn open(&mut self, _req: &fuser::Request<'_>, ino: u64, flags: i32, reply: fuser::ReplyOpen) { let flags = OpenFlags::from_bits_truncate(flags); - let result = self.runtime.block_on(async { - let open_file = self.inner.get_fs().open(LoadedAddr(ino), flags).await?; - let fh = open_file.fh; - self.open_files.insert(fh, Arc::clone(&open_file.reader)); - Ok::<_, std::io::Error>(fh) - }); - match result { - Ok(fh) => { + self.runtime + .block_on(async { + let open_file = self.inner.get_fs().open(LoadedAddr(ino), flags).await?; + let fh = open_file.fh; + self.open_files.insert(fh, Arc::clone(&open_file.reader)); + Ok::<_, std::io::Error>(fh) + }) + .fuse_reply(reply, |fh, reply| { debug!(handle = fh, "replying..."); reply.opened(fh, 0); - } - Err(e) => { - debug!(error = %e, "replying error"); - reply.error(io_to_errno(&e)); - } - } + }); } #[instrument( From 74904cd4e9db84068c781be50c089cacf4f07e19 Mon Sep 17 00:00:00 2001 From: Marko Vejnovic Date: Fri, 20 Feb 2026 12:38:51 -0800 Subject: [PATCH 07/58] refactor: use fuse_reply in read --- lib/fs/fuser.rs | 25 ++++++++++--------------- 1 file changed, 10 insertions(+), 15 deletions(-) diff --git a/lib/fs/fuser.rs b/lib/fs/fuser.rs index 7d648598..824fafa2 100644 --- a/lib/fs/fuser.rs +++ b/lib/fs/fuser.rs @@ -319,23 +319,18 @@ impl fuser::Filesystem for FuserAdapter { _lock_owner: Option, reply: fuser::ReplyData, ) { - let result = self.runtime.block_on(async { - let reader = self - .open_files - .get(&fh) - .ok_or_else(|| std::io::Error::from_raw_os_error(libc::EBADF))?; - reader.read(offset.cast_unsigned(), size).await - }); - match result { - Ok(data) => { + self.runtime + .block_on(async { + let reader = self + .open_files + .get(&fh) + .ok_or_else(|| std::io::Error::from_raw_os_error(libc::EBADF))?; + reader.read(offset.cast_unsigned(), size).await + }) + .fuse_reply(reply, |data, reply| { debug!(read_bytes = data.len(), "replying..."); reply.data(&data); - } - Err(e) => { - debug!(error = %e, "replying error"); - reply.error(io_to_errno(&e)); - } - } + }); } #[instrument( From cc820ca8c8a5b666bf8110f649630a0cc7d3e3e0 Mon Sep 17 00:00:00 2001 From: Marko Vejnovic Date: Fri, 20 Feb 2026 12:39:43 -0800 Subject: [PATCH 08/58] refactor: use fuse_reply in readdir --- lib/fs/fuser.rs | 81 ++++++++++++++++++++++--------------------------- 1 file changed, 37 insertions(+), 44 deletions(-) diff --git a/lib/fs/fuser.rs b/lib/fs/fuser.rs index 824fafa2..886a5f6f 100644 --- a/lib/fs/fuser.rs +++ b/lib/fs/fuser.rs @@ -240,52 +240,45 @@ impl fuser::Filesystem for FuserAdapter { ino: u64, _fh: u64, offset: i64, - mut reply: fuser::ReplyDirectory, + reply: fuser::ReplyDirectory, ) { let offset_u64 = offset.cast_unsigned(); - let result = self.runtime.block_on(async { - let mut entries = Vec::new(); - self.inner - .get_fs() - .readdir(LoadedAddr(ino), offset_u64, |de, _next_offset| { - entries.push((de.inode.addr, de.name.to_os_string(), de.inode.itype)); - false - }) - .await?; - Ok::<_, std::io::Error>(entries) - }); - - let entries = match result { - Ok(entries) => entries, - Err(e) => { - debug!(error = %e, "replying error"); - reply.error(io_to_errno(&e)); - return; - } - }; - - #[expect( - clippy::cast_possible_truncation, - reason = "offset fits in usize on supported 64-bit platforms" - )] - for (i, (entry_ino, entry_name, entry_itype)) in entries.iter().enumerate() { - let kind = inode_type_to_fuser(*entry_itype); - let abs_idx = offset_u64 as usize + i + 1; - let Ok(idx): Result = abs_idx.try_into() else { - error!("Directory entry index {} too large for fuser", abs_idx); - reply.error(libc::EIO); - return; - }; - - debug!(?entry_name, ino = entry_ino, "adding entry to reply..."); - if reply.add(*entry_ino, idx, kind, entry_name) { - debug!("buffer full for now, stopping readdir"); - break; - } - } - - debug!("finalizing reply..."); - reply.ok(); + self.runtime + .block_on(async { + let mut entries = Vec::new(); + self.inner + .get_fs() + .readdir(LoadedAddr(ino), offset_u64, |de, _next_offset| { + entries.push((de.inode.addr, de.name.to_os_string(), de.inode.itype)); + false + }) + .await?; + Ok::<_, std::io::Error>(entries) + }) + .fuse_reply(reply, |entries, mut reply| { + for (i, (entry_ino, entry_name, entry_itype)) in entries.iter().enumerate() { + let kind = inode_type_to_fuser(*entry_itype); + #[expect( + clippy::cast_possible_truncation, + reason = "offset fits in usize on supported 64-bit platforms" + )] + let abs_idx = offset_u64 as usize + i + 1; + let Ok(idx): Result = abs_idx.try_into() else { + error!("Directory entry index {} too large for fuser", abs_idx); + reply.error(libc::EIO); + return; + }; + + debug!(?entry_name, ino = entry_ino, "adding entry to reply..."); + if reply.add(*entry_ino, idx, kind, entry_name) { + debug!("buffer full for now, stopping readdir"); + break; + } + } + + debug!("finalizing reply..."); + reply.ok(); + }); } #[instrument(name = "FuserAdapter::open", skip(self, _req, flags, reply))] From e7d59095b520ace0e7a35e67ca7cbff2dc40999f Mon Sep 17 00:00:00 2001 From: Marko Vejnovic Date: Fri, 20 Feb 2026 13:26:29 -0800 Subject: [PATCH 09/58] DCache with per-parent info --- lib/fs/dcache.rs | 184 ++++++++++++++++++++++++++++++++++++++++++----- 1 file changed, 167 insertions(+), 17 deletions(-) diff --git a/lib/fs/dcache.rs b/lib/fs/dcache.rs index 5138e802..fab36c7b 100644 --- a/lib/fs/dcache.rs +++ b/lib/fs/dcache.rs @@ -1,4 +1,6 @@ use std::ffi::{OsStr, OsString}; +use std::sync::Arc; +use std::sync::atomic::{AtomicBool, Ordering}; use crate::fs::LoadedAddr; @@ -11,29 +13,69 @@ pub struct DValue { pub is_dir: bool, } -/// In-memory directory entry cache mapping `(parent, name)` to child metadata. +/// Per-parent directory state holding child entries and a population flag. +struct DirState { + children: scc::HashMap, + populated: AtomicBool, +} + +impl DirState { + fn new() -> Self { + Self { + children: scc::HashMap::new(), + populated: AtomicBool::new(false), + } + } +} + +/// In-memory directory entry cache with per-parent child maps. /// -/// Backed by [`scc::HashMap`] for atomic upsert on insert. The `readdir` -/// implementation scans the entire map and filters by parent — this is O(n) -/// over the cache size rather than O(log n + k) with an ordered index, but -/// guarantees that `insert` never creates a window where an entry is absent. -#[derive(Default)] +/// Each parent directory gets its own [`DirState`] containing a +/// [`scc::HashMap`] of child entries and an [`AtomicBool`] population flag. +/// This makes `readdir` O(k) in the number of children rather than O(n) +/// over the entire cache. pub struct DCache { - cache: scc::HashMap<(LoadedAddr, OsString), DValue>, + dirs: scc::HashMap>, +} + +impl Default for DCache { + fn default() -> Self { + Self::new() + } } impl DCache { /// Creates an empty directory cache. #[must_use] pub fn new() -> Self { - Self::default() + Self { + dirs: scc::HashMap::new(), + } + } + + /// Returns the [`DirState`] for `parent_ino`, creating one if absent. + fn dir_state(&self, parent_ino: LoadedAddr) -> Arc { + if let Some(entry) = self.dirs.read_sync(&parent_ino, |_, v| Arc::clone(v)) { + return entry; + } + let state = Arc::new(DirState::new()); + match self.dirs.entry_sync(parent_ino) { + scc::hash_map::Entry::Occupied(occ) => Arc::clone(occ.get()), + scc::hash_map::Entry::Vacant(vac) => { + let cloned = Arc::clone(&state); + vac.insert_entry(state); + cloned + } + } } /// Looks up a single child entry by parent inode and name. #[must_use] pub fn lookup(&self, parent_ino: LoadedAddr, name: &OsStr) -> Option { - let key = (parent_ino, name.to_os_string()); - self.cache.read_sync(&key, |_, v| v.clone()) + let state = self.dirs.read_sync(&parent_ino, |_, v| Arc::clone(v))?; + state + .children + .read_sync(&name.to_os_string(), |_, v| v.clone()) } /// Atomically inserts or overwrites a child entry in the cache. @@ -44,22 +86,130 @@ impl DCache { ino: LoadedAddr, is_dir: bool, ) { - let key = (parent_ino, name); + let state = self.dir_state(parent_ino); let value = DValue { ino, is_dir }; - self.cache.upsert_async(key, value).await; + state.children.upsert_async(name, value).await; } /// Returns all cached children of `parent_ino` as `(name, value)` pairs. pub async fn readdir(&self, parent_ino: LoadedAddr) -> Vec<(OsString, DValue)> { + let Some(state) = self.dirs.read_sync(&parent_ino, |_, v| Arc::clone(v)) else { + return Vec::new(); + }; let mut entries = Vec::new(); - self.cache - .iter_async(|key, value| { - if key.0 == parent_ino { - entries.push((key.1.clone(), value.clone())); - } + state + .children + .iter_async(|k, v| { + entries.push((k.clone(), v.clone())); true }) .await; entries } + + /// Returns `true` if the directory at `parent_ino` has been fully populated. + #[must_use] + pub fn is_populated(&self, parent_ino: LoadedAddr) -> bool { + self.dirs + .read_sync(&parent_ino, |_, v| v.populated.load(Ordering::Acquire)) + .unwrap_or(false) + } + + /// Marks the directory at `parent_ino` as fully populated. + pub fn mark_populated(&self, parent_ino: LoadedAddr) { + let state = self.dir_state(parent_ino); + state.populated.store(true, Ordering::Release); + } +} + +#[cfg(test)] +mod tests { + use super::*; + use std::ffi::OsString; + + #[tokio::test] + async fn lookup_returns_none_for_missing_entry() { + let cache = DCache::new(); + assert!(cache.lookup(LoadedAddr(1), OsStr::new("foo")).is_none()); + } + + #[tokio::test] + async fn insert_then_lookup() { + let cache = DCache::new(); + cache + .insert(LoadedAddr(1), OsString::from("foo"), LoadedAddr(10), false) + .await; + let dv = cache.lookup(LoadedAddr(1), OsStr::new("foo")); + assert!(dv.is_some(), "entry should be present after insert"); + let dv = dv.expect("checked above"); + assert_eq!(dv.ino, LoadedAddr(10)); + assert!(!dv.is_dir); + } + + #[tokio::test] + async fn readdir_returns_only_children_of_parent() { + let cache = DCache::new(); + cache + .insert(LoadedAddr(1), OsString::from("a"), LoadedAddr(10), false) + .await; + cache + .insert(LoadedAddr(1), OsString::from("b"), LoadedAddr(11), true) + .await; + cache + .insert(LoadedAddr(2), OsString::from("c"), LoadedAddr(12), false) + .await; + let children = cache.readdir(LoadedAddr(1)).await; + assert_eq!(children.len(), 2); + let names: Vec<_> = children.iter().map(|(n, _)| n.clone()).collect(); + assert!(names.contains(&OsString::from("a"))); + assert!(names.contains(&OsString::from("b"))); + } + + #[tokio::test] + async fn readdir_empty_parent_returns_empty() { + let cache = DCache::new(); + let children = cache.readdir(LoadedAddr(1)).await; + assert!(children.is_empty()); + } + + #[tokio::test] + async fn is_populated_false_by_default() { + let cache = DCache::new(); + assert!(!cache.is_populated(LoadedAddr(1))); + } + + #[tokio::test] + async fn mark_populated_then_check() { + let cache = DCache::new(); + cache.mark_populated(LoadedAddr(1)); + assert!(cache.is_populated(LoadedAddr(1))); + } + + #[tokio::test] + async fn insert_does_not_mark_populated() { + let cache = DCache::new(); + cache + .insert(LoadedAddr(1), OsString::from("foo"), LoadedAddr(10), false) + .await; + assert!( + !cache.is_populated(LoadedAddr(1)), + "insert alone should not mark a directory as populated" + ); + } + + #[tokio::test] + async fn upsert_overwrites_existing_entry() { + let cache = DCache::new(); + cache + .insert(LoadedAddr(1), OsString::from("foo"), LoadedAddr(10), false) + .await; + cache + .insert(LoadedAddr(1), OsString::from("foo"), LoadedAddr(20), true) + .await; + let dv = cache.lookup(LoadedAddr(1), OsStr::new("foo")); + assert!(dv.is_some(), "entry should still be present after upsert"); + let dv = dv.expect("checked above"); + assert_eq!(dv.ino, LoadedAddr(20)); + assert!(dv.is_dir); + } } From 44d5f0751e56a22fbe7f1d678c0e8502135f8842 Mon Sep 17 00:00:00 2001 From: Marko Vejnovic Date: Fri, 20 Feb 2026 13:28:37 -0800 Subject: [PATCH 10/58] refactor: use DCache population tracking, remove readdir_populated from AsyncFs --- lib/fs/async_fs.rs | 11 ++--------- 1 file changed, 2 insertions(+), 9 deletions(-) diff --git a/lib/fs/async_fs.rs b/lib/fs/async_fs.rs index 7626578f..3bf3b0f3 100644 --- a/lib/fs/async_fs.rs +++ b/lib/fs/async_fs.rs @@ -193,9 +193,6 @@ pub struct AsyncFs<'tbl, DP: FsDataProvider> { /// Monotonically increasing file handle counter. Starts at 1 (0 is reserved). next_fh: AtomicU64, - - /// Tracks which directories have had their children fetched via `dp.readdir`. - readdir_populated: FutureBackedCache, } impl<'tbl, DP: FsDataProvider> AsyncFs<'tbl, DP> { @@ -215,7 +212,6 @@ impl<'tbl, DP: FsDataProvider> AsyncFs<'tbl, DP> { directory_cache: DCache::new(), data_provider, next_fh: AtomicU64::new(1), - readdir_populated: FutureBackedCache::default(), } } @@ -235,7 +231,6 @@ impl<'tbl, DP: FsDataProvider> AsyncFs<'tbl, DP> { directory_cache: DCache::new(), data_provider, next_fh: AtomicU64::new(1), - readdir_populated: FutureBackedCache::default(), } } @@ -392,7 +387,7 @@ impl<'tbl, DP: FsDataProvider> AsyncFs<'tbl, DP> { } // Populate the directory cache on first readdir for this parent. - if self.readdir_populated.get(&parent).await.is_none() { + if !self.directory_cache.is_populated(parent) { let children = self.data_provider.readdir(parent_inode).await?; for (name, child_inode) in children { self.inode_table @@ -407,9 +402,7 @@ impl<'tbl, DP: FsDataProvider> AsyncFs<'tbl, DP> { ) .await; } - self.readdir_populated - .get_or_init(parent, || async {}) - .await; + self.directory_cache.mark_populated(parent); } let mut children = self.directory_cache.readdir(parent).await; From f07db8be693508b4d51fa1b96f154317db448cf0 Mon Sep 17 00:00:00 2001 From: Marko Vejnovic Date: Fri, 20 Feb 2026 13:30:48 -0800 Subject: [PATCH 11/58] refactor: use DCache population tracking in CompositeFs --- src/fs/mescloud/composite.rs | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/src/fs/mescloud/composite.rs b/src/fs/mescloud/composite.rs index 3356b7b5..91c35806 100644 --- a/src/fs/mescloud/composite.rs +++ b/src/fs/mescloud/composite.rs @@ -83,7 +83,6 @@ struct OpenFileEntry { pub(super) struct CompositeFs { pub(super) inode_table: FutureBackedCache, pub(super) directory_cache: DCache, - readdir_populated: FutureBackedCache, next_ino: AtomicU64, next_fh: AtomicU64, refcounts: FxHashMap, @@ -121,7 +120,6 @@ impl CompositeFs { Self { inode_table, directory_cache: DCache::new(), - readdir_populated: FutureBackedCache::default(), next_ino: AtomicU64::new(Self::ROOT_INO + 1), next_fh: AtomicU64::new(1), refcounts, @@ -286,7 +284,7 @@ impl CompositeFs { .copied() .ok_or(ReadDirError::InodeNotFound)?; - if self.readdir_populated.get(&LoadedAddr(ino)).await.is_none() { + if !self.directory_cache.is_populated(LoadedAddr(ino)) { let inner_ino = self.slots[idx] .bridge .forward(ino) @@ -321,9 +319,7 @@ impl CompositeFs { .await; } - self.readdir_populated - .get_or_init(LoadedAddr(ino), || async {}) - .await; + self.directory_cache.mark_populated(LoadedAddr(ino)); } let mut children = self.directory_cache.readdir(LoadedAddr(ino)).await; From 7558e8624380775aac8b7c175d9e7fcfeafe3ca0 Mon Sep 17 00:00:00 2001 From: Marko Vejnovic Date: Fri, 20 Feb 2026 13:31:36 -0800 Subject: [PATCH 12/58] fix: update stale readdir_populated comment in async_fs.rs --- lib/fs/async_fs.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/fs/async_fs.rs b/lib/fs/async_fs.rs index 3bf3b0f3..761149d2 100644 --- a/lib/fs/async_fs.rs +++ b/lib/fs/async_fs.rs @@ -368,7 +368,7 @@ impl<'tbl, DP: FsDataProvider> AsyncFs<'tbl, DP> { /// /// # Concurrency /// - /// The `readdir_populated` check-then-populate is **not** atomic. If two + /// The `is_populated` check-then-populate is **not** atomic. If two /// concurrent callers invoke `readdir` for the same parent, both may call /// `dp.readdir()` and insert duplicate children. This is safe when the /// caller serializes access (e.g. via `&mut self` on the `Fs` trait). From bcf2f1eb46b5b2f90d1b335a9cfb76463ddc66f9 Mon Sep 17 00:00:00 2001 From: Marko Vejnovic Date: Fri, 20 Feb 2026 14:08:22 -0800 Subject: [PATCH 13/58] feat: add ConcurrentBridge for lock-free inode address translation --- lib/fs/bridge.rs | 76 +++++++++++++++++++++++++++++++++++++++++++ lib/fs/mod.rs | 2 ++ tests/bridge_tests.rs | 49 ++++++++++++++++++++++++++++ 3 files changed, 127 insertions(+) create mode 100644 lib/fs/bridge.rs create mode 100644 tests/bridge_tests.rs diff --git a/lib/fs/bridge.rs b/lib/fs/bridge.rs new file mode 100644 index 00000000..5bb1b028 --- /dev/null +++ b/lib/fs/bridge.rs @@ -0,0 +1,76 @@ +//! Lock-free bidirectional inode address mapping. +//! +//! [`ConcurrentBridge`] maps between "outer" (composite) and "inner" (child) +//! inode address spaces using two [`scc::HashMap`]s. + +use crate::fs::InodeAddr; + +/// Bidirectional inode mapping between outer (composite) and inner (child) address spaces. +/// +/// Uses two lock-free `scc::HashMap`s. Insertion order: forward map first, +/// then backward map, so any observer that discovers an outer addr via +/// `backward` can immediately resolve it via `forward`. +pub struct ConcurrentBridge { + /// outer -> inner + fwd: scc::HashMap, + /// inner -> outer + bwd: scc::HashMap, +} + +impl ConcurrentBridge { + /// Creates an empty bridge. + #[must_use] + pub fn new() -> Self { + Self { + fwd: scc::HashMap::new(), + bwd: scc::HashMap::new(), + } + } + + /// Insert a mapping from outer to inner. + /// + /// Inserts into the forward map first (see module docs for ordering rationale). + pub fn insert(&self, outer: InodeAddr, inner: InodeAddr) { + let _ = self.fwd.insert_sync(outer, inner); + let _ = self.bwd.insert_sync(inner, outer); + } + + /// Resolve outer -> inner. + #[must_use] + pub fn forward(&self, outer: InodeAddr) -> Option { + self.fwd.read_sync(&outer, |_, &v| v) + } + + /// Resolve inner -> outer. + #[must_use] + pub fn backward(&self, inner: InodeAddr) -> Option { + self.bwd.read_sync(&inner, |_, &v| v) + } + + /// Look up inner -> outer, or allocate a new outer address if unmapped. + pub fn backward_or_insert( + &self, + inner: InodeAddr, + allocate: impl FnOnce() -> InodeAddr, + ) -> InodeAddr { + if let Some(outer) = self.backward(inner) { + return outer; + } + let outer = allocate(); + self.insert(outer, inner); + outer + } + + /// Remove the mapping for the given outer address. + pub fn remove_by_outer(&self, outer: InodeAddr) { + if let Some((_, inner)) = self.fwd.remove_sync(&outer) { + self.bwd.remove_sync(&inner); + } + } +} + +impl Default for ConcurrentBridge { + fn default() -> Self { + Self::new() + } +} diff --git a/lib/fs/mod.rs b/lib/fs/mod.rs index e8f971b4..f5d42961 100644 --- a/lib/fs/mod.rs +++ b/lib/fs/mod.rs @@ -1,6 +1,8 @@ //! Useful filesystem generalizations. /// Async filesystem cache with concurrent inode management. pub mod async_fs; +/// Lock-free bidirectional inode address mapping. +pub mod bridge; /// Directory entry cache for fast parent-child lookups. pub mod dcache; /// FUSE adapter: maps [`fuser::Filesystem`] callbacks to [`async_fs::AsyncFs`]. diff --git a/tests/bridge_tests.rs b/tests/bridge_tests.rs new file mode 100644 index 00000000..b0598e4d --- /dev/null +++ b/tests/bridge_tests.rs @@ -0,0 +1,49 @@ +#![allow(clippy::unwrap_used, missing_docs)] + +use git_fs::fs::bridge::ConcurrentBridge; + +#[test] +fn insert_then_forward_returns_inner() { + let bridge = ConcurrentBridge::new(); + bridge.insert(10, 100); + assert_eq!(bridge.forward(10), Some(100)); +} + +#[test] +fn insert_then_backward_returns_outer() { + let bridge = ConcurrentBridge::new(); + bridge.insert(10, 100); + assert_eq!(bridge.backward(100), Some(10)); +} + +#[test] +fn forward_missing_returns_none() { + let bridge = ConcurrentBridge::new(); + assert_eq!(bridge.forward(42), None); +} + +#[test] +fn backward_or_insert_existing_returns_cached() { + let bridge = ConcurrentBridge::new(); + bridge.insert(10, 100); + let outer = bridge.backward_or_insert(100, || 999); + assert_eq!(outer, 10, "should return existing outer addr"); +} + +#[test] +fn backward_or_insert_new_allocates() { + let bridge = ConcurrentBridge::new(); + let outer = bridge.backward_or_insert(200, || 50); + assert_eq!(outer, 50, "should use allocator"); + assert_eq!(bridge.forward(50), Some(200)); + assert_eq!(bridge.backward(200), Some(50)); +} + +#[test] +fn remove_by_outer_clears_both_directions() { + let bridge = ConcurrentBridge::new(); + bridge.insert(10, 100); + bridge.remove_by_outer(10); + assert_eq!(bridge.forward(10), None); + assert_eq!(bridge.backward(100), None); +} From a19e91d1d7e395d11501f4a422213c4800518bac Mon Sep 17 00:00:00 2001 From: Marko Vejnovic Date: Fri, 20 Feb 2026 14:12:17 -0800 Subject: [PATCH 14/58] fix: eliminate TOCTOU race in ConcurrentBridge::backward_or_insert Use `scc::HashMap::entry_sync` for atomic check-and-insert instead of separate backward() + insert() calls that allowed two concurrent callers to both allocate for the same inner address. Also add #[must_use]. --- lib/fs/bridge.rs | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) diff --git a/lib/fs/bridge.rs b/lib/fs/bridge.rs index 5bb1b028..350d8750 100644 --- a/lib/fs/bridge.rs +++ b/lib/fs/bridge.rs @@ -48,17 +48,22 @@ impl ConcurrentBridge { } /// Look up inner -> outer, or allocate a new outer address if unmapped. + #[must_use] pub fn backward_or_insert( &self, inner: InodeAddr, allocate: impl FnOnce() -> InodeAddr, ) -> InodeAddr { - if let Some(outer) = self.backward(inner) { - return outer; + match self.bwd.entry_sync(inner) { + scc::hash_map::Entry::Occupied(occ) => *occ.get(), + scc::hash_map::Entry::Vacant(vac) => { + let outer = allocate(); + vac.insert_entry(outer); + // Populate forward map after backward is committed. + let _ = self.fwd.insert_sync(outer, inner); + outer + } } - let outer = allocate(); - self.insert(outer, inner); - outer } /// Remove the mapping for the given outer address. From d9fdc04b8c75e6016502889c5a67e7e4b0584fc8 Mon Sep 17 00:00:00 2001 From: Marko Vejnovic Date: Fri, 20 Feb 2026 14:15:20 -0800 Subject: [PATCH 15/58] feat: add CompositeRoot trait, ChildInner, and CompositeReader --- lib/fs/composite.rs | 125 ++++++++++++++++++++++++++++++++++++++++++++ lib/fs/mod.rs | 2 + 2 files changed, 127 insertions(+) create mode 100644 lib/fs/composite.rs diff --git a/lib/fs/composite.rs b/lib/fs/composite.rs new file mode 100644 index 00000000..36969c67 --- /dev/null +++ b/lib/fs/composite.rs @@ -0,0 +1,125 @@ +//! Generic composite filesystem types. +//! +//! A composite filesystem presents multiple child filesystems under a single +//! virtual root directory. The [`CompositeRoot`] trait describes how children +//! are discovered, [`ChildInner`] co-locates an inode table with an +//! [`AsyncFs`](super::async_fs::AsyncFs), and [`CompositeReader`] wraps a +//! child reader so the composite layer can expose it through [`FileReader`]. + +use std::ffi::{OsStr, OsString}; +use std::future::Future; +use std::sync::Arc; + +use bytes::Bytes; + +use crate::fs::INode; +use crate::fs::async_fs::{FileReader, FsDataProvider}; + +/// Descriptor for a child filesystem returned by [`CompositeRoot`]. +pub struct ChildDescriptor { + /// The name this child is listed as in the composite root directory. + pub name: OsString, + /// The data provider for this child. + pub provider: DP, + /// The root inode of the child filesystem. + pub root_ino: INode, +} + +/// Describes the children that a composite filesystem exposes at its root. +/// +/// Implementors define domain-specific child resolution: what children exist, +/// and what [`FsDataProvider`] backs each child. +pub trait CompositeRoot: Send + Sync + 'static { + /// The data provider type for child filesystems. + type ChildDP: FsDataProvider; + + /// Resolve a child by name, returning its data provider and root inode. + /// + /// Called on lookup at the composite root. Returns `None` if the name + /// does not correspond to a known child. + fn resolve_child( + &self, + name: &OsStr, + ) -> impl Future>, std::io::Error>> + Send; + + /// List all children at the composite root. + /// + /// Called on readdir at the composite root. + fn list_children( + &self, + ) -> impl Future>, std::io::Error>> + Send; +} + +mod child_inner_impl { + #![allow(clippy::future_not_send, clippy::mem_forget)] + + use ouroboros::self_referencing; + + use crate::cache::async_backed::FutureBackedCache; + use crate::fs::async_fs::{AsyncFs, FsDataProvider}; + use crate::fs::{INode, InodeAddr}; + + /// Self-referential struct co-locating an inode table and [`AsyncFs`]. + /// + /// The `AsyncFs` borrows from the table directly, avoiding an extra + /// indirection. This mirrors the [`FuseBridgeInner`](super::super::fuser) + /// pattern. + #[self_referencing] + pub struct ChildInner { + pub(super) table: FutureBackedCache, + #[borrows(table)] + #[covariant] + pub(super) fs: AsyncFs<'this, DP>, + } + + impl ChildInner { + #[expect(dead_code, reason = "used by CompositeFs in a follow-up commit")] + pub(super) fn create(table: FutureBackedCache, provider: DP) -> Self { + ChildInnerBuilder { + table, + fs_builder: |tbl| AsyncFs::new_preseeded(provider, tbl), + } + .build() + } + + #[expect(dead_code, reason = "used by CompositeFs in a follow-up commit")] + pub(super) fn get_fs(&self) -> &AsyncFs<'_, DP> { + self.borrow_fs() + } + } +} + +pub use child_inner_impl::ChildInner; + +/// Wraps a child's reader so that the composite layer can expose it as its own +/// [`FileReader`]. +pub struct CompositeReader { + inner: Arc, +} + +impl CompositeReader { + /// Create a new `CompositeReader` wrapping the given reader. + pub fn new(inner: Arc) -> Self { + Self { inner } + } +} + +impl std::fmt::Debug for CompositeReader { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + f.debug_struct("CompositeReader").finish_non_exhaustive() + } +} + +impl FileReader for CompositeReader { + fn read( + &self, + offset: u64, + size: u32, + ) -> impl Future> + Send { + self.inner.read(offset, size) + } + + fn close(&self) -> impl Future> + Send { + self.inner.close() + } +} diff --git a/lib/fs/mod.rs b/lib/fs/mod.rs index f5d42961..ed93bd25 100644 --- a/lib/fs/mod.rs +++ b/lib/fs/mod.rs @@ -3,6 +3,8 @@ pub mod async_fs; /// Lock-free bidirectional inode address mapping. pub mod bridge; +/// Generic composite filesystem types. +pub mod composite; /// Directory entry cache for fast parent-child lookups. pub mod dcache; /// FUSE adapter: maps [`fuser::Filesystem`] callbacks to [`async_fs::AsyncFs`]. From e2f8215b48bbd62a978d21f78815eee9e293dd4c Mon Sep 17 00:00:00 2001 From: Marko Vejnovic Date: Fri, 20 Feb 2026 14:18:21 -0800 Subject: [PATCH 16/58] test: extract async_backed inline tests to tests/async_backed_correctness.rs --- lib/cache/async_backed.rs | 101 ------------------------------ tests/async_backed_correctness.rs | 99 +++++++++++++++++++++++++++++ 2 files changed, 99 insertions(+), 101 deletions(-) create mode 100644 tests/async_backed_correctness.rs diff --git a/lib/cache/async_backed.rs b/lib/cache/async_backed.rs index c3fddd05..8f15803b 100644 --- a/lib/cache/async_backed.rs +++ b/lib/cache/async_backed.rs @@ -288,104 +288,3 @@ where } } } - -#[cfg(test)] -mod tests { - use super::*; - - #[tokio::test] - async fn try_init_ok_caches_value() { - let cache = FutureBackedCache::::default(); - let result: Result = cache - .get_or_try_init(1, || async { Ok("hello".to_owned()) }) - .await; - assert_eq!(result.unwrap(), "hello", "should return Ok value"); - - // Value should now be cached (get returns it without factory) - let cached = cache.get(&1).await; - assert_eq!(cached.unwrap(), "hello", "value should be in cache"); - } - - #[tokio::test] - async fn try_init_err_does_not_cache() { - let cache = FutureBackedCache::::default(); - let result: Result = cache.get_or_try_init(1, || async { Err("boom") }).await; - assert_eq!(result.unwrap_err(), "boom", "should return the error"); - - // Cache should be empty — error was not stored - assert!(cache.is_empty(), "cache should have no entries after error"); - assert!(cache.get(&1).await.is_none(), "key should not exist"); - } - - #[tokio::test] - async fn try_init_err_then_retry_ok() { - let cache = FutureBackedCache::::default(); - - // First call: factory fails - let r1: Result = cache.get_or_try_init(1, || async { Err("fail") }).await; - assert!(r1.is_err(), "first call should fail"); - - // Second call: factory succeeds - let r2: Result = cache - .get_or_try_init(1, || async { Ok("recovered".to_owned()) }) - .await; - assert_eq!(r2.unwrap(), "recovered", "retry should succeed"); - - // Value should now be cached - let cached = cache.get(&1).await; - assert_eq!(cached.unwrap(), "recovered"); - } - - #[tokio::test] - async fn try_init_returns_value_cached_by_init() { - let cache = FutureBackedCache::::default(); - - // Populate via infallible get_or_init - cache - .get_or_init(1, || async { "from_init".to_owned() }) - .await; - - // get_or_try_init should return the cached value without running factory - let result: Result = cache - .get_or_try_init(1, || async { panic!("factory should not run") }) - .await; - assert_eq!(result.unwrap(), "from_init"); - } - - #[tokio::test] - async fn panic_in_factory_is_recovered() { - use std::sync::Arc; - use std::sync::atomic::{AtomicUsize, Ordering}; - - let cache = Arc::new(FutureBackedCache::::default()); - let call_count = Arc::new(AtomicUsize::new(0)); - - // Spawn a task whose factory panics. tokio::spawn catches the panic. - let cache2 = Arc::clone(&cache); - let call_count2 = Arc::clone(&call_count); - let handle = tokio::spawn(async move { - cache2 - .get_or_init(1, || { - call_count2.fetch_add(1, Ordering::Relaxed); - async { panic!("boom") } - }) - .await - }); - // The spawned task panics internally; JoinHandle returns Err. - assert!(handle.await.is_err(), "task should have panicked"); - - // The key should NOT be permanently bricked. A new caller should succeed. - let v = cache - .get_or_init(1, || { - call_count.fetch_add(1, Ordering::Relaxed); - async { "recovered".to_owned() } - }) - .await; - assert_eq!(v, "recovered", "should recover after panic"); - assert_eq!( - call_count.load(Ordering::Relaxed), - 2, - "factory called twice" - ); - } -} diff --git a/tests/async_backed_correctness.rs b/tests/async_backed_correctness.rs new file mode 100644 index 00000000..457ba948 --- /dev/null +++ b/tests/async_backed_correctness.rs @@ -0,0 +1,99 @@ +#![allow(clippy::unwrap_used, missing_docs)] + +use std::sync::Arc; +use std::sync::atomic::{AtomicUsize, Ordering}; + +use git_fs::cache::async_backed::FutureBackedCache; + +#[tokio::test] +async fn try_init_ok_caches_value() { + let cache = FutureBackedCache::::default(); + let result: Result = cache + .get_or_try_init(1, || async { Ok("hello".to_owned()) }) + .await; + assert_eq!(result.unwrap(), "hello", "should return Ok value"); + + // Value should now be cached (get returns it without factory) + let cached = cache.get(&1).await; + assert_eq!(cached.unwrap(), "hello", "value should be in cache"); +} + +#[tokio::test] +async fn try_init_err_does_not_cache() { + let cache = FutureBackedCache::::default(); + let result: Result = cache.get_or_try_init(1, || async { Err("boom") }).await; + assert_eq!(result.unwrap_err(), "boom", "should return the error"); + + // Cache should be empty — error was not stored + assert!(cache.is_empty(), "cache should have no entries after error"); + assert!(cache.get(&1).await.is_none(), "key should not exist"); +} + +#[tokio::test] +async fn try_init_err_then_retry_ok() { + let cache = FutureBackedCache::::default(); + + // First call: factory fails + let r1: Result = cache.get_or_try_init(1, || async { Err("fail") }).await; + assert!(r1.is_err(), "first call should fail"); + + // Second call: factory succeeds + let r2: Result = cache + .get_or_try_init(1, || async { Ok("recovered".to_owned()) }) + .await; + assert_eq!(r2.unwrap(), "recovered", "retry should succeed"); + + // Value should now be cached + let cached = cache.get(&1).await; + assert_eq!(cached.unwrap(), "recovered"); +} + +#[tokio::test] +async fn try_init_returns_value_cached_by_init() { + let cache = FutureBackedCache::::default(); + + // Populate via infallible get_or_init + cache + .get_or_init(1, || async { "from_init".to_owned() }) + .await; + + // get_or_try_init should return the cached value without running factory + let result: Result = cache + .get_or_try_init(1, || async { panic!("factory should not run") }) + .await; + assert_eq!(result.unwrap(), "from_init"); +} + +#[tokio::test] +async fn panic_in_factory_is_recovered() { + let cache = Arc::new(FutureBackedCache::::default()); + let call_count = Arc::new(AtomicUsize::new(0)); + + // Spawn a task whose factory panics. tokio::spawn catches the panic. + let cache2 = Arc::clone(&cache); + let call_count2 = Arc::clone(&call_count); + let handle = tokio::spawn(async move { + cache2 + .get_or_init(1, || { + call_count2.fetch_add(1, Ordering::Relaxed); + async { panic!("boom") } + }) + .await + }); + // The spawned task panics internally; JoinHandle returns Err. + assert!(handle.await.is_err(), "task should have panicked"); + + // The key should NOT be permanently bricked. A new caller should succeed. + let v = cache + .get_or_init(1, || { + call_count.fetch_add(1, Ordering::Relaxed); + async { "recovered".to_owned() } + }) + .await; + assert_eq!(v, "recovered", "should recover after panic"); + assert_eq!( + call_count.load(Ordering::Relaxed), + 2, + "factory called twice" + ); +} From 6fe9dd52afe722fe6e6898db49dd9c3334138e2e Mon Sep 17 00:00:00 2001 From: Marko Vejnovic Date: Fri, 20 Feb 2026 14:18:51 -0800 Subject: [PATCH 17/58] fix: add #[must_use] to CompositeReader::new --- lib/fs/composite.rs | 1 + 1 file changed, 1 insertion(+) diff --git a/lib/fs/composite.rs b/lib/fs/composite.rs index 36969c67..d8237dcb 100644 --- a/lib/fs/composite.rs +++ b/lib/fs/composite.rs @@ -99,6 +99,7 @@ pub struct CompositeReader { impl CompositeReader { /// Create a new `CompositeReader` wrapping the given reader. + #[must_use] pub fn new(inner: Arc) -> Self { Self { inner } } From 781d7bb28376cbfaffacfc7a8f17ef43e62fdc4b Mon Sep 17 00:00:00 2001 From: Marko Vejnovic Date: Fri, 20 Feb 2026 14:21:15 -0800 Subject: [PATCH 18/58] test: extract dcache inline tests to tests/dcache_correctness.rs --- lib/fs/dcache.rs | 92 ------------------------------------- tests/dcache_correctness.rs | 92 +++++++++++++++++++++++++++++++++++++ 2 files changed, 92 insertions(+), 92 deletions(-) create mode 100644 tests/dcache_correctness.rs diff --git a/lib/fs/dcache.rs b/lib/fs/dcache.rs index fab36c7b..4870a401 100644 --- a/lib/fs/dcache.rs +++ b/lib/fs/dcache.rs @@ -121,95 +121,3 @@ impl DCache { state.populated.store(true, Ordering::Release); } } - -#[cfg(test)] -mod tests { - use super::*; - use std::ffi::OsString; - - #[tokio::test] - async fn lookup_returns_none_for_missing_entry() { - let cache = DCache::new(); - assert!(cache.lookup(LoadedAddr(1), OsStr::new("foo")).is_none()); - } - - #[tokio::test] - async fn insert_then_lookup() { - let cache = DCache::new(); - cache - .insert(LoadedAddr(1), OsString::from("foo"), LoadedAddr(10), false) - .await; - let dv = cache.lookup(LoadedAddr(1), OsStr::new("foo")); - assert!(dv.is_some(), "entry should be present after insert"); - let dv = dv.expect("checked above"); - assert_eq!(dv.ino, LoadedAddr(10)); - assert!(!dv.is_dir); - } - - #[tokio::test] - async fn readdir_returns_only_children_of_parent() { - let cache = DCache::new(); - cache - .insert(LoadedAddr(1), OsString::from("a"), LoadedAddr(10), false) - .await; - cache - .insert(LoadedAddr(1), OsString::from("b"), LoadedAddr(11), true) - .await; - cache - .insert(LoadedAddr(2), OsString::from("c"), LoadedAddr(12), false) - .await; - let children = cache.readdir(LoadedAddr(1)).await; - assert_eq!(children.len(), 2); - let names: Vec<_> = children.iter().map(|(n, _)| n.clone()).collect(); - assert!(names.contains(&OsString::from("a"))); - assert!(names.contains(&OsString::from("b"))); - } - - #[tokio::test] - async fn readdir_empty_parent_returns_empty() { - let cache = DCache::new(); - let children = cache.readdir(LoadedAddr(1)).await; - assert!(children.is_empty()); - } - - #[tokio::test] - async fn is_populated_false_by_default() { - let cache = DCache::new(); - assert!(!cache.is_populated(LoadedAddr(1))); - } - - #[tokio::test] - async fn mark_populated_then_check() { - let cache = DCache::new(); - cache.mark_populated(LoadedAddr(1)); - assert!(cache.is_populated(LoadedAddr(1))); - } - - #[tokio::test] - async fn insert_does_not_mark_populated() { - let cache = DCache::new(); - cache - .insert(LoadedAddr(1), OsString::from("foo"), LoadedAddr(10), false) - .await; - assert!( - !cache.is_populated(LoadedAddr(1)), - "insert alone should not mark a directory as populated" - ); - } - - #[tokio::test] - async fn upsert_overwrites_existing_entry() { - let cache = DCache::new(); - cache - .insert(LoadedAddr(1), OsString::from("foo"), LoadedAddr(10), false) - .await; - cache - .insert(LoadedAddr(1), OsString::from("foo"), LoadedAddr(20), true) - .await; - let dv = cache.lookup(LoadedAddr(1), OsStr::new("foo")); - assert!(dv.is_some(), "entry should still be present after upsert"); - let dv = dv.expect("checked above"); - assert_eq!(dv.ino, LoadedAddr(20)); - assert!(dv.is_dir); - } -} diff --git a/tests/dcache_correctness.rs b/tests/dcache_correctness.rs new file mode 100644 index 00000000..59731d28 --- /dev/null +++ b/tests/dcache_correctness.rs @@ -0,0 +1,92 @@ +#![allow(clippy::unwrap_used, missing_docs)] + +use std::ffi::{OsStr, OsString}; + +use git_fs::fs::LoadedAddr; +use git_fs::fs::dcache::DCache; + +#[tokio::test] +async fn lookup_returns_none_for_missing_entry() { + let cache = DCache::new(); + assert!(cache.lookup(LoadedAddr(1), OsStr::new("foo")).is_none()); +} + +#[tokio::test] +async fn insert_then_lookup() { + let cache = DCache::new(); + cache + .insert(LoadedAddr(1), OsString::from("foo"), LoadedAddr(10), false) + .await; + let dv = cache.lookup(LoadedAddr(1), OsStr::new("foo")); + assert!(dv.is_some(), "entry should be present after insert"); + let dv = dv.expect("checked above"); + assert_eq!(dv.ino, LoadedAddr(10)); + assert!(!dv.is_dir); +} + +#[tokio::test] +async fn readdir_returns_only_children_of_parent() { + let cache = DCache::new(); + cache + .insert(LoadedAddr(1), OsString::from("a"), LoadedAddr(10), false) + .await; + cache + .insert(LoadedAddr(1), OsString::from("b"), LoadedAddr(11), true) + .await; + cache + .insert(LoadedAddr(2), OsString::from("c"), LoadedAddr(12), false) + .await; + let children = cache.readdir(LoadedAddr(1)).await; + assert_eq!(children.len(), 2); + let names: Vec<_> = children.iter().map(|(n, _)| n.clone()).collect(); + assert!(names.contains(&OsString::from("a"))); + assert!(names.contains(&OsString::from("b"))); +} + +#[tokio::test] +async fn readdir_empty_parent_returns_empty() { + let cache = DCache::new(); + let children = cache.readdir(LoadedAddr(1)).await; + assert!(children.is_empty()); +} + +#[tokio::test] +async fn is_populated_false_by_default() { + let cache = DCache::new(); + assert!(!cache.is_populated(LoadedAddr(1))); +} + +#[tokio::test] +async fn mark_populated_then_check() { + let cache = DCache::new(); + cache.mark_populated(LoadedAddr(1)); + assert!(cache.is_populated(LoadedAddr(1))); +} + +#[tokio::test] +async fn insert_does_not_mark_populated() { + let cache = DCache::new(); + cache + .insert(LoadedAddr(1), OsString::from("foo"), LoadedAddr(10), false) + .await; + assert!( + !cache.is_populated(LoadedAddr(1)), + "insert alone should not mark a directory as populated" + ); +} + +#[tokio::test] +async fn upsert_overwrites_existing_entry() { + let cache = DCache::new(); + cache + .insert(LoadedAddr(1), OsString::from("foo"), LoadedAddr(10), false) + .await; + cache + .insert(LoadedAddr(1), OsString::from("foo"), LoadedAddr(20), true) + .await; + let dv = cache.lookup(LoadedAddr(1), OsStr::new("foo")); + assert!(dv.is_some(), "entry should still be present after upsert"); + let dv = dv.expect("checked above"); + assert_eq!(dv.ino, LoadedAddr(20)); + assert!(dv.is_dir); +} From 903392f23572fa882a9c1f415fcd4c77cd0c8981 Mon Sep 17 00:00:00 2001 From: Marko Vejnovic Date: Fri, 20 Feb 2026 14:24:26 -0800 Subject: [PATCH 19/58] feat: add CompositeFs struct with FsDataProvider impl --- lib/fs/composite.rs | 350 +++++++++++++++++++++++++++++++++++++- src/fs/mescloud/common.rs | 3 + 2 files changed, 349 insertions(+), 4 deletions(-) diff --git a/lib/fs/composite.rs b/lib/fs/composite.rs index d8237dcb..ceb29308 100644 --- a/lib/fs/composite.rs +++ b/lib/fs/composite.rs @@ -9,11 +9,14 @@ use std::ffi::{OsStr, OsString}; use std::future::Future; use std::sync::Arc; +use std::sync::atomic::{AtomicU64, Ordering}; use bytes::Bytes; -use crate::fs::INode; -use crate::fs::async_fs::{FileReader, FsDataProvider}; +use crate::cache::async_backed::FutureBackedCache; +use crate::fs::async_fs::{FileReader, FsDataProvider, OpenFile}; +use crate::fs::bridge::ConcurrentBridge; +use crate::fs::{INode, INodeType, InodeAddr, InodePerms, LoadedAddr, OpenFlags}; /// Descriptor for a child filesystem returned by [`CompositeRoot`]. pub struct ChildDescriptor { @@ -73,7 +76,6 @@ mod child_inner_impl { } impl ChildInner { - #[expect(dead_code, reason = "used by CompositeFs in a follow-up commit")] pub(super) fn create(table: FutureBackedCache, provider: DP) -> Self { ChildInnerBuilder { table, @@ -82,7 +84,6 @@ mod child_inner_impl { .build() } - #[expect(dead_code, reason = "used by CompositeFs in a follow-up commit")] pub(super) fn get_fs(&self) -> &AsyncFs<'_, DP> { self.borrow_fs() } @@ -124,3 +125,344 @@ impl FileReader for CompositeReader { self.inner.close() } } + +struct ChildSlot { + inner: Arc>, + bridge: ConcurrentBridge, +} + +struct CompositeFsInner { + root: R, + /// Child slots, indexed by slot number. + slots: scc::HashMap>, + /// Maps a composite-level outer inode to its child slot index. + addr_to_slot: scc::HashMap, + /// Maps child name to slot index (for dedup on concurrent resolve). + name_to_slot: scc::HashMap, + /// Monotonically increasing slot counter. + next_slot: AtomicU64, + /// Monotonically increasing inode counter. Starts at 2 (1 = root). + next_ino: AtomicU64, + /// The filesystem owner uid/gid. + fs_owner: (u32, u32), +} + +/// A generic composite filesystem that routes to child `AsyncFs` instances. +/// +/// Implements [`FsDataProvider`] so it can be used inside another `AsyncFs`. +/// Clone is cheap (shared `Arc`). +pub struct CompositeFs { + inner: Arc>, +} + +impl Clone for CompositeFs { + fn clone(&self) -> Self { + Self { + inner: Arc::clone(&self.inner), + } + } +} + +impl CompositeFs { + /// Root inode address for this composite level. + pub const ROOT_INO: InodeAddr = 1; + + /// Create a new composite filesystem. + #[must_use] + pub fn new(root: R, fs_owner: (u32, u32)) -> Self { + Self { + inner: Arc::new(CompositeFsInner { + root, + slots: scc::HashMap::new(), + addr_to_slot: scc::HashMap::new(), + name_to_slot: scc::HashMap::new(), + next_slot: AtomicU64::new(0), + next_ino: AtomicU64::new(2), // 1 = root + fs_owner, + }), + } + } + + /// Build the root inode for this composite filesystem. + #[must_use] + pub fn make_root_inode(&self) -> INode { + let now = std::time::SystemTime::now(); + INode { + addr: Self::ROOT_INO, + permissions: InodePerms::from_bits_truncate(0o755), + uid: self.inner.fs_owner.0, + gid: self.inner.fs_owner.1, + create_time: now, + last_modified_at: now, + parent: None, + size: 0, + itype: INodeType::Directory, + } + } + + fn allocate_ino(&self) -> InodeAddr { + self.inner.next_ino.fetch_add(1, Ordering::Relaxed) + } + + fn make_child_dir_inode(&self, addr: InodeAddr) -> INode { + let now = std::time::SystemTime::now(); + INode { + addr, + permissions: InodePerms::from_bits_truncate(0o755), + uid: self.inner.fs_owner.0, + gid: self.inner.fs_owner.1, + create_time: now, + last_modified_at: now, + parent: Some(Self::ROOT_INO), + size: 0, + itype: INodeType::Directory, + } + } + + /// Register a child, returning the composite-level outer inode address. + /// + /// If the child is already registered by name, the existing outer address + /// is returned. Otherwise a new slot is created with a fresh inode table + /// and bridge mapping. + fn register_child(&self, desc: &ChildDescriptor) -> InodeAddr + where + R::ChildDP: Clone, + { + // Fast path: already registered by name. + match self.inner.name_to_slot.entry_sync(desc.name.clone()) { + scc::hash_map::Entry::Occupied(occ) => { + let slot_idx = *occ.get(); + // Return existing outer address for this child's root inode. + if let Some(outer) = self + .inner + .slots + .read_sync(&slot_idx, |_, slot| { + slot.bridge.backward(desc.root_ino.addr) + }) + .flatten() + { + return outer; + } + // Slot exists but bridge has no mapping — should not happen, + // but fall through to create a fresh slot below. + // (Remove stale name entry so the vacant path can re-insert.) + drop(occ); + self.inner.name_to_slot.remove_sync(&desc.name); + } + scc::hash_map::Entry::Vacant(vac) => { + // Claim the name slot atomically. + let outer_ino = self.allocate_ino(); + #[expect( + clippy::cast_possible_truncation, + reason = "slot index fits in usize on 64-bit" + )] + let slot_idx = self.inner.next_slot.fetch_add(1, Ordering::Relaxed) as usize; + + let table = FutureBackedCache::default(); + table.insert_sync(desc.root_ino.addr, desc.root_ino); + let child_inner = Arc::new(ChildInner::create(table, desc.provider.clone())); + + let bridge = ConcurrentBridge::new(); + bridge.insert(outer_ino, desc.root_ino.addr); + + drop(self.inner.slots.insert_sync( + slot_idx, + ChildSlot { + inner: child_inner, + bridge, + }, + )); + let _ = self.inner.addr_to_slot.insert_sync(outer_ino, slot_idx); + vac.insert_entry(slot_idx); + + return outer_ino; + } + } + + // Fallback: name was stale, create fresh. This path is rare. + let outer_ino = self.allocate_ino(); + #[expect( + clippy::cast_possible_truncation, + reason = "slot index fits in usize on 64-bit" + )] + let slot_idx = self.inner.next_slot.fetch_add(1, Ordering::Relaxed) as usize; + + let table = FutureBackedCache::default(); + table.insert_sync(desc.root_ino.addr, desc.root_ino); + let child_inner = Arc::new(ChildInner::create(table, desc.provider.clone())); + + let bridge = ConcurrentBridge::new(); + bridge.insert(outer_ino, desc.root_ino.addr); + + drop(self.inner.slots.insert_sync( + slot_idx, + ChildSlot { + inner: child_inner, + bridge, + }, + )); + let _ = self.inner.addr_to_slot.insert_sync(outer_ino, slot_idx); + drop( + self.inner + .name_to_slot + .insert_sync(desc.name.clone(), slot_idx), + ); + + outer_ino + } +} + +impl FsDataProvider for CompositeFs +where + R::ChildDP: Clone, + <::ChildDP as FsDataProvider>::Reader: 'static, +{ + type Reader = CompositeReader<<::ChildDP as FsDataProvider>::Reader>; + + async fn lookup(&self, parent: INode, name: &OsStr) -> Result { + if parent.addr == Self::ROOT_INO { + let desc = self + .inner + .root + .resolve_child(name) + .await? + .ok_or_else(|| std::io::Error::from_raw_os_error(libc::ENOENT))?; + + let outer_ino = self.register_child(&desc); + Ok(self.make_child_dir_inode(outer_ino)) + } else { + let slot_idx = self + .inner + .addr_to_slot + .read_sync(&parent.addr, |_, &v| v) + .ok_or_else(|| std::io::Error::from_raw_os_error(libc::ENOENT))?; + + // Extract Arc and inner parent address under the guard. + let (child, inner_parent) = self + .inner + .slots + .read_sync(&slot_idx, |_, slot| { + (Arc::clone(&slot.inner), slot.bridge.forward(parent.addr)) + }) + .ok_or_else(|| std::io::Error::from_raw_os_error(libc::ENOENT))?; + + let inner_parent = + inner_parent.ok_or_else(|| std::io::Error::from_raw_os_error(libc::ENOENT))?; + + // Await the lookup outside any scc guard. + let tracked = child + .get_fs() + .lookup(LoadedAddr(inner_parent), name) + .await?; + let child_inode = tracked.inode; + + // Translate inner address back to composite-level address. + let outer_ino = self + .inner + .slots + .read_sync(&slot_idx, |_, slot| { + let next_ino = &self.inner.next_ino; + slot.bridge.backward_or_insert(child_inode.addr, || { + next_ino.fetch_add(1, Ordering::Relaxed) + }) + }) + .ok_or_else(|| std::io::Error::from_raw_os_error(libc::ENOENT))?; + + let _ = self.inner.addr_to_slot.insert_sync(outer_ino, slot_idx); + + Ok(INode { + addr: outer_ino, + ..child_inode + }) + } + } + + async fn readdir(&self, parent: INode) -> Result, std::io::Error> { + if parent.addr == Self::ROOT_INO { + let children = self.inner.root.list_children().await?; + let mut entries = Vec::with_capacity(children.len()); + for desc in &children { + let outer_ino = self.register_child(desc); + entries.push((desc.name.clone(), self.make_child_dir_inode(outer_ino))); + } + Ok(entries) + } else { + let slot_idx = self + .inner + .addr_to_slot + .read_sync(&parent.addr, |_, &v| v) + .ok_or_else(|| std::io::Error::from_raw_os_error(libc::ENOENT))?; + + let (child, inner_parent) = self + .inner + .slots + .read_sync(&slot_idx, |_, slot| { + (Arc::clone(&slot.inner), slot.bridge.forward(parent.addr)) + }) + .ok_or_else(|| std::io::Error::from_raw_os_error(libc::ENOENT))?; + + let inner_parent = + inner_parent.ok_or_else(|| std::io::Error::from_raw_os_error(libc::ENOENT))?; + + // Collect child entries outside the guard. + let mut child_entries = Vec::new(); + child + .get_fs() + .readdir(LoadedAddr(inner_parent), 0, |de, _offset| { + child_entries.push((de.name.to_os_string(), de.inode)); + false + }) + .await?; + + // Translate all inner addresses to composite-level addresses. + let mut entries = Vec::with_capacity(child_entries.len()); + for (name, child_inode) in child_entries { + let outer_ino = self + .inner + .slots + .read_sync(&slot_idx, |_, slot| { + let next_ino = &self.inner.next_ino; + slot.bridge.backward_or_insert(child_inode.addr, || { + next_ino.fetch_add(1, Ordering::Relaxed) + }) + }) + .ok_or_else(|| std::io::Error::from_raw_os_error(libc::ENOENT))?; + + let _ = self.inner.addr_to_slot.insert_sync(outer_ino, slot_idx); + entries.push(( + name, + INode { + addr: outer_ino, + ..child_inode + }, + )); + } + Ok(entries) + } + } + + async fn open(&self, inode: INode, flags: OpenFlags) -> Result { + let slot_idx = self + .inner + .addr_to_slot + .read_sync(&inode.addr, |_, &v| v) + .ok_or_else(|| std::io::Error::from_raw_os_error(libc::ENOENT))?; + + let (child, inner_ino) = self + .inner + .slots + .read_sync(&slot_idx, |_, slot| { + (Arc::clone(&slot.inner), slot.bridge.forward(inode.addr)) + }) + .ok_or_else(|| std::io::Error::from_raw_os_error(libc::ENOENT))?; + + let inner_ino = inner_ino.ok_or_else(|| std::io::Error::from_raw_os_error(libc::ENOENT))?; + + let open_file: OpenFile<<::ChildDP as FsDataProvider>::Reader> = + child.get_fs().open(LoadedAddr(inner_ino), flags).await?; + + Ok(CompositeReader { + inner: open_file.reader, + }) + } +} diff --git a/src/fs/mescloud/common.rs b/src/fs/mescloud/common.rs index 6e9c8bf8..473b5e54 100644 --- a/src/fs/mescloud/common.rs +++ b/src/fs/mescloud/common.rs @@ -149,6 +149,9 @@ pub(super) trait ChildFs: Send + Sync { async fn release(&mut self, ino: InodeAddr, fh: FileHandle) -> Result<(), ReleaseError>; } +// Tests kept inline: these types live in the binary crate and are not +// re-exported through the `git_fs` lib, so integration tests in `tests/` +// cannot access them. #[cfg(test)] mod tests { use super::*; From 4c55565e46733798ce61689a4c1a718b1a6e17d2 Mon Sep 17 00:00:00 2001 From: Marko Vejnovic Date: Fri, 20 Feb 2026 14:29:23 -0800 Subject: [PATCH 20/58] refactor: extract slot creation helper in register_child --- lib/fs/composite.rs | 89 ++++++++++++++++++++++----------------------- 1 file changed, 43 insertions(+), 46 deletions(-) diff --git a/lib/fs/composite.rs b/lib/fs/composite.rs index ceb29308..bf063307 100644 --- a/lib/fs/composite.rs +++ b/lib/fs/composite.rs @@ -219,6 +219,39 @@ impl CompositeFs { } } + /// Allocate a new child slot with a fresh inode table and bridge mapping. + /// + /// Returns `(outer_ino, slot_idx)` for the newly created slot. + fn create_child_slot(&self, desc: &ChildDescriptor) -> (InodeAddr, usize) + where + R::ChildDP: Clone, + { + let outer_ino = self.allocate_ino(); + #[expect( + clippy::cast_possible_truncation, + reason = "slot index fits in usize on 64-bit" + )] + let slot_idx = self.inner.next_slot.fetch_add(1, Ordering::Relaxed) as usize; + + let table = FutureBackedCache::default(); + table.insert_sync(desc.root_ino.addr, desc.root_ino); + let child_inner = Arc::new(ChildInner::create(table, desc.provider.clone())); + + let bridge = ConcurrentBridge::new(); + bridge.insert(outer_ino, desc.root_ino.addr); + + drop(self.inner.slots.insert_sync( + slot_idx, + ChildSlot { + inner: child_inner, + bridge, + }, + )); + let _ = self.inner.addr_to_slot.insert_sync(outer_ino, slot_idx); + + (outer_ino, slot_idx) + } + /// Register a child, returning the composite-level outer inode address. /// /// If the child is already registered by name, the existing outer address @@ -246,62 +279,26 @@ impl CompositeFs { // Slot exists but bridge has no mapping — should not happen, // but fall through to create a fresh slot below. // (Remove stale name entry so the vacant path can re-insert.) + // + // Race window: between `drop(occ)` and the `remove_sync` below, + // another thread could read the stale entry and resolve to a + // broken slot. In the worst case two threads create separate + // slots for the same child — the last writer to `name_to_slot` + // wins and the other slot becomes orphaned. This is functionally + // harmless: the orphaned slot is never reached via name lookup + // and will not serve any future requests. drop(occ); self.inner.name_to_slot.remove_sync(&desc.name); } scc::hash_map::Entry::Vacant(vac) => { - // Claim the name slot atomically. - let outer_ino = self.allocate_ino(); - #[expect( - clippy::cast_possible_truncation, - reason = "slot index fits in usize on 64-bit" - )] - let slot_idx = self.inner.next_slot.fetch_add(1, Ordering::Relaxed) as usize; - - let table = FutureBackedCache::default(); - table.insert_sync(desc.root_ino.addr, desc.root_ino); - let child_inner = Arc::new(ChildInner::create(table, desc.provider.clone())); - - let bridge = ConcurrentBridge::new(); - bridge.insert(outer_ino, desc.root_ino.addr); - - drop(self.inner.slots.insert_sync( - slot_idx, - ChildSlot { - inner: child_inner, - bridge, - }, - )); - let _ = self.inner.addr_to_slot.insert_sync(outer_ino, slot_idx); + let (outer_ino, slot_idx) = self.create_child_slot(desc); vac.insert_entry(slot_idx); - return outer_ino; } } // Fallback: name was stale, create fresh. This path is rare. - let outer_ino = self.allocate_ino(); - #[expect( - clippy::cast_possible_truncation, - reason = "slot index fits in usize on 64-bit" - )] - let slot_idx = self.inner.next_slot.fetch_add(1, Ordering::Relaxed) as usize; - - let table = FutureBackedCache::default(); - table.insert_sync(desc.root_ino.addr, desc.root_ino); - let child_inner = Arc::new(ChildInner::create(table, desc.provider.clone())); - - let bridge = ConcurrentBridge::new(); - bridge.insert(outer_ino, desc.root_ino.addr); - - drop(self.inner.slots.insert_sync( - slot_idx, - ChildSlot { - inner: child_inner, - bridge, - }, - )); - let _ = self.inner.addr_to_slot.insert_sync(outer_ino, slot_idx); + let (outer_ino, slot_idx) = self.create_child_slot(desc); drop( self.inner .name_to_slot From 5e31225a2e11801bcfffba031d55441e4ad0461f Mon Sep 17 00:00:00 2001 From: Marko Vejnovic Date: Fri, 20 Feb 2026 14:32:54 -0800 Subject: [PATCH 21/58] test: add integration tests for generic CompositeFs --- tests/common/composite_mocks.rs | 55 ++++++ tests/common/mod.rs | 1 + tests/composite_fs_tests.rs | 285 ++++++++++++++++++++++++++++++++ 3 files changed, 341 insertions(+) create mode 100644 tests/common/composite_mocks.rs create mode 100644 tests/composite_fs_tests.rs diff --git a/tests/common/composite_mocks.rs b/tests/common/composite_mocks.rs new file mode 100644 index 00000000..413621d3 --- /dev/null +++ b/tests/common/composite_mocks.rs @@ -0,0 +1,55 @@ +#![allow(missing_docs, clippy::unwrap_used)] + +use std::collections::HashMap; +use std::ffi::{OsStr, OsString}; +use std::sync::Arc; + +use git_fs::fs::INode; +use git_fs::fs::composite::{ChildDescriptor, CompositeRoot}; + +use super::async_fs_mocks::MockFsDataProvider; + +/// A mock `CompositeRoot` that resolves children from a fixed map. +pub struct MockRoot { + pub children: Arc>, +} + +impl MockRoot { + pub fn new(children: HashMap) -> Self { + Self { + children: Arc::new(children), + } + } +} + +impl CompositeRoot for MockRoot { + type ChildDP = MockFsDataProvider; + + async fn resolve_child( + &self, + name: &OsStr, + ) -> Result>, std::io::Error> { + Ok(self + .children + .get(name) + .map(|(provider, root_ino)| ChildDescriptor { + name: name.to_os_string(), + provider: provider.clone(), + root_ino: *root_ino, + })) + } + + async fn list_children( + &self, + ) -> Result>, std::io::Error> { + Ok(self + .children + .iter() + .map(|(name, (provider, root_ino))| ChildDescriptor { + name: name.clone(), + provider: provider.clone(), + root_ino: *root_ino, + }) + .collect()) + } +} diff --git a/tests/common/mod.rs b/tests/common/mod.rs index 2729c866..96aedec1 100644 --- a/tests/common/mod.rs +++ b/tests/common/mod.rs @@ -1,6 +1,7 @@ #![allow(dead_code, missing_docs, clippy::unwrap_used)] pub mod async_fs_mocks; +pub mod composite_mocks; use std::sync::{Arc, Mutex}; use std::time::Duration; diff --git a/tests/composite_fs_tests.rs b/tests/composite_fs_tests.rs new file mode 100644 index 00000000..d6470a6a --- /dev/null +++ b/tests/composite_fs_tests.rs @@ -0,0 +1,285 @@ +#![allow(clippy::unwrap_used, clippy::expect_used, missing_docs)] + +mod common; + +use std::collections::HashMap; +use std::ffi::{OsStr, OsString}; + +use bytes::Bytes; + +use git_fs::cache::async_backed::FutureBackedCache; +use git_fs::fs::async_fs::AsyncFs; +use git_fs::fs::composite::CompositeFs; +use git_fs::fs::{INode, INodeType, LoadedAddr, OpenFlags}; + +use common::async_fs_mocks::{MockFsDataProvider, MockFsState, make_inode}; +use common::composite_mocks::MockRoot; + +/// Build a child data provider with a root directory and a set of children. +/// +/// Each child is `(name, addr, itype, size)`. Files get auto-generated content +/// of the form `"content of {name}"`. +fn make_child_provider( + root_addr: u64, + children: &[(&str, u64, INodeType, u64)], +) -> (MockFsDataProvider, INode) { + let root = make_inode(root_addr, INodeType::Directory, 0, None); + let mut state = MockFsState::default(); + let mut dir_entries = Vec::new(); + for (name, addr, itype, size) in children { + let child = make_inode(*addr, *itype, *size, Some(root_addr)); + state + .lookups + .insert((root_addr, OsString::from(name)), child); + dir_entries.push((OsString::from(name), child)); + if *itype == INodeType::File { + state + .file_contents + .insert(*addr, Bytes::from(format!("content of {name}"))); + } + } + state.directories.insert(root_addr, dir_entries); + (MockFsDataProvider::new(state), root) +} + +#[tokio::test(flavor = "multi_thread", worker_threads = 2)] +async fn composite_root_lookup_resolves_child() { + let (provider, root_ino) = make_child_provider(100, &[("file.txt", 101, INodeType::File, 42)]); + + let mut children = HashMap::new(); + children.insert(OsString::from("repo-a"), (provider, root_ino)); + + let mock_root = MockRoot::new(children); + let composite = CompositeFs::new(mock_root, (1000, 1000)); + let root_inode = composite.make_root_inode(); + + let table = FutureBackedCache::default(); + table.insert_sync(1, root_inode); + let afs = AsyncFs::new_preseeded(composite, &table); + + let tracked = afs + .lookup(LoadedAddr(1), OsStr::new("repo-a")) + .await + .unwrap(); + + assert_eq!( + tracked.inode.itype, + INodeType::Directory, + "child should appear as a directory at composite level" + ); + assert_ne!( + tracked.inode.addr, 1, + "child should have a composite-level address different from root" + ); + assert_eq!( + tracked.inode.parent, + Some(1), + "child directory should have the composite root as parent" + ); +} + +#[tokio::test(flavor = "multi_thread", worker_threads = 2)] +async fn composite_root_readdir_lists_children() { + let (prov_a, root_a) = make_child_provider(100, &[]); + let (prov_b, root_b) = make_child_provider(200, &[]); + + let mut children = HashMap::new(); + children.insert(OsString::from("alpha"), (prov_a, root_a)); + children.insert(OsString::from("beta"), (prov_b, root_b)); + + let mock_root = MockRoot::new(children); + let composite = CompositeFs::new(mock_root, (1000, 1000)); + let root_inode = composite.make_root_inode(); + + let table = FutureBackedCache::default(); + table.insert_sync(1, root_inode); + let afs = AsyncFs::new_preseeded(composite, &table); + + let mut entries = Vec::new(); + afs.readdir(LoadedAddr(1), 0, |de, _offset| { + entries.push(de.name.to_os_string()); + false + }) + .await + .unwrap(); + + entries.sort(); + assert_eq!(entries.len(), 2, "should list both children"); + assert_eq!(entries[0], "alpha"); + assert_eq!(entries[1], "beta"); +} + +#[tokio::test(flavor = "multi_thread", worker_threads = 2)] +async fn composite_delegated_lookup_reaches_child() { + let (provider, root_ino) = make_child_provider( + 100, + &[ + ("readme.md", 101, INodeType::File, 256), + ("src", 102, INodeType::Directory, 0), + ], + ); + + let mut children = HashMap::new(); + children.insert(OsString::from("my-repo"), (provider, root_ino)); + + let mock_root = MockRoot::new(children); + let composite = CompositeFs::new(mock_root, (1000, 1000)); + let root_inode = composite.make_root_inode(); + + let table = FutureBackedCache::default(); + table.insert_sync(1, root_inode); + let afs = AsyncFs::new_preseeded(composite, &table); + + // First, lookup the child at root level. + let child_dir = afs + .lookup(LoadedAddr(1), OsStr::new("my-repo")) + .await + .unwrap(); + let child_addr = child_dir.inode.addr; + + // Then, lookup a file inside the child. + let file = afs + .lookup(LoadedAddr(child_addr), OsStr::new("readme.md")) + .await + .unwrap(); + + assert_eq!(file.inode.itype, INodeType::File); + assert_eq!(file.inode.size, 256); + + // Also lookup a subdirectory inside the child. + let subdir = afs + .lookup(LoadedAddr(child_addr), OsStr::new("src")) + .await + .unwrap(); + + assert_eq!(subdir.inode.itype, INodeType::Directory); +} + +#[tokio::test(flavor = "multi_thread", worker_threads = 2)] +async fn composite_open_and_read_through_child() { + let (provider, root_ino) = make_child_provider(100, &[("hello.txt", 101, INodeType::File, 20)]); + + let mut children = HashMap::new(); + children.insert(OsString::from("repo"), (provider, root_ino)); + + let mock_root = MockRoot::new(children); + let composite = CompositeFs::new(mock_root, (1000, 1000)); + let root_inode = composite.make_root_inode(); + + let table = FutureBackedCache::default(); + table.insert_sync(1, root_inode); + let afs = AsyncFs::new_preseeded(composite, &table); + + // Navigate to the file. + let child_dir = afs.lookup(LoadedAddr(1), OsStr::new("repo")).await.unwrap(); + let file_tracked = afs + .lookup(LoadedAddr(child_dir.inode.addr), OsStr::new("hello.txt")) + .await + .unwrap(); + let file_addr = file_tracked.inode.addr; + + // Open and read. + let open_file = afs + .open(LoadedAddr(file_addr), OpenFlags::empty()) + .await + .unwrap(); + let data = open_file.read(0, 1024).await.unwrap(); + + assert_eq!( + data, + Bytes::from("content of hello.txt"), + "should read the file content through the composite layer" + ); +} + +#[tokio::test(flavor = "multi_thread", worker_threads = 2)] +async fn composite_lookup_unknown_child_returns_enoent() { + let (provider, root_ino) = make_child_provider(100, &[]); + + let mut children = HashMap::new(); + children.insert(OsString::from("existing"), (provider, root_ino)); + + let mock_root = MockRoot::new(children); + let composite = CompositeFs::new(mock_root, (1000, 1000)); + let root_inode = composite.make_root_inode(); + + let table = FutureBackedCache::default(); + table.insert_sync(1, root_inode); + let afs = AsyncFs::new_preseeded(composite, &table); + + let err = afs + .lookup(LoadedAddr(1), OsStr::new("nonexistent")) + .await + .unwrap_err(); + + assert_eq!( + err.raw_os_error(), + Some(libc::ENOENT), + "looking up a nonexistent child at root should return ENOENT" + ); +} + +#[tokio::test(flavor = "multi_thread", worker_threads = 2)] +async fn composite_readdir_delegated_lists_child_contents() { + let (provider, root_ino) = make_child_provider( + 100, + &[ + ("a.rs", 101, INodeType::File, 10), + ("b.rs", 102, INodeType::File, 20), + ("lib", 103, INodeType::Directory, 0), + ], + ); + + let mut children = HashMap::new(); + children.insert(OsString::from("repo"), (provider, root_ino)); + + let mock_root = MockRoot::new(children); + let composite = CompositeFs::new(mock_root, (1000, 1000)); + let root_inode = composite.make_root_inode(); + + let table = FutureBackedCache::default(); + table.insert_sync(1, root_inode); + let afs = AsyncFs::new_preseeded(composite, &table); + + // Navigate into the child. + let child_dir = afs.lookup(LoadedAddr(1), OsStr::new("repo")).await.unwrap(); + + // Readdir inside the child. + let mut entries = Vec::new(); + afs.readdir(LoadedAddr(child_dir.inode.addr), 0, |de, _offset| { + entries.push((de.name.to_os_string(), de.inode.itype)); + false + }) + .await + .unwrap(); + + entries.sort_by(|(a, _), (b, _)| a.cmp(b)); + assert_eq!(entries.len(), 3); + assert_eq!(entries[0], (OsString::from("a.rs"), INodeType::File)); + assert_eq!(entries[1], (OsString::from("b.rs"), INodeType::File)); + assert_eq!(entries[2], (OsString::from("lib"), INodeType::Directory)); +} + +#[tokio::test(flavor = "multi_thread", worker_threads = 2)] +async fn composite_repeated_lookup_returns_same_addr() { + let (provider, root_ino) = make_child_provider(100, &[]); + + let mut children = HashMap::new(); + children.insert(OsString::from("repo"), (provider, root_ino)); + + let mock_root = MockRoot::new(children); + let composite = CompositeFs::new(mock_root, (1000, 1000)); + let root_inode = composite.make_root_inode(); + + let table = FutureBackedCache::default(); + table.insert_sync(1, root_inode); + let afs = AsyncFs::new_preseeded(composite, &table); + + let first = afs.lookup(LoadedAddr(1), OsStr::new("repo")).await.unwrap(); + let second = afs.lookup(LoadedAddr(1), OsStr::new("repo")).await.unwrap(); + + assert_eq!( + first.inode.addr, second.inode.addr, + "repeated lookups for the same child should return the same composite address" + ); +} From aa989f7118c8159bce364af0b9d8f535022c6955 Mon Sep 17 00:00:00 2001 From: Marko Vejnovic Date: Fri, 20 Feb 2026 15:03:06 -0800 Subject: [PATCH 22/58] feat: add domain roots (MesaRoot, StandardOrgRoot, GithubOrgRoot) and OrgChildDP enum --- src/fs/mescloud/mod.rs | 1 + src/fs/mescloud/roots.rs | 483 +++++++++++++++++++++++++++++++++++++++ 2 files changed, 484 insertions(+) create mode 100644 src/fs/mescloud/roots.rs diff --git a/src/fs/mescloud/mod.rs b/src/fs/mescloud/mod.rs index 15a70725..a9e5e155 100644 --- a/src/fs/mescloud/mod.rs +++ b/src/fs/mescloud/mod.rs @@ -32,6 +32,7 @@ pub use org::OrgConfig; use org::OrgFs; pub mod repo; +mod roots; struct HeaderInjector<'a>(&'a mut reqwest::header::HeaderMap); diff --git a/src/fs/mescloud/roots.rs b/src/fs/mescloud/roots.rs new file mode 100644 index 00000000..aafe0c4c --- /dev/null +++ b/src/fs/mescloud/roots.rs @@ -0,0 +1,483 @@ +//! Domain-specific [`CompositeRoot`] implementations and the [`OrgChildDP`] enum. +//! +//! Bridges the generic `CompositeFs` from `lib/fs/composite.rs` with +//! Mesa/GitHub-specific org and repo resolution logic. +//! +//! These types are not yet wired into the daemon entry point; they will be +//! connected in a follow-up change that replaces the old `MesaFS` + `OrgFs` +//! pipeline. +#![expect(dead_code, reason = "wired in the follow-up daemon change")] + +use std::ffi::{OsStr, OsString}; +use std::future::Future; +use std::sync::Arc; +use std::time::SystemTime; + +use base64::Engine as _; +use futures::TryStreamExt as _; +use mesa_dev::MesaClient; +use tracing::warn; + +use git_fs::cache::fcache::FileCache; +use git_fs::fs::async_fs::{FileReader, FsDataProvider}; +use git_fs::fs::composite::{ChildDescriptor, CompositeFs, CompositeReader, CompositeRoot}; +use git_fs::fs::{INode, INodeType, InodeAddr, InodePerms, OpenFlags}; + +use super::common::MesaApiError; +use super::repo::{MesFileReader, MesRepoProvider}; +use crate::app_config::CacheConfig; + +const CHILD_ROOT_ADDR: InodeAddr = 1; + +fn mesa_api_error_to_io(e: MesaApiError) -> std::io::Error { + match &e { + MesaApiError::Response { status, .. } if *status == 404 => { + std::io::Error::from_raw_os_error(libc::ENOENT) + } + MesaApiError::Reqwest(_) + | MesaApiError::ReqwestMiddleware(_) + | MesaApiError::Serde(_) + | MesaApiError::SerdePath(_) + | MesaApiError::Io(_) + | MesaApiError::Response { .. } => std::io::Error::other(e), + } +} + +/// Create a [`MesRepoProvider`] and its root [`INode`] for a given repo. +async fn create_repo_provider( + client: &MesaClient, + org_name: &str, + repo_name: &str, + ref_: &str, + fs_owner: (u32, u32), + cache_config: &CacheConfig, +) -> (MesRepoProvider, INode) { + let file_cache = match cache_config.max_size { + Some(max_size) if max_size.as_u64() > 0 => { + let cache_dir = cache_config.path.join(org_name).join(repo_name); + let max_bytes = max_size.as_u64().try_into().unwrap_or(usize::MAX); + match FileCache::new(&cache_dir, max_bytes).await { + Ok(cache) => Some(Arc::new(cache)), + Err(e) => { + warn!(error = ?e, org = %org_name, repo = %repo_name, + "failed to create file cache, continuing without caching"); + None + } + } + } + _ => None, + }; + + let provider = MesRepoProvider::new( + client.clone(), + org_name.to_owned(), + repo_name.to_owned(), + ref_.to_owned(), + fs_owner, + file_cache, + ); + + provider.seed_root_path(CHILD_ROOT_ADDR); + + let now = SystemTime::now(); + let root_ino = INode { + addr: CHILD_ROOT_ADDR, + permissions: InodePerms::from_bits_truncate(0o755), + uid: fs_owner.0, + gid: fs_owner.1, + create_time: now, + last_modified_at: now, + parent: None, + size: 0, + itype: INodeType::Directory, + }; + + (provider, root_ino) +} + +/// Returns `Ok(())` if the error is a 404; otherwise returns the IO error. +/// +/// Callers use this to treat 404 as "not found" (return `Ok(None)`) while +/// propagating all other API errors. +fn check_not_found(e: MesaApiError) -> Result<(), std::io::Error> { + match &e { + MesaApiError::Response { status, .. } if *status == 404 => Ok(()), + MesaApiError::Reqwest(_) + | MesaApiError::ReqwestMiddleware(_) + | MesaApiError::Serde(_) + | MesaApiError::SerdePath(_) + | MesaApiError::Io(_) + | MesaApiError::Response { .. } => Err(mesa_api_error_to_io(e)), + } +} + +pub(super) struct StandardOrgRoot { + client: MesaClient, + org_name: String, + cache_config: CacheConfig, + fs_owner: (u32, u32), +} + +impl StandardOrgRoot { + pub(super) fn new( + client: MesaClient, + org_name: String, + cache_config: CacheConfig, + fs_owner: (u32, u32), + ) -> Self { + Self { + client, + org_name, + cache_config, + fs_owner, + } + } +} + +impl CompositeRoot for StandardOrgRoot { + type ChildDP = MesRepoProvider; + + async fn resolve_child( + &self, + name: &OsStr, + ) -> Result>, std::io::Error> { + let name_str = name.to_str().ok_or_else(|| { + std::io::Error::new( + std::io::ErrorKind::InvalidData, + "repo name contains non-UTF-8 characters", + ) + })?; + + let repo = match self + .client + .org(&self.org_name) + .repos() + .at(name_str) + .get() + .await + .map_err(MesaApiError::from) + { + Ok(repo) => repo, + Err(e) => { + check_not_found(e)?; + return Ok(None); + } + }; + + // Single-repo GET returns `default_branch: String` (non-optional), + // unlike the list endpoint which returns `Option`. + let (provider, root_ino) = create_repo_provider( + &self.client, + &self.org_name, + name_str, + &repo.default_branch, + self.fs_owner, + &self.cache_config, + ) + .await; + + Ok(Some(ChildDescriptor { + name: name.to_os_string(), + provider, + root_ino, + })) + } + + async fn list_children(&self) -> Result>, std::io::Error> { + let repos: Vec = self + .client + .org(&self.org_name) + .repos() + .list(None) + .try_collect() + .await + .map_err(MesaApiError::from) + .map_err(mesa_api_error_to_io)?; + + let mut children = Vec::with_capacity(repos.len()); + for repo in repos { + let Some(repo_name) = repo.name else { + continue; + }; + let default_branch = repo.default_branch.unwrap_or_else(|| "main".to_owned()); + + let (provider, root_ino) = create_repo_provider( + &self.client, + &self.org_name, + &repo_name, + &default_branch, + self.fs_owner, + &self.cache_config, + ) + .await; + + children.push(ChildDescriptor { + name: OsString::from(repo_name), + provider, + root_ino, + }); + } + + Ok(children) + } +} + +pub(super) struct GithubRepoRoot { + client: MesaClient, + org_name: String, + owner: String, + cache_config: CacheConfig, + fs_owner: (u32, u32), +} + +impl CompositeRoot for GithubRepoRoot { + type ChildDP = MesRepoProvider; + + async fn resolve_child( + &self, + name: &OsStr, + ) -> Result>, std::io::Error> { + let repo_name = name.to_str().ok_or_else(|| { + std::io::Error::new( + std::io::ErrorKind::InvalidData, + "repo name contains non-UTF-8 characters", + ) + })?; + + let full_decoded = format!("{}/{}", self.owner, repo_name); + let encoded = base64::engine::general_purpose::STANDARD.encode(&full_decoded); + + let repo = match self + .client + .org(&self.org_name) + .repos() + .at(&encoded) + .get() + .await + .map_err(MesaApiError::from) + { + Ok(repo) => repo, + Err(e) => { + check_not_found(e)?; + return Ok(None); + } + }; + + // Single-repo GET returns `default_branch: String` (non-optional). + let (provider, root_ino) = create_repo_provider( + &self.client, + &self.org_name, + &encoded, + &repo.default_branch, + self.fs_owner, + &self.cache_config, + ) + .await; + + Ok(Some(ChildDescriptor { + name: name.to_os_string(), + provider, + root_ino, + })) + } + + async fn list_children(&self) -> Result>, std::io::Error> { + Err(std::io::Error::from_raw_os_error(libc::EPERM)) + } +} + +pub(super) struct GithubOrgRoot { + client: MesaClient, + org_name: String, + cache_config: CacheConfig, + fs_owner: (u32, u32), +} + +impl GithubOrgRoot { + pub(super) fn new( + client: MesaClient, + org_name: String, + cache_config: CacheConfig, + fs_owner: (u32, u32), + ) -> Self { + Self { + client, + org_name, + cache_config, + fs_owner, + } + } +} + +impl CompositeRoot for GithubOrgRoot { + type ChildDP = CompositeFs; + + async fn resolve_child( + &self, + name: &OsStr, + ) -> Result>, std::io::Error> { + let owner = name.to_str().ok_or_else(|| { + std::io::Error::new( + std::io::ErrorKind::InvalidData, + "owner name contains non-UTF-8 characters", + ) + })?; + + let repo_root = GithubRepoRoot { + client: self.client.clone(), + org_name: self.org_name.clone(), + owner: owner.to_owned(), + cache_config: self.cache_config.clone(), + fs_owner: self.fs_owner, + }; + + let composite = CompositeFs::new(repo_root, self.fs_owner); + let root_ino = composite.make_root_inode(); + + Ok(Some(ChildDescriptor { + name: name.to_os_string(), + provider: composite, + root_ino, + })) + } + + async fn list_children(&self) -> Result>, std::io::Error> { + Err(std::io::Error::from_raw_os_error(libc::EPERM)) + } +} + +#[derive(Clone)] +pub(super) enum OrgChildDP { + Standard(CompositeFs), + Github(CompositeFs), +} + +impl OrgChildDP { + fn make_root_inode(&self) -> INode { + match self { + Self::Standard(c) => c.make_root_inode(), + Self::Github(c) => c.make_root_inode(), + } + } +} + +impl FsDataProvider for OrgChildDP { + type Reader = OrgChildReader; + + fn lookup( + &self, + parent: INode, + name: &OsStr, + ) -> impl Future> + Send { + let this = self.clone(); + let name = name.to_os_string(); + async move { + match this { + Self::Standard(c) => c.lookup(parent, &name).await, + Self::Github(c) => c.lookup(parent, &name).await, + } + } + } + + fn readdir( + &self, + parent: INode, + ) -> impl Future, std::io::Error>> + Send { + let this = self.clone(); + async move { + match this { + Self::Standard(c) => c.readdir(parent).await, + Self::Github(c) => c.readdir(parent).await, + } + } + } + + fn open( + &self, + inode: INode, + flags: OpenFlags, + ) -> impl Future> + Send { + let this = self.clone(); + async move { + match this { + Self::Standard(c) => c.open(inode, flags).await.map(OrgChildReader::Standard), + Self::Github(c) => c.open(inode, flags).await.map(OrgChildReader::Github), + } + } + } +} + +pub(super) enum OrgChildReader { + Standard(CompositeReader), + Github(CompositeReader>), +} + +impl std::fmt::Debug for OrgChildReader { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + match self { + Self::Standard(_) => f.debug_tuple("Standard").finish(), + Self::Github(_) => f.debug_tuple("Github").finish(), + } + } +} + +impl FileReader for OrgChildReader { + fn read( + &self, + offset: u64, + size: u32, + ) -> impl Future> + Send { + match self { + Self::Standard(r) => futures::future::Either::Left(r.read(offset, size)), + Self::Github(r) => futures::future::Either::Right(r.read(offset, size)), + } + } + + fn close(&self) -> impl Future> + Send { + match self { + Self::Standard(r) => futures::future::Either::Left(r.close()), + Self::Github(r) => futures::future::Either::Right(r.close()), + } + } +} + +pub(super) struct MesaRoot { + orgs: Vec<(OsString, OrgChildDP)>, +} + +impl MesaRoot { + pub(super) fn new(orgs: Vec<(OsString, OrgChildDP)>) -> Self { + Self { orgs } + } +} + +impl CompositeRoot for MesaRoot { + type ChildDP = OrgChildDP; + + async fn resolve_child( + &self, + name: &OsStr, + ) -> Result>, std::io::Error> { + let found = self.orgs.iter().find(|(n, _)| n == name); + match found { + Some((_, dp)) => Ok(Some(ChildDescriptor { + name: name.to_os_string(), + provider: dp.clone(), + root_ino: dp.make_root_inode(), + })), + None => Ok(None), + } + } + + async fn list_children(&self) -> Result>, std::io::Error> { + Ok(self + .orgs + .iter() + .map(|(name, dp)| ChildDescriptor { + name: name.clone(), + provider: dp.clone(), + root_ino: dp.make_root_inode(), + }) + .collect()) + } +} From 9885de0dc3d56f41a9c055549b35b5d47faf6154 Mon Sep 17 00:00:00 2001 From: Marko Vejnovic Date: Fri, 20 Feb 2026 15:23:43 -0800 Subject: [PATCH 23/58] refactor: wire CompositeFs into daemon, delete old composite.rs and ChildFs --- lib/fs/async_fs.rs | 3 +- src/daemon.rs | 66 ++--- src/fs/mescloud/common.rs | 139 +---------- src/fs/mescloud/composite.rs | 456 ----------------------------------- src/fs/mescloud/mod.rs | 378 +---------------------------- src/fs/mescloud/org.rs | 390 ------------------------------ src/fs/mescloud/repo.rs | 248 +------------------ src/fs/mescloud/roots.rs | 40 +-- 8 files changed, 73 insertions(+), 1647 deletions(-) delete mode 100644 src/fs/mescloud/composite.rs delete mode 100644 src/fs/mescloud/org.rs diff --git a/lib/fs/async_fs.rs b/lib/fs/async_fs.rs index 761149d2..1f81a87e 100644 --- a/lib/fs/async_fs.rs +++ b/lib/fs/async_fs.rs @@ -370,8 +370,7 @@ impl<'tbl, DP: FsDataProvider> AsyncFs<'tbl, DP> { /// /// The `is_populated` check-then-populate is **not** atomic. If two /// concurrent callers invoke `readdir` for the same parent, both may call - /// `dp.readdir()` and insert duplicate children. This is safe when the - /// caller serializes access (e.g. via `&mut self` on the `Fs` trait). + /// `dp.readdir()` and insert duplicate children. /// /// TODO(MES-746): Implement `opendir` and `releasedir` to snapshot directory contents and /// avoid racing with `lookup`/`createfile`. diff --git a/src/daemon.rs b/src/daemon.rs index 0a7a9f31..102e476b 100644 --- a/src/daemon.rs +++ b/src/daemon.rs @@ -1,7 +1,6 @@ use tokio::select; use crate::app_config; -use crate::fs::mescloud::{MesaFS, OrgConfig}; use tracing::{debug, error, info}; mod managed_fuse { @@ -15,12 +14,11 @@ mod managed_fuse { use nix::errno::Errno; use git_fs::cache::async_backed::FutureBackedCache; - use git_fs::fs::{INode, INodeType, InodePerms}; - use super::{MesaFS, OrgConfig, app_config, debug, error}; - use crate::fs::mescloud::MesaFsProvider; + use super::{app_config, debug, error}; use fuser::BackgroundSession; use git_fs::fs::fuser::FuserAdapter; + use secrecy::ExposeSecret as _; pub struct FuseCoreScope { _session: BackgroundSession, @@ -40,32 +38,44 @@ mod managed_fuse { config: app_config::Config, handle: tokio::runtime::Handle, ) -> Result { - let orgs = config - .organizations - .iter() - .map(|(org_name, org)| OrgConfig { - name: org_name.clone(), - api_key: org.api_key.clone(), - }); - let mesa_fs = MesaFS::new(orgs, (config.uid, config.gid), &config.cache); + let fs_owner = (config.uid, config.gid); + + let mut org_children = Vec::new(); + for (org_name, org_conf) in &config.organizations { + let client = + crate::fs::mescloud::build_mesa_client(org_conf.api_key.expose_secret()); + let dp = if org_name == "github" { + let github_org_root = crate::fs::mescloud::roots::GithubOrgRoot::new( + client, + org_name.clone(), + config.cache.clone(), + fs_owner, + ); + crate::fs::mescloud::roots::OrgChildDP::Github( + git_fs::fs::composite::CompositeFs::new(github_org_root, fs_owner), + ) + } else { + let standard_org_root = crate::fs::mescloud::roots::StandardOrgRoot::new( + client, + org_name.clone(), + config.cache.clone(), + fs_owner, + ); + crate::fs::mescloud::roots::OrgChildDP::Standard( + git_fs::fs::composite::CompositeFs::new(standard_org_root, fs_owner), + ) + }; + org_children.push((std::ffi::OsString::from(org_name), dp)); + } + + let mesa_root = crate::fs::mescloud::roots::MesaRoot::new(org_children); + let composite = git_fs::fs::composite::CompositeFs::new(mesa_root, fs_owner); let table = FutureBackedCache::default(); - let now = std::time::SystemTime::now(); - let root = INode { - addr: 1, - permissions: InodePerms::from_bits_truncate(0o755), - uid: config.uid, - gid: config.gid, - create_time: now, - last_modified_at: now, - parent: None, - size: 0, - itype: INodeType::Directory, - }; - table.insert_sync(1, root); - - let provider = MesaFsProvider::new(mesa_fs); - let fuse_adapter = FuserAdapter::new(table, provider, handle); + let root_inode = composite.make_root_inode(); + table.insert_sync(1, root_inode); + + let fuse_adapter = FuserAdapter::new(table, composite, handle); let mount_opts = [ fuser::MountOption::FSName("git-fs".to_owned()), fuser::MountOption::RO, diff --git a/src/fs/mescloud/common.rs b/src/fs/mescloud/common.rs index 473b5e54..cf57e392 100644 --- a/src/fs/mescloud/common.rs +++ b/src/fs/mescloud/common.rs @@ -1,9 +1,3 @@ -//! Shared types and helpers used by both `MesaFS` and `RepoFs`. - -use std::ffi::{OsStr, OsString}; - -use bytes::Bytes; -use git_fs::fs::{FileHandle, INode, InodeAddr, OpenFlags as LibOpenFlags}; use mesa_dev::low_level::apis; use thiserror::Error; @@ -46,129 +40,16 @@ impl From> for MesaAp } } -#[derive(Debug, Error)] -pub enum LookupError { - #[error("inode not found")] - InodeNotFound, - - #[error("remote mesa error")] - RemoteMesaError(#[from] MesaApiError), -} - -#[derive(Debug, Error)] -pub enum GetAttrError { - #[error("inode not found")] - InodeNotFound, -} - -#[derive(Debug, Clone, Copy, Error)] -pub enum OpenError { - #[error("inode not found")] - InodeNotFound, -} - -#[derive(Debug, Error)] -pub enum ReadError { - #[error("file not open")] - FileNotOpen, - - #[error("inode not found")] - InodeNotFound, - - #[error("remote mesa error")] - RemoteMesaError(#[from] MesaApiError), - - #[error("content is not a file")] - NotAFile, - - #[error("base64 decode error: {0}")] - Base64Decode(#[from] base64::DecodeError), -} - -#[derive(Debug, Error)] -pub enum ReadDirError { - #[error("inode not found")] - InodeNotFound, - - #[error("remote mesa error")] - RemoteMesaError(#[from] MesaApiError), - - #[error("inode is not a directory")] - NotADirectory, - - #[error("operation not permitted")] - NotPermitted, -} - -impl From for ReadDirError { - fn from(e: LookupError) -> Self { - match e { - LookupError::RemoteMesaError(api) => Self::RemoteMesaError(api), - LookupError::InodeNotFound => Self::InodeNotFound, +pub(super) fn mesa_api_error_to_io(e: MesaApiError) -> std::io::Error { + match &e { + MesaApiError::Response { status, .. } if *status == 404 => { + std::io::Error::from_raw_os_error(libc::ENOENT) } - } -} - -#[derive(Debug, Error)] -pub enum ReleaseError { - #[error("file not open")] - FileNotOpen, -} - -/// A directory entry for readdir results, using lib types. -pub struct FsDirEntry { - pub ino: InodeAddr, - pub name: OsString, -} - -/// Trait for child filesystems composed by [`CompositeFs`](super::composite::CompositeFs). -/// -/// Uses lib types (`INode`, `InodeAddr`) directly — no conversion to/from `FileAttr`. -/// Replaces the old `Fs + InodeCachePeek` bound. -#[async_trait::async_trait] -pub(super) trait ChildFs: Send + Sync { - /// Look up a child by name within the given parent directory. - async fn lookup(&mut self, parent: InodeAddr, name: &OsStr) -> Result; - - /// List all children of a directory, returning full `INode` data for each. - async fn readdir(&mut self, ino: InodeAddr) -> Result, ReadDirError>; - - /// Open a file for reading. - async fn open(&mut self, ino: InodeAddr, flags: LibOpenFlags) -> Result; - - /// Read data from an open file. - async fn read( - &mut self, - ino: InodeAddr, - fh: FileHandle, - offset: u64, - size: u32, - ) -> Result; - - /// Release (close) a file handle. - async fn release(&mut self, ino: InodeAddr, fh: FileHandle) -> Result<(), ReleaseError>; -} - -// Tests kept inline: these types live in the binary crate and are not -// re-exported through the `git_fs` lib, so integration tests in `tests/` -// cannot access them. -#[cfg(test)] -mod tests { - use super::*; - - #[test] - fn lookup_inode_not_found_converts_to_readdir_inode_not_found() { - let err: ReadDirError = LookupError::InodeNotFound.into(); - assert!(matches!(err, ReadDirError::InodeNotFound)); - } - - #[test] - fn lookup_remote_error_converts_to_readdir_remote_error() { - let api_err = MesaApiError::Response { - status: 500, - body: "test".to_owned(), - }; - let err: ReadDirError = LookupError::RemoteMesaError(api_err).into(); - assert!(matches!(err, ReadDirError::RemoteMesaError(_))); + MesaApiError::Reqwest(_) + | MesaApiError::ReqwestMiddleware(_) + | MesaApiError::Serde(_) + | MesaApiError::SerdePath(_) + | MesaApiError::Io(_) + | MesaApiError::Response { .. } => std::io::Error::other(e), } } diff --git a/src/fs/mescloud/composite.rs b/src/fs/mescloud/composite.rs deleted file mode 100644 index 91c35806..00000000 --- a/src/fs/mescloud/composite.rs +++ /dev/null @@ -1,456 +0,0 @@ -use std::collections::HashMap; -use std::ffi::OsStr; -use std::sync::atomic::{AtomicU64, Ordering}; - -use bytes::Bytes; -use git_fs::cache::async_backed::FutureBackedCache; -use git_fs::fs::dcache::DCache; -use git_fs::fs::{ - AsyncFsStats, FileHandle, INode, INodeType, InodeAddr, InodePerms, LoadedAddr, OpenFlags, -}; -use rustc_hash::FxHashMap; -use tracing::{instrument, trace}; - -use super::common::{ - ChildFs, FsDirEntry, GetAttrError, LookupError, OpenError, ReadDirError, ReadError, - ReleaseError, -}; - -/// Bidirectional inode mapping between outer (composite) and inner (child) address spaces. -/// -/// Convention: **outer = left, inner = right**. -pub(super) struct InodeBridge { - map: bimap::BiMap, -} - -impl InodeBridge { - pub fn new() -> Self { - Self { - map: bimap::BiMap::new(), - } - } - - pub fn insert(&mut self, outer: InodeAddr, inner: InodeAddr) { - self.map.insert(outer, inner); - } - - pub fn forward(&self, outer: InodeAddr) -> Option { - self.map.get_by_left(&outer).copied() - } - - #[expect(dead_code, reason = "will be needed by future callers")] - pub fn backward(&self, inner: InodeAddr) -> Option { - self.map.get_by_right(&inner).copied() - } - - /// Look up inner->outer, or allocate a new outer address if unmapped. - pub fn backward_or_insert( - &mut self, - inner: InodeAddr, - allocate: impl FnOnce() -> InodeAddr, - ) -> InodeAddr { - if let Some(&outer) = self.map.get_by_right(&inner) { - outer - } else { - let outer = allocate(); - self.map.insert(outer, inner); - outer - } - } - - pub fn remove_by_outer(&mut self, outer: InodeAddr) { - self.map.remove_by_left(&outer); - } - - #[expect(dead_code, reason = "will be needed by future callers")] - pub fn get_inner(&self, outer: InodeAddr) -> Option<&InodeAddr> { - self.map.get_by_left(&outer) - } -} - -pub(super) struct ChildSlot { - pub inner: Inner, - pub bridge: InodeBridge, -} - -/// Tracks an open file: which child slot owns it and the inner fh. -struct OpenFileEntry { - slot_idx: usize, - inner_ino: InodeAddr, - inner_fh: FileHandle, -} - -pub(super) struct CompositeFs { - pub(super) inode_table: FutureBackedCache, - pub(super) directory_cache: DCache, - next_ino: AtomicU64, - next_fh: AtomicU64, - refcounts: FxHashMap, - pub(super) readdir_buf: Vec, - open_files: HashMap, - pub(super) child_inodes: HashMap, - pub(super) inode_to_slot: HashMap, - pub(super) slots: Vec>, - fs_owner: (u32, u32), - block_size: u32, -} - -impl CompositeFs { - pub const ROOT_INO: InodeAddr = 1; - - pub fn new(fs_owner: (u32, u32), block_size: u32) -> Self { - let inode_table = FutureBackedCache::default(); - let now = std::time::SystemTime::now(); - let root = INode { - addr: Self::ROOT_INO, - permissions: InodePerms::from_bits_truncate(0o755), - uid: fs_owner.0, - gid: fs_owner.1, - create_time: now, - last_modified_at: now, - parent: None, - size: 0, - itype: INodeType::Directory, - }; - inode_table.insert_sync(Self::ROOT_INO, root); - - let mut refcounts = FxHashMap::default(); - refcounts.insert(Self::ROOT_INO, 1); - - Self { - inode_table, - directory_cache: DCache::new(), - next_ino: AtomicU64::new(Self::ROOT_INO + 1), - next_fh: AtomicU64::new(1), - refcounts, - readdir_buf: Vec::new(), - open_files: HashMap::new(), - child_inodes: HashMap::new(), - inode_to_slot: HashMap::new(), - slots: Vec::new(), - fs_owner, - block_size, - } - } - - pub fn allocate_inode(&self) -> InodeAddr { - self.next_ino.fetch_add(1, Ordering::Relaxed) - } - - pub fn fs_owner(&self) -> (u32, u32) { - self.fs_owner - } - - #[expect(dead_code, reason = "available for future use")] - pub fn block_size(&self) -> u32 { - self.block_size - } - - pub fn add_child(&mut self, inner: Inner, child_root_ino: InodeAddr) -> InodeAddr { - self.add_child_with_parent(inner, child_root_ino, Self::ROOT_INO) - } - - pub fn cache_inode(&self, inode: INode) { - self.inode_table.insert_sync(inode.addr, inode); - } - - /// Insert the inode into the table and initialise its refcount to zero. - /// - /// The caller is responsible for bumping the refcount via [`inc_rc`](Self::inc_rc). - pub fn cache_inode_and_init_rc(&mut self, inode: INode) { - let addr = inode.addr; - self.inode_table.insert_sync(addr, inode); - self.refcounts.entry(addr).or_insert(0); - } - - pub fn inc_rc(&mut self, addr: InodeAddr) -> Option { - let rc = self.refcounts.get_mut(&addr)?; - *rc += 1; - Some(*rc) - } - - pub fn slot_for_inode(&self, ino: InodeAddr) -> Option { - self.inode_to_slot.get(&ino).copied() - } - - /// Like [`add_child`](Self::add_child) but sets a custom parent inode - /// instead of always using `ROOT_INO`. - pub fn add_child_with_parent( - &mut self, - inner: Inner, - child_root_ino: InodeAddr, - parent_ino: InodeAddr, - ) -> InodeAddr { - let outer_ino = self.allocate_inode(); - let now = std::time::SystemTime::now(); - let inode = INode { - addr: outer_ino, - permissions: InodePerms::from_bits_truncate(0o755), - uid: self.fs_owner.0, - gid: self.fs_owner.1, - create_time: now, - last_modified_at: now, - parent: Some(parent_ino), - size: 0, - itype: INodeType::Directory, - }; - self.inode_table.insert_sync(outer_ino, inode); - - let mut bridge = InodeBridge::new(); - bridge.insert(outer_ino, child_root_ino); - - let idx = self.slots.len(); - self.slots.push(ChildSlot { inner, bridge }); - self.child_inodes.insert(outer_ino, idx); - self.inode_to_slot.insert(outer_ino, idx); - - outer_ino - } -} - -impl CompositeFs { - #[instrument(name = "CompositeFs::delegated_lookup", skip(self, name))] - pub async fn delegated_lookup( - &mut self, - parent: InodeAddr, - name: &OsStr, - ) -> Result { - // Fast path: DCache hit + inode still in table - if let Some(dentry) = self.directory_cache.lookup(LoadedAddr(parent), name) - && let Some(inode) = self.inode_table.get(&dentry.ino.0).await - { - *self.refcounts.entry(inode.addr).or_insert(0) += 1; - return Ok(inode); - } - - // Slow path: delegate to child - let idx = self - .inode_to_slot - .get(&parent) - .copied() - .ok_or(LookupError::InodeNotFound)?; - let inner_parent = self.slots[idx] - .bridge - .forward(parent) - .ok_or(LookupError::InodeNotFound)?; - let inner_inode = self.slots[idx].inner.lookup(inner_parent, name).await?; - - let next_ino = &self.next_ino; - let outer_ino = self.slots[idx] - .bridge - .backward_or_insert(inner_inode.addr, || { - next_ino.fetch_add(1, Ordering::Relaxed) - }); - self.inode_to_slot.insert(outer_ino, idx); - - let remapped = INode { - addr: outer_ino, - ..inner_inode - }; - self.inode_table - .get_or_init(outer_ino, || async move { remapped }) - .await; - - let is_dir = matches!(inner_inode.itype, INodeType::Directory); - self.directory_cache - .insert( - LoadedAddr(parent), - name.to_os_string(), - LoadedAddr(outer_ino), - is_dir, - ) - .await; - - *self.refcounts.entry(outer_ino).or_insert(0) += 1; - let rc = self.refcounts[&outer_ino]; - trace!( - outer_ino, - inner_ino = inner_inode.addr, - rc, - "lookup: resolved via delegation" - ); - - Ok(remapped) - } - - #[instrument(name = "CompositeFs::delegated_readdir", skip(self))] - pub async fn delegated_readdir( - &mut self, - ino: InodeAddr, - ) -> Result<&[FsDirEntry], ReadDirError> { - let idx = self - .inode_to_slot - .get(&ino) - .copied() - .ok_or(ReadDirError::InodeNotFound)?; - - if !self.directory_cache.is_populated(LoadedAddr(ino)) { - let inner_ino = self.slots[idx] - .bridge - .forward(ino) - .ok_or(ReadDirError::InodeNotFound)?; - let inner_entries = self.slots[idx].inner.readdir(inner_ino).await?; - - for (name, child_inode) in &inner_entries { - let next_ino = &self.next_ino; - let outer_child = self.slots[idx] - .bridge - .backward_or_insert(child_inode.addr, || { - next_ino.fetch_add(1, Ordering::Relaxed) - }); - self.inode_to_slot.insert(outer_child, idx); - - let remapped = INode { - addr: outer_child, - ..*child_inode - }; - self.inode_table - .get_or_init(outer_child, || async move { remapped }) - .await; - - let is_dir = matches!(child_inode.itype, INodeType::Directory); - self.directory_cache - .insert( - LoadedAddr(ino), - name.clone(), - LoadedAddr(outer_child), - is_dir, - ) - .await; - } - - self.directory_cache.mark_populated(LoadedAddr(ino)); - } - - let mut children = self.directory_cache.readdir(LoadedAddr(ino)).await; - children.sort_unstable_by(|(a, _), (b, _)| a.cmp(b)); - - let mut entries = Vec::with_capacity(children.len()); - for (name, dvalue) in &children { - if let Some(inode) = self.inode_table.get(&dvalue.ino.0).await { - entries.push(FsDirEntry { - ino: inode.addr, - name: name.clone(), - }); - } - } - - self.readdir_buf = entries; - Ok(&self.readdir_buf) - } - - #[instrument(name = "CompositeFs::delegated_getattr", skip(self))] - pub async fn delegated_getattr(&self, ino: InodeAddr) -> Result { - self.inode_table - .get(&ino) - .await - .ok_or(GetAttrError::InodeNotFound) - } - - #[expect(dead_code, reason = "will be needed by future callers")] - #[must_use] - pub fn delegated_statfs(&self) -> AsyncFsStats { - AsyncFsStats { - block_size: self.block_size, - total_blocks: 0, - free_blocks: 0, - available_blocks: 0, - total_inodes: self.inode_table.len() as u64, - free_inodes: 0, - max_filename_length: 255, - } - } - - #[instrument(name = "CompositeFs::delegated_open", skip(self))] - pub async fn delegated_open( - &mut self, - ino: InodeAddr, - flags: OpenFlags, - ) -> Result { - let idx = self - .inode_to_slot - .get(&ino) - .copied() - .ok_or(OpenError::InodeNotFound)?; - let inner_ino = self.slots[idx] - .bridge - .forward(ino) - .ok_or(OpenError::InodeNotFound)?; - let inner_fh = self.slots[idx].inner.open(inner_ino, flags).await?; - - let outer_fh = self.next_fh.fetch_add(1, Ordering::Relaxed); - self.open_files.insert( - outer_fh, - OpenFileEntry { - slot_idx: idx, - inner_ino, - inner_fh, - }, - ); - - trace!(ino, outer_fh, inner_fh, "open: assigned fh"); - Ok(outer_fh) - } - - #[instrument(name = "CompositeFs::delegated_read", skip(self))] - pub async fn delegated_read( - &mut self, - fh: FileHandle, - offset: u64, - size: u32, - ) -> Result { - let entry = self.open_files.get(&fh).ok_or(ReadError::FileNotOpen)?; - let slot_idx = entry.slot_idx; - let inner_ino = entry.inner_ino; - let inner_fh = entry.inner_fh; - self.slots[slot_idx] - .inner - .read(inner_ino, inner_fh, offset, size) - .await - } - - #[instrument(name = "CompositeFs::delegated_release", skip(self))] - pub async fn delegated_release(&mut self, fh: FileHandle) -> Result<(), ReleaseError> { - let entry = self - .open_files - .remove(&fh) - .ok_or(ReleaseError::FileNotOpen)?; - let result = self.slots[entry.slot_idx] - .inner - .release(entry.inner_ino, entry.inner_fh) - .await; - trace!(fh, "release: cleaned up fh mapping"); - result - } - - /// Returns `true` if the inode was evicted. - /// - /// The composite only manages its own refcounts and inode table. - /// Inner filesystem inodes are managed by the inner FS itself through - /// its own lifecycle; the composite does not propagate forget to children. - #[expect(dead_code, reason = "will be needed by future callers")] - #[must_use] - #[instrument(name = "CompositeFs::delegated_forget", skip(self))] - pub fn delegated_forget(&mut self, ino: InodeAddr, nlookups: u64) -> bool { - let slot_idx = self.inode_to_slot.get(&ino).copied(); - - if let Some(rc) = self.refcounts.get_mut(&ino) { - *rc = rc.saturating_sub(nlookups); - if *rc > 0 { - return false; - } - self.refcounts.remove(&ino); - } else { - return false; - } - - self.inode_table.remove_sync(&ino); - self.child_inodes.remove(&ino); - self.inode_to_slot.remove(&ino); - if let Some(idx) = slot_idx { - self.slots[idx].bridge.remove_by_outer(ino); - } - - true - } -} diff --git a/src/fs/mescloud/mod.rs b/src/fs/mescloud/mod.rs index a9e5e155..ab3745db 100644 --- a/src/fs/mescloud/mod.rs +++ b/src/fs/mescloud/mod.rs @@ -1,38 +1,15 @@ -use std::ffi::{OsStr, OsString}; -use std::future::Future; -use std::sync::Arc; -use std::sync::atomic::{AtomicBool, Ordering}; -use std::time::SystemTime; - -use bytes::Bytes; -use git_fs::fs::{FileHandle, INode, INodeType, InodeAddr, InodePerms, OpenFlags}; use mesa_dev::MesaClient; use opentelemetry::propagation::Injector; -use secrecy::ExposeSecret as _; -use tracing::{instrument, trace, warn}; use tracing_opentelemetry::OpenTelemetrySpanExt as _; -use crate::app_config::CacheConfig; - -pub use common::FsDirEntry; -use composite::CompositeFs; - -pub use common::{GetAttrError, LookupError, OpenError, ReadDirError, ReadError, ReleaseError}; - #[cfg(feature = "staging")] const MESA_API_BASE_URL: &str = "https://staging.depot.mesa.dev/api/v1"; #[cfg(not(feature = "staging"))] const MESA_API_BASE_URL: &str = "https://depot.mesa.dev/api/v1"; mod common; -mod composite; - -mod org; -pub use org::OrgConfig; -use org::OrgFs; - pub mod repo; -mod roots; +pub mod roots; struct HeaderInjector<'a>(&'a mut reqwest::header::HeaderMap); @@ -72,7 +49,7 @@ impl reqwest_middleware::Middleware for OtelPropagationMiddleware { } } -fn build_mesa_client(api_key: &str) -> MesaClient { +pub fn build_mesa_client(api_key: &str) -> MesaClient { let client = reqwest_middleware::ClientBuilder::new(reqwest::Client::new()) .with(OtelPropagationMiddleware) .build(); @@ -82,354 +59,3 @@ fn build_mesa_client(api_key: &str) -> MesaClient { .with_client(client) .build() } - -/// Classifies an inode by its role in the mesa hierarchy. -enum InodeRole { - /// The filesystem root (ino == 1). - Root, - /// An inode owned by some org. - OrgOwned, -} - -/// The top-level `MesaFS` filesystem. -/// -/// Composes multiple [`OrgFs`] instances, each with its own inode namespace, -/// delegating to [`CompositeFs`] for inode/fh translation at each boundary. -pub struct MesaFS { - composite: CompositeFs, -} - -impl MesaFS { - const ROOT_NODE_INO: InodeAddr = CompositeFs::::ROOT_INO; - const BLOCK_SIZE: u32 = 4096; - - /// Create a new `MesaFS` instance. - #[must_use] - pub fn new( - orgs: impl Iterator, - fs_owner: (u32, u32), - cache: &CacheConfig, - ) -> Self { - let mut composite = CompositeFs::new(fs_owner, Self::BLOCK_SIZE); - for org_conf in orgs { - let client = build_mesa_client(org_conf.api_key.expose_secret()); - let org = OrgFs::new(org_conf.name, client, fs_owner, cache.clone()); - composite.add_child(org, OrgFs::ROOT_INO); - } - Self { composite } - } - - /// Classify an inode by its role. - fn inode_role(&self, ino: InodeAddr) -> Option { - if ino == Self::ROOT_NODE_INO { - return Some(InodeRole::Root); - } - if self.composite.child_inodes.contains_key(&ino) { - return Some(InodeRole::OrgOwned); - } - if self.composite.slot_for_inode(ino).is_some() { - return Some(InodeRole::OrgOwned); - } - None - } - - /// Ensure a mesa-level inode exists for the org at `org_idx`. - /// Does NOT bump rc. - async fn ensure_org_inode(&mut self, org_idx: usize) -> (InodeAddr, INode) { - let existing_ino = self - .composite - .child_inodes - .iter() - .find(|&(_, &idx)| idx == org_idx) - .map(|(&ino, _)| ino); - - if let Some(existing_ino) = existing_ino { - if let Ok(inode) = self.composite.delegated_getattr(existing_ino).await { - trace!( - ino = existing_ino, - org_idx, "ensure_org_inode: reusing existing inode" - ); - return (existing_ino, inode); - } - warn!( - ino = existing_ino, - org_idx, "ensure_org_inode: evicted, rebuilding" - ); - let now = SystemTime::now(); - let inode = INode { - addr: existing_ino, - permissions: InodePerms::from_bits_truncate(0o755), - uid: self.composite.fs_owner().0, - gid: self.composite.fs_owner().1, - create_time: now, - last_modified_at: now, - parent: Some(Self::ROOT_NODE_INO), - size: 0, - itype: INodeType::Directory, - }; - self.composite.cache_inode(inode); - self.composite.inode_to_slot.insert(existing_ino, org_idx); - self.composite.child_inodes.insert(existing_ino, org_idx); - return (existing_ino, inode); - } - - warn!( - org_idx, - "ensure_org_inode: no child_inodes entry for org slot" - ); - let org_name = self.composite.slots[org_idx].inner.name().to_owned(); - let ino = self.composite.allocate_inode(); - let now = SystemTime::now(); - let inode = INode { - addr: ino, - permissions: InodePerms::from_bits_truncate(0o755), - uid: self.composite.fs_owner().0, - gid: self.composite.fs_owner().1, - create_time: now, - last_modified_at: now, - parent: Some(Self::ROOT_NODE_INO), - size: 0, - itype: INodeType::Directory, - }; - self.composite.cache_inode(inode); - self.composite.child_inodes.insert(ino, org_idx); - self.composite.inode_to_slot.insert(ino, org_idx); - trace!(ino, org_idx, org = %org_name, "ensure_org_inode: allocated new inode"); - (ino, inode) - } - - #[instrument(name = "MesaFS::lookup", skip(self))] - pub async fn lookup(&mut self, parent: InodeAddr, name: &OsStr) -> Result { - let role = self.inode_role(parent).ok_or(LookupError::InodeNotFound)?; - match role { - InodeRole::Root => { - let org_name = name.to_str().ok_or(LookupError::InodeNotFound)?; - let org_idx = self - .composite - .slots - .iter() - .position(|s| s.inner.name() == org_name) - .ok_or(LookupError::InodeNotFound)?; - - trace!(org = org_name, "lookup: matched org"); - let (ino, inode) = self.ensure_org_inode(org_idx).await; - self.composite - .inc_rc(ino) - .ok_or(LookupError::InodeNotFound)?; - Ok(inode) - } - InodeRole::OrgOwned => self.composite.delegated_lookup(parent, name).await, - } - } - - #[instrument(name = "MesaFS::getattr", skip(self))] - pub async fn getattr(&self, ino: InodeAddr) -> Result { - self.composite.delegated_getattr(ino).await - } - - #[instrument(name = "MesaFS::readdir", skip(self))] - pub async fn readdir(&mut self, ino: InodeAddr) -> Result<&[FsDirEntry], ReadDirError> { - let role = self.inode_role(ino).ok_or(ReadDirError::InodeNotFound)?; - match role { - InodeRole::Root => { - let org_info: Vec<(usize, String)> = self - .composite - .slots - .iter() - .enumerate() - .map(|(idx, s)| (idx, s.inner.name().to_owned())) - .collect(); - - let mut entries = Vec::with_capacity(org_info.len()); - for (org_idx, name) in &org_info { - let (entry_ino, _) = self.ensure_org_inode(*org_idx).await; - entries.push(FsDirEntry { - ino: entry_ino, - name: name.clone().into(), - }); - } - - trace!(entry_count = entries.len(), "readdir: listing orgs"); - self.composite.readdir_buf = entries; - Ok(&self.composite.readdir_buf) - } - InodeRole::OrgOwned => self.composite.delegated_readdir(ino).await, - } - } - - #[instrument(name = "MesaFS::open", skip(self))] - pub async fn open( - &mut self, - ino: InodeAddr, - flags: OpenFlags, - ) -> Result { - self.composite.delegated_open(ino, flags).await - } - - #[instrument(name = "MesaFS::read", skip(self))] - pub async fn read( - &mut self, - fh: FileHandle, - offset: u64, - size: u32, - ) -> Result { - self.composite.delegated_read(fh, offset, size).await - } - - #[instrument(name = "MesaFS::release", skip(self))] - pub async fn release(&mut self, fh: FileHandle) -> Result<(), ReleaseError> { - self.composite.delegated_release(fh).await - } -} - -/// A file reader that delegates reads to `MesaFS` through a shared mutex. -/// -/// Resources are released via [`FileReader::close`](git_fs::fs::async_fs::FileReader::close), -/// which is called by the FUSE adapter during `release`. Dropping without -/// calling `close()` emits a diagnostic warning. -pub struct MesaFsReader { - inner: Arc>, - fh: FileHandle, - closed: AtomicBool, -} - -impl git_fs::fs::async_fs::FileReader for MesaFsReader { - fn read( - &self, - offset: u64, - size: u32, - ) -> impl Future> + Send { - let inner = Arc::clone(&self.inner); - let fh = self.fh; - async move { - let mut guard = inner.lock().await; - guard - .read(fh, offset, size) - .await - .map_err(|e| std::io::Error::other(e.to_string())) - } - } - - fn close(&self) -> impl Future> + Send { - self.closed.store(true, Ordering::Relaxed); - let inner = Arc::clone(&self.inner); - let fh = self.fh; - async move { - let mut guard = inner.lock().await; - guard - .release(fh) - .await - .map_err(|e| std::io::Error::other(e.to_string())) - } - } -} - -impl Drop for MesaFsReader { - fn drop(&mut self) { - if !self.closed.load(Ordering::Relaxed) { - tracing::warn!(fh = self.fh, "MesaFsReader dropped without close()"); - } - } -} - -/// A [`FsDataProvider`](git_fs::fs::async_fs::FsDataProvider) that wraps -/// `MesaFS` behind a shared mutex. -#[derive(Clone)] -pub struct MesaFsProvider { - inner: Arc>, -} - -impl MesaFsProvider { - /// Create a new provider wrapping the given `MesaFS`. - pub fn new(mesa_fs: MesaFS) -> Self { - Self { - inner: Arc::new(tokio::sync::Mutex::new(mesa_fs)), - } - } -} - -fn lookup_error_to_io(e: LookupError) -> std::io::Error { - match e { - LookupError::InodeNotFound => std::io::Error::from_raw_os_error(libc::ENOENT), - LookupError::RemoteMesaError(api) => std::io::Error::other(api.to_string()), - } -} - -fn readdir_error_to_io(e: ReadDirError) -> std::io::Error { - match e { - ReadDirError::InodeNotFound => std::io::Error::from_raw_os_error(libc::ENOENT), - ReadDirError::NotADirectory => std::io::Error::from_raw_os_error(libc::ENOTDIR), - ReadDirError::NotPermitted => std::io::Error::from_raw_os_error(libc::EPERM), - ReadDirError::RemoteMesaError(api) => std::io::Error::other(api.to_string()), - } -} - -fn open_error_to_io(e: OpenError) -> std::io::Error { - match e { - OpenError::InodeNotFound => std::io::Error::from_raw_os_error(libc::ENOENT), - } -} - -impl git_fs::fs::async_fs::FsDataProvider for MesaFsProvider { - type Reader = MesaFsReader; - - fn lookup( - &self, - parent: INode, - name: &OsStr, - ) -> impl Future> + Send { - let inner = Arc::clone(&self.inner); - let name = name.to_os_string(); - async move { - let mut guard = inner.lock().await; - guard - .lookup(parent.addr, &name) - .await - .map_err(lookup_error_to_io) - } - } - - fn readdir( - &self, - parent: INode, - ) -> impl Future, std::io::Error>> + Send { - let inner = Arc::clone(&self.inner); - async move { - let mut guard = inner.lock().await; - let dir_entries: Vec<(OsString, InodeAddr)> = { - let entries = guard - .readdir(parent.addr) - .await - .map_err(readdir_error_to_io)?; - entries.iter().map(|e| (e.name.clone(), e.ino)).collect() - }; - let mut result = Vec::with_capacity(dir_entries.len()); - for (name, ino) in dir_entries { - if let Ok(inode) = guard.getattr(ino).await { - result.push((name, inode)); - } - } - Ok(result) - } - } - - fn open( - &self, - inode: INode, - flags: OpenFlags, - ) -> impl Future> + Send { - let inner = Arc::clone(&self.inner); - async move { - let mut guard = inner.lock().await; - let fh = guard - .open(inode.addr, flags) - .await - .map_err(open_error_to_io)?; - Ok(MesaFsReader { - inner: Arc::clone(&inner), - fh, - closed: AtomicBool::new(false), - }) - } - } -} diff --git a/src/fs/mescloud/org.rs b/src/fs/mescloud/org.rs deleted file mode 100644 index feefaf8e..00000000 --- a/src/fs/mescloud/org.rs +++ /dev/null @@ -1,390 +0,0 @@ -use std::collections::HashMap; -use std::ffi::{OsStr, OsString}; -use std::time::SystemTime; - -use bytes::Bytes; -use futures::TryStreamExt as _; -use git_fs::fs::{FileHandle, INode, INodeType, InodeAddr, InodePerms, OpenFlags}; -use mesa_dev::MesaClient; -use secrecy::SecretString; -use tracing::{instrument, trace, warn}; - -use super::common::{ChildFs, MesaApiError}; -pub use super::common::{LookupError, OpenError, ReadDirError, ReadError, ReleaseError}; -use super::composite::CompositeFs; -use super::repo::RepoFs; -use crate::app_config::CacheConfig; - -#[derive(Debug, Clone)] -pub struct OrgConfig { - pub name: String, - pub api_key: SecretString, -} - -/// Classifies an inode by its role in the org hierarchy. -enum InodeRole { - /// The org root directory. - OrgRoot, - /// A virtual owner directory (github only). - OwnerDir, - /// An inode owned by some repo (either a child-root or delegated). - RepoOwned, -} - -/// A filesystem rooted at a single organization. -/// -/// Composes multiple [`RepoFs`] instances, each with its own inode namespace, -/// delegating to [`CompositeFs`] for inode/fh translation at each boundary. -pub struct OrgFs { - name: String, - client: MesaClient, - composite: CompositeFs, - /// Maps org-level owner-dir inodes to owner name (github only). - owner_inodes: HashMap, - cache_config: CacheConfig, -} - -impl OrgFs { - pub(crate) const ROOT_INO: InodeAddr = CompositeFs::::ROOT_INO; - const BLOCK_SIZE: u32 = 4096; - - /// The name of the organization. - #[must_use] - pub(crate) fn name(&self) -> &str { - &self.name - } - - /// Whether this org uses the github two-level owner/repo hierarchy. - /// TODO(MES-674): Cleanup "special" casing for github. - fn is_github(&self) -> bool { - self.name == "github" - } - - /// Encode "owner/repo" to base64 for API calls. - /// TODO(MES-674): Cleanup "special" casing for github. - fn encode_github_repo_name(decoded: &str) -> String { - use base64::Engine as _; - base64::engine::general_purpose::STANDARD.encode(decoded) - } - - /// Ensure an inode exists for a virtual owner directory (github only). Does NOT bump rc. - /// TODO(MES-674): Cleanup "special" casing for github. - async fn ensure_owner_inode(&mut self, owner: &str) -> (InodeAddr, INode) { - // Check existing - let mut stale_ino = None; - for (&ino, existing_owner) in &self.owner_inodes { - if existing_owner == owner { - if let Ok(inode) = self.composite.delegated_getattr(ino).await { - return (ino, inode); - } - stale_ino = Some(ino); - break; - } - } - if let Some(ino) = stale_ino { - self.owner_inodes.remove(&ino); - } - - let ino = self.composite.allocate_inode(); - let now = SystemTime::now(); - let inode = INode { - addr: ino, - permissions: InodePerms::from_bits_truncate(0o755), - uid: self.composite.fs_owner().0, - gid: self.composite.fs_owner().1, - create_time: now, - last_modified_at: now, - parent: Some(Self::ROOT_INO), - size: 0, - itype: INodeType::Directory, - }; - self.composite.cache_inode_and_init_rc(inode); - self.owner_inodes.insert(ino, owner.to_owned()); - (ino, inode) - } - - #[must_use] - pub fn new( - name: String, - client: MesaClient, - fs_owner: (u32, u32), - cache_config: CacheConfig, - ) -> Self { - Self { - name, - client, - composite: CompositeFs::new(fs_owner, Self::BLOCK_SIZE), - owner_inodes: HashMap::new(), - cache_config, - } - } - - /// Classify an inode by its role. - fn inode_role(&self, ino: InodeAddr) -> Option { - if ino == Self::ROOT_INO { - return Some(InodeRole::OrgRoot); - } - if self.owner_inodes.contains_key(&ino) { - return Some(InodeRole::OwnerDir); - } - if self.composite.child_inodes.contains_key(&ino) { - return Some(InodeRole::RepoOwned); - } - if self.composite.slot_for_inode(ino).is_some() { - return Some(InodeRole::RepoOwned); - } - None - } - - /// Ensure an inode + `RepoFs` exists for the given repo name. - /// Does NOT bump rc. - /// - /// - `repo_name`: name used for API calls / `RepoFs` (base64-encoded for github) - /// - `display_name`: name shown in filesystem ("linux" for github, same as `repo_name` otherwise) - /// - `parent_ino`: owner-dir inode for github, `ROOT_INO` otherwise - async fn ensure_repo_inode( - &mut self, - repo_name: &str, - display_name: &str, - default_branch: &str, - parent_ino: InodeAddr, - ) -> (InodeAddr, INode) { - // Check existing repos. - for (&ino, &idx) in &self.composite.child_inodes { - if self.composite.slots[idx].inner.repo_name() == repo_name { - if let Ok(inode) = self.composite.delegated_getattr(ino).await { - trace!(ino, repo = repo_name, "ensure_repo_inode: reusing"); - return (ino, inode); - } - warn!( - ino, - repo = repo_name, - "ensure_repo_inode: attr missing, rebuilding" - ); - return self.make_repo_dir_inode(ino); - } - } - - // Create new RepoFs and register as child. - let repo = RepoFs::new( - self.client.clone(), - self.name.clone(), - repo_name.to_owned(), - default_branch.to_owned(), - self.composite.fs_owner(), - self.cache_config.clone(), - ) - .await; - - let outer_ino = self - .composite - .add_child_with_parent(repo, RepoFs::ROOT_INO, parent_ino); - trace!( - ino = outer_ino, - repo = repo_name, - "ensure_repo_inode: allocated new inode" - ); - - // Register in directory cache so readdir sees it. - self.composite - .directory_cache - .insert( - git_fs::fs::LoadedAddr(parent_ino), - OsString::from(display_name), - git_fs::fs::LoadedAddr(outer_ino), - true, - ) - .await; - - let inode = self - .composite - .delegated_getattr(outer_ino) - .await - .unwrap_or_else(|_| { - let now = SystemTime::now(); - INode { - addr: outer_ino, - permissions: InodePerms::from_bits_truncate(0o755), - uid: self.composite.fs_owner().0, - gid: self.composite.fs_owner().1, - create_time: now, - last_modified_at: now, - parent: Some(parent_ino), - size: 0, - itype: INodeType::Directory, - } - }); - (outer_ino, inode) - } - - /// Build a directory inode for `ino`, returning `(ino, inode)`. - fn make_repo_dir_inode(&self, ino: InodeAddr) -> (InodeAddr, INode) { - let now = SystemTime::now(); - let inode = INode { - addr: ino, - permissions: InodePerms::from_bits_truncate(0o755), - uid: self.composite.fs_owner().0, - gid: self.composite.fs_owner().1, - create_time: now, - last_modified_at: now, - parent: None, - size: 0, - itype: INodeType::Directory, - }; - self.composite.cache_inode(inode); - (ino, inode) - } - - /// Fetch a repo by name via the API. - async fn wait_for_sync( - &self, - repo_name: &str, - ) -> Result { - self.client - .org(&self.name) - .repos() - .at(repo_name) - .get() - .await - .map_err(MesaApiError::from) - } -} - -#[async_trait::async_trait] -impl ChildFs for OrgFs { - #[instrument(name = "OrgFs::lookup", skip(self), fields(org = %self.name))] - async fn lookup(&mut self, parent: InodeAddr, name: &OsStr) -> Result { - let role = self.inode_role(parent).ok_or(LookupError::InodeNotFound)?; - match role { - InodeRole::OrgRoot => { - let name_str = name.to_str().ok_or(LookupError::InodeNotFound)?; - - if self.is_github() { - trace!(owner = name_str, "lookup: resolving github owner dir"); - let (ino, inode) = self.ensure_owner_inode(name_str).await; - self.composite - .inc_rc(ino) - .ok_or(LookupError::InodeNotFound)?; - Ok(inode) - } else { - trace!(repo = name_str, "lookup: resolving repo"); - let repo = self.wait_for_sync(name_str).await?; - let (ino, inode) = self - .ensure_repo_inode(name_str, name_str, &repo.default_branch, Self::ROOT_INO) - .await; - let rc = self - .composite - .inc_rc(ino) - .ok_or(LookupError::InodeNotFound)?; - trace!(ino, repo = name_str, rc, "lookup: resolved repo inode"); - Ok(inode) - } - } - InodeRole::OwnerDir => { - let owner = self - .owner_inodes - .get(&parent) - .ok_or(LookupError::InodeNotFound)? - .clone(); - let repo_name_str = name.to_str().ok_or(LookupError::InodeNotFound)?; - let full_decoded = format!("{owner}/{repo_name_str}"); - let encoded = Self::encode_github_repo_name(&full_decoded); - - trace!( - owner = %owner, repo = repo_name_str, encoded = %encoded, - "lookup: resolving github repo via owner dir" - ); - - let repo = self.wait_for_sync(&encoded).await?; - let (ino, inode) = self - .ensure_repo_inode(&encoded, repo_name_str, &repo.default_branch, parent) - .await; - self.composite - .inc_rc(ino) - .ok_or(LookupError::InodeNotFound)?; - Ok(inode) - } - InodeRole::RepoOwned => self.composite.delegated_lookup(parent, name).await, - } - } - - #[instrument(name = "OrgFs::readdir", skip(self), fields(org = %self.name))] - async fn readdir(&mut self, ino: InodeAddr) -> Result, ReadDirError> { - let role = self.inode_role(ino).ok_or(ReadDirError::InodeNotFound)?; - match role { - InodeRole::OrgRoot => { - if self.is_github() { - return Err(ReadDirError::NotPermitted); - } - - let repos: Vec = self - .client - .org(&self.name) - .repos() - .list(None) - .try_collect() - .await - .map_err(MesaApiError::from)?; - - let repo_infos: Vec<(String, String)> = repos - .into_iter() - .filter_map(|r| { - let name = r.name?; - let branch = r.default_branch.unwrap_or_else(|| "main".to_owned()); - Some((name, branch)) - }) - .collect(); - trace!(count = repo_infos.len(), "readdir: fetched repo list"); - - let mut entries = Vec::with_capacity(repo_infos.len()); - for (repo_name, default_branch) in &repo_infos { - let (_, inode) = self - .ensure_repo_inode(repo_name, repo_name, default_branch, Self::ROOT_INO) - .await; - entries.push((OsString::from(repo_name), inode)); - } - - Ok(entries) - } - InodeRole::OwnerDir if self.is_github() => Err(ReadDirError::NotPermitted), - InodeRole::OwnerDir => Err(ReadDirError::NotADirectory), - InodeRole::RepoOwned => { - let dir_entries: Vec<_> = self - .composite - .delegated_readdir(ino) - .await? - .iter() - .map(|e| (e.name.clone(), e.ino)) - .collect(); - let mut entries = Vec::with_capacity(dir_entries.len()); - for (name, child_ino) in dir_entries { - if let Some(inode) = self.composite.inode_table.get(&child_ino).await { - entries.push((name, inode)); - } - } - Ok(entries) - } - } - } - - #[instrument(name = "OrgFs::open", skip(self), fields(org = %self.name))] - async fn open(&mut self, ino: InodeAddr, flags: OpenFlags) -> Result { - self.composite.delegated_open(ino, flags).await - } - - #[instrument(name = "OrgFs::read", skip(self), fields(org = %self.name))] - async fn read( - &mut self, - _ino: InodeAddr, - fh: FileHandle, - offset: u64, - size: u32, - ) -> Result { - self.composite.delegated_read(fh, offset, size).await - } - - #[instrument(name = "OrgFs::release", skip(self), fields(org = %self.name))] - async fn release(&mut self, _ino: InodeAddr, fh: FileHandle) -> Result<(), ReleaseError> { - self.composite.delegated_release(fh).await - } -} diff --git a/src/fs/mescloud/repo.rs b/src/fs/mescloud/repo.rs index acff3d04..f13ead88 100644 --- a/src/fs/mescloud/repo.rs +++ b/src/fs/mescloud/repo.rs @@ -2,13 +2,12 @@ //! //! This module directly accesses the mesa repo through the Rust SDK, on a per-repo basis. -use std::collections::HashMap; -use std::ffi::OsString; +use std::ffi::{OsStr, OsString}; use std::future::Future; +use std::path::PathBuf; use std::sync::Arc; use std::sync::atomic::{AtomicU64, Ordering}; use std::time::SystemTime; -use std::{ffi::OsStr, path::PathBuf}; use base64::Engine as _; use bytes::Bytes; @@ -20,31 +19,12 @@ use tracing::warn; use git_fs::cache::fcache::FileCache; use git_fs::cache::traits::{AsyncReadableCache as _, AsyncWritableCache as _}; use git_fs::fs::async_fs::{FileReader, FsDataProvider}; -use git_fs::fs::{ - INode, INodeType, InodeAddr, InodePerms, LoadedAddr, OpenFlags as AsyncOpenFlags, -}; +use git_fs::fs::{INode, INodeType, InodeAddr, InodePerms, OpenFlags as AsyncOpenFlags}; -use crate::app_config::CacheConfig; - -use super::common::MesaApiError; -pub use super::common::{LookupError, OpenError, ReadDirError, ReadError, ReleaseError}; - -fn mesa_api_error_to_io(e: MesaApiError) -> std::io::Error { - match &e { - MesaApiError::Response { status, .. } if *status == 404 => { - std::io::Error::from_raw_os_error(libc::ENOENT) - } - MesaApiError::Reqwest(_) - | MesaApiError::ReqwestMiddleware(_) - | MesaApiError::Serde(_) - | MesaApiError::SerdePath(_) - | MesaApiError::Io(_) - | MesaApiError::Response { .. } => std::io::Error::other(e), - } -} +use super::common::{MesaApiError, mesa_api_error_to_io}; #[derive(Clone)] -pub(super) struct MesRepoProvider { +pub struct MesRepoProvider { inner: Arc, } @@ -97,6 +77,10 @@ impl MesRepoProvider { } /// The name of the repository. + #[expect( + dead_code, + reason = "useful diagnostic accessor retained for future use" + )] pub(super) fn repo_name(&self) -> &str { &self.inner.repo_name } @@ -294,7 +278,7 @@ impl FsDataProvider for MesRepoProvider { } } -pub(super) struct MesFileReader { +pub struct MesFileReader { client: MesaClient, org_name: String, repo_name: String, @@ -383,215 +367,3 @@ impl FileReader for MesFileReader { } } } - -mod repo_fs_inner { - #![allow(clippy::future_not_send, clippy::mem_forget)] - use git_fs::cache::async_backed::FutureBackedCache; - use git_fs::fs::async_fs::AsyncFs; - use git_fs::fs::{INode, InodeAddr}; - use ouroboros::self_referencing; - - use super::MesRepoProvider; - - #[self_referencing] - pub struct RepoFsInner { - pub(super) inode_table: FutureBackedCache, - #[borrows(inode_table)] - #[covariant] - pub(super) fs: AsyncFs<'this, MesRepoProvider>, - } - - impl RepoFsInner { - pub fn create( - inode_table: FutureBackedCache, - provider: MesRepoProvider, - ) -> Self { - RepoFsInnerBuilder { - inode_table, - fs_builder: |tbl| AsyncFs::new_preseeded(provider, tbl), - } - .build() - } - } -} -use repo_fs_inner::RepoFsInner; - -/// A filesystem rooted at a single mesa repository. -/// -/// Wraps [`AsyncFs`] via ouroboros to co-locate the inode table -/// and the filesystem that borrows it. Implements [`Fs`] as a thin adapter. -pub struct RepoFs { - inner: RepoFsInner, - /// Reference counts for inodes held by the kernel. - refcounts: rustc_hash::FxHashMap, - /// Open file handles mapped to readers. - open_files: HashMap>, - /// Provider clone for accessing `repo_name` and `path_map` cleanup. - provider: MesRepoProvider, -} - -impl RepoFs { - pub(crate) const ROOT_INO: InodeAddr = 1; - - /// Create a new `RepoFs` for a specific org and repo. - pub async fn new( - client: MesaClient, - org_name: String, - repo_name: String, - ref_: String, - fs_owner: (u32, u32), - cache_config: CacheConfig, - ) -> Self { - let file_cache = match cache_config.max_size { - Some(max_size) if max_size.as_u64() > 0 => { - let cache_dir = cache_config.path.join(&org_name).join(&repo_name); - let max_bytes = max_size.as_u64().try_into().unwrap_or(usize::MAX); - match FileCache::new(&cache_dir, max_bytes).await { - Ok(cache) => Some(Arc::new(cache)), - Err(e) => { - warn!(error = ?e, org = %org_name, repo = %repo_name, - "failed to create file cache, continuing without caching"); - None - } - } - } - _ => None, - }; - - let provider = - MesRepoProvider::new(client, org_name, repo_name, ref_, fs_owner, file_cache); - provider.seed_root_path(Self::ROOT_INO); - - let root = INode { - addr: Self::ROOT_INO, - permissions: InodePerms::from_bits_truncate(0o755), - uid: fs_owner.0, - gid: fs_owner.1, - create_time: SystemTime::now(), - last_modified_at: SystemTime::now(), - parent: None, - size: 0, - itype: INodeType::Directory, - }; - - let inode_table = git_fs::cache::async_backed::FutureBackedCache::default(); - inode_table.insert_sync(root.addr, root); - - let inner = RepoFsInner::create(inode_table, provider.clone()); - - let mut refcounts = rustc_hash::FxHashMap::default(); - refcounts.insert(Self::ROOT_INO, 1); - - Self { - inner, - refcounts, - open_files: HashMap::new(), - provider, - } - } - - /// The name of the repository this filesystem is rooted at. - pub(crate) fn repo_name(&self) -> &str { - self.provider.repo_name() - } -} - -#[expect( - clippy::wildcard_enum_match_arm, - reason = "mapping all ErrorKind variants is impractical; EIO is the sensible default" -)] -fn io_error_to_errno(e: &std::io::Error) -> i32 { - e.raw_os_error().unwrap_or_else(|| match e.kind() { - std::io::ErrorKind::NotFound => libc::ENOENT, - std::io::ErrorKind::PermissionDenied => libc::EACCES, - std::io::ErrorKind::AlreadyExists => libc::EEXIST, - _ => libc::EIO, - }) -} - -#[async_trait::async_trait] -impl super::common::ChildFs for RepoFs { - async fn lookup(&mut self, parent: InodeAddr, name: &OsStr) -> Result { - let tracked = self - .inner - .borrow_fs() - .lookup(LoadedAddr(parent), name) - .await - .map_err(|e| { - if io_error_to_errno(&e) == libc::ENOENT { - LookupError::InodeNotFound - } else { - LookupError::RemoteMesaError(MesaApiError::Io(e)) - } - })?; - *self.refcounts.entry(tracked.inode.addr).or_insert(0) += 1; - Ok(tracked.inode) - } - - async fn readdir(&mut self, ino: InodeAddr) -> Result, ReadDirError> { - let mut entries = Vec::new(); - self.inner - .borrow_fs() - .readdir(LoadedAddr(ino), 0, |de, _offset| { - entries.push((de.name.to_os_string(), de.inode)); - false - }) - .await - .map_err(|e| { - if io_error_to_errno(&e) == libc::ENOTDIR { - ReadDirError::NotADirectory - } else if io_error_to_errno(&e) == libc::ENOENT { - ReadDirError::InodeNotFound - } else { - ReadDirError::RemoteMesaError(MesaApiError::Io(e)) - } - })?; - Ok(entries) - } - - async fn open( - &mut self, - ino: InodeAddr, - flags: AsyncOpenFlags, - ) -> Result { - let open_file = self - .inner - .borrow_fs() - .open(LoadedAddr(ino), flags) - .await - .map_err(|_| OpenError::InodeNotFound)?; - self.open_files - .insert(open_file.fh, Arc::clone(&open_file.reader)); - Ok(open_file.fh) - } - - async fn read( - &mut self, - _ino: InodeAddr, - fh: git_fs::fs::FileHandle, - offset: u64, - size: u32, - ) -> Result { - let reader = self.open_files.get(&fh).ok_or(ReadError::FileNotOpen)?; - reader.read(offset, size).await.map_err(|e| { - if io_error_to_errno(&e) == libc::EISDIR { - ReadError::NotAFile - } else if io_error_to_errno(&e) == libc::ENOENT { - ReadError::InodeNotFound - } else { - ReadError::RemoteMesaError(MesaApiError::Io(e)) - } - }) - } - - async fn release( - &mut self, - _ino: InodeAddr, - fh: git_fs::fs::FileHandle, - ) -> Result<(), ReleaseError> { - self.open_files - .remove(&fh) - .ok_or(ReleaseError::FileNotOpen)?; - Ok(()) - } -} diff --git a/src/fs/mescloud/roots.rs b/src/fs/mescloud/roots.rs index aafe0c4c..8893d379 100644 --- a/src/fs/mescloud/roots.rs +++ b/src/fs/mescloud/roots.rs @@ -3,10 +3,8 @@ //! Bridges the generic `CompositeFs` from `lib/fs/composite.rs` with //! Mesa/GitHub-specific org and repo resolution logic. //! -//! These types are not yet wired into the daemon entry point; they will be -//! connected in a follow-up change that replaces the old `MesaFS` + `OrgFs` -//! pipeline. -#![expect(dead_code, reason = "wired in the follow-up daemon change")] +//! These types are wired into the daemon entry point, replacing the old +//! `MesaFS` + `OrgFs` pipeline. use std::ffi::{OsStr, OsString}; use std::future::Future; @@ -23,26 +21,12 @@ use git_fs::fs::async_fs::{FileReader, FsDataProvider}; use git_fs::fs::composite::{ChildDescriptor, CompositeFs, CompositeReader, CompositeRoot}; use git_fs::fs::{INode, INodeType, InodeAddr, InodePerms, OpenFlags}; -use super::common::MesaApiError; +use super::common::{MesaApiError, mesa_api_error_to_io}; use super::repo::{MesFileReader, MesRepoProvider}; use crate::app_config::CacheConfig; const CHILD_ROOT_ADDR: InodeAddr = 1; -fn mesa_api_error_to_io(e: MesaApiError) -> std::io::Error { - match &e { - MesaApiError::Response { status, .. } if *status == 404 => { - std::io::Error::from_raw_os_error(libc::ENOENT) - } - MesaApiError::Reqwest(_) - | MesaApiError::ReqwestMiddleware(_) - | MesaApiError::Serde(_) - | MesaApiError::SerdePath(_) - | MesaApiError::Io(_) - | MesaApiError::Response { .. } => std::io::Error::other(e), - } -} - /// Create a [`MesRepoProvider`] and its root [`INode`] for a given repo. async fn create_repo_provider( client: &MesaClient, @@ -111,7 +95,7 @@ fn check_not_found(e: MesaApiError) -> Result<(), std::io::Error> { } } -pub(super) struct StandardOrgRoot { +pub struct StandardOrgRoot { client: MesaClient, org_name: String, cache_config: CacheConfig, @@ -119,7 +103,7 @@ pub(super) struct StandardOrgRoot { } impl StandardOrgRoot { - pub(super) fn new( + pub fn new( client: MesaClient, org_name: String, cache_config: CacheConfig, @@ -222,7 +206,7 @@ impl CompositeRoot for StandardOrgRoot { } } -pub(super) struct GithubRepoRoot { +pub struct GithubRepoRoot { client: MesaClient, org_name: String, owner: String, @@ -286,7 +270,7 @@ impl CompositeRoot for GithubRepoRoot { } } -pub(super) struct GithubOrgRoot { +pub struct GithubOrgRoot { client: MesaClient, org_name: String, cache_config: CacheConfig, @@ -294,7 +278,7 @@ pub(super) struct GithubOrgRoot { } impl GithubOrgRoot { - pub(super) fn new( + pub fn new( client: MesaClient, org_name: String, cache_config: CacheConfig, @@ -347,7 +331,7 @@ impl CompositeRoot for GithubOrgRoot { } #[derive(Clone)] -pub(super) enum OrgChildDP { +pub enum OrgChildDP { Standard(CompositeFs), Github(CompositeFs), } @@ -407,7 +391,7 @@ impl FsDataProvider for OrgChildDP { } } -pub(super) enum OrgChildReader { +pub enum OrgChildReader { Standard(CompositeReader), Github(CompositeReader>), } @@ -441,12 +425,12 @@ impl FileReader for OrgChildReader { } } -pub(super) struct MesaRoot { +pub struct MesaRoot { orgs: Vec<(OsString, OrgChildDP)>, } impl MesaRoot { - pub(super) fn new(orgs: Vec<(OsString, OrgChildDP)>) -> Self { + pub fn new(orgs: Vec<(OsString, OrgChildDP)>) -> Self { Self { orgs } } } From 36d9fea53f29b38215e249c52f8bf0b01cf98f3c Mon Sep 17 00:00:00 2001 From: Marko Vejnovic Date: Fri, 20 Feb 2026 16:20:10 -0800 Subject: [PATCH 24/58] bug fixes --- lib/cache/async_backed.rs | 72 ++++++++++++++---------- lib/fs/async_fs.rs | 79 +++++++++++++++++--------- lib/fs/composite.rs | 107 +++++++++++++++--------------------- lib/fs/dcache.rs | 76 ++++++++++++++++++++----- lib/fs/fuser.rs | 9 ++- src/fs/mescloud/repo.rs | 7 ++- src/fs/mescloud/roots.rs | 7 +++ tests/dcache_correctness.rs | 54 +++++++++++++++--- 8 files changed, 270 insertions(+), 141 deletions(-) diff --git a/lib/cache/async_backed.rs b/lib/cache/async_backed.rs index 8f15803b..273bcd39 100644 --- a/lib/cache/async_backed.rs +++ b/lib/cache/async_backed.rs @@ -7,6 +7,7 @@ //! Note that this cache does not support automatic eviction. use std::panic::AssertUnwindSafe; +use std::sync::atomic::{AtomicU64, Ordering}; use std::{fmt::Debug, future::Future, hash::Hash, pin::Pin}; use futures::FutureExt as _; @@ -17,10 +18,12 @@ type SharedFut = Shared> + Send>>>; /// Two-state slot: `InFlight` while a factory future is running, then promoted to `Ready` once /// the future completes. /// -/// The `InFlight` variant holds a `Shared<..., Output = Option>` where `None` signals that the -/// factory panicked (caught by `catch_unwind`). On `None`, callers remove the entry and retry. +/// The `InFlight` variant holds a generation counter and a `Shared<..., Output = Option>` +/// where `None` signals that the factory panicked (caught by `catch_unwind`). On `None`, callers +/// remove the entry only if the generation matches, avoiding destruction of a valid re-inserted +/// entry. enum Slot { - InFlight(SharedFut), + InFlight(u64, SharedFut), Ready(V), } @@ -30,6 +33,7 @@ enum Slot { /// invocation of the factory runs. All callers receive a clone of the result. pub struct FutureBackedCache { map: scc::HashMap>, + next_gen: AtomicU64, } impl Default for FutureBackedCache @@ -40,6 +44,7 @@ where fn default() -> Self { Self { map: scc::HashMap::default(), + next_gen: AtomicU64::new(0), } } } @@ -69,14 +74,14 @@ where .map .read_async(&key, |_, slot| match slot { Slot::Ready(v) => Ok(v.clone()), - Slot::InFlight(shared) => Err(shared.clone()), + Slot::InFlight(generation, shared) => Err((*generation, shared.clone())), }) .await; match existing { Some(Ok(v)) => return v, - Some(Err(shared)) => { - if let Some(v) = self.await_shared(&key, shared).await { + Some(Err((generation, shared))) => { + if let Some(v) = self.await_shared(&key, generation, shared).await { return v; } // Factory panicked; entry removed. Fall through to re-insert below. @@ -85,20 +90,21 @@ where } // Slow path: use entry_async for atomic check-and-insert. - let shared = match self.map.entry_async(key.clone()).await { + let (generation, shared) = match self.map.entry_async(key.clone()).await { scc::hash_map::Entry::Occupied(occ) => match occ.get() { Slot::Ready(v) => return v.clone(), - Slot::InFlight(shared) => shared.clone(), + Slot::InFlight(g, shared) => (*g, shared.clone()), }, scc::hash_map::Entry::Vacant(vac) => { + let generation = self.next_gen.fetch_add(1, Ordering::Relaxed); let shared = Self::make_shared(factory); let ret = shared.clone(); - vac.insert_entry(Slot::InFlight(shared)); - ret + vac.insert_entry(Slot::InFlight(generation, shared)); + (generation, ret) } }; - if let Some(v) = self.await_shared(&key, shared).await { + if let Some(v) = self.await_shared(&key, generation, shared).await { return v; } @@ -124,14 +130,14 @@ where .map .read_async(&key, |_, slot| match slot { Slot::Ready(v) => Ok(v.clone()), - Slot::InFlight(shared) => Err(shared.clone()), + Slot::InFlight(generation, shared) => Err((*generation, shared.clone())), }) .await; match existing { Some(Ok(v)) => return Ok(v), - Some(Err(shared)) => { - if let Some(v) = self.await_shared(&key, shared).await { + Some(Err((generation, shared))) => { + if let Some(v) = self.await_shared(&key, generation, shared).await { return Ok(v); } // Factory panicked; entry was removed. Fall through to run our own factory. @@ -147,10 +153,13 @@ where match self.map.entry_async(key).await { scc::hash_map::Entry::Occupied(occ) => match occ.get() { Slot::Ready(v) => Ok(v.clone()), - Slot::InFlight(shared) => Ok(self - .await_shared(occ.key(), shared.clone()) - .await - .unwrap_or(val)), + Slot::InFlight(g, shared) => { + let generation = *g; + Ok(self + .await_shared(occ.key(), generation, shared.clone()) + .await + .unwrap_or(val)) + } }, scc::hash_map::Entry::Vacant(vac) => { vac.insert_entry(Slot::Ready(val.clone())); @@ -170,25 +179,30 @@ where .map .read_async(key, |_, slot| match slot { Slot::Ready(v) => Ok(v.clone()), - Slot::InFlight(shared) => Err(shared.clone()), + Slot::InFlight(generation, shared) => Err((*generation, shared.clone())), }) .await; match existing { Some(Ok(v)) => Some(v), - Some(Err(shared)) => self.await_shared(key, shared).await, + Some(Err((generation, shared))) => self.await_shared(key, generation, shared).await, None => None, } } /// Await a `Shared` future, handle promotion to `Ready`, and handle panic recovery. /// + /// The `observed_gen` parameter is the generation of the `InFlight` slot that was read. + /// On panic recovery, only the entry with this exact generation is removed, preventing + /// destruction of a valid entry re-inserted by a recovered thread. + /// /// Returns `Some(v)` on success. Returns `None` if the factory panicked, after removing /// the poisoned entry from the map. - async fn await_shared(&self, key: &K, shared: SharedFut) -> Option { + async fn await_shared(&self, key: &K, observed_gen: u64, shared: SharedFut) -> Option { let mut guard = PromoteGuard { map: &self.map, key, + observed_gen, value: None, }; @@ -199,7 +213,7 @@ where self.map .update_async(key, |_, slot| { - if matches!(slot, Slot::InFlight(_)) { + if matches!(slot, Slot::InFlight(g, _) if *g == observed_gen) { *slot = Slot::Ready(v.clone()); } }) @@ -209,11 +223,11 @@ where Some(v) } else { // Factory panicked. Remove the poisoned InFlight entry so the next caller - // can retry. - drop( - self.map - .remove_if_sync(key, |slot| matches!(slot, Slot::InFlight(_))), - ); + // can retry — but only if the generation matches our observation. + drop(self.map.remove_if_sync( + key, + |slot| matches!(slot, Slot::InFlight(g, _) if *g == observed_gen), + )); None } } @@ -270,6 +284,7 @@ where { map: &'a scc::HashMap>, key: &'a K, + observed_gen: u64, value: Option, } @@ -280,8 +295,9 @@ where { fn drop(&mut self) { if let Some(v) = self.value.take() { + let generation = self.observed_gen; self.map.update_sync(self.key, |_, slot| { - if matches!(slot, Slot::InFlight(_)) { + if matches!(slot, Slot::InFlight(g, _) if *g == generation) { *slot = Slot::Ready(v); } }); diff --git a/lib/fs/async_fs.rs b/lib/fs/async_fs.rs index 1f81a87e..a13a6617 100644 --- a/lib/fs/async_fs.rs +++ b/lib/fs/async_fs.rs @@ -64,6 +64,12 @@ pub trait FsDataProvider: Clone + Send + Sync + 'static { inode: INode, flags: OpenFlags, ) -> impl Future> + Send; + + /// Called when the kernel forgets an inode (refcount reaches zero). + /// + /// Implementations should clean up any internal mappings for the given + /// address (e.g. bridge maps, path maps). The default is a no-op. + fn forget(&self, _addr: InodeAddr) {} } /// Zero-sized tag whose [`StatelessDrop`] implementation automatically evicts @@ -76,6 +82,15 @@ impl<'a> StatelessDrop<&'a FutureBackedCache, InodeAddr> for I } } +impl<'a, DP: FsDataProvider> StatelessDrop<(&'a FutureBackedCache, DP), InodeAddr> + for InodeForget +{ + fn delete(ctx: &(&'a FutureBackedCache, DP), key: &InodeAddr) { + ctx.0.remove_sync(key); + ctx.1.forget(*key); + } +} + /// A looked-up inode whose lifetime must be managed by the caller. /// /// Each `TrackedINode` returned by [`AsyncFs::lookup`] represents one @@ -283,14 +298,14 @@ impl<'tbl, DP: FsDataProvider> AsyncFs<'tbl, DP> { // Inode was evicted from the table — fall through to the slow path. let name_owned = name.to_os_string(); - let name_for_cache = name_owned.clone(); let lookup_key = (parent.0, name_owned.clone()); let dp = self.data_provider.clone(); let child = self .lookup_cache - .get_or_try_init(lookup_key, || async move { - dp.lookup(parent_ino, &name_owned).await + .get_or_try_init(lookup_key, || { + let name_for_dp = name_owned.clone(); + async move { dp.lookup(parent_ino, &name_for_dp).await } }) .await?; @@ -301,7 +316,7 @@ impl<'tbl, DP: FsDataProvider> AsyncFs<'tbl, DP> { self.directory_cache .insert( parent, - name_for_cache, + name_owned, LoadedAddr(child.addr), matches!(child.itype, INodeType::Directory), ) @@ -366,12 +381,6 @@ impl<'tbl, DP: FsDataProvider> AsyncFs<'tbl, DP> { /// returns `true` (indicating the caller's buffer is full), iteration /// stops early. /// - /// # Concurrency - /// - /// The `is_populated` check-then-populate is **not** atomic. If two - /// concurrent callers invoke `readdir` for the same parent, both may call - /// `dp.readdir()` and insert duplicate children. - /// /// TODO(MES-746): Implement `opendir` and `releasedir` to snapshot directory contents and /// avoid racing with `lookup`/`createfile`. pub async fn readdir( @@ -380,28 +389,48 @@ impl<'tbl, DP: FsDataProvider> AsyncFs<'tbl, DP> { offset: u64, mut filler: impl FnMut(DirEntry<'_>, u64) -> bool, ) -> Result<(), std::io::Error> { + use crate::fs::dcache::PopulateStatus; + let parent_inode = self.loaded_inode(parent).await?; if parent_inode.itype != INodeType::Directory { return Err(std::io::Error::from_raw_os_error(libc::ENOTDIR)); } // Populate the directory cache on first readdir for this parent. - if !self.directory_cache.is_populated(parent) { - let children = self.data_provider.readdir(parent_inode).await?; - for (name, child_inode) in children { - self.inode_table - .get_or_init(child_inode.addr, || async move { child_inode }) - .await; - self.directory_cache - .insert( - parent, - name, - LoadedAddr(child_inode.addr), - child_inode.itype == INodeType::Directory, - ) - .await; + // Uses a three-state CAS gate to prevent duplicate dp.readdir() calls. + loop { + match self.directory_cache.try_claim_populate(parent) { + PopulateStatus::Claimed => { + match self.data_provider.readdir(parent_inode).await { + Ok(children) => { + for (name, child_inode) in children { + self.inode_table + .get_or_init(child_inode.addr, || async move { child_inode }) + .await; + self.directory_cache + .insert( + parent, + name, + LoadedAddr(child_inode.addr), + child_inode.itype == INodeType::Directory, + ) + .await; + } + self.directory_cache.finish_populate(parent); + } + Err(e) => { + self.directory_cache.abort_populate(parent); + return Err(e); + } + } + break; + } + PopulateStatus::InProgress => { + self.directory_cache.wait_populated(parent).await; + // Re-check: the populator may have aborted. + } + PopulateStatus::Done => break, } - self.directory_cache.mark_populated(parent); } let mut children = self.directory_cache.readdir(parent).await; diff --git a/lib/fs/composite.rs b/lib/fs/composite.rs index bf063307..7d9748c6 100644 --- a/lib/fs/composite.rs +++ b/lib/fs/composite.rs @@ -128,7 +128,7 @@ impl FileReader for CompositeReader { struct ChildSlot { inner: Arc>, - bridge: ConcurrentBridge, + bridge: Arc, } struct CompositeFsInner { @@ -237,7 +237,7 @@ impl CompositeFs { table.insert_sync(desc.root_ino.addr, desc.root_ino); let child_inner = Arc::new(ChildInner::create(table, desc.provider.clone())); - let bridge = ConcurrentBridge::new(); + let bridge = Arc::new(ConcurrentBridge::new()); bridge.insert(outer_ino, desc.root_ino.addr); drop(self.inner.slots.insert_sync( @@ -261,51 +261,29 @@ impl CompositeFs { where R::ChildDP: Clone, { - // Fast path: already registered by name. match self.inner.name_to_slot.entry_sync(desc.name.clone()) { - scc::hash_map::Entry::Occupied(occ) => { + scc::hash_map::Entry::Occupied(mut occ) => { let slot_idx = *occ.get(); - // Return existing outer address for this child's root inode. - if let Some(outer) = self + // Extract bridge Arc from the slot guard, then query outside. + let bridge = self .inner .slots - .read_sync(&slot_idx, |_, slot| { - slot.bridge.backward(desc.root_ino.addr) - }) - .flatten() - { + .read_sync(&slot_idx, |_, slot| Arc::clone(&slot.bridge)); + if let Some(outer) = bridge.and_then(|b| b.backward(desc.root_ino.addr)) { return outer; } - // Slot exists but bridge has no mapping — should not happen, - // but fall through to create a fresh slot below. - // (Remove stale name entry so the vacant path can re-insert.) - // - // Race window: between `drop(occ)` and the `remove_sync` below, - // another thread could read the stale entry and resolve to a - // broken slot. In the worst case two threads create separate - // slots for the same child — the last writer to `name_to_slot` - // wins and the other slot becomes orphaned. This is functionally - // harmless: the orphaned slot is never reached via name lookup - // and will not serve any future requests. - drop(occ); - self.inner.name_to_slot.remove_sync(&desc.name); + // Slot exists but bridge has no mapping — replace in-place + // while still holding the entry guard to prevent races. + let (outer_ino, new_slot_idx) = self.create_child_slot(desc); + *occ.get_mut() = new_slot_idx; + outer_ino } scc::hash_map::Entry::Vacant(vac) => { let (outer_ino, slot_idx) = self.create_child_slot(desc); vac.insert_entry(slot_idx); - return outer_ino; + outer_ino } } - - // Fallback: name was stale, create fresh. This path is rare. - let (outer_ino, slot_idx) = self.create_child_slot(desc); - drop( - self.inner - .name_to_slot - .insert_sync(desc.name.clone(), slot_idx), - ); - - outer_ino } } @@ -334,12 +312,16 @@ where .read_sync(&parent.addr, |_, &v| v) .ok_or_else(|| std::io::Error::from_raw_os_error(libc::ENOENT))?; - // Extract Arc and inner parent address under the guard. - let (child, inner_parent) = self + // Extract Arc, bridge, and inner parent address under the guard. + let (child, bridge, inner_parent) = self .inner .slots .read_sync(&slot_idx, |_, slot| { - (Arc::clone(&slot.inner), slot.bridge.forward(parent.addr)) + ( + Arc::clone(&slot.inner), + Arc::clone(&slot.bridge), + slot.bridge.forward(parent.addr), + ) }) .ok_or_else(|| std::io::Error::from_raw_os_error(libc::ENOENT))?; @@ -353,17 +335,10 @@ where .await?; let child_inode = tracked.inode; - // Translate inner address back to composite-level address. - let outer_ino = self - .inner - .slots - .read_sync(&slot_idx, |_, slot| { - let next_ino = &self.inner.next_ino; - slot.bridge.backward_or_insert(child_inode.addr, || { - next_ino.fetch_add(1, Ordering::Relaxed) - }) - }) - .ok_or_else(|| std::io::Error::from_raw_os_error(libc::ENOENT))?; + // Translate inner address back to composite-level address (outside scc guard). + let outer_ino = bridge.backward_or_insert(child_inode.addr, || { + self.inner.next_ino.fetch_add(1, Ordering::Relaxed) + }); let _ = self.inner.addr_to_slot.insert_sync(outer_ino, slot_idx); @@ -390,11 +365,15 @@ where .read_sync(&parent.addr, |_, &v| v) .ok_or_else(|| std::io::Error::from_raw_os_error(libc::ENOENT))?; - let (child, inner_parent) = self + let (child, bridge, inner_parent) = self .inner .slots .read_sync(&slot_idx, |_, slot| { - (Arc::clone(&slot.inner), slot.bridge.forward(parent.addr)) + ( + Arc::clone(&slot.inner), + Arc::clone(&slot.bridge), + slot.bridge.forward(parent.addr), + ) }) .ok_or_else(|| std::io::Error::from_raw_os_error(libc::ENOENT))?; @@ -411,19 +390,12 @@ where }) .await?; - // Translate all inner addresses to composite-level addresses. + // Translate all inner addresses to composite-level addresses (outside scc guard). let mut entries = Vec::with_capacity(child_entries.len()); for (name, child_inode) in child_entries { - let outer_ino = self - .inner - .slots - .read_sync(&slot_idx, |_, slot| { - let next_ino = &self.inner.next_ino; - slot.bridge.backward_or_insert(child_inode.addr, || { - next_ino.fetch_add(1, Ordering::Relaxed) - }) - }) - .ok_or_else(|| std::io::Error::from_raw_os_error(libc::ENOENT))?; + let outer_ino = bridge.backward_or_insert(child_inode.addr, || { + self.inner.next_ino.fetch_add(1, Ordering::Relaxed) + }); let _ = self.inner.addr_to_slot.insert_sync(outer_ino, slot_idx); entries.push(( @@ -462,4 +434,15 @@ where inner: open_file.reader, }) } + + fn forget(&self, addr: InodeAddr) { + if addr == Self::ROOT_INO { + return; + } + if let Some((_, slot_idx)) = self.inner.addr_to_slot.remove_sync(&addr) { + self.inner + .slots + .read_sync(&slot_idx, |_, slot| slot.bridge.remove_by_outer(addr)); + } + } } diff --git a/lib/fs/dcache.rs b/lib/fs/dcache.rs index 4870a401..aea5bb2c 100644 --- a/lib/fs/dcache.rs +++ b/lib/fs/dcache.rs @@ -1,6 +1,6 @@ use std::ffi::{OsStr, OsString}; use std::sync::Arc; -use std::sync::atomic::{AtomicBool, Ordering}; +use std::sync::atomic::{AtomicU8, Ordering}; use crate::fs::LoadedAddr; @@ -13,17 +13,32 @@ pub struct DValue { pub is_dir: bool, } +/// Population states for a directory. +const POPULATE_UNCLAIMED: u8 = 0; +const POPULATE_IN_PROGRESS: u8 = 1; +const POPULATE_DONE: u8 = 2; + +/// Result of attempting to claim a directory for population. +pub enum PopulateStatus { + /// This caller won the race and should populate the directory. + Claimed, + /// Another caller is currently populating; wait and re-check. + InProgress, + /// The directory is already fully populated. + Done, +} + /// Per-parent directory state holding child entries and a population flag. struct DirState { children: scc::HashMap, - populated: AtomicBool, + populated: AtomicU8, } impl DirState { fn new() -> Self { Self { children: scc::HashMap::new(), - populated: AtomicBool::new(false), + populated: AtomicU8::new(POPULATE_UNCLAIMED), } } } @@ -73,9 +88,7 @@ impl DCache { #[must_use] pub fn lookup(&self, parent_ino: LoadedAddr, name: &OsStr) -> Option { let state = self.dirs.read_sync(&parent_ino, |_, v| Arc::clone(v))?; - state - .children - .read_sync(&name.to_os_string(), |_, v| v.clone()) + state.children.read_sync(name, |_, v| v.clone()) } /// Atomically inserts or overwrites a child entry in the cache. @@ -107,17 +120,50 @@ impl DCache { entries } - /// Returns `true` if the directory at `parent_ino` has been fully populated. - #[must_use] - pub fn is_populated(&self, parent_ino: LoadedAddr) -> bool { - self.dirs - .read_sync(&parent_ino, |_, v| v.populated.load(Ordering::Acquire)) - .unwrap_or(false) + /// Atomically try to claim a directory for population. + /// + /// Uses `compare_exchange` on the three-state flag: + /// - `UNCLAIMED → IN_PROGRESS`: returns `Claimed` (caller should populate) + /// - Already `IN_PROGRESS`: returns `InProgress` (caller should wait) + /// - Already `DONE`: returns `Done` (nothing to do) + pub fn try_claim_populate(&self, parent_ino: LoadedAddr) -> PopulateStatus { + let state = self.dir_state(parent_ino); + match state.populated.compare_exchange( + POPULATE_UNCLAIMED, + POPULATE_IN_PROGRESS, + Ordering::AcqRel, + Ordering::Acquire, + ) { + Ok(_) => PopulateStatus::Claimed, + Err(POPULATE_IN_PROGRESS) => PopulateStatus::InProgress, + Err(_) => PopulateStatus::Done, + } } - /// Marks the directory at `parent_ino` as fully populated. - pub fn mark_populated(&self, parent_ino: LoadedAddr) { + /// Mark a directory as fully populated after successful population. + pub fn finish_populate(&self, parent_ino: LoadedAddr) { let state = self.dir_state(parent_ino); - state.populated.store(true, Ordering::Release); + state.populated.store(POPULATE_DONE, Ordering::Release); + } + + /// Abort a population attempt, resetting back to unclaimed so another + /// caller can retry. + pub fn abort_populate(&self, parent_ino: LoadedAddr) { + let state = self.dir_state(parent_ino); + state.populated.store(POPULATE_UNCLAIMED, Ordering::Release); + } + + /// Wait until a directory is no longer in the `InProgress` state. + pub async fn wait_populated(&self, parent_ino: LoadedAddr) { + loop { + let current = self + .dirs + .read_sync(&parent_ino, |_, v| v.populated.load(Ordering::Acquire)) + .unwrap_or(POPULATE_UNCLAIMED); + if current != POPULATE_IN_PROGRESS { + return; + } + tokio::task::yield_now().await; + } } } diff --git a/lib/fs/fuser.rs b/lib/fs/fuser.rs index 886a5f6f..7a9bed24 100644 --- a/lib/fs/fuser.rs +++ b/lib/fs/fuser.rs @@ -83,12 +83,16 @@ mod inner { /// /// Both `ward` and `fs` borrow from `table`. The ward manages inode /// refcounts; the fs serves lookup/readdir/open/read operations. + /// + /// The ward context is `(&table, DP)` so that [`InodeForget`] can both + /// remove the inode from the table and call `dp.forget()` to clean up + /// provider-internal maps (bridge mappings, path maps, etc.). #[self_referencing] pub(super) struct FuseBridgeInner { table: FutureBackedCache, #[borrows(table)] #[not_covariant] - ward: DropWard<&'this FutureBackedCache, InodeAddr, InodeForget>, + ward: DropWard<(&'this FutureBackedCache, DP), InodeAddr, InodeForget>, #[borrows(table)] #[covariant] fs: AsyncFs<'this, DP>, @@ -96,9 +100,10 @@ mod inner { impl FuseBridgeInner { pub(super) fn create(table: FutureBackedCache, provider: DP) -> Self { + let ward_provider = provider.clone(); FuseBridgeInnerBuilder { table, - ward_builder: |tbl| DropWard::new(tbl), + ward_builder: |tbl| DropWard::new((tbl, ward_provider)), fs_builder: |tbl| AsyncFs::new_preseeded(provider, tbl), } .build() diff --git a/src/fs/mescloud/repo.rs b/src/fs/mescloud/repo.rs index f13ead88..f2041d10 100644 --- a/src/fs/mescloud/repo.rs +++ b/src/fs/mescloud/repo.rs @@ -71,8 +71,7 @@ impl MesRepoProvider { } /// Remove the path entry for an inode. Called during forget/cleanup. - #[expect(dead_code, reason = "will be needed when child forget is implemented")] - pub(super) fn remove_path(&self, addr: InodeAddr) { + fn remove_path(&self, addr: InodeAddr) { self.inner.path_map.remove_sync(&addr); } @@ -276,6 +275,10 @@ impl FsDataProvider for MesRepoProvider { }) } } + + fn forget(&self, addr: InodeAddr) { + self.remove_path(addr); + } } pub struct MesFileReader { diff --git a/src/fs/mescloud/roots.rs b/src/fs/mescloud/roots.rs index 8893d379..7c8701db 100644 --- a/src/fs/mescloud/roots.rs +++ b/src/fs/mescloud/roots.rs @@ -389,6 +389,13 @@ impl FsDataProvider for OrgChildDP { } } } + + fn forget(&self, addr: InodeAddr) { + match self { + Self::Standard(c) => c.forget(addr), + Self::Github(c) => c.forget(addr), + } + } } pub enum OrgChildReader { diff --git a/tests/dcache_correctness.rs b/tests/dcache_correctness.rs index 59731d28..34dcf088 100644 --- a/tests/dcache_correctness.rs +++ b/tests/dcache_correctness.rs @@ -3,7 +3,7 @@ use std::ffi::{OsStr, OsString}; use git_fs::fs::LoadedAddr; -use git_fs::fs::dcache::DCache; +use git_fs::fs::dcache::{DCache, PopulateStatus}; #[tokio::test] async fn lookup_returns_none_for_missing_entry() { @@ -51,16 +51,53 @@ async fn readdir_empty_parent_returns_empty() { } #[tokio::test] -async fn is_populated_false_by_default() { +async fn try_claim_populate_unclaimed_returns_claimed() { let cache = DCache::new(); - assert!(!cache.is_populated(LoadedAddr(1))); + assert!(matches!( + cache.try_claim_populate(LoadedAddr(1)), + PopulateStatus::Claimed + )); } #[tokio::test] -async fn mark_populated_then_check() { +async fn finish_populate_then_claim_returns_done() { let cache = DCache::new(); - cache.mark_populated(LoadedAddr(1)); - assert!(cache.is_populated(LoadedAddr(1))); + assert!(matches!( + cache.try_claim_populate(LoadedAddr(1)), + PopulateStatus::Claimed + )); + cache.finish_populate(LoadedAddr(1)); + assert!(matches!( + cache.try_claim_populate(LoadedAddr(1)), + PopulateStatus::Done + )); +} + +#[tokio::test] +async fn double_claim_returns_in_progress() { + let cache = DCache::new(); + assert!(matches!( + cache.try_claim_populate(LoadedAddr(1)), + PopulateStatus::Claimed + )); + assert!(matches!( + cache.try_claim_populate(LoadedAddr(1)), + PopulateStatus::InProgress + )); +} + +#[tokio::test] +async fn abort_populate_allows_reclaim() { + let cache = DCache::new(); + assert!(matches!( + cache.try_claim_populate(LoadedAddr(1)), + PopulateStatus::Claimed + )); + cache.abort_populate(LoadedAddr(1)); + assert!(matches!( + cache.try_claim_populate(LoadedAddr(1)), + PopulateStatus::Claimed + )); } #[tokio::test] @@ -70,7 +107,10 @@ async fn insert_does_not_mark_populated() { .insert(LoadedAddr(1), OsString::from("foo"), LoadedAddr(10), false) .await; assert!( - !cache.is_populated(LoadedAddr(1)), + matches!( + cache.try_claim_populate(LoadedAddr(1)), + PopulateStatus::Claimed + ), "insert alone should not mark a directory as populated" ); } From b735ac89dc528a2115291437d2ed29f57b72095a Mon Sep 17 00:00:00 2001 From: Marko Vejnovic Date: Fri, 20 Feb 2026 16:52:20 -0800 Subject: [PATCH 25/58] more fixes --- lib/cache/async_backed.rs | 100 +++++++++++++++++++++++++++++++------- lib/fs/async_fs.rs | 74 ++++++++++++++++++++-------- lib/fs/bridge.rs | 36 +++++++++++--- lib/fs/composite.rs | 6 ++- lib/fs/dcache.rs | 20 +++++--- src/fs/mescloud/repo.rs | 49 ++++++++++--------- 6 files changed, 207 insertions(+), 78 deletions(-) diff --git a/lib/cache/async_backed.rs b/lib/cache/async_backed.rs index 273bcd39..f304ca6d 100644 --- a/lib/cache/async_backed.rs +++ b/lib/cache/async_backed.rs @@ -7,6 +7,7 @@ //! Note that this cache does not support automatic eviction. use std::panic::AssertUnwindSafe; +use std::sync::Arc; use std::sync::atomic::{AtomicU64, Ordering}; use std::{fmt::Debug, future::Future, hash::Hash, pin::Pin}; @@ -116,16 +117,20 @@ where /// If the factory returns `Ok(v)`, the value is cached and returned. If it returns `Err(e)`, /// **nothing is cached** and the error is propagated to the caller. /// - /// Unlike `get_or_init`, concurrent callers are **not** deduplicated — each caller that - /// finds the key absent will invoke the factory independently. However, if a value was - /// previously cached (by either `get_or_init` or a successful `get_or_try_init`), it is - /// returned immediately without calling the factory. + /// Concurrent callers for the same key are deduplicated: only one factory invocation runs, + /// and joiners await its shared result. If the factory fails, the poisoned `InFlight` entry + /// is removed and joiners fall through to run their own factory (non-deduplicated retry). + /// + /// # Panics + /// + /// Panics if the factory panics (caught internally via `catch_unwind`). pub async fn get_or_try_init(&self, key: K, factory: F) -> Result where F: FnOnce() -> Fut, Fut: Future> + Send + 'static, + E: Send + 'static, { - // Fast path: value already cached or in-flight from an infallible init. + // Fast path: value already cached or in-flight. let existing = self .map .read_async(&key, |_, slot| match slot { @@ -140,30 +145,60 @@ where if let Some(v) = self.await_shared(&key, generation, shared).await { return Ok(v); } - // Factory panicked; entry was removed. Fall through to run our own factory. + // In-flight failed; fall through to slow path. } None => {} } - // Run the fallible factory (not deduplicated). - let val = factory().await?; + // Slow path: claim a slot or join an existing in-flight computation. + // The error side-channel lets the owner retrieve the `Err(e)` from the + // shared future (which only produces `Option`). + let error_cell: Arc>> = Arc::new(std::sync::Mutex::new(None)); - // Attempt to cache. If another caller raced us and already inserted, - // return the existing value and discard ours. - match self.map.entry_async(key).await { + match self.map.entry_async(key.clone()).await { scc::hash_map::Entry::Occupied(occ) => match occ.get() { Slot::Ready(v) => Ok(v.clone()), Slot::InFlight(g, shared) => { - let generation = *g; - Ok(self - .await_shared(occ.key(), generation, shared.clone()) - .await - .unwrap_or(val)) + let (generation, shared) = (*g, shared.clone()); + drop(occ); + if let Some(v) = self.await_shared(&key, generation, shared).await { + return Ok(v); + } + // In-flight failed. We still have `factory` — run it ourselves. + let val = factory().await?; + match self.map.entry_async(key).await { + scc::hash_map::Entry::Occupied(occ) => match occ.get() { + Slot::Ready(v) => Ok(v.clone()), + Slot::InFlight(..) => Ok(val), + }, + scc::hash_map::Entry::Vacant(vac) => { + vac.insert_entry(Slot::Ready(val.clone())); + Ok(val) + } + } } }, scc::hash_map::Entry::Vacant(vac) => { - vac.insert_entry(Slot::Ready(val.clone())); - Ok(val) + let generation = self.next_gen.fetch_add(1, Ordering::Relaxed); + let shared = Self::make_shared_fallible(factory, Arc::clone(&error_cell)); + let ret = shared.clone(); + vac.insert_entry(Slot::InFlight(generation, shared)); + + if let Some(v) = self.await_shared(&key, generation, ret).await { + return Ok(v); + } + // Our factory returned `Err` — retrieve it from the side channel. + let captured = error_cell + .lock() + .unwrap_or_else(std::sync::PoisonError::into_inner) + .take(); + match captured { + Some(e) => Err(e), + None => panic!( + "FutureBackedCache: factory for key {key:?} resolved to None \ + but no error was captured (factory panicked)" + ), + } } } } @@ -244,6 +279,35 @@ where boxed.shared() } + /// Like [`make_shared`](Self::make_shared), but for fallible factories. + /// + /// On `Ok(v)`, the shared future resolves to `Some(v)`. On `Err(e)`, the + /// error is captured in `error_cell` and the future resolves to `None`. + fn make_shared_fallible( + factory: F, + error_cell: Arc>>, + ) -> SharedFut + where + F: FnOnce() -> Fut, + Fut: Future> + Send + 'static, + E: Send + 'static, + { + let fut = AssertUnwindSafe(factory()).catch_unwind(); + let boxed: Pin> + Send>> = Box::pin(async move { + match fut.await { + Ok(Ok(v)) => Some(v), + Ok(Err(e)) => { + *error_cell + .lock() + .unwrap_or_else(std::sync::PoisonError::into_inner) = Some(e); + None + } + Err(_panic) => None, + } + }); + boxed.shared() + } + /// Returns the number of entries in the cache (both `Ready` and `InFlight`). #[must_use] pub fn len(&self) -> usize { diff --git a/lib/fs/async_fs.rs b/lib/fs/async_fs.rs index a13a6617..839cb267 100644 --- a/lib/fs/async_fs.rs +++ b/lib/fs/async_fs.rs @@ -184,6 +184,39 @@ impl InodeLifecycle { } } +/// RAII guard that calls [`DCache::abort_populate`] on drop unless defused. +/// +/// Prevents the populate flag from getting stuck in `IN_PROGRESS` if the +/// populating future is cancelled (e.g. by a FUSE interrupt or `select!`). +struct PopulateGuard<'a> { + dcache: &'a DCache, + parent: LoadedAddr, + armed: bool, +} + +impl<'a> PopulateGuard<'a> { + fn new(dcache: &'a DCache, parent: LoadedAddr) -> Self { + Self { + dcache, + parent, + armed: true, + } + } + + /// Defuse the guard after a successful `finish_populate`. + fn defuse(&mut self) { + self.armed = false; + } +} + +impl Drop for PopulateGuard<'_> { + fn drop(&mut self) { + if self.armed { + self.dcache.abort_populate(self.parent); + } + } +} + /// An asynchronous filesystem cache mapping `InodeAddr` to `INode`. /// /// Uses two [`FutureBackedCache`] layers: @@ -401,28 +434,27 @@ impl<'tbl, DP: FsDataProvider> AsyncFs<'tbl, DP> { loop { match self.directory_cache.try_claim_populate(parent) { PopulateStatus::Claimed => { - match self.data_provider.readdir(parent_inode).await { - Ok(children) => { - for (name, child_inode) in children { - self.inode_table - .get_or_init(child_inode.addr, || async move { child_inode }) - .await; - self.directory_cache - .insert( - parent, - name, - LoadedAddr(child_inode.addr), - child_inode.itype == INodeType::Directory, - ) - .await; - } - self.directory_cache.finish_populate(parent); - } - Err(e) => { - self.directory_cache.abort_populate(parent); - return Err(e); - } + // RAII guard: if this future is cancelled between Claimed + // and finish_populate, automatically abort so other waiters + // can retry instead of hanging forever. + let mut guard = PopulateGuard::new(&self.directory_cache, parent); + + let children = self.data_provider.readdir(parent_inode).await?; + for (name, child_inode) in children { + self.inode_table + .get_or_init(child_inode.addr, || async move { child_inode }) + .await; + self.directory_cache + .insert( + parent, + name, + LoadedAddr(child_inode.addr), + child_inode.itype == INodeType::Directory, + ) + .await; } + self.directory_cache.finish_populate(parent); + guard.defuse(); break; } PopulateStatus::InProgress => { diff --git a/lib/fs/bridge.rs b/lib/fs/bridge.rs index 350d8750..37599388 100644 --- a/lib/fs/bridge.rs +++ b/lib/fs/bridge.rs @@ -1,20 +1,26 @@ -//! Lock-free bidirectional inode address mapping. +//! Bidirectional inode address mapping. //! //! [`ConcurrentBridge`] maps between "outer" (composite) and "inner" (child) -//! inode address spaces using two [`scc::HashMap`]s. +//! inode address spaces using two [`scc::HashMap`]s guarded by a coordination +//! lock for cross-map atomicity. + +use std::sync::Mutex; use crate::fs::InodeAddr; /// Bidirectional inode mapping between outer (composite) and inner (child) address spaces. /// -/// Uses two lock-free `scc::HashMap`s. Insertion order: forward map first, -/// then backward map, so any observer that discovers an outer addr via -/// `backward` can immediately resolve it via `forward`. +/// Uses two concurrent `scc::HashMap`s for lock-free reads. Mutations that +/// touch both maps are serialized by a `Mutex<()>` to prevent cross-map +/// inconsistencies (e.g. a concurrent `remove_by_outer` between the two +/// `insert_sync` calls in `insert` could leave orphaned entries). pub struct ConcurrentBridge { /// outer -> inner fwd: scc::HashMap, /// inner -> outer bwd: scc::HashMap, + /// Serializes mutations that touch both maps. + mu: Mutex<()>, } impl ConcurrentBridge { @@ -24,13 +30,18 @@ impl ConcurrentBridge { Self { fwd: scc::HashMap::new(), bwd: scc::HashMap::new(), + mu: Mutex::new(()), } } /// Insert a mapping from outer to inner. /// - /// Inserts into the forward map first (see module docs for ordering rationale). + /// Serialized with other mutations via the coordination lock. pub fn insert(&self, outer: InodeAddr, inner: InodeAddr) { + let _guard = self + .mu + .lock() + .unwrap_or_else(std::sync::PoisonError::into_inner); let _ = self.fwd.insert_sync(outer, inner); let _ = self.bwd.insert_sync(inner, outer); } @@ -48,18 +59,23 @@ impl ConcurrentBridge { } /// Look up inner -> outer, or allocate a new outer address if unmapped. + /// + /// Serialized with other mutations via the coordination lock. #[must_use] pub fn backward_or_insert( &self, inner: InodeAddr, allocate: impl FnOnce() -> InodeAddr, ) -> InodeAddr { + let _guard = self + .mu + .lock() + .unwrap_or_else(std::sync::PoisonError::into_inner); match self.bwd.entry_sync(inner) { scc::hash_map::Entry::Occupied(occ) => *occ.get(), scc::hash_map::Entry::Vacant(vac) => { let outer = allocate(); vac.insert_entry(outer); - // Populate forward map after backward is committed. let _ = self.fwd.insert_sync(outer, inner); outer } @@ -67,7 +83,13 @@ impl ConcurrentBridge { } /// Remove the mapping for the given outer address. + /// + /// Serialized with other mutations via the coordination lock. pub fn remove_by_outer(&self, outer: InodeAddr) { + let _guard = self + .mu + .lock() + .unwrap_or_else(std::sync::PoisonError::into_inner); if let Some((_, inner)) = self.fwd.remove_sync(&outer) { self.bwd.remove_sync(&inner); } diff --git a/lib/fs/composite.rs b/lib/fs/composite.rs index 7d9748c6..abb65fc2 100644 --- a/lib/fs/composite.rs +++ b/lib/fs/composite.rs @@ -263,12 +263,12 @@ impl CompositeFs { { match self.inner.name_to_slot.entry_sync(desc.name.clone()) { scc::hash_map::Entry::Occupied(mut occ) => { - let slot_idx = *occ.get(); + let old_slot_idx = *occ.get(); // Extract bridge Arc from the slot guard, then query outside. let bridge = self .inner .slots - .read_sync(&slot_idx, |_, slot| Arc::clone(&slot.bridge)); + .read_sync(&old_slot_idx, |_, slot| Arc::clone(&slot.bridge)); if let Some(outer) = bridge.and_then(|b| b.backward(desc.root_ino.addr)) { return outer; } @@ -276,6 +276,8 @@ impl CompositeFs { // while still holding the entry guard to prevent races. let (outer_ino, new_slot_idx) = self.create_child_slot(desc); *occ.get_mut() = new_slot_idx; + // Remove the orphaned old slot to prevent unbounded growth. + self.inner.slots.remove_sync(&old_slot_idx); outer_ino } scc::hash_map::Entry::Vacant(vac) => { diff --git a/lib/fs/dcache.rs b/lib/fs/dcache.rs index aea5bb2c..d8778fb8 100644 --- a/lib/fs/dcache.rs +++ b/lib/fs/dcache.rs @@ -2,6 +2,8 @@ use std::ffi::{OsStr, OsString}; use std::sync::Arc; use std::sync::atomic::{AtomicU8, Ordering}; +use tokio::sync::Notify; + use crate::fs::LoadedAddr; /// Cached metadata for a directory entry. @@ -32,6 +34,8 @@ pub enum PopulateStatus { struct DirState { children: scc::HashMap, populated: AtomicU8, + /// Wakes waiters when `populated` transitions out of `IN_PROGRESS`. + notify: Notify, } impl DirState { @@ -39,6 +43,7 @@ impl DirState { Self { children: scc::HashMap::new(), populated: AtomicU8::new(POPULATE_UNCLAIMED), + notify: Notify::new(), } } } @@ -46,7 +51,7 @@ impl DirState { /// In-memory directory entry cache with per-parent child maps. /// /// Each parent directory gets its own [`DirState`] containing a -/// [`scc::HashMap`] of child entries and an [`AtomicBool`] population flag. +/// [`scc::HashMap`] of child entries and an [`AtomicU8`] population flag. /// This makes `readdir` O(k) in the number of children rather than O(n) /// over the entire cache. pub struct DCache { @@ -144,6 +149,7 @@ impl DCache { pub fn finish_populate(&self, parent_ino: LoadedAddr) { let state = self.dir_state(parent_ino); state.populated.store(POPULATE_DONE, Ordering::Release); + state.notify.notify_waiters(); } /// Abort a population attempt, resetting back to unclaimed so another @@ -151,19 +157,21 @@ impl DCache { pub fn abort_populate(&self, parent_ino: LoadedAddr) { let state = self.dir_state(parent_ino); state.populated.store(POPULATE_UNCLAIMED, Ordering::Release); + state.notify.notify_waiters(); } /// Wait until a directory is no longer in the `InProgress` state. + /// + /// Uses [`Notify`] to sleep efficiently instead of spinning. pub async fn wait_populated(&self, parent_ino: LoadedAddr) { + let state = self.dir_state(parent_ino); loop { - let current = self - .dirs - .read_sync(&parent_ino, |_, v| v.populated.load(Ordering::Acquire)) - .unwrap_or(POPULATE_UNCLAIMED); + let notified = state.notify.notified(); + let current = state.populated.load(Ordering::Acquire); if current != POPULATE_IN_PROGRESS { return; } - tokio::task::yield_now().await; + notified.await; } } } diff --git a/src/fs/mescloud/repo.rs b/src/fs/mescloud/repo.rs index f2041d10..df0b2dbb 100644 --- a/src/fs/mescloud/repo.rs +++ b/src/fs/mescloud/repo.rs @@ -265,13 +265,15 @@ impl FsDataProvider for MesRepoProvider { .ok_or_else(|| std::io::Error::from_raw_os_error(libc::ENOENT))?; Ok(MesFileReader { - client: inner.client.clone(), - org_name: inner.org_name.clone(), - repo_name: inner.repo_name.clone(), - ref_: inner.ref_.clone(), - path, - file_cache: inner.file_cache.clone(), - inode_addr: inode.addr, + inner: Arc::new(MesFileReaderCtx { + client: inner.client.clone(), + org_name: inner.org_name.clone(), + repo_name: inner.repo_name.clone(), + ref_: inner.ref_.clone(), + path, + file_cache: inner.file_cache.clone(), + inode_addr: inode.addr, + }), }) } } @@ -282,6 +284,10 @@ impl FsDataProvider for MesRepoProvider { } pub struct MesFileReader { + inner: Arc, +} + +struct MesFileReaderCtx { client: MesaClient, org_name: String, repo_name: String, @@ -297,18 +303,12 @@ impl FileReader for MesFileReader { offset: u64, size: u32, ) -> impl Future> + Send { - let client = self.client.clone(); - let org_name = self.org_name.clone(); - let repo_name = self.repo_name.clone(); - let ref_ = self.ref_.clone(); - let path = self.path.clone(); - let file_cache = self.file_cache.clone(); - let inode_addr = self.inode_addr; + let ctx = Arc::clone(&self.inner); async move { // Try the file cache first. - if let Some(cache) = &file_cache - && let Some(data) = cache.get(&inode_addr).await + if let Some(cache) = &ctx.file_cache + && let Some(data) = cache.get(&ctx.inode_addr).await { let start = usize::try_from(offset) .unwrap_or(data.len()) @@ -318,7 +318,7 @@ impl FileReader for MesFileReader { } // Cache miss -- fetch from the Mesa API. - let path_str = path.to_str().ok_or_else(|| { + let path_str = ctx.path.to_str().ok_or_else(|| { std::io::Error::new( std::io::ErrorKind::InvalidData, "path contains non-UTF-8 characters", @@ -331,12 +331,13 @@ impl FileReader for MesFileReader { Some(path_str) }; - let content = client - .org(&org_name) + let content = ctx + .client + .org(&ctx.org_name) .repos() - .at(&repo_name) + .at(&ctx.repo_name) .content() - .get(Some(ref_.as_str()), api_path, None) + .get(Some(ctx.ref_.as_str()), api_path, None) .await .map_err(MesaApiError::from) .map_err(mesa_api_error_to_io)?; @@ -360,10 +361,10 @@ impl FileReader for MesFileReader { let result = Bytes::copy_from_slice(&decoded[start..end]); // Store the decoded content in the cache for future reads. - if let Some(cache) = &file_cache - && let Err(e) = cache.insert(&inode_addr, decoded).await + if let Some(cache) = &ctx.file_cache + && let Err(e) = cache.insert(&ctx.inode_addr, decoded).await { - warn!(error = ?e, inode_addr, "failed to cache file content"); + warn!(error = ?e, inode_addr = ctx.inode_addr, "failed to cache file content"); } Ok(result) From 7106d6053af6daf6a520191afa364aa37c62a3b9 Mon Sep 17 00:00:00 2001 From: Marko Vejnovic Date: Sat, 21 Feb 2026 12:02:21 -0800 Subject: [PATCH 26/58] tests --- lib/cache/async_backed.rs | 95 +++++++++++++++---------------- lib/fs/async_fs.rs | 19 +++++-- tests/async_backed_correctness.rs | 77 +++++++++++++++++++++++++ tests/async_fs_correctness.rs | 42 ++++++++++++++ tests/common/async_fs_mocks.rs | 8 +++ 5 files changed, 186 insertions(+), 55 deletions(-) diff --git a/lib/cache/async_backed.rs b/lib/cache/async_backed.rs index f304ca6d..9c05ee0b 100644 --- a/lib/cache/async_backed.rs +++ b/lib/cache/async_backed.rs @@ -7,7 +7,6 @@ //! Note that this cache does not support automatic eviction. use std::panic::AssertUnwindSafe; -use std::sync::Arc; use std::sync::atomic::{AtomicU64, Ordering}; use std::{fmt::Debug, future::Future, hash::Hash, pin::Pin}; @@ -119,7 +118,9 @@ where /// /// Concurrent callers for the same key are deduplicated: only one factory invocation runs, /// and joiners await its shared result. If the factory fails, the poisoned `InFlight` entry - /// is removed and joiners fall through to run their own factory (non-deduplicated retry). + /// is removed and joiners retry by re-entering the `entry_async` gate, so a single new + /// owner is elected. Joiners never receive the original error — the retrying owner invokes + /// its own factory independently and may produce a different error or succeed. /// /// # Panics /// @@ -151,53 +152,49 @@ where } // Slow path: claim a slot or join an existing in-flight computation. - // The error side-channel lets the owner retrieve the `Err(e)` from the - // shared future (which only produces `Option`). - let error_cell: Arc>> = Arc::new(std::sync::Mutex::new(None)); + // Wrapped in `Option` so the `FnOnce` factory can be consumed exactly + // once inside the loop (only in the `Vacant` branch, which always returns). + let mut factory = Some(factory); - match self.map.entry_async(key.clone()).await { - scc::hash_map::Entry::Occupied(occ) => match occ.get() { - Slot::Ready(v) => Ok(v.clone()), - Slot::InFlight(g, shared) => { - let (generation, shared) = (*g, shared.clone()); - drop(occ); - if let Some(v) = self.await_shared(&key, generation, shared).await { - return Ok(v); - } - // In-flight failed. We still have `factory` — run it ourselves. - let val = factory().await?; - match self.map.entry_async(key).await { - scc::hash_map::Entry::Occupied(occ) => match occ.get() { - Slot::Ready(v) => Ok(v.clone()), - Slot::InFlight(..) => Ok(val), - }, - scc::hash_map::Entry::Vacant(vac) => { - vac.insert_entry(Slot::Ready(val.clone())); - Ok(val) + loop { + match self.map.entry_async(key.clone()).await { + scc::hash_map::Entry::Occupied(occ) => match occ.get() { + Slot::Ready(v) => return Ok(v.clone()), + Slot::InFlight(g, shared) => { + let (generation, shared) = (*g, shared.clone()); + drop(occ); + if let Some(v) = self.await_shared(&key, generation, shared).await { + return Ok(v); } + // In-flight failed. Loop back to `entry_async` so the + // next caller gets proper dedup instead of running + // factory directly. } - } - }, - scc::hash_map::Entry::Vacant(vac) => { - let generation = self.next_gen.fetch_add(1, Ordering::Relaxed); - let shared = Self::make_shared_fallible(factory, Arc::clone(&error_cell)); - let ret = shared.clone(); - vac.insert_entry(Slot::InFlight(generation, shared)); + }, + scc::hash_map::Entry::Vacant(vac) => { + let f = factory.take().unwrap_or_else(|| { + unreachable!( + "FutureBackedCache: factory already consumed but \ + reached Vacant branch again for key {key:?}" + ) + }); + let generation = self.next_gen.fetch_add(1, Ordering::Relaxed); + let (error_tx, mut error_rx) = tokio::sync::oneshot::channel(); + let shared = Self::make_shared_fallible(f, error_tx); + let ret = shared.clone(); + vac.insert_entry(Slot::InFlight(generation, shared)); - if let Some(v) = self.await_shared(&key, generation, ret).await { - return Ok(v); - } - // Our factory returned `Err` — retrieve it from the side channel. - let captured = error_cell - .lock() - .unwrap_or_else(std::sync::PoisonError::into_inner) - .take(); - match captured { - Some(e) => Err(e), - None => panic!( - "FutureBackedCache: factory for key {key:?} resolved to None \ - but no error was captured (factory panicked)" - ), + if let Some(v) = self.await_shared(&key, generation, ret).await { + return Ok(v); + } + // Our factory returned `Err` — retrieve it from the channel. + return match error_rx.try_recv().ok() { + Some(e) => Err(e), + None => panic!( + "FutureBackedCache: factory for key {key:?} resolved to None \ + but no error was captured (factory panicked)" + ), + }; } } } @@ -282,10 +279,10 @@ where /// Like [`make_shared`](Self::make_shared), but for fallible factories. /// /// On `Ok(v)`, the shared future resolves to `Some(v)`. On `Err(e)`, the - /// error is captured in `error_cell` and the future resolves to `None`. + /// error is sent through `error_tx` and the future resolves to `None`. fn make_shared_fallible( factory: F, - error_cell: Arc>>, + error_tx: tokio::sync::oneshot::Sender, ) -> SharedFut where F: FnOnce() -> Fut, @@ -297,9 +294,7 @@ where match fut.await { Ok(Ok(v)) => Some(v), Ok(Err(e)) => { - *error_cell - .lock() - .unwrap_or_else(std::sync::PoisonError::into_inner) = Some(e); + drop(error_tx.send(e)); None } Err(_panic) => None, diff --git a/lib/fs/async_fs.rs b/lib/fs/async_fs.rs index 839cb267..061f974b 100644 --- a/lib/fs/async_fs.rs +++ b/lib/fs/async_fs.rs @@ -323,12 +323,21 @@ impl<'tbl, DP: FsDataProvider> AsyncFs<'tbl, DP> { "parent inode should be a directory" ); - if let Some(dentry) = self.directory_cache.lookup(parent, name) - && let Some(inode) = self.inode_table.get(&dentry.ino.0).await - { - return Ok(TrackedINode { inode }); + if let Some(dentry) = self.directory_cache.lookup(parent, name) { + if let Some(inode) = self.inode_table.get(&dentry.ino.0).await { + return Ok(TrackedINode { inode }); + } + // Inode was evicted (e.g. by forget). Evict the stale lookup_cache + // entry so the slow path calls dp.lookup() fresh. + // + // Note: a concurrent task may re-insert into lookup_cache between + // our inode_table miss and this remove_sync. This is benign — it + // causes at most one redundant dp.lookup() call because all + // downstream operations (get_or_try_init, get_or_init) are + // idempotent or deduplicated. + self.lookup_cache + .remove_sync(&(parent.0, name.to_os_string())); } - // Inode was evicted from the table — fall through to the slow path. let name_owned = name.to_os_string(); let lookup_key = (parent.0, name_owned.clone()); diff --git a/tests/async_backed_correctness.rs b/tests/async_backed_correctness.rs index 457ba948..097226aa 100644 --- a/tests/async_backed_correctness.rs +++ b/tests/async_backed_correctness.rs @@ -3,6 +3,8 @@ use std::sync::Arc; use std::sync::atomic::{AtomicUsize, Ordering}; +use tokio::sync::oneshot; + use git_fs::cache::async_backed::FutureBackedCache; #[tokio::test] @@ -97,3 +99,78 @@ async fn panic_in_factory_is_recovered() { "factory called twice" ); } + +/// With 3+ joiners the dedup property becomes observable: under the old +/// broken code each joiner would run its own factory after the owner fails +/// (4 total calls for 1 owner + 3 joiners). With the loop-based retry only +/// one joiner wins the `Vacant` race, so we expect exactly 2 calls +/// (A's fail + one winner's success). +#[tokio::test(flavor = "multi_thread", worker_threads = 4)] +async fn try_init_retry_after_joined_failure_deduplicates() { + let cache = Arc::new(FutureBackedCache::::default()); + let call_count = Arc::new(AtomicUsize::new(0)); + + // Channel to control timing of Task A's factory. + let (release_tx, release_rx) = oneshot::channel::<()>(); + + // Task A: starts a failing InFlight, held until we release. + let cache_a = Arc::clone(&cache); + let count_a = Arc::clone(&call_count); + let task_a = tokio::spawn(async move { + let result: Result = cache_a + .get_or_try_init(1, || { + count_a.fetch_add(1, Ordering::Relaxed); + async move { + let _ = release_rx.await; + Err("task_a_fail".to_owned()) + } + }) + .await; + result + }); + + // Give Task A time to register the InFlight slot. + tokio::time::sleep(std::time::Duration::from_millis(50)).await; + + // Spawn 3 joiners that all join A's InFlight. After A fails, exactly + // one should win the Vacant race and run its factory; the others join + // the new InFlight. + let mut joiner_handles = Vec::new(); + for _ in 0..3 { + let cache_j = Arc::clone(&cache); + let count_j = Arc::clone(&call_count); + joiner_handles.push(tokio::spawn(async move { + let result: Result = cache_j + .get_or_try_init(1, || { + count_j.fetch_add(1, Ordering::Relaxed); + async move { Ok("joiner_ok".to_owned()) } + }) + .await; + result + })); + } + + // Give joiners time to join the InFlight. + tokio::time::sleep(std::time::Duration::from_millis(50)).await; + + // Release A's factory → it fails. + release_tx.send(()).unwrap(); + + let result_a = task_a.await.unwrap(); + assert!(result_a.is_err(), "task A should fail"); + + for handle in joiner_handles { + let result = handle.await.unwrap(); + assert_eq!(result.unwrap(), "joiner_ok", "every joiner should succeed"); + } + + // Factory should have been called exactly 2 times: A's fail + one + // joiner winning the Vacant race. The other 2 joiners piggyback on + // the winner's InFlight via Shared, so their factories are never called. + assert_eq!( + call_count.load(Ordering::Relaxed), + 2, + "factory should be called exactly twice (A's fail + one joiner's success), \ + not 4 (which would indicate each joiner ran its own factory)" + ); +} diff --git a/tests/async_fs_correctness.rs b/tests/async_fs_correctness.rs index 5fe27a28..fd6c2bdc 100644 --- a/tests/async_fs_correctness.rs +++ b/tests/async_fs_correctness.rs @@ -3,6 +3,7 @@ mod common; use std::ffi::{OsStr, OsString}; +use std::sync::Arc; use git_fs::cache::async_backed::FutureBackedCache; use git_fs::fs::async_fs::{AsyncFs, InodeLifecycle}; @@ -579,6 +580,47 @@ async fn readdir_provides_correct_next_offsets() { ); } +#[tokio::test(flavor = "multi_thread", worker_threads = 2)] +async fn lookup_after_eviction_returns_fresh_inode() { + let root = make_inode(1, INodeType::Directory, 0, None); + let child_v1 = make_inode(10, INodeType::File, 42, Some(1)); + let child_v2 = make_inode(20, INodeType::File, 99, Some(1)); + + let mut state = MockFsState::default(); + state.lookups.insert((1, "readme.md".into()), child_v1); + let dp = MockFsDataProvider::new(state); + let state_ref = Arc::clone(&dp.state); + + let table = FutureBackedCache::default(); + let fs = AsyncFs::new(dp, root, &table).await; + + // First lookup → addr=10 + let first = fs + .lookup(LoadedAddr(1), OsStr::new("readme.md")) + .await + .unwrap(); + assert_eq!(first.inode.addr, 10); + + // Simulate forget: remove the inode from the table. + table.remove_sync(&10); + + // Insert the refresh entry *after* the first lookup so dp.lookup() + // returns child_v2 on the next call (refresh_lookups is checked first). + drop( + state_ref + .refresh_lookups + .insert_sync((1, "readme.md".into()), child_v2), + ); + + // Second lookup should NOT return the stale addr=10. + let second = fs + .lookup(LoadedAddr(1), OsStr::new("readme.md")) + .await + .unwrap(); + assert_ne!(second.inode.addr, 10, "should not return stale inode"); + assert_eq!(second.inode.addr, 20, "should return the fresh inode"); +} + // lookup-after-readdir integration test #[tokio::test(flavor = "multi_thread", worker_threads = 2)] diff --git a/tests/common/async_fs_mocks.rs b/tests/common/async_fs_mocks.rs index 5c132eec..4441544c 100644 --- a/tests/common/async_fs_mocks.rs +++ b/tests/common/async_fs_mocks.rs @@ -52,6 +52,10 @@ pub struct MockFsState { pub directories: HashMap>, /// `inode_addr -> file content bytes` pub file_contents: HashMap, + /// Mutable overrides for `lookups`. When populated, entries here take + /// precedence and are consumed on use (removed after the first hit). + /// Existing tests are unaffected because this defaults to empty. + pub refresh_lookups: scc::HashMap<(u64, OsString), INode>, } /// A clonable mock data provider for `AsyncFs` tests. @@ -73,6 +77,10 @@ impl FsDataProvider for MockFsDataProvider { async fn lookup(&self, parent: INode, name: &OsStr) -> Result { let key = (parent.addr, name.to_os_string()); + // Check mutable overrides first (consumed on use). + if let Some((_, inode)) = self.state.refresh_lookups.remove_sync(&key) { + return Ok(inode); + } self.state .lookups .get(&key) From 61f5f3082067caf0d6955d3547b22e1af025f2d6 Mon Sep 17 00:00:00 2001 From: Marko Vejnovic Date: Sat, 21 Feb 2026 12:24:23 -0800 Subject: [PATCH 27/58] more docs --- lib/fs/async_fs.rs | 22 +++++++++++++++++----- lib/fs/composite.rs | 3 +++ src/fs/mescloud/repo.rs | 17 +++++++++++++++-- 3 files changed, 35 insertions(+), 7 deletions(-) diff --git a/lib/fs/async_fs.rs b/lib/fs/async_fs.rs index 061f974b..7b5b8d2f 100644 --- a/lib/fs/async_fs.rs +++ b/lib/fs/async_fs.rs @@ -65,23 +65,35 @@ pub trait FsDataProvider: Clone + Send + Sync + 'static { flags: OpenFlags, ) -> impl Future> + Send; - /// Called when the kernel forgets an inode (refcount reaches zero). + /// Clean up provider-internal state for an evicted inode. /// - /// Implementations should clean up any internal mappings for the given - /// address (e.g. bridge maps, path maps). The default is a no-op. + /// The `DropWard`/`InodeForget` system automatically removes inodes from + /// the shared `inode_table` when the FUSE refcount reaches zero, but data + /// providers often maintain auxiliary structures (path maps, bridge maps) + /// that also need cleanup. This method is that extension point. + /// + /// Never called directly -- [`InodeForget::delete`] invokes it + /// automatically when the refcount drops to zero. fn forget(&self, _addr: InodeAddr) {} } -/// Zero-sized tag whose [`StatelessDrop`] implementation automatically evicts -/// an inode from the inode table when its reference count reaches zero. +/// Zero-sized cleanup tag for inode eviction. +/// +/// The [`StatelessDrop`] implementations on this type evict inodes from the +/// inode table and, when a data provider is present, delegate to +/// [`FsDataProvider::forget`] so the provider can clean up its own auxiliary +/// structures (path maps, bridge maps, etc.). pub struct InodeForget; +/// Evicts the inode from the table only. Used when no data provider is available. impl<'a> StatelessDrop<&'a FutureBackedCache, InodeAddr> for InodeForget { fn delete(inode_table: &&'a FutureBackedCache, addr: &InodeAddr) { inode_table.remove_sync(addr); } } +/// Evicts the inode from the table and delegates to [`FsDataProvider::forget`] +/// so the provider can clean up its own auxiliary state. impl<'a, DP: FsDataProvider> StatelessDrop<(&'a FutureBackedCache, DP), InodeAddr> for InodeForget { diff --git a/lib/fs/composite.rs b/lib/fs/composite.rs index abb65fc2..ff00c8d3 100644 --- a/lib/fs/composite.rs +++ b/lib/fs/composite.rs @@ -437,6 +437,9 @@ where }) } + /// Removes the composite-level address from `addr_to_slot` and the + /// child's bridge map. Called automatically by `InodeForget` when the + /// FUSE refcount drops to zero. The root inode is never forgotten. fn forget(&self, addr: InodeAddr) { if addr == Self::ROOT_INO { return; diff --git a/src/fs/mescloud/repo.rs b/src/fs/mescloud/repo.rs index df0b2dbb..aae85491 100644 --- a/src/fs/mescloud/repo.rs +++ b/src/fs/mescloud/repo.rs @@ -35,8 +35,19 @@ struct MesRepoProviderInner { ref_: String, fs_owner: (u32, u32), next_addr: AtomicU64, - /// Maps inode addresses to repo-relative paths (e.g., "src/main.rs"). - /// Root directory maps to an empty `PathBuf`. + /// Maps inode addresses to repo-relative paths (e.g. `"src/main.rs"`). + /// Root maps to an empty `PathBuf`. + /// + /// Exists alongside the [`DCache`](git_fs::fs::dcache::DCache) because + /// they serve different purposes: the dcache maps + /// `(parent_addr, child_name) -> child_addr` (single-hop name resolution), + /// while this map provides the full repo-relative path needed for Mesa API + /// calls. Reconstructing the full path from the dcache would require + /// walking parent pointers to the root on every API call; this map + /// materializes that walk as an O(1) lookup. + /// + /// Entries are inserted during `lookup`/`readdir` and removed via + /// [`forget`](Self::remove_path) when the FUSE refcount reaches zero. path_map: scc::HashMap, file_cache: Option>>, } @@ -278,6 +289,8 @@ impl FsDataProvider for MesRepoProvider { } } + /// Evicts the inode's entry from [`path_map`](MesRepoProviderInner::path_map). + /// Called automatically by `InodeForget` when the FUSE refcount drops to zero. fn forget(&self, addr: InodeAddr) { self.remove_path(addr); } From e890c3da1ea99bb704679e03e3d2a16694b16311 Mon Sep 17 00:00:00 2001 From: Marko Vejnovic Date: Sat, 21 Feb 2026 13:48:08 -0800 Subject: [PATCH 28/58] thread-safety on forget --- lib/cache/async_backed.rs | 63 ++++++++++++++++++++++++++------------- lib/drop_ward.rs | 5 +++- lib/fs/bridge.rs | 17 +++++++++++ lib/fs/composite.rs | 24 ++++++++++++--- 4 files changed, 83 insertions(+), 26 deletions(-) diff --git a/lib/cache/async_backed.rs b/lib/cache/async_backed.rs index 9c05ee0b..d8989cf8 100644 --- a/lib/cache/async_backed.rs +++ b/lib/cache/async_backed.rs @@ -62,8 +62,10 @@ where /// /// # Panics /// - /// Panics if this caller joins an in-flight factory that itself panicked (i.e. the caller - /// lost the race to insert a fresh entry after the poisoned slot was removed). + /// Panics only if *this* caller's own factory panicked (i.e. this caller won the `Vacant` + /// slot and the factory it spawned panicked). Joiners who observe a panicked factory loop + /// back to `entry_async` so a new owner is elected, matching the retry semantics of + /// [`get_or_try_init`](Self::get_or_try_init). pub async fn get_or_init(&self, key: K, factory: F) -> V where F: FnOnce() -> Fut, @@ -84,31 +86,50 @@ where if let Some(v) = self.await_shared(&key, generation, shared).await { return v; } - // Factory panicked; entry removed. Fall through to re-insert below. + // Factory panicked; entry removed. Fall through to slow path. } None => {} } - // Slow path: use entry_async for atomic check-and-insert. - let (generation, shared) = match self.map.entry_async(key.clone()).await { - scc::hash_map::Entry::Occupied(occ) => match occ.get() { - Slot::Ready(v) => return v.clone(), - Slot::InFlight(g, shared) => (*g, shared.clone()), - }, - scc::hash_map::Entry::Vacant(vac) => { - let generation = self.next_gen.fetch_add(1, Ordering::Relaxed); - let shared = Self::make_shared(factory); - let ret = shared.clone(); - vac.insert_entry(Slot::InFlight(generation, shared)); - (generation, ret) - } - }; + // Slow path: claim a slot or join an existing in-flight computation. + // Wrapped in `Option` so the `FnOnce` factory can be consumed exactly + // once inside the loop (only in the `Vacant` branch, which always returns). + let mut factory = Some(factory); - if let Some(v) = self.await_shared(&key, generation, shared).await { - return v; - } + loop { + match self.map.entry_async(key.clone()).await { + scc::hash_map::Entry::Occupied(occ) => match occ.get() { + Slot::Ready(v) => return v.clone(), + Slot::InFlight(g, shared) => { + let (generation, shared) = (*g, shared.clone()); + drop(occ); + if let Some(v) = self.await_shared(&key, generation, shared).await { + return v; + } + // In-flight failed. Loop back to `entry_async` so the + // next caller gets proper dedup instead of running + // factory directly. + } + }, + scc::hash_map::Entry::Vacant(vac) => { + let f = factory.take().unwrap_or_else(|| { + unreachable!( + "FutureBackedCache: factory already consumed but \ + reached Vacant branch again for key {key:?}" + ) + }); + let generation = self.next_gen.fetch_add(1, Ordering::Relaxed); + let shared = Self::make_shared(f); + let ret = shared.clone(); + vac.insert_entry(Slot::InFlight(generation, shared)); - panic!("FutureBackedCache: joined an in-flight factory that panicked for key {key:?}"); + if let Some(v) = self.await_shared(&key, generation, ret).await { + return v; + } + panic!("FutureBackedCache: factory for key {key:?} panicked"); + } + } + } } /// Like [`get_or_init`](Self::get_or_init), but for fallible factories. diff --git a/lib/drop_ward.rs b/lib/drop_ward.rs index 4922e13c..848d1dfb 100644 --- a/lib/drop_ward.rs +++ b/lib/drop_ward.rs @@ -107,8 +107,11 @@ where let curr = *self.map.get(key)?; let new_count = curr.saturating_sub(by); if new_count == 0 { - self.map.remove(key); + // Delete before removing from the map: if `delete` panics the + // entry remains and a subsequent `dec` can retry cleanup. The + // reverse order would silently lose the entry. T::delete(&self.ctx, key); + self.map.remove(key); } else if let Some(slot) = self.map.get_mut(key) { *slot = new_count; } diff --git a/lib/fs/bridge.rs b/lib/fs/bridge.rs index 37599388..5b5354d9 100644 --- a/lib/fs/bridge.rs +++ b/lib/fs/bridge.rs @@ -47,12 +47,29 @@ impl ConcurrentBridge { } /// Resolve outer -> inner. + /// + /// This read is **not** serialized with mutations. A concurrent [`insert`] + /// may have completed the forward entry but not yet the backward entry (or + /// vice versa for [`remove_by_outer`]). Callers must tolerate stale or + /// transiently-missing results. Use [`backward_or_insert`] when + /// cross-map consistency is required. + /// + /// [`insert`]: Self::insert + /// [`remove_by_outer`]: Self::remove_by_outer + /// [`backward_or_insert`]: Self::backward_or_insert #[must_use] pub fn forward(&self, outer: InodeAddr) -> Option { self.fwd.read_sync(&outer, |_, &v| v) } /// Resolve inner -> outer. + /// + /// This read is **not** serialized with mutations. See [`forward`] for + /// the consistency caveats. Use [`backward_or_insert`] when cross-map + /// consistency is required. + /// + /// [`forward`]: Self::forward + /// [`backward_or_insert`]: Self::backward_or_insert #[must_use] pub fn backward(&self, inner: InodeAddr) -> Option { self.bwd.read_sync(&inner, |_, &v| v) diff --git a/lib/fs/composite.rs b/lib/fs/composite.rs index ff00c8d3..957836b1 100644 --- a/lib/fs/composite.rs +++ b/lib/fs/composite.rs @@ -138,6 +138,10 @@ struct CompositeFsInner { /// Maps a composite-level outer inode to its child slot index. addr_to_slot: scc::HashMap, /// Maps child name to slot index (for dedup on concurrent resolve). + /// + /// `register_child` uses `entry_sync` on this map for per-name + /// exclusion, serializing concurrent registrations of the same child + /// without a global lock. `forget` never touches this map. name_to_slot: scc::HashMap, /// Monotonically increasing slot counter. next_slot: AtomicU64, @@ -257,6 +261,13 @@ impl CompositeFs { /// If the child is already registered by name, the existing outer address /// is returned. Otherwise a new slot is created with a fresh inode table /// and bridge mapping. + /// + /// Uses `entry_sync` on `name_to_slot` for per-name exclusion: + /// concurrent registrations of the same child are serialized by the + /// `scc::HashMap` bucket lock, while different names proceed in + /// parallel. `forget` never touches `name_to_slot` and is fully + /// independent — outer inode addresses are monotonic and never reused, + /// so `forget` cannot corrupt a replacement slot. fn register_child(&self, desc: &ChildDescriptor) -> InodeAddr where R::ChildDP: Clone, @@ -264,7 +275,6 @@ impl CompositeFs { match self.inner.name_to_slot.entry_sync(desc.name.clone()) { scc::hash_map::Entry::Occupied(mut occ) => { let old_slot_idx = *occ.get(); - // Extract bridge Arc from the slot guard, then query outside. let bridge = self .inner .slots @@ -272,11 +282,9 @@ impl CompositeFs { if let Some(outer) = bridge.and_then(|b| b.backward(desc.root_ino.addr)) { return outer; } - // Slot exists but bridge has no mapping — replace in-place - // while still holding the entry guard to prevent races. + // Slot exists but bridge has no mapping — replace it. let (outer_ino, new_slot_idx) = self.create_child_slot(desc); *occ.get_mut() = new_slot_idx; - // Remove the orphaned old slot to prevent unbounded growth. self.inner.slots.remove_sync(&old_slot_idx); outer_ino } @@ -440,6 +448,14 @@ where /// Removes the composite-level address from `addr_to_slot` and the /// child's bridge map. Called automatically by `InodeForget` when the /// FUSE refcount drops to zero. The root inode is never forgotten. + /// + /// Lock-free with respect to [`register_child`](CompositeFs::register_child): + /// outer inode addresses are monotonically increasing and never reused, + /// so `forget(addr)` can only affect the slot that originally owned + /// `addr`. If a concurrent `register_child` has already replaced the + /// slot, `slots.read_sync` returns `None` and the bridge cleanup is + /// skipped — the old slot's `Arc` is dropped with its + /// `Arc` refcount. fn forget(&self, addr: InodeAddr) { if addr == Self::ROOT_INO { return; From 08d51c64686a7140188e0c3c09683590e2a83cbe Mon Sep 17 00:00:00 2001 From: Marko Vejnovic Date: Sat, 21 Feb 2026 14:09:43 -0800 Subject: [PATCH 29/58] TOCTOU fixes --- lib/fs/async_fs.rs | 26 +++++++---- lib/fs/bridge.rs | 8 +++- lib/fs/composite.rs | 19 +++++--- lib/fs/fuser.rs | 31 ++++++++++--- lib/fs/mod.rs | 29 +++++++++++- tests/async_fs_correctness.rs | 78 ++++++++++++++++++++----------- tests/composite_fs_tests.rs | 54 +++++++++++++++------- tests/dcache_correctness.rs | 87 +++++++++++++++++++++++++---------- 8 files changed, 242 insertions(+), 90 deletions(-) diff --git a/lib/fs/async_fs.rs b/lib/fs/async_fs.rs index 7b5b8d2f..625f3f8c 100644 --- a/lib/fs/async_fs.rs +++ b/lib/fs/async_fs.rs @@ -222,6 +222,11 @@ impl<'a> PopulateGuard<'a> { } impl Drop for PopulateGuard<'_> { + /// Fires when the populating future is cancelled before [`defuse`](Self::defuse) + /// is called, resetting the dcache populate flag from `IN_PROGRESS` back to + /// `UNCLAIMED` so a subsequent `readdir` can retry. This is a normal + /// occurrence under FUSE interrupts or `tokio::select!` cancellation — + /// not an error. fn drop(&mut self) { if self.armed { self.dcache.abort_populate(self.parent); @@ -336,7 +341,7 @@ impl<'tbl, DP: FsDataProvider> AsyncFs<'tbl, DP> { ); if let Some(dentry) = self.directory_cache.lookup(parent, name) { - if let Some(inode) = self.inode_table.get(&dentry.ino.0).await { + if let Some(inode) = self.inode_table.get(&dentry.ino.addr()).await { return Ok(TrackedINode { inode }); } // Inode was evicted (e.g. by forget). Evict the stale lookup_cache @@ -348,11 +353,11 @@ impl<'tbl, DP: FsDataProvider> AsyncFs<'tbl, DP> { // downstream operations (get_or_try_init, get_or_init) are // idempotent or deduplicated. self.lookup_cache - .remove_sync(&(parent.0, name.to_os_string())); + .remove_sync(&(parent.addr(), name.to_os_string())); } let name_owned = name.to_os_string(); - let lookup_key = (parent.0, name_owned.clone()); + let lookup_key = (parent.addr(), name_owned.clone()); let dp = self.data_provider.clone(); let child = self @@ -371,7 +376,7 @@ impl<'tbl, DP: FsDataProvider> AsyncFs<'tbl, DP> { .insert( parent, name_owned, - LoadedAddr(child.addr), + LoadedAddr::new_unchecked(child.addr), matches!(child.itype, INodeType::Directory), ) .await; @@ -384,9 +389,9 @@ impl<'tbl, DP: FsDataProvider> AsyncFs<'tbl, DP> { /// If the inode is currently in-flight (being loaded by another caller), this awaits /// completion. Returns an error if the inode is not in the table at all. pub async fn loaded_inode(&self, addr: LoadedAddr) -> Result { - self.inode_table.get(&addr.0).await.ok_or_else(|| { + self.inode_table.get(&addr.addr()).await.ok_or_else(|| { tracing::error!( - inode = ?addr.0, + inode = ?addr.addr(), "inode not found in table — this is a programming bug" ); std::io::Error::from_raw_os_error(libc::ENOENT) @@ -469,7 +474,7 @@ impl<'tbl, DP: FsDataProvider> AsyncFs<'tbl, DP> { .insert( parent, name, - LoadedAddr(child_inode.addr), + LoadedAddr::new_unchecked(child_inode.addr), child_inode.itype == INodeType::Directory, ) .await; @@ -494,7 +499,12 @@ impl<'tbl, DP: FsDataProvider> AsyncFs<'tbl, DP> { reason = "offset fits in usize on supported 64-bit platforms" )] for (i, (name, dvalue)) in children.iter().enumerate().skip(offset as usize) { - let inode = self.loaded_inode(dvalue.ino).await?; + let Some(inode) = self.inode_table.get(&dvalue.ino.addr()).await else { + // Inode was evicted between readdir collection and iteration + // (e.g. by a concurrent forget). Skip the stale entry. + tracing::debug!(addr = ?dvalue.ino.addr(), name = ?name, "inode evicted during readdir, skipping"); + continue; + }; let next_offset = (i + 1) as u64; if filler(DirEntry { name, inode }, next_offset) { break; diff --git a/lib/fs/bridge.rs b/lib/fs/bridge.rs index 5b5354d9..b0366cfd 100644 --- a/lib/fs/bridge.rs +++ b/lib/fs/bridge.rs @@ -101,8 +101,11 @@ impl ConcurrentBridge { /// Remove the mapping for the given outer address. /// - /// Serialized with other mutations via the coordination lock. - pub fn remove_by_outer(&self, outer: InodeAddr) { + /// Returns `true` if the bridge is empty after the removal — the caller + /// can use this to garbage-collect the owning slot. The emptiness check + /// is performed under the coordination lock so there is no TOCTOU gap + /// with the removal itself. + pub fn remove_by_outer(&self, outer: InodeAddr) -> bool { let _guard = self .mu .lock() @@ -110,6 +113,7 @@ impl ConcurrentBridge { if let Some((_, inner)) = self.fwd.remove_sync(&outer) { self.bwd.remove_sync(&inner); } + self.fwd.is_empty() } } diff --git a/lib/fs/composite.rs b/lib/fs/composite.rs index 957836b1..3ba0f5f7 100644 --- a/lib/fs/composite.rs +++ b/lib/fs/composite.rs @@ -341,7 +341,7 @@ where // Await the lookup outside any scc guard. let tracked = child .get_fs() - .lookup(LoadedAddr(inner_parent), name) + .lookup(LoadedAddr::new_unchecked(inner_parent), name) .await?; let child_inode = tracked.inode; @@ -394,7 +394,7 @@ where let mut child_entries = Vec::new(); child .get_fs() - .readdir(LoadedAddr(inner_parent), 0, |de, _offset| { + .readdir(LoadedAddr::new_unchecked(inner_parent), 0, |de, _offset| { child_entries.push((de.name.to_os_string(), de.inode)); false }) @@ -437,8 +437,10 @@ where let inner_ino = inner_ino.ok_or_else(|| std::io::Error::from_raw_os_error(libc::ENOENT))?; - let open_file: OpenFile<<::ChildDP as FsDataProvider>::Reader> = - child.get_fs().open(LoadedAddr(inner_ino), flags).await?; + let open_file: OpenFile<<::ChildDP as FsDataProvider>::Reader> = child + .get_fs() + .open(LoadedAddr::new_unchecked(inner_ino), flags) + .await?; Ok(CompositeReader { inner: open_file.reader, @@ -461,9 +463,14 @@ where return; } if let Some((_, slot_idx)) = self.inner.addr_to_slot.remove_sync(&addr) { - self.inner + let bridge_empty = self + .inner .slots - .read_sync(&slot_idx, |_, slot| slot.bridge.remove_by_outer(addr)); + .read_sync(&slot_idx, |_, slot| slot.bridge.remove_by_outer(addr)) + .unwrap_or(false); + if bridge_empty { + self.inner.slots.remove_sync(&slot_idx); + } } } } diff --git a/lib/fs/fuser.rs b/lib/fs/fuser.rs index 7a9bed24..15fa36f7 100644 --- a/lib/fs/fuser.rs +++ b/lib/fs/fuser.rs @@ -210,7 +210,11 @@ impl fuser::Filesystem for FuserAdapter { ) { self.runtime .block_on(async { - let tracked = self.inner.get_fs().lookup(LoadedAddr(parent), name).await?; + let tracked = self + .inner + .get_fs() + .lookup(LoadedAddr::new_unchecked(parent), name) + .await?; self.inner.ward_inc(tracked.inode.addr); Ok::<_, std::io::Error>(tracked.inode) }) @@ -230,7 +234,12 @@ impl fuser::Filesystem for FuserAdapter { reply: fuser::ReplyAttr, ) { self.runtime - .block_on(async { self.inner.get_fs().getattr(LoadedAddr(ino)).await }) + .block_on(async { + self.inner + .get_fs() + .getattr(LoadedAddr::new_unchecked(ino)) + .await + }) .fuse_reply(reply, |inode, reply| { let attr = inode_to_fuser_attr(&inode, BLOCK_SIZE); debug!(?attr, "replying..."); @@ -253,10 +262,14 @@ impl fuser::Filesystem for FuserAdapter { let mut entries = Vec::new(); self.inner .get_fs() - .readdir(LoadedAddr(ino), offset_u64, |de, _next_offset| { - entries.push((de.inode.addr, de.name.to_os_string(), de.inode.itype)); - false - }) + .readdir( + LoadedAddr::new_unchecked(ino), + offset_u64, + |de, _next_offset| { + entries.push((de.inode.addr, de.name.to_os_string(), de.inode.itype)); + false + }, + ) .await?; Ok::<_, std::io::Error>(entries) }) @@ -291,7 +304,11 @@ impl fuser::Filesystem for FuserAdapter { let flags = OpenFlags::from_bits_truncate(flags); self.runtime .block_on(async { - let open_file = self.inner.get_fs().open(LoadedAddr(ino), flags).await?; + let open_file = self + .inner + .get_fs() + .open(LoadedAddr::new_unchecked(ino), flags) + .await?; let fh = open_file.fh; self.open_files.insert(fh, Arc::clone(&open_file.reader)); Ok::<_, std::io::Error>(fh) diff --git a/lib/fs/mod.rs b/lib/fs/mod.rs index ed93bd25..02ef8384 100644 --- a/lib/fs/mod.rs +++ b/lib/fs/mod.rs @@ -24,8 +24,35 @@ pub type InodeAddr = u64; /// /// This newtype wrapper distinguishes inode addresses that are known to exist /// in the [`async_fs::AsyncFs`] inode table from raw [`InodeAddr`] values. +/// +/// The inner field is private to prevent unchecked construction. Code within +/// the crate may use [`LoadedAddr::new_unchecked`] at trusted boundaries +/// (e.g. after inserting into the inode table, or at the FUSE adapter boundary +/// where the kernel provides addresses it previously received from us). #[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, PartialOrd, Ord)] -pub struct LoadedAddr(pub InodeAddr); +pub struct LoadedAddr(InodeAddr); + +impl LoadedAddr { + /// Construct a `LoadedAddr` without validating that the address exists in + /// the inode table. + /// + /// # Safety contract (logical, not `unsafe`) + /// + /// The caller must ensure one of: + /// - The address was previously inserted into an inode table, **or** + /// - The address originates from the FUSE kernel (which only knows + /// addresses we previously returned to it). + #[must_use] + pub fn new_unchecked(addr: InodeAddr) -> Self { + Self(addr) + } + + /// Return the raw inode address. + #[must_use] + pub fn addr(self) -> InodeAddr { + self.0 + } +} /// Type representing a file handle. pub type FileHandle = u64; diff --git a/tests/async_fs_correctness.rs b/tests/async_fs_correctness.rs index fd6c2bdc..643e7b1e 100644 --- a/tests/async_fs_correctness.rs +++ b/tests/async_fs_correctness.rs @@ -172,7 +172,7 @@ async fn loaded_inode_returns_seeded_inode() { let fs = AsyncFs::new(dp, root, &table).await; - let inode = fs.loaded_inode(LoadedAddr(1)).await.unwrap(); + let inode = fs.loaded_inode(LoadedAddr::new_unchecked(1)).await.unwrap(); assert_eq!(inode.addr, 1); assert_eq!(inode.itype, INodeType::Directory); } @@ -185,7 +185,10 @@ async fn loaded_inode_returns_enoent_for_missing_addr() { let fs = AsyncFs::new(dp, root, &table).await; - let err = fs.loaded_inode(LoadedAddr(999)).await.unwrap_err(); + let err = fs + .loaded_inode(LoadedAddr::new_unchecked(999)) + .await + .unwrap_err(); assert_eq!(err.raw_os_error(), Some(libc::ENOENT)); } @@ -197,7 +200,7 @@ async fn getattr_delegates_to_loaded_inode() { let fs = AsyncFs::new(dp, root, &table).await; - let inode = fs.getattr(LoadedAddr(1)).await.unwrap(); + let inode = fs.getattr(LoadedAddr::new_unchecked(1)).await.unwrap(); assert_eq!(inode.addr, 1); assert_eq!(inode.size, 4096); } @@ -215,7 +218,7 @@ async fn lookup_resolves_child_via_data_provider() { let fs = AsyncFs::new(dp, root, &table).await; let tracked = fs - .lookup(LoadedAddr(1), OsStr::new("readme.md")) + .lookup(LoadedAddr::new_unchecked(1), OsStr::new("readme.md")) .await .unwrap(); @@ -236,7 +239,7 @@ async fn lookup_populates_inode_table() { let table = FutureBackedCache::default(); let fs = AsyncFs::new(dp, root, &table).await; - fs.lookup(LoadedAddr(1), OsStr::new("file.txt")) + fs.lookup(LoadedAddr::new_unchecked(1), OsStr::new("file.txt")) .await .unwrap(); @@ -262,11 +265,11 @@ async fn lookup_second_call_uses_cache() { let fs = AsyncFs::new(dp, root, &table).await; let first = fs - .lookup(LoadedAddr(1), OsStr::new("cached.txt")) + .lookup(LoadedAddr::new_unchecked(1), OsStr::new("cached.txt")) .await .unwrap(); let second = fs - .lookup(LoadedAddr(1), OsStr::new("cached.txt")) + .lookup(LoadedAddr::new_unchecked(1), OsStr::new("cached.txt")) .await .unwrap(); @@ -283,7 +286,7 @@ async fn lookup_propagates_provider_error() { let fs = AsyncFs::new(dp, root, &table).await; let err = fs - .lookup(LoadedAddr(1), OsStr::new("nonexistent")) + .lookup(LoadedAddr::new_unchecked(1), OsStr::new("nonexistent")) .await .unwrap_err(); assert_eq!(err.raw_os_error(), Some(libc::ENOENT)); @@ -306,7 +309,10 @@ async fn open_returns_file_handle_and_reader() { table.insert_sync(10, file); let fs = AsyncFs::new(dp, root, &table).await; - let open_file = fs.open(LoadedAddr(10), OpenFlags::RDONLY).await.unwrap(); + let open_file = fs + .open(LoadedAddr::new_unchecked(10), OpenFlags::RDONLY) + .await + .unwrap(); assert!(open_file.fh >= 1, "file handle should start at 1"); let data = open_file.read(0, 5).await.unwrap(); @@ -321,7 +327,10 @@ async fn open_returns_eisdir_for_directory() { let table = FutureBackedCache::default(); let fs = AsyncFs::new(dp, root, &table).await; - let err = fs.open(LoadedAddr(1), OpenFlags::RDONLY).await.unwrap_err(); + let err = fs + .open(LoadedAddr::new_unchecked(1), OpenFlags::RDONLY) + .await + .unwrap_err(); assert_eq!(err.raw_os_error(), Some(libc::EISDIR)); } @@ -334,7 +343,7 @@ async fn open_returns_enoent_for_missing_inode() { let fs = AsyncFs::new(dp, root, &table).await; let err = fs - .open(LoadedAddr(999), OpenFlags::RDONLY) + .open(LoadedAddr::new_unchecked(999), OpenFlags::RDONLY) .await .unwrap_err(); assert_eq!(err.raw_os_error(), Some(libc::ENOENT)); @@ -351,8 +360,16 @@ async fn open_assigns_unique_file_handles() { table.insert_sync(10, file); let fs = AsyncFs::new(dp, root, &table).await; - let fh1 = fs.open(LoadedAddr(10), OpenFlags::RDONLY).await.unwrap().fh; - let fh2 = fs.open(LoadedAddr(10), OpenFlags::RDONLY).await.unwrap().fh; + let fh1 = fs + .open(LoadedAddr::new_unchecked(10), OpenFlags::RDONLY) + .await + .unwrap() + .fh; + let fh2 = fs + .open(LoadedAddr::new_unchecked(10), OpenFlags::RDONLY) + .await + .unwrap() + .fh; assert_ne!(fh1, fh2, "each open should produce a unique file handle"); } @@ -372,7 +389,10 @@ async fn open_file_read_with_offset() { table.insert_sync(10, file); let fs = AsyncFs::new(dp, root, &table).await; - let open_file = fs.open(LoadedAddr(10), OpenFlags::RDONLY).await.unwrap(); + let open_file = fs + .open(LoadedAddr::new_unchecked(10), OpenFlags::RDONLY) + .await + .unwrap(); let data = open_file.read(6, 5).await.unwrap(); assert_eq!(&data[..], b"world"); @@ -400,7 +420,7 @@ async fn readdir_lists_children_sorted_by_name() { let fs = AsyncFs::new(dp, root, &table).await; let mut entries: Vec<(OsString, u64)> = Vec::new(); - fs.readdir(LoadedAddr(1), 0, |entry, _offset| { + fs.readdir(LoadedAddr::new_unchecked(1), 0, |entry, _offset| { entries.push((entry.name.to_os_string(), entry.inode.addr)); false // don't stop }) @@ -436,11 +456,13 @@ async fn readdir_respects_offset() { let fs = AsyncFs::new(dp, root, &table).await; // First readdir to populate cache - fs.readdir(LoadedAddr(1), 0, |_, _| false).await.unwrap(); + fs.readdir(LoadedAddr::new_unchecked(1), 0, |_, _| false) + .await + .unwrap(); // Second readdir starting at offset 2 (skip first two) let mut entries: Vec = Vec::new(); - fs.readdir(LoadedAddr(1), 2, |entry, _| { + fs.readdir(LoadedAddr::new_unchecked(1), 2, |entry, _| { entries.push(entry.name.to_os_string()); false }) @@ -472,7 +494,7 @@ async fn readdir_stops_when_filler_returns_true() { let fs = AsyncFs::new(dp, root, &table).await; let mut count = 0; - fs.readdir(LoadedAddr(1), 0, |_, _| { + fs.readdir(LoadedAddr::new_unchecked(1), 0, |_, _| { count += 1; count >= 2 // stop after 2 entries }) @@ -494,7 +516,7 @@ async fn readdir_returns_enotdir_for_file() { let fs = AsyncFs::new(dp, root, &table).await; let err = fs - .readdir(LoadedAddr(10), 0, |_, _| false) + .readdir(LoadedAddr::new_unchecked(10), 0, |_, _| false) .await .unwrap_err(); assert_eq!(err.raw_os_error(), Some(libc::ENOTDIR)); @@ -514,7 +536,9 @@ async fn readdir_populates_inode_table_with_children() { let table = FutureBackedCache::default(); let fs = AsyncFs::new(dp, root, &table).await; - fs.readdir(LoadedAddr(1), 0, |_, _| false).await.unwrap(); + fs.readdir(LoadedAddr::new_unchecked(1), 0, |_, _| false) + .await + .unwrap(); let cached = table.get(&10).await; assert_eq!( @@ -536,7 +560,7 @@ async fn readdir_empty_directory() { let fs = AsyncFs::new(dp, root, &table).await; let mut count = 0; - fs.readdir(LoadedAddr(1), 0, |_, _| { + fs.readdir(LoadedAddr::new_unchecked(1), 0, |_, _| { count += 1; false }) @@ -566,7 +590,7 @@ async fn readdir_provides_correct_next_offsets() { let fs = AsyncFs::new(dp, root, &table).await; let mut offsets: Vec = Vec::new(); - fs.readdir(LoadedAddr(1), 0, |_, next_offset| { + fs.readdir(LoadedAddr::new_unchecked(1), 0, |_, next_offset| { offsets.push(next_offset); false }) @@ -596,7 +620,7 @@ async fn lookup_after_eviction_returns_fresh_inode() { // First lookup → addr=10 let first = fs - .lookup(LoadedAddr(1), OsStr::new("readme.md")) + .lookup(LoadedAddr::new_unchecked(1), OsStr::new("readme.md")) .await .unwrap(); assert_eq!(first.inode.addr, 10); @@ -614,7 +638,7 @@ async fn lookup_after_eviction_returns_fresh_inode() { // Second lookup should NOT return the stale addr=10. let second = fs - .lookup(LoadedAddr(1), OsStr::new("readme.md")) + .lookup(LoadedAddr::new_unchecked(1), OsStr::new("readme.md")) .await .unwrap(); assert_ne!(second.inode.addr, 10, "should not return stale inode"); @@ -640,11 +664,13 @@ async fn lookup_after_readdir_uses_directory_cache() { let fs = AsyncFs::new(dp, root, &table).await; // readdir populates the directory cache. - fs.readdir(LoadedAddr(1), 0, |_, _| false).await.unwrap(); + fs.readdir(LoadedAddr::new_unchecked(1), 0, |_, _| false) + .await + .unwrap(); // lookup should hit the directory cache fast path. let tracked = fs - .lookup(LoadedAddr(1), OsStr::new("file.txt")) + .lookup(LoadedAddr::new_unchecked(1), OsStr::new("file.txt")) .await .unwrap(); assert_eq!(tracked.inode.addr, 10); diff --git a/tests/composite_fs_tests.rs b/tests/composite_fs_tests.rs index d6470a6a..d68dd6ea 100644 --- a/tests/composite_fs_tests.rs +++ b/tests/composite_fs_tests.rs @@ -58,7 +58,7 @@ async fn composite_root_lookup_resolves_child() { let afs = AsyncFs::new_preseeded(composite, &table); let tracked = afs - .lookup(LoadedAddr(1), OsStr::new("repo-a")) + .lookup(LoadedAddr::new_unchecked(1), OsStr::new("repo-a")) .await .unwrap(); @@ -96,7 +96,7 @@ async fn composite_root_readdir_lists_children() { let afs = AsyncFs::new_preseeded(composite, &table); let mut entries = Vec::new(); - afs.readdir(LoadedAddr(1), 0, |de, _offset| { + afs.readdir(LoadedAddr::new_unchecked(1), 0, |de, _offset| { entries.push(de.name.to_os_string()); false }) @@ -132,14 +132,17 @@ async fn composite_delegated_lookup_reaches_child() { // First, lookup the child at root level. let child_dir = afs - .lookup(LoadedAddr(1), OsStr::new("my-repo")) + .lookup(LoadedAddr::new_unchecked(1), OsStr::new("my-repo")) .await .unwrap(); let child_addr = child_dir.inode.addr; // Then, lookup a file inside the child. let file = afs - .lookup(LoadedAddr(child_addr), OsStr::new("readme.md")) + .lookup( + LoadedAddr::new_unchecked(child_addr), + OsStr::new("readme.md"), + ) .await .unwrap(); @@ -148,7 +151,7 @@ async fn composite_delegated_lookup_reaches_child() { // Also lookup a subdirectory inside the child. let subdir = afs - .lookup(LoadedAddr(child_addr), OsStr::new("src")) + .lookup(LoadedAddr::new_unchecked(child_addr), OsStr::new("src")) .await .unwrap(); @@ -171,16 +174,22 @@ async fn composite_open_and_read_through_child() { let afs = AsyncFs::new_preseeded(composite, &table); // Navigate to the file. - let child_dir = afs.lookup(LoadedAddr(1), OsStr::new("repo")).await.unwrap(); + let child_dir = afs + .lookup(LoadedAddr::new_unchecked(1), OsStr::new("repo")) + .await + .unwrap(); let file_tracked = afs - .lookup(LoadedAddr(child_dir.inode.addr), OsStr::new("hello.txt")) + .lookup( + LoadedAddr::new_unchecked(child_dir.inode.addr), + OsStr::new("hello.txt"), + ) .await .unwrap(); let file_addr = file_tracked.inode.addr; // Open and read. let open_file = afs - .open(LoadedAddr(file_addr), OpenFlags::empty()) + .open(LoadedAddr::new_unchecked(file_addr), OpenFlags::empty()) .await .unwrap(); let data = open_file.read(0, 1024).await.unwrap(); @@ -208,7 +217,7 @@ async fn composite_lookup_unknown_child_returns_enoent() { let afs = AsyncFs::new_preseeded(composite, &table); let err = afs - .lookup(LoadedAddr(1), OsStr::new("nonexistent")) + .lookup(LoadedAddr::new_unchecked(1), OsStr::new("nonexistent")) .await .unwrap_err(); @@ -242,14 +251,21 @@ async fn composite_readdir_delegated_lists_child_contents() { let afs = AsyncFs::new_preseeded(composite, &table); // Navigate into the child. - let child_dir = afs.lookup(LoadedAddr(1), OsStr::new("repo")).await.unwrap(); + let child_dir = afs + .lookup(LoadedAddr::new_unchecked(1), OsStr::new("repo")) + .await + .unwrap(); // Readdir inside the child. let mut entries = Vec::new(); - afs.readdir(LoadedAddr(child_dir.inode.addr), 0, |de, _offset| { - entries.push((de.name.to_os_string(), de.inode.itype)); - false - }) + afs.readdir( + LoadedAddr::new_unchecked(child_dir.inode.addr), + 0, + |de, _offset| { + entries.push((de.name.to_os_string(), de.inode.itype)); + false + }, + ) .await .unwrap(); @@ -275,8 +291,14 @@ async fn composite_repeated_lookup_returns_same_addr() { table.insert_sync(1, root_inode); let afs = AsyncFs::new_preseeded(composite, &table); - let first = afs.lookup(LoadedAddr(1), OsStr::new("repo")).await.unwrap(); - let second = afs.lookup(LoadedAddr(1), OsStr::new("repo")).await.unwrap(); + let first = afs + .lookup(LoadedAddr::new_unchecked(1), OsStr::new("repo")) + .await + .unwrap(); + let second = afs + .lookup(LoadedAddr::new_unchecked(1), OsStr::new("repo")) + .await + .unwrap(); assert_eq!( first.inode.addr, second.inode.addr, diff --git a/tests/dcache_correctness.rs b/tests/dcache_correctness.rs index 34dcf088..83074517 100644 --- a/tests/dcache_correctness.rs +++ b/tests/dcache_correctness.rs @@ -8,19 +8,28 @@ use git_fs::fs::dcache::{DCache, PopulateStatus}; #[tokio::test] async fn lookup_returns_none_for_missing_entry() { let cache = DCache::new(); - assert!(cache.lookup(LoadedAddr(1), OsStr::new("foo")).is_none()); + assert!( + cache + .lookup(LoadedAddr::new_unchecked(1), OsStr::new("foo")) + .is_none() + ); } #[tokio::test] async fn insert_then_lookup() { let cache = DCache::new(); cache - .insert(LoadedAddr(1), OsString::from("foo"), LoadedAddr(10), false) + .insert( + LoadedAddr::new_unchecked(1), + OsString::from("foo"), + LoadedAddr::new_unchecked(10), + false, + ) .await; - let dv = cache.lookup(LoadedAddr(1), OsStr::new("foo")); + let dv = cache.lookup(LoadedAddr::new_unchecked(1), OsStr::new("foo")); assert!(dv.is_some(), "entry should be present after insert"); let dv = dv.expect("checked above"); - assert_eq!(dv.ino, LoadedAddr(10)); + assert_eq!(dv.ino, LoadedAddr::new_unchecked(10)); assert!(!dv.is_dir); } @@ -28,15 +37,30 @@ async fn insert_then_lookup() { async fn readdir_returns_only_children_of_parent() { let cache = DCache::new(); cache - .insert(LoadedAddr(1), OsString::from("a"), LoadedAddr(10), false) + .insert( + LoadedAddr::new_unchecked(1), + OsString::from("a"), + LoadedAddr::new_unchecked(10), + false, + ) .await; cache - .insert(LoadedAddr(1), OsString::from("b"), LoadedAddr(11), true) + .insert( + LoadedAddr::new_unchecked(1), + OsString::from("b"), + LoadedAddr::new_unchecked(11), + true, + ) .await; cache - .insert(LoadedAddr(2), OsString::from("c"), LoadedAddr(12), false) + .insert( + LoadedAddr::new_unchecked(2), + OsString::from("c"), + LoadedAddr::new_unchecked(12), + false, + ) .await; - let children = cache.readdir(LoadedAddr(1)).await; + let children = cache.readdir(LoadedAddr::new_unchecked(1)).await; assert_eq!(children.len(), 2); let names: Vec<_> = children.iter().map(|(n, _)| n.clone()).collect(); assert!(names.contains(&OsString::from("a"))); @@ -46,7 +70,7 @@ async fn readdir_returns_only_children_of_parent() { #[tokio::test] async fn readdir_empty_parent_returns_empty() { let cache = DCache::new(); - let children = cache.readdir(LoadedAddr(1)).await; + let children = cache.readdir(LoadedAddr::new_unchecked(1)).await; assert!(children.is_empty()); } @@ -54,7 +78,7 @@ async fn readdir_empty_parent_returns_empty() { async fn try_claim_populate_unclaimed_returns_claimed() { let cache = DCache::new(); assert!(matches!( - cache.try_claim_populate(LoadedAddr(1)), + cache.try_claim_populate(LoadedAddr::new_unchecked(1)), PopulateStatus::Claimed )); } @@ -63,12 +87,12 @@ async fn try_claim_populate_unclaimed_returns_claimed() { async fn finish_populate_then_claim_returns_done() { let cache = DCache::new(); assert!(matches!( - cache.try_claim_populate(LoadedAddr(1)), + cache.try_claim_populate(LoadedAddr::new_unchecked(1)), PopulateStatus::Claimed )); - cache.finish_populate(LoadedAddr(1)); + cache.finish_populate(LoadedAddr::new_unchecked(1)); assert!(matches!( - cache.try_claim_populate(LoadedAddr(1)), + cache.try_claim_populate(LoadedAddr::new_unchecked(1)), PopulateStatus::Done )); } @@ -77,11 +101,11 @@ async fn finish_populate_then_claim_returns_done() { async fn double_claim_returns_in_progress() { let cache = DCache::new(); assert!(matches!( - cache.try_claim_populate(LoadedAddr(1)), + cache.try_claim_populate(LoadedAddr::new_unchecked(1)), PopulateStatus::Claimed )); assert!(matches!( - cache.try_claim_populate(LoadedAddr(1)), + cache.try_claim_populate(LoadedAddr::new_unchecked(1)), PopulateStatus::InProgress )); } @@ -90,12 +114,12 @@ async fn double_claim_returns_in_progress() { async fn abort_populate_allows_reclaim() { let cache = DCache::new(); assert!(matches!( - cache.try_claim_populate(LoadedAddr(1)), + cache.try_claim_populate(LoadedAddr::new_unchecked(1)), PopulateStatus::Claimed )); - cache.abort_populate(LoadedAddr(1)); + cache.abort_populate(LoadedAddr::new_unchecked(1)); assert!(matches!( - cache.try_claim_populate(LoadedAddr(1)), + cache.try_claim_populate(LoadedAddr::new_unchecked(1)), PopulateStatus::Claimed )); } @@ -104,11 +128,16 @@ async fn abort_populate_allows_reclaim() { async fn insert_does_not_mark_populated() { let cache = DCache::new(); cache - .insert(LoadedAddr(1), OsString::from("foo"), LoadedAddr(10), false) + .insert( + LoadedAddr::new_unchecked(1), + OsString::from("foo"), + LoadedAddr::new_unchecked(10), + false, + ) .await; assert!( matches!( - cache.try_claim_populate(LoadedAddr(1)), + cache.try_claim_populate(LoadedAddr::new_unchecked(1)), PopulateStatus::Claimed ), "insert alone should not mark a directory as populated" @@ -119,14 +148,24 @@ async fn insert_does_not_mark_populated() { async fn upsert_overwrites_existing_entry() { let cache = DCache::new(); cache - .insert(LoadedAddr(1), OsString::from("foo"), LoadedAddr(10), false) + .insert( + LoadedAddr::new_unchecked(1), + OsString::from("foo"), + LoadedAddr::new_unchecked(10), + false, + ) .await; cache - .insert(LoadedAddr(1), OsString::from("foo"), LoadedAddr(20), true) + .insert( + LoadedAddr::new_unchecked(1), + OsString::from("foo"), + LoadedAddr::new_unchecked(20), + true, + ) .await; - let dv = cache.lookup(LoadedAddr(1), OsStr::new("foo")); + let dv = cache.lookup(LoadedAddr::new_unchecked(1), OsStr::new("foo")); assert!(dv.is_some(), "entry should still be present after upsert"); let dv = dv.expect("checked above"); - assert_eq!(dv.ino, LoadedAddr(20)); + assert_eq!(dv.ino, LoadedAddr::new_unchecked(20)); assert!(dv.is_dir); } From 9b30b5503bfa5b859980cd889aee8d98ae06598b Mon Sep 17 00:00:00 2001 From: Marko Vejnovic Date: Sat, 21 Feb 2026 15:05:03 -0800 Subject: [PATCH 30/58] fix: atomicize forget slot GC and clean up name_to_slot --- lib/fs/bridge.rs | 11 +++++++++ lib/fs/composite.rs | 41 +++++++++++++++++++++---------- tests/composite_fs_tests.rs | 48 ++++++++++++++++++++++++++++++++++++- 3 files changed, 86 insertions(+), 14 deletions(-) diff --git a/lib/fs/bridge.rs b/lib/fs/bridge.rs index b0366cfd..49acf9ee 100644 --- a/lib/fs/bridge.rs +++ b/lib/fs/bridge.rs @@ -115,6 +115,17 @@ impl ConcurrentBridge { } self.fwd.is_empty() } + + /// Returns `true` if the bridge contains no mappings. + /// + /// Reads are not serialized with mutations. The result is a + /// snapshot that may be immediately stale. Use under the + /// coordination lock or an external guard when consistency + /// with mutations is required. + #[must_use] + pub fn is_empty(&self) -> bool { + self.fwd.is_empty() + } } impl Default for ConcurrentBridge { diff --git a/lib/fs/composite.rs b/lib/fs/composite.rs index 3ba0f5f7..ea7349dc 100644 --- a/lib/fs/composite.rs +++ b/lib/fs/composite.rs @@ -141,7 +141,8 @@ struct CompositeFsInner { /// /// `register_child` uses `entry_sync` on this map for per-name /// exclusion, serializing concurrent registrations of the same child - /// without a global lock. `forget` never touches this map. + /// without a global lock. `forget` cleans up entries when a slot's + /// bridge becomes empty. name_to_slot: scc::HashMap, /// Monotonically increasing slot counter. next_slot: AtomicU64, @@ -265,8 +266,9 @@ impl CompositeFs { /// Uses `entry_sync` on `name_to_slot` for per-name exclusion: /// concurrent registrations of the same child are serialized by the /// `scc::HashMap` bucket lock, while different names proceed in - /// parallel. `forget` never touches `name_to_slot` and is fully - /// independent — outer inode addresses are monotonic and never reused, + /// parallel. `forget` may remove entries from `name_to_slot` when a + /// slot's bridge becomes empty, but this is safe — outer inode addresses + /// are monotonic and never reused, /// so `forget` cannot corrupt a replacement slot. fn register_child(&self, desc: &ChildDescriptor) -> InodeAddr where @@ -448,28 +450,41 @@ where } /// Removes the composite-level address from `addr_to_slot` and the - /// child's bridge map. Called automatically by `InodeForget` when the - /// FUSE refcount drops to zero. The root inode is never forgotten. + /// child's bridge map. When the bridge becomes empty, the slot and its + /// `name_to_slot` entry are garbage-collected. /// - /// Lock-free with respect to [`register_child`](CompositeFs::register_child): - /// outer inode addresses are monotonically increasing and never reused, - /// so `forget(addr)` can only affect the slot that originally owned - /// `addr`. If a concurrent `register_child` has already replaced the - /// slot, `slots.read_sync` returns `None` and the bridge cleanup is - /// skipped — the old slot's `Arc` is dropped with its - /// `Arc` refcount. + /// The slot removal uses `remove_if_sync` with a re-check of + /// `bridge.is_empty()`, preventing a concurrent `backward_or_insert` + /// from inserting a new mapping between the bridge emptiness check + /// and the slot removal. + /// + /// The root inode is never forgotten. fn forget(&self, addr: InodeAddr) { if addr == Self::ROOT_INO { return; } if let Some((_, slot_idx)) = self.inner.addr_to_slot.remove_sync(&addr) { + // Remove the outer->inner mapping from the bridge. The bridge's + // internal mutex serializes this with `backward_or_insert`. let bridge_empty = self .inner .slots .read_sync(&slot_idx, |_, slot| slot.bridge.remove_by_outer(addr)) .unwrap_or(false); if bridge_empty { - self.inner.slots.remove_sync(&slot_idx); + // Bridge is empty — atomically remove the slot only if no one + // has re-populated the bridge between our check and this removal. + // `remove_if_sync` holds the scc bucket lock during evaluation. + let removed = self + .inner + .slots + .remove_if_sync(&slot_idx, |slot| slot.bridge.is_empty()); + if removed.is_some() { + // Clean up name_to_slot to prevent dead slot indices. + self.inner + .name_to_slot + .retain_sync(|_, &mut idx| idx != slot_idx); + } } } } diff --git a/tests/composite_fs_tests.rs b/tests/composite_fs_tests.rs index d68dd6ea..ce110acb 100644 --- a/tests/composite_fs_tests.rs +++ b/tests/composite_fs_tests.rs @@ -8,7 +8,7 @@ use std::ffi::{OsStr, OsString}; use bytes::Bytes; use git_fs::cache::async_backed::FutureBackedCache; -use git_fs::fs::async_fs::AsyncFs; +use git_fs::fs::async_fs::{AsyncFs, FsDataProvider as _}; use git_fs::fs::composite::CompositeFs; use git_fs::fs::{INode, INodeType, LoadedAddr, OpenFlags}; @@ -305,3 +305,49 @@ async fn composite_repeated_lookup_returns_same_addr() { "repeated lookups for the same child should return the same composite address" ); } + +#[tokio::test(flavor = "multi_thread", worker_threads = 2)] +async fn composite_forget_cleans_up_slot_and_name_mapping() { + // Setup: one child "repo" with a file. + let (provider, root_ino) = make_child_provider(100, &[("file.txt", 101, INodeType::File, 42)]); + + let mut children = HashMap::new(); + children.insert(OsString::from("repo"), (provider, root_ino)); + + let mock_root = MockRoot::new(children); + let composite = CompositeFs::new(mock_root, (1000, 1000)); + let root_inode = composite.make_root_inode(); + + let table = FutureBackedCache::default(); + table.insert_sync(1, root_inode); + let afs = AsyncFs::new_preseeded(composite.clone(), &table); + + // Look up the child and a file inside it. + let child_dir = afs + .lookup(LoadedAddr::new_unchecked(1), OsStr::new("repo")) + .await + .unwrap(); + let child_addr = child_dir.inode.addr; + + let file = afs + .lookup( + LoadedAddr::new_unchecked(child_addr), + OsStr::new("file.txt"), + ) + .await + .unwrap(); + let file_addr = file.inode.addr; + + // Forget the file, then the child directory. + composite.forget(file_addr); + composite.forget(child_addr); + + // Re-lookup the child — should succeed with a fresh slot. + let re_resolved = afs + .lookup(LoadedAddr::new_unchecked(1), OsStr::new("repo")) + .await + .unwrap(); + + assert_eq!(re_resolved.inode.itype, INodeType::Directory); + // The new address may differ from the original (fresh slot allocated). +} From 2dccf32d3213c7a9d766a1a874fb55a2b8e336e2 Mon Sep 17 00:00:00 2001 From: Marko Vejnovic Date: Sat, 21 Feb 2026 15:39:58 -0800 Subject: [PATCH 31/58] perf: use BTreeMap in DCache to eliminate readdir re-sort --- lib/fs/async_fs.rs | 31 ++++------ lib/fs/dcache.rs | 52 ++++++++-------- tests/dcache_correctness.rs | 118 ++++++++++++++++++------------------ 3 files changed, 101 insertions(+), 100 deletions(-) diff --git a/lib/fs/async_fs.rs b/lib/fs/async_fs.rs index 625f3f8c..78adfacc 100644 --- a/lib/fs/async_fs.rs +++ b/lib/fs/async_fs.rs @@ -372,14 +372,12 @@ impl<'tbl, DP: FsDataProvider> AsyncFs<'tbl, DP> { .get_or_init(child.addr, || async move { child }) .await; - self.directory_cache - .insert( - parent, - name_owned, - LoadedAddr::new_unchecked(child.addr), - matches!(child.itype, INodeType::Directory), - ) - .await; + self.directory_cache.insert( + parent, + name_owned, + LoadedAddr::new_unchecked(child.addr), + matches!(child.itype, INodeType::Directory), + ); Ok(TrackedINode { inode: child }) } @@ -470,14 +468,12 @@ impl<'tbl, DP: FsDataProvider> AsyncFs<'tbl, DP> { self.inode_table .get_or_init(child_inode.addr, || async move { child_inode }) .await; - self.directory_cache - .insert( - parent, - name, - LoadedAddr::new_unchecked(child_inode.addr), - child_inode.itype == INodeType::Directory, - ) - .await; + self.directory_cache.insert( + parent, + name, + LoadedAddr::new_unchecked(child_inode.addr), + child_inode.itype == INodeType::Directory, + ); } self.directory_cache.finish_populate(parent); guard.defuse(); @@ -491,8 +487,7 @@ impl<'tbl, DP: FsDataProvider> AsyncFs<'tbl, DP> { } } - let mut children = self.directory_cache.readdir(parent).await; - children.sort_unstable_by(|(a, _), (b, _)| a.cmp(b)); + let children = self.directory_cache.readdir(parent); #[expect( clippy::cast_possible_truncation, diff --git a/lib/fs/dcache.rs b/lib/fs/dcache.rs index d8778fb8..247070e9 100644 --- a/lib/fs/dcache.rs +++ b/lib/fs/dcache.rs @@ -1,6 +1,7 @@ +use std::collections::BTreeMap; use std::ffi::{OsStr, OsString}; -use std::sync::Arc; use std::sync::atomic::{AtomicU8, Ordering}; +use std::sync::{Arc, RwLock}; use tokio::sync::Notify; @@ -32,7 +33,7 @@ pub enum PopulateStatus { /// Per-parent directory state holding child entries and a population flag. struct DirState { - children: scc::HashMap, + children: RwLock>, populated: AtomicU8, /// Wakes waiters when `populated` transitions out of `IN_PROGRESS`. notify: Notify, @@ -41,7 +42,7 @@ struct DirState { impl DirState { fn new() -> Self { Self { - children: scc::HashMap::new(), + children: RwLock::new(BTreeMap::new()), populated: AtomicU8::new(POPULATE_UNCLAIMED), notify: Notify::new(), } @@ -51,9 +52,9 @@ impl DirState { /// In-memory directory entry cache with per-parent child maps. /// /// Each parent directory gets its own [`DirState`] containing a -/// [`scc::HashMap`] of child entries and an [`AtomicU8`] population flag. -/// This makes `readdir` O(k) in the number of children rather than O(n) -/// over the entire cache. +/// [`BTreeMap`] of child entries (kept in sorted order) and an [`AtomicU8`] +/// population flag. This makes `readdir` O(k) in the number of children +/// with zero sorting overhead. pub struct DCache { dirs: scc::HashMap>, } @@ -93,36 +94,39 @@ impl DCache { #[must_use] pub fn lookup(&self, parent_ino: LoadedAddr, name: &OsStr) -> Option { let state = self.dirs.read_sync(&parent_ino, |_, v| Arc::clone(v))?; - state.children.read_sync(name, |_, v| v.clone()) + let children = state + .children + .read() + .unwrap_or_else(std::sync::PoisonError::into_inner); + children.get(name).cloned() } /// Atomically inserts or overwrites a child entry in the cache. - pub async fn insert( - &self, - parent_ino: LoadedAddr, - name: OsString, - ino: LoadedAddr, - is_dir: bool, - ) { + pub fn insert(&self, parent_ino: LoadedAddr, name: OsString, ino: LoadedAddr, is_dir: bool) { let state = self.dir_state(parent_ino); let value = DValue { ino, is_dir }; - state.children.upsert_async(name, value).await; + let mut children = state + .children + .write() + .unwrap_or_else(std::sync::PoisonError::into_inner); + children.insert(name, value); } /// Returns all cached children of `parent_ino` as `(name, value)` pairs. - pub async fn readdir(&self, parent_ino: LoadedAddr) -> Vec<(OsString, DValue)> { + /// + /// Entries are returned in name-sorted order (guaranteed by `BTreeMap`). + pub fn readdir(&self, parent_ino: LoadedAddr) -> Vec<(OsString, DValue)> { let Some(state) = self.dirs.read_sync(&parent_ino, |_, v| Arc::clone(v)) else { return Vec::new(); }; - let mut entries = Vec::new(); - state + let children = state .children - .iter_async(|k, v| { - entries.push((k.clone(), v.clone())); - true - }) - .await; - entries + .read() + .unwrap_or_else(std::sync::PoisonError::into_inner); + children + .iter() + .map(|(k, v)| (k.clone(), v.clone())) + .collect() } /// Atomically try to claim a directory for population. diff --git a/tests/dcache_correctness.rs b/tests/dcache_correctness.rs index 83074517..c2273076 100644 --- a/tests/dcache_correctness.rs +++ b/tests/dcache_correctness.rs @@ -18,14 +18,12 @@ async fn lookup_returns_none_for_missing_entry() { #[tokio::test] async fn insert_then_lookup() { let cache = DCache::new(); - cache - .insert( - LoadedAddr::new_unchecked(1), - OsString::from("foo"), - LoadedAddr::new_unchecked(10), - false, - ) - .await; + cache.insert( + LoadedAddr::new_unchecked(1), + OsString::from("foo"), + LoadedAddr::new_unchecked(10), + false, + ); let dv = cache.lookup(LoadedAddr::new_unchecked(1), OsStr::new("foo")); assert!(dv.is_some(), "entry should be present after insert"); let dv = dv.expect("checked above"); @@ -36,31 +34,25 @@ async fn insert_then_lookup() { #[tokio::test] async fn readdir_returns_only_children_of_parent() { let cache = DCache::new(); - cache - .insert( - LoadedAddr::new_unchecked(1), - OsString::from("a"), - LoadedAddr::new_unchecked(10), - false, - ) - .await; - cache - .insert( - LoadedAddr::new_unchecked(1), - OsString::from("b"), - LoadedAddr::new_unchecked(11), - true, - ) - .await; - cache - .insert( - LoadedAddr::new_unchecked(2), - OsString::from("c"), - LoadedAddr::new_unchecked(12), - false, - ) - .await; - let children = cache.readdir(LoadedAddr::new_unchecked(1)).await; + cache.insert( + LoadedAddr::new_unchecked(1), + OsString::from("a"), + LoadedAddr::new_unchecked(10), + false, + ); + cache.insert( + LoadedAddr::new_unchecked(1), + OsString::from("b"), + LoadedAddr::new_unchecked(11), + true, + ); + cache.insert( + LoadedAddr::new_unchecked(2), + OsString::from("c"), + LoadedAddr::new_unchecked(12), + false, + ); + let children = cache.readdir(LoadedAddr::new_unchecked(1)); assert_eq!(children.len(), 2); let names: Vec<_> = children.iter().map(|(n, _)| n.clone()).collect(); assert!(names.contains(&OsString::from("a"))); @@ -70,7 +62,7 @@ async fn readdir_returns_only_children_of_parent() { #[tokio::test] async fn readdir_empty_parent_returns_empty() { let cache = DCache::new(); - let children = cache.readdir(LoadedAddr::new_unchecked(1)).await; + let children = cache.readdir(LoadedAddr::new_unchecked(1)); assert!(children.is_empty()); } @@ -127,14 +119,12 @@ async fn abort_populate_allows_reclaim() { #[tokio::test] async fn insert_does_not_mark_populated() { let cache = DCache::new(); - cache - .insert( - LoadedAddr::new_unchecked(1), - OsString::from("foo"), - LoadedAddr::new_unchecked(10), - false, - ) - .await; + cache.insert( + LoadedAddr::new_unchecked(1), + OsString::from("foo"), + LoadedAddr::new_unchecked(10), + false, + ); assert!( matches!( cache.try_claim_populate(LoadedAddr::new_unchecked(1)), @@ -147,25 +137,37 @@ async fn insert_does_not_mark_populated() { #[tokio::test] async fn upsert_overwrites_existing_entry() { let cache = DCache::new(); - cache - .insert( - LoadedAddr::new_unchecked(1), - OsString::from("foo"), - LoadedAddr::new_unchecked(10), - false, - ) - .await; - cache - .insert( - LoadedAddr::new_unchecked(1), - OsString::from("foo"), - LoadedAddr::new_unchecked(20), - true, - ) - .await; + cache.insert( + LoadedAddr::new_unchecked(1), + OsString::from("foo"), + LoadedAddr::new_unchecked(10), + false, + ); + cache.insert( + LoadedAddr::new_unchecked(1), + OsString::from("foo"), + LoadedAddr::new_unchecked(20), + true, + ); let dv = cache.lookup(LoadedAddr::new_unchecked(1), OsStr::new("foo")); assert!(dv.is_some(), "entry should still be present after upsert"); let dv = dv.expect("checked above"); assert_eq!(dv.ino, LoadedAddr::new_unchecked(20)); assert!(dv.is_dir); } + +#[tokio::test] +async fn readdir_returns_entries_in_sorted_order() { + let cache = DCache::new(); + for name in ["zebra", "apple", "mango"] { + cache.insert( + LoadedAddr::new_unchecked(1), + OsString::from(name), + LoadedAddr::new_unchecked(10), + false, + ); + } + let children = cache.readdir(LoadedAddr::new_unchecked(1)); + let names: Vec<_> = children.iter().map(|(n, _)| n.to_str().unwrap()).collect(); + assert_eq!(names, ["apple", "mango", "zebra"]); +} From 2dd4d39ae47a0b9918511bee84dec42bd9cd9168 Mon Sep 17 00:00:00 2001 From: Marko Vejnovic Date: Sat, 21 Feb 2026 15:47:36 -0800 Subject: [PATCH 32/58] refactor: rename TrackedINode to ResolvedINode --- lib/fs/async_fs.rs | 16 ++++++++-------- lib/fs/mod.rs | 2 +- 2 files changed, 9 insertions(+), 9 deletions(-) diff --git a/lib/fs/async_fs.rs b/lib/fs/async_fs.rs index 78adfacc..51236ce8 100644 --- a/lib/fs/async_fs.rs +++ b/lib/fs/async_fs.rs @@ -103,13 +103,13 @@ impl<'a, DP: FsDataProvider> StatelessDrop<(&'a FutureBackedCache AsyncFs<'tbl, DP> { &self, parent: LoadedAddr, name: &OsStr, - ) -> Result { + ) -> Result { let parent_ino = self.loaded_inode(parent).await?; debug_assert!( matches!(parent_ino.itype, INodeType::Directory), @@ -342,7 +342,7 @@ impl<'tbl, DP: FsDataProvider> AsyncFs<'tbl, DP> { if let Some(dentry) = self.directory_cache.lookup(parent, name) { if let Some(inode) = self.inode_table.get(&dentry.ino.addr()).await { - return Ok(TrackedINode { inode }); + return Ok(ResolvedINode { inode }); } // Inode was evicted (e.g. by forget). Evict the stale lookup_cache // entry so the slow path calls dp.lookup() fresh. @@ -379,7 +379,7 @@ impl<'tbl, DP: FsDataProvider> AsyncFs<'tbl, DP> { matches!(child.itype, INodeType::Directory), ); - Ok(TrackedINode { inode: child }) + Ok(ResolvedINode { inode: child }) } /// Retrieve an inode that is expected to already be loaded. diff --git a/lib/fs/mod.rs b/lib/fs/mod.rs index 02ef8384..2ecf4f3a 100644 --- a/lib/fs/mod.rs +++ b/lib/fs/mod.rs @@ -10,7 +10,7 @@ pub mod dcache; /// FUSE adapter: maps [`fuser::Filesystem`] callbacks to [`async_fs::AsyncFs`]. pub mod fuser; -pub use async_fs::{InodeForget, InodeLifecycle, OpenFile, TrackedINode}; +pub use async_fs::{InodeForget, InodeLifecycle, OpenFile, ResolvedINode}; use std::ffi::OsStr; use std::time::SystemTime; From 754eeca3bca76e4b644cb5da1f8893d4bd4e0199 Mon Sep 17 00:00:00 2001 From: Marko Vejnovic Date: Sat, 21 Feb 2026 15:49:19 -0800 Subject: [PATCH 33/58] perf: use Arc for lookup cache key to reduce allocations --- lib/fs/async_fs.rs | 18 ++++++------------ 1 file changed, 6 insertions(+), 12 deletions(-) diff --git a/lib/fs/async_fs.rs b/lib/fs/async_fs.rs index 51236ce8..3aedca26 100644 --- a/lib/fs/async_fs.rs +++ b/lib/fs/async_fs.rs @@ -248,7 +248,7 @@ pub struct AsyncFs<'tbl, DP: FsDataProvider> { /// Deduplicating lookup cache keyed by `(parent_addr, child_name)`. The factory is /// `dp.lookup()`, so the data provider is only called on a true cache miss. - lookup_cache: FutureBackedCache<(InodeAddr, OsString), INode>, + lookup_cache: FutureBackedCache<(InodeAddr, Arc), INode>, /// Directory entry cache, mapping `(parent, name)` to child inode address. directory_cache: DCache, @@ -346,24 +346,18 @@ impl<'tbl, DP: FsDataProvider> AsyncFs<'tbl, DP> { } // Inode was evicted (e.g. by forget). Evict the stale lookup_cache // entry so the slow path calls dp.lookup() fresh. - // - // Note: a concurrent task may re-insert into lookup_cache between - // our inode_table miss and this remove_sync. This is benign — it - // causes at most one redundant dp.lookup() call because all - // downstream operations (get_or_try_init, get_or_init) are - // idempotent or deduplicated. self.lookup_cache - .remove_sync(&(parent.addr(), name.to_os_string())); + .remove_sync(&(parent.addr(), Arc::from(name))); } - let name_owned = name.to_os_string(); - let lookup_key = (parent.addr(), name_owned.clone()); + let name_arc: Arc = Arc::from(name); + let lookup_key = (parent.addr(), Arc::clone(&name_arc)); let dp = self.data_provider.clone(); let child = self .lookup_cache .get_or_try_init(lookup_key, || { - let name_for_dp = name_owned.clone(); + let name_for_dp = Arc::clone(&name_arc); async move { dp.lookup(parent_ino, &name_for_dp).await } }) .await?; @@ -374,7 +368,7 @@ impl<'tbl, DP: FsDataProvider> AsyncFs<'tbl, DP> { self.directory_cache.insert( parent, - name_owned, + name_arc.as_ref().to_os_string(), LoadedAddr::new_unchecked(child.addr), matches!(child.itype, INodeType::Directory), ); From 3b93bc4dec3e191db1fbcf6bb2a6f836614434e0 Mon Sep 17 00:00:00 2001 From: Marko Vejnovic Date: Sat, 21 Feb 2026 15:50:45 -0800 Subject: [PATCH 34/58] fix: hide LoadedAddr::new_unchecked from public API docs --- lib/fs/mod.rs | 1 + 1 file changed, 1 insertion(+) diff --git a/lib/fs/mod.rs b/lib/fs/mod.rs index 2ecf4f3a..b33b3ca7 100644 --- a/lib/fs/mod.rs +++ b/lib/fs/mod.rs @@ -42,6 +42,7 @@ impl LoadedAddr { /// - The address was previously inserted into an inode table, **or** /// - The address originates from the FUSE kernel (which only knows /// addresses we previously returned to it). + #[doc(hidden)] #[must_use] pub fn new_unchecked(addr: InodeAddr) -> Self { Self(addr) From d9169afc17f5e695e83f95956666579dc76ba1a8 Mon Sep 17 00:00:00 2001 From: Marko Vejnovic Date: Sat, 21 Feb 2026 15:51:53 -0800 Subject: [PATCH 35/58] docs: document get_or_try_init dedup limitation and cache invalidation strategy --- lib/cache/async_backed.rs | 9 +++++++++ lib/fs/async_fs.rs | 5 +++++ lib/fs/mod.rs | 20 ++++++++++++++++++++ 3 files changed, 34 insertions(+) diff --git a/lib/cache/async_backed.rs b/lib/cache/async_backed.rs index d8989cf8..9f3a7f94 100644 --- a/lib/cache/async_backed.rs +++ b/lib/cache/async_backed.rs @@ -143,6 +143,15 @@ where /// owner is elected. Joiners never receive the original error — the retrying owner invokes /// its own factory independently and may produce a different error or succeed. /// + /// # Deduplication of failures + /// + /// When the factory returns `Err`, the poisoned entry is removed and the + /// next caller becomes a new owner with its own factory invocation. This + /// means failures are **not deduplicated**: under transient errors, N + /// concurrent callers may each independently invoke their factory rather + /// than coalescing on the first error. This is intentional — callers + /// may have different retry or error-handling semantics. + /// /// # Panics /// /// Panics if the factory panics (caught internally via `catch_unwind`). diff --git a/lib/fs/async_fs.rs b/lib/fs/async_fs.rs index 3aedca26..37c4b3d6 100644 --- a/lib/fs/async_fs.rs +++ b/lib/fs/async_fs.rs @@ -350,6 +350,11 @@ impl<'tbl, DP: FsDataProvider> AsyncFs<'tbl, DP> { .remove_sync(&(parent.addr(), Arc::from(name))); } + // Note: get_or_try_init deduplicates successful lookups but NOT + // failures. Under transient API errors, concurrent lookups for + // the same (parent, name) may each independently call dp.lookup(). + // This is acceptable: the cost of a redundant API call on error is + // low compared to the complexity of error-channel deduplication. let name_arc: Arc = Arc::from(name); let lookup_key = (parent.addr(), Arc::clone(&name_arc)); let dp = self.data_provider.clone(); diff --git a/lib/fs/mod.rs b/lib/fs/mod.rs index b33b3ca7..52f9510e 100644 --- a/lib/fs/mod.rs +++ b/lib/fs/mod.rs @@ -1,4 +1,24 @@ //! Useful filesystem generalizations. +//! +//! # Cache invalidation +//! +//! The current implementation caches directory listings and inode data +//! indefinitely once populated. Staleness is mitigated only by a short +//! FUSE entry/attr TTL (currently 1 second in `FuserAdapter`). +//! +//! The intended long-term strategy is to use FUSE kernel notifications +//! (`notify_inval_inode` / `notify_inval_entry`) to proactively invalidate +//! specific entries when the backing data changes. This would allow a +//! much higher TTL while still reflecting changes promptly. The key +//! changes needed: +//! +//! 1. `DCache` needs a `remove` or `invalidate` method to reset a +//! parent's `PopulateStatus` back to `UNCLAIMED`. +//! 2. `FuserAdapter` needs access to the `fuser::Session` handle to +//! send `notify_inval_entry` notifications. +//! 3. Data providers need a way to signal when their backing data changes +//! (e.g. webhook, polling, or subscription). + /// Async filesystem cache with concurrent inode management. pub mod async_fs; /// Lock-free bidirectional inode address mapping. From 4d19859d3b6621116391cd221130a038b536cce9 Mon Sep 17 00:00:00 2001 From: Marko Vejnovic Date: Sat, 21 Feb 2026 18:27:58 -0800 Subject: [PATCH 36/58] slightly more cleanup --- lib/fs/async_fs.rs | 23 +++++++++++++++++------ lib/fs/bridge.rs | 19 +++++++++---------- lib/fs/composite.rs | 21 ++++++++++----------- lib/fs/dcache.rs | 17 +++++++++-------- tests/bridge_tests.rs | 6 +++--- tests/dcache_correctness.rs | 16 ++++++++++++---- 6 files changed, 60 insertions(+), 42 deletions(-) diff --git a/lib/fs/async_fs.rs b/lib/fs/async_fs.rs index 37c4b3d6..1899e136 100644 --- a/lib/fs/async_fs.rs +++ b/lib/fs/async_fs.rs @@ -486,20 +486,31 @@ impl<'tbl, DP: FsDataProvider> AsyncFs<'tbl, DP> { } } - let children = self.directory_cache.readdir(parent); - #[expect( clippy::cast_possible_truncation, reason = "offset fits in usize on supported 64-bit platforms" )] - for (i, (name, dvalue)) in children.iter().enumerate().skip(offset as usize) { - let Some(inode) = self.inode_table.get(&dvalue.ino.addr()).await else { + let skip = offset as usize; + + // Collect only entries at or past `offset`, avoiding clones for + // entries that will be skipped during paginated readdir. + let mut entries: Vec<(OsString, LoadedAddr)> = Vec::new(); + let mut idx = 0usize; + self.directory_cache.readdir(parent, |name, dvalue| { + if idx >= skip { + entries.push((name.to_os_string(), dvalue.ino)); + } + idx += 1; + }); + + for (i, (name, child_addr)) in entries.iter().enumerate() { + let Some(inode) = self.inode_table.get(&child_addr.addr()).await else { // Inode was evicted between readdir collection and iteration // (e.g. by a concurrent forget). Skip the stale entry. - tracing::debug!(addr = ?dvalue.ino.addr(), name = ?name, "inode evicted during readdir, skipping"); + tracing::debug!(addr = ?child_addr.addr(), name = ?name, "inode evicted during readdir, skipping"); continue; }; - let next_offset = (i + 1) as u64; + let next_offset = (skip + i + 1) as u64; if filler(DirEntry { name, inode }, next_offset) { break; } diff --git a/lib/fs/bridge.rs b/lib/fs/bridge.rs index 49acf9ee..6e4ef942 100644 --- a/lib/fs/bridge.rs +++ b/lib/fs/bridge.rs @@ -75,15 +75,15 @@ impl ConcurrentBridge { self.bwd.read_sync(&inner, |_, &v| v) } - /// Look up inner -> outer, or allocate a new outer address if unmapped. + /// Look up inner -> outer, or insert `fallback` as the new outer address. + /// + /// `fallback` is a pre-allocated address provided by the caller. If the + /// inner address already has a mapping, `fallback` is unused (the caller + /// accepts that the monotonic address counter may skip values). /// /// Serialized with other mutations via the coordination lock. #[must_use] - pub fn backward_or_insert( - &self, - inner: InodeAddr, - allocate: impl FnOnce() -> InodeAddr, - ) -> InodeAddr { + pub fn backward_or_insert(&self, inner: InodeAddr, fallback: InodeAddr) -> InodeAddr { let _guard = self .mu .lock() @@ -91,10 +91,9 @@ impl ConcurrentBridge { match self.bwd.entry_sync(inner) { scc::hash_map::Entry::Occupied(occ) => *occ.get(), scc::hash_map::Entry::Vacant(vac) => { - let outer = allocate(); - vac.insert_entry(outer); - let _ = self.fwd.insert_sync(outer, inner); - outer + vac.insert_entry(fallback); + let _ = self.fwd.insert_sync(fallback, inner); + fallback } } } diff --git a/lib/fs/composite.rs b/lib/fs/composite.rs index ea7349dc..b564924d 100644 --- a/lib/fs/composite.rs +++ b/lib/fs/composite.rs @@ -129,6 +129,9 @@ impl FileReader for CompositeReader { struct ChildSlot { inner: Arc>, bridge: Arc, + /// The name under which this child was registered in `name_to_slot`. + /// Stored here so `forget` can do O(1) removal instead of a linear scan. + name: OsString, } struct CompositeFsInner { @@ -250,6 +253,7 @@ impl CompositeFs { ChildSlot { inner: child_inner, bridge, + name: desc.name.clone(), }, )); let _ = self.inner.addr_to_slot.insert_sync(outer_ino, slot_idx); @@ -348,9 +352,8 @@ where let child_inode = tracked.inode; // Translate inner address back to composite-level address (outside scc guard). - let outer_ino = bridge.backward_or_insert(child_inode.addr, || { - self.inner.next_ino.fetch_add(1, Ordering::Relaxed) - }); + let fallback = self.allocate_ino(); + let outer_ino = bridge.backward_or_insert(child_inode.addr, fallback); let _ = self.inner.addr_to_slot.insert_sync(outer_ino, slot_idx); @@ -405,9 +408,8 @@ where // Translate all inner addresses to composite-level addresses (outside scc guard). let mut entries = Vec::with_capacity(child_entries.len()); for (name, child_inode) in child_entries { - let outer_ino = bridge.backward_or_insert(child_inode.addr, || { - self.inner.next_ino.fetch_add(1, Ordering::Relaxed) - }); + let fallback = self.allocate_ino(); + let outer_ino = bridge.backward_or_insert(child_inode.addr, fallback); let _ = self.inner.addr_to_slot.insert_sync(outer_ino, slot_idx); entries.push(( @@ -479,11 +481,8 @@ where .inner .slots .remove_if_sync(&slot_idx, |slot| slot.bridge.is_empty()); - if removed.is_some() { - // Clean up name_to_slot to prevent dead slot indices. - self.inner - .name_to_slot - .retain_sync(|_, &mut idx| idx != slot_idx); + if let Some((_, slot)) = removed { + self.inner.name_to_slot.remove_sync(&slot.name); } } } diff --git a/lib/fs/dcache.rs b/lib/fs/dcache.rs index 247070e9..abd412d2 100644 --- a/lib/fs/dcache.rs +++ b/lib/fs/dcache.rs @@ -112,21 +112,22 @@ impl DCache { children.insert(name, value); } - /// Returns all cached children of `parent_ino` as `(name, value)` pairs. + /// Iterate all cached children of `parent_ino` in name-sorted order. /// - /// Entries are returned in name-sorted order (guaranteed by `BTreeMap`). - pub fn readdir(&self, parent_ino: LoadedAddr) -> Vec<(OsString, DValue)> { + /// Calls `f` for each `(name, value)` pair while holding the read lock. + /// Callers decide what to collect, avoiding unnecessary allocations for + /// entries that will be skipped (e.g. by offset-based pagination). + pub fn readdir(&self, parent_ino: LoadedAddr, mut f: impl FnMut(&OsStr, &DValue)) { let Some(state) = self.dirs.read_sync(&parent_ino, |_, v| Arc::clone(v)) else { - return Vec::new(); + return; }; let children = state .children .read() .unwrap_or_else(std::sync::PoisonError::into_inner); - children - .iter() - .map(|(k, v)| (k.clone(), v.clone())) - .collect() + for (name, value) in children.iter() { + f(name, value); + } } /// Atomically try to claim a directory for population. diff --git a/tests/bridge_tests.rs b/tests/bridge_tests.rs index b0598e4d..d8389273 100644 --- a/tests/bridge_tests.rs +++ b/tests/bridge_tests.rs @@ -26,15 +26,15 @@ fn forward_missing_returns_none() { fn backward_or_insert_existing_returns_cached() { let bridge = ConcurrentBridge::new(); bridge.insert(10, 100); - let outer = bridge.backward_or_insert(100, || 999); + let outer = bridge.backward_or_insert(100, 999); assert_eq!(outer, 10, "should return existing outer addr"); } #[test] fn backward_or_insert_new_allocates() { let bridge = ConcurrentBridge::new(); - let outer = bridge.backward_or_insert(200, || 50); - assert_eq!(outer, 50, "should use allocator"); + let outer = bridge.backward_or_insert(200, 50); + assert_eq!(outer, 50, "should use fallback address"); assert_eq!(bridge.forward(50), Some(200)); assert_eq!(bridge.backward(200), Some(50)); } diff --git a/tests/dcache_correctness.rs b/tests/dcache_correctness.rs index c2273076..f99d797b 100644 --- a/tests/dcache_correctness.rs +++ b/tests/dcache_correctness.rs @@ -52,7 +52,10 @@ async fn readdir_returns_only_children_of_parent() { LoadedAddr::new_unchecked(12), false, ); - let children = cache.readdir(LoadedAddr::new_unchecked(1)); + let mut children = Vec::new(); + cache.readdir(LoadedAddr::new_unchecked(1), |name, dvalue| { + children.push((name.to_os_string(), dvalue.clone())); + }); assert_eq!(children.len(), 2); let names: Vec<_> = children.iter().map(|(n, _)| n.clone()).collect(); assert!(names.contains(&OsString::from("a"))); @@ -62,7 +65,10 @@ async fn readdir_returns_only_children_of_parent() { #[tokio::test] async fn readdir_empty_parent_returns_empty() { let cache = DCache::new(); - let children = cache.readdir(LoadedAddr::new_unchecked(1)); + let mut children = Vec::new(); + cache.readdir(LoadedAddr::new_unchecked(1), |name, dvalue| { + children.push((name.to_os_string(), dvalue.clone())); + }); assert!(children.is_empty()); } @@ -167,7 +173,9 @@ async fn readdir_returns_entries_in_sorted_order() { false, ); } - let children = cache.readdir(LoadedAddr::new_unchecked(1)); - let names: Vec<_> = children.iter().map(|(n, _)| n.to_str().unwrap()).collect(); + let mut names = Vec::new(); + cache.readdir(LoadedAddr::new_unchecked(1), |name, _| { + names.push(name.to_str().unwrap().to_owned()); + }); assert_eq!(names, ["apple", "mango", "zebra"]); } From f274a5a7782d19f792829131818046c27e8b1bab Mon Sep 17 00:00:00 2001 From: Marko Vejnovic Date: Sat, 21 Feb 2026 19:26:59 -0800 Subject: [PATCH 37/58] refactor: replace ouroboros with Arc in AsyncFs, InodeLifecycle, FuseBridgeInner, ChildInner Remove the ouroboros dependency entirely. All self-referencing structs now use Arc for shared ownership, which is simpler and enables spawning background tasks that reference the inode table. --- Cargo.lock | 74 +------------------- Cargo.toml | 1 - lib/cache/async_backed.rs | 3 +- lib/fs/async_fs.rs | 86 ++++++++++------------- lib/fs/composite.rs | 48 ++++--------- lib/fs/fuser.rs | 72 +++++++------------- tests/async_fs_correctness.rs | 124 +++++++++++++++++----------------- tests/composite_fs_tests.rs | 33 ++++----- 8 files changed, 156 insertions(+), 285 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 1050f46b..d4cf1499 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -11,12 +11,6 @@ dependencies = [ "memchr", ] -[[package]] -name = "aliasable" -version = "0.1.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "250f629c0161ad8107cf89319e990051fae62832fd343083bea452d93e2205fd" - [[package]] name = "android_system_properties" version = "0.1.5" @@ -234,7 +228,7 @@ version = "4.5.55" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "a92793da1a46a5f2a02a6f4c46c6496b28c43638adea8306fcb0caa1634f24e5" dependencies = [ - "heck 0.5.0", + "heck", "proc-macro2", "quote", "syn", @@ -764,7 +758,6 @@ dependencies = [ "opentelemetry", "opentelemetry-otlp", "opentelemetry_sdk", - "ouroboros", "rand", "reqwest", "reqwest-middleware", @@ -846,12 +839,6 @@ dependencies = [ "hashbrown 0.16.1", ] -[[package]] -name = "heck" -version = "0.4.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "95505c38b4572b2d910cecb0281560f54b440a19336cbbcb27bf6ce6adc6f5a8" - [[package]] name = "heck" version = "0.5.0" @@ -1510,30 +1497,6 @@ version = "0.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "04744f49eae99ab78e0d5c0b603ab218f515ea8cfe5a456d7629ad883a3b6e7d" -[[package]] -name = "ouroboros" -version = "0.18.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1e0f050db9c44b97a94723127e6be766ac5c340c48f2c4bb3ffa11713744be59" -dependencies = [ - "aliasable", - "ouroboros_macro", - "static_assertions", -] - -[[package]] -name = "ouroboros_macro" -version = "0.18.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3c7028bdd3d43083f6d8d4d5187680d0d3560d54df4cc9d752005268b41e64d0" -dependencies = [ - "heck 0.4.1", - "proc-macro2", - "proc-macro2-diagnostics", - "quote", - "syn", -] - [[package]] name = "page_size" version = "0.6.0" @@ -1660,19 +1623,6 @@ dependencies = [ "unicode-ident", ] -[[package]] -name = "proc-macro2-diagnostics" -version = "0.10.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "af066a9c399a26e020ada66a034357a868728e72cd426f3adcd35f80d88d88c8" -dependencies = [ - "proc-macro2", - "quote", - "syn", - "version_check", - "yansi", -] - [[package]] name = "prost" version = "0.13.5" @@ -2362,12 +2312,6 @@ version = "1.2.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "6ce2be8dc25455e1f91df71bfa12ad37d7af1092ae736f3a6cd0e37bc7810596" -[[package]] -name = "static_assertions" -version = "1.1.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a2eb9349b6444b326872e140eb1cf5e7c522154d69e7a0ffb0fb81c06b37543f" - [[package]] name = "strsim" version = "0.11.1" @@ -2921,12 +2865,6 @@ dependencies = [ "rustversion", ] -[[package]] -name = "version_check" -version = "0.9.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0b928f33d975fc6ad9f86c8f283853ad26bdd5b10b7f1542aa2fa15e2289105a" - [[package]] name = "vt100" version = "0.16.2" @@ -3371,7 +3309,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ea61de684c3ea68cb082b7a88508a8b27fcc8b797d738bfc99a82facf1d752dc" dependencies = [ "anyhow", - "heck 0.5.0", + "heck", "wit-parser", ] @@ -3382,7 +3320,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b7c566e0f4b284dd6561c786d9cb0142da491f46a9fbed79ea69cdad5db17f21" dependencies = [ "anyhow", - "heck 0.5.0", + "heck", "indexmap 2.13.0", "prettyplease", "syn", @@ -3449,12 +3387,6 @@ version = "0.6.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9edde0db4769d2dc68579893f2306b26c6ecfbe0ef499b013d731b7b9247e0b9" -[[package]] -name = "yansi" -version = "1.0.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "cfe53a6657fd280eaa890a3bc59152892ffa3e30101319d168b781ed6529b049" - [[package]] name = "yoke" version = "0.8.1" diff --git a/Cargo.toml b/Cargo.toml index dcf7b555..d837f7fe 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -49,7 +49,6 @@ tracing-indicatif = "0.3.14" opentelemetry = { version = "0.29" } opentelemetry_sdk = { version = "0.29", features = ["rt-tokio"] } opentelemetry-otlp = { version = "0.29", default-features = false, features = ["http-proto", "trace", "reqwest-blocking-client"] } -ouroboros = "0.18" tracing-opentelemetry = { version = "0.30" } hashlink = "0.11.0" diff --git a/lib/cache/async_backed.rs b/lib/cache/async_backed.rs index 9f3a7f94..6ec95d75 100644 --- a/lib/cache/async_backed.rs +++ b/lib/cache/async_backed.rs @@ -347,8 +347,7 @@ where /// Synchronously insert a value, overwriting any existing entry. /// - /// Suitable for seeding the cache before async operations begin (e.g. - /// inside an ouroboros builder where async is unavailable). + /// Suitable for seeding the cache before async operations begin. pub fn insert_sync(&self, key: K, value: V) { drop(self.map.insert_sync(key, Slot::Ready(value))); } diff --git a/lib/fs/async_fs.rs b/lib/fs/async_fs.rs index 1899e136..32c5251d 100644 --- a/lib/fs/async_fs.rs +++ b/lib/fs/async_fs.rs @@ -86,18 +86,18 @@ pub trait FsDataProvider: Clone + Send + Sync + 'static { pub struct InodeForget; /// Evicts the inode from the table only. Used when no data provider is available. -impl<'a> StatelessDrop<&'a FutureBackedCache, InodeAddr> for InodeForget { - fn delete(inode_table: &&'a FutureBackedCache, addr: &InodeAddr) { +impl StatelessDrop>, InodeAddr> for InodeForget { + fn delete(inode_table: &Arc>, addr: &InodeAddr) { inode_table.remove_sync(addr); } } /// Evicts the inode from the table and delegates to [`FsDataProvider::forget`] /// so the provider can clean up its own auxiliary state. -impl<'a, DP: FsDataProvider> StatelessDrop<(&'a FutureBackedCache, DP), InodeAddr> +impl StatelessDrop<(Arc>, DP), InodeAddr> for InodeForget { - fn delete(ctx: &(&'a FutureBackedCache, DP), key: &InodeAddr) { + fn delete(ctx: &(Arc>, DP), key: &InodeAddr) { ctx.0.remove_sync(key); ctx.1.forget(*key); } @@ -134,44 +134,29 @@ impl OpenFile { } } -mod inode_lifecycle_impl { - #![allow(clippy::future_not_send, clippy::mem_forget)] - use ouroboros::self_referencing; - - use crate::cache::async_backed::FutureBackedCache; - use crate::drop_ward::DropWard; - use crate::fs::InodeAddr; - - use super::{INode, InodeForget}; - - /// Co-located inode table and reference-count ward. - /// - /// The ward borrows the table directly (no `Arc`) via `ouroboros`. - /// When `dec` reaches zero for a key, [`InodeForget::delete`] synchronously - /// removes that inode from the table. - #[self_referencing] - pub struct InodeLifecycle { - pub(super) table: FutureBackedCache, - #[borrows(table)] - #[not_covariant] - pub(super) ward: - DropWard<&'this FutureBackedCache, InodeAddr, InodeForget>, - } - - impl InodeLifecycle { - /// Create a new lifecycle managing the given inode table. - pub fn from_table(table: FutureBackedCache) -> Self { - Self::new(table, |tbl| DropWard::new(tbl)) - } - } +/// Co-located inode table and reference-count ward. +/// +/// When `dec` reaches zero for a key, [`InodeForget::delete`] synchronously +/// removes that inode from the table. +pub struct InodeLifecycle { + table: Arc>, + ward: crate::drop_ward::DropWard< + Arc>, + InodeAddr, + InodeForget, + >, } -pub use inode_lifecycle_impl::InodeLifecycle; - impl InodeLifecycle { + /// Create a new lifecycle managing the given inode table. + pub fn from_table(table: Arc>) -> Self { + let ward = crate::drop_ward::DropWard::new(Arc::clone(&table)); + Self { table, ward } + } + /// Increment the reference count for an inode address. pub fn inc(&mut self, addr: InodeAddr) -> usize { - self.with_ward_mut(|ward| ward.inc(addr)) + self.ward.inc(addr) } /// Decrement the reference count for an inode address. @@ -179,20 +164,20 @@ impl InodeLifecycle { /// When the count reaches zero, the inode is automatically evicted /// from the table via [`InodeForget::delete`]. pub fn dec(&mut self, addr: &InodeAddr) -> Option { - self.with_ward_mut(|ward| ward.dec(addr)) + self.ward.dec(addr) } /// Decrement the reference count by `count`. /// /// When the count reaches zero, the inode is automatically evicted. pub fn dec_count(&mut self, addr: &InodeAddr, count: usize) -> Option { - self.with_ward_mut(|ward| ward.dec_count(addr, count)) + self.ward.dec_count(addr, count) } /// Read-only access to the underlying inode table. #[must_use] pub fn table(&self) -> &FutureBackedCache { - self.borrow_table() + &self.table } } @@ -242,16 +227,16 @@ impl Drop for PopulateGuard<'_> { /// called on a true cache miss (not already cached or in-flight). /// /// The [`DCache`] sits in front as a synchronous fast path mapping `(parent, name)` to child addr. -pub struct AsyncFs<'tbl, DP: FsDataProvider> { +pub struct AsyncFs { /// Canonical addr -> `INode` map. Used by `loaded_inode()` to retrieve inodes by address. - inode_table: &'tbl FutureBackedCache, + inode_table: Arc>, /// Deduplicating lookup cache keyed by `(parent_addr, child_name)`. The factory is /// `dp.lookup()`, so the data provider is only called on a true cache miss. lookup_cache: FutureBackedCache<(InodeAddr, Arc), INode>, /// Directory entry cache, mapping `(parent, name)` to child inode address. - directory_cache: DCache, + directory_cache: Arc, /// The data provider used to fetch inode data on cache misses. data_provider: DP, @@ -260,12 +245,12 @@ pub struct AsyncFs<'tbl, DP: FsDataProvider> { next_fh: AtomicU64, } -impl<'tbl, DP: FsDataProvider> AsyncFs<'tbl, DP> { +impl AsyncFs { /// Create a new `AsyncFs`, seeding the root inode into the table. pub async fn new( data_provider: DP, root: INode, - inode_table: &'tbl FutureBackedCache, + inode_table: Arc>, ) -> Self { inode_table .get_or_init(root.addr, || async move { root }) @@ -274,7 +259,7 @@ impl<'tbl, DP: FsDataProvider> AsyncFs<'tbl, DP> { Self { inode_table, lookup_cache: FutureBackedCache::default(), - directory_cache: DCache::new(), + directory_cache: Arc::new(DCache::new()), data_provider, next_fh: AtomicU64::new(1), } @@ -282,18 +267,17 @@ impl<'tbl, DP: FsDataProvider> AsyncFs<'tbl, DP> { /// Create a new `AsyncFs`, assuming the root inode is already in the table. /// - /// This synchronous constructor is needed for ouroboros builders where - /// async is unavailable. The caller must ensure the root inode has already - /// been inserted into `inode_table` (e.g. via [`FutureBackedCache::insert_sync`]). + /// The caller must ensure the root inode has already been inserted into + /// `inode_table` (e.g. via [`FutureBackedCache::insert_sync`]). #[must_use] pub fn new_preseeded( data_provider: DP, - inode_table: &'tbl FutureBackedCache, + inode_table: Arc>, ) -> Self { Self { inode_table, lookup_cache: FutureBackedCache::default(), - directory_cache: DCache::new(), + directory_cache: Arc::new(DCache::new()), data_provider, next_fh: AtomicU64::new(1), } diff --git a/lib/fs/composite.rs b/lib/fs/composite.rs index b564924d..92361344 100644 --- a/lib/fs/composite.rs +++ b/lib/fs/composite.rs @@ -14,7 +14,7 @@ use std::sync::atomic::{AtomicU64, Ordering}; use bytes::Bytes; use crate::cache::async_backed::FutureBackedCache; -use crate::fs::async_fs::{FileReader, FsDataProvider, OpenFile}; +use crate::fs::async_fs::{AsyncFs, FileReader, FsDataProvider, OpenFile}; use crate::fs::bridge::ConcurrentBridge; use crate::fs::{INode, INodeType, InodeAddr, InodePerms, LoadedAddr, OpenFlags}; @@ -53,45 +53,25 @@ pub trait CompositeRoot: Send + Sync + 'static { ) -> impl Future>, std::io::Error>> + Send; } -mod child_inner_impl { - #![allow(clippy::future_not_send, clippy::mem_forget)] - - use ouroboros::self_referencing; - - use crate::cache::async_backed::FutureBackedCache; - use crate::fs::async_fs::{AsyncFs, FsDataProvider}; - use crate::fs::{INode, InodeAddr}; +/// Co-locates an inode table and [`AsyncFs`]. +pub struct ChildInner { + #[expect(dead_code)] + table: Arc>, + fs: AsyncFs, +} - /// Self-referential struct co-locating an inode table and [`AsyncFs`]. - /// - /// The `AsyncFs` borrows from the table directly, avoiding an extra - /// indirection. This mirrors the [`FuseBridgeInner`](super::super::fuser) - /// pattern. - #[self_referencing] - pub struct ChildInner { - pub(super) table: FutureBackedCache, - #[borrows(table)] - #[covariant] - pub(super) fs: AsyncFs<'this, DP>, +impl ChildInner { + pub(crate) fn create(table: FutureBackedCache, provider: DP) -> Self { + let table = Arc::new(table); + let fs = AsyncFs::new_preseeded(provider, Arc::clone(&table)); + Self { table, fs } } - impl ChildInner { - pub(super) fn create(table: FutureBackedCache, provider: DP) -> Self { - ChildInnerBuilder { - table, - fs_builder: |tbl| AsyncFs::new_preseeded(provider, tbl), - } - .build() - } - - pub(super) fn get_fs(&self) -> &AsyncFs<'_, DP> { - self.borrow_fs() - } + pub(crate) fn get_fs(&self) -> &AsyncFs { + &self.fs } } -pub use child_inner_impl::ChildInner; - /// Wraps a child's reader so that the composite layer can expose it as its own /// [`FileReader`]. pub struct CompositeReader { diff --git a/lib/fs/fuser.rs b/lib/fs/fuser.rs index 15fa36f7..06c27d4d 100644 --- a/lib/fs/fuser.rs +++ b/lib/fs/fuser.rs @@ -69,62 +69,38 @@ impl FuseResultExt for Result { } } -mod inner { - #![allow(clippy::future_not_send, clippy::mem_forget)] - - use ouroboros::self_referencing; - - use crate::cache::async_backed::FutureBackedCache; - use crate::drop_ward::DropWard; - use crate::fs::async_fs::{AsyncFs, FsDataProvider, InodeForget}; - use crate::fs::{INode, InodeAddr}; +type FuseWard = crate::drop_ward::DropWard< + (Arc>, DP), + InodeAddr, + super::async_fs::InodeForget, +>; + +struct FuseBridgeInner { + ward: FuseWard, + fs: super::async_fs::AsyncFs, +} - /// Self-referential struct holding the inode table, refcount ward, and `AsyncFs`. - /// - /// Both `ward` and `fs` borrow from `table`. The ward manages inode - /// refcounts; the fs serves lookup/readdir/open/read operations. - /// - /// The ward context is `(&table, DP)` so that [`InodeForget`] can both - /// remove the inode from the table and call `dp.forget()` to clean up - /// provider-internal maps (bridge mappings, path maps, etc.). - #[self_referencing] - pub(super) struct FuseBridgeInner { - table: FutureBackedCache, - #[borrows(table)] - #[not_covariant] - ward: DropWard<(&'this FutureBackedCache, DP), InodeAddr, InodeForget>, - #[borrows(table)] - #[covariant] - fs: AsyncFs<'this, DP>, +impl FuseBridgeInner { + fn create(table: FutureBackedCache, provider: DP) -> Self { + let table = Arc::new(table); + let ward = crate::drop_ward::DropWard::new((Arc::clone(&table), provider.clone())); + let fs = super::async_fs::AsyncFs::new_preseeded(provider, table); + Self { ward, fs } } - impl FuseBridgeInner { - pub(super) fn create(table: FutureBackedCache, provider: DP) -> Self { - let ward_provider = provider.clone(); - FuseBridgeInnerBuilder { - table, - ward_builder: |tbl| DropWard::new((tbl, ward_provider)), - fs_builder: |tbl| AsyncFs::new_preseeded(provider, tbl), - } - .build() - } - - pub(super) fn get_fs(&self) -> &AsyncFs<'_, DP> { - self.borrow_fs() - } + fn get_fs(&self) -> &super::async_fs::AsyncFs { + &self.fs + } - pub(super) fn ward_inc(&mut self, addr: InodeAddr) -> usize { - self.with_ward_mut(|ward| ward.inc(addr)) - } + fn ward_inc(&mut self, addr: InodeAddr) -> usize { + self.ward.inc(addr) + } - pub(super) fn ward_dec_count(&mut self, addr: InodeAddr, count: usize) -> Option { - self.with_ward_mut(|ward| ward.dec_count(&addr, count)) - } + fn ward_dec_count(&mut self, addr: InodeAddr, count: usize) -> Option { + self.ward.dec_count(&addr, count) } } -use inner::FuseBridgeInner; - /// Convert an `INode` to the fuser-specific `FileAttr`. fn inode_to_fuser_attr(inode: &INode, block_size: u32) -> fuser::FileAttr { fuser::FileAttr { diff --git a/tests/async_fs_correctness.rs b/tests/async_fs_correctness.rs index 643e7b1e..5ce202c6 100644 --- a/tests/async_fs_correctness.rs +++ b/tests/async_fs_correctness.rs @@ -13,11 +13,11 @@ use common::async_fs_mocks::{MockFsDataProvider, MockFsState, make_inode}; #[tokio::test(flavor = "multi_thread", worker_threads = 2)] async fn lifecycle_inc_returns_count_after_increment() { - let table = FutureBackedCache::default(); + let table = Arc::new(FutureBackedCache::default()); let inode = make_inode(100, INodeType::File, 0, Some(1)); table.insert_sync(100, inode); - let mut lifecycle = InodeLifecycle::from_table(table); + let mut lifecycle = InodeLifecycle::from_table(Arc::clone(&table)); assert_eq!(lifecycle.inc(100), 1, "first inc should return 1"); assert_eq!(lifecycle.inc(100), 2, "second inc should return 2"); @@ -26,11 +26,11 @@ async fn lifecycle_inc_returns_count_after_increment() { #[tokio::test(flavor = "multi_thread", worker_threads = 2)] async fn lifecycle_dec_returns_remaining_count() { - let table = FutureBackedCache::default(); + let table = Arc::new(FutureBackedCache::default()); let inode = make_inode(100, INodeType::File, 0, Some(1)); table.insert_sync(100, inode); - let mut lifecycle = InodeLifecycle::from_table(table); + let mut lifecycle = InodeLifecycle::from_table(Arc::clone(&table)); lifecycle.inc(100); lifecycle.inc(100); @@ -40,8 +40,8 @@ async fn lifecycle_dec_returns_remaining_count() { #[tokio::test(flavor = "multi_thread", worker_threads = 2)] async fn lifecycle_dec_unknown_addr_returns_none() { - let table: FutureBackedCache = FutureBackedCache::default(); - let mut lifecycle = InodeLifecycle::from_table(table); + let table: Arc> = Arc::new(FutureBackedCache::default()); + let mut lifecycle = InodeLifecycle::from_table(Arc::clone(&table)); assert_eq!( lifecycle.dec(&999), @@ -52,11 +52,11 @@ async fn lifecycle_dec_unknown_addr_returns_none() { #[tokio::test(flavor = "multi_thread", worker_threads = 2)] async fn lifecycle_dec_to_zero_evicts_from_table() { - let table = FutureBackedCache::default(); + let table = Arc::new(FutureBackedCache::default()); let inode = make_inode(100, INodeType::File, 0, Some(1)); table.insert_sync(100, inode); - let mut lifecycle = InodeLifecycle::from_table(table); + let mut lifecycle = InodeLifecycle::from_table(Arc::clone(&table)); lifecycle.inc(100); assert_eq!(lifecycle.dec(&100), Some(0)); @@ -69,11 +69,11 @@ async fn lifecycle_dec_to_zero_evicts_from_table() { #[tokio::test(flavor = "multi_thread", worker_threads = 2)] async fn lifecycle_dec_count_decrements_by_n() { - let table: FutureBackedCache = FutureBackedCache::default(); + let table: Arc> = Arc::new(FutureBackedCache::default()); let inode = make_inode(100, INodeType::File, 0, Some(1)); table.insert_sync(100, inode); - let mut lifecycle = InodeLifecycle::from_table(table); + let mut lifecycle = InodeLifecycle::from_table(Arc::clone(&table)); lifecycle.inc(100); lifecycle.inc(100); lifecycle.inc(100); // count = 3 @@ -87,11 +87,11 @@ async fn lifecycle_dec_count_decrements_by_n() { #[tokio::test(flavor = "multi_thread", worker_threads = 2)] async fn lifecycle_dec_count_to_zero_evicts() { - let table = FutureBackedCache::default(); + let table = Arc::new(FutureBackedCache::default()); let inode = make_inode(100, INodeType::File, 0, Some(1)); table.insert_sync(100, inode); - let mut lifecycle = InodeLifecycle::from_table(table); + let mut lifecycle = InodeLifecycle::from_table(Arc::clone(&table)); lifecycle.inc(100); lifecycle.inc(100); // count = 2 @@ -104,11 +104,11 @@ async fn lifecycle_dec_count_to_zero_evicts() { #[tokio::test(flavor = "multi_thread", worker_threads = 2)] async fn lifecycle_table_returns_underlying_cache() { - let table = FutureBackedCache::default(); + let table = Arc::new(FutureBackedCache::default()); let inode = make_inode(42, INodeType::Directory, 0, None); table.insert_sync(42, inode); - let lifecycle = InodeLifecycle::from_table(table); + let lifecycle = InodeLifecycle::from_table(Arc::clone(&table)); let fetched = lifecycle.table().get(&42).await; assert_eq!( @@ -120,11 +120,11 @@ async fn lifecycle_table_returns_underlying_cache() { #[tokio::test(flavor = "multi_thread", worker_threads = 2)] async fn new_seeds_root_inode_into_table() { - let table = FutureBackedCache::default(); + let table = Arc::new(FutureBackedCache::default()); let root = make_inode(1, INodeType::Directory, 0, None); let dp = MockFsDataProvider::new(MockFsState::default()); - let fs = AsyncFs::new(dp, root, &table).await; + let fs = AsyncFs::new(dp, root, Arc::clone(&table)).await; assert_eq!(fs.inode_count(), 1, "root should be the only inode"); let fetched = table.get(&1).await; @@ -137,10 +137,10 @@ async fn new_seeds_root_inode_into_table() { #[tokio::test(flavor = "multi_thread", worker_threads = 2)] async fn new_preseeded_does_not_insert_root() { - let table: FutureBackedCache = FutureBackedCache::default(); + let table: Arc> = Arc::new(FutureBackedCache::default()); let dp = MockFsDataProvider::new(MockFsState::default()); - let fs = AsyncFs::new_preseeded(dp, &table); + let fs = AsyncFs::new_preseeded(dp, Arc::clone(&table)); assert_eq!( fs.inode_count(), @@ -151,11 +151,11 @@ async fn new_preseeded_does_not_insert_root() { #[tokio::test(flavor = "multi_thread", worker_threads = 2)] async fn statfs_reports_inode_count() { - let table = FutureBackedCache::default(); + let table = Arc::new(FutureBackedCache::default()); let root = make_inode(1, INodeType::Directory, 0, None); let dp = MockFsDataProvider::new(MockFsState::default()); - let fs = AsyncFs::new(dp, root, &table).await; + let fs = AsyncFs::new(dp, root, Arc::clone(&table)).await; let stats = fs.statfs(); assert_eq!(stats.block_size, 4096); @@ -166,11 +166,11 @@ async fn statfs_reports_inode_count() { #[tokio::test(flavor = "multi_thread", worker_threads = 2)] async fn loaded_inode_returns_seeded_inode() { - let table = FutureBackedCache::default(); + let table = Arc::new(FutureBackedCache::default()); let root = make_inode(1, INodeType::Directory, 0, None); let dp = MockFsDataProvider::new(MockFsState::default()); - let fs = AsyncFs::new(dp, root, &table).await; + let fs = AsyncFs::new(dp, root, Arc::clone(&table)).await; let inode = fs.loaded_inode(LoadedAddr::new_unchecked(1)).await.unwrap(); assert_eq!(inode.addr, 1); @@ -179,11 +179,11 @@ async fn loaded_inode_returns_seeded_inode() { #[tokio::test(flavor = "multi_thread", worker_threads = 2)] async fn loaded_inode_returns_enoent_for_missing_addr() { - let table = FutureBackedCache::default(); + let table = Arc::new(FutureBackedCache::default()); let root = make_inode(1, INodeType::Directory, 0, None); let dp = MockFsDataProvider::new(MockFsState::default()); - let fs = AsyncFs::new(dp, root, &table).await; + let fs = AsyncFs::new(dp, root, Arc::clone(&table)).await; let err = fs .loaded_inode(LoadedAddr::new_unchecked(999)) @@ -194,11 +194,11 @@ async fn loaded_inode_returns_enoent_for_missing_addr() { #[tokio::test(flavor = "multi_thread", worker_threads = 2)] async fn getattr_delegates_to_loaded_inode() { - let table = FutureBackedCache::default(); + let table = Arc::new(FutureBackedCache::default()); let root = make_inode(1, INodeType::Directory, 4096, None); let dp = MockFsDataProvider::new(MockFsState::default()); - let fs = AsyncFs::new(dp, root, &table).await; + let fs = AsyncFs::new(dp, root, Arc::clone(&table)).await; let inode = fs.getattr(LoadedAddr::new_unchecked(1)).await.unwrap(); assert_eq!(inode.addr, 1); @@ -214,8 +214,8 @@ async fn lookup_resolves_child_via_data_provider() { state.lookups.insert((1, "readme.md".into()), child); let dp = MockFsDataProvider::new(state); - let table = FutureBackedCache::default(); - let fs = AsyncFs::new(dp, root, &table).await; + let table = Arc::new(FutureBackedCache::default()); + let fs = AsyncFs::new(dp, root, Arc::clone(&table)).await; let tracked = fs .lookup(LoadedAddr::new_unchecked(1), OsStr::new("readme.md")) @@ -236,8 +236,8 @@ async fn lookup_populates_inode_table() { state.lookups.insert((1, "file.txt".into()), child); let dp = MockFsDataProvider::new(state); - let table = FutureBackedCache::default(); - let fs = AsyncFs::new(dp, root, &table).await; + let table = Arc::new(FutureBackedCache::default()); + let fs = AsyncFs::new(dp, root, Arc::clone(&table)).await; fs.lookup(LoadedAddr::new_unchecked(1), OsStr::new("file.txt")) .await @@ -261,8 +261,8 @@ async fn lookup_second_call_uses_cache() { state.lookups.insert((1, "cached.txt".into()), child); let dp = MockFsDataProvider::new(state); - let table = FutureBackedCache::default(); - let fs = AsyncFs::new(dp, root, &table).await; + let table = Arc::new(FutureBackedCache::default()); + let fs = AsyncFs::new(dp, root, Arc::clone(&table)).await; let first = fs .lookup(LoadedAddr::new_unchecked(1), OsStr::new("cached.txt")) @@ -282,8 +282,8 @@ async fn lookup_propagates_provider_error() { // No lookups configured — provider will return ENOENT. let dp = MockFsDataProvider::new(MockFsState::default()); - let table = FutureBackedCache::default(); - let fs = AsyncFs::new(dp, root, &table).await; + let table = Arc::new(FutureBackedCache::default()); + let fs = AsyncFs::new(dp, root, Arc::clone(&table)).await; let err = fs .lookup(LoadedAddr::new_unchecked(1), OsStr::new("nonexistent")) @@ -305,9 +305,9 @@ async fn open_returns_file_handle_and_reader() { .insert(10, bytes::Bytes::from_static(b"hello")); let dp = MockFsDataProvider::new(state); - let table = FutureBackedCache::default(); + let table = Arc::new(FutureBackedCache::default()); table.insert_sync(10, file); - let fs = AsyncFs::new(dp, root, &table).await; + let fs = AsyncFs::new(dp, root, Arc::clone(&table)).await; let open_file = fs .open(LoadedAddr::new_unchecked(10), OpenFlags::RDONLY) @@ -324,8 +324,8 @@ async fn open_returns_eisdir_for_directory() { let root = make_inode(1, INodeType::Directory, 0, None); let dp = MockFsDataProvider::new(MockFsState::default()); - let table = FutureBackedCache::default(); - let fs = AsyncFs::new(dp, root, &table).await; + let table = Arc::new(FutureBackedCache::default()); + let fs = AsyncFs::new(dp, root, Arc::clone(&table)).await; let err = fs .open(LoadedAddr::new_unchecked(1), OpenFlags::RDONLY) @@ -339,8 +339,8 @@ async fn open_returns_enoent_for_missing_inode() { let root = make_inode(1, INodeType::Directory, 0, None); let dp = MockFsDataProvider::new(MockFsState::default()); - let table = FutureBackedCache::default(); - let fs = AsyncFs::new(dp, root, &table).await; + let table = Arc::new(FutureBackedCache::default()); + let fs = AsyncFs::new(dp, root, Arc::clone(&table)).await; let err = fs .open(LoadedAddr::new_unchecked(999), OpenFlags::RDONLY) @@ -356,9 +356,9 @@ async fn open_assigns_unique_file_handles() { let dp = MockFsDataProvider::new(MockFsState::default()); - let table = FutureBackedCache::default(); + let table = Arc::new(FutureBackedCache::default()); table.insert_sync(10, file); - let fs = AsyncFs::new(dp, root, &table).await; + let fs = AsyncFs::new(dp, root, Arc::clone(&table)).await; let fh1 = fs .open(LoadedAddr::new_unchecked(10), OpenFlags::RDONLY) @@ -385,9 +385,9 @@ async fn open_file_read_with_offset() { .insert(10, bytes::Bytes::from_static(b"hello world")); let dp = MockFsDataProvider::new(state); - let table = FutureBackedCache::default(); + let table = Arc::new(FutureBackedCache::default()); table.insert_sync(10, file); - let fs = AsyncFs::new(dp, root, &table).await; + let fs = AsyncFs::new(dp, root, Arc::clone(&table)).await; let open_file = fs .open(LoadedAddr::new_unchecked(10), OpenFlags::RDONLY) @@ -416,8 +416,8 @@ async fn readdir_lists_children_sorted_by_name() { ); let dp = MockFsDataProvider::new(state); - let table = FutureBackedCache::default(); - let fs = AsyncFs::new(dp, root, &table).await; + let table = Arc::new(FutureBackedCache::default()); + let fs = AsyncFs::new(dp, root, Arc::clone(&table)).await; let mut entries: Vec<(OsString, u64)> = Vec::new(); fs.readdir(LoadedAddr::new_unchecked(1), 0, |entry, _offset| { @@ -452,8 +452,8 @@ async fn readdir_respects_offset() { ); let dp = MockFsDataProvider::new(state); - let table = FutureBackedCache::default(); - let fs = AsyncFs::new(dp, root, &table).await; + let table = Arc::new(FutureBackedCache::default()); + let fs = AsyncFs::new(dp, root, Arc::clone(&table)).await; // First readdir to populate cache fs.readdir(LoadedAddr::new_unchecked(1), 0, |_, _| false) @@ -490,8 +490,8 @@ async fn readdir_stops_when_filler_returns_true() { ); let dp = MockFsDataProvider::new(state); - let table = FutureBackedCache::default(); - let fs = AsyncFs::new(dp, root, &table).await; + let table = Arc::new(FutureBackedCache::default()); + let fs = AsyncFs::new(dp, root, Arc::clone(&table)).await; let mut count = 0; fs.readdir(LoadedAddr::new_unchecked(1), 0, |_, _| { @@ -511,9 +511,9 @@ async fn readdir_returns_enotdir_for_file() { let dp = MockFsDataProvider::new(MockFsState::default()); - let table = FutureBackedCache::default(); + let table = Arc::new(FutureBackedCache::default()); table.insert_sync(10, file); - let fs = AsyncFs::new(dp, root, &table).await; + let fs = AsyncFs::new(dp, root, Arc::clone(&table)).await; let err = fs .readdir(LoadedAddr::new_unchecked(10), 0, |_, _| false) @@ -533,8 +533,8 @@ async fn readdir_populates_inode_table_with_children() { .insert(1, vec![(OsString::from("child.txt"), child)]); let dp = MockFsDataProvider::new(state); - let table = FutureBackedCache::default(); - let fs = AsyncFs::new(dp, root, &table).await; + let table = Arc::new(FutureBackedCache::default()); + let fs = AsyncFs::new(dp, root, Arc::clone(&table)).await; fs.readdir(LoadedAddr::new_unchecked(1), 0, |_, _| false) .await @@ -556,8 +556,8 @@ async fn readdir_empty_directory() { state.directories.insert(1, vec![]); let dp = MockFsDataProvider::new(state); - let table = FutureBackedCache::default(); - let fs = AsyncFs::new(dp, root, &table).await; + let table = Arc::new(FutureBackedCache::default()); + let fs = AsyncFs::new(dp, root, Arc::clone(&table)).await; let mut count = 0; fs.readdir(LoadedAddr::new_unchecked(1), 0, |_, _| { @@ -586,8 +586,8 @@ async fn readdir_provides_correct_next_offsets() { ); let dp = MockFsDataProvider::new(state); - let table = FutureBackedCache::default(); - let fs = AsyncFs::new(dp, root, &table).await; + let table = Arc::new(FutureBackedCache::default()); + let fs = AsyncFs::new(dp, root, Arc::clone(&table)).await; let mut offsets: Vec = Vec::new(); fs.readdir(LoadedAddr::new_unchecked(1), 0, |_, next_offset| { @@ -615,8 +615,8 @@ async fn lookup_after_eviction_returns_fresh_inode() { let dp = MockFsDataProvider::new(state); let state_ref = Arc::clone(&dp.state); - let table = FutureBackedCache::default(); - let fs = AsyncFs::new(dp, root, &table).await; + let table = Arc::new(FutureBackedCache::default()); + let fs = AsyncFs::new(dp, root, Arc::clone(&table)).await; // First lookup → addr=10 let first = fs @@ -660,8 +660,8 @@ async fn lookup_after_readdir_uses_directory_cache() { .insert(1, vec![(OsString::from("file.txt"), child)]); let dp = MockFsDataProvider::new(state); - let table = FutureBackedCache::default(); - let fs = AsyncFs::new(dp, root, &table).await; + let table = Arc::new(FutureBackedCache::default()); + let fs = AsyncFs::new(dp, root, Arc::clone(&table)).await; // readdir populates the directory cache. fs.readdir(LoadedAddr::new_unchecked(1), 0, |_, _| false) diff --git a/tests/composite_fs_tests.rs b/tests/composite_fs_tests.rs index ce110acb..1c263425 100644 --- a/tests/composite_fs_tests.rs +++ b/tests/composite_fs_tests.rs @@ -4,6 +4,7 @@ mod common; use std::collections::HashMap; use std::ffi::{OsStr, OsString}; +use std::sync::Arc; use bytes::Bytes; @@ -53,9 +54,9 @@ async fn composite_root_lookup_resolves_child() { let composite = CompositeFs::new(mock_root, (1000, 1000)); let root_inode = composite.make_root_inode(); - let table = FutureBackedCache::default(); + let table = Arc::new(FutureBackedCache::default()); table.insert_sync(1, root_inode); - let afs = AsyncFs::new_preseeded(composite, &table); + let afs = AsyncFs::new_preseeded(composite, Arc::clone(&table)); let tracked = afs .lookup(LoadedAddr::new_unchecked(1), OsStr::new("repo-a")) @@ -91,9 +92,9 @@ async fn composite_root_readdir_lists_children() { let composite = CompositeFs::new(mock_root, (1000, 1000)); let root_inode = composite.make_root_inode(); - let table = FutureBackedCache::default(); + let table = Arc::new(FutureBackedCache::default()); table.insert_sync(1, root_inode); - let afs = AsyncFs::new_preseeded(composite, &table); + let afs = AsyncFs::new_preseeded(composite, Arc::clone(&table)); let mut entries = Vec::new(); afs.readdir(LoadedAddr::new_unchecked(1), 0, |de, _offset| { @@ -126,9 +127,9 @@ async fn composite_delegated_lookup_reaches_child() { let composite = CompositeFs::new(mock_root, (1000, 1000)); let root_inode = composite.make_root_inode(); - let table = FutureBackedCache::default(); + let table = Arc::new(FutureBackedCache::default()); table.insert_sync(1, root_inode); - let afs = AsyncFs::new_preseeded(composite, &table); + let afs = AsyncFs::new_preseeded(composite, Arc::clone(&table)); // First, lookup the child at root level. let child_dir = afs @@ -169,9 +170,9 @@ async fn composite_open_and_read_through_child() { let composite = CompositeFs::new(mock_root, (1000, 1000)); let root_inode = composite.make_root_inode(); - let table = FutureBackedCache::default(); + let table = Arc::new(FutureBackedCache::default()); table.insert_sync(1, root_inode); - let afs = AsyncFs::new_preseeded(composite, &table); + let afs = AsyncFs::new_preseeded(composite, Arc::clone(&table)); // Navigate to the file. let child_dir = afs @@ -212,9 +213,9 @@ async fn composite_lookup_unknown_child_returns_enoent() { let composite = CompositeFs::new(mock_root, (1000, 1000)); let root_inode = composite.make_root_inode(); - let table = FutureBackedCache::default(); + let table = Arc::new(FutureBackedCache::default()); table.insert_sync(1, root_inode); - let afs = AsyncFs::new_preseeded(composite, &table); + let afs = AsyncFs::new_preseeded(composite, Arc::clone(&table)); let err = afs .lookup(LoadedAddr::new_unchecked(1), OsStr::new("nonexistent")) @@ -246,9 +247,9 @@ async fn composite_readdir_delegated_lists_child_contents() { let composite = CompositeFs::new(mock_root, (1000, 1000)); let root_inode = composite.make_root_inode(); - let table = FutureBackedCache::default(); + let table = Arc::new(FutureBackedCache::default()); table.insert_sync(1, root_inode); - let afs = AsyncFs::new_preseeded(composite, &table); + let afs = AsyncFs::new_preseeded(composite, Arc::clone(&table)); // Navigate into the child. let child_dir = afs @@ -287,9 +288,9 @@ async fn composite_repeated_lookup_returns_same_addr() { let composite = CompositeFs::new(mock_root, (1000, 1000)); let root_inode = composite.make_root_inode(); - let table = FutureBackedCache::default(); + let table = Arc::new(FutureBackedCache::default()); table.insert_sync(1, root_inode); - let afs = AsyncFs::new_preseeded(composite, &table); + let afs = AsyncFs::new_preseeded(composite, Arc::clone(&table)); let first = afs .lookup(LoadedAddr::new_unchecked(1), OsStr::new("repo")) @@ -318,9 +319,9 @@ async fn composite_forget_cleans_up_slot_and_name_mapping() { let composite = CompositeFs::new(mock_root, (1000, 1000)); let root_inode = composite.make_root_inode(); - let table = FutureBackedCache::default(); + let table = Arc::new(FutureBackedCache::default()); table.insert_sync(1, root_inode); - let afs = AsyncFs::new_preseeded(composite.clone(), &table); + let afs = AsyncFs::new_preseeded(composite.clone(), Arc::clone(&table)); // Look up the child and a file inside it. let child_dir = afs From 12fac6d3b6f91a25ce0be1b2b1303afce754ae69 Mon Sep 17 00:00:00 2001 From: Marko Vejnovic Date: Sat, 21 Feb 2026 19:28:57 -0800 Subject: [PATCH 38/58] feat: add DCache::child_dir_addrs for prefetch discovery --- lib/fs/dcache.rs | 20 ++++++++++++++++++ tests/dcache_correctness.rs | 42 +++++++++++++++++++++++++++++++++++++ 2 files changed, 62 insertions(+) diff --git a/lib/fs/dcache.rs b/lib/fs/dcache.rs index abd412d2..7f22258a 100644 --- a/lib/fs/dcache.rs +++ b/lib/fs/dcache.rs @@ -130,6 +130,26 @@ impl DCache { } } + /// Returns the [`LoadedAddr`] of every child that is itself a directory. + /// + /// Used by the prefetch logic to discover which subdirectories to + /// background-populate after a `readdir` completes. + #[must_use] + pub fn child_dir_addrs(&self, parent_ino: LoadedAddr) -> Vec { + let Some(state) = self.dirs.read_sync(&parent_ino, |_, v| Arc::clone(v)) else { + return Vec::new(); + }; + let children = state + .children + .read() + .unwrap_or_else(std::sync::PoisonError::into_inner); + children + .values() + .filter(|dv| dv.is_dir) + .map(|dv| dv.ino) + .collect() + } + /// Atomically try to claim a directory for population. /// /// Uses `compare_exchange` on the three-state flag: diff --git a/tests/dcache_correctness.rs b/tests/dcache_correctness.rs index f99d797b..7043bd9b 100644 --- a/tests/dcache_correctness.rs +++ b/tests/dcache_correctness.rs @@ -179,3 +179,45 @@ async fn readdir_returns_entries_in_sorted_order() { }); assert_eq!(names, ["apple", "mango", "zebra"]); } + +#[tokio::test] +async fn child_dir_addrs_returns_only_directories() { + let cache = DCache::new(); + let parent = LoadedAddr::new_unchecked(1); + cache.insert( + parent, + OsString::from("file.txt"), + LoadedAddr::new_unchecked(10), + false, + ); + cache.insert( + parent, + OsString::from("subdir"), + LoadedAddr::new_unchecked(11), + true, + ); + cache.insert( + parent, + OsString::from("another_file"), + LoadedAddr::new_unchecked(12), + false, + ); + cache.insert( + parent, + OsString::from("another_dir"), + LoadedAddr::new_unchecked(13), + true, + ); + + let dirs = cache.child_dir_addrs(parent); + assert_eq!(dirs.len(), 2); + assert!(dirs.contains(&LoadedAddr::new_unchecked(11))); + assert!(dirs.contains(&LoadedAddr::new_unchecked(13))); +} + +#[tokio::test] +async fn child_dir_addrs_returns_empty_for_unknown_parent() { + let cache = DCache::new(); + let dirs = cache.child_dir_addrs(LoadedAddr::new_unchecked(999)); + assert!(dirs.is_empty()); +} From e16c0a24e6cccc77fe6d6c83c61ba3ef65d2fe28 Mon Sep 17 00:00:00 2001 From: Marko Vejnovic Date: Sat, 21 Feb 2026 19:31:26 -0800 Subject: [PATCH 39/58] feat: prefetch child directories after readdir populates parent After readdir populates a directory via the Claimed CAS path, spawn background tokio tasks to prefetch each child directory. This makes subsequent navigation into subdirectories instant since the dcache and inode table are already populated. The prefetch uses the same CAS gate (try_claim_populate) so duplicate work is impossible, and errors are silently ignored since prefetch is best-effort. --- lib/fs/async_fs.rs | 55 ++++++++++++++++++++ tests/async_fs_correctness.rs | 95 ++++++++++++++++++++++++++++++++++ tests/common/async_fs_mocks.rs | 5 ++ 3 files changed, 155 insertions(+) diff --git a/lib/fs/async_fs.rs b/lib/fs/async_fs.rs index 32c5251d..434239a9 100644 --- a/lib/fs/async_fs.rs +++ b/lib/fs/async_fs.rs @@ -219,6 +219,49 @@ impl Drop for PopulateGuard<'_> { } } +/// Background-populate a single child directory into the caches. +/// +/// Uses the same CAS gate as `readdir` so duplicate work is impossible. +/// Errors are silently ignored — prefetch is best-effort. +async fn prefetch_dir( + dir_addr: LoadedAddr, + directory_cache: Arc, + inode_table: Arc>, + data_provider: DP, +) { + use crate::fs::dcache::PopulateStatus; + + match directory_cache.try_claim_populate(dir_addr) { + PopulateStatus::Claimed => {} + PopulateStatus::InProgress | PopulateStatus::Done => return, + } + + let mut guard = PopulateGuard::new(&directory_cache, dir_addr); + + let Some(dir_inode) = inode_table.get(&dir_addr.addr()).await else { + return; + }; + + let Ok(children) = data_provider.readdir(dir_inode).await else { + return; + }; + + for (name, child_inode) in children { + let is_dir = child_inode.itype == INodeType::Directory; + inode_table + .get_or_init(child_inode.addr, || async move { child_inode }) + .await; + directory_cache.insert( + dir_addr, + name, + LoadedAddr::new_unchecked(child_inode.addr), + is_dir, + ); + } + directory_cache.finish_populate(dir_addr); + guard.defuse(); +} + /// An asynchronous filesystem cache mapping `InodeAddr` to `INode`. /// /// Uses two [`FutureBackedCache`] layers: @@ -283,6 +326,17 @@ impl AsyncFs { } } + /// Spawn background tasks to prefetch each child directory of `parent`. + fn spawn_prefetch_children(&self, parent: LoadedAddr) { + let child_dirs = self.directory_cache.child_dir_addrs(parent); + for child_addr in child_dirs { + let dcache = Arc::clone(&self.directory_cache); + let table = Arc::clone(&self.inode_table); + let dp = self.data_provider.clone(); + tokio::spawn(prefetch_dir(child_addr, dcache, table, dp)); + } + } + /// Get the total number of inodes currently stored in the inode table. #[must_use] pub fn inode_count(&self) -> usize { @@ -460,6 +514,7 @@ impl AsyncFs { } self.directory_cache.finish_populate(parent); guard.defuse(); + self.spawn_prefetch_children(parent); break; } PopulateStatus::InProgress => { diff --git a/tests/async_fs_correctness.rs b/tests/async_fs_correctness.rs index 5ce202c6..e3087ceb 100644 --- a/tests/async_fs_correctness.rs +++ b/tests/async_fs_correctness.rs @@ -675,3 +675,98 @@ async fn lookup_after_readdir_uses_directory_cache() { .unwrap(); assert_eq!(tracked.inode.addr, 10); } + +#[tokio::test(flavor = "multi_thread", worker_threads = 2)] +async fn readdir_prefetches_child_directories() { + use std::sync::atomic::Ordering; + + let root = make_inode(1, INodeType::Directory, 0, None); + let child_dir = make_inode(10, INodeType::Directory, 0, Some(1)); + let child_file = make_inode(11, INodeType::File, 100, Some(1)); + let grandchild = make_inode(20, INodeType::File, 50, Some(10)); + + let mut state = MockFsState::default(); + state.directories.insert( + 1, + vec![ + (OsString::from("subdir"), child_dir), + (OsString::from("file.txt"), child_file), + ], + ); + state + .directories + .insert(10, vec![(OsString::from("grandchild.txt"), grandchild)]); + let dp = MockFsDataProvider::new(state); + let readdir_count = Arc::clone(&dp.state); + + let table = Arc::new(FutureBackedCache::default()); + let fs = AsyncFs::new(dp, root, Arc::clone(&table)).await; + + // readdir on root should trigger prefetch of child_dir (addr=10) + fs.readdir(LoadedAddr::new_unchecked(1), 0, |_, _| false) + .await + .unwrap(); + + // Wait for prefetch to complete (mock is instant, just need task to run) + tokio::time::sleep(std::time::Duration::from_millis(100)).await; + + // dp.readdir should have been called twice: once for root, once for child_dir prefetch + assert_eq!( + readdir_count.readdir_count.load(Ordering::Relaxed), + 2, + "prefetch should have called readdir on the child directory" + ); + + // Now readdir on child_dir should NOT call dp.readdir again (served from cache) + let mut entries = Vec::new(); + fs.readdir(LoadedAddr::new_unchecked(10), 0, |entry, _| { + entries.push(entry.name.to_os_string()); + false + }) + .await + .unwrap(); + + assert_eq!(entries, vec![OsString::from("grandchild.txt")]); + assert_eq!( + readdir_count.readdir_count.load(Ordering::Relaxed), + 2, + "cached readdir should not call dp.readdir again" + ); +} + +#[tokio::test(flavor = "multi_thread", worker_threads = 2)] +async fn prefetch_failure_does_not_affect_parent_readdir() { + let root = make_inode(1, INodeType::Directory, 0, None); + let child_dir = make_inode(10, INodeType::Directory, 0, Some(1)); + + let mut state = MockFsState::default(); + state + .directories + .insert(1, vec![(OsString::from("bad_dir"), child_dir)]); + // Don't configure readdir for addr=10 — mock will return ENOENT + let dp = MockFsDataProvider::new(state); + + let table = Arc::new(FutureBackedCache::default()); + let fs = AsyncFs::new(dp, root, Arc::clone(&table)).await; + + // Parent readdir should succeed even though child prefetch will fail + let mut entries = Vec::new(); + fs.readdir(LoadedAddr::new_unchecked(1), 0, |entry, _| { + entries.push(entry.name.to_os_string()); + false + }) + .await + .unwrap(); + + assert_eq!(entries, vec![OsString::from("bad_dir")]); + + // Wait for prefetch to attempt and fail + tokio::time::sleep(std::time::Duration::from_millis(100)).await; + + // Direct readdir on child should still work (CAS reset to UNCLAIMED by PopulateGuard) + let err = fs + .readdir(LoadedAddr::new_unchecked(10), 0, |_, _| false) + .await + .unwrap_err(); + assert_eq!(err.raw_os_error(), Some(libc::ENOENT)); +} diff --git a/tests/common/async_fs_mocks.rs b/tests/common/async_fs_mocks.rs index 4441544c..b95f6ebe 100644 --- a/tests/common/async_fs_mocks.rs +++ b/tests/common/async_fs_mocks.rs @@ -56,6 +56,8 @@ pub struct MockFsState { /// precedence and are consumed on use (removed after the first hit). /// Existing tests are unaffected because this defaults to empty. pub refresh_lookups: scc::HashMap<(u64, OsString), INode>, + /// Counts how many times `readdir` has been called on this provider. + pub readdir_count: std::sync::atomic::AtomicU64, } /// A clonable mock data provider for `AsyncFs` tests. @@ -89,6 +91,9 @@ impl FsDataProvider for MockFsDataProvider { } async fn readdir(&self, parent: INode) -> Result, std::io::Error> { + self.state + .readdir_count + .fetch_add(1, std::sync::atomic::Ordering::Relaxed); self.state .directories .get(&parent.addr) From eb394925eca6d7e243835cc13a0678f497f43d5a Mon Sep 17 00:00:00 2001 From: Marko Vejnovic Date: Sat, 21 Feb 2026 20:07:21 -0800 Subject: [PATCH 40/58] concurrency edge cases --- lib/fs/composite.rs | 65 +++++++++++++++++++++++++++++++-------------- lib/fs/dcache.rs | 11 +++++++- 2 files changed, 55 insertions(+), 21 deletions(-) diff --git a/lib/fs/composite.rs b/lib/fs/composite.rs index 92361344..c0aedfed 100644 --- a/lib/fs/composite.rs +++ b/lib/fs/composite.rs @@ -431,9 +431,18 @@ where }) } - /// Removes the composite-level address from `addr_to_slot` and the - /// child's bridge map. When the bridge becomes empty, the slot and its - /// `name_to_slot` entry are garbage-collected. + /// Removes the composite-level address from the child's bridge map and + /// then from `addr_to_slot`. When the bridge becomes empty, the slot + /// and its `name_to_slot` entry are garbage-collected. + /// + /// **Ordering invariant:** the bridge mapping is removed *before* + /// `addr_to_slot` so that a concurrent [`lookup`](Self::lookup) + /// calling `backward_or_insert` will allocate a *fresh* outer address + /// (since the old inner→outer entry is already gone from the bridge) + /// rather than returning the about-to-be-forgotten address. Because + /// the fresh address differs from the forgotten one, the subsequent + /// `addr_to_slot.remove_sync` here cannot destroy the concurrent + /// lookup's mapping. /// /// The slot removal uses `remove_if_sync` with a re-check of /// `bridge.is_empty()`, preventing a concurrent `backward_or_insert` @@ -445,25 +454,41 @@ where if addr == Self::ROOT_INO { return; } - if let Some((_, slot_idx)) = self.inner.addr_to_slot.remove_sync(&addr) { - // Remove the outer->inner mapping from the bridge. The bridge's - // internal mutex serializes this with `backward_or_insert`. - let bridge_empty = self + let Some(slot_idx) = self.inner.addr_to_slot.read_sync(&addr, |_, &v| v) else { + return; + }; + // Remove from the bridge FIRST. The bridge's internal mutex + // serializes this with `backward_or_insert`, ensuring that any + // concurrent lookup that arrives after this point will allocate a + // fresh outer address rather than reusing the forgotten `addr`. + let bridge_empty = self + .inner + .slots + .read_sync(&slot_idx, |_, slot| slot.bridge.remove_by_outer(addr)) + .unwrap_or(false); + // Now safe to remove from addr_to_slot — concurrent lookups that + // raced with us either: + // (a) ran backward_or_insert BEFORE our bridge removal and got + // `addr` back (same key we are removing — acceptable, see + // below), or + // (b) ran AFTER and got a fresh fallback address (different key, + // unaffected by this removal). + // + // Case (a) is a FUSE protocol-level race: the kernel sent + // `forget` for this address while a lookup resolved to the same + // inner entity. In practice, this should not occur because + // `forget` fires only when nlookup reaches zero. + self.inner.addr_to_slot.remove_sync(&addr); + if bridge_empty { + // Bridge is empty — atomically remove the slot only if no one + // has re-populated the bridge between our check and this removal. + // `remove_if_sync` holds the scc bucket lock during evaluation. + let removed = self .inner .slots - .read_sync(&slot_idx, |_, slot| slot.bridge.remove_by_outer(addr)) - .unwrap_or(false); - if bridge_empty { - // Bridge is empty — atomically remove the slot only if no one - // has re-populated the bridge between our check and this removal. - // `remove_if_sync` holds the scc bucket lock during evaluation. - let removed = self - .inner - .slots - .remove_if_sync(&slot_idx, |slot| slot.bridge.is_empty()); - if let Some((_, slot)) = removed { - self.inner.name_to_slot.remove_sync(&slot.name); - } + .remove_if_sync(&slot_idx, |slot| slot.bridge.is_empty()); + if let Some((_, slot)) = removed { + self.inner.name_to_slot.remove_sync(&slot.name); } } } diff --git a/lib/fs/dcache.rs b/lib/fs/dcache.rs index 7f22258a..82f73b66 100644 --- a/lib/fs/dcache.rs +++ b/lib/fs/dcache.rs @@ -188,14 +188,23 @@ impl DCache { /// Wait until a directory is no longer in the `InProgress` state. /// /// Uses [`Notify`] to sleep efficiently instead of spinning. + /// + /// The `Notified` future is pinned and `enable()`d before checking the + /// flag so that the waiter is registered with the `Notify` *before* the + /// state check. Without this, a `notify_waiters()` firing between + /// `notified()` and the first poll would be lost (since + /// `notify_waiters` does not store a permit), causing a permanent hang. pub async fn wait_populated(&self, parent_ino: LoadedAddr) { let state = self.dir_state(parent_ino); loop { - let notified = state.notify.notified(); + let mut notified = std::pin::pin!(state.notify.notified()); + notified.as_mut().enable(); let current = state.populated.load(Ordering::Acquire); if current != POPULATE_IN_PROGRESS { return; } + // SAFETY(cancel): re-entering the loop re-creates the Notified + // future, so spurious wakeups just re-check the flag. notified.await; } } From defd96350c58ab260d26a4424247188ee2562344 Mon Sep 17 00:00:00 2001 From: Marko Vejnovic Date: Sat, 21 Feb 2026 20:39:02 -0800 Subject: [PATCH 41/58] fix: race in forget slot removal and unbounded prefetch spawning MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Two concurrency fixes identified during review: 1. CompositeFs::forget could prematurely remove a slot when a concurrent backward_or_insert was mid-insert. The remove_if_sync predicate called bridge.is_empty() without the coordination mutex, so it could observe fwd as empty while backward_or_insert had inserted into bwd but not yet fwd. Added ConcurrentBridge::is_empty_locked() that acquires the mutex, and use it in the predicate. No deadlock risk — lock ordering is always slots bucket lock → bridge mutex. 2. spawn_prefetch_children spawned one tokio::spawn per child directory with no bound, creating a thundering herd on the API backend for directories with many subdirs. Added a per-AsyncFs semaphore capping concurrent prefetch tasks at 8. --- lib/fs/async_fs.rs | 24 +++++++++++++++++++++++- lib/fs/bridge.rs | 20 +++++++++++++++++--- lib/fs/composite.rs | 12 +++++++----- 3 files changed, 47 insertions(+), 9 deletions(-) diff --git a/lib/fs/async_fs.rs b/lib/fs/async_fs.rs index 434239a9..1c069f4d 100644 --- a/lib/fs/async_fs.rs +++ b/lib/fs/async_fs.rs @@ -6,6 +6,7 @@ use std::sync::Arc; use std::sync::atomic::{AtomicU64, Ordering}; use bytes::Bytes; +use tokio::sync::Semaphore; use crate::cache::async_backed::FutureBackedCache; use crate::drop_ward::StatelessDrop; @@ -262,6 +263,14 @@ async fn prefetch_dir( guard.defuse(); } +/// Maximum number of concurrent prefetch tasks spawned per [`AsyncFs`] instance. +/// +/// Prevents thundering-herd API calls when a parent directory contains many +/// subdirectories (e.g. `node_modules`). Each `readdir` that discovers child +/// directories spawns at most this many concurrent prefetch tasks; additional +/// children wait for a permit. +const MAX_PREFETCH_CONCURRENCY: usize = 8; + /// An asynchronous filesystem cache mapping `InodeAddr` to `INode`. /// /// Uses two [`FutureBackedCache`] layers: @@ -286,6 +295,9 @@ pub struct AsyncFs { /// Monotonically increasing file handle counter. Starts at 1 (0 is reserved). next_fh: AtomicU64, + + /// Bounds the number of concurrent background prefetch tasks. + prefetch_semaphore: Arc, } impl AsyncFs { @@ -305,6 +317,7 @@ impl AsyncFs { directory_cache: Arc::new(DCache::new()), data_provider, next_fh: AtomicU64::new(1), + prefetch_semaphore: Arc::new(Semaphore::new(MAX_PREFETCH_CONCURRENCY)), } } @@ -323,17 +336,26 @@ impl AsyncFs { directory_cache: Arc::new(DCache::new()), data_provider, next_fh: AtomicU64::new(1), + prefetch_semaphore: Arc::new(Semaphore::new(MAX_PREFETCH_CONCURRENCY)), } } /// Spawn background tasks to prefetch each child directory of `parent`. + /// + /// Concurrency is bounded by [`MAX_PREFETCH_CONCURRENCY`] via a shared + /// semaphore, preventing thundering-herd API calls when a parent + /// directory contains many subdirectories. fn spawn_prefetch_children(&self, parent: LoadedAddr) { let child_dirs = self.directory_cache.child_dir_addrs(parent); for child_addr in child_dirs { + let sem = Arc::clone(&self.prefetch_semaphore); let dcache = Arc::clone(&self.directory_cache); let table = Arc::clone(&self.inode_table); let dp = self.data_provider.clone(); - tokio::spawn(prefetch_dir(child_addr, dcache, table, dp)); + tokio::spawn(async move { + let _permit = sem.acquire().await; + prefetch_dir(child_addr, dcache, table, dp).await; + }); } } diff --git a/lib/fs/bridge.rs b/lib/fs/bridge.rs index 6e4ef942..c6edda8a 100644 --- a/lib/fs/bridge.rs +++ b/lib/fs/bridge.rs @@ -118,13 +118,27 @@ impl ConcurrentBridge { /// Returns `true` if the bridge contains no mappings. /// /// Reads are not serialized with mutations. The result is a - /// snapshot that may be immediately stale. Use under the - /// coordination lock or an external guard when consistency - /// with mutations is required. + /// snapshot that may be immediately stale. Use [`is_empty_locked`](Self::is_empty_locked) + /// when consistency with concurrent mutations is required. #[must_use] pub fn is_empty(&self) -> bool { self.fwd.is_empty() } + + /// Returns `true` if the bridge contains no mappings, serialized with + /// mutations via the coordination lock. + /// + /// Use this instead of [`is_empty`](Self::is_empty) when the result + /// must be consistent with a concurrent [`backward_or_insert`](Self::backward_or_insert) + /// that may be mid-insert. + #[must_use] + pub fn is_empty_locked(&self) -> bool { + let _guard = self + .mu + .lock() + .unwrap_or_else(std::sync::PoisonError::into_inner); + self.fwd.is_empty() + } } impl Default for ConcurrentBridge { diff --git a/lib/fs/composite.rs b/lib/fs/composite.rs index c0aedfed..e4245e5c 100644 --- a/lib/fs/composite.rs +++ b/lib/fs/composite.rs @@ -445,9 +445,9 @@ where /// lookup's mapping. /// /// The slot removal uses `remove_if_sync` with a re-check of - /// `bridge.is_empty()`, preventing a concurrent `backward_or_insert` - /// from inserting a new mapping between the bridge emptiness check - /// and the slot removal. + /// `bridge.is_empty_locked()`, which acquires the bridge's + /// coordination mutex to serialize with a concurrent + /// `backward_or_insert` that may be mid-insert. /// /// The root inode is never forgotten. fn forget(&self, addr: InodeAddr) { @@ -482,11 +482,13 @@ where if bridge_empty { // Bridge is empty — atomically remove the slot only if no one // has re-populated the bridge between our check and this removal. - // `remove_if_sync` holds the scc bucket lock during evaluation. + // `remove_if_sync` holds the scc bucket lock during evaluation, + // and `is_empty_locked` acquires the bridge's coordination mutex + // to serialize with any concurrent `backward_or_insert`. let removed = self .inner .slots - .remove_if_sync(&slot_idx, |slot| slot.bridge.is_empty()); + .remove_if_sync(&slot_idx, |slot| slot.bridge.is_empty_locked()); if let Some((_, slot)) = removed { self.inner.name_to_slot.remove_sync(&slot.name); } From bcd9ae7833b2d0a4dd12012273750dba4668b5f1 Mon Sep 17 00:00:00 2001 From: Marko Vejnovic Date: Sun, 22 Feb 2026 10:56:17 -0800 Subject: [PATCH 42/58] fix: address critical and high severity issues from PR #40 review - Force ZST assertion evaluation in DropWard::new() (was dead code) - Implement opendir/readdir/releasedir snapshot pattern in FuserAdapter, eliminating offset instability from BTreeMap position indexing and the intermediate Vec double-buffer on every readdir call - Propagate forget from CompositeFs to child inode tables via new AsyncFs::evict(), preventing inner inode leaks - Add DCache::remove_child() and remove_parent() for cache invalidation - Document path_map race (MES-697) and await_shared O(N) contention --- lib/cache/async_backed.rs | 7 ++ lib/drop_ward.rs | 3 + lib/fs/async_fs.rs | 13 ++- lib/fs/bridge.rs | 23 ++++-- lib/fs/composite.rs | 13 ++- lib/fs/dcache.rs | 22 ++++++ lib/fs/fuser.rs | 140 ++++++++++++++++++++++++--------- src/fs/mescloud/repo.rs | 5 ++ tests/bridge_tests.rs | 37 ++++++++- tests/common/async_fs_mocks.rs | 6 ++ tests/composite_fs_tests.rs | 43 ++++++++++ tests/dcache_correctness.rs | 94 ++++++++++++++++++++++ 12 files changed, 355 insertions(+), 51 deletions(-) diff --git a/lib/cache/async_backed.rs b/lib/cache/async_backed.rs index 6ec95d75..974aac09 100644 --- a/lib/cache/async_backed.rs +++ b/lib/cache/async_backed.rs @@ -260,6 +260,13 @@ where /// /// Returns `Some(v)` on success. Returns `None` if the factory panicked, after removing /// the poisoned entry from the map. + /// + /// NOTE: Every joiner that reaches `await_shared` independently calls `update_async` to + /// attempt promotion from `InFlight` to `Ready`. Under high concurrency (N joiners), this + /// results in O(N) lock acquisitions on the same bucket — only the first succeeds, and the + /// rest are no-ops due to the generation check. A future optimization could use an + /// `AtomicBool` promoter-election flag so that only one joiner attempts the `update_async` + /// call, reducing contention from O(N) to O(1). async fn await_shared(&self, key: &K, observed_gen: u64, shared: SharedFut) -> Option { let mut guard = PromoteGuard { map: &self.map, diff --git a/lib/drop_ward.rs b/lib/drop_ward.rs index 848d1dfb..21ed777b 100644 --- a/lib/drop_ward.rs +++ b/lib/drop_ward.rs @@ -84,6 +84,9 @@ where /// Create a new ward that will pass `ctx` to `T::delete` on cleanup. pub fn new(ctx: Ctx) -> Self { + // Force evaluation of the compile-time ZST assertion. Without this + // reference the const is dead code and never checked. + const { assert!(size_of::() == 0, "T must be zero-sized") } Self { map: FxHashMap::default(), ctx, diff --git a/lib/fs/async_fs.rs b/lib/fs/async_fs.rs index 1c069f4d..eed3d984 100644 --- a/lib/fs/async_fs.rs +++ b/lib/fs/async_fs.rs @@ -486,6 +486,17 @@ impl AsyncFs { }) } + /// Evict an inode from the inode table and notify the data provider. + /// + /// Called by the composite layer when propagating `forget` to a child + /// filesystem. Removes the inode from the table and calls + /// [`FsDataProvider::forget`] so the provider can clean up auxiliary + /// structures (path maps, etc.). + pub fn evict(&self, addr: InodeAddr) { + self.inode_table.remove_sync(&addr); + self.data_provider.forget(addr); + } + /// Iterate directory entries for `parent`, starting from `offset`. /// /// On the first call for a given parent, fetches the directory listing @@ -497,8 +508,6 @@ impl AsyncFs { /// returns `true` (indicating the caller's buffer is full), iteration /// stops early. /// - /// TODO(MES-746): Implement `opendir` and `releasedir` to snapshot directory contents and - /// avoid racing with `lookup`/`createfile`. pub async fn readdir( &self, parent: LoadedAddr, diff --git a/lib/fs/bridge.rs b/lib/fs/bridge.rs index c6edda8a..675ca030 100644 --- a/lib/fs/bridge.rs +++ b/lib/fs/bridge.rs @@ -100,19 +100,26 @@ impl ConcurrentBridge { /// Remove the mapping for the given outer address. /// - /// Returns `true` if the bridge is empty after the removal — the caller - /// can use this to garbage-collect the owning slot. The emptiness check - /// is performed under the coordination lock so there is no TOCTOU gap - /// with the removal itself. - pub fn remove_by_outer(&self, outer: InodeAddr) -> bool { + /// Returns `(removed_inner, bridge_empty)`: + /// - `removed_inner`: the inner address that was mapped, or `None` if the + /// outer address was not present. + /// - `bridge_empty`: whether the bridge is empty after removal — the + /// caller can use this to garbage-collect the owning slot. + /// + /// The emptiness check is performed under the coordination lock so there + /// is no TOCTOU gap with the removal itself. + pub fn remove_by_outer(&self, outer: InodeAddr) -> (Option, bool) { let _guard = self .mu .lock() .unwrap_or_else(std::sync::PoisonError::into_inner); - if let Some((_, inner)) = self.fwd.remove_sync(&outer) { + let removed_inner = if let Some((_, inner)) = self.fwd.remove_sync(&outer) { self.bwd.remove_sync(&inner); - } - self.fwd.is_empty() + Some(inner) + } else { + None + }; + (removed_inner, self.fwd.is_empty()) } /// Returns `true` if the bridge contains no mappings. diff --git a/lib/fs/composite.rs b/lib/fs/composite.rs index e4245e5c..950def95 100644 --- a/lib/fs/composite.rs +++ b/lib/fs/composite.rs @@ -461,11 +461,20 @@ where // serializes this with `backward_or_insert`, ensuring that any // concurrent lookup that arrives after this point will allocate a // fresh outer address rather than reusing the forgotten `addr`. - let bridge_empty = self + let (removed_inner, bridge_empty) = self .inner .slots .read_sync(&slot_idx, |_, slot| slot.bridge.remove_by_outer(addr)) - .unwrap_or(false); + .unwrap_or((None, false)); + + // Propagate forget to the child's inode table and data provider so + // inner inodes don't leak until the entire slot is GC'd. + if let Some(inner_addr) = removed_inner { + self.inner.slots.read_sync(&slot_idx, |_, slot| { + slot.inner.get_fs().evict(inner_addr); + }); + } + // Now safe to remove from addr_to_slot — concurrent lookups that // raced with us either: // (a) ran backward_or_insert BEFORE our bridge removal and got diff --git a/lib/fs/dcache.rs b/lib/fs/dcache.rs index 82f73b66..4810139d 100644 --- a/lib/fs/dcache.rs +++ b/lib/fs/dcache.rs @@ -150,6 +150,28 @@ impl DCache { .collect() } + /// Removes a single child entry from the cache. + /// + /// Returns the removed [`DValue`] if it was present, or `None` if the + /// parent or child did not exist. + pub fn remove_child(&self, parent_ino: LoadedAddr, name: &OsStr) -> Option { + let state = self.dirs.read_sync(&parent_ino, |_, v| Arc::clone(v))?; + let mut children = state + .children + .write() + .unwrap_or_else(std::sync::PoisonError::into_inner); + children.remove(name) + } + + /// Removes the entire [`DirState`] for `parent_ino`, resetting its + /// population status so the next `readdir` will re-fetch from the + /// data provider. + /// + /// Returns `true` if an entry was removed. + pub fn remove_parent(&self, parent_ino: LoadedAddr) -> bool { + self.dirs.remove_sync(&parent_ino).is_some() + } + /// Atomically try to claim a directory for population. /// /// Uses `compare_exchange` on the three-state flag: diff --git a/lib/fs/fuser.rs b/lib/fs/fuser.rs index 06c27d4d..78528914 100644 --- a/lib/fs/fuser.rs +++ b/lib/fs/fuser.rs @@ -1,7 +1,7 @@ //! FUSE adapter: maps [`fuser::Filesystem`] callbacks to [`AsyncFs`](super::async_fs::AsyncFs). use std::collections::HashMap; -use std::ffi::OsStr; +use std::ffi::{OsStr, OsString}; use std::sync::Arc; use super::async_fs::{FileReader as _, FsDataProvider}; @@ -136,13 +136,24 @@ fn inode_type_to_fuser(itype: INodeType) -> fuser::FileType { const BLOCK_SIZE: u32 = 4096; +/// Snapshot of a directory listing created by `opendir`. +enum DirSnapshot { + /// Directory handle allocated but not yet populated. + Pending, + /// Fully materialized directory listing. + Ready(Vec<(InodeAddr, OsString, INodeType)>), +} + /// Bridges a generic [`FsDataProvider`] to the [`fuser::Filesystem`] trait. /// /// Owns a self-referential inode table + ward + [`AsyncFs`](super::async_fs::AsyncFs), -/// plus an open-file map and a tokio runtime handle for blocking on async ops. +/// plus an open-file map, a directory-handle map, and a tokio runtime handle +/// for blocking on async ops. pub struct FuserAdapter { inner: FuseBridgeInner, open_files: HashMap>, + dir_handles: HashMap, + next_dir_fh: u64, runtime: tokio::runtime::Handle, } @@ -170,6 +181,8 @@ impl FuserAdapter { Self { inner: FuseBridgeInner::create(table, provider), open_files: HashMap::new(), + dir_handles: HashMap::new(), + next_dir_fh: 1, runtime, } } @@ -223,56 +236,107 @@ impl fuser::Filesystem for FuserAdapter { }); } - #[instrument(name = "FuserAdapter::readdir", skip(self, _req, _fh, offset, reply))] + #[instrument(name = "FuserAdapter::opendir", skip(self, _req, _ino, _flags, reply))] + fn opendir( + &mut self, + _req: &fuser::Request<'_>, + _ino: u64, + _flags: i32, + reply: fuser::ReplyOpen, + ) { + let fh = self.next_dir_fh; + self.next_dir_fh += 1; + self.dir_handles.insert(fh, DirSnapshot::Pending); + debug!(handle = fh, "replying..."); + reply.opened(fh, 0); + } + + #[instrument(name = "FuserAdapter::readdir", skip(self, _req, fh, offset, reply))] fn readdir( &mut self, _req: &fuser::Request<'_>, ino: u64, - _fh: u64, + fh: u64, offset: i64, reply: fuser::ReplyDirectory, ) { let offset_u64 = offset.cast_unsigned(); - self.runtime - .block_on(async { - let mut entries = Vec::new(); - self.inner - .get_fs() - .readdir( - LoadedAddr::new_unchecked(ino), - offset_u64, - |de, _next_offset| { + + // Lazily populate the snapshot on the first readdir call for this handle. + let snapshot = match self.dir_handles.get(&fh) { + Some(DirSnapshot::Pending) | None => { + // Populate from AsyncFs and transition to Ready. + let result = self.runtime.block_on(async { + let mut entries = Vec::new(); + self.inner + .get_fs() + .readdir(LoadedAddr::new_unchecked(ino), 0, |de, _next_offset| { entries.push((de.inode.addr, de.name.to_os_string(), de.inode.itype)); false - }, - ) - .await?; - Ok::<_, std::io::Error>(entries) - }) - .fuse_reply(reply, |entries, mut reply| { - for (i, (entry_ino, entry_name, entry_itype)) in entries.iter().enumerate() { - let kind = inode_type_to_fuser(*entry_itype); - #[expect( - clippy::cast_possible_truncation, - reason = "offset fits in usize on supported 64-bit platforms" - )] - let abs_idx = offset_u64 as usize + i + 1; - let Ok(idx): Result = abs_idx.try_into() else { - error!("Directory entry index {} too large for fuser", abs_idx); - reply.error(libc::EIO); + }) + .await?; + Ok::<_, std::io::Error>(entries) + }); + match result { + Ok(entries) => { + self.dir_handles.insert(fh, DirSnapshot::Ready(entries)); + } + Err(e) => { + debug!(error = %e, "replying error"); + reply.error(io_to_errno(&e)); return; - }; - - debug!(?entry_name, ino = entry_ino, "adding entry to reply..."); - if reply.add(*entry_ino, idx, kind, entry_name) { - debug!("buffer full for now, stopping readdir"); - break; } } + match self.dir_handles.get(&fh) { + Some(DirSnapshot::Ready(entries)) => entries, + _ => unreachable!("just inserted Ready"), + } + } + Some(DirSnapshot::Ready(entries)) => entries, + }; - debug!("finalizing reply..."); - reply.ok(); - }); + #[expect( + clippy::cast_possible_truncation, + reason = "offset fits in usize on supported 64-bit platforms" + )] + let skip = offset_u64 as usize; + let mut reply = reply; + + for (i, (entry_ino, entry_name, entry_itype)) in snapshot.iter().enumerate().skip(skip) { + let kind = inode_type_to_fuser(*entry_itype); + let abs_idx = i + 1; + let Ok(idx): Result = abs_idx.try_into() else { + error!("Directory entry index {} too large for fuser", abs_idx); + reply.error(libc::EIO); + return; + }; + + debug!(?entry_name, ino = entry_ino, "adding entry to reply..."); + if reply.add(*entry_ino, idx, kind, entry_name) { + debug!("buffer full for now, stopping readdir"); + break; + } + } + + debug!("finalizing reply..."); + reply.ok(); + } + + #[instrument( + name = "FuserAdapter::releasedir", + skip(self, _req, _ino, fh, _flags, reply) + )] + fn releasedir( + &mut self, + _req: &fuser::Request<'_>, + _ino: u64, + fh: u64, + _flags: i32, + reply: fuser::ReplyEmpty, + ) { + self.dir_handles.remove(&fh); + debug!("replying ok"); + reply.ok(); } #[instrument(name = "FuserAdapter::open", skip(self, _req, flags, reply))] diff --git a/src/fs/mescloud/repo.rs b/src/fs/mescloud/repo.rs index aae85491..2e1ca999 100644 --- a/src/fs/mescloud/repo.rs +++ b/src/fs/mescloud/repo.rs @@ -148,6 +148,11 @@ impl FsDataProvider for MesRepoProvider { InodePerms::from_bits_truncate(0o644) }; + // TODO(MES-697): Address allocation is racy — two concurrent lookups for the same + // child path each allocate a fresh address and insert into `path_map`, causing + // the second insert to silently overwrite the first. The `AsyncFs` lookup cache + // deduplicates in practice, but a content-addressed scheme (hash path → addr) + // would eliminate the race structurally. let addr = inner.next_addr.fetch_add(1, Ordering::Relaxed); drop(inner.path_map.insert_async(addr, child_path).await); diff --git a/tests/bridge_tests.rs b/tests/bridge_tests.rs index d8389273..434e2b8c 100644 --- a/tests/bridge_tests.rs +++ b/tests/bridge_tests.rs @@ -43,7 +43,42 @@ fn backward_or_insert_new_allocates() { fn remove_by_outer_clears_both_directions() { let bridge = ConcurrentBridge::new(); bridge.insert(10, 100); - bridge.remove_by_outer(10); + let (removed_inner, empty) = bridge.remove_by_outer(10); + assert_eq!( + removed_inner, + Some(100), + "should return the removed inner addr" + ); + assert!( + empty, + "bridge should be empty after removing the only entry" + ); assert_eq!(bridge.forward(10), None); assert_eq!(bridge.backward(100), None); } + +#[test] +fn remove_by_outer_missing_key_returns_none() { + let bridge = ConcurrentBridge::new(); + bridge.insert(10, 100); + let (removed_inner, empty) = bridge.remove_by_outer(42); + assert_eq!(removed_inner, None, "should return None for missing key"); + assert!( + !empty, + "bridge should not be empty when other entries exist" + ); +} + +#[test] +fn remove_by_outer_not_empty_when_others_remain() { + let bridge = ConcurrentBridge::new(); + bridge.insert(10, 100); + bridge.insert(20, 200); + let (removed_inner, empty) = bridge.remove_by_outer(10); + assert_eq!(removed_inner, Some(100)); + assert!( + !empty, + "bridge should not be empty when other entries remain" + ); + assert_eq!(bridge.forward(20), Some(200), "other entry should survive"); +} diff --git a/tests/common/async_fs_mocks.rs b/tests/common/async_fs_mocks.rs index b95f6ebe..ae7c4e11 100644 --- a/tests/common/async_fs_mocks.rs +++ b/tests/common/async_fs_mocks.rs @@ -58,6 +58,8 @@ pub struct MockFsState { pub refresh_lookups: scc::HashMap<(u64, OsString), INode>, /// Counts how many times `readdir` has been called on this provider. pub readdir_count: std::sync::atomic::AtomicU64, + /// Tracks addresses passed to `forget`. Used to verify propagation. + pub forgotten_addrs: scc::HashSet, } /// A clonable mock data provider for `AsyncFs` tests. @@ -114,4 +116,8 @@ impl FsDataProvider for MockFsDataProvider { .unwrap_or_default(); Ok(MockFileReader { data }) } + + fn forget(&self, addr: git_fs::fs::InodeAddr) { + let _ = self.state.forgotten_addrs.insert_sync(addr); + } } diff --git a/tests/composite_fs_tests.rs b/tests/composite_fs_tests.rs index 1c263425..dad0255a 100644 --- a/tests/composite_fs_tests.rs +++ b/tests/composite_fs_tests.rs @@ -307,6 +307,49 @@ async fn composite_repeated_lookup_returns_same_addr() { ); } +#[tokio::test(flavor = "multi_thread", worker_threads = 2)] +async fn composite_forget_propagates_to_child_inode_table() { + let (provider, root_ino) = make_child_provider(100, &[("file.txt", 101, INodeType::File, 42)]); + let mock_state = Arc::clone(&provider.state); + + let mut children = HashMap::new(); + children.insert(OsString::from("repo"), (provider, root_ino)); + + let mock_root = MockRoot::new(children); + let composite = CompositeFs::new(mock_root, (1000, 1000)); + let root_inode = composite.make_root_inode(); + + let table = Arc::new(FutureBackedCache::default()); + table.insert_sync(1, root_inode); + let afs = AsyncFs::new_preseeded(composite.clone(), Arc::clone(&table)); + + // Navigate to the file. + let child_dir = afs + .lookup(LoadedAddr::new_unchecked(1), OsStr::new("repo")) + .await + .unwrap(); + let child_addr = child_dir.inode.addr; + + let file = afs + .lookup( + LoadedAddr::new_unchecked(child_addr), + OsStr::new("file.txt"), + ) + .await + .unwrap(); + let file_addr = file.inode.addr; + + // Forget the file — this should propagate to the child. + composite.forget(file_addr); + + // The child's data provider should have received the forget call + // for the inner address (101). + assert!( + mock_state.forgotten_addrs.contains_sync(&101), + "forget should propagate to child data provider" + ); +} + #[tokio::test(flavor = "multi_thread", worker_threads = 2)] async fn composite_forget_cleans_up_slot_and_name_mapping() { // Setup: one child "repo" with a file. diff --git a/tests/dcache_correctness.rs b/tests/dcache_correctness.rs index 7043bd9b..16bad50a 100644 --- a/tests/dcache_correctness.rs +++ b/tests/dcache_correctness.rs @@ -221,3 +221,97 @@ async fn child_dir_addrs_returns_empty_for_unknown_parent() { let dirs = cache.child_dir_addrs(LoadedAddr::new_unchecked(999)); assert!(dirs.is_empty()); } + +#[tokio::test] +async fn remove_child_returns_removed_entry() { + let cache = DCache::new(); + let parent = LoadedAddr::new_unchecked(1); + cache.insert( + parent, + OsString::from("foo"), + LoadedAddr::new_unchecked(10), + false, + ); + let removed = cache.remove_child(parent, OsStr::new("foo")); + assert!(removed.is_some(), "should return the removed entry"); + let dv = removed.unwrap(); + assert_eq!(dv.ino, LoadedAddr::new_unchecked(10)); + assert!(!dv.is_dir); + assert!( + cache.lookup(parent, OsStr::new("foo")).is_none(), + "entry should no longer be present after removal" + ); +} + +#[tokio::test] +async fn remove_child_returns_none_for_missing_entry() { + let cache = DCache::new(); + let parent = LoadedAddr::new_unchecked(1); + assert!(cache.remove_child(parent, OsStr::new("nope")).is_none()); +} + +#[tokio::test] +async fn remove_child_does_not_affect_siblings() { + let cache = DCache::new(); + let parent = LoadedAddr::new_unchecked(1); + cache.insert( + parent, + OsString::from("a"), + LoadedAddr::new_unchecked(10), + false, + ); + cache.insert( + parent, + OsString::from("b"), + LoadedAddr::new_unchecked(11), + true, + ); + cache.remove_child(parent, OsStr::new("a")); + assert!( + cache.lookup(parent, OsStr::new("b")).is_some(), + "sibling should survive removal of another child" + ); +} + +#[tokio::test] +async fn remove_parent_resets_populate_status() { + let cache = DCache::new(); + let parent = LoadedAddr::new_unchecked(1); + cache.insert( + parent, + OsString::from("x"), + LoadedAddr::new_unchecked(10), + false, + ); + assert!(matches!( + cache.try_claim_populate(parent), + PopulateStatus::Claimed + )); + cache.finish_populate(parent); + assert!(matches!( + cache.try_claim_populate(parent), + PopulateStatus::Done + )); + + assert!( + cache.remove_parent(parent), + "should return true for existing parent" + ); + + // After removal, the parent is gone, so populate returns Claimed again. + assert!(matches!( + cache.try_claim_populate(parent), + PopulateStatus::Claimed + )); + // Children should also be gone. + assert!(cache.lookup(parent, OsStr::new("x")).is_none()); +} + +#[tokio::test] +async fn remove_parent_returns_false_for_unknown() { + let cache = DCache::new(); + assert!( + !cache.remove_parent(LoadedAddr::new_unchecked(999)), + "should return false for unknown parent" + ); +} From 64265bae60b4ae84ad065eb98ac785999cb0d577 Mon Sep 17 00:00:00 2001 From: Marko Vejnovic Date: Sun, 22 Feb 2026 11:19:34 -0800 Subject: [PATCH 43/58] feat(dcache): add evict method with reverse index for parent discovery --- lib/fs/dcache.rs | 47 +++++++++++++++++++++++++++++++++++-- tests/dcache_correctness.rs | 27 +++++++++++++++++++++ 2 files changed, 72 insertions(+), 2 deletions(-) diff --git a/lib/fs/dcache.rs b/lib/fs/dcache.rs index 4810139d..026c66a7 100644 --- a/lib/fs/dcache.rs +++ b/lib/fs/dcache.rs @@ -57,6 +57,9 @@ impl DirState { /// with zero sorting overhead. pub struct DCache { dirs: scc::HashMap>, + /// Reverse index: child inode -> parent inode, for O(1) parent discovery + /// during eviction. + child_to_parent: scc::HashMap, } impl Default for DCache { @@ -71,6 +74,7 @@ impl DCache { pub fn new() -> Self { Self { dirs: scc::HashMap::new(), + child_to_parent: scc::HashMap::new(), } } @@ -110,6 +114,7 @@ impl DCache { .write() .unwrap_or_else(std::sync::PoisonError::into_inner); children.insert(name, value); + let _ = self.child_to_parent.insert_sync(ino, parent_ino); } /// Iterate all cached children of `parent_ino` in name-sorted order. @@ -160,7 +165,11 @@ impl DCache { .children .write() .unwrap_or_else(std::sync::PoisonError::into_inner); - children.remove(name) + let removed = children.remove(name); + if let Some(ref dv) = removed { + self.child_to_parent.remove_sync(&dv.ino); + } + removed } /// Removes the entire [`DirState`] for `parent_ino`, resetting its @@ -169,7 +178,41 @@ impl DCache { /// /// Returns `true` if an entry was removed. pub fn remove_parent(&self, parent_ino: LoadedAddr) -> bool { - self.dirs.remove_sync(&parent_ino).is_some() + if let Some((_, state)) = self.dirs.remove_sync(&parent_ino) { + let children = state + .children + .read() + .unwrap_or_else(std::sync::PoisonError::into_inner); + for dv in children.values() { + self.child_to_parent.remove_sync(&dv.ino); + } + true + } else { + false + } + } + + /// Evict a child inode from the cache by its address. + /// + /// Looks up the parent via the reverse index, removes the child entry + /// from that parent's children map, and resets the parent's populate + /// flag to `UNCLAIMED` so the next `readdir` re-fetches from the + /// data provider. + pub fn evict(&self, child_ino: LoadedAddr) { + let Some((_, parent_ino)) = self.child_to_parent.remove_sync(&child_ino) else { + return; + }; + let Some(state) = self.dirs.read_sync(&parent_ino, |_, v| Arc::clone(v)) else { + return; + }; + let mut children = state + .children + .write() + .unwrap_or_else(std::sync::PoisonError::into_inner); + children.retain(|_, dv| dv.ino != child_ino); + drop(children); + state.populated.store(POPULATE_UNCLAIMED, Ordering::Release); + state.notify.notify_waiters(); } /// Atomically try to claim a directory for population. diff --git a/tests/dcache_correctness.rs b/tests/dcache_correctness.rs index 16bad50a..04658732 100644 --- a/tests/dcache_correctness.rs +++ b/tests/dcache_correctness.rs @@ -315,3 +315,30 @@ async fn remove_parent_returns_false_for_unknown() { "should return false for unknown parent" ); } + +#[tokio::test] +async fn evict_removes_child_and_resets_populate_status() { + let cache = DCache::new(); + let parent = LoadedAddr::new_unchecked(1); + let child = LoadedAddr::new_unchecked(10); + cache.insert(parent, OsString::from("foo"), child, false); + assert!(matches!( + cache.try_claim_populate(parent), + PopulateStatus::Claimed + )); + cache.finish_populate(parent); + assert!(matches!( + cache.try_claim_populate(parent), + PopulateStatus::Done + )); + + cache.evict(child); + + // Child should be gone. + assert!(cache.lookup(parent, OsStr::new("foo")).is_none()); + // Populate status should be reset so next readdir re-fetches. + assert!(matches!( + cache.try_claim_populate(parent), + PopulateStatus::Claimed + )); +} From b1cf057ac8c72214f26ae3ed84041cc4dd3d3350 Mon Sep 17 00:00:00 2001 From: Marko Vejnovic Date: Sun, 22 Feb 2026 11:31:46 -0800 Subject: [PATCH 44/58] fix(dcache): use upsert_sync for reverse index and document IN_PROGRESS race Address code review findings: - insert_sync silently discards duplicates; upsert_sync correctly overwrites when a child is re-inserted under a different parent. - Add comment documenting the intentional populate-flag clobber in evict. --- lib/fs/dcache.rs | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/lib/fs/dcache.rs b/lib/fs/dcache.rs index 026c66a7..aa304904 100644 --- a/lib/fs/dcache.rs +++ b/lib/fs/dcache.rs @@ -114,7 +114,8 @@ impl DCache { .write() .unwrap_or_else(std::sync::PoisonError::into_inner); children.insert(name, value); - let _ = self.child_to_parent.insert_sync(ino, parent_ino); + // Upsert: overwrite if this child was previously cached under a different parent. + self.child_to_parent.upsert_sync(ino, parent_ino); } /// Iterate all cached children of `parent_ino` in name-sorted order. @@ -211,6 +212,10 @@ impl DCache { .unwrap_or_else(std::sync::PoisonError::into_inner); children.retain(|_, dv| dv.ino != child_ino); drop(children); + // Reset regardless of current state. If a populate is in flight, + // the concurrent caller will overwrite this with DONE when it + // finishes; that is acceptable because the next readdir will + // re-fetch again. state.populated.store(POPULATE_UNCLAIMED, Ordering::Release); state.notify.notify_waiters(); } From 24fe59f9b5a807143c4170147c5cded57cc28c1a Mon Sep 17 00:00:00 2001 From: Marko Vejnovic Date: Sun, 22 Feb 2026 11:32:53 -0800 Subject: [PATCH 45/58] test(dcache): add edge-case tests for evict method --- tests/dcache_correctness.rs | 57 +++++++++++++++++++++++++++++++++++++ 1 file changed, 57 insertions(+) diff --git a/tests/dcache_correctness.rs b/tests/dcache_correctness.rs index 04658732..b3a24e4e 100644 --- a/tests/dcache_correctness.rs +++ b/tests/dcache_correctness.rs @@ -342,3 +342,60 @@ async fn evict_removes_child_and_resets_populate_status() { PopulateStatus::Claimed )); } + +#[tokio::test] +async fn evict_unknown_child_is_noop() { + let cache = DCache::new(); + // Should not panic or corrupt state. + cache.evict(LoadedAddr::new_unchecked(999)); +} + +#[tokio::test] +async fn evict_does_not_affect_siblings() { + let cache = DCache::new(); + let parent = LoadedAddr::new_unchecked(1); + cache.insert( + parent, + OsString::from("a"), + LoadedAddr::new_unchecked(10), + false, + ); + cache.insert( + parent, + OsString::from("b"), + LoadedAddr::new_unchecked(11), + true, + ); + assert!(matches!( + cache.try_claim_populate(parent), + PopulateStatus::Claimed + )); + cache.finish_populate(parent); + + cache.evict(LoadedAddr::new_unchecked(10)); + + // Sibling should survive. + assert!(cache.lookup(parent, OsStr::new("b")).is_some()); + // But populate status should be reset. + assert!(matches!( + cache.try_claim_populate(parent), + PopulateStatus::Claimed + )); +} + +#[tokio::test] +async fn evict_child_from_multiple_parents_removes_from_correct_parent() { + let cache = DCache::new(); + let parent_a = LoadedAddr::new_unchecked(1); + let parent_b = LoadedAddr::new_unchecked(2); + let child = LoadedAddr::new_unchecked(10); + // Same child addr under two parents — last insert wins in reverse index + // because upsert_sync overwrites. + cache.insert(parent_a, OsString::from("x"), child, false); + cache.insert(parent_b, OsString::from("y"), child, false); + + cache.evict(child); + + // The parent_b entry should be removed (last insert wins in reverse index). + assert!(cache.lookup(parent_b, OsStr::new("y")).is_none()); +} From 325c3a929531efdfef9aeb4d3eede05d2c6fe544 Mon Sep 17 00:00:00 2001 From: Marko Vejnovic Date: Sun, 22 Feb 2026 11:33:48 -0800 Subject: [PATCH 46/58] fix: evict inodes from DCache so readdir re-fetches after forget --- lib/fs/async_fs.rs | 1 + tests/async_fs_correctness.rs | 53 +++++++++++++++++++++++++++++++++++ 2 files changed, 54 insertions(+) diff --git a/lib/fs/async_fs.rs b/lib/fs/async_fs.rs index eed3d984..126e309e 100644 --- a/lib/fs/async_fs.rs +++ b/lib/fs/async_fs.rs @@ -494,6 +494,7 @@ impl AsyncFs { /// structures (path maps, etc.). pub fn evict(&self, addr: InodeAddr) { self.inode_table.remove_sync(&addr); + self.directory_cache.evict(LoadedAddr::new_unchecked(addr)); self.data_provider.forget(addr); } diff --git a/tests/async_fs_correctness.rs b/tests/async_fs_correctness.rs index e3087ceb..836b242c 100644 --- a/tests/async_fs_correctness.rs +++ b/tests/async_fs_correctness.rs @@ -770,3 +770,56 @@ async fn prefetch_failure_does_not_affect_parent_readdir() { .unwrap_err(); assert_eq!(err.raw_os_error(), Some(libc::ENOENT)); } + +#[tokio::test(flavor = "multi_thread", worker_threads = 2)] +async fn readdir_after_evict_re_fetches_from_provider() { + use std::sync::atomic::Ordering; + + let root = make_inode(1, INodeType::Directory, 0, None); + let child_a = make_inode(10, INodeType::File, 42, Some(1)); + let child_b = make_inode(11, INodeType::File, 99, Some(1)); + + let mut state = MockFsState::default(); + state.directories.insert( + 1, + vec![ + (OsString::from("a.txt"), child_a), + (OsString::from("b.txt"), child_b), + ], + ); + let dp = MockFsDataProvider::new(state); + let readdir_count = Arc::clone(&dp.state); + + let table = Arc::new(FutureBackedCache::default()); + let fs = AsyncFs::new(dp, root, Arc::clone(&table)).await; + + // First readdir populates cache. + let mut entries = Vec::new(); + fs.readdir(LoadedAddr::new_unchecked(1), 0, |entry, _| { + entries.push(entry.name.to_os_string()); + false + }) + .await + .unwrap(); + assert_eq!(entries.len(), 2); + assert_eq!(readdir_count.readdir_count.load(Ordering::Relaxed), 1); + + // Simulate forget: evict all children. + fs.evict(10); + fs.evict(11); + + // Second readdir should re-fetch from the data provider (not return empty). + let mut entries2 = Vec::new(); + fs.readdir(LoadedAddr::new_unchecked(1), 0, |entry, _| { + entries2.push(entry.name.to_os_string()); + false + }) + .await + .unwrap(); + assert_eq!(entries2.len(), 2, "second readdir must not return empty"); + assert_eq!( + readdir_count.readdir_count.load(Ordering::Relaxed), + 2, + "should have called dp.readdir again after eviction" + ); +} From 07d9749bcbe5563f25e48d2567f87df1e9a6bb2e Mon Sep 17 00:00:00 2001 From: Marko Vejnovic Date: Sun, 22 Feb 2026 11:36:49 -0800 Subject: [PATCH 47/58] fix: propagate DCache eviction through InodeForget drop path --- lib/fs/async_fs.rs | 23 ++++++++++++++++++----- lib/fs/fuser.rs | 11 ++++++++--- 2 files changed, 26 insertions(+), 8 deletions(-) diff --git a/lib/fs/async_fs.rs b/lib/fs/async_fs.rs index 126e309e..8d78f063 100644 --- a/lib/fs/async_fs.rs +++ b/lib/fs/async_fs.rs @@ -93,14 +93,17 @@ impl StatelessDrop>, InodeAddr> for Inod } } -/// Evicts the inode from the table and delegates to [`FsDataProvider::forget`] -/// so the provider can clean up its own auxiliary state. -impl StatelessDrop<(Arc>, DP), InodeAddr> +/// Evicts the inode from the table and the directory cache, then delegates to +/// [`FsDataProvider::forget`] so the provider can clean up its own auxiliary +/// state. +impl + StatelessDrop<(Arc>, Arc, DP), InodeAddr> for InodeForget { - fn delete(ctx: &(Arc>, DP), key: &InodeAddr) { + fn delete(ctx: &(Arc>, Arc, DP), key: &InodeAddr) { ctx.0.remove_sync(key); - ctx.1.forget(*key); + ctx.1.evict(LoadedAddr::new_unchecked(*key)); + ctx.2.forget(*key); } } @@ -359,6 +362,16 @@ impl AsyncFs { } } + /// Returns a clone of the directory cache handle. + /// + /// Used by the FUSE adapter to pass the cache into the [`DropWard`] + /// context so that [`InodeForget`] can evict stale entries when the + /// kernel forgets an inode. + #[must_use] + pub fn directory_cache(&self) -> Arc { + Arc::clone(&self.directory_cache) + } + /// Get the total number of inodes currently stored in the inode table. #[must_use] pub fn inode_count(&self) -> usize { diff --git a/lib/fs/fuser.rs b/lib/fs/fuser.rs index 78528914..b7f886be 100644 --- a/lib/fs/fuser.rs +++ b/lib/fs/fuser.rs @@ -70,7 +70,11 @@ impl FuseResultExt for Result { } type FuseWard = crate::drop_ward::DropWard< - (Arc>, DP), + ( + Arc>, + Arc, + DP, + ), InodeAddr, super::async_fs::InodeForget, >; @@ -83,8 +87,9 @@ struct FuseBridgeInner { impl FuseBridgeInner { fn create(table: FutureBackedCache, provider: DP) -> Self { let table = Arc::new(table); - let ward = crate::drop_ward::DropWard::new((Arc::clone(&table), provider.clone())); - let fs = super::async_fs::AsyncFs::new_preseeded(provider, table); + let fs = super::async_fs::AsyncFs::new_preseeded(provider.clone(), Arc::clone(&table)); + let dcache = fs.directory_cache(); + let ward = crate::drop_ward::DropWard::new((table, dcache, provider)); Self { ward, fs } } From 41be16dd6cbcefcd3effcaacdaf0675f42f8bdb8 Mon Sep 17 00:00:00 2001 From: Marko Vejnovic Date: Sun, 22 Feb 2026 11:38:39 -0800 Subject: [PATCH 48/58] test: add readdir-evict-readdir round-trip integration test --- tests/async_fs_correctness.rs | 50 +++++++++++++++++++++++++++++++++++ 1 file changed, 50 insertions(+) diff --git a/tests/async_fs_correctness.rs b/tests/async_fs_correctness.rs index 836b242c..5a32c114 100644 --- a/tests/async_fs_correctness.rs +++ b/tests/async_fs_correctness.rs @@ -823,3 +823,53 @@ async fn readdir_after_evict_re_fetches_from_provider() { "should have called dp.readdir again after eviction" ); } + +#[tokio::test(flavor = "multi_thread", worker_threads = 2)] +async fn readdir_evict_all_readdir_returns_same_entries() { + let root = make_inode(1, INodeType::Directory, 0, None); + let child_dir = make_inode(10, INodeType::Directory, 0, Some(1)); + let child_file = make_inode(11, INodeType::File, 100, Some(1)); + + let mut state = MockFsState::default(); + state.directories.insert( + 1, + vec![ + (OsString::from("subdir"), child_dir), + (OsString::from("file.txt"), child_file), + ], + ); + let dp = MockFsDataProvider::new(state); + + let table = Arc::new(FutureBackedCache::default()); + let fs = AsyncFs::new(dp, root, Arc::clone(&table)).await; + + // First readdir. + let mut first = Vec::new(); + fs.readdir(LoadedAddr::new_unchecked(1), 0, |entry, _| { + first.push((entry.name.to_os_string(), entry.inode.addr)); + false + }) + .await + .unwrap(); + assert_eq!(first.len(), 2); + + // Evict all children (simulating FUSE forget). + fs.evict(10); + fs.evict(11); + + // Wait for any prefetch tasks to settle. + tokio::time::sleep(std::time::Duration::from_millis(50)).await; + + // Second readdir should return the same entries. + let mut second = Vec::new(); + fs.readdir(LoadedAddr::new_unchecked(1), 0, |entry, _| { + second.push((entry.name.to_os_string(), entry.inode.addr)); + false + }) + .await + .unwrap(); + assert_eq!( + second, first, + "readdir after evict should return same entries" + ); +} From 3b23bca1b0f39b6f675c7ac88be6906fd1aa868e Mon Sep 17 00:00:00 2001 From: Marko Vejnovic Date: Sun, 22 Feb 2026 12:21:44 -0800 Subject: [PATCH 49/58] fix: address remaining critical review issues from PR #40 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Avoid nested lock (scc bucket → bridge mutex) in CompositeFs::forget by checking bridge emptiness outside the scc predicate (Alice C1) - Use compare_exchange(DONE→UNCLAIMED) in DCache::evict instead of a blind store to avoid clobbering an in-flight populate (Alice C2) - Document AssertUnwindSafe panic-safety contract on get_or_init and get_or_try_init (Bob C1) - Document why two clones in await_shared are structurally necessary and zero-cost for Copy types (Bob C2) - Handle closed-semaphore case in prefetch spawn with let-else (Bob M6) --- lib/cache/async_backed.rs | 23 +++++++++++++++++++++++ lib/fs/async_fs.rs | 4 +++- lib/fs/composite.rs | 23 ++++++++++++++--------- lib/fs/dcache.rs | 24 +++++++++++++++++++----- 4 files changed, 59 insertions(+), 15 deletions(-) diff --git a/lib/cache/async_backed.rs b/lib/cache/async_backed.rs index 974aac09..8e35999e 100644 --- a/lib/cache/async_backed.rs +++ b/lib/cache/async_backed.rs @@ -60,6 +60,14 @@ where /// computation instead of spawning a duplicate. If the factory panics, the entry is removed /// and the next caller retries with a fresh factory invocation. /// + /// # Panic safety + /// + /// The factory is wrapped in `AssertUnwindSafe` + `catch_unwind` to prevent a panicking + /// factory from permanently poisoning the cache slot. Factories that capture shared mutable + /// state must ensure panics do not leave that state inconsistent. In practice this is + /// satisfied when factories capture only `Arc`, owned data, or immutable references — which + /// is the case for all callers in this codebase. + /// /// # Panics /// /// Panics only if *this* caller's own factory panicked (i.e. this caller won the `Vacant` @@ -152,6 +160,14 @@ where /// than coalescing on the first error. This is intentional — callers /// may have different retry or error-handling semantics. /// + /// # Panic safety + /// + /// The factory is wrapped in `AssertUnwindSafe` + `catch_unwind` to prevent a panicking + /// factory from permanently poisoning the cache slot. Factories that capture shared mutable + /// state must ensure panics do not leave that state inconsistent. In practice this is + /// satisfied when factories capture only `Arc`, owned data, or immutable references — which + /// is the case for all callers in this codebase. + /// /// # Panics /// /// Panics if the factory panics (caught internally via `catch_unwind`). @@ -278,6 +294,13 @@ where let result = shared.await; if let Some(v) = result { + // Two clones of `v` are structurally necessary: + // 1. `guard.value` — the `PromoteGuard` drop impl needs a copy + // to promote the slot if this task is cancelled between here + // and the `update_async` below. + // 2. `Slot::Ready(v.clone())` — moves the value into the map. + // For `Copy` types (like `INode`, the primary value type) both + // clones are zero-cost. guard.value = Some(v.clone()); self.map diff --git a/lib/fs/async_fs.rs b/lib/fs/async_fs.rs index 8d78f063..3d247782 100644 --- a/lib/fs/async_fs.rs +++ b/lib/fs/async_fs.rs @@ -356,7 +356,9 @@ impl AsyncFs { let table = Arc::clone(&self.inode_table); let dp = self.data_provider.clone(); tokio::spawn(async move { - let _permit = sem.acquire().await; + let Ok(_permit) = sem.acquire().await else { + return; + }; prefetch_dir(child_addr, dcache, table, dp).await; }); } diff --git a/lib/fs/composite.rs b/lib/fs/composite.rs index 950def95..ba18bc4b 100644 --- a/lib/fs/composite.rs +++ b/lib/fs/composite.rs @@ -489,17 +489,22 @@ where // `forget` fires only when nlookup reaches zero. self.inner.addr_to_slot.remove_sync(&addr); if bridge_empty { - // Bridge is empty — atomically remove the slot only if no one - // has re-populated the bridge between our check and this removal. - // `remove_if_sync` holds the scc bucket lock during evaluation, - // and `is_empty_locked` acquires the bridge's coordination mutex - // to serialize with any concurrent `backward_or_insert`. - let removed = self + // Check emptiness under the bridge's coordination lock OUTSIDE + // the scc bucket guard, avoiding nested lock acquisition + // (scc bucket -> bridge mutex) which would be a deadlock risk. + let still_empty = self .inner .slots - .remove_if_sync(&slot_idx, |slot| slot.bridge.is_empty_locked()); - if let Some((_, slot)) = removed { - self.inner.name_to_slot.remove_sync(&slot.name); + .read_sync(&slot_idx, |_, slot| Arc::clone(&slot.bridge)) + .is_some_and(|b| b.is_empty_locked()); + if still_empty { + let removed = self + .inner + .slots + .remove_if_sync(&slot_idx, |slot| slot.bridge.is_empty()); + if let Some((_, slot)) = removed { + self.inner.name_to_slot.remove_sync(&slot.name); + } } } } diff --git a/lib/fs/dcache.rs b/lib/fs/dcache.rs index aa304904..4285e324 100644 --- a/lib/fs/dcache.rs +++ b/lib/fs/dcache.rs @@ -199,6 +199,12 @@ impl DCache { /// from that parent's children map, and resets the parent's populate /// flag to `UNCLAIMED` so the next `readdir` re-fetches from the /// data provider. + /// + /// The reset uses `compare_exchange(DONE -> UNCLAIMED)` rather than a + /// blind store to avoid a race with an in-flight populate: if a + /// concurrent `readdir` is mid-populate (`IN_PROGRESS`), a blind store + /// of `UNCLAIMED` would be overwritten by the populator's final `DONE` + /// store, leaving the cache in a stale-but-marked-done state. pub fn evict(&self, child_ino: LoadedAddr) { let Some((_, parent_ino)) = self.child_to_parent.remove_sync(&child_ino) else { return; @@ -212,11 +218,19 @@ impl DCache { .unwrap_or_else(std::sync::PoisonError::into_inner); children.retain(|_, dv| dv.ino != child_ino); drop(children); - // Reset regardless of current state. If a populate is in flight, - // the concurrent caller will overwrite this with DONE when it - // finishes; that is acceptable because the next readdir will - // re-fetch again. - state.populated.store(POPULATE_UNCLAIMED, Ordering::Release); + // Only reset from DONE to UNCLAIMED. If a populate is in flight + // (IN_PROGRESS), leave the flag alone: the concurrent populator will + // finish and store DONE, but the child we just removed is already gone + // from the children map, and readdir handles missing inodes gracefully + // (skips with a debug log). The next readdir after that populate + // completes will see DONE and serve the (stale) cache, but a + // subsequent forget-cycle will evict again. + let _ = state.populated.compare_exchange( + POPULATE_DONE, + POPULATE_UNCLAIMED, + Ordering::AcqRel, + Ordering::Relaxed, + ); state.notify.notify_waiters(); } From d660c590183723ee7d8c76aefd34304183f76301 Mon Sep 17 00:00:00 2001 From: Marko Vejnovic Date: Sun, 22 Feb 2026 12:55:03 -0800 Subject: [PATCH 50/58] Performance improvement --- lib/cache/async_backed.rs | 22 +++++++++++ lib/fs/async_fs.rs | 16 +++++--- lib/fs/dcache.rs | 58 ++++++++++++++++++++------- src/fs/mescloud/roots.rs | 4 ++ tests/dcache_correctness.rs | 78 ++++++++++++++++++++++++------------- 5 files changed, 129 insertions(+), 49 deletions(-) diff --git a/lib/cache/async_backed.rs b/lib/cache/async_backed.rs index 8e35999e..7c406161 100644 --- a/lib/cache/async_backed.rs +++ b/lib/cache/async_backed.rs @@ -225,6 +225,9 @@ where ) }); let generation = self.next_gen.fetch_add(1, Ordering::Relaxed); + // Channel is per-iteration: each owner gets its own error channel. + // Do not hoist above the loop — joiners that re-enter the Vacant + // branch need a fresh channel for their own factory invocation. let (error_tx, mut error_rx) = tokio::sync::oneshot::channel(); let shared = Self::make_shared_fallible(f, error_tx); let ret = shared.clone(); @@ -330,6 +333,9 @@ where F: FnOnce() -> Fut, Fut: Future + Send + 'static, { + // SAFETY(unwind): factories in this codebase capture only Arc, owned data, + // or immutable references — no shared mutable state that could be left + // inconsistent by a panic. See `get_or_init` doc comment for details. let fut = AssertUnwindSafe(factory()).catch_unwind(); let boxed: Pin> + Send>> = Box::pin(async move { fut.await.ok() }); @@ -349,6 +355,8 @@ where Fut: Future> + Send + 'static, E: Send + 'static, { + // SAFETY(unwind): same justification as `make_shared` — factories capture + // only Arc, owned data, or immutable references. let fut = AssertUnwindSafe(factory()).catch_unwind(); let boxed: Pin> + Send>> = Box::pin(async move { match fut.await { @@ -389,6 +397,20 @@ where pub fn remove_sync(&self, key: &K) -> bool { self.map.remove_sync(key).is_some() } + + /// Synchronously remove all `Ready` entries for which `predicate` returns `true`. + /// + /// `InFlight` entries are always retained — only fully resolved entries + /// are eligible for removal. This is safe to call concurrently with + /// other cache operations: `scc::HashMap::retain_sync` acquires + /// bucket-level locks, and `InFlight` entries are skipped so in-progress + /// computations are never disturbed. + pub fn remove_ready_if_sync(&self, mut predicate: impl FnMut(&K, &V) -> bool) { + self.map.retain_sync(|k, slot| match slot { + Slot::InFlight(..) => true, + Slot::Ready(v) => !predicate(k, v), + }); + } } /// Drop guard that synchronously promotes an `InFlight` entry to `Ready` if the caller diff --git a/lib/fs/async_fs.rs b/lib/fs/async_fs.rs index 3d247782..e8cb93e0 100644 --- a/lib/fs/async_fs.rs +++ b/lib/fs/async_fs.rs @@ -235,10 +235,10 @@ async fn prefetch_dir( ) { use crate::fs::dcache::PopulateStatus; - match directory_cache.try_claim_populate(dir_addr) { - PopulateStatus::Claimed => {} + let claim_gen = match directory_cache.try_claim_populate(dir_addr) { + PopulateStatus::Claimed(claim_gen) => claim_gen, PopulateStatus::InProgress | PopulateStatus::Done => return, - } + }; let mut guard = PopulateGuard::new(&directory_cache, dir_addr); @@ -262,7 +262,7 @@ async fn prefetch_dir( is_dir, ); } - directory_cache.finish_populate(dir_addr); + directory_cache.finish_populate(dir_addr, claim_gen); guard.defuse(); } @@ -510,6 +510,10 @@ impl AsyncFs { pub fn evict(&self, addr: InodeAddr) { self.inode_table.remove_sync(&addr); self.directory_cache.evict(LoadedAddr::new_unchecked(addr)); + self.lookup_cache + .remove_ready_if_sync(|&(parent_addr, _), child| { + parent_addr == addr || child.addr == addr + }); self.data_provider.forget(addr); } @@ -541,7 +545,7 @@ impl AsyncFs { // Uses a three-state CAS gate to prevent duplicate dp.readdir() calls. loop { match self.directory_cache.try_claim_populate(parent) { - PopulateStatus::Claimed => { + PopulateStatus::Claimed(claim_gen) => { // RAII guard: if this future is cancelled between Claimed // and finish_populate, automatically abort so other waiters // can retry instead of hanging forever. @@ -559,7 +563,7 @@ impl AsyncFs { child_inode.itype == INodeType::Directory, ); } - self.directory_cache.finish_populate(parent); + self.directory_cache.finish_populate(parent, claim_gen); guard.defuse(); self.spawn_prefetch_children(parent); break; diff --git a/lib/fs/dcache.rs b/lib/fs/dcache.rs index 4285e324..49351776 100644 --- a/lib/fs/dcache.rs +++ b/lib/fs/dcache.rs @@ -1,6 +1,6 @@ use std::collections::BTreeMap; use std::ffi::{OsStr, OsString}; -use std::sync::atomic::{AtomicU8, Ordering}; +use std::sync::atomic::{AtomicU8, AtomicU64, Ordering}; use std::sync::{Arc, RwLock}; use tokio::sync::Notify; @@ -24,7 +24,9 @@ const POPULATE_DONE: u8 = 2; /// Result of attempting to claim a directory for population. pub enum PopulateStatus { /// This caller won the race and should populate the directory. - Claimed, + /// Carries the generation at claim time so [`DCache::finish_populate`] + /// can detect whether an eviction invalidated the populate. + Claimed(u64), /// Another caller is currently populating; wait and re-check. InProgress, /// The directory is already fully populated. @@ -35,6 +37,10 @@ pub enum PopulateStatus { struct DirState { children: RwLock>, populated: AtomicU8, + /// Monotonically increasing counter bumped by each [`DCache::evict`] call. + /// Allows [`DCache::finish_populate`] to detect that an eviction occurred + /// while a populate was in flight. + generation: AtomicU64, /// Wakes waiters when `populated` transitions out of `IN_PROGRESS`. notify: Notify, } @@ -44,6 +50,7 @@ impl DirState { Self { children: RwLock::new(BTreeMap::new()), populated: AtomicU8::new(POPULATE_UNCLAIMED), + generation: AtomicU64::new(0), notify: Notify::new(), } } @@ -60,6 +67,9 @@ pub struct DCache { /// Reverse index: child inode -> parent inode, for O(1) parent discovery /// during eviction. child_to_parent: scc::HashMap, + /// Reverse index: child inode -> entry name, for O(log n) removal from + /// the parent's `BTreeMap` during eviction (instead of O(n) `retain`). + child_to_name: scc::HashMap, } impl Default for DCache { @@ -75,6 +85,7 @@ impl DCache { Self { dirs: scc::HashMap::new(), child_to_parent: scc::HashMap::new(), + child_to_name: scc::HashMap::new(), } } @@ -113,8 +124,13 @@ impl DCache { .children .write() .unwrap_or_else(std::sync::PoisonError::into_inner); - children.insert(name, value); - // Upsert: overwrite if this child was previously cached under a different parent. + if let Some(old) = children.insert(name.clone(), value) + && old.ino != ino + { + self.child_to_name.remove_sync(&old.ino); + self.child_to_parent.remove_sync(&old.ino); + } + self.child_to_name.upsert_sync(ino, name); self.child_to_parent.upsert_sync(ino, parent_ino); } @@ -169,6 +185,7 @@ impl DCache { let removed = children.remove(name); if let Some(ref dv) = removed { self.child_to_parent.remove_sync(&dv.ino); + self.child_to_name.remove_sync(&dv.ino); } removed } @@ -186,6 +203,7 @@ impl DCache { .unwrap_or_else(std::sync::PoisonError::into_inner); for dv in children.values() { self.child_to_parent.remove_sync(&dv.ino); + self.child_to_name.remove_sync(&dv.ino); } true } else { @@ -216,15 +234,13 @@ impl DCache { .children .write() .unwrap_or_else(std::sync::PoisonError::into_inner); - children.retain(|_, dv| dv.ino != child_ino); + if let Some((_, name)) = self.child_to_name.remove_sync(&child_ino) { + children.remove(&name); + } drop(children); - // Only reset from DONE to UNCLAIMED. If a populate is in flight - // (IN_PROGRESS), leave the flag alone: the concurrent populator will - // finish and store DONE, but the child we just removed is already gone - // from the children map, and readdir handles missing inodes gracefully - // (skips with a debug log). The next readdir after that populate - // completes will see DONE and serve the (stale) cache, but a - // subsequent forget-cycle will evict again. + // Bump generation so any in-flight populate knows its data is stale. + state.generation.fetch_add(1, Ordering::Release); + // Reset DONE -> UNCLAIMED so the next readdir re-fetches. let _ = state.populated.compare_exchange( POPULATE_DONE, POPULATE_UNCLAIMED, @@ -248,16 +264,28 @@ impl DCache { Ordering::AcqRel, Ordering::Acquire, ) { - Ok(_) => PopulateStatus::Claimed, + Ok(_) => { + let claim_gen = state.generation.load(Ordering::Acquire); + PopulateStatus::Claimed(claim_gen) + } Err(POPULATE_IN_PROGRESS) => PopulateStatus::InProgress, Err(_) => PopulateStatus::Done, } } /// Mark a directory as fully populated after successful population. - pub fn finish_populate(&self, parent_ino: LoadedAddr) { + /// + /// `claimed_gen` is the generation returned by [`try_claim_populate`]. If + /// an [`evict`](Self::evict) bumped the generation since then, the data + /// is stale so the flag is reset to `UNCLAIMED` instead of `DONE`. + pub fn finish_populate(&self, parent_ino: LoadedAddr, claimed_gen: u64) { let state = self.dir_state(parent_ino); - state.populated.store(POPULATE_DONE, Ordering::Release); + let current_gen = state.generation.load(Ordering::Acquire); + if current_gen == claimed_gen { + state.populated.store(POPULATE_DONE, Ordering::Release); + } else { + state.populated.store(POPULATE_UNCLAIMED, Ordering::Release); + } state.notify.notify_waiters(); } diff --git a/src/fs/mescloud/roots.rs b/src/fs/mescloud/roots.rs index 7c8701db..ca66da45 100644 --- a/src/fs/mescloud/roots.rs +++ b/src/fs/mescloud/roots.rs @@ -345,6 +345,10 @@ impl OrgChildDP { } } +// Manual dispatch is safe: Rust's exhaustive matching requires every new +// variant to be handled in every `match` below, so the compiler enforces +// completeness. A macro abstraction would add complexity without benefit +// for a two-variant enum. impl FsDataProvider for OrgChildDP { type Reader = OrgChildReader; diff --git a/tests/dcache_correctness.rs b/tests/dcache_correctness.rs index b3a24e4e..32010711 100644 --- a/tests/dcache_correctness.rs +++ b/tests/dcache_correctness.rs @@ -77,18 +77,18 @@ async fn try_claim_populate_unclaimed_returns_claimed() { let cache = DCache::new(); assert!(matches!( cache.try_claim_populate(LoadedAddr::new_unchecked(1)), - PopulateStatus::Claimed + PopulateStatus::Claimed(_) )); } #[tokio::test] async fn finish_populate_then_claim_returns_done() { let cache = DCache::new(); - assert!(matches!( - cache.try_claim_populate(LoadedAddr::new_unchecked(1)), - PopulateStatus::Claimed - )); - cache.finish_populate(LoadedAddr::new_unchecked(1)); + let PopulateStatus::Claimed(claim_gen) = cache.try_claim_populate(LoadedAddr::new_unchecked(1)) + else { + panic!("expected Claimed") + }; + cache.finish_populate(LoadedAddr::new_unchecked(1), claim_gen); assert!(matches!( cache.try_claim_populate(LoadedAddr::new_unchecked(1)), PopulateStatus::Done @@ -100,7 +100,7 @@ async fn double_claim_returns_in_progress() { let cache = DCache::new(); assert!(matches!( cache.try_claim_populate(LoadedAddr::new_unchecked(1)), - PopulateStatus::Claimed + PopulateStatus::Claimed(_) )); assert!(matches!( cache.try_claim_populate(LoadedAddr::new_unchecked(1)), @@ -113,12 +113,12 @@ async fn abort_populate_allows_reclaim() { let cache = DCache::new(); assert!(matches!( cache.try_claim_populate(LoadedAddr::new_unchecked(1)), - PopulateStatus::Claimed + PopulateStatus::Claimed(_) )); cache.abort_populate(LoadedAddr::new_unchecked(1)); assert!(matches!( cache.try_claim_populate(LoadedAddr::new_unchecked(1)), - PopulateStatus::Claimed + PopulateStatus::Claimed(_) )); } @@ -134,7 +134,7 @@ async fn insert_does_not_mark_populated() { assert!( matches!( cache.try_claim_populate(LoadedAddr::new_unchecked(1)), - PopulateStatus::Claimed + PopulateStatus::Claimed(_) ), "insert alone should not mark a directory as populated" ); @@ -283,11 +283,10 @@ async fn remove_parent_resets_populate_status() { LoadedAddr::new_unchecked(10), false, ); - assert!(matches!( - cache.try_claim_populate(parent), - PopulateStatus::Claimed - )); - cache.finish_populate(parent); + let PopulateStatus::Claimed(claim_gen) = cache.try_claim_populate(parent) else { + panic!("expected Claimed") + }; + cache.finish_populate(parent, claim_gen); assert!(matches!( cache.try_claim_populate(parent), PopulateStatus::Done @@ -301,7 +300,7 @@ async fn remove_parent_resets_populate_status() { // After removal, the parent is gone, so populate returns Claimed again. assert!(matches!( cache.try_claim_populate(parent), - PopulateStatus::Claimed + PopulateStatus::Claimed(_) )); // Children should also be gone. assert!(cache.lookup(parent, OsStr::new("x")).is_none()); @@ -322,11 +321,10 @@ async fn evict_removes_child_and_resets_populate_status() { let parent = LoadedAddr::new_unchecked(1); let child = LoadedAddr::new_unchecked(10); cache.insert(parent, OsString::from("foo"), child, false); - assert!(matches!( - cache.try_claim_populate(parent), - PopulateStatus::Claimed - )); - cache.finish_populate(parent); + let PopulateStatus::Claimed(claim_gen) = cache.try_claim_populate(parent) else { + panic!("expected Claimed") + }; + cache.finish_populate(parent, claim_gen); assert!(matches!( cache.try_claim_populate(parent), PopulateStatus::Done @@ -339,7 +337,7 @@ async fn evict_removes_child_and_resets_populate_status() { // Populate status should be reset so next readdir re-fetches. assert!(matches!( cache.try_claim_populate(parent), - PopulateStatus::Claimed + PopulateStatus::Claimed(_) )); } @@ -366,11 +364,10 @@ async fn evict_does_not_affect_siblings() { LoadedAddr::new_unchecked(11), true, ); - assert!(matches!( - cache.try_claim_populate(parent), - PopulateStatus::Claimed - )); - cache.finish_populate(parent); + let PopulateStatus::Claimed(claim_gen) = cache.try_claim_populate(parent) else { + panic!("expected Claimed") + }; + cache.finish_populate(parent, claim_gen); cache.evict(LoadedAddr::new_unchecked(10)); @@ -379,7 +376,7 @@ async fn evict_does_not_affect_siblings() { // But populate status should be reset. assert!(matches!( cache.try_claim_populate(parent), - PopulateStatus::Claimed + PopulateStatus::Claimed(_) )); } @@ -399,3 +396,28 @@ async fn evict_child_from_multiple_parents_removes_from_correct_parent() { // The parent_b entry should be removed (last insert wins in reverse index). assert!(cache.lookup(parent_b, OsStr::new("y")).is_none()); } + +#[tokio::test] +async fn evict_during_populate_invalidates_generation() { + let cache = DCache::new(); + let parent = LoadedAddr::new_unchecked(1); + let child = LoadedAddr::new_unchecked(10); + cache.insert(parent, OsString::from("foo"), child, false); + + let PopulateStatus::Claimed(claim_gen) = cache.try_claim_populate(parent) else { + panic!("expected Claimed") + }; + + // Evict while populate is in progress. + cache.evict(child); + + // Finish populate with the stale generation. + cache.finish_populate(parent, claim_gen); + + // The finish_populate should have detected the generation mismatch + // and reset to UNCLAIMED instead of DONE. + assert!( + matches!(cache.try_claim_populate(parent), PopulateStatus::Claimed(_)), + "should be re-claimable after evict invalidated the generation" + ); +} From 0c27dfeaa6948733dd1f0325f6f3f7bab27bebfd Mon Sep 17 00:00:00 2001 From: Marko Vejnovic Date: Sun, 22 Feb 2026 13:13:27 -0800 Subject: [PATCH 51/58] fix: harden DCache::evict against concurrent insert and document known limitations MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Rewrite evict to read (not remove) the reverse index first, acquire the parent's children write lock, then re-check ownership before removing — closing the TOCTOU race with concurrent insert. - Document unbounded memory growth in inode_table/lookup_cache (AsyncFs). - Expand MES-697 TODO to note the cache-eviction interaction. - Add two new DCache tests for evict-reinsert and concurrent reparent. --- lib/fs/async_fs.rs | 6 ++++ lib/fs/dcache.rs | 31 +++++++++++++++++- src/fs/mescloud/repo.rs | 5 +++ tests/dcache_correctness.rs | 63 +++++++++++++++++++++++++++++++++++++ 4 files changed, 104 insertions(+), 1 deletion(-) diff --git a/lib/fs/async_fs.rs b/lib/fs/async_fs.rs index e8cb93e0..7ce84817 100644 --- a/lib/fs/async_fs.rs +++ b/lib/fs/async_fs.rs @@ -282,6 +282,12 @@ const MAX_PREFETCH_CONCURRENCY: usize = 8; /// called on a true cache miss (not already cached or in-flight). /// /// The [`DCache`] sits in front as a synchronous fast path mapping `(parent, name)` to child addr. +/// +/// **Known limitation:** Both `inode_table` and `lookup_cache` grow monotonically — entries are +/// only removed when FUSE sends `forget`, which may never happen for long-lived mounts or +/// recursive traversals (e.g. `find`, `tree`). Under sustained traversal the memory footprint +/// grows without bound. Adding LRU or TTL-based eviction to these caches is a planned +/// improvement. pub struct AsyncFs { /// Canonical addr -> `INode` map. Used by `loaded_inode()` to retrieve inodes by address. inode_table: Arc>, diff --git a/lib/fs/dcache.rs b/lib/fs/dcache.rs index 49351776..914713fa 100644 --- a/lib/fs/dcache.rs +++ b/lib/fs/dcache.rs @@ -223,17 +223,46 @@ impl DCache { /// concurrent `readdir` is mid-populate (`IN_PROGRESS`), a blind store /// of `UNCLAIMED` would be overwritten by the populator's final `DONE` /// store, leaving the cache in a stale-but-marked-done state. + /// + /// # Ordering with concurrent `insert` + /// + /// The reverse-index removal and children-map removal are performed + /// while holding the parent's `children` write lock. This serializes + /// with `insert` (which also holds the write lock while updating the + /// reverse indices), preventing a race where a concurrent `insert` + /// for the same child inode could clobber freshly removed reverse-index + /// entries between `child_to_parent.remove_sync` and the write lock + /// acquisition. pub fn evict(&self, child_ino: LoadedAddr) { - let Some((_, parent_ino)) = self.child_to_parent.remove_sync(&child_ino) else { + // Read the parent without removing — we need the write lock first + // to serialize with concurrent `insert`. + let Some(parent_ino) = self.child_to_parent.read_sync(&child_ino, |_, &v| v) else { return; }; let Some(state) = self.dirs.read_sync(&parent_ino, |_, v| Arc::clone(v)) else { + // Parent dir was removed; clean up reverse indices. + self.child_to_parent.remove_sync(&child_ino); + self.child_to_name.remove_sync(&child_ino); return; }; let mut children = state .children .write() .unwrap_or_else(std::sync::PoisonError::into_inner); + // Re-check that child_to_parent still points to this parent. + // A concurrent `insert` may have re-parented the child while we + // were waiting for the write lock. + let still_ours = self + .child_to_parent + .read_sync(&child_ino, |_, &v| v == parent_ino) + .unwrap_or(false); + if !still_ours { + // The child was re-parented by a concurrent insert. + // Nothing to do — the new parent owns the reverse indices. + return; + } + // Now atomically remove reverse indices and children entry. + self.child_to_parent.remove_sync(&child_ino); if let Some((_, name)) = self.child_to_name.remove_sync(&child_ino) { children.remove(&name); } diff --git a/src/fs/mescloud/repo.rs b/src/fs/mescloud/repo.rs index 2e1ca999..f1662ec7 100644 --- a/src/fs/mescloud/repo.rs +++ b/src/fs/mescloud/repo.rs @@ -153,6 +153,11 @@ impl FsDataProvider for MesRepoProvider { // the second insert to silently overwrite the first. The `AsyncFs` lookup cache // deduplicates in practice, but a content-addressed scheme (hash path → addr) // would eliminate the race structurally. + // + // This also interacts badly with cache eviction: when an inode is evicted and + // later re-looked-up, a fresh address is minted, leaking stale bridge and + // addr_to_slot entries for the old address. A stable, content-addressed scheme + // would make re-lookup return the same address and avoid the leak. let addr = inner.next_addr.fetch_add(1, Ordering::Relaxed); drop(inner.path_map.insert_async(addr, child_path).await); diff --git a/tests/dcache_correctness.rs b/tests/dcache_correctness.rs index 32010711..d6d16534 100644 --- a/tests/dcache_correctness.rs +++ b/tests/dcache_correctness.rs @@ -421,3 +421,66 @@ async fn evict_during_populate_invalidates_generation() { "should be re-claimable after evict invalidated the generation" ); } + +#[tokio::test] +async fn evict_then_reinsert_same_child_leaves_consistent_state() { + let cache = DCache::new(); + let parent_a = LoadedAddr::new_unchecked(1); + let parent_b = LoadedAddr::new_unchecked(2); + let child = LoadedAddr::new_unchecked(10); + + // Insert child under parent_a. + cache.insert(parent_a, OsString::from("foo"), child, false); + assert!(cache.lookup(parent_a, OsStr::new("foo")).is_some()); + + // Evict the child. + cache.evict(child); + assert!(cache.lookup(parent_a, OsStr::new("foo")).is_none()); + + // Re-insert the same child under a different parent. + cache.insert(parent_b, OsString::from("bar"), child, true); + assert!(cache.lookup(parent_b, OsStr::new("bar")).is_some()); + + // A second evict should remove from parent_b, not parent_a. + cache.evict(child); + assert!( + cache.lookup(parent_b, OsStr::new("bar")).is_none(), + "evict after re-insert should remove from the new parent" + ); +} + +#[tokio::test] +async fn evict_with_concurrent_reparent_does_not_corrupt() { + // Simulates the interleaving where insert re-parents a child between + // evict's parent lookup and write-lock acquisition. + let cache = DCache::new(); + let parent_a = LoadedAddr::new_unchecked(1); + let parent_b = LoadedAddr::new_unchecked(2); + let child = LoadedAddr::new_unchecked(10); + + // Insert child under parent_a. + cache.insert(parent_a, OsString::from("foo"), child, false); + + // Simulate: evict reads parent_ino = parent_a from reverse index, + // then insert re-parents child to parent_b before evict acquires + // the write lock. We can't truly interleave threads here, but we + // can verify the post-condition: after insert moves the child to + // parent_b and evict runs, the child should still be in parent_b. + cache.insert(parent_b, OsString::from("bar"), child, false); + + // Now evict — should detect that child_to_parent no longer points + // to parent_a and leave parent_b's entry intact. + cache.evict(child); + + // parent_b's "bar" entry should have been evicted (child_to_parent + // now points to parent_b, so evict targets the correct parent). + assert!( + cache.lookup(parent_b, OsStr::new("bar")).is_none(), + "evict should target the current parent, not a stale one" + ); + // parent_a's "foo" entry should already have been cleaned up by the + // insert that re-parented the child (insert replaces the old entry + // only if the ino changes — here same ino, different parent, so + // parent_a retains a stale "foo" entry pointing to the moved child). + // This is a known limitation: insert does not cross-parent cleanup. +} From c171cca777c1f6a178036bc92b69b488f38ca697 Mon Sep 17 00:00:00 2001 From: Marko Vejnovic Date: Sun, 22 Feb 2026 14:24:35 -0800 Subject: [PATCH 52/58] small cleanup --- lib/fs/composite.rs | 25 ++++++++++--------------- lib/fs/dcache.rs | 6 ++++++ lib/fs/mod.rs | 6 ++++++ src/daemon.rs | 2 +- src/fs/mescloud/repo.rs | 5 +++-- src/fs/mescloud/roots.rs | 8 +++----- 6 files changed, 29 insertions(+), 23 deletions(-) diff --git a/lib/fs/composite.rs b/lib/fs/composite.rs index ba18bc4b..bca8125d 100644 --- a/lib/fs/composite.rs +++ b/lib/fs/composite.rs @@ -16,7 +16,7 @@ use bytes::Bytes; use crate::cache::async_backed::FutureBackedCache; use crate::fs::async_fs::{AsyncFs, FileReader, FsDataProvider, OpenFile}; use crate::fs::bridge::ConcurrentBridge; -use crate::fs::{INode, INodeType, InodeAddr, InodePerms, LoadedAddr, OpenFlags}; +use crate::fs::{INode, INodeType, InodeAddr, InodePerms, LoadedAddr, OpenFlags, ROOT_INO}; /// Descriptor for a child filesystem returned by [`CompositeRoot`]. pub struct ChildDescriptor { @@ -55,16 +55,14 @@ pub trait CompositeRoot: Send + Sync + 'static { /// Co-locates an inode table and [`AsyncFs`]. pub struct ChildInner { - #[expect(dead_code)] - table: Arc>, fs: AsyncFs, } impl ChildInner { pub(crate) fn create(table: FutureBackedCache, provider: DP) -> Self { let table = Arc::new(table); - let fs = AsyncFs::new_preseeded(provider, Arc::clone(&table)); - Self { table, fs } + let fs = AsyncFs::new_preseeded(provider, table); + Self { fs } } pub(crate) fn get_fs(&self) -> &AsyncFs { @@ -152,9 +150,6 @@ impl Clone for CompositeFs { } impl CompositeFs { - /// Root inode address for this composite level. - pub const ROOT_INO: InodeAddr = 1; - /// Create a new composite filesystem. #[must_use] pub fn new(root: R, fs_owner: (u32, u32)) -> Self { @@ -165,7 +160,7 @@ impl CompositeFs { addr_to_slot: scc::HashMap::new(), name_to_slot: scc::HashMap::new(), next_slot: AtomicU64::new(0), - next_ino: AtomicU64::new(2), // 1 = root + next_ino: AtomicU64::new(ROOT_INO + 1), fs_owner, }), } @@ -176,7 +171,7 @@ impl CompositeFs { pub fn make_root_inode(&self) -> INode { let now = std::time::SystemTime::now(); INode { - addr: Self::ROOT_INO, + addr: ROOT_INO, permissions: InodePerms::from_bits_truncate(0o755), uid: self.inner.fs_owner.0, gid: self.inner.fs_owner.1, @@ -201,7 +196,7 @@ impl CompositeFs { gid: self.inner.fs_owner.1, create_time: now, last_modified_at: now, - parent: Some(Self::ROOT_INO), + parent: Some(ROOT_INO), size: 0, itype: INodeType::Directory, } @@ -291,7 +286,7 @@ where type Reader = CompositeReader<<::ChildDP as FsDataProvider>::Reader>; async fn lookup(&self, parent: INode, name: &OsStr) -> Result { - if parent.addr == Self::ROOT_INO { + if parent.addr == ROOT_INO { let desc = self .inner .root @@ -345,7 +340,7 @@ where } async fn readdir(&self, parent: INode) -> Result, std::io::Error> { - if parent.addr == Self::ROOT_INO { + if parent.addr == ROOT_INO { let children = self.inner.root.list_children().await?; let mut entries = Vec::with_capacity(children.len()); for desc in &children { @@ -451,7 +446,7 @@ where /// /// The root inode is never forgotten. fn forget(&self, addr: InodeAddr) { - if addr == Self::ROOT_INO { + if addr == ROOT_INO { return; } let Some(slot_idx) = self.inner.addr_to_slot.read_sync(&addr, |_, &v| v) else { @@ -501,7 +496,7 @@ where let removed = self .inner .slots - .remove_if_sync(&slot_idx, |slot| slot.bridge.is_empty()); + .remove_if_sync(&slot_idx, |slot| slot.bridge.is_empty_locked()); if let Some((_, slot)) = removed { self.inner.name_to_slot.remove_sync(&slot.name); } diff --git a/lib/fs/dcache.rs b/lib/fs/dcache.rs index 914713fa..39c0ded0 100644 --- a/lib/fs/dcache.rs +++ b/lib/fs/dcache.rs @@ -35,6 +35,12 @@ pub enum PopulateStatus { /// Per-parent directory state holding child entries and a population flag. struct DirState { + /// Child entries, guarded by `std::sync::RwLock` (NOT `tokio::sync::RwLock`). + /// + /// This is intentional: all lock acquisitions are scoped and synchronous, + /// never held across `.await` points. `std::sync::RwLock` has lower + /// overhead in the uncontended case. Do NOT introduce `.await` calls + /// while holding a guard — this would block the tokio worker thread. children: RwLock>, populated: AtomicU8, /// Monotonically increasing counter bumped by each [`DCache::evict`] call. diff --git a/lib/fs/mod.rs b/lib/fs/mod.rs index 52f9510e..b277aa23 100644 --- a/lib/fs/mod.rs +++ b/lib/fs/mod.rs @@ -40,6 +40,12 @@ use bitflags::bitflags; /// Type representing an inode identifier. pub type InodeAddr = u64; +/// The conventional root inode address used by all filesystem layers. +/// +/// Both [`CompositeFs`](composite::CompositeFs) and data providers use this +/// value as the root address. Monotonic inode counters start at `ROOT_INO + 1`. +pub const ROOT_INO: InodeAddr = 1; + /// Represents an inode address that has been loaded into the inode table. /// /// This newtype wrapper distinguishes inode addresses that are known to exist diff --git a/src/daemon.rs b/src/daemon.rs index 102e476b..d07070c4 100644 --- a/src/daemon.rs +++ b/src/daemon.rs @@ -73,7 +73,7 @@ mod managed_fuse { let table = FutureBackedCache::default(); let root_inode = composite.make_root_inode(); - table.insert_sync(1, root_inode); + table.insert_sync(git_fs::fs::ROOT_INO, root_inode); let fuse_adapter = FuserAdapter::new(table, composite, handle); let mount_opts = [ diff --git a/src/fs/mescloud/repo.rs b/src/fs/mescloud/repo.rs index f1662ec7..3998e0b8 100644 --- a/src/fs/mescloud/repo.rs +++ b/src/fs/mescloud/repo.rs @@ -19,7 +19,7 @@ use tracing::warn; use git_fs::cache::fcache::FileCache; use git_fs::cache::traits::{AsyncReadableCache as _, AsyncWritableCache as _}; use git_fs::fs::async_fs::{FileReader, FsDataProvider}; -use git_fs::fs::{INode, INodeType, InodeAddr, InodePerms, OpenFlags as AsyncOpenFlags}; +use git_fs::fs::{INode, INodeType, InodeAddr, InodePerms, OpenFlags as AsyncOpenFlags, ROOT_INO}; use super::common::{MesaApiError, mesa_api_error_to_io}; @@ -68,7 +68,7 @@ impl MesRepoProvider { repo_name, ref_, fs_owner, - next_addr: AtomicU64::new(2), // 1 is reserved for root + next_addr: AtomicU64::new(ROOT_INO + 1), path_map: scc::HashMap::new(), file_cache, }), @@ -136,6 +136,7 @@ impl FsDataProvider for MesRepoProvider { let now = SystemTime::now(); let (uid, gid) = inner.fs_owner; + // Symlinks are mapped to File because FuserAdapter does not implement readlink. let (itype, size) = match &content { Content::File(f) => (INodeType::File, f.size.to_u64().unwrap_or(0)), Content::Symlink(s) => (INodeType::File, s.size.to_u64().unwrap_or(0)), diff --git a/src/fs/mescloud/roots.rs b/src/fs/mescloud/roots.rs index ca66da45..bdd676e0 100644 --- a/src/fs/mescloud/roots.rs +++ b/src/fs/mescloud/roots.rs @@ -19,14 +19,12 @@ use tracing::warn; use git_fs::cache::fcache::FileCache; use git_fs::fs::async_fs::{FileReader, FsDataProvider}; use git_fs::fs::composite::{ChildDescriptor, CompositeFs, CompositeReader, CompositeRoot}; -use git_fs::fs::{INode, INodeType, InodeAddr, InodePerms, OpenFlags}; +use git_fs::fs::{INode, INodeType, InodeAddr, InodePerms, OpenFlags, ROOT_INO}; use super::common::{MesaApiError, mesa_api_error_to_io}; use super::repo::{MesFileReader, MesRepoProvider}; use crate::app_config::CacheConfig; -const CHILD_ROOT_ADDR: InodeAddr = 1; - /// Create a [`MesRepoProvider`] and its root [`INode`] for a given repo. async fn create_repo_provider( client: &MesaClient, @@ -61,11 +59,11 @@ async fn create_repo_provider( file_cache, ); - provider.seed_root_path(CHILD_ROOT_ADDR); + provider.seed_root_path(ROOT_INO); let now = SystemTime::now(); let root_ino = INode { - addr: CHILD_ROOT_ADDR, + addr: ROOT_INO, permissions: InodePerms::from_bits_truncate(0o755), uid: fs_owner.0, gid: fs_owner.1, From 16f1c71087dacaafe0fd3033724d5669b1295749 Mon Sep 17 00:00:00 2001 From: Marko Vejnovic Date: Sun, 22 Feb 2026 14:59:09 -0800 Subject: [PATCH 53/58] tickets --- lib/cache/async_backed.rs | 14 +++++++++++--- src/fs/mescloud/repo.rs | 2 +- 2 files changed, 12 insertions(+), 4 deletions(-) diff --git a/lib/cache/async_backed.rs b/lib/cache/async_backed.rs index 7c406161..52c5916b 100644 --- a/lib/cache/async_backed.rs +++ b/lib/cache/async_backed.rs @@ -156,9 +156,17 @@ where /// When the factory returns `Err`, the poisoned entry is removed and the /// next caller becomes a new owner with its own factory invocation. This /// means failures are **not deduplicated**: under transient errors, N - /// concurrent callers may each independently invoke their factory rather - /// than coalescing on the first error. This is intentional — callers - /// may have different retry or error-handling semantics. + /// concurrent callers may each *sequentially* invoke their factory (one + /// at a time via the `entry_async` gate) rather than coalescing on the + /// first error. This is intentional — callers may have different retry + /// or error-handling semantics. + /// + /// Note: this is serial retry, not a thundering herd. Each failed owner + /// is replaced by exactly one new owner from the pool of waiters. + /// + // TODO(MES-776): consider adding a negative cache with short TTL so that + // under sustained API errors, retries are bounded to 1/TTL per key + // rather than N sequential calls. /// /// # Panic safety /// diff --git a/src/fs/mescloud/repo.rs b/src/fs/mescloud/repo.rs index 3998e0b8..d5f73a09 100644 --- a/src/fs/mescloud/repo.rs +++ b/src/fs/mescloud/repo.rs @@ -149,7 +149,7 @@ impl FsDataProvider for MesRepoProvider { InodePerms::from_bits_truncate(0o644) }; - // TODO(MES-697): Address allocation is racy — two concurrent lookups for the same + // TODO(MES-777): Address allocation is racy — two concurrent lookups for the same // child path each allocate a fresh address and insert into `path_map`, causing // the second insert to silently overwrite the first. The `AsyncFs` lookup cache // deduplicates in practice, but a content-addressed scheme (hash path → addr) From 66cf7495b2038ab9011e0ea8767d6095049d3178 Mon Sep 17 00:00:00 2001 From: Marko Vejnovic Date: Sun, 22 Feb 2026 15:56:28 -0800 Subject: [PATCH 54/58] more perf --- lib/fs/async_fs.rs | 68 +++++++++++++++++++++++++++++++++++++-------- lib/fs/composite.rs | 16 +++++++++-- lib/fs/fuser.rs | 4 ++- lib/fs/mod.rs | 2 +- 4 files changed, 74 insertions(+), 16 deletions(-) diff --git a/lib/fs/async_fs.rs b/lib/fs/async_fs.rs index 7ce84817..b5f0afd6 100644 --- a/lib/fs/async_fs.rs +++ b/lib/fs/async_fs.rs @@ -15,6 +15,13 @@ use crate::fs::{ dcache::DCache, }; +/// The concrete type of the lookup cache used by [`AsyncFs`]. +/// +/// Keyed by `(parent_addr, child_name)`, valued by the resolved `INode`. +/// Exposed as a type alias so [`InodeForget`] can include it in its +/// `StatelessDrop` context without repeating the full generic signature. +pub type LookupCache = FutureBackedCache<(InodeAddr, Arc), INode>; + /// A reader for an open file, returned by [`FsDataProvider::open`]. /// /// Implementors provide the actual data for read operations. The FUSE @@ -93,17 +100,42 @@ impl StatelessDrop>, InodeAddr> for Inod } } -/// Evicts the inode from the table and the directory cache, then delegates to -/// [`FsDataProvider::forget`] so the provider can clean up its own auxiliary -/// state. +/// Evicts the inode from the table, directory cache, and lookup cache, then +/// delegates to [`FsDataProvider::forget`] so the provider can clean up its +/// own auxiliary state. +/// +/// The lookup cache cleanup (`remove_ready_if_sync`) ensures that stale +/// `(parent, name) → INode` entries do not survive after FUSE forgets an +/// inode. Without this, a subsequent `lookup` would hit the stale cache +/// entry, observe a missing inode table entry, and have to fall through to +/// the slow path — correct but wasteful. impl - StatelessDrop<(Arc>, Arc, DP), InodeAddr> - for InodeForget + StatelessDrop< + ( + Arc>, + Arc, + Arc, + DP, + ), + InodeAddr, + > for InodeForget { - fn delete(ctx: &(Arc>, Arc, DP), key: &InodeAddr) { + fn delete( + ctx: &( + Arc>, + Arc, + Arc, + DP, + ), + key: &InodeAddr, + ) { + let addr = *key; ctx.0.remove_sync(key); - ctx.1.evict(LoadedAddr::new_unchecked(*key)); - ctx.2.forget(*key); + ctx.1.evict(LoadedAddr::new_unchecked(addr)); + ctx.2.remove_ready_if_sync(|&(parent_addr, _), child| { + parent_addr == addr || child.addr == addr + }); + ctx.3.forget(addr); } } @@ -294,7 +326,11 @@ pub struct AsyncFs { /// Deduplicating lookup cache keyed by `(parent_addr, child_name)`. The factory is /// `dp.lookup()`, so the data provider is only called on a true cache miss. - lookup_cache: FutureBackedCache<(InodeAddr, Arc), INode>, + /// + /// Wrapped in `Arc` so that [`InodeForget`] can include it in its + /// `StatelessDrop` context and clean up stale entries when FUSE forgets + /// an inode. + lookup_cache: Arc, /// Directory entry cache, mapping `(parent, name)` to child inode address. directory_cache: Arc, @@ -322,7 +358,7 @@ impl AsyncFs { Self { inode_table, - lookup_cache: FutureBackedCache::default(), + lookup_cache: Arc::new(LookupCache::default()), directory_cache: Arc::new(DCache::new()), data_provider, next_fh: AtomicU64::new(1), @@ -341,7 +377,7 @@ impl AsyncFs { ) -> Self { Self { inode_table, - lookup_cache: FutureBackedCache::default(), + lookup_cache: Arc::new(LookupCache::default()), directory_cache: Arc::new(DCache::new()), data_provider, next_fh: AtomicU64::new(1), @@ -380,6 +416,16 @@ impl AsyncFs { Arc::clone(&self.directory_cache) } + /// Returns a clone of the lookup cache handle. + /// + /// Used by the FUSE adapter to pass the cache into the [`DropWard`] + /// context so that [`InodeForget`] can clean up stale + /// `(parent, name) → INode` entries when the kernel forgets an inode. + #[must_use] + pub fn lookup_cache(&self) -> Arc { + Arc::clone(&self.lookup_cache) + } + /// Get the total number of inodes currently stored in the inode table. #[must_use] pub fn inode_count(&self) -> usize { diff --git a/lib/fs/composite.rs b/lib/fs/composite.rs index bca8125d..837b0a90 100644 --- a/lib/fs/composite.rs +++ b/lib/fs/composite.rs @@ -464,10 +464,20 @@ where // Propagate forget to the child's inode table and data provider so // inner inodes don't leak until the entire slot is GC'd. + // + // Clone the `Arc` out of the scc bucket guard before + // calling `evict`. `evict` does O(n) work (`retain_sync` on the + // lookup_cache), and holding the `slots` bucket lock for the + // entire duration would block concurrent lookups and forgets that + // hash to the same bucket. if let Some(inner_addr) = removed_inner { - self.inner.slots.read_sync(&slot_idx, |_, slot| { - slot.inner.get_fs().evict(inner_addr); - }); + let child = self + .inner + .slots + .read_sync(&slot_idx, |_, slot| Arc::clone(&slot.inner)); + if let Some(child) = child { + child.get_fs().evict(inner_addr); + } } // Now safe to remove from addr_to_slot — concurrent lookups that diff --git a/lib/fs/fuser.rs b/lib/fs/fuser.rs index b7f886be..a1e32657 100644 --- a/lib/fs/fuser.rs +++ b/lib/fs/fuser.rs @@ -73,6 +73,7 @@ type FuseWard = crate::drop_ward::DropWard< ( Arc>, Arc, + Arc, DP, ), InodeAddr, @@ -89,7 +90,8 @@ impl FuseBridgeInner { let table = Arc::new(table); let fs = super::async_fs::AsyncFs::new_preseeded(provider.clone(), Arc::clone(&table)); let dcache = fs.directory_cache(); - let ward = crate::drop_ward::DropWard::new((table, dcache, provider)); + let lookup_cache = fs.lookup_cache(); + let ward = crate::drop_ward::DropWard::new((table, dcache, lookup_cache, provider)); Self { ward, fs } } diff --git a/lib/fs/mod.rs b/lib/fs/mod.rs index b277aa23..ccefbecc 100644 --- a/lib/fs/mod.rs +++ b/lib/fs/mod.rs @@ -30,7 +30,7 @@ pub mod dcache; /// FUSE adapter: maps [`fuser::Filesystem`] callbacks to [`async_fs::AsyncFs`]. pub mod fuser; -pub use async_fs::{InodeForget, InodeLifecycle, OpenFile, ResolvedINode}; +pub use async_fs::{InodeForget, InodeLifecycle, LookupCache, OpenFile, ResolvedINode}; use std::ffi::OsStr; use std::time::SystemTime; From 603ee593108308e3f5a6f835e312c30a5483e048 Mon Sep 17 00:00:00 2001 From: Marko Vejnovic Date: Mon, 23 Feb 2026 12:20:09 -0800 Subject: [PATCH 55/58] dcache correctness --- lib/fs/dcache.rs | 48 ++++++++++++++++ tests/dcache_correctness.rs | 111 ++++++++++++++++++++++++++++++++++-- 2 files changed, 154 insertions(+), 5 deletions(-) diff --git a/lib/fs/dcache.rs b/lib/fs/dcache.rs index 39c0ded0..911dad99 100644 --- a/lib/fs/dcache.rs +++ b/lib/fs/dcache.rs @@ -123,7 +123,14 @@ impl DCache { } /// Atomically inserts or overwrites a child entry in the cache. + /// + /// When re-parenting a child (the child was previously cached under a + /// different parent), the stale entry in the old parent's children map is + /// removed and the old parent's populate status is reset to `UNCLAIMED` + /// so that the next `readdir` re-fetches from the data provider. pub fn insert(&self, parent_ino: LoadedAddr, name: OsString, ino: LoadedAddr, is_dir: bool) { + self.cleanup_old_parent(parent_ino, ino); + let state = self.dir_state(parent_ino); let value = DValue { ino, is_dir }; let mut children = state @@ -140,6 +147,47 @@ impl DCache { self.child_to_parent.upsert_sync(ino, parent_ino); } + /// If `ino` is currently cached under a parent different from + /// `new_parent`, remove its stale entry from the old parent and reset + /// that parent's populate status. + /// + /// The old parent's write lock is acquired and released before `insert` + /// takes the new parent's write lock, avoiding any deadlock from + /// simultaneous two-lock holds. + fn cleanup_old_parent(&self, new_parent: LoadedAddr, ino: LoadedAddr) { + let Some(old_parent) = self.child_to_parent.read_sync(&ino, |_, &v| v) else { + return; + }; + if old_parent == new_parent { + return; + } + let Some(old_name) = self.child_to_name.read_sync(&ino, |_, v| v.clone()) else { + return; + }; + let Some(old_state) = self.dirs.read_sync(&old_parent, |_, v| Arc::clone(v)) else { + return; + }; + let mut old_children = old_state + .children + .write() + .unwrap_or_else(std::sync::PoisonError::into_inner); + // Guard: only remove if the entry still maps to this inode. + // A concurrent insert may have reused the name for a different child. + if old_children.get(&old_name).is_some_and(|dv| dv.ino == ino) { + old_children.remove(&old_name); + } + drop(old_children); + // Reset populate status so the next readdir re-fetches. + old_state.generation.fetch_add(1, Ordering::Release); + let _ = old_state.populated.compare_exchange( + POPULATE_DONE, + POPULATE_UNCLAIMED, + Ordering::AcqRel, + Ordering::Relaxed, + ); + old_state.notify.notify_waiters(); + } + /// Iterate all cached children of `parent_ino` in name-sorted order. /// /// Calls `f` for each `(name, value)` pair while holding the read lock. diff --git a/tests/dcache_correctness.rs b/tests/dcache_correctness.rs index d6d16534..023695dc 100644 --- a/tests/dcache_correctness.rs +++ b/tests/dcache_correctness.rs @@ -478,9 +478,110 @@ async fn evict_with_concurrent_reparent_does_not_corrupt() { cache.lookup(parent_b, OsStr::new("bar")).is_none(), "evict should target the current parent, not a stale one" ); - // parent_a's "foo" entry should already have been cleaned up by the - // insert that re-parented the child (insert replaces the old entry - // only if the ino changes — here same ino, different parent, so - // parent_a retains a stale "foo" entry pointing to the moved child). - // This is a known limitation: insert does not cross-parent cleanup. + // parent_a's "foo" entry should have been cleaned up by the insert + // that re-parented the child to parent_b. + assert!( + cache.lookup(parent_a, OsStr::new("foo")).is_none(), + "insert should clean up stale entry in old parent when re-parenting" + ); +} + +#[tokio::test] +async fn insert_reparent_removes_stale_entry_from_old_parent() { + let cache = DCache::new(); + let parent_a = LoadedAddr::new_unchecked(1); + let parent_b = LoadedAddr::new_unchecked(2); + let child = LoadedAddr::new_unchecked(10); + + cache.insert(parent_a, OsString::from("foo"), child, false); + assert!(cache.lookup(parent_a, OsStr::new("foo")).is_some()); + + // Re-parent: insert same child under parent_b. + cache.insert(parent_b, OsString::from("bar"), child, false); + + // Old parent should no longer have the stale entry. + assert!( + cache.lookup(parent_a, OsStr::new("foo")).is_none(), + "stale entry in old parent should be cleaned up on re-parent" + ); + // New parent should have the entry. + assert!(cache.lookup(parent_b, OsStr::new("bar")).is_some()); +} + +#[tokio::test] +async fn insert_reparent_resets_old_parent_populate_status() { + let cache = DCache::new(); + let parent_a = LoadedAddr::new_unchecked(1); + let parent_b = LoadedAddr::new_unchecked(2); + let child = LoadedAddr::new_unchecked(10); + + cache.insert(parent_a, OsString::from("foo"), child, false); + let PopulateStatus::Claimed(claim_gen) = cache.try_claim_populate(parent_a) else { + panic!("expected Claimed"); + }; + cache.finish_populate(parent_a, claim_gen); + assert!(matches!( + cache.try_claim_populate(parent_a), + PopulateStatus::Done + )); + + // Re-parent: insert same child under parent_b. + cache.insert(parent_b, OsString::from("bar"), child, false); + + // Old parent's populate status should be reset to allow re-fetch. + assert!( + matches!( + cache.try_claim_populate(parent_a), + PopulateStatus::Claimed(_) + ), + "old parent should be re-claimable after child was re-parented away" + ); +} + +#[tokio::test] +async fn insert_reparent_does_not_remove_reused_name_in_old_parent() { + let cache = DCache::new(); + let parent_a = LoadedAddr::new_unchecked(1); + let parent_b = LoadedAddr::new_unchecked(2); + let child_1 = LoadedAddr::new_unchecked(10); + let child_2 = LoadedAddr::new_unchecked(20); + + // Insert child_1 under parent_a as "foo". + cache.insert(parent_a, OsString::from("foo"), child_1, false); + + // Replace "foo" in parent_a with a different child (child_2). + cache.insert(parent_a, OsString::from("foo"), child_2, false); + + // Now re-parent child_1 to parent_b. The old name "foo" is still in + // child_to_name for child_1, but parent_a's "foo" now points to child_2. + cache.insert(parent_b, OsString::from("bar"), child_1, false); + + // parent_a's "foo" should still point to child_2, not be removed. + let dv = cache.lookup(parent_a, OsStr::new("foo")); + assert!( + dv.is_some(), + "should not remove entry belonging to different child" + ); + assert_eq!(dv.unwrap().ino, child_2); +} + +#[tokio::test] +async fn insert_reparent_same_parent_is_noop() { + let cache = DCache::new(); + let parent = LoadedAddr::new_unchecked(1); + let child = LoadedAddr::new_unchecked(10); + + cache.insert(parent, OsString::from("foo"), child, false); + + // Re-insert under the same parent with a different name. + // This is not a re-parent, so no cross-parent cleanup should happen. + cache.insert(parent, OsString::from("bar"), child, false); + + // "bar" should exist (the new entry). + assert!(cache.lookup(parent, OsStr::new("bar")).is_some()); + // "foo" still exists because insert only cleans up cross-parent + // stale entries, not same-parent renames. The old entry under + // the same parent is a separate concern (name -> ino mapping). + // The reverse index now says child_to_name[child] = "bar", + // so evict will target "bar", not "foo". } From 8201cfbe4a7b3965bbcef950dbfc2d22dd8b2c38 Mon Sep 17 00:00:00 2001 From: Marko Vejnovic Date: Mon, 23 Feb 2026 13:01:02 -0800 Subject: [PATCH 56/58] tests and docs --- lib/fs/dcache.rs | 73 +++++++++++++++++++++++++------------ tests/dcache_correctness.rs | 17 ++++----- 2 files changed, 56 insertions(+), 34 deletions(-) diff --git a/lib/fs/dcache.rs b/lib/fs/dcache.rs index 911dad99..a6a13460 100644 --- a/lib/fs/dcache.rs +++ b/lib/fs/dcache.rs @@ -124,12 +124,16 @@ impl DCache { /// Atomically inserts or overwrites a child entry in the cache. /// - /// When re-parenting a child (the child was previously cached under a - /// different parent), the stale entry in the old parent's children map is - /// removed and the old parent's populate status is reset to `UNCLAIMED` - /// so that the next `readdir` re-fetches from the data provider. + /// Handles two kinds of stale-entry cleanup before the insert: + /// + /// - **Cross-parent move:** the child was previously cached under a + /// different parent. The old entry is removed and the old parent's + /// populate status is reset to `UNCLAIMED`. + /// - **Same-parent rename:** the child was previously cached under this + /// parent with a different name. The old name entry is removed so that + /// `readdir` does not return two entries for the same inode. pub fn insert(&self, parent_ino: LoadedAddr, name: OsString, ino: LoadedAddr, is_dir: bool) { - self.cleanup_old_parent(parent_ino, ino); + self.cleanup_stale_entry(parent_ino, &name, ino); let state = self.dir_state(parent_ino); let value = DValue { ino, is_dir }; @@ -147,23 +151,39 @@ impl DCache { self.child_to_parent.upsert_sync(ino, parent_ino); } - /// If `ino` is currently cached under a parent different from - /// `new_parent`, remove its stale entry from the old parent and reset - /// that parent's populate status. + /// Remove a stale cache entry for `ino` if it moved to a new parent or + /// was renamed within the same parent. + /// + /// For cross-parent moves the old parent's write lock is acquired and + /// released before `insert` takes the new parent's write lock, avoiding + /// deadlock from simultaneous two-lock holds. + /// + /// For same-parent renames the same lock is acquired sequentially (not + /// nested). The brief window between the two acquisitions is acceptable: + /// during initial population `readdir` is blocked by `IN_PROGRESS`, and + /// after population a concurrent `readdir` would at worst momentarily + /// miss the entry — it will reappear on the next call. + /// + /// # Concurrent same-inode inserts /// - /// The old parent's write lock is acquired and released before `insert` - /// takes the new parent's write lock, avoiding any deadlock from - /// simultaneous two-lock holds. - fn cleanup_old_parent(&self, new_parent: LoadedAddr, ino: LoadedAddr) { + /// Two concurrent `insert` calls for the same `ino` with different names + /// under the same parent can orphan an entry: both read the same stale + /// `old_name`, the first removes it, the second's guard no-ops, and both + /// inserts proceed — leaving two name→ino mappings with only the last + /// writer's name in the reverse index. This is not reachable in practice + /// because [`AsyncFs`](super::async_fs::AsyncFs) deduplicates per-inode + /// operations through [`FutureBackedCache`](crate::cache::async_backed::FutureBackedCache), + /// but callers that bypass that layer must serialize inserts per inode. + fn cleanup_stale_entry(&self, new_parent: LoadedAddr, new_name: &OsStr, ino: LoadedAddr) { let Some(old_parent) = self.child_to_parent.read_sync(&ino, |_, &v| v) else { return; }; - if old_parent == new_parent { - return; - } let Some(old_name) = self.child_to_name.read_sync(&ino, |_, v| v.clone()) else { return; }; + if old_parent == new_parent && old_name.as_os_str() == new_name { + return; + } let Some(old_state) = self.dirs.read_sync(&old_parent, |_, v| Arc::clone(v)) else { return; }; @@ -177,15 +197,20 @@ impl DCache { old_children.remove(&old_name); } drop(old_children); - // Reset populate status so the next readdir re-fetches. - old_state.generation.fetch_add(1, Ordering::Release); - let _ = old_state.populated.compare_exchange( - POPULATE_DONE, - POPULATE_UNCLAIMED, - Ordering::AcqRel, - Ordering::Relaxed, - ); - old_state.notify.notify_waiters(); + // Only reset populate status for cross-parent moves. A same-parent + // rename does not invalidate the directory listing — the data + // provider returned the new name, so the cache is being corrected, + // not going stale. + if old_parent != new_parent { + old_state.generation.fetch_add(1, Ordering::Release); + let _ = old_state.populated.compare_exchange( + POPULATE_DONE, + POPULATE_UNCLAIMED, + Ordering::AcqRel, + Ordering::Relaxed, + ); + old_state.notify.notify_waiters(); + } } /// Iterate all cached children of `parent_ino` in name-sorted order. diff --git a/tests/dcache_correctness.rs b/tests/dcache_correctness.rs index 023695dc..f58e12a0 100644 --- a/tests/dcache_correctness.rs +++ b/tests/dcache_correctness.rs @@ -165,11 +165,11 @@ async fn upsert_overwrites_existing_entry() { #[tokio::test] async fn readdir_returns_entries_in_sorted_order() { let cache = DCache::new(); - for name in ["zebra", "apple", "mango"] { + for (i, name) in ["zebra", "apple", "mango"].iter().enumerate() { cache.insert( LoadedAddr::new_unchecked(1), - OsString::from(name), - LoadedAddr::new_unchecked(10), + OsString::from(*name), + LoadedAddr::new_unchecked(10 + i as u64), false, ); } @@ -566,7 +566,7 @@ async fn insert_reparent_does_not_remove_reused_name_in_old_parent() { } #[tokio::test] -async fn insert_reparent_same_parent_is_noop() { +async fn insert_reparent_same_parent_removes_old_name() { let cache = DCache::new(); let parent = LoadedAddr::new_unchecked(1); let child = LoadedAddr::new_unchecked(10); @@ -574,14 +574,11 @@ async fn insert_reparent_same_parent_is_noop() { cache.insert(parent, OsString::from("foo"), child, false); // Re-insert under the same parent with a different name. - // This is not a re-parent, so no cross-parent cleanup should happen. cache.insert(parent, OsString::from("bar"), child, false); // "bar" should exist (the new entry). assert!(cache.lookup(parent, OsStr::new("bar")).is_some()); - // "foo" still exists because insert only cleans up cross-parent - // stale entries, not same-parent renames. The old entry under - // the same parent is a separate concern (name -> ino mapping). - // The reverse index now says child_to_name[child] = "bar", - // so evict will target "bar", not "foo". + // "foo" must be gone — otherwise readdir would return two entries + // pointing to the same child inode. + assert!(cache.lookup(parent, OsStr::new("foo")).is_none()); } From 40447f48c80058031d00117b724e913335cdf1ce Mon Sep 17 00:00:00 2001 From: Marko Vejnovic Date: Mon, 23 Feb 2026 14:18:07 -0800 Subject: [PATCH 57/58] more fixes --- lib/fs/async_fs.rs | 196 ++++++++++++++++++++++++++++---- lib/fs/composite.rs | 9 +- lib/fs/dcache.rs | 80 +++++++++---- lib/fs/fuser.rs | 2 +- lib/fs/mod.rs | 4 +- tests/async_fs_correctness.rs | 208 +++++++++++++++++++++++++++++++++- tests/composite_fs_tests.rs | 127 ++++++++++++++++++++- tests/dcache_correctness.rs | 160 +++++++++++++++++++++++++- 8 files changed, 739 insertions(+), 47 deletions(-) diff --git a/lib/fs/async_fs.rs b/lib/fs/async_fs.rs index b5f0afd6..f60b9af1 100644 --- a/lib/fs/async_fs.rs +++ b/lib/fs/async_fs.rs @@ -22,6 +22,167 @@ use crate::fs::{ /// `StatelessDrop` context without repeating the full generic signature. pub type LookupCache = FutureBackedCache<(InodeAddr, Arc), INode>; +type LookupKey = (InodeAddr, Arc); + +/// A reverse-index entry: the lookup-cache key plus the child inode addr +/// that the key resolved to. Storing the child addr allows [`evict_addr`] +/// to clean both the parent and child sides of the reverse index without +/// needing to read the (already-removed) cache value. +type ReverseEntry = (LookupKey, InodeAddr); + +/// Wraps a [`LookupCache`] with a reverse index for O(k) eviction. +/// +/// The reverse index maps each `InodeAddr` to the set of lookup-cache +/// keys that reference it (either as parent or as child). This avoids +/// the O(N) `retain_sync` scan that would otherwise be required when +/// evicting a single inode. +/// +/// Unlike [`DCache`](super::dcache::DCache)'s 1:1 `child_to_parent` +/// reverse index (where each child has exactly one parent), the lookup +/// cache maps one inode address to *multiple* cache keys (because an +/// inode can appear as both a parent and a child in different entries). +/// Hence we use `Vec` rather than a flat map. +pub struct IndexedLookupCache { + cache: LookupCache, + /// addr → set of `(key, child_addr)` pairs where `addr` appears as + /// parent or child. + reverse: scc::HashMap>, +} + +impl Default for IndexedLookupCache { + fn default() -> Self { + Self { + cache: LookupCache::default(), + reverse: scc::HashMap::new(), + } + } +} + +impl IndexedLookupCache { + /// Delegate to the inner cache's `get_or_try_init`, then record the + /// result in the reverse index. + pub async fn get_or_try_init( + &self, + key: LookupKey, + factory: F, + ) -> Result + where + F: FnOnce() -> Fut, + Fut: Future> + Send + 'static, + { + let child = self.cache.get_or_try_init(key.clone(), factory).await?; + self.index_entry(&key, child.addr); + Ok(child) + } + + /// Remove a single key from the cache and its reverse-index entries. + pub fn remove_sync(&self, key: &LookupKey) { + if self.cache.remove_sync(key) { + self.deindex_key(key); + } + } + + /// Remove all lookup-cache entries referencing `addr` (as parent or child). + /// + /// O(k) where k is the number of entries referencing `addr`, vs the + /// previous O(N) scan over the entire cache. + pub fn evict_addr(&self, addr: InodeAddr) { + let entries = self + .reverse + .remove_sync(&addr) + .map(|(_, entries)| entries) + .unwrap_or_default(); + + for (key, child_addr) in &entries { + self.cache.remove_sync(key); + let (parent_addr, _) = key; + // Clean the *other* side(s) of the reverse index. + // We removed `addr`'s Vec already; now prune the key from + // whichever other addrs it was indexed under. + if *parent_addr != addr { + self.reverse.update_sync(parent_addr, |_, v| { + v.retain(|(k, _)| k != key); + }); + } + if *child_addr != addr && *child_addr != *parent_addr { + self.reverse.update_sync(child_addr, |_, v| { + v.retain(|(k, _)| k != key); + }); + } + } + } + + /// Record a lookup entry in the reverse index for both parent and child addrs. + /// + /// Deduplicates: if the key is already present in the `Vec` for a given + /// addr, the push is skipped. This prevents unbounded growth when the + /// same key is looked up repeatedly (cache hits still call this method + /// because the `FutureBackedCache` joiner path returns without + /// distinguishing hits from misses). + fn index_entry(&self, key: &LookupKey, child_addr: InodeAddr) { + let entry = (key.clone(), child_addr); + let (parent_addr, _) = key; + // Index under parent addr (deduplicated). + self.reverse + .entry_sync(*parent_addr) + .or_default() + .get_mut() + .dedup_push(&entry); + // Index under child addr (if different from parent, deduplicated). + if child_addr != *parent_addr { + self.reverse + .entry_sync(child_addr) + .or_default() + .get_mut() + .dedup_push(&entry); + } + } + + /// Returns the number of entries in the reverse-index `Vec` for `addr`. + /// + /// Intended for testing only — verifies that the reverse index stays + /// bounded and does not accumulate duplicates. + #[doc(hidden)] + #[must_use] + pub fn reverse_entry_count(&self, addr: InodeAddr) -> usize { + self.reverse + .read_sync(&addr, |_, entries| entries.len()) + .unwrap_or(0) + } + + /// Remove a single key's entries from the reverse index. + /// + /// Cleans the parent side. The child side cannot be cleaned here because + /// the cache value (which held the child addr) has already been removed. + /// This is acceptable: orphaned child-side entries are harmless (they + /// reference a key that no longer exists in the cache) and are cleaned + /// up when the child addr is eventually evicted via [`evict_addr`]. + fn deindex_key(&self, key: &LookupKey) { + let (parent_addr, _) = key; + self.reverse.update_sync(parent_addr, |_, entries| { + entries.retain(|(k, _)| k != key); + }); + } +} + +/// Extension trait for `Vec` to push only if the element is not already present. +trait DedupPush { + fn dedup_push(&mut self, item: &T) + where + T: Clone; +} + +impl DedupPush for Vec { + fn dedup_push(&mut self, item: &T) + where + T: Clone, + { + if !self.contains(item) { + self.push(item.clone()); + } + } +} + /// A reader for an open file, returned by [`FsDataProvider::open`]. /// /// Implementors provide the actual data for read operations. The FUSE @@ -104,17 +265,15 @@ impl StatelessDrop>, InodeAddr> for Inod /// delegates to [`FsDataProvider::forget`] so the provider can clean up its /// own auxiliary state. /// -/// The lookup cache cleanup (`remove_ready_if_sync`) ensures that stale -/// `(parent, name) → INode` entries do not survive after FUSE forgets an -/// inode. Without this, a subsequent `lookup` would hit the stale cache -/// entry, observe a missing inode table entry, and have to fall through to -/// the slow path — correct but wasteful. +/// The lookup cache cleanup removes all entries referencing the forgotten +/// inode (as parent or child) via the [`IndexedLookupCache`]'s reverse +/// index, ensuring O(k) eviction instead of O(N) full-cache scan. impl StatelessDrop< ( Arc>, Arc, - Arc, + Arc, DP, ), InodeAddr, @@ -124,7 +283,7 @@ impl ctx: &( Arc>, Arc, - Arc, + Arc, DP, ), key: &InodeAddr, @@ -132,9 +291,7 @@ impl let addr = *key; ctx.0.remove_sync(key); ctx.1.evict(LoadedAddr::new_unchecked(addr)); - ctx.2.remove_ready_if_sync(|&(parent_addr, _), child| { - parent_addr == addr || child.addr == addr - }); + ctx.2.evict_addr(addr); ctx.3.forget(addr); } } @@ -327,10 +484,10 @@ pub struct AsyncFs { /// Deduplicating lookup cache keyed by `(parent_addr, child_name)`. The factory is /// `dp.lookup()`, so the data provider is only called on a true cache miss. /// - /// Wrapped in `Arc` so that [`InodeForget`] can include it in its - /// `StatelessDrop` context and clean up stale entries when FUSE forgets - /// an inode. - lookup_cache: Arc, + /// Uses [`IndexedLookupCache`] with a reverse index for O(k) eviction + /// instead of O(N) full-cache scans. Wrapped in `Arc` so that + /// [`InodeForget`] can include it in its `StatelessDrop` context. + lookup_cache: Arc, /// Directory entry cache, mapping `(parent, name)` to child inode address. directory_cache: Arc, @@ -358,7 +515,7 @@ impl AsyncFs { Self { inode_table, - lookup_cache: Arc::new(LookupCache::default()), + lookup_cache: Arc::new(IndexedLookupCache::default()), directory_cache: Arc::new(DCache::new()), data_provider, next_fh: AtomicU64::new(1), @@ -377,7 +534,7 @@ impl AsyncFs { ) -> Self { Self { inode_table, - lookup_cache: Arc::new(LookupCache::default()), + lookup_cache: Arc::new(IndexedLookupCache::default()), directory_cache: Arc::new(DCache::new()), data_provider, next_fh: AtomicU64::new(1), @@ -422,7 +579,7 @@ impl AsyncFs { /// context so that [`InodeForget`] can clean up stale /// `(parent, name) → INode` entries when the kernel forgets an inode. #[must_use] - pub fn lookup_cache(&self) -> Arc { + pub fn lookup_cache(&self) -> Arc { Arc::clone(&self.lookup_cache) } @@ -562,10 +719,7 @@ impl AsyncFs { pub fn evict(&self, addr: InodeAddr) { self.inode_table.remove_sync(&addr); self.directory_cache.evict(LoadedAddr::new_unchecked(addr)); - self.lookup_cache - .remove_ready_if_sync(|&(parent_addr, _), child| { - parent_addr == addr || child.addr == addr - }); + self.lookup_cache.evict_addr(addr); self.data_provider.forget(addr); } diff --git a/lib/fs/composite.rs b/lib/fs/composite.rs index 837b0a90..fc1743c4 100644 --- a/lib/fs/composite.rs +++ b/lib/fs/composite.rs @@ -508,7 +508,14 @@ where .slots .remove_if_sync(&slot_idx, |slot| slot.bridge.is_empty_locked()); if let Some((_, slot)) = removed { - self.inner.name_to_slot.remove_sync(&slot.name); + // Guard: only remove from name_to_slot if it still + // points to this slot. A concurrent `register_child` + // may have replaced it with a new slot index for the + // same child name; removing unconditionally would + // orphan the replacement slot. + self.inner + .name_to_slot + .remove_if_sync(&slot.name, |idx| *idx == slot_idx); } } } diff --git a/lib/fs/dcache.rs b/lib/fs/dcache.rs index a6a13460..65eb9b51 100644 --- a/lib/fs/dcache.rs +++ b/lib/fs/dcache.rs @@ -281,8 +281,17 @@ impl DCache { .read() .unwrap_or_else(std::sync::PoisonError::into_inner); for dv in children.values() { - self.child_to_parent.remove_sync(&dv.ino); - self.child_to_name.remove_sync(&dv.ino); + // Only remove reverse-index entries that still point to this + // parent. A concurrent `insert` may have re-parented the child + // to a new parent between `dirs.remove_sync` and this cleanup; + // removing unconditionally would clobber the new mapping. + let removed = self + .child_to_parent + .remove_if_sync(&dv.ino, |v| *v == parent_ino) + .is_some(); + if removed { + self.child_to_name.remove_sync(&dv.ino); + } } true } else { @@ -297,11 +306,13 @@ impl DCache { /// flag to `UNCLAIMED` so the next `readdir` re-fetches from the /// data provider. /// - /// The reset uses `compare_exchange(DONE -> UNCLAIMED)` rather than a - /// blind store to avoid a race with an in-flight populate: if a - /// concurrent `readdir` is mid-populate (`IN_PROGRESS`), a blind store - /// of `UNCLAIMED` would be overwritten by the populator's final `DONE` - /// store, leaving the cache in a stale-but-marked-done state. + /// The reset attempts CAS on both `DONE -> UNCLAIMED` and + /// `IN_PROGRESS -> UNCLAIMED`. The `IN_PROGRESS` case handles eviction + /// that occurs while a concurrent `readdir` is mid-populate: the + /// populator's subsequent `finish_populate` will observe a generation + /// mismatch and store `UNCLAIMED`, but resetting here closes the window + /// where waiters would see a stuck `IN_PROGRESS` flag between the + /// eviction and the populate completion. /// /// # Ordering with concurrent `insert` /// @@ -345,16 +356,32 @@ impl DCache { if let Some((_, name)) = self.child_to_name.remove_sync(&child_ino) { children.remove(&name); } - drop(children); - // Bump generation so any in-flight populate knows its data is stale. + // Bump generation and reset populate status while still holding the + // write lock. This prevents a concurrent `finish_populate` from + // reading the stale generation and setting DONE between the child + // removal and the generation bump. state.generation.fetch_add(1, Ordering::Release); - // Reset DONE -> UNCLAIMED so the next readdir re-fetches. - let _ = state.populated.compare_exchange( - POPULATE_DONE, - POPULATE_UNCLAIMED, - Ordering::AcqRel, - Ordering::Relaxed, - ); + // Reset to UNCLAIMED so the next readdir re-fetches. Try both + // DONE and IN_PROGRESS: eviction during an in-flight populate + // must also reset the flag so waiters are not stuck. + if state + .populated + .compare_exchange( + POPULATE_DONE, + POPULATE_UNCLAIMED, + Ordering::AcqRel, + Ordering::Relaxed, + ) + .is_err() + { + let _ = state.populated.compare_exchange( + POPULATE_IN_PROGRESS, + POPULATE_UNCLAIMED, + Ordering::AcqRel, + Ordering::Relaxed, + ); + } + drop(children); state.notify.notify_waiters(); } @@ -386,14 +413,27 @@ impl DCache { /// `claimed_gen` is the generation returned by [`try_claim_populate`]. If /// an [`evict`](Self::evict) bumped the generation since then, the data /// is stale so the flag is reset to `UNCLAIMED` instead of `DONE`. + /// + /// Uses CAS (`IN_PROGRESS -> target`) rather than a plain store so + /// that if a concurrent [`evict`](Self::evict) already reset the flag + /// to `UNCLAIMED`, this does not overwrite that correction. pub fn finish_populate(&self, parent_ino: LoadedAddr, claimed_gen: u64) { let state = self.dir_state(parent_ino); let current_gen = state.generation.load(Ordering::Acquire); - if current_gen == claimed_gen { - state.populated.store(POPULATE_DONE, Ordering::Release); + let target = if current_gen == claimed_gen { + POPULATE_DONE } else { - state.populated.store(POPULATE_UNCLAIMED, Ordering::Release); - } + POPULATE_UNCLAIMED + }; + // CAS: only transition from IN_PROGRESS. If evict already reset + // to UNCLAIMED, this fails harmlessly — the correct state is + // already in place. + let _ = state.populated.compare_exchange( + POPULATE_IN_PROGRESS, + target, + Ordering::AcqRel, + Ordering::Relaxed, + ); state.notify.notify_waiters(); } diff --git a/lib/fs/fuser.rs b/lib/fs/fuser.rs index a1e32657..0d5852ee 100644 --- a/lib/fs/fuser.rs +++ b/lib/fs/fuser.rs @@ -73,7 +73,7 @@ type FuseWard = crate::drop_ward::DropWard< ( Arc>, Arc, - Arc, + Arc, DP, ), InodeAddr, diff --git a/lib/fs/mod.rs b/lib/fs/mod.rs index ccefbecc..88eadb7f 100644 --- a/lib/fs/mod.rs +++ b/lib/fs/mod.rs @@ -30,7 +30,9 @@ pub mod dcache; /// FUSE adapter: maps [`fuser::Filesystem`] callbacks to [`async_fs::AsyncFs`]. pub mod fuser; -pub use async_fs::{InodeForget, InodeLifecycle, LookupCache, OpenFile, ResolvedINode}; +pub use async_fs::{ + IndexedLookupCache, InodeForget, InodeLifecycle, LookupCache, OpenFile, ResolvedINode, +}; use std::ffi::OsStr; use std::time::SystemTime; diff --git a/tests/async_fs_correctness.rs b/tests/async_fs_correctness.rs index 5a32c114..2f4cff79 100644 --- a/tests/async_fs_correctness.rs +++ b/tests/async_fs_correctness.rs @@ -1,4 +1,10 @@ -#![allow(clippy::unwrap_used, clippy::expect_used, missing_docs)] +#![allow( + clippy::unwrap_used, + clippy::expect_used, + clippy::doc_markdown, + clippy::similar_names, + missing_docs +)] mod common; @@ -873,3 +879,203 @@ async fn readdir_evict_all_readdir_returns_same_entries() { "readdir after evict should return same entries" ); } + +/// Verify that `IndexedLookupCache::evict_addr` removes only entries +/// referencing the evicted inode (parent or child), leaving unrelated +/// entries intact. This is the O(k) replacement for the old O(N) scan. +#[tokio::test(flavor = "multi_thread", worker_threads = 2)] +async fn indexed_lookup_cache_evict_removes_only_related_entries() { + use git_fs::fs::async_fs::IndexedLookupCache; + + let cache = IndexedLookupCache::default(); + + let parent_a: u64 = 1; + let parent_b: u64 = 2; + let child_x: u64 = 10; + let child_y: u64 = 11; + let child_z: u64 = 12; + + let inode_x = make_inode(child_x, INodeType::File, 100, Some(parent_a)); + let inode_y = make_inode(child_y, INodeType::File, 200, Some(parent_a)); + let inode_z = make_inode(child_z, INodeType::File, 300, Some(parent_b)); + + // Populate: parent_a has children x and y, parent_b has child z. + let key_ax: (u64, Arc) = (parent_a, Arc::from(OsStr::new("x"))); + let key_ay: (u64, Arc) = (parent_a, Arc::from(OsStr::new("y"))); + let key_bz: (u64, Arc) = (parent_b, Arc::from(OsStr::new("z"))); + + let ix = inode_x; + cache + .get_or_try_init(key_ax.clone(), move || async move { Ok(ix) }) + .await + .unwrap(); + let iy = inode_y; + cache + .get_or_try_init(key_ay.clone(), move || async move { Ok(iy) }) + .await + .unwrap(); + let iz = inode_z; + cache + .get_or_try_init(key_bz.clone(), move || async move { Ok(iz) }) + .await + .unwrap(); + + // Evict child_x. This should remove key_ax but leave key_ay and key_bz. + cache.evict_addr(child_x); + + // key_ax should be gone (child_x was evicted). + let result = cache + .get_or_try_init(key_ax.clone(), move || async move { + Ok(make_inode(child_x, INodeType::File, 999, Some(parent_a))) + }) + .await + .unwrap(); + assert_eq!(result.size, 999, "key_ax should have been re-fetched"); + + // key_ay should still be cached (different child). + let result = cache + .get_or_try_init(key_ay.clone(), || async { + panic!("factory should not be called for cached key_ay") + }) + .await + .unwrap(); + assert_eq!(result.size, 200, "key_ay should still be cached"); + + // key_bz should still be cached (different parent and child). + let result = cache + .get_or_try_init(key_bz.clone(), || async { + panic!("factory should not be called for cached key_bz") + }) + .await + .unwrap(); + assert_eq!(result.size, 300, "key_bz should still be cached"); + + // Evict parent_a. This should remove all entries with parent_a (key_ax, key_ay). + cache.evict_addr(parent_a); + + let result = cache + .get_or_try_init(key_ay.clone(), move || async move { + Ok(make_inode(child_y, INodeType::File, 888, Some(parent_a))) + }) + .await + .unwrap(); + assert_eq!( + result.size, 888, + "key_ay should have been re-fetched after parent eviction" + ); + + // key_bz should still be cached. + let result = cache + .get_or_try_init(key_bz.clone(), || async { + panic!("factory should not be called for cached key_bz after parent_a eviction") + }) + .await + .unwrap(); + assert_eq!(result.size, 300, "key_bz should still be cached"); +} + +/// H1: Evicting a parent addr orphans the child-side reverse-index entries. +/// A subsequent evict of the old child then spuriously deletes a live cache +/// entry that was re-inserted under the same key with a different child. +/// +/// Sequence: +/// 1. Insert (P, "foo") -> C1. Reverse: P=[key], C1=[key]. +/// 2. evict_addr(P): removes P's entries, cleans cache. But since key's +/// parent_addr == P == addr, the child side (C1) is NOT cleaned. +/// Reverse: C1=[orphaned key]. +/// 3. Insert (P, "foo") -> C2 (new child). Reverse: P=[key], C2=[key], +/// C1=[orphaned key still referencing (P, "foo")]. +/// 4. evict_addr(C1): finds orphaned (P, "foo") in C1's reverse list, +/// calls cache.remove_sync((P, "foo")) — deletes the LIVE C2 entry. +/// 5. Lookup (P, "foo") should return C2 (cached), but instead triggers +/// the factory (cache miss). +#[tokio::test(flavor = "multi_thread", worker_threads = 2)] +async fn evict_parent_then_old_child_must_not_delete_repopulated_entry() { + use git_fs::fs::async_fs::IndexedLookupCache; + + let cache = IndexedLookupCache::default(); + let parent: u64 = 1; + let child_c1: u64 = 10; + let child_c2: u64 = 20; + + let key: (u64, Arc) = (parent, Arc::from(OsStr::new("foo"))); + + // Step 1: Insert (P, "foo") -> C1. + let c1 = make_inode(child_c1, INodeType::File, 100, Some(parent)); + cache + .get_or_try_init(key.clone(), move || async move { Ok(c1) }) + .await + .unwrap(); + + // Step 2: Evict the parent. + cache.evict_addr(parent); + + // Step 3: Re-insert (P, "foo") -> C2 (different child). + let c2 = make_inode(child_c2, INodeType::File, 200, Some(parent)); + cache + .get_or_try_init(key.clone(), move || async move { Ok(c2) }) + .await + .unwrap(); + + // Step 4: Evict old child C1. If orphaned reverse entries exist, + // this spuriously removes the live (P, "foo") -> C2 entry. + cache.evict_addr(child_c1); + + // Step 5: The live entry (P, "foo") -> C2 must still be cached. + let result = cache + .get_or_try_init(key.clone(), || async { + panic!("factory must NOT be called — (P, \"foo\") -> C2 should still be cached") + }) + .await + .unwrap(); + assert_eq!( + result.size, 200, + "live entry C2 should not have been spuriously evicted" + ); +} + +/// H2: The reverse-index Vec grows unbounded because `index_entry` is +/// called on every `get_or_try_init` hit, not just on cache misses. +/// After N lookups of the same key, the reverse-index should still +/// contain only 1 entry per addr side (not N duplicates). +#[tokio::test(flavor = "multi_thread", worker_threads = 2)] +async fn repeated_lookups_must_not_grow_reverse_index() { + use git_fs::fs::async_fs::IndexedLookupCache; + + let cache = IndexedLookupCache::default(); + let parent: u64 = 1; + let child: u64 = 10; + + let key: (u64, Arc) = (parent, Arc::from(OsStr::new("foo"))); + let inode = make_inode(child, INodeType::File, 42, Some(parent)); + + // First call: cache miss, factory runs. + let i = inode; + cache + .get_or_try_init(key.clone(), move || async move { Ok(i) }) + .await + .unwrap(); + + // 50 more calls: all cache hits, factory never called. + for _ in 0..50 { + cache + .get_or_try_init(key.clone(), || async { + panic!("factory should not be called on cache hit") + }) + .await + .unwrap(); + } + + // The reverse index for parent and child should each have exactly 1 + // entry, not 51. + assert_eq!( + cache.reverse_entry_count(parent), + 1, + "parent reverse-index should have exactly 1 entry, not one per lookup" + ); + assert_eq!( + cache.reverse_entry_count(child), + 1, + "child reverse-index should have exactly 1 entry, not one per lookup" + ); +} diff --git a/tests/composite_fs_tests.rs b/tests/composite_fs_tests.rs index dad0255a..99b88276 100644 --- a/tests/composite_fs_tests.rs +++ b/tests/composite_fs_tests.rs @@ -1,4 +1,9 @@ -#![allow(clippy::unwrap_used, clippy::expect_used, missing_docs)] +#![allow( + clippy::unwrap_used, + clippy::expect_used, + clippy::doc_markdown, + missing_docs +)] mod common; @@ -395,3 +400,123 @@ async fn composite_forget_cleans_up_slot_and_name_mapping() { assert_eq!(re_resolved.inode.itype, INodeType::Directory); // The new address may differ from the original (fresh slot allocated). } + +/// Regression test for C1: forget must not remove a name_to_slot entry +/// that was replaced by a concurrent register_child. +/// +/// Scenario: slot S1 for "repo" is GC'd by forget (bridge empty), then +/// a new lookup creates slot S2 for the same name. A stale forget that +/// still references S1's slot index must NOT remove the name_to_slot +/// entry pointing to S2. +/// +/// We simulate this by: +/// 1. Looking up "repo" to create slot S1 +/// 2. Forgetting all addresses to trigger slot GC (name_to_slot entry removed) +/// 3. Re-looking up "repo" to create slot S2 (name_to_slot entry re-created) +/// 4. Verifying a further re-lookup still works (name_to_slot entry intact) +/// +/// Before the fix, step 2's forget could race with step 3's register_child +/// and destroy the replacement entry. +#[tokio::test(flavor = "multi_thread", worker_threads = 2)] +async fn composite_forget_does_not_destroy_replacement_name_to_slot_entry() { + let (provider, root_ino) = make_child_provider(100, &[("file.txt", 101, INodeType::File, 42)]); + + let mut children = HashMap::new(); + children.insert(OsString::from("repo"), (provider, root_ino)); + + let mock_root = MockRoot::new(children); + let composite = CompositeFs::new(mock_root, (1000, 1000)); + let root_inode = composite.make_root_inode(); + + let table = Arc::new(FutureBackedCache::default()); + table.insert_sync(1, root_inode); + let afs = AsyncFs::new_preseeded(composite.clone(), Arc::clone(&table)); + + // Step 1: establish slot S1 + let child_dir = afs + .lookup(LoadedAddr::new_unchecked(1), OsStr::new("repo")) + .await + .unwrap(); + let s1_child_addr = child_dir.inode.addr; + + let file = afs + .lookup( + LoadedAddr::new_unchecked(s1_child_addr), + OsStr::new("file.txt"), + ) + .await + .unwrap(); + let s1_file_addr = file.inode.addr; + + // Step 2: forget all S1 addresses → slot GC + composite.forget(s1_file_addr); + composite.forget(s1_child_addr); + + // Step 3: re-lookup "repo" → creates slot S2 + let re_resolved = afs + .lookup(LoadedAddr::new_unchecked(1), OsStr::new("repo")) + .await + .unwrap(); + let s2_child_addr = re_resolved.inode.addr; + assert_eq!(re_resolved.inode.itype, INodeType::Directory); + + // Step 4: another lookup must succeed — name_to_slot must still point to S2. + // Before the fix, if forget's stale remove_sync destroyed S2's entry, + // this lookup would create a *third* slot instead of reusing S2, or + // worse, the name_to_slot entry would be missing. + let third_lookup = afs + .lookup(LoadedAddr::new_unchecked(1), OsStr::new("repo")) + .await + .unwrap(); + assert_eq!( + third_lookup.inode.addr, s2_child_addr, + "repeated lookup after forget+re-register should return the same slot S2 address" + ); +} + +/// Test that concurrent forget + lookup on the same child name does not +/// orphan the replacement slot. +#[tokio::test(flavor = "multi_thread", worker_threads = 4)] +async fn composite_concurrent_forget_and_lookup_preserves_name_mapping() { + let (provider, root_ino) = make_child_provider(100, &[("a.txt", 101, INodeType::File, 1)]); + + let mut children = HashMap::new(); + children.insert(OsString::from("repo"), (provider, root_ino)); + + let mock_root = MockRoot::new(children); + let composite = CompositeFs::new(mock_root, (1000, 1000)); + let root_inode = composite.make_root_inode(); + + // Run multiple rounds of forget-then-re-lookup to stress the race window. + for _ in 0..50 { + let table = Arc::new(FutureBackedCache::default()); + table.insert_sync(1, root_inode); + let afs = AsyncFs::new_preseeded(composite.clone(), Arc::clone(&table)); + + // Establish the child. + let child = afs + .lookup(LoadedAddr::new_unchecked(1), OsStr::new("repo")) + .await + .unwrap(); + let child_addr = child.inode.addr; + + let file = afs + .lookup(LoadedAddr::new_unchecked(child_addr), OsStr::new("a.txt")) + .await + .unwrap(); + + // Forget everything so the slot is GC'd. + composite.forget(file.inode.addr); + composite.forget(child_addr); + + // Re-lookup: must always succeed. + let re = afs + .lookup(LoadedAddr::new_unchecked(1), OsStr::new("repo")) + .await; + assert!( + re.is_ok(), + "re-lookup after forget should always succeed, got: {:?}", + re.err() + ); + } +} diff --git a/tests/dcache_correctness.rs b/tests/dcache_correctness.rs index f58e12a0..a91be439 100644 --- a/tests/dcache_correctness.rs +++ b/tests/dcache_correctness.rs @@ -1,4 +1,4 @@ -#![allow(clippy::unwrap_used, missing_docs)] +#![allow(clippy::unwrap_used, clippy::doc_markdown, missing_docs)] use std::ffi::{OsStr, OsString}; @@ -565,6 +565,87 @@ async fn insert_reparent_does_not_remove_reused_name_in_old_parent() { assert_eq!(dv.unwrap().ino, child_2); } +/// Regression test for H1: evict during IN_PROGRESS must reset populate +/// status so finish_populate stores UNCLAIMED (not DONE). +/// +/// Scenario: +/// 1. Populate is claimed (state = IN_PROGRESS, claim_gen = 0) +/// 2. Evict fires while IN_PROGRESS — bumps generation to 1 AND resets +/// state back to UNCLAIMED +/// 3. finish_populate(claim_gen=0) sees generation mismatch → stores UNCLAIMED +/// +/// Before the fix, evict only attempted CAS(DONE -> UNCLAIMED) which +/// failed because state was IN_PROGRESS. The state stayed IN_PROGRESS, +/// and finish_populate with stale gen correctly stored UNCLAIMED via +/// the generation check. But the populate status was stuck for any +/// waiter that checked between evict and finish_populate. +/// +/// After the fix, evict also resets IN_PROGRESS -> UNCLAIMED, closing +/// the window. +#[tokio::test] +async fn evict_during_in_progress_resets_populate_status() { + let cache = DCache::new(); + let parent = LoadedAddr::new_unchecked(1); + let child = LoadedAddr::new_unchecked(10); + cache.insert(parent, OsString::from("foo"), child, false); + + // Claim populate (state → IN_PROGRESS). + let PopulateStatus::Claimed(_claim_gen) = cache.try_claim_populate(parent) else { + panic!("expected Claimed"); + }; + + // Evict while IN_PROGRESS. + cache.evict(child); + + // After the fix, evict should have reset IN_PROGRESS → UNCLAIMED. + // A new claim attempt should succeed (not return InProgress). + match cache.try_claim_populate(parent) { + PopulateStatus::Claimed(_) => { /* correct: evict reset to UNCLAIMED */ } + PopulateStatus::InProgress => { + panic!( + "BUG: evict during IN_PROGRESS failed to reset populate status; \ + state is stuck at IN_PROGRESS" + ); + } + PopulateStatus::Done => { + panic!("BUG: state should not be DONE — nobody called finish_populate"); + } + } +} + +/// Regression test: evict during IN_PROGRESS followed by finish_populate +/// with stale generation must leave the directory re-claimable. +#[tokio::test] +async fn evict_during_in_progress_then_finish_populate_stays_unclaimed() { + let cache = DCache::new(); + let parent = LoadedAddr::new_unchecked(1); + let child_a = LoadedAddr::new_unchecked(10); + let child_b = LoadedAddr::new_unchecked(11); + cache.insert(parent, OsString::from("a"), child_a, false); + cache.insert(parent, OsString::from("b"), child_b, false); + + // Step 1: claim populate. + let PopulateStatus::Claimed(claim_gen) = cache.try_claim_populate(parent) else { + panic!("expected Claimed"); + }; + + // Step 2: evict during IN_PROGRESS. After the fix, this resets to + // UNCLAIMED and bumps the generation. + cache.evict(child_a); + + // Step 3: finish_populate with the stale generation. Since evict + // already reset to UNCLAIMED, finish_populate's CAS on IN_PROGRESS + // may fail (already UNCLAIMED) — which is correct. Or if the + // generation mismatch is detected, it stores UNCLAIMED. + cache.finish_populate(parent, claim_gen); + + // The directory must be re-claimable (not stuck in DONE with stale data). + assert!( + matches!(cache.try_claim_populate(parent), PopulateStatus::Claimed(_)), + "directory should be re-claimable after evict invalidated the in-flight populate" + ); +} + #[tokio::test] async fn insert_reparent_same_parent_removes_old_name() { let cache = DCache::new(); @@ -582,3 +663,80 @@ async fn insert_reparent_same_parent_removes_old_name() { // pointing to the same child inode. assert!(cache.lookup(parent, OsStr::new("foo")).is_none()); } + +/// M1: `remove_parent` should only remove reverse-index entries for children +/// that still belong to the removed parent. If a child was concurrently +/// re-inserted under a *new* parent's DirState (created after the old one was +/// removed), remove_parent must not clobber the new reverse-index entries. +#[tokio::test] +async fn remove_parent_does_not_clobber_concurrent_reinsert_reverse_index() { + let cache = DCache::new(); + let parent_a = LoadedAddr::new_unchecked(1); + let parent_b = LoadedAddr::new_unchecked(2); + let child = LoadedAddr::new_unchecked(10); + + // Populate parent_a with child. + cache.insert(parent_a, OsString::from("child"), child, false); + + // Now simulate the race: first, re-insert child under parent_b. + cache.insert(parent_b, OsString::from("child"), child, false); + + // Then remove parent_a (which no longer owns the child, since insert + // already moved the reverse index to parent_b). + cache.remove_parent(parent_a); + + // The child should still be discoverable via evict (which uses the + // reverse index to find the parent). If remove_parent clobbered the + // reverse index, evict won't find anything. + cache.evict(child); + + // After evict, lookup under parent_b should return None (evicted). + assert!( + cache.lookup(parent_b, OsStr::new("child")).is_none(), + "evict should have removed child from parent_b via reverse index" + ); +} + +/// M2: The generation bump in `evict` must be visible before the write lock +/// is released, so that a concurrent `finish_populate` sees the new +/// generation and resets to UNCLAIMED instead of setting DONE. +/// +/// This test exercises the interleaving: +/// 1. Claim populate (gen=0), begin populating. +/// 2. Evict bumps gen to 1, resets IN_PROGRESS to UNCLAIMED. +/// 3. finish_populate with claimed_gen=0 — must NOT set DONE because +/// the CAS from IN_PROGRESS fails (already UNCLAIMED from evict). +/// 4. A subsequent try_claim_populate must succeed (UNCLAIMED, not DONE). +#[tokio::test] +async fn evict_generation_bump_prevents_stale_finish_populate() { + let cache = DCache::new(); + let parent = LoadedAddr::new_unchecked(1); + let child = LoadedAddr::new_unchecked(10); + + // Insert child so evict has something to work with. + cache.insert(parent, OsString::from("foo"), child, false); + + // Claim populate (gen=0). State: IN_PROGRESS. + let PopulateStatus::Claimed(claimed_gen) = cache.try_claim_populate(parent) else { + panic!("expected Claimed"); + }; + assert_eq!(claimed_gen, 0); + + // Evict while populate is in-flight. This bumps gen to 1 and + // CASes IN_PROGRESS -> UNCLAIMED. + cache.evict(child); + + // finish_populate with the now-stale claimed_gen=0. Since evict + // already reset to UNCLAIMED, the CAS (IN_PROGRESS -> target) should + // fail harmlessly. The directory must remain UNCLAIMED. + cache.finish_populate(parent, claimed_gen); + + // The directory should be re-claimable (UNCLAIMED), not stuck at DONE. + match cache.try_claim_populate(parent) { + PopulateStatus::Claimed(_) => { /* correct: UNCLAIMED -> Claimed */ } + PopulateStatus::Done => { + panic!("directory should be UNCLAIMED after evict invalidated the generation") + } + PopulateStatus::InProgress => panic!("unexpected InProgress"), + } +} From c8628f5c2b7402c48ead10cf7fd49d739ed2632d Mon Sep 17 00:00:00 2001 From: Marko Vejnovic Date: Mon, 23 Feb 2026 14:42:07 -0800 Subject: [PATCH 58/58] small cleanup --- lib/cache/async_backed.rs | 14 --- lib/fs/async_fs.rs | 163 +------------------------------- lib/fs/dcache.rs | 35 +++++-- lib/fs/fuser.rs | 2 +- lib/fs/indexed_lookup_cache.rs | 165 +++++++++++++++++++++++++++++++++ lib/fs/mod.rs | 7 +- tests/async_fs_correctness.rs | 65 ++++++++++++- 7 files changed, 261 insertions(+), 190 deletions(-) create mode 100644 lib/fs/indexed_lookup_cache.rs diff --git a/lib/cache/async_backed.rs b/lib/cache/async_backed.rs index 52c5916b..0e02c8d3 100644 --- a/lib/cache/async_backed.rs +++ b/lib/cache/async_backed.rs @@ -405,20 +405,6 @@ where pub fn remove_sync(&self, key: &K) -> bool { self.map.remove_sync(key).is_some() } - - /// Synchronously remove all `Ready` entries for which `predicate` returns `true`. - /// - /// `InFlight` entries are always retained — only fully resolved entries - /// are eligible for removal. This is safe to call concurrently with - /// other cache operations: `scc::HashMap::retain_sync` acquires - /// bucket-level locks, and `InFlight` entries are skipped so in-progress - /// computations are never disturbed. - pub fn remove_ready_if_sync(&self, mut predicate: impl FnMut(&K, &V) -> bool) { - self.map.retain_sync(|k, slot| match slot { - Slot::InFlight(..) => true, - Slot::Ready(v) => !predicate(k, v), - }); - } } /// Drop guard that synchronously promotes an `InFlight` entry to `Ready` if the caller diff --git a/lib/fs/async_fs.rs b/lib/fs/async_fs.rs index f60b9af1..4a577441 100644 --- a/lib/fs/async_fs.rs +++ b/lib/fs/async_fs.rs @@ -12,7 +12,7 @@ use crate::cache::async_backed::FutureBackedCache; use crate::drop_ward::StatelessDrop; use crate::fs::{ AsyncFsStats, DirEntry, FileHandle, INode, INodeType, InodeAddr, LoadedAddr, OpenFlags, - dcache::DCache, + dcache::DCache, indexed_lookup_cache::IndexedLookupCache, }; /// The concrete type of the lookup cache used by [`AsyncFs`]. @@ -22,167 +22,6 @@ use crate::fs::{ /// `StatelessDrop` context without repeating the full generic signature. pub type LookupCache = FutureBackedCache<(InodeAddr, Arc), INode>; -type LookupKey = (InodeAddr, Arc); - -/// A reverse-index entry: the lookup-cache key plus the child inode addr -/// that the key resolved to. Storing the child addr allows [`evict_addr`] -/// to clean both the parent and child sides of the reverse index without -/// needing to read the (already-removed) cache value. -type ReverseEntry = (LookupKey, InodeAddr); - -/// Wraps a [`LookupCache`] with a reverse index for O(k) eviction. -/// -/// The reverse index maps each `InodeAddr` to the set of lookup-cache -/// keys that reference it (either as parent or as child). This avoids -/// the O(N) `retain_sync` scan that would otherwise be required when -/// evicting a single inode. -/// -/// Unlike [`DCache`](super::dcache::DCache)'s 1:1 `child_to_parent` -/// reverse index (where each child has exactly one parent), the lookup -/// cache maps one inode address to *multiple* cache keys (because an -/// inode can appear as both a parent and a child in different entries). -/// Hence we use `Vec` rather than a flat map. -pub struct IndexedLookupCache { - cache: LookupCache, - /// addr → set of `(key, child_addr)` pairs where `addr` appears as - /// parent or child. - reverse: scc::HashMap>, -} - -impl Default for IndexedLookupCache { - fn default() -> Self { - Self { - cache: LookupCache::default(), - reverse: scc::HashMap::new(), - } - } -} - -impl IndexedLookupCache { - /// Delegate to the inner cache's `get_or_try_init`, then record the - /// result in the reverse index. - pub async fn get_or_try_init( - &self, - key: LookupKey, - factory: F, - ) -> Result - where - F: FnOnce() -> Fut, - Fut: Future> + Send + 'static, - { - let child = self.cache.get_or_try_init(key.clone(), factory).await?; - self.index_entry(&key, child.addr); - Ok(child) - } - - /// Remove a single key from the cache and its reverse-index entries. - pub fn remove_sync(&self, key: &LookupKey) { - if self.cache.remove_sync(key) { - self.deindex_key(key); - } - } - - /// Remove all lookup-cache entries referencing `addr` (as parent or child). - /// - /// O(k) where k is the number of entries referencing `addr`, vs the - /// previous O(N) scan over the entire cache. - pub fn evict_addr(&self, addr: InodeAddr) { - let entries = self - .reverse - .remove_sync(&addr) - .map(|(_, entries)| entries) - .unwrap_or_default(); - - for (key, child_addr) in &entries { - self.cache.remove_sync(key); - let (parent_addr, _) = key; - // Clean the *other* side(s) of the reverse index. - // We removed `addr`'s Vec already; now prune the key from - // whichever other addrs it was indexed under. - if *parent_addr != addr { - self.reverse.update_sync(parent_addr, |_, v| { - v.retain(|(k, _)| k != key); - }); - } - if *child_addr != addr && *child_addr != *parent_addr { - self.reverse.update_sync(child_addr, |_, v| { - v.retain(|(k, _)| k != key); - }); - } - } - } - - /// Record a lookup entry in the reverse index for both parent and child addrs. - /// - /// Deduplicates: if the key is already present in the `Vec` for a given - /// addr, the push is skipped. This prevents unbounded growth when the - /// same key is looked up repeatedly (cache hits still call this method - /// because the `FutureBackedCache` joiner path returns without - /// distinguishing hits from misses). - fn index_entry(&self, key: &LookupKey, child_addr: InodeAddr) { - let entry = (key.clone(), child_addr); - let (parent_addr, _) = key; - // Index under parent addr (deduplicated). - self.reverse - .entry_sync(*parent_addr) - .or_default() - .get_mut() - .dedup_push(&entry); - // Index under child addr (if different from parent, deduplicated). - if child_addr != *parent_addr { - self.reverse - .entry_sync(child_addr) - .or_default() - .get_mut() - .dedup_push(&entry); - } - } - - /// Returns the number of entries in the reverse-index `Vec` for `addr`. - /// - /// Intended for testing only — verifies that the reverse index stays - /// bounded and does not accumulate duplicates. - #[doc(hidden)] - #[must_use] - pub fn reverse_entry_count(&self, addr: InodeAddr) -> usize { - self.reverse - .read_sync(&addr, |_, entries| entries.len()) - .unwrap_or(0) - } - - /// Remove a single key's entries from the reverse index. - /// - /// Cleans the parent side. The child side cannot be cleaned here because - /// the cache value (which held the child addr) has already been removed. - /// This is acceptable: orphaned child-side entries are harmless (they - /// reference a key that no longer exists in the cache) and are cleaned - /// up when the child addr is eventually evicted via [`evict_addr`]. - fn deindex_key(&self, key: &LookupKey) { - let (parent_addr, _) = key; - self.reverse.update_sync(parent_addr, |_, entries| { - entries.retain(|(k, _)| k != key); - }); - } -} - -/// Extension trait for `Vec` to push only if the element is not already present. -trait DedupPush { - fn dedup_push(&mut self, item: &T) - where - T: Clone; -} - -impl DedupPush for Vec { - fn dedup_push(&mut self, item: &T) - where - T: Clone, - { - if !self.contains(item) { - self.push(item.clone()); - } - } -} - /// A reader for an open file, returned by [`FsDataProvider::open`]. /// /// Implementors provide the actual data for read operations. The FUSE diff --git a/lib/fs/dcache.rs b/lib/fs/dcache.rs index 65eb9b51..7ff21cac 100644 --- a/lib/fs/dcache.rs +++ b/lib/fs/dcache.rs @@ -203,12 +203,23 @@ impl DCache { // not going stale. if old_parent != new_parent { old_state.generation.fetch_add(1, Ordering::Release); - let _ = old_state.populated.compare_exchange( - POPULATE_DONE, - POPULATE_UNCLAIMED, - Ordering::AcqRel, - Ordering::Relaxed, - ); + if old_state + .populated + .compare_exchange( + POPULATE_DONE, + POPULATE_UNCLAIMED, + Ordering::AcqRel, + Ordering::Relaxed, + ) + .is_err() + { + let _ = old_state.populated.compare_exchange( + POPULATE_IN_PROGRESS, + POPULATE_UNCLAIMED, + Ordering::AcqRel, + Ordering::Relaxed, + ); + } old_state.notify.notify_waiters(); } } @@ -439,9 +450,19 @@ impl DCache { /// Abort a population attempt, resetting back to unclaimed so another /// caller can retry. + /// + /// Uses CAS (`IN_PROGRESS → UNCLAIMED`) rather than a plain store so + /// that if a concurrent [`evict`](Self::evict) already reset the flag + /// to `UNCLAIMED` and a new populator claimed it, this stale abort + /// does not clobber the new populator's `IN_PROGRESS` state. pub fn abort_populate(&self, parent_ino: LoadedAddr) { let state = self.dir_state(parent_ino); - state.populated.store(POPULATE_UNCLAIMED, Ordering::Release); + let _ = state.populated.compare_exchange( + POPULATE_IN_PROGRESS, + POPULATE_UNCLAIMED, + Ordering::AcqRel, + Ordering::Relaxed, + ); state.notify.notify_waiters(); } diff --git a/lib/fs/fuser.rs b/lib/fs/fuser.rs index 0d5852ee..dccc21f8 100644 --- a/lib/fs/fuser.rs +++ b/lib/fs/fuser.rs @@ -73,7 +73,7 @@ type FuseWard = crate::drop_ward::DropWard< ( Arc>, Arc, - Arc, + Arc, DP, ), InodeAddr, diff --git a/lib/fs/indexed_lookup_cache.rs b/lib/fs/indexed_lookup_cache.rs new file mode 100644 index 00000000..65fa6418 --- /dev/null +++ b/lib/fs/indexed_lookup_cache.rs @@ -0,0 +1,165 @@ +//! [`IndexedLookupCache`]: a reverse-indexed wrapper around [`LookupCache`] +//! for O(k) eviction of lookup-cache entries by inode address. + +use std::ffi::OsStr; +use std::future::Future; +use std::sync::Arc; + +use super::async_fs::LookupCache; +use super::{INode, InodeAddr}; + +type LookupKey = (InodeAddr, Arc); + +/// A reverse-index entry: the lookup-cache key plus the child inode addr +/// that the key resolved to. Storing the child addr allows [`IndexedLookupCache::evict_addr`] +/// to clean both the parent and child sides of the reverse index without +/// needing to read the (already-removed) cache value. +type ReverseEntry = (LookupKey, InodeAddr); + +/// Wraps a [`LookupCache`] with a reverse index for O(k) eviction. +/// +/// The reverse index maps each `InodeAddr` to the set of lookup-cache +/// keys that reference it (either as parent or as child). This avoids +/// the O(N) `retain_sync` scan that would otherwise be required when +/// evicting a single inode. +/// +/// Unlike [`DCache`](super::dcache::DCache)'s 1:1 `child_to_parent` +/// reverse index (where each child has exactly one parent), the lookup +/// cache maps one inode address to *multiple* cache keys (because an +/// inode can appear as both a parent and a child in different entries). +/// Hence we use `Vec` rather than a flat map. +pub struct IndexedLookupCache { + cache: LookupCache, + /// addr → set of `(key, child_addr)` pairs where `addr` appears as + /// parent or child. + reverse: scc::HashMap>, +} + +impl Default for IndexedLookupCache { + fn default() -> Self { + Self { + cache: LookupCache::default(), + reverse: scc::HashMap::new(), + } + } +} + +impl IndexedLookupCache { + /// Delegate to the inner cache's `get_or_try_init`, then record the + /// result in the reverse index. + pub async fn get_or_try_init( + &self, + key: LookupKey, + factory: F, + ) -> Result + where + F: FnOnce() -> Fut, + Fut: Future> + Send + 'static, + { + let child = self.cache.get_or_try_init(key.clone(), factory).await?; + self.index_entry(&key, child.addr); + Ok(child) + } + + /// Remove a single key from the cache and its reverse-index entries. + pub fn remove_sync(&self, key: &LookupKey) { + if self.cache.remove_sync(key) { + self.deindex_key(key); + } + } + + /// Remove all lookup-cache entries referencing `addr` (as parent or child). + /// + /// O(k) where k is the number of entries referencing `addr`, vs the + /// previous O(N) scan over the entire cache. + pub fn evict_addr(&self, addr: InodeAddr) { + let entries = self + .reverse + .remove_sync(&addr) + .map(|(_, entries)| entries) + .unwrap_or_default(); + + for (key, child_addr) in &entries { + self.cache.remove_sync(key); + let (parent_addr, _) = key; + // Clean the *other* side(s) of the reverse index. + // We removed `addr`'s Vec already; now prune the key from + // whichever other addrs it was indexed under. + if *parent_addr != addr { + self.reverse.update_sync(parent_addr, |_, v| { + v.retain(|(k, _)| k != key); + }); + } + if *child_addr != addr && *child_addr != *parent_addr { + self.reverse.update_sync(child_addr, |_, v| { + v.retain(|(k, _)| k != key); + }); + } + } + } + + /// Record a lookup entry in the reverse index for both parent and child addrs. + /// + /// Deduplicates: if the key is already present in the `Vec` for a given + /// addr, the push is skipped. This prevents unbounded growth when the + /// same key is looked up repeatedly (cache hits still call this method + /// because the `FutureBackedCache` joiner path returns without + /// distinguishing hits from misses). + fn index_entry(&self, key: &LookupKey, child_addr: InodeAddr) { + let entry = (key.clone(), child_addr); + let (parent_addr, _) = key; + // Index under parent addr (deduplicated). + self.reverse + .entry_sync(*parent_addr) + .or_default() + .get_mut() + .dedup_push(&entry); + // Index under child addr (if different from parent, deduplicated). + if child_addr != *parent_addr { + self.reverse + .entry_sync(child_addr) + .or_default() + .get_mut() + .dedup_push(&entry); + } + } + + /// Returns the number of entries in the reverse-index `Vec` for `addr`. + /// + /// Intended for testing only — verifies that the reverse index stays + /// bounded and does not accumulate duplicates. + #[doc(hidden)] + #[must_use] + pub fn reverse_entry_count(&self, addr: InodeAddr) -> usize { + self.reverse + .read_sync(&addr, |_, entries| entries.len()) + .unwrap_or(0) + } + + /// Remove a single key's entries from the reverse index. + /// + /// Cleans the parent side. The child side cannot be cleaned here because + /// the cache value (which held the child addr) has already been removed. + /// This is acceptable: orphaned child-side entries are harmless (they + /// reference a key that no longer exists in the cache) and are cleaned + /// up when the child addr is eventually evicted via [`evict_addr`]. + fn deindex_key(&self, key: &LookupKey) { + let (parent_addr, _) = key; + self.reverse.update_sync(parent_addr, |_, entries| { + entries.retain(|(k, _)| k != key); + }); + } +} + +/// Extension trait for `Vec` to push only if the element is not already present. +trait DedupPush { + fn dedup_push(&mut self, item: &T); +} + +impl DedupPush for Vec { + fn dedup_push(&mut self, item: &T) { + if !self.contains(item) { + self.push(item.clone()); + } + } +} diff --git a/lib/fs/mod.rs b/lib/fs/mod.rs index 88eadb7f..ad7c4f96 100644 --- a/lib/fs/mod.rs +++ b/lib/fs/mod.rs @@ -29,10 +29,11 @@ pub mod composite; pub mod dcache; /// FUSE adapter: maps [`fuser::Filesystem`] callbacks to [`async_fs::AsyncFs`]. pub mod fuser; +/// Reverse-indexed lookup cache for O(k) inode eviction. +pub mod indexed_lookup_cache; -pub use async_fs::{ - IndexedLookupCache, InodeForget, InodeLifecycle, LookupCache, OpenFile, ResolvedINode, -}; +pub use async_fs::{InodeForget, InodeLifecycle, LookupCache, OpenFile, ResolvedINode}; +pub use indexed_lookup_cache::IndexedLookupCache; use std::ffi::OsStr; use std::time::SystemTime; diff --git a/tests/async_fs_correctness.rs b/tests/async_fs_correctness.rs index 2f4cff79..b0bf6a7d 100644 --- a/tests/async_fs_correctness.rs +++ b/tests/async_fs_correctness.rs @@ -885,7 +885,7 @@ async fn readdir_evict_all_readdir_returns_same_entries() { /// entries intact. This is the O(k) replacement for the old O(N) scan. #[tokio::test(flavor = "multi_thread", worker_threads = 2)] async fn indexed_lookup_cache_evict_removes_only_related_entries() { - use git_fs::fs::async_fs::IndexedLookupCache; + use git_fs::fs::IndexedLookupCache; let cache = IndexedLookupCache::default(); @@ -991,7 +991,7 @@ async fn indexed_lookup_cache_evict_removes_only_related_entries() { /// the factory (cache miss). #[tokio::test(flavor = "multi_thread", worker_threads = 2)] async fn evict_parent_then_old_child_must_not_delete_repopulated_entry() { - use git_fs::fs::async_fs::IndexedLookupCache; + use git_fs::fs::IndexedLookupCache; let cache = IndexedLookupCache::default(); let parent: u64 = 1; @@ -1040,7 +1040,7 @@ async fn evict_parent_then_old_child_must_not_delete_repopulated_entry() { /// contain only 1 entry per addr side (not N duplicates). #[tokio::test(flavor = "multi_thread", worker_threads = 2)] async fn repeated_lookups_must_not_grow_reverse_index() { - use git_fs::fs::async_fs::IndexedLookupCache; + use git_fs::fs::IndexedLookupCache; let cache = IndexedLookupCache::default(); let parent: u64 = 1; @@ -1079,3 +1079,62 @@ async fn repeated_lookups_must_not_grow_reverse_index() { "child reverse-index should have exactly 1 entry, not one per lookup" ); } + +/// Verify that `remove_sync` removes the cache entry and cleans the +/// parent-side reverse index. The child-side reverse entry is documented +/// as an acceptable orphan — it is harmless and cleaned on child eviction. +#[tokio::test(flavor = "multi_thread", worker_threads = 2)] +async fn remove_sync_cleans_cache_and_parent_reverse_index() { + use git_fs::fs::IndexedLookupCache; + + let cache = IndexedLookupCache::default(); + let parent: u64 = 1; + let child: u64 = 10; + + let key: (u64, Arc) = (parent, Arc::from(OsStr::new("foo"))); + let inode = make_inode(child, INodeType::File, 42, Some(parent)); + + // Insert entry. + let i = inode; + cache + .get_or_try_init(key.clone(), move || async move { Ok(i) }) + .await + .unwrap(); + + assert_eq!(cache.reverse_entry_count(parent), 1); + assert_eq!(cache.reverse_entry_count(child), 1); + + // remove_sync should remove the cache entry and parent reverse index. + cache.remove_sync(&key); + + // Cache entry is gone — factory should be called on next get_or_try_init. + let result = cache + .get_or_try_init(key.clone(), move || async move { + Ok(make_inode(child, INodeType::File, 999, Some(parent))) + }) + .await + .unwrap(); + assert_eq!(result.size, 999, "cache entry should have been removed"); + + // Parent-side reverse index should be cleaned (0 entries after remove). + // Note: reverse_entry_count(parent) is now 1 again because + // get_or_try_init re-indexed it. Check that it was cleaned by verifying + // it was re-populated from scratch (size == 999 above proves the factory ran). + + // Child-side orphan: the reverse index for child may still contain the + // old entry (from before remove_sync). This is the documented trade-off. + // Verify that evict_addr(child) cleanly handles the orphan without panic. + cache.evict_addr(child); + + // After evicting the child, the re-inserted entry should be gone. + let result = cache + .get_or_try_init(key.clone(), move || async move { + Ok(make_inode(child, INodeType::File, 777, Some(parent))) + }) + .await + .unwrap(); + assert_eq!( + result.size, 777, + "entry should be re-fetchable after orphan cleanup via evict_addr" + ); +}