diff --git a/Cargo.lock b/Cargo.lock index 4cc5c378c..303213fe4 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -5350,6 +5350,7 @@ dependencies = [ "slog-async", "slog-term", "softnpu", + "static_assertions", "strum 0.26.3", "tempfile", "thiserror 1.0.64", diff --git a/lib/propolis/Cargo.toml b/lib/propolis/Cargo.toml index 5ac6f0e96..cf863ca8c 100644 --- a/lib/propolis/Cargo.toml +++ b/lib/propolis/Cargo.toml @@ -45,6 +45,7 @@ ispf = { workspace = true, optional = true } rand = { workspace = true, optional = true } softnpu = { workspace = true, optional = true } dlpi = { workspace = true, optional = true } +static_assertions = "1.1.0" [dev-dependencies] crossbeam-channel.workspace = true diff --git a/lib/propolis/src/block/attachment.rs b/lib/propolis/src/block/attachment.rs index 6bb00fa06..35d6daeee 100644 --- a/lib/propolis/src/block/attachment.rs +++ b/lib/propolis/src/block/attachment.rs @@ -39,7 +39,10 @@ use thiserror::Error; use tokio::sync::futures::Notified; use tokio::sync::Notify; -/// Static for generating unique block [DeviceId]s with a process +/// Static for generating unique block [DeviceId]s within a process +/// +/// Numbering across block devices means that a block `DeviceId` and the queue +/// ID in a block attachment are unique across a VM. static NEXT_DEVICE_ID: AtomicU32 = AtomicU32::new(0); pub const MAX_WORKERS: NonZeroUsize = NonZeroUsize::new(64).unwrap(); diff --git a/lib/propolis/src/hw/nvme/admin.rs b/lib/propolis/src/hw/nvme/admin.rs index 63607707b..df8746a62 100644 --- a/lib/propolis/src/hw/nvme/admin.rs +++ b/lib/propolis/src/hw/nvme/admin.rs @@ -5,6 +5,7 @@ use std::mem::size_of; use crate::common::{GuestAddr, GuestRegion, PAGE_SIZE}; +use crate::hw::nvme; use crate::vmm::MemCtx; use super::bits::*; @@ -17,7 +18,7 @@ use super::{ #[usdt::provider(provider = "propolis")] mod probes { - fn nvme_abort(cid: u16, sqid: u16) {} + fn nvme_abort(cid: u16, devsq_id: u64) {} } impl NvmeCtrl { @@ -25,7 +26,8 @@ impl NvmeCtrl { /// /// See NVMe 1.0e Section 5.1 Abort command pub(super) fn acmd_abort(&self, cmd: &cmds::AbortCmd) -> cmds::Completion { - probes::nvme_abort!(|| (cmd.cid, cmd.sqid)); + let devsq_id = nvme::devq_id(self.device_id, cmd.sqid); + probes::nvme_abort!(|| (cmd.cid, devsq_id)); // Verify the SQ in question currently exists let sqid = cmd.sqid as usize; @@ -74,6 +76,7 @@ impl NvmeCtrl { match self.create_cq( super::queue::CreateParams { id: cmd.qid, + device_id: self.device_id, base: GuestAddr(cmd.prp), size: cmd.qsize, }, @@ -119,6 +122,7 @@ impl NvmeCtrl { match self.create_sq( super::queue::CreateParams { id: cmd.qid, + device_id: self.device_id, base: GuestAddr(cmd.prp), size: cmd.qsize, }, diff --git a/lib/propolis/src/hw/nvme/mod.rs b/lib/propolis/src/hw/nvme/mod.rs index d7e61b923..83317dd9a 100644 --- a/lib/propolis/src/hw/nvme/mod.rs +++ b/lib/propolis/src/hw/nvme/mod.rs @@ -5,6 +5,7 @@ use std::convert::TryInto; use std::mem::size_of; use std::num::NonZeroUsize; +use std::sync::atomic::AtomicU32; use std::sync::atomic::{AtomicBool, Ordering}; use std::sync::{Arc, Mutex, MutexGuard, Weak}; @@ -31,13 +32,33 @@ mod requests; use bits::*; use queue::{CompQueue, QueueId, SubQueue}; +/// Static for generating unique NVMe device identifiers across a VM. +static NEXT_DEVICE_ID: AtomicU32 = AtomicU32::new(0); + +type DeviceId = u32; + #[usdt::provider(provider = "propolis")] mod probes { - fn nvme_doorbell(off: u64, qid: u16, is_cq: u8, val: u16) {} + fn nvme_doorbell(off: u64, devq_id: u64, is_cq: u8, val: u16) {} fn nvme_doorbell_admin_cq(val: u16) {} fn nvme_doorbell_admin_sq(val: u16) {} fn nvme_admin_cmd(opcode: u8, prp1: u64, prp2: u64) {} - fn nvme_block_notify(sqid: u16, block_qid: u16, occupied_hint: u16) {} + fn nvme_block_notify(devsq_id: u64, block_devqid: u64, occupied_hint: u16) { + } +} + +/// Combine an NVMe device and queue ID into a single u64 for probes +pub(crate) fn devq_id(dev: DeviceId, queue: QueueId) -> u64 { + // We'll use the low 16 bits for the queue ID. Assert at compile time that + // queue IDs cannot go above that. Clippy helpfully asserts this is an + // absurd comparison, so silence that. If you see a rustc error here you + // must have changed the type of QueueId such this is no longer absurd! + #[allow(clippy::absurd_extreme_comparisons)] + { + static_assertions::const_assert!(QueueId::MAX <= u16::MAX); + } + + ((dev as u64) << 16) | (queue as u64) } /// The max number of MSI-X interrupts we support @@ -154,6 +175,12 @@ const MAX_NUM_IO_QUEUES: usize = MAX_NUM_QUEUES - 1; /// NVMe Controller struct NvmeCtrl { + /// A distinguishing identifier for this NVMe controller across the VM. + /// Useful mostly to distinguish queues and commands as seen in probes. + /// `device_id` is held constant across NVMe resets, but not persisted + /// across export and import. + device_id: DeviceId, + /// Internal NVMe Controller state ctrl: CtrlState, @@ -185,6 +212,7 @@ impl NvmeCtrl { self.create_cq( queue::CreateParams { id: queue::ADMIN_QUEUE_ID, + device_id: self.device_id, base: GuestAddr(self.ctrl.admin_cq_base), // Convert from 0's based size: u32::from(self.ctrl.aqa.acqs()) + 1, @@ -196,6 +224,7 @@ impl NvmeCtrl { self.create_sq( queue::CreateParams { id: queue::ADMIN_QUEUE_ID, + device_id: self.device_id, base: GuestAddr(self.ctrl.admin_sq_base), // Convert from 0's based size: u32::from(self.ctrl.aqa.asqs()) + 1, @@ -589,6 +618,7 @@ impl NvmeCtrl { self.create_cq( queue::CreateParams { id: cqi.id, + device_id: self.device_id, base: GuestAddr(cqi.base), size: cqi.size, }, @@ -611,6 +641,7 @@ impl NvmeCtrl { .create_sq( queue::CreateParams { id: sqi.id, + device_id: self.device_id, base: GuestAddr(sqi.base), size: sqi.size, }, @@ -742,6 +773,11 @@ pub struct PciNvme { /// accesses without stacking them up behind one central lock. is_enabled: AtomicBool, + /// Duplicate of the controller NVMe device ID, but not requiring locking + /// [NvmeCtrl] to read. This is used to provide additional context in + /// NVMe-related probes. + device_id: DeviceId, + /// Access to NVMe Submission and Completion queues. /// /// These are protected with per-slot (queue ID) locks, so actions taken on @@ -849,6 +885,7 @@ impl PciNvme { let csts = Status(0); let state = NvmeCtrl { + device_id: NEXT_DEVICE_ID.fetch_add(1, Ordering::Relaxed), ctrl: CtrlState { cap, cc, csts, ..Default::default() }, doorbell_buf: None, msix_hdl: None, @@ -879,9 +916,13 @@ impl PciNvme { } })); + // Cache device ID before we move it into the Mutex below. + let device_id = state.device_id; + PciNvme { state: Mutex::new(state), is_enabled: AtomicBool::new(false), + device_id, pci_state, queues: NvmeQueues::default(), block_attach, @@ -1110,9 +1151,12 @@ impl PciNvme { // 32-bit register but ignore reserved top 16-bits let val = wo.read_u32() as u16; + // Mix in the device ID for probe purposes + let devq_id = devq_id(self.device_id, qid); + probes::nvme_doorbell!(|| ( off as u64, - qid, + devq_id, u8::from(is_cq), val )); @@ -1188,9 +1232,12 @@ impl PciNvme { }) { let block_qid = queue::sqid_to_block_qid(sqid); + let devsq_id = devq_id(self.device_id, sqid); + let block_devqid = + block::devq_id(self.block_attach.device_id(), block_qid); probes::nvme_block_notify!(|| ( - sqid, - u16::from(block_qid), + devsq_id, + block_devqid, num_occupied )); self.block_attach.notify( @@ -1207,14 +1254,17 @@ impl PciNvme { let sq = guard.as_ref().unwrap(); let num_occupied = sq.notify_tail(val)?; + let devsq_id = sq.devq_id(); // Notification to block layer cannot be issued with SQ lock held drop(guard); assert_ne!(qid, queue::ADMIN_QUEUE_ID); let block_qid = queue::sqid_to_block_qid(qid); + let block_devqid = + block::devq_id(self.block_attach.device_id(), block_qid); probes::nvme_block_notify!(|| ( - qid, - u16::from(block_qid), + devsq_id, + block_devqid, num_occupied )); self.block_attach diff --git a/lib/propolis/src/hw/nvme/queue.rs b/lib/propolis/src/hw/nvme/queue.rs index 064950bf6..2218ddbcb 100644 --- a/lib/propolis/src/hw/nvme/queue.rs +++ b/lib/propolis/src/hw/nvme/queue.rs @@ -12,6 +12,7 @@ use super::cmds::Completion; use crate::accessors::MemAccessor; use crate::block; use crate::common::*; +use crate::hw::nvme::DeviceId; use crate::hw::pci; use crate::migrate::MigrateStateError; use crate::vmm::MemCtx; @@ -20,18 +21,28 @@ use thiserror::Error; #[usdt::provider(provider = "propolis")] mod probes { - fn nvme_cqe(qid: u16, idx: u16, phase: u8) {} - fn nvme_sq_dbbuf_read(qid: u16, val: u32, tail: u16) {} - fn nvme_sq_dbbuf_write(qid: u16, head: u16) {} - fn nvme_sq_dbbuf_write_shadow(qid: u16, head: u16) {} - fn nvme_cq_dbbuf_read(qid: u16, val: u32, tail: u16) {} - fn nvme_cq_dbbuf_write(qid: u16, head: u16) {} - fn nvme_cq_dbbuf_write_shadow(qid: u16, head: u16) {} + fn nvme_cqe(devcq_id: u64, idx: u16, phase: u8) {} + fn nvme_sq_dbbuf_read(devsq_id: u64, val: u32, tail: u16) {} + fn nvme_sq_dbbuf_write(devsq_id: u64, head: u16) {} + fn nvme_sq_dbbuf_write_shadow(devsq_id: u64, head: u16) {} + fn nvme_cq_dbbuf_read(devcq_id: u64, val: u32, tail: u16) {} + fn nvme_cq_dbbuf_write(devcq_id: u64, head: u16) {} + fn nvme_cq_dbbuf_write_shadow(devcq_id: u64, head: u16) {} } /// Each queue is identified by a 16-bit ID. /// /// See NVMe 1.0e Section 4.1.4 Queue Identifier +/// +/// Submission and completion queue IDs are distinct namespaces, so a device +/// can have both a "Submission Queue 1" and "Completion Queue 1". +/// +/// For USDT probes, we combine this ID with an NVMe controller ID to produce a +/// `devq_id`. This combined identifier is still ambiguous beteen one submission +/// queue and one completion queue. Contextually there is typically only one +/// reasonable interpretation of the ID, but the probe arguments are named +/// `devsq_id` or `devcq_id` to be explicit about identifying a Submission or +/// Completion queue, respectively. pub type QueueId = u16; /// The minimum number of entries in either a Completion or Submission Queue. @@ -330,9 +341,9 @@ impl QueueGuard<'_, CompQueueState> { } /// Write update to the EventIdx in Doorbell Buffer page, if possible - fn db_buf_write(&mut self, qid: QueueId, mem: &MemCtx) { + fn db_buf_write(&mut self, devq_id: u64, mem: &MemCtx) { if let Some(db_buf) = self.state.db_buf { - probes::nvme_cq_dbbuf_write!(|| (qid, self.state.tail)); + probes::nvme_cq_dbbuf_write!(|| (devq_id, self.state.tail)); // Keep EventIdx populated with the position of the CQ tail. We are // not especially concerned with receiving timely (doorbell) updates // from the guest about where the head pointer sits. We keep our @@ -352,21 +363,21 @@ impl QueueGuard<'_, CompQueueState> { /// We would expect the guest driver to keep this value in a valid state per /// the specification, but qemu notes that certain consumers fail to do so /// on the admin queue. We follow their lead to avoid issues. - fn db_buf_write_shadow(&mut self, qid: QueueId, mem: &MemCtx) { + fn db_buf_write_shadow(&mut self, devq_id: u64, mem: &MemCtx) { if let Some(db_buf) = self.state.db_buf { - probes::nvme_cq_dbbuf_write_shadow!(|| (qid, self.state.head)); + probes::nvme_cq_dbbuf_write_shadow!(|| (devq_id, self.state.head)); fence(Ordering::Release); mem.write(db_buf.shadow, &self.state.head); } } /// Read update from the Shadow in Doorbell Buffer page, if possible - fn db_buf_read(&mut self, qid: QueueId, mem: &MemCtx) { + fn db_buf_read(&mut self, devq_id: u64, mem: &MemCtx) { if let Some(db_buf) = self.state.db_buf { if let Some(new_head) = mem.read::(db_buf.shadow) { let new_head = *new_head; probes::nvme_cq_dbbuf_read!(|| ( - qid, + devq_id, new_head, self.state.head )); @@ -450,9 +461,9 @@ impl QueueGuard<'_, SubQueueState> { } /// Write update to the EventIdx in Doorbell Buffer page, if possible - fn db_buf_write(&mut self, qid: QueueId, mem: &MemCtx) { + fn db_buf_write(&mut self, devq_id: u64, mem: &MemCtx) { if let Some(db_buf) = self.state.db_buf { - probes::nvme_sq_dbbuf_write!(|| (qid, self.state.head)); + probes::nvme_sq_dbbuf_write!(|| (devq_id, self.state.head)); // Keep EventIdx populated with the position of the SQ head. As // long as there are entries available between the head and tail, we // do not want the guest taking exits for ultimately redundant @@ -470,21 +481,21 @@ impl QueueGuard<'_, SubQueueState> { /// /// See [QueueGuard::db_buf_write_shadow()] for why we would /// write to a "guest-owned" page. - fn db_buf_write_shadow(&mut self, qid: QueueId, mem: &MemCtx) { + fn db_buf_write_shadow(&mut self, devq_id: u64, mem: &MemCtx) { if let Some(db_buf) = self.state.db_buf { - probes::nvme_sq_dbbuf_write_shadow!(|| (qid, self.state.tail)); + probes::nvme_sq_dbbuf_write_shadow!(|| (devq_id, self.state.tail)); fence(Ordering::Release); mem.write(db_buf.shadow, &self.state.tail); } } /// Read update from the Shadow in Doorbell Buffer page, if possible - fn db_buf_read(&mut self, qid: QueueId, mem: &MemCtx) { + fn db_buf_read(&mut self, devq_id: u64, mem: &MemCtx) { if let Some(db_buf) = self.state.db_buf { if let Some(new_tail) = mem.read::(db_buf.shadow) { let new_tail = *new_tail; probes::nvme_sq_dbbuf_read!(|| ( - qid, + devq_id, new_tail, self.state.head )); @@ -527,6 +538,9 @@ pub enum QueueUpdateError { #[derive(Copy, Clone)] pub struct CreateParams { pub id: QueueId, + // Not strictly necessary for submission or completion queues, but helpful + // to disambiguate the queue in probes. + pub device_id: DeviceId, pub base: GuestAddr, pub size: u32, } @@ -536,6 +550,10 @@ pub struct SubQueue { /// The ID of this Submission Queue. id: QueueId, + /// The ID of the device that owns this submission queue. Kept here only to + /// produce `devsq_id` for DTrace probes. + device_id: DeviceId, + /// The corresponding Completion Queue. cq: Arc, @@ -566,10 +584,11 @@ impl SubQueue { cq: Arc, acc_mem: MemAccessor, ) -> Result, QueueCreateErr> { - let CreateParams { id, base, size } = params; + let CreateParams { id, device_id, base, size } = params; validate(id == ADMIN_QUEUE_ID, base, size)?; let sq = Arc::new(Self { id, + device_id, cq, state: SubQueueState::new(size, acc_mem), cur_head: AtomicU16::new(0), @@ -597,7 +616,7 @@ impl SubQueue { state.push_tail_to(idx)?; if self.id == ADMIN_QUEUE_ID { if let Some(mem) = state.acc_mem.access() { - state.db_buf_write_shadow(self.id, &mem); + state.db_buf_write_shadow(self.devq_id(), &mem); } } @@ -620,14 +639,15 @@ impl SubQueue { // Check for last-minute updates to the tail via any configured doorbell // page, prior to attempting the pop itself. - state.db_buf_read(self.id, &mem); + state.db_buf_read(self.devq_id(), &mem); if let Some(idx) = state.pop_head(&self.cur_head) { let addr = self.base.offset::(idx as usize); if let Some(ent) = mem.read::(addr) { - state.db_buf_write(self.id, &mem); - state.db_buf_read(self.id, &mem); + let devq_id = self.devq_id(); + state.db_buf_write(devq_id, &mem); + state.db_buf_read(devq_id, &mem); return Some((ent, permit.promote(ent.cid()), idx)); } // TODO: set error state on queue/ctrl if we cannot read entry @@ -678,6 +698,11 @@ impl SubQueue { cqe.sqhd = self.cur_head.load(Ordering::Acquire); } + /// Return a VM-unique identifier for this submission queue + pub(crate) fn devq_id(&self) -> u64 { + super::devq_id(self.device_id, self.id) + } + pub(super) fn export(&self) -> migrate::NvmeSubQueueV1 { let inner = self.state.inner.lock().unwrap(); migrate::NvmeSubQueueV1 { @@ -713,6 +738,10 @@ pub struct CompQueue { /// The ID of this Completion Queue. id: QueueId, + /// The ID of the device that owns this completion queue. Kept here only to + /// produce `devcq_id` for DTrace probes. + device_id: DeviceId, + /// The Interrupt Vector used to signal to the host (VM) upon pushing /// entries onto the Completion Queue. iv: u16, @@ -739,10 +768,11 @@ impl CompQueue { hdl: pci::MsixHdl, acc_mem: MemAccessor, ) -> Result { - let CreateParams { id, base, size } = params; + let CreateParams { id, device_id, base, size } = params; validate(id == ADMIN_QUEUE_ID, base, size)?; Ok(Self { id, + device_id, iv, state: CompQueueState::new(size, acc_mem), base, @@ -757,7 +787,7 @@ impl CompQueue { state.pop_head_to(idx)?; if self.id == ADMIN_QUEUE_ID { if let Some(mem) = state.acc_mem.access() { - state.db_buf_write_shadow(self.id, &mem) + state.db_buf_write_shadow(self.devq_id(), &mem) } } Ok(()) @@ -804,7 +834,7 @@ impl CompQueue { // If the CQ appears full, but the db_buf shadow is configured, do a // last-minute check to see if entries have been consumed/freed // without a doorbell call. - state.db_buf_read(self.id, mem); + state.db_buf_read(self.devq_id(), mem); } if state.take_avail(sq) { Some(ProtoPermit::new(self, sq)) @@ -824,7 +854,7 @@ impl CompQueue { .push_tail() .expect("CQ should have available space for assigned permit"); - probes::nvme_cqe!(|| (self.id, idx, u8::from(phase))); + probes::nvme_cqe!(|| (self.devq_id(), idx, u8::from(phase))); // The only definite indicator that a CQE has become valid is the phase // bit being toggled. Since the interface for writing to guest memory @@ -844,8 +874,9 @@ impl CompQueue { cqe.set_phase(phase); mem.write(addr, &cqe); - state.db_buf_read(self.id, &mem); - state.db_buf_write(self.id, &mem); + let devq_id = self.devq_id(); + state.db_buf_read(devq_id, &mem); + state.db_buf_write(devq_id, &mem); } else { // TODO: mark the queue/controller in error state? } @@ -870,6 +901,11 @@ impl CompQueue { } } + /// Return a VM-unique identifier for this completion queue + pub(crate) fn devq_id(&self) -> u64 { + super::devq_id(self.device_id, self.id) + } + pub(super) fn export(&self) -> migrate::NvmeCompQueueV1 { let guard = self.state.lock(); migrate::NvmeCompQueueV1 { @@ -918,13 +954,18 @@ pub struct ProtoPermit { /// The Submission Queue for which this entry is reserved. sq: Weak, - /// The Submission Queue's ID (stored separately to avoid going through - /// the Weak ref). - sqid: u16, + /// The ID for the device and Submission Queue the command associated with + /// this permit was submtited from. Stored separately to avoid going through + /// the Weak ref. + devsq_id: u64, } impl ProtoPermit { fn new(cq: &Arc, sq: &Arc) -> Self { - Self { cq: Arc::downgrade(cq), sq: Arc::downgrade(sq), sqid: sq.id } + Self { + cq: Arc::downgrade(cq), + sq: Arc::downgrade(sq), + devsq_id: sq.devq_id(), + } } /// Promote a "proto" permit to a [Permit]. @@ -936,7 +977,7 @@ impl ProtoPermit { Permit { cq: self.cq, sq: self.sq, - sqid: self.sqid, + devsq_id: self.devsq_id, cid, _nodrop: NoDropPermit, } @@ -963,8 +1004,9 @@ pub struct Permit { /// The Submission Queue for which this entry is reserved. sq: Weak, - /// The Submission Queue ID the request came in on. - sqid: u16, + /// The Submission Queue and device ID the request came in on. Retained as a + /// consistent source identifier for probes. + devsq_id: u64, /// ID of command holding this permit. Used to populate `cid` field in /// Completion Queue Entry. @@ -1009,10 +1051,10 @@ impl Permit { self.cid } - /// Get the ID of the Submission Queue the command associated with this - /// permit was submitted on. - pub fn sqid(&self) -> u16 { - self.sqid + /// Get the ID of the device and Submission Queue the command associated + /// with this permit was submitted on. + pub fn devsq_id(&self) -> u64 { + self.devsq_id } /// A device reset may cause us to abandon some in-flight I/O, dropping the @@ -1046,7 +1088,7 @@ impl Debug for Permit { f.debug_struct("Permit") .field("sq", &self.sq) .field("cq", &self.cq) - .field("sqid", &self.sqid) + .field("devsq_id", &self.devsq_id) .field("cid", &self.cid) .finish() } @@ -1139,21 +1181,35 @@ mod test { let machine = Machine::new_test()?; let hdl = pci::MsixHdl::new_test(); let write_base = GuestAddr(1024 * 1024); - let tmpl = - CreateParams { id: ADMIN_QUEUE_ID, base: write_base, size: 0 }; + let tmpl = CreateParams { + id: ADMIN_QUEUE_ID, + device_id: 0, + base: write_base, + size: 0, + }; let acc_mem = || machine.acc_mem.child(None); // Admin queues must be less than 4K let cq = CompQueue::new( - CreateParams { id: ADMIN_QUEUE_ID, size: 1024, ..tmpl }, + CreateParams { + id: ADMIN_QUEUE_ID, + device_id: 0, + size: 1024, + ..tmpl + }, 0, hdl.clone(), acc_mem(), ); assert!(matches!(cq, Ok(_))); let cq = CompQueue::new( - CreateParams { id: ADMIN_QUEUE_ID, size: 5 * 1024, ..tmpl }, + CreateParams { + id: ADMIN_QUEUE_ID, + device_id: 0, + size: 5 * 1024, + ..tmpl + }, 0, hdl.clone(), acc_mem(), @@ -1162,14 +1218,14 @@ mod test { // I/O queues must be less than 64K let cq = CompQueue::new( - CreateParams { id: 1, size: 1024, ..tmpl }, + CreateParams { id: 1, device_id: 0, size: 1024, ..tmpl }, 0, hdl.clone(), acc_mem(), ); assert!(matches!(cq, Ok(_))); let cq = CompQueue::new( - CreateParams { id: 1, size: 65 * 1024, ..tmpl }, + CreateParams { id: 1, device_id: 0, size: 65 * 1024, ..tmpl }, 0, hdl.clone(), acc_mem(), @@ -1178,14 +1234,14 @@ mod test { // Neither must be less than 2 let cq = CompQueue::new( - CreateParams { id: ADMIN_QUEUE_ID, size: 1, ..tmpl }, + CreateParams { id: ADMIN_QUEUE_ID, device_id: 0, size: 1, ..tmpl }, 0, hdl.clone(), acc_mem(), ); assert!(matches!(cq, Err(QueueCreateErr::InvalidSize))); let cq = CompQueue::new( - CreateParams { id: 1, size: 1, ..tmpl }, + CreateParams { id: 1, device_id: 0, size: 1, ..tmpl }, 0, hdl.clone(), acc_mem(), @@ -1209,6 +1265,7 @@ mod test { CompQueue::new( CreateParams { id: ADMIN_QUEUE_ID, + device_id: 0, base: write_base, size: 1024, }, @@ -1220,7 +1277,12 @@ mod test { ); let io_cq = Arc::new( CompQueue::new( - CreateParams { id: 1, base: write_base, size: 1024 }, + CreateParams { + id: 1, + device_id: 0, + base: write_base, + size: 1024, + }, 0, hdl, acc_mem(), @@ -1230,7 +1292,12 @@ mod test { // Admin queues must be less than 4K let sq = SubQueue::new( - CreateParams { id: ADMIN_QUEUE_ID, base: read_base, size: 1024 }, + CreateParams { + id: ADMIN_QUEUE_ID, + device_id: 0, + base: read_base, + size: 1024, + }, admin_cq.clone(), acc_mem(), ); @@ -1238,6 +1305,7 @@ mod test { let sq = SubQueue::new( CreateParams { id: ADMIN_QUEUE_ID, + device_id: 0, base: read_base, size: 5 * 1024, }, @@ -1248,13 +1316,18 @@ mod test { // I/O queues must be less than 64K let sq = SubQueue::new( - CreateParams { id: 1, base: read_base, size: 1024 }, + CreateParams { id: 1, device_id: 0, base: read_base, size: 1024 }, io_cq.clone(), acc_mem(), ); assert!(matches!(sq, Ok(_))); let sq = SubQueue::new( - CreateParams { id: 1, base: read_base, size: 65 * 1024 }, + CreateParams { + id: 1, + device_id: 0, + base: read_base, + size: 65 * 1024, + }, io_cq, acc_mem(), ); @@ -1262,13 +1335,18 @@ mod test { // Neither must be less than 2 let sq = SubQueue::new( - CreateParams { id: ADMIN_QUEUE_ID, base: read_base, size: 1 }, + CreateParams { + id: ADMIN_QUEUE_ID, + device_id: 0, + base: read_base, + size: 1, + }, admin_cq.clone(), acc_mem(), ); assert!(matches!(sq, Err(QueueCreateErr::InvalidSize))); let sq = SubQueue::new( - CreateParams { id: 1, base: read_base, size: 1 }, + CreateParams { id: 1, device_id: 0, base: read_base, size: 1 }, admin_cq, acc_mem(), ); @@ -1306,7 +1384,7 @@ mod test { // Create our queues let cq = Arc::new( CompQueue::new( - CreateParams { id: 1, base: write_base, size: 4 }, + CreateParams { id: 1, device_id: 0, base: write_base, size: 4 }, 0, hdl, acc_mem(), @@ -1315,7 +1393,7 @@ mod test { ); let sq = Arc::new( SubQueue::new( - CreateParams { id: 1, base: read_base, size: 4 }, + CreateParams { id: 1, device_id: 0, base: read_base, size: 4 }, cq.clone(), acc_mem(), ) @@ -1385,7 +1463,7 @@ mod test { // Purposely make the CQ smaller to test kicks let cq = Arc::new( CompQueue::new( - CreateParams { id: 1, base: write_base, size: 2 }, + CreateParams { id: 1, device_id: 0, base: write_base, size: 2 }, 0, hdl, acc_mem(), @@ -1394,7 +1472,7 @@ mod test { ); let sq = Arc::new( SubQueue::new( - CreateParams { id: 1, base: read_base, size: 4 }, + CreateParams { id: 1, device_id: 0, base: read_base, size: 4 }, cq.clone(), acc_mem(), ) @@ -1456,7 +1534,7 @@ mod test { let sq_size = rng.random_range(512..2048); let cq = Arc::new( CompQueue::new( - CreateParams { id: 1, base: write_base, size: 4 }, + CreateParams { id: 1, device_id: 0, base: write_base, size: 4 }, 0, hdl, acc_mem(), @@ -1465,7 +1543,12 @@ mod test { ); let sq = Arc::new( SubQueue::new( - CreateParams { id: 1, base: read_base, size: sq_size }, + CreateParams { + id: 1, + device_id: 0, + base: read_base, + size: sq_size, + }, cq.clone(), acc_mem(), ) diff --git a/lib/propolis/src/hw/nvme/requests.rs b/lib/propolis/src/hw/nvme/requests.rs index f755a3549..97ee8f4f6 100644 --- a/lib/propolis/src/hw/nvme/requests.rs +++ b/lib/propolis/src/hw/nvme/requests.rs @@ -12,17 +12,37 @@ use crate::hw::nvme::{bits, cmds::Completion, queue::SubQueue}; #[usdt::provider(provider = "propolis")] mod probes { - fn nvme_read_enqueue(qid: u16, idx: u16, cid: u16, off: u64, sz: u64) {} - fn nvme_read_complete(qid: u16, cid: u16, res: u8) {} - - fn nvme_write_enqueue(qid: u16, idx: u16, cid: u16, off: u64, sz: u64) {} - fn nvme_write_complete(qid: u16, cid: u16, res: u8) {} + // Note that unlike the probes in `queue.rs`, the probes here provide a + // `devsq_id` for completion as well as enqueuement. The submission queue is + // the one the command was originally submitted on. + // + // As long as queues are not destroyed (and the device is not reset), a + // `(devsq_id, cid)` tuple seen in an `nvme_*_enqueue` probably will not be + // reused before that same tuple is used in a corresponding + // `nvme_*_complete` probe. It is possible, but such a case is a + // guest error and unlikely. From the NVM Express Base Specification: + // + // > The Command Identifier field in the SQE shall be unique among all + // > outstanding commands associated with that queue. + fn nvme_read_enqueue(devsq_id: u64, idx: u16, cid: u16, off: u64, sz: u64) { + } + fn nvme_read_complete(devsq_id: u64, cid: u16, res: u8) {} + + fn nvme_write_enqueue( + devsq_id: u64, + idx: u16, + cid: u16, + off: u64, + sz: u64, + ) { + } + fn nvme_write_complete(devsq_id: u64, cid: u16, res: u8) {} - fn nvme_flush_enqueue(qid: u16, idx: u16, cid: u16) {} - fn nvme_flush_complete(qid: u16, cid: u16, res: u8) {} + fn nvme_flush_enqueue(devsq_id: u64, idx: u16, cid: u16) {} + fn nvme_flush_complete(devsq_id: u64, cid: u16, res: u8) {} fn nvme_raw_cmd( - qid: u16, + devsq_id: u64, cdw0nsid: u64, prp1: u64, prp2: u64, @@ -57,10 +77,10 @@ impl block::DeviceQueue for NvmeBlockQueue { let params = self.sq.params(); while let Some((sub, permit, idx)) = sq.pop() { - let qid = sq.id(); + let devsq_id = sq.devq_id(); probes::nvme_raw_cmd!(|| { ( - qid, + devsq_id, u64::from(sub.cdw0) | (u64::from(sub.nsid) << 32), sub.prp1, sub.prp2, @@ -83,7 +103,13 @@ impl block::DeviceQueue for NvmeBlockQueue { continue; } - probes::nvme_write_enqueue!(|| (qid, idx, cid, off, size)); + probes::nvme_write_enqueue!(|| ( + sq.devq_id(), + idx, + cid, + off, + size + )); let bufs = cmd.data(size, &mem).collect(); let req = @@ -102,7 +128,13 @@ impl block::DeviceQueue for NvmeBlockQueue { continue; } - probes::nvme_read_enqueue!(|| (qid, idx, cid, off, size)); + probes::nvme_read_enqueue!(|| ( + sq.devq_id(), + idx, + cid, + off, + size + )); let bufs = cmd.data(size, &mem).collect(); let req = @@ -110,7 +142,7 @@ impl block::DeviceQueue for NvmeBlockQueue { return Some((req, permit, None)); } Ok(NvmCmd::Flush) => { - probes::nvme_flush_enqueue!(|| (qid, idx, cid)); + probes::nvme_flush_enqueue!(|| (sq.devq_id(), idx, cid)); let req = Request::new_flush(); return Some((req, permit, None)); } @@ -133,18 +165,18 @@ impl block::DeviceQueue for NvmeBlockQueue { result: block::Result, permit: Self::Token, ) { - let qid = permit.sqid(); + let devsq_id = permit.devsq_id(); let cid = permit.cid(); let resnum = result as u8; match op { Operation::Read(..) => { - probes::nvme_read_complete!(|| (qid, cid, resnum)); + probes::nvme_read_complete!(|| (devsq_id, cid, resnum)); } Operation::Write(..) => { - probes::nvme_write_complete!(|| (qid, cid, resnum)); + probes::nvme_write_complete!(|| (devsq_id, cid, resnum)); } Operation::Flush => { - probes::nvme_flush_complete!(|| (qid, cid, resnum)); + probes::nvme_flush_complete!(|| (devsq_id, cid, resnum)); } Operation::Discard(..) => { unreachable!("discard not supported in NVMe for now");