-
Notifications
You must be signed in to change notification settings - Fork 29
block/file: do pread/pwrite from Propolis heap instead of VM memory #985
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -754,6 +754,44 @@ pub trait MappingExt { | |
| fn pwritev(&self, fd: RawFd, offset: i64) -> Result<usize>; | ||
| } | ||
|
|
||
| // Gross hack alert: since the mappings below are memory regions backed by | ||
| // segvmm_ops, `zvol_{read,write}` and similar will end up contending on | ||
| // `svmd->svmd_lock`. Instead, as long as the I/O is small enough we'll tolerate | ||
| // it, copy from guest memory to Propolis heap. The segment backing Propolis' | ||
| // heap has an `as_page{,un}lock` impl that avoids the more | ||
| // expensive/contentious `as_fault()` fallback. | ||
| // | ||
| // This is an optimization until stlouis#871 can get things sorted, at | ||
| // which point it should be strictly worse than directly using the | ||
| // requested mappings. | ||
| // | ||
| // 1 MiB is an arbitrary-ish choice: `propolis-server` and `propolis-standalone` | ||
| // set NVMe MDTS to "8", so the largest I/Os from NVMe will be | ||
| // `2**8 * 4096 == 1048576 bytes == 1 MiB`. Beyond this, fall back to using | ||
| // iovecs directly, potentially at increased OS overhead. | ||
| // | ||
| // The amount of memory used for temporary buffers is given by the number of | ||
| // worker threads for all file-backed disks, times this threshold. It works out | ||
| // to up to 8 MiB (8 worker threads) of buffers per disk by default as of | ||
| // writing. | ||
| const MAPPING_IO_LIMIT_BYTES: usize = crate::common::MB; | ||
|
|
||
| /// Compute the number of bytes that would be required to hold these mappings | ||
| /// sequentially. | ||
| /// | ||
| /// Ranges covered by multiple mappings are counted repeatedly. | ||
| fn total_mapping_size(mappings: &[SubMapping<'_>]) -> Result<usize> { | ||
| mappings | ||
| .iter() | ||
| .try_fold(0usize, |total, mapping| total.checked_add(mapping.len)) | ||
| .ok_or_else(|| { | ||
| Error::new( | ||
| ErrorKind::InvalidInput, | ||
| "Total mapping larger than a `usize`", | ||
| ) | ||
| }) | ||
| } | ||
|
|
||
| impl<'a, T: AsRef<[SubMapping<'a>]>> MappingExt for T { | ||
| fn preadv(&self, fd: RawFd, offset: i64) -> Result<usize> { | ||
| if !self | ||
|
|
@@ -767,23 +805,82 @@ impl<'a, T: AsRef<[SubMapping<'a>]>> MappingExt for T { | |
| )); | ||
| } | ||
|
|
||
| let iov = self | ||
| .as_ref() | ||
| .iter() | ||
| .map(|mapping| iovec { | ||
| iov_base: mapping.ptr.as_ptr() as *mut libc::c_void, | ||
| iov_len: mapping.len, | ||
| }) | ||
| .collect::<Vec<_>>(); | ||
| let total_capacity = total_mapping_size(self.as_ref())?; | ||
|
|
||
| // Gross hack: see the comment on `MAPPING_IO_LIMIT_BYTES`. | ||
| if total_capacity <= MAPPING_IO_LIMIT_BYTES { | ||
| // If we're motivated to avoid the zero-fill via | ||
| // `Layout::with_size_align` + `GlobalAlloc::alloc`, we should | ||
| // probably avoid this gross hack entirely (see comment on | ||
| // MAPPING_IO_LIMIT_BYTES). | ||
|
Comment on lines
+812
to
+815
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. fair!
Member
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. so it since has turned out that the zero-filling doesn't take a remarkable amount of CPU time (compared to the copying), but the |
||
| let mut buf = vec![0; total_capacity]; | ||
|
|
||
| let iov = [iovec { | ||
| iov_base: buf.as_mut_ptr() as *mut libc::c_void, | ||
| iov_len: buf.len(), | ||
| }]; | ||
|
|
||
| let res = unsafe { | ||
| libc::preadv(fd, iov.as_ptr(), iov.len() as libc::c_int, offset) | ||
| }; | ||
| if res == -1 { | ||
| return Err(Error::last_os_error()); | ||
| } | ||
| let read = res as usize; | ||
|
|
||
| // copy `read` bytes back into the iovecs and return | ||
| let mut remaining = &mut buf[..read]; | ||
| for mapping in self.as_ref().iter() { | ||
| let to_copy = std::cmp::min(remaining.len(), mapping.len); | ||
|
|
||
| // Safety: guest physical memory is READ|WRITE, and won't become | ||
| // unmapped as long as we hold a Mapping, which `self` implies. | ||
| // | ||
| // The guest may be concurrently modifying this memory, we may | ||
| // be concurrently reading or writing this memory (if it is | ||
| // submitted for a read or write on another thread, for | ||
| // example), but `copy_nonoverlapping` has no compunctions about | ||
| // concurrent mutation. | ||
|
Comment on lines
+842
to
+843
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. perhaps worth noting explicitly that if a guest does decide it wants to screw up its own iovec for some reason, it would be pretty damn hard for LLVM to figure out a way to miscompile us in that case?
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. We could also drop down to something like the volatile copy intrinsic, which is sadly not exposed in the standard library (presumably via
Member
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
I'm definitely thinking more about "same page submitted in another I/O concurrently", where we're working on the same range on a different thread in Propolis, more than the guest happens to scribble on memory at the same time. before, I'd broken the rules but hadn't seen a better option and really just needed to justify to myself why it might be OK. now I just want to emphasize things can be happening (more than a normal but generally "operate on
would be a bit too limiting, I think! that intrinsic would demand a byte-wise copy loop, not |
||
| unsafe { | ||
| copy_nonoverlapping::<u8>( | ||
| remaining.as_ptr(), | ||
| mapping.ptr.as_ptr(), | ||
| mapping.len, | ||
| ); | ||
| } | ||
|
|
||
| remaining = remaining.split_at_mut(to_copy).1; | ||
|
|
||
| if remaining.len() == 0 { | ||
| // Either we're at the last iov and we're finished copying | ||
| // back into the guest, or `preadv` did a short read. | ||
| break; | ||
| } | ||
| } | ||
|
|
||
| let read = unsafe { | ||
| libc::preadv(fd, iov.as_ptr(), iov.len() as libc::c_int, offset) | ||
| }; | ||
| if read == -1 { | ||
| return Err(Error::last_os_error()); | ||
| } | ||
| // We should never read more than the guest mappings could hold. | ||
| assert_eq!(remaining.len(), 0); | ||
|
|
||
| Ok(read as usize) | ||
| Ok(read) | ||
| } else { | ||
| let iov = self | ||
| .as_ref() | ||
| .iter() | ||
| .map(|mapping| iovec { | ||
| iov_base: mapping.ptr.as_ptr() as *mut libc::c_void, | ||
| iov_len: mapping.len, | ||
| }) | ||
| .collect::<Vec<_>>(); | ||
|
|
||
| let read = unsafe { | ||
| libc::preadv(fd, iov.as_ptr(), iov.len() as libc::c_int, offset) | ||
| }; | ||
| if read == -1 { | ||
| return Err(Error::last_os_error()); | ||
| } | ||
| let read: usize = read.try_into().expect("read is positive"); | ||
|
Comment on lines
+875
to
+881
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. i thought we had someplace a thingy for doing a libc call and checking errno if the return value is -1, but after looking closer, it looks like this is just for i'm imagining a thingy like fn syscall_result(x: i32) -> Result<usize, io::Error> {
usize::try_from(x).map_err(|_| io::Error::last_os_error())
}not a blocker, naturally! |
||
| Ok(read) | ||
| } | ||
| } | ||
|
|
||
| fn pwritev(&self, fd: RawFd, offset: i64) -> Result<usize> { | ||
|
|
@@ -798,18 +895,70 @@ impl<'a, T: AsRef<[SubMapping<'a>]>> MappingExt for T { | |
| )); | ||
| } | ||
|
|
||
| let iov = self | ||
| .as_ref() | ||
| .iter() | ||
| .map(|mapping| iovec { | ||
| iov_base: mapping.ptr.as_ptr() as *mut libc::c_void, | ||
| iov_len: mapping.len, | ||
| }) | ||
| .collect::<Vec<_>>(); | ||
| let total_capacity = total_mapping_size(self.as_ref())?; | ||
|
|
||
| // Gross hack: see the comment on `MAPPING_IO_LIMIT_BYTES`. | ||
| let written = if total_capacity <= MAPPING_IO_LIMIT_BYTES { | ||
| // If we're motivated to avoid the zero-fill via | ||
| // `Layout::with_size_align` + `GlobalAlloc::alloc`, we should | ||
| // probably avoid this gross hack entirely (see comment on | ||
| // MAPPING_IO_LIMIT_BYTES). | ||
| let mut buf = vec![0; total_capacity]; | ||
|
|
||
| let mut remaining = buf.as_mut_slice(); | ||
| for mapping in self.as_ref().iter() { | ||
| // Safety: guest physical memory is READ|WRITE, and won't become | ||
| // unmapped as long as we hold a Mapping, which `self` implies. | ||
| // | ||
| // The guest may be concurrently modifying this memory, we may | ||
| // be concurrently reading or writing this memory (if it is | ||
| // submitted for a read or write on another thread, for | ||
| // example), but `copy_nonoverlapping` has no compunctions about | ||
| // concurrent mutation. | ||
| unsafe { | ||
| copy_nonoverlapping::<u8>( | ||
| mapping.ptr.as_ptr() as *const u8, | ||
| remaining.as_mut_ptr(), | ||
| mapping.len, | ||
| ); | ||
| } | ||
|
|
||
| remaining = remaining.split_at_mut(mapping.len).1; | ||
| } | ||
|
|
||
| let written = unsafe { | ||
| libc::pwritev(fd, iov.as_ptr(), iov.len() as libc::c_int, offset) | ||
| let iovs = [iovec { | ||
| iov_base: buf.as_mut_ptr() as *mut libc::c_void, | ||
| iov_len: buf.len(), | ||
| }]; | ||
|
|
||
| unsafe { | ||
| libc::pwritev( | ||
| fd, | ||
| iovs.as_ptr(), | ||
| iovs.len() as libc::c_int, | ||
| offset, | ||
| ) | ||
| } | ||
iximeow marked this conversation as resolved.
Show resolved
Hide resolved
|
||
| } else { | ||
| let iovs = self | ||
| .as_ref() | ||
| .iter() | ||
| .map(|mapping| iovec { | ||
| iov_base: mapping.ptr.as_ptr() as *mut libc::c_void, | ||
| iov_len: mapping.len, | ||
| }) | ||
| .collect::<Vec<_>>(); | ||
|
|
||
| unsafe { | ||
| libc::pwritev( | ||
| fd, | ||
| iovs.as_ptr(), | ||
| iovs.len() as libc::c_int, | ||
| offset, | ||
| ) | ||
| } | ||
| }; | ||
|
|
||
| if written == -1 { | ||
| return Err(Error::last_os_error()); | ||
| } | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
wow, i hate this a lot, but your rationale makes sense. i'll let you have it. :)