1#![expect(missing_docs)]
5#![cfg(target_os = "linux")]
6
7#![expect(unsafe_code)]
12
13mod ioctl;
14mod nvme;
15
16use anyhow::Context;
17use async_trait::async_trait;
18use blocking::unblock;
19use disk_backend::DiskError;
20use disk_backend::DiskIo;
21use disk_backend::UnmapBehavior;
22use disk_backend::pr::PersistentReservation;
23use disk_backend::pr::ReservationCapabilities;
24use disk_backend::pr::ReservationReport;
25use disk_backend::pr::ReservationType;
26use disk_backend::resolve::ResolveDiskParameters;
27use disk_backend::resolve::ResolvedDisk;
28use fs_err::PathExt;
29use guestmem::MemoryRead;
30use guestmem::MemoryWrite;
31use inspect::Inspect;
32use io_uring::opcode;
33use io_uring::types;
34use io_uring::types::RwFlags;
35use mesh::MeshPayload;
36use nvme::check_nvme_status;
37use nvme_spec::nvm;
38use pal::unix::affinity;
39use pal_uring::Initiate;
40use pal_uring::IoInitiator;
41use scsi_buffers::BounceBufferTracker;
42use scsi_buffers::RequestBuffers;
43use std::fmt::Debug;
44use std::fs;
45use std::os::unix::io::AsRawFd;
46use std::os::unix::prelude::FileTypeExt;
47use std::os::unix::prelude::MetadataExt;
48use std::path::Path;
49use std::path::PathBuf;
50use std::str::FromStr;
51use std::sync::Arc;
52use std::sync::atomic::AtomicU64;
53use std::sync::atomic::Ordering;
54use thiserror::Error;
55use uevent::CallbackHandle;
56use uevent::UeventListener;
57use vm_resource::AsyncResolveResource;
58use vm_resource::ResourceId;
59use vm_resource::ResourceResolver;
60use vm_resource::kind::DiskHandleKind;
61
62pub struct BlockDeviceResolver {
63 uring: Arc<dyn Initiate>,
64 uevent_listener: Option<Arc<UeventListener>>,
65 bounce_buffer_tracker: Arc<BounceBufferTracker>,
66 always_bounce: bool,
67}
68
69impl BlockDeviceResolver {
70 pub fn new(
71 uring: Arc<dyn Initiate>,
72 uevent_listener: Option<Arc<UeventListener>>,
73 bounce_buffer_tracker: Arc<BounceBufferTracker>,
74 always_bounce: bool,
75 ) -> Self {
76 Self {
77 uring,
78 uevent_listener,
79 bounce_buffer_tracker,
80 always_bounce,
81 }
82 }
83}
84
85#[derive(MeshPayload)]
86pub struct OpenBlockDeviceConfig {
87 pub file: fs::File,
88}
89
90impl ResourceId<DiskHandleKind> for OpenBlockDeviceConfig {
91 const ID: &'static str = "block";
92}
93
94#[derive(Debug, Error)]
95pub enum ResolveDiskError {
96 #[error("failed to create new device")]
97 NewDevice(#[source] NewDeviceError),
98 #[error("invalid disk")]
99 InvalidDisk(#[source] disk_backend::InvalidDisk),
100}
101
102#[async_trait]
103impl AsyncResolveResource<DiskHandleKind, OpenBlockDeviceConfig> for BlockDeviceResolver {
104 type Output = ResolvedDisk;
105 type Error = ResolveDiskError;
106
107 async fn resolve(
108 &self,
109 _resolver: &ResourceResolver,
110 rsrc: OpenBlockDeviceConfig,
111 input: ResolveDiskParameters<'_>,
112 ) -> Result<Self::Output, Self::Error> {
113 let disk = BlockDevice::new(
114 rsrc.file,
115 input.read_only,
116 self.uring.clone(),
117 self.uevent_listener.as_deref(),
118 self.bounce_buffer_tracker.clone(),
119 self.always_bounce,
120 )
121 .await
122 .map_err(ResolveDiskError::NewDevice)?;
123 ResolvedDisk::new(disk).map_err(ResolveDiskError::InvalidDisk)
124 }
125}
126
127pub fn open_file_for_block(path: &Path, read_only: bool) -> std::io::Result<fs::File> {
129 use std::os::unix::prelude::*;
130
131 tracing::debug!(?path, read_only, "open_file_for_block");
132 fs::OpenOptions::new()
133 .read(true)
134 .write(!read_only)
135 .custom_flags(libc::O_DIRECT)
136 .open(path)
137}
138
139#[derive(Inspect)]
141#[inspect(extra = "BlockDevice::inspect_extra")]
142pub struct BlockDevice {
143 file: Arc<fs::File>,
144 sector_size: u32,
145 physical_sector_size: u32,
146 sector_shift: u32,
147 sector_count: AtomicU64,
148 optimal_unmap_sectors: u32,
149 read_only: bool,
150 #[inspect(skip)]
151 uring: Arc<dyn Initiate>,
152 #[inspect(flatten)]
153 device_type: DeviceType,
154 supports_pr: bool,
155 supports_fua: bool,
156 #[inspect(skip)]
157 _uevent_filter: Option<CallbackHandle>,
158 resize_epoch: Arc<ResizeEpoch>,
159 resized_acked: AtomicU64,
160 #[inspect(skip)]
161 bounce_buffer_tracker: Arc<BounceBufferTracker>,
162 always_bounce: bool,
163}
164
165#[derive(Inspect, Debug, Default)]
166#[inspect(transparent)]
167struct ResizeEpoch {
168 epoch: AtomicU64,
169 #[inspect(skip)]
170 event: event_listener::Event,
171}
172
173#[derive(Debug, Copy, Clone, Inspect)]
174#[inspect(tag = "device_type")]
175enum DeviceType {
176 File {
177 sector_count: u64,
178 },
179 UnknownBlock,
180 NVMe {
181 ns_id: u32,
182 rescap: nvm::ReservationCapabilities,
183 },
184}
185
186impl BlockDevice {
187 fn inspect_extra(&self, resp: &mut inspect::Response<'_>) {
188 match self.device_type {
189 DeviceType::NVMe { .. } => {
190 resp.field_mut_with("interrupt_aggregation", |new_value| {
191 self.inspect_interrupt_coalescing(new_value)
192 });
193 }
194 DeviceType::UnknownBlock => {}
195 DeviceType::File { .. } => {}
196 }
197 }
198
199 fn inspect_interrupt_coalescing(&self, new_value: Option<&str>) -> anyhow::Result<String> {
200 let coalescing = if let Some(new_value) = new_value {
201 let coalescing = (|| {
202 let (threshold, time) = new_value.split_once(' ')?;
203 Some(
204 nvme::InterruptCoalescing::new()
205 .with_aggregation_threshold(threshold.parse().ok()?)
206 .with_aggregation_time(time.parse().ok()?),
207 )
208 })()
209 .context("expected `<aggregation_threshold> <aggregation_time>`")?;
210 nvme::nvme_set_features_interrupt_coalescing(&self.file, coalescing)?;
211 coalescing
212 } else if let Ok(coalescing) = nvme::nvme_get_features_interrupt_coalescing(&self.file) {
213 coalescing
214 } else {
215 return Ok("not supported".into());
216 };
217 Ok(format!(
218 "{} {}",
219 coalescing.aggregation_threshold(),
220 coalescing.aggregation_time()
221 ))
222 }
223}
224
225#[derive(Debug, Error)]
227pub enum NewDeviceError {
228 #[error("block device ioctl error")]
229 IoctlError(#[from] DiskError),
230 #[error("failed to read device metadata")]
231 DeviceMetadata(#[source] anyhow::Error),
232 #[error("invalid file type, not a file or block device")]
233 InvalidFileType,
234 #[error("invalid disk size {0:#x}")]
235 InvalidDiskSize(u64),
236}
237
238impl BlockDevice {
239 pub async fn new(
247 file: fs::File,
248 read_only: bool,
249 uring: Arc<dyn Initiate>,
250 uevent_listener: Option<&UeventListener>,
251 bounce_buffer_tracker: Arc<BounceBufferTracker>,
252 always_bounce: bool,
253 ) -> Result<BlockDevice, NewDeviceError> {
254 let initiator = uring.initiator();
255 assert!(initiator.probe(opcode::Read::CODE));
256 assert!(initiator.probe(opcode::Write::CODE));
257 assert!(initiator.probe(opcode::Readv::CODE));
258 assert!(initiator.probe(opcode::Writev::CODE));
259 assert!(initiator.probe(opcode::Fsync::CODE));
260
261 let metadata = file.metadata().map_err(DiskError::Io)?;
262
263 let mut uevent_filter = None;
264 let resize_epoch = Arc::new(ResizeEpoch::default());
265
266 let devmeta = if metadata.file_type().is_block_device() {
267 let rdev = metadata.rdev();
268 let (major, minor) = (libc::major(rdev), libc::minor(rdev));
269
270 if let Some(uevent_listener) = uevent_listener {
272 let resize_epoch = resize_epoch.clone();
273 uevent_filter = Some(
274 uevent_listener
275 .add_block_resize_callback(major, minor, {
276 move || {
277 tracing::info!(major, minor, "disk resized");
278 resize_epoch.epoch.fetch_add(1, Ordering::SeqCst);
279 resize_epoch.event.notify(usize::MAX);
280 }
281 })
282 .await,
283 );
284 }
285
286 DeviceMetadata::from_block_device(&file, major, minor)
287 .map_err(NewDeviceError::DeviceMetadata)?
288 } else if metadata.file_type().is_file() {
289 DeviceMetadata::from_file(&metadata).map_err(NewDeviceError::DeviceMetadata)?
290 } else {
291 return Err(NewDeviceError::InvalidFileType);
292 };
293
294 let sector_size = devmeta.logical_block_size;
295 let sector_shift = sector_size.trailing_zeros();
296 let physical_sector_size = devmeta.physical_block_size.max(sector_size);
297 let sector_count = devmeta.disk_size >> sector_shift;
298 let unmap_granularity = devmeta.discard_granularity >> sector_shift;
299 let file = Arc::new(file);
300 let device = BlockDevice {
301 file,
302 sector_size,
303 physical_sector_size,
304 sector_shift: sector_size.trailing_zeros(),
305 sector_count: sector_count.into(),
306 optimal_unmap_sectors: unmap_granularity,
307 read_only,
308 uring,
309 device_type: devmeta.device_type,
310 supports_pr: devmeta.supports_pr,
311 supports_fua: devmeta.fua,
312 _uevent_filter: uevent_filter,
313 resize_epoch,
314 resized_acked: 0.into(),
315 bounce_buffer_tracker,
316 always_bounce,
317 };
318
319 Ok(device)
320 }
321
322 fn initiator(&self) -> &IoInitiator {
323 self.uring.initiator()
324 }
325
326 fn handle_resize(&self) {
327 if let Err(err) = self.handle_resize_inner() {
328 tracing::error!(
329 error = &err as &dyn std::error::Error,
330 "failed to update disk size"
331 );
332 }
333 }
334
335 fn handle_resize_inner(&self) -> std::io::Result<()> {
336 let mut acked = self.resized_acked.load(Ordering::SeqCst);
337 loop {
338 let epoch = self.resize_epoch.epoch.load(Ordering::SeqCst);
339 if acked == epoch {
340 break Ok(());
341 }
342
343 let size_in_bytes = ioctl::query_block_device_size_in_bytes(&self.file)?;
344
345 let new_sector_count = size_in_bytes / self.sector_size as u64;
346 let original_sector_count = self.sector_count.load(Ordering::SeqCst);
347
348 tracing::debug!(original_sector_count, new_sector_count, "resize");
349 if original_sector_count != new_sector_count {
350 tracing::info!(
351 original_sector_count,
352 new_sector_count,
353 "Disk size updating..."
354 );
355 self.sector_count.store(new_sector_count, Ordering::SeqCst);
356 }
357
358 acked = self
359 .resized_acked
360 .compare_exchange(acked, epoch, Ordering::SeqCst, Ordering::SeqCst)
361 .unwrap_or_else(|x| x);
362 }
363 }
364
365 fn map_io_error(&self, err: std::io::Error) -> DiskError {
366 if !matches!(self.device_type, DeviceType::File { .. }) {
367 match err.raw_os_error() {
368 Some(libc::EBADE) => return DiskError::ReservationConflict,
369 Some(libc::ENOSPC) => return DiskError::IllegalBlock,
370 _ => {}
371 }
372 }
373 DiskError::Io(err)
374 }
375}
376
377struct DeviceMetadata {
378 device_type: DeviceType,
379 disk_size: u64,
380 logical_block_size: u32,
381 physical_block_size: u32,
382 discard_granularity: u32,
383 supports_pr: bool,
384 fua: bool,
385}
386
387impl DeviceMetadata {
388 fn from_block_device(file: &fs::File, major: u32, minor: u32) -> anyhow::Result<Self> {
389 let devpath = PathBuf::from(format!("/sys/dev/block/{major}:{minor}"));
391 devpath
392 .fs_err_metadata()
393 .context("could not find sysfs path for block device")?;
394
395 let mut supports_pr = false;
396
397 let device_type = match fs_err::read_to_string(devpath.join("nsid")) {
399 Ok(ns_id) => {
400 let ns_id = ns_id
401 .trim()
402 .parse()
403 .context("failed to parse NVMe namespace ID")?;
404
405 let rescap = nvme::nvme_identify_namespace_data(file, ns_id)?.rescap;
406 let oncs = nvme::nvme_identify_controller_data(file)?.oncs;
407 tracing::debug!(rescap = ?rescap, oncs = ?oncs, "get identify data");
408 supports_pr = oncs.reservations() && u8::from(rescap) != 0;
409 Some(DeviceType::NVMe { ns_id, rescap })
410 }
411 Err(err) if err.kind() == std::io::ErrorKind::NotFound => None,
412 Err(err) => Err(err).context("failed to read NVMe namespace ID")?,
413 };
414
415 let device_type = device_type.unwrap_or(DeviceType::UnknownBlock);
417
418 fn read_val<T: FromStr>(devpath: &Path, path: &str, msg: &str) -> anyhow::Result<T>
419 where
420 T::Err: 'static + std::error::Error + Send + Sync,
421 {
422 fs_err::read_to_string(devpath.join(path))
423 .with_context(|| format!("failed to read {msg}"))?
424 .trim()
425 .parse()
426 .with_context(|| format!("failed to parse {msg}"))
427 }
428
429 let logical_block_size = read_val(&devpath, "queue/logical_block_size", "sector size")?;
430 let physical_block_size = read_val(
431 &devpath,
432 "queue/physical_block_size",
433 "physical sector size",
434 )?;
435
436 let disk_size = read_val::<u64>(&devpath, "size", "disk size")? * 512;
439 let discard_granularity =
440 read_val(&devpath, "queue/discard_granularity", "discard granularity")?;
441
442 let fua = read_val::<u8>(&devpath, "queue/fua", "fua")? != 0;
443
444 Self {
445 device_type,
446 disk_size,
447 logical_block_size,
448 physical_block_size,
449 discard_granularity,
450 supports_pr,
451 fua,
452 }
453 .validate()
454 }
455
456 fn from_file(metadata: &fs::Metadata) -> anyhow::Result<Self> {
457 let logical_block_size = 512;
458 Self {
459 device_type: DeviceType::File {
460 sector_count: metadata.len() / logical_block_size as u64,
461 },
462 disk_size: metadata.size(),
463 logical_block_size,
464 physical_block_size: metadata.blksize() as u32,
465 discard_granularity: 0,
466 supports_pr: false,
467 fua: false,
468 }
469 .validate()
470 }
471
472 fn validate(self) -> anyhow::Result<Self> {
473 let Self {
474 device_type: _,
475 disk_size,
476 logical_block_size,
477 physical_block_size,
478 discard_granularity,
479 supports_pr: _,
480 fua: _,
481 } = self;
482 if logical_block_size < 512 || !logical_block_size.is_power_of_two() {
483 anyhow::bail!("invalid sector size {logical_block_size}");
484 }
485 if !physical_block_size.is_power_of_two() {
486 anyhow::bail!("invalid physical sector size {physical_block_size}");
487 }
488 if disk_size % logical_block_size as u64 != 0 {
489 anyhow::bail!("invalid disk size {disk_size:#x}");
490 }
491 if discard_granularity % logical_block_size != 0 {
492 anyhow::bail!("invalid discard granularity {discard_granularity}");
493 }
494 Ok(self)
495 }
496}
497
498impl DiskIo for BlockDevice {
499 fn disk_type(&self) -> &str {
500 "block_device"
501 }
502
503 fn sector_count(&self) -> u64 {
504 if self.resize_epoch.epoch.load(Ordering::Relaxed)
505 != self.resized_acked.load(Ordering::Relaxed)
506 {
507 self.handle_resize();
508 }
509 self.sector_count.load(Ordering::Relaxed)
510 }
511
512 fn sector_size(&self) -> u32 {
513 self.sector_size
514 }
515
516 fn disk_id(&self) -> Option<[u8; 16]> {
517 None
518 }
519
520 fn physical_sector_size(&self) -> u32 {
521 self.physical_sector_size
522 }
523
524 fn is_fua_respected(&self) -> bool {
525 self.supports_fua
526 }
527
528 fn is_read_only(&self) -> bool {
529 self.read_only
530 }
531
532 fn pr(&self) -> Option<&dyn PersistentReservation> {
533 if self.supports_pr { Some(self) } else { None }
534 }
535
536 async fn eject(&self) -> Result<(), DiskError> {
537 let file = self.file.clone();
538 unblock(move || {
539 ioctl::lockdoor(&file, false)?;
540 ioctl::eject(&file)
541 })
542 .await
543 .map_err(|err| self.map_io_error(err))?;
544 Ok(())
545 }
546
547 async fn read_vectored(
548 &self,
549 buffers: &RequestBuffers<'_>,
550 sector: u64,
551 ) -> Result<(), DiskError> {
552 let io_size = buffers.len();
553 tracing::trace!(sector, io_size, "read_vectored");
554
555 let mut bounce_buffer = None;
556 let locked;
557 let should_bounce = self.always_bounce || !buffers.is_aligned(self.sector_size() as usize);
558 let io_vecs = if !should_bounce {
559 locked = buffers.lock(true)?;
560 locked.io_vecs()
561 } else {
562 tracing::trace!("double buffering IO");
563
564 bounce_buffer
565 .insert(
566 self.bounce_buffer_tracker
567 .acquire_bounce_buffers(buffers.len(), affinity::get_cpu_number() as usize)
568 .await,
569 )
570 .buffer
571 .io_vecs()
572 };
573
574 let (r, _) = unsafe {
578 self.initiator().issue_io((), |_| {
579 opcode::Readv::new(
580 types::Fd(self.file.as_raw_fd()),
581 io_vecs.as_ptr().cast(),
582 io_vecs.len() as u32,
583 )
584 .offset((sector * self.sector_size() as u64) as _)
585 .build()
586 })
587 }
588 .await;
589
590 let bytes_read = r.map_err(|err| self.map_io_error(err))?;
591 tracing::trace!(bytes_read, "read_vectored");
592 if bytes_read != io_size as i32 {
593 return Err(DiskError::IllegalBlock);
594 }
595
596 if let Some(mut bounce_buffer) = bounce_buffer {
597 buffers
598 .writer()
599 .write(bounce_buffer.buffer.as_mut_bytes())?;
600 }
601 Ok(())
602 }
603
604 async fn write_vectored(
605 &self,
606 buffers: &RequestBuffers<'_>,
607 sector: u64,
608 fua: bool,
609 ) -> Result<(), DiskError> {
610 let io_size = buffers.len();
611 tracing::trace!(sector, io_size, "write_vectored");
612
613 if let DeviceType::File { sector_count } = self.device_type {
615 if sector + (io_size as u64 >> self.sector_shift) > sector_count {
616 return Err(DiskError::IllegalBlock);
617 }
618 }
619
620 let mut bounce_buffer;
621 let locked;
622 let should_bounce = self.always_bounce || !buffers.is_aligned(self.sector_size() as usize);
623 let io_vecs = if !should_bounce {
624 locked = buffers.lock(false)?;
625 locked.io_vecs()
626 } else {
627 tracing::trace!("double buffering IO");
628 bounce_buffer = self
629 .bounce_buffer_tracker
630 .acquire_bounce_buffers(buffers.len(), affinity::get_cpu_number() as usize)
631 .await;
632 buffers.reader().read(bounce_buffer.buffer.as_mut_bytes())?;
633 bounce_buffer.buffer.io_vecs()
634 };
635
636 const RWF_DSYNC: RwFlags = 0x00000002;
639
640 let (r, _) = unsafe {
644 self.initiator().issue_io((), |_| {
645 opcode::Writev::new(
646 types::Fd(self.file.as_raw_fd()),
647 io_vecs.as_ptr().cast::<libc::iovec>(),
648 io_vecs.len() as _,
649 )
650 .offset((sector * self.sector_size() as u64) as _)
651 .rw_flags(if fua { RWF_DSYNC } else { 0 })
652 .build()
653 })
654 }
655 .await;
656
657 let bytes_written = r.map_err(|err| self.map_io_error(err))?;
658 tracing::trace!(bytes_written, "write_vectored");
659 if bytes_written != io_size as i32 {
660 return Err(DiskError::IllegalBlock);
661 }
662
663 Ok(())
664 }
665
666 async fn sync_cache(&self) -> Result<(), DiskError> {
667 unsafe {
669 self.initiator()
670 .issue_io((), |_| {
671 opcode::Fsync::new(types::Fd(self.file.as_raw_fd())).build()
672 })
673 .await
674 .0
675 .map_err(|err| self.map_io_error(err))?;
676 }
677 Ok(())
678 }
679
680 async fn wait_resize(&self, sector_count: u64) -> u64 {
681 loop {
682 let listen = self.resize_epoch.event.listen();
683 let current = self.sector_count();
684 if current != sector_count {
685 break current;
686 }
687 listen.await;
688 }
689 }
690
691 async fn unmap(
692 &self,
693 sector_offset: u64,
694 sector_count: u64,
695 _block_level_only: bool,
696 ) -> Result<(), DiskError> {
697 let file = self.file.clone();
698 let file_offset = sector_offset << self.sector_shift;
699 let length = sector_count << self.sector_shift;
700 tracing::debug!(file = ?file, file_offset, length, "unmap_async");
701 match unblock(move || ioctl::discard(&file, file_offset, length)).await {
702 Ok(()) => {}
703 Err(_) if sector_offset + sector_count > self.sector_count() => {
704 return Err(DiskError::IllegalBlock);
705 }
706 Err(err) => return Err(self.map_io_error(err)),
707 }
708 Ok(())
709 }
710
711 fn unmap_behavior(&self) -> UnmapBehavior {
712 if self.optimal_unmap_sectors == 0 {
713 UnmapBehavior::Ignored
714 } else {
715 UnmapBehavior::Unspecified
716 }
717 }
718
719 fn optimal_unmap_sectors(&self) -> u32 {
720 self.optimal_unmap_sectors
721 }
722}
723
724#[async_trait::async_trait]
725impl PersistentReservation for BlockDevice {
726 fn capabilities(&self) -> ReservationCapabilities {
727 match &self.device_type {
728 &DeviceType::NVMe { rescap, .. } => {
729 nvme_common::from_nvme_reservation_capabilities(rescap)
730 }
731 DeviceType::File { .. } | DeviceType::UnknownBlock => unreachable!(),
732 }
733 }
734
735 async fn report(&self) -> Result<ReservationReport, DiskError> {
736 assert!(matches!(self.device_type, DeviceType::NVMe { .. }));
737 self.nvme_persistent_reservation_report()
738 .await
739 .map_err(|err| self.map_io_error(err))
740 }
741
742 async fn register(
743 &self,
744 current_key: Option<u64>,
745 new_key: u64,
746 ptpl: Option<bool>,
747 ) -> Result<(), DiskError> {
748 assert!(matches!(self.device_type, DeviceType::NVMe { .. }));
749
750 if ptpl == Some(false) {
754 tracing::warn!("ignoring guest request to disable persist through power loss");
755 }
756
757 let file = self.file.clone();
758 unblock(move || {
759 ioctl::pr_register(
760 &file,
761 current_key.unwrap_or(0),
762 new_key,
763 if current_key.is_none() {
764 ioctl::PR_FL_IGNORE_KEY
765 } else {
766 0
767 },
768 )
769 })
770 .await
771 .and_then(check_nvme_status)
772 .map_err(|err| self.map_io_error(err))?;
773 Ok(())
774 }
775
776 async fn reserve(&self, key: u64, reservation_type: ReservationType) -> Result<(), DiskError> {
777 assert!(matches!(self.device_type, DeviceType::NVMe { .. }));
778 let file = self.file.clone();
779 unblock(move || ioctl::pr_reserve(&file, reservation_type, key))
780 .await
781 .and_then(check_nvme_status)
782 .map_err(|err| self.map_io_error(err))?;
783 Ok(())
784 }
785
786 async fn release(&self, key: u64, reservation_type: ReservationType) -> Result<(), DiskError> {
787 assert!(matches!(self.device_type, DeviceType::NVMe { .. }));
788 let file = self.file.clone();
789 unblock(move || ioctl::pr_release(&file, reservation_type, key))
790 .await
791 .and_then(check_nvme_status)
792 .map_err(|err| self.map_io_error(err))?;
793 Ok(())
794 }
795
796 async fn clear(&self, key: u64) -> Result<(), DiskError> {
797 assert!(matches!(self.device_type, DeviceType::NVMe { .. }));
798 let file = self.file.clone();
799 unblock(move || ioctl::pr_clear(&file, key))
800 .await
801 .and_then(check_nvme_status)
802 .map_err(|err| self.map_io_error(err))?;
803 Ok(())
804 }
805
806 async fn preempt(
807 &self,
808 current_key: u64,
809 preempt_key: u64,
810 reservation_type: ReservationType,
811 abort: bool,
812 ) -> Result<(), DiskError> {
813 assert!(matches!(self.device_type, DeviceType::NVMe { .. }));
814 let file = self.file.clone();
815 unblock(move || {
816 ioctl::pr_preempt(&file, reservation_type, current_key, preempt_key, abort)
817 })
818 .await
819 .and_then(check_nvme_status)
820 .map_err(|err| self.map_io_error(err))?;
821 Ok(())
822 }
823}
824
825#[cfg(test)]
826mod tests {
827 use super::*;
828 use futures::executor::block_on;
829 use guestmem::GuestMemory;
830 use hvdef::HV_PAGE_SIZE;
831 use hvdef::HV_PAGE_SIZE_USIZE;
832 use once_cell::sync::OnceCell;
833 use pal_async::async_test;
834 use pal_uring::IoUringPool;
835 use scsi_buffers::OwnedRequestBuffers;
836
837 fn is_buggy_kernel() -> bool {
838 let output = String::from_utf8(
840 std::process::Command::new("uname")
841 .arg("-r")
842 .output()
843 .unwrap()
844 .stdout,
845 )
846 .unwrap();
847
848 output.contains("5.13")
849 }
850
851 fn new_block_device() -> Result<BlockDevice, NewDeviceError> {
852 static POOL: OnceCell<Arc<IoInitiator>> = OnceCell::new();
854
855 let initiator = POOL
856 .get_or_try_init(|| {
857 let pool = IoUringPool::new("test", 16)?;
858 let initiator = pool.client().initiator().clone();
859 std::thread::spawn(|| pool.run());
860 Ok(Arc::new(initiator))
861 })
862 .map_err(|err| NewDeviceError::IoctlError(DiskError::Io(err)))?;
863
864 let bounce_buffer_tracker = Arc::new(BounceBufferTracker::new(
865 2048,
866 affinity::num_procs() as usize,
867 ));
868
869 let test_file = tempfile::tempfile().unwrap();
870 test_file.set_len(1024 * 64).unwrap();
871 block_on(BlockDevice::new(
872 test_file.try_clone().unwrap(),
873 false,
874 initiator.clone(),
875 None,
876 bounce_buffer_tracker,
877 false,
878 ))
879 }
880
881 macro_rules! get_block_device_or_skip {
882 () => {
883 match new_block_device() {
884 Ok(pool) => {
885 if is_buggy_kernel() {
886 println!("Test case skipped (buggy kernel version)");
887 return;
888 }
889
890 pool
891 }
892 Err(NewDeviceError::IoctlError(DiskError::Io(err)))
893 if err.raw_os_error() == Some(libc::ENOSYS) =>
894 {
895 println!("Test case skipped (no IO-Uring support)");
896 return;
897 }
898 Err(err) => panic!("{}", err),
899 }
900 };
901 }
902
903 async fn run_async_disk_io(fua: bool) {
904 let disk = get_block_device_or_skip!();
905
906 let test_guest_mem = GuestMemory::allocate(0x8000);
907 test_guest_mem
908 .write_at(0, &(0..0x8000).map(|x| x as u8).collect::<Vec<_>>())
909 .unwrap();
910
911 let write_buffers = OwnedRequestBuffers::new(&[3, 2, 1, 0]);
912 disk.write_vectored(&write_buffers.buffer(&test_guest_mem), 0, fua)
913 .await
914 .unwrap();
915
916 if !fua {
917 disk.sync_cache().await.unwrap();
918 }
919
920 let read_buffers = OwnedRequestBuffers::new(&[7, 6, 5, 4]);
921 disk.read_vectored(&read_buffers.buffer(&test_guest_mem), 0)
922 .await
923 .unwrap();
924
925 let mut source = vec![0u8; 4 * HV_PAGE_SIZE_USIZE];
926 test_guest_mem.read_at(0, &mut source).unwrap();
927 let mut target = vec![0u8; 4 * HV_PAGE_SIZE_USIZE];
928 test_guest_mem
929 .read_at(4 * HV_PAGE_SIZE, &mut target)
930 .unwrap();
931 assert_eq!(source, target);
932 }
933
934 #[async_test]
935 async fn test_async_disk_io() {
936 run_async_disk_io(false).await;
937 }
938
939 #[async_test]
940 async fn test_async_disk_io_fua() {
941 run_async_disk_io(true).await;
942 }
943
944 async fn run_async_disk_io_unaligned(fua: bool) {
945 let disk = get_block_device_or_skip!();
946
947 let test_guest_mem = GuestMemory::allocate(0x8000);
948 test_guest_mem
949 .write_at(0, &(0..0x8000).map(|x| x as u8).collect::<Vec<_>>())
950 .unwrap();
951
952 let write_buffers =
953 OwnedRequestBuffers::new_unaligned(&[0, 1, 2, 3], 512, 3 * HV_PAGE_SIZE_USIZE);
954
955 disk.write_vectored(&write_buffers.buffer(&test_guest_mem), 0, fua)
956 .await
957 .unwrap();
958
959 if !fua {
960 disk.sync_cache().await.unwrap();
961 }
962
963 let read_buffers =
964 OwnedRequestBuffers::new_unaligned(&[4, 5, 6, 7], 512, 3 * HV_PAGE_SIZE_USIZE);
965 disk.read_vectored(&read_buffers.buffer(&test_guest_mem), 0)
966 .await
967 .unwrap();
968
969 let mut source = vec![0u8; 3 * HV_PAGE_SIZE_USIZE];
970 test_guest_mem.read_at(512, &mut source).unwrap();
971 let mut target = vec![0u8; 3 * HV_PAGE_SIZE_USIZE];
972 test_guest_mem
973 .read_at(4 * HV_PAGE_SIZE + 512, &mut target)
974 .unwrap();
975 assert_eq!(source, target);
976 }
977
978 #[async_test]
979 async fn test_async_disk_io_unaligned() {
980 run_async_disk_io_unaligned(false).await;
981 }
982
983 #[async_test]
984 async fn test_async_disk_io_unaligned_fua() {
985 run_async_disk_io_unaligned(true).await;
986 }
987
988 #[async_test]
989 async fn test_illegal_lba() {
990 let disk = get_block_device_or_skip!();
991 let gm = GuestMemory::allocate(512);
992 match disk
993 .write_vectored(
994 &OwnedRequestBuffers::linear(0, 512, true).buffer(&gm),
995 i64::MAX as u64 / 512,
996 false,
997 )
998 .await
999 {
1000 Err(DiskError::IllegalBlock) => {}
1001 r => panic!("unexpected result: {:?}", r),
1002 }
1003 }
1004}