Skip to content

Commit ffdc8f7

Browse files
authored
nvme,mana: ensure completions are read and written in order (#506)
The NVMe completion entry is a 16-byte value, and it contains a phase bit that the NVMe driver reads to determine when a completion in the queue is valid. To ensure the driver observes a consistent completion entry, the phase bit must be written by the device _after_ the rest of the entry, and it must be read by the driver _before_ the rest. The memory access machinery in OpenVMM does not provide many ordering guarantees, especially on ARM64. Ensure accesses are performed in the right order via two atomic 8-byte accesses, plus proper fences, in the NVMe emulator and driver. There's a similar bug in MANA. Fix this, too.
1 parent 65f05d6 commit ffdc8f7

File tree

4 files changed

+48
-7
lines changed

4 files changed

+48
-7
lines changed

vm/devices/net/gdma/src/queues.rs

Lines changed: 12 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,7 @@ use parking_lot::Mutex;
2424
use parking_lot::MutexGuard;
2525
use pci_core::capabilities::msix::MsixEmulator;
2626
use std::marker::PhantomData;
27+
use std::sync::atomic::Ordering::Release;
2728
use std::task::Context;
2829
use std::task::Poll;
2930
use std::task::Waker;
@@ -95,9 +96,19 @@ impl<T: AsBytes> CqEq<T> {
9596
let offset = (self.tail & (self.cap - 1)) as usize * size_of::<T>();
9697
let mut range = self.region.range();
9798
range.skip(offset);
98-
if let Err(err) = range.writer(gm).write(entry.as_bytes()) {
99+
let mut writer = range.writer(gm);
100+
let (entry, last) = entry.as_bytes().split_at(entry.as_bytes().len() - 1);
101+
if let Err(err) = writer.write(entry) {
99102
tracing::warn!(err = &err as &dyn std::error::Error, "failed to write");
100103
}
104+
// Write the final byte last after a release fence to ensure that the
105+
// guest sees the entire entry before the owner count is updated.
106+
std::sync::atomic::fence(Release);
107+
if let Err(err) = writer.write(last) {
108+
tracing::warn!(err = &err as &dyn std::error::Error, "failed to write");
109+
}
110+
// Ensure the write is flushed before sending the interrupt.
111+
std::sync::atomic::fence(Release);
101112
let new_tail = self.tail.wrapping_add(1);
102113
self.tail = new_tail;
103114
std::mem::take(&mut self.armed)

vm/devices/net/mana_driver/src/queues.rs

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@ use gdma_defs::OWNER_MASK;
2323
use gdma_defs::WQE_ALIGNMENT;
2424
use inspect::Inspect;
2525
use std::marker::PhantomData;
26+
use std::sync::atomic::Ordering::Acquire;
2627
use std::sync::Arc;
2728
use user_driver::memory::MemoryBlock;
2829
use zerocopy::AsBytes;
@@ -140,7 +141,11 @@ impl<T: AsBytes + FromBytes> CqEq<T> {
140141

141142
/// Pops an event queue entry.
142143
pub fn pop(&mut self) -> Option<T> {
143-
let b = self.read_next::<u8>(size_of::<T>() as u32 - 1);
144+
// Perform an acquire load to ensure that the read of the queue entry is
145+
// not reordered before the read of the owner count.
146+
let b = self.mem.as_slice()
147+
[(self.next.wrapping_add(size_of::<T>() as u32 - 1) & (self.size - 1)) as usize]
148+
.load(Acquire);
144149
let owner_count = b >> 5;
145150
let cur_owner_count = (self.next >> self.shift) as u8;
146151
if owner_count == (cur_owner_count.wrapping_sub(1)) & OWNER_MASK as u8 {

vm/devices/storage/disk_nvme/nvme_driver/src/queues.rs

Lines changed: 17 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,10 @@ use crate::driver::save_restore::CompletionQueueSavedState;
88
use crate::driver::save_restore::SubmissionQueueSavedState;
99
use crate::registers::DeviceRegisters;
1010
use inspect::Inspect;
11+
use safeatomic::AtomicSliceOps;
12+
use std::sync::atomic::AtomicU64;
13+
use std::sync::atomic::Ordering::Acquire;
14+
use std::sync::atomic::Ordering::Relaxed;
1115
use user_driver::memory::MemoryBlock;
1216
use user_driver::DeviceBacking;
1317

@@ -133,12 +137,21 @@ impl CompletionQueue {
133137
}
134138

135139
pub fn read(&mut self) -> Option<spec::Completion> {
136-
let completion = self
137-
.mem
138-
.read_obj::<spec::Completion>(self.head as usize * size_of::<spec::Completion>());
139-
if completion.status.phase() != self.phase {
140+
let completion_mem = self.mem.as_slice()
141+
[self.head as usize * size_of::<spec::Completion>()..]
142+
[..size_of::<spec::Completion>() * 2]
143+
.as_atomic_slice::<AtomicU64>()
144+
.unwrap();
145+
146+
// Check the phase bit, using an acquire read to ensure the rest of the
147+
// completion is read with or after the phase bit.
148+
let high = completion_mem[1].load(Acquire);
149+
let status = spec::CompletionStatus::from((high >> 48) as u16);
150+
if status.phase() != self.phase {
140151
return None;
141152
}
153+
let low = completion_mem[0].load(Relaxed);
154+
let completion: spec::Completion = zerocopy::transmute!([low, high]);
142155
self.head += 1;
143156
if self.head == self.len {
144157
self.head = 0;

vm/devices/storage/nvme/src/queue.rs

Lines changed: 13 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -266,8 +266,20 @@ impl CompletionQueue {
266266
return Ok(false);
267267
}
268268
data.status.set_phase(self.phase);
269-
mem.write_plain(self.gpa.wrapping_add(self.tail as u64 * 16), &data)
269+
270+
// Atomically write the low part of the completion entry first, then the
271+
// high part, using release fences to ensure ordering.
272+
//
273+
// This is necessary to ensure the guest can observe the full completion
274+
// once it observes the phase bit change (which is in the high part).
275+
let [low, high]: [u64; 2] = zerocopy::transmute!(data);
276+
let gpa = self.gpa.wrapping_add(self.tail as u64 * 16);
277+
mem.write_plain(gpa, &low).map_err(QueueError::Memory)?;
278+
std::sync::atomic::fence(Ordering::Release);
279+
mem.write_plain(gpa + 8, &high)
270280
.map_err(QueueError::Memory)?;
281+
std::sync::atomic::fence(Ordering::Release);
282+
271283
if let Some(interrupt) = &self.interrupt {
272284
interrupt.deliver();
273285
}

0 commit comments

Comments
 (0)