disklayer_sqlite/
lib.rs

1// Copyright (c) Microsoft Corporation.
2// Licensed under the MIT License.
3
4//! SQLite-backed disk layer implementation.
5//!
6//! At this time, **this layer is only designed for use in dev/test scenarios!**
7//!
8//! # DISCLAIMER: Stability
9//!
10//! There are no stability guarantees around the on-disk data format! The schema
11//! can and will change without warning!
12//!
13//! # DISCLAIMER: Performance
14//!
15//! This implementation has only been minimally optimized! Don't expect to get
16//! incredible perf from this disk backend!
17//!
18//! Notably:
19//!
20//! - Data is stored within a single `sectors` table as tuples of `(sector:
21//!   INTEGER, sector_data: BLOB(sector_size))`. All data is accessed in
22//!   `sector_size` chunks (i.e: without performing any kind of adjacent-sector
23//!   coalescing).
24//! - Reads and writes currently allocate many temporary `Vec<u8>` buffers per
25//!   operation, without any buffer reuse.
26//!
27//! These design choices were made with simplicity and expediency in mind, given
28//! that the primary use-case for this backend is for dev/test scenarios. If
29//! performance ever becomes a concern, there are various optimizations that
30//! should be possible to implement here, though quite frankly, investing in a
31//! cross-platform QCOW2 or VHDX disk backend is likely a far more worthwhile
32//! endeavor.
33//!
34//! # Context
35//!
36//! In late 2024, OpenVMM was missing a _cross-platform_ disk backend that
37//! supported the following key features:
38//!
39//! - Used a dynamically-sized file as the disks's backing store
40//! - Supported snapshots / differencing disks
41//!
42//! While OpenVMM will eventually need to support for one or more of the current
43//! "industry standard" virtual disk formats that supports these features (e.g:
44//! QCOW2, VHDX), we really wanted some sort of "stop-gap" solution to unblock
45//! various dev/test use-cases.
46//!
47//! And thus, `disklayer_sqlite` was born!
48//!
49//! The initial implementation took less than a day to get up and running, and
50//! worked "well enough" to support the dev/test scenarios we were interested
51//! in, such as:
52//!
53//! - Having a cross-platform _sparsely allocated_ virtual disk file.
54//! - Having a _persistent_ diff-disk on-top of an existing disk (as opposed to
55//!   `ramdiff`, which is in-memory and _ephemeral_)
56//! - Having a "cache" layer for JIT-accessed disks, such as `disk_blob`
57//!
58//! The idea of using SQLite as a backing store - while wacky - proved to be an
59//! excellent way to quickly bring up a dynamically-sized, sparsely-allocated
60//! disk format for testing in OpenVMM.
61
62#![forbid(unsafe_code)]
63
64mod auto_cache;
65pub mod resolver;
66
67use anyhow::Context;
68use blocking::unblock;
69use disk_backend::DiskError;
70use disk_backend::UnmapBehavior;
71use disk_layered::LayerAttach;
72use disk_layered::LayerIo;
73use disk_layered::SectorMarker;
74use disk_layered::WriteNoOverwrite;
75use futures::lock::Mutex;
76use futures::lock::OwnedMutexGuard;
77use guestmem::MemoryRead;
78use guestmem::MemoryWrite;
79use inspect::Inspect;
80use rusqlite::Connection;
81use scsi_buffers::RequestBuffers;
82use std::path::Path;
83use std::path::PathBuf;
84use std::sync::Arc;
85
86/// Formatting parameters provided to [`FormatOnAttachSqliteDiskLayer::new`].
87///
88/// Optional parameters which are not provided will be determined by reading the
89/// metadata of the layer being attached to.
90#[derive(Inspect, Copy, Clone)]
91pub struct IncompleteFormatParams {
92    /// Should the layer be considered logically read only (i.e: a cache layer)
93    pub logically_read_only: bool,
94    /// The size of the layer in bytes.
95    pub len: Option<u64>,
96}
97
98/// Formatting parameters provided to [`SqliteDiskLayer::new`]
99#[derive(Inspect, Copy, Clone)]
100pub struct FormatParams {
101    /// Should the layer be considered logically read only (i.e: a cache layer)
102    pub logically_read_only: bool,
103    /// The size of the layer in bytes. Must be divisible by `sector_size`.
104    pub len: u64,
105    /// The size of each sector.
106    pub sector_size: u32,
107}
108
109/// A disk layer backed by sqlite, which lazily infers its topology from the
110/// layer it is being stacked on-top of.
111pub struct FormatOnAttachSqliteDiskLayer {
112    dbhd_path: PathBuf,
113    read_only: bool,
114    format_dbhd: IncompleteFormatParams,
115}
116
117impl FormatOnAttachSqliteDiskLayer {
118    /// Create a new sqlite-backed disk layer, which is formatted when it is
119    /// attached.
120    pub fn new(dbhd_path: PathBuf, read_only: bool, format_dbhd: IncompleteFormatParams) -> Self {
121        Self {
122            dbhd_path,
123            read_only,
124            format_dbhd,
125        }
126    }
127}
128
129/// A disk layer backed entirely by sqlite.
130#[derive(Inspect)]
131pub struct SqliteDiskLayer {
132    #[inspect(skip)]
133    conn: Arc<Mutex<Connection>>, // FUTURE: switch to connection-pool instead
134    meta: schema::DiskMeta,
135}
136
137impl SqliteDiskLayer {
138    /// Create a new sqlite-backed disk layer.
139    pub fn new(
140        dbhd_path: &Path,
141        read_only: bool,
142        format_dbhd: Option<FormatParams>,
143    ) -> anyhow::Result<Self> {
144        // DEVNOTE: sqlite _really_ want to be in control of opening the file,
145        // since it also wants to read/write to the runtime "sidecar" files that
146        // get created when accessing the DB (i.e: the `*-shm` and `*-wal`
147        // files)
148        //
149        // This will make it tricky to sandbox SQLite in the future...
150        //
151        // One idea: maybe we could implement a small SQLite `vfs` shim that
152        // lets use pre-open those particular files on the caller side, and hand
153        // them to sqlite when requested (vs. having it `open()` them itself?)
154        let conn = Connection::open_with_flags(dbhd_path, {
155            use rusqlite::OpenFlags;
156
157            let mut flags = OpenFlags::SQLITE_OPEN_NO_MUTEX;
158
159            if read_only {
160                flags |= OpenFlags::SQLITE_OPEN_READ_ONLY;
161            } else {
162                flags |= OpenFlags::SQLITE_OPEN_READ_WRITE;
163            }
164
165            // FUTURE: if/when the VFS layer is implemented, it _may_ be worth
166            // removing this flag entirely, and relying on the VFS to ensure
167            // that the (possibly blank) db file has been created. Emphasis on
168            // the word "may", as its unclear what the best approach will be
169            // until if/when we have more of the VFS infrastructure in place.
170            if format_dbhd.is_some() {
171                flags |= OpenFlags::SQLITE_OPEN_CREATE
172            }
173
174            flags
175        })?;
176
177        let meta = if let Some(FormatParams {
178            logically_read_only,
179            len,
180            sector_size,
181        }) = format_dbhd
182        {
183            use rusqlite::config::DbConfig;
184
185            // Wipe any existing contents.
186            //
187            // see https://www.sqlite.org/c3ref/c_dbconfig_defensive.html#sqlitedbconfigresetdatabase
188            conn.set_db_config(DbConfig::SQLITE_DBCONFIG_RESET_DATABASE, true)?;
189            conn.execute("VACUUM", ())?;
190            conn.set_db_config(DbConfig::SQLITE_DBCONFIG_RESET_DATABASE, false)?;
191
192            // Set core database config, and initialize table structure
193            conn.pragma_update(None, "journal_mode", "WAL")?;
194            conn.execute(schema::DEFINE_TABLE_SECTORS, [])?;
195            conn.execute(schema::DEFINE_TABLE_METADATA, [])?;
196
197            if len % sector_size as u64 != 0 {
198                anyhow::bail!(
199                    "failed to format: len={len} must be multiple of sector_size={sector_size}"
200                );
201            }
202            let sector_count = len / sector_size as u64;
203
204            let meta = schema::DiskMeta {
205                logically_read_only,
206                sector_count,
207                sector_size,
208            };
209
210            conn.execute(
211                "INSERT INTO meta VALUES (json(?))",
212                [serde_json::to_string(&meta).unwrap()],
213            )?;
214
215            meta
216        } else {
217            use rusqlite::OptionalExtension;
218            let data: String = conn
219                .query_row("SELECT json_extract(metadata, '$') FROM meta", [], |row| {
220                    row.get(0)
221                })
222                .optional()?
223                .context("missing `meta` table")?;
224            serde_json::from_str(&data)?
225        };
226
227        Ok(SqliteDiskLayer {
228            conn: Arc::new(Mutex::new(conn)),
229            meta,
230        })
231    }
232
233    async fn write_maybe_overwrite(
234        &self,
235        buffers: &RequestBuffers<'_>,
236        sector: u64,
237        overwrite: bool,
238    ) -> Result<(), DiskError> {
239        assert!(!(overwrite && self.meta.logically_read_only));
240
241        let count = buffers.len() / self.meta.sector_size as usize;
242        tracing::trace!(sector, count, "write");
243
244        let buf = buffers.reader().read_all()?;
245        unblock({
246            let conn = self.conn.clone().lock_owned().await;
247            let sector_size = self.meta.sector_size;
248            move || write_sectors(conn, sector_size, sector, buf, overwrite)
249        })
250        .await
251        .map_err(|e| DiskError::Io(std::io::Error::other(e)))?;
252
253        Ok(())
254    }
255}
256
257impl LayerAttach for FormatOnAttachSqliteDiskLayer {
258    type Error = anyhow::Error;
259    type Layer = SqliteDiskLayer;
260
261    async fn attach(
262        self,
263        lower_layer_metadata: Option<disk_layered::DiskLayerMetadata>,
264    ) -> Result<Self::Layer, Self::Error> {
265        let len = {
266            let lower_len = lower_layer_metadata
267                .as_ref()
268                .map(|m| m.sector_count * m.sector_size as u64);
269            self.format_dbhd
270                .len
271                .or(lower_len)
272                .context("no base layer to infer sector_count from")?
273        };
274        // FUTURE: make sector-size configurable
275        let sector_size = lower_layer_metadata.map(|x| x.sector_size).unwrap_or(512);
276
277        SqliteDiskLayer::new(
278            &self.dbhd_path,
279            self.read_only,
280            Some(FormatParams {
281                logically_read_only: self.format_dbhd.logically_read_only,
282                len,
283                sector_size,
284            }),
285        )
286    }
287}
288
289impl LayerIo for SqliteDiskLayer {
290    fn layer_type(&self) -> &str {
291        "sqlite"
292    }
293
294    fn sector_count(&self) -> u64 {
295        self.meta.sector_count
296    }
297
298    fn sector_size(&self) -> u32 {
299        self.meta.sector_size
300    }
301
302    fn is_logically_read_only(&self) -> bool {
303        self.meta.logically_read_only
304    }
305
306    fn disk_id(&self) -> Option<[u8; 16]> {
307        None
308    }
309
310    fn physical_sector_size(&self) -> u32 {
311        self.meta.sector_size
312    }
313
314    fn is_fua_respected(&self) -> bool {
315        false
316    }
317
318    async fn read(
319        &self,
320        buffers: &RequestBuffers<'_>,
321        sector: u64,
322        mut marker: SectorMarker<'_>,
323    ) -> Result<(), DiskError> {
324        let sector_count = (buffers.len() / self.meta.sector_size as usize) as u64;
325        let end_sector = sector + sector_count;
326        tracing::trace!(sector, sector_count, "read");
327        if end_sector > self.meta.sector_count {
328            return Err(DiskError::IllegalBlock);
329        }
330
331        let valid_sectors = unblock({
332            let conn = self.conn.clone().lock_owned().await;
333            let end_sector = sector + sector_count;
334            let sector_size = self.meta.sector_size;
335            move || read_sectors(conn, sector_size, sector, end_sector)
336        })
337        .await
338        .map_err(|e| DiskError::Io(std::io::Error::other(e)))?;
339
340        for (s, data) in valid_sectors {
341            let offset = (s - sector) as usize * self.meta.sector_size as usize;
342            let subrange = buffers.subrange(offset, self.meta.sector_size as usize);
343            let mut writer = subrange.writer();
344            match data {
345                SectorKind::AllZero => writer.zero(self.meta.sector_size as usize)?,
346                SectorKind::Data(data) => writer.write(&data)?,
347            };
348
349            marker.set(s);
350        }
351
352        Ok(())
353    }
354
355    async fn write(
356        &self,
357        buffers: &RequestBuffers<'_>,
358        sector: u64,
359        _fua: bool,
360    ) -> Result<(), DiskError> {
361        self.write_maybe_overwrite(buffers, sector, true).await
362    }
363
364    fn write_no_overwrite(&self) -> Option<impl WriteNoOverwrite> {
365        Some(self)
366    }
367
368    async fn sync_cache(&self) -> Result<(), DiskError> {
369        tracing::trace!("sync_cache");
370
371        unblock({
372            let mut conn = self.conn.clone().lock_owned().await;
373            move || -> rusqlite::Result<()> {
374                // https://sqlite-users.sqlite.narkive.com/LX75NOma/forcing-a-manual-fsync-in-wal-normal-mode
375                conn.pragma_update(None, "synchronous", "FULL")?;
376                {
377                    let tx = conn.transaction()?;
378                    tx.pragma_update(None, "user_version", "0")?;
379                }
380                conn.pragma_update(None, "synchronous", "NORMAL")?;
381                Ok(())
382            }
383        })
384        .await
385        .map_err(|e| DiskError::Io(std::io::Error::other(e)))
386    }
387
388    async fn unmap(
389        &self,
390        sector_offset: u64,
391        sector_count: u64,
392        _block_level_only: bool,
393        next_is_zero: bool,
394    ) -> Result<(), DiskError> {
395        tracing::trace!(sector_offset, sector_count, "unmap");
396        if sector_offset + sector_count > self.meta.sector_count {
397            return Err(DiskError::IllegalBlock);
398        }
399
400        unblock({
401            let conn = self.conn.clone().lock_owned().await;
402            move || unmap_sectors(conn, sector_offset, sector_count, next_is_zero)
403        })
404        .await
405        .map_err(|e| DiskError::Io(std::io::Error::other(e)))?;
406
407        Ok(())
408    }
409
410    fn unmap_behavior(&self) -> UnmapBehavior {
411        UnmapBehavior::Zeroes
412    }
413
414    fn optimal_unmap_sectors(&self) -> u32 {
415        1
416    }
417}
418
419impl WriteNoOverwrite for SqliteDiskLayer {
420    async fn write_no_overwrite(
421        &self,
422        buffers: &RequestBuffers<'_>,
423        sector: u64,
424    ) -> Result<(), DiskError> {
425        self.write_maybe_overwrite(buffers, sector, false).await
426    }
427}
428
429enum SectorKind {
430    AllZero,
431    Data(Vec<u8>),
432}
433
434// FUTURE: read from sqlite directly into `RequestBuffers`.
435fn read_sectors(
436    conn: OwnedMutexGuard<Connection>,
437    sector_size: u32,
438    start_sector: u64,
439    end_sector: u64,
440) -> anyhow::Result<Vec<(u64, SectorKind)>> {
441    let mut select_stmt = conn.prepare_cached(
442        "SELECT sector, data
443        FROM sectors
444        WHERE sector >= ? AND sector < ?
445        ORDER BY sector ASC",
446    )?;
447    let mut rows = select_stmt.query(rusqlite::params![start_sector, end_sector])?;
448
449    let mut res = Vec::new();
450    while let Some(row) = rows.next()? {
451        let sector: u64 = row.get(0)?;
452        let data: Option<&[u8]> = row.get_ref(1)?.as_blob_or_null()?;
453        let data = if let Some(data) = data {
454            if data.len() != sector_size as usize {
455                anyhow::bail!(
456                    "db contained sector with unexpected size (expected={}, found={}, sector={:#x})",
457                    sector_size,
458                    data.len(),
459                    sector
460                )
461            }
462            SectorKind::Data(data.into())
463        } else {
464            SectorKind::AllZero
465        };
466        res.push((sector, data));
467    }
468
469    Ok(res)
470}
471
472// FUTURE: write into sqlite directly from `RequestBuffers`.
473fn write_sectors(
474    mut conn: OwnedMutexGuard<Connection>,
475    sector_size: u32,
476    mut sector: u64,
477    buf: Vec<u8>,
478    overwrite: bool,
479) -> Result<(), rusqlite::Error> {
480    let tx = conn.transaction()?;
481    {
482        let mut stmt = if overwrite {
483            tx.prepare_cached("INSERT OR REPLACE INTO sectors (sector, data) VALUES (?, ?)")?
484        } else {
485            tx.prepare_cached("INSERT OR IGNORE INTO sectors (sector, data) VALUES (?, ?)")?
486        };
487
488        let chunks = buf.chunks_exact(sector_size as usize);
489        assert!(chunks.remainder().is_empty());
490        for chunk in chunks {
491            if chunk.iter().all(|x| *x == 0) {
492                stmt.execute(rusqlite::params![sector, rusqlite::types::Null])?;
493            } else {
494                stmt.execute(rusqlite::params![sector, chunk])?;
495            };
496
497            sector += 1;
498        }
499    }
500    tx.commit()?;
501
502    Ok(())
503}
504
505fn unmap_sectors(
506    mut conn: OwnedMutexGuard<Connection>,
507    sector_offset: u64,
508    sector_count: u64,
509    next_is_zero: bool,
510) -> Result<(), rusqlite::Error> {
511    if next_is_zero {
512        let mut clear_stmt =
513            conn.prepare_cached("DELETE FROM sectors WHERE sector BETWEEN ? AND ?")?;
514        clear_stmt.execute(rusqlite::params![
515            sector_offset,
516            sector_offset + sector_count - 1
517        ])?;
518    } else {
519        let tx = conn.transaction()?;
520        {
521            let mut stmt =
522                tx.prepare_cached("INSERT OR REPLACE INTO sectors (sector, data) VALUES (?, ?)")?;
523
524            for sector in sector_offset..(sector_offset + sector_count) {
525                stmt.execute(rusqlite::params![sector, rusqlite::types::Null])?;
526            }
527        }
528        tx.commit()?;
529    }
530
531    Ok(())
532}
533
534mod schema {
535    use inspect::Inspect;
536    use serde::Deserialize;
537    use serde::Serialize;
538
539    // DENOTE: SQLite actually saves the _plaintext_ of CREATE TABLE
540    // statements in its file format, which makes it a pretty good place to
541    // stash inline comments about the schema being used
542    //
543    // DEVNOTE: the choice to use the len of the blob as a marker for all
544    // zero / all one sectors has not been profiled relative to other
545    // implementation (e.g: having a third "kind" column).
546    pub const DEFINE_TABLE_SECTORS: &str = r#"
547CREATE TABLE sectors (
548    -- if data is NULL, that indicates an all-zero sector.
549    -- otherwise, data has len == SECTOR_SIZE, containing the sector data.
550    sector INTEGER NOT NULL,
551    data   BLOB,
552    PRIMARY KEY (sector)
553)
554"#; // TODO?: enforce sqlite >3.37.0 so we can use STRICT
555
556    // DEVNOTE: Given that this is a singleton table, we might as well use JSON
557    // + serde to store whatever metadata we want here, vs. trying to bend our
558    // metadata structure to sqlite's native data types.
559    //
560    // Using JSON (vs, say, protobuf) has the added benefit of allowing existing
561    // external sqlite tooling to more easily read and manipulate the metadata
562    // using sqlite's built-in JSON handling functions.
563    pub const DEFINE_TABLE_METADATA: &str = r#"
564CREATE TABLE meta (
565    metadata TEXT NOT NULL -- stored as JSON
566)
567"#;
568
569    #[derive(Debug, PartialEq, PartialOrd, Eq, Ord, Serialize, Deserialize, Inspect)]
570    pub struct DiskMeta {
571        pub logically_read_only: bool,
572        pub sector_count: u64,
573        pub sector_size: u32,
574    }
575}