bootc_lib/
podstorage.rs

1//! # bootc-managed instance of containers-storage:
2//!
3//! The backend for podman and other tools is known as `container-storage:`,
4//! with a canonical instance that lives in `/var/lib/containers`.
5//!
6//! This is a `containers-storage:` instance` which is owned by bootc and
7//! is stored at `/sysroot/ostree/bootc`.
8//!
9//! At the current time, this is only used for Logically Bound Images.
10
11use std::collections::HashSet;
12use std::io::{Seek, Write};
13use std::os::unix::process::CommandExt;
14use std::process::{Command, Stdio};
15use std::sync::Arc;
16
17use anyhow::{Context, Result};
18use bootc_utils::{AsyncCommandRunExt, CommandRunExt, ExitStatusExt};
19use camino::{Utf8Path, Utf8PathBuf};
20use cap_std_ext::cap_std::fs::Dir;
21use cap_std_ext::cap_tempfile::TempDir;
22use cap_std_ext::cmdext::CapStdExtCommandExt;
23use cap_std_ext::dirext::CapStdExtDirExt;
24use cap_std_ext::{cap_std, cap_tempfile};
25use fn_error_context::context;
26use ostree_ext::ostree::{self};
27use std::os::fd::{AsFd, AsRawFd, OwnedFd};
28use tokio::process::Command as AsyncCommand;
29
30// Pass only 100 args at a time just to avoid potentially overflowing argument
31// vectors; not that this should happen in reality, but just in case.
32const SUBCMD_ARGV_CHUNKING: usize = 100;
33
34/// Global directory path which we use for podman to point
35/// it at our storage. Unfortunately we can't yet use the
36/// /proc/self/fd/N trick because it currently breaks due
37/// to how the untar process is forked in the child.
38pub(crate) const STORAGE_ALIAS_DIR: &str = "/run/bootc/storage";
39/// We pass this via /proc/self/fd to the child process.
40const STORAGE_RUN_FD: i32 = 3;
41
42const LABELED: &str = ".bootc_labeled";
43
44/// The system path to the canonical containers-storage instance,
45/// used as the SELinux label reference path.
46const SYS_CSTOR_PATH: &str = "/var/lib/containers/storage";
47
48/// The path to the image storage, relative to the bootc root directory.
49pub(crate) const SUBPATH: &str = "storage";
50/// The path to the "runroot" with transient runtime state; this is
51/// relative to the /run directory
52const RUNROOT: &str = "bootc/storage";
53
54/// A bootc-owned instance of `containers-storage:`.
55pub(crate) struct CStorage {
56    /// The root directory
57    sysroot: Dir,
58    /// The location of container storage
59    storage_root: Dir,
60    #[allow(dead_code)]
61    /// Our runtime state
62    run: Dir,
63    /// The SELinux policy used for labeling the storage.
64    sepolicy: Option<ostree::SePolicy>,
65    /// Disallow using this across multiple threads concurrently; while we
66    /// have internal locking in podman, in the future we may change how
67    /// things work here. And we don't have a use case right now for
68    /// concurrent operations.
69    _unsync: std::cell::Cell<()>,
70}
71
72#[derive(Debug, PartialEq, Eq)]
73pub(crate) enum PullMode {
74    /// Pull only if the image is not present
75    IfNotExists,
76    /// Always check for an update
77    #[allow(dead_code)]
78    Always,
79}
80
81#[allow(unsafe_code)]
82#[context("Binding storage roots")]
83fn bind_storage_roots(cmd: &mut Command, storage_root: &Dir, run_root: &Dir) -> Result<()> {
84    // podman requires an absolute path, for two reasons right now:
85    // - It writes the file paths into `db.sql`, a sqlite database for unknown reasons
86    // - It forks helper binaries, so just giving it /proc/self/fd won't work as
87    //   those helpers may not get the fd passed. (which is also true of skopeo)
88    // We create a new mount namespace, which also has the helpful side effect
89    // of automatically cleaning up the global bind mount that the storage stack
90    // creates.
91
92    let storage_root = Arc::new(storage_root.try_clone().context("Cloning storage root")?);
93    let run_root: Arc<OwnedFd> = Arc::new(run_root.try_clone().context("Cloning runroot")?.into());
94    // SAFETY: All the APIs we call here are safe to invoke between fork and exec.
95    unsafe {
96        cmd.pre_exec(move || {
97            use rustix::fs::{Mode, OFlags};
98            // For reasons I don't understand, we can't just `mount("/proc/self/fd/N", "/path/to/target")`
99            // but it *does* work to fchdir(fd) + mount(".", "/path/to/target").
100            // I think it may be that mount doesn't like operating on the magic links?
101            // This trick only works if we set our working directory to the target *before*
102            // creating the new namespace too.
103            //
104            // I think we may be hitting this:
105            //
106            // "       EINVAL A bind operation (MS_BIND) was requested where source referred a mount namespace magic link (i.e., a /proc/pid/ns/mnt magic link or a bind mount to such a link) and the propagation type of the parent mount of target was
107            // MS_SHARED, but propagation of the requested bind mount could lead to a circular dependency that might prevent the mount namespace from ever being freed."
108            //
109            // But...how did we avoid that circular dependency by using the process cwd?
110            //
111            // I tried making the mounts recursively private, but that didn't help.
112            let oldwd = rustix::fs::open(
113                ".",
114                OFlags::DIRECTORY | OFlags::CLOEXEC | OFlags::RDONLY,
115                Mode::empty(),
116            )?;
117            rustix::process::fchdir(&storage_root)?;
118            rustix::thread::unshare_unsafe(rustix::thread::UnshareFlags::NEWNS)?;
119            rustix::mount::mount_bind(".", STORAGE_ALIAS_DIR)?;
120            rustix::process::fchdir(&oldwd)?;
121            Ok(())
122        })
123    };
124    cmd.take_fd_n(run_root, STORAGE_RUN_FD);
125    Ok(())
126}
127
128// Initialize a `podman` subprocess with:
129// - storage overridden to point to to storage_root
130// - Authentication (auth.json) using the bootc/ostree owned auth
131fn new_podman_cmd_in(sysroot: &Dir, storage_root: &Dir, run_root: &Dir) -> Result<Command> {
132    let mut cmd = Command::new("podman");
133    bind_storage_roots(&mut cmd, storage_root, run_root)?;
134    let run_root = format!("/proc/self/fd/{STORAGE_RUN_FD}");
135    cmd.args(["--root", STORAGE_ALIAS_DIR, "--runroot", run_root.as_str()]);
136
137    let tmpd = &cap_std::fs::Dir::open_ambient_dir("/tmp", cap_std::ambient_authority())?;
138    let mut tempfile = cap_tempfile::TempFile::new_anonymous(tmpd).map(std::io::BufWriter::new)?;
139
140    // Keep this in sync with https://github.com/bootc-dev/containers-image-proxy-rs/blob/b5e0861ad5065f47eaf9cda0d48da3529cc1bc43/src/imageproxy.rs#L310
141    // We always override the auth to match the bootc setup.
142    let authfile_fd = ostree_ext::globals::get_global_authfile(sysroot)?.map(|v| v.1);
143    if let Some(mut fd) = authfile_fd {
144        std::io::copy(&mut fd, &mut tempfile)?;
145    } else {
146        // Note that if there's no bootc-owned auth, then we force an empty authfile to ensure
147        // that podman doesn't fall back to searching the user-owned paths.
148        tempfile.write_all(b"{}")?;
149    }
150
151    let tempfile = tempfile
152        .into_inner()
153        .map_err(|e| e.into_error())?
154        .into_std();
155    let fd: Arc<OwnedFd> = std::sync::Arc::new(tempfile.into());
156    let target_fd = fd.as_fd().as_raw_fd();
157    cmd.take_fd_n(fd, target_fd);
158    cmd.env("REGISTRY_AUTH_FILE", format!("/proc/self/fd/{target_fd}"));
159
160    Ok(cmd)
161}
162
163/// Adjust the provided command (skopeo or podman e.g.) to reference
164/// the provided path as an additional image store.
165pub fn set_additional_image_store<'c>(
166    cmd: &'c mut Command,
167    ais: impl AsRef<Utf8Path>,
168) -> &'c mut Command {
169    let ais = ais.as_ref();
170    let storage_opt = format!("additionalimagestore={ais}");
171    cmd.env("STORAGE_OPTS", storage_opt)
172}
173
174/// Ensure that "podman" is the first thing to touch the global storage
175/// instance. This is a workaround for <https://github.com/bootc-dev/bootc/pull/1101#issuecomment-2653862974>
176/// Basically podman has special upgrade logic for when it is the first thing
177/// to initialize the c/storage instance it sets the networking to netavark.
178/// If it's not the first thing, then it assumes an upgrade scenario and we
179/// may be using CNI.
180///
181/// But this legacy path is triggered through us using skopeo, turning off netavark
182/// by default. Work around this by ensuring that /usr/bin/podman is
183/// always the first thing to touch c/storage (at least, when invoked by us).
184///
185/// Call this function any time we're going to write to containers-storage.
186pub(crate) fn ensure_floating_c_storage_initialized() {
187    if let Err(e) = Command::new("podman")
188        .args(["system", "info"])
189        .stdout(Stdio::null())
190        .run_capture_stderr()
191    {
192        // Out of conservatism we don't make this operation fatal right now.
193        // If something went wrong, then we'll probably fail on a later operation
194        // anyways.
195        tracing::warn!("Failed to query podman system info: {e}");
196    }
197}
198
199impl CStorage {
200    /// Create a `podman image` Command instance prepared to operate on our alternative
201    /// root.
202    pub(crate) fn new_image_cmd(&self) -> Result<Command> {
203        let mut r = new_podman_cmd_in(&self.sysroot, &self.storage_root, &self.run)?;
204        // We want to limit things to only manipulating images by default.
205        r.arg("image");
206        Ok(r)
207    }
208
209    fn init_globals() -> Result<()> {
210        // Ensure our global storage alias dir exists
211        std::fs::create_dir_all(STORAGE_ALIAS_DIR)
212            .with_context(|| format!("Creating {STORAGE_ALIAS_DIR}"))?;
213        Ok(())
214    }
215
216    /// Ensure that the LSM (SELinux) labels are set on the bootc-owned
217    /// containers-storage: instance. We use a `LABELED` stamp file for
218    /// idempotence.
219    #[context("Labeling imgstorage dirs")]
220    pub(crate) fn ensure_labeled(&self) -> Result<()> {
221        if self.storage_root.try_exists(LABELED)? {
222            return Ok(());
223        }
224        let Some(sepolicy) = self.sepolicy.as_ref() else {
225            return Ok(());
226        };
227
228        // recursively set the labels because they were previously set to usr_t,
229        // and there is no policy defined to set them to the c/storage labels
230        crate::lsm::relabel_recurse(
231            &self.storage_root,
232            ".",
233            Some(Utf8Path::new(SYS_CSTOR_PATH)),
234            sepolicy,
235        )
236        .context("labeling storage root")?;
237
238        // fsync so relabel writes are durable before creating the stamp file
239        rustix::fs::fsync(
240            self.storage_root
241                .reopen_as_ownedfd()
242                .context("Reopening as owned fd")?,
243        )
244        .context("fsync")?;
245
246        self.storage_root.create(LABELED)?;
247
248        // Label the stamp file itself to match the storage directory context
249        crate::lsm::relabel(
250            &self.storage_root,
251            &self.storage_root.symlink_metadata(LABELED)?,
252            LABELED.into(),
253            Some(&Utf8Path::new(SYS_CSTOR_PATH).join(LABELED)),
254            sepolicy,
255        )
256        .context("labeling stamp file")?;
257
258        // fsync to persist the stamp file entry
259        rustix::fs::fsync(
260            self.storage_root
261                .reopen_as_ownedfd()
262                .context("Reopening as owned fd")?,
263        )
264        .context("fsync")?;
265
266        Ok(())
267    }
268
269    #[context("Creating imgstorage")]
270    pub(crate) fn create(
271        sysroot: &Dir,
272        run: &Dir,
273        sepolicy: Option<&ostree::SePolicy>,
274    ) -> Result<Self> {
275        Self::init_globals()?;
276        let subpath = &Self::subpath();
277
278        // SAFETY: We know there's a parent
279        let parent = subpath.parent().unwrap();
280        let tmp = format!("{subpath}.tmp");
281        let existed = sysroot
282            .try_exists(subpath)
283            .with_context(|| format!("Querying {subpath}"))?;
284        if !existed {
285            sysroot.remove_all_optional(&tmp).context("Removing tmp")?;
286            sysroot
287                .create_dir_all(parent)
288                .with_context(|| format!("Creating {parent}"))?;
289            sysroot.create_dir_all(&tmp).context("Creating tmpdir")?;
290            let storage_root = sysroot.open_dir(&tmp).context("Open tmp")?;
291
292            // There's no explicit API to initialize a containers-storage:
293            // root, simply passing a path will attempt to auto-create it.
294            // We run "podman images" in the new root.
295            new_podman_cmd_in(&sysroot, &storage_root, &run)?
296                .stdout(Stdio::null())
297                .arg("images")
298                .run_capture_stderr()
299                .context("Initializing images")?;
300            drop(storage_root);
301            sysroot
302                .rename(&tmp, sysroot, subpath)
303                .context("Renaming tmpdir")?;
304            tracing::debug!("Created image store");
305        }
306
307        let s = Self::open(sysroot, run, sepolicy.cloned())?;
308        if existed {
309            // For pre-existing storage (e.g. on a booted system), ensure
310            // labels are correct now. For freshly created storage (e.g.
311            // during install), labeling is deferred until after all image
312            // pulls are complete via an explicit ensure_labeled() call.
313            s.ensure_labeled()?;
314        }
315        Ok(s)
316    }
317
318    #[context("Opening imgstorage")]
319    pub(crate) fn open(
320        sysroot: &Dir,
321        run: &Dir,
322        sepolicy: Option<ostree::SePolicy>,
323    ) -> Result<Self> {
324        tracing::trace!("Opening container image store");
325        Self::init_globals()?;
326        let subpath = &Self::subpath();
327        let storage_root = sysroot
328            .open_dir(subpath)
329            .with_context(|| format!("Opening {subpath}"))?;
330        // Always auto-create this if missing
331        run.create_dir_all(RUNROOT)
332            .with_context(|| format!("Creating {RUNROOT}"))?;
333        let run = run.open_dir(RUNROOT)?;
334        Ok(Self {
335            sysroot: sysroot.try_clone()?,
336            storage_root,
337            run,
338            sepolicy,
339            _unsync: Default::default(),
340        })
341    }
342
343    #[context("Listing images")]
344    pub(crate) async fn list_images(&self) -> Result<Vec<crate::podman::ImageListEntry>> {
345        let mut cmd = self.new_image_cmd()?;
346        cmd.args(["list", "--format=json"]);
347        cmd.stdin(Stdio::null());
348        // It's maximally convenient for us to just pipe the whole output to a tempfile
349        let mut stdout = tempfile::tempfile()?;
350        cmd.stdout(stdout.try_clone()?);
351        // Allocate stderr, which is passed to the status checker
352        let stderr = tempfile::tempfile()?;
353        cmd.stderr(stderr.try_clone()?);
354
355        // Spawn the child and wait
356        AsyncCommand::from(cmd)
357            .status()
358            .await?
359            .check_status_with_stderr(stderr)?;
360        // Spawn a helper thread to avoid blocking the main thread
361        // parsing JSON.
362        tokio::task::spawn_blocking(move || -> Result<_> {
363            stdout.seek(std::io::SeekFrom::Start(0))?;
364            let stdout = std::io::BufReader::new(stdout);
365            let r = serde_json::from_reader(stdout)?;
366            Ok(r)
367        })
368        .await?
369    }
370
371    #[context("Pruning")]
372    pub(crate) async fn prune_except_roots(&self, roots: &HashSet<&str>) -> Result<Vec<String>> {
373        let all_images = self.list_images().await?;
374        tracing::debug!("Images total: {}", all_images.len(),);
375        let mut garbage = Vec::new();
376        for image in all_images {
377            if image
378                .names
379                .iter()
380                .flatten()
381                .all(|name| !roots.contains(name.as_str()))
382            {
383                garbage.push(image.id);
384            }
385        }
386        tracing::debug!("Images to prune: {}", garbage.len());
387        for garbage in garbage.chunks(SUBCMD_ARGV_CHUNKING) {
388            let mut cmd = self.new_image_cmd()?;
389            cmd.stdin(Stdio::null());
390            cmd.stdout(Stdio::null());
391            cmd.arg("rm");
392            cmd.args(garbage);
393            AsyncCommand::from(cmd).run().await?;
394        }
395        Ok(garbage)
396    }
397
398    /// Return true if the image exists in the storage.
399    pub(crate) async fn exists(&self, image: &str) -> Result<bool> {
400        // Sadly https://docs.rs/containers-image-proxy/latest/containers_image_proxy/struct.ImageProxy.html#method.open_image_optional
401        // doesn't work with containers-storage yet
402        let mut cmd = AsyncCommand::from(self.new_image_cmd()?);
403        cmd.args(["exists", image]);
404        Ok(cmd.status().await?.success())
405    }
406
407    /// Fetch the image if it is not already present; return whether
408    /// or not the image was fetched.
409    pub(crate) async fn pull(&self, image: &str, mode: PullMode) -> Result<bool> {
410        match mode {
411            PullMode::IfNotExists => {
412                if self.exists(image).await? {
413                    tracing::debug!("Image is already present: {image}");
414                    return Ok(false);
415                }
416            }
417            PullMode::Always => {}
418        };
419        let mut cmd = self.new_image_cmd()?;
420        cmd.stdin(Stdio::null());
421        cmd.stdout(Stdio::null());
422        cmd.args(["pull", image]);
423        tracing::debug!("Pulling image: {image}");
424        let mut cmd = AsyncCommand::from(cmd);
425        cmd.run().await.context("Failed to pull image")?;
426        Ok(true)
427    }
428
429    /// Copy an image from the default container storage (/var/lib/containers/)
430    /// to this storage.
431    #[context("Pulling from host storage: {image}")]
432    pub(crate) async fn pull_from_host_storage(&self, image: &str) -> Result<()> {
433        let mut cmd = Command::new("podman");
434        cmd.stdin(Stdio::null());
435        cmd.stdout(Stdio::null());
436        // An ephemeral place for the transient state;
437        let temp_runroot = TempDir::new(cap_std::ambient_authority())?;
438        bind_storage_roots(&mut cmd, &self.storage_root, &temp_runroot)?;
439
440        // The destination (target stateroot) + container storage dest
441        let storage_dest = &format!(
442            "containers-storage:[overlay@{STORAGE_ALIAS_DIR}+/proc/self/fd/{STORAGE_RUN_FD}]"
443        );
444        cmd.args(["image", "push", "--remove-signatures", image])
445            .arg(format!("{storage_dest}{image}"));
446        let mut cmd = AsyncCommand::from(cmd);
447        cmd.run().await?;
448        temp_runroot.close()?;
449        Ok(())
450    }
451
452    pub(crate) fn subpath() -> Utf8PathBuf {
453        Utf8Path::new(crate::store::BOOTC_ROOT).join(SUBPATH)
454    }
455}
456
457#[cfg(test)]
458mod tests {
459    use super::*;
460    static_assertions::assert_not_impl_any!(CStorage: Sync);
461}