From 4807750a97fdcc8a480b02d069b49e218a84b7b2 Mon Sep 17 00:00:00 2001 From: Allison Karlitskaya Date: Fri, 15 Nov 2024 11:13:09 +0100 Subject: [PATCH] oci: Firm up some questions about the / entry The / entry doesn't appear in many layer tarballs. Until now, we've arbitrarily created it root:root, 0755, with mtime set to the epoch. Let's start thinking about this a bit more rigorously. Add a doc/oci.md with some of these decisions spelled out more explicitly. The upshot: we now use 0555 instead of 0755 and we set the mtime to the mtime of the newest file in the filesystem (instead of the epoch). Signed-off-by: Allison Karlitskaya --- doc/oci.md | 53 ++++++++++++++++++++++++++++++++++++++++++++++++ src/image.rs | 41 +++++++++++++++++++++++++++++++++---- src/oci/image.rs | 2 ++ 3 files changed, 92 insertions(+), 4 deletions(-) create mode 100644 doc/oci.md diff --git a/doc/oci.md b/doc/oci.md new file mode 100644 index 0000000..dab7f1a --- /dev/null +++ b/doc/oci.md @@ -0,0 +1,53 @@ +# How to create a composefs from an OCI image + +This document is incomplete. It only serves to document some decisions we've +taken about how to resolve ambiguous situations. + +# Data precision + +We currently create a composefs image using the granularity of data as +typically appears in OCI tarballs: + - atime and ctime are not present (these are actually not physically present + in the erofs inode structure at all, either the compact or extended forms) + - mtime is set to the mtime in seconds; the sub-seconds value is simply + truncated (ie: we always round down). erofs has an nsec field, but it's not + normally present in OCI tarballs. That's down to the fact that the usual + tar header only has timestamps in seconds and extended headers are not + usually added for this purpose. + - we take great care to faithfully represent hardlinks: even though the + produced filesystem is read-only and we have data de-duplication via the + objects store, we make sure that hardlinks result in an actual shared inode + as visible via the `st_ino` and `st_nlink` fields on the mounted filesystem. + +We apply these precision restrictions also when creating images by scanning the +filesystem. For example: even if we get more-accurate timestamp information, +we'll truncate it to the nearest second. + +# Merging directories + +This is done according to the OCI spec, with an additional clarification: in +case a directory entry is present in multiple layers, we use the tar metadata +from the most-derived layer to determine the attributes (owner, permissions, +mtime) for the directory. + +# The root inode + +The root inode (/) is a difficult case because it doesn't always appear in the +layer tarballs. We need to make some arbitrary decisions about the metadata. + +Here's what we do: + + - if any layer tarball contains an empty for '/' then we'd like to use it. + The code for this doesn't exist yet, but it seems reasonable as a principle. + In case the `/` entry were to appear in multiple layers, we'd use the + most-derived layer in which it is present (as per the logic in the previous + section). + - otherwise: + - we assume that the root directory is owned by root:root and has `a+rx` + permissions (ie: `0555`). This matches the behaviour of podman. Note in + particular: podman uses `0555`, not `0755`: the root directory is not + (nominally) writable by the root user. + - the mtime of the root directory is taken to be equal to the most recent + file in the entire system, that is: the highest numerical value of any + mtime on any inode. The rationale is that this is usually a very good + proxy for "when was the (most-derived) container image created". diff --git a/src/image.rs b/src/image.rs index b75ebfe..b03049a 100644 --- a/src/image.rs +++ b/src/image.rs @@ -155,6 +155,20 @@ impl Directory { pub fn remove_all(&mut self) { self.entries.clear(); } + + pub fn newest_file(&self) -> i64 { + let mut newest = self.stat.st_mtim_sec; + for DirEnt { inode, .. } in &self.entries { + let mtime = match inode { + Inode::Leaf(ref leaf) => leaf.stat.st_mtim_sec, + Inode::Directory(ref dir) => dir.newest_file(), + }; + if mtime > newest { + newest = mtime; + } + } + newest + } } pub struct FileSystem { @@ -172,10 +186,10 @@ impl FileSystem { FileSystem { root: Directory { stat: Stat { - st_mode: 0o755, - st_uid: 0, - st_gid: 0, - st_mtim_sec: 0, + st_mode: u32::MAX, // assigned later + st_uid: u32::MAX, // assigned later + st_gid: u32::MAX, // assigned later + st_mtim_sec: -1, // assigned later xattrs: RefCell::new(BTreeMap::new()), }, entries: vec![], @@ -246,6 +260,25 @@ impl FileSystem { todo!(); } } + + pub fn done(&mut self) { + // We need to look at the root entry and deal with the "assign later" fields + let stat = &mut self.root.stat; + + if stat.st_mode == u32::MAX { + stat.st_mode = 0o555; + } + if stat.st_uid == u32::MAX { + stat.st_uid = 0; + } + if stat.st_gid == u32::MAX { + stat.st_gid = 0; + } + if stat.st_mtim_sec == -1 { + // write this in full to avoid annoying the borrow checker + self.root.stat.st_mtim_sec = self.root.newest_file(); + } + } } pub fn mkcomposefs(filesystem: FileSystem) -> Result> { diff --git a/src/oci/image.rs b/src/oci/image.rs index 098c1ad..ffec4e9 100644 --- a/src/oci/image.rs +++ b/src/oci/image.rs @@ -65,6 +65,7 @@ pub fn compose_filesystem(repo: &Repository, layers: &[String]) -> Result