libcontainer: remove all mount logic from nsexec

With open_tree(OPEN_TREE_CLONE), it is possible to implement both the id-mapped mounts and bind-mount source file descriptor logic entirely in Go without requiring any complicated handling from nsexec. However, implementing it the naive way (do the OPEN_TREE_CLONE in the host namespace before the rootfs is set up -- which is what the existing implementation did) exposes issues in how mount ordering (in particular when handling mount sources from inside the container rootfs, but also in relation to mount propagation) was handled for idmapped mounts and bind-mount sources. In order to solve this problem completely, it is necessary to spawn a thread which joins the container mount namespace and provides mountfds when requested by the rootfs setup code (ensuring that the mount order and mount propagation of the source of the bind-mount are handled correctly). While the need to join the mount namespace leads to other complicated (such as with the usage of /proc/self -- fixed in a later patch) the resulting code is still reasonable and is the only real way to solve the issue. This allows us to reduce the amount of C code we have in nsexec, as well as simplifying a whole host of places that were made more complicated with the addition of id-mapped mounts and the bind sourcefd logic. Because we join the container namespace, we can continue to use regular O_PATH file descriptors for non-id-mapped bind-mount sources (which means we don't have to raise the kernel requirement for that case). In addition, we can easily add support for id-mappings that don't match the container's user namespace. The approach taken here is to use Go's officially supported mechanism for spawning a process in a user namespace, but (ab)use PTRACE_TRACEME to avoid actually having to exec a different process. The most efficient way to implement this would be to do clone() in cgo directly to run a function that just does kill(getpid(), SIGSTOP) -- we can always switch to that if it turns out this approach is too slow. It should be noted that the included micro-benchmark seems to indicate this is Fast Enough(TM): goos: linux goarch: amd64 pkg: github.com/opencontainers/runc/libcontainer/userns cpu: Intel(R) Core(TM) i5-10210U CPU @ 1.60GHz BenchmarkSpawnProc BenchmarkSpawnProc-8 1670 770065 ns/op Fixes: fda12ab101 ("Support idmap mounts on volumes") Fixes: 9c444070ec ("Open bind mount sources from the host userns") Signed-off-by: Aleksa Sarai <cyphar@cyphar.com>
2025-11-09 13:00:56 +03:00 · 2023-08-01 20:07:49 +10:00
parent 99f7fa1413
commit ba0b5e2698
14 changed files with 618 additions and 688 deletions
--- a/libcontainer/container_linux.go
+++ b/libcontainer/container_linux.go
@@ -2,7 +2,6 @@ package libcontainer
 import (
 	"bytes"
 	"encoding/json"
 	"errors"
 	"fmt"
 	"io"
@@ -629,112 +628,6 @@ func (c *Container) newParentProcess(p *Process) (parentProcess, error) {
 	return c.newSetnsProcess(p, cmd, comm)
 }
 // shouldSendMountSources says whether the child process must setup bind mounts with
 // the source pre-opened (O_PATH) in the host user namespace.
 // See https://github.com/opencontainers/runc/issues/2484
 func (c *Container) shouldSendMountSources() bool {
 	// Passing the mount sources via SCM_RIGHTS is only necessary when
 	// both userns and mntns are active.
 	if !c.config.Namespaces.Contains(configs.NEWUSER) ||
 		!c.config.Namespaces.Contains(configs.NEWNS) {
 		return false
 	}
 	// nsexec.c send_mountsources() requires setns(mntns) capabilities
 	// CAP_SYS_CHROOT and CAP_SYS_ADMIN.
 	if c.config.RootlessEUID {
 		return false
 	}
 	// We need to send sources if there are non-idmap bind-mounts.
 	for _, m := range c.config.Mounts {
 		if m.IsBind() && !m.IsIDMapped() {
 			return true
 		}
 	}
 	return false
 }
 // shouldSendIdmapSources says whether the child process must setup idmap mounts with
 // the mount_setattr already done in the host user namespace.
 func (c *Container) shouldSendIdmapSources() bool {
 	// nsexec.c mount_setattr() requires CAP_SYS_ADMIN in:
 	// * the user namespace the filesystem was mounted in;
 	// * the user namespace we're trying to idmap the mount to;
 	// * the owning user namespace of the mount namespace you're currently located in.
 	//
 	// See the comment from Christian Brauner:
 	//	https://github.com/opencontainers/runc/pull/3717#discussion_r1103607972
 	//
 	// Let's just rule out rootless, we don't have those permission in the
 	// rootless case.
 	if c.config.RootlessEUID {
 		return false
 	}
 	// For the time being we require userns to be in use.
 	if !c.config.Namespaces.Contains(configs.NEWUSER) {
 		return false
 	}
 	// We need to send sources if there are idmap bind-mounts.
 	for _, m := range c.config.Mounts {
 		if m.IsBind() && m.IsIDMapped() {
 			return true
 		}
 	}
 	return false
 }
 func (c *Container) sendMountSources(cmd *exec.Cmd, comm *processComm) error {
 	if !c.shouldSendMountSources() {
 		return nil
 	}
 	return c.sendFdsSources(cmd, comm, "_LIBCONTAINER_MOUNT_FDS", func(m *configs.Mount) bool {
 		return m.IsBind() && !m.IsIDMapped()
 	})
 }
 func (c *Container) sendIdmapSources(cmd *exec.Cmd, comm *processComm) error {
 	if !c.shouldSendIdmapSources() {
 		return nil
 	}
 	return c.sendFdsSources(cmd, comm, "_LIBCONTAINER_IDMAP_FDS", func(m *configs.Mount) bool {
 		return m.IsBind() && m.IsIDMapped()
 	})
 }
 func (c *Container) sendFdsSources(cmd *exec.Cmd, comm *processComm, envVar string, condition func(*configs.Mount) bool) error {
 	// Elements on these slices will be paired with mounts (see StartInitialization() and
 	// prepareRootfs()). These slices MUST have the same size as c.config.Mounts.
 	fds := make([]int, len(c.config.Mounts))
 	for i, m := range c.config.Mounts {
 		if !condition(m) {
 			// The -1 fd is ignored later.
 			fds[i] = -1
 			continue
 		}
 		// The fd passed here will not be used: nsexec.c will overwrite it with
 		// dup3(). We just need to allocate a fd so that we know the number to
 		// pass in the environment variable. The fd must not be closed before
 		// cmd.Start(), so we reuse initSockChild because the lifecycle of that
 		// fd is already taken care of.
 		cmd.ExtraFiles = append(cmd.ExtraFiles, comm.initSockChild)
 		fds[i] = stdioFdCount + len(cmd.ExtraFiles) - 1
 	}
 	fdsJSON, err := json.Marshal(fds)
 	if err != nil {
 		return fmt.Errorf("Error creating %v: %w", envVar, err)
 	}
 	cmd.Env = append(cmd.Env, envVar+"="+string(fdsJSON))
 	return nil
 }
 func (c *Container) newInitProcess(p *Process, cmd *exec.Cmd, comm *processComm) (*initProcess, error) {
 	cmd.Env = append(cmd.Env, "_LIBCONTAINER_INITTYPE="+string(initStandard))
 	nsMaps := make(map[configs.NamespaceType]string)
@@ -743,16 +636,10 @@ func (c *Container) newInitProcess(p *Process, cmd *exec.Cmd, comm *processComm)
 			nsMaps[ns.Type] = ns.Path
 		}
 	}
-	data, err := c.bootstrapData(c.config.Namespaces.CloneFlags(), nsMaps, initStandard)
+	data, err := c.bootstrapData(c.config.Namespaces.CloneFlags(), nsMaps)
 	if err != nil {
 		return nil, err
 	}
 	if err := c.sendMountSources(cmd, comm); err != nil {
 		return nil, err
 	}
 	if err := c.sendIdmapSources(cmd, comm); err != nil {
 		return nil, err
 	}
 	init := &initProcess{
 		cmd:             cmd,
@@ -776,7 +663,7 @@ func (c *Container) newSetnsProcess(p *Process, cmd *exec.Cmd, comm *processComm
 	}
 	// for setns process, we don't have to set cloneflags as the process namespaces
 	// will only be set via setns syscall
-	data, err := c.bootstrapData(0, state.NamespacePaths, initSetns)
+	data, err := c.bootstrapData(0, state.NamespacePaths)
 	if err != nil {
 		return nil, err
 	}
@@ -1165,7 +1052,7 @@ type netlinkError struct{ error }
 // such as one that uses nsenter package to bootstrap the container's
 // init process correctly, i.e. with correct namespaces, uid/gid
 // mapping etc.
-func (c *Container) bootstrapData(cloneFlags uintptr, nsMaps map[configs.NamespaceType]string, it initType) (_ io.Reader, Err error) {
+func (c *Container) bootstrapData(cloneFlags uintptr, nsMaps map[configs.NamespaceType]string) (_ io.Reader, Err error) {
 	// create the netlink message
 	r := nl.NewNetlinkRequest(int(InitMsg), 0)
@@ -1267,48 +1154,6 @@ func (c *Container) bootstrapData(cloneFlags uintptr, nsMaps map[configs.Namespa
 		Value: c.config.RootlessEUID,
 	})
 	// Bind mount source to open.
 	if it == initStandard && c.shouldSendMountSources() {
 		var mounts []byte
 		for _, m := range c.config.Mounts {
 			if m.IsBind() && !m.IsIDMapped() {
 				if strings.IndexByte(m.Source, 0) >= 0 {
 					return nil, fmt.Errorf("mount source string contains null byte: %q", m.Source)
 				}
 				mounts = append(mounts, []byte(m.Source)...)
 			}
 			mounts = append(mounts, byte(0))
 		}
 		r.AddData(&Bytemsg{
 			Type:  MountSourcesAttr,
 			Value: mounts,
 		})
 	}
 	// Idmap mount sources to open.
 	if it == initStandard && c.shouldSendIdmapSources() {
 		var mounts []byte
 		for _, m := range c.config.Mounts {
 			if m.IsBind() && m.IsIDMapped() {
 				// While other parts of the code check this too (like
 				// libcontainer/specconv/spec_linux.go) we do it here also because some libcontainer
 				// users don't use those functions.
 				if strings.IndexByte(m.Source, 0) >= 0 {
 					return nil, fmt.Errorf("mount source string contains null byte: %q", m.Source)
 				}
 				mounts = append(mounts, []byte(m.Source)...)
 			}
 			mounts = append(mounts, byte(0))
 		}
 		r.AddData(&Bytemsg{
 			Type:  IdmapSourcesAttr,
 			Value: mounts,
 		})
 	}
 	// write boottime and monotonic time ns offsets.
 	if c.config.TimeOffsets != nil {
 		var offsetSpec bytes.Buffer
--- a/libcontainer/criu_linux.go
+++ b/libcontainer/criu_linux.go
@@ -618,8 +618,8 @@ func (c *Container) prepareCriuRestoreMounts(mounts []*configs.Mount) error {
 			// because during initial container creation mounts are
 			// set up in the order they are configured.
 			if m.Device == "bind" {
-				if err := utils.WithProcfd(c.config.Rootfs, m.Destination, func(dstFD string) error {
+				if err := utils.WithProcfd(c.config.Rootfs, m.Destination, func(dstFd string) error {
-					return mountViaFDs(m.Source, nil, m.Destination, dstFD, "", unix.MS_BIND|unix.MS_REC, "")
+					return mountViaFds(m.Source, nil, m.Destination, dstFd, "", unix.MS_BIND|unix.MS_REC, "")
 				}); err != nil {
 					return err
 				}
--- a/libcontainer/factory_linux.go
+++ b/libcontainer/factory_linux.go
@@ -214,18 +214,3 @@ func validateID(id string) error {
 	return nil
 }
 func parseFdsFromEnv(envVar string) ([]int, error) {
 	fdsJSON := os.Getenv(envVar)
 	if fdsJSON == "" {
 		// Always return the nil slice if no fd is present.
 		return nil, nil
 	}
 	var fds []int
 	if err := json.Unmarshal([]byte(fdsJSON), &fds); err != nil {
 		return nil, fmt.Errorf("Error unmarshalling %v: %w", envVar, err)
 	}
 	return fds, nil
 }
--- a/libcontainer/init_linux.go
+++ b/libcontainer/init_linux.go
@@ -47,18 +47,6 @@ type network struct {
 	TempVethPeerName string `json:"temp_veth_peer_name"`
 }
 type mountFds struct {
 	// sourceFds are the fds to use as source when mounting.
 	// The slice size should be the same as container mounts, as it will be
 	// paired with them.
 	// The value -1 is used when no fd is needed for the mount.
 	// Can't have a valid fd in the same position that other slices in this struct.
 	// We need to use only one of these fds on any single mount.
 	sourceFds []int
 	// Idem sourceFds, but fds of already created idmap mounts, to use with unix.MoveMount().
 	idmapFds []int
 }
 // initConfig is used for transferring parameters from Exec() to Init()
 type initConfig struct {
 	Args             []string              `json:"args"`
@@ -189,18 +177,6 @@ func startInitialization() (retErr error) {
 		defer pidfdSocket.Close()
 	}
 	// Get mount files (O_PATH).
 	mountSrcFds, err := parseFdsFromEnv("_LIBCONTAINER_MOUNT_FDS")
 	if err != nil {
 		return err
 	}
 	// Get idmap fds.
 	idmapFds, err := parseFdsFromEnv("_LIBCONTAINER_IDMAP_FDS")
 	if err != nil {
 		return err
 	}
 	// Get runc-dmz fds.
 	var dmzExe *os.File
 	if dmzFdStr := os.Getenv("_LIBCONTAINER_DMZEXEFD"); dmzFdStr != "" {
@@ -232,21 +208,16 @@ func startInitialization() (retErr error) {
 	}
 	// If init succeeds, it will not return, hence none of the defers will be called.
-	return containerInit(it, &config, syncPipe, consoleSocket, pidfdSocket, fifofd, logFD, dmzExe, mountFds{sourceFds: mountSrcFds, idmapFds: idmapFds})
+	return containerInit(it, &config, syncPipe, consoleSocket, pidfdSocket, fifofd, logFD, dmzExe)
 }
-func containerInit(t initType, config *initConfig, pipe *syncSocket, consoleSocket, pidfdSocket *os.File, fifoFd, logFd int, dmzExe *os.File, mountFds mountFds) error {
+func containerInit(t initType, config *initConfig, pipe *syncSocket, consoleSocket, pidfdSocket *os.File, fifoFd, logFd int, dmzExe *os.File) error {
 	if err := populateProcessEnvironment(config.Env); err != nil {
 		return err
 	}
 	switch t {
 	case initSetns:
 		// mount and idmap fds must be nil in this case. We don't mount while doing runc exec.
 		if mountFds.sourceFds != nil || mountFds.idmapFds != nil {
 			return errors.New("mount and idmap fds must be nil; can't mount from exec")
 		}
 		i := &linuxSetnsInit{
 			pipe:          pipe,
 			consoleSocket: consoleSocket,
@@ -266,7 +237,6 @@ func containerInit(t initType, config *initConfig, pipe *syncSocket, consoleSock
 			fifoFd:        fifoFd,
 			logFd:         logFd,
 			dmzExe:        dmzExe,
 			mountFds:      mountFds,
 		}
 		return i.Init()
 	}
--- a/libcontainer/message_linux.go
+++ b/libcontainer/message_linux.go
@@ -21,9 +21,7 @@ const (
 	RootlessEUIDAttr uint16 = 27287
 	UidmapPathAttr   uint16 = 27288
 	GidmapPathAttr   uint16 = 27289
-	MountSourcesAttr uint16 = 27290
+	TimeOffsetsAttr  uint16 = 27290
 	IdmapSourcesAttr uint16 = 27291
 	TimeOffsetsAttr  uint16 = 27292
 )
 type Int32msg struct {
--- a/libcontainer/mount_linux.go
+++ b/libcontainer/mount_linux.go
@@ -1,19 +1,44 @@
 package libcontainer
 import (
 	"errors"
 	"fmt"
 	"io/fs"
 	"os"
 	"strconv"
 	"github.com/sirupsen/logrus"
 	"golang.org/x/sys/unix"
 	"github.com/opencontainers/runc/libcontainer/configs"
 	"github.com/opencontainers/runc/libcontainer/userns"
 )
 // mountSourceType indicates what type of file descriptor is being returned. It
 // is used to tell rootfs_linux.go whether or not to use move_mount(2) to
 // install the mount.
 type mountSourceType string
 const (
 	// An open_tree(2)-style file descriptor that needs to be installed using
 	// move_mount(2) to install.
 	mountSourceOpenTree mountSourceType = "open_tree"
 	// A plain file descriptor that can be mounted through /proc/self/fd.
 	mountSourcePlain mountSourceType = "plain-open"
 )
 type mountSource struct {
 	Type mountSourceType `json:"type"`
 	file *os.File        `json:"-"`
 }
 // mountError holds an error from a failed mount or unmount operation.
 type mountError struct {
 	op      string
 	source  string
-	srcFD  *int
+	srcFile *mountSource
 	target  string
-	dstFD  string
+	dstFd   string
 	flags   uintptr
 	data    string
 	err     error
@@ -25,13 +50,14 @@ func (e *mountError) Error() string {
 	if e.source != "" {
 		out += "src=" + e.source + ", "
-		if e.srcFD != nil {
+		if e.srcFile != nil {
-			out += "srcFD=" + strconv.Itoa(*e.srcFD) + ", "
+			out += "srcType=" + string(e.srcFile.Type) + ", "
 			out += "srcFd=" + strconv.Itoa(int(e.srcFile.file.Fd())) + ", "
 		}
 	}
 	out += "dst=" + e.target
-	if e.dstFD != "" {
+	if e.dstFd != "" {
-		out += ", dstFD=" + e.dstFD
+		out += ", dstFd=" + e.dstFd
 	}
 	if e.flags != uintptr(0) {
@@ -54,35 +80,55 @@ func (e *mountError) Unwrap() error {
 // mount is a simple unix.Mount wrapper, returning an error with more context
 // in case it failed.
 func mount(source, target, fstype string, flags uintptr, data string) error {
-	return mountViaFDs(source, nil, target, "", fstype, flags, data)
+	return mountViaFds(source, nil, target, "", fstype, flags, data)
 }
-// mountViaFDs is a unix.Mount wrapper which uses srcFD instead of source,
+// mountViaFds is a unix.Mount wrapper which uses srcFile instead of source,
-// and dstFD instead of target, unless those are empty.
+// and dstFd instead of target, unless those are empty.
 // If srcFD is different than nil, its path (i.e. "/proc/self/fd/NN") will be
 // constructed by this function.
 // dstFD argument, if non-empty, is expected to be in the form of a path to an
 // opened file descriptor on procfs (i.e. "/proc/self/fd/NN").
 //
-// If case an FD is used instead of a source or a target path, the
+// If srcFile is non-nil and flags does not contain MS_REMOUNT, mountViaFds
-// corresponding path is only used to add context to an error in case
+// will mount it according to the mountSourceType of the file descriptor.
-// the mount operation has failed.
+//
-func mountViaFDs(source string, srcFD *int, target, dstFD, fstype string, flags uintptr, data string) error {
+// The dstFd argument, if non-empty, is expected to be in the form of a path to
-	src := source
+// an opened file descriptor on procfs (i.e. "/proc/self/fd/NN").
-	if srcFD != nil {
+//
-		src = "/proc/self/fd/" + strconv.Itoa(*srcFD)
+// If a file descriptor is used instead of a source or a target path, the
 // corresponding path is only used to add context to an error in case the mount
 // operation has failed.
 func mountViaFds(source string, srcFile *mountSource, target, dstFd, fstype string, flags uintptr, data string) error {
 	// MS_REMOUNT and srcFile don't make sense together.
 	if srcFile != nil && flags&unix.MS_REMOUNT != 0 {
 		logrus.Debugf("mount source passed along with MS_REMOUNT -- ignoring srcFile")
 		srcFile = nil
 	}
 	dst := target
-	if dstFD != "" {
+	if dstFd != "" {
-		dst = dstFD
+		dst = dstFd
 	}
-	if err := unix.Mount(src, dst, fstype, flags, data); err != nil {
+	src := source
 	if srcFile != nil {
 		src = "/proc/self/fd/" + strconv.Itoa(int(srcFile.file.Fd()))
 	}
 	var op string
 	var err error
 	if srcFile != nil && srcFile.Type == mountSourceOpenTree {
 		op = "move_mount"
 		err = unix.MoveMount(int(srcFile.file.Fd()), "",
 			unix.AT_FDCWD, dstFd,
 			unix.MOVE_MOUNT_F_EMPTY_PATH|unix.MOVE_MOUNT_T_SYMLINKS)
 	} else {
 		op = "mount"
 		err = unix.Mount(src, dst, fstype, flags, data)
 	}
 	if err != nil {
 		return &mountError{
-			op:     "mount",
+			op:      op,
 			source:  source,
-			srcFD:  srcFD,
+			srcFile: srcFile,
 			target:  target,
-			dstFD:  dstFD,
+			dstFd:   dstFd,
 			flags:   flags,
 			data:    data,
 			err:     err,
@@ -121,3 +167,81 @@ func syscallMode(i fs.FileMode) (o uint32) {
 	// No mapping for Go's ModeTemporary (plan9 only).
 	return
 }
 // mountFd creates a "mount source fd" (either through open_tree(2) or just
 // open(O_PATH)) based on the provided configuration. This function must be
 // called from within the container's mount namespace.
 //
 // In the case of idmapped mount configurations, the returned mount source will
 // be an open_tree(2) file with MOUNT_ATTR_IDMAP applied. For other
 // bind-mounts, it will be an O_PATH. If the type of mount cannot be handled,
 // the returned mountSource will be nil, indicating that the container init
 // process will need to do an old-fashioned mount(2) themselves.
 //
 // This helper is only intended to be used by goCreateMountSources.
 func mountFd(nsHandles *userns.Handles, m *configs.Mount) (*mountSource, error) {
 	if !m.IsBind() {
 		return nil, errors.New("new mount api: only bind-mounts are supported")
 	}
 	if nsHandles == nil {
 		nsHandles = new(userns.Handles)
 		defer nsHandles.Release()
 	}
 	var mountFile *os.File
 	var sourceType mountSourceType
 	// Ideally, we would use OPEN_TREE_CLONE for everything, because we can
 	// be sure that the file descriptor cannot be used to escape outside of
 	// the mount root. Unfortunately, OPEN_TREE_CLONE is far more expensive
 	// than open(2) because it requires doing mounts inside a new anonymous
 	// mount namespace. So we use open(2) for standard bind-mounts, and
 	// OPEN_TREE_CLONE when we need to set mount attributes here.
 	//
 	// While passing open(2)'d paths from the host rootfs isn't exactly the
 	// safest thing in the world, the files will not survive across
 	// execve(2) and "runc init" is non-dumpable so it should not be
 	// possible for a malicious container process to gain access to the
 	// file descriptors. We also don't do any of this for "runc exec",
 	// lessening the risk even further.
 	if m.IsIDMapped() {
 		flags := uint(unix.OPEN_TREE_CLONE | unix.OPEN_TREE_CLOEXEC)
 		if m.Flags&unix.MS_REC == unix.MS_REC {
 			flags |= unix.AT_RECURSIVE
 		}
 		fd, err := unix.OpenTree(unix.AT_FDCWD, m.Source, flags)
 		if err != nil {
 			return nil, &os.PathError{Op: "open_tree(OPEN_TREE_CLONE)", Path: m.Source, Err: err}
 		}
 		mountFile = os.NewFile(uintptr(fd), m.Source)
 		sourceType = mountSourceOpenTree
 		// Configure the id mapping.
 		usernsFile, err := nsHandles.Get(userns.Mapping{
 			UIDMappings: m.UIDMappings,
 			GIDMappings: m.GIDMappings,
 		})
 		if err != nil {
 			return nil, fmt.Errorf("failed to create userns for %s id-mapping: %w", m.Source, err)
 		}
 		defer usernsFile.Close()
 		if err := unix.MountSetattr(int(mountFile.Fd()), "", unix.AT_EMPTY_PATH, &unix.MountAttr{
 			Attr_set:  unix.MOUNT_ATTR_IDMAP,
 			Userns_fd: uint64(usernsFile.Fd()),
 		}); err != nil {
 			return nil, fmt.Errorf("failed to set MOUNT_ATTR_IDMAP on %s: %w", m.Source, err)
 		}
 	} else {
 		var err error
 		mountFile, err = os.OpenFile(m.Source, unix.O_PATH|unix.O_CLOEXEC, 0)
 		if err != nil {
 			return nil, err
 		}
 		sourceType = mountSourcePlain
 	}
 	return &mountSource{
 		Type: sourceType,
 		file: mountFile,
 	}, nil
 }
--- a/libcontainer/nsenter/idmap.h
+++ b/libcontainer/nsenter/idmap.h
@@ -1,76 +0,0 @@
 #ifndef IDMAP_H
 #define IDMAP_H
 #include <sys/mount.h>
 // Centos-7 doesn't have this file nor the __has_include() directive, so let's
 // just leave this commented out and we can uncomment when it hits EOL (2024-06-30).
 //#include <linux/mount.h>
 #include <sys/syscall.h>
 #include <unistd.h>
 /* mount_setattr() */
 #ifndef MOUNT_ATTR_IDMAP
 #define MOUNT_ATTR_IDMAP 0x00100000
 #endif
 #ifndef __NR_mount_setattr
 	#if defined _MIPS_SIM
 		#if _MIPS_SIM == _MIPS_SIM_ABI32	/* o32 */
 			#define __NR_mount_setattr (442 + 4000)
 		#endif
 		#if _MIPS_SIM == _MIPS_SIM_NABI32	/* n32 */
 			#define __NR_mount_setattr (442 + 6000)
 		#endif
 		#if _MIPS_SIM == _MIPS_SIM_ABI64	/* n64 */
 			#define __NR_mount_setattr (442 + 5000)
 		#endif
 	#else
 		#define __NR_mount_setattr 442
 	#endif
 #endif
 #ifndef MOUNT_ATTR_SIZE_VER0
 struct mount_attr {
 	__u64 attr_set;
 	__u64 attr_clr;
 	__u64 propagation;
 	__u64 userns_fd;
 };
 #endif
 /* open_tree() */
 #ifndef OPEN_TREE_CLONE
 #define OPEN_TREE_CLONE 1
 #endif
 #ifndef OPEN_TREE_CLOEXEC
 #define OPEN_TREE_CLOEXEC O_CLOEXEC
 #endif
 #ifndef __NR_open_tree
 	#if defined _MIPS_SIM
 		#if _MIPS_SIM == _MIPS_SIM_ABI32	/* o32 */
 			#define __NR_open_tree 4428
 		#endif
 		#if _MIPS_SIM == _MIPS_SIM_NABI32	/* n32 */
 			#define __NR_open_tree 6428
 		#endif
 		#if _MIPS_SIM == _MIPS_SIM_ABI64	/* n64 */
 			#define __NR_open_tree 5428
 		#endif
 	#else
 		#define __NR_open_tree 428
 	#endif
 #endif
 static inline int sys_mount_setattr(int dfd, const char *path, unsigned int flags, struct mount_attr *attr, size_t size)
 {
 	return syscall(__NR_mount_setattr, dfd, path, flags, attr, size);
 }
 static inline int sys_open_tree(int dfd, const char *filename, unsigned int flags)
 {
 	return syscall(__NR_open_tree, dfd, filename, flags);
 }
 #endif /* IDMAP_H */
--- a/libcontainer/nsenter/nsexec.c
+++ b/libcontainer/nsenter/nsexec.c
@@ -33,9 +33,6 @@
 /* Get all of the CLONE_NEW* flags. */
 #include "namespace.h"
 /* Get definitions for idmap sources */
 #include "idmap.h"
 /* Synchronisation values. */
 enum sync_t {
 	SYNC_USERMAP_PLS = 0x40,	/* Request parent to map our users. */
@@ -44,12 +41,8 @@ enum sync_t {
 	SYNC_RECVPID_ACK = 0x43,	/* PID was correctly received by parent. */
 	SYNC_GRANDCHILD = 0x44,	/* The grandchild is ready to run. */
 	SYNC_CHILD_FINISH = 0x45,	/* The child or grandchild has finished. */
-	SYNC_MOUNTSOURCES_PLS = 0x46,	/* Tell parent to send mount sources by SCM_RIGHTS. */
+	SYNC_TIMEOFFSETS_PLS = 0x46,	/* Request parent to write timens offsets. */
-	SYNC_MOUNTSOURCES_ACK = 0x47,	/* All mount sources have been sent. */
+	SYNC_TIMEOFFSETS_ACK = 0x47,	/* Timens offsets were written. */
 	SYNC_MOUNT_IDMAP_PLS = 0x48,	/* Tell parent to mount idmap sources. */
 	SYNC_MOUNT_IDMAP_ACK = 0x49,	/* All idmap mounts have been done. */
 	SYNC_TIMEOFFSETS_PLS = 0x50,	/* Request parent to write timens offsets. */
 	SYNC_TIMEOFFSETS_ACK = 0x51,	/* Timens offsets were written. */
 };
 #define STAGE_SETUP  -1
@@ -99,14 +92,6 @@ struct nlconfig_t {
 	char *gidmappath;
 	size_t gidmappath_len;
 	/* Mount sources opened outside the container userns. */
 	char *mountsources;
 	size_t mountsources_len;
 	/* Idmap sources opened outside the container userns which will be id mapped. */
 	char *idmapsources;
 	size_t idmapsources_len;
 	/* Time NS offsets. */
 	char *timensoffset;
 	size_t timensoffset_len;
@@ -126,9 +111,7 @@ struct nlconfig_t {
 #define ROOTLESS_EUID_ATTR	27287
 #define UIDMAPPATH_ATTR		27288
 #define GIDMAPPATH_ATTR		27289
-#define MOUNT_SOURCES_ATTR	27290
+#define TIMENSOFFSET_ATTR	27290
 #define IDMAP_SOURCES_ATTR	27291
 #define TIMENSOFFSET_ATTR	27292
 /*
 * Use the raw syscall for versions of glibc which don't include a function for
@@ -446,14 +429,6 @@ static void nl_parse(int fd, struct nlconfig_t *config)
 		case SETGROUP_ATTR:
 			config->is_setgroup = readint8(current);
 			break;
 		case MOUNT_SOURCES_ATTR:
 			config->mountsources = current;
 			config->mountsources_len = payload_len;
 			break;
 		case IDMAP_SOURCES_ATTR:
 			config->idmapsources = current;
 			config->idmapsources_len = payload_len;
 			break;
 		case TIMENSOFFSET_ATTR:
 			config->timensoffset = current;
 			config->timensoffset_len = payload_len;
@@ -546,115 +521,6 @@ static inline int sane_kill(pid_t pid, int signum)
 		return 0;
 }
 /* receive_fd_sources parses env_var as an array of fd numbers and, for each element that is
 * not -1, it receives an fd via SCM_RIGHTS and dup3 it to the fd requested in
 * the element of the env var.
 */
 void receive_fd_sources(int sockfd, const char *env_var)
 {
 	char *fds, *endp;
 	long new_fd;
 	// This env var must be a json array of ints.
 	fds = getenv(env_var);
 	if (fds[0] != '[') {
 		bail("malformed %s env var: missing '['", env_var);
 	}
 	fds++;
 	for (endp = fds; *endp != ']'; fds = endp + 1) {
 		new_fd = strtol(fds, &endp, 10);
 		if (endp == fds) {
 			bail("malformed %s env var: not a number", env_var);
 		}
 		if (*endp == '\0') {
 			bail("malformed %s env var: missing ]", env_var);
 		}
 		// The list contains -1 when no fd is needed. Ignore them.
 		if (new_fd == -1) {
 			continue;
 		}
 		if (new_fd == LONG_MAX || new_fd < 0 || new_fd > INT_MAX) {
 			bail("malformed %s env var: fds out of range", env_var);
 		}
 		int recv_fd = receive_fd(sockfd);
 		if (dup3(recv_fd, new_fd, O_CLOEXEC) < 0) {
 			bail("cannot dup3 fd %d to %ld", recv_fd, new_fd);
 		}
 		if (close(recv_fd) < 0) {
 			bail("cannot close fd %d", recv_fd);
 		}
 	}
 }
 void receive_mountsources(int sockfd)
 {
 	receive_fd_sources(sockfd, "_LIBCONTAINER_MOUNT_FDS");
 }
 void send_mountsources(int sockfd, pid_t child, char *mountsources, size_t mountsources_len)
 {
 	char proc_path[PATH_MAX];
 	int host_mntns_fd;
 	int container_mntns_fd;
 	int fd;
 	int ret;
 	// container_linux.go shouldSendMountSources() decides if mount sources
 	// should be pre-opened (O_PATH) and passed via SCM_RIGHTS
 	if (mountsources == NULL)
 		return;
 	host_mntns_fd = open("/proc/self/ns/mnt", O_RDONLY | O_CLOEXEC);
 	if (host_mntns_fd == -1)
 		bail("failed to get current mount namespace");
 	if (snprintf(proc_path, PATH_MAX, "/proc/%d/ns/mnt", child) < 0)
 		bail("failed to get mount namespace path");
 	container_mntns_fd = open(proc_path, O_RDONLY | O_CLOEXEC);
 	if (container_mntns_fd == -1)
 		bail("failed to get container mount namespace");
 	if (setns(container_mntns_fd, CLONE_NEWNS) < 0)
 		bail("failed to setns to container mntns");
 	char *mountsources_end = mountsources + mountsources_len;
 	while (mountsources < mountsources_end) {
 		if (mountsources[0] == '\0') {
 			mountsources++;
 			continue;
 		}
 		fd = open(mountsources, O_PATH | O_CLOEXEC);
 		if (fd < 0)
 			bail("failed to open mount source %s", mountsources);
 		write_log(DEBUG, "~> sending fd for: %s", mountsources);
 		if (send_fd(sockfd, fd) < 0)
 			bail("failed to send fd %d via unix socket %d", fd, sockfd);
 		ret = close(fd);
 		if (ret != 0)
 			bail("failed to close mount source fd %d", fd);
 		mountsources += strlen(mountsources) + 1;
 	}
 	if (setns(host_mntns_fd, CLONE_NEWNS) < 0)
 		bail("failed to setns to host mntns");
 	ret = close(host_mntns_fd);
 	if (ret != 0)
 		bail("failed to close host mount namespace fd %d", host_mntns_fd);
 	ret = close(container_mntns_fd);
 	if (ret != 0)
 		bail("failed to close container mount namespace fd %d", container_mntns_fd);
 }
 void try_unshare(int flags, const char *msg)
 {
 	write_log(DEBUG, "unshare %s", msg);
@@ -674,89 +540,6 @@ void try_unshare(int flags, const char *msg)
 	bail("failed to unshare %s", msg);
 }
 void send_idmapsources(int sockfd, pid_t pid, char *idmap_src, int idmap_src_len)
 {
 	char proc_user_path[PATH_MAX];
 	/* Open the userns fd only once.
 	 * Currently we only support idmap mounts that use the same mapping than
 	 * the userns. This is validated in libcontainer/configs/validate/validator.go,
 	 * so if we reached here, we know the mapping for the idmap is the same
 	 * as the userns. This is why we just open the userns_fd once from the
 	 * PID of the child process that has the userns already applied.
 	 */
 	int ret = snprintf(proc_user_path, sizeof(proc_user_path), "/proc/%d/ns/user", pid);
 	if (ret < 0 || (size_t)ret >= sizeof(proc_user_path)) {
 		sane_kill(pid, SIGKILL);
 		bail("failed to create userns path string");
 	}
 	int userns_fd = open(proc_user_path, O_RDONLY | O_CLOEXEC | O_NOCTTY);
 	if (userns_fd < 0) {
 		sane_kill(pid, SIGKILL);
 		bail("failed to get user namespace fd");
 	}
 	char *idmap_end = idmap_src + idmap_src_len;
 	while (idmap_src < idmap_end) {
 		if (idmap_src[0] == '\0') {
 			idmap_src++;
 			continue;
 		}
 		int fd_tree = sys_open_tree(-EBADF, idmap_src,
 					    OPEN_TREE_CLONE | OPEN_TREE_CLOEXEC |
 					    AT_SYMLINK_NOFOLLOW | AT_NO_AUTOMOUNT);
 		if (fd_tree < 0) {
 			sane_kill(pid, SIGKILL);
 			if (errno == ENOSYS) {
 				bail("open_tree(2) failed, the kernel doesn't support ID-mapped mounts");
 			} else if (errno == EINVAL) {
 				bail("open_tree(2) failed with path: %s, the kernel doesn't support ID-mapped mounts",
 				     idmap_src);
 			} else {
 				bail("open_tree(2) failed with path: %s", idmap_src);
 			}
 		}
 		struct mount_attr attr = {
 			.attr_set = MOUNT_ATTR_IDMAP,
 			.userns_fd = userns_fd,
 		};
 		ret = sys_mount_setattr(fd_tree, "", AT_EMPTY_PATH, &attr, sizeof(attr));
 		if (ret < 0) {
 			sane_kill(pid, SIGKILL);
 			if (errno == ENOSYS)
 				bail("mount_setattr(2) failed, the kernel doesn't support ID-mapped mounts");
 			else if (errno == EINVAL)
 				bail("mount_setattr(2) failed with path: %s, maybe the filesystem doesn't support ID-mapped mounts", idmap_src);
 			else
 				bail("mount_setattr(2) failed with path: %s", idmap_src);
 		}
 		write_log(DEBUG, "~> sending idmap source: %s with mapping from: %s", idmap_src, proc_user_path);
 		send_fd(sockfd, fd_tree);
 		if (close(fd_tree) < 0) {
 			sane_kill(pid, SIGKILL);
 			bail("error closing fd_tree");
 		}
 		idmap_src += strlen(idmap_src) + 1;
 	}
 	if (close(userns_fd) < 0) {
 		sane_kill(pid, SIGKILL);
 		bail("error closing userns fd");
 	}
 }
 void receive_idmapsources(int sockfd)
 {
 	receive_fd_sources(sockfd, "_LIBCONTAINER_IDMAP_FDS");
 }
 static void update_timens_offsets(pid_t pid, char *map, size_t map_len)
 {
 	if (map == NULL || map_len == 0)
@@ -988,28 +771,6 @@ void nsexec(void)
 						sane_kill(stage2_pid, SIGKILL);
 						bail("failed to sync with runc: write(pid-JSON)");
 					}
 					break;
 				case SYNC_MOUNTSOURCES_PLS:
 					write_log(DEBUG, "stage-1 requested to open mount sources");
 					send_mountsources(syncfd, stage1_pid, config.mountsources,
 							  config.mountsources_len);
 					s = SYNC_MOUNTSOURCES_ACK;
 					if (write(syncfd, &s, sizeof(s)) != sizeof(s)) {
 						sane_kill(stage1_pid, SIGKILL);
 						bail("failed to sync with child: write(SYNC_MOUNTSOURCES_ACK)");
 					}
 					break;
 				case SYNC_MOUNT_IDMAP_PLS:
 					write_log(DEBUG, "stage-1 requested to open idmap sources");
 					send_idmapsources(syncfd, stage1_pid, config.idmapsources,
 							  config.idmapsources_len);
 					s = SYNC_MOUNT_IDMAP_ACK;
 					if (write(syncfd, &s, sizeof(s)) != sizeof(s)) {
 						sane_kill(stage1_pid, SIGKILL);
 						bail("failed to sync with child: write(SYNC_MOUNT_IDMAP_ACK)");
 					}
 					break;
 				case SYNC_TIMEOFFSETS_PLS:
 					write_log(DEBUG, "stage-1 requested timens offsets to be configured");
@@ -1186,38 +947,6 @@ void nsexec(void)
 					bail("failed to sync with parent: SYNC_TIMEOFFSETS_ACK: got %u", s);
 			}
 			/* Ask our parent to send the mount sources fds. */
 			if (config.mountsources) {
 				write_log(DEBUG, "request stage-0 to send mount sources");
 				s = SYNC_MOUNTSOURCES_PLS;
 				if (write(syncfd, &s, sizeof(s)) != sizeof(s))
 					bail("failed to sync with parent: write(SYNC_MOUNTSOURCES_PLS)");
 				/* Receive and install all mount sources fds. */
 				receive_mountsources(syncfd);
 				/* Parent finished to send the mount sources fds. */
 				if (read(syncfd, &s, sizeof(s)) != sizeof(s))
 					bail("failed to sync with parent: read(SYNC_MOUNTSOURCES_ACK)");
 				if (s != SYNC_MOUNTSOURCES_ACK)
 					bail("failed to sync with parent: SYNC_MOUNTSOURCES_ACK: got %u", s);
 			}
 			if (config.idmapsources) {
 				write_log(DEBUG, "request stage-0 to send idmap sources");
 				s = SYNC_MOUNT_IDMAP_PLS;
 				if (write(syncfd, &s, sizeof(s)) != sizeof(s))
 					bail("failed to sync with parent: write(SYNC_MOUNT_IDMAP_PLS)");
 				/* Receive and install all idmap fds. */
 				receive_idmapsources(syncfd);
 				if (read(syncfd, &s, sizeof(s)) != sizeof(s))
 					bail("failed to sync with parent: read(SYNC_MOUNT_IDMAP_ACK)");
 				if (s != SYNC_MOUNT_IDMAP_ACK)
 					bail("failed to sync with parent: SYNC_MOUNT_IDMAP_ACK: got %u", s);
 			}
 			/*
 			 * TODO: What about non-namespace clone flags that we're dropping here?
 			 *
--- a/libcontainer/process_linux.go
+++ b/libcontainer/process_linux.go
@@ -1,6 +1,7 @@
 package libcontainer
 import (
 	"context"
 	"encoding/json"
 	"errors"
 	"fmt"
@@ -11,18 +12,21 @@ import (
 	"path/filepath"
 	"runtime"
 	"strconv"
 	"sync"
 	"time"
 	"github.com/opencontainers/runtime-spec/specs-go"
 	"github.com/sirupsen/logrus"
 	"golang.org/x/sys/unix"
 	"github.com/opencontainers/runc/libcontainer/cgroups"
 	"github.com/opencontainers/runc/libcontainer/cgroups/fs2"
 	"github.com/opencontainers/runc/libcontainer/configs"
 	"github.com/opencontainers/runc/libcontainer/intelrdt"
 	"github.com/opencontainers/runc/libcontainer/logs"
 	"github.com/opencontainers/runc/libcontainer/system"
 	"github.com/opencontainers/runc/libcontainer/userns"
 	"github.com/opencontainers/runc/libcontainer/utils"
 	"github.com/opencontainers/runtime-spec/specs-go"
 	"github.com/sirupsen/logrus"
 	"golang.org/x/sys/unix"
 )
 type parentProcess interface {
@@ -208,6 +212,9 @@ func (p *setnsProcess) start() (retErr error) {
 		case procHooks:
 			// This shouldn't happen.
 			panic("unexpected procHooks in setns")
 		case procMountPlease:
 			// This shouldn't happen.
 			panic("unexpected procMountPlease in setns")
 		case procSeccomp:
 			if p.config.Config.Seccomp.ListenerPath == "" {
 				return errors.New("seccomp listenerPath is not set")
@@ -398,6 +405,110 @@ func (p *initProcess) waitForChildExit(childPid int) error {
 	return nil
 }
 type mountSourceRequestFn func(*configs.Mount) (*mountSource, error)
 // goCreateMountSources spawns a goroutine which creates open_tree(2)-style
 // mountfds based on the requested configs.Mount configuration. The returned
 // requestFn and cancelFn are used to interact with the goroutine.
 //
 // The caller of the returned mountSourceRequestFn is responsible for closing
 // the returned file.
 func (p *initProcess) goCreateMountSources(ctx context.Context) (mountSourceRequestFn, context.CancelFunc, error) {
 	type response struct {
 		src *mountSource
 		err error
 	}
 	errCh := make(chan error, 1)
 	requestCh := make(chan *configs.Mount)
 	responseCh := make(chan response)
 	ctx, cancelFn := context.WithTimeout(ctx, 1*time.Minute)
 	go func() {
 		// We lock this thread because we need to setns(2) here. There is no
 		// UnlockOSThread() here, to ensure that the Go runtime will kill this
 		// thread once this goroutine returns (ensuring no other goroutines run
 		// in this context).
 		runtime.LockOSThread()
 		// Detach from the shared fs of the rest of the Go process in order to
 		// be able to CLONE_NEWNS.
 		if err := unix.Unshare(unix.CLONE_FS); err != nil {
 			err = os.NewSyscallError("unshare(CLONE_FS)", err)
 			errCh <- fmt.Errorf("mount source thread: %w", err)
 			return
 		}
 		// Attach to the container's mount namespace.
 		nsFd, err := os.Open(fmt.Sprintf("/proc/%d/ns/mnt", p.pid()))
 		if err != nil {
 			errCh <- fmt.Errorf("mount source thread: open container mntns: %w", err)
 			return
 		}
 		defer nsFd.Close()
 		if err := unix.Setns(int(nsFd.Fd()), unix.CLONE_NEWNS); err != nil {
 			err = os.NewSyscallError("setns", err)
 			errCh <- fmt.Errorf("mount source thread: join container mntns: %w", err)
 			return
 		}
 		// No errors during setup!
 		close(errCh)
 		logrus.Debugf("mount source thread: successfully running in container mntns")
 		nsHandles := new(userns.Handles)
 		defer nsHandles.Release()
 	loop:
 		for {
 			select {
 			case m, ok := <-requestCh:
 				if !ok {
 					break loop
 				}
 				src, err := mountFd(nsHandles, m)
 				logrus.Debugf("mount source thread: handling request for %q: %v %v", m.Source, src, err)
 				responseCh <- response{
 					src: src,
 					err: err,
 				}
 			case <-ctx.Done():
 				break loop
 			}
 		}
 		logrus.Debugf("mount source thread: closing thread: %v", ctx.Err())
 		close(responseCh)
 	}()
 	// Check for setup errors.
 	err := <-errCh
 	if err != nil {
 		cancelFn()
 		return nil, nil, err
 	}
 	// TODO: Switch to context.AfterFunc when we switch to Go 1.21.
 	var requestChCloseOnce sync.Once
 	requestFn := func(m *configs.Mount) (*mountSource, error) {
 		var err error
 		select {
 		case requestCh <- m:
 			select {
 			case resp, ok := <-responseCh:
 				if ok {
 					return resp.src, resp.err
 				}
 			case <-ctx.Done():
 				err = fmt.Errorf("receive mount source context cancelled: %w", ctx.Err())
 			}
 		case <-ctx.Done():
 			err = fmt.Errorf("send mount request cancelled: %w", ctx.Err())
 		}
 		requestChCloseOnce.Do(func() { close(requestCh) })
 		return nil, err
 	}
 	return requestFn, cancelFn, nil
 }
 func (p *initProcess) start() (retErr error) {
 	defer p.comm.closeParent()
 	err := p.cmd.Start()
@@ -487,6 +598,22 @@ func (p *initProcess) start() (retErr error) {
 		return fmt.Errorf("error waiting for our first child to exit: %w", err)
 	}
 	// Spin up a goroutine to handle remapping mount requests by runc init.
 	// There is no point doing this for rootless containers because they cannot
 	// configure MOUNT_ATTR_IDMAP, nor do OPEN_TREE_CLONE. We could just
 	// service plain-open requests for plain bind-mounts but there's no need
 	// (rootless containers will never have permission issues on a source mount
 	// that the parent process can help with -- they are the same user).
 	var mountRequest mountSourceRequestFn
 	if !p.container.config.RootlessEUID {
 		request, cancel, err := p.goCreateMountSources(context.Background())
 		if err != nil {
 			return fmt.Errorf("error spawning mount remapping thread: %w", err)
 		}
 		defer cancel()
 		mountRequest = request
 	}
 	if err := p.createNetworkInterfaces(); err != nil {
 		return fmt.Errorf("error creating network interfaces: %w", err)
 	}
@@ -500,6 +627,35 @@ func (p *initProcess) start() (retErr error) {
 	var seenProcReady bool
 	ierr := parseSync(p.comm.syncSockParent, func(sync *syncT) error {
 		switch sync.Type {
 		case procMountPlease:
 			if mountRequest == nil {
 				return fmt.Errorf("cannot fulfil mount requests as a rootless user")
 			}
 			var m *configs.Mount
 			if sync.Arg == nil {
 				return fmt.Errorf("sync %q is missing an argument", sync.Type)
 			}
 			if err := json.Unmarshal(*sync.Arg, &m); err != nil {
 				return fmt.Errorf("sync %q passed invalid mount arg: %w", sync.Type, err)
 			}
 			mnt, err := mountRequest(m)
 			if err != nil {
 				return fmt.Errorf("failed to fulfil mount request: %w", err)
 			}
 			defer mnt.file.Close()
 			arg, err := json.Marshal(mnt)
 			if err != nil {
 				return fmt.Errorf("sync %q failed to marshal mountSource: %w", sync.Type, err)
 			}
 			argMsg := json.RawMessage(arg)
 			if err := doWriteSync(p.comm.syncSockParent, syncT{
 				Type: procMountFd,
 				Arg:  &argMsg,
 				File: mnt.file,
 			}); err != nil {
 				return err
 			}
 		case procSeccomp:
 			if p.config.Config.Seccomp.ListenerPath == "" {
 				return errors.New("seccomp listenerPath is not set")
--- a/libcontainer/rootfs_linux.go
+++ b/libcontainer/rootfs_linux.go
@@ -1,6 +1,7 @@
 package libcontainer
 import (
 	"encoding/json"
 	"errors"
 	"fmt"
 	"os"
@@ -13,16 +14,17 @@ import (
 	securejoin "github.com/cyphar/filepath-securejoin"
 	"github.com/moby/sys/mountinfo"
 	"github.com/mrunalp/fileutils"
 	"github.com/opencontainers/runtime-spec/specs-go"
 	"github.com/opencontainers/selinux/go-selinux/label"
 	"github.com/sirupsen/logrus"
 	"golang.org/x/sys/unix"
 	"github.com/opencontainers/runc/libcontainer/cgroups"
 	"github.com/opencontainers/runc/libcontainer/cgroups/fs2"
 	"github.com/opencontainers/runc/libcontainer/configs"
 	"github.com/opencontainers/runc/libcontainer/devices"
 	"github.com/opencontainers/runc/libcontainer/userns"
 	"github.com/opencontainers/runc/libcontainer/utils"
 	"github.com/opencontainers/runtime-spec/specs-go"
 	"github.com/opencontainers/selinux/go-selinux/label"
 	"github.com/sirupsen/logrus"
 	"golang.org/x/sys/unix"
 )
 const defaultMountFlags = unix.MS_NOEXEC | unix.MS_NOSUID | unix.MS_NODEV
@@ -39,12 +41,12 @@ type mountConfig struct {
 // mountEntry contains mount data specific to a mount point.
 type mountEntry struct {
 	*configs.Mount
-	srcFD *int
+	srcFile *mountSource
 }
 func (m *mountEntry) src() string {
-	if m.srcFD != nil {
+	if m.srcFile != nil {
-		return "/proc/self/fd/" + strconv.Itoa(*m.srcFD)
+		return "/proc/self/fd/" + strconv.Itoa(int(m.srcFile.file.Fd()))
 	}
 	return m.Source
 }
@@ -62,20 +64,12 @@ func needsSetupDev(config *configs.Config) bool {
 // prepareRootfs sets up the devices, mount points, and filesystems for use
 // inside a new mount namespace. It doesn't set anything as ro. You must call
 // finalizeRootfs after this function to finish setting up the rootfs.
-func prepareRootfs(pipe *syncSocket, iConfig *initConfig, mountFds mountFds) (err error) {
+func prepareRootfs(pipe *syncSocket, iConfig *initConfig) (err error) {
 	config := iConfig.Config
 	if err := prepareRoot(config); err != nil {
 		return fmt.Errorf("error preparing rootfs: %w", err)
 	}
 	if mountFds.sourceFds != nil && len(mountFds.sourceFds) != len(config.Mounts) {
 		return fmt.Errorf("malformed mountFds slice. Expected size: %v, got: %v", len(config.Mounts), len(mountFds.sourceFds))
 	}
 	if mountFds.idmapFds != nil && len(mountFds.idmapFds) != len(config.Mounts) {
 		return fmt.Errorf("malformed idmapFds slice: expected size: %v, got: %v", len(config.Mounts), len(mountFds.idmapFds))
 	}
 	mountConfig := &mountConfig{
 		root:            config.Rootfs,
 		label:           config.MountLabel,
@@ -83,22 +77,53 @@ func prepareRootfs(pipe *syncSocket, iConfig *initConfig, mountFds mountFds) (er
 		rootlessCgroups: iConfig.RootlessCgroups,
 		cgroupns:        config.Namespaces.Contains(configs.NEWCGROUP),
 	}
-	for i, m := range config.Mounts {
+	for _, m := range config.Mounts {
 		entry := mountEntry{Mount: m}
-		// Just before the loop we checked that if not empty, len(mountFds.sourceFds) == len(config.Mounts).
+		// Figure out whether we need to request runc to give us an
-		// Therefore, we can access mountFds.sourceFds[i] without any concerns.
+		// open_tree(2)-style mountfd. For idmapped mounts, this is always
-		if mountFds.sourceFds != nil && mountFds.sourceFds[i] != -1 {
+		// necessary. For bind-mounts, this is only necessary if we cannot
-			entry.srcFD = &mountFds.sourceFds[i]
+		// resolve the parent mount (this is only hit if you are running in a
 		// userns -- but for rootless the host-side thread can't help).
 		wantSourceFile := m.IsIDMapped()
 		if m.IsBind() && !config.RootlessEUID {
 			if _, err := os.Stat(m.Source); err != nil {
 				wantSourceFile = true
 			}
 		// We validated before we can access mountFds.idmapFds[i].
 		if mountFds.idmapFds != nil && mountFds.idmapFds[i] != -1 {
 			if entry.srcFD != nil {
 				return fmt.Errorf("malformed mountFds and idmapFds slice, entry: %v has fds in both slices", i)
 		}
-			entry.srcFD = &mountFds.idmapFds[i]
+		if wantSourceFile {
 			// Request a source file from the host.
 			if err := writeSyncArg(pipe, procMountPlease, m); err != nil {
 				return fmt.Errorf("failed to request mountfd for %q: %w", m.Source, err)
 			}
 			sync, err := readSyncFull(pipe, procMountFd)
 			if err != nil {
 				return fmt.Errorf("mountfd request for %q failed: %w", m.Source, err)
 			}
 			if sync.File == nil {
 				return fmt.Errorf("mountfd request for %q: response missing attached fd", m.Source)
 			}
 			defer sync.File.Close()
 			// Sanity-check to make sure we didn't get the wrong fd back. Note
 			// that while m.Source might contain symlinks, the (*os.File).Name
 			// is based on the path provided to os.OpenFile, not what it
 			// resolves to. So this should never happen.
 			if sync.File.Name() != m.Source {
 				return fmt.Errorf("returned mountfd for %q doesn't match requested mount configuration: mountfd path is %q", m.Source, sync.File.Name())
 			}
 			// Unmarshal the procMountFd argument (the file is sync.File).
 			var src *mountSource
 			if sync.Arg == nil {
 				return fmt.Errorf("sync %q is missing an argument", sync.Type)
 			}
 			if err := json.Unmarshal(*sync.Arg, &src); err != nil {
 				return fmt.Errorf("invalid mount fd response argument %q: %w", string(*sync.Arg), err)
 			}
 			if src == nil {
 				return fmt.Errorf("mountfd request for %q: no mount source info received", m.Source)
 			}
 			src.file = sync.File
 			entry.srcFile = src
 		}
 		if err := mountToRootfs(mountConfig, entry); err != nil {
 			return fmt.Errorf("error mounting %q to rootfs at %q: %w", m.Source, m.Destination, err)
 		}
@@ -281,7 +306,7 @@ func mountCgroupV1(m *configs.Mount, c *mountConfig) error {
 			if err := os.MkdirAll(subsystemPath, 0o755); err != nil {
 				return err
 			}
-			if err := utils.WithProcfd(c.root, b.Destination, func(dstFD string) error {
+			if err := utils.WithProcfd(c.root, b.Destination, func(dstFd string) error {
 				flags := defaultMountFlags
 				if m.Flags&unix.MS_RDONLY != 0 {
 					flags = flags | unix.MS_RDONLY
@@ -294,7 +319,7 @@ func mountCgroupV1(m *configs.Mount, c *mountConfig) error {
 					data = cgroups.CgroupNamePrefix + data
 					source = "systemd"
 				}
-				return mountViaFDs(source, nil, b.Destination, dstFD, "cgroup", uintptr(flags), data)
+				return mountViaFds(source, nil, b.Destination, dstFd, "cgroup", uintptr(flags), data)
 			}); err != nil {
 				return err
 			}
@@ -325,8 +350,8 @@ func mountCgroupV2(m *configs.Mount, c *mountConfig) error {
 	if err := os.MkdirAll(dest, 0o755); err != nil {
 		return err
 	}
-	err = utils.WithProcfd(c.root, m.Destination, func(dstFD string) error {
+	err = utils.WithProcfd(c.root, m.Destination, func(dstFd string) error {
-		return mountViaFDs(m.Source, nil, m.Destination, dstFD, "cgroup2", uintptr(m.Flags), m.Data)
+		return mountViaFds(m.Source, nil, m.Destination, dstFd, "cgroup2", uintptr(m.Flags), m.Data)
 	})
 	if err == nil || !(errors.Is(err, unix.EPERM) || errors.Is(err, unix.EBUSY)) {
 		return err
@@ -347,7 +372,6 @@ func mountCgroupV2(m *configs.Mount, c *mountConfig) error {
 		bindM.Source = c.cgroup2Path
 	}
 	// mountToRootfs() handles remounting for MS_RDONLY.
 	// No need to set mountEntry.srcFD here, because mountToRootfs() calls utils.WithProcfd() by itself in mountPropagate().
 	err = mountToRootfs(c, mountEntry{Mount: bindM})
 	if c.rootlessCgroups && errors.Is(err, unix.ENOENT) {
 		// ENOENT (for `src = c.cgroup2Path`) happens when rootless runc is being executed
@@ -392,15 +416,15 @@ func doTmpfsCopyUp(m mountEntry, rootfs, mountLabel string) (Err error) {
 		}
 	}()
-	return utils.WithProcfd(rootfs, m.Destination, func(dstFD string) (Err error) {
+	return utils.WithProcfd(rootfs, m.Destination, func(dstFd string) (Err error) {
 		// Copy the container data to the host tmpdir. We append "/" to force
 		// CopyDirectory to resolve the symlink rather than trying to copy the
 		// symlink itself.
-		if err := fileutils.CopyDirectory(dstFD+"/", tmpDir); err != nil {
+		if err := fileutils.CopyDirectory(dstFd+"/", tmpDir); err != nil {
-			return fmt.Errorf("tmpcopyup: failed to copy %s to %s (%s): %w", m.Destination, dstFD, tmpDir, err)
+			return fmt.Errorf("tmpcopyup: failed to copy %s to %s (%s): %w", m.Destination, dstFd, tmpDir, err)
 		}
 		// Now move the mount into the container.
-		if err := mountViaFDs(tmpDir, nil, m.Destination, dstFD, "", unix.MS_MOVE, ""); err != nil {
+		if err := mountViaFds(tmpDir, nil, m.Destination, dstFd, "", unix.MS_MOVE, ""); err != nil {
 			return fmt.Errorf("tmpcopyup: failed to move mount: %w", err)
 		}
 		return nil
@@ -522,36 +546,10 @@ func mountToRootfs(c *mountConfig, m mountEntry) error {
 		if err := prepareBindMount(m, rootfs); err != nil {
 			return err
 		}
-
+		// open_tree()-related shenanigans are all handled in mountViaFds.
 		if m.IsBind() && m.IsIDMapped() {
 			if m.srcFD == nil {
 				return fmt.Errorf("error creating mount %+v: idmapFD is invalid, should point to a valid fd", m)
 			}
 			if err := unix.MoveMount(*m.srcFD, "", unix.AT_FDCWD, dest, unix.MOVE_MOUNT_F_EMPTY_PATH); err != nil {
 				return fmt.Errorf("error on unix.MoveMount %+v: %w", m, err)
 			}
 			// In nsexec.c, we did not set the propagation field of mount_attr struct.
 			// So, let's deal with these flags right now!
 			if err := utils.WithProcfd(rootfs, dest, func(dstFD string) error {
 				for _, pflag := range m.PropagationFlags {
 					// When using mount for setting propagations flags, the source, file
 					// system type and data arguments are ignored:
 					// https://man7.org/linux/man-pages/man2/mount.2.html
 					// We also ignore procfd because we want to act on dest.
 					if err := mountViaFDs("", nil, dest, dstFD, "", uintptr(pflag), ""); err != nil {
 						return err
 					}
 				}
 				return nil
 			}); err != nil {
 				return fmt.Errorf("change mount propagation through procfd: %w", err)
 			}
 		} else {
 		if err := mountPropagate(m, rootfs, mountLabel); err != nil {
 			return err
 		}
 		}
 		// The initial MS_BIND won't change the mount options, we need to do a
 		// separate MS_BIND|MS_REMOUNT to apply the mount options. We skip
@@ -563,7 +561,7 @@ func mountToRootfs(c *mountConfig, m mountEntry) error {
 		// contrast to mount(8)'s current behaviour, but is what users probably
 		// expect. See <https://github.com/util-linux/util-linux/issues/2433>.
 		if m.Flags & ^(unix.MS_BIND|unix.MS_REC|unix.MS_REMOUNT) != 0 || m.ClearedFlags != 0 {
-			if err := utils.WithProcfd(rootfs, m.Destination, func(dstFD string) error {
+			if err := utils.WithProcfd(rootfs, m.Destination, func(dstFd string) error {
 				flags := m.Flags | unix.MS_BIND | unix.MS_REMOUNT
 				// The runtime-spec says we SHOULD map to the relevant mount(8)
 				// behaviour. However, it's not clear whether we want the
@@ -590,7 +588,7 @@ func mountToRootfs(c *mountConfig, m mountEntry) error {
 				// different set of flags. This also has the mount(8) bug where
 				// "nodiratime,norelatime" will result in a
 				// "nodiratime,relatime" mount.
-				mountErr := mountViaFDs("", nil, m.Destination, dstFD, "", uintptr(flags), "")
+				mountErr := mountViaFds("", nil, m.Destination, dstFd, "", uintptr(flags), "")
 				if mountErr == nil {
 					return nil
 				}
@@ -639,7 +637,7 @@ func mountToRootfs(c *mountConfig, m mountEntry) error {
 				// Retry the mount with the existing lockable mount flags
 				// applied.
 				flags |= srcFlags & mntLockFlags
-				mountErr = mountViaFDs("", nil, m.Destination, dstFD, "", uintptr(flags), "")
+				mountErr = mountViaFds("", nil, m.Destination, dstFd, "", uintptr(flags), "")
 				logrus.Debugf("remount retry: srcFlags=0x%x flagsSet=0x%x flagsClr=0x%x: %v", srcFlags, m.Flags, m.ClearedFlags, mountErr)
 				return mountErr
 			}); err != nil {
@@ -857,8 +855,8 @@ func bindMountDeviceNode(rootfs, dest string, node *devices.Device) error {
 	if f != nil {
 		_ = f.Close()
 	}
-	return utils.WithProcfd(rootfs, dest, func(dstFD string) error {
+	return utils.WithProcfd(rootfs, dest, func(dstFd string) error {
-		return mountViaFDs(node.Path, nil, dest, dstFD, "bind", unix.MS_BIND, "")
+		return mountViaFds(node.Path, nil, dest, dstFd, "bind", unix.MS_BIND, "")
 	})
 }
@@ -1251,17 +1249,17 @@ func mountPropagate(m mountEntry, rootfs string, mountLabel string) error {
 	// mutating underneath us, we verify that we are actually going to mount
 	// inside the container with WithProcfd() -- mounting through a procfd
 	// mounts on the target.
-	if err := utils.WithProcfd(rootfs, m.Destination, func(dstFD string) error {
+	if err := utils.WithProcfd(rootfs, m.Destination, func(dstFd string) error {
-		return mountViaFDs(m.Source, m.srcFD, m.Destination, dstFD, m.Device, uintptr(flags), data)
+		return mountViaFds(m.Source, m.srcFile, m.Destination, dstFd, m.Device, uintptr(flags), data)
 	}); err != nil {
 		return err
 	}
 	// We have to apply mount propagation flags in a separate WithProcfd() call
 	// because the previous call invalidates the passed procfd -- the mount
 	// target needs to be re-opened.
-	if err := utils.WithProcfd(rootfs, m.Destination, func(dstFD string) error {
+	if err := utils.WithProcfd(rootfs, m.Destination, func(dstFd string) error {
 		for _, pflag := range m.PropagationFlags {
-			if err := mountViaFDs("", nil, m.Destination, dstFD, "", uintptr(pflag), ""); err != nil {
+			if err := mountViaFds("", nil, m.Destination, dstFd, "", uintptr(pflag), ""); err != nil {
 				return err
 			}
 		}
--- a/libcontainer/standard_init_linux.go
+++ b/libcontainer/standard_init_linux.go
@@ -27,7 +27,6 @@ type linuxStandardInit struct {
 	fifoFd        int
 	logFd         int
 	dmzExe        *os.File
 	mountFds      mountFds
 	config        *initConfig
 }
@@ -88,17 +87,7 @@ func (l *linuxStandardInit) Init() error {
 	// initialises the labeling system
 	selinux.GetEnabled()
-	// We don't need the mount nor idmap fds after prepareRootfs() nor if it fails.
+	err := prepareRootfs(l.pipe, l.config)
 	err := prepareRootfs(l.pipe, l.config, l.mountFds)
 	for _, m := range append(l.mountFds.sourceFds, l.mountFds.idmapFds...) {
 		if m == -1 {
 			continue
 		}
 		if err := unix.Close(m); err != nil {
 			return fmt.Errorf("unable to close mountFds fds: %w", err)
 		}
 	}
 	if err != nil {
 		return err
 	}
--- a/libcontainer/sync.go
+++ b/libcontainer/sync.go
@@ -21,6 +21,11 @@ type syncType string
 //
 //	     [  child  ] <-> [   parent   ]
 //
 //	procMountPlease      --> [open(2) or open_tree(2) and configure mount]
 //	  Arg: configs.Mount
 //	                     <-- procMountFd
 //	                           file: mountfd
 //
 //	procSeccomp         --> [forward fd to listenerPath]
 //	  file: seccomp fd
 //	                    --- no return synchronisation
@@ -39,6 +44,8 @@ const (
 	procRun         syncType = "procRun"
 	procHooks       syncType = "procHooks"
 	procHooksDone   syncType = "procHooksDone"
 	procMountPlease syncType = "procMountPlease"
 	procMountFd     syncType = "procMountFd"
 	procSeccomp     syncType = "procSeccomp"
 	procSeccompDone syncType = "procSeccompDone"
 )
--- a/libcontainer/userns/usernsfd_linux.go
+++ b/libcontainer/userns/usernsfd_linux.go
@@ -0,0 +1,153 @@
 package userns
 import (
 	"fmt"
 	"os"
 	"sort"
 	"strings"
 	"sync"
 	"syscall"
 	"github.com/sirupsen/logrus"
 	"golang.org/x/sys/unix"
 	"github.com/opencontainers/runc/libcontainer/configs"
 )
 type Mapping struct {
 	UIDMappings []configs.IDMap
 	GIDMappings []configs.IDMap
 }
 func (m Mapping) toSys() (uids, gids []syscall.SysProcIDMap) {
 	for _, uid := range m.UIDMappings {
 		uids = append(uids, syscall.SysProcIDMap{
 			ContainerID: uid.ContainerID,
 			HostID:      uid.HostID,
 			Size:        uid.Size,
 		})
 	}
 	for _, gid := range m.GIDMappings {
 		gids = append(gids, syscall.SysProcIDMap{
 			ContainerID: gid.ContainerID,
 			HostID:      gid.HostID,
 			Size:        gid.Size,
 		})
 	}
 	return
 }
 // id returns a unique identifier for this mapping, agnostic of the order of
 // the uid and gid mappings (because the order doesn't matter to the kernel).
 // The set of userns handles is indexed using this ID.
 func (m Mapping) id() string {
 	var uids, gids []string
 	for _, idmap := range m.UIDMappings {
 		uids = append(uids, fmt.Sprintf("%d:%d:%d", idmap.ContainerID, idmap.HostID, idmap.Size))
 	}
 	for _, idmap := range m.GIDMappings {
 		gids = append(gids, fmt.Sprintf("%d:%d:%d", idmap.ContainerID, idmap.HostID, idmap.Size))
 	}
 	// We don't care about the sort order -- just sort them.
 	sort.Strings(uids)
 	sort.Strings(gids)
 	return "uid=" + strings.Join(uids, ",") + ";gid=" + strings.Join(gids, ",")
 }
 type Handles struct {
 	m    sync.Mutex
 	maps map[string]*os.File
 }
 // Release all resources associated with this Handle. All existing files
 // returned from Get() will continue to work even after calling Release(). The
 // same Handles can be re-used after calling Release().
 func (hs *Handles) Release() {
 	hs.m.Lock()
 	defer hs.m.Unlock()
 	// Close the files for good measure, though GC will do that for us anyway.
 	for _, file := range hs.maps {
 		_ = file.Close()
 	}
 	hs.maps = nil
 }
 func spawnProc(req Mapping) (*os.Process, error) {
 	// We need to spawn a subprocess with the requested mappings, which is
 	// unfortunately quite expensive. The "safe" way of doing this is natively
 	// with Go (and then spawning something like "sleep infinity"), but
 	// execve() is a waste of cycles because we just need some process to have
 	// the right mapping, we don't care what it's executing. The "unsafe"
 	// option of doing a clone() behind the back of Go is probably okay in
 	// theory as long as we just do kill(getpid(), SIGSTOP). However, if we
 	// tell Go to put the new process into PTRACE_TRACEME mode, we can avoid
 	// the exec and not have to faff around with the mappings.
 	//
 	// Note that Go's stdlib does not support newuidmap, but in the case of
 	// id-mapped mounts, it seems incredibly unlikely that the user will be
 	// requesting us to do a remapping as an unprivileged user with mappings
 	// they have privileges over.
 	logrus.Debugf("spawning dummy process for id-mapping %s", req.id())
 	uidMappings, gidMappings := req.toSys()
 	return os.StartProcess("/proc/self/exe", []string{"runc", "--help"}, &os.ProcAttr{
 		Sys: &syscall.SysProcAttr{
 			Cloneflags:                 unix.CLONE_NEWUSER,
 			UidMappings:                uidMappings,
 			GidMappings:                gidMappings,
 			GidMappingsEnableSetgroups: false,
 			// Put the process into PTRACE_TRACEME mode to allow us to get the
 			// userns without having a proper execve() target.
 			Ptrace: true,
 		},
 	})
 }
 func dupFile(f *os.File) (*os.File, error) {
 	newFd, err := unix.FcntlInt(f.Fd(), unix.F_DUPFD_CLOEXEC, 0)
 	if err != nil {
 		return nil, os.NewSyscallError("fcntl(F_DUPFD_CLOEXEC)", err)
 	}
 	return os.NewFile(uintptr(newFd), f.Name()), nil
 }
 // Get returns a handle to a /proc/$pid/ns/user nsfs file with the requested
 // mapping. The processes spawned to produce userns nsfds are cached, so if
 // equivalent user namespace mappings are requested, the same user namespace
 // will be returned. The caller is responsible for closing the returned file
 // descriptor.
 func (hs *Handles) Get(req Mapping) (file *os.File, err error) {
 	hs.m.Lock()
 	defer hs.m.Unlock()
 	if hs.maps == nil {
 		hs.maps = make(map[string]*os.File)
 	}
 	file, ok := hs.maps[req.id()]
 	if !ok {
 		proc, err := spawnProc(req)
 		if err != nil {
 			return nil, fmt.Errorf("failed to spawn dummy process for map %s: %w", req.id(), err)
 		}
 		// Make sure we kill the helper process. We ignore errors because
 		// there's not much we can do about them anyway, and ultimately
 		defer func() {
 			_ = proc.Kill()
 			_, _ = proc.Wait()
 		}()
 		// Stash away a handle to the userns file. This is neater than keeping
 		// the process alive, because Go's GC can handle files much better than
 		// leaked processes, and having long-living useless processes seems
 		// less than ideal.
 		file, err = os.Open(fmt.Sprintf("/proc/%d/ns/user", proc.Pid))
 		if err != nil {
 			return nil, err
 		}
 		hs.maps[req.id()] = file
 	}
 	// Duplicate the file, to make sure the lifecycle of each *os.File we
 	// return is independent.
 	return dupFile(file)
 }
--- a/libcontainer/userns/usernsfd_linux_test.go
+++ b/libcontainer/userns/usernsfd_linux_test.go
@@ -0,0 +1,52 @@
 package userns
 import (
 	"os"
 	"testing"
 	"github.com/opencontainers/runc/libcontainer/configs"
 )
 func BenchmarkSpawnProc(b *testing.B) {
 	if os.Geteuid() != 0 {
 		b.Skip("spawning user namespaced processes requires root")
 	}
 	// We can reuse the mapping as we call spawnProc() directly.
 	mapping := Mapping{
 		UIDMappings: []configs.IDMap{
 			{ContainerID: 0, HostID: 1337, Size: 142},
 			{ContainerID: 150, HostID: 0, Size: 1},
 			{ContainerID: 442, HostID: 1111, Size: 12},
 			{ContainerID: 1000, HostID: 9999, Size: 92},
 			{ContainerID: 9999, HostID: 1000000, Size: 4},
 			// Newer kernels support more than 5 entries, but stick to 5 here.
 		},
 		GIDMappings: []configs.IDMap{
 			{ContainerID: 1, HostID: 2337, Size: 142},
 			{ContainerID: 145, HostID: 6, Size: 1},
 			{ContainerID: 200, HostID: 1000, Size: 12},
 			{ContainerID: 1000, HostID: 9888, Size: 92},
 			{ContainerID: 8999, HostID: 1000000, Size: 4},
 			// Newer kernels support more than 5 entries, but stick to 5 here.
 		},
 	}
 	procs := make([]*os.Process, 0, b.N)
 	b.Cleanup(func() {
 		for _, proc := range procs {
 			if proc != nil {
 				_ = proc.Kill()
 				_, _ = proc.Wait()
 			}
 		}
 	})
 	for i := 0; i < b.N; i++ {
 		proc, err := spawnProc(mapping)
 		if err != nil {
 			b.Error(err)
 		}
 		procs = append(procs, proc)
 	}
 }