1
0
mirror of https://github.com/opencontainers/runc.git synced 2025-11-09 13:00:56 +03:00

libcontainer: remove all mount logic from nsexec

With open_tree(OPEN_TREE_CLONE), it is possible to implement both the
id-mapped mounts and bind-mount source file descriptor logic entirely in
Go without requiring any complicated handling from nsexec.

However, implementing it the naive way (do the OPEN_TREE_CLONE in the
host namespace before the rootfs is set up -- which is what the existing
implementation did) exposes issues in how mount ordering (in particular
when handling mount sources from inside the container rootfs, but also
in relation to mount propagation) was handled for idmapped mounts and
bind-mount sources. In order to solve this problem completely, it is
necessary to spawn a thread which joins the container mount namespace
and provides mountfds when requested by the rootfs setup code (ensuring
that the mount order and mount propagation of the source of the
bind-mount are handled correctly). While the need to join the mount
namespace leads to other complicated (such as with the usage of
/proc/self -- fixed in a later patch) the resulting code is still
reasonable and is the only real way to solve the issue.

This allows us to reduce the amount of C code we have in nsexec, as well
as simplifying a whole host of places that were made more complicated
with the addition of id-mapped mounts and the bind sourcefd logic.
Because we join the container namespace, we can continue to use regular
O_PATH file descriptors for non-id-mapped bind-mount sources (which
means we don't have to raise the kernel requirement for that case).

In addition, we can easily add support for id-mappings that don't match
the container's user namespace. The approach taken here is to use Go's
officially supported mechanism for spawning a process in a user
namespace, but (ab)use PTRACE_TRACEME to avoid actually having to exec a
different process. The most efficient way to implement this would be to
do clone() in cgo directly to run a function that just does
kill(getpid(), SIGSTOP) -- we can always switch to that if it turns out
this approach is too slow. It should be noted that the included
micro-benchmark seems to indicate this is Fast Enough(TM):

  goos: linux
  goarch: amd64
  pkg: github.com/opencontainers/runc/libcontainer/userns
  cpu: Intel(R) Core(TM) i5-10210U CPU @ 1.60GHz
  BenchmarkSpawnProc
  BenchmarkSpawnProc-8        1670            770065 ns/op

Fixes: fda12ab101 ("Support idmap mounts on volumes")
Fixes: 9c444070ec ("Open bind mount sources from the host userns")
Signed-off-by: Aleksa Sarai <cyphar@cyphar.com>
This commit is contained in:
Aleksa Sarai
2023-08-01 20:07:49 +10:00
parent 99f7fa1413
commit ba0b5e2698
14 changed files with 618 additions and 688 deletions

View File

@@ -2,7 +2,6 @@ package libcontainer
import ( import (
"bytes" "bytes"
"encoding/json"
"errors" "errors"
"fmt" "fmt"
"io" "io"
@@ -629,112 +628,6 @@ func (c *Container) newParentProcess(p *Process) (parentProcess, error) {
return c.newSetnsProcess(p, cmd, comm) return c.newSetnsProcess(p, cmd, comm)
} }
// shouldSendMountSources says whether the child process must setup bind mounts with
// the source pre-opened (O_PATH) in the host user namespace.
// See https://github.com/opencontainers/runc/issues/2484
func (c *Container) shouldSendMountSources() bool {
// Passing the mount sources via SCM_RIGHTS is only necessary when
// both userns and mntns are active.
if !c.config.Namespaces.Contains(configs.NEWUSER) ||
!c.config.Namespaces.Contains(configs.NEWNS) {
return false
}
// nsexec.c send_mountsources() requires setns(mntns) capabilities
// CAP_SYS_CHROOT and CAP_SYS_ADMIN.
if c.config.RootlessEUID {
return false
}
// We need to send sources if there are non-idmap bind-mounts.
for _, m := range c.config.Mounts {
if m.IsBind() && !m.IsIDMapped() {
return true
}
}
return false
}
// shouldSendIdmapSources says whether the child process must setup idmap mounts with
// the mount_setattr already done in the host user namespace.
func (c *Container) shouldSendIdmapSources() bool {
// nsexec.c mount_setattr() requires CAP_SYS_ADMIN in:
// * the user namespace the filesystem was mounted in;
// * the user namespace we're trying to idmap the mount to;
// * the owning user namespace of the mount namespace you're currently located in.
//
// See the comment from Christian Brauner:
// https://github.com/opencontainers/runc/pull/3717#discussion_r1103607972
//
// Let's just rule out rootless, we don't have those permission in the
// rootless case.
if c.config.RootlessEUID {
return false
}
// For the time being we require userns to be in use.
if !c.config.Namespaces.Contains(configs.NEWUSER) {
return false
}
// We need to send sources if there are idmap bind-mounts.
for _, m := range c.config.Mounts {
if m.IsBind() && m.IsIDMapped() {
return true
}
}
return false
}
func (c *Container) sendMountSources(cmd *exec.Cmd, comm *processComm) error {
if !c.shouldSendMountSources() {
return nil
}
return c.sendFdsSources(cmd, comm, "_LIBCONTAINER_MOUNT_FDS", func(m *configs.Mount) bool {
return m.IsBind() && !m.IsIDMapped()
})
}
func (c *Container) sendIdmapSources(cmd *exec.Cmd, comm *processComm) error {
if !c.shouldSendIdmapSources() {
return nil
}
return c.sendFdsSources(cmd, comm, "_LIBCONTAINER_IDMAP_FDS", func(m *configs.Mount) bool {
return m.IsBind() && m.IsIDMapped()
})
}
func (c *Container) sendFdsSources(cmd *exec.Cmd, comm *processComm, envVar string, condition func(*configs.Mount) bool) error {
// Elements on these slices will be paired with mounts (see StartInitialization() and
// prepareRootfs()). These slices MUST have the same size as c.config.Mounts.
fds := make([]int, len(c.config.Mounts))
for i, m := range c.config.Mounts {
if !condition(m) {
// The -1 fd is ignored later.
fds[i] = -1
continue
}
// The fd passed here will not be used: nsexec.c will overwrite it with
// dup3(). We just need to allocate a fd so that we know the number to
// pass in the environment variable. The fd must not be closed before
// cmd.Start(), so we reuse initSockChild because the lifecycle of that
// fd is already taken care of.
cmd.ExtraFiles = append(cmd.ExtraFiles, comm.initSockChild)
fds[i] = stdioFdCount + len(cmd.ExtraFiles) - 1
}
fdsJSON, err := json.Marshal(fds)
if err != nil {
return fmt.Errorf("Error creating %v: %w", envVar, err)
}
cmd.Env = append(cmd.Env, envVar+"="+string(fdsJSON))
return nil
}
func (c *Container) newInitProcess(p *Process, cmd *exec.Cmd, comm *processComm) (*initProcess, error) { func (c *Container) newInitProcess(p *Process, cmd *exec.Cmd, comm *processComm) (*initProcess, error) {
cmd.Env = append(cmd.Env, "_LIBCONTAINER_INITTYPE="+string(initStandard)) cmd.Env = append(cmd.Env, "_LIBCONTAINER_INITTYPE="+string(initStandard))
nsMaps := make(map[configs.NamespaceType]string) nsMaps := make(map[configs.NamespaceType]string)
@@ -743,16 +636,10 @@ func (c *Container) newInitProcess(p *Process, cmd *exec.Cmd, comm *processComm)
nsMaps[ns.Type] = ns.Path nsMaps[ns.Type] = ns.Path
} }
} }
data, err := c.bootstrapData(c.config.Namespaces.CloneFlags(), nsMaps, initStandard) data, err := c.bootstrapData(c.config.Namespaces.CloneFlags(), nsMaps)
if err != nil { if err != nil {
return nil, err return nil, err
} }
if err := c.sendMountSources(cmd, comm); err != nil {
return nil, err
}
if err := c.sendIdmapSources(cmd, comm); err != nil {
return nil, err
}
init := &initProcess{ init := &initProcess{
cmd: cmd, cmd: cmd,
@@ -776,7 +663,7 @@ func (c *Container) newSetnsProcess(p *Process, cmd *exec.Cmd, comm *processComm
} }
// for setns process, we don't have to set cloneflags as the process namespaces // for setns process, we don't have to set cloneflags as the process namespaces
// will only be set via setns syscall // will only be set via setns syscall
data, err := c.bootstrapData(0, state.NamespacePaths, initSetns) data, err := c.bootstrapData(0, state.NamespacePaths)
if err != nil { if err != nil {
return nil, err return nil, err
} }
@@ -1165,7 +1052,7 @@ type netlinkError struct{ error }
// such as one that uses nsenter package to bootstrap the container's // such as one that uses nsenter package to bootstrap the container's
// init process correctly, i.e. with correct namespaces, uid/gid // init process correctly, i.e. with correct namespaces, uid/gid
// mapping etc. // mapping etc.
func (c *Container) bootstrapData(cloneFlags uintptr, nsMaps map[configs.NamespaceType]string, it initType) (_ io.Reader, Err error) { func (c *Container) bootstrapData(cloneFlags uintptr, nsMaps map[configs.NamespaceType]string) (_ io.Reader, Err error) {
// create the netlink message // create the netlink message
r := nl.NewNetlinkRequest(int(InitMsg), 0) r := nl.NewNetlinkRequest(int(InitMsg), 0)
@@ -1267,48 +1154,6 @@ func (c *Container) bootstrapData(cloneFlags uintptr, nsMaps map[configs.Namespa
Value: c.config.RootlessEUID, Value: c.config.RootlessEUID,
}) })
// Bind mount source to open.
if it == initStandard && c.shouldSendMountSources() {
var mounts []byte
for _, m := range c.config.Mounts {
if m.IsBind() && !m.IsIDMapped() {
if strings.IndexByte(m.Source, 0) >= 0 {
return nil, fmt.Errorf("mount source string contains null byte: %q", m.Source)
}
mounts = append(mounts, []byte(m.Source)...)
}
mounts = append(mounts, byte(0))
}
r.AddData(&Bytemsg{
Type: MountSourcesAttr,
Value: mounts,
})
}
// Idmap mount sources to open.
if it == initStandard && c.shouldSendIdmapSources() {
var mounts []byte
for _, m := range c.config.Mounts {
if m.IsBind() && m.IsIDMapped() {
// While other parts of the code check this too (like
// libcontainer/specconv/spec_linux.go) we do it here also because some libcontainer
// users don't use those functions.
if strings.IndexByte(m.Source, 0) >= 0 {
return nil, fmt.Errorf("mount source string contains null byte: %q", m.Source)
}
mounts = append(mounts, []byte(m.Source)...)
}
mounts = append(mounts, byte(0))
}
r.AddData(&Bytemsg{
Type: IdmapSourcesAttr,
Value: mounts,
})
}
// write boottime and monotonic time ns offsets. // write boottime and monotonic time ns offsets.
if c.config.TimeOffsets != nil { if c.config.TimeOffsets != nil {
var offsetSpec bytes.Buffer var offsetSpec bytes.Buffer

View File

@@ -618,8 +618,8 @@ func (c *Container) prepareCriuRestoreMounts(mounts []*configs.Mount) error {
// because during initial container creation mounts are // because during initial container creation mounts are
// set up in the order they are configured. // set up in the order they are configured.
if m.Device == "bind" { if m.Device == "bind" {
if err := utils.WithProcfd(c.config.Rootfs, m.Destination, func(dstFD string) error { if err := utils.WithProcfd(c.config.Rootfs, m.Destination, func(dstFd string) error {
return mountViaFDs(m.Source, nil, m.Destination, dstFD, "", unix.MS_BIND|unix.MS_REC, "") return mountViaFds(m.Source, nil, m.Destination, dstFd, "", unix.MS_BIND|unix.MS_REC, "")
}); err != nil { }); err != nil {
return err return err
} }

View File

@@ -214,18 +214,3 @@ func validateID(id string) error {
return nil return nil
} }
func parseFdsFromEnv(envVar string) ([]int, error) {
fdsJSON := os.Getenv(envVar)
if fdsJSON == "" {
// Always return the nil slice if no fd is present.
return nil, nil
}
var fds []int
if err := json.Unmarshal([]byte(fdsJSON), &fds); err != nil {
return nil, fmt.Errorf("Error unmarshalling %v: %w", envVar, err)
}
return fds, nil
}

View File

@@ -47,18 +47,6 @@ type network struct {
TempVethPeerName string `json:"temp_veth_peer_name"` TempVethPeerName string `json:"temp_veth_peer_name"`
} }
type mountFds struct {
// sourceFds are the fds to use as source when mounting.
// The slice size should be the same as container mounts, as it will be
// paired with them.
// The value -1 is used when no fd is needed for the mount.
// Can't have a valid fd in the same position that other slices in this struct.
// We need to use only one of these fds on any single mount.
sourceFds []int
// Idem sourceFds, but fds of already created idmap mounts, to use with unix.MoveMount().
idmapFds []int
}
// initConfig is used for transferring parameters from Exec() to Init() // initConfig is used for transferring parameters from Exec() to Init()
type initConfig struct { type initConfig struct {
Args []string `json:"args"` Args []string `json:"args"`
@@ -189,18 +177,6 @@ func startInitialization() (retErr error) {
defer pidfdSocket.Close() defer pidfdSocket.Close()
} }
// Get mount files (O_PATH).
mountSrcFds, err := parseFdsFromEnv("_LIBCONTAINER_MOUNT_FDS")
if err != nil {
return err
}
// Get idmap fds.
idmapFds, err := parseFdsFromEnv("_LIBCONTAINER_IDMAP_FDS")
if err != nil {
return err
}
// Get runc-dmz fds. // Get runc-dmz fds.
var dmzExe *os.File var dmzExe *os.File
if dmzFdStr := os.Getenv("_LIBCONTAINER_DMZEXEFD"); dmzFdStr != "" { if dmzFdStr := os.Getenv("_LIBCONTAINER_DMZEXEFD"); dmzFdStr != "" {
@@ -232,21 +208,16 @@ func startInitialization() (retErr error) {
} }
// If init succeeds, it will not return, hence none of the defers will be called. // If init succeeds, it will not return, hence none of the defers will be called.
return containerInit(it, &config, syncPipe, consoleSocket, pidfdSocket, fifofd, logFD, dmzExe, mountFds{sourceFds: mountSrcFds, idmapFds: idmapFds}) return containerInit(it, &config, syncPipe, consoleSocket, pidfdSocket, fifofd, logFD, dmzExe)
} }
func containerInit(t initType, config *initConfig, pipe *syncSocket, consoleSocket, pidfdSocket *os.File, fifoFd, logFd int, dmzExe *os.File, mountFds mountFds) error { func containerInit(t initType, config *initConfig, pipe *syncSocket, consoleSocket, pidfdSocket *os.File, fifoFd, logFd int, dmzExe *os.File) error {
if err := populateProcessEnvironment(config.Env); err != nil { if err := populateProcessEnvironment(config.Env); err != nil {
return err return err
} }
switch t { switch t {
case initSetns: case initSetns:
// mount and idmap fds must be nil in this case. We don't mount while doing runc exec.
if mountFds.sourceFds != nil || mountFds.idmapFds != nil {
return errors.New("mount and idmap fds must be nil; can't mount from exec")
}
i := &linuxSetnsInit{ i := &linuxSetnsInit{
pipe: pipe, pipe: pipe,
consoleSocket: consoleSocket, consoleSocket: consoleSocket,
@@ -266,7 +237,6 @@ func containerInit(t initType, config *initConfig, pipe *syncSocket, consoleSock
fifoFd: fifoFd, fifoFd: fifoFd,
logFd: logFd, logFd: logFd,
dmzExe: dmzExe, dmzExe: dmzExe,
mountFds: mountFds,
} }
return i.Init() return i.Init()
} }

View File

@@ -21,9 +21,7 @@ const (
RootlessEUIDAttr uint16 = 27287 RootlessEUIDAttr uint16 = 27287
UidmapPathAttr uint16 = 27288 UidmapPathAttr uint16 = 27288
GidmapPathAttr uint16 = 27289 GidmapPathAttr uint16 = 27289
MountSourcesAttr uint16 = 27290 TimeOffsetsAttr uint16 = 27290
IdmapSourcesAttr uint16 = 27291
TimeOffsetsAttr uint16 = 27292
) )
type Int32msg struct { type Int32msg struct {

View File

@@ -1,19 +1,44 @@
package libcontainer package libcontainer
import ( import (
"errors"
"fmt"
"io/fs" "io/fs"
"os"
"strconv" "strconv"
"github.com/sirupsen/logrus"
"golang.org/x/sys/unix" "golang.org/x/sys/unix"
"github.com/opencontainers/runc/libcontainer/configs"
"github.com/opencontainers/runc/libcontainer/userns"
) )
// mountSourceType indicates what type of file descriptor is being returned. It
// is used to tell rootfs_linux.go whether or not to use move_mount(2) to
// install the mount.
type mountSourceType string
const (
// An open_tree(2)-style file descriptor that needs to be installed using
// move_mount(2) to install.
mountSourceOpenTree mountSourceType = "open_tree"
// A plain file descriptor that can be mounted through /proc/self/fd.
mountSourcePlain mountSourceType = "plain-open"
)
type mountSource struct {
Type mountSourceType `json:"type"`
file *os.File `json:"-"`
}
// mountError holds an error from a failed mount or unmount operation. // mountError holds an error from a failed mount or unmount operation.
type mountError struct { type mountError struct {
op string op string
source string source string
srcFD *int srcFile *mountSource
target string target string
dstFD string dstFd string
flags uintptr flags uintptr
data string data string
err error err error
@@ -25,13 +50,14 @@ func (e *mountError) Error() string {
if e.source != "" { if e.source != "" {
out += "src=" + e.source + ", " out += "src=" + e.source + ", "
if e.srcFD != nil { if e.srcFile != nil {
out += "srcFD=" + strconv.Itoa(*e.srcFD) + ", " out += "srcType=" + string(e.srcFile.Type) + ", "
out += "srcFd=" + strconv.Itoa(int(e.srcFile.file.Fd())) + ", "
} }
} }
out += "dst=" + e.target out += "dst=" + e.target
if e.dstFD != "" { if e.dstFd != "" {
out += ", dstFD=" + e.dstFD out += ", dstFd=" + e.dstFd
} }
if e.flags != uintptr(0) { if e.flags != uintptr(0) {
@@ -54,35 +80,55 @@ func (e *mountError) Unwrap() error {
// mount is a simple unix.Mount wrapper, returning an error with more context // mount is a simple unix.Mount wrapper, returning an error with more context
// in case it failed. // in case it failed.
func mount(source, target, fstype string, flags uintptr, data string) error { func mount(source, target, fstype string, flags uintptr, data string) error {
return mountViaFDs(source, nil, target, "", fstype, flags, data) return mountViaFds(source, nil, target, "", fstype, flags, data)
} }
// mountViaFDs is a unix.Mount wrapper which uses srcFD instead of source, // mountViaFds is a unix.Mount wrapper which uses srcFile instead of source,
// and dstFD instead of target, unless those are empty. // and dstFd instead of target, unless those are empty.
// If srcFD is different than nil, its path (i.e. "/proc/self/fd/NN") will be
// constructed by this function.
// dstFD argument, if non-empty, is expected to be in the form of a path to an
// opened file descriptor on procfs (i.e. "/proc/self/fd/NN").
// //
// If case an FD is used instead of a source or a target path, the // If srcFile is non-nil and flags does not contain MS_REMOUNT, mountViaFds
// corresponding path is only used to add context to an error in case // will mount it according to the mountSourceType of the file descriptor.
// the mount operation has failed. //
func mountViaFDs(source string, srcFD *int, target, dstFD, fstype string, flags uintptr, data string) error { // The dstFd argument, if non-empty, is expected to be in the form of a path to
src := source // an opened file descriptor on procfs (i.e. "/proc/self/fd/NN").
if srcFD != nil { //
src = "/proc/self/fd/" + strconv.Itoa(*srcFD) // If a file descriptor is used instead of a source or a target path, the
// corresponding path is only used to add context to an error in case the mount
// operation has failed.
func mountViaFds(source string, srcFile *mountSource, target, dstFd, fstype string, flags uintptr, data string) error {
// MS_REMOUNT and srcFile don't make sense together.
if srcFile != nil && flags&unix.MS_REMOUNT != 0 {
logrus.Debugf("mount source passed along with MS_REMOUNT -- ignoring srcFile")
srcFile = nil
} }
dst := target dst := target
if dstFD != "" { if dstFd != "" {
dst = dstFD dst = dstFd
} }
if err := unix.Mount(src, dst, fstype, flags, data); err != nil { src := source
if srcFile != nil {
src = "/proc/self/fd/" + strconv.Itoa(int(srcFile.file.Fd()))
}
var op string
var err error
if srcFile != nil && srcFile.Type == mountSourceOpenTree {
op = "move_mount"
err = unix.MoveMount(int(srcFile.file.Fd()), "",
unix.AT_FDCWD, dstFd,
unix.MOVE_MOUNT_F_EMPTY_PATH|unix.MOVE_MOUNT_T_SYMLINKS)
} else {
op = "mount"
err = unix.Mount(src, dst, fstype, flags, data)
}
if err != nil {
return &mountError{ return &mountError{
op: "mount", op: op,
source: source, source: source,
srcFD: srcFD, srcFile: srcFile,
target: target, target: target,
dstFD: dstFD, dstFd: dstFd,
flags: flags, flags: flags,
data: data, data: data,
err: err, err: err,
@@ -121,3 +167,81 @@ func syscallMode(i fs.FileMode) (o uint32) {
// No mapping for Go's ModeTemporary (plan9 only). // No mapping for Go's ModeTemporary (plan9 only).
return return
} }
// mountFd creates a "mount source fd" (either through open_tree(2) or just
// open(O_PATH)) based on the provided configuration. This function must be
// called from within the container's mount namespace.
//
// In the case of idmapped mount configurations, the returned mount source will
// be an open_tree(2) file with MOUNT_ATTR_IDMAP applied. For other
// bind-mounts, it will be an O_PATH. If the type of mount cannot be handled,
// the returned mountSource will be nil, indicating that the container init
// process will need to do an old-fashioned mount(2) themselves.
//
// This helper is only intended to be used by goCreateMountSources.
func mountFd(nsHandles *userns.Handles, m *configs.Mount) (*mountSource, error) {
if !m.IsBind() {
return nil, errors.New("new mount api: only bind-mounts are supported")
}
if nsHandles == nil {
nsHandles = new(userns.Handles)
defer nsHandles.Release()
}
var mountFile *os.File
var sourceType mountSourceType
// Ideally, we would use OPEN_TREE_CLONE for everything, because we can
// be sure that the file descriptor cannot be used to escape outside of
// the mount root. Unfortunately, OPEN_TREE_CLONE is far more expensive
// than open(2) because it requires doing mounts inside a new anonymous
// mount namespace. So we use open(2) for standard bind-mounts, and
// OPEN_TREE_CLONE when we need to set mount attributes here.
//
// While passing open(2)'d paths from the host rootfs isn't exactly the
// safest thing in the world, the files will not survive across
// execve(2) and "runc init" is non-dumpable so it should not be
// possible for a malicious container process to gain access to the
// file descriptors. We also don't do any of this for "runc exec",
// lessening the risk even further.
if m.IsIDMapped() {
flags := uint(unix.OPEN_TREE_CLONE | unix.OPEN_TREE_CLOEXEC)
if m.Flags&unix.MS_REC == unix.MS_REC {
flags |= unix.AT_RECURSIVE
}
fd, err := unix.OpenTree(unix.AT_FDCWD, m.Source, flags)
if err != nil {
return nil, &os.PathError{Op: "open_tree(OPEN_TREE_CLONE)", Path: m.Source, Err: err}
}
mountFile = os.NewFile(uintptr(fd), m.Source)
sourceType = mountSourceOpenTree
// Configure the id mapping.
usernsFile, err := nsHandles.Get(userns.Mapping{
UIDMappings: m.UIDMappings,
GIDMappings: m.GIDMappings,
})
if err != nil {
return nil, fmt.Errorf("failed to create userns for %s id-mapping: %w", m.Source, err)
}
defer usernsFile.Close()
if err := unix.MountSetattr(int(mountFile.Fd()), "", unix.AT_EMPTY_PATH, &unix.MountAttr{
Attr_set: unix.MOUNT_ATTR_IDMAP,
Userns_fd: uint64(usernsFile.Fd()),
}); err != nil {
return nil, fmt.Errorf("failed to set MOUNT_ATTR_IDMAP on %s: %w", m.Source, err)
}
} else {
var err error
mountFile, err = os.OpenFile(m.Source, unix.O_PATH|unix.O_CLOEXEC, 0)
if err != nil {
return nil, err
}
sourceType = mountSourcePlain
}
return &mountSource{
Type: sourceType,
file: mountFile,
}, nil
}

View File

@@ -1,76 +0,0 @@
#ifndef IDMAP_H
#define IDMAP_H
#include <sys/mount.h>
// Centos-7 doesn't have this file nor the __has_include() directive, so let's
// just leave this commented out and we can uncomment when it hits EOL (2024-06-30).
//#include <linux/mount.h>
#include <sys/syscall.h>
#include <unistd.h>
/* mount_setattr() */
#ifndef MOUNT_ATTR_IDMAP
#define MOUNT_ATTR_IDMAP 0x00100000
#endif
#ifndef __NR_mount_setattr
#if defined _MIPS_SIM
#if _MIPS_SIM == _MIPS_SIM_ABI32 /* o32 */
#define __NR_mount_setattr (442 + 4000)
#endif
#if _MIPS_SIM == _MIPS_SIM_NABI32 /* n32 */
#define __NR_mount_setattr (442 + 6000)
#endif
#if _MIPS_SIM == _MIPS_SIM_ABI64 /* n64 */
#define __NR_mount_setattr (442 + 5000)
#endif
#else
#define __NR_mount_setattr 442
#endif
#endif
#ifndef MOUNT_ATTR_SIZE_VER0
struct mount_attr {
__u64 attr_set;
__u64 attr_clr;
__u64 propagation;
__u64 userns_fd;
};
#endif
/* open_tree() */
#ifndef OPEN_TREE_CLONE
#define OPEN_TREE_CLONE 1
#endif
#ifndef OPEN_TREE_CLOEXEC
#define OPEN_TREE_CLOEXEC O_CLOEXEC
#endif
#ifndef __NR_open_tree
#if defined _MIPS_SIM
#if _MIPS_SIM == _MIPS_SIM_ABI32 /* o32 */
#define __NR_open_tree 4428
#endif
#if _MIPS_SIM == _MIPS_SIM_NABI32 /* n32 */
#define __NR_open_tree 6428
#endif
#if _MIPS_SIM == _MIPS_SIM_ABI64 /* n64 */
#define __NR_open_tree 5428
#endif
#else
#define __NR_open_tree 428
#endif
#endif
static inline int sys_mount_setattr(int dfd, const char *path, unsigned int flags, struct mount_attr *attr, size_t size)
{
return syscall(__NR_mount_setattr, dfd, path, flags, attr, size);
}
static inline int sys_open_tree(int dfd, const char *filename, unsigned int flags)
{
return syscall(__NR_open_tree, dfd, filename, flags);
}
#endif /* IDMAP_H */

View File

@@ -33,9 +33,6 @@
/* Get all of the CLONE_NEW* flags. */ /* Get all of the CLONE_NEW* flags. */
#include "namespace.h" #include "namespace.h"
/* Get definitions for idmap sources */
#include "idmap.h"
/* Synchronisation values. */ /* Synchronisation values. */
enum sync_t { enum sync_t {
SYNC_USERMAP_PLS = 0x40, /* Request parent to map our users. */ SYNC_USERMAP_PLS = 0x40, /* Request parent to map our users. */
@@ -44,12 +41,8 @@ enum sync_t {
SYNC_RECVPID_ACK = 0x43, /* PID was correctly received by parent. */ SYNC_RECVPID_ACK = 0x43, /* PID was correctly received by parent. */
SYNC_GRANDCHILD = 0x44, /* The grandchild is ready to run. */ SYNC_GRANDCHILD = 0x44, /* The grandchild is ready to run. */
SYNC_CHILD_FINISH = 0x45, /* The child or grandchild has finished. */ SYNC_CHILD_FINISH = 0x45, /* The child or grandchild has finished. */
SYNC_MOUNTSOURCES_PLS = 0x46, /* Tell parent to send mount sources by SCM_RIGHTS. */ SYNC_TIMEOFFSETS_PLS = 0x46, /* Request parent to write timens offsets. */
SYNC_MOUNTSOURCES_ACK = 0x47, /* All mount sources have been sent. */ SYNC_TIMEOFFSETS_ACK = 0x47, /* Timens offsets were written. */
SYNC_MOUNT_IDMAP_PLS = 0x48, /* Tell parent to mount idmap sources. */
SYNC_MOUNT_IDMAP_ACK = 0x49, /* All idmap mounts have been done. */
SYNC_TIMEOFFSETS_PLS = 0x50, /* Request parent to write timens offsets. */
SYNC_TIMEOFFSETS_ACK = 0x51, /* Timens offsets were written. */
}; };
#define STAGE_SETUP -1 #define STAGE_SETUP -1
@@ -99,14 +92,6 @@ struct nlconfig_t {
char *gidmappath; char *gidmappath;
size_t gidmappath_len; size_t gidmappath_len;
/* Mount sources opened outside the container userns. */
char *mountsources;
size_t mountsources_len;
/* Idmap sources opened outside the container userns which will be id mapped. */
char *idmapsources;
size_t idmapsources_len;
/* Time NS offsets. */ /* Time NS offsets. */
char *timensoffset; char *timensoffset;
size_t timensoffset_len; size_t timensoffset_len;
@@ -126,9 +111,7 @@ struct nlconfig_t {
#define ROOTLESS_EUID_ATTR 27287 #define ROOTLESS_EUID_ATTR 27287
#define UIDMAPPATH_ATTR 27288 #define UIDMAPPATH_ATTR 27288
#define GIDMAPPATH_ATTR 27289 #define GIDMAPPATH_ATTR 27289
#define MOUNT_SOURCES_ATTR 27290 #define TIMENSOFFSET_ATTR 27290
#define IDMAP_SOURCES_ATTR 27291
#define TIMENSOFFSET_ATTR 27292
/* /*
* Use the raw syscall for versions of glibc which don't include a function for * Use the raw syscall for versions of glibc which don't include a function for
@@ -446,14 +429,6 @@ static void nl_parse(int fd, struct nlconfig_t *config)
case SETGROUP_ATTR: case SETGROUP_ATTR:
config->is_setgroup = readint8(current); config->is_setgroup = readint8(current);
break; break;
case MOUNT_SOURCES_ATTR:
config->mountsources = current;
config->mountsources_len = payload_len;
break;
case IDMAP_SOURCES_ATTR:
config->idmapsources = current;
config->idmapsources_len = payload_len;
break;
case TIMENSOFFSET_ATTR: case TIMENSOFFSET_ATTR:
config->timensoffset = current; config->timensoffset = current;
config->timensoffset_len = payload_len; config->timensoffset_len = payload_len;
@@ -546,115 +521,6 @@ static inline int sane_kill(pid_t pid, int signum)
return 0; return 0;
} }
/* receive_fd_sources parses env_var as an array of fd numbers and, for each element that is
* not -1, it receives an fd via SCM_RIGHTS and dup3 it to the fd requested in
* the element of the env var.
*/
void receive_fd_sources(int sockfd, const char *env_var)
{
char *fds, *endp;
long new_fd;
// This env var must be a json array of ints.
fds = getenv(env_var);
if (fds[0] != '[') {
bail("malformed %s env var: missing '['", env_var);
}
fds++;
for (endp = fds; *endp != ']'; fds = endp + 1) {
new_fd = strtol(fds, &endp, 10);
if (endp == fds) {
bail("malformed %s env var: not a number", env_var);
}
if (*endp == '\0') {
bail("malformed %s env var: missing ]", env_var);
}
// The list contains -1 when no fd is needed. Ignore them.
if (new_fd == -1) {
continue;
}
if (new_fd == LONG_MAX || new_fd < 0 || new_fd > INT_MAX) {
bail("malformed %s env var: fds out of range", env_var);
}
int recv_fd = receive_fd(sockfd);
if (dup3(recv_fd, new_fd, O_CLOEXEC) < 0) {
bail("cannot dup3 fd %d to %ld", recv_fd, new_fd);
}
if (close(recv_fd) < 0) {
bail("cannot close fd %d", recv_fd);
}
}
}
void receive_mountsources(int sockfd)
{
receive_fd_sources(sockfd, "_LIBCONTAINER_MOUNT_FDS");
}
void send_mountsources(int sockfd, pid_t child, char *mountsources, size_t mountsources_len)
{
char proc_path[PATH_MAX];
int host_mntns_fd;
int container_mntns_fd;
int fd;
int ret;
// container_linux.go shouldSendMountSources() decides if mount sources
// should be pre-opened (O_PATH) and passed via SCM_RIGHTS
if (mountsources == NULL)
return;
host_mntns_fd = open("/proc/self/ns/mnt", O_RDONLY | O_CLOEXEC);
if (host_mntns_fd == -1)
bail("failed to get current mount namespace");
if (snprintf(proc_path, PATH_MAX, "/proc/%d/ns/mnt", child) < 0)
bail("failed to get mount namespace path");
container_mntns_fd = open(proc_path, O_RDONLY | O_CLOEXEC);
if (container_mntns_fd == -1)
bail("failed to get container mount namespace");
if (setns(container_mntns_fd, CLONE_NEWNS) < 0)
bail("failed to setns to container mntns");
char *mountsources_end = mountsources + mountsources_len;
while (mountsources < mountsources_end) {
if (mountsources[0] == '\0') {
mountsources++;
continue;
}
fd = open(mountsources, O_PATH | O_CLOEXEC);
if (fd < 0)
bail("failed to open mount source %s", mountsources);
write_log(DEBUG, "~> sending fd for: %s", mountsources);
if (send_fd(sockfd, fd) < 0)
bail("failed to send fd %d via unix socket %d", fd, sockfd);
ret = close(fd);
if (ret != 0)
bail("failed to close mount source fd %d", fd);
mountsources += strlen(mountsources) + 1;
}
if (setns(host_mntns_fd, CLONE_NEWNS) < 0)
bail("failed to setns to host mntns");
ret = close(host_mntns_fd);
if (ret != 0)
bail("failed to close host mount namespace fd %d", host_mntns_fd);
ret = close(container_mntns_fd);
if (ret != 0)
bail("failed to close container mount namespace fd %d", container_mntns_fd);
}
void try_unshare(int flags, const char *msg) void try_unshare(int flags, const char *msg)
{ {
write_log(DEBUG, "unshare %s", msg); write_log(DEBUG, "unshare %s", msg);
@@ -674,89 +540,6 @@ void try_unshare(int flags, const char *msg)
bail("failed to unshare %s", msg); bail("failed to unshare %s", msg);
} }
void send_idmapsources(int sockfd, pid_t pid, char *idmap_src, int idmap_src_len)
{
char proc_user_path[PATH_MAX];
/* Open the userns fd only once.
* Currently we only support idmap mounts that use the same mapping than
* the userns. This is validated in libcontainer/configs/validate/validator.go,
* so if we reached here, we know the mapping for the idmap is the same
* as the userns. This is why we just open the userns_fd once from the
* PID of the child process that has the userns already applied.
*/
int ret = snprintf(proc_user_path, sizeof(proc_user_path), "/proc/%d/ns/user", pid);
if (ret < 0 || (size_t)ret >= sizeof(proc_user_path)) {
sane_kill(pid, SIGKILL);
bail("failed to create userns path string");
}
int userns_fd = open(proc_user_path, O_RDONLY | O_CLOEXEC | O_NOCTTY);
if (userns_fd < 0) {
sane_kill(pid, SIGKILL);
bail("failed to get user namespace fd");
}
char *idmap_end = idmap_src + idmap_src_len;
while (idmap_src < idmap_end) {
if (idmap_src[0] == '\0') {
idmap_src++;
continue;
}
int fd_tree = sys_open_tree(-EBADF, idmap_src,
OPEN_TREE_CLONE | OPEN_TREE_CLOEXEC |
AT_SYMLINK_NOFOLLOW | AT_NO_AUTOMOUNT);
if (fd_tree < 0) {
sane_kill(pid, SIGKILL);
if (errno == ENOSYS) {
bail("open_tree(2) failed, the kernel doesn't support ID-mapped mounts");
} else if (errno == EINVAL) {
bail("open_tree(2) failed with path: %s, the kernel doesn't support ID-mapped mounts",
idmap_src);
} else {
bail("open_tree(2) failed with path: %s", idmap_src);
}
}
struct mount_attr attr = {
.attr_set = MOUNT_ATTR_IDMAP,
.userns_fd = userns_fd,
};
ret = sys_mount_setattr(fd_tree, "", AT_EMPTY_PATH, &attr, sizeof(attr));
if (ret < 0) {
sane_kill(pid, SIGKILL);
if (errno == ENOSYS)
bail("mount_setattr(2) failed, the kernel doesn't support ID-mapped mounts");
else if (errno == EINVAL)
bail("mount_setattr(2) failed with path: %s, maybe the filesystem doesn't support ID-mapped mounts", idmap_src);
else
bail("mount_setattr(2) failed with path: %s", idmap_src);
}
write_log(DEBUG, "~> sending idmap source: %s with mapping from: %s", idmap_src, proc_user_path);
send_fd(sockfd, fd_tree);
if (close(fd_tree) < 0) {
sane_kill(pid, SIGKILL);
bail("error closing fd_tree");
}
idmap_src += strlen(idmap_src) + 1;
}
if (close(userns_fd) < 0) {
sane_kill(pid, SIGKILL);
bail("error closing userns fd");
}
}
void receive_idmapsources(int sockfd)
{
receive_fd_sources(sockfd, "_LIBCONTAINER_IDMAP_FDS");
}
static void update_timens_offsets(pid_t pid, char *map, size_t map_len) static void update_timens_offsets(pid_t pid, char *map, size_t map_len)
{ {
if (map == NULL || map_len == 0) if (map == NULL || map_len == 0)
@@ -988,28 +771,6 @@ void nsexec(void)
sane_kill(stage2_pid, SIGKILL); sane_kill(stage2_pid, SIGKILL);
bail("failed to sync with runc: write(pid-JSON)"); bail("failed to sync with runc: write(pid-JSON)");
} }
break;
case SYNC_MOUNTSOURCES_PLS:
write_log(DEBUG, "stage-1 requested to open mount sources");
send_mountsources(syncfd, stage1_pid, config.mountsources,
config.mountsources_len);
s = SYNC_MOUNTSOURCES_ACK;
if (write(syncfd, &s, sizeof(s)) != sizeof(s)) {
sane_kill(stage1_pid, SIGKILL);
bail("failed to sync with child: write(SYNC_MOUNTSOURCES_ACK)");
}
break;
case SYNC_MOUNT_IDMAP_PLS:
write_log(DEBUG, "stage-1 requested to open idmap sources");
send_idmapsources(syncfd, stage1_pid, config.idmapsources,
config.idmapsources_len);
s = SYNC_MOUNT_IDMAP_ACK;
if (write(syncfd, &s, sizeof(s)) != sizeof(s)) {
sane_kill(stage1_pid, SIGKILL);
bail("failed to sync with child: write(SYNC_MOUNT_IDMAP_ACK)");
}
break; break;
case SYNC_TIMEOFFSETS_PLS: case SYNC_TIMEOFFSETS_PLS:
write_log(DEBUG, "stage-1 requested timens offsets to be configured"); write_log(DEBUG, "stage-1 requested timens offsets to be configured");
@@ -1186,38 +947,6 @@ void nsexec(void)
bail("failed to sync with parent: SYNC_TIMEOFFSETS_ACK: got %u", s); bail("failed to sync with parent: SYNC_TIMEOFFSETS_ACK: got %u", s);
} }
/* Ask our parent to send the mount sources fds. */
if (config.mountsources) {
write_log(DEBUG, "request stage-0 to send mount sources");
s = SYNC_MOUNTSOURCES_PLS;
if (write(syncfd, &s, sizeof(s)) != sizeof(s))
bail("failed to sync with parent: write(SYNC_MOUNTSOURCES_PLS)");
/* Receive and install all mount sources fds. */
receive_mountsources(syncfd);
/* Parent finished to send the mount sources fds. */
if (read(syncfd, &s, sizeof(s)) != sizeof(s))
bail("failed to sync with parent: read(SYNC_MOUNTSOURCES_ACK)");
if (s != SYNC_MOUNTSOURCES_ACK)
bail("failed to sync with parent: SYNC_MOUNTSOURCES_ACK: got %u", s);
}
if (config.idmapsources) {
write_log(DEBUG, "request stage-0 to send idmap sources");
s = SYNC_MOUNT_IDMAP_PLS;
if (write(syncfd, &s, sizeof(s)) != sizeof(s))
bail("failed to sync with parent: write(SYNC_MOUNT_IDMAP_PLS)");
/* Receive and install all idmap fds. */
receive_idmapsources(syncfd);
if (read(syncfd, &s, sizeof(s)) != sizeof(s))
bail("failed to sync with parent: read(SYNC_MOUNT_IDMAP_ACK)");
if (s != SYNC_MOUNT_IDMAP_ACK)
bail("failed to sync with parent: SYNC_MOUNT_IDMAP_ACK: got %u", s);
}
/* /*
* TODO: What about non-namespace clone flags that we're dropping here? * TODO: What about non-namespace clone flags that we're dropping here?
* *

View File

@@ -1,6 +1,7 @@
package libcontainer package libcontainer
import ( import (
"context"
"encoding/json" "encoding/json"
"errors" "errors"
"fmt" "fmt"
@@ -11,18 +12,21 @@ import (
"path/filepath" "path/filepath"
"runtime" "runtime"
"strconv" "strconv"
"sync"
"time" "time"
"github.com/opencontainers/runtime-spec/specs-go"
"github.com/sirupsen/logrus"
"golang.org/x/sys/unix"
"github.com/opencontainers/runc/libcontainer/cgroups" "github.com/opencontainers/runc/libcontainer/cgroups"
"github.com/opencontainers/runc/libcontainer/cgroups/fs2" "github.com/opencontainers/runc/libcontainer/cgroups/fs2"
"github.com/opencontainers/runc/libcontainer/configs" "github.com/opencontainers/runc/libcontainer/configs"
"github.com/opencontainers/runc/libcontainer/intelrdt" "github.com/opencontainers/runc/libcontainer/intelrdt"
"github.com/opencontainers/runc/libcontainer/logs" "github.com/opencontainers/runc/libcontainer/logs"
"github.com/opencontainers/runc/libcontainer/system" "github.com/opencontainers/runc/libcontainer/system"
"github.com/opencontainers/runc/libcontainer/userns"
"github.com/opencontainers/runc/libcontainer/utils" "github.com/opencontainers/runc/libcontainer/utils"
"github.com/opencontainers/runtime-spec/specs-go"
"github.com/sirupsen/logrus"
"golang.org/x/sys/unix"
) )
type parentProcess interface { type parentProcess interface {
@@ -208,6 +212,9 @@ func (p *setnsProcess) start() (retErr error) {
case procHooks: case procHooks:
// This shouldn't happen. // This shouldn't happen.
panic("unexpected procHooks in setns") panic("unexpected procHooks in setns")
case procMountPlease:
// This shouldn't happen.
panic("unexpected procMountPlease in setns")
case procSeccomp: case procSeccomp:
if p.config.Config.Seccomp.ListenerPath == "" { if p.config.Config.Seccomp.ListenerPath == "" {
return errors.New("seccomp listenerPath is not set") return errors.New("seccomp listenerPath is not set")
@@ -398,6 +405,110 @@ func (p *initProcess) waitForChildExit(childPid int) error {
return nil return nil
} }
type mountSourceRequestFn func(*configs.Mount) (*mountSource, error)
// goCreateMountSources spawns a goroutine which creates open_tree(2)-style
// mountfds based on the requested configs.Mount configuration. The returned
// requestFn and cancelFn are used to interact with the goroutine.
//
// The caller of the returned mountSourceRequestFn is responsible for closing
// the returned file.
func (p *initProcess) goCreateMountSources(ctx context.Context) (mountSourceRequestFn, context.CancelFunc, error) {
type response struct {
src *mountSource
err error
}
errCh := make(chan error, 1)
requestCh := make(chan *configs.Mount)
responseCh := make(chan response)
ctx, cancelFn := context.WithTimeout(ctx, 1*time.Minute)
go func() {
// We lock this thread because we need to setns(2) here. There is no
// UnlockOSThread() here, to ensure that the Go runtime will kill this
// thread once this goroutine returns (ensuring no other goroutines run
// in this context).
runtime.LockOSThread()
// Detach from the shared fs of the rest of the Go process in order to
// be able to CLONE_NEWNS.
if err := unix.Unshare(unix.CLONE_FS); err != nil {
err = os.NewSyscallError("unshare(CLONE_FS)", err)
errCh <- fmt.Errorf("mount source thread: %w", err)
return
}
// Attach to the container's mount namespace.
nsFd, err := os.Open(fmt.Sprintf("/proc/%d/ns/mnt", p.pid()))
if err != nil {
errCh <- fmt.Errorf("mount source thread: open container mntns: %w", err)
return
}
defer nsFd.Close()
if err := unix.Setns(int(nsFd.Fd()), unix.CLONE_NEWNS); err != nil {
err = os.NewSyscallError("setns", err)
errCh <- fmt.Errorf("mount source thread: join container mntns: %w", err)
return
}
// No errors during setup!
close(errCh)
logrus.Debugf("mount source thread: successfully running in container mntns")
nsHandles := new(userns.Handles)
defer nsHandles.Release()
loop:
for {
select {
case m, ok := <-requestCh:
if !ok {
break loop
}
src, err := mountFd(nsHandles, m)
logrus.Debugf("mount source thread: handling request for %q: %v %v", m.Source, src, err)
responseCh <- response{
src: src,
err: err,
}
case <-ctx.Done():
break loop
}
}
logrus.Debugf("mount source thread: closing thread: %v", ctx.Err())
close(responseCh)
}()
// Check for setup errors.
err := <-errCh
if err != nil {
cancelFn()
return nil, nil, err
}
// TODO: Switch to context.AfterFunc when we switch to Go 1.21.
var requestChCloseOnce sync.Once
requestFn := func(m *configs.Mount) (*mountSource, error) {
var err error
select {
case requestCh <- m:
select {
case resp, ok := <-responseCh:
if ok {
return resp.src, resp.err
}
case <-ctx.Done():
err = fmt.Errorf("receive mount source context cancelled: %w", ctx.Err())
}
case <-ctx.Done():
err = fmt.Errorf("send mount request cancelled: %w", ctx.Err())
}
requestChCloseOnce.Do(func() { close(requestCh) })
return nil, err
}
return requestFn, cancelFn, nil
}
func (p *initProcess) start() (retErr error) { func (p *initProcess) start() (retErr error) {
defer p.comm.closeParent() defer p.comm.closeParent()
err := p.cmd.Start() err := p.cmd.Start()
@@ -487,6 +598,22 @@ func (p *initProcess) start() (retErr error) {
return fmt.Errorf("error waiting for our first child to exit: %w", err) return fmt.Errorf("error waiting for our first child to exit: %w", err)
} }
// Spin up a goroutine to handle remapping mount requests by runc init.
// There is no point doing this for rootless containers because they cannot
// configure MOUNT_ATTR_IDMAP, nor do OPEN_TREE_CLONE. We could just
// service plain-open requests for plain bind-mounts but there's no need
// (rootless containers will never have permission issues on a source mount
// that the parent process can help with -- they are the same user).
var mountRequest mountSourceRequestFn
if !p.container.config.RootlessEUID {
request, cancel, err := p.goCreateMountSources(context.Background())
if err != nil {
return fmt.Errorf("error spawning mount remapping thread: %w", err)
}
defer cancel()
mountRequest = request
}
if err := p.createNetworkInterfaces(); err != nil { if err := p.createNetworkInterfaces(); err != nil {
return fmt.Errorf("error creating network interfaces: %w", err) return fmt.Errorf("error creating network interfaces: %w", err)
} }
@@ -500,6 +627,35 @@ func (p *initProcess) start() (retErr error) {
var seenProcReady bool var seenProcReady bool
ierr := parseSync(p.comm.syncSockParent, func(sync *syncT) error { ierr := parseSync(p.comm.syncSockParent, func(sync *syncT) error {
switch sync.Type { switch sync.Type {
case procMountPlease:
if mountRequest == nil {
return fmt.Errorf("cannot fulfil mount requests as a rootless user")
}
var m *configs.Mount
if sync.Arg == nil {
return fmt.Errorf("sync %q is missing an argument", sync.Type)
}
if err := json.Unmarshal(*sync.Arg, &m); err != nil {
return fmt.Errorf("sync %q passed invalid mount arg: %w", sync.Type, err)
}
mnt, err := mountRequest(m)
if err != nil {
return fmt.Errorf("failed to fulfil mount request: %w", err)
}
defer mnt.file.Close()
arg, err := json.Marshal(mnt)
if err != nil {
return fmt.Errorf("sync %q failed to marshal mountSource: %w", sync.Type, err)
}
argMsg := json.RawMessage(arg)
if err := doWriteSync(p.comm.syncSockParent, syncT{
Type: procMountFd,
Arg: &argMsg,
File: mnt.file,
}); err != nil {
return err
}
case procSeccomp: case procSeccomp:
if p.config.Config.Seccomp.ListenerPath == "" { if p.config.Config.Seccomp.ListenerPath == "" {
return errors.New("seccomp listenerPath is not set") return errors.New("seccomp listenerPath is not set")

View File

@@ -1,6 +1,7 @@
package libcontainer package libcontainer
import ( import (
"encoding/json"
"errors" "errors"
"fmt" "fmt"
"os" "os"
@@ -13,16 +14,17 @@ import (
securejoin "github.com/cyphar/filepath-securejoin" securejoin "github.com/cyphar/filepath-securejoin"
"github.com/moby/sys/mountinfo" "github.com/moby/sys/mountinfo"
"github.com/mrunalp/fileutils" "github.com/mrunalp/fileutils"
"github.com/opencontainers/runtime-spec/specs-go"
"github.com/opencontainers/selinux/go-selinux/label"
"github.com/sirupsen/logrus"
"golang.org/x/sys/unix"
"github.com/opencontainers/runc/libcontainer/cgroups" "github.com/opencontainers/runc/libcontainer/cgroups"
"github.com/opencontainers/runc/libcontainer/cgroups/fs2" "github.com/opencontainers/runc/libcontainer/cgroups/fs2"
"github.com/opencontainers/runc/libcontainer/configs" "github.com/opencontainers/runc/libcontainer/configs"
"github.com/opencontainers/runc/libcontainer/devices" "github.com/opencontainers/runc/libcontainer/devices"
"github.com/opencontainers/runc/libcontainer/userns" "github.com/opencontainers/runc/libcontainer/userns"
"github.com/opencontainers/runc/libcontainer/utils" "github.com/opencontainers/runc/libcontainer/utils"
"github.com/opencontainers/runtime-spec/specs-go"
"github.com/opencontainers/selinux/go-selinux/label"
"github.com/sirupsen/logrus"
"golang.org/x/sys/unix"
) )
const defaultMountFlags = unix.MS_NOEXEC | unix.MS_NOSUID | unix.MS_NODEV const defaultMountFlags = unix.MS_NOEXEC | unix.MS_NOSUID | unix.MS_NODEV
@@ -39,12 +41,12 @@ type mountConfig struct {
// mountEntry contains mount data specific to a mount point. // mountEntry contains mount data specific to a mount point.
type mountEntry struct { type mountEntry struct {
*configs.Mount *configs.Mount
srcFD *int srcFile *mountSource
} }
func (m *mountEntry) src() string { func (m *mountEntry) src() string {
if m.srcFD != nil { if m.srcFile != nil {
return "/proc/self/fd/" + strconv.Itoa(*m.srcFD) return "/proc/self/fd/" + strconv.Itoa(int(m.srcFile.file.Fd()))
} }
return m.Source return m.Source
} }
@@ -62,20 +64,12 @@ func needsSetupDev(config *configs.Config) bool {
// prepareRootfs sets up the devices, mount points, and filesystems for use // prepareRootfs sets up the devices, mount points, and filesystems for use
// inside a new mount namespace. It doesn't set anything as ro. You must call // inside a new mount namespace. It doesn't set anything as ro. You must call
// finalizeRootfs after this function to finish setting up the rootfs. // finalizeRootfs after this function to finish setting up the rootfs.
func prepareRootfs(pipe *syncSocket, iConfig *initConfig, mountFds mountFds) (err error) { func prepareRootfs(pipe *syncSocket, iConfig *initConfig) (err error) {
config := iConfig.Config config := iConfig.Config
if err := prepareRoot(config); err != nil { if err := prepareRoot(config); err != nil {
return fmt.Errorf("error preparing rootfs: %w", err) return fmt.Errorf("error preparing rootfs: %w", err)
} }
if mountFds.sourceFds != nil && len(mountFds.sourceFds) != len(config.Mounts) {
return fmt.Errorf("malformed mountFds slice. Expected size: %v, got: %v", len(config.Mounts), len(mountFds.sourceFds))
}
if mountFds.idmapFds != nil && len(mountFds.idmapFds) != len(config.Mounts) {
return fmt.Errorf("malformed idmapFds slice: expected size: %v, got: %v", len(config.Mounts), len(mountFds.idmapFds))
}
mountConfig := &mountConfig{ mountConfig := &mountConfig{
root: config.Rootfs, root: config.Rootfs,
label: config.MountLabel, label: config.MountLabel,
@@ -83,22 +77,53 @@ func prepareRootfs(pipe *syncSocket, iConfig *initConfig, mountFds mountFds) (er
rootlessCgroups: iConfig.RootlessCgroups, rootlessCgroups: iConfig.RootlessCgroups,
cgroupns: config.Namespaces.Contains(configs.NEWCGROUP), cgroupns: config.Namespaces.Contains(configs.NEWCGROUP),
} }
for i, m := range config.Mounts { for _, m := range config.Mounts {
entry := mountEntry{Mount: m} entry := mountEntry{Mount: m}
// Just before the loop we checked that if not empty, len(mountFds.sourceFds) == len(config.Mounts). // Figure out whether we need to request runc to give us an
// Therefore, we can access mountFds.sourceFds[i] without any concerns. // open_tree(2)-style mountfd. For idmapped mounts, this is always
if mountFds.sourceFds != nil && mountFds.sourceFds[i] != -1 { // necessary. For bind-mounts, this is only necessary if we cannot
entry.srcFD = &mountFds.sourceFds[i] // resolve the parent mount (this is only hit if you are running in a
// userns -- but for rootless the host-side thread can't help).
wantSourceFile := m.IsIDMapped()
if m.IsBind() && !config.RootlessEUID {
if _, err := os.Stat(m.Source); err != nil {
wantSourceFile = true
} }
// We validated before we can access mountFds.idmapFds[i].
if mountFds.idmapFds != nil && mountFds.idmapFds[i] != -1 {
if entry.srcFD != nil {
return fmt.Errorf("malformed mountFds and idmapFds slice, entry: %v has fds in both slices", i)
} }
entry.srcFD = &mountFds.idmapFds[i] if wantSourceFile {
// Request a source file from the host.
if err := writeSyncArg(pipe, procMountPlease, m); err != nil {
return fmt.Errorf("failed to request mountfd for %q: %w", m.Source, err)
}
sync, err := readSyncFull(pipe, procMountFd)
if err != nil {
return fmt.Errorf("mountfd request for %q failed: %w", m.Source, err)
}
if sync.File == nil {
return fmt.Errorf("mountfd request for %q: response missing attached fd", m.Source)
}
defer sync.File.Close()
// Sanity-check to make sure we didn't get the wrong fd back. Note
// that while m.Source might contain symlinks, the (*os.File).Name
// is based on the path provided to os.OpenFile, not what it
// resolves to. So this should never happen.
if sync.File.Name() != m.Source {
return fmt.Errorf("returned mountfd for %q doesn't match requested mount configuration: mountfd path is %q", m.Source, sync.File.Name())
}
// Unmarshal the procMountFd argument (the file is sync.File).
var src *mountSource
if sync.Arg == nil {
return fmt.Errorf("sync %q is missing an argument", sync.Type)
}
if err := json.Unmarshal(*sync.Arg, &src); err != nil {
return fmt.Errorf("invalid mount fd response argument %q: %w", string(*sync.Arg), err)
}
if src == nil {
return fmt.Errorf("mountfd request for %q: no mount source info received", m.Source)
}
src.file = sync.File
entry.srcFile = src
} }
if err := mountToRootfs(mountConfig, entry); err != nil { if err := mountToRootfs(mountConfig, entry); err != nil {
return fmt.Errorf("error mounting %q to rootfs at %q: %w", m.Source, m.Destination, err) return fmt.Errorf("error mounting %q to rootfs at %q: %w", m.Source, m.Destination, err)
} }
@@ -281,7 +306,7 @@ func mountCgroupV1(m *configs.Mount, c *mountConfig) error {
if err := os.MkdirAll(subsystemPath, 0o755); err != nil { if err := os.MkdirAll(subsystemPath, 0o755); err != nil {
return err return err
} }
if err := utils.WithProcfd(c.root, b.Destination, func(dstFD string) error { if err := utils.WithProcfd(c.root, b.Destination, func(dstFd string) error {
flags := defaultMountFlags flags := defaultMountFlags
if m.Flags&unix.MS_RDONLY != 0 { if m.Flags&unix.MS_RDONLY != 0 {
flags = flags | unix.MS_RDONLY flags = flags | unix.MS_RDONLY
@@ -294,7 +319,7 @@ func mountCgroupV1(m *configs.Mount, c *mountConfig) error {
data = cgroups.CgroupNamePrefix + data data = cgroups.CgroupNamePrefix + data
source = "systemd" source = "systemd"
} }
return mountViaFDs(source, nil, b.Destination, dstFD, "cgroup", uintptr(flags), data) return mountViaFds(source, nil, b.Destination, dstFd, "cgroup", uintptr(flags), data)
}); err != nil { }); err != nil {
return err return err
} }
@@ -325,8 +350,8 @@ func mountCgroupV2(m *configs.Mount, c *mountConfig) error {
if err := os.MkdirAll(dest, 0o755); err != nil { if err := os.MkdirAll(dest, 0o755); err != nil {
return err return err
} }
err = utils.WithProcfd(c.root, m.Destination, func(dstFD string) error { err = utils.WithProcfd(c.root, m.Destination, func(dstFd string) error {
return mountViaFDs(m.Source, nil, m.Destination, dstFD, "cgroup2", uintptr(m.Flags), m.Data) return mountViaFds(m.Source, nil, m.Destination, dstFd, "cgroup2", uintptr(m.Flags), m.Data)
}) })
if err == nil || !(errors.Is(err, unix.EPERM) || errors.Is(err, unix.EBUSY)) { if err == nil || !(errors.Is(err, unix.EPERM) || errors.Is(err, unix.EBUSY)) {
return err return err
@@ -347,7 +372,6 @@ func mountCgroupV2(m *configs.Mount, c *mountConfig) error {
bindM.Source = c.cgroup2Path bindM.Source = c.cgroup2Path
} }
// mountToRootfs() handles remounting for MS_RDONLY. // mountToRootfs() handles remounting for MS_RDONLY.
// No need to set mountEntry.srcFD here, because mountToRootfs() calls utils.WithProcfd() by itself in mountPropagate().
err = mountToRootfs(c, mountEntry{Mount: bindM}) err = mountToRootfs(c, mountEntry{Mount: bindM})
if c.rootlessCgroups && errors.Is(err, unix.ENOENT) { if c.rootlessCgroups && errors.Is(err, unix.ENOENT) {
// ENOENT (for `src = c.cgroup2Path`) happens when rootless runc is being executed // ENOENT (for `src = c.cgroup2Path`) happens when rootless runc is being executed
@@ -392,15 +416,15 @@ func doTmpfsCopyUp(m mountEntry, rootfs, mountLabel string) (Err error) {
} }
}() }()
return utils.WithProcfd(rootfs, m.Destination, func(dstFD string) (Err error) { return utils.WithProcfd(rootfs, m.Destination, func(dstFd string) (Err error) {
// Copy the container data to the host tmpdir. We append "/" to force // Copy the container data to the host tmpdir. We append "/" to force
// CopyDirectory to resolve the symlink rather than trying to copy the // CopyDirectory to resolve the symlink rather than trying to copy the
// symlink itself. // symlink itself.
if err := fileutils.CopyDirectory(dstFD+"/", tmpDir); err != nil { if err := fileutils.CopyDirectory(dstFd+"/", tmpDir); err != nil {
return fmt.Errorf("tmpcopyup: failed to copy %s to %s (%s): %w", m.Destination, dstFD, tmpDir, err) return fmt.Errorf("tmpcopyup: failed to copy %s to %s (%s): %w", m.Destination, dstFd, tmpDir, err)
} }
// Now move the mount into the container. // Now move the mount into the container.
if err := mountViaFDs(tmpDir, nil, m.Destination, dstFD, "", unix.MS_MOVE, ""); err != nil { if err := mountViaFds(tmpDir, nil, m.Destination, dstFd, "", unix.MS_MOVE, ""); err != nil {
return fmt.Errorf("tmpcopyup: failed to move mount: %w", err) return fmt.Errorf("tmpcopyup: failed to move mount: %w", err)
} }
return nil return nil
@@ -522,36 +546,10 @@ func mountToRootfs(c *mountConfig, m mountEntry) error {
if err := prepareBindMount(m, rootfs); err != nil { if err := prepareBindMount(m, rootfs); err != nil {
return err return err
} }
// open_tree()-related shenanigans are all handled in mountViaFds.
if m.IsBind() && m.IsIDMapped() {
if m.srcFD == nil {
return fmt.Errorf("error creating mount %+v: idmapFD is invalid, should point to a valid fd", m)
}
if err := unix.MoveMount(*m.srcFD, "", unix.AT_FDCWD, dest, unix.MOVE_MOUNT_F_EMPTY_PATH); err != nil {
return fmt.Errorf("error on unix.MoveMount %+v: %w", m, err)
}
// In nsexec.c, we did not set the propagation field of mount_attr struct.
// So, let's deal with these flags right now!
if err := utils.WithProcfd(rootfs, dest, func(dstFD string) error {
for _, pflag := range m.PropagationFlags {
// When using mount for setting propagations flags, the source, file
// system type and data arguments are ignored:
// https://man7.org/linux/man-pages/man2/mount.2.html
// We also ignore procfd because we want to act on dest.
if err := mountViaFDs("", nil, dest, dstFD, "", uintptr(pflag), ""); err != nil {
return err
}
}
return nil
}); err != nil {
return fmt.Errorf("change mount propagation through procfd: %w", err)
}
} else {
if err := mountPropagate(m, rootfs, mountLabel); err != nil { if err := mountPropagate(m, rootfs, mountLabel); err != nil {
return err return err
} }
}
// The initial MS_BIND won't change the mount options, we need to do a // The initial MS_BIND won't change the mount options, we need to do a
// separate MS_BIND|MS_REMOUNT to apply the mount options. We skip // separate MS_BIND|MS_REMOUNT to apply the mount options. We skip
@@ -563,7 +561,7 @@ func mountToRootfs(c *mountConfig, m mountEntry) error {
// contrast to mount(8)'s current behaviour, but is what users probably // contrast to mount(8)'s current behaviour, but is what users probably
// expect. See <https://github.com/util-linux/util-linux/issues/2433>. // expect. See <https://github.com/util-linux/util-linux/issues/2433>.
if m.Flags & ^(unix.MS_BIND|unix.MS_REC|unix.MS_REMOUNT) != 0 || m.ClearedFlags != 0 { if m.Flags & ^(unix.MS_BIND|unix.MS_REC|unix.MS_REMOUNT) != 0 || m.ClearedFlags != 0 {
if err := utils.WithProcfd(rootfs, m.Destination, func(dstFD string) error { if err := utils.WithProcfd(rootfs, m.Destination, func(dstFd string) error {
flags := m.Flags | unix.MS_BIND | unix.MS_REMOUNT flags := m.Flags | unix.MS_BIND | unix.MS_REMOUNT
// The runtime-spec says we SHOULD map to the relevant mount(8) // The runtime-spec says we SHOULD map to the relevant mount(8)
// behaviour. However, it's not clear whether we want the // behaviour. However, it's not clear whether we want the
@@ -590,7 +588,7 @@ func mountToRootfs(c *mountConfig, m mountEntry) error {
// different set of flags. This also has the mount(8) bug where // different set of flags. This also has the mount(8) bug where
// "nodiratime,norelatime" will result in a // "nodiratime,norelatime" will result in a
// "nodiratime,relatime" mount. // "nodiratime,relatime" mount.
mountErr := mountViaFDs("", nil, m.Destination, dstFD, "", uintptr(flags), "") mountErr := mountViaFds("", nil, m.Destination, dstFd, "", uintptr(flags), "")
if mountErr == nil { if mountErr == nil {
return nil return nil
} }
@@ -639,7 +637,7 @@ func mountToRootfs(c *mountConfig, m mountEntry) error {
// Retry the mount with the existing lockable mount flags // Retry the mount with the existing lockable mount flags
// applied. // applied.
flags |= srcFlags & mntLockFlags flags |= srcFlags & mntLockFlags
mountErr = mountViaFDs("", nil, m.Destination, dstFD, "", uintptr(flags), "") mountErr = mountViaFds("", nil, m.Destination, dstFd, "", uintptr(flags), "")
logrus.Debugf("remount retry: srcFlags=0x%x flagsSet=0x%x flagsClr=0x%x: %v", srcFlags, m.Flags, m.ClearedFlags, mountErr) logrus.Debugf("remount retry: srcFlags=0x%x flagsSet=0x%x flagsClr=0x%x: %v", srcFlags, m.Flags, m.ClearedFlags, mountErr)
return mountErr return mountErr
}); err != nil { }); err != nil {
@@ -857,8 +855,8 @@ func bindMountDeviceNode(rootfs, dest string, node *devices.Device) error {
if f != nil { if f != nil {
_ = f.Close() _ = f.Close()
} }
return utils.WithProcfd(rootfs, dest, func(dstFD string) error { return utils.WithProcfd(rootfs, dest, func(dstFd string) error {
return mountViaFDs(node.Path, nil, dest, dstFD, "bind", unix.MS_BIND, "") return mountViaFds(node.Path, nil, dest, dstFd, "bind", unix.MS_BIND, "")
}) })
} }
@@ -1251,17 +1249,17 @@ func mountPropagate(m mountEntry, rootfs string, mountLabel string) error {
// mutating underneath us, we verify that we are actually going to mount // mutating underneath us, we verify that we are actually going to mount
// inside the container with WithProcfd() -- mounting through a procfd // inside the container with WithProcfd() -- mounting through a procfd
// mounts on the target. // mounts on the target.
if err := utils.WithProcfd(rootfs, m.Destination, func(dstFD string) error { if err := utils.WithProcfd(rootfs, m.Destination, func(dstFd string) error {
return mountViaFDs(m.Source, m.srcFD, m.Destination, dstFD, m.Device, uintptr(flags), data) return mountViaFds(m.Source, m.srcFile, m.Destination, dstFd, m.Device, uintptr(flags), data)
}); err != nil { }); err != nil {
return err return err
} }
// We have to apply mount propagation flags in a separate WithProcfd() call // We have to apply mount propagation flags in a separate WithProcfd() call
// because the previous call invalidates the passed procfd -- the mount // because the previous call invalidates the passed procfd -- the mount
// target needs to be re-opened. // target needs to be re-opened.
if err := utils.WithProcfd(rootfs, m.Destination, func(dstFD string) error { if err := utils.WithProcfd(rootfs, m.Destination, func(dstFd string) error {
for _, pflag := range m.PropagationFlags { for _, pflag := range m.PropagationFlags {
if err := mountViaFDs("", nil, m.Destination, dstFD, "", uintptr(pflag), ""); err != nil { if err := mountViaFds("", nil, m.Destination, dstFd, "", uintptr(pflag), ""); err != nil {
return err return err
} }
} }

View File

@@ -27,7 +27,6 @@ type linuxStandardInit struct {
fifoFd int fifoFd int
logFd int logFd int
dmzExe *os.File dmzExe *os.File
mountFds mountFds
config *initConfig config *initConfig
} }
@@ -88,17 +87,7 @@ func (l *linuxStandardInit) Init() error {
// initialises the labeling system // initialises the labeling system
selinux.GetEnabled() selinux.GetEnabled()
// We don't need the mount nor idmap fds after prepareRootfs() nor if it fails. err := prepareRootfs(l.pipe, l.config)
err := prepareRootfs(l.pipe, l.config, l.mountFds)
for _, m := range append(l.mountFds.sourceFds, l.mountFds.idmapFds...) {
if m == -1 {
continue
}
if err := unix.Close(m); err != nil {
return fmt.Errorf("unable to close mountFds fds: %w", err)
}
}
if err != nil { if err != nil {
return err return err
} }

View File

@@ -21,6 +21,11 @@ type syncType string
// //
// [ child ] <-> [ parent ] // [ child ] <-> [ parent ]
// //
// procMountPlease --> [open(2) or open_tree(2) and configure mount]
// Arg: configs.Mount
// <-- procMountFd
// file: mountfd
//
// procSeccomp --> [forward fd to listenerPath] // procSeccomp --> [forward fd to listenerPath]
// file: seccomp fd // file: seccomp fd
// --- no return synchronisation // --- no return synchronisation
@@ -39,6 +44,8 @@ const (
procRun syncType = "procRun" procRun syncType = "procRun"
procHooks syncType = "procHooks" procHooks syncType = "procHooks"
procHooksDone syncType = "procHooksDone" procHooksDone syncType = "procHooksDone"
procMountPlease syncType = "procMountPlease"
procMountFd syncType = "procMountFd"
procSeccomp syncType = "procSeccomp" procSeccomp syncType = "procSeccomp"
procSeccompDone syncType = "procSeccompDone" procSeccompDone syncType = "procSeccompDone"
) )

View File

@@ -0,0 +1,153 @@
package userns
import (
"fmt"
"os"
"sort"
"strings"
"sync"
"syscall"
"github.com/sirupsen/logrus"
"golang.org/x/sys/unix"
"github.com/opencontainers/runc/libcontainer/configs"
)
type Mapping struct {
UIDMappings []configs.IDMap
GIDMappings []configs.IDMap
}
func (m Mapping) toSys() (uids, gids []syscall.SysProcIDMap) {
for _, uid := range m.UIDMappings {
uids = append(uids, syscall.SysProcIDMap{
ContainerID: uid.ContainerID,
HostID: uid.HostID,
Size: uid.Size,
})
}
for _, gid := range m.GIDMappings {
gids = append(gids, syscall.SysProcIDMap{
ContainerID: gid.ContainerID,
HostID: gid.HostID,
Size: gid.Size,
})
}
return
}
// id returns a unique identifier for this mapping, agnostic of the order of
// the uid and gid mappings (because the order doesn't matter to the kernel).
// The set of userns handles is indexed using this ID.
func (m Mapping) id() string {
var uids, gids []string
for _, idmap := range m.UIDMappings {
uids = append(uids, fmt.Sprintf("%d:%d:%d", idmap.ContainerID, idmap.HostID, idmap.Size))
}
for _, idmap := range m.GIDMappings {
gids = append(gids, fmt.Sprintf("%d:%d:%d", idmap.ContainerID, idmap.HostID, idmap.Size))
}
// We don't care about the sort order -- just sort them.
sort.Strings(uids)
sort.Strings(gids)
return "uid=" + strings.Join(uids, ",") + ";gid=" + strings.Join(gids, ",")
}
type Handles struct {
m sync.Mutex
maps map[string]*os.File
}
// Release all resources associated with this Handle. All existing files
// returned from Get() will continue to work even after calling Release(). The
// same Handles can be re-used after calling Release().
func (hs *Handles) Release() {
hs.m.Lock()
defer hs.m.Unlock()
// Close the files for good measure, though GC will do that for us anyway.
for _, file := range hs.maps {
_ = file.Close()
}
hs.maps = nil
}
func spawnProc(req Mapping) (*os.Process, error) {
// We need to spawn a subprocess with the requested mappings, which is
// unfortunately quite expensive. The "safe" way of doing this is natively
// with Go (and then spawning something like "sleep infinity"), but
// execve() is a waste of cycles because we just need some process to have
// the right mapping, we don't care what it's executing. The "unsafe"
// option of doing a clone() behind the back of Go is probably okay in
// theory as long as we just do kill(getpid(), SIGSTOP). However, if we
// tell Go to put the new process into PTRACE_TRACEME mode, we can avoid
// the exec and not have to faff around with the mappings.
//
// Note that Go's stdlib does not support newuidmap, but in the case of
// id-mapped mounts, it seems incredibly unlikely that the user will be
// requesting us to do a remapping as an unprivileged user with mappings
// they have privileges over.
logrus.Debugf("spawning dummy process for id-mapping %s", req.id())
uidMappings, gidMappings := req.toSys()
return os.StartProcess("/proc/self/exe", []string{"runc", "--help"}, &os.ProcAttr{
Sys: &syscall.SysProcAttr{
Cloneflags: unix.CLONE_NEWUSER,
UidMappings: uidMappings,
GidMappings: gidMappings,
GidMappingsEnableSetgroups: false,
// Put the process into PTRACE_TRACEME mode to allow us to get the
// userns without having a proper execve() target.
Ptrace: true,
},
})
}
func dupFile(f *os.File) (*os.File, error) {
newFd, err := unix.FcntlInt(f.Fd(), unix.F_DUPFD_CLOEXEC, 0)
if err != nil {
return nil, os.NewSyscallError("fcntl(F_DUPFD_CLOEXEC)", err)
}
return os.NewFile(uintptr(newFd), f.Name()), nil
}
// Get returns a handle to a /proc/$pid/ns/user nsfs file with the requested
// mapping. The processes spawned to produce userns nsfds are cached, so if
// equivalent user namespace mappings are requested, the same user namespace
// will be returned. The caller is responsible for closing the returned file
// descriptor.
func (hs *Handles) Get(req Mapping) (file *os.File, err error) {
hs.m.Lock()
defer hs.m.Unlock()
if hs.maps == nil {
hs.maps = make(map[string]*os.File)
}
file, ok := hs.maps[req.id()]
if !ok {
proc, err := spawnProc(req)
if err != nil {
return nil, fmt.Errorf("failed to spawn dummy process for map %s: %w", req.id(), err)
}
// Make sure we kill the helper process. We ignore errors because
// there's not much we can do about them anyway, and ultimately
defer func() {
_ = proc.Kill()
_, _ = proc.Wait()
}()
// Stash away a handle to the userns file. This is neater than keeping
// the process alive, because Go's GC can handle files much better than
// leaked processes, and having long-living useless processes seems
// less than ideal.
file, err = os.Open(fmt.Sprintf("/proc/%d/ns/user", proc.Pid))
if err != nil {
return nil, err
}
hs.maps[req.id()] = file
}
// Duplicate the file, to make sure the lifecycle of each *os.File we
// return is independent.
return dupFile(file)
}

View File

@@ -0,0 +1,52 @@
package userns
import (
"os"
"testing"
"github.com/opencontainers/runc/libcontainer/configs"
)
func BenchmarkSpawnProc(b *testing.B) {
if os.Geteuid() != 0 {
b.Skip("spawning user namespaced processes requires root")
}
// We can reuse the mapping as we call spawnProc() directly.
mapping := Mapping{
UIDMappings: []configs.IDMap{
{ContainerID: 0, HostID: 1337, Size: 142},
{ContainerID: 150, HostID: 0, Size: 1},
{ContainerID: 442, HostID: 1111, Size: 12},
{ContainerID: 1000, HostID: 9999, Size: 92},
{ContainerID: 9999, HostID: 1000000, Size: 4},
// Newer kernels support more than 5 entries, but stick to 5 here.
},
GIDMappings: []configs.IDMap{
{ContainerID: 1, HostID: 2337, Size: 142},
{ContainerID: 145, HostID: 6, Size: 1},
{ContainerID: 200, HostID: 1000, Size: 12},
{ContainerID: 1000, HostID: 9888, Size: 92},
{ContainerID: 8999, HostID: 1000000, Size: 4},
// Newer kernels support more than 5 entries, but stick to 5 here.
},
}
procs := make([]*os.Process, 0, b.N)
b.Cleanup(func() {
for _, proc := range procs {
if proc != nil {
_ = proc.Kill()
_, _ = proc.Wait()
}
}
})
for i := 0; i < b.N; i++ {
proc, err := spawnProc(mapping)
if err != nil {
b.Error(err)
}
procs = append(procs, proc)
}
}