1
0
mirror of https://github.com/opencontainers/runc.git synced 2025-07-13 04:01:45 +03:00

Support idmap mounts on volumes

This commit adds support for idmap mounts as specified in the runtime-spec.

We open the idmap source paths and call mount_setattr() in runc PARENT,
as we need privileges in the init userns for that, and then sends the
fds to the child process. For this fd passing we use the same mechanism
used in other parts of thecode, the _LIBCONTAINER_ env vars.

The mount is finished (unix.MoveMount) from go code, inside the userns,
so we reuse all the prepareBindMount() security checks and the remount
logic for some flags too.

This commit only supports idmap mounts when userns are used AND the mappings
are the same specified for the userns mapping. This limitation is to
simplify the initial implementation, as all our users so far only need
this, and we can avoid sending over netlink the mappings, creating a
userns with this custom mapping, etc. Future PRs will remove this
limitation.

Co-authored-by: Francis Laniel <flaniel@linux.microsoft.com>
Signed-off-by: Rodrigo Campos <rodrigoca@microsoft.com>
This commit is contained in:
Rodrigo Campos
2023-04-03 15:52:35 +02:00
parent fe4528b176
commit fda12ab101
6 changed files with 249 additions and 14 deletions

View File

@ -541,6 +541,38 @@ func (c *Container) shouldSendMountSources() bool {
return false return false
} }
// shouldSendIdmapSources says whether the child process must setup idmap mounts with
// the mount_setattr already done in the host user namespace.
func (c *Container) shouldSendIdmapSources() bool {
// nsexec.c mount_setattr() requires CAP_SYS_ADMIN in:
// * the user namespace the filesystem was mounted in;
// * the user namespace we're trying to idmap the mount to;
// * the owning user namespace of the mount namespace you're currently located in.
//
// See the comment from Christian Brauner:
// https://github.com/opencontainers/runc/pull/3717#discussion_r1103607972
//
// Let's just rule out rootless, we don't have those permission in the
// rootless case.
if c.config.RootlessEUID {
return false
}
// For the time being we require userns to be in use.
if !c.config.Namespaces.Contains(configs.NEWUSER) {
return false
}
// We need to send sources if there are idmap bind-mounts.
for _, m := range c.config.Mounts {
if m.IsBind() && m.IsIDMapped() {
return true
}
}
return false
}
func (c *Container) sendMountSources(cmd *exec.Cmd, messageSockPair filePair) error { func (c *Container) sendMountSources(cmd *exec.Cmd, messageSockPair filePair) error {
if !c.shouldSendMountSources() { if !c.shouldSendMountSources() {
return nil return nil
@ -551,6 +583,16 @@ func (c *Container) sendMountSources(cmd *exec.Cmd, messageSockPair filePair) er
}) })
} }
func (c *Container) sendIdmapSources(cmd *exec.Cmd, messageSockPair filePair) error {
if !c.shouldSendIdmapSources() {
return nil
}
return c.sendFdsSources(cmd, messageSockPair, "_LIBCONTAINER_IDMAP_FDS", func(m *configs.Mount) bool {
return m.IsBind() && m.IsIDMapped()
})
}
func (c *Container) sendFdsSources(cmd *exec.Cmd, messageSockPair filePair, envVar string, condition func(*configs.Mount) bool) error { func (c *Container) sendFdsSources(cmd *exec.Cmd, messageSockPair filePair, envVar string, condition func(*configs.Mount) bool) error {
// Elements on these slices will be paired with mounts (see StartInitialization() and // Elements on these slices will be paired with mounts (see StartInitialization() and
// prepareRootfs()). These slices MUST have the same size as c.config.Mounts. // prepareRootfs()). These slices MUST have the same size as c.config.Mounts.
@ -592,6 +634,9 @@ func (c *Container) newInitProcess(p *Process, cmd *exec.Cmd, messageSockPair, l
if err := c.sendMountSources(cmd, messageSockPair); err != nil { if err := c.sendMountSources(cmd, messageSockPair); err != nil {
return nil, err return nil, err
} }
if err := c.sendIdmapSources(cmd, messageSockPair); err != nil {
return nil, err
}
init := &initProcess{ init := &initProcess{
cmd: cmd, cmd: cmd,
@ -2256,6 +2301,29 @@ func (c *Container) bootstrapData(cloneFlags uintptr, nsMaps map[configs.Namespa
}) })
} }
// Idmap mount sources to open.
if it == initStandard && c.shouldSendIdmapSources() {
var mounts []byte
for _, m := range c.config.Mounts {
if m.IsBind() && m.IsIDMapped() {
// While other parts of the code check this too (like
// libcontainer/specconv/spec_linux.go) we do it here also because some libcontainer
// users don't use those functions.
if strings.IndexByte(m.Source, 0) >= 0 {
return nil, fmt.Errorf("mount source string contains null byte: %q", m.Source)
}
mounts = append(mounts, []byte(m.Source)...)
}
mounts = append(mounts, byte(0))
}
r.AddData(&Bytemsg{
Type: IdmapSourcesAttr,
Value: mounts,
})
}
return bytes.NewReader(r.Serialize()), nil return bytes.NewReader(r.Serialize()), nil
} }

View File

@ -48,12 +48,15 @@ type network struct {
} }
type mountFds struct { type mountFds struct {
// Fds to use as source when mounting // sourceFds are the fds to use as source when mounting.
// Size should be the same as container mounts, as it will be paired. // The slice size should be the same as container mounts, as it will be
// paired with them.
// The value -1 is used when no fd is needed for the mount. // The value -1 is used when no fd is needed for the mount.
// Can't have a valid fd in the same position that other slices in this struct. // Can't have a valid fd in the same position that other slices in this struct.
// We need to use only one of these fds on any single mount. // We need to use only one of these fds on any single mount.
sourceFds []int sourceFds []int
// Idem sourceFds, but fds of already created idmap mounts, to use with unix.MoveMount().
idmapFds []int
} }
// initConfig is used for transferring parameters from Exec() to Init() // initConfig is used for transferring parameters from Exec() to Init()
@ -142,6 +145,12 @@ func StartInitialization() (retErr error) {
return err return err
} }
// Get idmap fds.
idmapFds, err := parseFdsFromEnv("_LIBCONTAINER_IDMAP_FDS")
if err != nil {
return err
}
// clear the current process's environment to clean any libcontainer // clear the current process's environment to clean any libcontainer
// specific env vars. // specific env vars.
os.Clearenv() os.Clearenv()
@ -157,7 +166,7 @@ func StartInitialization() (retErr error) {
}() }()
// If init succeeds, it will not return, hence none of the defers will be called. // If init succeeds, it will not return, hence none of the defers will be called.
return containerInit(it, pipe, consoleSocket, fifofd, logPipeFd, mountFds{sourceFds: mountSrcFds}) return containerInit(it, pipe, consoleSocket, fifofd, logPipeFd, mountFds{sourceFds: mountSrcFds, idmapFds: idmapFds})
} }
func containerInit(t initType, pipe *os.File, consoleSocket *os.File, fifoFd, logFd int, mountFds mountFds) error { func containerInit(t initType, pipe *os.File, consoleSocket *os.File, fifoFd, logFd int, mountFds mountFds) error {
@ -170,9 +179,9 @@ func containerInit(t initType, pipe *os.File, consoleSocket *os.File, fifoFd, lo
} }
switch t { switch t {
case initSetns: case initSetns:
// mountFds must be nil in this case. We don't mount while doing runc exec. // mount and idmap fds must be nil in this case. We don't mount while doing runc exec.
if mountFds.sourceFds != nil { if mountFds.sourceFds != nil || mountFds.idmapFds != nil {
return errors.New("mount source fds must be nil; can't mount from exec") return errors.New("mount and idmap fds must be nil; can't mount from exec")
} }
i := &linuxSetnsInit{ i := &linuxSetnsInit{

View File

@ -22,6 +22,7 @@ const (
UidmapPathAttr uint16 = 27288 UidmapPathAttr uint16 = 27288
GidmapPathAttr uint16 = 27289 GidmapPathAttr uint16 = 27289
MountSourcesAttr uint16 = 27290 MountSourcesAttr uint16 = 27290
IdmapSourcesAttr uint16 = 27291
) )
type Int32msg struct { type Int32msg struct {

View File

@ -33,6 +33,9 @@
/* Get all of the CLONE_NEW* flags. */ /* Get all of the CLONE_NEW* flags. */
#include "namespace.h" #include "namespace.h"
/* Get definitions for idmap sources */
#include "idmap.h"
/* Synchronisation values. */ /* Synchronisation values. */
enum sync_t { enum sync_t {
SYNC_USERMAP_PLS = 0x40, /* Request parent to map our users. */ SYNC_USERMAP_PLS = 0x40, /* Request parent to map our users. */
@ -43,6 +46,8 @@ enum sync_t {
SYNC_CHILD_FINISH = 0x45, /* The child or grandchild has finished. */ SYNC_CHILD_FINISH = 0x45, /* The child or grandchild has finished. */
SYNC_MOUNTSOURCES_PLS = 0x46, /* Tell parent to send mount sources by SCM_RIGHTS. */ SYNC_MOUNTSOURCES_PLS = 0x46, /* Tell parent to send mount sources by SCM_RIGHTS. */
SYNC_MOUNTSOURCES_ACK = 0x47, /* All mount sources have been sent. */ SYNC_MOUNTSOURCES_ACK = 0x47, /* All mount sources have been sent. */
SYNC_MOUNT_IDMAP_PLS = 0x48, /* Tell parent to mount idmap sources. */
SYNC_MOUNT_IDMAP_ACK = 0x49, /* All idmap mounts have been done. */
}; };
#define STAGE_SETUP -1 #define STAGE_SETUP -1
@ -95,6 +100,10 @@ struct nlconfig_t {
/* Mount sources opened outside the container userns. */ /* Mount sources opened outside the container userns. */
char *mountsources; char *mountsources;
size_t mountsources_len; size_t mountsources_len;
/* Idmap sources opened outside the container userns which will be id mapped. */
char *idmapsources;
size_t idmapsources_len;
}; };
/* /*
@ -112,6 +121,7 @@ struct nlconfig_t {
#define UIDMAPPATH_ATTR 27288 #define UIDMAPPATH_ATTR 27288
#define GIDMAPPATH_ATTR 27289 #define GIDMAPPATH_ATTR 27289
#define MOUNT_SOURCES_ATTR 27290 #define MOUNT_SOURCES_ATTR 27290
#define IDMAP_SOURCES_ATTR 27291
/* /*
* Use the raw syscall for versions of glibc which don't include a function for * Use the raw syscall for versions of glibc which don't include a function for
@ -431,6 +441,10 @@ static void nl_parse(int fd, struct nlconfig_t *config)
config->mountsources = current; config->mountsources = current;
config->mountsources_len = payload_len; config->mountsources_len = payload_len;
break; break;
case IDMAP_SOURCES_ATTR:
config->idmapsources = current;
config->idmapsources_len = payload_len;
break;
default: default:
bail("unknown netlink message type %d", nlattr->nla_type); bail("unknown netlink message type %d", nlattr->nla_type);
} }
@ -650,6 +664,83 @@ void try_unshare(int flags, const char *msg)
bail("failed to unshare %s", msg); bail("failed to unshare %s", msg);
} }
void send_idmapsources(int sockfd, pid_t pid, char *idmap_src, int idmap_src_len)
{
char proc_user_path[PATH_MAX];
/* Open the userns fd only once.
* Currently we only support idmap mounts that use the same mapping than
* the userns. This is validated in libcontainer/configs/validate/validator.go,
* so if we reached here, we know the mapping for the idmap is the same
* as the userns. This is why we just open the userns_fd once from the
* PID of the child process that has the userns already applied.
*/
int ret = snprintf(proc_user_path, sizeof(proc_user_path), "/proc/%d/ns/user", pid);
if (ret < 0 || (size_t)ret >= sizeof(proc_user_path)) {
sane_kill(pid, SIGKILL);
bail("failed to create userns path string");
}
int userns_fd = open(proc_user_path, O_RDONLY | O_CLOEXEC | O_NOCTTY);
if (userns_fd < 0) {
sane_kill(pid, SIGKILL);
bail("failed to get user namespace fd");
}
char *idmap_end = idmap_src + idmap_src_len;
while (idmap_src < idmap_end) {
if (idmap_src[0] == '\0') {
idmap_src++;
continue;
}
int fd_tree = sys_open_tree(-EBADF, idmap_src,
OPEN_TREE_CLONE | OPEN_TREE_CLOEXEC |
AT_SYMLINK_NOFOLLOW | AT_NO_AUTOMOUNT);
if (fd_tree < 0) {
sane_kill(pid, SIGKILL);
if (errno == EINVAL)
bail("failed to use open_tree(2) with path: %s, the kernel doesn't supports ID-mapped mounts", idmap_src);
else
bail("failed to use open_tree(2) with path: %s", idmap_src);
}
struct mount_attr attr = {
.attr_set = MOUNT_ATTR_IDMAP,
.userns_fd = userns_fd,
};
ret = sys_mount_setattr(fd_tree, "", AT_EMPTY_PATH, &attr, sizeof(attr));
if (ret < 0) {
sane_kill(pid, SIGKILL);
if (errno == EINVAL)
bail("failed to change mount attributes, maybe the filesystem doesn't supports ID-mapped mounts");
else
bail("failed to change mount attributes");
}
write_log(DEBUG, "~> sending idmap source: %s with mapping from: %s", idmap_src, proc_user_path);
send_fd(sockfd, fd_tree);
if (close(fd_tree) < 0) {
sane_kill(pid, SIGKILL);
bail("error closing fd_tree");
}
idmap_src += strlen(idmap_src) + 1;
}
if (close(userns_fd) < 0) {
sane_kill(pid, SIGKILL);
bail("error closing userns fd");
}
}
void receive_idmapsources(int sockfd)
{
receive_fd_sources(sockfd, "_LIBCONTAINER_IDMAP_FDS");
}
void nsexec(void) void nsexec(void)
{ {
int pipenum; int pipenum;
@ -891,6 +982,17 @@ void nsexec(void)
sane_kill(stage1_pid, SIGKILL); sane_kill(stage1_pid, SIGKILL);
bail("failed to sync with child: write(SYNC_MOUNTSOURCES_ACK)"); bail("failed to sync with child: write(SYNC_MOUNTSOURCES_ACK)");
} }
break;
case SYNC_MOUNT_IDMAP_PLS:
write_log(DEBUG, "stage-1 requested to open idmap sources");
send_idmapsources(syncfd, stage1_pid, config.idmapsources,
config.idmapsources_len);
s = SYNC_MOUNT_IDMAP_ACK;
if (write(syncfd, &s, sizeof(s)) != sizeof(s)) {
sane_kill(stage1_pid, SIGKILL);
bail("failed to sync with child: write(SYNC_MOUNT_IDMAP_ACK)");
}
break; break;
case SYNC_CHILD_FINISH: case SYNC_CHILD_FINISH:
write_log(DEBUG, "stage-1 complete"); write_log(DEBUG, "stage-1 complete");
@ -1062,6 +1164,21 @@ void nsexec(void)
bail("failed to sync with parent: SYNC_MOUNTSOURCES_ACK: got %u", s); bail("failed to sync with parent: SYNC_MOUNTSOURCES_ACK: got %u", s);
} }
if (config.idmapsources) {
write_log(DEBUG, "request stage-0 to send idmap sources");
s = SYNC_MOUNT_IDMAP_PLS;
if (write(syncfd, &s, sizeof(s)) != sizeof(s))
bail("failed to sync with parent: write(SYNC_MOUNT_IDMAP_PLS)");
/* Receive and install all idmap fds. */
receive_idmapsources(syncfd);
if (read(syncfd, &s, sizeof(s)) != sizeof(s))
bail("failed to sync with parent: read(SYNC_MOUNT_IDMAP_ACK)");
if (s != SYNC_MOUNT_IDMAP_ACK)
bail("failed to sync with parent: SYNC_MOUNT_IDMAP_ACK: got %u", s);
}
/* /*
* TODO: What about non-namespace clone flags that we're dropping here? * TODO: What about non-namespace clone flags that we're dropping here?
* *

View File

@ -40,7 +40,8 @@ type mountConfig struct {
// mountEntry contains mount data specific to a mount point. // mountEntry contains mount data specific to a mount point.
type mountEntry struct { type mountEntry struct {
*configs.Mount *configs.Mount
srcFD string srcFD string
idmapFD int
} }
func (m *mountEntry) src() string { func (m *mountEntry) src() string {
@ -73,6 +74,10 @@ func prepareRootfs(pipe io.ReadWriter, iConfig *initConfig, mountFds mountFds) (
return fmt.Errorf("malformed mountFds slice. Expected size: %v, got: %v", len(config.Mounts), len(mountFds.sourceFds)) return fmt.Errorf("malformed mountFds slice. Expected size: %v, got: %v", len(config.Mounts), len(mountFds.sourceFds))
} }
if mountFds.idmapFds != nil && len(mountFds.idmapFds) != len(config.Mounts) {
return fmt.Errorf("malformed idmapFds slice: expected size: %v, got: %v", len(config.Mounts), len(mountFds.idmapFds))
}
mountConfig := &mountConfig{ mountConfig := &mountConfig{
root: config.Rootfs, root: config.Rootfs,
label: config.MountLabel, label: config.MountLabel,
@ -81,13 +86,22 @@ func prepareRootfs(pipe io.ReadWriter, iConfig *initConfig, mountFds mountFds) (
cgroupns: config.Namespaces.Contains(configs.NEWCGROUP), cgroupns: config.Namespaces.Contains(configs.NEWCGROUP),
} }
for i, m := range config.Mounts { for i, m := range config.Mounts {
entry := mountEntry{Mount: m} entry := mountEntry{Mount: m, idmapFD: -1}
// Just before the loop we checked that if not empty, len(mountFds) == len(config.Mounts). // Just before the loop we checked that if not empty, len(mountFds) == len(config.Mounts).
// Therefore, we can access mountFds[i] without any concerns. // Therefore, we can access mountFds[i] without any concerns.
if mountFds.sourceFds != nil && mountFds.sourceFds[i] != -1 { if mountFds.sourceFds != nil && mountFds.sourceFds[i] != -1 {
entry.srcFD = "/proc/self/fd/" + strconv.Itoa(mountFds.sourceFds[i]) entry.srcFD = "/proc/self/fd/" + strconv.Itoa(mountFds.sourceFds[i])
} }
// We validated before we can access idmapFds[i].
if mountFds.idmapFds != nil && mountFds.idmapFds[i] != -1 {
entry.idmapFD = mountFds.idmapFds[i]
}
if entry.idmapFD != -1 && entry.srcFD != "" {
return fmt.Errorf("malformed mountFds and idmapFds slice, entry: %v has fds in both slices", i)
}
if err := mountToRootfs(mountConfig, entry); err != nil { if err := mountToRootfs(mountConfig, entry); err != nil {
return fmt.Errorf("error mounting %q to rootfs at %q: %w", m.Source, m.Destination, err) return fmt.Errorf("error mounting %q to rootfs at %q: %w", m.Source, m.Destination, err)
} }
@ -466,8 +480,35 @@ func mountToRootfs(c *mountConfig, m mountEntry) error {
if err := prepareBindMount(m, rootfs); err != nil { if err := prepareBindMount(m, rootfs); err != nil {
return err return err
} }
if err := mountPropagate(m, rootfs, mountLabel); err != nil {
return err if m.IsBind() && m.IsIDMapped() {
if m.idmapFD == -1 {
return fmt.Errorf("error creating mount %+v: idmapFD is invalid, should point to a valid fd", m)
}
if err := unix.MoveMount(m.idmapFD, "", -1, dest, unix.MOVE_MOUNT_F_EMPTY_PATH); err != nil {
return fmt.Errorf("error on unix.MoveMount %+v: %w", m, err)
}
// In nsexec.c, we did not set the propagation field of mount_attr struct.
// So, let's deal with these flags right now!
if err := utils.WithProcfd(rootfs, dest, func(dstFD string) error {
for _, pflag := range m.PropagationFlags {
// When using mount for setting propagations flags, the source, file
// system type and data arguments are ignored:
// https://man7.org/linux/man-pages/man2/mount.2.html
// We also ignore procfd because we want to act on dest.
if err := mountViaFDs("", "", dest, dstFD, "", uintptr(pflag), ""); err != nil {
return err
}
}
return nil
}); err != nil {
return fmt.Errorf("change mount propagation through procfd: %w", err)
}
} else {
if err := mountPropagate(m, rootfs, mountLabel); err != nil {
return err
}
} }
// bind mount won't change mount options, we need remount to make mount options effective. // bind mount won't change mount options, we need remount to make mount options effective.
// first check that we have non-default options required before attempting a remount // first check that we have non-default options required before attempting a remount

View File

@ -86,15 +86,14 @@ func (l *linuxStandardInit) Init() error {
// initialises the labeling system // initialises the labeling system
selinux.GetEnabled() selinux.GetEnabled()
// We don't need the mountFds.SourceFds after prepareRootfs() nor if it fails. // We don't need the mount nor idmap fds after prepareRootfs() nor if it fails.
err := prepareRootfs(l.pipe, l.config, l.mountFds) err := prepareRootfs(l.pipe, l.config, l.mountFds)
for _, m := range l.mountFds.sourceFds { for _, m := range append(l.mountFds.sourceFds, l.mountFds.idmapFds...) {
if m == -1 { if m == -1 {
continue continue
} }
if err := unix.Close(m); err != nil { if err := unix.Close(m); err != nil {
return fmt.Errorf("unable to close mount sourceFds: %w", err) return fmt.Errorf("unable to close mountFds fds: %w", err)
} }
} }