1
0
mirror of https://github.com/opencontainers/runc.git synced 2025-11-09 13:00:56 +03:00

nsenter: improve debug logging

In order to make 'runc --debug' actually useful for debugging nsexec
bugs, provide information about all the internal operations when in
debug mode.

[@kolyshkin: rebasing; fix formatting via indent for make validate to pass]

Signed-off-by: Aleksa Sarai <asarai@suse.de>
Signed-off-by: Kir Kolyshkin <kolyshkin@gmail.com>
This commit is contained in:
Aleksa Sarai
2020-06-08 01:24:30 +10:00
committed by Kir Kolyshkin
parent bb28c44f12
commit 64bb59f592
4 changed files with 186 additions and 114 deletions

View File

@@ -35,8 +35,8 @@ const (
) )
type pid struct { type pid struct {
Pid int `json:"pid"` Pid int `json:"stage2_pid"`
PidFirstChild int `json:"pid_first"` PidFirstChild int `json:"stage1_pid"`
} }
// network is an internal struct used to setup container networks. // network is an internal struct used to setup container networks.

View File

@@ -36,7 +36,7 @@ enum sync_t {
SYNC_RECVPID_PLS = 0x42, /* Tell parent we're sending the PID. */ SYNC_RECVPID_PLS = 0x42, /* Tell parent we're sending the PID. */
SYNC_RECVPID_ACK = 0x43, /* PID was correctly received by parent. */ SYNC_RECVPID_ACK = 0x43, /* PID was correctly received by parent. */
SYNC_GRANDCHILD = 0x44, /* The grandchild is ready to run. */ SYNC_GRANDCHILD = 0x44, /* The grandchild is ready to run. */
SYNC_CHILD_READY = 0x45, /* The child or grandchild is ready to return. */ SYNC_CHILD_FINISH = 0x45, /* The child or grandchild has finished. */
}; };
/* /*
@@ -45,10 +45,14 @@ enum sync_t {
*/ */
#define CREATECGROUPNS 0x80 #define CREATECGROUPNS 0x80
#define STAGE_SETUP -1
/* longjmp() arguments. */ /* longjmp() arguments. */
#define JUMP_PARENT 0x00 #define STAGE_PARENT 0
#define JUMP_CHILD 0xA0 #define STAGE_CHILD 1
#define JUMP_INIT 0xA1 #define STAGE_INIT 2
/* Stores the current stage of nsexec. */
int current_stage = STAGE_SETUP;
/* Assume the stack grows down, so arguments should be above it. */ /* Assume the stack grows down, so arguments should be above it. */
struct clone_t { struct clone_t {
@@ -134,33 +138,41 @@ int setns(int fd, int nstype)
} }
#endif #endif
static void write_log_with_info(const char *level, const char *function, int line, const char *format, ...) static void write_log(const char *level, const char *format, ...)
{ {
char message[1024] = { }; char *message = NULL, *stage = NULL;
va_list args; va_list args;
int ret;
if (logfd < 0 || level == NULL) if (logfd < 0 || level == NULL)
return; goto out;
va_start(args, format); va_start(args, format);
if (vsnprintf(message, sizeof(message), format, args) < 0) ret = vasprintf(&message, format, args);
goto done;
dprintf(logfd, "{\"level\":\"%s\", \"msg\": \"%s:%d %s\"}\n", level, function, line, message);
done:
va_end(args); va_end(args);
} if (ret < 0)
goto out;
#define write_log(level, fmt, ...) \ if (current_stage == STAGE_SETUP)
write_log_with_info((level), __FUNCTION__, __LINE__, (fmt), ##__VA_ARGS__) stage = strdup("nsexec");
else
ret = asprintf(&stage, "nsexec-%d", current_stage);
if (ret < 0)
goto out;
dprintf(logfd, "{\"level\":\"%s\", \"msg\": \"%s[%d]: %s\"}\n", level, stage, getpid(), message);
out:
free(message);
free(stage);
}
/* XXX: This is ugly. */ /* XXX: This is ugly. */
static int syncfd = -1; static int syncfd = -1;
#define bail(fmt, ...) \ #define bail(fmt, ...) \
do { \ do { \
write_log(FATAL, "nsenter: " fmt ": %m", ##__VA_ARGS__); \ write_log(FATAL, fmt ": %m", ##__VA_ARGS__); \
exit(1); \ exit(1); \
} while(0) } while(0)
@@ -297,9 +309,11 @@ static void update_uidmap(const char *path, int pid, char *map, size_t map_len)
if (map == NULL || map_len <= 0) if (map == NULL || map_len <= 0)
return; return;
write_log(DEBUG, "update /proc/%d/uid_map to '%s'", pid, map);
if (write_file(map, map_len, "/proc/%d/uid_map", pid) < 0) { if (write_file(map, map_len, "/proc/%d/uid_map", pid) < 0) {
if (errno != EPERM) if (errno != EPERM)
bail("failed to update /proc/%d/uid_map", pid); bail("failed to update /proc/%d/uid_map", pid);
write_log(DEBUG, "update /proc/%d/uid_map got -EPERM (trying %s)", pid, path);
if (try_mapping_tool(path, pid, map, map_len)) if (try_mapping_tool(path, pid, map, map_len))
bail("failed to use newuid map on %d", pid); bail("failed to use newuid map on %d", pid);
} }
@@ -310,9 +324,11 @@ static void update_gidmap(const char *path, int pid, char *map, size_t map_len)
if (map == NULL || map_len <= 0) if (map == NULL || map_len <= 0)
return; return;
write_log(DEBUG, "update /proc/%d/gid_map to '%s'", pid, map);
if (write_file(map, map_len, "/proc/%d/gid_map", pid) < 0) { if (write_file(map, map_len, "/proc/%d/gid_map", pid) < 0) {
if (errno != EPERM) if (errno != EPERM)
bail("failed to update /proc/%d/gid_map", pid); bail("failed to update /proc/%d/gid_map", pid);
write_log(DEBUG, "update /proc/%d/gid_map got -EPERM (trying %s)", pid, path);
if (try_mapping_tool(path, pid, map, map_len)) if (try_mapping_tool(path, pid, map, map_len))
bail("failed to use newgid map on %d", pid); bail("failed to use newgid map on %d", pid);
} }
@@ -323,6 +339,7 @@ static void update_oom_score_adj(char *data, size_t len)
if (data == NULL || len <= 0) if (data == NULL || len <= 0)
return; return;
write_log(DEBUG, "update /proc/self/oom_score_adj to '%s'", data);
if (write_file(data, len, "/proc/self/oom_score_adj") < 0) if (write_file(data, len, "/proc/self/oom_score_adj") < 0)
bail("failed to update /proc/self/oom_score_adj"); bail("failed to update /proc/self/oom_score_adj");
} }
@@ -507,7 +524,6 @@ void join_namespaces(char *nslist)
char *namespace = strtok_r(nslist, ",", &saveptr); char *namespace = strtok_r(nslist, ",", &saveptr);
struct namespace_t { struct namespace_t {
int fd; int fd;
int ns;
char type[PATH_MAX]; char type[PATH_MAX];
char path[PATH_MAX]; char path[PATH_MAX];
} *namespaces = NULL; } *namespaces = NULL;
@@ -542,7 +558,7 @@ void join_namespaces(char *nslist)
bail("failed to open %s", path); bail("failed to open %s", path);
ns->fd = fd; ns->fd = fd;
ns->ns = nsflag(namespace); strncpy(ns->type, namespace, PATH_MAX - 1);
strncpy(ns->path, path, PATH_MAX - 1); strncpy(ns->path, path, PATH_MAX - 1);
ns->path[PATH_MAX - 1] = '\0'; ns->path[PATH_MAX - 1] = '\0';
} while ((namespace = strtok_r(NULL, ",", &saveptr)) != NULL); } while ((namespace = strtok_r(NULL, ",", &saveptr)) != NULL);
@@ -555,12 +571,14 @@ void join_namespaces(char *nslist)
*/ */
for (i = 0; i < num; i++) { for (i = 0; i < num; i++) {
struct namespace_t ns = namespaces[i]; struct namespace_t *ns = &namespaces[i];
int flag = nsflag(ns->type);
if (setns(ns.fd, ns.ns) < 0) write_log(DEBUG, "setns(%#x) into %s namespace (with path %s)", flag, ns->type, ns->path);
bail("failed to setns to %s", ns.path); if (setns(ns->fd, flag) < 0)
bail("failed to setns into %s namespace", ns->type);
close(ns.fd); close(ns->fd);
} }
free(namespaces); free(namespaces);
@@ -569,6 +587,14 @@ void join_namespaces(char *nslist)
/* Defined in cloned_binary.c. */ /* Defined in cloned_binary.c. */
extern int ensure_cloned_binary(void); extern int ensure_cloned_binary(void);
static inline int sane_kill(pid_t pid, int signum)
{
if (pid > 0)
return kill(pid, signum);
else
return 0;
}
void nsexec(void) void nsexec(void)
{ {
int pipenum; int pipenum;
@@ -605,7 +631,7 @@ void nsexec(void)
if (write(pipenum, "", 1) != 1) if (write(pipenum, "", 1) != 1)
bail("could not inform the parent we are past initial setup"); bail("could not inform the parent we are past initial setup");
write_log(DEBUG, "nsexec started"); write_log(DEBUG, "=> nsexec container setup");
/* Parse all of the netlink configuration. */ /* Parse all of the netlink configuration. */
nl_parse(pipenum, &config); nl_parse(pipenum, &config);
@@ -629,6 +655,7 @@ void nsexec(void)
* containers), which is the recommendation from the kernel folks. * containers), which is the recommendation from the kernel folks.
*/ */
if (config.namespaces) { if (config.namespaces) {
write_log(DEBUG, "set process as non-dumpable");
if (prctl(PR_SET_DUMPABLE, 0, 0, 0, 0) < 0) if (prctl(PR_SET_DUMPABLE, 0, 0, 0, 0) < 0)
bail("failed to set process as non-dumpable"); bail("failed to set process as non-dumpable");
} }
@@ -693,45 +720,49 @@ void nsexec(void)
* -- Aleksa "what has my life come to?" Sarai * -- Aleksa "what has my life come to?" Sarai
*/ */
switch (setjmp(env)) { current_stage = setjmp(env);
switch (current_stage) {
/* /*
* Stage 0: We're in the parent. Our job is just to create a new child * Stage 0: We're in the parent. Our job is just to create a new child
* (stage 1: JUMP_CHILD) process and write its uid_map and * (stage 1: STAGE_CHILD) process and write its uid_map and
* gid_map. That process will go on to create a new process, then * gid_map. That process will go on to create a new process, then
* it will send us its PID which we will send to the bootstrap * it will send us its PID which we will send to the bootstrap
* process. * process.
*/ */
case JUMP_PARENT:{ case STAGE_PARENT:{
int len; int len;
pid_t child, first_child = -1; pid_t stage1_pid = -1, stage2_pid = -1;
bool ready = false; bool stage1_complete, stage2_complete;
/* For debugging. */ /* For debugging. */
prctl(PR_SET_NAME, (unsigned long)"runc:[0:PARENT]", 0, 0, 0); prctl(PR_SET_NAME, (unsigned long)"runc:[0:PARENT]", 0, 0, 0);
write_log(DEBUG, "~> nsexec stage-0");
/* Start the process of getting a container. */ /* Start the process of getting a container. */
child = clone_parent(&env, JUMP_CHILD); write_log(DEBUG, "spawn stage-1");
if (child < 0) stage1_pid = clone_parent(&env, STAGE_CHILD);
bail("unable to fork: child_func"); if (stage1_pid < 0)
bail("unable to spawn stage-1");
/*
* State machine for synchronisation with the children.
*
* Father only return when both child and grandchild are
* ready, so we can receive all possible error codes
* generated by children.
*/
syncfd = sync_child_pipe[1]; syncfd = sync_child_pipe[1];
close(sync_child_pipe[0]); close(sync_child_pipe[0]);
while (!ready) { /*
* State machine for synchronisation with the children. We only
* return once both the child and grandchild are ready.
*/
write_log(DEBUG, "-> stage-1 synchronisation loop");
stage1_complete = false;
while (!stage1_complete) {
enum sync_t s; enum sync_t s;
if (read(syncfd, &s, sizeof(s)) != sizeof(s)) if (read(syncfd, &s, sizeof(s)) != sizeof(s))
bail("failed to sync with child: next state"); bail("failed to sync with stage-1: next state");
switch (s) { switch (s) {
case SYNC_USERMAP_PLS: case SYNC_USERMAP_PLS:
write_log(DEBUG, "stage-1 requested userns mappings");
/* /*
* Enable setgroups(2) if we've been asked to. But we also * Enable setgroups(2) if we've been asked to. But we also
* have to explicitly disable setgroups(2) if we're * have to explicitly disable setgroups(2) if we're
@@ -742,70 +773,78 @@ void nsexec(void)
* For rootless multi-entry mapping, config.is_setgroup shall be true and * For rootless multi-entry mapping, config.is_setgroup shall be true and
* newuidmap/newgidmap shall be used. * newuidmap/newgidmap shall be used.
*/ */
if (config.is_rootless_euid && !config.is_setgroup) if (config.is_rootless_euid && !config.is_setgroup)
update_setgroups(child, SETGROUPS_DENY); update_setgroups(stage1_pid, SETGROUPS_DENY);
/* Set up mappings. */ /* Set up mappings. */
update_uidmap(config.uidmappath, child, config.uidmap, config.uidmap_len); update_uidmap(config.uidmappath, stage1_pid, config.uidmap, config.uidmap_len);
update_gidmap(config.gidmappath, child, config.gidmap, config.gidmap_len); update_gidmap(config.gidmappath, stage1_pid, config.gidmap, config.gidmap_len);
s = SYNC_USERMAP_ACK; s = SYNC_USERMAP_ACK;
if (write(syncfd, &s, sizeof(s)) != sizeof(s)) { if (write(syncfd, &s, sizeof(s)) != sizeof(s)) {
kill(child, SIGKILL); sane_kill(stage1_pid, SIGKILL);
bail("failed to sync with child: write(SYNC_USERMAP_ACK)"); sane_kill(stage2_pid, SIGKILL);
bail("failed to sync with stage-1: write(SYNC_USERMAP_ACK)");
} }
break; break;
case SYNC_RECVPID_PLS: case SYNC_RECVPID_PLS:
first_child = child; write_log(DEBUG, "stage-1 requested pid to be forwarded");
/* Get the init_func pid. */ /* Get the stage-2 pid. */
if (read(syncfd, &child, sizeof(child)) != sizeof(child)) { if (read(syncfd, &stage2_pid, sizeof(stage2_pid)) != sizeof(stage2_pid)) {
kill(first_child, SIGKILL); sane_kill(stage1_pid, SIGKILL);
bail("failed to sync with child: read(childpid)"); sane_kill(stage2_pid, SIGKILL);
bail("failed to sync with stage-1: read(stage2_pid)");
} }
/* Send ACK. */ /* Send ACK. */
s = SYNC_RECVPID_ACK; s = SYNC_RECVPID_ACK;
if (write(syncfd, &s, sizeof(s)) != sizeof(s)) { if (write(syncfd, &s, sizeof(s)) != sizeof(s)) {
kill(first_child, SIGKILL); sane_kill(stage1_pid, SIGKILL);
kill(child, SIGKILL); sane_kill(stage2_pid, SIGKILL);
bail("failed to sync with child: write(SYNC_RECVPID_ACK)"); bail("failed to sync with stage-1: write(SYNC_RECVPID_ACK)");
} }
/* Send the init_func pid back to our parent. /*
* * Send both the stage-1 and stage-2 pids back to runc.
* Send the init_func pid and the pid of the first child back to our parent. * runc needs the stage-2 to continue process management,
* We need to send both back because we can't reap the first child we created (CLONE_PARENT). * but because stage-1 was spawned with CLONE_PARENT we
* It becomes the responsibility of our parent to reap the first child. * cannot reap it within stage-0 and thus we need to ask
* runc to reap the zombie for us.
*/ */
len = dprintf(pipenum, "{\"pid\": %d, \"pid_first\": %d}\n", write_log(DEBUG, "forward stage-1 (%d) and stage-2 (%d) pids to runc",
child, first_child); stage1_pid, stage2_pid);
len =
dprintf(pipenum, "{\"stage1_pid\":%d,\"stage2_pid\":%d}\n", stage1_pid,
stage2_pid);
if (len < 0) { if (len < 0) {
kill(child, SIGKILL); sane_kill(stage1_pid, SIGKILL);
bail("unable to generate JSON for child pid"); sane_kill(stage2_pid, SIGKILL);
bail("failed to sync with runc: write(pid-JSON)");
} }
break; break;
case SYNC_CHILD_READY: case SYNC_CHILD_FINISH:
ready = true; write_log(DEBUG, "stage-1 complete");
stage1_complete = true;
break; break;
default: default:
bail("unexpected sync value: %u", s); bail("unexpected sync value: %u", s);
} }
} }
write_log(DEBUG, "<- stage-1 synchronisation loop");
/* Now sync with grandchild. */ /* Now sync with grandchild. */
syncfd = sync_grandchild_pipe[1]; syncfd = sync_grandchild_pipe[1];
close(sync_grandchild_pipe[0]); close(sync_grandchild_pipe[0]);
write_log(DEBUG, "-> stage-2 synchronisation loop");
ready = false; stage2_complete = false;
while (!ready) { while (!stage2_complete) {
enum sync_t s; enum sync_t s;
write_log(DEBUG, "signalling stage-2 to run");
s = SYNC_GRANDCHILD; s = SYNC_GRANDCHILD;
if (write(syncfd, &s, sizeof(s)) != sizeof(s)) { if (write(syncfd, &s, sizeof(s)) != sizeof(s)) {
kill(child, SIGKILL); sane_kill(stage2_pid, SIGKILL);
bail("failed to sync with child: write(SYNC_GRANDCHILD)"); bail("failed to sync with child: write(SYNC_GRANDCHILD)");
} }
@@ -813,27 +852,31 @@ void nsexec(void)
bail("failed to sync with child: next state"); bail("failed to sync with child: next state");
switch (s) { switch (s) {
case SYNC_CHILD_READY: case SYNC_CHILD_FINISH:
ready = true; write_log(DEBUG, "stage-2 complete");
stage2_complete = true;
break; break;
default: default:
bail("unexpected sync value: %u", s); bail("unexpected sync value: %u", s);
} }
} }
write_log(DEBUG, "<- stage-2 synchronisation loop");
write_log(DEBUG, "<~ nsexec stage-0");
exit(0); exit(0);
} }
break;
/* /*
* Stage 1: We're in the first child process. Our job is to join any * Stage 1: We're in the first child process. Our job is to join any
* provided namespaces in the netlink payload and unshare all * provided namespaces in the netlink payload and unshare all of
* of the requested namespaces. If we've been asked to * the requested namespaces. If we've been asked to CLONE_NEWUSER,
* CLONE_NEWUSER, we will ask our parent (stage 0) to set up * we will ask our parent (stage 0) to set up our user mappings
* our user mappings for us. Then, we create a new child * for us. Then, we create a new child (stage 2: STAGE_INIT) for
* (stage 2: JUMP_INIT) for PID namespace. We then send the * PID namespace. We then send the child's PID to our parent
* child's PID to our parent (stage 0). * (stage 0).
*/ */
case JUMP_CHILD:{ case STAGE_CHILD:{
pid_t child; pid_t stage2_pid = -1;
enum sync_t s; enum sync_t s;
/* We're in a child and thus need to tell the parent if we die. */ /* We're in a child and thus need to tell the parent if we die. */
@@ -842,11 +885,12 @@ void nsexec(void)
/* For debugging. */ /* For debugging. */
prctl(PR_SET_NAME, (unsigned long)"runc:[1:CHILD]", 0, 0, 0); prctl(PR_SET_NAME, (unsigned long)"runc:[1:CHILD]", 0, 0, 0);
write_log(DEBUG, "~> nsexec stage-1");
/* /*
* We need to setns first. We cannot do this earlier (in stage 0) * We need to setns first. We cannot do this earlier (in stage 0)
* because of the fact that we forked to get here (the PID of * because of the fact that we forked to get here (the PID of
* [stage 2: JUMP_INIT]) would be meaningless). We could send it * [stage 2: STAGE_INIT]) would be meaningless). We could send it
* using cmsg(3) but that's just annoying. * using cmsg(3) but that's just annoying.
*/ */
if (config.namespaces) if (config.namespaces)
@@ -872,40 +916,50 @@ void nsexec(void)
* problem. * problem.
*/ */
if (config.cloneflags & CLONE_NEWUSER) { if (config.cloneflags & CLONE_NEWUSER) {
write_log(DEBUG, "unshare user namespace");
if (unshare(CLONE_NEWUSER) < 0) if (unshare(CLONE_NEWUSER) < 0)
bail("failed to unshare user namespace"); bail("failed to unshare user namespace");
config.cloneflags &= ~CLONE_NEWUSER; config.cloneflags &= ~CLONE_NEWUSER;
/* /*
* We don't have the privileges to do any mapping here (see the * We need to set ourselves as dumpable temporarily so that the
* clone_parent rant). So signal our parent to hook us up. * parent process can write to our procfs files.
*/ */
/* Switching is only necessary if we joined namespaces. */
if (config.namespaces) { if (config.namespaces) {
write_log(DEBUG, "temporarily set process as dumpable");
if (prctl(PR_SET_DUMPABLE, 1, 0, 0, 0) < 0) if (prctl(PR_SET_DUMPABLE, 1, 0, 0, 0) < 0)
bail("failed to set process as dumpable"); bail("failed to temporarily set process as dumpable");
} }
/*
* We don't have the privileges to do any mapping here (see the
* clone_parent rant). So signal stage-0 to do the mapping for
* us.
*/
write_log(DEBUG, "request stage-0 to map user namespace");
s = SYNC_USERMAP_PLS; s = SYNC_USERMAP_PLS;
if (write(syncfd, &s, sizeof(s)) != sizeof(s)) if (write(syncfd, &s, sizeof(s)) != sizeof(s))
bail("failed to sync with parent: write(SYNC_USERMAP_PLS)"); bail("failed to sync with parent: write(SYNC_USERMAP_PLS)");
/* ... wait for mapping ... */ /* ... wait for mapping ... */
write_log(DEBUG, "request stage-0 to map user namespace");
if (read(syncfd, &s, sizeof(s)) != sizeof(s)) if (read(syncfd, &s, sizeof(s)) != sizeof(s))
bail("failed to sync with parent: read(SYNC_USERMAP_ACK)"); bail("failed to sync with parent: read(SYNC_USERMAP_ACK)");
if (s != SYNC_USERMAP_ACK) if (s != SYNC_USERMAP_ACK)
bail("failed to sync with parent: SYNC_USERMAP_ACK: got %u", s); bail("failed to sync with parent: SYNC_USERMAP_ACK: got %u", s);
/* Switching is only necessary if we joined namespaces. */
/* Revert temporary re-dumpable setting. */
if (config.namespaces) { if (config.namespaces) {
write_log(DEBUG, "re-set process as non-dumpable");
if (prctl(PR_SET_DUMPABLE, 0, 0, 0, 0) < 0) if (prctl(PR_SET_DUMPABLE, 0, 0, 0, 0) < 0)
bail("failed to set process as dumpable"); bail("failed to re-set process as non-dumpable");
} }
/* Become root in the namespace proper. */ /* Become root in the namespace proper. */
if (setresuid(0, 0, 0) < 0) if (setresuid(0, 0, 0) < 0)
bail("failed to become root in user namespace"); bail("failed to become root in user namespace");
} }
/* /*
* Unshare all of the namespaces. Now, it should be noted that this * Unshare all of the namespaces. Now, it should be noted that this
* ordering might break in the future (especially with rootless * ordering might break in the future (especially with rootless
@@ -916,8 +970,9 @@ void nsexec(void)
* some old kernel versions where clone(CLONE_PARENT | CLONE_NEWPID) * some old kernel versions where clone(CLONE_PARENT | CLONE_NEWPID)
* was broken, so we'll just do it the long way anyway. * was broken, so we'll just do it the long way anyway.
*/ */
write_log(DEBUG, "unshare remaining namespace (except cgroupns)");
if (unshare(config.cloneflags & ~CLONE_NEWCGROUP) < 0) if (unshare(config.cloneflags & ~CLONE_NEWCGROUP) < 0)
bail("failed to unshare namespaces"); bail("failed to unshare remaining namespaces (except cgroupns)");
/* /*
* TODO: What about non-namespace clone flags that we're dropping here? * TODO: What about non-namespace clone flags that we're dropping here?
@@ -928,41 +983,45 @@ void nsexec(void)
* which would break many applications and libraries, so we must fork * which would break many applications and libraries, so we must fork
* to actually enter the new PID namespace. * to actually enter the new PID namespace.
*/ */
child = clone_parent(&env, JUMP_INIT); write_log(DEBUG, "spawn stage-2");
if (child < 0) stage2_pid = clone_parent(&env, STAGE_INIT);
bail("unable to fork: init_func"); if (stage2_pid < 0)
bail("unable to spawn stage-2");
/* Send the child to our parent, which knows what it's doing. */ /* Send the child to our parent, which knows what it's doing. */
write_log(DEBUG, "request stage-0 to forward stage-2 pid (%d)", stage2_pid);
s = SYNC_RECVPID_PLS; s = SYNC_RECVPID_PLS;
if (write(syncfd, &s, sizeof(s)) != sizeof(s)) { if (write(syncfd, &s, sizeof(s)) != sizeof(s)) {
kill(child, SIGKILL); sane_kill(stage2_pid, SIGKILL);
bail("failed to sync with parent: write(SYNC_RECVPID_PLS)"); bail("failed to sync with parent: write(SYNC_RECVPID_PLS)");
} }
if (write(syncfd, &child, sizeof(child)) != sizeof(child)) { if (write(syncfd, &stage2_pid, sizeof(stage2_pid)) != sizeof(stage2_pid)) {
kill(child, SIGKILL); sane_kill(stage2_pid, SIGKILL);
bail("failed to sync with parent: write(childpid)"); bail("failed to sync with parent: write(stage2_pid)");
} }
/* ... wait for parent to get the pid ... */ /* ... wait for parent to get the pid ... */
if (read(syncfd, &s, sizeof(s)) != sizeof(s)) { if (read(syncfd, &s, sizeof(s)) != sizeof(s)) {
kill(child, SIGKILL); sane_kill(stage2_pid, SIGKILL);
bail("failed to sync with parent: read(SYNC_RECVPID_ACK)"); bail("failed to sync with parent: read(SYNC_RECVPID_ACK)");
} }
if (s != SYNC_RECVPID_ACK) { if (s != SYNC_RECVPID_ACK) {
kill(child, SIGKILL); sane_kill(stage2_pid, SIGKILL);
bail("failed to sync with parent: SYNC_RECVPID_ACK: got %u", s); bail("failed to sync with parent: SYNC_RECVPID_ACK: got %u", s);
} }
s = SYNC_CHILD_READY; write_log(DEBUG, "signal completion to stage-0");
s = SYNC_CHILD_FINISH;
if (write(syncfd, &s, sizeof(s)) != sizeof(s)) { if (write(syncfd, &s, sizeof(s)) != sizeof(s)) {
kill(child, SIGKILL); sane_kill(stage2_pid, SIGKILL);
bail("failed to sync with parent: write(SYNC_CHILD_READY)"); bail("failed to sync with parent: write(SYNC_CHILD_FINISH)");
} }
/* Our work is done. [Stage 2: JUMP_INIT] is doing the rest of the work. */ /* Our work is done. [Stage 2: STAGE_INIT] is doing the rest of the work. */
write_log(DEBUG, "<~ nsexec stage-1");
exit(0); exit(0);
} }
break;
/* /*
* Stage 2: We're the final child process, and the only process that will * Stage 2: We're the final child process, and the only process that will
@@ -970,7 +1029,7 @@ void nsexec(void)
* final cleanup steps and then return to the Go runtime to allow * final cleanup steps and then return to the Go runtime to allow
* init_linux.go to run. * init_linux.go to run.
*/ */
case JUMP_INIT:{ case STAGE_INIT:{
/* /*
* We're inside the child now, having jumped from the * We're inside the child now, having jumped from the
* start_child() code after forking in the parent. * start_child() code after forking in the parent.
@@ -985,6 +1044,7 @@ void nsexec(void)
/* For debugging. */ /* For debugging. */
prctl(PR_SET_NAME, (unsigned long)"runc:[2:INIT]", 0, 0, 0); prctl(PR_SET_NAME, (unsigned long)"runc:[2:INIT]", 0, 0, 0);
write_log(DEBUG, "~> nsexec stage-2");
if (read(syncfd, &s, sizeof(s)) != sizeof(s)) if (read(syncfd, &s, sizeof(s)) != sizeof(s))
bail("failed to sync with parent: read(SYNC_GRANDCHILD)"); bail("failed to sync with parent: read(SYNC_GRANDCHILD)");
@@ -1005,21 +1065,30 @@ void nsexec(void)
bail("setgroups failed"); bail("setgroups failed");
} }
/* ... wait until our topmost parent has finished cgroup setup in p.manager.Apply() ... */ /*
* Wait until our topmost parent has finished cgroup setup in
* p.manager.Apply().
*
* TODO(cyphar): Check if this code is actually needed because we
* should be in the cgroup even from stage-0, so
* waiting until now might not make sense.
*/
if (config.cloneflags & CLONE_NEWCGROUP) { if (config.cloneflags & CLONE_NEWCGROUP) {
uint8_t value; uint8_t value;
if (read(pipenum, &value, sizeof(value)) != sizeof(value)) if (read(pipenum, &value, sizeof(value)) != sizeof(value))
bail("read synchronisation value failed"); bail("read synchronisation value failed");
if (value == CREATECGROUPNS) { if (value == CREATECGROUPNS) {
write_log(DEBUG, "unshare cgroup namespace");
if (unshare(CLONE_NEWCGROUP) < 0) if (unshare(CLONE_NEWCGROUP) < 0)
bail("failed to unshare cgroup namespace"); bail("failed to unshare cgroup namespace");
} else } else
bail("received unknown synchronisation value"); bail("received unknown synchronisation value");
} }
s = SYNC_CHILD_READY; write_log(DEBUG, "signal completion to stage-0");
s = SYNC_CHILD_FINISH;
if (write(syncfd, &s, sizeof(s)) != sizeof(s)) if (write(syncfd, &s, sizeof(s)) != sizeof(s))
bail("failed to sync with patent: write(SYNC_CHILD_READY)"); bail("failed to sync with patent: write(SYNC_CHILD_FINISH)");
/* Close sync pipes. */ /* Close sync pipes. */
close(sync_grandchild_pipe[0]); close(sync_grandchild_pipe[0]);
@@ -1028,10 +1097,13 @@ void nsexec(void)
nl_free(&config); nl_free(&config);
/* Finish executing, let the Go runtime take over. */ /* Finish executing, let the Go runtime take over. */
write_log(DEBUG, "<= nsexec container setup");
write_log(DEBUG, "booting up go runtime ...");
return; return;
} }
break;
default: default:
bail("unexpected jump value"); bail("unknown stage '%d' for jump value", current_stage);
} }
/* Should never be reached. */ /* Should never be reached. */

View File

@@ -11,7 +11,7 @@ function teardown() {
} }
function check_debug() { function check_debug() {
[[ "$*" == *"nsexec started"* ]] [[ "$*" == *"nsexec container setup"* ]]
[[ "$*" == *"child process in init()"* ]] [[ "$*" == *"child process in init()"* ]]
[[ "$*" == *"init: closing the pipe to signal completion"* ]] [[ "$*" == *"init: closing the pipe to signal completion"* ]]
} }

View File

@@ -138,7 +138,7 @@ function teardown() {
} }
function check_exec_debug() { function check_exec_debug() {
[[ "$*" == *"nsexec started"* ]] [[ "$*" == *"nsexec container setup"* ]]
[[ "$*" == *"child process in init()"* ]] [[ "$*" == *"child process in init()"* ]]
[[ "$*" == *"setns_init: about to exec"* ]] [[ "$*" == *"setns_init: about to exec"* ]]
} }