1
0
mirror of https://github.com/opencontainers/runc.git synced 2025-09-18 05:23:14 +03:00
Files
runc/libcontainer/criu_linux.go
Kir Kolyshkin ce3cd4234c criu: simplify isOnTmpfs check in prepareCriuRestoreMounts
Instead of generating a list of tmpfs mount and have a special function
to check whether the path is in the list, let's go over the list of
mounts directly. This simplifies the code and improves readability.

Signed-off-by: Kir Kolyshkin <kolyshkin@gmail.com>
2025-05-20 16:56:55 -07:00

1192 lines
35 KiB
Go

//go:build !runc_nocriu
package libcontainer
import (
"bufio"
"bytes"
"encoding/json"
"errors"
"fmt"
"net"
"os"
"os/exec"
"path/filepath"
"reflect"
"strings"
"time"
"github.com/checkpoint-restore/go-criu/v7"
criurpc "github.com/checkpoint-restore/go-criu/v7/rpc"
securejoin "github.com/cyphar/filepath-securejoin"
"github.com/sirupsen/logrus"
"golang.org/x/sys/unix"
"google.golang.org/protobuf/proto"
"github.com/opencontainers/cgroups"
"github.com/opencontainers/runc/libcontainer/configs"
"github.com/opencontainers/runc/libcontainer/utils"
)
var criuFeatures *criurpc.CriuFeatures
var ErrCriuMissingFeatures = errors.New("criu is missing features")
func (c *Container) checkCriuFeatures(criuOpts *CriuOpts, criuFeat *criurpc.CriuFeatures) error {
t := criurpc.CriuReqType_FEATURE_CHECK
// make sure the features we are looking for are really not from
// some previous check
criuFeatures = nil
req := &criurpc.CriuReq{
Type: &t,
Features: criuFeat,
}
err := c.criuSwrk(nil, req, criuOpts, nil)
if err != nil {
return fmt.Errorf("CRIU feature check failed: %w", err)
}
var missingFeatures []string
// The outer if checks if the fields actually exist
if (criuFeat.MemTrack != nil) &&
(criuFeatures.MemTrack != nil) {
// The inner if checks if they are set to true
if *criuFeat.MemTrack && !*criuFeatures.MemTrack {
missingFeatures = append(missingFeatures, "MemTrack")
logrus.Debugf("CRIU does not support MemTrack")
}
}
// This needs to be repeated for every new feature check.
// Is there a way to put this in a function. Reflection?
if (criuFeat.LazyPages != nil) &&
(criuFeatures.LazyPages != nil) {
if *criuFeat.LazyPages && !*criuFeatures.LazyPages {
missingFeatures = append(missingFeatures, "LazyPages")
logrus.Debugf("CRIU does not support LazyPages")
}
}
if len(missingFeatures) != 0 {
return fmt.Errorf("%w: %v", ErrCriuMissingFeatures, missingFeatures)
}
return nil
}
func compareCriuVersion(criuVersion int, minVersion int) error {
// simple function to perform the actual version compare
if criuVersion < minVersion {
return fmt.Errorf("CRIU version %d must be %d or higher", criuVersion, minVersion)
}
return nil
}
// checkCriuVersion checks CRIU version greater than or equal to minVersion.
func (c *Container) checkCriuVersion(minVersion int) error {
// If the version of criu has already been determined there is no need
// to ask criu for the version again. Use the value from c.criuVersion.
if c.criuVersion != 0 {
return compareCriuVersion(c.criuVersion, minVersion)
}
criu := criu.MakeCriu()
var err error
c.criuVersion, err = criu.GetCriuVersion()
if err != nil {
return fmt.Errorf("CRIU version check failed: %w", err)
}
return compareCriuVersion(c.criuVersion, minVersion)
}
const descriptorsFilename = "descriptors.json"
func (c *Container) addCriuDumpMount(req *criurpc.CriuReq, m *configs.Mount) {
mountDest := strings.TrimPrefix(m.Destination, c.config.Rootfs)
if dest, err := securejoin.SecureJoin(c.config.Rootfs, mountDest); err == nil {
mountDest = dest[len(c.config.Rootfs):]
}
extMnt := &criurpc.ExtMountMap{
Key: proto.String(mountDest),
Val: proto.String(mountDest),
}
req.Opts.ExtMnt = append(req.Opts.ExtMnt, extMnt)
}
func (c *Container) addMaskPaths(req *criurpc.CriuReq) error {
for _, path := range c.config.MaskPaths {
fi, err := os.Stat(fmt.Sprintf("/proc/%d/root/%s", c.initProcess.pid(), path))
if err != nil {
if os.IsNotExist(err) {
continue
}
return err
}
if fi.IsDir() {
continue
}
extMnt := &criurpc.ExtMountMap{
Key: proto.String(path),
Val: proto.String("/dev/null"),
}
req.Opts.ExtMnt = append(req.Opts.ExtMnt, extMnt)
}
return nil
}
func (c *Container) handleCriuConfigurationFile(rpcOpts *criurpc.CriuOpts) {
// CRIU will evaluate a configuration starting with release 3.11.
// Settings in the configuration file will overwrite RPC settings.
// Look for annotations. The annotation 'org.criu.config'
// specifies if CRIU should use a different, container specific
// configuration file.
configFile, exists := utils.SearchLabels(c.config.Labels, "org.criu.config")
if exists {
// If the annotation 'org.criu.config' exists and is set
// to a non-empty string, tell CRIU to use that as a
// configuration file. If the file does not exist, CRIU
// will just ignore it.
if configFile != "" {
rpcOpts.ConfigFile = proto.String(configFile)
}
// If 'org.criu.config' exists and is set to an empty
// string, a runc specific CRIU configuration file will
// be not set at all.
} else {
// If the mentioned annotation has not been found, specify
// a default CRIU configuration file.
rpcOpts.ConfigFile = proto.String("/etc/criu/runc.conf")
}
}
func (c *Container) criuSupportsExtNS(t configs.NamespaceType) bool {
var minVersion int
switch t {
case configs.NEWNET:
// CRIU supports different external namespace with different released CRIU versions.
// For network namespaces to work we need at least criu 3.11.0 => 31100.
minVersion = 31100
case configs.NEWPID:
// For PID namespaces criu 31500 is needed.
minVersion = 31500
default:
return false
}
return c.checkCriuVersion(minVersion) == nil
}
func criuNsToKey(t configs.NamespaceType) string {
return "extRoot" + strings.Title(configs.NsName(t)) + "NS" //nolint:staticcheck // SA1019: strings.Title is deprecated
}
func (c *Container) handleCheckpointingExternalNamespaces(rpcOpts *criurpc.CriuOpts, t configs.NamespaceType) error {
if !c.criuSupportsExtNS(t) {
return fmt.Errorf("criu lacks support for external %s namespace during checkpointing process (old criu version?)", configs.NsName(t))
}
nsPath := c.config.Namespaces.PathOf(t)
if nsPath == "" {
return nil
}
// CRIU expects the information about an external namespace
// like this: --external <TYPE>[<inode>]:<key>
// This <key> is always 'extRoot<TYPE>NS'.
var ns unix.Stat_t
if err := unix.Stat(nsPath, &ns); err != nil {
return err
}
criuExternal := fmt.Sprintf("%s[%d]:%s", configs.NsName(t), ns.Ino, criuNsToKey(t))
rpcOpts.External = append(rpcOpts.External, criuExternal)
return nil
}
func (c *Container) handleRestoringNamespaces(rpcOpts *criurpc.CriuOpts, extraFiles *[]*os.File) error {
for _, ns := range c.config.Namespaces {
switch ns.Type {
case configs.NEWNET, configs.NEWPID:
// If the container is running in a network or PID namespace and has
// a path to the network or PID namespace configured, we will dump
// that network or PID namespace as an external namespace and we
// will expect that the namespace exists during restore.
// This basically means that CRIU will ignore the namespace
// and expect it to be setup correctly.
if err := c.handleRestoringExternalNamespaces(rpcOpts, extraFiles, ns.Type); err != nil {
return err
}
default:
// For all other namespaces except NET and PID CRIU has
// a simpler way of joining the existing namespace if set
nsPath := c.config.Namespaces.PathOf(ns.Type)
if nsPath == "" {
continue
}
if ns.Type == configs.NEWCGROUP {
// CRIU has no code to handle NEWCGROUP
return fmt.Errorf("Do not know how to handle namespace %v", ns.Type)
}
// CRIU has code to handle NEWTIME, but it does not seem to be defined in runc
// CRIU will issue a warning for NEWUSER:
// criu/namespaces.c: 'join-ns with user-namespace is not fully tested and dangerous'
rpcOpts.JoinNs = append(rpcOpts.JoinNs, &criurpc.JoinNamespace{
Ns: proto.String(configs.NsName(ns.Type)),
NsFile: proto.String(nsPath),
})
}
}
return nil
}
func (c *Container) handleRestoringExternalNamespaces(rpcOpts *criurpc.CriuOpts, extraFiles *[]*os.File, t configs.NamespaceType) error {
if !c.criuSupportsExtNS(t) {
return fmt.Errorf("criu lacks support for external %s namespace during the restoration process (old criu version?)", configs.NsName(t))
}
nsPath := c.config.Namespaces.PathOf(t)
if nsPath == "" {
return nil
}
// CRIU wants the information about an existing namespace
// like this: --inherit-fd fd[<fd>]:<key>
// The <key> needs to be the same as during checkpointing.
// We are always using 'extRoot<TYPE>NS' as the key in this.
nsFd, err := os.Open(nsPath)
if err != nil {
logrus.Errorf("If a specific network namespace is defined it must exist: %s", err)
return fmt.Errorf("Requested network namespace %v does not exist", nsPath)
}
inheritFd := &criurpc.InheritFd{
Key: proto.String(criuNsToKey(t)),
// The offset of four is necessary because 0, 1, 2 and 3 are
// already used by stdin, stdout, stderr, 'criu swrk' socket.
Fd: proto.Int32(int32(4 + len(*extraFiles))),
}
rpcOpts.InheritFd = append(rpcOpts.InheritFd, inheritFd)
// All open FDs need to be transferred to CRIU via extraFiles
*extraFiles = append(*extraFiles, nsFd)
return nil
}
func (c *Container) Checkpoint(criuOpts *CriuOpts) error {
const logFile = "dump.log"
c.m.Lock()
defer c.m.Unlock()
// Checkpoint is unlikely to work if os.Geteuid() != 0 || system.RunningInUserNS().
// (CLI prints a warning)
// TODO(avagin): Figure out how to make this work nicely. CRIU 2.0 has
// support for doing unprivileged dumps, but the setup of
// rootless containers might make this complicated.
// We are relying on the CRIU version RPC which was introduced with CRIU 3.0.0
if err := c.checkCriuVersion(30000); err != nil {
return err
}
if criuOpts.ImagesDirectory == "" {
return errors.New("invalid directory to save checkpoint")
}
cgMode, err := criuCgMode(criuOpts.ManageCgroupsMode)
if err != nil {
return err
}
// Since a container can be C/R'ed multiple times,
// the checkpoint directory may already exist.
if err := os.Mkdir(criuOpts.ImagesDirectory, 0o700); err != nil && !os.IsExist(err) {
return err
}
logDir := criuOpts.ImagesDirectory
imageDir, err := os.Open(criuOpts.ImagesDirectory)
if err != nil {
return err
}
defer imageDir.Close()
rpcOpts := criurpc.CriuOpts{
ImagesDirFd: proto.Int32(int32(imageDir.Fd())),
LogLevel: proto.Int32(4),
LogFile: proto.String(logFile),
Root: proto.String(c.config.Rootfs),
ManageCgroups: proto.Bool(true), // Obsoleted by ManageCgroupsMode.
ManageCgroupsMode: &cgMode,
NotifyScripts: proto.Bool(true),
Pid: proto.Int32(int32(c.initProcess.pid())),
ShellJob: proto.Bool(criuOpts.ShellJob),
LeaveRunning: proto.Bool(criuOpts.LeaveRunning),
TcpEstablished: proto.Bool(criuOpts.TcpEstablished),
TcpSkipInFlight: proto.Bool(criuOpts.TcpSkipInFlight),
LinkRemap: proto.Bool(criuOpts.LinkRemap),
ExtUnixSk: proto.Bool(criuOpts.ExternalUnixConnections),
FileLocks: proto.Bool(criuOpts.FileLocks),
EmptyNs: proto.Uint32(criuOpts.EmptyNs),
OrphanPtsMaster: proto.Bool(true),
AutoDedup: proto.Bool(criuOpts.AutoDedup),
LazyPages: proto.Bool(criuOpts.LazyPages),
}
// if criuOpts.WorkDirectory is not set, criu default is used.
if criuOpts.WorkDirectory != "" {
if err := os.Mkdir(criuOpts.WorkDirectory, 0o700); err != nil && !os.IsExist(err) {
return err
}
workDir, err := os.Open(criuOpts.WorkDirectory)
if err != nil {
return err
}
defer workDir.Close()
rpcOpts.WorkDirFd = proto.Int32(int32(workDir.Fd()))
logDir = criuOpts.WorkDirectory
}
c.handleCriuConfigurationFile(&rpcOpts)
// If the container is running in a network namespace and has
// a path to the network namespace configured, we will dump
// that network namespace as an external namespace and we
// will expect that the namespace exists during restore.
// This basically means that CRIU will ignore the namespace
// and expect to be setup correctly.
if err := c.handleCheckpointingExternalNamespaces(&rpcOpts, configs.NEWNET); err != nil {
return err
}
// Same for possible external PID namespaces
if err := c.handleCheckpointingExternalNamespaces(&rpcOpts, configs.NEWPID); err != nil {
return err
}
// CRIU can use cgroup freezer; when rpcOpts.FreezeCgroup
// is not set, CRIU uses ptrace() to pause the processes.
// Note cgroup v2 freezer is only supported since CRIU release 3.14.
if !cgroups.IsCgroup2UnifiedMode() || c.checkCriuVersion(31400) == nil {
if fcg := c.cgroupManager.Path("freezer"); fcg != "" {
rpcOpts.FreezeCgroup = proto.String(fcg)
}
}
// append optional criu opts, e.g., page-server and port
if criuOpts.PageServer.Address != "" && criuOpts.PageServer.Port != 0 {
rpcOpts.Ps = &criurpc.CriuPageServerInfo{
Address: proto.String(criuOpts.PageServer.Address),
Port: proto.Int32(criuOpts.PageServer.Port),
}
}
// pre-dump may need parentImage param to complete iterative migration
if criuOpts.ParentImage != "" {
rpcOpts.ParentImg = proto.String(criuOpts.ParentImage)
rpcOpts.TrackMem = proto.Bool(true)
}
var t criurpc.CriuReqType
if criuOpts.PreDump {
feat := criurpc.CriuFeatures{
MemTrack: proto.Bool(true),
}
if err := c.checkCriuFeatures(criuOpts, &feat); err != nil {
return err
}
t = criurpc.CriuReqType_PRE_DUMP
} else {
t = criurpc.CriuReqType_DUMP
}
if criuOpts.LazyPages {
// lazy migration requested; check if criu supports it
feat := criurpc.CriuFeatures{
LazyPages: proto.Bool(true),
}
if err := c.checkCriuFeatures(criuOpts, &feat); err != nil {
return err
}
if fd := criuOpts.StatusFd; fd != -1 {
// check that the FD is valid
flags, err := unix.FcntlInt(uintptr(fd), unix.F_GETFL, 0)
if err != nil {
return fmt.Errorf("invalid --status-fd argument %d: %w", fd, err)
}
// and writable
if flags&unix.O_WRONLY == 0 {
return fmt.Errorf("invalid --status-fd argument %d: not writable", fd)
}
if c.checkCriuVersion(31500) != nil {
// For criu 3.15+, use notifications (see case "status-ready"
// in criuNotifications). Otherwise, rely on criu status fd.
rpcOpts.StatusFd = proto.Int32(int32(fd))
}
}
}
req := &criurpc.CriuReq{
Type: &t,
Opts: &rpcOpts,
}
// no need to dump all this in pre-dump
if !criuOpts.PreDump {
hasCgroupns := c.config.Namespaces.Contains(configs.NEWCGROUP)
for _, m := range c.config.Mounts {
switch m.Device {
case "bind":
c.addCriuDumpMount(req, m)
case "cgroup":
if cgroups.IsCgroup2UnifiedMode() || hasCgroupns {
// real mount(s)
continue
}
// a set of "external" bind mounts
binds, err := getCgroupMounts(m)
if err != nil {
return err
}
for _, b := range binds {
c.addCriuDumpMount(req, b)
}
}
}
if err := c.addMaskPaths(req); err != nil {
return err
}
for _, node := range c.config.Devices {
m := &configs.Mount{Destination: node.Path, Source: node.Path}
c.addCriuDumpMount(req, m)
}
// Write the FD info to a file in the image directory
fdsJSON, err := json.Marshal(c.initProcess.externalDescriptors())
if err != nil {
return err
}
err = os.WriteFile(filepath.Join(criuOpts.ImagesDirectory, descriptorsFilename), fdsJSON, 0o600)
if err != nil {
return err
}
}
err = c.criuSwrk(nil, req, criuOpts, nil)
if err != nil {
logCriuErrors(logDir, logFile)
return err
}
return nil
}
func (c *Container) addCriuRestoreMount(req *criurpc.CriuReq, m *configs.Mount) {
mountDest := strings.TrimPrefix(m.Destination, c.config.Rootfs)
if dest, err := securejoin.SecureJoin(c.config.Rootfs, mountDest); err == nil {
mountDest = dest[len(c.config.Rootfs):]
}
extMnt := &criurpc.ExtMountMap{
Key: proto.String(mountDest),
Val: proto.String(m.Source),
}
req.Opts.ExtMnt = append(req.Opts.ExtMnt, extMnt)
}
func (c *Container) restoreNetwork(req *criurpc.CriuReq, criuOpts *CriuOpts) {
for _, iface := range c.config.Networks {
switch iface.Type {
case "veth":
veth := new(criurpc.CriuVethPair)
veth.IfOut = proto.String(iface.HostInterfaceName)
veth.IfIn = proto.String(iface.Name)
req.Opts.Veths = append(req.Opts.Veths, veth)
case "loopback":
// Do nothing
}
}
for _, i := range criuOpts.VethPairs {
veth := new(criurpc.CriuVethPair)
veth.IfOut = proto.String(i.HostInterfaceName)
veth.IfIn = proto.String(i.ContainerInterfaceName)
req.Opts.Veths = append(req.Opts.Veths, veth)
}
}
func isOnTmpfs(path string, mounts []*configs.Mount) bool {
for _, m := range mounts {
if m.Device == "tmpfs" && strings.HasPrefix(path, m.Destination+"/") {
return true
}
}
return false
}
// prepareCriuRestoreMounts tries to set up the rootfs of the
// container to be restored in the same way runc does it for
// initial container creation. Even for a read-only rootfs container
// runc modifies the rootfs to add mountpoints which do not exist.
// This function also creates missing mountpoints as long as they
// are not on top of a tmpfs, as CRIU will restore tmpfs content anyway.
func (c *Container) prepareCriuRestoreMounts(mounts []*configs.Mount) error {
umounts := []string{}
defer func() {
for _, u := range umounts {
_ = utils.WithProcfd(c.config.Rootfs, u, func(procfd string) error {
if e := unix.Unmount(procfd, unix.MNT_DETACH); e != nil {
if e != unix.EINVAL {
// Ignore EINVAL as it means 'target is not a mount point.'
// It probably has already been unmounted.
logrus.Warnf("Error during cleanup unmounting of %s (%s): %v", procfd, u, e)
}
}
return nil
})
}
}()
// Now go through all mounts and create the required mountpoints.
for _, m := range mounts {
// No cgroup mount point(s) need to be created:
// * for v1, mount points are saved by CRIU because
// /sys/fs/cgroup is a tmpfs mount;
// * for v2, /sys/fs/cgroup is a real mount, but
// the mountpoint appears as soon as /sys is mounted.
if m.Device == "cgroup" {
continue
}
// If the mountpoint is on a tmpfs, skip it as CRIU will
// restore the complete tmpfs content from its checkpoint.
if isOnTmpfs(m.Destination, mounts) {
continue
}
if _, err := createMountpoint(c.config.Rootfs, mountEntry{Mount: m}); err != nil {
return fmt.Errorf("create criu restore mountpoint for %s mount: %w", m.Destination, err)
}
// If the mount point is a bind mount, we need to mount
// it now so that runc can create the necessary mount
// points for mounts in bind mounts.
// This also happens during initial container creation.
// Without this CRIU restore will fail
// See: https://github.com/opencontainers/runc/issues/2748
// It is also not necessary to order the mount points
// because during initial container creation mounts are
// set up in the order they are configured.
if m.Device == "bind" {
if err := utils.WithProcfd(c.config.Rootfs, m.Destination, func(dstFd string) error {
return mountViaFds(m.Source, nil, m.Destination, dstFd, "", unix.MS_BIND|unix.MS_REC, "")
}); err != nil {
return err
}
umounts = append(umounts, m.Destination)
}
}
return nil
}
// Restore restores the checkpointed container to a running state using the
// criu(8) utility.
func (c *Container) Restore(process *Process, criuOpts *CriuOpts) error {
const logFile = "restore.log"
c.m.Lock()
defer c.m.Unlock()
var extraFiles []*os.File
// Restore is unlikely to work if os.Geteuid() != 0 || system.RunningInUserNS().
// (CLI prints a warning)
// TODO(avagin): Figure out how to make this work nicely. CRIU doesn't have
// support for unprivileged restore at the moment.
// We are relying on the CRIU version RPC which was introduced with CRIU 3.0.0
if err := c.checkCriuVersion(30000); err != nil {
return err
}
if criuOpts.ImagesDirectory == "" {
return errors.New("invalid directory to restore checkpoint")
}
cgMode, err := criuCgMode(criuOpts.ManageCgroupsMode)
if err != nil {
return err
}
logDir := criuOpts.ImagesDirectory
imageDir, err := os.Open(criuOpts.ImagesDirectory)
if err != nil {
return err
}
defer imageDir.Close()
// CRIU has a few requirements for a root directory:
// * it must be a mount point
// * its parent must not be overmounted
// c.config.Rootfs is bind-mounted to a temporary directory
// to satisfy these requirements.
root := filepath.Join(c.stateDir, "criu-root")
if err := os.Mkdir(root, 0o755); err != nil {
return err
}
defer os.Remove(root)
root, err = filepath.EvalSymlinks(root)
if err != nil {
return err
}
err = mount(c.config.Rootfs, root, "", unix.MS_BIND|unix.MS_REC, "")
if err != nil {
return err
}
defer unix.Unmount(root, unix.MNT_DETACH) //nolint: errcheck
t := criurpc.CriuReqType_RESTORE
req := &criurpc.CriuReq{
Type: &t,
Opts: &criurpc.CriuOpts{
ImagesDirFd: proto.Int32(int32(imageDir.Fd())),
EvasiveDevices: proto.Bool(true),
LogLevel: proto.Int32(4),
LogFile: proto.String(logFile),
RstSibling: proto.Bool(true),
Root: proto.String(root),
ManageCgroups: proto.Bool(true), // Obsoleted by ManageCgroupsMode.
ManageCgroupsMode: &cgMode,
NotifyScripts: proto.Bool(true),
ShellJob: proto.Bool(criuOpts.ShellJob),
ExtUnixSk: proto.Bool(criuOpts.ExternalUnixConnections),
TcpEstablished: proto.Bool(criuOpts.TcpEstablished),
FileLocks: proto.Bool(criuOpts.FileLocks),
EmptyNs: proto.Uint32(criuOpts.EmptyNs),
OrphanPtsMaster: proto.Bool(true),
AutoDedup: proto.Bool(criuOpts.AutoDedup),
LazyPages: proto.Bool(criuOpts.LazyPages),
},
}
if criuOpts.LsmProfile != "" {
// CRIU older than 3.16 has a bug which breaks the possibility
// to set a different LSM profile.
if err := c.checkCriuVersion(31600); err != nil {
return errors.New("--lsm-profile requires at least CRIU 3.16")
}
req.Opts.LsmProfile = proto.String(criuOpts.LsmProfile)
}
if criuOpts.LsmMountContext != "" {
if err := c.checkCriuVersion(31600); err != nil {
return errors.New("--lsm-mount-context requires at least CRIU 3.16")
}
req.Opts.LsmMountContext = proto.String(criuOpts.LsmMountContext)
}
if criuOpts.WorkDirectory != "" {
// Since a container can be C/R'ed multiple times,
// the work directory may already exist.
if err := os.Mkdir(criuOpts.WorkDirectory, 0o700); err != nil && !os.IsExist(err) {
return err
}
workDir, err := os.Open(criuOpts.WorkDirectory)
if err != nil {
return err
}
defer workDir.Close()
req.Opts.WorkDirFd = proto.Int32(int32(workDir.Fd()))
logDir = criuOpts.WorkDirectory
}
c.handleCriuConfigurationFile(req.Opts)
if err := c.handleRestoringNamespaces(req.Opts, &extraFiles); err != nil {
return err
}
// This will modify the rootfs of the container in the same way runc
// modifies the container during initial creation.
if err := c.prepareCriuRestoreMounts(c.config.Mounts); err != nil {
return err
}
hasCgroupns := c.config.Namespaces.Contains(configs.NEWCGROUP)
for _, m := range c.config.Mounts {
switch m.Device {
case "bind":
c.addCriuRestoreMount(req, m)
case "cgroup":
if cgroups.IsCgroup2UnifiedMode() || hasCgroupns {
continue
}
// cgroup v1 is a set of bind mounts, unless cgroupns is used
binds, err := getCgroupMounts(m)
if err != nil {
return err
}
for _, b := range binds {
c.addCriuRestoreMount(req, b)
}
}
}
if len(c.config.MaskPaths) > 0 {
m := &configs.Mount{Destination: "/dev/null", Source: "/dev/null"}
c.addCriuRestoreMount(req, m)
}
for _, node := range c.config.Devices {
m := &configs.Mount{Destination: node.Path, Source: node.Path}
c.addCriuRestoreMount(req, m)
}
if criuOpts.EmptyNs&unix.CLONE_NEWNET == 0 {
c.restoreNetwork(req, criuOpts)
}
var (
fds []string
fdJSON []byte
)
if fdJSON, err = os.ReadFile(filepath.Join(criuOpts.ImagesDirectory, descriptorsFilename)); err != nil {
return err
}
if err := json.Unmarshal(fdJSON, &fds); err != nil {
return err
}
for i := range fds {
if s := fds[i]; strings.Contains(s, "pipe:") {
inheritFd := new(criurpc.InheritFd)
inheritFd.Key = proto.String(s)
inheritFd.Fd = proto.Int32(int32(i))
req.Opts.InheritFd = append(req.Opts.InheritFd, inheritFd)
}
}
err = c.criuSwrk(process, req, criuOpts, extraFiles)
if err != nil {
logCriuErrors(logDir, logFile)
}
// Now that CRIU is done let's close all opened FDs CRIU needed.
for _, fd := range extraFiles {
fd.Close()
}
return err
}
// logCriuErrors tries to find and log errors from a criu log file.
// The output is similar to what "grep -n -B5 Error" does.
func logCriuErrors(dir, file string) {
lookFor := []byte("Error") // Print the line that contains this...
const max = 5 + 1 // ... and a few preceding lines.
logFile := filepath.Join(dir, file)
f, err := os.Open(logFile)
if err != nil {
logrus.Warn(err)
return
}
defer f.Close()
var lines [max][]byte
var idx, lineNo, printedLineNo int
s := bufio.NewScanner(f)
for s.Scan() {
lineNo++
lines[idx] = s.Bytes()
idx = (idx + 1) % max
if !bytes.Contains(s.Bytes(), lookFor) {
continue
}
// Found an error.
if printedLineNo == 0 {
logrus.Warnf("--- Quoting %q", logFile)
} else if lineNo-max > printedLineNo {
// Mark the gap.
logrus.Warn("...")
}
// Print the last lines.
for add := range max {
i := (idx + add) % max
s := lines[i]
actLineNo := lineNo + add - max + 1
if len(s) > 0 && actLineNo > printedLineNo {
logrus.Warnf("%d:%s", actLineNo, s)
printedLineNo = actLineNo
}
}
}
if printedLineNo != 0 {
logrus.Warn("---") // End of "Quoting ...".
}
if err := s.Err(); err != nil {
logrus.Warnf("read %q: %v", logFile, err)
}
}
func (c *Container) criuApplyCgroups(pid int, req *criurpc.CriuReq) error {
// need to apply cgroups only on restore
if req.GetType() != criurpc.CriuReqType_RESTORE {
return nil
}
// XXX: Do we need to deal with this case? AFAIK criu still requires root.
if err := c.cgroupManager.Apply(pid); err != nil {
return err
}
if err := c.cgroupManager.Set(c.config.Cgroups.Resources); err != nil {
return err
}
// TODO(@kolyshkin): should we use c.cgroupManager.GetPaths()
// instead of reading /proc/pid/cgroup?
path := fmt.Sprintf("/proc/%d/cgroup", pid)
cgroupsPaths, err := cgroups.ParseCgroupFile(path)
if err != nil {
return err
}
for c, p := range cgroupsPaths {
cgroupRoot := &criurpc.CgroupRoot{
Ctrl: proto.String(c),
Path: proto.String(p),
}
req.Opts.CgRoot = append(req.Opts.CgRoot, cgroupRoot)
}
return nil
}
func (c *Container) criuSwrk(process *Process, req *criurpc.CriuReq, opts *CriuOpts, extraFiles []*os.File) error {
fds, err := unix.Socketpair(unix.AF_LOCAL, unix.SOCK_SEQPACKET|unix.SOCK_CLOEXEC, 0)
if err != nil {
return err
}
criuClient := os.NewFile(uintptr(fds[0]), "criu-transport-client")
criuClientFileCon, err := net.FileConn(criuClient)
criuClient.Close()
if err != nil {
return err
}
criuClientCon := criuClientFileCon.(*net.UnixConn)
defer criuClientCon.Close()
criuServer := os.NewFile(uintptr(fds[1]), "criu-transport-server")
defer criuServer.Close()
if c.criuVersion != 0 {
// If the CRIU Version is still '0' then this is probably
// the initial CRIU run to detect the version. Skip it.
logrus.Debugf("Using CRIU %d", c.criuVersion)
}
cmd := exec.Command("criu", "swrk", "3")
if process != nil {
cmd.Stdin = process.Stdin
cmd.Stdout = process.Stdout
cmd.Stderr = process.Stderr
}
cmd.ExtraFiles = append(cmd.ExtraFiles, criuServer)
if extraFiles != nil {
cmd.ExtraFiles = append(cmd.ExtraFiles, extraFiles...)
}
if err := cmd.Start(); err != nil {
return err
}
// we close criuServer so that even if CRIU crashes or unexpectedly exits, runc will not hang.
criuServer.Close()
// cmd.Process will be replaced by a restored init.
criuProcess := cmd.Process
var criuProcessState *os.ProcessState
defer func() {
if criuProcessState == nil {
criuClientCon.Close()
_, err := criuProcess.Wait()
if err != nil {
logrus.Warnf("wait on criuProcess returned %v", err)
}
}
}()
if err := c.criuApplyCgroups(criuProcess.Pid, req); err != nil {
return err
}
var extFds []string
if process != nil {
extFds, err = getPipeFds(criuProcess.Pid)
if err != nil {
return err
}
}
logrus.Debugf("Using CRIU in %s mode", req.GetType().String())
// In the case of criurpc.CriuReqType_FEATURE_CHECK req.GetOpts()
// should be empty. For older CRIU versions it still will be
// available but empty. criurpc.CriuReqType_VERSION actually
// has no req.GetOpts().
if logrus.GetLevel() >= logrus.DebugLevel &&
(req.GetType() != criurpc.CriuReqType_FEATURE_CHECK &&
req.GetType() != criurpc.CriuReqType_VERSION) {
val := reflect.ValueOf(req.GetOpts())
v := reflect.Indirect(val)
for i := range v.NumField() {
st := v.Type()
name := st.Field(i).Name
if 'A' <= name[0] && name[0] <= 'Z' {
value := val.MethodByName("Get" + name).Call([]reflect.Value{})
logrus.Debugf("CRIU option %s with value %v", name, value[0])
}
}
}
data, err := proto.Marshal(req)
if err != nil {
return err
}
_, err = criuClientCon.Write(data)
if err != nil {
return err
}
buf := make([]byte, 10*4096)
oob := make([]byte, 4096)
for {
n, oobn, _, _, err := criuClientCon.ReadMsgUnix(buf, oob)
if req.Opts != nil && req.Opts.StatusFd != nil {
// Close status_fd as soon as we got something back from criu,
// assuming it has consumed (reopened) it by this time.
// Otherwise it will might be left open forever and whoever
// is waiting on it will wait forever.
fd := int(*req.Opts.StatusFd)
_ = unix.Close(fd)
req.Opts.StatusFd = nil
}
if err != nil {
return err
}
if n == 0 {
return errors.New("unexpected EOF")
}
if n == len(buf) {
return errors.New("buffer is too small")
}
resp := new(criurpc.CriuResp)
err = proto.Unmarshal(buf[:n], resp)
if err != nil {
return err
}
t := resp.GetType()
if !resp.GetSuccess() {
return fmt.Errorf("criu failed: type %s errno %d", t, resp.GetCrErrno())
}
switch t {
case criurpc.CriuReqType_FEATURE_CHECK:
logrus.Debugf("Feature check says: %s", resp)
criuFeatures = resp.GetFeatures()
case criurpc.CriuReqType_NOTIFY:
if err := c.criuNotifications(resp, process, cmd, opts, extFds, oob[:oobn]); err != nil {
return err
}
req = &criurpc.CriuReq{
Type: &t,
NotifySuccess: proto.Bool(true),
}
data, err = proto.Marshal(req)
if err != nil {
return err
}
_, err = criuClientCon.Write(data)
if err != nil {
return err
}
continue
case criurpc.CriuReqType_RESTORE:
case criurpc.CriuReqType_DUMP:
case criurpc.CriuReqType_PRE_DUMP:
default:
return fmt.Errorf("unable to parse the response %s", resp.String())
}
break
}
_ = criuClientCon.CloseWrite()
// cmd.Wait() waits cmd.goroutines which are used for proxying file descriptors.
// Here we want to wait only the CRIU process.
criuProcessState, err = criuProcess.Wait()
if err != nil {
return err
}
// In pre-dump mode CRIU is in a loop and waits for
// the final DUMP command.
// The current runc pre-dump approach, however, is
// start criu in PRE_DUMP once for a single pre-dump
// and not the whole series of pre-dump, pre-dump, ...m, dump
// If we got the message CriuReqType_PRE_DUMP it means
// CRIU was successful and we need to forcefully stop CRIU
if !criuProcessState.Success() && *req.Type != criurpc.CriuReqType_PRE_DUMP {
return fmt.Errorf("criu failed: %s", criuProcessState)
}
return nil
}
// lockNetwork blocks any external network activity.
func lockNetwork(config *configs.Config) error {
for _, config := range config.Networks {
strategy, err := getStrategy(config.Type)
if err != nil {
return err
}
if err := strategy.detach(config); err != nil {
return err
}
}
return nil
}
func unlockNetwork(config *configs.Config) error {
for _, config := range config.Networks {
strategy, err := getStrategy(config.Type)
if err != nil {
return err
}
if err = strategy.attach(config); err != nil {
return err
}
}
return nil
}
func (c *Container) criuNotifications(resp *criurpc.CriuResp, process *Process, cmd *exec.Cmd, opts *CriuOpts, fds []string, oob []byte) error {
notify := resp.GetNotify()
if notify == nil {
return fmt.Errorf("invalid response: %s", resp.String())
}
script := notify.GetScript()
logrus.Debugf("notify: %s\n", script)
switch script {
case "post-dump":
f, err := os.Create(filepath.Join(c.stateDir, "checkpoint"))
if err != nil {
return err
}
f.Close()
case "network-unlock":
if err := unlockNetwork(c.config); err != nil {
return err
}
case "network-lock":
if err := lockNetwork(c.config); err != nil {
return err
}
case "setup-namespaces":
if c.config.HasHook(configs.Prestart, configs.CreateRuntime) {
s, err := c.currentOCIState()
if err != nil {
return nil
}
s.Pid = int(notify.GetPid())
if err := c.config.Hooks.Run(configs.Prestart, s); err != nil {
return err
}
if err := c.config.Hooks.Run(configs.CreateRuntime, s); err != nil {
return err
}
}
case "post-restore":
pid := notify.GetPid()
p, err := os.FindProcess(int(pid))
if err != nil {
return err
}
cmd.Process = p
r, err := newRestoredProcess(cmd, fds)
if err != nil {
return err
}
process.ops = r
if err := c.state.transition(&restoredState{
imageDir: opts.ImagesDirectory,
c: c,
}); err != nil {
return err
}
// create a timestamp indicating when the restored checkpoint was started
c.created = time.Now().UTC()
if !c.config.Namespaces.Contains(configs.NEWTIME) &&
configs.IsNamespaceSupported(configs.NEWTIME) &&
c.checkCriuVersion(31400) == nil {
// CRIU restores processes into a time namespace.
c.config.Namespaces = append(c.config.Namespaces,
configs.Namespace{Type: configs.NEWTIME})
}
if _, err := c.updateState(r); err != nil {
return err
}
if err := os.Remove(filepath.Join(c.stateDir, "checkpoint")); err != nil {
if !os.IsNotExist(err) {
logrus.Error(err)
}
}
case "orphan-pts-master":
scm, err := unix.ParseSocketControlMessage(oob)
if err != nil {
return err
}
fds, err := unix.ParseUnixRights(&scm[0])
if err != nil {
return err
}
master := os.NewFile(uintptr(fds[0]), "orphan-pts-master")
defer master.Close()
// While we can access console.master, using the API is a good idea.
if err := utils.SendFile(process.ConsoleSocket, master); err != nil {
return err
}
case "status-ready":
if opts.StatusFd != -1 {
// write \0 to status fd to notify that lazy page server is ready
_, err := unix.Write(opts.StatusFd, []byte{0})
if err != nil {
logrus.Warnf("can't write \\0 to status fd: %v", err)
}
_ = unix.Close(opts.StatusFd)
opts.StatusFd = -1
}
}
return nil
}
func criuCgMode(mode string) (criurpc.CriuCgMode, error) {
switch mode {
case "":
return criurpc.CriuCgMode_DEFAULT, nil
case "soft":
return criurpc.CriuCgMode_SOFT, nil
case "full":
return criurpc.CriuCgMode_FULL, nil
case "strict":
return criurpc.CriuCgMode_STRICT, nil
case "ignore":
return criurpc.CriuCgMode_IGNORE, nil
default:
return 0, errors.New("invalid manage-cgroups-mode value")
}
}