diff --git a/.travis.yml b/.travis.yml index 07e447f04..8fe26f36a 100644 --- a/.travis.yml +++ b/.travis.yml @@ -3,6 +3,21 @@ sudo: required +env: + global: + - travis_cache_dir=$HOME/travis_ccache + # Travis times out after 50 minutes. Very generously leave 10 minutes + # for setup (e.g. cache download, compression, and upload), so we never + # fail to cache the progress we made. + - docker_build_timeout=40m + +cache: + # Our build caches can be 200-300MB, so increase the timeout to 7 minutes + # to make sure we never fail to cache the progress we made. + timeout: 420 + directories: + - $HOME/travis_ccache # see docker_build_with_ccache.sh + # Ugh, `services:` must be in the matrix, or we get `docker: command not found` # https://github.com/travis-ci/travis-ci/issues/5142 matrix: diff --git a/build/fbcode_builder/README.md b/build/fbcode_builder/README.md index a5eb0e5b6..c5aa034ba 100644 --- a/build/fbcode_builder/README.md +++ b/build/fbcode_builder/README.md @@ -11,13 +11,27 @@ For external Travis builds, the entry point is `travis_docker_build.sh`. If you are debugging or enhancing a CI build, you will want to do so from host or virtual machine that can run a reasonably modern version of Docker: -``` -./make_docker_context.py --help # See available options -./travis_docker_build.sh # Tiny wrapper that starts a Travis-like build +``` sh +./make_docker_context.py --help # See available options for OS & compiler +# Tiny wrapper that starts a Travis-like build with compile caching: +os_image=ubuntu:14.04 \ + gcc_version=4.9 \ + make_parallelism=2 \ + travis_cache_dir=~/travis_ccache \ + ./travis_docker_build.sh &> build_at_$(date +'%Y%m%d_%H%M%S').log ``` **IMPORTANT**: Read `fbcode_builder/README.docker` befire diving in! +Setting `travis_cache_dir` turns on [ccache](https://ccache.samba.org/), +saving a fresh copy of `ccache.tgz` after every build. This will invalidate +Docker's layer cache, foring it to rebuild starting right after OS package +setup, but the builds will be fast because all the compiles will be cached. +To iterate without invalidating the Docker layer cache, just `cd +/tmp/docker-context-*` and interact with the `Dockerfile` normally. Note +that the `docker-context-*` dirs preserve a copy of `ccache.tgz` as they +first used it. + # What to read next @@ -28,6 +42,11 @@ in this order: - docker_builder.py - make_docker_context.py +As far as runs on Travis go, the control flow is: + - .travis.yml calls + - travis_docker_build.sh calls + - docker_build_with_ccache.sh + This library also has an (unpublished) component targeting Facebook's internal continuous-integration platform using the same build-step DSL. diff --git a/build/fbcode_builder/docker_build_with_ccache.sh b/build/fbcode_builder/docker_build_with_ccache.sh new file mode 100755 index 000000000..f080e6c22 --- /dev/null +++ b/build/fbcode_builder/docker_build_with_ccache.sh @@ -0,0 +1,211 @@ +#!/bin/bash -uex +set -o pipefail # Be sure to `|| :` commands that are allowed to fail. + +# +# Future: port this to Python if you are making significant changes. +# + +# Parse command-line arguments +build_timeout="" # Default to no time-out +print_usage() { + echo "Usage: $0 [--build-timeout TIMEOUT_VAL] SAVE-CCACHE-TO-DIR" + echo "SAVE-CCACHE-TO-DIR is required. An empty string discards the ccache." +} +while [[ $# -gt 0 ]]; do + case "$1" in + --build-timeout) + shift + build_timeout="$1" + if [[ "$build_timeout" != "" ]] ; then + timeout "$build_timeout" true # fail early on invalid timeouts + fi + ;; + -h|--help) + print_usage + exit + ;; + *) + break + ;; + esac + shift +done +# There is one required argument, but an empty string is allowed. +if [[ "$#" != 1 ]] ; then + print_usage + exit 1 +fi +save_ccache_to_dir="$1" +if [[ "$save_ccache_to_dir" != "" ]] ; then + mkdir -p "$save_ccache_to_dir" # fail early if there's nowhere to save +else + echo "WARNING: Will not save /ccache from inside the Docker container" +fi + +rand_guid() { + echo "$(date +%s)_${RANDOM}_${RANDOM}_${RANDOM}_${RANDOM}" +} + +id=fbcode_builder_image_id=$(rand_guid) +logfile=$(mktemp) + +echo " + + +Running build with timeout '$build_timeout', label $id, and log in $logfile + + +" + +if [[ "$build_timeout" != "" ]] ; then + # Kill the container after $build_timeout. Using `/bin/timeout` would cause + # Docker to destroy the most recent container and lose its cache. + ( + sleep "$build_timeout" + echo "Build timed out after $build_timeout" 1>&2 + while true; do + maybe_container=$( + egrep '^( ---> Running in [0-9a-f]+|FBCODE_BUILDER_EXIT)$' "$logfile" | + tail -n 1 | awk '{print $NF}' + ) + if [[ "$maybe_container" == "FBCODE_BUILDER_EXIT" ]] ; then + echo "Time-out successfully terminated build" 1>&2 + break + fi + echo "Time-out: trying to kill $maybe_container" 1>&2 + # This kill fail if we get unlucky, try again soon. + docker kill "$maybe_container" || sleep 5 + done + ) & +fi + +build_exit_code=0 +# `docker build` is allowed to fail, and `pipefail` means we must check the +# failure explicitly. +if ! docker build --label="$id" . 2>&1 | tee "$logfile" ; then + build_exit_code="${PIPESTATUS[0]}" + # NB: We are going to deliberately forge ahead even if `tee` failed. + # If it did, we have a problem with tempfile creation, and all is sad. + echo "Build failed with code $build_exit_code, trying to save ccache" 1>&2 +fi +# Stop trying to kill the container. +echo $'\nFBCODE_BUILDER_EXIT' >> "$logfile" + +if [[ "$save_ccache_to_dir" == "" ]] ; then + echo "Not inspecting Docker build, since saving the ccache wasn't requested." + exit "$build_exit_code" +fi + +img=$(docker images --filter "label=$id" -a -q) +if [[ "$img" == "" ]] ; then + docker images -a + echo "In the above list, failed to find most recent image with $id" 1>&2 + # Usually, the above `docker kill` will leave us with an up-to-the-second + # container, from which we can extract the cache. However, if that fails + # for any reason, this loop will instead grab the latest available image. + # + # It's possible for this log search to get confused due to the output of + # the build command itself, but since our builds aren't **trying** to + # break cache, we probably won't randomly hit an ID from another build. + img=$( + egrep '^ ---> (Running in [0-9a-f]+|[0-9a-f]+)$' "$logfile" | tac | + sed 's/Running in /container_/;s/ ---> //;' | ( + while read -r x ; do + # Both docker commands below print an image ID to stdout on + # success, so we just need to know when to stop. + if [[ "$x" =~ container_.* ]] ; then + if docker commit "${x#container_}" ; then + break + fi + elif docker inspect --type image -f '{{.Id}}' "$x" ; then + break + fi + done + ) + ) + if [[ "$img" == "" ]] ; then + echo "Failed to find valid container or image ID in log $logfile" 1>&2 + exit 1 + fi +elif [[ "$(echo "$img" | wc -l)" != 1 ]] ; then + # Shouldn't really happen, but be explicit if it does. + echo "Multiple images with label $id, taking the latest of:" + echo "$img" + img=$(echo "$img" | head -n 1) +fi + +container_name="fbcode_builder_container_$(rand_guid)" +echo "Starting $container_name from latest image of the build with $id --" +echo "$img" + +# ccache collection must be done outside of the Docker build steps because +# we need to be able to kill it on timeout. +# +# This step grows the max cache size to slightly exceed than the working set +# of a successful build. This simple design persists the max size in the +# cache directory itself (the env var CCACHE_MAXSIZE does not even work with +# older ccaches like the one on 14.04). +# +# Future: copy this script into the Docker image via Dockerfile. +( + # By default, fbcode_builder creates an unsigned image, so the `docker + # run` below would fail if DOCKER_CONTENT_TRUST were set. So we unset it + # just for this one run. + export DOCKER_CONTENT_TRUST= + # CAUTION: The inner bash runs without -uex, so code accordingly. + docker run --user root --name "$container_name" "$img" /bin/bash -c ' + build_exit_code='"$build_exit_code"' + export CCACHE_DIR=/ccache + ccache -s + + echo "Total bytes in /ccache:"; + total_bytes=$(du -sb /ccache | awk "{print \$1}") + echo "$total_bytes" + + echo "Used bytes in /ccache:"; + used_bytes=$( + du -sb $(find /ccache -type f -newermt @$( + cat /FBCODE_BUILDER_CCACHE_START_TIME + )) | awk "{t += \$1} END {print t}" + ) + echo "$used_bytes" + + # Goal: set the max cache to 30% over the usage of a successful build. + desired_mb=$(( $used_bytes / 806597 )) # 130% in MB: 1024*1024/1.3 + if [[ "$build_exit_code" != "0" ]] ; then + # For a bad build, disallow shrinking the max cache size. Instead of + # the max cache size, we use on-disk size, which ccache keeps ~10% + # under the actual max size, hence the 1.15 safety factor. + cur_max_mb=$(( $total_bytes / 911805 )) # 115% in MB: 1024*1024/1.15 + if [[ "$desired_mb" -le "$cur_max_mb" ]] ; then + desired_mb="" + fi + fi + + if [[ "$desired_mb" != "" ]] ; then + echo "Updating cache size to $desired_mb MB" + ccache -M "${desired_mb}M" + ccache -s + fi + + # Subshell because `time` the binary may not be installed. + if (time tar czf /ccache.tgz /ccache) ; then + ls -l /ccache.tgz + else + # This `else` ensures we never overwrite the current cache with + # partial data in case of error, even if somebody adds code below. + rm /ccache.tgz + exit 1 + fi + ' +) + +echo "Updating $save_ccache_to_dir/ccache.tgz" +# This will not delete the existing cache if `docker run` didn't make one +docker cp "$container_name:/ccache.tgz" "$save_ccache_to_dir/" + +# Future: it'd be nice if Travis allowed us to retry if the build timed out, +# since we'll make more progress thanks to the cache. As-is, we have to +# wait for the next commit to land. +echo "Build exited with code $build_exit_code" +exit "$build_exit_code" diff --git a/build/fbcode_builder/docker_builder.py b/build/fbcode_builder/docker_builder.py index b44e5c2da..c053f99f1 100644 --- a/build/fbcode_builder/docker_builder.py +++ b/build/fbcode_builder/docker_builder.py @@ -15,7 +15,9 @@ caching, you will want to: - Put the steps that you are debugging towards the very end. ''' +import logging import os +import shutil import tempfile from fbcode_builder import FBCodeBuilder @@ -95,3 +97,71 @@ class DockerFBCodeBuilder(FBCodeBuilder): def _render_impl(self, steps): return raw_shell(shell_join('\n', recursively_flatten_list(steps))) + + def debian_ccache_setup_steps(self): + source_ccache_tgz = self.option('ccache_tgz', '') + if not source_ccache_tgz: + logging.info('Docker ccache not enabled') + return [] + + dest_ccache_tgz = os.path.join( + self.option('docker_context_dir'), 'ccache.tgz' + ) + + try: + try: + os.link(source_ccache_tgz, dest_ccache_tgz) + except OSError: + logging.exception( + 'Hard-linking {s} to {d} failed, falling back to copy' + .format(s=source_ccache_tgz, d=dest_ccache_tgz) + ) + shutil.copyfile(source_ccache_tgz, dest_ccache_tgz) + except Exception: + logging.exception( + 'Failed to copy or link {s} to {d}, aborting' + .format(s=source_ccache_tgz, d=dest_ccache_tgz) + ) + raise + + return [ + # Separate layer so that in development we avoid re-downloads. + self.run(ShellQuoted('apt-get install -yq ccache')), + ShellQuoted('ADD ccache.tgz /'), + ShellQuoted( + # Set CCACHE_DIR before the `ccache` invocations below. + 'ENV CCACHE_DIR=/ccache ' + # No clang support for now, so it's easiest to hardcode gcc. + 'CC="ccache gcc" CXX="ccache g++" ' + # Always log for ease of debugging. For real FB projects, + # this log is several megabytes, so dumping it to stdout + # would likely exceed the Travis log limit of 4MB. + # + # On a local machine, `docker cp` will get you the data. To + # get the data out from Travis, I would compress and dump + # uuencoded bytes to the log -- for Bistro this was about + # 600kb or 8000 lines: + # + # apt-get install sharutils + # bzip2 -9 < /tmp/ccache.log | uuencode -m ccache.log.bz2 + 'CCACHE_LOGFILE=/tmp/ccache.log' + ), + self.run(ShellQuoted( + # Future: Skipping this part made this Docker step instant, + # saving ~1min of build time. It's unclear if it is the + # chown or the du, but probably the chown -- since a large + # part of the cost is incurred at image save time. + # + # ccache.tgz may be empty, or may have the wrong + # permissions. + 'mkdir -p /ccache && time chown -R nobody /ccache && ' + 'time du -sh /ccache && ' + # Reset stats so `docker_build_with_ccache.sh` can print + # useful values at the end of the run. + 'echo === Prev run stats === && ccache -s && ccache -z && ' + # Record the current time to let travis_build.sh figure out + # the number of bytes in the cache that are actually used -- + # this is crucial for tuning the maximum cache size. + 'date +%s > /FBCODE_BUILDER_CCACHE_START_TIME' + )), + ] diff --git a/build/fbcode_builder/fbcode_builder.py b/build/fbcode_builder/fbcode_builder.py index 373fa4926..d9fcbcf1d 100644 --- a/build/fbcode_builder/fbcode_builder.py +++ b/build/fbcode_builder/fbcode_builder.py @@ -241,8 +241,13 @@ class FBCodeBuilder(object): 'apt-get upgrade -yq cmake' ))) + actions.extend(self.debian_ccache_setup_steps()) + return self.step('Install packages for Debian-based OS', actions) + def debian_ccache_setup_steps(self): + raise [] # It's ok to ship a renderer without ccache support. + def github_project_workdir(self, project, path): # Only check out a non-default branch if requested. This especially # makes sense when building from a local repo. diff --git a/build/fbcode_builder/make_docker_context.py b/build/fbcode_builder/make_docker_context.py index 4e437d2b1..fc28beb4e 100755 --- a/build/fbcode_builder/make_docker_context.py +++ b/build/fbcode_builder/make_docker_context.py @@ -15,8 +15,6 @@ contain a Dockerfile, and might also contain copies of your local repos, and other data needed for the build container. ''' -import argparse -import logging import os import tempfile import textwrap @@ -92,6 +90,12 @@ def make_docker_context( help='If set, build {0} from a local directory instead of Github.' .format(github_project), ) + parser.add_argument( + '--ccache-tgz', metavar='PATH', + help='If set, enable ccache for the build. To initialize the ' + 'cache, first try to hardlink, then to copy --cache-tgz ' + 'as ccache.tgz into the --docker-context-dir.' + ) opts = parse_args_to_fbcode_builder_opts( add_args, @@ -105,6 +109,7 @@ def make_docker_context( 'gcc_version', 'make_parallelism', 'local_repo_dir', + 'ccache_tgz', ), opts, help=textwrap.dedent(''' diff --git a/build/fbcode_builder/travis_docker_build.sh b/build/fbcode_builder/travis_docker_build.sh index 0bbdc0d80..5f7e87d40 100755 --- a/build/fbcode_builder/travis_docker_build.sh +++ b/build/fbcode_builder/travis_docker_build.sh @@ -1,17 +1,41 @@ -#!/bin/bash -ex +#!/bin/bash -uex # .travis.yml in the top-level dir explains why this is a separate script. # Read the docs: ./make_docker_context.py --help + os_image=${os_image?Must be set by Travis} gcc_version=${gcc_version?Must be set by Travis} make_parallelism=${make_parallelism:-4} +# ccache is off unless requested +travis_cache_dir=${travis_cache_dir:-} +# The docker build never times out, unless specified +docker_build_timeout=${docker_build_timeout:-} + cur_dir="$(readlink -f "$(dirname "$0")")" + +if [[ "$travis_cache_dir" == "" ]]; then + echo "ccache disabled, enable by setting env. var. travis_cache_dir" + ccache_tgz="" +elif [[ -e "$travis_cache_dir/ccache.tgz" ]]; then + ccache_tgz="$travis_cache_dir/ccache.tgz" +else + echo "$travis_cache_dir/ccache.tgz does not exist, starting with empty cache" + ccache_tgz=$(mktemp) + tar -T /dev/null -czf "$ccache_tgz" +fi + docker_context_dir=$( cd "$cur_dir/.." # Let the script find our fbcode_builder_config.py "$cur_dir/make_docker_context.py" \ --os-image "$os_image" \ --gcc-version "$gcc_version" \ --make-parallelism "$make_parallelism" \ - --local-repo-dir "$cur_dir/../.." + --local-repo-dir "$cur_dir/../.." \ + --ccache-tgz "$ccache_tgz" ) cd "${docker_context_dir?Failed to make Docker context directory}" -docker build . + +# Make it safe to iterate on the .sh in the tree while the script runs. +cp "$cur_dir/docker_build_with_ccache.sh" . +exec ./docker_build_with_ccache.sh \ + --build-timeout "$docker_build_timeout" \ + "$travis_cache_dir"