From 4f5544674aca19c13155d4cf5c903d03654a46cd Mon Sep 17 00:00:00 2001 From: Wez Furlong Date: Fri, 3 May 2019 15:52:39 -0700 Subject: [PATCH] fbcode_builder: getdeps: beef up hash computation Summary: previously, a relatively lame hash was computed to use for the build directory based on some hash of the source directory. That was good enough to get things off the ground, but in the interest of being able to cache the build outputs and safely invalidate them we need a slightly more rigorous implementation. This diff computes a hash based on the manifest contents and relevant environmental factors. The hash is used to name the build directory which will ultimately be cached eg: using the travis/appveyor cache directory configuration and some other means for the FB internal CI. The hash needs to be sufficient that we change the hash when the manifests change. We can tolerate a false positive change in hash (it just means that a build will take longer), but cannot tolerate a false negative (which would result in an incorrect build). Reviewed By: simpkins Differential Revision: D14710332 fbshipit-source-id: ebc2e74eafc6f3305d4412a82195bc9fb9dfa615 --- build/fbcode_builder/getdeps.py | 9 ++- build/fbcode_builder/getdeps/buildopts.py | 70 ++++++++++++++++++++++- build/fbcode_builder/getdeps/fetcher.py | 53 +++++++++++------ build/fbcode_builder/getdeps/manifest.py | 30 ++++++++++ 4 files changed, 139 insertions(+), 23 deletions(-) diff --git a/build/fbcode_builder/getdeps.py b/build/fbcode_builder/getdeps.py index 618fe5748..7c57dca6b 100755 --- a/build/fbcode_builder/getdeps.py +++ b/build/fbcode_builder/getdeps.py @@ -145,7 +145,9 @@ class BuildCmd(SubCmd): manifest = load_project(opts, args.project) ctx = context_from_host_tuple() + print("Building on %s" % ctx) projects = manifests_in_dependency_order(opts, manifest, ctx) + manifests_by_name = {m.name: m for m in projects} # Accumulate the install directories so that the build steps # can find their dep installation @@ -161,7 +163,7 @@ class BuildCmd(SubCmd): reconfigure = change_status.build_changed() sources_changed = change_status.sources_changed() - dirs = opts.compute_dirs(m, fetcher) + dirs = opts.compute_dirs(m, fetcher, manifests_by_name, ctx) build_dir = dirs["build_dir"] inst_dir = dirs["inst_dir"] @@ -169,7 +171,7 @@ class BuildCmd(SubCmd): if os.path.exists(built_marker): with open(built_marker, "r") as f: built_hash = f.read().strip() - if built_hash != hash: + if built_hash != dirs["hash"]: # Some kind of inconsistency with a prior build, # let's run it again to be sure os.unlink(built_marker) @@ -213,6 +215,7 @@ class TestCmd(SubCmd): ctx = context_from_host_tuple() projects = manifests_in_dependency_order(opts, manifest, ctx) + manifests_by_name = {m.name: m for m in projects} # Accumulate the install directories so that the test steps # can find their dep installation @@ -221,7 +224,7 @@ class TestCmd(SubCmd): for m in projects: fetcher = m.create_fetcher(opts, ctx) - dirs = opts.compute_dirs(m, fetcher) + dirs = opts.compute_dirs(m, fetcher, manifests_by_name, ctx) build_dir = dirs["build_dir"] inst_dir = dirs["inst_dir"] diff --git a/build/fbcode_builder/getdeps/buildopts.py b/build/fbcode_builder/getdeps/buildopts.py index 021cbb491..a2d807c38 100644 --- a/build/fbcode_builder/getdeps/buildopts.py +++ b/build/fbcode_builder/getdeps/buildopts.py @@ -7,11 +7,14 @@ from __future__ import absolute_import, division, print_function, unicode_literals +import base64 import errno +import hashlib import os import subprocess import tempfile +from .envfuncs import path_search from .platform import HostType, is_windows @@ -89,9 +92,70 @@ class BuildOptions(object): def is_linux(self): return self.host_type.is_linux() - def compute_dirs(self, manifest, fetcher): - hash = fetcher.hash() - directory = "%s-%s" % (manifest.name, hash) + def _compute_hash(self, hash_by_name, manifest, manifests_by_name, ctx): + """ This recursive function computes a hash for a given manifest. + The hash takes into account some environmental factors on the + host machine and includes the hashes of its dependencies. + No caching of the computation is performed, which is theoretically + wasteful but the computation is fast enough that it is not required + to cache across multiple invocations. """ + + h = hash_by_name.get(manifest.name, None) + if h is not None: + return h + + hasher = hashlib.sha256() + # Some environmental and configuration things matter + env = {} + env["install_dir"] = self.install_dir + env["scratch_dir"] = self.scratch_dir + env["os"] = self.host_type.ostype + env["distro"] = self.host_type.distro + env["distro_vers"] = self.host_type.distrovers + for name in ["CXXFLAGS", "CPPFLAGS", "LDFLAGS", "CXX", "CC"]: + env[name] = os.environ.get(name) + for tool in ["cc", "c++", "gcc", "g++", "clang", "clang++"]: + env["tool-%s" % tool] = path_search(os.environ, tool) + + fetcher = manifest.create_fetcher(self, ctx) + env["fetcher.hash"] = fetcher.hash() + + for name in sorted(env.keys()): + hasher.update(name.encode("utf-8")) + value = env.get(name) + if value is not None: + hasher.update(value.encode("utf-8")) + + manifest.update_hash(hasher, ctx) + + dep_list = sorted(manifest.get_section_as_dict("dependencies", ctx).keys()) + for dep in dep_list: + dep_hash = self._compute_hash( + hash_by_name, manifests_by_name[dep], manifests_by_name, ctx + ) + hasher.update(dep_hash.encode("utf-8")) + + # Use base64 to represent the hash, rather than the simple hex digest, + # so that the string is shorter. Use the URL-safe encoding so that + # the hash can also be safely used as a filename component. + h = base64.urlsafe_b64encode(hasher.digest()).decode("ascii") + # ... and because cmd.exe is troublesome with `=` signs, nerf those. + # They tend to be padding characters at the end anyway, so we can + # safely discard them. + h = h.replace("=", "") + hash_by_name[manifest.name] = h + + return h + + def compute_dirs(self, manifest, fetcher, manifests_by_name, ctx): + hash_by_name = {} + hash = self._compute_hash(hash_by_name, manifest, manifests_by_name, ctx) + + if manifest.is_first_party_project(): + directory = manifest.name + else: + directory = "%s-%s" % (manifest.name, hash) + build_dir = os.path.join(self.scratch_dir, "build", directory) inst_dir = os.path.join(self.scratch_dir, "installed", directory) diff --git a/build/fbcode_builder/getdeps/fetcher.py b/build/fbcode_builder/getdeps/fetcher.py index 016afec64..3cfd05f05 100644 --- a/build/fbcode_builder/getdeps/fetcher.py +++ b/build/fbcode_builder/getdeps/fetcher.py @@ -20,6 +20,7 @@ import time import zipfile from .copytree import prefetch_dir_if_eden +from .envfuncs import Env from .platform import is_windows from .runcmd import run_cmd @@ -114,7 +115,12 @@ class Fetcher(object): working copy. For a git repo this is commit hash for the working copy. For other Fetchers this should relate to the version of the code in the src dir. The intent is that if a manifest - changes the version/rev of a project that the hash be different. """ + changes the version/rev of a project that the hash be different. + Importantly, this should be computable without actually fetching + the code, as we want this to factor into a hash used to download + a pre-built version of the code, without having to first download + and extract its sources (eg: boost on windows is pretty painful). + """ pass def get_src_dir(self): @@ -216,13 +222,7 @@ class GitFetcher(Fetcher): run_cmd(["git", "clean", "-fxd"], cwd=self.repo_dir) def hash(self): - """ Returns a hash that identifies the version of the code in the - working copy """ - return ( - subprocess.check_output(["git", "rev-parse", "HEAD"], cwd=self.repo_dir) - .strip() - .decode("utf-8")[0:6] - ) + return self.rev def get_src_dir(self): return self.repo_dir @@ -402,6 +402,31 @@ class ShipitPathMap(object): return change_status +FBSOURCE_REPO_HASH = {} + + +def get_fbsource_repo_hash(build_options): + """ Returns the hash for the fbsource repo. + Since we may have multiple first party projects to + hash, and because we don't mutate the repo, we cache + this hash in a global. """ + global FBSOURCE_REPO_HASH + cached_hash = FBSOURCE_REPO_HASH.get(build_options.fbsource_dir) + if cached_hash: + return cached_hash + + cmd = ["hg", "log", "-r.", "-T{node}"] + env = Env() + env.set("HGPLAIN", "1") + cached_hash = subprocess.check_output( + cmd, cwd=build_options.fbsource_dir, env=dict(env.items()) + ).decode("ascii") + + FBSOURCE_REPO_HASH[build_options.fbsource_dir] = cached_hash + + return cached_hash + + class SimpleShipitTransformerFetcher(Fetcher): def __init__(self, build_options, manifest): self.build_options = build_options @@ -426,7 +451,7 @@ class SimpleShipitTransformerFetcher(Fetcher): return mapping.mirror(self.build_options.fbsource_dir, self.repo_dir) def hash(self): - return "fbsource" # FIXME: use the hash of the repo in this and the repo_dir + return get_fbsource_repo_hash(self.build_options) def get_src_dir(self): return self.repo_dir @@ -493,13 +518,7 @@ class ShipitTransformerFetcher(Fetcher): raise def hash(self): - cmd = ["hg", "log", "-r.", "-T{node}"] - env = os.environ.copy() - env["HGPLAIN"] = "1" - fbsource_hash = subprocess.check_output( - cmd, cwd=self.build_options.fbsource_dir, env=env - ) - return fbsource_hash[0:6] + return get_fbsource_repo_hash(self.build_options) def get_src_dir(self): return self.repo_dir @@ -637,7 +656,7 @@ class ArchiveFetcher(Fetcher): return ChangeStatus(True) def hash(self): - return self.sha256[0:6] + return self.sha256 def get_src_dir(self): return self.src_dir diff --git a/build/fbcode_builder/getdeps/manifest.py b/build/fbcode_builder/getdeps/manifest.py index ece21b5e5..906afcb59 100644 --- a/build/fbcode_builder/getdeps/manifest.py +++ b/build/fbcode_builder/getdeps/manifest.py @@ -271,6 +271,36 @@ class ManifestParser(object): d[field] = value return d + def update_hash(self, hasher, ctx): + """ Compute a hash over the configuration for the given + context. The goal is for the hash to change if the config + for that context changes, but not if a change is made to + the config only for a different platform than that expressed + by ctx. The hash is intended to be used to help invalidate + a future cache for the third party build products. + The hasher argument is a hash object returned from hashlib. """ + for section in sorted(SCHEMA.keys()): + hasher.update(section.encode("utf-8")) + + # Note: at the time of writing, nothing in the implementation + # relies on keys in any config section being ordered. + # In theory we could have conflicting flags in different + # config sections and later flags override earlier flags. + # For the purposes of computing a hash we're not super + # concerned about this: manifest changes should be rare + # enough and we'd rather that this trigger an invalidation + # than strive for a cache hit at this time. + pairs = self.get_section_as_ordered_pairs(section, ctx) + pairs.sort(key=lambda pair: pair[0]) + for key, value in pairs: + hasher.update(key.encode("utf-8")) + if value is not None: + hasher.update(value.encode("utf-8")) + + def is_first_party_project(self): + """ returns true if this is an FB first-party project """ + return self.shipit_project is not None + def create_fetcher(self, build_options, ctx): use_real_shipit = ( ShipitTransformerFetcher.available() and build_options.use_shipit