From 2a7eeef17643526cd5ee32d8ae23850c96e3c33e Mon Sep 17 00:00:00 2001 From: Brad Warren Date: Thu, 16 Feb 2023 11:17:47 -0800 Subject: [PATCH] Polish snap_build.py (#9584) I wanted to try to make our tooling's messaging about it a little clearer. While fixing my typo/bad English, we happened to hit a "Chroot problem" failure! See the logs for the CI first attempt at https://dev.azure.com/certbot/certbot/_build/results?buildId=6416&view=results. Looking at these logs, I noticed three things: 1. This message I added is sometimes printed many times because we're still processing output from snapcraft. See https://dev.azure.com/certbot/certbot/_build/results?buildId=6416&view=logs&j=f44d40a4-7318-5ffe-762c-ae4557889284&s=1dfbc15b-7d0f-52a9-b1da-b17592bf94f8&t=07786725-57f8-5198-4d13-ea77f640bd5c&l=565. 2. snapcraft is complaining that we should be using --build-for now instead of --build-on. See https://dev.azure.com/certbot/certbot/_build/results?buildId=6416&view=logs&j=f44d40a4-7318-5ffe-762c-ae4557889284&s=1dfbc15b-7d0f-52a9-b1da-b17592bf94f8&t=07786725-57f8-5198-4d13-ea77f640bd5c&l=472. 3. Us canceling the Certbot build due to a "Chroot problem" happened 3 times in 3 seconds which seems very unlikely. See https://dev.azure.com/certbot/certbot/_build/results?buildId=6416&view=logs&j=f44d40a4-7318-5ffe-762c-ae4557889284&s=1dfbc15b-7d0f-52a9-b1da-b17592bf94f8&t=07786725-57f8-5198-4d13-ea77f640bd5c&l=587. I looked at the builds on launchpad and I only saw one Certbot build. I think what's happening is this code is causing the old build state to be reported so we error immediately. I fixed all of these things in my follow up commits. * polish chroot problem messaging * only execute branch once --- tools/snap/build_remote.py | 25 ++++++++++++++++++------- 1 file changed, 18 insertions(+), 7 deletions(-) diff --git a/tools/snap/build_remote.py b/tools/snap/build_remote.py index 7a4eea1b0..092acfa3e 100755 --- a/tools/snap/build_remote.py +++ b/tools/snap/build_remote.py @@ -48,7 +48,7 @@ def _snap_log_name(target: str, arch: str): def _execute_build( target: str, archs: Set[str], status: Dict[str, Dict[str, str]], - workspace: str) -> Tuple[int, List[str]]: + workspace: str, output_lock: Lock) -> Tuple[int, List[str]]: # snapcraft remote-build accepts a --build-id flag with snapcraft version # 5.0+. We make use of this feature to set a unique build ID so a fresh @@ -68,18 +68,28 @@ def _execute_build( environ['XDG_CACHE_HOME'] = tempdir process = subprocess.Popen([ 'snapcraft', 'remote-build', '--launchpad-accept-public-upload', - '--build-on', ','.join(archs), '--build-id', build_id], + '--build-for', ','.join(archs), '--build-id', build_id], stdout=subprocess.PIPE, stderr=subprocess.STDOUT, universal_newlines=True, env=environ, cwd=workspace) + killed = False process_output: List[str] = [] for line in process.stdout: process_output.append(line) _extract_state(target, line, status) - if any(state for state in status[target].values() if state == 'Chroot problem'): - # On this error the snapcraft process stales. Let's finish it. + if not killed and any(state for state in status[target].values() if state == 'Chroot problem'): + # On this error the snapcraft process hangs. Let's finish it. + # + # killed is used to stop us from executing this code path + # multiple times per build that encounters "Chroot problem". + with output_lock: + print('Chroot problem encountered for build ' + f'{target} for {",".join(archs)}.\n' + 'Launchpad seems to be unable to recover from this ' + 'state so we are terminating the build.') process.kill() + killed = True process_state = process.wait() @@ -89,8 +99,6 @@ def _execute_build( def _build_snap( target: str, archs: Set[str], status: Dict[str, Dict[str, str]], running: Dict[str, bool], output_lock: Lock) -> bool: - status[target] = {arch: '...' for arch in archs} - if target == 'certbot': workspace = CERTBOT_DIR else: @@ -99,7 +107,10 @@ def _build_snap( build_success = False retry = 3 while retry: - exit_code, process_output = _execute_build(target, archs, status, workspace) + # Let's reset the status before each build so we're not starting with + # old state values. + status[target] = {arch: '...' for arch in archs} + exit_code, process_output = _execute_build(target, archs, status, workspace, output_lock) with output_lock: print(f'Build {target} for {",".join(archs)} (attempt {4-retry}/3) ended with ' f'exit code {exit_code}.')