From 2a7eeef17643526cd5ee32d8ae23850c96e3c33e Mon Sep 17 00:00:00 2001
From: Brad Warren <bmw@users.noreply.github.com>
Date: Thu, 16 Feb 2023 11:17:47 -0800
Subject: [PATCH] Polish snap_build.py (#9584)

I wanted to try to make our tooling's messaging about it a little clearer.

While fixing my typo/bad English, we happened to hit a "Chroot problem" failure! See the logs for the CI first attempt at https://dev.azure.com/certbot/certbot/_build/results?buildId=6416&view=results.

Looking at these logs, I noticed three things:

1. This message I added is sometimes printed many times because we're still processing output from snapcraft. See https://dev.azure.com/certbot/certbot/_build/results?buildId=6416&view=logs&j=f44d40a4-7318-5ffe-762c-ae4557889284&s=1dfbc15b-7d0f-52a9-b1da-b17592bf94f8&t=07786725-57f8-5198-4d13-ea77f640bd5c&l=565.
2. snapcraft is complaining that we should be using --build-for now instead of --build-on. See https://dev.azure.com/certbot/certbot/_build/results?buildId=6416&view=logs&j=f44d40a4-7318-5ffe-762c-ae4557889284&s=1dfbc15b-7d0f-52a9-b1da-b17592bf94f8&t=07786725-57f8-5198-4d13-ea77f640bd5c&l=472.
3. Us canceling the Certbot build due to a "Chroot problem" happened 3 times in 3 seconds which seems very unlikely. See https://dev.azure.com/certbot/certbot/_build/results?buildId=6416&view=logs&j=f44d40a4-7318-5ffe-762c-ae4557889284&s=1dfbc15b-7d0f-52a9-b1da-b17592bf94f8&t=07786725-57f8-5198-4d13-ea77f640bd5c&l=587. I looked at the builds on launchpad and I only saw one Certbot build. I think what's happening is this code is causing the old build state to be reported so we error immediately.
I fixed all of these things in my follow up commits.

* polish chroot problem messaging

* only execute branch once
---
 tools/snap/build_remote.py | 25 ++++++++++++++++++-------
 1 file changed, 18 insertions(+), 7 deletions(-)

diff --git a/tools/snap/build_remote.py b/tools/snap/build_remote.py
index 7a4eea1b0..092acfa3e 100755
--- a/tools/snap/build_remote.py
+++ b/tools/snap/build_remote.py
@@ -48,7 +48,7 @@ def _snap_log_name(target: str, arch: str):
 
 def _execute_build(
         target: str, archs: Set[str], status: Dict[str, Dict[str, str]],
-        workspace: str) -> Tuple[int, List[str]]:
+        workspace: str, output_lock: Lock) -> Tuple[int, List[str]]:
 
     # snapcraft remote-build accepts a --build-id flag with snapcraft version
     # 5.0+. We make use of this feature to set a unique build ID so a fresh
@@ -68,18 +68,28 @@ def _execute_build(
         environ['XDG_CACHE_HOME'] = tempdir
         process = subprocess.Popen([
             'snapcraft', 'remote-build', '--launchpad-accept-public-upload',
-            '--build-on', ','.join(archs), '--build-id', build_id],
+            '--build-for', ','.join(archs), '--build-id', build_id],
             stdout=subprocess.PIPE, stderr=subprocess.STDOUT,
             universal_newlines=True, env=environ, cwd=workspace)
 
+    killed = False
     process_output: List[str] = []
     for line in process.stdout:
         process_output.append(line)
         _extract_state(target, line, status)
 
-        if any(state for state in status[target].values() if state == 'Chroot problem'):
-            # On this error the snapcraft process stales. Let's finish it.
+        if not killed and any(state for state in status[target].values() if state == 'Chroot problem'):
+            # On this error the snapcraft process hangs. Let's finish it.
+            #
+            # killed is used to stop us from executing this code path
+            # multiple times per build that encounters "Chroot problem".
+            with output_lock:
+                print('Chroot problem encountered for build '
+                      f'{target} for {",".join(archs)}.\n'
+                      'Launchpad seems to be unable to recover from this '
+                      'state so we are terminating the build.')
             process.kill()
+            killed = True
 
     process_state = process.wait()
 
@@ -89,8 +99,6 @@ def _execute_build(
 def _build_snap(
         target: str, archs: Set[str], status: Dict[str, Dict[str, str]],
         running: Dict[str, bool], output_lock: Lock) -> bool:
-    status[target] = {arch: '...' for arch in archs}
-
     if target == 'certbot':
         workspace = CERTBOT_DIR
     else:
@@ -99,7 +107,10 @@ def _build_snap(
     build_success = False
     retry = 3
     while retry:
-        exit_code, process_output = _execute_build(target, archs, status, workspace)
+        # Let's reset the status before each build so we're not starting with
+        # old state values.
+        status[target] = {arch: '...' for arch in archs}
+        exit_code, process_output = _execute_build(target, archs, status, workspace, output_lock)
         with output_lock:
             print(f'Build {target} for {",".join(archs)} (attempt {4-retry}/3) ended with '
                   f'exit code {exit_code}.')