Skip to content

Commit 7cf0e43

Browse files
authored
Add automated retries on retryable condition for building images in CI (apache#24006)
There is a flakiness in pushing cache images to ghcr.io, therefore we want to add automated retries when the images fail intermittently. The root cause of the problem is tracked in containerd: containerd/containerd#5978
1 parent ae343fa commit 7cf0e43

File tree

9 files changed

+252
-174
lines changed

9 files changed

+252
-174
lines changed

.github/workflows/ci.yml

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1689,6 +1689,7 @@ ${{ hashFiles('.pre-commit-config.yaml') }}"
16891689
run: >
16901690
breeze build-image
16911691
--prepare-buildx-cache
1692+
--max-retries 3
16921693
--platform linux/amd64,linux/arm64
16931694
env:
16941695
PYTHON_MAJOR_MINOR_VERSION: ${{ matrix.python-version }}
@@ -1722,6 +1723,7 @@ ${{ hashFiles('.pre-commit-config.yaml') }}"
17221723
--install-packages-from-context
17231724
--prepare-buildx-cache
17241725
--disable-airflow-repo-cache
1726+
--max-retries 3
17251727
--platform linux/amd64,linux/arm64
17261728
env:
17271729
PYTHON_MAJOR_MINOR_VERSION: ${{ matrix.python-version }}

dev/breeze/src/airflow_breeze/commands/ci_image_commands.py

Lines changed: 33 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@
1818
import os
1919
import sys
2020
from pathlib import Path
21+
from subprocess import CompletedProcess
2122
from typing import List, Optional, Tuple, Union
2223

2324
import click
@@ -50,6 +51,7 @@
5051
option_image_name,
5152
option_image_tag,
5253
option_install_providers_from_sources,
54+
option_max_retries,
5355
option_parallelism,
5456
option_platform,
5557
option_prepare_buildx_cache,
@@ -87,6 +89,7 @@
8789
instruct_build_image,
8890
is_repo_rebased,
8991
run_command,
92+
run_result_contains,
9093
)
9194

9295
CI_IMAGE_TOOLS_COMMANDS = {
@@ -110,6 +113,7 @@
110113
"--tag-as-latest",
111114
"--docker-cache",
112115
"--force-build",
116+
"--max-retries",
113117
],
114118
},
115119
{
@@ -202,6 +206,7 @@
202206
@option_docker_cache
203207
@option_image_tag
204208
@option_prepare_buildx_cache
209+
@option_max_retries
205210
@option_push_image
206211
@option_empty_image
207212
@option_install_providers_from_sources
@@ -477,15 +482,34 @@ def build_ci_image(verbose: bool, dry_run: bool, ci_image_params: BuildCiParams)
477482
)
478483
else:
479484
get_console().print(f"\n[info]Building CI Image for Python {ci_image_params.python}\n")
480-
build_command_result = run_command(
481-
cmd, verbose=verbose, dry_run=dry_run, cwd=AIRFLOW_SOURCES_ROOT, text=True, check=False
482-
)
483-
if build_command_result.returncode == 0:
484-
if ci_image_params.prepare_buildx_cache:
485+
num_tries = 1 if ci_image_params.max_retries is None else ci_image_params.max_retries
486+
build_command_result = CompletedProcess(args=[], returncode=1, stdout="This should never happen.")
487+
while num_tries > 0:
488+
build_command_result = run_command(
489+
cmd,
490+
verbose=verbose,
491+
dry_run=dry_run,
492+
cwd=AIRFLOW_SOURCES_ROOT,
493+
check=False,
494+
text=True,
495+
capture_output=True,
496+
)
497+
if ci_image_params.prepare_buildx_cache and build_command_result.returncode == 0:
485498
build_command_result = build_cache(
486499
image_params=ci_image_params, dry_run=dry_run, verbose=verbose
487500
)
488-
501+
if build_command_result.returncode == 0:
502+
break
503+
num_tries -= 1
504+
if run_result_contains(build_command_result, "cannot reuse body, request must be retried"):
505+
if num_tries > 0:
506+
get_console().print(
507+
"[info]Retrying failed command on retryable condition. "
508+
f"There are {num_tries} left[/]"
509+
)
510+
continue
511+
else:
512+
break
489513
if not ci_image_params.prepare_buildx_cache:
490514
if not dry_run:
491515
if build_command_result.returncode == 0:
@@ -504,7 +528,9 @@ def build_ci_image(verbose: bool, dry_run: bool, ci_image_params: BuildCiParams)
504528
f"Image build: {ci_image_params.python}",
505529
)
506530
else:
507-
get_console().print("[info]Not updating build cache because we are in `dry_run` mode.[/]")
531+
get_console().print(
532+
"[info]Not tagging/marking image as refreshed because we are in `dry_run` mode.[/]"
533+
)
508534
return build_command_result.returncode, f"Image build: {ci_image_params.python}"
509535

510536

dev/breeze/src/airflow_breeze/commands/production_image_commands.py

Lines changed: 41 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@
1717
import contextlib
1818
import os
1919
import sys
20+
from subprocess import CompletedProcess
2021
from typing import Optional, Tuple
2122

2223
import click
@@ -49,6 +50,7 @@
4950
option_image_name,
5051
option_image_tag,
5152
option_install_providers_from_sources,
53+
option_max_retries,
5254
option_parallelism,
5355
option_platform,
5456
option_prepare_buildx_cache,
@@ -77,7 +79,12 @@
7779
from airflow_breeze.utils.python_versions import get_python_version_list
7880
from airflow_breeze.utils.registry import login_to_github_docker_registry
7981
from airflow_breeze.utils.run_tests import verify_an_image
80-
from airflow_breeze.utils.run_utils import filter_out_none, fix_group_permissions, run_command
82+
from airflow_breeze.utils.run_utils import (
83+
filter_out_none,
84+
fix_group_permissions,
85+
run_command,
86+
run_result_contains,
87+
)
8188

8289
PRODUCTION_IMAGE_TOOLS_COMMANDS = {
8390
"name": "Production Image tools",
@@ -99,6 +106,7 @@
99106
"--image-tag",
100107
"--tag-as-latest",
101108
"--docker-cache",
109+
"--max-retries",
102110
],
103111
},
104112
{
@@ -206,6 +214,7 @@
206214
@option_docker_cache
207215
@option_image_tag
208216
@option_prepare_buildx_cache
217+
@option_max_retries
209218
@option_push_image
210219
@option_empty_image
211220
@option_airflow_constraints_mode_prod
@@ -517,16 +526,36 @@ def build_production_image(
517526
image_params=prod_image_params,
518527
verbose=verbose,
519528
)
520-
build_command_result = run_command(
521-
cmd, verbose=verbose, dry_run=dry_run, cwd=AIRFLOW_SOURCES_ROOT, check=False, text=True
522-
)
523-
if build_command_result.returncode == 0:
524-
if prod_image_params.prepare_buildx_cache:
525-
build_command_result = build_cache(
526-
image_params=prod_image_params, dry_run=dry_run, verbose=verbose
527-
)
529+
num_tries = 1 if prod_image_params.max_retries is None else prod_image_params.max_retries
530+
build_command_result = CompletedProcess(args=[], returncode=1, stdout="This should never happen.")
531+
while num_tries > 0:
532+
build_command_result = run_command(
533+
cmd,
534+
verbose=verbose,
535+
dry_run=dry_run,
536+
cwd=AIRFLOW_SOURCES_ROOT,
537+
check=False,
538+
text=True,
539+
capture_output=True,
540+
)
541+
if build_command_result.returncode == 0:
542+
if prod_image_params.prepare_buildx_cache:
543+
build_command_result = build_cache(
544+
image_params=prod_image_params, dry_run=dry_run, verbose=verbose
545+
)
546+
else:
547+
if prod_image_params.tag_as_latest:
548+
build_command_result = tag_image_as_latest(prod_image_params, dry_run, verbose)
549+
if build_command_result.returncode == 0:
550+
break
551+
num_tries -= 1
552+
if run_result_contains(build_command_result, "cannot reuse body, request must be retried"):
553+
if num_tries > 0:
554+
get_console().print(
555+
"[info]Retrying failed command on retryable condition. "
556+
f"There are {num_tries} left[/]"
557+
)
558+
continue
528559
else:
529-
if prod_image_params.tag_as_latest:
530-
build_command_result = tag_image_as_latest(prod_image_params, dry_run, verbose)
531-
560+
break
532561
return build_command_result.returncode, f"Image build: {prod_image_params.python}"

dev/breeze/src/airflow_breeze/params/_common_build_params.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -55,6 +55,7 @@ class _CommonBuildParams:
5555
github_token: str = os.environ.get('GITHUB_TOKEN', "")
5656
github_username: str = ""
5757
image_tag: Optional[str] = None
58+
max_retries: Optional[int] = None
5859
install_providers_from_sources: bool = False
5960
platform: str = f"linux/{os.uname().machine}"
6061
prepare_buildx_cache: bool = False

dev/breeze/src/airflow_breeze/utils/common_options.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -289,6 +289,12 @@
289289
is_flag=True,
290290
envvar='PREPARE_BUILDX_CACHE',
291291
)
292+
option_max_retries = click.option(
293+
'--max-retries',
294+
help='Maximum number of retries for the operation for "retryable" intermittent problems.',
295+
type=click.IntRange(min=2),
296+
envvar='MAX_RETRIES',
297+
)
292298
option_push_image = click.option(
293299
'--push-image',
294300
help='Push image after building it.',

dev/breeze/src/airflow_breeze/utils/run_utils.py

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -405,3 +405,11 @@ def get_runnable_ci_image(verbose: bool, dry_run: bool) -> str:
405405
instruction=f"breeze build-image --python {python_version}",
406406
)
407407
return airflow_image
408+
409+
410+
def run_result_contains(result: RunCommandResult, message: str) -> bool:
411+
if result.stdout and message in result.stdout:
412+
return True
413+
if result.stderr and message in result.stderr:
414+
return True
415+
return False

0 commit comments

Comments
 (0)