4343
4444CCACHE_MAXSIZE = '500G'
4545
46-
47-
48- def retry (ExceptionToCheck , tries = 4 , delay_s = 1 , backoff = 2 ):
49- """Retry calling the decorated function using an exponential backoff.
50-
51- http://www.saltycrane.com/blog/2009/11/trying-out-retry-decorator-python/
52- original from: http://wiki.python.org/moin/PythonDecoratorLibrary#Retry
53-
54- :param ExceptionToCheck: the exception to check. may be a tuple of
55- exceptions to check
56- :type ExceptionToCheck: Exception or tuple
57- :param tries: number of times to try (not retry) before giving up
58- :type tries: int
59- :param delay_s: initial delay between retries in seconds
60- :type delay_s: int
61- :param backoff: backoff multiplier e.g. value of 2 will double the delay
62- each retry
63- :type backoff: int
64- """
65- import time
66- from functools import wraps
67- def decorated_retry (f ):
68- @wraps (f )
69- def f_retry (* args , ** kwargs ):
70- mtries , mdelay = tries , delay_s
71- while mtries > 1 :
72- try :
73- return f (* args , ** kwargs )
74- except ExceptionToCheck as e :
75- logging .warning ("Exception: %s, Retrying in %d seconds..." , str (e ), mdelay )
76- time .sleep (mdelay )
77- mtries -= 1
78- mdelay *= backoff
79- return f (* args , ** kwargs )
80- return f_retry # true decorator
81- return decorated_retry
82-
8346def under_ci () -> bool :
8447 """:return: True if we run in Jenkins."""
8548 return 'JOB_NAME' in os .environ
@@ -114,8 +77,9 @@ def build_docker(platform: str, docker_binary: str, registry: str, num_retries:
11477 :param num_retries: Number of retries to build the docker image
11578 :return: Id of the top level image
11679 """
80+
11781 tag = get_docker_tag (platform = platform , registry = registry )
118- logging .info ("Building docker container tagged '%s' with %s" , tag , docker_binary )
82+ logging .info ("Building container tagged '%s' with %s" , tag , docker_binary )
11983 #
12084 # We add a user with the same group as the executing non-root user so files created in the
12185 # container match permissions of the local user. Same for the group.
@@ -127,24 +91,40 @@ def build_docker(platform: str, docker_binary: str, registry: str, num_retries:
12791 # docker pull see: docker_cache.load_docker_cache
12892 #
12993 # This doesn't work with multi head docker files.
130- #
131- cmd = [docker_binary , "build" ,
132- "-f" , get_dockerfile (platform ),
133- "--build-arg" , "USER_ID={}" .format (os .getuid ()),
134- "--build-arg" , "GROUP_ID={}" .format (os .getgid ()),
135- "--cache-from" , tag ,
136- "-t" , tag ,
137- "docker" ]
138-
139- @retry (subprocess .CalledProcessError , tries = num_retries )
140- def run_cmd ():
94+ #
95+
96+ for i in range (num_retries ):
97+ logging .info ('%d out of %d tries to build the docker image.' , i + 1 , num_retries )
98+
99+ cmd = [docker_binary , "build" ,
100+ "-f" , get_dockerfile (platform ),
101+ "--build-arg" , "USER_ID={}" .format (os .getuid ()),
102+ "--build-arg" , "GROUP_ID={}" .format (os .getgid ()),
103+ "--cache-from" , tag ,
104+ "-t" , tag ,
105+ "docker" ]
141106 logging .info ("Running command: '%s'" , ' ' .join (cmd ))
142- check_call (cmd )
107+ try :
108+ check_call (cmd )
109+ # Docker build was successful. Call break to break out of the retry mechanism
110+ break
111+ except subprocess .CalledProcessError as e :
112+ saved_exception = e
113+ logging .error ('Failed to build docker image' )
114+ # Building the docker image failed. Call continue to trigger the retry mechanism
115+ continue
116+ else :
117+ # Num retries exceeded
118+ logging .exception ('Exception during build of docker image' , saved_exception )
119+ logging .fatal ('Failed to build the docker image, aborting...' )
120+ sys .exit (1 )
143121
144- run_cmd ()
145122 # Get image id by reading the tag. It's guaranteed (except race condition) that the tag exists. Otherwise, the
146123 # check_call would have failed
147- return _get_local_image_id (docker_binary = docker_binary , docker_tag = tag )
124+ image_id = _get_local_image_id (docker_binary = docker_binary , docker_tag = tag )
125+ if not image_id :
126+ raise FileNotFoundError ('Unable to find docker image id matching with {}' .format (tag ))
127+ return image_id
148128
149129
150130def _get_local_image_id (docker_binary , docker_tag ):
@@ -156,8 +136,6 @@ def _get_local_image_id(docker_binary, docker_tag):
156136 cmd = [docker_binary , "images" , "-q" , docker_tag ]
157137 image_id_b = subprocess .check_output (cmd )
158138 image_id = image_id_b .decode ('utf-8' ).strip ()
159- if not image_id :
160- raise RuntimeError ('Unable to find docker image id matching with tag {}' .format (tag ))
161139 return image_id
162140
163141
@@ -208,7 +186,7 @@ def container_run(platform: str,
208186 '-e' , "CCACHE_LOGFILE=/tmp/ccache.log" , # a container-scoped log, useful for ccache verification.
209187 tag ]
210188 runlist .extend (command )
211- cmd = ' \\ \n \t ' .join (runlist )
189+ cmd = '\\ \n \t ' .join (runlist )
212190 ret = 0
213191 if not dry_run and not interactive :
214192 logging .info ("Running %s in container %s" , command , tag )
@@ -221,14 +199,14 @@ def container_run(platform: str,
221199 # -ti can't be after the tag, as is interpreted as a command so hook it up after the -u argument
222200 idx = into_cmd .index ('-u' ) + 2
223201 into_cmd [idx :idx ] = ['-ti' ]
224- cmd = ' \\ \n \t ' .join (into_cmd )
202+ cmd = '\\ \n \t ' .join (into_cmd )
225203 logging .info ("Executing:\n %s\n " , cmd )
226204 docker_run_cmd = ' ' .join (into_cmd )
227205 ret = call (into_cmd )
228206
229207 if not dry_run and not interactive and ret != 0 :
230208 logging .error ("Running of command in container failed (%s):\n %s\n " , ret , cmd )
231- logging .error ("You can get into the container by adding the -i option to this script " )
209+ logging .error ("You can get into the container by adding the -i option" )
232210 raise subprocess .CalledProcessError (ret , cmd )
233211
234212 return docker_run_cmd
@@ -325,6 +303,7 @@ def use_cache():
325303 command = list (chain (* args .command ))
326304 docker_binary = get_docker_binary (args .nvidiadocker )
327305 shared_memory_size = args .shared_memory_size
306+ num_docker_build_retires = args .docker_build_retries
328307
329308 if args .list :
330309 list_platforms ()
@@ -333,7 +312,7 @@ def use_cache():
333312 tag = get_docker_tag (platform = platform , registry = args .docker_registry )
334313 if use_cache ():
335314 load_docker_cache (tag = tag , docker_registry = args .docker_registry )
336- build_docker (platform , docker_binary , registry = args .docker_registry , num_retries = args . docker_build_retries )
315+ build_docker (platform , docker_binary , registry = args .docker_registry , num_retries = num_docker_build_retires )
337316 if args .build_only :
338317 logging .warning ("Container was just built. Exiting due to build-only." )
339318 return 0
@@ -367,7 +346,7 @@ def use_cache():
367346 tag = get_docker_tag (platform = platform , registry = args .docker_registry )
368347 if use_cache ():
369348 load_docker_cache (tag = tag , docker_registry = args .docker_registry )
370- build_docker (platform , docker_binary , args .docker_registry , num_retries = args . docker_build_retries )
349+ build_docker (platform , docker_binary , args .docker_registry , num_retries = num_docker_build_retires )
371350 if args .build_only :
372351 continue
373352 build_platform = "build_{}" .format (platform )
0 commit comments