11package org .labkey .api .sequenceanalysis .run ;
22
3+ import org .apache .commons .collections4 .list .UnmodifiableList ;
34import org .apache .commons .io .FileUtils ;
45import org .apache .commons .lang3 .StringUtils ;
56import org .apache .logging .log4j .Logger ;
@@ -34,6 +35,7 @@ public class DockerWrapper extends AbstractCommandWrapper
3435 private boolean _useLocalContainerStorage ;
3536 private String _alternateUserHome = null ;
3637 private final Map <String , String > _dockerEnvironment = new HashMap <>();
38+ private int _maxRetries = 3 ;
3739
3840 public DockerWrapper (String containerName , Logger log , PipelineContext ctx )
3941 {
@@ -199,7 +201,7 @@ public void executeWithDocker(List<String> containerArgs, File workDir, Pipeline
199201
200202 localBashScript .setExecutable (true );
201203 dockerBashScript .setExecutable (true );
202- execute (Arrays .asList ("/bin/bash" , localBashScript .getPath ()));
204+ executeWithRetry (Arrays .asList ("/bin/bash" , localBashScript .getPath ()));
203205
204206 if (_useLocalContainerStorage )
205207 {
@@ -214,6 +216,54 @@ public void executeWithDocker(List<String> containerArgs, File workDir, Pipeline
214216 }
215217 }
216218
219+ public int getMaxRetries ()
220+ {
221+ return _maxRetries ;
222+ }
223+
224+ // NOTE: when running on a shared/cluster environment with multiple containers initializing concurrently, conflicts can result in these error codes.
225+ // As a convenience, build in auto-retry behavior if one of these occurs
226+ private final List <Integer > ALLOWABLE_FAIL_CODES = new UnmodifiableList <>(Arrays .asList (125 , 127 ));
227+
228+ private void executeWithRetry (final List <String > args ) throws PipelineJobException
229+ {
230+ int retries = 0 ;
231+ while (retries <= getMaxRetries ())
232+ {
233+ try
234+ {
235+ execute (args );
236+ break ;
237+ }
238+ catch (PipelineJobException e )
239+ {
240+ if (ALLOWABLE_FAIL_CODES .contains (getLastReturnCode ()))
241+ {
242+ retries ++;
243+ if (retries > getMaxRetries ())
244+ {
245+ getLogger ().info ("Maximum retries exceeded" );
246+ throw e ;
247+ }
248+
249+ getLogger ().info ("Exit code " + getLastReturnCode () + ", retrying after 1 sec (" + retries + " of " + getMaxRetries ()+ ")" );
250+ try
251+ {
252+ Thread .sleep (1000 );
253+ }
254+ catch (InterruptedException ex )
255+ {
256+ throw new PipelineJobException (ex );
257+ }
258+ }
259+ else
260+ {
261+ throw e ;
262+ }
263+ }
264+ }
265+ }
266+
217267 private String getEffectiveContainerName ()
218268 {
219269 return _containerName ;
0 commit comments