Skip to content

Commit b7cebee

Browse files
committed
Allow docker to auto-retry after common failures
1 parent e6cf76a commit b7cebee

File tree

1 file changed

+51
-1
lines changed

1 file changed

+51
-1
lines changed

SequenceAnalysis/api-src/org/labkey/api/sequenceanalysis/run/DockerWrapper.java

Lines changed: 51 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
package org.labkey.api.sequenceanalysis.run;
22

3+
import org.apache.commons.collections4.list.UnmodifiableList;
34
import org.apache.commons.io.FileUtils;
45
import org.apache.commons.lang3.StringUtils;
56
import org.apache.logging.log4j.Logger;
@@ -34,6 +35,7 @@ public class DockerWrapper extends AbstractCommandWrapper
3435
private boolean _useLocalContainerStorage;
3536
private String _alternateUserHome = null;
3637
private final Map<String, String> _dockerEnvironment = new HashMap<>();
38+
private int _maxRetries = 3;
3739

3840
public DockerWrapper(String containerName, Logger log, PipelineContext ctx)
3941
{
@@ -199,7 +201,7 @@ public void executeWithDocker(List<String> containerArgs, File workDir, Pipeline
199201

200202
localBashScript.setExecutable(true);
201203
dockerBashScript.setExecutable(true);
202-
execute(Arrays.asList("/bin/bash", localBashScript.getPath()));
204+
executeWithRetry(Arrays.asList("/bin/bash", localBashScript.getPath()));
203205

204206
if (_useLocalContainerStorage)
205207
{
@@ -214,6 +216,54 @@ public void executeWithDocker(List<String> containerArgs, File workDir, Pipeline
214216
}
215217
}
216218

219+
public int getMaxRetries()
220+
{
221+
return _maxRetries;
222+
}
223+
224+
// NOTE: when running on a shared/cluster environment with multiple containers initializing concurrently, conflicts can result in these error codes.
225+
// As a convenience, build in auto-retry behavior if one of these occurs
226+
private final List<Integer> ALLOWABLE_FAIL_CODES = new UnmodifiableList<>(Arrays.asList(125, 127));
227+
228+
private void executeWithRetry(final List<String> args) throws PipelineJobException
229+
{
230+
int retries = 0;
231+
while (retries <= getMaxRetries())
232+
{
233+
try
234+
{
235+
execute(args);
236+
break;
237+
}
238+
catch (PipelineJobException e)
239+
{
240+
if (ALLOWABLE_FAIL_CODES.contains(getLastReturnCode()))
241+
{
242+
retries++;
243+
if (retries > getMaxRetries())
244+
{
245+
getLogger().info("Maximum retries exceeded");
246+
throw e;
247+
}
248+
249+
getLogger().info("Exit code " + getLastReturnCode() + ", retrying after 1 sec (" + retries + " of " + getMaxRetries()+ ")");
250+
try
251+
{
252+
Thread.sleep(1000);
253+
}
254+
catch (InterruptedException ex)
255+
{
256+
throw new PipelineJobException(ex);
257+
}
258+
}
259+
else
260+
{
261+
throw e;
262+
}
263+
}
264+
}
265+
}
266+
217267
private String getEffectiveContainerName()
218268
{
219269
return _containerName;

0 commit comments

Comments
 (0)