Skip to content

Commit 43071d3

Browse files
committed
Attempt to rescue jobs inappropriately marked as ERROR by slurm due to memory overages, but where the job actually completed.
1 parent 13122e1 commit 43071d3

File tree

1 file changed

+3
-4
lines changed

1 file changed

+3
-4
lines changed

cluster/src/org/labkey/cluster/pipeline/AbstractClusterExecutionEngine.java

Lines changed: 3 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -480,7 +480,7 @@ protected Collection<ClusterJob> getJobsToCheck(boolean includeConflicting, Coll
480480
/**
481481
* this expects the status normalized from cluster codes to LK TaskStatus
482482
*/
483-
protected void updateJobStatus(@Nullable String status, ClusterJob j, @Nullable String info) throws PipelineJobException
483+
protected synchronized void updateJobStatus(@Nullable String status, ClusterJob j, @Nullable String info) throws PipelineJobException
484484
{
485485
//update DB
486486
boolean statusChanged = (status != null && !status.equals(j.getStatus()));
@@ -592,9 +592,8 @@ else if (pj.getActiveTaskStatus() == PipelineJob.TaskStatus.complete)
592592
pj.getLogger().info("Pipeline job JSON marked complete, but the cluster status was: " + taskStatus + ", status file was: " + (sf == null ? null : sf.getStatus()) + ", cluster job: " + mostRecent.getStatus(), new Exception());
593593
if (taskStatus == PipelineJob.TaskStatus.error && PipelineJob.TaskStatus.running.matches(sf.getStatus()))
594594
{
595-
// TODO: consider updating this?
596-
//pj.getLogger().info("Ignoring ERROR status and deferring to pipeline JSON status of " + pj.getActiveTaskStatus());
597-
//taskStatus = PipelineJob.TaskStatus.complete;
595+
pj.getLogger().info("Ignoring ERROR status and deferring to pipeline JSON status of " + pj.getActiveTaskStatus());
596+
taskStatus = PipelineJob.TaskStatus.complete;
598597
}
599598
}
600599

0 commit comments

Comments
 (0)