Skip to content

Commit 172c23d

Browse files
committed
Improve reporting for cluster errors
1 parent 49ce0ec commit 172c23d

File tree

5 files changed

+67
-6
lines changed

5 files changed

+67
-6
lines changed

cluster/api-src/org/labkey/api/cluster/ClusterService.java

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -34,10 +34,12 @@ public static void setInstance(ClusterService instance)
3434
* This creates a barebones PipelineJob configured to run on the selected RemoteExecutionEngine.
3535
* You may wish to further configure this job. This method does not submit the job, which you can do using PipelineService.queueJob()
3636
*/
37-
abstract public PipelineJob createClusterRemotePipelineJob(Container c, User u, String jobName, RemoteExecutionEngine engine, ClusterRemoteTask task, File logFile) throws PipelineValidationException;
37+
abstract public PipelineJob createClusterRemotePipelineJob(Container c, User u, String jobName, RemoteExecutionEngine<?> engine, ClusterRemoteTask task, File logFile) throws PipelineValidationException;
3838

3939
abstract public File getSerializedJobFile(File jobLogFile);
4040

41+
abstract public File getExpectedSubmitScript(PipelineJob job);
42+
4143
public interface ClusterRemoteTask extends Serializable
4244
{
4345
public void run(Logger log);

cluster/src/org/labkey/cluster/ClusterModule.java

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,7 @@
3737
import org.labkey.api.view.WebPartFactory;
3838
import org.labkey.cluster.pipeline.ClusterPipelineJobNotificationProvider;
3939
import org.labkey.cluster.pipeline.ClusterPipelineProvider;
40+
import org.labkey.cluster.pipeline.SlurmExecutionEngine;
4041
import org.labkey.cluster.pipeline.TestCase;
4142
import org.labkey.cluster.query.ForceCancelJobsButton;
4243
import org.labkey.cluster.query.RecoverCompletedJobsButton;
@@ -128,6 +129,14 @@ public Set<Class> getIntegrationTests()
128129
return testClasses;
129130
}
130131

132+
@Override
133+
public @NotNull Set<Class> getUnitTests()
134+
{
135+
return new HashSet<>(Arrays.asList(
136+
SlurmExecutionEngine.TestCase.class
137+
));
138+
}
139+
131140
@Override
132141
protected void registerSchemas()
133142
{

cluster/src/org/labkey/cluster/ClusterServiceImpl.java

Lines changed: 24 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,8 @@
1010
import org.labkey.api.security.User;
1111
import org.labkey.cluster.pipeline.AbstractClusterExecutionEngine;
1212
import org.labkey.cluster.pipeline.ClusterPipelineJob;
13+
import org.labkey.cluster.pipeline.HTCondorExecutionEngine;
14+
import org.labkey.cluster.pipeline.SlurmExecutionEngine;
1315

1416
import java.io.File;
1517
import java.util.ArrayList;
@@ -82,7 +84,28 @@ public File getSerializedJobFile(File jobLogFile)
8284
}
8385

8486
@Override
85-
public PipelineJob createClusterRemotePipelineJob(Container c, User u, String jobName, RemoteExecutionEngine engine, ClusterRemoteTask task, File logFile) throws PipelineValidationException
87+
public File getExpectedSubmitScript(PipelineJob job)
88+
{
89+
if (job.getActiveTaskFactory() != null)
90+
{
91+
String location = job.getActiveTaskFactory().getExecutionLocation();
92+
if (SlurmExecutionEngine.TYPE.equals(location))
93+
{
94+
return SlurmExecutionEngine.getExpectedSubmitScript(job);
95+
}
96+
else if (HTCondorExecutionEngine.TYPE.equals(location))
97+
{
98+
return HTCondorExecutionEngine.getExpectedSubmitScript(job);
99+
}
100+
}
101+
102+
job.getLogger().error("Unable to find appropriate remote execution engine for job: " + job.getJobGUID() + ", with active task factory: " + (job.getActiveTaskFactory() != null ? job.getActiveTaskFactory().getId() : " null"));
103+
104+
return null;
105+
}
106+
107+
@Override
108+
public PipelineJob createClusterRemotePipelineJob(Container c, User u, String jobName, RemoteExecutionEngine<?> engine, ClusterRemoteTask task, File logFile) throws PipelineValidationException
86109
{
87110
return ClusterPipelineJob.createJob(c, u, jobName, task, engine, logFile);
88111
}

cluster/src/org/labkey/cluster/pipeline/HTCondorExecutionEngine.java

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -167,7 +167,7 @@ private File createSubmitScript(PipelineJob job) throws PipelineJobException
167167
//we want this unique for each task, but reused if submitted multiple times
168168
File outDir = job.getLogFile().getParentFile();
169169
String basename = FileUtil.getBaseName(job.getLogFile());
170-
File submitScript = new File(outDir, basename + (job.getActiveTaskId() == null ? "" : "." + job.getActiveTaskId().getNamespaceClass().getSimpleName()) + ".submit");
170+
File submitScript = getExpectedSubmitScript(job);
171171
if (ClusterManager.get().isRecreateSubmitScriptFile() && submitScript.exists())
172172
{
173173
job.getLogger().info("Deleting existing submit script");
@@ -528,4 +528,10 @@ protected boolean removeJob(ClusterJob clusterJob)
528528

529529
return success;
530530
}
531+
532+
public static File getExpectedSubmitScript(PipelineJob job)
533+
{
534+
String basename = FileUtil.getBaseName(job.getLogFile());
535+
return new File(job.getLogFile().getParentFile(), basename + (job.getActiveTaskId() == null ? "" : "." + job.getActiveTaskId().getNamespaceClass().getSimpleName()) + ".submit");
536+
}
531537
}

cluster/src/org/labkey/cluster/pipeline/SlurmExecutionEngine.java

Lines changed: 24 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,8 @@
55
import org.apache.logging.log4j.Logger;
66
import org.jetbrains.annotations.NotNull;
77
import org.jetbrains.annotations.Nullable;
8+
import org.junit.Assert;
9+
import org.junit.Test;
810
import org.labkey.api.cluster.ClusterResourceAllocator;
911
import org.labkey.api.collections.CaseInsensitiveHashSet;
1012
import org.labkey.api.data.Container;
@@ -281,10 +283,10 @@ else if (headerFound)
281283
if (maxRssIdx > -1 && maxRssIdx < tokens.length)
282284
{
283285
long bytes = FileSizeFormatter.convertStringRepresentationToBytes(tokens[maxRssIdx]);
284-
long requestInBytes = FileSizeFormatter.convertBytesToUnit(getConfig().getRequestMemory(), 'G');
286+
long requestInBytes = FileSizeFormatter.convertStringRepresentationToBytes(getConfig().getRequestMemory() + "G"); //request is always GB
285287
if (bytes > requestInBytes)
286288
{
287-
info = "Job exceeded memory: " + FileSizeFormatter.convertBytesToUnit(bytes, 'G');
289+
info = "Job exceeded memory, max was: " + FileSizeFormatter.convertBytesToUnit(bytes, 'G') + "G";
288290
}
289291
}
290292
}
@@ -361,6 +363,12 @@ protected boolean removeJob(ClusterJob clusterJob)
361363
return success;
362364
}
363365

366+
public static File getExpectedSubmitScript(PipelineJob job)
367+
{
368+
String basename = FileUtil.getBaseName(job.getLogFile());
369+
return new File(job.getLogFile().getParentFile(), basename + (job.getActiveTaskId() == null ? "" : "." + job.getActiveTaskId().getNamespaceClass().getSimpleName()) + ".slurm.sh");
370+
}
371+
364372
private File createSubmitScript(PipelineJob job) throws PipelineJobException
365373
{
366374
try
@@ -370,7 +378,7 @@ private File createSubmitScript(PipelineJob job) throws PipelineJobException
370378
//we want this unique for each task, but reused if submitted multiple times
371379
File outDir = job.getLogFile().getParentFile();
372380
String basename = FileUtil.getBaseName(job.getLogFile());
373-
File submitScript = new File(outDir, basename + (job.getActiveTaskId() == null ? "" : "." + job.getActiveTaskId().getNamespaceClass().getSimpleName()) + ".slurm.sh");
381+
File submitScript = getExpectedSubmitScript(job);
374382
if (ClusterManager.get().isRecreateSubmitScriptFile() && submitScript.exists())
375383
{
376384
job.getLogger().info("Deleting existing submit script");
@@ -745,4 +753,17 @@ private static long getSizeFactor(char unit)
745753
};
746754
}
747755
}
756+
757+
public static class TestCase
758+
{
759+
@Test
760+
public void testFileSizeFormatter()
761+
{
762+
long bytes = FileSizeFormatter.convertStringRepresentationToBytes("1362624K");
763+
Assert.assertEquals("Incorrect byte value", 1395326976, bytes);
764+
765+
long val2 = FileSizeFormatter.convertBytesToUnit(bytes, 'K');
766+
Assert.assertEquals("Incorrect string value", 1362624, val2);
767+
}
768+
}
748769
}

0 commit comments

Comments
 (0)