Skip to content

Commit dc06add

Browse files
committed
Add pipeline message if slurm job exceeds memory
1 parent 37d2065 commit dc06add

File tree

1 file changed

+58
-1
lines changed

1 file changed

+58
-1
lines changed

cluster/src/org/labkey/cluster/pipeline/SlurmExecutionEngine.java

Lines changed: 58 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -220,6 +220,7 @@ protected Pair<String, String> getStatusForJob(ClusterJob job, Container c)
220220
int jobIdx = -1;
221221
int stateIdx = -1;
222222
int hostnameIdx = -1;
223+
int maxRssIdx = -1;
223224
for (String line : ret)
224225
{
225226
line = StringUtils.trimToNull(line);
@@ -235,6 +236,7 @@ protected Pair<String, String> getStatusForJob(ClusterJob job, Container c)
235236
jobIdx = header.indexOf("JOBID");
236237
stateIdx = header.indexOf("STATE");
237238
hostnameIdx = header.indexOf("NODELIST");
239+
maxRssIdx = header.indexOf("MAXRSS");
238240

239241
if (stateIdx == -1)
240242
{
@@ -274,6 +276,16 @@ else if (headerFound)
274276
}
275277
}
276278
}
279+
280+
if (maxRssIdx > -1)
281+
{
282+
long bytes = FileSizeFormatter.convertStringRepresentationToBytes(tokens[maxRssIdx]);
283+
long requestInBytes = FileSizeFormatter.convertBytesToUnit(getConfig().getRequestMemory(), 'G');
284+
if (bytes > requestInBytes)
285+
{
286+
info = "Job exceeded memory: " + FileSizeFormatter.convertBytesToUnit(bytes, 'G');
287+
}
288+
}
277289
}
278290
catch (Exception e)
279291
{
@@ -535,7 +547,8 @@ public static enum StatusType
535547
SE("Error", PipelineJob.TaskStatus.error, Arrays.asList("SPECIAL_EXIT")),
536548
ST("Stopped", PipelineJob.TaskStatus.error),
537549
S("Suspended", PipelineJob.TaskStatus.waiting, null, "Job suspended"),
538-
TO("Timeout", PipelineJob.TaskStatus.error, null, "Job timeout");
550+
TO("Timeout", PipelineJob.TaskStatus.error, null, "Job timeout"),
551+
OOM("Out of Memory", PipelineJob.TaskStatus.error, Arrays.asList("OUT_OF_MEMORY"), "Out of Memory");
539552

540553
private Set<String> _aliases = new CaseInsensitiveHashSet();
541554
private String _labkeyStatus;
@@ -687,4 +700,48 @@ private Pair<String, String> getStatusFromQueue(ClusterJob job)
687700

688701
return null;
689702
}
703+
704+
// Based on: https://stackoverflow.com/questions/3758606/how-can-i-convert-byte-size-into-a-human-readable-format-in-java
705+
private static class FileSizeFormatter
706+
{
707+
public static long convertStringRepresentationToBytes(final String value)
708+
{
709+
try
710+
{
711+
char unit = value.toUpperCase().charAt(value.length() - 1);
712+
long sizeFactor = getSizeFactor(unit);
713+
long size = Long.parseLong(value.substring(0, value.length() - 1));
714+
715+
return size * sizeFactor;
716+
}
717+
catch (Exception e)
718+
{
719+
throw new IllegalArgumentException("Improper size string: " + value, e);
720+
}
721+
}
722+
723+
public static long convertBytesToUnit(final long bytes, final char unit)
724+
{
725+
long sizeFactor = getSizeFactor(unit);
726+
727+
return bytes / sizeFactor;
728+
}
729+
730+
private static long getSizeFactor(char unit)
731+
{
732+
final long K = 1024;
733+
final long M = K * K;
734+
final long G = M * K;
735+
final long T = G * K;
736+
737+
return switch (unit)
738+
{
739+
case 'K' -> K;
740+
case 'M' -> M;
741+
case 'G' -> G;
742+
case 'T' -> T;
743+
default -> 1;
744+
};
745+
}
746+
}
690747
}

0 commit comments

Comments
 (0)