Skip to content

Commit 44aa8ba

Browse files
committed
Parse request memory and mem used for slurm jobs
1 parent 9a4feeb commit 44aa8ba

File tree

1 file changed

+32
-12
lines changed

1 file changed

+32
-12
lines changed

cluster/src/org/labkey/cluster/pipeline/SlurmExecutionEngine.java

Lines changed: 32 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -247,6 +247,8 @@ protected Pair<String, String> getStatusForJob(ClusterJob job, Container c)
247247
int stateIdx = -1;
248248
int hostnameIdx = -1;
249249
int maxRssIdx = -1;
250+
int reqMemIdx = -1;
251+
String reqMem = null;
250252
for (String line : ret)
251253
{
252254
line = StringUtils.trimToNull(line);
@@ -263,6 +265,7 @@ protected Pair<String, String> getStatusForJob(ClusterJob job, Container c)
263265
stateIdx = header.indexOf("STATE");
264266
hostnameIdx = header.indexOf("NODELIST");
265267
maxRssIdx = header.indexOf("MAXRSS");
268+
reqMemIdx = header.indexOf("REQMEM");
266269

267270
if (stateIdx == -1)
268271
{
@@ -303,6 +306,16 @@ else if (headerFound)
303306
}
304307
}
305308

309+
if (reqMemIdx > -1 && reqMemIdx < tokens.length)
310+
{
311+
String val = StringUtils.trimToNull(tokens[reqMemIdx]);
312+
if (val != null)
313+
{
314+
reqMem = val;
315+
}
316+
317+
}
318+
306319
// NOTE: if the line has blank ending columns, trimmed lines might lack that value
307320
if ((job.getClusterId() + ".0").equals(id) && maxRssIdx > -1 && maxRssIdx < tokens.length)
308321
{
@@ -312,21 +325,28 @@ else if (headerFound)
312325
if (maxRSS != null)
313326
{
314327
double bytes = FileSizeFormatter.convertStringRepresentationToBytes(maxRSS);
315-
double requestInBytes = FileSizeFormatter.convertStringRepresentationToBytes(getConfig().getRequestMemory() + "G"); //request is always GB
316-
if (bytes > requestInBytes)
328+
if (reqMem == null)
317329
{
318-
info = "Job exceeded memory, max was: " + FileSizeFormatter.convertBytesToUnit(bytes, 'G') + "G, requested memory was: " + getConfig().getRequestMemory() + "G";
319-
320-
PipelineStatusFile sf = PipelineService.get().getStatusFile(job.getJobId());
321-
if (sf != null)
330+
_log.warn("Unable to find ReqMem for slurm job: " + job.getClusterId());
331+
}
332+
else
333+
{
334+
double requestInBytes = FileSizeFormatter.convertStringRepresentationToBytes(reqMem);
335+
if (bytes > requestInBytes)
322336
{
323-
try (PrintWriter writer = PrintWriters.getPrintWriter(new File(sf.getFilePath()), StandardOpenOption.APPEND))
324-
{
325-
writer.println(info + ". Raw slurm value: " + maxRSS);
326-
}
327-
catch (FileNotFoundException e)
337+
info = "Job exceeded memory, max was: " + FileSizeFormatter.convertBytesToUnit(bytes, 'G') + "G, requested memory was: " + FileSizeFormatter.convertBytesToUnit(requestInBytes, 'G');
338+
339+
PipelineStatusFile sf = PipelineService.get().getStatusFile(job.getJobId());
340+
if (sf != null)
328341
{
329-
_log.error("Unable to find log file for job, " + job.getJobId() + ": " + sf.getFilePath());
342+
try (PrintWriter writer = PrintWriters.getPrintWriter(new File(sf.getFilePath()), StandardOpenOption.APPEND))
343+
{
344+
writer.println(info + ". Raw slurm value: " + maxRSS);
345+
}
346+
catch (FileNotFoundException e)
347+
{
348+
_log.error("Unable to find log file for job, " + job.getJobId() + ": " + sf.getFilePath());
349+
}
330350
}
331351
}
332352
}

0 commit comments

Comments
 (0)