@@ -247,6 +247,8 @@ protected Pair<String, String> getStatusForJob(ClusterJob job, Container c)
247247 int stateIdx = -1 ;
248248 int hostnameIdx = -1 ;
249249 int maxRssIdx = -1 ;
250+ int reqMemIdx = -1 ;
251+ String reqMem = null ;
250252 for (String line : ret )
251253 {
252254 line = StringUtils .trimToNull (line );
@@ -263,6 +265,7 @@ protected Pair<String, String> getStatusForJob(ClusterJob job, Container c)
263265 stateIdx = header .indexOf ("STATE" );
264266 hostnameIdx = header .indexOf ("NODELIST" );
265267 maxRssIdx = header .indexOf ("MAXRSS" );
268+ reqMemIdx = header .indexOf ("REQMEM" );
266269
267270 if (stateIdx == -1 )
268271 {
@@ -303,6 +306,16 @@ else if (headerFound)
303306 }
304307 }
305308
309+ if (reqMemIdx > -1 && reqMemIdx < tokens .length )
310+ {
311+ String val = StringUtils .trimToNull (tokens [reqMemIdx ]);
312+ if (val != null )
313+ {
314+ reqMem = val ;
315+ }
316+
317+ }
318+
306319 // NOTE: if the line has blank ending columns, trimmed lines might lack that value
307320 if ((job .getClusterId () + ".0" ).equals (id ) && maxRssIdx > -1 && maxRssIdx < tokens .length )
308321 {
@@ -312,21 +325,28 @@ else if (headerFound)
312325 if (maxRSS != null )
313326 {
314327 double bytes = FileSizeFormatter .convertStringRepresentationToBytes (maxRSS );
315- double requestInBytes = FileSizeFormatter .convertStringRepresentationToBytes (getConfig ().getRequestMemory () + "G" ); //request is always GB
316- if (bytes > requestInBytes )
328+ if (reqMem == null )
317329 {
318- info = "Job exceeded memory, max was: " + FileSizeFormatter .convertBytesToUnit (bytes , 'G' ) + "G, requested memory was: " + getConfig ().getRequestMemory () + "G" ;
319-
320- PipelineStatusFile sf = PipelineService .get ().getStatusFile (job .getJobId ());
321- if (sf != null )
330+ _log .warn ("Unable to find ReqMem for slurm job: " + job .getClusterId ());
331+ }
332+ else
333+ {
334+ double requestInBytes = FileSizeFormatter .convertStringRepresentationToBytes (reqMem );
335+ if (bytes > requestInBytes )
322336 {
323- try (PrintWriter writer = PrintWriters .getPrintWriter (new File (sf .getFilePath ()), StandardOpenOption .APPEND ))
324- {
325- writer .println (info + ". Raw slurm value: " + maxRSS );
326- }
327- catch (FileNotFoundException e )
337+ info = "Job exceeded memory, max was: " + FileSizeFormatter .convertBytesToUnit (bytes , 'G' ) + "G, requested memory was: " + FileSizeFormatter .convertBytesToUnit (requestInBytes , 'G' );
338+
339+ PipelineStatusFile sf = PipelineService .get ().getStatusFile (job .getJobId ());
340+ if (sf != null )
328341 {
329- _log .error ("Unable to find log file for job, " + job .getJobId () + ": " + sf .getFilePath ());
342+ try (PrintWriter writer = PrintWriters .getPrintWriter (new File (sf .getFilePath ()), StandardOpenOption .APPEND ))
343+ {
344+ writer .println (info + ". Raw slurm value: " + maxRSS );
345+ }
346+ catch (FileNotFoundException e )
347+ {
348+ _log .error ("Unable to find log file for job, " + job .getJobId () + ": " + sf .getFilePath ());
349+ }
330350 }
331351 }
332352 }
0 commit comments