Skip to content

Commit 3718413

Browse files
committed
Improve cluster/slurm exception reporting
1 parent 8c80884 commit 3718413

File tree

2 files changed

+80
-56
lines changed

2 files changed

+80
-56
lines changed

cluster/src/org/labkey/cluster/ClusterManager.java

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -134,7 +134,7 @@ public void execute(JobExecutionContext context) throws JobExecutionException
134134
}
135135
catch (Exception ex)
136136
{
137-
_log.error(ex);
137+
_log.error(ex.getMessage(), ex);
138138
}
139139
}
140140
}

cluster/src/org/labkey/cluster/pipeline/SlurmExecutionEngine.java

Lines changed: 79 additions & 55 deletions
Original file line numberDiff line numberDiff line change
@@ -127,12 +127,6 @@ protected Set<String> updateStatusForAllJobs() throws PipelineJobException
127127
stateIdx = header.indexOf("STATE");
128128
hostnameIdx = header.indexOf("NODELIST");
129129

130-
continue;
131-
}
132-
133-
if (headerFound)
134-
{
135-
String[] tokens = line.split("( )+");
136130
if (stateIdx == -1)
137131
{
138132
_log.error("Unable to find STATE in header: " + StringUtils.join(header, ", "));
@@ -145,28 +139,42 @@ protected Set<String> updateStatusForAllJobs() throws PipelineJobException
145139
break;
146140
}
147141

148-
String id = StringUtils.trimToNull(tokens[jobIdx]);
149-
if (id != null)
142+
continue;
143+
}
144+
145+
if (headerFound)
146+
{
147+
try
150148
{
151-
ClusterJob j = getClusterSubmission(id);
152-
if (j == null)
153-
{
154-
//it is allowable for the same user to submit jobs outside of LK
155-
//_log.error("unable to find slurm submission matching: " + id);
156-
}
157-
else
149+
String[] tokens = line.split("( )+");
150+
String id = StringUtils.trimToNull(tokens[jobIdx]);
151+
if (id != null)
158152
{
159-
String hostname = StringUtils.trimToNull(tokens[hostnameIdx]);
160-
if (hostname != null)
153+
ClusterJob j = getClusterSubmission(id);
154+
if (j == null)
161155
{
162-
j.setHostname(hostname);
156+
//it is allowable for the same user to submit jobs outside of LK
157+
//_log.error("unable to find slurm submission matching: " + id);
158+
}
159+
else
160+
{
161+
String hostname = StringUtils.trimToNull(tokens[hostnameIdx]);
162+
if (hostname != null)
163+
{
164+
j.setHostname(hostname);
165+
}
166+
167+
Pair<String, String> status = translateSlurmStatusToTaskStatus(StringUtils.trimToNull(tokens[stateIdx]));
168+
updateJobStatus(status == null ? null : status.first, j, status == null ? null : status.second);
169+
jobsUpdated.add(j.getClusterId());
163170
}
164-
165-
Pair<String, String> status = translateSlurmStatusToTaskStatus(StringUtils.trimToNull(tokens[stateIdx]));
166-
updateJobStatus(status == null ? null : status.first, j, status == null ? null : status.second);
167-
jobsUpdated.add(j.getClusterId());
168171
}
169172
}
173+
catch (Exception e)
174+
{
175+
_log.error("Error parsing line: " + line);
176+
throw e;
177+
}
170178
}
171179
}
172180

@@ -218,13 +226,7 @@ protected Pair<String, String> getStatusForJob(ClusterJob job, Container c)
218226
jobIdx = header.indexOf("JOBID");
219227
stateIdx = header.indexOf("STATE");
220228
hostnameIdx = header.indexOf("NODELIST");
221-
}
222-
else if (foundJobLine && line.startsWith("------------"))
223-
{
224-
headerFound = true;
225-
}
226-
else if (headerFound)
227-
{
229+
228230
if (stateIdx == -1)
229231
{
230232
_log.error("Unable to find STATE in header: " + StringUtils.join(header, ", "));
@@ -236,25 +238,39 @@ else if (headerFound)
236238
_log.error("Unable to find JOBID in header: " + StringUtils.join(header, ", "));
237239
break;
238240
}
239-
240-
String[] tokens = line.split("( )+");
241-
String id = StringUtils.trimToNull(tokens[jobIdx]);
242-
if (id.equals(job.getClusterId()))
241+
}
242+
else if (foundJobLine && line.startsWith("------------"))
243+
{
244+
headerFound = true;
245+
}
246+
else if (headerFound)
247+
{
248+
try
243249
{
244-
statuses.add(StringUtils.trimToNull(tokens[stateIdx]));
245-
}
250+
String[] tokens = line.split("( )+");
251+
String id = StringUtils.trimToNull(tokens[jobIdx]);
252+
if (id.equals(job.getClusterId()))
253+
{
254+
statuses.add(StringUtils.trimToNull(tokens[stateIdx]));
255+
}
246256

247-
if (hostnameIdx > -1)
248-
{
249-
String hostname = StringUtils.trimToNull(tokens[hostnameIdx]);
250-
if (hostname != null)
257+
if (hostnameIdx > -1)
251258
{
252-
if (job.getHostname() == null || !job.getHostname().equals(hostname))
259+
String hostname = StringUtils.trimToNull(tokens[hostnameIdx]);
260+
if (hostname != null)
253261
{
254-
job.setHostname(hostname);
262+
if (job.getHostname() == null || !job.getHostname().equals(hostname))
263+
{
264+
job.setHostname(hostname);
265+
}
255266
}
256267
}
257268
}
269+
catch (Exception e)
270+
{
271+
_log.error("Error parsing line: " + line);
272+
throw e;
273+
}
258274
}
259275
}
260276

@@ -591,12 +607,6 @@ private Pair<String, String> getStatusFromQueue(ClusterJob job)
591607
stateIdx = header.indexOf("STATE");
592608
hostnameIdx = header.indexOf("NODELIST");
593609

594-
continue;
595-
}
596-
597-
if (headerFound)
598-
{
599-
String[] tokens = line.split("( )+");
600610
if (stateIdx == -1)
601611
{
602612
_log.error("Unable to find STATE in header: " + StringUtils.join(header, ", "));
@@ -609,19 +619,33 @@ private Pair<String, String> getStatusFromQueue(ClusterJob job)
609619
break;
610620
}
611621

612-
String id = StringUtils.trimToNull(tokens[jobIdx]);
613-
if (job.getClusterId().equals(id))
622+
continue;
623+
}
624+
625+
if (headerFound)
626+
{
627+
try
614628
{
615-
if (hostnameIdx > -1)
629+
String[] tokens = line.split("( )+");
630+
String id = StringUtils.trimToNull(tokens[jobIdx]);
631+
if (job.getClusterId().equals(id))
616632
{
617-
String hostname = StringUtils.trimToNull(tokens[hostnameIdx]);
618-
if (hostname != null)
633+
if (hostnameIdx > -1)
619634
{
620-
job.setHostname(hostname);
635+
String hostname = StringUtils.trimToNull(tokens[hostnameIdx]);
636+
if (hostname != null)
637+
{
638+
job.setHostname(hostname);
639+
}
621640
}
622-
}
623641

624-
return translateSlurmStatusToTaskStatus(StringUtils.trimToNull(tokens[stateIdx]));
642+
return translateSlurmStatusToTaskStatus(StringUtils.trimToNull(tokens[stateIdx]));
643+
}
644+
}
645+
catch (Exception e)
646+
{
647+
_log.error("Error parsing line: " + line);
648+
throw e;
625649
}
626650
}
627651
}

0 commit comments

Comments
 (0)