Skip to content

Commit 6093c5f

Browse files
committed
Merge discvr-23.7 to develop
2 parents 2ff18ff + d3b5fae commit 6093c5f

File tree

24 files changed

+772
-287
lines changed

24 files changed

+772
-287
lines changed

SequenceAnalysis/api-src/org/labkey/api/sequenceanalysis/pipeline/VariantProcessingStep.java

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -68,9 +68,9 @@ default void validateScatter(ScatterGatherMethod method, PipelineJob job) throws
6868

6969
}
7070

71-
default void performAdditionalMergeTasks(SequenceOutputHandler.JobContext ctx, PipelineJob job, TaskFileManager manager, ReferenceGenome genome, List<File> orderedScatterOutputs) throws PipelineJobException
71+
default void performAdditionalMergeTasks(SequenceOutputHandler.JobContext ctx, PipelineJob job, TaskFileManager manager, ReferenceGenome genome, List<File> orderedScatterOutputs, List<String> orderedJobDirs) throws PipelineJobException
7272
{
73-
73+
ctx.getLogger().debug("No additional merge tasks are implemented for: " + getClass().getName());
7474
}
7575

7676
default boolean doSortAfterMerge()

SequenceAnalysis/api-src/org/labkey/api/sequenceanalysis/run/AbstractCommandWrapper.java

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -253,7 +253,7 @@ public Logger getLogger()
253253
{
254254
if (_log == null)
255255
{
256-
return LogManager.getLogger("NoOpLogger");
256+
return LogManager.getLogger("NoOpLogger");
257257
}
258258

259259
return _log;

SequenceAnalysis/src/org/labkey/sequenceanalysis/analysis/PicardAlignmentMetricsHandler.java

Lines changed: 38 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,7 @@
2525
import org.labkey.sequenceanalysis.run.util.CollectInsertSizeMetricsWrapper;
2626
import org.labkey.sequenceanalysis.run.util.CollectWgsMetricsWithNonZeroCoverageWrapper;
2727
import org.labkey.sequenceanalysis.run.util.CollectWgsMetricsWrapper;
28+
import org.labkey.sequenceanalysis.run.util.MarkDuplicatesWrapper;
2829

2930
import java.io.File;
3031
import java.util.ArrayList;
@@ -54,7 +55,13 @@ public PicardAlignmentMetricsHandler()
5455
}}, false),
5556
ToolParameterDescriptor.create("collectWgsNonZero", "Run WGS Metrics Over Non-Zero Coverage", "If checked, Picard CollectWgsMetricsWithNonZeroCoverage will be run", "checkbox", new JSONObject(){{
5657
put("checked", false);
57-
}}, false)
58+
}}, false),
59+
ToolParameterDescriptor.create("markDuplicates", "Run MarkDuplicates", "If checked, Picard CollectWgsMetricsWithNonZeroCoverage will be run", "checkbox", new JSONObject(){{
60+
put("checked", false);
61+
}}, false),
62+
ToolParameterDescriptor.create("useOutputFileContainer", "Submit to Source File Workbook", "If checked, each job will be submitted to the same workbook as the input file, as opposed to submitting all jobs to the same workbook. This is primarily useful if submitting a large batch of files to process separately. This only applies if 'Run Separately' is selected.", "checkbox", new JSONObject(){{
63+
put("checked", true);
64+
}}, true)
5865
));
5966
}
6067

@@ -156,6 +163,13 @@ public void processFilesOnWebserver(PipelineJob job, SequenceAnalysisJobSupport
156163
metricsFiles.add(mf4);
157164
}
158165

166+
File mf5 = new MarkDuplicatesWrapper(job.getLogger()).getMetricsFile(m.getAlignmentFileObject());
167+
if (mf5.exists())
168+
{
169+
action.addOutput(mf5, "Duplication Metrics", false);
170+
metricsFiles.add(mf5);
171+
}
172+
159173
TableInfo ti = SequenceAnalysisManager.get().getTable(SequenceAnalysisSchema.TABLE_QUALITY_METRICS);
160174
for (File f : metricsFiles)
161175
{
@@ -195,6 +209,7 @@ public void processFilesRemote(List<SequenceOutputFile> inputFiles, JobContext c
195209
boolean collectInsertSize = params.optBoolean("collectInsertSize", false);
196210
boolean collectWgs = params.optBoolean("collectWgs", false);
197211
boolean collectWgsNonZero = params.optBoolean("collectWgsNonZero", false);
212+
boolean runMarkDuplicates = params.optBoolean("markDuplicates", false);
198213

199214
int i = 1;
200215
for (SequenceOutputFile o : inputFiles)
@@ -244,6 +259,28 @@ public void processFilesRemote(List<SequenceOutputFile> inputFiles, JobContext c
244259
CollectInsertSizeMetricsWrapper wrapper = new CollectInsertSizeMetricsWrapper(job.getLogger());
245260
wrapper.executeCommand(o.getFile(), metricsFile, metricsHistogram);
246261
}
262+
263+
if (runMarkDuplicates)
264+
{
265+
job.getLogger().info("running MarkDuplicates");
266+
job.setStatus(PipelineJob.TaskStatus.running, "RUNNING MARKDUPLICATES");
267+
MarkDuplicatesWrapper wrapper = new MarkDuplicatesWrapper(job.getLogger());
268+
File metricsFile = wrapper.getMetricsFile(o.getFile());
269+
File tempBam = new File(ctx.getOutputDir(), FileUtil.getBaseName(o.getFile()) + ".markDuplicates.bam");
270+
ctx.getFileManager().addIntermediateFile(tempBam);
271+
ctx.getFileManager().addIntermediateFile(new File(tempBam.getPath() + ".bai"));
272+
273+
if (tempBam.exists())
274+
{
275+
tempBam.delete();
276+
}
277+
278+
wrapper.executeCommand(o.getFile(), tempBam, null);
279+
if (!metricsFile.exists())
280+
{
281+
throw new PipelineJobException("Unable to find file: " + metricsFile);
282+
}
283+
}
247284
}
248285

249286
action.setEndTime(new Date());

SequenceAnalysis/src/org/labkey/sequenceanalysis/pipeline/PicardMetricsUtil.java

Lines changed: 3 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -4,12 +4,11 @@
44
import htsjdk.samtools.metrics.MetricBase;
55
import htsjdk.samtools.metrics.MetricsFile;
66
import org.apache.commons.beanutils.ConversionException;
7-
import org.apache.commons.io.Charsets;
87
import org.apache.commons.lang3.StringUtils;
98
import org.apache.logging.log4j.Logger;
10-
import org.apache.logging.log4j.LogManager;
119
import org.labkey.api.data.ConvertHelper;
1210
import org.labkey.api.pipeline.PipelineJobException;
11+
import org.labkey.api.reader.Readers;
1312
import picard.analysis.AlignmentSummaryMetrics;
1413
import picard.analysis.CollectWgsMetricsWithNonZeroCoverage;
1514
import picard.analysis.InsertSizeMetrics;
@@ -18,9 +17,7 @@
1817

1918
import java.io.BufferedReader;
2019
import java.io.File;
21-
import java.io.FileInputStream;
2220
import java.io.IOException;
23-
import java.io.InputStreamReader;
2421
import java.util.ArrayList;
2522
import java.util.HashMap;
2623
import java.util.List;
@@ -38,9 +35,9 @@ public static List<Map<String, Object>> processFile(File f, Logger log) throws P
3835
throw new PipelineJobException("Unable to find file: " + f.getPath());
3936
}
4037

41-
try (BufferedReader reader = new BufferedReader(new InputStreamReader(new FileInputStream(f), Charsets.UTF_8)))
38+
try (BufferedReader reader = Readers.getReader(f))
4239
{
43-
MetricsFile metricsFile = new MetricsFile();
40+
MetricsFile<MetricBase,?> metricsFile = new MetricsFile<>();
4441
metricsFile.read(reader);
4542
List<MetricBase> metrics = metricsFile.getMetrics();
4643
if (metrics.get(0).getClass() == DuplicationMetrics.class)

SequenceAnalysis/src/org/labkey/sequenceanalysis/pipeline/ProcessVariantsHandler.java

Lines changed: 8 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -209,7 +209,8 @@ public static File getScatterOutputByCategory(JobContext ctx, final String categ
209209

210210
if (scatterOutputs.isEmpty())
211211
{
212-
throw new PipelineJobException("Unable to find final VCF");
212+
ctx.getLogger().info("No outputs of category: " + category + " were found");
213+
return null;
213214
}
214215
else if (scatterOutputs.size() > 1)
215216
{
@@ -411,6 +412,7 @@ public static File processVCF(File input, Integer libraryId, JobContext ctx, Res
411412
resumer.getFileManager().addIntermediateFile(outputFileIdx);
412413
}
413414

415+
File effectiveInput = currentVCF; //this will be tested at the end to determine if a new file was actually created
414416
for (PipelineStepCtx<VariantProcessingStep> stepCtx : providers)
415417
{
416418
ctx.getLogger().info("Starting to run: " + stepCtx.getProvider().getLabel());
@@ -484,7 +486,7 @@ public static File processVCF(File input, Integer libraryId, JobContext ctx, Res
484486
resumer.setStepComplete(stepIdx, input.getPath(), action, currentVCF);
485487
}
486488

487-
if (currentVCF.exists())
489+
if (currentVCF != null && currentVCF.exists() && !currentVCF.equals(effectiveInput))
488490
{
489491
resumer.getFileManager().removeIntermediateFile(currentVCF);
490492
resumer.getFileManager().removeIntermediateFile(new File(currentVCF.getPath() + ".tbi"));
@@ -872,14 +874,15 @@ else if (AbstractGenomicsDBImportHandler.TILE_DB_FILETYPE.isType(input))
872874
}
873875

874876
@Override
875-
public void performAdditionalMergeTasks(JobContext ctx, PipelineJob job, TaskFileManager manager, ReferenceGenome genome, List<File> orderedScatterOutputs) throws PipelineJobException
877+
public void performAdditionalMergeTasks(JobContext ctx, PipelineJob job, TaskFileManager manager, ReferenceGenome genome, List<File> orderedScatterOutputs, List<String> orderedJobDirs) throws PipelineJobException
876878
{
877879
List<PipelineStepCtx<VariantProcessingStep>> providers = SequencePipelineService.get().getSteps(job, VariantProcessingStep.class);
878880
for (PipelineStepCtx<VariantProcessingStep> stepCtx : providers)
879881
{
880-
if (stepCtx.getProvider() instanceof VariantProcessingStep.SupportsScatterGather ssg)
882+
VariantProcessingStep vps = stepCtx.getProvider().create(ctx);
883+
if (vps instanceof VariantProcessingStep.SupportsScatterGather ssg)
881884
{
882-
ssg.performAdditionalMergeTasks(ctx, job, manager, genome, orderedScatterOutputs);
885+
ssg.performAdditionalMergeTasks(ctx, job, manager, genome, orderedScatterOutputs, orderedJobDirs);
883886
}
884887
}
885888
}

SequenceAnalysis/src/org/labkey/sequenceanalysis/pipeline/VariantProcessingRemoteMergeTask.java

Lines changed: 40 additions & 47 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33
import htsjdk.samtools.util.Interval;
44
import org.apache.commons.lang3.StringUtils;
55
import org.jetbrains.annotations.NotNull;
6+
import org.jetbrains.annotations.Nullable;
67
import org.labkey.api.pipeline.AbstractTaskFactory;
78
import org.labkey.api.pipeline.AbstractTaskFactorySettings;
89
import org.labkey.api.pipeline.PipelineJob;
@@ -127,13 +128,16 @@ private VariantProcessingJob getPipelineJob()
127128
if (handler instanceof SequenceOutputHandler.TracksVCF)
128129
{
129130
Set<SequenceOutputFile> outputs = new HashSet<>();
130-
scatterOutputs.values().forEach(f -> outputs.addAll(getPipelineJob().getOutputsToCreate().stream().filter(x -> f.equals(x.getFile())).collect(Collectors.toSet())));
131+
scatterOutputs.values().forEach(f -> outputs.addAll(getPipelineJob().getOutputsToCreate().stream().filter(x -> x != null && f.equals(x.getFile())).collect(Collectors.toSet())));
131132
getJob().getLogger().debug("Total component outputs created: " + outputs.size());
132133
getPipelineJob().getOutputsToCreate().removeAll(outputs);
133134
getJob().getLogger().debug("Total SequenceOutputFiles on job after remove: " + getPipelineJob().getOutputsToCreate().size());
134135

135-
SequenceOutputFile finalOutput = ((SequenceOutputHandler.TracksVCF)getPipelineJob().getHandler()).createFinalSequenceOutput(getJob(), finalOut, getPipelineJob().getFiles());
136-
manager.addSequenceOutput(finalOutput);
136+
if (finalOut != null)
137+
{
138+
SequenceOutputFile finalOutput = ((SequenceOutputHandler.TracksVCF) getPipelineJob().getHandler()).createFinalSequenceOutput(getJob(), finalOut, getPipelineJob().getFiles());
139+
manager.addSequenceOutput(finalOutput);
140+
}
137141
}
138142
else
139143
{
@@ -152,14 +156,15 @@ private VariantProcessingJob getPipelineJob()
152156
return new RecordedActionSet(action);
153157
}
154158

155-
private File runDefaultVariantMerge(JobContextImpl ctx, TaskFileManagerImpl manager, RecordedAction action, SequenceOutputHandler<SequenceOutputHandler.SequenceOutputProcessor> handler) throws PipelineJobException
159+
private @Nullable File runDefaultVariantMerge(JobContextImpl ctx, TaskFileManagerImpl manager, RecordedAction action, SequenceOutputHandler<SequenceOutputHandler.SequenceOutputProcessor> handler) throws PipelineJobException
156160
{
157161
Map<String, List<Interval>> jobToIntervalMap = getPipelineJob().getJobToIntervalMap();
158162
getJob().setStatus(PipelineJob.TaskStatus.running, "Combining Per-Contig VCFs: " + jobToIntervalMap.size());
159163

160164
Map<String, File> scatterOutputs = getPipelineJob().getScatterJobOutputs();
161165
List<File> toConcat = new ArrayList<>();
162166
Set<File> missing = new HashSet<>();
167+
int totalNull = 0;
163168
for (String name : jobToIntervalMap.keySet())
164169
{
165170
if (!scatterOutputs.containsKey(name))
@@ -168,45 +173,29 @@ private File runDefaultVariantMerge(JobContextImpl ctx, TaskFileManagerImpl mana
168173
}
169174

170175
File vcf = scatterOutputs.get(name);
171-
if (!vcf.exists())
176+
if (scatterOutputs.get(name) == null)
172177
{
173-
missing.add(vcf);
174-
}
175-
176-
// NOTE: this was added to fix a one-time issue where -L was dropped from some upstream GenotypeGVCFs.
177-
// Under normal conditions this would never be necessary.
178-
boolean ensureOutputsWithinIntervals = getPipelineJob().getParameterJson().optBoolean("variantCalling.GenotypeGVCFs.ensureOutputsWithinIntervalsOnMerge", false);
179-
if (ensureOutputsWithinIntervals)
180-
{
181-
getJob().getLogger().debug("Ensuring ensure scatter outputs respect intervals");
182-
183-
File subsetVcf = new File(vcf.getParentFile(), SequenceAnalysisService.get().getUnzippedBaseName(vcf.getName()) + ".subset.vcf.gz");
184-
File subsetVcfIdx = new File(subsetVcf.getPath() + ".tbi");
185-
manager.addIntermediateFile(subsetVcf);
186-
manager.addIntermediateFile(subsetVcfIdx);
187-
188-
if (subsetVcfIdx.exists())
189-
{
190-
getJob().getLogger().debug("Index exists, will not re-subset the VCF: " + subsetVcf.getName());
191-
}
192-
else
193-
{
194-
OutputVariantsStartingInIntervalsStep.Wrapper wrapper = new OutputVariantsStartingInIntervalsStep.Wrapper(getJob().getLogger());
195-
wrapper.execute(vcf, subsetVcf, getPipelineJob().getIntervalsForTask());
196-
}
197-
198-
toConcat.add(subsetVcf);
178+
totalNull++;
179+
continue;
199180
}
200-
else
181+
else if (!vcf.exists())
201182
{
202-
toConcat.add(vcf);
183+
missing.add(vcf);
184+
continue;
203185
}
204186

187+
toConcat.add(vcf);
188+
205189
manager.addInput(action, "Input VCF", vcf);
206190
manager.addIntermediateFile(vcf);
207191
manager.addIntermediateFile(new File(vcf.getPath() + ".tbi"));
208192
}
209193

194+
if (totalNull > 0 && !toConcat.isEmpty())
195+
{
196+
throw new PipelineJobException("The scatter jobs returned a mixture of null and non-null outputs");
197+
}
198+
210199
Set<Integer> genomeIds = new HashSet<>();
211200
getPipelineJob().getFiles().forEach(x -> genomeIds.add(x.getLibrary_id()));
212201
if (genomeIds.size() != 1)
@@ -216,29 +205,33 @@ private File runDefaultVariantMerge(JobContextImpl ctx, TaskFileManagerImpl mana
216205

217206
ReferenceGenome genome = getPipelineJob().getSequenceSupport().getCachedGenome(genomeIds.iterator().next());
218207

219-
String basename = SequenceAnalysisService.get().getUnzippedBaseName(toConcat.get(0).getName());
220-
File combined = new File(getPipelineJob().getAnalysisDirectory(), basename + ".vcf.gz");
221-
File combinedIdx = new File(combined.getPath() + ".tbi");
222-
if (combinedIdx.exists())
208+
File combined = null;
209+
if (!toConcat.isEmpty())
223210
{
224-
getJob().getLogger().info("VCF exists, will not recreate: " + combined.getPath());
225-
}
226-
else
227-
{
228-
if (!missing.isEmpty())
211+
String basename = SequenceAnalysisService.get().getUnzippedBaseName(toConcat.get(0).getName());
212+
combined = new File(getPipelineJob().getAnalysisDirectory(), basename + ".vcf.gz");
213+
File combinedIdx = new File(combined.getPath() + ".tbi");
214+
if (combinedIdx.exists())
229215
{
230-
throw new PipelineJobException("Missing one of more VCFs: " + missing.stream().map(File::getPath).collect(Collectors.joining(",")));
216+
getJob().getLogger().info("VCF exists, will not recreate: " + combined.getPath());
231217
}
218+
else
219+
{
220+
if (!missing.isEmpty())
221+
{
222+
throw new PipelineJobException("Missing one of more VCFs: " + missing.stream().map(File::getPath).collect(Collectors.joining(",")));
223+
}
232224

233-
boolean sortAfterMerge = handler instanceof VariantProcessingStep.SupportsScatterGather && ((VariantProcessingStep.SupportsScatterGather)handler).doSortAfterMerge();
234-
combined = SequenceAnalysisService.get().combineVcfs(toConcat, combined, genome, getJob().getLogger(), true, null, sortAfterMerge);
225+
boolean sortAfterMerge = handler instanceof VariantProcessingStep.SupportsScatterGather && ((VariantProcessingStep.SupportsScatterGather) handler).doSortAfterMerge();
226+
combined = SequenceAnalysisService.get().combineVcfs(toConcat, combined, genome, getJob().getLogger(), true, null, sortAfterMerge);
227+
}
228+
manager.addOutput(action, "Merged VCF", combined);
235229
}
236-
manager.addOutput(action, "Merged VCF", combined);
237230

238231
if (handler instanceof VariantProcessingStep.SupportsScatterGather)
239232
{
240233
ctx.getLogger().debug("Running additional merge tasks");
241-
((VariantProcessingStep.SupportsScatterGather) handler).performAdditionalMergeTasks(ctx, getPipelineJob(), manager, genome, toConcat);
234+
((VariantProcessingStep.SupportsScatterGather) handler).performAdditionalMergeTasks(ctx, getPipelineJob(), manager, genome, toConcat, new ArrayList<>(jobToIntervalMap.keySet()));
242235
}
243236

244237
return combined;

SequenceAnalysis/src/org/labkey/sequenceanalysis/pipeline/VariantProcessingRemoteSplitTask.java

Lines changed: 15 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -117,8 +117,13 @@ private VariantProcessingJob getPipelineJob()
117117
{
118118
output = ((SequenceOutputHandler.TracksVCF)handler).finalizeScatterJobOutput(ctx, output);
119119

120-
// If the output is still under the work dir, translate path. Otherwise it was already copied to the the source dir
121-
if (output.getPath().startsWith(_wd.getDir().getPath()))
120+
// If the output is still under the work dir, translate path. Otherwise it was already copied to the source dir
121+
if (output == null)
122+
{
123+
ctx.getLogger().debug("No output produced, adding null to scatter outputs");
124+
getPipelineJob().getScatterJobOutputs().put(getPipelineJob().getIntervalSetName(), null);
125+
}
126+
else if (output.getPath().startsWith(_wd.getDir().getPath()))
122127
{
123128
//NOTE: the VCF will be copied back to the source dir, so translate paths
124129
String path = _wd.getRelativePath(output);
@@ -136,7 +141,14 @@ private VariantProcessingJob getPipelineJob()
136141
throw new PipelineJobException(e);
137142
}
138143

139-
getPipelineJob().getLogger().debug("Final scatter output: " + output.getPath());
144+
if (output != null)
145+
{
146+
getPipelineJob().getLogger().debug("Final scatter output: " + output.getPath());
147+
}
148+
else
149+
{
150+
getPipelineJob().getLogger().debug("No primary scatter output produced");
151+
}
140152
}
141153
else
142154
{

SequenceAnalysis/src/org/labkey/sequenceanalysis/run/bampostprocessing/MarkDuplicatesStep.java

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,7 @@
2626
*/
2727
public class MarkDuplicatesStep extends AbstractCommandPipelineStep<MarkDuplicatesWrapper> implements BamProcessingStep
2828
{
29-
public MarkDuplicatesStep(PipelineStepProvider provider, PipelineContext ctx)
29+
public MarkDuplicatesStep(PipelineStepProvider<?> provider, PipelineContext ctx)
3030
{
3131
super(provider, ctx, new MarkDuplicatesWrapper(ctx.getLogger()));
3232
}

0 commit comments

Comments
 (0)