Skip to content

Commit a415993

Browse files
committed
Increase validation over scatter/gather patterns
1 parent 80622da commit a415993

File tree

8 files changed

+117
-29
lines changed

8 files changed

+117
-29
lines changed

SequenceAnalysis/api-src/org/labkey/api/sequenceanalysis/pipeline/VariantProcessingStep.java

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -38,6 +38,14 @@ default void init(PipelineJob job, SequenceAnalysisJobSupport support, List<Sequ
3838

3939
}
4040

41+
enum ScatterGatherMethod
42+
{
43+
none(),
44+
contig(),
45+
chunked(),
46+
fixedJobs()
47+
}
48+
4149
public static interface Output extends PipelineStepOutput
4250
{
4351
public File getVCF();
@@ -50,7 +58,10 @@ public static interface RequiresPedigree
5058

5159
public static interface SupportsScatterGather
5260
{
61+
default void validateScatter(ScatterGatherMethod method, PipelineJob job) throws IllegalArgumentException
62+
{
5363

64+
}
5465
}
5566

5667
public static interface MayRequirePrepareTask

SequenceAnalysis/src/org/labkey/sequenceanalysis/SequenceAnalysisController.java

Lines changed: 23 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -114,6 +114,7 @@
114114
import org.labkey.api.sequenceanalysis.pipeline.SequenceOutputHandler;
115115
import org.labkey.api.sequenceanalysis.pipeline.SequencePipelineService;
116116
import org.labkey.api.sequenceanalysis.pipeline.ToolParameterDescriptor;
117+
import org.labkey.api.sequenceanalysis.pipeline.VariantProcessingStep;
117118
import org.labkey.api.util.ExceptionUtil;
118119
import org.labkey.api.util.FileType;
119120
import org.labkey.api.util.FileUtil;
@@ -162,7 +163,6 @@
162163
import org.labkey.sequenceanalysis.run.util.FastqcRunner;
163164
import org.labkey.sequenceanalysis.util.ChainFileValidator;
164165
import org.labkey.sequenceanalysis.util.FastqUtils;
165-
import org.labkey.sequenceanalysis.util.ScatterGatherUtils;
166166
import org.labkey.sequenceanalysis.util.SequenceUtil;
167167
import org.labkey.sequenceanalysis.visualization.VariationChart;
168168
import org.springframework.beans.PropertyValues;
@@ -4057,9 +4057,28 @@ public ApiResponse execute(RunSequenceHandlerForm form, BindException errors) th
40574057

40584058
protected PipelineJob createOutputJob(RunSequenceHandlerForm form, Container targetContainer, String jobName, PipeRoot pr1, SequenceOutputHandler handler, List<SequenceOutputFile> inputs, JSONObject json) throws IOException, PipelineJobException
40594059
{
4060+
validateGenomes(inputs);
40604061
return new SequenceOutputHandlerJob(targetContainer, getUser(), jobName, pr1, handler, inputs, json);
40614062
}
40624063

4064+
protected void validateGenomes(List<SequenceOutputFile> inputs) throws IllegalArgumentException
4065+
{
4066+
Set<Integer> genomes = new HashSet<>();
4067+
inputs.forEach(x -> {
4068+
if (x.getLibrary_id() == null)
4069+
{
4070+
throw new IllegalArgumentException("Input missing genome: " + x.getRowid());
4071+
}
4072+
4073+
genomes.add(x.getLibrary_id());
4074+
});
4075+
4076+
if (genomes.size() > 1)
4077+
{
4078+
throw new IllegalArgumentException("All inputs must use the same base genome");
4079+
}
4080+
}
4081+
40634082
private PipeRoot getPipeRoot(Container targetContainer, Map<Container, PipeRoot> containerToPipeRootMap)
40644083
{
40654084
if (!containerToPipeRootMap.containsKey(targetContainer))
@@ -4078,12 +4097,14 @@ private PipeRoot getPipeRoot(Container targetContainer, Map<Container, PipeRoot>
40784097
@RequiresPermission(InsertPermission.class)
40794098
public class RunVariantProcessingAction extends RunSequenceHandlerAction
40804099
{
4100+
@Override
40814101
protected PipelineJob createOutputJob(RunSequenceHandlerForm form, Container targetContainer, String jobName, PipeRoot pr1, SequenceOutputHandler handler, List<SequenceOutputFile> inputs, JSONObject json) throws PipelineJobException, IOException
40824102
{
40834103
String method = json.getString("scatterGatherMethod");
40844104
try
40854105
{
4086-
ScatterGatherUtils.ScatterGatherMethod scatterMethod = ScatterGatherUtils.ScatterGatherMethod.valueOf(method);
4106+
validateGenomes(inputs);
4107+
VariantProcessingStep.ScatterGatherMethod scatterMethod = VariantProcessingStep.ScatterGatherMethod.valueOf(method);
40874108

40884109
return new VariantProcessingJob(targetContainer, getUser(), jobName, pr1, handler, inputs, json, scatterMethod);
40894110
}

SequenceAnalysis/src/org/labkey/sequenceanalysis/analysis/GenotypeGVCFHandler.java

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -50,7 +50,7 @@
5050
/**
5151
* Created by bimber on 8/26/2014.
5252
*/
53-
public class GenotypeGVCFHandler implements SequenceOutputHandler<SequenceOutputHandler.SequenceOutputProcessor>, SequenceOutputHandler.HasActionNames, SequenceOutputHandler.TracksVCF, VariantProcessingStep.MayRequirePrepareTask
53+
public class GenotypeGVCFHandler implements SequenceOutputHandler<SequenceOutputHandler.SequenceOutputProcessor>, SequenceOutputHandler.HasActionNames, SequenceOutputHandler.TracksVCF, VariantProcessingStep.MayRequirePrepareTask, VariantProcessingStep.SupportsScatterGather
5454
{
5555
private FileType _gvcfFileType = new FileType(Arrays.asList(".g.vcf"), ".g.vcf", false, FileType.gzSupportLevel.SUPPORT_GZ);
5656

SequenceAnalysis/src/org/labkey/sequenceanalysis/pipeline/ProcessVariantsHandler.java

Lines changed: 8 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -64,7 +64,7 @@
6464
/**
6565
* Created by bimber on 8/26/2014.
6666
*/
67-
public class ProcessVariantsHandler implements SequenceOutputHandler<SequenceOutputHandler.SequenceOutputProcessor>, SequenceOutputHandler.HasActionNames, SequenceOutputHandler.TracksVCF
67+
public class ProcessVariantsHandler implements SequenceOutputHandler<SequenceOutputHandler.SequenceOutputProcessor>, SequenceOutputHandler.HasActionNames, SequenceOutputHandler.TracksVCF, VariantProcessingStep.SupportsScatterGather
6868
{
6969
public static final String VCF_CATEGORY = "VCF File";
7070

@@ -258,9 +258,14 @@ public static void initVariantProcessing(PipelineJob job, SequenceAnalysisJobSup
258258
requiresPedigree = true;
259259
}
260260

261-
if (useScatterGather && !(stepCtx.getProvider() instanceof VariantProcessingStep.SupportsScatterGather))
261+
if (useScatterGather)
262262
{
263-
stepsNotScatterGather.add(stepCtx.getProvider().getName());
263+
if (!(stepCtx.getProvider() instanceof VariantProcessingStep.SupportsScatterGather))
264+
{
265+
stepsNotScatterGather.add(stepCtx.getProvider().getName());
266+
}
267+
268+
((VariantProcessingStep.SupportsScatterGather)stepCtx.getProvider()).validateScatter(vj.getScatterGatherMethod(), job);
264269
}
265270

266271
VariantProcessingStep step = stepCtx.getProvider().create(taskHelper);

SequenceAnalysis/src/org/labkey/sequenceanalysis/pipeline/VariantProcessingJob.java

Lines changed: 37 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -8,9 +8,9 @@
88
import htsjdk.samtools.SAMSequenceRecord;
99
import htsjdk.samtools.util.Interval;
1010
import htsjdk.variant.utils.SAMSequenceDictionaryExtractor;
11-
import org.apache.logging.log4j.Logger;
12-
import org.apache.logging.log4j.LogManager;
1311
import org.apache.commons.lang3.StringUtils;
12+
import org.apache.logging.log4j.LogManager;
13+
import org.apache.logging.log4j.Logger;
1414
import org.jetbrains.annotations.Nullable;
1515
import org.json.JSONObject;
1616
import org.junit.Assert;
@@ -29,6 +29,7 @@
2929
import org.labkey.api.sequenceanalysis.SequenceOutputFile;
3030
import org.labkey.api.sequenceanalysis.pipeline.ReferenceGenome;
3131
import org.labkey.api.sequenceanalysis.pipeline.SequenceOutputHandler;
32+
import org.labkey.api.sequenceanalysis.pipeline.VariantProcessingStep;
3233
import org.labkey.api.writer.PrintWriters;
3334
import org.labkey.sequenceanalysis.util.ScatterGatherUtils;
3435

@@ -46,7 +47,7 @@
4647

4748
public class VariantProcessingJob extends SequenceOutputHandlerJob
4849
{
49-
private ScatterGatherUtils.ScatterGatherMethod _scatterGatherMethod = ScatterGatherUtils.ScatterGatherMethod.none;
50+
private VariantProcessingStep.ScatterGatherMethod _scatterGatherMethod = VariantProcessingStep.ScatterGatherMethod.none;
5051
File _dictFile = null;
5152
Map<String, File> _scatterOutputs = new HashMap<>();
5253
private transient LinkedHashMap<String, List<Interval>> _jobToIntervalMap;
@@ -68,13 +69,14 @@ protected VariantProcessingJob(VariantProcessingJob parentJob, String intervalSe
6869
_intervalSetName = intervalSetName;
6970
}
7071

71-
public VariantProcessingJob(Container c, User user, @Nullable String jobName, PipeRoot pipeRoot, SequenceOutputHandler handler, List<SequenceOutputFile> files, JSONObject jsonParams, ScatterGatherUtils.ScatterGatherMethod scatterGatherMethod) throws IOException, PipelineJobException
72+
public VariantProcessingJob(Container c, User user, @Nullable String jobName, PipeRoot pipeRoot, SequenceOutputHandler handler, List<SequenceOutputFile> files, JSONObject jsonParams, VariantProcessingStep.ScatterGatherMethod scatterGatherMethod) throws IOException, PipelineJobException
7273
{
7374
super(c, user, jobName, pipeRoot, handler, files, jsonParams);
7475
_scatterGatherMethod = scatterGatherMethod;
7576

7677
if (isScatterJob())
7778
{
79+
validateScatterForTask();
7880
Set<Integer> genomeIds = new HashSet<>();
7981
for (SequenceOutputFile so : files)
8082
{
@@ -94,30 +96,47 @@ public VariantProcessingJob(Container c, User user, @Nullable String jobName, Pi
9496
}
9597
}
9698

99+
private void validateScatterForTask()
100+
{
101+
if (!isScatterJob())
102+
{
103+
return;
104+
}
105+
106+
if (!(getHandler() instanceof VariantProcessingStep.SupportsScatterGather))
107+
{
108+
throw new IllegalArgumentException("Task doe not support Scatter/Gather: " + getHandler().getName());
109+
}
110+
111+
112+
VariantProcessingStep.SupportsScatterGather sg = (VariantProcessingStep.SupportsScatterGather)getHandler();
113+
sg.validateScatter(getScatterGatherMethod(), this);
114+
}
115+
97116
private LinkedHashMap<String, List<Interval>> establishIntervals()
98117
{
99118
LinkedHashMap<String, List<Interval>> ret;
100119
SAMSequenceDictionary dict = SAMSequenceDictionaryExtractor.extractDictionary(_dictFile.toPath());
101-
if (_scatterGatherMethod == ScatterGatherUtils.ScatterGatherMethod.contig)
120+
if (_scatterGatherMethod == VariantProcessingStep.ScatterGatherMethod.contig)
102121
{
103122
ret = new LinkedHashMap<>();
104123
for (SAMSequenceRecord rec : dict.getSequences())
105124
{
106125
ret.put(rec.getSequenceName(), Collections.singletonList(new Interval(rec.getSequenceName(), 1, rec.getSequenceLength())));
107126
}
108127
}
109-
else if (_scatterGatherMethod == ScatterGatherUtils.ScatterGatherMethod.chunked)
128+
else if (_scatterGatherMethod == VariantProcessingStep.ScatterGatherMethod.chunked)
110129
{
111130
int basesPerJob = getParameterJson().getInt("scatterGather.basesPerJob");
112-
boolean allowSplitChromosomes = getParameterJson().optBoolean("scatterGather.allowSplitChromosomes", true);
131+
boolean allowSplitChromosomes = doAllowSplitContigs();
113132
int maxContigsPerJob = getParameterJson().optInt("scatterGather.maxContigsPerJob", -1);
114133
getLogger().info("Creating jobs with target bp size: " + basesPerJob + " mbp. allow splitting configs: " + allowSplitChromosomes + ", max contigs per job: " + maxContigsPerJob);
115134

116135
basesPerJob = basesPerJob * 1000000;
117136
ret = ScatterGatherUtils.divideGenome(dict, basesPerJob, allowSplitChromosomes, maxContigsPerJob);
118137

119138
}
120-
else if (_scatterGatherMethod == ScatterGatherUtils.ScatterGatherMethod.fixedJobs)
139+
else if (_scatterGatherMethod == VariantProcessingStep.ScatterGatherMethod.fixedJobs)
121140
{
122141
long totalSize = dict.getReferenceLength();
123142
int numJobs = getParameterJson().getInt("scatterGather.totalJobs");
@@ -133,9 +152,14 @@ else if (_scatterGatherMethod == ScatterGatherUtils.ScatterGatherMethod.fixedJob
133152
return ret;
134153
}
135154

155+
public boolean doAllowSplitContigs()
156+
{
157+
return getParameterJson().optBoolean("scatterGather.allowSplitChromosomes", true);
158+
}
159+
136160
public boolean isScatterJob()
137161
{
138-
return _scatterGatherMethod != ScatterGatherUtils.ScatterGatherMethod.none;
162+
return _scatterGatherMethod != VariantProcessingStep.ScatterGatherMethod.none;
139163
}
140164

141165
@JsonIgnore
@@ -296,12 +320,12 @@ public TaskPipeline getTaskPipeline()
296320
return PipelineJobService.get().getTaskPipeline(new TaskId(VariantProcessingJob.class));
297321
}
298322

299-
public ScatterGatherUtils.ScatterGatherMethod getScatterGatherMethod()
323+
public VariantProcessingStep.ScatterGatherMethod getScatterGatherMethod()
300324
{
301325
return _scatterGatherMethod;
302326
}
303327

304-
public void setScatterGatherMethod(ScatterGatherUtils.ScatterGatherMethod scatterGatherMethod)
328+
public void setScatterGatherMethod(VariantProcessingStep.ScatterGatherMethod scatterGatherMethod)
305329
{
306330
_scatterGatherMethod = scatterGatherMethod;
307331
}
@@ -315,7 +339,7 @@ public void serializeTest() throws Exception
315339
{
316340
VariantProcessingJob job1 = new VariantProcessingJob();
317341
job1._intervalSetName = "chr1";
318-
job1._scatterGatherMethod = ScatterGatherUtils.ScatterGatherMethod.chunked;
342+
job1._scatterGatherMethod = VariantProcessingStep.ScatterGatherMethod.chunked;
319343

320344
File tmp = new File(System.getProperty("java.io.tmpdir"));
321345
File xml = new File(tmp, "variantProcessingJob.txt");
@@ -342,7 +366,7 @@ public File getDataDirectory()
342366
};
343367

344368
job1._intervalSetName = "chr1";
345-
job1._scatterGatherMethod = ScatterGatherUtils.ScatterGatherMethod.chunked;
369+
job1._scatterGatherMethod = VariantProcessingStep.ScatterGatherMethod.chunked;
346370

347371
Map<String, List<Interval>> intervalMap = new LinkedHashMap<>();
348372
intervalMap.put("1", Arrays.asList(new Interval("chr1", 1, 10)));

SequenceAnalysis/src/org/labkey/sequenceanalysis/run/util/AbstractGenomicsDBImportHandler.java

Lines changed: 30 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -47,7 +47,7 @@
4747
import java.util.Map;
4848
import java.util.Set;
4949

50-
abstract public class AbstractGenomicsDBImportHandler extends AbstractParameterizedOutputHandler<SequenceOutputHandler.SequenceOutputProcessor> implements SequenceOutputHandler.TracksVCF, SequenceOutputHandler.HasCustomVariantMerge, VariantProcessingStep.MayRequirePrepareTask
50+
abstract public class AbstractGenomicsDBImportHandler extends AbstractParameterizedOutputHandler<SequenceOutputHandler.SequenceOutputProcessor> implements SequenceOutputHandler.TracksVCF, SequenceOutputHandler.HasCustomVariantMerge, VariantProcessingStep.MayRequirePrepareTask, VariantProcessingStep.SupportsScatterGather
5151
{
5252
protected FileType _gvcfFileType = new FileType(Arrays.asList(".g.vcf"), ".g.vcf", false, FileType.gzSupportLevel.SUPPORT_GZ);
5353
public static final FileType TILE_DB_FILETYPE = new FileType(Arrays.asList(".tdb"), ".tdb", false, FileType.gzSupportLevel.NO_GZ);
@@ -60,6 +60,35 @@ public AbstractGenomicsDBImportHandler(Module owner, String name, String descrip
6060
super(owner, name, description, dependencies, parameters);
6161
}
6262

63+
@Override
64+
public void validateScatter(VariantProcessingStep.ScatterGatherMethod method, PipelineJob job) throws IllegalArgumentException
65+
{
66+
validateNoSplitContigScatter(method, job);
67+
}
68+
69+
public static void validateNoSplitContigScatter(VariantProcessingStep.ScatterGatherMethod method, PipelineJob job) throws IllegalArgumentException
70+
{
71+
if (!(job instanceof VariantProcessingJob))
72+
{
73+
return;
74+
}
75+
76+
VariantProcessingJob vj = (VariantProcessingJob)job;
77+
78+
if (!vj.isScatterJob())
79+
{
80+
return;
81+
}
82+
83+
if (vj.getScatterGatherMethod() == VariantProcessingStep.ScatterGatherMethod.fixedJobs || vj.getScatterGatherMethod() == VariantProcessingStep.ScatterGatherMethod.chunked)
84+
{
85+
if (vj.doAllowSplitContigs())
86+
{
87+
throw new IllegalArgumentException("This job does not support scatter methods with allowSplitContigs=true");
88+
}
89+
}
90+
}
91+
6392
@Override
6493
public SequenceOutputFile createFinalSequenceOutput(PipelineJob job, File mergedWorkspace, List<SequenceOutputFile> inputFiles) throws PipelineJobException
6594
{

SequenceAnalysis/src/org/labkey/sequenceanalysis/run/util/CombineGVCFsHandler.java

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,6 @@
2525
import java.io.IOException;
2626
import java.util.ArrayList;
2727
import java.util.Arrays;
28-
import java.util.Collection;
2928
import java.util.Date;
3029
import java.util.HashSet;
3130
import java.util.LinkedHashSet;
@@ -35,7 +34,7 @@
3534
/**
3635
* Created by bimber on 4/2/2017.
3736
*/
38-
public class CombineGVCFsHandler extends AbstractParameterizedOutputHandler<SequenceOutputHandler.SequenceOutputProcessor> implements SequenceOutputHandler.TracksVCF, VariantProcessingStep.MayRequirePrepareTask
37+
public class CombineGVCFsHandler extends AbstractParameterizedOutputHandler<SequenceOutputHandler.SequenceOutputProcessor> implements SequenceOutputHandler.TracksVCF, VariantProcessingStep.MayRequirePrepareTask, VariantProcessingStep.SupportsScatterGather
3938
{
4039
public static final String NAME = "Combine GVCFs";
4140
private static final String COMBINED_CATEGORY = "Combined gVCF File";
@@ -53,6 +52,12 @@ public CombineGVCFsHandler()
5352
));
5453
}
5554

55+
@Override
56+
public void validateScatter(VariantProcessingStep.ScatterGatherMethod method, PipelineJob job) throws IllegalArgumentException
57+
{
58+
AbstractGenomicsDBImportHandler.validateNoSplitContigScatter(method, job);
59+
}
60+
5661
@Override
5762
public boolean canProcess(SequenceOutputFile o)
5863
{

SequenceAnalysis/src/org/labkey/sequenceanalysis/util/ScatterGatherUtils.java

Lines changed: 0 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -15,13 +15,6 @@
1515

1616
public class ScatterGatherUtils
1717
{
18-
public static enum ScatterGatherMethod
19-
{
20-
none(),
21-
contig(),
22-
chunked(),
23-
fixedJobs()
24-
}
2518

2619
private static class ActiveIntervalSet
2720
{

0 commit comments

Comments
 (0)