Skip to content

Commit d4e5c01

Browse files
authored
Merge pull request #143 from LabKey/fb_merge_22.7_to_develop
Merge 22.7 to develop
2 parents 9912271 + dba5151 commit d4e5c01

File tree

15 files changed

+474
-122
lines changed

15 files changed

+474
-122
lines changed

mGAP/src/org/labkey/mgap/mGAPModule.java

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -45,6 +45,7 @@
4545
import org.labkey.mgap.pipeline.RenameSamplesForMgapStep;
4646
import org.labkey.mgap.pipeline.SampleSpecificGenotypeFiltrationStep;
4747
import org.labkey.mgap.pipeline.VcfComparisonStep;
48+
import org.labkey.mgap.pipeline.mGapReleaseAnnotateNovelSitesStep;
4849
import org.labkey.mgap.pipeline.mGapReleaseComparisonStep;
4950
import org.labkey.mgap.pipeline.mGapReleaseGenerator;
5051
import org.labkey.mgap.query.mGAPUserSchema;
@@ -118,6 +119,7 @@ public PipelineStartup()
118119
SequencePipelineService.get().registerPipelineStep(new VcfComparisonStep.Provider());
119120
SequencePipelineService.get().registerPipelineStep(new mGapReleaseComparisonStep.Provider());
120121
SequencePipelineService.get().registerPipelineStep(new SampleSpecificGenotypeFiltrationStep.Provider());
122+
SequencePipelineService.get().registerPipelineStep(new mGapReleaseAnnotateNovelSitesStep.Provider());
121123

122124
_hasRegistered = true;
123125
}
Lines changed: 195 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,195 @@
1+
package org.labkey.mgap.pipeline;
2+
3+
import htsjdk.samtools.util.Interval;
4+
import org.apache.logging.log4j.Logger;
5+
import org.json.old.JSONObject;
6+
import org.labkey.api.data.SimpleFilter;
7+
import org.labkey.api.data.TableSelector;
8+
import org.labkey.api.pipeline.PipelineJob;
9+
import org.labkey.api.pipeline.PipelineJobException;
10+
import org.labkey.api.query.FieldKey;
11+
import org.labkey.api.sequenceanalysis.SequenceAnalysisService;
12+
import org.labkey.api.sequenceanalysis.SequenceOutputFile;
13+
import org.labkey.api.sequenceanalysis.pipeline.AbstractVariantProcessingStepProvider;
14+
import org.labkey.api.sequenceanalysis.pipeline.PipelineContext;
15+
import org.labkey.api.sequenceanalysis.pipeline.PipelineStepProvider;
16+
import org.labkey.api.sequenceanalysis.pipeline.ReferenceGenome;
17+
import org.labkey.api.sequenceanalysis.pipeline.SequenceAnalysisJobSupport;
18+
import org.labkey.api.sequenceanalysis.pipeline.ToolParameterDescriptor;
19+
import org.labkey.api.sequenceanalysis.pipeline.VariantProcessingStep;
20+
import org.labkey.api.sequenceanalysis.pipeline.VariantProcessingStepOutputImpl;
21+
import org.labkey.api.sequenceanalysis.run.AbstractCommandPipelineStep;
22+
import org.labkey.api.sequenceanalysis.run.AbstractDiscvrSeqWrapper;
23+
import org.labkey.api.util.PageFlowUtil;
24+
import org.labkey.mgap.mGAPSchema;
25+
26+
import javax.annotation.Nullable;
27+
import java.io.File;
28+
import java.util.ArrayList;
29+
import java.util.Arrays;
30+
import java.util.List;
31+
32+
/**
33+
* User: bimber
34+
* Date: 6/15/2014
35+
* Time: 12:39 PM
36+
*/
37+
public class mGapReleaseAnnotateNovelSitesStep extends AbstractCommandPipelineStep<mGapReleaseAnnotateNovelSitesStep.AnnotateNovelSitesWrapper> implements VariantProcessingStep
38+
{
39+
public static final String VERSION_ROWID = "versionRowId";
40+
public static final String PRIOR_RELEASE_LABEL = "priorReleaseLabel";
41+
public static final String SITES_ONLY_DATA = "sitesOnlyVcfData";
42+
43+
public mGapReleaseAnnotateNovelSitesStep(PipelineStepProvider<?> provider, PipelineContext ctx)
44+
{
45+
super(provider, ctx, new AnnotateNovelSitesWrapper(ctx.getLogger()));
46+
}
47+
48+
public static class Provider extends AbstractVariantProcessingStepProvider<mGapReleaseAnnotateNovelSitesStep> implements SupportsScatterGather
49+
{
50+
public Provider()
51+
{
52+
super("mGapAnnotateNovelSites", "Annotate Novel Sites Against mGAP Release", "AnnotateNovelSites", "Compare the VCF to the specified mGAP release VCF, producing TSV/VCF reports with site- and genotype-level concordance.", Arrays.asList(
53+
ToolParameterDescriptor.create(VERSION_ROWID, "mGAP Release", "The mGAP release VCF to use for comparison", "ldk-simplelabkeycombo", new JSONObject(){{
54+
put("allowBlank", false);
55+
put("width", 400);
56+
put("schemaName", "mgap");
57+
put("queryName", "variantCatalogReleases");
58+
put("containerPath", "js:Laboratory.Utils.getQueryContainerPath()");
59+
put("displayField", "version");
60+
put("valueField", "rowid");
61+
put("doNotIncludeInTemplates", true);
62+
}}, null),
63+
ToolParameterDescriptor.create("releaseVersion", "mGAP Version", "This string will be used to tag novel variants.", "textfield", new JSONObject(){{
64+
put("allowBlank", false);
65+
put("doNotIncludeInTemplates", true);
66+
}}, null)
67+
), PageFlowUtil.set("sequenceanalysis/field/SequenceOutputFileSelectorField.js"), null);
68+
}
69+
70+
@Override
71+
public mGapReleaseAnnotateNovelSitesStep create(PipelineContext ctx)
72+
{
73+
return new mGapReleaseAnnotateNovelSitesStep(this, ctx);
74+
}
75+
}
76+
77+
@Override
78+
public Output processVariants(File inputVCF, File outputDirectory, ReferenceGenome genome, @Nullable List<Interval> intervals) throws PipelineJobException
79+
{
80+
VariantProcessingStepOutputImpl output = new VariantProcessingStepOutputImpl();
81+
getPipelineCtx().getLogger().info("Annotating VCF by mGAP Release");
82+
83+
String releaseVersion = getProvider().getParameterByName("releaseVersion").extractValue(getPipelineCtx().getJob(), getProvider(), getStepIdx(), String.class, "0.0");
84+
String priorReleaseLabel = getPipelineCtx().getSequenceSupport().getCachedObject(PRIOR_RELEASE_LABEL, String.class);
85+
int sitesOnlyExpDataId = getPipelineCtx().getSequenceSupport().getCachedObject(SITES_ONLY_DATA, Integer.class);
86+
File sitesOnlyVcf = getPipelineCtx().getSequenceSupport().getCachedData(sitesOnlyExpDataId);
87+
if (!sitesOnlyVcf.exists())
88+
{
89+
throw new PipelineJobException("Unable to find file: " + sitesOnlyVcf);
90+
}
91+
92+
List<String> extraArgs = new ArrayList<>();
93+
if (intervals != null)
94+
{
95+
intervals.forEach(interval -> {
96+
extraArgs.add("-L");
97+
extraArgs.add(interval.getContig() + ":" + interval.getStart() + "-" + interval.getEnd());
98+
});
99+
100+
extraArgs.add("--ignore-variants-starting-outside-interval");
101+
}
102+
103+
extraArgs.add("-dv");
104+
extraArgs.add(priorReleaseLabel);
105+
106+
File annotatedVCF = new File(outputDirectory, SequenceAnalysisService.get().getUnzippedBaseName(inputVCF.getName()) + ".comparison.vcf.gz");
107+
getWrapper().execute(inputVCF, sitesOnlyVcf, genome.getWorkingFastaFile(), releaseVersion, annotatedVCF, extraArgs);
108+
if (!annotatedVCF.exists())
109+
{
110+
throw new PipelineJobException("Unable to find output: " + annotatedVCF.getPath());
111+
}
112+
113+
output.addInput(inputVCF, "Input VCF");
114+
output.addInput(sitesOnlyVcf, "Reference VCF");
115+
116+
output.addOutput(annotatedVCF, "VCF Annotated by mGAP Version");
117+
output.setVcf(annotatedVCF);
118+
119+
return output;
120+
}
121+
122+
@Override
123+
public void init(PipelineJob job, SequenceAnalysisJobSupport support, List<SequenceOutputFile> inputFiles) throws PipelineJobException
124+
{
125+
Integer versionRowId = getProvider().getParameterByName(VERSION_ROWID).extractValue(getPipelineCtx().getJob(), getProvider(), getStepIdx(), Integer.class);
126+
String version = new TableSelector(mGAPSchema.getInstance().getSchema().getTable(mGAPSchema.TABLE_VARIANT_CATALOG_RELEASES), PageFlowUtil.set("version"), new SimpleFilter(FieldKey.fromString("rowId"), versionRowId), null).getObject(String.class);
127+
if (version == null)
128+
{
129+
throw new PipelineJobException("Unable to find release for release: " + versionRowId);
130+
}
131+
132+
Integer referenceVcfOutputId = new TableSelector(mGAPSchema.getInstance().getSchema().getTable(mGAPSchema.TABLE_VARIANT_CATALOG_RELEASES), PageFlowUtil.set("sitesOnlyVcfId"), new SimpleFilter(FieldKey.fromString("rowId"), versionRowId), null).getObject(Integer.class);
133+
if (referenceVcfOutputId == null)
134+
{
135+
getPipelineCtx().getLogger().debug("Sites-only VCF not found, using primary VCF");
136+
referenceVcfOutputId = new TableSelector(mGAPSchema.getInstance().getSchema().getTable(mGAPSchema.TABLE_VARIANT_CATALOG_RELEASES), PageFlowUtil.set("vcfId"), new SimpleFilter(FieldKey.fromString("rowId"), versionRowId), null).getObject(Integer.class);
137+
}
138+
139+
if (referenceVcfOutputId == null)
140+
{
141+
throw new PipelineJobException("Unable to find sites-only VCF for release: " + versionRowId);
142+
}
143+
144+
SequenceOutputFile sitesOnly = SequenceOutputFile.getForId(referenceVcfOutputId);
145+
if (sitesOnly == null)
146+
{
147+
throw new PipelineJobException("Unable to find sites-only VCF output file for fileId: " + referenceVcfOutputId);
148+
}
149+
150+
support.cacheExpData(sitesOnly.getExpData());
151+
152+
support.cacheObject(SITES_ONLY_DATA, sitesOnly.getDataId());
153+
support.cacheObject(PRIOR_RELEASE_LABEL, version);
154+
}
155+
156+
public static class AnnotateNovelSitesWrapper extends AbstractDiscvrSeqWrapper
157+
{
158+
public AnnotateNovelSitesWrapper(Logger log)
159+
{
160+
super(log);
161+
}
162+
163+
public File execute(File vcf, File referenceVcf, File fasta, String versionString, File vcfOutput, List<String> extraArgs) throws PipelineJobException
164+
{
165+
List<String> args = new ArrayList<>(getBaseArgs());
166+
args.add("AnnotateNovelSites");
167+
args.add("-R");
168+
args.add(fasta.getPath());
169+
170+
args.add("-V");
171+
args.add(vcf.getPath());
172+
args.add("-rv");
173+
args.add(referenceVcf.getPath());
174+
175+
args.add("-an");
176+
args.add("mGAPV");
177+
args.add("-ad");
178+
args.add("The first mGAP version where variants at this site appeared");
179+
args.add("-av");
180+
args.add(versionString);
181+
182+
args.add("-O");
183+
args.add(vcfOutput.getPath());
184+
185+
if (extraArgs != null)
186+
{
187+
args.addAll(extraArgs);
188+
}
189+
190+
execute(args);
191+
192+
return vcfOutput;
193+
}
194+
}
195+
}

mGAP/src/org/labkey/mgap/pipeline/mGapReleaseComparisonStep.java

Lines changed: 56 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -2,12 +2,16 @@
22

33
import htsjdk.samtools.util.Interval;
44
import org.json.old.JSONObject;
5+
import org.labkey.api.pipeline.PipelineJob;
56
import org.labkey.api.pipeline.PipelineJobException;
67
import org.labkey.api.sequenceanalysis.SequenceAnalysisService;
8+
import org.labkey.api.sequenceanalysis.SequenceOutputFile;
79
import org.labkey.api.sequenceanalysis.pipeline.AbstractVariantProcessingStepProvider;
810
import org.labkey.api.sequenceanalysis.pipeline.PipelineContext;
911
import org.labkey.api.sequenceanalysis.pipeline.PipelineStepProvider;
1012
import org.labkey.api.sequenceanalysis.pipeline.ReferenceGenome;
13+
import org.labkey.api.sequenceanalysis.pipeline.SequenceOutputHandler;
14+
import org.labkey.api.sequenceanalysis.pipeline.TaskFileManager;
1115
import org.labkey.api.sequenceanalysis.pipeline.ToolParameterDescriptor;
1216
import org.labkey.api.sequenceanalysis.pipeline.VariantProcessingStep;
1317
import org.labkey.api.sequenceanalysis.pipeline.VariantProcessingStepOutputImpl;
@@ -25,7 +29,7 @@
2529
* Date: 6/15/2014
2630
* Time: 12:39 PM
2731
*/
28-
public class mGapReleaseComparisonStep extends AbstractCommandPipelineStep<VcfComparisonStep.VcfComparison> implements VariantProcessingStep
32+
public class mGapReleaseComparisonStep extends AbstractCommandPipelineStep<VcfComparisonStep.VcfComparison> implements VariantProcessingStep, VariantProcessingStep.SupportsScatterGather
2933
{
3034
public static final String REF_VCF = "refVcf";
3135

@@ -38,15 +42,17 @@ public static class Provider extends AbstractVariantProcessingStepProvider<mGapR
3842
{
3943
public Provider()
4044
{
41-
super("mGapReleaseComparison", "mGapReleaseComparison", "VcfComparison", "Compare the VCF to the specified mGAP release VCF, producing TSV/VCF reports with site- and genotype-level concordance.", Arrays.asList(
42-
ToolParameterDescriptor.createExpDataParam(REF_VCF, "mGAP Release", "This is the pre-computed celltypist model to use for classification", "sequenceanalysis-sequenceoutputfileselectorfield", new JSONObject(){{
45+
super("mGapReleaseComparison", "Compare VCF to mGap Release", "VcfComparison", "Compare the VCF to the specified mGAP release VCF, producing TSV/VCF reports with site- and genotype-level concordance.", Arrays.asList(
46+
ToolParameterDescriptor.createExpDataParam(REF_VCF, "mGAP Release", "The mGAP release VCF to use for comparison", "sequenceanalysis-sequenceoutputfileselectorfield", new JSONObject(){{
4347
put("allowBlank", false);
4448
put("category", "mGAP Release");
4549
put("performGenomeFilter", false);
50+
put("doNotIncludeInTemplates", true);
4651
}}, null)
4752
), PageFlowUtil.set("sequenceanalysis/field/SequenceOutputFileSelectorField.js"), null);
4853
}
4954

55+
@Override
5056
public mGapReleaseComparisonStep create(PipelineContext ctx)
5157
{
5258
return new mGapReleaseComparisonStep(this, ctx);
@@ -63,7 +69,7 @@ public Output processVariants(File inputVCF, File outputDirectory, ReferenceGeno
6369
File refVcf = getPipelineCtx().getSequenceSupport().getCachedData(refFileId);
6470
if (refVcf == null || !refVcf.exists())
6571
{
66-
throw new PipelineJobException("Unable to find file: " + refFileId + "/" + refVcf);
72+
throw new PipelineJobException("Unable to find file: " + refFileId + ", path: " + refVcf);
6773
}
6874

6975
List<String> extraArgs = new ArrayList<>();
@@ -75,7 +81,7 @@ public Output processVariants(File inputVCF, File outputDirectory, ReferenceGeno
7581
});
7682
}
7783

78-
File outputTable = new File(outputDirectory, SequenceAnalysisService.get().getUnzippedBaseName(inputVCF.getName()) + ".comparison.txt");
84+
File outputSummaryTable = new File(outputDirectory, SequenceAnalysisService.get().getUnzippedBaseName(inputVCF.getName()) + ".comparison.txt");
7985
File missingSitesVcf = new File(outputDirectory, SequenceAnalysisService.get().getUnzippedBaseName(inputVCF.getName()) + ".missingSites.vcf.gz");
8086
File novelSitesVcf = new File(outputDirectory, SequenceAnalysisService.get().getUnzippedBaseName(inputVCF.getName()) + ".novelSites.vcf.gz");
8187

@@ -85,17 +91,57 @@ public Output processVariants(File inputVCF, File outputDirectory, ReferenceGeno
8591
extraArgs.add("--missing-sites-vcf");
8692
extraArgs.add(missingSitesVcf.getPath());
8793

88-
getWrapper().runTool(inputVCF, refVcf, outputTable, genome.getWorkingFastaFile(), extraArgs);
89-
if (!outputTable.exists())
94+
getWrapper().runTool(inputVCF, refVcf, outputSummaryTable, genome.getWorkingFastaFile(), extraArgs);
95+
if (!novelSitesVcf.exists())
9096
{
91-
throw new PipelineJobException("Unable to find output: " + outputTable.getPath());
97+
throw new PipelineJobException("Unable to find output: " + novelSitesVcf.getPath());
9298
}
9399

94100
output.addInput(inputVCF, "Input VCF");
95101
output.addInput(genome.getWorkingFastaFile(), "Reference Genome");
96-
output.addOutput(outputTable, "VcfComparison Table");
97-
output.addOutput(outputTable, "VcfComparison Site Table");
102+
output.addOutput(novelSitesVcf, "VcfComparison Novel Sites VCF");
103+
output.addOutput(missingSitesVcf, "VcfComparison Missing Sites VCF");
104+
output.addOutput(outputSummaryTable, "VcfComparison Summary Table");
105+
106+
output.setVcf(novelSitesVcf);
98107

99108
return output;
100109
}
110+
111+
@Override
112+
public void performAdditionalMergeTasks(SequenceOutputHandler.JobContext ctx, PipelineJob job, TaskFileManager manager, ReferenceGenome genome, List<File> orderedScatterOutputs) throws PipelineJobException
113+
{
114+
job.getLogger().info("Merging missing sites VCFs");
115+
List<File> toConcat = orderedScatterOutputs.stream().map(f -> {
116+
f = new File(f.getParentFile(), f.getName().replaceAll("novelSites", "missingSites"));
117+
if (!f.exists())
118+
{
119+
throw new IllegalStateException("Missing file: " + f.getPath());
120+
}
121+
122+
ctx.getFileManager().addIntermediateFile(f);
123+
ctx.getFileManager().addIntermediateFile(new File(f.getPath() + ".tbi"));
124+
125+
return f;
126+
}).toList();
127+
128+
String basename = SequenceAnalysisService.get().getUnzippedBaseName(toConcat.get(0).getName());
129+
File combined = new File(ctx.getSourceDirectory(), basename + ".vcf.gz");
130+
File combinedIdx = new File(combined.getPath() + ".tbi");
131+
if (combinedIdx.exists())
132+
{
133+
job.getLogger().info("VCF exists, will not recreate: " + combined.getPath());
134+
}
135+
else
136+
{
137+
combined = SequenceAnalysisService.get().combineVcfs(toConcat, combined, genome, job.getLogger(), true, null);
138+
}
139+
140+
SequenceOutputFile so = new SequenceOutputFile();
141+
so.setName(basename + ": Missing Sites");
142+
so.setFile(combined);
143+
so.setCategory("Missing Sites VCF");
144+
so.setLibrary_id(genome.getGenomeId());
145+
manager.addSequenceOutput(so);
146+
}
101147
}

0 commit comments

Comments
 (0)