Skip to content

Commit 06f7da5

Browse files
committed
Add option to run consolidate_genomicsdb_array prior to GenomicsDBImport/Append
1 parent e224ea8 commit 06f7da5

File tree

3 files changed

+94
-71
lines changed

3 files changed

+94
-71
lines changed

SequenceAnalysis/src/org/labkey/sequenceanalysis/run/util/AbstractGenomicsDBImportHandler.java

Lines changed: 92 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@
1919
import org.labkey.api.pipeline.RecordedAction;
2020
import org.labkey.api.sequenceanalysis.SequenceOutputFile;
2121
import org.labkey.api.sequenceanalysis.pipeline.AbstractParameterizedOutputHandler;
22+
import org.labkey.api.sequenceanalysis.pipeline.CommandLineParam;
2223
import org.labkey.api.sequenceanalysis.pipeline.ReferenceGenome;
2324
import org.labkey.api.sequenceanalysis.pipeline.SequenceAnalysisJobSupport;
2425
import org.labkey.api.sequenceanalysis.pipeline.SequenceOutputHandler;
@@ -66,6 +67,61 @@ public AbstractGenomicsDBImportHandler(Module owner, String name, String descrip
6667
super(owner, name, description, dependencies, parameters);
6768
}
6869

70+
protected static List<ToolParameterDescriptor> getToolParameters(boolean addCopyOption)
71+
{
72+
List<ToolParameterDescriptor> ret = new ArrayList<>();
73+
74+
if (addCopyOption)
75+
{
76+
ret.add(ToolParameterDescriptor.createExpDataParam(EXISTING_WORKSPACE, "Existing Workspace", "This is the workspace into which new samples will be merged", "sequenceanalysis-sequenceoutputfileselectorfield", new JSONObject()
77+
{{
78+
put("allowBlank", false);
79+
put("category", CATEGORY);
80+
}}, null));
81+
}
82+
83+
ret.addAll(Arrays.asList(
84+
ToolParameterDescriptor.create("fileBaseName", "Filename", "This is the basename that will be used for the output gzipped VCF", "textfield", null, "CombinedGenotypes"),
85+
ToolParameterDescriptor.create("doCopyGVcfLocal", "Copy gVCFs To Working Directory", "If selected, the gVCFs will be copied to the working directory first, which can improve performance when working with a large set of files.", "checkbox", new JSONObject(){{
86+
put("checked", false);
87+
}}, false),
88+
ToolParameterDescriptor.createCommandLineParam(CommandLineParam.create("--batch-size"), "batchSize", "Batch Size", "Batch size controls the number of samples for which readers are open at once and therefore provides a way to minimize memory consumption. However, it can take longer to complete. Use the consolidate flag if more than a hundred batches were used. This will improve feature read time. batchSize=0 means no batching (i.e. readers for all samples will be opened at once) Defaults to 0.", "ldk-integerfield", null, null),
89+
ToolParameterDescriptor.createCommandLineParam(CommandLineParam.create("--reader-threads"), "readerThreads", "Reader Threads", "How many simultaneous threads to use when opening VCFs in batches; higher values may improve performance when network latency is an issue", "ldk-integerfield", null, null),
90+
ToolParameterDescriptor.create("disableFileLocking", "Disable File Locking", "Certain filesystems do not support file locking, including NFS and Lustre. If your data will be processed on a filesystem that does not support locking, check this.", "checkbox", new JSONObject(){{
91+
put("checked", true);
92+
}}, true),
93+
ToolParameterDescriptor.create("sharedPosixOptimizations", "Use Shared Posix Optimizations", "This enabled optimizations for large shared filesystems, such as lustre.", "checkbox", new JSONObject(){{
94+
put("checked", true);
95+
}}, true),
96+
ToolParameterDescriptor.create("bypassFeatureReader", "Bypass Feature Reader", "If checked, rather than use the HTSJDK/Java reader, it will use a C-based implementation.", "checkbox", new JSONObject(){{
97+
put("checked", true);
98+
}}, true),
99+
ToolParameterDescriptor.createCommandLineParam(CommandLineParam.create("--genomicsdb-segment-size"), "genomicsdbSegmentSize", "Genomicsdb Segment Size", "Reducing this value may help with memory issues", "ldk-integerfield", new JSONObject(){{
100+
put("minValue", 0);
101+
}}, null),
102+
ToolParameterDescriptor.create("nativeMemoryBuffer", "C++ Memory Buffer", "By default, the pipeline java processes are allocated nearly all of the requested RAM. GenomicsDB requires memory for the C++ layer - this value (in GB) will be reserved for this. We recommend about 15-25% of the total job RAM", "ldk-integerfield", new JSONObject(){{
103+
put("minValue", 0);
104+
}}, 36),
105+
ToolParameterDescriptor.create("consolidate", "Consolidate", "If importing data in batches, a new fragment is created for each batch. In case thousands of fragments are created, GenomicsDB feature readers will try to open ~20x as many files. Also, internally GenomicsDB would consume more memory to maintain bookkeeping data from all fragments. Use this flag to merge all fragments into one. Merging can potentially improve read performance, however overall benefit might not be noticeable as the top Java layers have significantly higher overheads. This flag has no effect if only one batch is used. Defaults to false.", "checkbox", new JSONObject(){{
106+
put("checked", false);
107+
}}, false),
108+
ToolParameterDescriptor.create("scatterGather", "Scatter/Gather Options", "If selected, this job will be divided to run job per chromosome. The final step will take the VCF from each intermediate step and combined to make a final VCF file.", "sequenceanalysis-variantscattergatherpanel", new JSONObject(){{
109+
put("defaultValue", "chunked");
110+
}}, false)
111+
));
112+
113+
if (addCopyOption)
114+
{
115+
ret.add(
116+
ToolParameterDescriptor.create("consolidateFirst", "Consolidate First", "If checked, this will run the standalone tool consolidate_genomicsdb_array on the input prior to running GATK.", "checkbox", new JSONObject(){{
117+
put("checked", false);
118+
}}, false)
119+
);
120+
}
121+
122+
return ret;
123+
}
124+
69125
@Override
70126
public void validateScatter(VariantProcessingStep.ScatterGatherMethod method, PipelineJob job) throws IllegalArgumentException
71127
{
@@ -566,6 +622,42 @@ else if (genomeIds.isEmpty())
566622
wrapper.addToEnvironment("TILEDB_DISABLE_FILE_LOCKING", "1");
567623
}
568624

625+
if (ctx.getParams().optBoolean("consolidateFirst", false))
626+
{
627+
ctx.getLogger().info("Will pre-consolidate the workspace using consolidate_genomicsdb_array");
628+
List<String> baseArgs = new ArrayList<>();
629+
baseArgs.add(SequencePipelineService.get().getExeForPackage("GENOMICSDB_PATH", "consolidate_genomicsdb_array").getPath());
630+
631+
baseArgs.add("-w");
632+
baseArgs.add(workingDestinationWorkspaceFolder.getPath());
633+
634+
if (ctx.getParams().optBoolean("sharedPosixOptimizations", false))
635+
{
636+
baseArgs.add("--shared-posixfs-optimizations");
637+
}
638+
639+
if (ctx.getParams().get("genomicsdbSegmentSize") != null)
640+
{
641+
baseArgs.add("--segment-size");
642+
baseArgs.add(String.valueOf(ctx.getParams().get("genomicsdbSegmentSize")));
643+
}
644+
645+
List<Interval> intervals = getIntervalsOrFullGenome(ctx, genome);
646+
for (Interval i : intervals)
647+
{
648+
File contigFolder = new File(workingDestinationWorkspaceFolder, getFolderNameFromInterval(i));
649+
ctx.getLogger().info("Consolidating contig folder: " + contigFolder);
650+
651+
List<String> toRun = new ArrayList<>(baseArgs);
652+
toRun.add("-a");
653+
toRun.add(contigFolder.getName());
654+
655+
new SimpleScriptWrapper(ctx.getLogger()).execute(toRun);
656+
657+
reportFragmentsPerContig(ctx, contigFolder, i.getContig());
658+
}
659+
}
660+
569661
if (!genomicsDbCompleted)
570662
{
571663
try

SequenceAnalysis/src/org/labkey/sequenceanalysis/run/util/GenomicsDBAppendHandler.java

Lines changed: 1 addition & 37 deletions
Original file line numberDiff line numberDiff line change
@@ -1,13 +1,9 @@
11
package org.labkey.sequenceanalysis.run.util;
22

3-
import org.json.JSONObject;
43
import org.labkey.api.module.ModuleLoader;
5-
import org.labkey.api.sequenceanalysis.pipeline.CommandLineParam;
6-
import org.labkey.api.sequenceanalysis.pipeline.ToolParameterDescriptor;
74
import org.labkey.api.util.PageFlowUtil;
85
import org.labkey.sequenceanalysis.SequenceAnalysisModule;
96

10-
import java.util.Arrays;
117
import java.util.LinkedHashSet;
128

139
//See: https://gatk.broadinstitute.org/hc/en-us/articles/360035891051-GenomicsDB
@@ -18,39 +14,7 @@ public class GenomicsDBAppendHandler extends AbstractGenomicsDBImportHandler
1814

1915
public GenomicsDBAppendHandler()
2016
{
21-
super(ModuleLoader.getInstance().getModule(SequenceAnalysisModule.class), NAME, "This will run GATK\'s GenomicsDBImport on a set of GVCF files. Note: this cannot work against any VCF file - these are primarily VCFs created using GATK\'s HaplotypeCaller.", new LinkedHashSet<>(PageFlowUtil.set("sequenceanalysis/field/SequenceOutputFileSelectorField.js")), Arrays.asList(
22-
ToolParameterDescriptor.createExpDataParam(EXISTING_WORKSPACE, "Existing Workspace", "This is the workspace into which new samples will be merged", "sequenceanalysis-sequenceoutputfileselectorfield", new JSONObject(){{
23-
put("allowBlank", false);
24-
put("category", CATEGORY);
25-
}}, null),
26-
ToolParameterDescriptor.create("fileBaseName", "Filename", "This is the basename that will be used for the output gzipped VCF", "textfield", null, "CombinedGenotypes"),
27-
ToolParameterDescriptor.create("doCopyGVcfLocal", "Copy gVCFs To Working Directory", "If selected, the gVCFs will be copied to the working directory first, which can improve performance when working with a large set of files.", "checkbox", new JSONObject(){{
28-
put("checked", false);
29-
}}, false),
30-
ToolParameterDescriptor.createCommandLineParam(CommandLineParam.create("--batch-size"), "batchSize", "Batch Size", "Batch size controls the number of samples for which readers are open at once and therefore provides a way to minimize memory consumption. However, it can take longer to complete. Use the consolidate flag if more than a hundred batches were used. This will improve feature read time. batchSize=0 means no batching (i.e. readers for all samples will be opened at once) Defaults to 0.", "ldk-integerfield", null, null),
31-
ToolParameterDescriptor.createCommandLineParam(CommandLineParam.create("--reader-threads"), "readerThreads", "Reader Threads", "How many simultaneous threads to use when opening VCFs in batches; higher values may improve performance when network latency is an issue", "ldk-integerfield", null, null),
32-
ToolParameterDescriptor.create("disableFileLocking", "Disable File Locking", "Certain filesystems do not support file locking, including NFS and Lustre. If your data will be processed on a filesystem that does not support locking, check this.", "checkbox", new JSONObject(){{
33-
put("checked", true);
34-
}}, true),
35-
ToolParameterDescriptor.create("sharedPosixOptimizations", "Use Shared Posix Optimizations", "This enabled optimizations for large shared filesystems, such as lustre.", "checkbox", new JSONObject(){{
36-
put("checked", true);
37-
}}, true),
38-
ToolParameterDescriptor.create("bypassFeatureReader", "Bypass Feature Reader", "If checked, rather than use the HTSJDK/Java reader, it will use a C-based implementation.", "checkbox", new JSONObject(){{
39-
put("checked", true);
40-
}}, true),
41-
ToolParameterDescriptor.createCommandLineParam(CommandLineParam.create("--genomicsdb-segment-size"), "genomicsdbSegmentSize", "Genomicsdb Segment Size", "Reducing this value may help with memory issues", "ldk-integerfield", new JSONObject(){{
42-
put("minValue", 0);
43-
}}, null),
44-
ToolParameterDescriptor.create("nativeMemoryBuffer", "C++ Memory Buffer", "By default, the pipeline java processes are allocated nearly all of the requested RAM. GenomicsDB requires memory for the C++ layer - this value (in GB) will be reserved for this. We recommend about 15-25% of the total job RAM", "ldk-integerfield", new JSONObject(){{
45-
put("minValue", 0);
46-
}}, 36),
47-
ToolParameterDescriptor.create("consolidate", "Consolidate", "If importing data in batches, a new fragment is created for each batch. In case thousands of fragments are created, GenomicsDB feature readers will try to open ~20x as many files. Also, internally GenomicsDB would consume more memory to maintain bookkeeping data from all fragments. Use this flag to merge all fragments into one. Merging can potentially improve read performance, however overall benefit might not be noticeable as the top Java layers have significantly higher overheads. This flag has no effect if only one batch is used. Defaults to false.", "checkbox", new JSONObject(){{
48-
put("checked", false);
49-
}}, false),
50-
ToolParameterDescriptor.create("scatterGather", "Scatter/Gather Options", "If selected, this job will be divided to run job per chromosome. The final step will take the VCF from each intermediate step and combined to make a final VCF file.", "sequenceanalysis-variantscattergatherpanel", new JSONObject(){{
51-
put("defaultValue", "chunked");
52-
}}, false)
53-
));
17+
super(ModuleLoader.getInstance().getModule(SequenceAnalysisModule.class), NAME, "This will run GATK\'s GenomicsDBImport on a set of GVCF files. Note: this cannot work against any VCF file - these are primarily VCFs created using GATK\'s HaplotypeCaller.", new LinkedHashSet<>(PageFlowUtil.set("sequenceanalysis/field/SequenceOutputFileSelectorField.js")), getToolParameters(true));
5418
}
5519

5620
@Override

SequenceAnalysis/src/org/labkey/sequenceanalysis/run/util/GenomicsDBImportHandler.java

Lines changed: 1 addition & 34 deletions
Original file line numberDiff line numberDiff line change
@@ -1,13 +1,8 @@
11
package org.labkey.sequenceanalysis.run.util;
22

3-
import org.json.JSONObject;
43
import org.labkey.api.module.ModuleLoader;
5-
import org.labkey.api.sequenceanalysis.pipeline.CommandLineParam;
6-
import org.labkey.api.sequenceanalysis.pipeline.ToolParameterDescriptor;
74
import org.labkey.sequenceanalysis.SequenceAnalysisModule;
85

9-
import java.util.Arrays;
10-
116
/**
127
* Created by bimber on 4/2/2017.
138
*/
@@ -17,35 +12,7 @@ public class GenomicsDBImportHandler extends AbstractGenomicsDBImportHandler
1712

1813
public GenomicsDBImportHandler()
1914
{
20-
super(ModuleLoader.getInstance().getModule(SequenceAnalysisModule.class), NAME, "This will run GATK\'s GenomicsDBImport on a set of GVCF files. Note: this cannot work against any VCF file - these are primarily VCFs created using GATK\'s HaplotypeCaller.", null, Arrays.asList(
21-
ToolParameterDescriptor.create("fileBaseName", "Filename", "This is the basename that will be used for the output gzipped VCF", "textfield", null, "CombinedGenotypes"),
22-
ToolParameterDescriptor.create("doCopyGVcfLocal", "Copy gVCFs To Working Directory", "If selected, the gVCFs will be copied to the working directory first, which can improve performance when working with a large set of files.", "checkbox", new JSONObject(){{
23-
put("checked", false);
24-
}}, false),
25-
ToolParameterDescriptor.createCommandLineParam(CommandLineParam.create("--batch-size"), "batchSize", "Batch Size", "Batch size controls the number of samples for which readers are open at once and therefore provides a way to minimize memory consumption. However, it can take longer to complete. Use the consolidate flag if more than a hundred batches were used. This will improve feature read time. batchSize=0 means no batching (i.e. readers for all samples will be opened at once) Defaults to 0.", "ldk-integerfield", null, null),
26-
ToolParameterDescriptor.createCommandLineParam(CommandLineParam.create("--reader-threads"), "readerThreads", "Reader Threads", "How many simultaneous threads to use when opening VCFs in batches; higher values may improve performance when network latency is an issue", "ldk-integerfield", null, null),
27-
ToolParameterDescriptor.create("disableFileLocking", "Disable File Locking", "Certain filesystems do not support file locking, including NFS and Lustre. If your data will be processed on a filesystem that does not support locking, check this.", "checkbox", new JSONObject(){{
28-
put("checked", true);
29-
}}, true),
30-
ToolParameterDescriptor.create("sharedPosixOptimizations", "Use Shared Posix Optimizations", "This enabled optimizations for large shared filesystems, such as lustre.", "checkbox", new JSONObject(){{
31-
put("checked", true);
32-
}}, true),
33-
ToolParameterDescriptor.create("bypassFeatureReader", "Bypass Feature Reader", "If checked, rather than use the HTSJDK/Java reader, it will use a C-based implementation.", "checkbox", new JSONObject(){{
34-
put("checked", true);
35-
}}, true),
36-
ToolParameterDescriptor.createCommandLineParam(CommandLineParam.create("--genomicsdb-segment-size"), "genomicsdbSegmentSize", "Genomicsdb Segment Size", "Reducing this value may help with memory issues", "ldk-integerfield", new JSONObject(){{
37-
put("minValue", 0);
38-
}}, null),
39-
ToolParameterDescriptor.create("nativeMemoryBuffer", "C++ Memory Buffer", "By default, the pipeline java processes are allocated nearly all of the requested RAM. GenomicsDB requires memory for the C++ layer - this value (in GB) will be reserved for this. We recommend about 15-25% of the total job RAM", "ldk-integerfield", new JSONObject(){{
40-
put("minValue", 0);
41-
}}, 36),
42-
ToolParameterDescriptor.create("consolidate", "Consolidate", "If importing data in batches, a new fragment is created for each batch. In case thousands of fragments are created, GenomicsDB feature readers will try to open ~20x as many files. Also, internally GenomicsDB would consume more memory to maintain bookkeeping data from all fragments. Use this flag to merge all fragments into one. Merging can potentially improve read performance, however overall benefit might not be noticeable as the top Java layers have significantly higher overheads. This flag has no effect if only one batch is used. Defaults to false.", "checkbox", new JSONObject(){{
43-
put("checked", false);
44-
}}, false),
45-
ToolParameterDescriptor.create("scatterGather", "Scatter/Gather Options", "If selected, this job will be divided to run job per chromosome. The final step will take the VCF from each intermediate step and combined to make a final VCF file.", "sequenceanalysis-variantscattergatherpanel", new JSONObject(){{
46-
put("defaultValue", "chunked");
47-
}}, false)
48-
));
15+
super(ModuleLoader.getInstance().getModule(SequenceAnalysisModule.class), NAME, "This will run GATK\'s GenomicsDBImport on a set of GVCF files. Note: this cannot work against any VCF file - these are primarily VCFs created using GATK\'s HaplotypeCaller.", null, getToolParameters(false));
4916
}
5017

5118
@Override

0 commit comments

Comments
 (0)