Add option to run consolidate_genomicsdb_array prior to GenomicsDBImport/Append

bbimber · bbimber · commit 06f7da5b7ebf · 2022-03-03T09:44:04.000-08:00
diff --git a/SequenceAnalysis/src/org/labkey/sequenceanalysis/run/util/AbstractGenomicsDBImportHandler.java b/SequenceAnalysis/src/org/labkey/sequenceanalysis/run/util/AbstractGenomicsDBImportHandler.java
@@ -19,6 +19,7 @@
 import org.labkey.api.pipeline.RecordedAction;
 import org.labkey.api.sequenceanalysis.SequenceOutputFile;
 import org.labkey.api.sequenceanalysis.pipeline.AbstractParameterizedOutputHandler;
+import org.labkey.api.sequenceanalysis.pipeline.CommandLineParam;
 import org.labkey.api.sequenceanalysis.pipeline.ReferenceGenome;
 import org.labkey.api.sequenceanalysis.pipeline.SequenceAnalysisJobSupport;
 import org.labkey.api.sequenceanalysis.pipeline.SequenceOutputHandler;
@@ -66,6 +67,61 @@ public AbstractGenomicsDBImportHandler(Module owner, String name, String descrip
         super(owner, name, description, dependencies, parameters);
     }
 
+    protected static List<ToolParameterDescriptor> getToolParameters(boolean addCopyOption)
+    {
+        List<ToolParameterDescriptor> ret = new ArrayList<>();
+
+        if (addCopyOption)
+        {
+            ret.add(ToolParameterDescriptor.createExpDataParam(EXISTING_WORKSPACE, "Existing Workspace", "This is the workspace into which new samples will be merged", "sequenceanalysis-sequenceoutputfileselectorfield", new JSONObject()
+            {{
+                put("allowBlank", false);
+                put("category", CATEGORY);
+            }}, null));
+        }
+
+        ret.addAll(Arrays.asList(
+            ToolParameterDescriptor.create("fileBaseName", "Filename", "This is the basename that will be used for the output gzipped VCF", "textfield", null, "CombinedGenotypes"),
+            ToolParameterDescriptor.create("doCopyGVcfLocal", "Copy gVCFs To Working Directory", "If selected, the gVCFs will be copied to the working directory first, which can improve performance when working with a large set of files.", "checkbox", new JSONObject(){{
+                put("checked", false);
+            }}, false),
+            ToolParameterDescriptor.createCommandLineParam(CommandLineParam.create("--batch-size"), "batchSize", "Batch Size", "Batch size controls the number of samples for which readers are open at once and therefore provides a way to minimize memory consumption. However, it can take longer to complete. Use the consolidate flag if more than a hundred batches were used. This will improve feature read time. batchSize=0 means no batching (i.e. readers for all samples will be opened at once) Defaults to 0.", "ldk-integerfield", null, null),
+            ToolParameterDescriptor.createCommandLineParam(CommandLineParam.create("--reader-threads"), "readerThreads", "Reader Threads", "How many simultaneous threads to use when opening VCFs in batches; higher values may improve performance when network latency is an issue", "ldk-integerfield", null, null),
+            ToolParameterDescriptor.create("disableFileLocking", "Disable File Locking", "Certain filesystems do not support file locking, including NFS and Lustre.  If your data will be processed on a filesystem that does not support locking, check this.", "checkbox", new JSONObject(){{
+                put("checked", true);
+            }}, true),
+            ToolParameterDescriptor.create("sharedPosixOptimizations", "Use Shared Posix Optimizations", "This enabled optimizations for large shared filesystems, such as lustre.", "checkbox", new JSONObject(){{
+                put("checked", true);
+            }}, true),
+            ToolParameterDescriptor.create("bypassFeatureReader", "Bypass Feature Reader", "If checked, rather than use the HTSJDK/Java reader, it will use a C-based implementation.", "checkbox", new JSONObject(){{
+                put("checked", true);
+            }}, true),
+            ToolParameterDescriptor.createCommandLineParam(CommandLineParam.create("--genomicsdb-segment-size"), "genomicsdbSegmentSize", "Genomicsdb Segment Size", "Reducing this value may help with memory issues", "ldk-integerfield", new JSONObject(){{
+                put("minValue", 0);
+            }}, null),
+            ToolParameterDescriptor.create("nativeMemoryBuffer", "C++ Memory Buffer", "By default, the pipeline java processes are allocated nearly all of the requested RAM.  GenomicsDB requires memory for the C++ layer - this value (in GB) will be reserved for this.  We recommend about 15-25% of the total job RAM", "ldk-integerfield", new JSONObject(){{
+                put("minValue", 0);
+            }}, 36),
+            ToolParameterDescriptor.create("consolidate", "Consolidate", "If importing data in batches, a new fragment is created for each batch. In case thousands of fragments are created, GenomicsDB feature readers will try to open ~20x as many files. Also, internally GenomicsDB would consume more memory to maintain bookkeeping data from all fragments. Use this flag to merge all fragments into one. Merging can potentially improve read performance, however overall benefit might not be noticeable as the top Java layers have significantly higher overheads. This flag has no effect if only one batch is used. Defaults to false.", "checkbox", new JSONObject(){{
+                put("checked", false);
+            }}, false),
+            ToolParameterDescriptor.create("scatterGather", "Scatter/Gather Options", "If selected, this job will be divided to run job per chromosome.  The final step will take the VCF from each intermediate step and combined to make a final VCF file.", "sequenceanalysis-variantscattergatherpanel", new JSONObject(){{
+                put("defaultValue", "chunked");
+            }}, false)
+        ));
+
+        if (addCopyOption)
+        {
+            ret.add(
+                ToolParameterDescriptor.create("consolidateFirst", "Consolidate First", "If checked, this will run the standalone tool consolidate_genomicsdb_array on the input prior to running GATK.", "checkbox", new JSONObject(){{
+                    put("checked", false);
+                }}, false)
+            );
+        }
+
+        return ret;
+    }
+
     @Override
     public void validateScatter(VariantProcessingStep.ScatterGatherMethod method, PipelineJob job) throws IllegalArgumentException
     {
@@ -566,6 +622,42 @@ else if (genomeIds.isEmpty())
                 wrapper.addToEnvironment("TILEDB_DISABLE_FILE_LOCKING", "1");
             }
 
+            if (ctx.getParams().optBoolean("consolidateFirst", false))
+            {
+                ctx.getLogger().info("Will pre-consolidate the workspace using consolidate_genomicsdb_array");
+                List<String> baseArgs = new ArrayList<>();
+                baseArgs.add(SequencePipelineService.get().getExeForPackage("GENOMICSDB_PATH", "consolidate_genomicsdb_array").getPath());
+
+                baseArgs.add("-w");
+                baseArgs.add(workingDestinationWorkspaceFolder.getPath());
+
+                if (ctx.getParams().optBoolean("sharedPosixOptimizations", false))
+                {
+                    baseArgs.add("--shared-posixfs-optimizations");
+                }
+
+                if (ctx.getParams().get("genomicsdbSegmentSize") != null)
+                {
+                    baseArgs.add("--segment-size");
+                    baseArgs.add(String.valueOf(ctx.getParams().get("genomicsdbSegmentSize")));
+                }
+
+                List<Interval> intervals = getIntervalsOrFullGenome(ctx, genome);
+                for (Interval i : intervals)
+                {
+                    File contigFolder = new File(workingDestinationWorkspaceFolder, getFolderNameFromInterval(i));
+                    ctx.getLogger().info("Consolidating contig folder: " + contigFolder);
+
+                    List<String> toRun = new ArrayList<>(baseArgs);
+                    toRun.add("-a");
+                    toRun.add(contigFolder.getName());
+
+                    new SimpleScriptWrapper(ctx.getLogger()).execute(toRun);
+
+                    reportFragmentsPerContig(ctx, contigFolder, i.getContig());
+                }
+            }
+
             if (!genomicsDbCompleted)
             {
                 try
diff --git a/SequenceAnalysis/src/org/labkey/sequenceanalysis/run/util/GenomicsDBAppendHandler.java b/SequenceAnalysis/src/org/labkey/sequenceanalysis/run/util/GenomicsDBAppendHandler.java
@@ -1,13 +1,9 @@
 package org.labkey.sequenceanalysis.run.util;
 
-import org.json.JSONObject;
 import org.labkey.api.module.ModuleLoader;
-import org.labkey.api.sequenceanalysis.pipeline.CommandLineParam;
-import org.labkey.api.sequenceanalysis.pipeline.ToolParameterDescriptor;
 import org.labkey.api.util.PageFlowUtil;
 import org.labkey.sequenceanalysis.SequenceAnalysisModule;
 
-import java.util.Arrays;
 import java.util.LinkedHashSet;
 
 //See: https://gatk.broadinstitute.org/hc/en-us/articles/360035891051-GenomicsDB
@@ -18,39 +14,7 @@ public class GenomicsDBAppendHandler extends AbstractGenomicsDBImportHandler
 
     public GenomicsDBAppendHandler()
     {
-        super(ModuleLoader.getInstance().getModule(SequenceAnalysisModule.class), NAME, "This will run GATK\'s GenomicsDBImport on a set of GVCF files.  Note: this cannot work against any VCF file - these are primarily VCFs created using GATK\'s HaplotypeCaller.", new LinkedHashSet<>(PageFlowUtil.set("sequenceanalysis/field/SequenceOutputFileSelectorField.js")), Arrays.asList(
-                ToolParameterDescriptor.createExpDataParam(EXISTING_WORKSPACE, "Existing Workspace", "This is the workspace into which new samples will be merged", "sequenceanalysis-sequenceoutputfileselectorfield", new JSONObject(){{
-                    put("allowBlank", false);
-                    put("category", CATEGORY);
-                }}, null),
-                ToolParameterDescriptor.create("fileBaseName", "Filename", "This is the basename that will be used for the output gzipped VCF", "textfield", null, "CombinedGenotypes"),
-                ToolParameterDescriptor.create("doCopyGVcfLocal", "Copy gVCFs To Working Directory", "If selected, the gVCFs will be copied to the working directory first, which can improve performance when working with a large set of files.", "checkbox", new JSONObject(){{
-                    put("checked", false);
-                }}, false),
-                ToolParameterDescriptor.createCommandLineParam(CommandLineParam.create("--batch-size"), "batchSize", "Batch Size", "Batch size controls the number of samples for which readers are open at once and therefore provides a way to minimize memory consumption. However, it can take longer to complete. Use the consolidate flag if more than a hundred batches were used. This will improve feature read time. batchSize=0 means no batching (i.e. readers for all samples will be opened at once) Defaults to 0.", "ldk-integerfield", null, null),
-                ToolParameterDescriptor.createCommandLineParam(CommandLineParam.create("--reader-threads"), "readerThreads", "Reader Threads", "How many simultaneous threads to use when opening VCFs in batches; higher values may improve performance when network latency is an issue", "ldk-integerfield", null, null),
-                ToolParameterDescriptor.create("disableFileLocking", "Disable File Locking", "Certain filesystems do not support file locking, including NFS and Lustre.  If your data will be processed on a filesystem that does not support locking, check this.", "checkbox", new JSONObject(){{
-                    put("checked", true);
-                }}, true),
-                ToolParameterDescriptor.create("sharedPosixOptimizations", "Use Shared Posix Optimizations", "This enabled optimizations for large shared filesystems, such as lustre.", "checkbox", new JSONObject(){{
-                    put("checked", true);
-                }}, true),
-                ToolParameterDescriptor.create("bypassFeatureReader", "Bypass Feature Reader", "If checked, rather than use the HTSJDK/Java reader, it will use a C-based implementation.", "checkbox", new JSONObject(){{
-                    put("checked", true);
-                }}, true),
-                ToolParameterDescriptor.createCommandLineParam(CommandLineParam.create("--genomicsdb-segment-size"), "genomicsdbSegmentSize", "Genomicsdb Segment Size", "Reducing this value may help with memory issues", "ldk-integerfield", new JSONObject(){{
-                    put("minValue", 0);
-                }}, null),
-                ToolParameterDescriptor.create("nativeMemoryBuffer", "C++ Memory Buffer", "By default, the pipeline java processes are allocated nearly all of the requested RAM.  GenomicsDB requires memory for the C++ layer - this value (in GB) will be reserved for this.  We recommend about 15-25% of the total job RAM", "ldk-integerfield", new JSONObject(){{
-                    put("minValue", 0);
-                }}, 36),
-                ToolParameterDescriptor.create("consolidate", "Consolidate", "If importing data in batches, a new fragment is created for each batch. In case thousands of fragments are created, GenomicsDB feature readers will try to open ~20x as many files. Also, internally GenomicsDB would consume more memory to maintain bookkeeping data from all fragments. Use this flag to merge all fragments into one. Merging can potentially improve read performance, however overall benefit might not be noticeable as the top Java layers have significantly higher overheads. This flag has no effect if only one batch is used. Defaults to false.", "checkbox", new JSONObject(){{
-                    put("checked", false);
-                }}, false),
-                ToolParameterDescriptor.create("scatterGather", "Scatter/Gather Options", "If selected, this job will be divided to run job per chromosome.  The final step will take the VCF from each intermediate step and combined to make a final VCF file.", "sequenceanalysis-variantscattergatherpanel", new JSONObject(){{
-                    put("defaultValue", "chunked");
-                }}, false)
-        ));
+        super(ModuleLoader.getInstance().getModule(SequenceAnalysisModule.class), NAME, "This will run GATK\'s GenomicsDBImport on a set of GVCF files.  Note: this cannot work against any VCF file - these are primarily VCFs created using GATK\'s HaplotypeCaller.", new LinkedHashSet<>(PageFlowUtil.set("sequenceanalysis/field/SequenceOutputFileSelectorField.js")), getToolParameters(true));
     }
 
     @Override
diff --git a/SequenceAnalysis/src/org/labkey/sequenceanalysis/run/util/GenomicsDBImportHandler.java b/SequenceAnalysis/src/org/labkey/sequenceanalysis/run/util/GenomicsDBImportHandler.java
@@ -1,13 +1,8 @@
 package org.labkey.sequenceanalysis.run.util;
 
-import org.json.JSONObject;
 import org.labkey.api.module.ModuleLoader;
-import org.labkey.api.sequenceanalysis.pipeline.CommandLineParam;
-import org.labkey.api.sequenceanalysis.pipeline.ToolParameterDescriptor;
 import org.labkey.sequenceanalysis.SequenceAnalysisModule;
 
-import java.util.Arrays;
-
 /**
  * Created by bimber on 4/2/2017.
  */
@@ -17,35 +12,7 @@ public class GenomicsDBImportHandler extends AbstractGenomicsDBImportHandler
 
     public GenomicsDBImportHandler()
     {
-        super(ModuleLoader.getInstance().getModule(SequenceAnalysisModule.class), NAME, "This will run GATK\'s GenomicsDBImport on a set of GVCF files.  Note: this cannot work against any VCF file - these are primarily VCFs created using GATK\'s HaplotypeCaller.", null, Arrays.asList(
-                ToolParameterDescriptor.create("fileBaseName", "Filename", "This is the basename that will be used for the output gzipped VCF", "textfield", null, "CombinedGenotypes"),
-                ToolParameterDescriptor.create("doCopyGVcfLocal", "Copy gVCFs To Working Directory", "If selected, the gVCFs will be copied to the working directory first, which can improve performance when working with a large set of files.", "checkbox", new JSONObject(){{
-                    put("checked", false);
-                }}, false),
-                ToolParameterDescriptor.createCommandLineParam(CommandLineParam.create("--batch-size"), "batchSize", "Batch Size", "Batch size controls the number of samples for which readers are open at once and therefore provides a way to minimize memory consumption. However, it can take longer to complete. Use the consolidate flag if more than a hundred batches were used. This will improve feature read time. batchSize=0 means no batching (i.e. readers for all samples will be opened at once) Defaults to 0.", "ldk-integerfield", null, null),
-                ToolParameterDescriptor.createCommandLineParam(CommandLineParam.create("--reader-threads"), "readerThreads", "Reader Threads", "How many simultaneous threads to use when opening VCFs in batches; higher values may improve performance when network latency is an issue", "ldk-integerfield", null, null),
-                ToolParameterDescriptor.create("disableFileLocking", "Disable File Locking", "Certain filesystems do not support file locking, including NFS and Lustre.  If your data will be processed on a filesystem that does not support locking, check this.", "checkbox", new JSONObject(){{
-                    put("checked", true);
-                }}, true),
-                ToolParameterDescriptor.create("sharedPosixOptimizations", "Use Shared Posix Optimizations", "This enabled optimizations for large shared filesystems, such as lustre.", "checkbox", new JSONObject(){{
-                    put("checked", true);
-                }}, true),
-                ToolParameterDescriptor.create("bypassFeatureReader", "Bypass Feature Reader", "If checked, rather than use the HTSJDK/Java reader, it will use a C-based implementation.", "checkbox", new JSONObject(){{
-                    put("checked", true);
-                }}, true),
-                ToolParameterDescriptor.createCommandLineParam(CommandLineParam.create("--genomicsdb-segment-size"), "genomicsdbSegmentSize", "Genomicsdb Segment Size", "Reducing this value may help with memory issues", "ldk-integerfield", new JSONObject(){{
-                    put("minValue", 0);
-                }}, null),
-                ToolParameterDescriptor.create("nativeMemoryBuffer", "C++ Memory Buffer", "By default, the pipeline java processes are allocated nearly all of the requested RAM.  GenomicsDB requires memory for the C++ layer - this value (in GB) will be reserved for this.  We recommend about 15-25% of the total job RAM", "ldk-integerfield", new JSONObject(){{
-                    put("minValue", 0);
-                }}, 36),
-                ToolParameterDescriptor.create("consolidate", "Consolidate", "If importing data in batches, a new fragment is created for each batch. In case thousands of fragments are created, GenomicsDB feature readers will try to open ~20x as many files. Also, internally GenomicsDB would consume more memory to maintain bookkeeping data from all fragments. Use this flag to merge all fragments into one. Merging can potentially improve read performance, however overall benefit might not be noticeable as the top Java layers have significantly higher overheads. This flag has no effect if only one batch is used. Defaults to false.", "checkbox", new JSONObject(){{
-                    put("checked", false);
-                }}, false),
-                ToolParameterDescriptor.create("scatterGather", "Scatter/Gather Options", "If selected, this job will be divided to run job per chromosome.  The final step will take the VCF from each intermediate step and combined to make a final VCF file.", "sequenceanalysis-variantscattergatherpanel", new JSONObject(){{
-                    put("defaultValue", "chunked");
-                }}, false)
-        ));
+        super(ModuleLoader.getInstance().getModule(SequenceAnalysisModule.class), NAME, "This will run GATK\'s GenomicsDBImport on a set of GVCF files.  Note: this cannot work against any VCF file - these are primarily VCFs created using GATK\'s HaplotypeCaller.", null, getToolParameters(false));
     }
 
     @Override