Merge discvr-22.7 to develop

root · root · commit 0bf4f720345c · 2022-11-02T20:02:07.000Z
diff --git a/mGAP/resources/views/releaseNotes.html b/mGAP/resources/views/releaseNotes.html
@@ -1,3 +1,13 @@
+<h4>Release 2.3:</h4>
+<ul>
+    <li>This is an additional 560 animals over the prior version.</li>
+    <li>There are a sizable number of data processing changes, largely adaptations to handle the rapidly growing dataset size:</li>
+    <ol>
+        <li>All data used <a href="https://gatk.broadinstitute.org/hc/en-us/articles/4405443600667-ReblockGVCF">GATK Reblocked gVCFs</a> as inputs. This reduces processing, but can reduce sensitivity at homozygous-reference sites (resulting in greater numbers of no-call genotypes at homozygous ref sites)</li>
+        <li>Also to adapt to larger data size, we changed the structure of data processing. Previously, samples were each aggregated into one GenomicsDB workspace per data type (WGS or WXS). Next, GenotypeGVCFs was run on each workspace, with one job per contig. The resulting VCFs were filtered and merged. In this release, the upfront aggregation step was dropped, and we instead: 1) use reblocked gVCFs as input (entire set of samples), 2) chunk the genome into ~1000 bins with one job/bin, 3) per bin, run GenomicsDbImport to make a transient workspace using the job's intervals +/- 1000bp, 4) run GenotypeGVCFs against that workspace, 5) filter the result, including technology-aware thresholds (i.e. different depth filters for WGS/WXS). This process is both considerably more efficient and has the advantage of joint-genotyping across the entire cohort at once.</li>
+    </ol>
+</ul>
+
 <h4>Release 2.2:</h4>
 <ul>
     <li>This is an additional 103 animals over the prior version.</li>
diff --git a/mGAP/src/org/labkey/mgap/pipeline/mGapReleaseGenerator.java b/mGAP/src/org/labkey/mgap/pipeline/mGapReleaseGenerator.java
@@ -306,12 +306,12 @@ else if (so.getCategory().contains("Lifted"))
                 }
                 else if (so.getCategory().contains("mGAP Release: Sites Only"))
                 {
-                    String name = so.getName().replaceAll(": Sites Only", "");
-                    sitesOnlyVcfMap.put(name, so);
+                    sitesOnlyVcfMap.put("mGAP Release: " + releaseVersion, so);
                 }
                 else if (so.getCategory().contains("Release Track") && so.getName().contains("Novel Sites"))
                 {
                     novelSitesVcfMap.put("mGAP Release: " + releaseVersion, so);
+                    trackVCFMap.put(so.getName(), so);
                 }
                 else if (so.getCategory().endsWith("Release"))
                 {
@@ -853,12 +853,15 @@ public void processFilesRemote(List<SequenceOutputFile> inputFiles, JobContext c
                     ctx.getFileManager().removeIntermediateFile(renamedVcfIdx);
                     ctx.getFileManager().addIntermediateFile(renamedVcfDone);
 
-                    SequenceOutputFile output = new SequenceOutputFile();
-                    output.setFile(renamedVcf);
-                    output.setName(track.getTrackName());
-                    output.setCategory("Release Track");
-                    output.setLibrary_id(genome.getGenomeId());
-                    ctx.getFileManager().addSequenceOutput(output);
+                    if (!track.isPrimary())
+                    {
+                        SequenceOutputFile output = new SequenceOutputFile();
+                        output.setFile(renamedVcf);
+                        output.setName(track.getTrackName());
+                        output.setCategory("Release Track");
+                        output.setLibrary_id(genome.getGenomeId());
+                        ctx.getFileManager().addSequenceOutput(output);
+                    }
                 }
             }
             catch (IOException e)
diff --git a/mcc/resources/queries/mcc/aggregatedDemographics.sql b/mcc/resources/queries/mcc/aggregatedDemographics.sql
@@ -82,45 +82,6 @@ FROM "/data/Colonies/WNPRC/".study.demographics d
 
 UNION ALL
 
-SELECT
-    d.Id.mccAlias.externalAlias as Id,
-    d.Id as originalId,
-    d.date,
-    d.species,
-    d.gender,
-    d.birth,
-    d.death,
-    'U NEB' as colony,
-    d.damMccAlias.externalAlias as dam,
-    d.sireMccAlias.externalAlias as sire,
-    d.dam as originalDam,
-    d.sire as originalSire,
-    d.Id.mostRecentWeight.mostRecentWeight as mostRecentWeight,
-    d.objectid,
-    d.calculated_status,
-    false as u24_status,
-    o.availability,
-    o.current_housing_status,
-    o.infant_history,
-    o.fertility_status,
-    o.medical_history,
-    o.date_of_observations,
-    d.container
-
-FROM "/data/Colonies/UNO/".study.demographics d
-         LEFT JOIN (SELECT
-                        o.Id,
-                        o.date_of_observations,
-                        o."availability::observation" as availability,
-                        o."current_housing_status::observation" as current_housing_status,
-                        o."infant_history::observation" as infant_history,
-                        o."fertility_status::observation" as fertility_status,
-                        o."medical_history::observation" as medical_history,
-                    FROM "/data/Colonies/UNO/".study.mostRecentObservationsPivoted o
-) o ON (o.Id = d.Id)
-
-UNION ALL
-
 SELECT
     d.Id.mccAlias.externalAlias as Id,
     d.Id as originalId,
diff --git a/mcc/resources/queries/mcc/aggregatedKinship.sql b/mcc/resources/queries/mcc/aggregatedKinship.sql
@@ -28,21 +28,6 @@ FROM "/data/Colonies/WNPRC/".study.kinship d
 
 UNION ALL
 
-SELECT
-    d.Id.mccAlias.externalAlias as Id,
-    d.Id as originalId,
-    d.date,
-    d.Id2MccAlias.externalAlias as Id2,
-    d.Id2 as originalId2,
-    d.kinship,
-    d.relationship,
-    d.objectid,
-    d.container
-
-FROM "/data/Colonies/UNO/".study.kinship d
-
-UNION ALL
-
 SELECT
     d.Id.mccAlias.externalAlias as Id,
     d.Id as originalId,

Original file line number	Diff line number	Diff line change
`@@ -306,12 +306,12 @@ else if (so.getCategory().contains("Lifted"))`
`306`	`306`	`}`
`307`	`307`	`else if (so.getCategory().contains("mGAP Release: Sites Only"))`
`308`	`308`	`{`
`309`		`- String name = so.getName().replaceAll(": Sites Only", "");`
`310`		`- sitesOnlyVcfMap.put(name, so);`
	`309`	`+ sitesOnlyVcfMap.put("mGAP Release: " + releaseVersion, so);`
`311`	`310`	`}`
`312`	`311`	`else if (so.getCategory().contains("Release Track") && so.getName().contains("Novel Sites"))`
`313`	`312`	`{`
`314`	`313`	`novelSitesVcfMap.put("mGAP Release: " + releaseVersion, so);`
	`314`	`+ trackVCFMap.put(so.getName(), so);`
`315`	`315`	`}`
`316`	`316`	`else if (so.getCategory().endsWith("Release"))`
`317`	`317`	`{`
`@@ -853,12 +853,15 @@ public void processFilesRemote(List<SequenceOutputFile> inputFiles, JobContext c`
`853`	`853`	`ctx.getFileManager().removeIntermediateFile(renamedVcfIdx);`
`854`	`854`	`ctx.getFileManager().addIntermediateFile(renamedVcfDone);`
`855`	`855`
`856`		`- SequenceOutputFile output = new SequenceOutputFile();`
`857`		`- output.setFile(renamedVcf);`
`858`		`- output.setName(track.getTrackName());`
`859`		`- output.setCategory("Release Track");`
`860`		`- output.setLibrary_id(genome.getGenomeId());`
`861`		`- ctx.getFileManager().addSequenceOutput(output);`
	`856`	`+ if (!track.isPrimary())`
	`857`	`+ {`
	`858`	`+ SequenceOutputFile output = new SequenceOutputFile();`
	`859`	`+ output.setFile(renamedVcf);`
	`860`	`+ output.setName(track.getTrackName());`
	`861`	`+ output.setCategory("Release Track");`
	`862`	`+ output.setLibrary_id(genome.getGenomeId());`
	`863`	`+ ctx.getFileManager().addSequenceOutput(output);`
	`864`	`+ }`
`862`	`865`	`}`
`863`	`866`	`}`
`864`	`867`	`catch (IOException e)`