Skip to content

Commit c58ba3d

Browse files
committed
Update mGAP release code to handle multi-species
1 parent 70acbc4 commit c58ba3d

File tree

1 file changed

+37
-36
lines changed

1 file changed

+37
-36
lines changed

mGAP/src/org/labkey/mgap/pipeline/mGapReleaseGenerator.java

Lines changed: 37 additions & 36 deletions
Original file line numberDiff line numberDiff line change
@@ -96,11 +96,21 @@
9696
public class mGapReleaseGenerator extends AbstractParameterizedOutputHandler<SequenceOutputHandler.SequenceOutputProcessor>
9797
{
9898
private final FileType _vcfType = new FileType(List.of(".vcf"), ".vcf", false, FileType.gzSupportLevel.SUPPORT_GZ);
99-
public static final String MMUL_GENOME = "mmulGenome";
99+
public static final String BASE_GENOME = "baseGenome";
100100

101101
public mGapReleaseGenerator()
102102
{
103103
super(ModuleLoader.getInstance().getModule(mGAPModule.class), "Create mGAP Release", "This will prepare an input VCF for use as an mGAP public release. This will optionally include: removing excess annotations and program records, limiting to SNVs (optional) and removing genotype data (optional). If genotypes are retained, the subject names will be checked for mGAP aliases and replaced as needed.", new LinkedHashSet<>(PageFlowUtil.set("sequenceanalysis/field/GenomeFileSelectorField.js")), Arrays.asList(
104+
ToolParameterDescriptor.create("species", "Version", "The species, which is used to filter tracks", "ldk-simplelabkeycombo", new JSONObject(){{
105+
put("allowBlank", false);
106+
put("doNotIncludeInTemplates", true);
107+
put("width", 400);
108+
put("schemaName", "laboratory");
109+
put("queryName", "species");
110+
put("containerPath", "js:Laboratory.Utils.getQueryContainerPath()");
111+
put("displayField", "common_name");
112+
put("valueField", "common_name");
113+
}}, null),
104114
ToolParameterDescriptor.create("releaseVersion", "Version", "This value will be used as the version when published.", "textfield", new JSONObject(){{
105115
put("allowBlank", false);
106116
put("doNotIncludeInTemplates", true);
@@ -182,10 +192,16 @@ public void init(JobContext ctx, List<SequenceOutputFile> inputFiles, List<Recor
182192
ctx.getJob().getLogger().info("writing track/subset data to file");
183193
Container target = ctx.getJob().getContainer().isWorkbook() ? ctx.getJob().getContainer().getParent() : ctx.getJob().getContainer();
184194
TableInfo releaseTracks = QueryService.get().getUserSchema(ctx.getJob().getUser(), target, mGAPSchema.NAME).getTable(mGAPSchema.TABLE_RELEASE_TRACKS);
195+
196+
final String species = ctx.getParams().optString("species");
197+
if (species == null)
198+
{
199+
throw new PipelineJobException("Missing value for species");
200+
}
185201

186202
Set<FieldKey> toSelect = new HashSet<>();
187203
toSelect.add(FieldKey.fromString("trackName"));
188-
toSelect.add(FieldKey.fromString("mergepriority"));
204+
toSelect.add(FieldKey.fromString("species"));
189205
toSelect.add(FieldKey.fromString("skipvalidation"));
190206
toSelect.add(FieldKey.fromString("isprimarytrack"));
191207
toSelect.add(FieldKey.fromString("vcfId"));
@@ -197,7 +213,7 @@ public void init(JobContext ctx, List<SequenceOutputFile> inputFiles, List<Recor
197213
File trackFile = getTrackListFile(ctx.getOutputDir());
198214
try (CSVWriter writer = new CSVWriter(PrintWriters.getPrintWriter(trackFile), '\t', CSVWriter.NO_QUOTE_CHARACTER))
199215
{
200-
new TableSelector(releaseTracks, colMap.values(), null, null).forEachResults(rs -> {
216+
new TableSelector(releaseTracks, colMap.values(), new SimpleFilter(FieldKey.fromString("species"), species), null).forEachResults(rs -> {
201217
if (rs.getObject(FieldKey.fromString("vcfId")) == null)
202218
{
203219
throw new SQLException("No VCF found for track: " + rs.getObject(FieldKey.fromString("trackName")));
@@ -217,7 +233,7 @@ public void init(JobContext ctx, List<SequenceOutputFile> inputFiles, List<Recor
217233
writer.writeNext(new String[]{
218234
rs.getString(FieldKey.fromString("trackName")),
219235
String.valueOf(rs.getInt(FieldKey.fromString("vcfId/dataId"))),
220-
String.valueOf(rs.getObject(FieldKey.fromString("mergepriority")) == null ? 999 : rs.getInt(FieldKey.fromString("mergepriority"))),
236+
rs.getString(FieldKey.fromString("species")),
221237
String.valueOf(rs.getObject(FieldKey.fromString("skipvalidation")) != null && rs.getBoolean(FieldKey.fromString("skipvalidation"))),
222238
String.valueOf(rs.getObject(FieldKey.fromString("isprimarytrack")) != null && rs.getBoolean(FieldKey.fromString("isprimarytrack")))
223239
});
@@ -256,7 +272,7 @@ public void init(JobContext ctx, List<SequenceOutputFile> inputFiles, List<Recor
256272
}
257273
int sourceGenome = genomeIds.iterator().next();
258274
ctx.getSequenceSupport().cacheGenome(SequenceAnalysisService.get().getReferenceGenome(sourceGenome, ctx.getJob().getUser()));
259-
ctx.getSequenceSupport().cacheObject(MMUL_GENOME, sourceGenome);
275+
ctx.getSequenceSupport().cacheObject(BASE_GENOME, sourceGenome);
260276

261277
AnnotationStep.findChainFile(genomeIds.iterator().next(), ctx.getParams().getInt(AnnotationStep.GRCH37), ctx.getSequenceSupport(), ctx.getJob());
262278

@@ -286,8 +302,8 @@ public void init(JobContext ctx, List<SequenceOutputFile> inputFiles, List<Recor
286302

287303
private SequenceOutputFile getAndValidateLuceneIndex(PipelineJob job, JSONObject params) throws PipelineJobException
288304
{
289-
Integer luceneIndexId = params.optInt("luceneIndex");
290-
if (luceneIndexId == null || luceneIndexId == 0)
305+
int luceneIndexId = params.optInt("luceneIndex");
306+
if (luceneIndexId == 0)
291307
{
292308
throw new PipelineJobException("Missing luceneIndex ID");
293309
}
@@ -520,10 +536,13 @@ else if (so.getCategory().endsWith("Release Track"))
520536
throw new PipelineJobException("Unable to find total variant from stats file!");
521537
}
522538

539+
final String species = ctx.getParams().optString("species");
540+
523541
//actually create release record
524542
Map<String, Object> row = new CaseInsensitiveHashMap<>();
525543
row.put("version", job.getParameters().get("releaseVersion"));
526544
row.put("releaseDate", new Date());
545+
row.put("species", species);
527546
row.put("vcfId", so.getRowid());
528547
row.put("liftedVcfId", liftedVcf.getRowid());
529548
row.put("sitesOnlyVcfId", sitesOnlyVcf.getRowid());
@@ -583,7 +602,7 @@ else if (so.getCategory().endsWith("Release Track"))
583602

584603
//also tracks:
585604
UserSchema us = QueryService.get().getUserSchema(job.getUser(), job.getContainer().isWorkbook() ? job.getContainer().getParent() : job.getContainer(), mGAPSchema.NAME);
586-
new TableSelector(us.getTable(mGAPSchema.TABLE_RELEASE_TRACKS), null, null).forEachResults(rs -> {
605+
new TableSelector(us.getTable(mGAPSchema.TABLE_RELEASE_TRACKS), new SimpleFilter(FieldKey.fromString("species"), species), null).forEachResults(rs -> {
587606
SequenceOutputFile so3 = trackVCFMap.get(rs.getString(FieldKey.fromString("trackName")));
588607
if (so3 == null && rs.getBoolean(FieldKey.fromString("isprimarytrack")))
589608
{
@@ -836,15 +855,15 @@ public static class TrackDescriptor
836855
{
837856
String _trackName;
838857
Integer _dataId;
839-
Integer _mergePriority;
858+
String _species;
840859
boolean _skipValidation;
841860
boolean _isPrimary;
842861

843862
public TrackDescriptor(String[] vals)
844863
{
845864
_trackName = vals[0];
846865
_dataId = Integer.parseInt(vals[1]);
847-
_mergePriority = Integer.parseInt(vals[2]);
866+
_species = vals[2];
848867
_skipValidation = Boolean.parseBoolean(vals[3]);
849868
_isPrimary = Boolean.parseBoolean(vals[4]);
850869
}
@@ -859,9 +878,9 @@ public Integer getDataId()
859878
return _dataId;
860879
}
861880

862-
public Integer getMergePriority()
881+
public String getSpecies()
863882
{
864-
return _mergePriority;
883+
return _species;
865884
}
866885

867886
public boolean isSkipValidation()
@@ -886,15 +905,6 @@ private List<TrackDescriptor> getTracks(File webserverDir) throws PipelineJobExc
886905
ret.add(new TrackDescriptor(line));
887906
}
888907

889-
ret.sort(new Comparator<TrackDescriptor>()
890-
{
891-
@Override
892-
public int compare(TrackDescriptor o1, TrackDescriptor o2)
893-
{
894-
return o1.getMergePriority().compareTo(o2.getMergePriority());
895-
}
896-
});
897-
898908
return ret;
899909
}
900910
catch (IOException e)
@@ -917,12 +927,13 @@ public void processFilesRemote(List<SequenceOutputFile> inputFiles, JobContext c
917927

918928
GeneToNameTranslator translator = new GeneToNameTranslator(gtf, ctx.getLogger());
919929
ReferenceGenome grch37Genome = ctx.getSequenceSupport().getCachedGenome(ctx.getParams().getInt(AnnotationStep.GRCH37));
920-
int genomeId = ctx.getSequenceSupport().getCachedObject(MMUL_GENOME, Integer.class);
930+
int genomeId = ctx.getSequenceSupport().getCachedObject(BASE_GENOME, Integer.class);
921931
ReferenceGenome genome = ctx.getSequenceSupport().getCachedGenome(genomeId);
922932
boolean testOnly = ctx.getParams().optBoolean("testOnly", false);
923933

934+
String species = ctx.getParams().getString("species");
924935
String releaseVersion = ctx.getParams().optString("releaseVersion", "0.0");
925-
File primaryTrackVcf = new File(ctx.getOutputDir(), "mGap.v" + FileUtil.makeLegalName(releaseVersion).replaceAll(" ", "_") + ".vcf.gz");
936+
File primaryTrackVcf = new File(ctx.getOutputDir(), "mGap." + species + ".v" + FileUtil.makeLegalName(releaseVersion).replaceAll(" ", "_") + ".vcf.gz");
926937

927938
try
928939
{
@@ -994,15 +1005,15 @@ public void processFilesRemote(List<SequenceOutputFile> inputFiles, JobContext c
9941005

9951006
SequenceOutputFile output = new SequenceOutputFile();
9961007
output.setFile(primaryTrackVcf);
997-
output.setName("mGAP Release: " + releaseVersion);
1008+
output.setName("mGAP Release: " + species + " " + releaseVersion);
9981009
output.setCategory((testOnly ? "Test " : "") + "mGAP Release");
9991010
output.setLibrary_id(genome.getGenomeId());
10001011
ctx.getFileManager().addSequenceOutput(output);
10011012

10021013
File interestingVariantTable = getVariantTableName(ctx, primaryTrackVcf);
10031014
SequenceOutputFile output2 = new SequenceOutputFile();
10041015
output2.setFile(interestingVariantTable);
1005-
output2.setName("mGAP Release: " + releaseVersion + " Variant Table");
1016+
output2.setName("mGAP Release: " + species + " " + releaseVersion + " Variant Table");
10061017
output2.setCategory((testOnly ? "Test " : "") + "mGAP Release Variant Table");
10071018
output2.setLibrary_id(genome.getGenomeId());
10081019
ctx.getFileManager().addSequenceOutput(output2);
@@ -1012,7 +1023,7 @@ public void processFilesRemote(List<SequenceOutputFile> inputFiles, JobContext c
10121023
File lifted = liftToHuman(ctx, primaryTrackVcf, sitesOnlyVcf, grch37Genome);
10131024
SequenceOutputFile output3 = new SequenceOutputFile();
10141025
output3.setFile(lifted);
1015-
output3.setName("mGAP Release: " + releaseVersion + " Lifted to Human");
1026+
output3.setName("mGAP Release: " + species + " " + releaseVersion + " Lifted to Human");
10161027
output3.setCategory((testOnly ? "Test " : "") + "mGAP Release Lifted to Human");
10171028
output3.setLibrary_id(grch37Genome.getGenomeId());
10181029
ctx.getFileManager().addSequenceOutput(output3);
@@ -1111,16 +1122,6 @@ private File getSitesOnlyVcfName(File outDir, File primaryTrackVcf)
11111122
return new File(outDir, SequenceAnalysisService.get().getUnzippedBaseName(primaryTrackVcf.getName()) + ".sitesOnly.vcf.gz");
11121123
}
11131124

1114-
private File getDroppedSitesVcfName(File outDir, File primaryTrackVcf)
1115-
{
1116-
return new File(outDir, SequenceAnalysisService.get().getUnzippedBaseName(primaryTrackVcf.getName()) + ".droppedFromPriorRelease.vcf.gz");
1117-
}
1118-
1119-
private File getNovelSitesVcfName(File outDir, File primaryTrackVcf)
1120-
{
1121-
return new File(outDir, SequenceAnalysisService.get().getUnzippedBaseName(primaryTrackVcf.getName()) + ".newToRelease.vcf.gz");
1122-
}
1123-
11241125
private File getLiftedVcfName(File outDir, File primaryTrackVcf)
11251126
{
11261127
return new File(outDir, SequenceAnalysisService.get().getUnzippedBaseName(primaryTrackVcf.getName()) + ".liftToGRCh37.vcf.gz");

0 commit comments

Comments
 (0)