Skip to content

Commit 181588d

Browse files
authored
Add columns to track lucene index in mGAP release (#143)
* Allow JBrowse to re-use existing lucene VCF indexes under certain conditions
1 parent 80889cf commit 181588d

File tree

10 files changed

+371
-69
lines changed

10 files changed

+371
-69
lines changed

mGAP/resources/etls/prime-seq.xml

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -108,6 +108,8 @@
108108
<column>sitesOnlyVcfId/name</column>
109109
<column>novelSitesVcfId/dataid/DataFileUrl</column>
110110
<column>novelSitesVcfId/name</column>
111+
<column>luceneIndex/dataid/DataFileUrl</column>
112+
<column>luceneIndex/name</column>
111113
<column>humanJbrowseId</column>
112114
<column>objectId</column>
113115
</sourceColumns>
@@ -119,6 +121,7 @@
119121
<column source="variantTable/dataid/DataFileUrl" target="variantTable" transformClass="org.labkey.mgap.columnTransforms.OutputFileTransform" />
120122
<column source="sitesOnlyVcfId/dataid/DataFileUrl" target="sitesOnlyVcfId" transformClass="org.labkey.mgap.columnTransforms.OutputFileTransform" />
121123
<column source="novelSitesVcfId/dataid/DataFileUrl" target="novelSitesVcfId" transformClass="org.labkey.mgap.columnTransforms.OutputFileTransform" />
124+
<column source="luceneIndex/dataid/DataFileUrl" target="luceneIndex" transformClass="org.labkey.mgap.columnTransforms.LuceneIndexTransform" />
122125
<column source="jbrowseId" transformClass="org.labkey.mgap.columnTransforms.JBrowseSessionTransform"/>
123126
<column source="liftedVcfId/dataid/DataFileUrl" target="liftedVcfId" transformClass="org.labkey.mgap.columnTransforms.LiftedVcfTransform" />
124127
<column source="humanJbrowseId" transformClass="org.labkey.mgap.columnTransforms.JBrowseHumanSessionTransform"/>
Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
ALTER TABLE mGAP.variantCatalogReleases ADD luceneIndex int;
Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
ALTER TABLE mGAP.variantCatalogReleases ADD luceneIndex int;

mGAP/resources/schemas/mgap.xml

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -188,6 +188,15 @@
188188
<fkColumnName>RowId</fkColumnName>
189189
</fk>
190190
</column>
191+
<column columnName="luceneIndex">
192+
<columnTitle>Lucene Index File</columnTitle>
193+
<nullable>true</nullable>
194+
<fk>
195+
<fkDbSchema>sequenceanalysis</fkDbSchema>
196+
<fkTable>outputfiles</fkTable>
197+
<fkColumnName>rowid</fkColumnName>
198+
</fk>
199+
</column>
191200
<column columnName="totalSubjects">
192201
<columnTitle>Total Subjects</columnTitle>
193202
</column>

mGAP/src/org/labkey/mgap/columnTransforms/AbstractVariantTransform.java

Lines changed: 99 additions & 66 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,7 @@
2424
import org.labkey.mgap.mGAPManager;
2525

2626
import java.io.File;
27+
import java.io.IOException;
2728
import java.net.URI;
2829
import java.sql.SQLException;
2930
import java.util.Arrays;
@@ -118,71 +119,8 @@ protected Integer getOrCreateOutputFile(Object dataFileUrl, Object folderName, S
118119
}
119120
else
120121
{
121-
PipeRoot pr = PipelineService.get().getPipelineRootSetting(getContainerUser().getContainer());
122-
File baseDir = new File(pr.getRootPath(), mGAPManager.DATA_DIR_NAME);
123-
if (!baseDir.exists())
124-
{
125-
baseDir.mkdirs();
126-
}
127-
128-
String folderNameString = StringUtils.trimToNull(String.valueOf(folderName));
129-
if (folderNameString == null)
130-
{
131-
throw new PipelineJobException("Unable to find folderName");
132-
}
133-
134-
File subdir = new File(baseDir, folderNameString);
135-
if (!subdir.exists())
136-
{
137-
subdir.mkdirs();
138-
}
139-
140-
getStatusLogger().info("preparing to copy file: " + f.getPath());
141-
142-
//Copy file locally, plus index if exists:
143-
File localCopy = new File(subdir, name == null || f.getName().startsWith("mGap.v")? f.getName() : FileUtil.makeLegalName(name).replaceAll(" ", "_") + ".vcf.gz");
144-
boolean doCopy = true;
145-
if (localCopy.exists())
146-
{
147-
getStatusLogger().info("file exists: " + localCopy.getPath());
148-
if (localCopy.lastModified() >= f.lastModified())
149-
{
150-
doCopy = false;
151-
}
152-
else
153-
{
154-
getStatusLogger().info("source file has been modified, deleting copy and re-syncing");
155-
localCopy.delete();
156-
}
157-
}
158-
159-
if (doCopy)
160-
{
161-
getStatusLogger().info("copying file locally: " + localCopy.getPath());
162-
if (localCopy.exists())
163-
{
164-
localCopy.delete();
165-
}
166-
167-
FileUtils.copyFile(f, localCopy);
168-
}
169-
170-
File index = new File(f.getPath() + ".tbi");
171-
if (index.exists())
172-
{
173-
File indexLocal = new File(localCopy.getPath() + ".tbi");
174-
if (doCopy && indexLocal.exists())
175-
{
176-
getStatusLogger().info("deleting local copy of index since file was re-copied");
177-
indexLocal.delete();
178-
}
179-
180-
if (!indexLocal.exists())
181-
{
182-
getStatusLogger().info("copying index locally: " + indexLocal.getPath());
183-
FileUtils.copyFile(index, indexLocal);
184-
}
185-
}
122+
File subDir = getLocalSubdir(folderName);
123+
File localCopy = doFileCopy(f, subDir, name);
186124

187125
//first create the ExpData
188126
ExpData d = ExperimentService.get().getExpDataByURL(localCopy, getContainerUser().getContainer());
@@ -204,7 +142,7 @@ protected Integer getOrCreateOutputFile(Object dataFileUrl, Object folderName, S
204142
else
205143
{
206144
Map<String, Object> row = new CaseInsensitiveHashMap<>();
207-
row.put("category", "VCF File");
145+
row.put("category", getOutputFileCategory());
208146
row.put("dataid", d.getRowId());
209147
row.put("name", name == null ? "mGAP Variants, Version: " + getInputValue("version") : name);
210148
row.put("description", getDescription());
@@ -230,6 +168,101 @@ protected Integer getOrCreateOutputFile(Object dataFileUrl, Object folderName, S
230168
return null;
231169
}
232170

171+
protected File getLocalSubdir(Object folderName) throws PipelineJobException
172+
{
173+
PipeRoot pr = PipelineService.get().getPipelineRootSetting(getContainerUser().getContainer());
174+
File baseDir = new File(pr.getRootPath(), mGAPManager.DATA_DIR_NAME);
175+
if (!baseDir.exists())
176+
{
177+
baseDir.mkdirs();
178+
}
179+
180+
String folderNameString = StringUtils.trimToNull(String.valueOf(folderName));
181+
if (folderNameString == null)
182+
{
183+
throw new PipelineJobException("Unable to find folderName");
184+
}
185+
186+
File subdir = new File(baseDir, folderNameString);
187+
if (!subdir.exists())
188+
{
189+
subdir.mkdirs();
190+
}
191+
192+
return subdir;
193+
}
194+
195+
protected File doFileCopy(File f, File subdir, String name) throws PipelineJobException
196+
{
197+
getStatusLogger().info("preparing to copy file: " + f.getPath());
198+
199+
//Copy file locally, plus index if exists:
200+
File localCopy = new File(subdir, name == null || f.getName().startsWith("mGap.v") ? f.getName() : FileUtil.makeLegalName(name).replaceAll(" ", "_") + ".vcf.gz");
201+
boolean doCopy = true;
202+
if (localCopy.exists())
203+
{
204+
getStatusLogger().info("file exists: " + localCopy.getPath());
205+
if (localCopy.lastModified() >= f.lastModified())
206+
{
207+
doCopy = false;
208+
}
209+
else
210+
{
211+
getStatusLogger().info("source file has been modified, deleting copy and re-syncing");
212+
localCopy.delete();
213+
}
214+
}
215+
216+
if (doCopy)
217+
{
218+
getStatusLogger().info("copying file locally: " + localCopy.getPath());
219+
if (localCopy.exists())
220+
{
221+
localCopy.delete();
222+
}
223+
224+
try
225+
{
226+
FileUtils.copyFile(f, localCopy);
227+
}
228+
catch (IOException e)
229+
{
230+
throw new PipelineJobException(e);
231+
}
232+
}
233+
234+
File index = new File(f.getPath() + ".tbi");
235+
if (index.exists())
236+
{
237+
File indexLocal = new File(localCopy.getPath() + ".tbi");
238+
if (doCopy && indexLocal.exists())
239+
{
240+
getStatusLogger().info("deleting local copy of index since file was re-copied");
241+
indexLocal.delete();
242+
}
243+
244+
if (!indexLocal.exists())
245+
{
246+
getStatusLogger().info("copying index locally: " + indexLocal.getPath());
247+
try
248+
{
249+
FileUtils.copyFile(index, indexLocal);
250+
}
251+
catch (IOException e)
252+
{
253+
throw new PipelineJobException(e);
254+
}
255+
}
256+
}
257+
258+
return localCopy;
259+
}
260+
261+
protected String getOutputFileCategory()
262+
{
263+
return "VCF File";
264+
}
265+
233266
protected String getDescription()
234267
{
235268
return "mGAP Release";
Lines changed: 44 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,44 @@
1+
package org.labkey.mgap.columnTransforms;
2+
3+
import org.apache.commons.io.FileUtils;
4+
import org.jetbrains.annotations.Nullable;
5+
import org.labkey.api.pipeline.PipelineJobException;
6+
import org.labkey.api.sequenceanalysis.run.SimpleScriptWrapper;
7+
8+
import java.io.File;
9+
import java.io.IOException;
10+
import java.util.Arrays;
11+
12+
public class LuceneIndexTransform extends OutputFileTransform
13+
{
14+
@Override
15+
protected Object doTransform(Object inputValue)
16+
{
17+
if (null == inputValue)
18+
return null;
19+
20+
return getOrCreateOutputFile(inputValue, getInputValue("objectId"), null);
21+
}
22+
23+
@Override
24+
protected File doFileCopy(File f, File subdir, @Nullable String name) throws PipelineJobException
25+
{
26+
// NOTE: lucene is a special case since the DB tracks one file, but we need this whole folder:
27+
File sourceDir = f.getParentFile();
28+
File targetDir = new File(subdir, "LuceneIndex");
29+
30+
// NOTE: rsync should no-op if there are no source changes
31+
getStatusLogger().info("Copying lucene index dir to: " + targetDir.getPath());
32+
new SimpleScriptWrapper(getStatusLogger()).execute(Arrays.asList(
33+
"rsync", "-r", "-a", "--delete", "--no-owner", "--no-group", "--chmod=D2770,F660", sourceDir.getPath(), targetDir.getPath()
34+
));
35+
36+
return new File(targetDir, f.getName());
37+
}
38+
39+
@Override
40+
protected String getDescription()
41+
{
42+
return "mGAP Release Lucene Index";
43+
}
44+
}
Lines changed: 45 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,45 @@
1+
package org.labkey.mgap.jbrowse;
2+
3+
import org.apache.logging.log4j.Logger;
4+
import org.jetbrains.annotations.Nullable;
5+
import org.labkey.api.data.CompareType;
6+
import org.labkey.api.data.Container;
7+
import org.labkey.api.data.SimpleFilter;
8+
import org.labkey.api.data.TableSelector;
9+
import org.labkey.api.jbrowse.JBrowseService;
10+
import org.labkey.api.module.ModuleLoader;
11+
import org.labkey.api.pipeline.PipelineJobException;
12+
import org.labkey.api.query.FieldKey;
13+
import org.labkey.api.query.QueryService;
14+
import org.labkey.api.security.User;
15+
import org.labkey.api.sequenceanalysis.SequenceOutputFile;
16+
import org.labkey.api.util.PageFlowUtil;
17+
import org.labkey.mgap.mGAPModule;
18+
import org.labkey.mgap.mGAPSchema;
19+
20+
import java.util.List;
21+
22+
public class mGAPLuceneDetector implements JBrowseService.LuceneIndexDetector
23+
{
24+
@Override
25+
public SequenceOutputFile findMatchingLuceneIndex(SequenceOutputFile vcfFile, List<String> infoFieldsToIndex, User u, @Nullable Logger log) throws PipelineJobException
26+
{
27+
Container target = vcfFile.getContainerObj().isWorkbookOrTab() ? vcfFile.getContainerObj().getParent() : vcfFile.getContainerObj();
28+
SimpleFilter filter = new SimpleFilter(FieldKey.fromString("vcfId"), vcfFile.getRowid());
29+
filter.addCondition(FieldKey.fromString("luceneIndex"), null, CompareType.NONBLANK);
30+
31+
TableSelector ts = new TableSelector(QueryService.get().getUserSchema(u, target, mGAPSchema.NAME).getTable(mGAPSchema.TABLE_VARIANT_CATALOG_RELEASES), PageFlowUtil.set("luceneIndex"), filter, null);
32+
if (ts.exists())
33+
{
34+
return SequenceOutputFile.getForId(ts.getObject(Integer.class));
35+
}
36+
37+
return null;
38+
}
39+
40+
@Override
41+
public boolean isAvailable(Container c)
42+
{
43+
return c.getActiveModules().contains(ModuleLoader.getInstance().getModule(mGAPModule.class));
44+
}
45+
}

mGAP/src/org/labkey/mgap/mGAPModule.java

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -43,9 +43,11 @@
4343
import org.labkey.mgap.buttons.ReleaseButton;
4444
import org.labkey.mgap.jbrowse.mGAPFieldCustomizer;
4545
import org.labkey.mgap.jbrowse.mGAPGroupsProvider;
46+
import org.labkey.mgap.jbrowse.mGAPLuceneDetector;
4647
import org.labkey.mgap.pipeline.AnnotationStep;
4748
import org.labkey.mgap.pipeline.GenerateMgapTracksStep;
4849
import org.labkey.mgap.pipeline.GroupCompareStep;
50+
import org.labkey.mgap.pipeline.IndexVariantsForMgapStep;
4951
import org.labkey.mgap.pipeline.RemoveAnnotationsForMgapStep;
5052
import org.labkey.mgap.pipeline.RemoveAnnotationsStep;
5153
import org.labkey.mgap.pipeline.RenameSamplesForMgapStep;
@@ -71,7 +73,7 @@ public String getName()
7173
@Override
7274
public Double getSchemaVersion()
7375
{
74-
return 16.69;
76+
return 16.70;
7577
}
7678

7779
@Override
@@ -96,6 +98,7 @@ public void doStartupAfterSpringConfig(ModuleContext moduleContext)
9698
JBrowseService.get().registerDemographicsSource(new mGAPDemographicsSource());
9799
JBrowseService.get().registerFieldCustomizer(new mGAPFieldCustomizer());
98100
JBrowseService.get().registerGroupsProvider(new mGAPGroupsProvider());
101+
JBrowseService.get().registerLuceneIndexDetector(new mGAPLuceneDetector());
99102

100103
SystemMaintenance.addTask(new mGapMaintenanceTask());
101104

@@ -132,6 +135,7 @@ public PipelineStartup()
132135
SequencePipelineService.get().registerPipelineStep(new SampleSpecificGenotypeFiltrationStep.Provider());
133136
SequencePipelineService.get().registerPipelineStep(new mGapReleaseAnnotateNovelSitesStep.Provider());
134137
SequencePipelineService.get().registerPipelineStep(new GenerateMgapTracksStep.Provider());
138+
SequencePipelineService.get().registerPipelineStep(new IndexVariantsForMgapStep.Provider());
135139

136140
_hasRegistered = true;
137141
}

0 commit comments

Comments
 (0)