Skip to content

Commit ab5d296

Browse files
authored
Allow JBrowse to re-use existing lucene VCF indexes under certain conditions (#245)
1 parent 0983212 commit ab5d296

File tree

6 files changed

+208
-12
lines changed

6 files changed

+208
-12
lines changed

SequenceAnalysis/api-src/org/labkey/api/sequenceanalysis/SequenceOutputFile.java

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,8 @@
1717

1818
import com.fasterxml.jackson.annotation.JsonIgnore;
1919
import org.json.JSONObject;
20+
import org.labkey.api.data.Container;
21+
import org.labkey.api.data.ContainerManager;
2022
import org.labkey.api.data.DbSchema;
2123
import org.labkey.api.data.TableSelector;
2224
import org.labkey.api.exp.api.ExpData;
@@ -159,6 +161,11 @@ public String getContainer()
159161
return _container;
160162
}
161163

164+
public Container getContainerObj()
165+
{
166+
return ContainerManager.getForId(_container);
167+
}
168+
162169
public void setContainer(String container)
163170
{
164171
_container = container;

SequenceAnalysis/resources/web/SequenceAnalysis/field/SequenceOutputFileSelectorField.js

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -4,14 +4,14 @@ Ext4.define('SequenceAnalysis.field.SequenceOutputFileSelectorField', {
44

55
genomeId: -1,
66
performGenomeFilter: true,
7+
valueField: 'dataid',
78

89
initComponent: function(){
910
Ext4.apply(this, {
1011
forceSelection: true,
1112
displayField: 'name',
12-
valueField: 'dataid',
1313
listConfig: {
14-
innerTpl: ['{name} ({[values["rowid"]]})']
14+
innerTpl: ['{name} ({[values["' + this.valueField + '"]]})']
1515
},
1616
store: {
1717
type: 'labkey-store',
@@ -21,7 +21,7 @@ Ext4.define('SequenceAnalysis.field.SequenceOutputFileSelectorField', {
2121
autoLoad: true,
2222
filterArray: this.getFilterArray(),
2323
sort: 'name',
24-
columns: 'library_id,name,dataid,category'
24+
columns: 'rowid,library_id,name,dataid,category'
2525
}
2626
});
2727

jbrowse/api-src/org/labkey/api/jbrowse/JBrowseService.java

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@
77
import org.labkey.api.pipeline.PipelineJobException;
88
import org.labkey.api.pipeline.PipelineValidationException;
99
import org.labkey.api.security.User;
10+
import org.labkey.api.sequenceanalysis.SequenceOutputFile;
1011

1112
import java.io.File;
1213
import java.io.IOException;
@@ -42,4 +43,15 @@ static public void setInstance(JBrowseService instance)
4243
abstract public void registerFieldCustomizer(JBrowseFieldCustomizer customizer);
4344

4445
abstract public void prepareLuceneIndex(File vcf, File indexDir, Logger log, List<String> infoFieldsForFullTextSearch, boolean allowLenientLuceneProcessing) throws PipelineJobException;
46+
47+
abstract public SequenceOutputFile findMatchingLuceneIndex(SequenceOutputFile vcfFile, List<String> infoFieldsToIndex, User u, @Nullable Logger log) throws PipelineJobException;
48+
49+
abstract public void registerLuceneIndexDetector(LuceneIndexDetector detector);
50+
51+
public interface LuceneIndexDetector
52+
{
53+
SequenceOutputFile findMatchingLuceneIndex(SequenceOutputFile vcfFile, List<String> infoFieldsToIndex, User u, @Nullable Logger log) throws PipelineJobException;
54+
55+
boolean isAvailable(Container c);
56+
}
4557
}

jbrowse/build.gradle

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@ repositories {
1111
dependencies {
1212
apiImplementation "com.github.samtools:htsjdk:${htsjdkVersion}"
1313
BuildUtils.addLabKeyDependency(project: project, config: "implementation", depProjectPath: ":server:modules:LabDevKitModules:LDK", depProjectConfig: "apiJarFile")
14+
BuildUtils.addLabKeyDependency(project: project, config: "apiImplementation", depProjectPath: ":server:modules:DiscvrLabKeyModules:SequenceAnalysis", depProjectConfig: "apiJarFile")
1415
BuildUtils.addLabKeyDependency(project: project, config: "implementation", depProjectPath: ":server:modules:DiscvrLabKeyModules:SequenceAnalysis", depProjectConfig: "apiJarFile")
1516
BuildUtils.addLabKeyDependency(project: project, config: "implementation", depProjectPath: ":server:modules:DiscvrLabKeyModules:SequenceAnalysis", depProjectConfig: "runtimeElements")
1617
BuildUtils.addLabKeyDependency(project: project, config: "implementation", depProjectPath: ":server:modules:LabDevKitModules:laboratory", depProjectConfig: "apiJarFile")

jbrowse/src/org/labkey/jbrowse/JBrowseServiceImpl.java

Lines changed: 160 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,26 +1,43 @@
11
package org.labkey.jbrowse;
22

3+
import org.apache.commons.lang3.StringUtils;
34
import org.apache.logging.log4j.Logger;
5+
import org.jetbrains.annotations.Nullable;
46
import org.json.JSONObject;
57
import org.labkey.api.collections.CaseInsensitiveHashMap;
68
import org.labkey.api.data.Container;
9+
import org.labkey.api.data.SimpleFilter;
10+
import org.labkey.api.data.TableInfo;
11+
import org.labkey.api.data.TableSelector;
12+
import org.labkey.api.exp.api.ExpData;
13+
import org.labkey.api.exp.api.ExpRun;
14+
import org.labkey.api.exp.api.ExperimentService;
715
import org.labkey.api.jbrowse.DemographicsSource;
816
import org.labkey.api.jbrowse.GroupsProvider;
917
import org.labkey.api.jbrowse.JBrowseFieldCustomizer;
1018
import org.labkey.api.jbrowse.JBrowseFieldDescriptor;
1119
import org.labkey.api.jbrowse.JBrowseService;
20+
import org.labkey.api.module.ModuleLoader;
1221
import org.labkey.api.pipeline.PipeRoot;
1322
import org.labkey.api.pipeline.PipelineJobException;
1423
import org.labkey.api.pipeline.PipelineService;
1524
import org.labkey.api.pipeline.PipelineValidationException;
25+
import org.labkey.api.query.FieldKey;
26+
import org.labkey.api.query.QueryService;
27+
import org.labkey.api.reader.Readers;
1628
import org.labkey.api.security.User;
29+
import org.labkey.api.sequenceanalysis.SequenceOutputFile;
30+
import org.labkey.api.util.PageFlowUtil;
1731
import org.labkey.api.util.logging.LogHelper;
1832
import org.labkey.jbrowse.model.JBrowseSession;
1933
import org.labkey.jbrowse.model.JsonFile;
34+
import org.labkey.jbrowse.pipeline.IndexVariantsStep;
2035
import org.labkey.jbrowse.pipeline.JBrowseLucenePipelineJob;
2136
import org.labkey.jbrowse.pipeline.JBrowseSessionPipelineJob;
2237

38+
import java.io.BufferedReader;
2339
import java.io.File;
40+
import java.io.IOException;
2441
import java.util.ArrayList;
2542
import java.util.Collection;
2643
import java.util.Collections;
@@ -31,6 +48,7 @@
3148
import java.util.Map;
3249
import java.util.Set;
3350
import java.util.TreeSet;
51+
import java.util.concurrent.atomic.AtomicReference;
3452

3553
/**
3654
* Created by bimber on 11/3/2016.
@@ -44,9 +62,11 @@ public class JBrowseServiceImpl extends JBrowseService
4462
private final List<GroupsProvider> _providers = new ArrayList<>();
4563
private final List<JBrowseFieldCustomizer> _customizers = new ArrayList<>();
4664

65+
private final List<LuceneIndexDetector> _detectors = new ArrayList<>();
66+
4767
private JBrowseServiceImpl()
4868
{
49-
69+
this.registerLuceneIndexDetector(new DefaultLuceneIndexDetector());
5070
}
5171

5272
public static JBrowseServiceImpl get()
@@ -148,9 +168,8 @@ public Map<String, Map<String, Object>> resolveSubjects(List<String> subjects, U
148168

149169
public void customizeField(User u, Container c, JBrowseFieldDescriptor field) {
150170
// NOTE: providers will be registered on module startup, which will be in dependency order.
151-
// Process them here in reverse dependency order, so we prioritize end modules
171+
// Process them here in this order, so end modules can override earlier ones:
152172
List<JBrowseFieldCustomizer> customizers = new ArrayList<>(_customizers);
153-
Collections.reverse(customizers);
154173
for (JBrowseFieldCustomizer fc : customizers) {
155174
if (fc.isAvailable(c, u)) {
156175
fc.customizeField(field, c, u);
@@ -265,4 +284,142 @@ public Map<String, String> getDemographicsFields(User u, Container c)
265284

266285
return ret;
267286
}
287+
288+
@Override
289+
public SequenceOutputFile findMatchingLuceneIndex(SequenceOutputFile vcfFile, List<String> infoFieldsToIndex, User u, @Nullable Logger log) throws PipelineJobException
290+
{
291+
// NOTE: These are registered in module dependency order, so process in reverse:
292+
List<LuceneIndexDetector> detectors = new ArrayList<>(_detectors);
293+
Collections.reverse(detectors);
294+
for (LuceneIndexDetector li : detectors)
295+
{
296+
if (li.isAvailable(vcfFile.getContainerObj()))
297+
{
298+
SequenceOutputFile so = li.findMatchingLuceneIndex(vcfFile, infoFieldsToIndex, u, log);
299+
if (so != null)
300+
{
301+
return so;
302+
}
303+
}
304+
}
305+
306+
return null;
307+
}
308+
309+
@Override
310+
public void registerLuceneIndexDetector(LuceneIndexDetector detector)
311+
{
312+
_detectors.add(detector);
313+
}
314+
315+
public static final class DefaultLuceneIndexDetector implements LuceneIndexDetector
316+
{
317+
@Override
318+
public SequenceOutputFile findMatchingLuceneIndex(SequenceOutputFile vcfFile, List<String> infoFieldsToIndex, User u, @Nullable Logger log) throws PipelineJobException
319+
{
320+
if (vcfFile.getContainerObj() == null)
321+
{
322+
return null;
323+
}
324+
325+
// This forces the index and VCF outputs to live in the same workbook:
326+
TableInfo ti = QueryService.get().getUserSchema(u, vcfFile.getContainerObj(), JBrowseSchema.SEQUENCE_ANALYSIS).getTable("outputfiles");
327+
SimpleFilter filter = new SimpleFilter(FieldKey.fromString("category"), IndexVariantsStep.CATEGORY);
328+
AtomicReference<SequenceOutputFile> idxDir = new AtomicReference<>();
329+
new TableSelector(ti, PageFlowUtil.set("rowid"), filter, null).forEachResults(rs -> {
330+
SequenceOutputFile so = SequenceOutputFile.getForId(rs.getInt(FieldKey.fromString("rowid")));
331+
if (so.getFile() == null || !so.getFile().exists())
332+
{
333+
log.error("Sequence output lacks a file: " + so.getRowid());
334+
return;
335+
}
336+
337+
if (so.getRunId() == null)
338+
{
339+
return;
340+
}
341+
342+
ExpRun run = ExperimentService.get().getExpRun(so.getRunId());
343+
if (run == null)
344+
{
345+
return;
346+
}
347+
348+
Map<ExpData, String> inputMap = run.getDataInputs();
349+
if (inputMap == null)
350+
{
351+
return;
352+
}
353+
354+
for (ExpData d : inputMap.keySet())
355+
{
356+
if (!"Input VCF".equals(inputMap.get(d)))
357+
{
358+
continue;
359+
}
360+
361+
if (d.getFile() == null || !d.getFile().exists())
362+
{
363+
continue;
364+
}
365+
366+
if (vcfFile.getFile().getAbsoluteFile().equals(d.getFile().getAbsoluteFile()))
367+
{
368+
File fieldsFile = new File(d.getFile().getParentFile(), "fieldList.txt");
369+
if (!fieldsFile.exists())
370+
{
371+
continue;
372+
}
373+
374+
List<String> fields = new ArrayList<>();
375+
try (BufferedReader reader = Readers.getReader(fieldsFile))
376+
{
377+
String line;
378+
while ((line = reader.readLine()) != null)
379+
{
380+
line = StringUtils.trimToNull(line);
381+
if (line != null)
382+
{
383+
fields.add(line);
384+
}
385+
}
386+
}
387+
catch (IOException e)
388+
{
389+
if (log != null)
390+
{
391+
log.error("Unable to read fieldList.txt for: " + d.getFile().getPath(), e);
392+
continue;
393+
}
394+
}
395+
396+
if (!infoFieldsToIndex.equals(fields))
397+
{
398+
if (log != null)
399+
{
400+
log.info("Partial index match found, but fields to index do not match: " + d.getFile().getPath());
401+
}
402+
continue;
403+
}
404+
405+
if (log != null)
406+
{
407+
log.debug("Identified pre-existing lucene index: " + so.getFile().getPath());
408+
}
409+
410+
idxDir.set(so);
411+
break;
412+
}
413+
}
414+
});
415+
416+
return idxDir.get();
417+
}
418+
419+
@Override
420+
public boolean isAvailable(Container c)
421+
{
422+
return c.getActiveModules().contains(ModuleLoader.getInstance().getModule(JBrowseModule.class));
423+
}
424+
}
268425
}

jbrowse/src/org/labkey/jbrowse/model/JsonFile.java

Lines changed: 25 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,7 @@
2727
import org.labkey.api.exp.api.ExpData;
2828
import org.labkey.api.exp.api.ExperimentService;
2929
import org.labkey.api.files.FileContentService;
30+
import org.labkey.api.jbrowse.JBrowseService;
3031
import org.labkey.api.pipeline.PipeRoot;
3132
import org.labkey.api.pipeline.PipelineJobException;
3233
import org.labkey.api.pipeline.PipelineService;
@@ -58,6 +59,7 @@
5859
import javax.annotation.Nullable;
5960
import java.io.File;
6061
import java.io.IOException;
62+
import java.nio.file.Files;
6163
import java.sql.SQLException;
6264
import java.util.Arrays;
6365
import java.util.Collections;
@@ -940,12 +942,17 @@ public File prepareResource(User u, Logger log, boolean throwIfNotPrepared, bool
940942

941943
if (shouldHaveFreeTextSearch())
942944
{
943-
// TODO:
944-
// Try to find a matching existing index:
945-
// Container targetContainer = getContainerObj().isWorkbookOrTab() ? getContainerObj().getParent() : getContainerObj();
946-
// TableInfo ti = QueryService.get().getUserSchema(u, targetContainer, JBrowseSchema.SEQUENCE_ANALYSIS).getTable("outputfiles");
947-
// SimpleFilter filter = new SimpleFilter(FieldKey.fromString("category"), IndexVariantsStep.CATEGORY);
948-
// filter.addCondition(FieldKey.fromString("library_id"), rg)
945+
// Try to find a matching existing index. Note: restrict to the same workbook as parent file, if present:
946+
File existingLuceneDir = null;
947+
if (getOutputFile() != null)
948+
{
949+
SequenceOutputFile so = SequenceOutputFile.getForId(getOutputFile());
950+
SequenceOutputFile existingLuceneOutput = JBrowseService.get().findMatchingLuceneIndex(so, getInfoFieldsToIndex(), u, log);
951+
if (existingLuceneOutput != null)
952+
{
953+
existingLuceneDir = existingLuceneOutput.getFile().getParentFile();
954+
}
955+
}
949956

950957
File luceneDir = getExpectedLocationOfLuceneIndex(throwIfNotPrepared);
951958
long sizeInGb = targetFile.length() / (1024 * 1024 * 1024);
@@ -955,6 +962,18 @@ public File prepareResource(User u, Logger log, boolean throwIfNotPrepared, bool
955962
{
956963
log.debug("Existing lucene index found, will not re-create: " + luceneDir.getPath());
957964
}
965+
else if (existingLuceneDir != null && existingLuceneDir.exists())
966+
{
967+
log.debug("Creating symlink to existing index: " + existingLuceneDir.getPath());
968+
try
969+
{
970+
Files.createSymbolicLink(existingLuceneDir.toPath(), existingLuceneDir.toPath());
971+
}
972+
catch (IOException e)
973+
{
974+
throw new PipelineJobException(e);
975+
}
976+
}
958977
else if (sizeInGb > 50)
959978
{
960979
log.info("VCF is too large, submitting VcfToLuceneIndexer as a separate pipeline job");

0 commit comments

Comments
 (0)