Skip to content

Commit d83f6b9

Browse files
committed
Allow jbrowse/lucene indexing to run as a separate pipeline job
1 parent 3b2a6c8 commit d83f6b9

File tree

5 files changed

+333
-72
lines changed

5 files changed

+333
-72
lines changed

jbrowse/src/org/labkey/jbrowse/model/JsonFile.java

Lines changed: 45 additions & 71 deletions
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,7 @@
3030
import org.labkey.api.pipeline.PipeRoot;
3131
import org.labkey.api.pipeline.PipelineJobException;
3232
import org.labkey.api.pipeline.PipelineService;
33+
import org.labkey.api.pipeline.PipelineValidationException;
3334
import org.labkey.api.query.FieldKey;
3435
import org.labkey.api.query.QueryService;
3536
import org.labkey.api.query.UserSchema;
@@ -39,17 +40,18 @@
3940
import org.labkey.api.sequenceanalysis.SequenceOutputFile;
4041
import org.labkey.api.sequenceanalysis.pipeline.ReferenceGenome;
4142
import org.labkey.api.sequenceanalysis.pipeline.SequencePipelineService;
42-
import org.labkey.api.sequenceanalysis.run.DISCVRSeqRunner;
4343
import org.labkey.api.sequenceanalysis.run.SimpleScriptWrapper;
4444
import org.labkey.api.settings.AppProps;
4545
import org.labkey.api.util.FileType;
4646
import org.labkey.api.util.FileUtil;
4747
import org.labkey.api.util.GUID;
48+
import org.labkey.api.util.JobRunner;
4849
import org.labkey.api.util.PageFlowUtil;
4950
import org.labkey.api.util.Path;
5051
import org.labkey.api.view.UnauthorizedException;
5152
import org.labkey.jbrowse.JBrowseManager;
5253
import org.labkey.jbrowse.JBrowseSchema;
54+
import org.labkey.jbrowse.pipeline.JBrowseLucenePipelineJob;
5355
import org.labkey.sequenceanalysis.run.util.TabixRunner;
5456

5557
import javax.annotation.Nullable;
@@ -938,25 +940,52 @@ public File prepareResource(Logger log, boolean throwIfNotPrepared, boolean forc
938940
if (shouldHaveFreeTextSearch())
939941
{
940942
File luceneDir = getExpectedLocationOfLuceneIndex(throwIfNotPrepared);
941-
if (forceReprocess && luceneDir.exists())
943+
long sizeInGb = targetFile.length() / (1024 * 1024 * 1024);
944+
log.debug("preparing lucene index, VCF size: " + sizeInGb);
945+
946+
if (!forceReprocess && doesLuceneIndexExist())
942947
{
943-
try
944-
{
945-
FileUtils.deleteDirectory(luceneDir);
946-
}
947-
catch (IOException e)
948-
{
949-
throw new PipelineJobException(e);
950-
}
948+
log.debug("Existing lucene index found, will not re-create: " + luceneDir.getPath());
951949
}
952-
953-
if (forceReprocess || !doesLuceneIndexExist())
950+
else if (sizeInGb > 50)
954951
{
955-
prepareLuceneIndex(log);
952+
log.info("VCF is too large, submitting VcfToLuceneIndexer as a separate pipeline job");
953+
final File vcf = targetFile;
954+
JobRunner.getDefault().execute(() -> {
955+
try
956+
{
957+
PipeRoot root = PipelineService.get().getPipelineRootSetting(getContainerObj());
958+
PipelineService.get().queueJob(new JBrowseLucenePipelineJob(getContainerObj(), null, root, vcf, luceneDir, getInfoFieldsToIndex(), allowLenientLuceneProcessing()));
959+
}
960+
catch (PipelineValidationException e)
961+
{
962+
log.error(e);
963+
}
964+
});
956965
}
957966
else
958967
{
959-
log.debug("Existing lucene index found, will not re-create: " + luceneDir.getPath());
968+
if (forceReprocess && luceneDir.exists())
969+
{
970+
try
971+
{
972+
log.debug("Deleting existing index: " + luceneDir.getPath());
973+
FileUtils.deleteDirectory(luceneDir);
974+
}
975+
catch (IOException e)
976+
{
977+
throw new PipelineJobException(e);
978+
}
979+
}
980+
981+
if (forceReprocess || !doesLuceneIndexExist())
982+
{
983+
JBrowseLucenePipelineJob.prepareLuceneIndex(targetFile, luceneDir, log, getInfoFieldsToIndex(), allowLenientLuceneProcessing());
984+
}
985+
else
986+
{
987+
log.debug("Existing lucene index found, will not re-create: " + luceneDir.getPath());
988+
}
960989
}
961990
}
962991

@@ -988,60 +1017,10 @@ private boolean doesLuceneIndexExist()
9881017
return Arrays.asList(rawFields.split(","));
9891018
}
9901019

991-
private void prepareLuceneIndex(Logger log) throws PipelineJobException
1020+
private boolean allowLenientLuceneProcessing()
9921021
{
993-
log.debug("Generating VCF full text index for file: " + getExpData().getFile().getName());
994-
995-
DISCVRSeqRunner runner = new DISCVRSeqRunner(log);
996-
if (!runner.jarExists())
997-
{
998-
log.error("Unable to find DISCVRSeq.jar, skipping lucene index creation");
999-
return;
1000-
}
1001-
1002-
File indexDir = getExpectedLocationOfLuceneIndex(false);
1003-
if (indexDir != null && indexDir.exists())
1004-
{
1005-
try
1006-
{
1007-
FileUtils.deleteDirectory(getExpectedLocationOfLuceneIndex(false));
1008-
}
1009-
catch (IOException e)
1010-
{
1011-
throw new PipelineJobException(e);
1012-
}
1013-
}
1014-
1015-
List<String> args = runner.getBaseArgs("VcfToLuceneIndexer");
1016-
args.add("-V");
1017-
args.add(getExpData().getFile().getPath());
1018-
1019-
args.add("-O");
1020-
args.add(indexDir.getPath());
1021-
1022-
args.add("--validation-stringency");
1023-
args.add("LENIENT");
1024-
1025-
List<String> infoFieldsForFullTextSearch = getInfoFieldsToIndex();
1026-
for (String field : infoFieldsForFullTextSearch)
1027-
{
1028-
args.add("-IF");
1029-
args.add(field);
1030-
}
1031-
1032-
args.add("--allow-missing-fields");
1033-
1034-
args.add("--index-stats");
1035-
args.add(getExpectedLocationOfLuceneIndexStats(false).getPath());
1036-
10371022
JSONObject config = getExtraTrackConfig();
1038-
if (config != null && !config.isNull("lenientLuceneProcessing") && config.getBoolean("lenientLuceneProcessing"))
1039-
{
1040-
args.add("--validation-stringency");
1041-
args.add("LENIENT");
1042-
}
1043-
1044-
runner.execute(args);
1023+
return config != null && !config.isNull("lenientLuceneProcessing") && config.getBoolean("lenientLuceneProcessing");
10451024
}
10461025

10471026
protected void createIndex(File finalLocation, Logger log, File idx, boolean throwIfNotPrepared) throws PipelineJobException
@@ -1385,11 +1364,6 @@ public boolean shouldHaveFreeTextSearch()
13851364
return json != null && json.optBoolean("createFullTextIndex", false);
13861365
}
13871366

1388-
public File getExpectedLocationOfLuceneIndexStats(boolean throwIfNotFound)
1389-
{
1390-
return new File(getExpectedLocationOfLuceneIndex(throwIfNotFound).getPath() + ".stats.txt");
1391-
}
1392-
13931367
public File getExpectedLocationOfLuceneIndex(boolean throwIfNotFound)
13941368
{
13951369
File basedir = getLocationOfProcessedTrack(false);
Lines changed: 199 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,199 @@
1+
package org.labkey.jbrowse.pipeline;
2+
3+
import org.apache.commons.io.FileUtils;
4+
import org.apache.logging.log4j.Logger;
5+
import org.labkey.api.assay.AssayFileWriter;
6+
import org.labkey.api.data.Container;
7+
import org.labkey.api.files.FileUrls;
8+
import org.labkey.api.module.Module;
9+
import org.labkey.api.pipeline.PipeRoot;
10+
import org.labkey.api.pipeline.PipelineDirectory;
11+
import org.labkey.api.pipeline.PipelineJob;
12+
import org.labkey.api.pipeline.PipelineJobException;
13+
import org.labkey.api.pipeline.PipelineJobService;
14+
import org.labkey.api.pipeline.PipelineProvider;
15+
import org.labkey.api.pipeline.TaskId;
16+
import org.labkey.api.pipeline.TaskPipeline;
17+
import org.labkey.api.security.User;
18+
import org.labkey.api.sequenceanalysis.pipeline.SequencePipelineService;
19+
import org.labkey.api.sequenceanalysis.run.DISCVRSeqRunner;
20+
import org.labkey.api.util.GUID;
21+
import org.labkey.api.util.PageFlowUtil;
22+
import org.labkey.api.view.ActionURL;
23+
import org.labkey.api.view.ViewBackgroundInfo;
24+
import org.labkey.api.view.ViewContext;
25+
import org.labkey.jbrowse.JBrowseManager;
26+
27+
import java.io.File;
28+
import java.io.IOException;
29+
import java.util.List;
30+
31+
/**
32+
* User: bimber
33+
* Date: 7/21/2014
34+
* Time: 10:33 AM
35+
*/
36+
public class JBrowseLucenePipelineJob extends PipelineJob
37+
{
38+
private List<String> _infoFields;
39+
private File _vcf;
40+
private File _targetDir;
41+
private boolean _allowLenientLuceneProcessing = false;
42+
43+
// Default constructor for serialization
44+
protected JBrowseLucenePipelineJob()
45+
{
46+
}
47+
48+
public JBrowseLucenePipelineJob(Container c, User user, PipeRoot pipeRoot, File vcf, File targetDir, List<String> infoFields, boolean allowLenientLuceneProcessing)
49+
{
50+
super(JBrowseLucenePipelineProvider.NAME, new ViewBackgroundInfo(c, user, null), pipeRoot);
51+
_vcf = vcf;
52+
_targetDir = targetDir;
53+
_infoFields = infoFields;
54+
_allowLenientLuceneProcessing = allowLenientLuceneProcessing;
55+
56+
setLogFile(AssayFileWriter.findUniqueFileName("jbrowse-lucene" + new GUID() + ".log", JBrowseManager.get().getBaseDir(c, true).toPath()));
57+
}
58+
59+
public static class JBrowseLucenePipelineProvider extends PipelineProvider
60+
{
61+
public static final String NAME = "JBrowseSessionPipeline";
62+
63+
public JBrowseLucenePipelineProvider(Module owningModule)
64+
{
65+
super(NAME, owningModule);
66+
}
67+
68+
@Override
69+
public void updateFileProperties(ViewContext context, PipeRoot pr, PipelineDirectory directory, boolean includeAll)
70+
{
71+
72+
}
73+
}
74+
75+
76+
@Override
77+
public String getDescription()
78+
{
79+
return "Preparing VCF index";
80+
}
81+
82+
83+
@Override
84+
public ActionURL getStatusHref()
85+
{
86+
return PageFlowUtil.urlProvider(FileUrls.class).urlBegin(getContainer());
87+
}
88+
89+
@Override
90+
public TaskPipeline getTaskPipeline()
91+
{
92+
return PipelineJobService.get().getTaskPipeline(new TaskId(JBrowseLucenePipelineJob.class));
93+
}
94+
95+
public List<String> getInfoFields()
96+
{
97+
return _infoFields;
98+
}
99+
100+
public void setInfoFields(List<String> infoFields)
101+
{
102+
_infoFields = infoFields;
103+
}
104+
105+
public File getVcf()
106+
{
107+
return _vcf;
108+
}
109+
110+
public void setVcf(File vcf)
111+
{
112+
_vcf = vcf;
113+
}
114+
115+
public File getTargetDir()
116+
{
117+
return _targetDir;
118+
}
119+
120+
public void setTargetDir(File targetDir)
121+
{
122+
_targetDir = targetDir;
123+
}
124+
125+
public boolean isAllowLenientLuceneProcessing()
126+
{
127+
return _allowLenientLuceneProcessing;
128+
}
129+
130+
public void setAllowLenientLuceneProcessing(boolean allowLenientLuceneProcessing)
131+
{
132+
_allowLenientLuceneProcessing = allowLenientLuceneProcessing;
133+
}
134+
135+
public static void prepareLuceneIndex(File vcf, File indexDir, Logger log, List<String> infoFieldsForFullTextSearch, boolean allowLenientLuceneProcessing) throws PipelineJobException
136+
{
137+
log.debug("Generating VCF full text index for file: " + vcf.getName());
138+
139+
DISCVRSeqRunner runner = new DISCVRSeqRunner(log);
140+
if (!runner.jarExists())
141+
{
142+
log.error("Unable to find DISCVRSeq.jar, skipping lucene index creation");
143+
return;
144+
}
145+
146+
if (indexDir.exists())
147+
{
148+
try
149+
{
150+
FileUtils.deleteDirectory(indexDir);
151+
}
152+
catch (IOException e)
153+
{
154+
throw new PipelineJobException(e);
155+
}
156+
}
157+
158+
List<String> args = runner.getBaseArgs("VcfToLuceneIndexer");
159+
args.add("-V");
160+
args.add(vcf.getPath());
161+
162+
args.add("-O");
163+
args.add(indexDir.getPath());
164+
165+
args.add("--validation-stringency");
166+
args.add("LENIENT");
167+
168+
for (String field : infoFieldsForFullTextSearch)
169+
{
170+
args.add("-IF");
171+
args.add(field);
172+
}
173+
174+
args.add("--allow-missing-fields");
175+
176+
args.add("--index-stats");
177+
args.add(getExpectedLocationOfLuceneIndexStats(indexDir).getPath());
178+
179+
if (allowLenientLuceneProcessing)
180+
{
181+
args.add("--validation-stringency");
182+
args.add("LENIENT");
183+
}
184+
185+
Integer threads = SequencePipelineService.get().getMaxThreads(log);
186+
if (threads != null)
187+
{
188+
args.add("--threads");
189+
args.add(threads.toString());
190+
}
191+
192+
runner.execute(args);
193+
}
194+
195+
public static File getExpectedLocationOfLuceneIndexStats(File indexDir)
196+
{
197+
return new File(indexDir.getPath() + ".stats.txt");
198+
}
199+
}

0 commit comments

Comments
 (0)