|
1 | 1 | package org.labkey.jbrowse.model; |
2 | 2 |
|
| 3 | +import htsjdk.samtools.util.FileExtensions; |
3 | 4 | import htsjdk.tribble.bed.BEDCodec; |
4 | 5 | import htsjdk.tribble.gff.Gff3Codec; |
5 | 6 | import htsjdk.tribble.index.Index; |
|
29 | 30 | import org.labkey.api.pipeline.PipeRoot; |
30 | 31 | import org.labkey.api.pipeline.PipelineJobException; |
31 | 32 | import org.labkey.api.pipeline.PipelineService; |
| 33 | +import org.labkey.api.pipeline.PipelineValidationException; |
32 | 34 | import org.labkey.api.query.FieldKey; |
33 | 35 | import org.labkey.api.query.QueryService; |
34 | 36 | import org.labkey.api.query.UserSchema; |
|
38 | 40 | import org.labkey.api.sequenceanalysis.SequenceOutputFile; |
39 | 41 | import org.labkey.api.sequenceanalysis.pipeline.ReferenceGenome; |
40 | 42 | import org.labkey.api.sequenceanalysis.pipeline.SequencePipelineService; |
41 | | -import org.labkey.api.sequenceanalysis.run.DISCVRSeqRunner; |
42 | 43 | import org.labkey.api.sequenceanalysis.run.SimpleScriptWrapper; |
43 | 44 | import org.labkey.api.settings.AppProps; |
44 | 45 | import org.labkey.api.util.FileType; |
45 | 46 | import org.labkey.api.util.FileUtil; |
46 | 47 | import org.labkey.api.util.GUID; |
| 48 | +import org.labkey.api.util.JobRunner; |
47 | 49 | import org.labkey.api.util.PageFlowUtil; |
48 | 50 | import org.labkey.api.util.Path; |
49 | 51 | import org.labkey.api.view.UnauthorizedException; |
50 | 52 | import org.labkey.jbrowse.JBrowseManager; |
51 | 53 | import org.labkey.jbrowse.JBrowseSchema; |
| 54 | +import org.labkey.jbrowse.pipeline.JBrowseLucenePipelineJob; |
52 | 55 | import org.labkey.sequenceanalysis.run.util.TabixRunner; |
53 | 56 |
|
54 | 57 | import javax.annotation.Nullable; |
@@ -937,25 +940,52 @@ public File prepareResource(Logger log, boolean throwIfNotPrepared, boolean forc |
937 | 940 | if (shouldHaveFreeTextSearch()) |
938 | 941 | { |
939 | 942 | File luceneDir = getExpectedLocationOfLuceneIndex(throwIfNotPrepared); |
940 | | - if (forceReprocess && luceneDir.exists()) |
| 943 | + long sizeInGb = targetFile.length() / (1024 * 1024 * 1024); |
| 944 | + log.debug("preparing lucene index, VCF size: " + sizeInGb); |
| 945 | + |
| 946 | + if (!forceReprocess && doesLuceneIndexExist()) |
941 | 947 | { |
942 | | - try |
943 | | - { |
944 | | - FileUtils.deleteDirectory(luceneDir); |
945 | | - } |
946 | | - catch (IOException e) |
947 | | - { |
948 | | - throw new PipelineJobException(e); |
949 | | - } |
| 948 | + log.debug("Existing lucene index found, will not re-create: " + luceneDir.getPath()); |
950 | 949 | } |
951 | | - |
952 | | - if (forceReprocess || !doesLuceneIndexExist()) |
| 950 | + else if (sizeInGb > 50) |
953 | 951 | { |
954 | | - prepareLuceneIndex(log); |
| 952 | + log.info("VCF is too large, submitting VcfToLuceneIndexer as a separate pipeline job"); |
| 953 | + final File vcf = targetFile; |
| 954 | + JobRunner.getDefault().execute(() -> { |
| 955 | + try |
| 956 | + { |
| 957 | + PipeRoot root = PipelineService.get().getPipelineRootSetting(getContainerObj()); |
| 958 | + PipelineService.get().queueJob(new JBrowseLucenePipelineJob(getContainerObj(), null, root, vcf, luceneDir, getInfoFieldsToIndex(), allowLenientLuceneProcessing())); |
| 959 | + } |
| 960 | + catch (PipelineValidationException e) |
| 961 | + { |
| 962 | + log.error(e); |
| 963 | + } |
| 964 | + }); |
955 | 965 | } |
956 | 966 | else |
957 | 967 | { |
958 | | - log.debug("Existing lucene index found, will not re-create: " + luceneDir.getPath()); |
| 968 | + if (forceReprocess && luceneDir.exists()) |
| 969 | + { |
| 970 | + try |
| 971 | + { |
| 972 | + log.debug("Deleting existing index: " + luceneDir.getPath()); |
| 973 | + FileUtils.deleteDirectory(luceneDir); |
| 974 | + } |
| 975 | + catch (IOException e) |
| 976 | + { |
| 977 | + throw new PipelineJobException(e); |
| 978 | + } |
| 979 | + } |
| 980 | + |
| 981 | + if (forceReprocess || !doesLuceneIndexExist()) |
| 982 | + { |
| 983 | + JBrowseLucenePipelineJob.prepareLuceneIndex(targetFile, luceneDir, log, getInfoFieldsToIndex(), allowLenientLuceneProcessing()); |
| 984 | + } |
| 985 | + else |
| 986 | + { |
| 987 | + log.debug("Existing lucene index found, will not re-create: " + luceneDir.getPath()); |
| 988 | + } |
959 | 989 | } |
960 | 990 | } |
961 | 991 |
|
@@ -987,57 +1017,10 @@ private boolean doesLuceneIndexExist() |
987 | 1017 | return Arrays.asList(rawFields.split(",")); |
988 | 1018 | } |
989 | 1019 |
|
990 | | - private void prepareLuceneIndex(Logger log) throws PipelineJobException |
| 1020 | + private boolean allowLenientLuceneProcessing() |
991 | 1021 | { |
992 | | - log.debug("Generating VCF full text index for file: " + getExpData().getFile().getName()); |
993 | | - |
994 | | - DISCVRSeqRunner runner = new DISCVRSeqRunner(log); |
995 | | - if (!runner.jarExists()) |
996 | | - { |
997 | | - log.error("Unable to find DISCVRSeq.jar, skipping lucene index creation"); |
998 | | - return; |
999 | | - } |
1000 | | - |
1001 | | - File indexDir = getExpectedLocationOfLuceneIndex(false); |
1002 | | - if (indexDir != null && indexDir.exists()) |
1003 | | - { |
1004 | | - try |
1005 | | - { |
1006 | | - FileUtils.deleteDirectory(getExpectedLocationOfLuceneIndex(false)); |
1007 | | - } |
1008 | | - catch (IOException e) |
1009 | | - { |
1010 | | - throw new PipelineJobException(e); |
1011 | | - } |
1012 | | - } |
1013 | | - |
1014 | | - List<String> args = runner.getBaseArgs("VcfToLuceneIndexer"); |
1015 | | - args.add("-V"); |
1016 | | - args.add(getExpData().getFile().getPath()); |
1017 | | - |
1018 | | - args.add("-O"); |
1019 | | - args.add(indexDir.getPath()); |
1020 | | - |
1021 | | - List<String> infoFieldsForFullTextSearch = getInfoFieldsToIndex(); |
1022 | | - for (String field : infoFieldsForFullTextSearch) |
1023 | | - { |
1024 | | - args.add("-IF"); |
1025 | | - args.add(field); |
1026 | | - } |
1027 | | - |
1028 | | - args.add("--allow-missing-fields"); |
1029 | | - |
1030 | | - args.add("--index-stats"); |
1031 | | - args.add(getExpectedLocationOfLuceneIndexStats(false).getPath()); |
1032 | | - |
1033 | 1022 | JSONObject config = getExtraTrackConfig(); |
1034 | | - if (config != null && !config.isNull("lenientLuceneProcessing") && config.getBoolean("lenientLuceneProcessing")) |
1035 | | - { |
1036 | | - args.add("--validation-stringency"); |
1037 | | - args.add("LENIENT"); |
1038 | | - } |
1039 | | - |
1040 | | - runner.execute(args); |
| 1023 | + return config != null && !config.isNull("lenientLuceneProcessing") && config.getBoolean("lenientLuceneProcessing"); |
1041 | 1024 | } |
1042 | 1025 |
|
1043 | 1026 | protected void createIndex(File finalLocation, Logger log, File idx, boolean throwIfNotPrepared) throws PipelineJobException |
@@ -1095,8 +1078,17 @@ else if (TRACK_TYPES.gff.getFileType().isType(finalLocation) || TRACK_TYPES.gtf. |
1095 | 1078 | SequenceAnalysisService.get().sortGxf(log, finalLocation, null); |
1096 | 1079 | } |
1097 | 1080 |
|
1098 | | - TabixRunner tabix = new TabixRunner(log); |
1099 | | - tabix.execute(finalLocation); |
| 1081 | + // If JBrowse is using an unaltered input file, trust that index: |
| 1082 | + File expectedIdx = new File(finalLocation.getPath() + FileExtensions.TABIX_INDEX); |
| 1083 | + if (expectedIdx.exists() && getExpData().getFile().equals(finalLocation)) |
| 1084 | + { |
| 1085 | + log.debug("Existing index found, will not re-create: " + expectedIdx.getPath()); |
| 1086 | + } |
| 1087 | + else |
| 1088 | + { |
| 1089 | + TabixRunner tabix = new TabixRunner(log); |
| 1090 | + tabix.execute(finalLocation); |
| 1091 | + } |
1100 | 1092 | } |
1101 | 1093 | } |
1102 | 1094 | } |
@@ -1372,11 +1364,6 @@ public boolean shouldHaveFreeTextSearch() |
1372 | 1364 | return json != null && json.optBoolean("createFullTextIndex", false); |
1373 | 1365 | } |
1374 | 1366 |
|
1375 | | - public File getExpectedLocationOfLuceneIndexStats(boolean throwIfNotFound) |
1376 | | - { |
1377 | | - return new File(getExpectedLocationOfLuceneIndex(throwIfNotFound).getPath() + ".stats.txt"); |
1378 | | - } |
1379 | | - |
1380 | 1367 | public File getExpectedLocationOfLuceneIndex(boolean throwIfNotFound) |
1381 | 1368 | { |
1382 | 1369 | File basedir = getLocationOfProcessedTrack(false); |
|
0 commit comments