Skip to content

Commit 4661fa2

Browse files
committed
Prepare to support CRAM format
1 parent bf63a08 commit 4661fa2

File tree

6 files changed

+89
-11
lines changed

6 files changed

+89
-11
lines changed

SequenceAnalysis/api-src/org/labkey/api/sequenceanalysis/pipeline/AbstractAlignmentStepProvider.java

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -33,11 +33,11 @@ abstract public class AbstractAlignmentStepProvider<StepType extends AlignmentSt
3333
public static String ALIGNMENT_MODE_PARAM = "alignmentMode";
3434
public static String SUPPORT_MERGED_UNALIGNED = "supportsMergeUnaligned";
3535
public static String COLLECT_WGS_METRICS = "collectWgsMetrics";
36+
public static String CONVERT_TO_CRAM = "convertToCram";
3637
public static String COLLECT_WGS_METRICS_NON_ZERO = "collectWgsMetricsNonZero";
3738
public static String DISCARD_BAM = "discardBam";
38-
public static String SUPPORT_ALIGNMENT_METRICS = "supportAlignmentMetrics";
3939

40-
public static enum ALIGNMENT_MODE
40+
public enum ALIGNMENT_MODE
4141
{
4242
ALIGN_THEN_MERGE(),
4343
MERGE_THEN_ALIGN();

SequenceAnalysis/api-src/org/labkey/api/sequenceanalysis/pipeline/ReferenceGenome.java

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -46,6 +46,12 @@ public interface ReferenceGenome extends Serializable
4646
*/
4747
public @NotNull File getWorkingFastaFile();
4848

49+
/**
50+
* @return This is the file that should typically be used by callers. The pipeline code usually copies this file to the local working directory.
51+
* If this has occurred, that file will preferentially be used. Otherwise, the source FASTA file will be returned.
52+
*/
53+
public @NotNull File getWorkingFastaFileGzipped();
54+
4955
public void setWorkingFasta(File workingFasta);
5056

5157
/**

SequenceAnalysis/src/org/labkey/sequenceanalysis/SequenceAnalysisMaintenanceTask.java

Lines changed: 23 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@
1212
import org.labkey.api.exp.api.ExperimentService;
1313
import org.labkey.api.ldk.LDKService;
1414
import org.labkey.api.pipeline.PipeRoot;
15+
import org.labkey.api.pipeline.PipelineJobException;
1516
import org.labkey.api.pipeline.PipelineService;
1617
import org.labkey.api.query.FieldKey;
1718
import org.labkey.api.security.User;
@@ -25,10 +26,12 @@
2526
import org.labkey.api.util.SystemMaintenance.MaintenanceTask;
2627
import org.labkey.sequenceanalysis.model.AnalysisModelImpl;
2728
import org.labkey.sequenceanalysis.pipeline.CacheGenomeTrigger;
29+
import org.labkey.sequenceanalysis.pipeline.ReferenceGenomeImpl;
2830

2931
import java.io.File;
3032
import java.io.IOException;
3133
import java.util.ArrayList;
34+
import java.util.Arrays;
3235
import java.util.Collections;
3336
import java.util.HashMap;
3437
import java.util.HashSet;
@@ -206,7 +209,7 @@ else if (!d.getFile().exists())
206209
}
207210
}
208211

209-
private void processContainer(Container c, Logger log) throws IOException
212+
private void processContainer(Container c, Logger log) throws IOException, PipelineJobException
210213
{
211214
PipeRoot root = PipelineService.get().getPipelineRootSetting(c);
212215
if (root != null && !root.isCloudRoot())
@@ -283,7 +286,8 @@ private void processContainer(Container c, Logger log) throws IOException
283286
{
284287
//inspect within library
285288
List<String> expectedChildren = new ArrayList<>();
286-
Integer fastaId = new TableSelector(SequenceAnalysisSchema.getInstance().getSchema().getTable(SequenceAnalysisSchema.TABLE_REF_LIBRARIES), PageFlowUtil.set("fasta_file")).getObject(Integer.parseInt(child.getName()), Integer.class);
289+
int libraryId = Integer.parseInt(child.getName());
290+
Integer fastaId = new TableSelector(SequenceAnalysisSchema.getInstance().getSchema().getTable(SequenceAnalysisSchema.TABLE_REF_LIBRARIES), PageFlowUtil.set("fasta_file")).getObject(libraryId, Integer.class);
287291
if (fastaId == null)
288292
{
289293
log.error("Unable to find FASTA ExpData in DB matching jbrowse directory: " + child.getPath());
@@ -297,6 +301,17 @@ private void processContainer(Container c, Logger log) throws IOException
297301
log.error("expected fasta file does not exist: " + fasta.getPath());
298302
}
299303

304+
// Use this to retroactively convert existing genomes:
305+
File gz = new File(fasta.getPath() + ".gz");
306+
if (!gz.exists())
307+
{
308+
ReferenceGenomeImpl genome = new ReferenceGenomeImpl(fasta, fastaData, libraryId, null);
309+
genome.createGzippedFile(log);
310+
}
311+
312+
expectedChildren.add(fasta.getName() + ".gz");
313+
expectedChildren.add(fasta.getName() + ".gzi");
314+
300315
expectedChildren.add(fasta.getName());
301316
expectedChildren.add(fasta.getName() + ".fai");
302317
expectedChildren.add(FileUtil.getBaseName(fasta.getName()) + ".idKey.txt");
@@ -320,8 +335,6 @@ private void processContainer(Container c, Logger log) throws IOException
320335
}
321336
}
322337

323-
Integer libraryId = Integer.parseInt(child.getName());
324-
325338
//check/verify tracks
326339
File trackDir = new File(child, "tracks");
327340
if (trackDir.exists())
@@ -462,6 +475,7 @@ private void deleteFile(File f, Logger log) throws IOException
462475
private static FileType _cramFileType = new FileType("cram");
463476
private static FileType _vcfFileType = new FileType("vcf", FileType.gzSupportLevel.SUPPORT_GZ);
464477
private static FileType _bedFileType = new FileType("bed");
478+
private static FileType _fastaFileType = new FileType(Arrays.asList("fasta", "fa"), "fasta", FileType.gzSupportLevel.SUPPORT_GZ);
465479

466480
/**
467481
* This is intended to return any files associated with an input, which is primarily designed to pick up index files
@@ -496,6 +510,11 @@ else if (_bedFileType.isType(f))
496510
{
497511
ret.add(f.getName() + ".idx");
498512
}
513+
else if (_fastaFileType.isType(f))
514+
{
515+
ret.add(f.getName() + ".gz");
516+
ret.add(f.getName() + ".gzi");
517+
}
499518

500519
return ret;
501520
}

SequenceAnalysis/src/org/labkey/sequenceanalysis/pipeline/ReferenceGenomeImpl.java

Lines changed: 44 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,20 +1,25 @@
11
package org.labkey.sequenceanalysis.pipeline;
22

33
import com.fasterxml.jackson.annotation.JsonIgnore;
4+
import org.apache.logging.log4j.Logger;
45
import org.jetbrains.annotations.NotNull;
56
import org.jetbrains.annotations.Nullable;
67
import org.labkey.api.data.Container;
78
import org.labkey.api.data.ContainerManager;
89
import org.labkey.api.data.SimpleFilter;
910
import org.labkey.api.data.TableSelector;
1011
import org.labkey.api.exp.api.ExpData;
12+
import org.labkey.api.pipeline.PipelineJobException;
1113
import org.labkey.api.query.FieldKey;
1214
import org.labkey.api.sequenceanalysis.pipeline.AlignerIndexUtil;
1315
import org.labkey.api.sequenceanalysis.pipeline.ReferenceGenome;
16+
import org.labkey.api.sequenceanalysis.pipeline.ReferenceGenomeManager;
1417
import org.labkey.api.sequenceanalysis.pipeline.SequencePipelineService;
1518
import org.labkey.api.util.FileUtil;
1619
import org.labkey.api.util.PageFlowUtil;
1720
import org.labkey.sequenceanalysis.SequenceAnalysisSchema;
21+
import org.labkey.sequenceanalysis.run.util.BgzipRunner;
22+
import org.labkey.sequenceanalysis.run.util.FastaIndexer;
1823

1924
import java.io.File;
2025

@@ -73,6 +78,18 @@ public boolean isTemporaryGenome() {
7378
return _workingFasta == null ? _sourceFasta : _workingFasta;
7479
}
7580

81+
@Override
82+
public @NotNull File getWorkingFastaFileGzipped()
83+
{
84+
File fasta = new File(getWorkingFastaFile().getPath() + ".gz");
85+
if (!fasta.exists())
86+
{
87+
throw new IllegalStateException("File does not exist: " + fasta.getPath());
88+
}
89+
90+
return fasta;
91+
}
92+
7693
@Override
7794
public void setWorkingFasta(File workingFasta)
7895
{
@@ -171,4 +188,31 @@ public static Container getFolderForGenome(int libraryId)
171188

172189
return containerId == null ? null : ContainerManager.getForId(containerId);
173190
}
191+
192+
public void createGzippedFile(Logger log) throws PipelineJobException
193+
{
194+
createGzippedFile(log, false);
195+
}
196+
197+
public void createGzippedFile(Logger log, boolean deleteIfExists) throws PipelineJobException
198+
{
199+
File target = new File(getSourceFastaFile().getPath() + ".gz");
200+
if (target.exists())
201+
{
202+
if (deleteIfExists)
203+
{
204+
target.delete();
205+
}
206+
else
207+
{
208+
return;
209+
}
210+
}
211+
212+
BgzipRunner runner = new BgzipRunner(log);
213+
File gz = runner.execute(getSourceFastaFile(), true);
214+
new FastaIndexer(log).execute(gz);
215+
216+
ReferenceGenomeManager.get().markGenomeModified(this, log);
217+
}
174218
}

SequenceAnalysis/src/org/labkey/sequenceanalysis/run/util/BgzipRunner.java

Lines changed: 13 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,6 @@
11
package org.labkey.sequenceanalysis.run.util;
22

33
import org.apache.logging.log4j.Logger;
4-
import org.apache.logging.log4j.LogManager;
54
import org.jetbrains.annotations.Nullable;
65
import org.labkey.api.pipeline.PipelineJobException;
76
import org.labkey.api.sequenceanalysis.pipeline.SequencePipelineService;
@@ -35,15 +34,20 @@ public void setCompressionLevel(int compressionLevel)
3534
}
3635

3736
public File execute(File input) throws PipelineJobException
37+
{
38+
return execute(input, false);
39+
}
40+
41+
public File execute(File input, boolean preserveInput) throws PipelineJobException
3842
{
3943
getLogger().info("BGZipping file: " + input.getPath());
4044

41-
execute(getParams(input));
45+
execute(getParams(input, preserveInput));
4246
File output = new File(input.getPath() + ".gz");
4347
if (!output.exists())
4448
throw new PipelineJobException("Output not created, expected: " + output.getPath());
4549

46-
if (input.exists())
50+
if (!preserveInput && input.exists())
4751
{
4852
getLogger().debug("deleting input: " + input.getPath());
4953
input.delete();
@@ -52,12 +56,17 @@ public File execute(File input) throws PipelineJobException
5256
return output;
5357
}
5458

55-
public List<String> getParams(File input)
59+
private List<String> getParams(File input, boolean preserveInput)
5660
{
5761
List<String> params = new ArrayList<>();
5862
params.add(getExe().getPath());
5963
params.add("-f");
6064

65+
if (preserveInput)
66+
{
67+
params.add("-k");
68+
}
69+
6170
Integer threads;
6271
if (_maxThreads == -1)
6372
{

SequenceAnalysis/src/org/labkey/sequenceanalysis/run/util/FastaIndexer.java

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -38,7 +38,7 @@ public File execute(File input) throws PipelineJobException
3838

3939
public static File getExpectedIndexName(File input)
4040
{
41-
return new File(input.getPath() + ".fai");
41+
return new File(input.getPath() + (input.getName().toLowerCase().endsWith("gz") ? ".gzi" : ".fai"));
4242
}
4343

4444
public List<String> getParams(File input)

0 commit comments

Comments
 (0)