Skip to content

Commit 5d64be0

Browse files
committed
Support CRAM archival mode
1 parent bf6e784 commit 5d64be0

File tree

4 files changed

+31
-10
lines changed

4 files changed

+31
-10
lines changed

SequenceAnalysis/api-src/org/labkey/api/sequenceanalysis/pipeline/AbstractAlignmentStepProvider.java

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,7 @@ abstract public class AbstractAlignmentStepProvider<StepType extends AlignmentSt
3434
public static String SUPPORT_MERGED_UNALIGNED = "supportsMergeUnaligned";
3535
public static String COLLECT_WGS_METRICS = "collectWgsMetrics";
3636
public static String CONVERT_TO_CRAM = "convertToCram";
37+
public static String CRAM_ARCHIVAL_MODE = "doCramArchivalMode";
3738
public static String COLLECT_WGS_METRICS_NON_ZERO = "collectWgsMetricsNonZero";
3839
public static String DISCARD_BAM = "discardBam";
3940

@@ -116,6 +117,10 @@ private static List<ToolParameterDescriptor> getParamList(List<ToolParameterDesc
116117
put("checked", false);
117118
}}, false));
118119

120+
parameters.add(ToolParameterDescriptor.create("doCramArchivalMode", "CRAM Archival Mode", "If selected, the CRAM will undergo additional compression to save space. This is lossy and may not be compatible with all downstream tools. See samtools view --output-fmt-option archive", "checkbox", new JSONObject(){{
121+
put("checked", false);
122+
}}, false));
123+
119124
parameters.add(ToolParameterDescriptor.create(ALIGNMENT_MODE_PARAM, "Alignment Mode", "If your readset has more than one pair of FASTQs, there pipeline can either align each pair sequentially (and then merge these BAMs), or merge the pairs of FASTQs first and then perform alignment once. The default is to align each pair of FASTQs separately; however, some pipelines like STAR require the latter.", "ldk-simplecombo", new JSONObject(){{
120125
put("storeValues", ALIGNMENT_MODE.ALIGN_THEN_MERGE.name() + ";" + ALIGNMENT_MODE.MERGE_THEN_ALIGN.name());
121126
put("value", alignmentMode.name());

SequenceAnalysis/api-src/org/labkey/api/sequenceanalysis/pipeline/SamtoolsCramConverter.java

Lines changed: 18 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -18,25 +18,40 @@ public SamtoolsCramConverter(Logger log)
1818
super(log);
1919
}
2020

21-
public File convert(File inputBam, File outputCram, File gzippedFasta, boolean doIndex, @Nullable Integer threads) throws PipelineJobException
21+
public File convert(File inputBam, File outputCram, File gzippedFasta, boolean doIndex, @Nullable Integer threads, boolean archivalMode) throws PipelineJobException
2222
{
2323
getLogger().info("Converting SAM/BAM to CRAM: " + inputBam.getPath());
2424

2525
List<String> params = new ArrayList<>();
2626
params.add(getSamtoolsPath().getPath());
2727
params.add("view");
2828

29-
params.add("-C");
29+
params.add("--output-fmt");
30+
params.add("cram,version=3.0");
3031

3132
params.add("-o");
3233
params.add(outputCram.getPath());
3334

35+
// CRAM does, however, have an optional archive settings mode (samtools view ...)
36+
// which is a lossy compression, doing things like removing read names, removing additional accessory fields, and additional compression of quality scores.
37+
// In all cases, the base sequence of the reads is preserved: https://www.htslib.org/doc/samtools.html
38+
if (archivalMode)
39+
{
40+
params.add("--output-fmt-option");
41+
params.add("archive");
42+
}
43+
3444
params.add("-T");
3545
params.add(gzippedFasta.getPath());
3646

47+
if (doIndex)
48+
{
49+
params.add("--write-index");
50+
}
51+
3752
if (threads != null)
3853
{
39-
params.add("--threads");
54+
params.add("-@");
4055
params.add(String.valueOf(threads));
4156
}
4257

@@ -49,11 +64,6 @@ public File convert(File inputBam, File outputCram, File gzippedFasta, boolean d
4964
throw new PipelineJobException("Missing output: " + outputCram.getPath());
5065
}
5166

52-
if (doIndex)
53-
{
54-
doIndex(outputCram, threads);
55-
}
56-
5767
return outputCram;
5868
}
5969

SequenceAnalysis/src/org/labkey/sequenceanalysis/pipeline/ConvertToCramHandler.java

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -42,6 +42,9 @@ public ConvertToCramHandler()
4242
ToolParameterDescriptor.create("replaceOriginal", "Replace Original File", "If selected, the input BAM will be deleted and the database record will be switched to use this filepath.", "checkbox", new JSONObject(){{
4343
put("checked", true);
4444
}}, true),
45+
ToolParameterDescriptor.create("doCramArchivalMode", "CRAM Archival Mode", "If selected, the CRAM will undergo additional compression to save space. This is lossy and may not be compatible with all downstream tools. See samtools view --output-fmt-option archive", "checkbox", new JSONObject(){{
46+
put("checked", false);
47+
}}, false),
4548
ToolParameterDescriptor.create("useOutputFileContainer", "Submit to Source File Workbook", "If checked, each job will be submitted to the same workbook as the input file, as opposed to submitting all jobs to the same workbook. This is primarily useful if submitting a large batch of files to process separately. This only applies if 'Run Separately' is selected.", "checkbox", new JSONObject(){{
4649
put("checked", true);
4750
}}, true)
@@ -103,6 +106,7 @@ public void processFilesOnWebserver(PipelineJob job, SequenceAnalysisJobSupport
103106
public void processFilesRemote(List<SequenceOutputFile> inputFiles, JobContext ctx) throws UnsupportedOperationException, PipelineJobException
104107
{
105108
boolean replaceOriginal = ctx.getParams().optBoolean("replaceOriginal", false);
109+
boolean doCramArchivalMode = ctx.getParams().optBoolean("doCramArchivalMode", false);
106110
ctx.getLogger().info("Replace input BAM: " + replaceOriginal);
107111

108112
Integer threads = SequencePipelineService.get().getMaxThreads(ctx.getLogger());
@@ -124,7 +128,7 @@ public void processFilesRemote(List<SequenceOutputFile> inputFiles, JobContext c
124128
}
125129
else
126130
{
127-
new SamtoolsCramConverter(ctx.getLogger()).convert(so.getFile(), cram, genome.getWorkingFastaFileGzipped(), true, threads);
131+
new SamtoolsCramConverter(ctx.getLogger()).convert(so.getFile(), cram, genome.getWorkingFastaFileGzipped(), true, threads, doCramArchivalMode);
128132
}
129133

130134
checkCramAndIndex(so);

SequenceAnalysis/src/org/labkey/sequenceanalysis/pipeline/SequenceAlignmentTask.java

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1145,7 +1145,9 @@ else if (step.expectToCreateNewBam())
11451145

11461146
// optional convert to CRAM:
11471147
ToolParameterDescriptor cramParam = alignmentStep.getProvider().getParameterByName(AbstractAlignmentStepProvider.CONVERT_TO_CRAM);
1148+
ToolParameterDescriptor cramArchivalParam = alignmentStep.getProvider().getParameterByName(AbstractAlignmentStepProvider.CRAM_ARCHIVAL_MODE);
11481149
boolean doCramConvert = cramParam != null && cramParam.extractValue(getJob(), alignmentStep.getProvider(), alignmentStep.getStepIdx(), Boolean.class, false);
1150+
boolean doArchival = cramArchivalParam != null && cramArchivalParam.extractValue(getJob(), alignmentStep.getProvider(), alignmentStep.getStepIdx(), Boolean.class, false);
11491151
if (doCramConvert)
11501152
{
11511153
getJob().getLogger().info("BAM will be converted to CRAM");
@@ -1154,7 +1156,7 @@ else if (step.expectToCreateNewBam())
11541156
Integer threads = SequenceTaskHelper.getMaxThreads(getJob());
11551157
if (!cramFileIdx.exists())
11561158
{
1157-
new SamtoolsCramConverter(getJob().getLogger()).convert(renamedBam, cramFile, referenceGenome.getWorkingFastaFileGzipped(), true, threads);
1159+
new SamtoolsCramConverter(getJob().getLogger()).convert(renamedBam, cramFile, referenceGenome.getWorkingFastaFileGzipped(), true, threads, doArchival);
11581160
}
11591161
else
11601162
{

0 commit comments

Comments
 (0)