Skip to content

Commit d25a658

Browse files
committed
Add separate indel-specific TSV out lofreq merge analysis
1 parent 1e60cae commit d25a658

File tree

2 files changed

+91
-43
lines changed

2 files changed

+91
-43
lines changed

SequenceAnalysis/src/org/labkey/sequenceanalysis/pipeline/TaskFileManagerImpl.java

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -835,6 +835,7 @@ public void cleanup(Collection<RecordedAction> actions, @Nullable AbstractResume
835835
{
836836
_job.getLogger().debug("performing file cleanup");
837837
_job.setStatus(PipelineJob.TaskStatus.running, "PERFORMING FILE CLEANUP");
838+
_job.setErrors(0);
838839

839840
_job.getLogger().debug("transferring " + _outputsToCreate.size() + " sequence outputs to pipeline job, existing: " + _job.getOutputsToCreate().size());
840841
for (SequenceOutputFile so : _outputsToCreate)

SequenceAnalysis/src/org/labkey/sequenceanalysis/run/analysis/MergeLoFreqVcfHandler.java

Lines changed: 90 additions & 43 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
package org.labkey.sequenceanalysis.run.analysis;
22

3+
import au.com.bytecode.opencsv.CSVReader;
34
import au.com.bytecode.opencsv.CSVWriter;
45
import htsjdk.samtools.SAMSequenceDictionary;
56
import htsjdk.samtools.SAMSequenceRecord;
@@ -20,12 +21,15 @@
2021
import org.labkey.api.pipeline.PipelineJob;
2122
import org.labkey.api.pipeline.PipelineJobException;
2223
import org.labkey.api.pipeline.RecordedAction;
24+
import org.labkey.api.reader.Readers;
25+
import org.labkey.api.sequenceanalysis.SequenceAnalysisService;
2326
import org.labkey.api.sequenceanalysis.SequenceOutputFile;
2427
import org.labkey.api.sequenceanalysis.pipeline.AbstractParameterizedOutputHandler;
2528
import org.labkey.api.sequenceanalysis.pipeline.ReferenceGenome;
2629
import org.labkey.api.sequenceanalysis.pipeline.SequenceAnalysisJobSupport;
2730
import org.labkey.api.sequenceanalysis.pipeline.SequenceOutputHandler;
2831
import org.labkey.api.sequenceanalysis.pipeline.ToolParameterDescriptor;
32+
import org.labkey.api.util.FileUtil;
2933
import org.labkey.api.util.PageFlowUtil;
3034
import org.labkey.api.util.StringUtilsLabKey;
3135
import org.labkey.sequenceanalysis.SequenceAnalysisModule;
@@ -225,69 +229,112 @@ public void processFilesRemote(List<SequenceOutputFile> inputFiles, JobContext c
225229

226230
Set<Integer> genomeIds = new HashSet<>();
227231

228-
for (SequenceOutputFile so : inputFiles)
232+
File indelOutput = new File(ctx.getOutputDir(), basename + "indels.txt.gz");
233+
try (CSVWriter writer = new CSVWriter(new BufferedWriter(new OutputStreamWriter(new GZIPOutputStream(new FileOutputStream(indelOutput)), StringUtilsLabKey.DEFAULT_CHARSET)), '\t', CSVWriter.NO_QUOTE_CHARACTER))
229234
{
230-
//This will error if the coverage file is not found. Perform check now to fail fast
231-
try
232-
{
233-
getDepthFile(so.getFile());
234-
}
235-
catch (PipelineJobException e)
236-
{
237-
errors.add(e.getMessage());
238-
}
235+
writer.writeNext(new String[]{"ReadsetName", "OutputFileId", "ReadsetId", "Source", "Contig", "Start", "End", "Ref", "AltAllele", "GatkDepth", "LoFreqDepth", "AltCount", "AltAF"});
239236

240-
if (so.getLibrary_id() == null)
237+
for (SequenceOutputFile so : inputFiles)
241238
{
242-
throw new PipelineJobException("VCF lacks library id: " + so.getRowid());
243-
}
239+
//This will error if the coverage file is not found. Perform check now to fail fast
240+
try
241+
{
242+
getDepthFile(so.getFile());
243+
}
244+
catch (PipelineJobException e)
245+
{
246+
errors.add(e.getMessage());
247+
}
244248

245-
genomeIds.add(so.getLibrary_id());
246-
if (genomeIds.size() > 1)
247-
{
248-
throw new PipelineJobException("Samples use more than one genome. Genome IDs: " + StringUtils.join(genomeIds, ","));
249-
}
249+
if (so.getLibrary_id() == null)
250+
{
251+
throw new PipelineJobException("VCF lacks library id: " + so.getRowid());
252+
}
250253

251-
try (VCFFileReader reader = new VCFFileReader(so.getFile()); CloseableIterator<VariantContext> it = reader.iterator())
252-
{
253-
while (it.hasNext())
254+
genomeIds.add(so.getLibrary_id());
255+
if (genomeIds.size() > 1)
254256
{
255-
VariantContext vc = it.next();
256-
if (vc.getAttribute("AF") == null)
257-
{
258-
continue;
259-
}
257+
throw new PipelineJobException("Samples use more than one genome. Genome IDs: " + StringUtils.join(genomeIds, ","));
258+
}
260259

261-
//Also perform santity check of VCF early
262-
if (vc.getAttribute("GATK_DP") == null)
260+
try (VCFFileReader reader = new VCFFileReader(so.getFile()); CloseableIterator<VariantContext> it = reader.iterator())
261+
{
262+
while (it.hasNext())
263263
{
264-
errors.add("Expected GATK_DP annotation on line " + getCacheKey(vc.getContig(), vc.getStart()) + " in file: " + so.getFile().getPath());
265-
}
264+
VariantContext vc = it.next();
265+
if (vc.getAttribute("AF") == null)
266+
{
267+
continue;
268+
}
266269

267-
double af = vc.getAttributeAsDouble("AF", 0.0);
268-
if (af >= minAfThreshold)
269-
{
270-
if (vc.isIndel())
270+
//Also perform santity check of VCF early
271+
if (vc.getAttribute("GATK_DP") == null)
271272
{
272-
uniqueIndels.add(vc.getContig() + "<>" + vc.getStart() + "<>" + vc.getReference().getBaseString() + "<>" + vc.getAlternateAlleles().get(0).getBaseString());
273+
errors.add("Expected GATK_DP annotation on line " + getCacheKey(vc.getContig(), vc.getStart()) + " in file: " + so.getFile().getPath());
273274
}
274275

275-
for (int i = 0; i < vc.getLengthOnReference();i++)
276+
double af = vc.getAttributeAsDouble("AF", 0.0);
277+
if (af >= minAfThreshold)
276278
{
277-
int effectiveStart = vc.getStart() + i;
278-
String key = getCacheKey(vc.getContig(), effectiveStart);
279-
Allele ref = vc.getLengthOnReference() == 1 ? vc.getReference() : Allele.create(vc.getReference().getBaseString().substring(i, i + 1), true);
280-
SiteAndAlleles site = siteToAllele.containsKey(key) ? siteToAllele.get(key) : new SiteAndAlleles(vc.getContig(), effectiveStart, ref);
281-
if (!siteToAllele.containsKey(key))
279+
if (vc.isIndel())
282280
{
283-
whitelistSites.add(Pair.of(vc.getContig(), effectiveStart));
281+
uniqueIndels.add(vc.getContig() + "<>" + vc.getStart() + "<>" + vc.getReference().getBaseString() + "<>" + vc.getAlternateAlleles().get(0).getBaseString());
282+
List<Integer> depths = vc.getAttributeAsIntList("DP4", 0);
283+
int alleleDepth = depths.get(2) + depths.get(3);
284+
285+
writer.writeNext(new String[]{ctx.getSequenceSupport().getCachedReadset(so.getReadset()).getName(), String.valueOf(so.getRowid()), String.valueOf(so.getReadset()), "LoFreq", vc.getContig(), String.valueOf(vc.getStart()), String.valueOf(vc.getEnd()), vc.getReference().getBaseString(), vc.getAlternateAlleles().get(0).getBaseString(), vc.getAttributeAsString("GATK_DP", "ND"), vc.getAttributeAsString("DP", "ND"), String.valueOf(alleleDepth), vc.getAttributeAsString("AF", "ND")});
286+
}
287+
288+
for (int i = 0; i < vc.getLengthOnReference(); i++)
289+
{
290+
int effectiveStart = vc.getStart() + i;
291+
String key = getCacheKey(vc.getContig(), effectiveStart);
292+
Allele ref = vc.getLengthOnReference() == 1 ? vc.getReference() : Allele.create(vc.getReference().getBaseString().substring(i, i + 1), true);
293+
SiteAndAlleles site = siteToAllele.containsKey(key) ? siteToAllele.get(key) : new SiteAndAlleles(vc.getContig(), effectiveStart, ref);
294+
if (!siteToAllele.containsKey(key))
295+
{
296+
whitelistSites.add(Pair.of(vc.getContig(), effectiveStart));
297+
}
298+
siteToAllele.put(key, site);
299+
}
300+
}
301+
}
302+
}
303+
304+
String pindelBasename = SequenceAnalysisService.get().getUnzippedBaseName(so.getFile().getName());
305+
if (pindelBasename.endsWith("lofreq"))
306+
{
307+
pindelBasename = FileUtil.getBaseName(pindelBasename);
308+
}
309+
310+
File pindelFile = new File(so.getFile().getParentFile(), pindelBasename + ".pindel.txt");
311+
if (pindelFile.exists())
312+
{
313+
try (CSVReader reader = new CSVReader(Readers.getReader(pindelFile)))
314+
{
315+
String[] line;
316+
while ((line = reader.readNext()) != null)
317+
{
318+
writer.writeNext(new String[]{"Type", "Contig", "Start", "End", "Depth", "ReadSupport", "Fraction"});
319+
if (line[0].equals("Type"))
320+
{
321+
continue;
284322
}
285-
siteToAllele.put(key, site);
323+
324+
writer.writeNext(new String[]{ctx.getSequenceSupport().getCachedReadset(so.getReadset()).getName(), String.valueOf(so.getRowid()), String.valueOf(so.getReadset()), "Pindel", line[1], line[2], line[3], "", line[0], line[4], "", line[5], line[6]});
286325
}
287326
}
288327
}
328+
else
329+
{
330+
ctx.getLogger().warn("Unable to find pindel file, expected: " + pindelFile.getPath());
331+
}
289332
}
290333
}
334+
catch (IOException e)
335+
{
336+
throw new PipelineJobException(e);
337+
}
291338

292339
if (!errors.isEmpty())
293340
{

0 commit comments

Comments
 (0)