|
4 | 4 | import au.com.bytecode.opencsv.CSVWriter; |
5 | 5 | import htsjdk.samtools.SAMSequenceDictionary; |
6 | 6 | import htsjdk.samtools.SAMSequenceRecord; |
| 7 | +import htsjdk.samtools.reference.IndexedFastaSequenceFile; |
| 8 | +import htsjdk.samtools.reference.ReferenceSequence; |
7 | 9 | import htsjdk.samtools.util.CloseableIterator; |
8 | 10 | import htsjdk.variant.utils.SAMSequenceDictionaryExtractor; |
9 | 11 | import htsjdk.variant.variantcontext.Allele; |
@@ -340,6 +342,22 @@ public void processFilesRemote(List<SequenceOutputFile> inputFiles, JobContext c |
340 | 342 |
|
341 | 343 | int length = Integer.parseInt(line[3]) - Integer.parseInt(line[2]) - 1; //NOTE: pindel reports one base upstream+downstream as part of the indel |
342 | 344 | writer.writeNext(new String[]{ctx.getSequenceSupport().getCachedReadset(so.getReadset()).getName(), String.valueOf(so.getRowid()), String.valueOf(so.getReadset()), "Pindel", line[1], line[2], line[3], String.valueOf(length), "", line[0], line[4], "", line[5], line[6]}); |
| 345 | + |
| 346 | + if ("D".equalsIgnoreCase(line[0])) |
| 347 | + { |
| 348 | + ReferenceSequence rs = getReferenceSequence(ctx, genomeIds.iterator().next(), line[1]); |
| 349 | + int start0 = Integer.parseInt(line[2]) - 1; |
| 350 | + char alt = (char)rs.getBases()[start0]; |
| 351 | + |
| 352 | + int end1 = Integer.parseInt(line[3]) - 1; //pindel reports the end as one bp downstream of the event, so drop 1 bp even though we're keeping 1-based |
| 353 | + String ref = new String(Arrays.copyOfRange(rs.getBases(), start0, end1), StringUtilsLabKey.DEFAULT_CHARSET); |
| 354 | + if (ref.length() != length + 1) |
| 355 | + { |
| 356 | + throw new PipelineJobException("Improper pindel parsing: " + so.getRowid() + "/" + line[2] + "/" + line[3] + "/" + ref + "/" + alt); |
| 357 | + } |
| 358 | + |
| 359 | + uniqueIndels.add(line[1] + "<>" + line[2] + "<>" + ref + "<>" + alt); |
| 360 | + } |
343 | 361 | } |
344 | 362 | } |
345 | 363 | } |
@@ -653,6 +671,22 @@ public void processFilesRemote(List<SequenceOutputFile> inputFiles, JobContext c |
653 | 671 | ctx.getFileManager().addSequenceOutput(output, "Merged LoFreq Variants: " + inputFiles.size() + " VCFs", "Merged LoFreq Variant Table", null, null, genome.getGenomeId(), null); |
654 | 672 | } |
655 | 673 |
|
| 674 | + private Map<String, ReferenceSequence> seqMap = new HashMap<>(); |
| 675 | + |
| 676 | + private ReferenceSequence getReferenceSequence(JobContext ctx, int genomeId, String name) throws IOException |
| 677 | + { |
| 678 | + if (!seqMap.containsKey(name)) |
| 679 | + { |
| 680 | + ReferenceGenome genome = ctx.getSequenceSupport().getCachedGenome(genomeId); |
| 681 | + try (IndexedFastaSequenceFile fastaSequenceFile = new IndexedFastaSequenceFile(genome.getWorkingFastaFile())) |
| 682 | + { |
| 683 | + seqMap.put(name, fastaSequenceFile.getSequence(name)); |
| 684 | + } |
| 685 | + } |
| 686 | + |
| 687 | + return seqMap.get(name); |
| 688 | + } |
| 689 | + |
656 | 690 | private static String getCacheKey(String contig, int start) |
657 | 691 | { |
658 | 692 | return contig + "<>" + start; |
|
0 commit comments