Skip to content

Commit 93dc3b0

Browse files
authored
Merge pull request #30 from BimberLab/fb_merge_discvr-20.11
Merge discvr-20.11 to discvr-21.3
2 parents 882325e + 5fc40f4 commit 93dc3b0

29 files changed

+923
-46
lines changed
Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
ALTER TABLE sequenceanalysis.aa_snps_by_codon ALTER column codon type VARCHAR(4000);
Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
ALTER TABLE sequenceanalysis.aa_snps_by_codon ALTER column ref_aa type VARCHAR(4000);
2+
ALTER TABLE sequenceanalysis.aa_snps_by_codon ALTER column q_aa type VARCHAR(4000);
3+
4+
ALTER TABLE sequenceanalysis.nt_snps_by_pos ALTER column ref_nt type VARCHAR(4000);
5+
ALTER TABLE sequenceanalysis.nt_snps_by_pos ALTER column q_nt type VARCHAR(4000);
Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
ALTER TABLE sequenceanalysis.aa_snps_by_codon ALTER column codon VARCHAR(4000);
Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,6 @@
1+
ALTER TABLE sequenceanalysis.aa_snps_by_codon ALTER column ref_aa VARCHAR(4000);
2+
ALTER TABLE sequenceanalysis.aa_snps_by_codon ALTER column q_aa VARCHAR(4000);
3+
4+
ALTER TABLE sequenceanalysis.nt_snps_by_pos ALTER column ref_nt VARCHAR(4000);
5+
ALTER TABLE sequenceanalysis.nt_snps_by_pos ALTER column q_nt VARCHAR(4000);
6+

SequenceAnalysis/resources/web/SequenceAnalysis/panel/SequenceImportPanel.js

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -414,8 +414,9 @@ Ext4.define('SequenceAnalysis.panel.SequenceImportPanel', {
414414
ILLUMINA_REGEX: /^(.+)_L(.+)_(R){0,1}([0-9])(_[0-9]+){0,1}(\.f(ast){0,1}q)(\.gz)?$/i,
415415

416416
//Example from NextSeq: RNA160915BB_34A_22436_Gag120_Clone-10_S10_R1_001.fastq.gz
417+
//OR: 210304_NS500556_0465_AHHVNKAFX2.OSPHL00042_old_cdna_2.R2.fq.gz
417418
//This should also allow simple pairs, like: file1_1.fq.gz and file1_2.fq.gz
418-
ILLUMINA_REGEX_NO_LANE: /^(.+)_(R){0,1}([0-9])(_[0-9]+){0,1}(\.f(ast){0,1}q)(\.gz)?$/i,
419+
ILLUMINA_REGEX_NO_LANE: /^(.+)[\._](R){0,1}([0-9])(_[0-9]+){0,1}(\.f(ast){0,1}q)(\.gz)?$/i,
419420

420421
//example: 214-3-6-GEX_1_S29_L002_R1_001.fastq.gz
421422
//<sample name>_<barcode sequence>_L<lane (0-padded to 3 digits)>_R<read number>_<set number (0-padded to 3 digits>.fastq.gz

SequenceAnalysis/src/org/labkey/sequenceanalysis/SequenceAnalysisModule.java

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -94,6 +94,7 @@
9494
import org.labkey.sequenceanalysis.run.analysis.ImmunoGenotypingAnalysis;
9595
import org.labkey.sequenceanalysis.run.analysis.LofreqAnalysis;
9696
import org.labkey.sequenceanalysis.run.analysis.MergeLoFreqVcfHandler;
97+
import org.labkey.sequenceanalysis.run.analysis.NextCladeHandler;
9798
import org.labkey.sequenceanalysis.run.analysis.PARalyzerAnalysis;
9899
import org.labkey.sequenceanalysis.run.analysis.PangolinHandler;
99100
import org.labkey.sequenceanalysis.run.analysis.PindelAnalysis;
@@ -178,7 +179,7 @@ public String getName()
178179
@Override
179180
public Double getSchemaVersion()
180181
{
181-
return 12.325;
182+
return 12.327;
182183
}
183184

184185
@Override
@@ -349,6 +350,7 @@ public static void registerPipelineSteps()
349350
SequenceAnalysisService.get().registerFileHandler(new GenomicsDBAppendHandler());
350351
SequenceAnalysisService.get().registerFileHandler(new MergeLoFreqVcfHandler());
351352
SequenceAnalysisService.get().registerFileHandler(new PangolinHandler());
353+
SequenceAnalysisService.get().registerFileHandler(new NextCladeHandler());
352354

353355
SequenceAnalysisService.get().registerReadsetHandler(new MultiQCHandler());
354356

SequenceAnalysis/src/org/labkey/sequenceanalysis/run/analysis/LofreqAnalysis.java

Lines changed: 153 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,6 @@
2323
import org.apache.commons.lang3.math.NumberUtils;
2424
import org.apache.commons.math3.stat.descriptive.rank.Median;
2525
import org.apache.logging.log4j.Logger;
26-
import org.apache.logging.log4j.LogManager;
2726
import org.biojava3.core.sequence.DNASequence;
2827
import org.biojava3.core.sequence.compound.AmbiguityDNACompoundSet;
2928
import org.biojava3.core.sequence.compound.NucleotideCompound;
@@ -60,9 +59,11 @@
6059
import org.labkey.api.util.PageFlowUtil;
6160
import org.labkey.api.writer.PrintWriters;
6261
import org.labkey.sequenceanalysis.SequenceAnalysisModule;
62+
import org.labkey.sequenceanalysis.SequenceAnalysisSchema;
6363
import org.labkey.sequenceanalysis.run.util.DepthOfCoverageWrapper;
6464
import org.labkey.sequenceanalysis.run.variant.SNPEffStep;
6565
import org.labkey.sequenceanalysis.run.variant.SnpEffWrapper;
66+
import org.labkey.sequenceanalysis.util.ReferenceLibraryHelperImpl;
6667

6768
import java.io.File;
6869
import java.io.IOException;
@@ -73,6 +74,7 @@
7374
import java.util.ArrayList;
7475
import java.util.Arrays;
7576
import java.util.Collections;
77+
import java.util.Date;
7678
import java.util.HashMap;
7779
import java.util.HashSet;
7880
import java.util.LinkedHashMap;
@@ -137,7 +139,7 @@ public Provider()
137139
{{
138140

139141
}}, false),
140-
ToolParameterDescriptor.create("runPangolin", "Run Pangolin", "If selected, Pangolin will be used to score the consensus against common SARS-CoV-2 lineages.", "checkbox", new JSONObject()
142+
ToolParameterDescriptor.create("runPangolin", "Run Pangolin and NextClade", "If selected, Pangolin and NextClade will be used to score the consensus against common SARS-CoV-2 lineages.", "checkbox", new JSONObject()
141143
{{
142144

143145
}}, false),
@@ -146,8 +148,20 @@ public Provider()
146148
put("minValue", 0.0);
147149
put("maxValue", 1.0);
148150
put("decimalPrecision", 2);
149-
}}, 0.1)
150-
), PageFlowUtil.set("sequenceanalysis/field/GenomeFileSelectorField.js"), "http://csb5.github.io/lofreq/");
151+
}}, 0.1),
152+
ToolParameterDescriptor.create("dbImport", "Import SNVs", "If selected, the LoFreq and pindel variants will be imported into the DB.", "checkbox", new JSONObject()
153+
{{
154+
155+
}}, false),
156+
ToolParameterDescriptor.create("dbImportThreshold", "Variant Import AF Threshold", "If DB import is selected, variants above this AF threshold will be imported.", "ldk-numberfield", new JSONObject()
157+
{{
158+
159+
}}, 0.1),
160+
ToolParameterDescriptor.create("dbImportDepthThreshold", "Variant Import Depth Threshold", "If DB import is selected, variants in a site with coverage greater than or equal to this value will be imported.", "ldk-integerfield", new JSONObject()
161+
{{
162+
163+
}}, 20)
164+
), PageFlowUtil.set("sequenceanalysis/field/GenomeFileSelectorField.js"), "http://csb5.github.io/lofreq/");
151165
}
152166

153167

@@ -407,8 +421,8 @@ public Output performAnalysisPerSampleRemote(Readset rs, File inputBam, Referenc
407421
int totalIndelGTThreshold = 0;
408422
int totalConsensusInPBS= 0;
409423

410-
File loFreqConsensusVcf = new File(outputDir, FileUtil.getBaseName(inputBam) + ".lofreq.consensus.vcf.gz");
411-
File loFreqAllVcf = new File(outputDir, FileUtil.getBaseName(inputBam) + ".lofreq.all.vcf.gz");
424+
File loFreqConsensusVcf = getConsensusVcf(outputDir, inputBam);
425+
File loFreqAllVcf = getAllVcf(outputDir, inputBam);
412426
Double strandBiasRecoveryAF = getProvider().getParameterByName("strandBiasRecoveryAF").extractValue(getPipelineCtx().getJob(), getProvider(), getStepIdx(), Double.class, 1.0);
413427
SAMSequenceDictionary dict = SAMSequenceDictionaryExtractor.extractDictionary(referenceGenome.getSequenceDictionary().toPath());
414428
VariantContextWriterBuilder writerBuilder1 = new VariantContextWriterBuilder().setOutputFile(loFreqConsensusVcf).setReferenceDictionary(dict);
@@ -591,7 +605,7 @@ public Output performAnalysisPerSampleRemote(Readset rs, File inputBam, Referenc
591605
int minDepth = getProvider().getParameterByName("minDepth").extractValue(getPipelineCtx().getJob(), getProvider(), getStepIdx(), Integer.class, 0);
592606
int minInsertSize = getProvider().getParameterByName("minInsertSize").extractValue(getPipelineCtx().getJob(), getProvider(), getStepIdx(), Integer.class, 0);
593607

594-
boolean runPangolin = getProvider().getParameterByName("runPangolin").extractValue(getPipelineCtx().getJob(), getProvider(), getStepIdx(), Boolean.class, false);
608+
boolean runPangolinAndNextClade = getProvider().getParameterByName("runPangolin").extractValue(getPipelineCtx().getJob(), getProvider(), getStepIdx(), Boolean.class, false);
595609
boolean runPindel = getProvider().getParameterByName("runPindel").extractValue(getPipelineCtx().getJob(), getProvider(), getStepIdx(), Boolean.class, false);
596610

597611
if (runPindel)
@@ -600,10 +614,13 @@ public Output performAnalysisPerSampleRemote(Readset rs, File inputBam, Referenc
600614
}
601615

602616
String[] pangolinData = null;
603-
if (runPangolin)
617+
if (runPangolinAndNextClade)
604618
{
605619
PangolinHandler.updatePangolinRefs(getPipelineCtx().getLogger());
606620
pangolinData = PangolinHandler.runPangolin(consensusFastaLoFreq, getPipelineCtx().getLogger(), output);
621+
622+
File json = NextCladeHandler.runNextClade(consensusFastaLoFreq, getPipelineCtx().getLogger(), output, outputDir);
623+
output.addSequenceOutput(json, "Nextclade: " + rs.getName(), "NextClade JSON", rs.getReadsetId(), null, referenceGenome.getGenomeId(), null);
607624
}
608625

609626
//write metrics:
@@ -640,6 +657,11 @@ public Output performAnalysisPerSampleRemote(Readset rs, File inputBam, Referenc
640657
return output;
641658
}
642659

660+
private File getAllVcf(File outputDir, File inputBam)
661+
{
662+
return new File(outputDir, FileUtil.getBaseName(inputBam) + ".lofreq.all.vcf.gz");
663+
}
664+
643665
private Set<String> runBcftools(File inputBam, ReferenceGenome referenceGenome, File mask, int minCoverage) throws PipelineJobException
644666
{
645667
Set<String> variantsBcftools = new HashSet<>();
@@ -684,9 +706,19 @@ private Set<String> runBcftools(File inputBam, ReferenceGenome referenceGenome,
684706
return variantsBcftools;
685707
}
686708

709+
private File getConsensusVcf(File outputDir, File inputBam)
710+
{
711+
return new File(outputDir, FileUtil.getBaseName(inputBam) + ".lofreq.consensus.vcf.gz");
712+
}
713+
714+
private File getConsensusFasta(File loFreqConsensusVcf)
715+
{
716+
return new File(loFreqConsensusVcf.getParentFile(), SequenceAnalysisService.get().getUnzippedBaseName(loFreqConsensusVcf.getName()) + ".fasta");
717+
}
718+
687719
private File generateConsensus(File loFreqConsensusVcf, File fasta, File maskBed) throws PipelineJobException
688720
{
689-
File ret = new File(loFreqConsensusVcf.getParentFile(), SequenceAnalysisService.get().getUnzippedBaseName(loFreqConsensusVcf.getName()) + ".fasta");
721+
File ret = getConsensusFasta(loFreqConsensusVcf);
690722
List<String> args = new ArrayList<>();
691723

692724
args.add(SequencePipelineService.get().getExeForPackage("BCFTOOLS", "bcftools").getPath());
@@ -769,8 +801,6 @@ private File getMetricsFile(File outDir)
769801
@Override
770802
public Output performAnalysisPerSampleLocal(AnalysisModel model, File inputBam, File referenceFasta, File outDir) throws PipelineJobException
771803
{
772-
Map<String, String> valueMap = new HashMap<>();
773-
774804
File metrics = getMetricsFile(outDir);
775805
if (metrics.exists())
776806
{
@@ -819,8 +849,6 @@ public Output performAnalysisPerSampleLocal(AnalysisModel model, File inputBam,
819849

820850
Table.insert(getPipelineCtx().getJob().getUser(), ti, r);
821851
total++;
822-
823-
valueMap.put(line[1], value);
824852
}
825853

826854
getPipelineCtx().getJob().getLogger().info("total metrics: " + total);
@@ -835,9 +863,121 @@ public Output performAnalysisPerSampleLocal(AnalysisModel model, File inputBam,
835863
throw new PipelineJobException("Unable to find metrics file: " + metrics.getPath());
836864
}
837865

866+
boolean dbImport = getProvider().getParameterByName("dbImport").extractValue(getPipelineCtx().getJob(), getProvider(), getStepIdx(), Boolean.class, false);
867+
if (dbImport)
868+
{
869+
importNtSnps(model, inputBam, outDir);
870+
}
871+
else
872+
{
873+
getPipelineCtx().getLogger().info("NT SNP DB Import not selected");
874+
}
875+
876+
boolean runPangolinAndNextClade = getProvider().getParameterByName("runPangolin").extractValue(getPipelineCtx().getJob(), getProvider(), getStepIdx(), Boolean.class, false);
877+
if (runPangolinAndNextClade)
878+
{
879+
//Find the NextClade json:
880+
File jsonFile = NextCladeHandler.getJsonFile(outDir, getConsensusFasta(getConsensusVcf(outDir, inputBam)));
881+
if (!jsonFile.exists())
882+
{
883+
throw new PipelineJobException("Unable to find NextClade JSON record: " + jsonFile.getPath());
884+
}
885+
886+
File vcf = getAllVcf(outDir, inputBam);
887+
if (!vcf.exists())
888+
{
889+
throw new PipelineJobException("Unable to find LoFreq VCF: " + vcf.getPath());
890+
}
891+
892+
NextCladeHandler.processAndImportNextCladeAa(getPipelineCtx().getJob(), jsonFile, model.getRowId(), model.getLibraryId(), model.getAlignmentFile(), model.getReadset(), vcf, dbImport);
893+
}
894+
else
895+
{
896+
getPipelineCtx().getLogger().info("NextClade was not run");
897+
}
898+
838899
return null;
839900
}
840901

902+
private void importNtSnps(AnalysisModel model, File inputBam, File outDir) throws PipelineJobException
903+
{
904+
getPipelineCtx().getLogger().info("Importing variants into DB");
905+
906+
File vcf = getAllVcf(outDir, inputBam);
907+
if (!vcf.exists())
908+
{
909+
throw new PipelineJobException("Unable to find file: " + vcf.getPath());
910+
}
911+
912+
ReferenceGenome genome = SequenceAnalysisService.get().getReferenceGenome(model.getLibraryId(), getPipelineCtx().getJob().getUser());
913+
ReferenceLibraryHelperImpl helper = new ReferenceLibraryHelperImpl(genome.getWorkingFastaFile(), getPipelineCtx().getLogger());
914+
915+
ViralSnpUtil.deleteExistingValues(getPipelineCtx().getJob(), model.getAnalysisId(), SequenceAnalysisSchema.TABLE_NT_SNP_BY_POS, null);
916+
Double afThreshold = getProvider().getParameterByName("dbImportThreshold").extractValue(getPipelineCtx().getJob(), getProvider(), getStepIdx(), Double.class, 0.0);
917+
Integer depthTheshold = getProvider().getParameterByName("dbImportDepthThreshold").extractValue(getPipelineCtx().getJob(), getProvider(), getStepIdx(), Integer.class, 0);
918+
919+
TableInfo ti = SequenceAnalysisSchema.getTable(SequenceAnalysisSchema.TABLE_NT_SNP_BY_POS);
920+
921+
try (VCFFileReader reader = new VCFFileReader(vcf);CloseableIterator<VariantContext> it = reader.iterator())
922+
{
923+
while (it.hasNext())
924+
{
925+
VariantContext vc = it.next();
926+
927+
if (vc.isFiltered())
928+
{
929+
continue;
930+
}
931+
932+
double pct = vc.getAttributeAsDouble("AF", 0.0);
933+
if (pct < afThreshold)
934+
{
935+
continue;
936+
}
937+
938+
int depth = vc.getAttributeAsInt("GATK_DP", 0);
939+
if (depth < depthTheshold)
940+
{
941+
continue;
942+
}
943+
944+
if (vc.getAlternateAlleles().size() != 1)
945+
{
946+
throw new PipelineJobException("Expected a single ALT allele");
947+
}
948+
949+
Map<String, Object> ntRow = new HashMap<>();
950+
ntRow.put("analysis_id", model.getAnalysisId());
951+
952+
Integer refId = helper.resolveSequenceId(vc.getContig());
953+
if (refId == null)
954+
{
955+
getPipelineCtx().getLogger().error("unknown reference id: [" + vc.getContig() + "]");
956+
}
957+
958+
ntRow.put("ref_nt_id", refId);
959+
ntRow.put("ref_nt_position", vc.getStart()); //use 1-based
960+
ntRow.put("ref_nt_insert_index", 0);
961+
ntRow.put("ref_nt", vc.getReference().getBaseString());
962+
ntRow.put("q_nt", vc.getAlternateAllele(0).getBaseString());
963+
964+
List<Integer> depths = vc.getAttributeAsIntList("DP4", 0);
965+
int alleleDepth = depths.get(2) + depths.get(3);
966+
ntRow.put("readcount", alleleDepth);
967+
ntRow.put("depth", depth);
968+
ntRow.put("adj_depth", vc.getAttribute("DP"));
969+
ntRow.put("pct", vc.getAttribute("AF"));
970+
ntRow.put("container", getPipelineCtx().getJob().getContainer().getEntityId());
971+
ntRow.put("createdby", getPipelineCtx().getJob().getUser().getUserId());
972+
ntRow.put("modifiedby", getPipelineCtx().getJob().getUser().getUserId());
973+
ntRow.put("created", new Date());
974+
ntRow.put("modified", new Date());
975+
976+
Table.insert(getPipelineCtx().getJob().getUser(), ti, ntRow);
977+
}
978+
}
979+
}
980+
841981
private String getHashKey(VariantContext vc)
842982
{
843983
return vc.getContig() + "<>" + vc.getStart() + vc.getAlternateAlleles().stream().map(Allele::getBaseString).collect(Collectors.joining(";"));

0 commit comments

Comments
 (0)