Skip to content

Commit 0b74243

Browse files
committed
Add option to limit the number of contigs per scatter/gather job
1 parent 2f911ad commit 0b74243

File tree

3 files changed

+38
-12
lines changed

3 files changed

+38
-12
lines changed

SequenceAnalysis/resources/web/SequenceAnalysis/panel/VariantScatterGatherPanel.js

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -61,6 +61,16 @@ Ext4.define('SequenceAnalysis.panel.VariantScatterGatherPanel', {
6161
inputValue: true,
6262
helpPopup: 'If true, a given chromosome/contig can be split between jobs. Otherwise chromosomes are always intact across jobs.'
6363
});
64+
65+
toAdd.push({
66+
xtype: 'ldk-integerfield',
67+
labelWidth: this.labelWidth,
68+
name: 'scatterGather.maxContigsPerJob',
69+
fieldLabel: 'Max Contigs/Job',
70+
minValue: -1,
71+
helpPopup: 'The maximum number of contigs allowed per job. Leave blank or enter -1 for no limit.',
72+
value: 200
73+
});
6474
}
6575
else if (val === 'fixedJobs') {
6676
toAdd.push({

SequenceAnalysis/src/org/labkey/sequenceanalysis/pipeline/VariantProcessingJob.java

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -108,10 +108,11 @@ else if (_scatterGatherMethod == ScatterGatherUtils.ScatterGatherMethod.chunked)
108108
{
109109
int basesPerJob = getParameterJson().getInt("scatterGather.basesPerJob");
110110
boolean allowSplitChromosomes = getParameterJson().optBoolean("scatterGather.allowSplitChromosomes", true);
111-
getLogger().info("Creating jobs with target bp size: " + basesPerJob + " mbp. allow splitting configs: " + allowSplitChromosomes);
111+
int maxContigsPerJob = getParameterJson().optInt("scatterGather.maxContigsPerJob", -1);
112+
getLogger().info("Creating jobs with target bp size: " + basesPerJob + " mbp. allow splitting configs: " + allowSplitChromosomes + ", max contigs per job: " + maxContigsPerJob);
112113

113114
basesPerJob = basesPerJob * 1000000;
114-
ret = ScatterGatherUtils.divideGenome(dict, basesPerJob, allowSplitChromosomes);
115+
ret = ScatterGatherUtils.divideGenome(dict, basesPerJob, allowSplitChromosomes, maxContigsPerJob);
115116

116117
}
117118
else if (_scatterGatherMethod == ScatterGatherUtils.ScatterGatherMethod.fixedJobs)
@@ -120,7 +121,7 @@ else if (_scatterGatherMethod == ScatterGatherUtils.ScatterGatherMethod.fixedJob
120121
int numJobs = getParameterJson().getInt("scatterGather.totalJobs");
121122
int jobSize = (int)Math.ceil(totalSize / (double)numJobs);
122123
getLogger().info("Creating " + numJobs + " jobs with approximate size: " + jobSize + " bp.");
123-
ret = ScatterGatherUtils.divideGenome(dict, jobSize, true);
124+
ret = ScatterGatherUtils.divideGenome(dict, jobSize, true, -1);
124125
}
125126
else
126127
{

SequenceAnalysis/src/org/labkey/sequenceanalysis/util/ScatterGatherUtils.java

Lines changed: 24 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -7,9 +7,11 @@
77
import org.junit.Test;
88

99
import java.util.ArrayList;
10+
import java.util.HashSet;
1011
import java.util.LinkedHashMap;
1112
import java.util.List;
1213
import java.util.Map;
14+
import java.util.Set;
1315

1416
public class ScatterGatherUtils
1517
{
@@ -28,34 +30,43 @@ private static class ActiveIntervalSet
2830
private final int _optimalBasesPerJob;
2931
private final LinkedHashMap<String, List<Interval>> _results;
3032
private final boolean _allowSplitChromosomes;
33+
private final int _maxContigsPerJob;
3134

3235
private List<Interval> _intervalList = new ArrayList<>();
3336
private int _basesPerActiveIntervalList = 0;
37+
private Set<String> _contigsInActiveIntervalList = new HashSet<>();
3438
private int _activeJobId = 1;
3539

36-
public ActiveIntervalSet(int optimalBasesPerJob, boolean allowSplitChromosomes)
40+
public ActiveIntervalSet(int optimalBasesPerJob, boolean allowSplitChromosomes, int maxContigsPerJob)
3741
{
3842
_optimalBasesPerJob = optimalBasesPerJob;
3943
_results = new LinkedHashMap<>();
4044
_allowSplitChromosomes = allowSplitChromosomes;
45+
_maxContigsPerJob = maxContigsPerJob;
4146
}
4247

4348
private void possiblyEndSet()
4449
{
4550
int basesRemaining = getBasesRemainingForInterval();
46-
if (basesRemaining <= 0)
51+
if (basesRemaining <= 0 || exceedsAllowableContigs())
4752
{
4853
closeSet();
4954
}
5055
}
5156

57+
private boolean exceedsAllowableContigs()
58+
{
59+
return _maxContigsPerJob != -1 && _contigsInActiveIntervalList.size() >= _maxContigsPerJob;
60+
}
61+
5262
public void closeSet()
5363
{
5464
if (!_intervalList.isEmpty())
5565
{
5666
_results.put("Job" + _activeJobId, new ArrayList<>(_intervalList));
5767
_intervalList.clear();
5868
_basesPerActiveIntervalList = 0;
69+
_contigsInActiveIntervalList.clear();
5970
_activeJobId++;
6071
}
6172
}
@@ -101,14 +112,15 @@ private void addInterval(String refName, int start, int end)
101112
{
102113
_intervalList.add(new Interval(refName, start, end));
103114
_basesPerActiveIntervalList += (end - start + 1);
115+
_contigsInActiveIntervalList.add(refName);
104116

105117
possiblyEndSet();
106118
}
107119
}
108120

109-
public static LinkedHashMap<String, List<Interval>> divideGenome(SAMSequenceDictionary dict, int optimalBasesPerJob, boolean allowSplitChromosomes)
121+
public static LinkedHashMap<String, List<Interval>> divideGenome(SAMSequenceDictionary dict, int optimalBasesPerJob, boolean allowSplitChromosomes, int maxContigsPerJob)
110122
{
111-
ActiveIntervalSet ais = new ActiveIntervalSet(optimalBasesPerJob, allowSplitChromosomes);
123+
ActiveIntervalSet ais = new ActiveIntervalSet(optimalBasesPerJob, allowSplitChromosomes, maxContigsPerJob);
112124
for (SAMSequenceRecord rec : dict.getSequences())
113125
{
114126
ais.add(rec);
@@ -141,13 +153,13 @@ private SAMSequenceDictionary getDict()
141153
public void testScatter()
142154
{
143155
SAMSequenceDictionary dict = getDict();
144-
Map<String, List<Interval>> ret = divideGenome(dict, 1000, true);
156+
Map<String, List<Interval>> ret = divideGenome(dict, 1000, true, -1);
145157
assertEquals("Incorrect number of jobs", 8, ret.size());
146158
assertEquals("Incorrect interval end", 2000, ret.get("Job3").get(0).getEnd());
147159
assertEquals("Incorrect start", 1001, ret.get("Job3").get(0).getStart());
148160
assertEquals("Incorrect interval end", 4, ret.get("Job8").size());
149161

150-
Map<String, List<Interval>> ret2 = divideGenome(dict, 3000, false);
162+
Map<String, List<Interval>> ret2 = divideGenome(dict, 3000, false, -1);
151163
assertEquals("Incorrect number of jobs", 3, ret2.size());
152164
for (String jobName : ret2.keySet())
153165
{
@@ -157,7 +169,7 @@ public void testScatter()
157169
}
158170
}
159171

160-
Map<String, List<Interval>> ret3 = divideGenome(dict, 3002, false);
172+
Map<String, List<Interval>> ret3 = divideGenome(dict, 3002, false, -1);
161173
assertEquals("Incorrect number of jobs", 3, ret3.size());
162174
for (String jobName : ret3.keySet())
163175
{
@@ -167,7 +179,7 @@ public void testScatter()
167179
}
168180
}
169181

170-
Map<String, List<Interval>> ret4 = divideGenome(dict, 2999, false);
182+
Map<String, List<Interval>> ret4 = divideGenome(dict, 2999, false, -1);
171183
assertEquals("Incorrect number of jobs", 3, ret4.size());
172184
for (String jobName : ret4.keySet())
173185
{
@@ -177,13 +189,16 @@ public void testScatter()
177189
}
178190
}
179191

180-
Map<String, List<Interval>> ret5 = divideGenome(dict, 750, true);
192+
Map<String, List<Interval>> ret5 = divideGenome(dict, 750, true, -1);
181193
assertEquals("Incorrect number of jobs", 10, ret5.size());
182194
assertEquals("Incorrect interval end", 1000, ret5.get("Job1").get(0).getEnd());
183195
assertEquals("Incorrect interval end", 4, ret5.get("Job10").size());
184196

185197
assertEquals("Incorrect interval start", 751, ret5.get("Job3").get(0).getStart());
186198
assertEquals("Incorrect interval start", 1501, ret5.get("Job8").get(0).getStart());
199+
200+
Map<String, List<Interval>> ret6 = divideGenome(dict, 5000, false, 2);
201+
assertEquals("Incorrect number of jobs", 5, ret6.size());
187202
}
188203
}
189204
}

0 commit comments

Comments
 (0)