77import org .junit .Test ;
88
99import java .util .ArrayList ;
10+ import java .util .HashSet ;
1011import java .util .LinkedHashMap ;
1112import java .util .List ;
1213import java .util .Map ;
14+ import java .util .Set ;
1315
1416public class ScatterGatherUtils
1517{
@@ -28,34 +30,43 @@ private static class ActiveIntervalSet
2830 private final int _optimalBasesPerJob ;
2931 private final LinkedHashMap <String , List <Interval >> _results ;
3032 private final boolean _allowSplitChromosomes ;
33+ private final int _maxContigsPerJob ;
3134
3235 private List <Interval > _intervalList = new ArrayList <>();
3336 private int _basesPerActiveIntervalList = 0 ;
37+ private Set <String > _contigsInActiveIntervalList = new HashSet <>();
3438 private int _activeJobId = 1 ;
3539
36- public ActiveIntervalSet (int optimalBasesPerJob , boolean allowSplitChromosomes )
40+ public ActiveIntervalSet (int optimalBasesPerJob , boolean allowSplitChromosomes , int maxContigsPerJob )
3741 {
3842 _optimalBasesPerJob = optimalBasesPerJob ;
3943 _results = new LinkedHashMap <>();
4044 _allowSplitChromosomes = allowSplitChromosomes ;
45+ _maxContigsPerJob = maxContigsPerJob ;
4146 }
4247
4348 private void possiblyEndSet ()
4449 {
4550 int basesRemaining = getBasesRemainingForInterval ();
46- if (basesRemaining <= 0 )
51+ if (basesRemaining <= 0 || exceedsAllowableContigs () )
4752 {
4853 closeSet ();
4954 }
5055 }
5156
57+ private boolean exceedsAllowableContigs ()
58+ {
59+ return _maxContigsPerJob != -1 && _contigsInActiveIntervalList .size () >= _maxContigsPerJob ;
60+ }
61+
5262 public void closeSet ()
5363 {
5464 if (!_intervalList .isEmpty ())
5565 {
5666 _results .put ("Job" + _activeJobId , new ArrayList <>(_intervalList ));
5767 _intervalList .clear ();
5868 _basesPerActiveIntervalList = 0 ;
69+ _contigsInActiveIntervalList .clear ();
5970 _activeJobId ++;
6071 }
6172 }
@@ -101,14 +112,15 @@ private void addInterval(String refName, int start, int end)
101112 {
102113 _intervalList .add (new Interval (refName , start , end ));
103114 _basesPerActiveIntervalList += (end - start + 1 );
115+ _contigsInActiveIntervalList .add (refName );
104116
105117 possiblyEndSet ();
106118 }
107119 }
108120
109- public static LinkedHashMap <String , List <Interval >> divideGenome (SAMSequenceDictionary dict , int optimalBasesPerJob , boolean allowSplitChromosomes )
121+ public static LinkedHashMap <String , List <Interval >> divideGenome (SAMSequenceDictionary dict , int optimalBasesPerJob , boolean allowSplitChromosomes , int maxContigsPerJob )
110122 {
111- ActiveIntervalSet ais = new ActiveIntervalSet (optimalBasesPerJob , allowSplitChromosomes );
123+ ActiveIntervalSet ais = new ActiveIntervalSet (optimalBasesPerJob , allowSplitChromosomes , maxContigsPerJob );
112124 for (SAMSequenceRecord rec : dict .getSequences ())
113125 {
114126 ais .add (rec );
@@ -141,13 +153,13 @@ private SAMSequenceDictionary getDict()
141153 public void testScatter ()
142154 {
143155 SAMSequenceDictionary dict = getDict ();
144- Map <String , List <Interval >> ret = divideGenome (dict , 1000 , true );
156+ Map <String , List <Interval >> ret = divideGenome (dict , 1000 , true , - 1 );
145157 assertEquals ("Incorrect number of jobs" , 8 , ret .size ());
146158 assertEquals ("Incorrect interval end" , 2000 , ret .get ("Job3" ).get (0 ).getEnd ());
147159 assertEquals ("Incorrect start" , 1001 , ret .get ("Job3" ).get (0 ).getStart ());
148160 assertEquals ("Incorrect interval end" , 4 , ret .get ("Job8" ).size ());
149161
150- Map <String , List <Interval >> ret2 = divideGenome (dict , 3000 , false );
162+ Map <String , List <Interval >> ret2 = divideGenome (dict , 3000 , false , - 1 );
151163 assertEquals ("Incorrect number of jobs" , 3 , ret2 .size ());
152164 for (String jobName : ret2 .keySet ())
153165 {
@@ -157,7 +169,7 @@ public void testScatter()
157169 }
158170 }
159171
160- Map <String , List <Interval >> ret3 = divideGenome (dict , 3002 , false );
172+ Map <String , List <Interval >> ret3 = divideGenome (dict , 3002 , false , - 1 );
161173 assertEquals ("Incorrect number of jobs" , 3 , ret3 .size ());
162174 for (String jobName : ret3 .keySet ())
163175 {
@@ -167,7 +179,7 @@ public void testScatter()
167179 }
168180 }
169181
170- Map <String , List <Interval >> ret4 = divideGenome (dict , 2999 , false );
182+ Map <String , List <Interval >> ret4 = divideGenome (dict , 2999 , false , - 1 );
171183 assertEquals ("Incorrect number of jobs" , 3 , ret4 .size ());
172184 for (String jobName : ret4 .keySet ())
173185 {
@@ -177,13 +189,16 @@ public void testScatter()
177189 }
178190 }
179191
180- Map <String , List <Interval >> ret5 = divideGenome (dict , 750 , true );
192+ Map <String , List <Interval >> ret5 = divideGenome (dict , 750 , true , - 1 );
181193 assertEquals ("Incorrect number of jobs" , 10 , ret5 .size ());
182194 assertEquals ("Incorrect interval end" , 1000 , ret5 .get ("Job1" ).get (0 ).getEnd ());
183195 assertEquals ("Incorrect interval end" , 4 , ret5 .get ("Job10" ).size ());
184196
185197 assertEquals ("Incorrect interval start" , 751 , ret5 .get ("Job3" ).get (0 ).getStart ());
186198 assertEquals ("Incorrect interval start" , 1501 , ret5 .get ("Job8" ).get (0 ).getStart ());
199+
200+ Map <String , List <Interval >> ret6 = divideGenome (dict , 5000 , false , 2 );
201+ assertEquals ("Incorrect number of jobs" , 5 , ret6 .size ());
187202 }
188203 }
189204}
0 commit comments