4848import java .sql .SQLException ;
4949import java .util .ArrayList ;
5050import java .util .Arrays ;
51+ import java .util .Collection ;
5152import java .util .Collections ;
5253import java .util .Comparator ;
5354import java .util .HashMap ;
5758import java .util .Set ;
5859import java .util .TreeSet ;
5960import java .util .stream .Collectors ;
61+ import java .util .stream .Stream ;
6062
6163/**
6264 * User: bimber
@@ -496,9 +498,10 @@ private boolean doCollapse(Logger log)
496498 AlignmentGroup g1 = it .next ();
497499 while (it .hasNext ())
498500 {
501+ int orig = g1 .alleles .size ();
499502 if (compareGroupToOthers (g1 ))
500503 {
501- log .info ("Collapsed: " + g1 .lineages + ", with : " + g1 .alleles .size ());
504+ log .info ("Collapsed: " + g1 .lineages + ", from : " + orig + " to " + g1 .alleles .size () + " alleles" );
502505 return true ; // abort and restart the process with a new list iterator
503506 }
504507
@@ -563,14 +566,36 @@ public boolean canCombine(AlignmentGroup g2)
563566 return false ;
564567 }
565568
566- return CollectionUtils .disjunction (this .alleles , g2 .alleles ).size () == 1 ;
569+ // Allow greater level of collapse with highly ambiguous results:
570+ // Require similar sizes, but disjoint allele sets (e.g., A/B/D and A/C/D, but not A/B/C and A/D/E)
571+ int setDiffThreshold ;
572+ int sizeDiffThreshold ;
573+ if (this .alleles .size () >= 16 )
574+ {
575+ setDiffThreshold = 6 ;
576+ sizeDiffThreshold = 3 ;
577+ }
578+ else if (this .alleles .size () >= 8 )
579+ {
580+ setDiffThreshold = 4 ;
581+ sizeDiffThreshold = 2 ;
582+ }
583+ else
584+ {
585+ setDiffThreshold = 2 ;
586+ sizeDiffThreshold = 1 ;
587+ }
588+
589+ return Math .abs (this .alleles .size () - g2 .alleles .size ()) <= sizeDiffThreshold && CollectionUtils .disjunction (this .alleles , g2 .alleles ).size () <= setDiffThreshold ;
567590 }
568591
569592 public AlignmentGroup combine (AlignmentGroup g2 )
570593 {
571- // Take the larger allele set:
594+ // Take the union of the allele sets:
595+ TreeSet <String > allAlleles = Stream .of (this .alleles , g2 .alleles ).flatMap (Collection ::stream ).collect (Collectors .toCollection (TreeSet ::new ));
572596 if (g2 .alleles .size () > this .alleles .size ())
573597 {
598+ g2 .alleles = allAlleles ;
574599 g2 .rowIdsToDelete .addAll (this .rowIds );
575600 g2 .rowIdsToDelete .addAll (this .rowIdsToDelete );
576601 g2 .totalReads = g2 .totalReads + totalReads ;
@@ -582,6 +607,7 @@ public AlignmentGroup combine(AlignmentGroup g2)
582607 }
583608 else
584609 {
610+ this .alleles = allAlleles ;
585611 this .rowIdsToDelete .addAll (g2 .rowIds );
586612 this .rowIdsToDelete .addAll (g2 .rowIdsToDelete );
587613 this .totalReads = g2 .totalReads + totalReads ;
0 commit comments