22
33import au .com .bytecode .opencsv .CSVReader ;
44import au .com .bytecode .opencsv .CSVWriter ;
5+ import com .google .common .collect .Lists ;
56import htsjdk .samtools .util .CloseableIterator ;
67import htsjdk .samtools .util .Interval ;
78import htsjdk .variant .utils .SAMSequenceDictionaryExtractor ;
@@ -250,46 +251,22 @@ private Map<String, String> getSamplesToAlias(File input) throws PipelineJobExce
250251 return Collections .emptyMap ();
251252 }
252253
253- TableInfo ti = QueryService .get ().getUserSchema (getPipelineCtx ().getJob ().getUser (), (getPipelineCtx ().getJob ().getContainer ().isWorkbook () ? getPipelineCtx ().getJob ().getContainer ().getParent () : getPipelineCtx ().getJob ().getContainer ()), mGAPSchema .NAME ).getTable (mGAPSchema .TABLE_ANIMAL_MAPPING );
254- TableSelector ts = new TableSelector (ti , PageFlowUtil .set ("subjectname" , "externalAlias" , "otherNames" ), new SimpleFilter (FieldKey .fromString ("subjectname" ), subjects , CompareType .IN ), null );
255- ts .forEachResults (new Selector .ForEachBlock <Results >()
256- {
257- @ Override
258- public void exec (Results rs ) throws SQLException
259- {
260- sampleNameMap .put (rs .getString (FieldKey .fromString ("subjectname" )), rs .getString (FieldKey .fromString ("externalAlias" )));
261-
262- if (rs .getObject (FieldKey .fromString ("otherNames" )) != null )
263- {
264- String val = StringUtils .trimToNull (rs .getString (FieldKey .fromString ("otherNames" )));
265- if (val != null )
266- {
267- String [] tokens = val .split ("," );
268- for (String name : tokens )
269- {
270- name = StringUtils .trimToNull (name );
271- if (name == null )
272- {
273- continue ;
274- }
275-
276- if (sampleNameMap .containsKey (name ) && !sampleNameMap .get (name ).equals (rs .getString (FieldKey .fromString ("externalAlias" ))))
277- {
278- throw new IllegalStateException ("Improper data in mgap.aliases table. Dual/conflicting aliases: " + name + ": " + rs .getString (FieldKey .fromString ("externalAlias" )) + " / " + sampleNameMap .get (name ));
279- }
280-
281- sampleNameMap .put (name , rs .getString (FieldKey .fromString ("externalAlias" )));
282- }
283- }
284- }
285- }
286- });
287-
288254 Set <String > sampleNames = new HashSet <>(header .getSampleNamesInOrder ());
289255 getPipelineCtx ().getLogger ().info ("total samples in input VCF: " + sampleNames .size ());
290256
291- sampleNames .retainAll (subjects );
292- getPipelineCtx ().getLogger ().info ("total samples to be written to any track: " + sampleNames .size ());
257+ // Pass 1: match on proper ID:
258+ querySampleBatch (sampleNameMap , new SimpleFilter (FieldKey .fromString ("subjectname" ), subjects , CompareType .IN ));
259+
260+ // Pass 2: add others using otherNames:
261+ List <String > missingSamples = new ArrayList <>(sampleNames );
262+ missingSamples .removeAll (sampleNameMap .keySet ());
263+ if (!missingSamples .isEmpty ())
264+ {
265+ getPipelineCtx ().getLogger ().debug ("Querying " + missingSamples .size () + " samples using otherNames field" );
266+ querySampleBatch (sampleNameMap , new SimpleFilter (FieldKey .fromString ("otherNames" ), missingSamples , CompareType .CONTAINS_ONE_OF ));
267+ }
268+
269+ getPipelineCtx ().getLogger ().info ("total sample names to alias: " + sampleNameMap .size ());
293270
294271 sampleNames .removeAll (sampleNameMap .keySet ());
295272 if (!sampleNames .isEmpty ())
@@ -298,17 +275,54 @@ public void exec(Results rs) throws SQLException
298275 }
299276
300277 //Now ensure we dont have duplicate mappings:
301- List <String > translated = new ArrayList <>(sampleNames . stream ().map (sampleNameMap ::get ).collect ( Collectors . toList () ));
278+ List <String > translated = new ArrayList <>(header . getSampleNamesInOrder (). stream ().map (sampleNameMap ::get ).toList ());
302279 Set <String > unique = new HashSet <>();
303- List <String > duplicates = translated .stream ().filter (o -> !unique .add (o )).collect ( Collectors . toList () );
280+ List <String > duplicates = translated .stream ().filter (o -> !unique .add (o )).toList ();
304281 if (!duplicates .isEmpty ())
305282 {
306283 throw new PipelineJobException ("There were duplicate mGAP IDs are translation. They were: " + StringUtils .join (duplicates , "," ));
307284 }
308285 }
309286
310- getPipelineCtx ().getLogger ().info ("total sample names to alias: " + sampleNameMap .size ());
311-
312287 return sampleNameMap ;
313288 }
289+
290+ private void querySampleBatch (Map <String , String > sampleNameMap , SimpleFilter filter )
291+ {
292+ TableInfo ti = QueryService .get ().getUserSchema (getPipelineCtx ().getJob ().getUser (), (getPipelineCtx ().getJob ().getContainer ().isWorkbook () ? getPipelineCtx ().getJob ().getContainer ().getParent () : getPipelineCtx ().getJob ().getContainer ()), mGAPSchema .NAME ).getTable (mGAPSchema .TABLE_ANIMAL_MAPPING );
293+ TableSelector ts = new TableSelector (ti , PageFlowUtil .set ("subjectname" , "externalAlias" , "otherNames" ), new SimpleFilter (filter ), null );
294+ ts .forEachResults (new Selector .ForEachBlock <Results >()
295+ {
296+ @ Override
297+ public void exec (Results rs ) throws SQLException
298+ {
299+ sampleNameMap .put (rs .getString (FieldKey .fromString ("subjectname" )), rs .getString (FieldKey .fromString ("externalAlias" )));
300+
301+ if (rs .getObject (FieldKey .fromString ("otherNames" )) != null )
302+ {
303+ String val = StringUtils .trimToNull (rs .getString (FieldKey .fromString ("otherNames" )));
304+ if (val != null )
305+ {
306+ String [] tokens = val .split ("," );
307+ for (String name : tokens )
308+ {
309+ name = StringUtils .trimToNull (name );
310+ if (name == null )
311+ {
312+ continue ;
313+ }
314+
315+ if (sampleNameMap .containsKey (name ) && !sampleNameMap .get (name ).equals (rs .getString (FieldKey .fromString ("externalAlias" ))))
316+ {
317+ throw new IllegalStateException ("Improper data in mgap.aliases table. Dual/conflicting aliases: " + name + ": " + rs .getString (FieldKey .fromString ("externalAlias" )) + " / " + sampleNameMap .get (name ));
318+ }
319+
320+ getPipelineCtx ().getLogger ().debug ("Adding otherName: " + name );
321+ sampleNameMap .put (name , rs .getString (FieldKey .fromString ("externalAlias" )));
322+ }
323+ }
324+ }
325+ }
326+ });
327+ }
314328}
0 commit comments