@@ -90,10 +90,13 @@ public static CellHashingServiceImpl get()
9090 return _instance ;
9191 }
9292
93+ private static String TCR_FIELD = "tcrReadsetId" ;
94+
9395 @ Override
94- public void prepareHashingForVdjIfNeeded (File sourceDir , PipelineJob job , SequenceAnalysisJobSupport support , String filterField , final boolean failIfNoHashingReadset ) throws PipelineJobException
96+ public void prepareHashingForVdjIfNeeded (SequenceOutputHandler . JobContext ctx , final boolean failIfNoHashingReadset ) throws PipelineJobException
9597 {
96- prepareHashingAndCiteSeqFilesIfNeeded (sourceDir , job , support , filterField , failIfNoHashingReadset , false , true , true , false , false );
98+ boolean useDemEM = ctx .getParams ().optString ("methods" , "" ).contains (CellHashingService .CALLING_METHOD .demuxem .name ());
99+ prepareHashingAndCiteSeqFilesIfNeeded (ctx .getOutputDir (), ctx .getJob (), ctx .getSequenceSupport (), TCR_FIELD , failIfNoHashingReadset , false , true , true , false , useDemEM );
97100 }
98101
99102 public void prepareHashingAndCiteSeqFilesForFeatureCountsIfNeeded (File sourceDir , PipelineJob job , SequenceAnalysisJobSupport support , String filterField , final boolean failIfNoHashingReadset , final boolean failIfNoCiteSeqReadset ) throws PipelineJobException
@@ -133,6 +136,7 @@ public void prepareHashingAndCiteSeqFilesIfNeeded(File sourceDir, PipelineJob jo
133136 HashMap <String , Integer > gexReadsetToH5Map = new HashMap <>();
134137 HashMap <Integer , Integer > readsetToHashingMap = new HashMap <>();
135138 HashMap <Integer , Integer > readsetToCiteSeqMap = new HashMap <>();
139+ HashMap <Integer , Integer > readsetToGexMap = new HashMap <>();
136140 HashMap <Integer , Set <String >> gexToPanels = new HashMap <>();
137141
138142 List <Readset > cachedReadsets = support .getCachedReadsets ();
@@ -227,10 +231,13 @@ public void prepareHashingAndCiteSeqFilesIfNeeded(File sourceDir, PipelineJob jo
227231 readsetToCiteSeqMap .put (rs .getReadsetId (), results .getInt (FieldKey .fromString ("citeseqReadsetId" )));
228232 }
229233 }
234+
235+ readsetToGexMap .put (rs .getReadsetId (), results .getInt (FieldKey .fromString ("readsetId" )));
230236 });
231237
232238 job .getLogger ().debug ("total readset to hashing pairs: " + readsetToHashingMap .size ());
233239 job .getLogger ().debug ("total readset to cite-seq pairs: " + readsetToCiteSeqMap .size ());
240+ job .getLogger ().debug ("total readset to GEX pairs: " + readsetToGexMap .size ());
234241
235242 if (hasError .get ())
236243 {
@@ -247,42 +254,7 @@ public void prepareHashingAndCiteSeqFilesIfNeeded(File sourceDir, PipelineJob jo
247254
248255 if (doH5Caching )
249256 {
250- job .getLogger ().debug ("Caching H5 Files" );
251- TableInfo ti = QueryService .get ().getUserSchema (job .getUser (), job .getContainer ().isWorkbook () ? job .getContainer ().getParent () : job .getContainer (), SingleCellSchema .SEQUENCE_SCHEMA_NAME ).getTable ("outputfiles" );
252- Set <Integer > cachedGenomes = support .getCachedGenomes ().stream ().map (ReferenceGenome ::getGenomeId ).collect (Collectors .toSet ());
253- for (int readsetId : uniqueGex )
254- {
255- for (int genomeId : cachedGenomes )
256- {
257- SimpleFilter filter = new SimpleFilter (FieldKey .fromString ("readset" ), readsetId , CompareType .EQUAL );
258- filter .addCondition (FieldKey .fromString ("library_id" ), genomeId , CompareType .EQUAL );
259- filter .addCondition (FieldKey .fromString ("category" ), LOUPE_CATEGORY , CompareType .EQUAL );
260-
261- TableSelector ts = new TableSelector (ti , PageFlowUtil .set ("dataid" ), filter , new org .labkey .api .data .Sort ("-rowid" ));
262- if (ts .exists ())
263- {
264- List <Integer > dataIds = ts .getArrayList (Integer .class );
265- int dataId = dataIds .get (0 );
266- if (dataIds .size () > 1 )
267- {
268- job .getLogger ().info ("More than one loupe file found for readset " + readsetId + " with genome: " + genomeId + ". Using the most recent: " + dataId );
269- }
270-
271- ExpData d = ExperimentService .get ().getExpData (dataId );
272- if (d == null )
273- {
274- throw new PipelineJobException ("Unable to find exp data: " + dataId );
275- }
276-
277- support .cacheExpData (d );
278- gexReadsetToH5Map .put (readsetId + "-" + genomeId , dataId );
279- }
280- else
281- {
282- job .getLogger ().warn ("Unable to find loupe file for readset: " + readsetId + " with genome: " + genomeId );
283- }
284- }
285- }
257+ gexReadsetToH5Map .putAll (cacheH5Files (job , support , uniqueGex , readsetToGexMap ));
286258 }
287259
288260 // if distinct HTOs is 1, no point in running hashing. note: presence of hashing readsets is a trigger downstream
@@ -443,6 +415,85 @@ else if (distinctHTOs.size() == 1)
443415 }
444416 }
445417
418+ private Map <String , Integer > cacheH5Files (PipelineJob job , SequenceAnalysisJobSupport support , Collection <Integer > uniqueGex , Map <Integer , Integer > readsetToGexMap ) throws PipelineJobException
419+ {
420+ job .getLogger ().debug ("Caching H5 Files" );
421+ Map <String , Integer > gexReadsetToH5Map = new HashMap <>();
422+
423+ TableInfo ti = QueryService .get ().getUserSchema (job .getUser (), job .getContainer ().isWorkbook () ? job .getContainer ().getParent () : job .getContainer (), SingleCellSchema .SEQUENCE_SCHEMA_NAME ).getTable ("outputfiles" );
424+ Set <Integer > cachedGenomes = support .getCachedGenomes ().stream ().map (ReferenceGenome ::getGenomeId ).collect (Collectors .toSet ());
425+
426+ for (int readsetId : readsetToGexMap .keySet ())
427+ {
428+ boolean isGEX = uniqueGex .contains (readsetId );
429+ int gexReadset = readsetToGexMap .get (readsetId );
430+
431+ SimpleFilter filter = new SimpleFilter (FieldKey .fromString ("readset" ), gexReadset , CompareType .EQUAL );
432+ filter .addCondition (FieldKey .fromString ("category" ), LOUPE_CATEGORY , CompareType .EQUAL );
433+
434+ int gexGenomeId ;
435+ if (isGEX )
436+ {
437+ if (cachedGenomes .size () > 1 )
438+ {
439+ throw new PipelineJobException ("demuxEM was selected, but more than one cached genome found, cannot infer correct genome. Found: " + StringUtils .join (cachedGenomes , ", " ));
440+ }
441+
442+ gexGenomeId = cachedGenomes .iterator ().next ();
443+ filter .addCondition (FieldKey .fromString ("library_id" ), gexGenomeId , CompareType .EQUAL );
444+ }
445+ else
446+ {
447+ job .getLogger ().debug ("Readset is not GEX, attempting to infer the loupe file genome" );
448+ HashSet <Integer > genomeIds = new HashSet <>(new TableSelector (ti , PageFlowUtil .set ("library_id" ), filter , new org .labkey .api .data .Sort ("-rowid" )).getArrayList (Integer .class ));
449+ if (genomeIds .isEmpty ())
450+ {
451+ throw new PipelineJobException ("demuxEM was selected, but no suitable loupe files were found for GEX readset: " + gexReadset );
452+ }
453+ else if (genomeIds .size () > 1 )
454+ {
455+ throw new PipelineJobException ("demuxEM was selected. Attempting to identify loupe files using GEX readset: " + gexReadset + ", but more than one genome found. Found: " + StringUtils .join (cachedGenomes , ", " ));
456+ }
457+
458+ gexGenomeId = genomeIds .iterator ().next ();
459+ filter .addCondition (FieldKey .fromString ("library_id" ), gexGenomeId , CompareType .EQUAL );
460+ }
461+
462+ TableSelector ts = new TableSelector (ti , PageFlowUtil .set ("dataid" ), filter , new org .labkey .api .data .Sort ("-rowid" ));
463+ if (ts .exists ())
464+ {
465+ List <Integer > dataIds = ts .getArrayList (Integer .class );
466+ int dataId = dataIds .get (0 );
467+ if (dataIds .size () > 1 )
468+ {
469+ job .getLogger ().info ("More than one loupe file found for GEX readset " + gexReadset + " with genome: " + gexGenomeId + ". Using the most recent: " + dataId );
470+ }
471+
472+ ExpData d = ExperimentService .get ().getExpData (dataId );
473+ if (d == null )
474+ {
475+ throw new PipelineJobException ("Unable to find exp data: " + dataId );
476+ }
477+
478+ support .cacheExpData (d );
479+
480+ if (cachedGenomes .size () > 1 )
481+ {
482+ throw new PipelineJobException ("demuxEM was selected, but more than one cached genome found, cannot infer correct genome. Found: " + StringUtils .join (cachedGenomes , ", " ));
483+ }
484+
485+ // NOTE: cache this using the source file's genome ID (which might be the TCR library), rather than the GEX genome
486+ gexReadsetToH5Map .put (readsetId + "-" + cachedGenomes .iterator ().next (), dataId );
487+ }
488+ else
489+ {
490+ job .getLogger ().warn ("Unable to find loupe file for GEX readset: " + gexReadset + " with genome: " + gexGenomeId );
491+ }
492+ }
493+
494+ return gexReadsetToH5Map ;
495+ }
496+
446497 public File getValidCiteSeqBarcodeFile (File sourceDir , int gexReadsetId )
447498 {
448499 return new File (sourceDir , "validADTS." + gexReadsetId + ".csv" );
0 commit comments