@@ -75,7 +75,7 @@ def getKineticsDepository(FullDatabase, family, depositoryLabel):
7575
7676 return exactKinetics , approxKinetics
7777
78- def getKineticsLeaveOneOut (family ):
78+ def getKineticsLeaveOneOut (family , averaging = True ):
7979 """
8080 Performs the leave one out test on a family. It returns a dictionary of
8181 the original exact nodes and a dictionary of the new averaged nodes.
@@ -89,15 +89,52 @@ def getKineticsLeaveOneOut(family):
8989 exactKinetics = {}
9090 approxKinetics = {}
9191
92+ rootTemplate = family .getRootTemplate ()
93+
9294 for entryKey in family .rules .entries .keys ():
9395 template = family .retrieveTemplate (entryKey .split (';' ))
94- exactKinetics [entryKey ], exactKineticsEntry = family .rules .estimateKinetics (template )
96+ exactKineticsEntry = family .rules .getRule (template ) # fetch the best matched rule
97+ if exactKineticsEntry .rank == 0 :
98+ # Skip rank zero entries, because they are replaced by averaged values
99+ # during model generation
100+ continue
101+
102+ exactKineticsData = exactKineticsEntry .data
103+ if exactKineticsData .comment :
104+ if 'training' not in exactKineticsData .comment :
105+ # Averaged nodes have kinetics data comments, so skip them unless
106+ # They were created from training reactions
107+ continue
108+
109+ exactKinetics [entryKey ] = exactKineticsData
110+
111+ if averaging :
112+ # In this scheme, we remove the data fully and try to pretend the database
113+ # wants this kinetic value when we know nothing about it
114+ familyCopy = copy .deepcopy (family )
115+ familyCopy .rules .entries .pop (entryKey )
116+ familyCopy .fillKineticsRulesByAveragingUp ()
117+ approxKinetics [entryKey ], approxKineticsEntry = familyCopy .rules .estimateKinetics (template )
118+ else :
119+ # In this scheme, do not re-average the tree, just try to see if the nearest
120+ # best node provides a good estimate for the original kinetics.
121+ # This takes significanty less time to run, but is not a true validation of the data,
122+ # it is just testing our kinetics selection algorithm
123+
124+ # Skip the top node in this scheme, because nothing can predict it if removed
125+
126+ if template == rootTemplate :
127+ continue
128+
129+ originalEntry = family .rules .entries [entryKey ]
130+ family .rules .entries .pop (entryKey )
131+ approxKinetics [entryKey ], approxKineticsEntry = family .rules .estimateKinetics (template )
132+ # Re-add the data back into the original family
133+ family .rules .entries [entryKey ] = originalEntry
134+
135+
95136
96- familyCopy = copy .deepcopy (family )
97- familyCopy .rules .entries .pop (entryKey )
98- familyCopy .fillKineticsRulesByAveragingUp ()
99137
100- approxKinetics [entryKey ], approxKineticsEntry = familyCopy .rules .estimateKinetics (template )
101138
102139 return exactKinetics , approxKinetics
103140
@@ -211,20 +248,17 @@ def obtainKineticsFamilyStatistics(FullDatabase, trialDir):
211248 as it would add non-exact rules to the rule count)
212249 """
213250 allFamilyNames = FullDatabase .kinetics .families .keys ()
214- allFamilyNames .sort () # Perform test on families alphabetically
215-
216- familyCount = {}
251+ allFamilyNames .sort (key = str .lower ) # Perform test on families alphabetically
217252
218- for familyName in allFamilyNames :
219- family = FullDatabase .kinetics .families [familyName ]
220- print "Processing" , familyName + '...' , '(' + str (len (family .rules .entries )) + ' rate rules)'
221- familyCount [familyName ]= countNodes (family )
222-
223253 with open (os .path .join (trialDir , 'FamilyStatistics.csv' ), 'wb' ) as csvfile :
224254 csvwriter = csv .writer (csvfile )
225255 csvwriter .writerow (['Family' ,'Number of Rules' , 'Top Node 1' , 'Number of Groups' , 'Top Node 2' , 'Number of Groups' , 'Top Node 3' , 'Number of Groups' ])
226- for key , value in familyCount .iteritems ():
227- csvwriter .writerow (value )
256+
257+ for familyName in allFamilyNames :
258+ family = FullDatabase .kinetics .families [familyName ]
259+ print "Processing" , familyName + '...' , '(' + str (len (family .rules .entries )) + ' rate rules)'
260+ # Save to csv file
261+ csvwriter .writerow (countNodes (family ))
228262
229263
230264def compareNIST (FullDatabase , trialDir ):
@@ -240,56 +274,56 @@ def compareNIST(FullDatabase, trialDir):
240274
241275
242276 allFamilyNames = FullDatabase .kinetics .families .keys ()
243- allFamilyNames .sort () # Perform test on families alphabetically
277+ allFamilyNames .sort (key = str . lower ) # Perform test on families alphabetically
244278
245279 QDict = {}
246280
247- for familyName in allFamilyNames :
248- family = FullDatabase .kinetics .families [familyName ]
249- print "Processing" , familyName + '...' , '(' + str (len (family .rules .entries )) + ' rate rules)'
250- if len (family .rules .entries ) < 2 :
251- print ' Skipping' , familyName , ': only has one rate rule...'
252- else :
253- exactKinetics , approxKinetics = getKineticsDepository (FullDatabase , family , 'NIST' )
254-
255- #prune for exact matches only
256- keysToRemove = []
257- for key , kinetics in approxKinetics .iteritems ():
258- if not re .search ('Exact' , kinetics .comment ):
259- keysToRemove .append (key )
260-
261- for key in keysToRemove :
262- del approxKinetics [key ]
263-
264- parityData = analyzeForParity (exactKinetics , approxKinetics , cutoff = 8.0 )
265-
266- if len (parityData )< 2 :
267- print ' Skipping' , familyName , ': {} reactions were compared...' .format (len (parityData ))
268- continue
269- QDict [familyName ]= calculateQ (parityData )
270- createParityPlot (parityData )
271- plt .title (familyName )
272- plt .savefig (os .path .join (trialDir , familyName + '.png' ))
273- plt .clf ()
274-
275- if not os .path .exists (os .path .join (os .path .join (trialDir , 'ParityData' ))):
276- os .makedirs (os .path .join (trialDir , 'ParityData' ))
277-
278- with open (os .path .join (trialDir , 'ParityData' , familyName + '.csv' ), 'wb' ) as csvfile :
279- csvwriter = csv .writer (csvfile )
280- for key , value in parityData .iteritems ():
281- csvwriter .writerow ([key , value [0 ]/ value [1 ], approxKinetics [key ].comment ])
282-
283281 with open (os .path .join (trialDir , 'QDict_LOO.csv' ), 'wb' ) as csvfile :
284282 csvwriter = csv .writer (csvfile )
285- for key , value in QDict .iteritems ():
286- csvwriter .writerow ([key , value ])
283+ for familyName in allFamilyNames :
284+ family = FullDatabase .kinetics .families [familyName ]
285+ print "Processing" , familyName + '...' , '(' + str (len (family .rules .entries )) + ' rate rules)'
286+ if len (family .rules .entries ) < 2 :
287+ print ' Skipping' , familyName , ': only has one rate rule...'
288+ else :
289+ exactKinetics , approxKinetics = getKineticsDepository (FullDatabase , family , 'NIST' )
290+
291+ #prune for exact matches only
292+ keysToRemove = []
293+ for key , kinetics in approxKinetics .iteritems ():
294+ if not re .search ('Exact' , kinetics .comment ):
295+ keysToRemove .append (key )
296+
297+ for key in keysToRemove :
298+ del approxKinetics [key ]
299+
300+ parityData = analyzeForParity (exactKinetics , approxKinetics , cutoff = 8.0 )
301+
302+ if len (parityData )< 2 :
303+ print ' Skipping' , familyName , ': {} reactions were compared...' .format (len (parityData ))
304+ continue
305+ QDict [familyName ]= calculateQ (parityData )
306+ createParityPlot (parityData )
307+ plt .title (familyName )
308+ plt .savefig (os .path .join (trialDir , familyName + '.png' ))
309+ plt .clf ()
310+
311+ if not os .path .exists (os .path .join (os .path .join (trialDir , 'ParityData' ))):
312+ os .makedirs (os .path .join (trialDir , 'ParityData' ))
313+
314+ with open (os .path .join (trialDir , 'ParityData' , familyName + '.csv' ), 'wb' ) as paritycsvfile :
315+ paritycsvwriter = csv .writer (paritycsvfile )
316+ for key , value in parityData .iteritems ():
317+ paritycsvwriter .writerow ([key , value [0 ]/ value [1 ], approxKinetics [key ].comment ])
318+
319+ # Save data to csv file
320+ csvwriter .writerow ([familyName , QDict [familyName ]])
287321
288322
289323
290324
291325
292- def leaveOneOut (FullDatabase , trialDir ):
326+ def leaveOneOut (FullDatabase , trialDir , averaging = True ):
293327 """
294328 Performs leave one out analysis on all the kinetics families.
295329 The algorithm deletes a single entry in the family, and then re-averages the tree
@@ -306,39 +340,44 @@ def leaveOneOut(FullDatabase, trialDir):
306340 os .makedirs (trialDir )
307341
308342 allFamilyNames = FullDatabase .kinetics .families .keys ()
309- allFamilyNames .sort () # Perform test on families alphabetically
343+ allFamilyNames .sort (key = str . lower ) # Perform test on families alphabetically
310344 QDict = {}
311345
312- for familyName in allFamilyNames :
313- family = FullDatabase .kinetics .families [familyName ]
314- print "Processing" , familyName + '...' , '(' + str (len (family .rules .entries )) + ' rate rules)'
315- if len (family .rules .entries ) < 2 :
316- print ' Skipping' , familyName , ': only has one rate rule...'
317- else :
318- exactKinetics , approxKinetics = getKineticsLeaveOneOut (family )
319- parityData = analyzeForParity (exactKinetics , approxKinetics , cutoff = 8.0 )
320-
321- if len (parityData )< 2 :
322- print ' Skipping' , familyName , ': {} rate rules were compared...' .format (len (parityData ))
323- continue
324- QDict [familyName ]= calculateQ (parityData )
325- createParityPlot (parityData )
326- plt .title (familyName )
327- plt .savefig (os .path .join (trialDir , familyName + '.png' ))
328- plt .clf ()
329-
330- if not os .path .exists (os .path .join (os .path .join (trialDir , 'ParityData' ))):
331- os .makedirs (os .path .join (trialDir , 'ParityData' ))
332-
333- with open (os .path .join (trialDir , 'ParityData' , familyName + '.csv' ), 'wb' ) as csvfile :
334- csvwriter = csv .writer (csvfile )
335- for key , value in parityData .iteritems ():
336- csvwriter .writerow ([key , value [0 ]/ value [1 ], approxKinetics [key ].comment ])
337346
338347 with open (os .path .join (trialDir , 'QDict_LOO.csv' ), 'wb' ) as csvfile :
339348 csvwriter = csv .writer (csvfile )
340- for key , value in QDict .iteritems ():
341- csvwriter .writerow ([key , value ])
349+
350+ for familyName in allFamilyNames :
351+ family = FullDatabase .kinetics .families [familyName ]
352+ print "Processing" , familyName + '...' , '(' + str (len (family .rules .entries )) + ' rate rules)'
353+ if len (family .rules .entries ) < 2 :
354+ print ' Skipping' , familyName , ': only has one rate rule...'
355+ else :
356+ if not averaging :
357+ # Pre-average the family if averaging is not turned on
358+ family .fillKineticsRulesByAveragingUp ()
359+ exactKinetics , approxKinetics = getKineticsLeaveOneOut (family , averaging )
360+ parityData = analyzeForParity (exactKinetics , approxKinetics , cutoff = 8.0 )
361+
362+ if len (parityData )< 2 :
363+ print ' Skipping' , familyName , ': {} rate rules were compared...' .format (len (parityData ))
364+ continue
365+ QDict [familyName ]= calculateQ (parityData )
366+ createParityPlot (parityData )
367+ plt .title (familyName )
368+ plt .savefig (os .path .join (trialDir , familyName + '.png' ))
369+ plt .clf ()
370+
371+ if not os .path .exists (os .path .join (os .path .join (trialDir , 'ParityData' ))):
372+ os .makedirs (os .path .join (trialDir , 'ParityData' ))
373+
374+ with open (os .path .join (trialDir , 'ParityData' , familyName + '.csv' ), 'wb' ) as paritycsvfile :
375+ paritycsvwriter = csv .writer (paritycsvfile )
376+ for key , value in parityData .iteritems ():
377+ paritycsvwriter .writerow ([key , value [0 ]/ value [1 ], approxKinetics [key ].comment ])
378+
379+ # Save to csv file
380+ csvwriter .writerow ([familyName , QDict [familyName ]])
342381
343382 return
344383
@@ -370,7 +409,7 @@ def leaveOneOut(FullDatabase, trialDir):
370409
371410 print '--------------------------------------------'
372411 print 'Performing the leave on out test on the kinetics families...'
373- leaveOneOut (FullDatabase , trialDir )
412+ leaveOneOut (FullDatabase , trialDir , averaging = False )
374413
375414 print '--------------------------------------------'
376415 print 'Filling up the family rate rules by averaging... Expect larger number of rate rules in subsequent tests'
0 commit comments