Skip to content

Commit cb2f649

Browse files
committed
Add argument for performing leaveOneOutTest without averaging
Is not quite the original leaveOneOut test as it was done before. The original test takes extremely long to run because it must average up the family after each exact rate rule. In this method, we remove the original rate rule, and see if using an average that (should incorporate the original) does a good job of estimating that rate rule. The option to use the original version of the algorithm can still be used by setting averaging=True Write data to csv file after each family rather than after entire set of families so that they are in alphabetical order
1 parent 53d7907 commit cb2f649

File tree

1 file changed

+124
-85
lines changed

1 file changed

+124
-85
lines changed

testing/evaluateKinetics.py

Lines changed: 124 additions & 85 deletions
Original file line numberDiff line numberDiff line change
@@ -75,7 +75,7 @@ def getKineticsDepository(FullDatabase, family, depositoryLabel):
7575

7676
return exactKinetics, approxKinetics
7777

78-
def getKineticsLeaveOneOut(family):
78+
def getKineticsLeaveOneOut(family, averaging=True):
7979
"""
8080
Performs the leave one out test on a family. It returns a dictionary of
8181
the original exact nodes and a dictionary of the new averaged nodes.
@@ -89,15 +89,52 @@ def getKineticsLeaveOneOut(family):
8989
exactKinetics={}
9090
approxKinetics={}
9191

92+
rootTemplate = family.getRootTemplate()
93+
9294
for entryKey in family.rules.entries.keys():
9395
template = family.retrieveTemplate(entryKey.split(';'))
94-
exactKinetics[entryKey], exactKineticsEntry=family.rules.estimateKinetics(template)
96+
exactKineticsEntry=family.rules.getRule(template) # fetch the best matched rule
97+
if exactKineticsEntry.rank == 0:
98+
# Skip rank zero entries, because they are replaced by averaged values
99+
# during model generation
100+
continue
101+
102+
exactKineticsData = exactKineticsEntry.data
103+
if exactKineticsData.comment:
104+
if 'training' not in exactKineticsData.comment:
105+
# Averaged nodes have kinetics data comments, so skip them unless
106+
# They were created from training reactions
107+
continue
108+
109+
exactKinetics[entryKey] = exactKineticsData
110+
111+
if averaging:
112+
# In this scheme, we remove the data fully and try to pretend the database
113+
# wants this kinetic value when we know nothing about it
114+
familyCopy=copy.deepcopy(family)
115+
familyCopy.rules.entries.pop(entryKey)
116+
familyCopy.fillKineticsRulesByAveragingUp()
117+
approxKinetics[entryKey], approxKineticsEntry=familyCopy.rules.estimateKinetics(template)
118+
else:
119+
# In this scheme, do not re-average the tree, just try to see if the nearest
120+
# best node provides a good estimate for the original kinetics.
121+
# This takes significanty less time to run, but is not a true validation of the data,
122+
# it is just testing our kinetics selection algorithm
123+
124+
# Skip the top node in this scheme, because nothing can predict it if removed
125+
126+
if template == rootTemplate:
127+
continue
128+
129+
originalEntry = family.rules.entries[entryKey]
130+
family.rules.entries.pop(entryKey)
131+
approxKinetics[entryKey], approxKineticsEntry=family.rules.estimateKinetics(template)
132+
# Re-add the data back into the original family
133+
family.rules.entries[entryKey] = originalEntry
134+
135+
95136

96-
familyCopy=copy.deepcopy(family)
97-
familyCopy.rules.entries.pop(entryKey)
98-
familyCopy.fillKineticsRulesByAveragingUp()
99137

100-
approxKinetics[entryKey], approxKineticsEntry=familyCopy.rules.estimateKinetics(template)
101138

102139
return exactKinetics, approxKinetics
103140

@@ -211,20 +248,17 @@ def obtainKineticsFamilyStatistics(FullDatabase, trialDir):
211248
as it would add non-exact rules to the rule count)
212249
"""
213250
allFamilyNames=FullDatabase.kinetics.families.keys()
214-
allFamilyNames.sort() # Perform test on families alphabetically
215-
216-
familyCount={}
251+
allFamilyNames.sort(key=str.lower) # Perform test on families alphabetically
217252

218-
for familyName in allFamilyNames:
219-
family=FullDatabase.kinetics.families[familyName]
220-
print "Processing", familyName + '...', '(' + str(len(family.rules.entries)) + ' rate rules)'
221-
familyCount[familyName]=countNodes(family)
222-
223253
with open(os.path.join(trialDir, 'FamilyStatistics.csv'), 'wb') as csvfile:
224254
csvwriter=csv.writer(csvfile)
225255
csvwriter.writerow(['Family','Number of Rules', 'Top Node 1', 'Number of Groups', 'Top Node 2', 'Number of Groups', 'Top Node 3', 'Number of Groups'])
226-
for key, value in familyCount.iteritems():
227-
csvwriter.writerow(value)
256+
257+
for familyName in allFamilyNames:
258+
family=FullDatabase.kinetics.families[familyName]
259+
print "Processing", familyName + '...', '(' + str(len(family.rules.entries)) + ' rate rules)'
260+
# Save to csv file
261+
csvwriter.writerow(countNodes(family))
228262

229263

230264
def compareNIST(FullDatabase, trialDir):
@@ -240,56 +274,56 @@ def compareNIST(FullDatabase, trialDir):
240274

241275

242276
allFamilyNames=FullDatabase.kinetics.families.keys()
243-
allFamilyNames.sort() # Perform test on families alphabetically
277+
allFamilyNames.sort(key=str.lower) # Perform test on families alphabetically
244278

245279
QDict={}
246280

247-
for familyName in allFamilyNames:
248-
family=FullDatabase.kinetics.families[familyName]
249-
print "Processing", familyName + '...', '(' + str(len(family.rules.entries)) + ' rate rules)'
250-
if len(family.rules.entries) < 2:
251-
print ' Skipping', familyName, ': only has one rate rule...'
252-
else:
253-
exactKinetics, approxKinetics = getKineticsDepository(FullDatabase, family, 'NIST')
254-
255-
#prune for exact matches only
256-
keysToRemove=[]
257-
for key, kinetics in approxKinetics.iteritems():
258-
if not re.search('Exact', kinetics.comment):
259-
keysToRemove.append(key)
260-
261-
for key in keysToRemove:
262-
del approxKinetics[key]
263-
264-
parityData=analyzeForParity(exactKinetics, approxKinetics, cutoff=8.0)
265-
266-
if len(parityData)<2:
267-
print ' Skipping', familyName, ': {} reactions were compared...'.format(len(parityData))
268-
continue
269-
QDict[familyName]=calculateQ(parityData)
270-
createParityPlot(parityData)
271-
plt.title(familyName)
272-
plt.savefig(os.path.join(trialDir, familyName +'.png'))
273-
plt.clf()
274-
275-
if not os.path.exists(os.path.join(os.path.join(trialDir, 'ParityData'))):
276-
os.makedirs(os.path.join(trialDir, 'ParityData'))
277-
278-
with open(os.path.join(trialDir, 'ParityData', familyName + '.csv'), 'wb') as csvfile:
279-
csvwriter=csv.writer(csvfile)
280-
for key, value in parityData.iteritems():
281-
csvwriter.writerow([key, value[0]/value[1], approxKinetics[key].comment])
282-
283281
with open(os.path.join(trialDir, 'QDict_LOO.csv'), 'wb') as csvfile:
284282
csvwriter=csv.writer(csvfile)
285-
for key, value in QDict.iteritems():
286-
csvwriter.writerow([key, value])
283+
for familyName in allFamilyNames:
284+
family=FullDatabase.kinetics.families[familyName]
285+
print "Processing", familyName + '...', '(' + str(len(family.rules.entries)) + ' rate rules)'
286+
if len(family.rules.entries) < 2:
287+
print ' Skipping', familyName, ': only has one rate rule...'
288+
else:
289+
exactKinetics, approxKinetics = getKineticsDepository(FullDatabase, family, 'NIST')
290+
291+
#prune for exact matches only
292+
keysToRemove=[]
293+
for key, kinetics in approxKinetics.iteritems():
294+
if not re.search('Exact', kinetics.comment):
295+
keysToRemove.append(key)
296+
297+
for key in keysToRemove:
298+
del approxKinetics[key]
299+
300+
parityData=analyzeForParity(exactKinetics, approxKinetics, cutoff=8.0)
301+
302+
if len(parityData)<2:
303+
print ' Skipping', familyName, ': {} reactions were compared...'.format(len(parityData))
304+
continue
305+
QDict[familyName]=calculateQ(parityData)
306+
createParityPlot(parityData)
307+
plt.title(familyName)
308+
plt.savefig(os.path.join(trialDir, familyName +'.png'))
309+
plt.clf()
310+
311+
if not os.path.exists(os.path.join(os.path.join(trialDir, 'ParityData'))):
312+
os.makedirs(os.path.join(trialDir, 'ParityData'))
313+
314+
with open(os.path.join(trialDir, 'ParityData', familyName + '.csv'), 'wb') as paritycsvfile:
315+
paritycsvwriter=csv.writer(paritycsvfile)
316+
for key, value in parityData.iteritems():
317+
paritycsvwriter.writerow([key, value[0]/value[1], approxKinetics[key].comment])
318+
319+
# Save data to csv file
320+
csvwriter.writerow([familyName, QDict[familyName]])
287321

288322

289323

290324

291325

292-
def leaveOneOut(FullDatabase, trialDir):
326+
def leaveOneOut(FullDatabase, trialDir, averaging=True):
293327
"""
294328
Performs leave one out analysis on all the kinetics families.
295329
The algorithm deletes a single entry in the family, and then re-averages the tree
@@ -306,39 +340,44 @@ def leaveOneOut(FullDatabase, trialDir):
306340
os.makedirs(trialDir)
307341

308342
allFamilyNames=FullDatabase.kinetics.families.keys()
309-
allFamilyNames.sort() # Perform test on families alphabetically
343+
allFamilyNames.sort(key=str.lower) # Perform test on families alphabetically
310344
QDict={}
311345

312-
for familyName in allFamilyNames:
313-
family=FullDatabase.kinetics.families[familyName]
314-
print "Processing", familyName + '...', '(' + str(len(family.rules.entries)) + ' rate rules)'
315-
if len(family.rules.entries) < 2:
316-
print ' Skipping', familyName, ': only has one rate rule...'
317-
else:
318-
exactKinetics, approxKinetics = getKineticsLeaveOneOut(family)
319-
parityData=analyzeForParity(exactKinetics, approxKinetics, cutoff=8.0)
320-
321-
if len(parityData)<2:
322-
print ' Skipping', familyName, ': {} rate rules were compared...'.format(len(parityData))
323-
continue
324-
QDict[familyName]=calculateQ(parityData)
325-
createParityPlot(parityData)
326-
plt.title(familyName)
327-
plt.savefig(os.path.join(trialDir, familyName +'.png'))
328-
plt.clf()
329-
330-
if not os.path.exists(os.path.join(os.path.join(trialDir, 'ParityData'))):
331-
os.makedirs(os.path.join(trialDir, 'ParityData'))
332-
333-
with open(os.path.join(trialDir, 'ParityData', familyName + '.csv'), 'wb') as csvfile:
334-
csvwriter=csv.writer(csvfile)
335-
for key, value in parityData.iteritems():
336-
csvwriter.writerow([key, value[0]/value[1], approxKinetics[key].comment])
337346

338347
with open(os.path.join(trialDir, 'QDict_LOO.csv'), 'wb') as csvfile:
339348
csvwriter=csv.writer(csvfile)
340-
for key, value in QDict.iteritems():
341-
csvwriter.writerow([key, value])
349+
350+
for familyName in allFamilyNames:
351+
family=FullDatabase.kinetics.families[familyName]
352+
print "Processing", familyName + '...', '(' + str(len(family.rules.entries)) + ' rate rules)'
353+
if len(family.rules.entries) < 2:
354+
print ' Skipping', familyName, ': only has one rate rule...'
355+
else:
356+
if not averaging:
357+
# Pre-average the family if averaging is not turned on
358+
family.fillKineticsRulesByAveragingUp()
359+
exactKinetics, approxKinetics = getKineticsLeaveOneOut(family, averaging)
360+
parityData=analyzeForParity(exactKinetics, approxKinetics, cutoff=8.0)
361+
362+
if len(parityData)<2:
363+
print ' Skipping', familyName, ': {} rate rules were compared...'.format(len(parityData))
364+
continue
365+
QDict[familyName]=calculateQ(parityData)
366+
createParityPlot(parityData)
367+
plt.title(familyName)
368+
plt.savefig(os.path.join(trialDir, familyName +'.png'))
369+
plt.clf()
370+
371+
if not os.path.exists(os.path.join(os.path.join(trialDir, 'ParityData'))):
372+
os.makedirs(os.path.join(trialDir, 'ParityData'))
373+
374+
with open(os.path.join(trialDir, 'ParityData', familyName + '.csv'), 'wb') as paritycsvfile:
375+
paritycsvwriter=csv.writer(paritycsvfile)
376+
for key, value in parityData.iteritems():
377+
paritycsvwriter.writerow([key, value[0]/value[1], approxKinetics[key].comment])
378+
379+
# Save to csv file
380+
csvwriter.writerow([familyName, QDict[familyName]])
342381

343382
return
344383

@@ -370,7 +409,7 @@ def leaveOneOut(FullDatabase, trialDir):
370409

371410
print '--------------------------------------------'
372411
print 'Performing the leave on out test on the kinetics families...'
373-
leaveOneOut(FullDatabase, trialDir)
412+
leaveOneOut(FullDatabase, trialDir, averaging=False)
374413

375414
print '--------------------------------------------'
376415
print 'Filling up the family rate rules by averaging... Expect larger number of rate rules in subsequent tests'

0 commit comments

Comments
 (0)