From bbe05616578b8b39a28d2f083d9bab34b47a9328 Mon Sep 17 00:00:00 2001 From: Nathan Yee Date: Mon, 17 Mar 2014 14:00:40 -0400 Subject: [PATCH 1/6] Add tests for checking the database --- rmgpy/data/base.py | 148 ++++++++++++++++++++++++++++++---------- rmgpy/molecule/group.py | 38 +++++++++++ 2 files changed, 150 insertions(+), 36 deletions(-) diff --git a/rmgpy/data/base.py b/rmgpy/data/base.py index 532e785e03..813901e2cf 100644 --- a/rmgpy/data/base.py +++ b/rmgpy/data/base.py @@ -786,7 +786,7 @@ def descendants(self, node): descendants.extend(self.descendants(child)) return descendants - def isWellFormed(self): + def checkWellFormed(self): """ Return :data:`True` if the database is well-formed. A well-formed database has an entry in the dictionary for every entry in the tree, and @@ -797,47 +797,123 @@ def isWellFormed(self): nodes in the tree, if the tree is present; this is for databases with multiple trees, e.g. the kinetics databases. """ - - wellFormed = True + + + from rmgpy.data.kinetics.family import KineticsFamily + + #list of nodes that are not wellFormed + noGroup=[] + noMatchingGroup={} + notInTree=[] + notSubgroup=[] + probablyProduct=[] + + # Give correct arguments for each type of database + if isinstance(self, KineticsFamily): + library=self.rules.entries + groups=self.groups.entries + treeIsPresent=True + topNodes=self.getRootTemplate() # Make list of all nodes in library - libraryNodes = [] - for nodes in self.library: - libraryNodes.extend(nodes.split(';')) - libraryNodes = list(set(libraryNodes)) + libraryNodes=[] + libraryNodesSplit = [] + for nodes in library: + libraryNodes.append(nodes) + libraryNodesSplit.extend(nodes.split(';')) + libraryNodesSplit = list(set(libraryNodesSplit)) + - for node in libraryNodes: + try: + for node in libraryNodesSplit: # All nodes in library must be in dictionary - try: - if node not in self.entries: - raise DatabaseError('Node "{0}" in library is not present in dictionary.'.format(node)) - except DatabaseError, e: - wellFormed = False - logging.error(str(e)) - - # If a tree is present, all nodes in library should be in tree - # (Technically the database is still well-formed, but let's warn - # the user anyway - if len(self.tree.parent) > 0: - try: - if node not in self.tree.parent: - raise DatabaseError('Node "{0}" in library is not present in tree.'.format(node)) - except DatabaseError, e: - logging.warning(str(e)) - - # If a tree is present, all nodes in tree must be in dictionary - if self.tree is not None: - for node in self.tree.parent: - try: - if node not in self.entries: - raise DatabaseError('Node "{0}" in tree is not present in dictionary.'.format(node)) - except DatabaseError, e: - wellFormed = False - logging.error(str(e)) - - return wellFormed + if node not in groups: + noGroup.append(node) + + #no point checking in tree if it doesn't even exist in groups + for libraryNode in libraryNodes: + nodes=libraryNode.split(';') + for libraryEntry in library[libraryNode]: + for node in nodes: + for libraryGroup in libraryEntry.item.reactants: + try: + if groups[node].item.isIsomorphic(libraryGroup): + break + except AttributeError: + if isinstance(groups[node].item, LogicOr) and isinstance(libraryGroup, LogicOr): + if groups[node].item==libraryGroup: + break + except TypeError: + print libraryGroup, type(libraryGroup) + except KeyError: + noGroup.append(node) + else: + noMatchingGroup[node]=libraryNode + + if treeIsPresent: + # All nodes need to be in the tree + # This is true when ascending through parents leads to a top node + for nodeName in groups: + ascendParent=self.groups.entries[nodeName] + + while ascendParent not in topNodes: + child=ascendParent + ascendParent=ascendParent.parent + if ascendParent is None or child not in ascendParent.children: + if child.index==-1: + probablyProduct.append(child.label) + break + else: + # If a group is not in a tree, we want to save the uppermost parent, not necessarily the original node + notInTree.append(child.label) + break + #check if child is actually subgroup of parent + ascendParent=self.groups.entries[nodeName].parent + if ascendParent is not None: + try: + if not ascendParent.item.isSubgraphIsomorphic(self.groups.entries[nodeName].item): + notSubgroup.append(nodeName) + except AttributeError: + if isinstance(groups[node].item, LogicOr) and isinstance(libraryGroup, LogicOr): + if groups[node].item==libraryGroup: + break + except TypeError: + print libraryGroup, type(libraryGroup) + # The adj list of each node actually needs to be subset of its parent's adjlist + #More to come later -nyee + except DatabaseError, e: + logging.error(str(e)) + +# # If a tree is present, all nodes in library should be in tree +# # (Technically the database is still well-formed, but let's warn +# # the user anyway +# if len(self.tree.parent) > 0: +# try: +# if node not in self.tree.parent: +# raise DatabaseError('Node "{0}" in library is not present in tree.'.format(node)) +# except DatabaseError, e: +# logging.warning(str(e)) +# +# # If a tree is present, all nodes in tree must be in dictionary +# if self.tree is not None: +# for node in self.tree.parent: +# try: +# if node not in self.entries: +# raise DatabaseError('Node "{0}" in tree is not present in dictionary.'.format(node)) +# except DatabaseError, e: +# wellFormed = False +# logging.error(str(e)) + +# for libraryRule in library: + #check the groups + + #eliminate duplicates + noGroup=list(set(noGroup)) + notInTree=list(set(notInTree)) + + return (noGroup, noMatchingGroup, notInTree, notSubgroup, probablyProduct) def matchNodeToStructure(self, node, structure, atoms): """ diff --git a/rmgpy/molecule/group.py b/rmgpy/molecule/group.py index b6f83e501c..98b0d4e106 100644 --- a/rmgpy/molecule/group.py +++ b/rmgpy/molecule/group.py @@ -352,6 +352,44 @@ def isSpecificCaseOf(self, other): # Otherwise self is in fact a specific case of other return True + + def isChildOf(self, parent): + """ + Returns ``True`` if `parent` is the same as `self` or a + parent of `self`. Returns ``False`` if some of `parent` is not + included in `self` or they are mutually exclusive. + """ + + if not isinstance(parent, GroupAtom): +# # Let the isSpecificCaseOf method of other handle it +# # We expect self to be an Atom object, but can't test for it here +# # because that would create an import cycle + raise TypeError('Expects both parent and self to be a GroupAtom object') + + # Compare two atom groups for equivalence + # Each atom type in self must have an equivalent in other (and vice versa) + for atomType1 in self.atomType: # all these must match + for atomType2 in parent.atomType: # can match any of these + if atomType1.isSpecificCaseOf(atomType2): break + else: + return False + # Each free radical electron state in self must have an equivalent in other (and vice versa) + for radical1, spin1 in zip(self.radicalElectrons, self.spinMultiplicity): # all these must match + for radical2, spin2 in zip(parent.radicalElectrons, parent.spinMultiplicity): # can match any of these + if radical1 == radical2 and spin1 == spin2: break + else: + return False + + #check that the label is the same + if not self.label==parent.label: return False + + #all bonds in the parent must also be in self, but not necessarily vice versa. + pass + + # Otherwise self is in fact a specific case of other + return True + + ################################################################################ class GroupBond(Edge): From 7bb54dae63b055b99e5f2504b86b28f47c2d0a6f Mon Sep 17 00:00:00 2001 From: Nathan Yee Date: Fri, 28 Mar 2014 16:27:41 -0400 Subject: [PATCH 2/6] Add function LogicOr.matchToLogicOr This function gives the boolean of whether two LogicOr objects are equivilant or not. I needed this for database checking --- rmgpy/data/base.py | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/rmgpy/data/base.py b/rmgpy/data/base.py index 813901e2cf..2d36a0615c 100644 --- a/rmgpy/data/base.py +++ b/rmgpy/data/base.py @@ -1064,6 +1064,18 @@ def matchToStructure(self,database,structure,atoms): return True != self.invert return False != self.invert + def matchToLogicOr(self, other): + """ + Is other the same LogicOr group as self? + """ + if len(self.components)!=len(other.components): + return False + else: + for node in self.components: + if node not in other.components: + return False + return True + def getPossibleStructures(self, entries): """ Return a list of the possible structures below this node. From 8a41c0ba2c9175e333346dbb5e3760e17363212f Mon Sep 17 00:00:00 2001 From: Nathan Yee Date: Mon, 31 Mar 2014 14:56:42 -0400 Subject: [PATCH 3/6] Add new atom attributes to comparisons The function equivilant and isSpecificCaseOf for GroupAtom now checks the list of charges and atom label --- rmgpy/molecule/group.py | 50 ++++++++++++----------------------------- 1 file changed, 14 insertions(+), 36 deletions(-) diff --git a/rmgpy/molecule/group.py b/rmgpy/molecule/group.py index 98b0d4e106..1e318179c4 100644 --- a/rmgpy/molecule/group.py +++ b/rmgpy/molecule/group.py @@ -320,6 +320,14 @@ def equivalent(self, other): if radical1 == radical2 and spin1 == spin2: break else: return False + #Each charge in self must have an equivalent in other + for charge1 in self.charge: + for charge2 in self.charge: + if charge1 == charge2: break + else: + return False + #The label must be the same + if not self.label==other.label: return False # Otherwise the two atom groups are equivalent return True @@ -349,47 +357,17 @@ def isSpecificCaseOf(self, other): if radical1 == radical2 and spin1 == spin2: break else: return False - # Otherwise self is in fact a specific case of other - return True - - - def isChildOf(self, parent): - """ - Returns ``True`` if `parent` is the same as `self` or a - parent of `self`. Returns ``False`` if some of `parent` is not - included in `self` or they are mutually exclusive. - """ - - if not isinstance(parent, GroupAtom): -# # Let the isSpecificCaseOf method of other handle it -# # We expect self to be an Atom object, but can't test for it here -# # because that would create an import cycle - raise TypeError('Expects both parent and self to be a GroupAtom object') - - # Compare two atom groups for equivalence - # Each atom type in self must have an equivalent in other (and vice versa) - for atomType1 in self.atomType: # all these must match - for atomType2 in parent.atomType: # can match any of these - if atomType1.isSpecificCaseOf(atomType2): break + #Each charge in self must have an equivalent in other + for charge1 in self.charge: + for charge2 in self.charge: + if charge1 == charge2: break else: return False - # Each free radical electron state in self must have an equivalent in other (and vice versa) - for radical1, spin1 in zip(self.radicalElectrons, self.spinMultiplicity): # all these must match - for radical2, spin2 in zip(parent.radicalElectrons, parent.spinMultiplicity): # can match any of these - if radical1 == radical2 and spin1 == spin2: break - else: - return False - - #check that the label is the same - if not self.label==parent.label: return False - - #all bonds in the parent must also be in self, but not necessarily vice versa. - pass + #The label must be the same + if not self.label==other.label: return False # Otherwise self is in fact a specific case of other return True - - ################################################################################ class GroupBond(Edge): From 71dfdad6f030437dcd7d843be1eceb0412100c38 Mon Sep 17 00:00:00 2001 From: Nathan Yee Date: Mon, 31 Mar 2014 14:38:53 -0400 Subject: [PATCH 4/6] Added function isIdentical to groups.py The function isIsomorphic was meant to compare molecules to groups, therefore it respects wildcards (seeing as a molecule will never have an atom named Cs or Ct). However, this makes the function isIsomorphic not useful when tryint to compare two groups. I've created a new function isIdentical which will not respect wildcards and can tell if two groups are the same. --- rmgpy/molecule/group.pxd | 2 ++ rmgpy/molecule/group.py | 23 +++++++++++++++++++++++ 2 files changed, 25 insertions(+) diff --git a/rmgpy/molecule/group.pxd b/rmgpy/molecule/group.pxd index 9939b85e07..efdc8bdde5 100644 --- a/rmgpy/molecule/group.pxd +++ b/rmgpy/molecule/group.pxd @@ -129,3 +129,5 @@ cdef class Group(Graph): cpdef bint isSubgraphIsomorphic(self, Graph other, dict initialMap=?) except -2 cpdef list findSubgraphIsomorphisms(self, Graph other, dict initialMap=?) + + cpdef bint isIdentical(self, Graph other) diff --git a/rmgpy/molecule/group.py b/rmgpy/molecule/group.py index 1e318179c4..33f6015a85 100644 --- a/rmgpy/molecule/group.py +++ b/rmgpy/molecule/group.py @@ -785,3 +785,26 @@ def findSubgraphIsomorphisms(self, other, initialMap=None): raise TypeError('Got a {0} object for parameter "other", when a Group object is required.'.format(other.__class__)) # Do the isomorphism comparison return Graph.findSubgraphIsomorphisms(self, other, initialMap) + + def isIdentical(self, other): + """ + Returns ``True`` if `other` is identical and ``False`` otherwise. + The function `isIsomorphic` respects wildcards, while this function + does not, make it more useful for checking groups to groups (as + opposed to molecules to groups) + + """ + # It only makes sense to compare a Group to a Group for full + # isomorphism, so raise an exception if this is not what was requested + if not isinstance(other, Group): + raise TypeError('Got a {0} object for parameter "other", when a Group object is required.'.format(other.__class__)) + # An identical group is always a child of itself and + # is the only case where that is true. Therefore + # if we do both directions of isSubgraphIsmorphic, we need + # to get True twice for it to be identical + if not self.isSubgraphIsomorphic(other): + return False + elif not other.isSubgraphIsomorphic(self): + return False + else: + return True From 13db891d098eda6bbe9b99bc60e12651a63475f0 Mon Sep 17 00:00:00 2001 From: Nathan Yee Date: Mon, 31 Mar 2014 14:35:24 -0400 Subject: [PATCH 5/6] Add function checkWellFormed to family.py This function is used to check for malformed kinetic families. I have also removed the function isWellFormed from base.py because it is deprecated by this new function (and also didn't work due to the many changes we've been making). The new function checks for: -rule labels that have group names which do not exist in groups.py -rule adjLists that do not match with the corresponding adj list in groups.py -Each group must exist in the tree -Each group adjList must actually be a more specific case of it's parent's adjList --- rmgpy/data/kinetics/family.py | 132 +++++++++++++++++++++++++++++++++- 1 file changed, 131 insertions(+), 1 deletion(-) diff --git a/rmgpy/data/kinetics/family.py b/rmgpy/data/kinetics/family.py index 530c20b909..1d22ef2b50 100644 --- a/rmgpy/data/kinetics/family.py +++ b/rmgpy/data/kinetics/family.py @@ -38,7 +38,7 @@ from copy import copy, deepcopy from rmgpy.data.base import Database, Entry, LogicNode, LogicOr, ForbiddenStructures,\ - ForbiddenStructureException, getAllCombinations + ForbiddenStructureException, getAllCombinations, DatabaseError from rmgpy.reaction import Reaction from rmgpy.kinetics import Arrhenius, ArrheniusEP, ThirdBody, Lindemann, Troe, \ PDepArrhenius, MultiArrhenius, MultiPDepArrhenius, \ @@ -1941,3 +1941,133 @@ def getRateCoefficientUnits(self): return 's^-1' else: raise ValueError('Unable to determine units of rate coefficient for reaction family "{0}".'.format(self.label)) + + def checkWellFormed(self): + """ + Returns a tuple of malformed database entries: + + noGroup is a list of nodes in the rules that has no corresponding group in + groups.py + + noMatchingGroup is a dictionary with entry labels from the rules as a key + and entry labels from groups as values. These are groups where rule.py's + adj list does not match group.py's. + + notInTree is a list of groups that do not appear in the tree + + notSubgroup is a dictionary with group labels as keys and atom indexes + as values. Each key is a group where the child's adj list is not a + true child of it's parent. The list of indexes corresponds to the + child's adj list index, where the atom is not a true child. + + probablyProducts is a list of groups which do not apepar in the + tree, but are probably products (as opposed to reactants) which + are created in the database loading. These are not necessarily + malformations, but because I'm not certain where they came from, + I decided to list them. + """ + + + #A function to add to the not in Subgroup dictionary + def appendToDict(dictionary, key, value): + if key not in dictionary: + dictionary[key]=[value] + else: + dictionary[key].append(value) + return dictionary + + #list of nodes that are not wellFormed + noGroup=[] + noMatchingGroup={} + tempNoMatchingGroup={} + notInTree=[] + notSubgroup={} + probablyProduct=[] + + # Give correct arguments for each type of database +# if isinstance(self, KineticsFamily): + library=self.rules.entries + groups=self.groups.entries + topNodes=self.getRootTemplate() + + # Make list of all node names in library + libraryNodes=[] + for nodes in library: + libraryNodes.append(nodes) + + try: + #Each label in rules.py should be be in the form group1;group2;group3 etc + #and each group must appear in groups.py + for libraryNode in libraryNodes: + nodes=libraryNode.split(';') + for libraryEntry in library[libraryNode]: + for nodeName in nodes: + if nodeName not in groups: + noGroup.append(nodeName) + #If the node is not in the dictionary, we can't do the rest of the check + continue + #Each adj list in rules.py should match the adj list in group's.py + for libraryGroup in libraryEntry.item.reactants: + #break if we find a match between two groups + if isinstance(groups[nodeName].item, Group) and isinstance(libraryGroup, Group): + if groups[nodeName].item.isIsomorphic(libraryGroup): + break + #break if we find a match between two logic nodes + elif isinstance(groups[nodeName].item, LogicOr) and isinstance(libraryGroup, LogicOr): + if groups[nodeName].item==libraryGroup: + break + #Otherwise no match is found, so we add it to the tempNoMatchingGroup + else: + tempNoMatchingGroup=appendToDict(tempNoMatchingGroup, libraryNode, nodeName) + #eliminate duplicates + for key, nodeList in tempNoMatchingGroup.iteritems(): + noMatchingGroup[key]=list(set(nodeList)) + + # Each group in groups.py should appear in the tree + # This is true when ascending through parents leads to a top node + for nodeName in groups: + nodeGroup=self.groups.entries[nodeName] + ascendParent=nodeGroup + while ascendParent not in topNodes: + child=ascendParent + ascendParent=ascendParent.parent + if ascendParent is None or child not in ascendParent.children: + if child.index==-1: + probablyProduct.append(child.label) + break + else: + # If a group is not in a tree, we want to save the uppermost parent, not necessarily the original node + notInTree.append(child.label) + break + #For a correct child-parent relationship, each atom in the parent should have a corresponding child atom in the child. + nodeParent=nodeGroup.parent + #Atoms may be in a different order initially. Need to sort both child and parent first + #Don't need to do check for topNodes + if nodeParent is not None: + if isinstance(nodeParent.item, LogicOr): + if not nodeGroup.label in nodeParent.item.components: + #-1 index means the child is not in the LogicOr + notSubgroup[nodeName]=nodeParent.label + continue + else: + #if the parent is a LogicOr, we want to keep ascending until we get to a group or hit a discontinuity (could be + #malformed tree or just ascending past the top node) + while isinstance(nodeParent.item, LogicOr): + nodeParent=nodeParent.parent + if nodeParent == None: break + if nodeParent == None: continue +# nodeParent.item.sortAtoms() + elif isinstance(nodeGroup.item, LogicOr): + print nodeGroup, ' is an intermediate LogicOr. See if it can be replaced with a adj list.' + continue + #If both the parent and child are graphs, we can use the function isSubgroupIsomorphic if it is actually a child + if not nodeGroup.item.isSubgraphIsomorphic(nodeParent.item): + notSubgroup[nodeName]=nodeParent.label + except DatabaseError, e: + logging.error(str(e)) + + #eliminate duplicates + noGroup=list(set(noGroup)) + notInTree=list(set(notInTree)) + + return (noGroup, noMatchingGroup, notInTree, notSubgroup, probablyProduct) From bd422260f4c2432e53a50e10a324e6f9fd9c9a2f Mon Sep 17 00:00:00 2001 From: Nathan Yee Date: Fri, 28 Mar 2014 16:29:58 -0400 Subject: [PATCH 6/6] Add check if groups are unique in the database Coded another check in checkWellFormed to see if each node is unique inside of a family. --- rmgpy/data/kinetics/family.py | 23 +++++++++++++++++++---- 1 file changed, 19 insertions(+), 4 deletions(-) diff --git a/rmgpy/data/kinetics/family.py b/rmgpy/data/kinetics/family.py index 1d22ef2b50..7162ac08c4 100644 --- a/rmgpy/data/kinetics/family.py +++ b/rmgpy/data/kinetics/family.py @@ -1960,7 +1960,7 @@ def checkWellFormed(self): true child of it's parent. The list of indexes corresponds to the child's adj list index, where the atom is not a true child. - probablyProducts is a list of groups which do not apepar in the + probablyProduct is a list of groups which do not apepar in the tree, but are probably products (as opposed to reactants) which are created in the database loading. These are not necessarily malformations, but because I'm not certain where they came from, @@ -1981,6 +1981,7 @@ def appendToDict(dictionary, key, value): noMatchingGroup={} tempNoMatchingGroup={} notInTree=[] + notUnique={} notSubgroup={} probablyProduct=[] @@ -1988,6 +1989,7 @@ def appendToDict(dictionary, key, value): # if isinstance(self, KineticsFamily): library=self.rules.entries groups=self.groups.entries + groupsCopy=copy(groups) topNodes=self.getRootTemplate() # Make list of all node names in library @@ -2014,7 +2016,7 @@ def appendToDict(dictionary, key, value): break #break if we find a match between two logic nodes elif isinstance(groups[nodeName].item, LogicOr) and isinstance(libraryGroup, LogicOr): - if groups[nodeName].item==libraryGroup: + if groups[nodeName].item.matchToLogicOr(libraryGroup): break #Otherwise no match is found, so we add it to the tempNoMatchingGroup else: @@ -2024,9 +2026,10 @@ def appendToDict(dictionary, key, value): noMatchingGroup[key]=list(set(nodeList)) # Each group in groups.py should appear in the tree - # This is true when ascending through parents leads to a top node + # This is true when ascending through parents leads to a top node for nodeName in groups: nodeGroup=self.groups.entries[nodeName] + nodeGroupItem=nodeGroup.item ascendParent=nodeGroup while ascendParent not in topNodes: child=ascendParent @@ -2039,6 +2042,18 @@ def appendToDict(dictionary, key, value): # If a group is not in a tree, we want to save the uppermost parent, not necessarily the original node notInTree.append(child.label) break + + #each node should also be unique: + del groupsCopy[nodeName] + for nodeName2 in groupsCopy: + nodeGroup2Item=self.groups.entries[nodeName2].item + if isinstance(nodeGroup2Item, Group) and isinstance(nodeGroupItem, Group): + if nodeGroupItem.isIdentical(nodeGroup2Item): + notUnique=appendToDict(notUnique, nodeName, nodeName2) + if isinstance(nodeGroup2Item, LogicOr) and isinstance(nodeGroupItem, LogicOr): + if nodeGroupItem.matchToLogicOr(nodeGroup2Item): + notUnique=appendToDict(notUnique, nodeName, nodeName2) + #For a correct child-parent relationship, each atom in the parent should have a corresponding child atom in the child. nodeParent=nodeGroup.parent #Atoms may be in a different order initially. Need to sort both child and parent first @@ -2070,4 +2085,4 @@ def appendToDict(dictionary, key, value): noGroup=list(set(noGroup)) notInTree=list(set(notInTree)) - return (noGroup, noMatchingGroup, notInTree, notSubgroup, probablyProduct) + return (noGroup, noMatchingGroup, notInTree, notUnique, notSubgroup, probablyProduct)