import os.path
import re
import sys
import numpy as np
from collections import OrderedDict
from itertools import product
from tree import Tree
class Grammar(object):
"""
The last grammar class you will ever need. Currently under development.
"""
INF = float('Inf')
def __init__(self, rules, isPcfg=False, isHnf=False):
"""
Create a grammar object from a grammar specification.
If grammar rules are being passed in a file, they must be written in the form
A -> B C | D E
F -> G H
... etc. Separate rules must be written on separate lines.
If grammar rules are being passed as an argument, they must be written in the form
"A -> B C | D E; F -> G H"
... etc. Separate rules must be separated by a semicolon (;).
In both cases, rules with multiple right-hand sides can use a vertical bar ('|') to
separate different right-hand sides, or every rule can just be written on a separate line.
All sister symbols must be separated by a space (' '). The first node of the first
rule will be treated as the root node internally. For now, all rules must be binary
branching at most; this is enforced within the program.
By default, this function accepts rules in CNF form and converts them to HNF.
If you want to enter HNF rules using the standard bracketing practice, you must set the hnf
parameter to True by passing this as a value to the function.
Rules can be entered in HNF; if they are, the flag 'isHnf' must be set to 'True.'
PCFG functionality is still under development. For now, best not to use this functionality.
Some features (i.e. 'toString' methods) may not work if you do.
This function is based on one originally written by Colin Wilson.
"""
self.isPcfg = isPcfg
listOfRules = []
if os.path.isfile(rules):
f = open(rules, 'r')
for line in f:
listOfRules.append(line)
f.close()
else:
listOfRules = rules.split(';')
ruleForm = re.compile(r'.+->.+')
for i in range(len(listOfRules)):
listOfRules[i] = listOfRules[i].strip()
if re.match(ruleForm, listOfRules[i]) == None:
sys.exit('Error: input ' + listOfRules[i] + ' is not formatted correctly.')
if not isHnf:
self.cnfRules = self.createRules(listOfRules)
self.hnfRules, self.branchSyms = self.cnfToHnf()
else:
self.hnfRules = self.createRules(listOfRules)
self.cnfRules, self.branchSyms = self.hnfToCnf()
if self.isPcfg:
self.cnfRules, self.hnfRules = self.setRemainingProbabilities()
self.hgRulesAreSet = False
self.networkInfoIsSet = False
self.allGridpointsAreSet = False
self.zIsSet = False
self.allHarmoniesAreSet = False
def createRules(self, listOfRules):
"""
Create the internal representation of the CFG passed as parameter listOfRules.
"""
maxN = 2
fl = re.compile(r'^0*1?\.?\d+?$')
rules = OrderedDict()
for rule in listOfRules:
splitRule = rule.split('->')
lhs = splitRule[0].strip()
wholeRhs = splitRule[1].split('|')
for rhs in wholeRhs:
rhsList = rhs.split()
nChildren = len(rhsList)
if self.isPcfg and re.match(fl, rhsList[0]) != None:
rhsList[0] = float(rhsList[0])
nChildren -= 1
if nChildren > maxN:
sys.exit('Seriously? Do you really need ' + str(len(rhsList)) + \
'-ary trees?')
rules.setdefault(lhs, []).append(rhsList)
return rules
def cnfToHnf(self):
"""
Convert rules in "Conventional Normal Form" to Harmonic Normal Form. (In other
words, add intermediate bracketed symbols.) Must be called after self.cnfRules has
been created.
This function is based on one originally written by Colin Wilson.
"""
hnfRules = OrderedDict()
branchSyms = []
b = re.compile(r'.*\[\d+\]')
goodInput = True
for lhs in self.cnfRules.keys():
if re.match(b, lhs) != None:
goodInput = False
bracketIndex = 1
for rhs in self.cnfRules[lhs]:
degree = len(rhs)
if degree >= 1:
newSym = lhs +'['+ str(bracketIndex) +']'
if not newSym in branchSyms:
branchSyms.append(newSym)
if self.isPcfg and isinstance(rhs[0], float):
hnfRules.setdefault(lhs, []).append([rhs[0], newSym])
hnfRules.setdefault(newSym, []).append(rhs[1:])
else:
hnfRules.setdefault(lhs, []).append([newSym])
hnfRules.setdefault(newSym, []).append(rhs[:])
bracketIndex += 1
else:
hnfRules[lhs].append(rhs)
if not goodInput:
sys.exit('Error: HNF-style bracketing detected. If you meant to enter your rules in HNF, ' + \
'you must pass parameter \'True\' to setRules(); otherwise, you should avoid using HNF-style ' + \
'bracketing structure in your nodes.')
return hnfRules, branchSyms
def hnfToCnf(self):
"""
Convert rules in Harmonic Normal Form to "Conventional Normal Form." (In other
words, remove intermediate bracketed symbols.) Must be called after self.hnfRules
has been created.
"""
cnfRules = OrderedDict()
branchSyms = []
b = re.compile(r'.*\[\d+\]')
goodInput = False
for lhs in self.hnfRules.keys():
if re.match(b, lhs) != None:
branchSyms.append(lhs)
newLhs = lhs.split('[')[0]
for rhs in self.hnfRules[lhs]:
cnfRules.setdefault(newLhs, []).append(rhs[:])
goodInput = True
if not goodInput:
sys.exit('Error: HNF not detected. If you did not intend to use HNF, do not pass parameter ' + \
'\'True\' to setRules().')
return cnfRules, branchSyms
def hnfTreeToState(self, inputString):
"""
Use grammar to convert a tree in HNF (input as text) to a state based on the current
network settings.
"""
if not self.networkInfoIsSet:
self.setNetworkInfo()
currentTree = Tree(inputString)
frBindings = currentTree.getFRbindings()
byRole = {}
for binding in frBindings:
if binding[0] not in self.fillerNames:
sys.exit('Error: Invalid filler (' + binding[0] + ').')
if binding[1] not in self.roleNames:
sys.exit('Error: Invalid role (' + binding[1] + ').')
byRole[binding[1]] = binding[0]
state = []
for role in self.roleNames:
if role not in byRole:
if self.padWithNulls:
sys.exit('Error: Role ' + role + ' not in tree. Check that \n(a) you entered ' + \
'your tree in HNF, \n(b) null elements in the tree are explicitly represented with ' + \
'the null symbol (' + self.nullSymbol + '), and \n(b) your tree is licensed by the ' + \
'following grammar:\n\n' + self.hnfRulesToString())
else:
sys.exit('Error: Role ' + role + ' not in tree. Check that \n(a) you entered ' + \
'your tree in HNF, and \n(b) your tree is licensed by the following grammar:\n\n' + \
self.hnfRulesToString())
thisRole = [0] * len(self.fillerNames)
thisRole[self.fillerNames.index(byRole[role])] = 1
state += thisRole
return np.transpose(np.array(state))
def stateToString(self, state):
"""
Convert state stored as a column vector to a string of 1s and 0s.
"""
returnString = np.array_str(state)
returnString = returnString[1:-1]
returnString = returnString.replace(' ','')
returnString = returnString.replace('.','')
returnString = returnString.replace('\n','')
return returnString
def setRemainingProbabilities(self):
"""
Fix rules such that if this is a PCFG, all rules that can expand to more than
one right-hand side are given an equal probability to expand to each of those right-hand
sides (asically, divide up remaining probability). Make sense?
"""
rulesSet = [self.cnfRules, self.hnfRules]
for rules in rulesSet:
for lhs in rules.keys():
wholeRhs = rules[lhs]
currentProbValues = []
noProbIndices = []
probToDistribute = 0
for rhs in wholeRhs:
if isinstance(rhs[0], float):
if rhs[0] < 0 or rhs[0] > 1:
sys.exit('Error: Invalid probability (' + str(rules[lhs][0]) + ').')
else:
currentProbValues.append(rhs[0])
else:
noProbIndices.append(wholeRhs.index(rhs))
if len(noProbIndices) != 0:
probToDivide = 1 - sum(currentProbValues)
probToDistribute = probToDivide / len(noProbIndices)
for index in noProbIndices:
wholeRhs[index].insert(0, probToDistribute)
if np.abs(1 - (len(noProbIndices) * probToDistribute + sum(currentProbValues))) > 0.02:
sys.exit('Error: Probabilities do not sum to 1 (check rules beginning with ' + lhs + ')')
return rulesSet[0], rulesSet[1]
def adjustBiases(self):
"""
Adds values from self.biasAdjustments to self.biasVector to influence probability.
Here, the probability difference is simply added to the most likely structure.
"""
for adjustment in self.biasAdjustments:
if adjustment[1] > 0 and adjustment[1] < self.INF:
frBinding = adjustment[0][0] + '/' + adjustment[0][1]
self.biasVector_byFiller[self.allFRbindings.index(frBinding)] += adjustment[1]
self.biasVector_byRole[self.allFRbindings.index(frBinding)] += adjustment[1]
def computeProb(self, tree, T=1):
"""
Compute probability of tree (input as text), using all gridpoints as the sample
space and T = 1 by default.
"""
if not self.zIsSet:
self.computeZ()
return np.exp(self.getHarmony(tree) / T) / self.z
def computeZ(self, T=1):
"""
Computes sum_i (exp(H(tree_i)/T)) for T = 1
"""
if not self.allHarmoniesAreSet:
self.setAllHarmonies()
allHarmonies = np.array(list(self.allHarmonies.values()))
self.z = np.exp(allHarmonies / T).sum()
self.zIsSet = True
def setAllGridpoints(self):
"""
Generate all gridpoints and evaluate their Harmony.
"""
if not self.networkInfoIsSet:
self.setNetworkInfo()
nR = len(self.roleNames)
nF = len(self.fillerNames)
allPoints = []
for i in range(len(self.fillerNames)):
currentPoint = [0] * len(self.fillerNames)
currentPoint[i] = 1
allPoints.append(currentPoint)
allGridsList = list(product(allPoints, repeat=len(self.roleNames)))
allGridsMat = np.zeros(shape=(nR * nF, len(allGridsList)))
for i in range(len(allGridsList)):
gridCol = np.transpose(np.reshape(np.array(allGridsList[i]), (nR * nF, 1)))
allGridsMat[:,i] = gridCol
self.allGridpoints = allGridsMat
self.allGridpointsAreSet = True
def setAllHarmonies(self):
"""
Store the harmony for every gridpoint because, why not? This may take a while to run,
but will only need to be run once per grammar.
"""
if not self.allGridpointsAreSet:
self.setAllGridpoints()
nGrids = self.allGridpoints.shape[1]
allHarmonies = {}
for i in range(nGrids):
stateKey = self.stateToString(self.allGridpoints[:,i])
allHarmonies[stateKey] = self.getHarmony(self.allGridpoints[:,i])
self.allHarmonies = allHarmonies
self.allHarmoniesAreSet = True
def setHarmonicGrammarRules(self, maxDepth=6, useHnf=True, addNullFillers=False, nullSymbol='_'):
"""
Create Harmonic Grammar rules based on the CFG passed as parameter
ruleDictionary.
"""
self.maxDepth = maxDepth
self.useHnf = useHnf
self.needNullFiller = addNullFillers
if self.useHnf:
ruleSet = self.hnfRules
else:
ruleSet = self.cnfRules
start = (self.getRootNode(ruleSet), 'r')
if addNullFillers:
self.nullSymbol = nullSymbol
self.nullPaddedRules = self.padWithNulls(ruleSet, self.nullSymbol)
ruleSet = self.nullPaddedRules
hgWeights = []
hgBiases = []
self.biasAdjustments = []
self.hgWeights, self.hgBiases = self.expandHGrules(start, ruleSet, hgWeights, hgBiases)
self.hgWeights, self.hgBiases = self.sortHGrules(self.hgWeights, self.hgBiases)
self.hgRulesAreSet = True
def padWithNulls(self, ruleSet, nullSymbol):
"""
"Symmetrizes" the CFG grammar in ruleSet by padding projections with null symbols.
For example, given the grammar
S -> A; S -> B B,
this function creates
S -> A _; S -> B B
Note that after this function is run, all parent nodes have the same number of children.
"""
maxN = 0
for lhs in ruleSet.keys():
for rhs in ruleSet[lhs]:
currentN = len(rhs)
if currentN > maxN:
maxN = currentN
for lhs in ruleSet.keys():
if lhs in self.branchSyms:
for rhs in ruleSet[lhs]:
while len(rhs) < maxN:
rhs.append(self.nullSymbol)
return ruleSet
def expandHGrules(self, parent, ruleSet, hgWeights, hgBiases):
"""
Recursive function to find Harmonic Grammar rules. Stops when all possible
paths through the CFG are explored, or the maximum depth is reached.
Right now, biases are added to roles only, not filler/role bindings.
"""
if len(parent[1]) < self.maxDepth:
if parent[0] in ruleSet.keys():
for rhs in ruleSet[parent[0]]:
harmonyDiff = 0
if self.isPcfg:
temp = rhs[1:]
harmonyDiff = np.log(rhs[0]) - np.log(1 - rhs[0])
else:
temp = rhs[:]
if parent[1] == 'r':
hgBiases.append([parent[0], parent[1], -(len(temp))])
else:
hgBiases.append([parent[0], parent[1], -(len(temp) + 1)])
childLevel = '0' + parent[1]
for childSymbol in temp:
self.biasAdjustments.append([[childSymbol, childLevel], harmonyDiff])
hgWeights.append([(parent[0], parent[1]), (childSymbol, childLevel), 2])
hgWeights, hgBiases = \
self.expandHGrules((childSymbol, childLevel), ruleSet, hgWeights, hgBiases)
childLevel = str(int(childLevel[0]) + 1) + childLevel[1:]
else:
hgBiases.append([parent[0], parent[1], -1])
else:
hgBiases.append([parent[0], parent[1], -1])
return hgWeights, hgBiases
def sortHGrules(self, hgWeights, hgBiases):
"""
This is pretty sloppy. So it will remain until we come up with a a more
clever data structure to store the HG rules (low priority right now).
"""
needSwapped = True
while needSwapped:
needSwapped = False
for i in range(len(hgWeights) - 1):
if len(hgWeights[i][0][1]) > len(hgWeights[i+1][0][1]):
temp = hgWeights[i]
hgWeights[i] = hgWeights[i+1]
hgWeights[i+1] = temp
needSwapped = True
hgWeightsNoDuplicates = []
seen = set()
for weight in hgWeights:
stringWeight = ''.join([weight[0][0], weight[0][1], weight[1][0], weight[1][1]])
if stringWeight not in seen:
hgWeightsNoDuplicates.append(weight)
seen.add(stringWeight)
needSwapped = True
while needSwapped:
needSwapped = False
for i in range(len(hgBiases) - 1):
if len(hgBiases[i][1]) > len(hgBiases[i+1][1]):
temp = hgBiases[i]
hgBiases[i] = hgBiases[i+1]
hgBiases[i+1] = temp
needSwapped = True
hgBiasesNoDuplicates = []
seen = set()
for biasPair in hgBiases:
if (biasPair[0], biasPair[1]) not in seen:
hgBiasesNoDuplicates.append(biasPair)
seen.add((biasPair[0], biasPair[1]))
return hgWeightsNoDuplicates, hgBiasesNoDuplicates
def setNetworkInfo(self, biasByFiller=True):
"""
Set the role names, filler names, weight matrix, and bias vector for
this HNF grammar.
"""
if not self.hgRulesAreSet:
self.setHarmonicGrammarRules()
fillerNames = []
roleNames = []
for i in range(len(self.hgBiases)):
if self.hgBiases[i][0] not in fillerNames:
fillerNames.append(self.hgBiases[i][0])
if self.hgBiases[i][1] not in roleNames:
roleNames.append(self.hgBiases[i][1])
if self.needNullFiller and self.nullSymbol not in fillerNames:
fillerNames.append(self.nullSymbol)
allFRbindings = []
for i in range(len(roleNames)):
for j in range(len(fillerNames)):
allFRbindings.append(fillerNames[j] + '/'+ roleNames[i])
weightMatrix = np.zeros((len(allFRbindings), len(allFRbindings)))
for i in range(len(self.hgWeights)):
index1 = allFRbindings.index(self.hgWeights[i][0][0] + '/' + self.hgWeights[i][0][1])
index2 = allFRbindings.index(self.hgWeights[i][1][0] + '/' + self.hgWeights[i][1][1])
weightMatrix[index1, index2] = self.hgWeights[i][2]
weightMatrix[index2, index1] = self.hgWeights[i][2]
biasVector_byFiller = np.zeros(len(allFRbindings))
biasVector_byRole = np.zeros(len(allFRbindings))
for i in range(len(self.hgBiases)):
currentFiller = self.hgBiases[i][0]
currentRole = self.hgBiases[i][1]
currentBias = self.hgBiases[i][2]
for j in range(len(allFRbindings)):
if allFRbindings[j][:len(currentFiller)] == currentFiller:
biasVector_byFiller[j] = currentBias
if allFRbindings[j][-len(currentRole):] == currentRole:
biasVector_byRole[j] = currentBias
self.roleNames = roleNames
self.fillerNames = fillerNames
self.weightMatrix = weightMatrix
self.biasVector_byFiller = biasVector_byFiller
self.biasVector_byRole = biasVector_byRole
self.allFRbindings = allFRbindings
self.networkInfoIsSet = True
if biasByFiller:
self.defaultBiasVector = biasVector_byFiller
else:
self.defaultBiasVector = biasVector_byRole
if self.isPcfg:
self.adjustBiases()
def getHarmony(self, state):
"""
Calculate harmony of state input as column vector.
"""
if not self.networkInfoIsSet:
self.setNetworkInfo();
if isinstance(state, str):
stateVector = self.hnfTreeToState(state)
else:
stateVector = state
hWeight = np.dot(np.dot(np.transpose(stateVector), self.weightMatrix), stateVector)
hBias = np.dot(np.transpose(stateVector), self.defaultBiasVector)
return ((0.5 * hWeight) + hBias)
def getRootNode(self, ruleSet):
"""
Given a dictionary of rules, find the first possible root node.
"""
return list(ruleSet.keys())[0]
def getTerminalNodes(self, ruleSet):
"""
Given a dictionary of rules, find all terminal symbols.
"""
terminals = []
rhSides = []
rhSides = ruleSet.keys()
terminals = []
for rhs in rhSides:
for lhs in ruleSet[rhs]:
for node in lhs:
if node not in rhSides and node not in terminals:
terminals.append(node)
return terminals
def getNetworkInfo(self, biasByFiller=True):
"""
Get the information needed to create a weight matrix for use in neural network
computation.
"""
if not self.networkInfoIsSet:
self.setNetworkInfo()
return self.roleNames, self.fillerNames, self.weightMatrix, self.defaultBiasVector
def getWeight(self, binding1, binding2):
"""
Get specific weight from weight matrix.
"""
binding1_isValid = binding1 in self.allFRbindings
binding2_isValid = binding2 in self.allFRbindings
if not binding1_isValid and not binding2_isValid:
sys.exit('Error: \'' + binding1 + '\' and \'' + binding2 + '\' are not valid filler/role bindings.')
elif not binding1_isValid:
sys.exit('Error: \'' + binding1 + '\' is not a valid filler/role binding.')
elif not binding2_isValid:
sys.exit('Error: \'' + binding2 + '\' is not a valid filler/role binding.')
else:
return self.weightMatrix[self.allFRbindings.index(binding1), self.allFRbindings.index(binding2)]
def getBias(self, binding):
"""
Get specific bias from bias vector.
"""
bindingIsValid = binding in self.allFRbindings
if not bindingIsValid:
sys.exit('Error: \'' + binding + '\' is not a valid filler/role binding.')
else:
return self.defaultBiasVector[self.allFRbindings.index(binding), 0]
def cnfRulesToString(self):
"""
Gets a pretty string for the CNF rules.
"""
nRules = 0
for lhs in self.cnfRules.keys():
nRules += len(self.cnfRules[lhs])
nStringified = 0
returnString = '{'
for lhs in self.cnfRules.keys():
for rhs in self.cnfRules[lhs]:
if nStringified != 0:
returnString += ' '
returnString += lhs + ' -> '
for i in range(len(rhs)):
if self.isPcfg and i == 0:
returnString += str(rhs[i])
else:
returnString += rhs[i]
if i != len(rhs) - 1:
returnString += ' '
if nStringified != nRules - 1:
returnString += '; \n'
nStringified += 1
returnString += '}'
return returnString
def hnfRulesToString(self):
"""
Gets a pretty string for the HNF rules.
"""
nRules = 0
for lhs in self.hnfRules.keys():
nRules += len(self.hnfRules[lhs])
nStringified = 0
returnString = '{'
for lhs in self.hnfRules.keys():
for rhs in self.hnfRules[lhs]:
if nStringified != 0:
returnString += ' '
returnString += lhs + ' -> '
for i in range(len(rhs)):
if self.isPcfg and i == 0:
returnString += str(rhs[i])
else:
returnString += rhs[i]
if i != len(rhs) - 1:
returnString += ' '
if nStringified != nRules - 1:
returnString += '; \n'
nStringified += 1
returnString += '}'
return returnString
def hgWeightsToString(self):
"""
Gets a pretty string for the HG weights.
"""
if not self.hgRulesAreSet:
self.setHarmonicGrammarRules()
returnString = '{'
for i in range(len(self.hgWeights)):
if i != 0:
returnString += ' '
returnString += '[(' + self.hgWeights[i][0][0] + '/'+ self.hgWeights[i][0][1] \
+ ', ' + self.hgWeights[i][1][0] + '/' + self.hgWeights[i][1][1] \
+ '), ' + str(self.hgWeights[i][-1]) + ']'
if i != len(self.hgWeights) - 1:
returnString += '; \n'
returnString += '}'
return returnString
def hgBiasesToString(self):
"""
Gets a pretty string for the HG biases.
"""
if not self.hgRulesAreSet:
self.setHarmonicGrammarRules()
returnString = '{'
for i in range(len(self.hgBiases)):
if i != 0:
returnString += ' '
returnString += '[' + self.hgBiases[i][0] + ', ' + str(self.hgBiases[i][1]) + ']'
if i != len(self.hgBiases) - 1:
returnString += '; \n'
returnString += '}'
return returnString
def hgRulesToString(self):
"""
Concatenates the HG weights and biases and gets a pretty string for them.
"""
if not self.networkInfoIsSet:
self.setNetworkInfo()
returnString = '{'
for i in range(len(self.hgWeights)):
if i != 0:
returnString += ' '
returnString += '[(' + self.hgWeights[i][0][0] + '/'+ self.hgWeights[i][0][1] \
+ ', ' + self.hgWeights[i][1][0] + '/' + self.hgWeights[i][1][1] \
+ '), ' + str(self.hgWeights[i][-1]) + '];\n'
for i in range(len(self.hgBiases)):
returnString += ' [' + self.hgBiases[i][0] + ', ' + str(self.hgBiases[i][1]) + ']'
if i != len(self.hgBiases) - 1:
returnString += '; \n'
returnString += '}'
if self.isPcfg:
adjustmentsToPrint = []
for adjustment in self.biasAdjustments:
if adjustment[1] > 0 and adjustment[1] < self.INF:
adjustmentsToPrint.append(adjustment)
returnString += ';\n{'
for i in range(len(adjustmentsToPrint)):
if i != 0:
returnString += ' '
returnString += '[' + adjustmentsToPrint[i][0][0] + '/' + adjustmentsToPrint[i][0][1] + ', ' + \
str(adjustmentsToPrint[i][1]) + ']'
if i != len(adjustmentsToPrint) - 1:
returnString += '; \n'
returnString += '}'
return returnString
def biasVectorToString(self):
"""
Gets a pretty string for the bias vector.
"""
returnString = ''
for i in range(len(self.allFRbindings)):
returnString += self.allFRbindings[i] + ', ' + str(self.defaultBiasVector[i]) + '\n'
return returnString