This repository was archived by the owner on Jun 6, 2018. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 7
Expand file tree
/
Copy pathFSChartParser.html
More file actions
560 lines (533 loc) · 50 KB
/
FSChartParser.html
File metadata and controls
560 lines (533 loc) · 50 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
<!doctype html PUBLIC "-//W3C//DTD HTML 4.0 Transitional//EN">
<html><head><title>Python: module FSChartParser</title>
<style type="text/css"><!--
TT { font-family: lucidatypewriter, lucida console, courier }
--></style></head><body bgcolor="#f0f0f8">
<table width="100%" cellspacing=0 cellpadding=2 border=0 summary="heading">
<tr bgcolor="#7799ee">
<td valign=bottom> <br>
<font color="#ffffff" face="helvetica, arial"> <br><big><big><strong>FSChartParser</strong></big></big></font></td
><td align=right valign=bottom
><font color="#ffffff" face="helvetica, arial"><a href=".">index</a><br><a href="file:///C|/documents%20and%20settings/oliver%20steele/my%20documents/projects/fsa/fschartparser.py">c:\documents and settings\oliver steele\my documents\projects\fsa\fschartparser.py</a></font></td></tr></table>
<p><tt># Module FSChartParser -- finite-state chart parser and grammar compilation utilities"""ChartParser is a chart parser that uses finite-state automata torecognize grammar rules.ChartParser is initialized with a grammar, represented as a list ofrules. Each rule is either a categorizing automaton (defined below),or a tuple (lhs, automaton), where each lhs is a category and eachautomaton recognizes a language over terminals, and nonterminals. Inthe latter case, the tuple is compiled to a categorizing automaton.A categorizing automaton is an automaton which also maps each finalstate to a list of categories, which index the languages thatcategorize a sequence that leads to that final state. A categorizingautomaton can be used to simultaneously apply a number of regulargrammars to a single input sequence, and categorize each subsequenceaccording to each grammar. Categorizing automata are represented byinstances of class CategorizingAutomaton, and created bycompileCategorizingAutomaton, which takes a list of (lhs, automaton)rules and constructs a single categorizing automaton which categorizesinputs according to all the rules simultaneously.The chart parser operates on instances of Constituent, which has acategory, a start index, an end index, and a list of children, whichare also constituents.Example-------- >>> RULES = map(lambda (lhs, rhs):(lhs, FSA.compileRE(rhs, multichar=1)), [ ('S', 'NP VP'), ('NP', "det? adj* noun+"), ('NP', 'noun of noun'), ('VP', 'verb NP')]) >>> parser = ChartParser(compileRules(RULES)) >>> print parser.parseString('noun verb noun', multichar=1).constituents() [S[NP[noun] VP[verb NP[noun]]]] >>> print parser.parseString('det adj noun noun verb adj noun', multichar=1).constituents() [S[NP[det adj noun noun] VP[verb NP[adj noun]]]] >>> parser = ChartParser(compileRules(RULES, optimize=1)) >>> print parser.parseString('noun verb noun', multichar=1) [S[NP[noun] VP[verb NP[noun]]]]"""__author__ = "Oliver Steele", 'steele@osteele.com'import FSAfrom types import ListType, StringType, TupleTypeclass ChartParser: TRACE = 0 TRACE_MATCHES = 0 TRACE_CONSTITUENTS = 0 # # Initialization # def __init__(self, rules): """Each rule is a sequence of f, fsa; where f is applied to (fsa, state, children, start, end) to create a constituent or None, and fsa is a finite-state automaton as defined in FSA.""" self.<strong>automata</strong> = rules self.<strong>firstConstituentsOnly</strong> = 1 def initializeChart(self, n): self.<strong>constituentMaps</strong> = map(lambda n:{}, range(n)) self.<strong>edges</strong> = map(lambda n:[], range(n + 1)) #for automaton in self.<strong>automata</strong>: # for i in range(n): # addEdge((automaton, automaton.initialState, i, i, [])) # # Parsing # def parse(self, tokens): """Preterminals should be a sequence of tokens. Parse it. If there's a set of spanning parses whose categories match the categories in the first grammar rule, return them. For more general queries (partial parses), use the query methods on the parser.""" self.<strong>tokens</strong> = tokens initializeChart(len(tokens)) self.<strong>finalIndex</strong> = len(tokens) for i in range(len(tokens)): for automaton in self.<strong>automata</strong>: addEdge(automaton, automaton.initialState, i, i, []) return self def parseString(self, sentence, multichar=0): """Parse the string argument, with one letter per preterminal constituent. If multichar is true, the string is split at whitespace and the intervening tokens are used as preterminals instead.""" if multichar: import string tokens = string.split(sentence) else: tokens = sentence return parse(tokens) # # Queries # def constituents(self, categories=None, complete=0): """Return a list of all the non-preterminal constituents in the chart. If categories is not false, include only constituents with a category in categories.""" results = [] for constituentMap in self.<strong>constituentMaps</strong>: constituents = constituentMap.values() if categories: constituents = filter(lambda c,cats=categories:c.category in cats, constituents) constituents.sort(lambda a, b:-cmp(a.length(), b.length())) results.extend(constituents) if complete: results = filter(lambda c, final=self.<strong>finalIndex</strong>: c.start == 0 and c.end == final, results) return results def constituentsAt(self, index): """Return a list of constituents at index.""" return filter(self.<strong>constituentMaps</strong>[index].values()) def longestDisjointConstituents(self, categories=None): """Return a sequence of disjoint constituents such that each constituent is one of the longest constituents at its index position. The algorithm is myopic and locally greedy: it won't choose a shorter constituent in order to get the whole sequence to come out longer, and it won't even choose between two constituents of equal length in order to get a longer overall sequence.""" results = [] index = 0 while index < len(self.<strong>constituentMaps</strong>): constituents = constituentsAt(index) if categories: constituents = filter(lambda c,cats=categories:c.category in cats, constituents) if constituents: constituents.sort(lambda a, b:-cmp(a.end, b.end)) best = constituents[0] results.append(best) index = best.end else: index = index + 1 return results # # Chart manipulation # def addConstituent(self, constituent, start, end): if self.<strong>TRACE</strong> or self.<strong>TRACE_CONSTITUENTS</strong>: print 'adding', constituent if self.<strong>firstConstituentsOnly</strong>: key = constituent.category, start, end if self.<strong>constituentMaps</strong>[start].get(key): return self.<strong>constituentMaps</strong>[start][key] = constituent for (automaton, state, left, right, children) in self.<strong>edges</strong>[start]: assert right == start successors = automaton.nextStates(state, constituent) if successors and self.<strong>TRACE_MATCHES</strong>: print '\tmatched:' print '\t\t', automaton.atStateString(state), '->' elif self.<strong>TRACE</strong>: if successors: print '\tmatched: \n\t\t%s -> %s' % (automaton.atStateString(state), successors) else: assert automaton.atStateString(state) print '\tdidn\'t match: \n\t\t%s' % (automaton.atStateString(state)) for successor in successors: if self.<strong>TRACE_MATCHES</strong>: print '\t\t', automaton.atStateString(successor) addEdge(automaton, successor, left, end, children + [constituent]) def addEdge(self, automaton, state, start, end, children): if automaton.stateMatchesConstituents(state): edge = automaton, state, start, end, children self.<strong>edges</strong>[end].append(edge) for category in automaton.getStateCategories(state): constituent = Constituent(category, children, start, end) try: constituent.source = automaton except AttributeError: pass addConstituent(constituent, start, end) if end < len(self.<strong>tokens</strong>): token = self.<strong>tokens</strong>[end] constituents = self.<strong>constituentMaps</strong>[end].values() for successor in automaton.nextStates(state, token): addEdge(automaton, successor, start, end + 1, children + [token]) for constituent in constituents: for successor in automaton.nextStates(state, constituent): addEdge(automaton, successor, start, constituent.end, children + [constituent]) # # Presentation # def toDotString(self, includeActiveEdges=0): """Returns a string that can be printed by the DOT tool at <a href="http://www.research.att.com/sw/tools/graphviz/">http://www.research.att.com/sw/tools/graphviz/</a> .""" import string output = [] output.append('digraph finite_state_machine {'); output.append('\t0 [style = bold];' ) output.append('\tnode [shape = doublecircle]; ' + `self.<strong>finalIndex</strong>` + ';' ); output.append('\tnode [shape = circle];'); output.append('\trankdir=LR;'); if includeActiveEdges: for edges in self.<strong>edges</strong>: for (automaton, state, start, end, children) in edges: label = string.replace(automaton.atStateString(state, wrap=20), '\n', '\\n') output.append('\t%s -> %s [style=dotted,label="%s"];' % (start, end, label) ) for constituents in self.<strong>constituentMaps</strong>: items = map(lambda c:(c.start, c.end, c), constituents.values()) items.sort() for item in items: output.append('\t%s -> %s [label="%s"];' % item) output.append('}'); return string.join(output, '\n') def view(self, includeActiveEdges=1): FSA.view(toDotString(includeActiveEdges=includeActiveEdges))## Chart classes#class Constituent: def __init__(self, category, children, start, end): self.<strong>category</strong> = category self.<strong>children</strong> = children self.<strong>start</strong> = start self.<strong>end</strong> = end def leaves(self): if self.<strong>children</strong>: leaves = [] for child in self.<strong>children</strong>: if hasattr(child, 'leaves'): leaves.extend(child.leaves()) else: leaves.append(child) return leaves else: return [self] def __repr__(self): import string if self.<strong>children</strong>: def tokenStr(token): return (hasattr(token, 'token') and str(token.token)) or str(token) if 1: #flatten the printed representation return self.<strong>category</strong> + '[' + string.join(map(tokenStr, leaves())) + ']' else: return self.<strong>category</strong> + '[' + string.join(map(str, self.<strong>children</strong> or []), ' ') + ']' else: return self.<strong>category</strong> def length(self): return self.<strong>end</strong> - self.<strong>start</strong>## Class CategorizingAutomaton#class CategorizingAutomaton(FSA.FSA): """A categorizing automaton is a finite-state automaton that additionally maps each final state into a set of categories. A categorizing automataon can be used to simultaneously recognize and categorize sequences according to a number of languages.""" def __init__(*args, **keys): apply(FSA.FSA.__init__, args, keys) self = args[0] setStateCategoriesMapping({}) def coerce(self, klass): coercion = FSA.FSA.coerce(self, klass) coercion.setStateCategoriesMapping(getStateCategoriesMapping()) return coercion # # Predicates # isCategorizingAutomaton = 1 # # State categories # def categories(self): """Return a list of categories that this automaton will categorize into.""" categories = [] for set in getStateCategoriesMapping().values(): for category in set: if category not in categories: categories.append(category) return categories def getStateCategories(self, state): return self.<strong>stateCategories</strong>[state] def addStateCategory(self, state, category): categories = self.<strong>stateCategories</strong>[state] if category not in categories: self.<strong>stateCategories</strong>[state] = categories + [category] def getStateCategoriesMapping(self): mapping = {} for state in self.<strong>states</strong>: mapping[state] = getStateCategories(state) return mapping def setStateCategoriesMapping(self, mapping): self.<strong>stateCategories</strong> = makeStateTable([]) for state, categories in mapping.items(): self.<strong>stateCategories</strong>[state] = categories def setFinalCategory(self, category): """Set all the final states to categorize to this category.""" self.<strong>stateCategories</strong> = makeStateTable([]) for state in self.<strong>finalStates</strong>: addStateCategory(state, category) # # Accessors # def computeStateMatchesConstituents(self, state): return 1 def stateMatchesConstituents(self, state): try: isConstituentTestMap = self.<strong>isConstituentTestMap</strong> except AttributeError: isConstituentTestMap = [None] * (reduce(max, self.<strong>states</strong>) + 1) for state in self.<strong>states</strong>: isConstituentTestMap[state] = computeStateMatchesConstituents(state) self.<strong>isConstituentTestMap</strong> = isConstituentTestMap return isConstituentTestMap[state] # # Presentation template overrides # def additionalTransitionInfoString(self, transition): result = FSA.FSA.additionalTransitionInfoString(self, transition) categories = getStateCategories(transition[1]) if categories: import string result = (result and result + ' ' or '') + `categories`#string.join(map(str, categories), ', ') return result def stateLabelString(self, state): # overrides the method in FSA, to include categorizing states in dot diagrams if categoriesFor(state): import string return `state` + '\n' + string.join(map(str, categoriesFor(state)), ', ') def atStateString(self, state, wrap=None): try: import REUtils str = REUtils.decompileRE(self, dottedStates=[state], wrap=wrap, sep=tokenSeparator()) except ImportError: str = "%s @ %s" % (self, state) if len(categories()) == 1: str = '%s => %s' % (categories()[0], str) return str def tokenSeparator(self): return ' ' # # Conversion # def toFSA(self, labelConstructor=lambda s:'=>' + s): """Return an FSA that corresponds to the categorizing automaton that is the argument, except that final state categories have been replaced by transitions labeled with a transformation of those categories.""" states, alphabet, transitions, initial, finals = tuple() newFinal = nextAvailableState() transitions = transitions[:] for state, categories in getStateCategoriesMapping().items(): for category in categories: transitions.append((state, newFinal, labelConstructor(category))) return copy(states + [newFinal], alphabet, transitions, initial, [newFinal]) # # Decision Functions # def buildDecisionTree(self, pairs): if pairs: test, state = pairs[0] if test.isUnconditional(): return (None, state, state) term = test.terms()[0] complement = term.complement() positives, negatives = [], [] for test, state in pairs: if term in test.terms(): positives.append((test.build(filter(lambda x, term=term:x != term, test.terms())), state)) else: negatives.append((test.build(filter(lambda x, term=complement:x != term, test.terms())), state)) return (term, buildDecisionTree(positives), buildDecisionTree(negatives)) def buildDecisionTreeDecider(self, pairs): def decisionTreeDecider(constituent, tree=buildDecisionTree(pairs)): while tree: test, positive, negative = tree if test: if test.matches(constituent): tree = positive else: tree = negative else: return positive return decisionTreeDecider def buildSerialDecider(self, pairs): if pairs: def serialDecider(constituent, pairs=pairs): for test, state in pairs: if test.matches(constituent): return state return serialDecider else: return lambda constituent:None def buildDecisionFunctions(self): assert getattr(self, '_isDeterminized', 0) decisionFunctions = [None] * (reduce(max, self.<strong>states</strong>) + 1) for state in self.<strong>states</strong>: decisionFunctions[state] = buildDecisionFunction(state) self.<strong>decisionFunctions</strong> = decisionFunctions self.<strong>nextState</strong> = self.<strong>nextStateUsingDecisionFunctions</strong> self.<strong>nextStates</strong> = self.<strong>nextStatesUsingDecisionFunctions</strong> def nextStateUsingDecisionFunctions(self, state, input): successor = self.<strong>decisionFunctions</strong>[state](input) return successor def nextStatesUsingDecisionFunctions(self, state, input): successor = self.<strong>decisionFunctions</strong>[state](input) return successor is not None and [successor] or [] # # Accepting # def labelMatches(self, label, constituent): """Override the implementation in FSA, so that strings can be used as labels that match the constituent's categories.""" if type(label) == StringType: return label == constituent or hasattr(constituent, 'category') and label == constituent.category else: return FSA.FSA.labelMatches(self, label, constituent)## Grammar compilation#def compileRule(rule, defaultCategory='S'): if getattr(rule, 'isCategorizingAutomaton', 0): automaton = rule elif getattr(rule, 'isFSA', 0): automaton = rule.coerce(CategorizingAutomaton) automaton.setFinalCategory(defaultCategory) elif type(rule) == TupleType: lhs, rhs = rule if type(rhs) == ListType: rhs = FSA.sequence(rhs) automaton = rhs.coerce(CategorizingAutomaton) automaton.setFinalCategory(lhs) else: raise 'rule must be a (lhs, automaton) or an automaton' return automatondef compileRules(rules, optimize=0, labelConstructor=None): # Rules is either a list of CategorizingFSAs or (lhs, rhs) pairs, where # each rhs is either a list or an automaton. Turn each pair ino a # CategorizingAutomaton by coercing it and setting the categories of its # final states to the lhs. automata = map(compileRule, rules) if optimize: automata = [combineRules(rules)] return automatadef combineRules(rules, labelConstructor=None): """Create a categorizing automaton from a list of rules. Each rules is a tuple (lhs, rhs), where lhs is the category for sequences recognized by rhs, which is an automaton. lhsLabelConstructor is an expression that converts a category into a label that can be intersected with the labels in the rule automata (the intersection of a category label with any rhs automaton label or with any other category label should be None); it defaults to a function that turns the category 'C' into '=>C'.""" lhsMap = {} def construct(rule, labelConstructor=labelConstructor or (lambda s:'=>' + s), lhsMap=lhsMap): automaton = compileRule(rule) for category in automaton.categories(): lhsMap[labelConstructor(category)] = category return automaton.toFSA(labelConstructor=labelConstructor) automata = map(construct , rules) fsa = apply(FSA.union, automata).minimized() states0, alpha, transitions0, initial, finals0 = fsa.tuple() transitions = filter(lambda (s0,s1,label), f=lhsMap.get: not f(label), transitions0) finalTransitions= filter(lambda (s0,s1,label), f=lhsMap.get: f(label), transitions0) states = [] for s0, s1, _ in transitions: if s0 not in states: states.append(s0) if s1 not in states: states.append(s1) finals = map(lambda (s0,s1,_): s0, finalTransitions) fsa = automata[0].copy(states, alpha, transitions, initial, finals) fsa._isDeterminized = 1 for state, _, label in finalTransitions: fsa.addStateCategory(state, lhsMap[label]) return fsa"""RULES = map(lambda (lhs, rhs):(lhs, FSA.compileRE(rhs, multichar=1)), [ ('S', 'NP VP'), ('NP', "det? adj* noun+"), ('NP', 'noun of noun'), ('VP', 'verb NP')])parser = ChartParser(compileRules(RULES))print parser.parseString('noun verb noun', multichar=1).constituents(complete=1)parser = ChartParser(compileRules(RULES, optimize=1))print parser.parseString('noun verb noun', multichar=1).constituents(complete=1)print parser.parseString('det adj noun noun verb adj noun', multichar=1).constituents(complete=1)print parser.toDotString()print parser.toDotString(includeActiveEdges=1)p.view()"""</tt></p>
<table width="100%" cellspacing=0 cellpadding=2 border=0 summary="section">
<tr bgcolor="#55aa55">
<td colspan=3 valign=bottom> <br>
<font color="#ffffff" face="helvetica, arial"><big><strong>Data</strong></big></font></td></tr>
<tr><td bgcolor="#55aa55"><tt> </tt></td><td> </td>
<td width="100%"><strong>__file__</strong> = r'.\FSChartParser.py'<br>
<strong>__name__</strong> = 'FSChartParser'</td></tr></table>
</body></html>