Modify the "apply" function in nltk's CachedTopDownPredictRule (line33-34). The original implementation excludes the productions that the first right hand side node is a terminal, and the index is larger than the number of tokens. This is modified so that we can predict the next terminal symbol.
March 1, 2017 ยท View on GitHub
class CachedTopDownPredictRule(TopDownPredictRule):
"""
A cached version of TopDownPredictRule. After the first time
this rule is applied to an edge with a given end and next,
it will not generate any more edges for edges with that end and
next.
If ``chart`` or ``grammar`` are changed, then the cache is flushed.
"""
def __init__(self):
TopDownPredictRule.__init__(self)
self._done = {}
def apply(self, chart, grammar, edge):
if edge.is_complete(): return
nextsym, index = edge.nextsym(), edge.end()
if not is_nonterminal(nextsym): return
# If we've already applied this rule to an edge with the same
# next & end, and the chart & grammar have not changed, then
# just return (no new edges to add).
done = self._done.get((nextsym, index), (None,None))
if done[0] is chart and done[1] is grammar: return
# Add all the edges indicated by the top down expand rule.
for prod in grammar.productions(lhs=nextsym):
# If the left corner in the predicted production is
# leaf, it must match with the input.
if prod.rhs():
first = prod.rhs()[0]
if is_terminal(first):
# === Modification here: allow the index to be larger than the number of tokens ===
# if index >= chart.num_leaves() or first != chart.leaf(index): continue
if index < chart.num_leaves() and first != chart.leaf(index): continue
new_edge = TreeEdge.from_production(prod, index)
if chart.insert(new_edge, ()):
yield new_edge
# Record the fact that we've applied this rule.
self._done[nextsym, index] = (chart, grammar)