Source code for modules.xrenner_rule

import re, operator

[docs]class CorefRule: def __init__(self,rule_string, rule_num): if not 3 <= rule_string.count(";") <= 5: raise Exception("coref rule does not contain 3-5 semicolons: " + rule_string) parts = rule_string.split(";") self.ana_spec, self.ante_spec, self.max_distance, self.propagation = parts[0:4] if len(parts) > 4: self.clf_name = parts[4] else: self.clf_name = "_default_" if len(parts) == 6: self.thresh = float(parts[5]) else: self.thresh = None self.max_distance = int(self.max_distance) self.ana_constraints = [] self.ante_constraints = [] for item in self.ana_spec.split("&"): self.ana_constraints.append(ConstraintMatcher(item)) for item in self.ante_spec.split("&"): self.ante_constraints.append(ConstraintMatcher(item)) # Make sure that group failure criteria are first to be checked self.ante_constraints.sort(key=lambda x: x.group_failure, reverse=True) self.rule_num = rule_num def __repr__(self): return self.ana_spec + " -> " + self.ante_spec + " (" + str(self.max_distance) + ", " + self.propagation + ", "+ self.clf_name + ")"
[docs]class ConstraintMatcher: def __init__(self,constraint): self.group_failure = False self.negative = operator.truth self.match_type = "exact" self.value = "" self.key = "" self.compiled_re = None self.props = {"form", "text", "agree", "entity", "subclass", "cardinality","text_lower","lemma","pos","func","quoted","mood","speaker"} if constraint.endswith("*"): self.group_failure = True constraint = constraint[0:-1] if "=" in constraint: # Key-value constrains key, value = constraint.split("=") if key[-1] == "!": self.negative = operator.not_ key = key[0:-1] if value.startswith('"') and value.endswith('"'): value = value[1:-1] self.match_type = "exact" elif value.startswith("/") and value.endswith("/"): value = value[1:-1] if re.escape(value) == value: # Does not contain special characters but is medial regex self.match_type = "exact" elif re.escape(value[1:-1]) == value and value.startswith("^") and value.endswith("$"): # Regex only supplies anchors, treat as exact match value = value[1:-1] self.match_type = "exact" elif re.escape(value[1:]) == value and value.startswith("^"): # Regex only supplies initial anchor, treat as startswith value = value[1:] self.match_type = "startswith" elif re.escape(value[:-1]) == value and value.endswith("$"): # Regex only supplies initial anchor, treat as startswith value = value[:-1] self.match_type = "endswith" else: self.match_type = "regex" self.compiled_re = re.compile(value) elif value.lower() == "true": self.match_type = "bool" self.value = True elif value.lower() == "false": self.match_type = "bool" self.value = False elif value.startswith("$"): # Antecedent-based spec, can't precompile value matcher self.match_type = "dollar" else: # String literal without regex self.match_type = "exact" if self.match_type != "bool": self.value = value self.key = key elif constraint == "none" or constraint.startswith("any") or constraint.startswith("look") or constraint.startswith("take"): # This is a 'none' matcher or processing instruction, matches anything self.match_type = "none" elif "sameparent" in constraint: # same or !same style constraint - port to $ for backwards compatibility if constraint[0] == "!": self.negative = operator.not_ self.match_type = "dollar" self.key = "parent" self.value = "$1" elif "samespeaker" in constraint: # same or !same style constraint - port to $ for backwards compatibility if constraint[0] == "!": self.negative = operator.not_ self.match_type = "dollar" self.key = "speaker" self.value = "$1" elif constraint.startswith("last["): self.match_type = "exact" self.key = "LAST" self.value = constraint[constraint.find("[")+1:-1] def __repr__(self): if self.negative == operator.truth: op = "" else: op = "!" return self.key + " " + op + self.match_type + " '" + self.value + "'" def match(self,mark,lex,anaphor=None): test_val = "" op = self.negative if self.match_type == "none": return True elif self.match_type == "dollar": if self.key in self.props: self.value = str(getattr(anaphor, self.key)) test_val = str(getattr(mark, self.key)) elif self.key == "head": return op(anaphor.head.id == mark.head.head) elif self.key == "child": return op(anaphor.head.head == mark.head.id) elif self.key == "hasa": return op(anaphor.head.head_text in lex.hasa[mark.lemma]) elif self.key == "parent": if mark.head.head == "0": # Root token, by definition not same parent as another token retval = op(False) elif mark.sentence.sent_num != anaphor.sentence.sent_num: retval = op(False) else: retval = op(anaphor.head.head == mark.head.head) if retval is False and self.group_failure and anaphor is not None: mark.non_antecdent_groups.add(anaphor.group) anaphor.non_antecdent_groups.add(mark.group) return retval elif self.key == "has_child_func": raise Exception("coref rule 'has_child_func=$' : $ identity not implemented for has_child_func") elif self.key == "mod": mods = getattr(anaphor.head, "modifiers") found_mod = False for mod1 in mark.head.modifiers: for mod2 in mods: if mod1.lemma == mod2.lemma and lex.filters["det_func"].match(mod1.func) is None and \ lex.filters["det_func"].match(mod2.func) is None: found_mod = True if not found_mod: if self.group_failure and anaphor is not None: mark.non_antecdent_groups.add(anaphor.group) return False else: return True else: if self.key in self.props: if self.match_type == "bool": test_val = getattr(mark, self.key) else: test_val = str(getattr(mark, self.key)) elif self.key == "LAST": if self.value in lex.last: return op(lex.last[self.value].entity==mark.entity) else: return False elif self.key == "has_child_func": test_val = mark.child_func_string self.match_type = "substring" if self.value[0] != ";": self.value = ";"+self.value+";" elif self.key == "mod": mods = [self.value] found_mod = False for mod1 in mark.head.modifiers: for mod2 in mods: # Note that mod2 is just a string - not a ParsedToken if mod1.lemma == mod2 and lex.filters["det_func"].match(mod1.func) is None: found_mod = True if not found_mod: if self.group_failure and anaphor is not None: mark.non_antecdent_groups.add(anaphor.group) return False else: return True elif self.key == "head": raise Exception("coref rule 'head=VAL' : value match not implemented for head") elif self.key == "child": raise Exception("coref rule 'child=VAL' : value match not implemented for child") if self.match_type == "exact": retval = op(test_val == self.value) elif self.match_type == "substring": retval = op(self.value in test_val) elif self.match_type == "regex": retval = op(self.compiled_re.search(test_val) is not None) elif self.match_type == "startswith": retval = op(test_val.startswith(self.value)) elif self.match_type == "endswith": retval = op(test_val.endswith(self.value)) elif self.match_type == "dollar": retval = op(test_val == self.value) elif self.match_type == "bool": retval = op(test_val == self.value) if retval is False and self.group_failure and anaphor is not None: mark.non_antecdent_groups.add(anaphor.group) return retval