Source code for modules.xrenner_compatible

import re
from .xrenner_marker import remove_suffix_tokens
from .xrenner_propagate import *
from .xrenner_classes import Markable, ParsedToken
from collections import OrderedDict

"""
Module for checking compatibility of various features between markables

Author: Amir Zeldes
"""


[docs]def entities_compatible(mark1, mark2, lex): """ Checks if the entity property of two markables is compatible for possible coreference :param mark1: the first of two markables to compare entities :param mark2: the second of two markables to compare entities :param lex: the LexData object with gazetteer information and model settings :return: bool """ if mark1.entity == mark2.entity: return True elif mark1.entity is None or mark2.entity is None or mark1.entity == "" or mark2.entity == "": return True if mark1.form == "pronoun" and (not (mark1.entity == lex.filters["person_def_entity"] and mark2.entity != lex.filters["person_def_entity"]) or mark1.entity_certainty == ''): return True if mark1.entity != mark2.entity: if mark1.entity in mark2.alt_entities and (mark2.entity_certainty == "uncertain" or mark2.entity_certainty == "propagated"): return True elif mark2.entity in mark1.alt_entities and (mark1.entity_certainty == "uncertain" or mark1.entity_certainty == "propagated"): return True elif mark2.entity == lex.filters["default_entity"] and mark2.entity_certainty in ["","propagated","uncertain"]: return True elif mark1.entity == lex.filters["default_entity"] and mark1.entity_certainty in ["","propagated","uncertain"]: return True return False
def cardinality_compatible(mark1,mark2,lex): if "ablations" in lex.debug: if "no_cardinality" in lex.debug["ablations"]: return True if mark1.cardinality!=0: if mark2.cardinality!=0: if mark1.cardinality != mark2.cardinality: return False return True
[docs]def modifiers_compatible(markable, candidate, lex, allow_force_proper_mod_match=True): """ Checks whether the dependents of two markables are compatible for possible coreference :param markable: :class:`Markable` one of two markables to compare dependents for :param candidate: :class:`Markable` the second markable, which is a candidate antecedent for the other markable :param lex: the LexData object with gazetteer information and model settings :return: bool """ if markable.id+"|"+candidate.id in lex.incompatible_mod_pairs: return False if allow_force_proper_mod_match: proper_mod_must_match = lex.filters["proper_mod_must_match"] else: proper_mod_must_match = False if not cardinality_compatible(markable,candidate,lex): return False # Do strict 'no new modifiers' check if desired if lex.filters["no_new_modifiers"]: first_mark = candidate second_mark = markable if markable.start < candidate.start: first_mark = markable second_mark = candidate first_mods = (comp_mod.text for comp_mod in first_mark.head.modifiers) for mod in second_mark.head.modifiers: if lex.filters["det_func"].match(mod.func) is None: # Exclude determiners from this check if mod.text not in first_mods: if lex.filters["use_new_modifier_exceptions"]: if mod.text not in lex.exceptional_new_modifiers: return False else: return False # Check if markable and candidate have modifiers that are in the antonym list together, # e.g. 'the good news' should not be coreferent with 'the bad news', antonym_check = True if "ablations" in lex.debug: if "no_antonyms" in lex.debug["ablations"]: antonym_check = False if antonym_check: for mod in markable.head.modifiers: if mod.text.lower() in lex.antonyms: for candidate_mod in candidate.head.modifiers: if candidate_mod.text.lower() in lex.antonyms[mod.text.lower()]: markable.non_antecdent_groups.add(candidate.group) return False elif mod.lemma.lower() in lex.antonyms: for candidate_mod in candidate.head.modifiers: if candidate_mod.lemma.lower() in lex.antonyms[mod.lemma.lower()]: markable.non_antecdent_groups.add(candidate.group) return False # Check that the two markables do not have non-identical proper noun modifiers if proper_mod_must_match: if lex.filters["proper_pos"].match(mod.pos): candidate_proper_mod_texts = [] for mod2 in candidate.head.modifiers: if lex.filters["proper_pos"].match(mod2.pos): candidate_proper_mod_texts.append(mod2.text) if mod.text not in candidate_proper_mod_texts and len(candidate_proper_mod_texts) > 0: return False # Check if markable and candidate have modifiers that are different place names # e.g. 'Georgetown University' is incompatible with 'Boston University' even if those entities are not in lexicon for mod in markable.head.modifiers: if mod.text in lex.entities and (mod.text.istitle() or not lex.filters["cap_names"]): if re.sub('\t.*', "", lex.entities[mod.text][0]) == lex.filters["place_def_entity"]: for candidate_mod in candidate.head.modifiers: if candidate_mod.text != mod.text: if candidate_mod.text in lex.entities and (candidate_mod.text.istitle() or not lex.filters["cap_names"]): if re.sub('\t.*', "", lex.entities[candidate_mod.text][0]) == lex.filters["place_def_entity"]: markable.non_antecdent_groups.add(candidate.group) return False # Check for each possible pair of modifiers with identical function in the ident_mod list whether they are identical, # e.g. for the num function 'the four children' shouldn't be coreferent with 'five other children' for mod in markable.head.modifiers: for candidate_mod in candidate.head.modifiers: # TODO: add support for ident_mod pos func combo: # if lex.filters["ident_mod_func"].match(mod.func+"+"+mod.pos) and lex.filters["ident_mod_func"].match(candidate_mod.func+"+"+candidate_mod.pos) and # mod.text.lower != candidate_mod.text.lower(): if lex.filters["ident_mod_func"].match(mod.func) is not None and lex.filters["ident_mod_func"].match(candidate_mod.func) is not None and mod.text.lower != candidate_mod.text.lower(): markable.non_antecdent_groups.add(candidate.group) return False # Check that heads are not antonyms themselves if markable.head.lemma in lex.antonyms: if candidate.head.lemma in lex.antonyms[markable.head.lemma]: return False if candidate.head.lemma.isupper() and candidate.head.lemma.lower() in lex.antonyms[markable.head.lemma]: return False # Check that the heads are not conflicting proper names if markable.form == "proper" and candidate.form == "proper": if markable.text != candidate.text: if markable.text in lex.names and candidate.text in lex.names: return False elif markable.text.count(" ") == 0 and candidate.text.count(" ") == 0: isa = False if markable.text in lex.first_names and candidate.text in lex.first_names: if markable.text in lex.isa: if candidate.text.lower() in lex.isa[markable.text]: isa = True if candidate.text in lex.isa: if markable.text.lower() in lex.isa[candidate.text]: isa = True if not isa: return False if markable.text in lex.last_names and candidate.text in lex.last_names: if markable.text in lex.isa: if candidate.text.lower() in lex.isa[markable.text]: isa = True if candidate.text in lex.isa: if markable.text.lower() in lex.isa[candidate.text]: isa = True if not isa: return False # Recursive check through antecedent ancestors in group if isinstance(candidate.antecedent, Markable): antecedent_compatible = modifiers_compatible(markable, candidate.antecedent, lex) if not antecedent_compatible: return False return True
[docs]def agree_compatible(mark1, mark2, lex): """ Checks if the agree property of two markables is compatible for possible coreference :param mark1: the first of two markables to compare agreement :param mark2: the second of two markables to compare agreement :param lex: the LexData object with gazetteer information and model settings :return: bool """ if mark1.agree == mark2.agree: return True elif lex.filters["no_person_agree"].match(mark1.agree) and mark2.entity == lex.filters["person_def_entity"]: return False elif lex.filters["no_person_agree"].match(mark2.agree) and mark1.entity == lex.filters["person_def_entity"]: return False elif mark1.agree in mark2.alt_agree: mark2.agree = mark1.agree return True elif mark2.agree in mark1.alt_agree: mark1.agree = mark2.agree return True elif (mark1.agree is None or mark1.agree == '') and (mark2.agree is None or mark2.agree == ''): return True elif (((mark1.agree is None or mark1.agree == '') and lex.filters["agree_with_unknown"].match(mark2.agree) is not None) or ((mark2.agree is None or mark2.agree == '') and lex.filters["agree_with_unknown"].match(mark1.agree) is not None)): return True else: return False
[docs]def merge_entities(mark1, mark2, previous_markables, lex): """ Negotiates entity mismatches across coreferent markables and their groups. Returns True if merging has occurred. :param mark1: the first of two markables to merge entities for :param mark2: the second of two markables to merge entities for :param previous_markables: all previous markables which may need to inherit from the model/host :param lex: the LexData object with gazetteer information and model settings :return: bool """ if not mark1.entity == mark2.entity: if mark1.entity in mark2.alt_entities: if update_group(mark2, mark1, previous_markables, lex): mark2.entity = mark1.entity mark2.subclass = mark1.subclass return True else: return False else: if update_group(mark1, mark2, previous_markables, lex): mark1.entity = mark2.entity mark1.subclass = mark2.subclass return True else: return False else: return True
[docs]def update_group(host, model, previous_markables, lex): """ Attempts to update entire coreference group of a host markable with information gathered from a model markable discovered to be possibly coreferent with the host. If incompatible modifiers are discovered the process fails and returns False. Otherwise updating succeeds and the update_group returns true :param host: the first markable discovered to be coreferent with the model :param model: the model markable, containing new information for the group :param previous_markables: all previous markables which may need to inherit from the model/host :param lex: the LexData object with gazetteer information and model settings :return: bool """ group = host.group for markable in previous_markables: if markable.group == group: if not modifiers_compatible(markable, model, lex): return False for markable in previous_markables: if markable.group == group: markable.entity = model.entity markable.subclass = model.subclass return True
[docs]def isa(markable, candidate, lex): """ Staging function to check for and store new cached isa information. Calls actual :func:`run_isa` function if pair is still viable for new isa match. :param markable: one of two markables to compare lexical isa relationship with :param candidate: the second markable, which is a candidate antecedent for the other markable :param lex: the LexData object with gazetteer information and model settings :return: bool """ retval = False if markable.id+"|"+candidate.id not in lex.incompatible_isa_pairs: retval = run_isa(markable, candidate, lex) if not retval: lex.incompatible_isa_pairs.add(markable.id+"|"+candidate.id) return retval
[docs]def run_isa(markable, candidate, lex): """ Checks whether two markables are compatible for coreference via the isa-relation :param markable: one of two markables to compare lexical isa relationship with :param candidate: the second markable, which is a candidate antecedent for the other markable :param lex: the LexData object with gazetteer information and model settings :return: bool """ if "ablations" in lex.debug: if "no_isa" in lex.debug["ablations"]: return False if not lex.filters["allow_indef_anaphor"]: # Don't allow an indefinite to have a definite antecedent via isa relation if markable.start > candidate.start: if markable.definiteness == "indef" and candidate.definiteness =="def": return False else: if markable.definiteness == "def" and candidate.definiteness =="indef": return False # Don't allow a proper markable to have an indefinite antecedent via isa relation # unless there's corroborating evidence #if markable.cardinality == candidate.cardinality and markable.cardinality != 0: # pass # Explicit cardinality match, forgo indefinite antecedent prohibition #elif markable.subclass == candidate.subclass and markable.agree == candidate.agree: # pass # Explicit subclass and agree match, forgo indefinite antecedent prohibition #else: TODO: re-examine indefinite isa antecedents in natural data if markable.start > candidate.start: if markable.form == "proper" and candidate.definiteness == "indef": return False else: if markable.definiteness == "indef" and candidate.form == "proper": return False if not lex.filters["allow_indef_isa"]: # Don't allow an indefinite to have any antecedent via isa relation if forbidden by configuration if markable.start > candidate.start: if markable.definiteness == "indef": return False else: if candidate.definiteness =="indef": return False # Check for incompatible modifiers if len(markable.modifiers) > 0: if not modifiers_compatible(markable,candidate, lex): lex.incompatible_mod_pairs.add(markable.id+"|"+candidate.id) return False # Check for first name + full name match if markable.entity in ["", lex.filters["person_def_entity"]] and candidate.entity in ["", lex.filters["person_def_entity"]]: if markable.head.text in lex.first_names: candidate_mod_texts = list((mod.text) for mod in candidate.head.modifiers) if markable.head.text in candidate_mod_texts: return True if candidate.head.text in lex.first_names: markable_mod_texts = list((mod.text) for mod in markable.head.modifiers) if candidate.head.text in markable_mod_texts: return True # Check for last name + full name match if markable.entity in ["", lex.filters["person_def_entity"]] and candidate.entity in ["", lex.filters["person_def_entity"]]: if markable.head.text in lex.last_names: candidate_mod_texts = list((mod.text) for mod in candidate.head.modifiers) if markable.head.text in candidate_mod_texts: return True if candidate.head.text in lex.last_names: markable_mod_texts = list((mod.text) for mod in markable.head.modifiers) if candidate.head.text in markable_mod_texts: return True # Forbid isa head matching for two distinct proper names except first+full name; NB: use coref table for these if desired if markable.form == "proper" and candidate.form == "proper": if markable.text in lex.names or markable.text in lex.first_names: if candidate.text in lex.names or candidate.text in lex.first_names: #return False pass # Subclass based isa match - check agreement too unless disabled # Note that this check is unidirectional: the subclass can match an antecedent instance of it, # but prior mention of the subclass is not matched to a subsequent instance # (the Guardian .. < .. the newspaper is OK, but not: the newspaper .. < .. the Guardian) for subclass in candidate.alt_subclasses + [candidate.subclass]: if subclass == markable.lemma: if agree_compatible(markable, candidate, lex) and not never_agree(markable, candidate, lex): # Check if this case is already assigned a different lexical head as isa partner if markable.isa_partner_head == "" or markable.isa_partner_head == candidate.lemma: candidate.isa_dir = "markable" return True else: # Another lemma is already isa-linked to this, e.g. state <- Oregon; so now not also "Nevada" return False if subclass in lex.isa: if lex.isa[subclass][-1] == "*": subclass_isa = lex.isa[subclass][:-1] check_agree = False else: subclass_isa = lex.isa[subclass] check_agree = lex.filters["isa_subclass_agreement"] if markable.lemma.lower() in subclass_isa: if markable.isa_partner_head == "" or markable.isa_partner_head == candidate.lemma or candidate.isa_partner_head == markable.lemma: if (agree_compatible(markable, candidate, lex) or check_agree is False) and not never_agree(markable, candidate, lex): candidate.isa_dir = "markable" return True # Exact text match in isa table - no agreement matching is carried out if markable.text in lex.isa: if candidate.text in lex.isa[markable.text]: if candidate.isa_partner_head == "" or candidate.isa_partner_head == markable.head.lemma: candidate.isa_dir = "candidate" return True if candidate.text in lex.isa: if markable.text in lex.isa[candidate.text]: if markable.isa_partner_head == "" or markable.isa_partner_head == candidate.head.lemma: candidate.isa_dir = "markable" return True # Core text isa match # Note this check is unidirectional if markable.core_text in lex.isa: if candidate.core_text in lex.isa[markable.core_text] or candidate.head.lemma in lex.isa[markable.core_text]: if candidate.isa_partner_head == "" or candidate.isa_partner_head == markable.head.lemma: if agree_compatible(markable, candidate, lex) and not never_agree(markable, candidate, lex): candidate.isa_dir = "candidate" return True # Head-core text isa match - no agreement matching is carried out elif candidate.head.text in lex.isa[markable.core_text]: if candidate.isa_partner_head == "" or candidate.isa_partner_head == markable.head.lemma: candidate.isa_dir = "candidate" return True elif markable.core_text.isupper(): # Try to title case on all caps entity if markable.core_text.title() in lex.isa: if candidate.core_text in lex.isa[markable.core_text.title()] or candidate.head.lemma in lex.isa[markable.core_text.title()]: if candidate.isa_partner_head == "" or candidate.isa_partner_head == markable.head.lemma: candidate.isa_dir = "candidate" return True # Handle cases where a prefix like an article is part of the entity name, but a suffix like a possessive isn't if remove_suffix_tokens(markable.text,lex) in lex.isa: if candidate.head.text in lex.isa[remove_suffix_tokens(markable.text,lex)]: if candidate.isa_partner_head == "" or candidate.isa_partner_head == markable.head.lemma: #candidate.isa_partner_head = markable.head.lemma candidate.isa_dir = "candidate" return True elif remove_suffix_tokens(candidate.text, lex) in lex.isa: if markable.head.text in lex.isa[remove_suffix_tokens(candidate.text, lex)]: if markable.isa_partner_head == "" or markable.isa_partner_head == candidate.head.lemma: candidate.isa_dir = "markable" return True # Head-head isa match - no agreement matching is carried out if markable.head.text in lex.isa: if candidate.head.text in lex.isa[markable.head.text]: if candidate.isa_partner_head == "" or candidate.isa_partner_head == markable.head.lemma: candidate.isa_dir = "candidate" return True if candidate.head.text in lex.isa: if markable.head.text in lex.isa[candidate.head.text]: if markable.isa_partner_head == "" or markable.isa_partner_head == candidate.head.lemma: candidate.isa_dir = "markable" return True # Lemma based isa matching - check agreement too if markable.head.lemma in lex.isa: if candidate.head.lemma in lex.isa[markable.head.lemma] or candidate.head.text in lex.isa[markable.head.lemma]: if candidate.isa_partner_head == "" or candidate.isa_partner_head == markable.head.lemma: if agree_compatible(markable, candidate, lex): candidate.isa_dir = "candidate" return True if candidate.head.lemma in lex.isa: if markable.head.lemma in lex.isa[candidate.head.lemma] or markable.head.text in lex.isa[candidate.head.lemma]: if markable.isa_partner_head == "" or markable.isa_partner_head == candidate.head.lemma: if agree_compatible(markable, candidate, lex): candidate.isa_dir = "markable" return True return False
[docs]def group_agree_compatible(markable,candidate,previous_markables,lex): """ :param markable: markable whose group the candidate might be joined to :param candidate: candidate to check for compatibility with all group members :param previous_markables: all previous markables which may need to inherit from the model/host :param lex: the LexData object with gazetteer information and model settings :return: bool """ if "+" in lex.filters["never_agree_pairs"]: never_agreement_pairs = lex.filters["never_agree_pairs"].split(";") agreements = [] for mark in previous_markables: if mark.group == markable.group or mark.group == candidate.group: agreements.append(mark.agree) for pair in never_agreement_pairs: class1, class2 = pair.split("+") if class1 in agreements and class2 in agreements: return False return True
def never_agree(candidate, markable, lex): if "+" in lex.filters["never_agree_pairs"]: never_agreement_list = lex.filters["never_agree_pairs"].split(";") never_agreement_pairs = [] for pair in never_agreement_list: never_agreement_pairs.append(pair.split("+")) if [markable.agree, candidate.agree] in never_agreement_pairs or [candidate.agree, markable.agree] in never_agreement_pairs: return True return False
[docs]def score_match_heuristic(markable,candidate,features,lex): """ Basic fall-back function for heuristic match scoring when no classifier is available :param makrable: :param candidate: :param features: :return: """ score = 0 - (markable.sentence.sent_num - candidate.sentence.sent_num) # Break ties via proximity score -= ((markable.start - candidate.end) * 0.00001 + (markable.start - candidate.start) * 0.000001) # Penalize different entity types, but note heuristic only runs if coref_rules allowed this match in the first place if markable.entity != candidate.entity: score -= 0.5 if markable.form != "pronoun": # Default heuristic for lexical NPs is 'most recent match', so the score is done return score score += features["d_entidep"] if candidate.entity == lex.filters["person_def_entity"]: # Introduce slight bias to persons score += 0.1 if candidate.entity == lex.filters["subject_func"]: # Introduce slight bias to subjects score += 0.95 if candidate.agree == markable.agree: # Slight bias to explicitly identical agreement (not just compatible) score += 0.1 score += features["d_hasa"] return score
[docs]def best_candidate(markable, candidate_set, lex, rule, take_first=False): """ :param markable: markable to find best antecedent for :param candidate_set: set of markables which are possible antecedents based on some coref_rule :param lex: the LexData object with gazetteer information and model settings :param propagate: string with feature propagation instructions from coref_rules.tab in lex :param rule_num: the rule number of the rule producing the match in coref_rules.tab :param clf_name: name of the pickled classifier to use for this rule, or "_default_" to use heuristic matching :param take_first: boolean, whether to skip matching and use the most recent candidate (minimum token distance). This saves time if a rule is guaranteed to produce a unique, correct candidate (e.g. reflexives) :return: Markable object or None (the selected best antecedent markable, if available) """ # DEBUG POINT # if markable.text == lex.debug["ana"]: a=5 rule_num, clf_name, propagate, score_thresh = rule.rule_num, rule.clf_name, rule.propagation, rule.thresh heuristic = True if clf_name == "_default_" or not lex.filters["use_classifiers"] else False if len(candidate_set) == 0: return None candidate_scores = {} candidate_features = {} score_ranking = {} best = None for candidate in candidate_set: candidate_features[candidate] = markable.extract_features(lex, candidate, candidate_set, dump_position=False) candidate_scores[candidate] = score_match_heuristic(markable, candidate, candidate_features[candidate], lex) candidate_features[candidate]["rule_num"] = str(rule_num) for index, candidate in enumerate(OrderedDict(sorted(candidate_scores.items(), key=lambda x: x[1]))): score_ranking[candidate] = 1/float(index+1) if lex.dump is not None: # Only runs if dumping training data for candidate in candidate_set: dump_features = markable.extract_features(lex, candidate, candidate_set, dump_position=True) if lex.dump_headers == []: for key in dump_features: lex.dump_headers.append(key) lex.dump_headers.append("heuristic_score") # Append the heuristic based score for comparison lex.dump_headers.append("rule_num") # Track rule number for error analysis dump_list = [] for key in dump_features: dump_list.append(dump_features[key]) dump_list = [str(feat) for feat in dump_list] dump_list.append(str(score_ranking[candidate])) outline = "\t".join(dump_list) if outline not in lex.dump_types: lex.dump_types.add(outline) outline += "\t"+str(rule_num) lex.dump.write(outline+"\n") max_score = "" if take_first: markable.matching_rule = str(rule_num) return min(candidate_set, key=lambda x: abs(markable.start - x.start)) clf_input = [] candidates = [] for candidate in candidate_scores: if lex.dump is None: # Skip accurate prediction during dump for speed clf_input.append((markable, candidate, candidate_set, lex)) candidates.append(candidate) ## DEBUG POINT ## if markable.text == lex.debug["ana"]: a=4 if lex.dump is None and not heuristic and lex.filters["use_classifiers"]: preds = lex.classifiers[clf_name].classify_many(clf_input) for i, pred in enumerate(preds): candidate_scores[candidates[i]] = pred for candidate in candidate_scores: if max_score == "": max_score = candidate_scores[candidate] best = candidate elif candidate_scores[candidate] > max_score: max_score = candidate_scores[candidate] best = candidate if max_score < score_thresh: # The best option is less likely than no coref if heuristic: pass # In heuristic mode, an antecedent must always be selected from the set else: return None # In classifier mode, if no candidate scores high enough, None is returned if not propagate == "nopropagate": propagate_entity(markable, best, propagate) propagate_agree(markable, best) markable.matching_rule = str(rule_num) return best
def stems_compatible(verb, noun, lex): verb_stem = lex.filters["stemmer_deletes"].sub("",verb.text) noun_stem = lex.filters["stemmer_deletes"].sub("",noun.text) if verb_stem == noun_stem and len(noun_stem) > 3: return True if verb.text in lex.nominalizations: if noun.text in lex.nominalizations[verb.text]: return True return False
[docs]def acronym_match(mark, candidate, lex): """ Check whether a Markable's text is an acronym of a candidate Markable's text :param mark: The Markable object to test :param candidate: The candidate Markable with potentially acronym-matching text :param lex: the LexData object with gazetteer information and model settings :return: bool """ position = 0 calibration = 0 candidate_string = candidate.core_text if "ignore_in_acronym" in lex.filters: candidate_string = lex.filters["ignore_in_acronym"].sub("", candidate_string) candidate_string = candidate_string.replace(" "," ") if mark.head.text.isupper() and len(mark.head.text) > 2: for word in candidate_string.split(" "): if lex.filters["articles"].match(word): calibration = -1 elif len(word) > 0: if len(mark.head.text) > position: if word[0].isupper() or word == "&": if word[0] == mark.head.text[position]: position+=1 else: return False else: return False if (position == len(candidate_string.strip().split(" ")) + calibration) and position > 2: return True else: return False else: return False