Source code for modules.xrenner_preprocess

"""
modules/xrenner_preprocess.py

Prepare parser output for entity and coreference resolution

Author: Amir Zeldes
"""

from .xrenner_marker import lookup_has_entity

[docs]def add_negated_parents(conll_tokens, offset):
	"""
	Sets the neg_parent property on tokens whose head dominates a negation

	:param conll_tokens: token list for this document
	:param offset: token ID reached in last sentence
	:return: None
	"""

	for token in conll_tokens[offset:]:
		parent_id = token.head
		if parent_id != "0":
			if conll_tokens[int(parent_id)].negated:
				token.neg_parent = True


[docs]def add_child_info(conll_tokens, child_funcs, child_strings, lex):
	"""
	Adds a list of all dependent functions and token strings to each parent token
	
	:param conll_tokens: The ParsedToken list so far
	:param child_funcs: Dictionary from ids to child functions
	:param child_strings: Dictionary from ids to child strings
	:return: void
	"""
	for child_id in child_funcs:
		if child_id > len(conll_tokens)-1:
			continue
		for func in child_funcs[child_id]:
			if func not in conll_tokens[child_id].child_funcs:
				conll_tokens[child_id].child_funcs.append(func)
				if lex.filters["neg_func"].match(func):
					conll_tokens[child_id].negated = True
		for tok_text in child_strings[child_id]:
			if tok_text not in conll_tokens[child_id].child_strings:
				conll_tokens[child_id].child_strings.append(tok_text)


def postprocess_parser(conll_tokens, tokoffset, children, stop_ids, lex):
	for tok1 in conll_tokens[tokoffset + 1:]:
		if tok1.text == "-LSB-" or tok1.text == "-RSB-":
			tok1.pos = tok1.text
			tok1.func = "punct"
			tok1.head = "0"
		if lex.filters["mark_head_pos"].match(tok1.pos) is not None:
			entity_candidate = tok1.text + " "
			for tok2 in conll_tokens[int(tok1.id) + 1:]:
				if lex.filters["mark_head_pos"].match(tok2.pos) is not None:
					entity_candidate += tok2.text + " "
					### DEBUG BREAKPOINT ###
					if entity_candidate.strip() == lex.debug["ana"]:
						pass
					if entity_candidate.strip() in lex.entities:  # Entity matched, check if all tokens are inter-connected
						for tok3 in conll_tokens[int(tok1.id):int(tok2.id)]:
							# Ensure right most token has head outside entity:
							if int(tok2.head) > int(tok2.id) or int(tok2.head) < int(tok1.id):
								if (int(tok3.head) < int(tok1.id) or int(tok3.head) > int(tok2.id)) and tok3.id in children[tok3.head]:
									children[tok3.head].remove(tok3.id)
									tok3.head = tok2.id
									children[tok3.head].append(tok3.id)
									break
				else:
					break
		# Check for apposition pointing back to immediately preceding proper noun token -
		# typical (German model) MaltParser name behavior
		if lex.filters["apposition_func"].match(tok1.func) is not None and not tok1.id == "1":
			if lex.filters["proper_pos"].match(conll_tokens[int(tok1.id) - 1].pos) is not None and conll_tokens[
						int(tok1.id) - 1].id == tok1.head:
				tok1.func = "xrenner_fix"
				children[str(int(tok1.id) - 1)].append(tok1.id)
				stop_ids[tok1.id] = True

		# Check for [city], [state/country] apposition -
		# typical (English model) Stanford parser behavior
		if tok1.text == lex.debug["ana"]:
			a=5
		if lex.filters["apposition_func"].match(tok1.func) is not None and not int(tok1.id) < 3:
			if conll_tokens[int(tok1.id) - 1].text.strip() == ",":
				tok_minus2 = conll_tokens[int(tok1.id) - 2]
				tok1_head = conll_tokens[int(tok1.head)]
				if lex.filters["proper_pos"].match(tok_minus2.pos) is not None:
					if (tok_minus2.id == tok1.head and (lookup_has_entity(tok1.text, tok1.lemma, "place", lex) and not lookup_has_entity(tok_minus2.text, tok_minus2.lemma, "place", lex) or \
						lookup_has_entity(tok_minus2.text, tok_minus2.lemma, "place", lex))) or \
						not lookup_has_entity(tok1_head.text, tok1_head.lemma, "place", lex) and lookup_has_entity(tok1.text, tok1.lemma, "place", lex):
							tok1.func = "xrenner_fix"
							if tok1.id not in children[tok_minus2.id]:
								if tok_minus2.head != tok1.id:  # Avoid creating a cycle
									children[tok_minus2.id].append(tok1.id)

		# Check for markable projecting beyond an apposition to itself and remove from children on violation
		if lex.filters["apposition_func"].match(tok1.func) is not None and not tok1.id == "1":
			for tok2 in conll_tokens[int(tok1.id) + 1:]:
				if tok2.head == tok1.head and lex.filters["non_link_func"].match(tok2.func) is None and tok2.id in children[tok2.head]:
					children[tok2.head].remove(tok2.id)


[docs]def replace_conj_func(conll_tokens, tokoffset, lex):
	"""
	Function to replace functions of tokens matching the conjunction function with their parent's function
	
	:param conll_tokens: The ParsedToken list so far
	:param tokoffset: The starting token for this sentence
	:param lex: the LexData object with gazetteer information and model settings
	:return: void
	"""

	for token in conll_tokens[tokoffset:]:
		## DEBUG POINT ##
		if token.text == lex.debug["ana"]:
			pass

		if lex.filters["conjunct_func"].match(token.func) is not None:
			for child_func in conll_tokens[int(token.head)].child_funcs:
				token.child_funcs.append(child_func)
			token.func = conll_tokens[int(token.head)].func
			token.head = conll_tokens[int(token.head)].head
			token.coordinate = True