Source code for modules.xrenner_classes

"""
Basic classes for parsed tokens, markables and sentences.

Author: Amir Zeldes
"""

import sys, re
from math import log
from collections import OrderedDict, defaultdict

[docs]class ParsedToken:
	def __init__(self, tok_id, text, lemma, pos, morph, head, func, sentence, modifiers, child_funcs, child_strings, lex, quoted=False, head2="_", func2="_"):
		self.id = tok_id
		self.text = text.strip()
		self.text_lower = text.lower()
		self.pos = pos
		if lemma != "_" and lemma != "--":
			self.lemma = lemma.strip()
		else:
			self.lemma = lex.lemmatize(self)
		self.morph = morph
		if morph != "_" and morph != "--" and morph != "":
			self.morph = lex.process_morph(self)

		self.head = head
		self.original_head = head
		self.func = func
		self.head2 = head2
		self.func2 = func2
		self.sentence = sentence
		self.modifiers = modifiers
		self.child_funcs = child_funcs
		self.child_strings = child_strings
		self.quoted = quoted
		self.coordinate = False
		self.head_text = ""
		self.lex = lex  # Pointer to lex object
		self.lemma_freq = 0.0
		self.negated = False
		self.neg_parent = False

	def __repr__(self):
		return str(self.text) + " (" + str(self.pos) + "/" + str(self.lemma) + ") " + "<-" + str(self.func) + "- " + str(self.head_text)

[docs]class Markable:

	# Properties refering to markable head, not markable itself
	tok_props = {"negated", "neg_parent", "pos", "lemma", "morph", "func", "quoted", "modifiers", "child_funcs", "child_strings",
	 "agree", "doc_position", "sent_position", "head_text", "head_pos", "lemma_freq"}

	def __init__(self, mark_id, head, form, definiteness, start, end, text, core_text, entity, entity_certainty, subclass, infstat, agree, sentence,
				 antecedent, coref_type, group, alt_entities, alt_subclasses, alt_agree, cardinality=0, submarks=[], coordinate=False, agree_certainty=""):
		self.id = mark_id
		self.head = head
		self.form = form
		self.definiteness = definiteness
		self.start = start
		self.end = end
		self.text = text.strip()
		self.core_text = core_text.strip()  # Holds markable text before any extensions or manipulations
		self.first = self.core_text.split(" ")[0]
		self.last = self.core_text.split(" ")[-1]
		self.entity = entity
		self.subclass = subclass
		self.infstat = infstat
		self.agree = agree
		self.agree_certainty = agree_certainty
		self.sentence = sentence
		self.antecedent = antecedent
		self.coref_type = coref_type
		self.group = group
		self.non_antecdent_groups = set()
		self.entity_certainty = entity_certainty
		self.isa_partner_head = ""  # Property to hold isa match; once saturated, no other lexeme may form isa link

		# Alternate agreement, subclass and entity lists:
		self.alt_agree = alt_agree
		self.alt_entities = alt_entities
		self.alt_subclasses = alt_subclasses

		self.cardinality=cardinality
		self.submarks = submarks
		self.coordinate = coordinate

		self.length = self.text.strip().count(" ") + 1
		self.mod_count = len(self.head.modifiers)

		# Dictionaries to fill when getting dependency and similarity based frequencies
		self.entity_dep_scores = defaultdict(int)
		self.entity_sim_dep_scores = defaultdict(int)
		self.lex_dep_scores = defaultdict(int)
		self.lex_sim_dep_scores = defaultdict(int)

	def has_child_func(self, func):
		if "*" in func: # func substring, do not delimit function
			return func in self.child_func_string
		else: # Exact match, delimit with ";"
			return ";" + func + ";" in self.child_func_string

	def get_dep_freqs(self,lex):

		#DEBUG POINT
		if self.text == lex.debug["ana"]:
			pass

		use_entity_deps = True
		use_lex_deps = True
		if "ablations" in lex.debug:
			if "no_entity_dep" in lex.debug["ablations"]:
				use_entity_deps = False
		use_hasa = True
		if "ablations" in lex.debug:
			if "no_hasa" in lex.debug["ablations"]:
				use_hasa = False

		anaphor_parent = self.head.head_text
		if anaphor_parent in lex.entity_deps and use_entity_deps:
			if self.head.func in lex.entity_deps[anaphor_parent]:
				for entity in lex.entity_deps[anaphor_parent][self.head.func]:
					self.entity_dep_scores[entity] = lex.entity_deps[anaphor_parent][self.head.func][entity]
		found = False
		if anaphor_parent in lex.similar and use_entity_deps:
			for sim in lex.similar[anaphor_parent]:
				if sim in lex.entity_deps and not found:
					if self.head.func in lex.entity_deps[sim]:
						found = True
						for entity in lex.entity_deps[sim][self.head.func]:
							self.entity_sim_dep_scores[entity] = lex.entity_deps[sim][self.head.func][entity]
		if anaphor_parent in lex.lex_deps and use_lex_deps:
			if self.head.func in lex.lex_deps[anaphor_parent]:
				for lexeme in lex.lex_deps[anaphor_parent][self.head.func]:
					self.lex_dep_scores[lexeme] = lex.lex_deps[anaphor_parent][self.head.func][lexeme]
		if anaphor_parent in lex.similar and use_entity_deps:
			for sim in lex.similar[anaphor_parent]:
				if sim in lex.lex_deps:
					if self.head.func in lex.lex_deps[sim]:
						for lexeme in lex.lex_deps[sim][self.head.func]:
							self.entity_sim_dep_scores[lexeme] = lex.lex_deps[sim][self.head.func][lexeme]

	def __repr__(self):
		agree = "no-agr" if self.agree == "" else self.agree
		defin = "no-def" if self.definiteness == "" else self.definiteness
		card = "no-card" if self.cardinality == 0 else self.cardinality
		func = "no-func" if self.head.func == "" else self.head.func
		return str(self.entity) + "/" + str(self.subclass) + ': "' + self.text + '" (' + agree + "/" + defin + "/" + func + "/" + str(card) + ")"


[docs]	def extract_features(self, lex, antecedent=None, candidate_list=[], dump_position=False):
		"""
		Function to generate feature representation of markables or markable-antecedent pairs for classifiers

		:param lex: the LexData object with gazetteer information and model settings
		:param antecedent: The antecedent Markable potentially coreferring to self
		:param candidate_list: The list of candidate markables under consideration, used to extract cohort size
		:param dump_position: Whether document name + token positions are dumped for each markable to compare to gold
		:return: dictionary of markable properties
		"""

		out_dict = OrderedDict()
		if dump_position:
			out_dict["position"] = str(self.start)+"-"+str(self.end)+";"+str(antecedent.start)+"-"+str(antecedent.end)
		out_dict["docname"] = lex.docname

		# TODO: Make genre representation configurable
		# By convention, genre is taken from first 4 chars of file, but for corpora like GUM use part between underscores
		if lex.docname.startswith("GUM_") or lex.docname.lower().startswith("autogum_") or lex.docname.lower().startswith("amalgum_"):
			out_dict["genre"] = lex.docname.split("_")[1]
		elif len(lex.docname) > 4:
			out_dict["genre"] = lex.docname[:4]
		else:
			out_dict["genre"] = "_"  # Too short to detect genre

		log_props = set([])  # Properties to log-transform before dump, if desired
		bool_props = {"coordinate","quoted","negated","neg_parent"}  # Boolean props to dump as 1 or 0
		thresh_props = {"lemma","head_text"}
		f_threshold = 0  # Minimum frequency for lexical categories in freqs.tab (0 to ignore, else items with lower freq are replace by POS)

		# TODO: May need to lower() head_text, since cap/uncapped version may be in lex/entity dep score lexicon
		anaphor_parent = self.head.head_text
		ana_props = ["lemma","func","head_text","form","pos","agree","start","end","lemma_freq",
						  "cardinality", "definiteness","entity","subclass", "infstat", "coordinate",
						  "length", "mod_count", "doc_position","sent_position","quoted","negated","neg_parent","s_type"]
		ante_props = ["lemma","func","head_text","form","pos","agree","start","end","lemma_freq",
						  "cardinality", "definiteness","entity","subclass", "infstat", "coordinate",
						  "length", "mod_count", "doc_position","sent_position","quoted","negated","neg_parent","s_type"]
		for prop in ana_props:
			val = getattr(self,prop)
			if prop in log_props:
				val = log(val+1)
			elif prop in bool_props:
				val = int(val)
			elif prop in ["lemma"]:
				if lex.freqs[val] < f_threshold:
					val = self.pos
			elif prop in ["head_text"]:
				if lex.freqs[val] < f_threshold:
					val = self.head_pos
			out_dict["n_" + prop] = val if val != "" else "_"

		if antecedent is not None:
			for prop in ante_props:
				val = getattr(antecedent,prop)
				if prop in log_props:
					val = log(val + 1)
				elif prop in bool_props:
					val = int(val)
				elif prop in thresh_props:
					if lex.freqs[val] < f_threshold:
						if prop == "lemma":
							val = antecedent.pos
						elif prop == "head_text":
							val = antecedent.head_pos
				out_dict["t_" + prop] = val if val != "" else "_"

			anaphor = self

			out_dict["d_sent"] = anaphor.sent_num - antecedent.sent_num
			out_dict["d_tok"] = anaphor.start - antecedent.end
			out_dict["d_agr"] = int(anaphor.agree == antecedent.agree)
			out_dict["d_intervene"] = abs(int(re.sub('.*_', '', anaphor.id)) - int(re.sub('.*_', '', antecedent.id)))  # Number of markables between ana and ante
			out_dict["d_cohort"] = len(candidate_list)
			out_dict["d_modcount"] = anaphor.mod_count - antecedent.mod_count
			out_dict["d_samemods"] = len(list(set([m.lemma for m in anaphor.head.modifiers]) & set([m.lemma for m in antecedent.head.modifiers])))

			hasa = 0
			use_hasa = True
			if "ablations" in lex.debug:
				if "no_hasa" in lex.debug["ablations"]:
					use_hasa = False
			if use_hasa:
				if antecedent.head.text in lex.hasa and lex.filters["possessive_func"].search(self.func) is not None:  # Text based hasa
					if anaphor_parent in lex.hasa[antecedent.head.text]:
						hasa = lex.hasa[antecedent.head.text][anaphor_parent]
				elif antecedent.head.lemma in lex.hasa and lex.filters["possessive_func"].search(self.func) is not None:  # Lemma based hasa
					if anaphor_parent in lex.hasa[antecedent.head.lemma]:
						hasa = lex.hasa[antecedent.head.lemma][anaphor_parent]

			out_dict["d_hasa"] = hasa
			out_dict["d_entidep"] = self.entity_dep_scores[antecedent.entity]
			out_dict["d_entisimdep"] = self.entity_sim_dep_scores[antecedent.entity]
			out_dict["d_lexdep"] = self.lex_dep_scores[antecedent.head.text]
			out_dict["d_lexsimdep"] = self.lex_sim_dep_scores[antecedent.head.text]
			out_dict["d_sametext"] = int(anaphor.text == antecedent.text)
			out_dict["d_samelemma"] = int(anaphor.lemma == antecedent.lemma)

			out_dict["d_doclen"] = int(anaphor.head.lex.token_count)


			# Check if one markable head is the dependency parent of the other
			if antecedent.head.head == anaphor.head.id:
				out_dict["d_parent"] = 1
			elif anaphor.head.head == antecedent.head.id:
				out_dict["d_parent"] = -1
			else:
				out_dict["d_parent"] = 0

			if self.speaker == "" and antecedent.speaker == "":
				out_dict["d_speaker"] = 0
			elif self.speaker == antecedent.speaker:
				out_dict["d_speaker"] = 1
			else:
				out_dict["d_speaker"] = -1

		return out_dict

	def __getattr__(self, item):
		# Convenience methods to access head token and containing sentence
		if item in self.tok_props:
			return getattr(self.head,item)
		elif item == "text_lower":
			if self.coordinate:  # If this is a coordinate markable return lower case core_text
				return self.core_text.lower()
			else:  # Otherwise return lower text of head token
				return getattr(self.head, item)
		elif item in ["mood", "speaker", "sent_num", "s_type"]:
			return getattr(self.sentence,item)
		elif item == "child_func_string":
			# Check for cached child_func_string
			if "child_func_string" not in self.__dict__:  # Convenience property to store semi-colon separated child funcs of head token
				# Assemble if not yet cached
				if len(self.head.child_funcs) > 1:
					self.child_func_string = ";" + ";".join(self.head.child_funcs) + ";"
				else:
					self.child_func_string = "_"
			return self.child_func_string
		else:
			raise AttributeError("No attribute: " + str(item))


class Sentence:
	def __init__(self, sent_num, start_offset, mood="", speaker=""):
		self.sent_num = sent_num
		self.start_offset = start_offset
		self.mood = mood
		self.speaker = speaker
		self.token_count = 0
		self.s_type = "_"  # Initial type, will be overwritten if s_type annotation is found in input

	def __repr__(self):
		mood = "(no mood info)" if self.mood == "" else self.mood
		speaker = "(no speaker info)" if self.speaker == "" else self.speaker
		return "S" + str(self.sent_num) + " from T" + str(self.start_offset + 1) + ", mood: " + mood  + ", speaker: " + speaker + ", type: " + self.s_type


def get_descendants(parent, children_dict, seen_tokens, sent_num, conll_tokens):
	my_descendants = []
	my_descendants += children_dict[parent]
	for child in children_dict[parent]:
		if child in seen_tokens:
			if sys.version_info[0] < 3:
				sys.stderr.write("\nCycle detected in syntax tree in " + conll_tokens[int(parent)].lex.docname + " in sentence " + str(sent_num) + " (child of token: '" + conll_tokens[int(parent)].text.encode("utf8") + "')\n")
			else:
				sys.stderr.write("\nCycle detected in syntax tree in " + conll_tokens[int(parent)].lex.docname + " in sentence " + str(sent_num) + " (child of token: '" + conll_tokens[int(parent)].text + "')\n")
			sys.exit("Exiting due to invalid input\n")
		else:
			seen_tokens += [child]
	for child in children_dict[parent]:
		if child in children_dict:
			my_descendants += get_descendants(child, children_dict, seen_tokens, sent_num, conll_tokens)
	return my_descendants