Source code for adeft.modeling.label

from adeft.recognize import AdeftRecognizer


[docs]class AdeftLabeler(object): """Class for labeling corpora Parameters ---------- grounding_dict : dict of dict of str Dictionary mapping shortforms to grounding_map dictionaries mapping longforms to groundings Attributes ---------- recognizers : list of py:class`adeft.recognize.AdeftRecognizer` List of recognizers for each shortform to be considered. Each recognizer identifies longforms for a shortform by finding defining matches to a defining pattern (DP) """ def __init__(self, grounding_dict): self.grounding_dict = grounding_dict self.recognizers = [AdeftRecognizer(shortform, grounding_map) for shortform, grounding_map in grounding_dict.items()]
[docs] def build_from_texts(self, text_tuples): """Build labeled corpus from a list of texts Labels texts based on defining patterns (DPs) Parameters ---------- text_tuples : list of tuple List of two element tuples whose first elements are texts from which we seek to build a corpus and whose second elements are identifiers associated with the texts. Each text should have a unique identifier associated to it. Returns ------- corpus : list Contains a tuple for each text in the input list which contains a defining pattern. Multiple tuples correspond to texts with multiple defining patterns for longforms with different groundings. The first element of each tuple contains a training text with all defining patterns replaced with only the shortform. The second element contains a grounding label for the desired shortform within the training text that was identified through a defining pattern. The third element contains the identifier for the given training text. """ corpus = [] for text, identifier in text_tuples: data_points = self._process_text(text) if data_points: corpus.extend((*data_point, identifier) for data_point in data_points) return corpus
def _process_text(self, text): """Returns training data and label corresponding to text if found The training text corresponding to an input text is obtained by stripping out all occurences of (<shortform>). It is possible that longforms are matched with the standard pattern. In this case, multiple datapoints are returned each with different labels but the same training text. Parameters ---------- text : str Fulltext to build datapoint from, if possible. Returns ------- datapoints : list of tuple or None Returns None if no label can be found by matching the standard pattern. Otherwise, returns a list of pairs containing the training text and a label for each label appearing in the input text matching the standard pattern. """ groundings = set() for recognizer in self.recognizers: groundings.update({x['grounding'] for x in recognizer.recognize(text)}) if not groundings: return None for recognizer in self.recognizers: text = recognizer.strip_defining_patterns(text) datapoints = [(text, grounding) for grounding in groundings] return datapoints