Source code for adeft.modeling.label

from adeft.recognize import AdeftRecognizer


[docs]class AdeftLabeler(object):
    """Class for labeling corpora

    Parameters
    ----------
    grounding_dict : dict of dict of str
        Dictionary mapping shortforms to grounding_map dictionaries mapping
        longforms to groundings

    Attributes
    ----------
    recognizers : list of py:class`adeft.recognize.AdeftRecognizer`
        List of recognizers for each shortform to be considered. Each
        recognizer identifies longforms for a shortform by finding defining
        matches to a defining pattern (DP)
    """
    def __init__(self, grounding_dict):
        self.grounding_dict = grounding_dict
        self.recognizers = [AdeftRecognizer(shortform, grounding_map)
                            for shortform, grounding_map
                            in grounding_dict.items()]

[docs]    def build_from_texts(self, text_tuples):
        """Build labeled corpus from a list of texts

        Labels texts based on defining patterns (DPs)

        Parameters
        ----------
        text_tuples : list of tuple
            List of two element tuples whose first elements are texts from
            which we seek to build a corpus and whose second elements are
            identifiers associated with the texts. Each text should have a
            unique identifier associated to it.

        Returns
        -------
        corpus : list
            Contains a tuple for each text in the input list which contains
            a defining pattern. Multiple tuples correspond to texts with
            multiple defining patterns for longforms with different groundings.
            The first element of each tuple contains a training text with all
            defining patterns replaced with only the shortform. The second
            element contains a grounding label for the desired shortform within
            the training text that was identified through a defining
            pattern. The third element contains the identifier for the given
            training text.
        """
        corpus = []
        for text, identifier in text_tuples:
            data_points = self._process_text(text)
            if data_points:
                corpus.extend((*data_point, identifier)
                              for data_point in data_points)
        return corpus

    def _process_text(self, text):
        """Returns training data and label corresponding to text if found

        The training text corresponding to an input text is obtained by
        stripping out all occurences of (<shortform>). It is possible that
        longforms are matched with the standard pattern. In this case, multiple
        datapoints are returned each with different labels but the same
        training text.

        Parameters
        ----------
        text : str
            Fulltext to build datapoint from, if possible.

        Returns
        -------
        datapoints : list of tuple or None
            Returns None if no label can be found by matching the standard
            pattern. Otherwise, returns a list of pairs containing the training
            text and a label for each label appearing in the input text
            matching the standard pattern.
        """
        groundings = set()
        for recognizer in self.recognizers:
            groundings.update({x['grounding']
                               for x in recognizer.recognize(text)})
        if not groundings:
            return None
        for recognizer in self.recognizers:
            text = recognizer.strip_defining_patterns(text)
        datapoints = [(text, grounding) for grounding in groundings]
        return datapoints