Source code for adeft.recognize

"""Implements the disambiguation of shortforms based on recognizing an
explicit defining pattern in text."""

import re
import string
import logging


from adeft.nlp import stem, word_tokenize, word_detokenize
from adeft.util import get_candidate_fragments, get_candidate, SearchTrie

logger = logging.getLogger(__file__)

try:
    from adeft.score import AlignmentBasedScorer
except Exception:
    logger.info('OneShotRecognizer not available. Extension module for'
                ' AlignmentBasedScorer is missing')


[docs]class BaseRecognizer(object):
    """Base class for recognizers

    Recognizers are built to identify longform expansions for a shortform by
    searching for defining patterns (DPs).

    Parameters
    ----------
    shortform : str
        shortform to be recognized
    window : Optional[int]
        Specifies range of characters before a defining pattern (DP)
        to consider when finding longforms. Should be set to the same value
        that was used in the AdeftMiner that was used to find longforms.
        Default: 100
    """
    def __init__(self, shortform, window=100):
        self.shortform = shortform
        self.window = window

[docs]    def recognize(self, text):
        """Find longforms in text by searching for defining patterns (DPs)

        Parameters
        ----------
        text : str
            Sentence where we seek to disambiguate shortform

        Returns
        -------
        expansions : set of str
            Set of longforms corresponding to shortform in sentence if a
            defining pattern is matched. Returns None if no defining patterns
            are found
        """
        results = []
        fragments = get_candidate_fragments(text, self.shortform,
                                            window=self.window)
        for fragment in fragments:
            if not fragment:
                continue
            tokens, longform_map = get_candidate(fragment)
            # search for longform in trie
            result = self._search(tokens)
            # if a longform is recognized, add it to output list
            if result:
                longform = result['longform']
                num_tokens = len(word_tokenize(longform))
                longform_text = longform_map[num_tokens]
                result = self._post_process(result)
                result['longform_text'] = longform_text
                results.append((result))
        return results

[docs]    def strip_defining_patterns(self, text):
        """Return text with defining patterns stripped

       This is useful for training machine learning models where training
       labels are generated by finding defining patterns (DP)s. Models must
       be trained to disambiguate texts that do not contain a defining
       pattern.

       The output on the first sentence of the previous paragraph is
       "This is useful for training machine learning models where training
       labels are generated by finding DPs."

       Parameters
       ----------
       text : str
           Text to remove defining patterns from

       Returns
       -------
       stripped_text : str
           Text with defining patterns replaced with shortform
        """
        fragments = get_candidate_fragments(text, self.shortform)
        for fragment in fragments:
            # Each fragment is tokenized and its longform is identified
            tokens = word_tokenize(fragment)
            result = self._search([token for token, _ in tokens
                                   if token not in string.punctuation])
            if result is None:
                # For now, ignore a fragment if its grounding has no longform
                # from the grounding map
                continue
            longform = result['longform']
            # Remove the longform from the fragment, keeping in mind that
            # punctuation is ignored when extracting longforms from text
            num_words = len(longform.split())
            i = 0
            j = len(tokens) - 1
            while i < num_words:
                if re.match(r'\w+', tokens[j][0]):
                    i += 1
                j -= 1
                if i > self.window:
                    break
            text = text.replace(fragment.strip(),
                                word_detokenize(tokens[:j+1]))
        # replace all instances of parenthesized shortform with shortform
        stripped_text = re.sub(r'\(\s*%s\s*\)'
                               % self.shortform,
                               ' ' + self.shortform + ' ', text)
        stripped_text = ' '.join(stripped_text.split())
        return stripped_text

    def _search(self, tokens):
        """Method to identify longform expansion from tokens preceeding DP

        This method should take a list of tokens preceeding a defining pattern
        and return a longform expansion as a single string
        """
        raise NotImplementedError

    def _post_process(self, text):
        """Post-processing step for longform expansion

        Default to no post-processing
        """
        return text


[docs]class AdeftRecognizer(BaseRecognizer):
    """Class for recognizing longforms by searching for defining patterns (DP)

    Searches text for the pattern "<longform> (<shortform>)" for a collection
    of grounded longforms supplied by the user.

    Parameters
    ----------
    shortform : str
        shortform to be recognized
    grounding_map : dict[str, str]
        Dictionary mapping longform texts to their groundings
    window : Optional[int]
        Specifies range of characters before a defining pattern (DP)
        to consider when finding longforms. Should be set to the same value
        that was used in the AdeftMiner that was used to find longforms.
        Default: 100

    Attributes
    ----------
    _trie : :py:class:`adeft.recognize._TrieNode`
        Trie used to search for longforms. Edges correspond to stemmed tokens
        from longforms. They appear in reverse order to the bottom of the trie
        with terminal nodes containing the associated longform in their data.
    """
    def __init__(self, shortform, grounding_map, window=100):
        self.grounding_map = grounding_map
        self.search_trie = SearchTrie(grounding_map,
                                      token_map=lambda x: stem(x).lower())
        super().__init__(shortform, window)

    def _search(self, tokens):
        res, _ = self.search_trie.search(tokens)
        if res is not None:
            res = {'longform': res}
        return res

    def _post_process(self, result):
        """Map longform to associated grounding in grounding map"""
        return {'grounding': self.grounding_map[result['longform']]}


[docs]class OneShotRecognizer(BaseRecognizer):
    """Identify longform expansions using subsequence matching

    Uses a string matching algorithm to determine longform boundaries
    for a defining pattern for only a single text.

    Attributes
    ----------
    shortform : str
        shortform to be recognized
    window : Optional[int]
        Specifies range of characters before a defining pattern (DP)
        to consider when finding longforms. Should be set to the same value
        that was used in the AdeftMiner that was used to find longforms.
        Default: 100
    **params
        Parameters for :py:class`adeft.score.AdeftLongformScorer`
    """
    def __init__(self, shortform, window=100, **params):
        try:
            self.scorer = AlignmentBasedScorer(shortform, **params)
        except NameError:
            logger.exception('OneShotRecognizer not available.'
                             ' Extension module for AlignmentBasedScorer'
                             ' is missing')
        super().__init__(shortform, window)

    def _search(self, tokens):
        """Use AdeftLongformScorer to identify expansions"""
        scores = self.scorer.expanding_score([stem(token).lower()
                                              for token in tokens])
        n = len(tokens)
        i = max(range(len(scores)), key=lambda i: scores[i])
        longform = ' '.join(tokens[n-i-1:])
        return {'longform': longform, 'score': scores[i]}

    def _post_process(self, result):
        return {'score': result['score']}