Source code for adeft.util

"""Utility functions used by Adeft internally.

"""
import re
from unicodedata import category

from adeft.nlp import word_tokenize, word_detokenize


[docs]def get_candidate_fragments(text, shortform, window=100):
    """Return candidate longform fragments from text

    Gets fragments of text preceding defining patterns (DPs) to search
    for candidate longforms. Each fragment contains either a specified range
    of characters before a DP, or characters up until either the start
    of the sentence or the end of a previous DP.


    Parameters
    ----------
    text : str
        Text to search for defining patterns (DP)
    shortform : str
        Shortform to disambiguate
    window : Optional[int]
        Specifies range of characters before a defining pattern (DP)
        to consider when finding longforms. If set to 30, candidate
        longforms would be taken from the string
        "ters before a defining pattern". Default: 100
    """
    # Find defining patterns by matching a regular expression
    matches = re.finditer(r'\s\(%s\)' % re.escape(shortform), text)
    # Keep track of the index of the end of the previous
    # Longform candidates cannot contain a previous DP and any text
    # before them
    end_previous = -1
    result = []
    for match in matches:
        # coordinates of current match
        span = match.span()
        # beginning of window containing longform candidate
        left = max(end_previous+1, span[0]-window)
        # fragment of text in this window
        fragment = text[left:span[0]]
        if not fragment:
            continue
        result.append(fragment)
        end_previous = span[1]
    return result


[docs]def get_candidate(fragment):
    """Return tokens in candidate fragment up until last excluded word

    Parameters
    ----------
    fragment : str
        The fragment to return tokens from.

    use_stemming : Optional[bool]
        If True, stem apply stemming to tokens. Default: True
    """
    fragment = fragment.strip()
    tokens = word_tokenize(fragment)
    longform_map = {}
    i, j = len(tokens) - 1, 0
    processed_tokens = []
    while i >= 0:
        if len(tokens[i][0]) > 1 or not category(tokens[i][0]).startswith('P'):
            processed_tokens.append(tokens[i][0])
            longform_map[j+1] = word_detokenize(tokens[i:])
            j += 1
        i -= 1
    longform_map[len(processed_tokens)] = fragment
    processed_tokens.reverse()
    return processed_tokens, longform_map


class _TrieNode(object):
    """TrieNode structure for use in recognizer

    Attributes
    ----------
    longform : str or None
        Set to associated longform at leaf nodes in the trie, otherwise None.
        Each longform corresponds to a path in the trie from root to leaf.

    children : dict
        dict mapping tokens to child nodes
    """
    __slots__ = ['data', 'children']

    def __init__(self, data=None):
        self.data = data
        self.children = {}


class SearchTrie(object):
    def __init__(self, lexicon, expander=None, token_map=None):
        """Initialize search trie with longforms in grounding map
        """
        if expander is None:
            def expander(x):
                return [x]
        if token_map is None:
            def token_map(x):
                return x
        root = _TrieNode()
        self._trie = root
        for longform in lexicon:
            for expansion in expander(longform):
                edges = tuple(token_map(token)
                              for token in get_candidate(expansion)[0][::-1])
                self.add(edges, longform)
        self.token_map = token_map

    def add(self, tokens, data):
        current = self._trie
        for index, token in enumerate(tokens):
            if token not in current.children:
                if index == len(tokens) - 1:
                    new = _TrieNode(data)
                else:
                    new = _TrieNode()
                current.children[token] = new
                current = new
            else:
                current = current.children[token]
                if index == len(tokens) - 1:
                    current.data = data

    def search(self, tokens):
        """Find longform expansion based on grounding map

        Parameters
        ----------
        tokens : list of str
            contains tokens that precede the occurence of the pattern
            "<longform> (<shortform>)" up until start of window

        Returns
        -------
        str
            Identified longform expansion
        """
        current = self._trie
        result = None
        match_text = []
        for token, mapped_token in tuple((token, self.token_map(token))
                                         for token in tokens[::-1]):
            if mapped_token not in current.children:
                break
            match_text.append(token)
            if current.children[mapped_token].data is not None:
                result = current.children[mapped_token].data
            current = current.children[mapped_token]
        match_text = ' '.join(match_text[::-1])
        return result, match_text