Source code for adeft.util

"""Utility functions used by Adeft internally.

"""
import re
from unicodedata import category

from adeft.nlp import word_tokenize, word_detokenize


[docs]def get_candidate_fragments(text, shortform, window=100): """Return candidate longform fragments from text Gets fragments of text preceding defining patterns (DPs) to search for candidate longforms. Each fragment contains either a specified range of characters before a DP, or characters up until either the start of the sentence or the end of a previous DP. Parameters ---------- text : str Text to search for defining patterns (DP) shortform : str Shortform to disambiguate window : Optional[int] Specifies range of characters before a defining pattern (DP) to consider when finding longforms. If set to 30, candidate longforms would be taken from the string "ters before a defining pattern". Default: 100 """ # Find defining patterns by matching a regular expression matches = re.finditer(r'\s\(%s\)' % re.escape(shortform), text) # Keep track of the index of the end of the previous # Longform candidates cannot contain a previous DP and any text # before them end_previous = -1 result = [] for match in matches: # coordinates of current match span = match.span() # beginning of window containing longform candidate left = max(end_previous+1, span[0]-window) # fragment of text in this window fragment = text[left:span[0]] if not fragment: continue result.append(fragment) end_previous = span[1] return result
[docs]def get_candidate(fragment): """Return tokens in candidate fragment up until last excluded word Parameters ---------- fragment : str The fragment to return tokens from. use_stemming : Optional[bool] If True, stem apply stemming to tokens. Default: True """ fragment = fragment.strip() tokens = word_tokenize(fragment) longform_map = {} i, j = len(tokens) - 1, 0 processed_tokens = [] while i >= 0: if len(tokens[i][0]) > 1 or not category(tokens[i][0]).startswith('P'): processed_tokens.append(tokens[i][0]) longform_map[j+1] = word_detokenize(tokens[i:]) j += 1 i -= 1 longform_map[len(processed_tokens)] = fragment processed_tokens.reverse() return processed_tokens, longform_map
class _TrieNode(object): """TrieNode structure for use in recognizer Attributes ---------- longform : str or None Set to associated longform at leaf nodes in the trie, otherwise None. Each longform corresponds to a path in the trie from root to leaf. children : dict dict mapping tokens to child nodes """ __slots__ = ['data', 'children'] def __init__(self, data=None): self.data = data self.children = {} class SearchTrie(object): def __init__(self, lexicon, expander=None, token_map=None): """Initialize search trie with longforms in grounding map """ if expander is None: def expander(x): return [x] if token_map is None: def token_map(x): return x root = _TrieNode() self._trie = root for longform in lexicon: for expansion in expander(longform): edges = tuple(token_map(token) for token in get_candidate(expansion)[0][::-1]) self.add(edges, longform) self.token_map = token_map def add(self, tokens, data): current = self._trie for index, token in enumerate(tokens): if token not in current.children: if index == len(tokens) - 1: new = _TrieNode(data) else: new = _TrieNode() current.children[token] = new current = new else: current = current.children[token] if index == len(tokens) - 1: current.data = data def search(self, tokens): """Find longform expansion based on grounding map Parameters ---------- tokens : list of str contains tokens that precede the occurence of the pattern "<longform> (<shortform>)" up until start of window Returns ------- str Identified longform expansion """ current = self._trie result = None match_text = [] for token, mapped_token in tuple((token, self.token_map(token)) for token in tokens[::-1]): if mapped_token not in current.children: break match_text.append(token) if current.children[mapped_token].data is not None: result = current.children[mapped_token].data current = current.children[mapped_token] match_text = ' '.join(match_text[::-1]) return result, match_text