diff options
author | 2023-10-22 21:06:23 +0500 | |
---|---|---|
committer | 2023-10-23 14:38:13 +0000 | |
commit | 697a1adce1e463079e640b55d6386cf82d7bd6bc (patch) | |
tree | 86e299cc7fe12b10c2e549f640924b61c7d07a95 /snips_inference_agl/preprocessing.py | |
parent | 97029ab8141e654a170a2282106f854037da294f (diff) |
Add Snips Inference Module
Add slightly modified version of the original Snips NLU
library. This module adds support for Python upto version
3.10.
Bug-AGL: SPEC-4856
Signed-off-by: Malik Talha <talhamalik727x@gmail.com>
Change-Id: I6d7e9eb181e6ff4aed9b6291027877ccb9f0d846
Diffstat (limited to 'snips_inference_agl/preprocessing.py')
-rw-r--r-- | snips_inference_agl/preprocessing.py | 97 |
1 files changed, 97 insertions, 0 deletions
diff --git a/snips_inference_agl/preprocessing.py b/snips_inference_agl/preprocessing.py new file mode 100644 index 0000000..cfb4aa5 --- /dev/null +++ b/snips_inference_agl/preprocessing.py @@ -0,0 +1,97 @@ +# coding=utf-8 +from __future__ import unicode_literals + +from builtins import object + +from snips_inference_agl.resources import get_stems + + +def stem(string, language, resources): + from snips_nlu_utils import normalize + + normalized_string = normalize(string) + tokens = tokenize_light(normalized_string, language) + stemmed_tokens = [_stem(token, resources) for token in tokens] + return " ".join(stemmed_tokens) + + +def stem_token(token, resources): + from snips_nlu_utils import normalize + + if token.stemmed_value: + return token.stemmed_value + if not token.normalized_value: + token.normalized_value = normalize(token.value) + token.stemmed_value = _stem(token.normalized_value, resources) + return token.stemmed_value + + +def normalize_token(token): + from snips_nlu_utils import normalize + + if token.normalized_value: + return token.normalized_value + token.normalized_value = normalize(token.value) + return token.normalized_value + + +def _stem(string, resources): + return get_stems(resources).get(string, string) + + +class Token(object): + """Token object which is output by the tokenization + + Attributes: + value (str): Tokenized string + start (int): Start position of the token within the sentence + end (int): End position of the token within the sentence + normalized_value (str): Normalized value of the tokenized string + stemmed_value (str): Stemmed value of the tokenized string + """ + + def __init__(self, value, start, end, normalized_value=None, + stemmed_value=None): + self.value = value + self.start = start + self.end = end + self.normalized_value = normalized_value + self.stemmed_value = stemmed_value + + def __eq__(self, other): + if not isinstance(other, type(self)): + return False + return (self.value == other.value + and self.start == other.start + and self.end == other.end) + + def __ne__(self, other): + return not self.__eq__(other) + + +def tokenize(string, language): + """Tokenizes the input + + Args: + string (str): Input to tokenize + language (str): Language to use during tokenization + + Returns: + list of :class:`.Token`: The list of tokenized values + """ + from snips_nlu_utils import tokenize as _tokenize + + tokens = [Token(value=token["value"], + start=token["char_range"]["start"], + end=token["char_range"]["end"]) + for token in _tokenize(string, language)] + return tokens + + +def tokenize_light(string, language): + """Same behavior as :func:`tokenize` but returns tokenized strings instead + of :class:`Token` objects""" + from snips_nlu_utils import tokenize_light as _tokenize_light + + tokenized_string = _tokenize_light(string, language) + return tokenized_string |