Add Snips Inference Module

Add slightly modified version of the original Snips NLU library. This module adds support for Python upto version 3.10. Bug-AGL: SPEC-4856 Signed-off-by: Malik Talha <talhamalik727x@gmail.com> Change-Id: I6d7e9eb181e6ff4aed9b6291027877ccb9f0d846
author: Malik Talha <talhamalik727x@gmail.com> 2023-10-22 21:06:23 +0500
committer: Jan-Simon Moeller <jsmoeller@linuxfoundation.org> 2023-10-23 14:38:13 +0000
commit: 697a1adce1e463079e640b55d6386cf82d7bd6bc (patch)
tree: 86e299cc7fe12b10c2e549f640924b61c7d07a95 /snips_inference_agl/preprocessing.py
parent: 97029ab8141e654a170a2282106f854037da294f (diff)
1 files changed, 97 insertions, 0 deletions
diff --git a/snips_inference_agl/preprocessing.py b/snips_inference_agl/preprocessing.py
new file mode 100644
index 0000000..cfb4aa5
--- /dev/null
+++ b/snips_inference_agl/preprocessing.py
@@ -0,0 +1,97 @@
+# coding=utf-8
+from __future__ import unicode_literals
+
+from builtins import object
+
+from snips_inference_agl.resources import get_stems
+
+
+def stem(string, language, resources):
+    from snips_nlu_utils import normalize
+
+    normalized_string = normalize(string)
+    tokens = tokenize_light(normalized_string, language)
+    stemmed_tokens = [_stem(token, resources) for token in tokens]
+    return " ".join(stemmed_tokens)
+
+
+def stem_token(token, resources):
+    from snips_nlu_utils import normalize
+
+    if token.stemmed_value:
+        return token.stemmed_value
+    if not token.normalized_value:
+        token.normalized_value = normalize(token.value)
+    token.stemmed_value = _stem(token.normalized_value, resources)
+    return token.stemmed_value
+
+
+def normalize_token(token):
+    from snips_nlu_utils import normalize
+
+    if token.normalized_value:
+        return token.normalized_value
+    token.normalized_value = normalize(token.value)
+    return token.normalized_value
+
+
+def _stem(string, resources):
+    return get_stems(resources).get(string, string)
+
+
+class Token(object):
+    """Token object which is output by the tokenization
+
+    Attributes:
+        value (str): Tokenized string
+        start (int): Start position of the token within the sentence
+        end (int): End position of the token within the sentence
+        normalized_value (str): Normalized value of the tokenized string
+        stemmed_value (str): Stemmed value of the tokenized string
+    """
+
+    def __init__(self, value, start, end, normalized_value=None,
+                 stemmed_value=None):
+        self.value = value
+        self.start = start
+        self.end = end
+        self.normalized_value = normalized_value
+        self.stemmed_value = stemmed_value
+
+    def __eq__(self, other):
+        if not isinstance(other, type(self)):
+            return False
+        return (self.value == other.value
+                and self.start == other.start
+                and self.end == other.end)
+
+    def __ne__(self, other):
+        return not self.__eq__(other)
+
+
+def tokenize(string, language):
+    """Tokenizes the input
+
+    Args:
+        string (str): Input to tokenize
+        language (str): Language to use during tokenization
+
+    Returns:
+        list of :class:`.Token`: The list of tokenized values
+    """
+    from snips_nlu_utils import tokenize as _tokenize
+
+    tokens = [Token(value=token["value"],
+                    start=token["char_range"]["start"],
+                    end=token["char_range"]["end"])
+              for token in _tokenize(string, language)]
+    return tokens
+
+
+def tokenize_light(string, language):
+    """Same behavior as :func:`tokenize` but returns tokenized strings instead
+        of :class:`Token` objects"""
+    from snips_nlu_utils import tokenize_light as _tokenize_light
+
+    tokenized_string = _tokenize_light(string, language)
+    return tokenized_string
author	Malik Talha <talhamalik727x@gmail.com>	2023-10-22 21:06:23 +0500
committer	Jan-Simon Moeller <jsmoeller@linuxfoundation.org>	2023-10-23 14:38:13 +0000
commit	697a1adce1e463079e640b55d6386cf82d7bd6bc (patch)
tree	86e299cc7fe12b10c2e549f640924b61c7d07a95 /snips_inference_agl/preprocessing.py
parent	97029ab8141e654a170a2282106f854037da294f (diff)