aboutsummaryrefslogtreecommitdiffstats
path: root/snips_inference_agl/preprocessing.py
diff options
context:
space:
mode:
authorMalik Talha <talhamalik727x@gmail.com>2023-10-22 21:06:23 +0500
committerJan-Simon Moeller <jsmoeller@linuxfoundation.org>2023-10-23 14:38:13 +0000
commit697a1adce1e463079e640b55d6386cf82d7bd6bc (patch)
tree86e299cc7fe12b10c2e549f640924b61c7d07a95 /snips_inference_agl/preprocessing.py
parent97029ab8141e654a170a2282106f854037da294f (diff)
Add Snips Inference Module
Add slightly modified version of the original Snips NLU library. This module adds support for Python upto version 3.10. Bug-AGL: SPEC-4856 Signed-off-by: Malik Talha <talhamalik727x@gmail.com> Change-Id: I6d7e9eb181e6ff4aed9b6291027877ccb9f0d846
Diffstat (limited to 'snips_inference_agl/preprocessing.py')
-rw-r--r--snips_inference_agl/preprocessing.py97
1 files changed, 97 insertions, 0 deletions
diff --git a/snips_inference_agl/preprocessing.py b/snips_inference_agl/preprocessing.py
new file mode 100644
index 0000000..cfb4aa5
--- /dev/null
+++ b/snips_inference_agl/preprocessing.py
@@ -0,0 +1,97 @@
+# coding=utf-8
+from __future__ import unicode_literals
+
+from builtins import object
+
+from snips_inference_agl.resources import get_stems
+
+
+def stem(string, language, resources):
+ from snips_nlu_utils import normalize
+
+ normalized_string = normalize(string)
+ tokens = tokenize_light(normalized_string, language)
+ stemmed_tokens = [_stem(token, resources) for token in tokens]
+ return " ".join(stemmed_tokens)
+
+
+def stem_token(token, resources):
+ from snips_nlu_utils import normalize
+
+ if token.stemmed_value:
+ return token.stemmed_value
+ if not token.normalized_value:
+ token.normalized_value = normalize(token.value)
+ token.stemmed_value = _stem(token.normalized_value, resources)
+ return token.stemmed_value
+
+
+def normalize_token(token):
+ from snips_nlu_utils import normalize
+
+ if token.normalized_value:
+ return token.normalized_value
+ token.normalized_value = normalize(token.value)
+ return token.normalized_value
+
+
+def _stem(string, resources):
+ return get_stems(resources).get(string, string)
+
+
+class Token(object):
+ """Token object which is output by the tokenization
+
+ Attributes:
+ value (str): Tokenized string
+ start (int): Start position of the token within the sentence
+ end (int): End position of the token within the sentence
+ normalized_value (str): Normalized value of the tokenized string
+ stemmed_value (str): Stemmed value of the tokenized string
+ """
+
+ def __init__(self, value, start, end, normalized_value=None,
+ stemmed_value=None):
+ self.value = value
+ self.start = start
+ self.end = end
+ self.normalized_value = normalized_value
+ self.stemmed_value = stemmed_value
+
+ def __eq__(self, other):
+ if not isinstance(other, type(self)):
+ return False
+ return (self.value == other.value
+ and self.start == other.start
+ and self.end == other.end)
+
+ def __ne__(self, other):
+ return not self.__eq__(other)
+
+
+def tokenize(string, language):
+ """Tokenizes the input
+
+ Args:
+ string (str): Input to tokenize
+ language (str): Language to use during tokenization
+
+ Returns:
+ list of :class:`.Token`: The list of tokenized values
+ """
+ from snips_nlu_utils import tokenize as _tokenize
+
+ tokens = [Token(value=token["value"],
+ start=token["char_range"]["start"],
+ end=token["char_range"]["end"])
+ for token in _tokenize(string, language)]
+ return tokens
+
+
+def tokenize_light(string, language):
+ """Same behavior as :func:`tokenize` but returns tokenized strings instead
+ of :class:`Token` objects"""
+ from snips_nlu_utils import tokenize_light as _tokenize_light
+
+ tokenized_string = _tokenize_light(string, language)
+ return tokenized_string