1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
|
# coding=utf-8
from __future__ import unicode_literals
from builtins import object
from snips_inference_agl.resources import get_stems
def stem(string, language, resources):
from snips_nlu_utils import normalize
normalized_string = normalize(string)
tokens = tokenize_light(normalized_string, language)
stemmed_tokens = [_stem(token, resources) for token in tokens]
return " ".join(stemmed_tokens)
def stem_token(token, resources):
from snips_nlu_utils import normalize
if token.stemmed_value:
return token.stemmed_value
if not token.normalized_value:
token.normalized_value = normalize(token.value)
token.stemmed_value = _stem(token.normalized_value, resources)
return token.stemmed_value
def normalize_token(token):
from snips_nlu_utils import normalize
if token.normalized_value:
return token.normalized_value
token.normalized_value = normalize(token.value)
return token.normalized_value
def _stem(string, resources):
return get_stems(resources).get(string, string)
class Token(object):
"""Token object which is output by the tokenization
Attributes:
value (str): Tokenized string
start (int): Start position of the token within the sentence
end (int): End position of the token within the sentence
normalized_value (str): Normalized value of the tokenized string
stemmed_value (str): Stemmed value of the tokenized string
"""
def __init__(self, value, start, end, normalized_value=None,
stemmed_value=None):
self.value = value
self.start = start
self.end = end
self.normalized_value = normalized_value
self.stemmed_value = stemmed_value
def __eq__(self, other):
if not isinstance(other, type(self)):
return False
return (self.value == other.value
and self.start == other.start
and self.end == other.end)
def __ne__(self, other):
return not self.__eq__(other)
def tokenize(string, language):
"""Tokenizes the input
Args:
string (str): Input to tokenize
language (str): Language to use during tokenization
Returns:
list of :class:`.Token`: The list of tokenized values
"""
from snips_nlu_utils import tokenize as _tokenize
tokens = [Token(value=token["value"],
start=token["char_range"]["start"],
end=token["char_range"]["end"])
for token in _tokenize(string, language)]
return tokens
def tokenize_light(string, language):
"""Same behavior as :func:`tokenize` but returns tokenized strings instead
of :class:`Token` objects"""
from snips_nlu_utils import tokenize_light as _tokenize_light
tokenized_string = _tokenize_light(string, language)
return tokenized_string
|