aboutsummaryrefslogtreecommitdiffstats
path: root/snips_inference_agl/intent_classifier/log_reg_classifier_utils.py
diff options
context:
space:
mode:
Diffstat (limited to 'snips_inference_agl/intent_classifier/log_reg_classifier_utils.py')
-rw-r--r--snips_inference_agl/intent_classifier/log_reg_classifier_utils.py94
1 files changed, 94 insertions, 0 deletions
diff --git a/snips_inference_agl/intent_classifier/log_reg_classifier_utils.py b/snips_inference_agl/intent_classifier/log_reg_classifier_utils.py
new file mode 100644
index 0000000..75a8ab1
--- /dev/null
+++ b/snips_inference_agl/intent_classifier/log_reg_classifier_utils.py
@@ -0,0 +1,94 @@
+from __future__ import division, unicode_literals
+
+import itertools
+import re
+from builtins import next, range, str
+from copy import deepcopy
+from uuid import uuid4
+
+from future.utils import iteritems, itervalues
+
+from snips_inference_agl.constants import (DATA, ENTITY, INTENTS, TEXT,
+ UNKNOWNWORD, UTTERANCES)
+from snips_inference_agl.data_augmentation import augment_utterances
+from snips_inference_agl.dataset import get_text_from_chunks
+from snips_inference_agl.entity_parser.builtin_entity_parser import is_builtin_entity
+from snips_inference_agl.preprocessing import tokenize_light
+from snips_inference_agl.resources import get_noise
+
+NOISE_NAME = str(uuid4())
+WORD_REGEX = re.compile(r"\w+(\s+\w+)*")
+UNKNOWNWORD_REGEX = re.compile(r"%s(\s+%s)*" % (UNKNOWNWORD, UNKNOWNWORD))
+
+
+def get_noise_it(noise, mean_length, std_length, random_state):
+ it = itertools.cycle(noise)
+ while True:
+ noise_length = int(random_state.normal(mean_length, std_length))
+ # pylint: disable=stop-iteration-return
+ yield " ".join(next(it) for _ in range(noise_length))
+ # pylint: enable=stop-iteration-return
+
+
+def generate_smart_noise(noise, augmented_utterances, replacement_string,
+ language):
+ text_utterances = [get_text_from_chunks(u[DATA])
+ for u in augmented_utterances]
+ vocab = [w for u in text_utterances for w in tokenize_light(u, language)]
+ vocab = set(vocab)
+ return [w if w in vocab else replacement_string for w in noise]
+
+
+def generate_noise_utterances(augmented_utterances, noise, num_intents,
+ data_augmentation_config, language,
+ random_state):
+ import numpy as np
+
+ if not augmented_utterances or not num_intents:
+ return []
+ avg_num_utterances = len(augmented_utterances) / float(num_intents)
+ if data_augmentation_config.unknown_words_replacement_string is not None:
+ noise = generate_smart_noise(
+ noise, augmented_utterances,
+ data_augmentation_config.unknown_words_replacement_string,
+ language)
+
+ noise_size = min(
+ int(data_augmentation_config.noise_factor * avg_num_utterances),
+ len(noise))
+ utterances_lengths = [
+ len(tokenize_light(get_text_from_chunks(u[DATA]), language))
+ for u in augmented_utterances]
+ mean_utterances_length = np.mean(utterances_lengths)
+ std_utterances_length = np.std(utterances_lengths)
+ noise_it = get_noise_it(noise, mean_utterances_length,
+ std_utterances_length, random_state)
+ # Remove duplicate 'unknownword unknownword'
+ return [
+ text_to_utterance(UNKNOWNWORD_REGEX.sub(UNKNOWNWORD, next(noise_it)))
+ for _ in range(noise_size)]
+
+
+def add_unknown_word_to_utterances(utterances, replacement_string,
+ unknown_word_prob, max_unknown_words,
+ random_state):
+ if not max_unknown_words:
+ return utterances
+
+ new_utterances = deepcopy(utterances)
+ for u in new_utterances:
+ if random_state.rand() < unknown_word_prob:
+ num_unknown = random_state.randint(1, max_unknown_words + 1)
+ # We choose to put the noise at the end of the sentence and not
+ # in the middle so that it doesn't impact to much ngrams
+ # computation
+ extra_chunk = {
+ TEXT: " " + " ".join(
+ replacement_string for _ in range(num_unknown))
+ }
+ u[DATA].append(extra_chunk)
+ return new_utterances
+
+
+def text_to_utterance(text):
+ return {DATA: [{TEXT: text}]}