aboutsummaryrefslogtreecommitdiffstats
path: root/snips_inference_agl/intent_classifier/log_reg_classifier_utils.py
blob: 75a8ab1466eac67f750b0932c7372481e0808a88 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
from __future__ import division, unicode_literals

import itertools
import re
from builtins import next, range, str
from copy import deepcopy
from uuid import uuid4

from future.utils import iteritems, itervalues

from snips_inference_agl.constants import (DATA, ENTITY, INTENTS, TEXT,
                                 UNKNOWNWORD, UTTERANCES)
from snips_inference_agl.data_augmentation import augment_utterances
from snips_inference_agl.dataset import get_text_from_chunks
from snips_inference_agl.entity_parser.builtin_entity_parser import is_builtin_entity
from snips_inference_agl.preprocessing import tokenize_light
from snips_inference_agl.resources import get_noise

NOISE_NAME = str(uuid4())
WORD_REGEX = re.compile(r"\w+(\s+\w+)*")
UNKNOWNWORD_REGEX = re.compile(r"%s(\s+%s)*" % (UNKNOWNWORD, UNKNOWNWORD))


def get_noise_it(noise, mean_length, std_length, random_state):
    it = itertools.cycle(noise)
    while True:
        noise_length = int(random_state.normal(mean_length, std_length))
        # pylint: disable=stop-iteration-return
        yield " ".join(next(it) for _ in range(noise_length))
        # pylint: enable=stop-iteration-return


def generate_smart_noise(noise, augmented_utterances, replacement_string,
                         language):
    text_utterances = [get_text_from_chunks(u[DATA])
                       for u in augmented_utterances]
    vocab = [w for u in text_utterances for w in tokenize_light(u, language)]
    vocab = set(vocab)
    return [w if w in vocab else replacement_string for w in noise]


def generate_noise_utterances(augmented_utterances, noise, num_intents,
                              data_augmentation_config, language,
                              random_state):
    import numpy as np

    if not augmented_utterances or not num_intents:
        return []
    avg_num_utterances = len(augmented_utterances) / float(num_intents)
    if data_augmentation_config.unknown_words_replacement_string is not None:
        noise = generate_smart_noise(
            noise, augmented_utterances,
            data_augmentation_config.unknown_words_replacement_string,
            language)

    noise_size = min(
        int(data_augmentation_config.noise_factor * avg_num_utterances),
        len(noise))
    utterances_lengths = [
        len(tokenize_light(get_text_from_chunks(u[DATA]), language))
        for u in augmented_utterances]
    mean_utterances_length = np.mean(utterances_lengths)
    std_utterances_length = np.std(utterances_lengths)
    noise_it = get_noise_it(noise, mean_utterances_length,
                            std_utterances_length, random_state)
    # Remove duplicate 'unknownword unknownword'
    return [
        text_to_utterance(UNKNOWNWORD_REGEX.sub(UNKNOWNWORD, next(noise_it)))
        for _ in range(noise_size)]


def add_unknown_word_to_utterances(utterances, replacement_string,
                                   unknown_word_prob, max_unknown_words,
                                   random_state):
    if not max_unknown_words:
        return utterances

    new_utterances = deepcopy(utterances)
    for u in new_utterances:
        if random_state.rand() < unknown_word_prob:
            num_unknown = random_state.randint(1, max_unknown_words + 1)
            # We choose to put the noise at the end of the sentence and not
            # in the middle so that it doesn't impact to much ngrams
            # computation
            extra_chunk = {
                TEXT: " " + " ".join(
                    replacement_string for _ in range(num_unknown))
            }
            u[DATA].append(extra_chunk)
    return new_utterances


def text_to_utterance(text):
    return {DATA: [{TEXT: text}]}