aboutsummaryrefslogtreecommitdiffstats
path: root/snips_inference_agl/default_configs
diff options
context:
space:
mode:
Diffstat (limited to 'snips_inference_agl/default_configs')
-rw-r--r--snips_inference_agl/default_configs/__init__.py26
-rw-r--r--snips_inference_agl/default_configs/config_de.py159
-rw-r--r--snips_inference_agl/default_configs/config_en.py145
-rw-r--r--snips_inference_agl/default_configs/config_es.py138
-rw-r--r--snips_inference_agl/default_configs/config_fr.py137
-rw-r--r--snips_inference_agl/default_configs/config_it.py137
-rw-r--r--snips_inference_agl/default_configs/config_ja.py164
-rw-r--r--snips_inference_agl/default_configs/config_ko.py155
-rw-r--r--snips_inference_agl/default_configs/config_pt_br.py137
-rw-r--r--snips_inference_agl/default_configs/config_pt_pt.py137
10 files changed, 1335 insertions, 0 deletions
diff --git a/snips_inference_agl/default_configs/__init__.py b/snips_inference_agl/default_configs/__init__.py
new file mode 100644
index 0000000..fc66d33
--- /dev/null
+++ b/snips_inference_agl/default_configs/__init__.py
@@ -0,0 +1,26 @@
+from __future__ import unicode_literals
+
+from snips_inference_agl.constants import (
+ LANGUAGE_DE, LANGUAGE_EN, LANGUAGE_ES, LANGUAGE_FR, LANGUAGE_IT,
+ LANGUAGE_JA, LANGUAGE_KO, LANGUAGE_PT_BR, LANGUAGE_PT_PT)
+from .config_de import CONFIG as CONFIG_DE
+from .config_en import CONFIG as CONFIG_EN
+from .config_es import CONFIG as CONFIG_ES
+from .config_fr import CONFIG as CONFIG_FR
+from .config_it import CONFIG as CONFIG_IT
+from .config_ja import CONFIG as CONFIG_JA
+from .config_ko import CONFIG as CONFIG_KO
+from .config_pt_br import CONFIG as CONFIG_PT_BR
+from .config_pt_pt import CONFIG as CONFIG_PT_PT
+
+DEFAULT_CONFIGS = {
+ LANGUAGE_DE: CONFIG_DE,
+ LANGUAGE_EN: CONFIG_EN,
+ LANGUAGE_ES: CONFIG_ES,
+ LANGUAGE_FR: CONFIG_FR,
+ LANGUAGE_IT: CONFIG_IT,
+ LANGUAGE_JA: CONFIG_JA,
+ LANGUAGE_KO: CONFIG_KO,
+ LANGUAGE_PT_BR: CONFIG_PT_BR,
+ LANGUAGE_PT_PT: CONFIG_PT_PT,
+}
diff --git a/snips_inference_agl/default_configs/config_de.py b/snips_inference_agl/default_configs/config_de.py
new file mode 100644
index 0000000..200fc30
--- /dev/null
+++ b/snips_inference_agl/default_configs/config_de.py
@@ -0,0 +1,159 @@
+from __future__ import unicode_literals
+
+CONFIG = {
+ "unit_name": "nlu_engine",
+ "intent_parsers_configs": [
+ {
+ "unit_name": "lookup_intent_parser",
+ "ignore_stop_words": True
+ },
+ {
+ "unit_name": "probabilistic_intent_parser",
+ "slot_filler_config": {
+ "unit_name": "crf_slot_filler",
+ "feature_factory_configs": [
+ {
+ "args": {
+ "common_words_gazetteer_name":
+ "top_200000_words_stemmed",
+ "use_stemming": True,
+ "n": 1
+ },
+ "factory_name": "ngram",
+ "offsets": [-2, -1, 0, 1, 2]
+ },
+ {
+ "args": {
+ "common_words_gazetteer_name":
+ "top_200000_words_stemmed",
+ "use_stemming": True,
+ "n": 2
+ },
+ "factory_name": "ngram",
+ "offsets": [-2, 1]
+ },
+ {
+ "args": {
+ "prefix_size": 2
+ },
+ "factory_name": "prefix",
+ "offsets": [0]
+ },
+ {
+ "args": {"prefix_size": 5},
+ "factory_name": "prefix",
+ "offsets": [0]
+ },
+ {
+ "args": {"suffix_size": 2},
+ "factory_name": "suffix",
+ "offsets": [0]
+ },
+ {
+ "args": {"suffix_size": 5},
+ "factory_name": "suffix",
+ "offsets": [0]
+ },
+ {
+ "args": {},
+ "factory_name": "is_digit",
+ "offsets": [-1, 0, 1]
+ },
+ {
+ "args": {},
+ "factory_name": "is_first",
+ "offsets": [-2, -1, 0]
+ },
+ {
+ "args": {},
+ "factory_name": "is_last",
+ "offsets": [0, 1, 2]
+ },
+ {
+ "args": {"n": 1},
+ "factory_name": "shape_ngram",
+ "offsets": [0]
+ },
+ {
+ "args": {"n": 2},
+ "factory_name": "shape_ngram",
+ "offsets": [-1, 0]
+ },
+ {
+ "args": {"n": 3},
+ "factory_name": "shape_ngram",
+ "offsets": [-1]
+ },
+ {
+ "args": {
+ "use_stemming": True,
+ "tagging_scheme_code": 2,
+ "entity_filter": {
+ "automatically_extensible": False
+ }
+ },
+ "factory_name": "entity_match",
+ "offsets": [-2, -1, 0]
+ },
+ {
+ "args": {
+ "use_stemming": True,
+ "tagging_scheme_code": 2,
+ "entity_filter": {
+ "automatically_extensible": True
+ }
+ },
+ "factory_name": "entity_match",
+ "offsets": [-2, -1, 0],
+ "drop_out": 0.5
+ },
+ {
+ "args": {"tagging_scheme_code": 1},
+ "factory_name": "builtin_entity_match",
+ "offsets": [-2, -1, 0]
+ }
+ ],
+ "crf_args": {
+ "c1": 0.1,
+ "c2": 0.1,
+ "algorithm": "lbfgs"
+ },
+ "tagging_scheme": 1,
+ "data_augmentation_config": {
+ "min_utterances": 200,
+ "capitalization_ratio": 0.2,
+ "add_builtin_entities_examples": True
+ }
+ },
+ "intent_classifier_config": {
+ "unit_name": "log_reg_intent_classifier",
+ "data_augmentation_config": {
+ "min_utterances": 20,
+ "noise_factor": 5,
+ "add_builtin_entities_examples": False,
+ "max_unknown_words": None,
+ "unknown_word_prob": 0.0,
+ "unknown_words_replacement_string": None
+ },
+ "featurizer_config": {
+ "unit_name": "featurizer",
+ "pvalue_threshold": 0.4,
+ "added_cooccurrence_feature_ratio": 0.0,
+ "tfidf_vectorizer_config": {
+ "unit_name": "tfidf_vectorizer",
+ "use_stemming": True,
+ "word_clusters_name": None
+ },
+ "cooccurrence_vectorizer_config": {
+ "unit_name": "cooccurrence_vectorizer",
+ "window_size": None,
+ "filter_stop_words": True,
+ "unknown_words_replacement_string": None,
+ "keep_order": True
+ }
+ },
+ "noise_reweight_factor": 1,
+ }
+ }
+ ]
+}
diff --git a/snips_inference_agl/default_configs/config_en.py b/snips_inference_agl/default_configs/config_en.py
new file mode 100644
index 0000000..12f7ae1
--- /dev/null
+++ b/snips_inference_agl/default_configs/config_en.py
@@ -0,0 +1,145 @@
+from __future__ import unicode_literals
+
+CONFIG = {
+ "unit_name": "nlu_engine",
+ "intent_parsers_configs": [
+ {
+ "unit_name": "lookup_intent_parser",
+ "ignore_stop_words": True
+ },
+ {
+ "unit_name": "probabilistic_intent_parser",
+ "slot_filler_config": {
+ "unit_name": "crf_slot_filler",
+ "feature_factory_configs": [
+ {
+ "args": {
+ "common_words_gazetteer_name":
+ "top_10000_words_stemmed",
+ "use_stemming": True,
+ "n": 1
+ },
+ "factory_name": "ngram",
+ "offsets": [-2, -1, 0, 1, 2]
+ },
+ {
+ "args": {
+ "common_words_gazetteer_name":
+ "top_10000_words_stemmed",
+ "use_stemming": True,
+ "n": 2
+ },
+ "factory_name": "ngram",
+ "offsets": [-2, 1]
+ },
+ {
+ "args": {},
+ "factory_name": "is_digit",
+ "offsets": [-1, 0, 1]
+ },
+ {
+ "args": {},
+ "factory_name": "is_first",
+ "offsets": [-2, -1, 0]
+ },
+ {
+ "args": {},
+ "factory_name": "is_last",
+ "offsets": [0, 1, 2]
+ },
+ {
+ "args": {"n": 1},
+ "factory_name": "shape_ngram",
+ "offsets": [0]
+ },
+ {
+ "args": {"n": 2},
+ "factory_name": "shape_ngram",
+ "offsets": [-1, 0]
+ },
+ {
+ "args": {"n": 3},
+ "factory_name": "shape_ngram",
+ "offsets": [-1]
+ },
+ {
+ "args": {
+ "use_stemming": True,
+ "tagging_scheme_code": 2,
+ "entity_filter": {
+ "automatically_extensible": False
+ }
+ },
+ "factory_name": "entity_match",
+ "offsets": [-2, -1, 0]
+ },
+ {
+ "args": {
+ "use_stemming": True,
+ "tagging_scheme_code": 2,
+ "entity_filter": {
+ "automatically_extensible": True
+ }
+ },
+ "factory_name": "entity_match",
+ "offsets": [-2, -1, 0],
+ "drop_out": 0.5
+ },
+ {
+ "args": {"tagging_scheme_code": 1},
+ "factory_name": "builtin_entity_match",
+ "offsets": [-2, -1, 0]
+ },
+ {
+ "args": {
+ "cluster_name": "brown_clusters",
+ "use_stemming": False
+ },
+ "factory_name": "word_cluster",
+ "offsets": [-2, -1, 0, 1]
+ }
+ ],
+ "crf_args": {
+ "c1": 0.1,
+ "c2": 0.1,
+ "algorithm": "lbfgs"
+ },
+ "tagging_scheme": 1,
+ "data_augmentation_config": {
+ "min_utterances": 200,
+ "capitalization_ratio": 0.2,
+ "add_builtin_entities_examples": True
+ }
+ },
+ "intent_classifier_config": {
+ "unit_name": "log_reg_intent_classifier",
+ "data_augmentation_config": {
+ "min_utterances": 20,
+ "noise_factor": 5,
+ "add_builtin_entities_examples": False,
+ "max_unknown_words": None,
+ "unknown_word_prob": 0.0,
+ "unknown_words_replacement_string": None
+ },
+ "featurizer_config": {
+ "unit_name": "featurizer",
+ "pvalue_threshold": 0.4,
+ "added_cooccurrence_feature_ratio": 0.0,
+ "tfidf_vectorizer_config": {
+ "unit_name": "tfidf_vectorizer",
+ "use_stemming": False,
+ "word_clusters_name": None
+ },
+ "cooccurrence_vectorizer_config": {
+ "unit_name": "cooccurrence_vectorizer",
+ "window_size": None,
+ "filter_stop_words": True,
+ "unknown_words_replacement_string": None,
+ "keep_order": True
+ }
+ },
+ "noise_reweight_factor": 1,
+ }
+ }
+ ]
+}
diff --git a/snips_inference_agl/default_configs/config_es.py b/snips_inference_agl/default_configs/config_es.py
new file mode 100644
index 0000000..28969ce
--- /dev/null
+++ b/snips_inference_agl/default_configs/config_es.py
@@ -0,0 +1,138 @@
+from __future__ import unicode_literals
+
+CONFIG = {
+ "unit_name": "nlu_engine",
+ "intent_parsers_configs": [
+ {
+ "unit_name": "lookup_intent_parser",
+ "ignore_stop_words": True
+ },
+ {
+ "unit_name": "probabilistic_intent_parser",
+ "slot_filler_config": {
+ "unit_name": "crf_slot_filler",
+ "feature_factory_configs": [
+ {
+ "args": {
+ "common_words_gazetteer_name":
+ "top_10000_words_stemmed",
+ "use_stemming": True,
+ "n": 1
+ },
+ "factory_name": "ngram",
+ "offsets": [-2, -1, 0, 1, 2]
+ },
+ {
+ "args": {
+ "common_words_gazetteer_name":
+ "top_10000_words_stemmed",
+ "use_stemming": True,
+ "n": 2
+ },
+ "factory_name": "ngram",
+ "offsets": [-2, 1]
+ },
+ {
+ "args": {},
+ "factory_name": "is_digit",
+ "offsets": [-1, 0, 1]
+ },
+ {
+ "args": {},
+ "factory_name": "is_first",
+ "offsets": [-2, -1, 0]
+ },
+ {
+ "args": {},
+ "factory_name": "is_last",
+ "offsets": [0, 1, 2]
+ },
+ {
+ "args": {"n": 1},
+ "factory_name": "shape_ngram",
+ "offsets": [0]
+ },
+ {
+ "args": {"n": 2},
+ "factory_name": "shape_ngram",
+ "offsets": [-1, 0]
+ },
+ {
+ "args": {"n": 3},
+ "factory_name": "shape_ngram",
+ "offsets": [-1]
+ },
+ {
+ "args": {
+ "use_stemming": True,
+ "tagging_scheme_code": 2,
+ "entity_filter": {
+ "automatically_extensible": False
+ }
+ },
+ "factory_name": "entity_match",
+ "offsets": [-2, -1, 0]
+ },
+ {
+ "args": {
+ "use_stemming": True,
+ "tagging_scheme_code": 2,
+ "entity_filter": {
+ "automatically_extensible": True
+ }
+ },
+ "factory_name": "entity_match",
+ "offsets": [-2, -1, 0],
+ "drop_out": 0.5
+ },
+ {
+ "args": {"tagging_scheme_code": 1},
+ "factory_name": "builtin_entity_match",
+ "offsets": [-2, -1, 0]
+ }
+ ],
+ "crf_args": {
+ "c1": 0.1,
+ "c2": 0.1,
+ "algorithm": "lbfgs"
+ },
+ "tagging_scheme": 1,
+ "data_augmentation_config": {
+ "min_utterances": 200,
+ "capitalization_ratio": 0.2,
+ "add_builtin_entities_examples": True
+ },
+
+ },
+ "intent_classifier_config": {
+ "unit_name": "log_reg_intent_classifier",
+ "data_augmentation_config": {
+ "min_utterances": 20,
+ "noise_factor": 5,
+ "add_builtin_entities_examples": False,
+ "max_unknown_words": None,
+ "unknown_word_prob": 0.0,
+ "unknown_words_replacement_string": None
+ },
+ "featurizer_config": {
+ "unit_name": "featurizer",
+ "pvalue_threshold": 0.4,
+ "added_cooccurrence_feature_ratio": 0.0,
+ "tfidf_vectorizer_config": {
+ "unit_name": "tfidf_vectorizer",
+ "use_stemming": True,
+ "word_clusters_name": None
+ },
+ "cooccurrence_vectorizer_config": {
+ "unit_name": "cooccurrence_vectorizer",
+ "window_size": None,
+ "filter_stop_words": True,
+ "unknown_words_replacement_string": None,
+ "keep_order": True
+ }
+ },
+ "noise_reweight_factor": 1,
+ }
+ }
+ ]
+}
diff --git a/snips_inference_agl/default_configs/config_fr.py b/snips_inference_agl/default_configs/config_fr.py
new file mode 100644
index 0000000..a2da590
--- /dev/null
+++ b/snips_inference_agl/default_configs/config_fr.py
@@ -0,0 +1,137 @@
+from __future__ import unicode_literals
+
+CONFIG = {
+ "unit_name": "nlu_engine",
+ "intent_parsers_configs": [
+ {
+ "unit_name": "lookup_intent_parser",
+ "ignore_stop_words": True
+ },
+ {
+ "unit_name": "probabilistic_intent_parser",
+ "slot_filler_config": {
+ "unit_name": "crf_slot_filler",
+ "feature_factory_configs": [
+ {
+ "args": {
+ "common_words_gazetteer_name":
+ "top_10000_words_stemmed",
+ "use_stemming": True,
+ "n": 1
+ },
+ "factory_name": "ngram",
+ "offsets": [-2, -1, 0, 1, 2]
+ },
+ {
+ "args": {
+ "common_words_gazetteer_name":
+ "top_10000_words_stemmed",
+ "use_stemming": True,
+ "n": 2
+ },
+ "factory_name": "ngram",
+ "offsets": [-2, 1]
+ },
+ {
+ "args": {},
+ "factory_name": "is_digit",
+ "offsets": [-1, 0, 1]
+ },
+ {
+ "args": {},
+ "factory_name": "is_first",
+ "offsets": [-2, -1, 0]
+ },
+ {
+ "args": {},
+ "factory_name": "is_last",
+ "offsets": [0, 1, 2]
+ },
+ {
+ "args": {"n": 1},
+ "factory_name": "shape_ngram",
+ "offsets": [0]
+ },
+ {
+ "args": {"n": 2},
+ "factory_name": "shape_ngram",
+ "offsets": [-1, 0]
+ },
+ {
+ "args": {"n": 3},
+ "factory_name": "shape_ngram",
+ "offsets": [-1]
+ },
+ {
+ "args": {
+ "use_stemming": True,
+ "tagging_scheme_code": 2,
+ "entity_filter": {
+ "automatically_extensible": False
+ }
+ },
+ "factory_name": "entity_match",
+ "offsets": [-2, -1, 0]
+ },
+ {
+ "args": {
+ "use_stemming": True,
+ "tagging_scheme_code": 2,
+ "entity_filter": {
+ "automatically_extensible": True
+ }
+ },
+ "factory_name": "entity_match",
+ "offsets": [-2, -1, 0],
+ "drop_out": 0.5
+ },
+ {
+ "args": {"tagging_scheme_code": 1},
+ "factory_name": "builtin_entity_match",
+ "offsets": [-2, -1, 0]
+ }
+ ],
+ "crf_args": {
+ "c1": 0.1,
+ "c2": 0.1,
+ "algorithm": "lbfgs"
+ },
+ "tagging_scheme": 1,
+ "data_augmentation_config": {
+ "min_utterances": 200,
+ "capitalization_ratio": 0.2,
+ "add_builtin_entities_examples": True
+ }
+ },
+ "intent_classifier_config": {
+ "unit_name": "log_reg_intent_classifier",
+ "data_augmentation_config": {
+ "min_utterances": 20,
+ "noise_factor": 5,
+ "add_builtin_entities_examples": False,
+ "max_unknown_words": None,
+ "unknown_word_prob": 0.0,
+ "unknown_words_replacement_string": None
+ },
+ "featurizer_config": {
+ "unit_name": "featurizer",
+ "pvalue_threshold": 0.4,
+ "added_cooccurrence_feature_ratio": 0.0,
+ "tfidf_vectorizer_config": {
+ "unit_name": "tfidf_vectorizer",
+ "use_stemming": True,
+ "word_clusters_name": None
+ },
+ "cooccurrence_vectorizer_config": {
+ "unit_name": "cooccurrence_vectorizer",
+ "window_size": None,
+ "filter_stop_words": True,
+ "unknown_words_replacement_string": None,
+ "keep_order": True
+ }
+ },
+ "noise_reweight_factor": 1,
+ }
+ }
+ ]
+}
diff --git a/snips_inference_agl/default_configs/config_it.py b/snips_inference_agl/default_configs/config_it.py
new file mode 100644
index 0000000..a2da590
--- /dev/null
+++ b/snips_inference_agl/default_configs/config_it.py
@@ -0,0 +1,137 @@
+from __future__ import unicode_literals
+
+CONFIG = {
+ "unit_name": "nlu_engine",
+ "intent_parsers_configs": [
+ {
+ "unit_name": "lookup_intent_parser",
+ "ignore_stop_words": True
+ },
+ {
+ "unit_name": "probabilistic_intent_parser",
+ "slot_filler_config": {
+ "unit_name": "crf_slot_filler",
+ "feature_factory_configs": [
+ {
+ "args": {
+ "common_words_gazetteer_name":
+ "top_10000_words_stemmed",
+ "use_stemming": True,
+ "n": 1
+ },
+ "factory_name": "ngram",
+ "offsets": [-2, -1, 0, 1, 2]
+ },
+ {
+ "args": {
+ "common_words_gazetteer_name":
+ "top_10000_words_stemmed",
+ "use_stemming": True,
+ "n": 2
+ },
+ "factory_name": "ngram",
+ "offsets": [-2, 1]
+ },
+ {
+ "args": {},
+ "factory_name": "is_digit",
+ "offsets": [-1, 0, 1]
+ },
+ {
+ "args": {},
+ "factory_name": "is_first",
+ "offsets": [-2, -1, 0]
+ },
+ {
+ "args": {},
+ "factory_name": "is_last",
+ "offsets": [0, 1, 2]
+ },
+ {
+ "args": {"n": 1},
+ "factory_name": "shape_ngram",
+ "offsets": [0]
+ },
+ {
+ "args": {"n": 2},
+ "factory_name": "shape_ngram",
+ "offsets": [-1, 0]
+ },
+ {
+ "args": {"n": 3},
+ "factory_name": "shape_ngram",
+ "offsets": [-1]
+ },
+ {
+ "args": {
+ "use_stemming": True,
+ "tagging_scheme_code": 2,
+ "entity_filter": {
+ "automatically_extensible": False
+ }
+ },
+ "factory_name": "entity_match",
+ "offsets": [-2, -1, 0]
+ },
+ {
+ "args": {
+ "use_stemming": True,
+ "tagging_scheme_code": 2,
+ "entity_filter": {
+ "automatically_extensible": True
+ }
+ },
+ "factory_name": "entity_match",
+ "offsets": [-2, -1, 0],
+ "drop_out": 0.5
+ },
+ {
+ "args": {"tagging_scheme_code": 1},
+ "factory_name": "builtin_entity_match",
+ "offsets": [-2, -1, 0]
+ }
+ ],
+ "crf_args": {
+ "c1": 0.1,
+ "c2": 0.1,
+ "algorithm": "lbfgs"
+ },
+ "tagging_scheme": 1,
+ "data_augmentation_config": {
+ "min_utterances": 200,
+ "capitalization_ratio": 0.2,
+ "add_builtin_entities_examples": True
+ }
+ },
+ "intent_classifier_config": {
+ "unit_name": "log_reg_intent_classifier",
+ "data_augmentation_config": {
+ "min_utterances": 20,
+ "noise_factor": 5,
+ "add_builtin_entities_examples": False,
+ "max_unknown_words": None,
+ "unknown_word_prob": 0.0,
+ "unknown_words_replacement_string": None
+ },
+ "featurizer_config": {
+ "unit_name": "featurizer",
+ "pvalue_threshold": 0.4,
+ "added_cooccurrence_feature_ratio": 0.0,
+ "tfidf_vectorizer_config": {
+ "unit_name": "tfidf_vectorizer",
+ "use_stemming": True,
+ "word_clusters_name": None
+ },
+ "cooccurrence_vectorizer_config": {
+ "unit_name": "cooccurrence_vectorizer",
+ "window_size": None,
+ "filter_stop_words": True,
+ "unknown_words_replacement_string": None,
+ "keep_order": True
+ }
+ },
+ "noise_reweight_factor": 1,
+ }
+ }
+ ]
+}
diff --git a/snips_inference_agl/default_configs/config_ja.py b/snips_inference_agl/default_configs/config_ja.py
new file mode 100644
index 0000000..b28791f
--- /dev/null
+++ b/snips_inference_agl/default_configs/config_ja.py
@@ -0,0 +1,164 @@
+from __future__ import unicode_literals
+
+CONFIG = {
+ "unit_name": "nlu_engine",
+ "intent_parsers_configs": [
+ {
+ "unit_name": "lookup_intent_parser",
+ "ignore_stop_words": False
+ },
+ {
+ "unit_name": "probabilistic_intent_parser",
+ "slot_filler_config": {
+ "unit_name": "crf_slot_filler",
+ "feature_factory_configs": [
+ {
+ "args": {
+ "common_words_gazetteer_name": None,
+ "use_stemming": False,
+ "n": 1
+ },
+ "factory_name": "ngram",
+ "offsets": [-2, -1, 0, 1, 2]
+ },
+ {
+ "args": {
+ "common_words_gazetteer_name": None,
+ "use_stemming": False,
+ "n": 2
+ },
+ "factory_name": "ngram",
+ "offsets": [-2, 0, 1, 2]
+ },
+ {
+ "args": {"prefix_size": 1},
+ "factory_name": "prefix",
+ "offsets": [0, 1]
+ },
+ {
+ "args": {"prefix_size": 2},
+ "factory_name": "prefix",
+ "offsets": [0, 1]
+ },
+ {
+ "args": {"suffix_size": 1},
+ "factory_name": "suffix",
+ "offsets": [0, 1]
+ },
+ {
+ "args": {"suffix_size": 2},
+ "factory_name": "suffix",
+ "offsets": [0, 1]
+ },
+ {
+ "args": {},
+ "factory_name": "is_digit",
+ "offsets": [-1, 0, 1]
+ },
+ {
+ "args": {},
+ "factory_name": "is_first",
+ "offsets": [-2, -1, 0]
+ },
+ {
+ "args": {},
+ "factory_name": "is_last",
+ "offsets": [0, 1, 2]
+ },
+ {
+ "args": {"n": 1},
+ "factory_name": "shape_ngram",
+ "offsets": [0]
+ },
+ {
+ "args": {"n": 2},
+ "factory_name": "shape_ngram",
+ "offsets": [-1, 0]
+ },
+ {
+ "args": {"n": 3},
+ "factory_name": "shape_ngram",
+ "offsets": [-1]
+ },
+ {
+ "args": {
+ "use_stemming": False,
+ "tagging_scheme_code": 2,
+ "entity_filter": {
+ "automatically_extensible": False
+ }
+ },
+ "factory_name": "entity_match",
+ "offsets": [-1, 0, 1, 2],
+ },
+ {
+ "args": {
+ "use_stemming": False,
+ "tagging_scheme_code": 2,
+ "entity_filter": {
+ "automatically_extensible": False
+ }
+ },
+ "factory_name": "entity_match",
+ "offsets": [-1, 0, 1, 2],
+ "drop_out": 0.5
+ },
+ {
+ "args": {"tagging_scheme_code": 1},
+ "factory_name": "builtin_entity_match",
+ "offsets": [-1, 0, 1, 2]
+ },
+ {
+ "args": {
+ "cluster_name": "w2v_clusters",
+ "use_stemming": False
+ },
+ "factory_name": "word_cluster",
+ "offsets": [-2, -1, 0, 1, 2]
+ }
+ ],
+ "crf_args": {
+ "c1": 0.1,
+ "c2": 0.1,
+ "algorithm": "lbfgs"
+ },
+ "tagging_scheme": 1,
+ "data_augmentation_config": {
+ "min_utterances": 200,
+ "capitalization_ratio": 0.2,
+ "add_builtin_entities_examples": True
+ },
+
+ },
+ "intent_classifier_config": {
+ "unit_name": "log_reg_intent_classifier",
+ "data_augmentation_config": {
+ "min_utterances": 20,
+ "noise_factor": 5,
+ "add_builtin_entities_examples": False,
+ "max_unknown_words": None,
+ "unknown_word_prob": 0.0,
+ "unknown_words_replacement_string": None
+ },
+ "featurizer_config": {
+ "unit_name": "featurizer",
+ "pvalue_threshold": 0.9,
+ "added_cooccurrence_feature_ratio": 0.0,
+ "tfidf_vectorizer_config": {
+ "unit_name": "tfidf_vectorizer",
+ "use_stemming": False,
+ "word_clusters_name": "w2v_clusters"
+ },
+ "cooccurrence_vectorizer_config": {
+ "unit_name": "cooccurrence_vectorizer",
+ "window_size": None,
+ "filter_stop_words": True,
+ "unknown_words_replacement_string": None,
+ "keep_order": True
+ }
+ },
+ "noise_reweight_factor": 1,
+ }
+ }
+ ]
+}
diff --git a/snips_inference_agl/default_configs/config_ko.py b/snips_inference_agl/default_configs/config_ko.py
new file mode 100644
index 0000000..1630796
--- /dev/null
+++ b/snips_inference_agl/default_configs/config_ko.py
@@ -0,0 +1,155 @@
+from __future__ import unicode_literals
+
+CONFIG = {
+ "unit_name": "nlu_engine",
+ "intent_parsers_configs": [
+ {
+ "unit_name": "lookup_intent_parser",
+ "ignore_stop_words": False
+ },
+ {
+ "unit_name": "probabilistic_intent_parser",
+ "slot_filler_config": {
+ "unit_name": "crf_slot_filler",
+ "feature_factory_configs": [
+ {
+ "args": {
+ "common_words_gazetteer_name": None,
+ "use_stemming": False,
+ "n": 1
+ },
+ "factory_name": "ngram",
+ "offsets": [-2, -1, 0, 1, 2]
+ },
+ {
+ "args": {
+ "common_words_gazetteer_name": None,
+ "use_stemming": False,
+ "n": 2
+ },
+ "factory_name": "ngram",
+ "offsets": [-2, 1]
+ },
+ {
+ "args": {"prefix_size": 1},
+ "factory_name": "prefix",
+ "offsets": [0]
+ },
+ {
+ "args": {"prefix_size": 2},
+ "factory_name": "prefix",
+ "offsets": [0]
+ },
+ {
+ "args": {"suffix_size": 1},
+ "factory_name": "suffix",
+ "offsets": [0]
+ },
+ {
+ "args": {"suffix_size": 2},
+ "factory_name": "suffix",
+ "offsets": [0]
+ },
+ {
+ "args": {},
+ "factory_name": "is_digit",
+ "offsets": [-1, 0, 1]
+ },
+ {
+ "args": {},
+ "factory_name": "is_first",
+ "offsets": [-2, -1, 0]
+ },
+ {
+ "args": {},
+ "factory_name": "is_last",
+ "offsets": [0, 1, 2]
+ },
+ {
+ "args": {"n": 1},
+ "factory_name": "shape_ngram",
+ "offsets": [0]
+ },
+ {
+ "args": {"n": 2},
+ "factory_name": "shape_ngram",
+ "offsets": [-1, 0]
+ },
+ {
+ "args": {"n": 3},
+ "factory_name": "shape_ngram",
+ "offsets": [-1]
+ },
+ {
+ "args": {
+ "use_stemming": False,
+ "tagging_scheme_code": 2,
+ "entity_filter": {
+ "automatically_extensible": False
+ }
+ },
+ "factory_name": "entity_match",
+ "offsets": [-2, -1, 0]
+ },
+ {
+ "args": {
+ "use_stemming": False,
+ "tagging_scheme_code": 2,
+ "entity_filter": {
+ "automatically_extensible": True
+ }
+ },
+ "factory_name": "entity_match",
+ "offsets": [-2, -1, 0],
+ "drop_out": 0.5
+ },
+ {
+ "args": {"tagging_scheme_code": 1},
+ "factory_name": "builtin_entity_match",
+ "offsets": [-2, -1, 0]
+ }
+ ],
+ "crf_args": {
+ "c1": 0.1,
+ "c2": 0.1,
+ "algorithm": "lbfgs"
+ },
+ "tagging_scheme": 1,
+ "data_augmentation_config": {
+ "min_utterances": 200,
+ "capitalization_ratio": 0.2,
+ "add_builtin_entities_examples": True
+ }
+ },
+ "intent_classifier_config": {
+ "unit_name": "log_reg_intent_classifier",
+ "data_augmentation_config": {
+ "min_utterances": 20,
+ "noise_factor": 5,
+ "add_builtin_entities_examples": False,
+ "max_unknown_words": None,
+ "unknown_word_prob": 0.0,
+ "unknown_words_replacement_string": None
+ },
+ "featurizer_config": {
+ "unit_name": "featurizer",
+ "pvalue_threshold": 0.4,
+ "added_cooccurrence_feature_ratio": 0.0,
+ "tfidf_vectorizer_config": {
+ "unit_name": "tfidf_vectorizer",
+ "use_stemming": False,
+ "word_clusters_name": None
+ },
+ "cooccurrence_vectorizer_config": {
+ "unit_name": "cooccurrence_vectorizer",
+ "window_size": None,
+ "filter_stop_words": True,
+ "unknown_words_replacement_string": None,
+ "keep_order": True
+ }
+ },
+ "noise_reweight_factor": 1,
+ }
+ }
+ ]
+}
diff --git a/snips_inference_agl/default_configs/config_pt_br.py b/snips_inference_agl/default_configs/config_pt_br.py
new file mode 100644
index 0000000..450f0db
--- /dev/null
+++ b/snips_inference_agl/default_configs/config_pt_br.py
@@ -0,0 +1,137 @@
+from __future__ import unicode_literals
+
+CONFIG = {
+ "unit_name": "nlu_engine",
+ "intent_parsers_configs": [
+ {
+ "unit_name": "lookup_intent_parser",
+ "ignore_stop_words": True
+ },
+ {
+ "unit_name": "probabilistic_intent_parser",
+ "slot_filler_config": {
+ "unit_name": "crf_slot_filler",
+ "feature_factory_configs": [
+ {
+ "args": {
+ "common_words_gazetteer_name":
+ "top_5000_words_stemmed",
+ "use_stemming": True,
+ "n": 1
+ },
+ "factory_name": "ngram",
+ "offsets": [-2, -1, 0, 1, 2]
+ },
+ {
+ "args": {
+ "common_words_gazetteer_name":
+ "top_5000_words_stemmed",
+ "use_stemming": True,
+ "n": 2
+ },
+ "factory_name": "ngram",
+ "offsets": [-2, 1]
+ },
+ {
+ "args": {},
+ "factory_name": "is_digit",
+ "offsets": [-1, 0, 1]
+ },
+ {
+ "args": {},
+ "factory_name": "is_first",
+ "offsets": [-2, -1, 0]
+ },
+ {
+ "args": {},
+ "factory_name": "is_last",
+ "offsets": [0, 1, 2]
+ },
+ {
+ "args": {"n": 1},
+ "factory_name": "shape_ngram",
+ "offsets": [0]
+ },
+ {
+ "args": {"n": 2},
+ "factory_name": "shape_ngram",
+ "offsets": [-1, 0]
+ },
+ {
+ "args": {"n": 3},
+ "factory_name": "shape_ngram",
+ "offsets": [-1]
+ },
+ {
+ "args": {
+ "use_stemming": True,
+ "tagging_scheme_code": 2,
+ "entity_filter": {
+ "automatically_extensible": False
+ }
+ },
+ "factory_name": "entity_match",
+ "offsets": [-2, -1, 0]
+ },
+ {
+ "args": {
+ "use_stemming": True,
+ "tagging_scheme_code": 2,
+ "entity_filter": {
+ "automatically_extensible": True
+ }
+ },
+ "factory_name": "entity_match",
+ "offsets": [-2, -1, 0],
+ "drop_out": 0.5
+ },
+ {
+ "args": {"tagging_scheme_code": 1},
+ "factory_name": "builtin_entity_match",
+ "offsets": [-2, -1, 0]
+ }
+ ],
+ "crf_args": {
+ "c1": 0.1,
+ "c2": 0.1,
+ "algorithm": "lbfgs"
+ },
+ "tagging_scheme": 1,
+ "data_augmentation_config": {
+ "min_utterances": 200,
+ "capitalization_ratio": 0.2,
+ "add_builtin_entities_examples": True
+ },
+ },
+ "intent_classifier_config": {
+ "unit_name": "log_reg_intent_classifier",
+ "data_augmentation_config": {
+ "min_utterances": 20,
+ "noise_factor": 5,
+ "add_builtin_entities_examples": False,
+ "max_unknown_words": None,
+ "unknown_word_prob": 0.0,
+ "unknown_words_replacement_string": None
+ },
+ "featurizer_config": {
+ "unit_name": "featurizer",
+ "pvalue_threshold": 0.4,
+ "added_cooccurrence_feature_ratio": 0.0,
+ "tfidf_vectorizer_config": {
+ "unit_name": "tfidf_vectorizer",
+ "use_stemming": True,
+ "word_clusters_name": None
+ },
+ "cooccurrence_vectorizer_config": {
+ "unit_name": "cooccurrence_vectorizer",
+ "window_size": None,
+ "filter_stop_words": True,
+ "unknown_words_replacement_string": None,
+ "keep_order": True
+ }
+ },
+ },
+ "noise_reweight_factor": 1,
+ }
+ ]
+}
diff --git a/snips_inference_agl/default_configs/config_pt_pt.py b/snips_inference_agl/default_configs/config_pt_pt.py
new file mode 100644
index 0000000..450f0db
--- /dev/null
+++ b/snips_inference_agl/default_configs/config_pt_pt.py
@@ -0,0 +1,137 @@
+from __future__ import unicode_literals
+
+CONFIG = {
+ "unit_name": "nlu_engine",
+ "intent_parsers_configs": [
+ {
+ "unit_name": "lookup_intent_parser",
+ "ignore_stop_words": True
+ },
+ {
+ "unit_name": "probabilistic_intent_parser",
+ "slot_filler_config": {
+ "unit_name": "crf_slot_filler",
+ "feature_factory_configs": [
+ {
+ "args": {
+ "common_words_gazetteer_name":
+ "top_5000_words_stemmed",
+ "use_stemming": True,
+ "n": 1
+ },
+ "factory_name": "ngram",
+ "offsets": [-2, -1, 0, 1, 2]
+ },
+ {
+ "args": {
+ "common_words_gazetteer_name":
+ "top_5000_words_stemmed",
+ "use_stemming": True,
+ "n": 2
+ },
+ "factory_name": "ngram",
+ "offsets": [-2, 1]
+ },
+ {
+ "args": {},
+ "factory_name": "is_digit",
+ "offsets": [-1, 0, 1]
+ },
+ {
+ "args": {},
+ "factory_name": "is_first",
+ "offsets": [-2, -1, 0]
+ },
+ {
+ "args": {},
+ "factory_name": "is_last",
+ "offsets": [0, 1, 2]
+ },
+ {
+ "args": {"n": 1},
+ "factory_name": "shape_ngram",
+ "offsets": [0]
+ },
+ {
+ "args": {"n": 2},
+ "factory_name": "shape_ngram",
+ "offsets": [-1, 0]
+ },
+ {
+ "args": {"n": 3},
+ "factory_name": "shape_ngram",
+ "offsets": [-1]
+ },
+ {
+ "args": {
+ "use_stemming": True,
+ "tagging_scheme_code": 2,
+ "entity_filter": {
+ "automatically_extensible": False
+ }
+ },
+ "factory_name": "entity_match",
+ "offsets": [-2, -1, 0]
+ },
+ {
+ "args": {
+ "use_stemming": True,
+ "tagging_scheme_code": 2,
+ "entity_filter": {
+ "automatically_extensible": True
+ }
+ },
+ "factory_name": "entity_match",
+ "offsets": [-2, -1, 0],
+ "drop_out": 0.5
+ },
+ {
+ "args": {"tagging_scheme_code": 1},
+ "factory_name": "builtin_entity_match",
+ "offsets": [-2, -1, 0]
+ }
+ ],
+ "crf_args": {
+ "c1": 0.1,
+ "c2": 0.1,
+ "algorithm": "lbfgs"
+ },
+ "tagging_scheme": 1,
+ "data_augmentation_config": {
+ "min_utterances": 200,
+ "capitalization_ratio": 0.2,
+ "add_builtin_entities_examples": True
+ },
+ },
+ "intent_classifier_config": {
+ "unit_name": "log_reg_intent_classifier",
+ "data_augmentation_config": {
+ "min_utterances": 20,
+ "noise_factor": 5,
+ "add_builtin_entities_examples": False,
+ "max_unknown_words": None,
+ "unknown_word_prob": 0.0,
+ "unknown_words_replacement_string": None
+ },
+ "featurizer_config": {
+ "unit_name": "featurizer",
+ "pvalue_threshold": 0.4,
+ "added_cooccurrence_feature_ratio": 0.0,
+ "tfidf_vectorizer_config": {
+ "unit_name": "tfidf_vectorizer",
+ "use_stemming": True,
+ "word_clusters_name": None
+ },
+ "cooccurrence_vectorizer_config": {
+ "unit_name": "cooccurrence_vectorizer",
+ "window_size": None,
+ "filter_stop_words": True,
+ "unknown_words_replacement_string": None,
+ "keep_order": True
+ }
+ },
+ },
+ "noise_reweight_factor": 1,
+ }
+ ]
+}