aboutsummaryrefslogtreecommitdiffstats
path: root/roms/edk2/MdeModulePkg/Library/BrotliCustomDecompressLib/brotli/research/dictionary_generator.cc
diff options
context:
space:
mode:
authorAngelos Mouzakitis <a.mouzakitis@virtualopensystems.com>2023-10-10 14:33:42 +0000
committerAngelos Mouzakitis <a.mouzakitis@virtualopensystems.com>2023-10-10 14:33:42 +0000
commitaf1a266670d040d2f4083ff309d732d648afba2a (patch)
tree2fc46203448ddcc6f81546d379abfaeb323575e9 /roms/edk2/MdeModulePkg/Library/BrotliCustomDecompressLib/brotli/research/dictionary_generator.cc
parente02cda008591317b1625707ff8e115a4841aa889 (diff)
Add submodule dependency filesHEADmaster
Change-Id: Iaf8d18082d3991dec7c0ebbea540f092188eb4ec
Diffstat (limited to 'roms/edk2/MdeModulePkg/Library/BrotliCustomDecompressLib/brotli/research/dictionary_generator.cc')
-rwxr-xr-xroms/edk2/MdeModulePkg/Library/BrotliCustomDecompressLib/brotli/research/dictionary_generator.cc326
1 files changed, 326 insertions, 0 deletions
diff --git a/roms/edk2/MdeModulePkg/Library/BrotliCustomDecompressLib/brotli/research/dictionary_generator.cc b/roms/edk2/MdeModulePkg/Library/BrotliCustomDecompressLib/brotli/research/dictionary_generator.cc
new file mode 100755
index 000000000..dcdf2fa12
--- /dev/null
+++ b/roms/edk2/MdeModulePkg/Library/BrotliCustomDecompressLib/brotli/research/dictionary_generator.cc
@@ -0,0 +1,326 @@
+#include <climits>
+#include <cstddef>
+#include <cstdio>
+#include <cstring>
+#include <fstream>
+#include <vector>
+
+#include "./deorummolae.h"
+#include "./durchschlag.h"
+#include "./sieve.h"
+
+/* This isn't a definitive list of "--foo" arguments, only those that take an
+ * additional "=#" integer parameter, like "--foo=20" or "--foo=32K".
+ */
+#define LONG_ARG_BLOCK_LEN "--block_len="
+#define LONG_ARG_SLICE_LEN "--slice_len="
+#define LONG_ARG_TARGET_DICT_LEN "--target_dict_len="
+#define LONG_ARG_MIN_SLICE_POP "--min_slice_pop="
+#define LONG_ARG_CHUNK_LEN "--chunk_len="
+#define LONG_ARG_OVERLAP_LEN "--overlap_len="
+
+#define METHOD_DM 0
+#define METHOD_SIEVE 1
+#define METHOD_DURCHSCHLAG 2
+#define METHOD_DISTILL 3
+#define METHOD_PURIFY 4
+
+static size_t readInt(const char* str) {
+ size_t result = 0;
+ if (str[0] == 0 || str[0] == '0') {
+ return 0;
+ }
+ for (size_t i = 0; i < 13; ++i) {
+ if (str[i] == 0) {
+ return result;
+ }
+ if (str[i] == 'k' || str[i] == 'K') {
+ if ((str[i + 1] == 0) && ((result << 10) > result)) {
+ return result << 10;
+ }
+ return 0;
+ }
+ if (str[i] == 'm' || str[i] == 'M') {
+ if ((str[i + 1] == 0) && ((result << 20) > result)) {
+ return result << 20;
+ }
+ return 0;
+ }
+ if (str[i] < '0' || str[i] > '9') {
+ return 0;
+ }
+ size_t next = (10 * result) + (str[i] - '0');
+ if (next <= result) {
+ return 0;
+ }
+ result = next;
+ }
+ return 0;
+}
+
+static std::string readFile(const std::string& path) {
+ std::ifstream file(path);
+ std::string content(
+ (std::istreambuf_iterator<char>(file)), std::istreambuf_iterator<char>());
+ return content;
+}
+
+static void writeFile(const char* file, const std::string& content) {
+ std::ofstream outfile(file, std::ofstream::binary);
+ outfile.write(content.c_str(), static_cast<std::streamsize>(content.size()));
+ outfile.close();
+}
+
+static void writeSamples(char const* argv[], const std::vector<int>& pathArgs,
+ const std::vector<size_t>& sizes, const uint8_t* data) {
+ size_t offset = 0;
+ for (size_t i = 0; i < pathArgs.size(); ++i) {
+ int j = pathArgs[i];
+ const char* file = argv[j];
+ size_t sampleSize = sizes[i];
+ std::ofstream outfile(file, std::ofstream::binary);
+ outfile.write(reinterpret_cast<const char*>(data + offset),
+ static_cast<std::streamsize>(sampleSize));
+ outfile.close();
+ offset += sampleSize;
+ }
+}
+
+/* Returns "base file name" or its tail, if it contains '/' or '\'. */
+static const char* fileName(const char* path) {
+ const char* separator_position = strrchr(path, '/');
+ if (separator_position) path = separator_position + 1;
+ separator_position = strrchr(path, '\\');
+ if (separator_position) path = separator_position + 1;
+ return path;
+}
+
+static void printHelp(const char* name) {
+ fprintf(stderr, "Usage: %s [OPTION]... DICTIONARY [SAMPLE]...\n", name);
+ fprintf(stderr,
+ "Options:\n"
+ " --dm use 'deorummolae' engine\n"
+ " --distill rewrite samples; unique text parts are removed\n"
+ " --dsh use 'durchschlag' engine (default)\n"
+ " --purify rewrite samples; unique text parts are zeroed out\n"
+ " --sieve use 'sieve' engine\n"
+ " -b#, --block_len=#\n"
+ " set block length for 'durchschlag'; default: 1024\n"
+ " -s#, --slice_len=#\n"
+ " set slice length for 'distill', 'durchschlag', 'purify'\n"
+ " and 'sieve'; default: 16\n"
+ " -t#, --target_dict_len=#\n"
+ " set target dictionary length (limit); default: 16K\n"
+ " -u#, --min_slice_pop=#\n"
+ " set minimum slice population (for rewrites); default: 2\n"
+ " -c#, --chunk_len=#\n"
+ " if positive, samples are cut into chunks of this length;\n"
+ " default: 0; cannot mix with 'rewrite samples'\n"
+ " -o#, --overlap_len=#\n"
+ " set chunk overlap length; default 0\n"
+ "# is a decimal number with optional k/K/m/M suffix.\n"
+ "WARNING: 'distill' and 'purify' will overwrite original samples!\n"
+ " Completely unique samples might become empty files.\n\n");
+}
+
+int main(int argc, char const* argv[]) {
+ int dictionaryArg = -1;
+ int method = METHOD_DURCHSCHLAG;
+ size_t sliceLen = 16;
+ size_t targetSize = 16 << 10;
+ size_t blockSize = 1024;
+ size_t minimumPopulation = 2;
+ size_t chunkLen = 0;
+ size_t overlapLen = 0;
+
+ std::vector<uint8_t> data;
+ std::vector<size_t> sizes;
+ std::vector<int> pathArgs;
+ size_t total = 0;
+ for (int i = 1; i < argc; ++i) {
+ if (argv[i] == nullptr) {
+ continue;
+ }
+
+ if (argv[i][0] == '-') {
+ char arg1 = argv[i][1];
+ const char* arg2 = arg1 ? &argv[i][2] : nullptr;
+ if (arg1 == '-') {
+ if (dictionaryArg != -1) {
+ fprintf(stderr,
+ "Method should be specified before dictionary / sample '%s'\n",
+ argv[i]);
+ exit(1);
+ }
+
+ /* Look for "--long_arg" via exact match. */
+ if (std::strcmp(argv[i], "--sieve") == 0) {
+ method = METHOD_SIEVE;
+ continue;
+ }
+ if (std::strcmp(argv[i], "--dm") == 0) {
+ method = METHOD_DM;
+ continue;
+ }
+ if (std::strcmp(argv[i], "--dsh") == 0) {
+ method = METHOD_DURCHSCHLAG;
+ continue;
+ }
+ if (std::strcmp(argv[i], "--distill") == 0) {
+ method = METHOD_DISTILL;
+ continue;
+ }
+ if (std::strcmp(argv[i], "--purify") == 0) {
+ method = METHOD_PURIFY;
+ continue;
+ }
+
+ /* Look for "--long_arg=#" via prefix match. */
+ if (std::strncmp(argv[i], LONG_ARG_BLOCK_LEN,
+ std::strlen(LONG_ARG_BLOCK_LEN)) == 0) {
+ arg1 = 'b';
+ arg2 = &argv[i][std::strlen(LONG_ARG_BLOCK_LEN)];
+ } else if (std::strncmp(argv[i], LONG_ARG_SLICE_LEN,
+ std::strlen(LONG_ARG_SLICE_LEN)) == 0) {
+ arg1 = 's';
+ arg2 = &argv[i][std::strlen(LONG_ARG_SLICE_LEN)];
+ } else if (std::strncmp(argv[i], LONG_ARG_TARGET_DICT_LEN,
+ std::strlen(LONG_ARG_TARGET_DICT_LEN)) == 0) {
+ arg1 = 't';
+ arg2 = &argv[i][std::strlen(LONG_ARG_TARGET_DICT_LEN)];
+ } else if (std::strncmp(argv[i], LONG_ARG_MIN_SLICE_POP,
+ std::strlen(LONG_ARG_MIN_SLICE_POP)) == 0) {
+ arg1 = 'u';
+ arg2 = &argv[i][std::strlen(LONG_ARG_MIN_SLICE_POP)];
+ } else if (std::strncmp(argv[i], LONG_ARG_CHUNK_LEN,
+ std::strlen(LONG_ARG_CHUNK_LEN)) == 0) {
+ arg1 = 'c';
+ arg2 = &argv[i][std::strlen(LONG_ARG_CHUNK_LEN)];
+ } else if (std::strncmp(argv[i], LONG_ARG_OVERLAP_LEN,
+ std::strlen(LONG_ARG_OVERLAP_LEN)) == 0) {
+ arg1 = 'o';
+ arg2 = &argv[i][std::strlen(LONG_ARG_OVERLAP_LEN)];
+ } else {
+ printHelp(fileName(argv[0]));
+ fprintf(stderr, "Invalid option '%s'\n", argv[i]);
+ exit(1);
+ }
+ }
+
+ /* Look for "-f" short args or "--foo=#" long args. */
+ if (arg1 == 'b') {
+ blockSize = readInt(arg2);
+ if (blockSize < 16 || blockSize > 65536) {
+ printHelp(fileName(argv[0]));
+ fprintf(stderr, "Invalid option '%s'\n", argv[i]);
+ exit(1);
+ }
+ } else if (arg1 == 's') {
+ sliceLen = readInt(arg2);
+ if (sliceLen < 4 || sliceLen > 256) {
+ printHelp(fileName(argv[0]));
+ fprintf(stderr, "Invalid option '%s'\n", argv[i]);
+ exit(1);
+ }
+ } else if (arg1 == 't') {
+ targetSize = readInt(arg2);
+ if (targetSize < 256 || targetSize > (1 << 25)) {
+ printHelp(fileName(argv[0]));
+ fprintf(stderr, "Invalid option '%s'\n", argv[i]);
+ exit(1);
+ }
+ } else if (arg1 == 'u') {
+ minimumPopulation = readInt(arg2);
+ if (minimumPopulation < 256 || minimumPopulation > 65536) {
+ printHelp(fileName(argv[0]));
+ fprintf(stderr, "Invalid option '%s'\n", argv[i]);
+ exit(1);
+ }
+ } else if (arg1 == 'c') {
+ chunkLen = readInt(arg2);
+ if (chunkLen < 0 || chunkLen > INT_MAX) {
+ printHelp(fileName(argv[0]));
+ fprintf(stderr, "Invalid option '%s'\n", argv[i]);
+ exit(1);
+ }
+ } else if (arg1 == 'o') {
+ overlapLen = readInt(arg2);
+ if (overlapLen < 0 || overlapLen > INT_MAX) {
+ printHelp(fileName(argv[0]));
+ fprintf(stderr, "Invalid option '%s'\n", argv[i]);
+ exit(1);
+ }
+ } else {
+ printHelp(fileName(argv[0]));
+ fprintf(stderr, "Unrecognized option '%s'\n", argv[i]);
+ exit(1);
+ }
+ continue;
+ }
+
+ if (dictionaryArg == -1) {
+ if (method != METHOD_DISTILL && method != METHOD_PURIFY) {
+ dictionaryArg = i;
+ continue;
+ }
+ }
+
+ std::string content = readFile(argv[i]);
+ if (chunkLen == 0) {
+ pathArgs.push_back(i);
+ data.insert(data.end(), content.begin(), content.end());
+ total += content.size();
+ sizes.push_back(content.size());
+ continue;
+ } else if (chunkLen <= overlapLen) {
+ printHelp(fileName(argv[0]));
+ fprintf(stderr, "Invalid chunkLen - overlapLen combination\n");
+ exit(1);
+ }
+ for (size_t chunkStart = 0;
+ chunkStart < content.size();
+ chunkStart += chunkLen - overlapLen) {
+ std::string chunk = content.substr(chunkStart, chunkLen);
+ data.insert(data.end(), chunk.begin(), chunk.end());
+ total += chunk.size();
+ sizes.push_back(chunk.size());
+ }
+ }
+
+ bool wantDictionary = (dictionaryArg == -1);
+ if (method == METHOD_DISTILL || method == METHOD_PURIFY) {
+ wantDictionary = false;
+ if (chunkLen != 0) {
+ printHelp(fileName(argv[0]));
+ fprintf(stderr, "Cannot mix 'rewrite samples' with positive chunk_len\n");
+ exit(1);
+ }
+ }
+ if (wantDictionary || total == 0) {
+ printHelp(fileName(argv[0]));
+ fprintf(stderr, "Not enough arguments\n");
+ exit(1);
+ }
+
+ if (method == METHOD_SIEVE) {
+ writeFile(argv[dictionaryArg], sieve_generate(
+ targetSize, sliceLen, sizes, data.data()));
+ } else if (method == METHOD_DM) {
+ writeFile(argv[dictionaryArg], DM_generate(
+ targetSize, sizes, data.data()));
+ } else if (method == METHOD_DURCHSCHLAG) {
+ writeFile(argv[dictionaryArg], durchschlag_generate(
+ targetSize, sliceLen, blockSize, sizes, data.data()));
+ } else if (method == METHOD_DISTILL) {
+ durchschlag_distill(sliceLen, minimumPopulation, &sizes, data.data());
+ writeSamples(argv, pathArgs, sizes, data.data());
+ } else if (method == METHOD_PURIFY) {
+ durchschlag_purify(sliceLen, minimumPopulation, sizes, data.data());
+ writeSamples(argv, pathArgs, sizes, data.data());
+ } else {
+ printHelp(fileName(argv[0]));
+ fprintf(stderr, "Unknown generator\n");
+ exit(1);
+ }
+ return 0;
+}