diff options
author | 2023-10-10 14:33:42 +0000 | |
---|---|---|
committer | 2023-10-10 14:33:42 +0000 | |
commit | af1a266670d040d2f4083ff309d732d648afba2a (patch) | |
tree | 2fc46203448ddcc6f81546d379abfaeb323575e9 /roms/edk2/MdeModulePkg/Library/BrotliCustomDecompressLib/brotli/research/dictionary_generator.cc | |
parent | e02cda008591317b1625707ff8e115a4841aa889 (diff) |
Change-Id: Iaf8d18082d3991dec7c0ebbea540f092188eb4ec
Diffstat (limited to 'roms/edk2/MdeModulePkg/Library/BrotliCustomDecompressLib/brotli/research/dictionary_generator.cc')
-rwxr-xr-x | roms/edk2/MdeModulePkg/Library/BrotliCustomDecompressLib/brotli/research/dictionary_generator.cc | 326 |
1 files changed, 326 insertions, 0 deletions
diff --git a/roms/edk2/MdeModulePkg/Library/BrotliCustomDecompressLib/brotli/research/dictionary_generator.cc b/roms/edk2/MdeModulePkg/Library/BrotliCustomDecompressLib/brotli/research/dictionary_generator.cc new file mode 100755 index 000000000..dcdf2fa12 --- /dev/null +++ b/roms/edk2/MdeModulePkg/Library/BrotliCustomDecompressLib/brotli/research/dictionary_generator.cc @@ -0,0 +1,326 @@ +#include <climits> +#include <cstddef> +#include <cstdio> +#include <cstring> +#include <fstream> +#include <vector> + +#include "./deorummolae.h" +#include "./durchschlag.h" +#include "./sieve.h" + +/* This isn't a definitive list of "--foo" arguments, only those that take an + * additional "=#" integer parameter, like "--foo=20" or "--foo=32K". + */ +#define LONG_ARG_BLOCK_LEN "--block_len=" +#define LONG_ARG_SLICE_LEN "--slice_len=" +#define LONG_ARG_TARGET_DICT_LEN "--target_dict_len=" +#define LONG_ARG_MIN_SLICE_POP "--min_slice_pop=" +#define LONG_ARG_CHUNK_LEN "--chunk_len=" +#define LONG_ARG_OVERLAP_LEN "--overlap_len=" + +#define METHOD_DM 0 +#define METHOD_SIEVE 1 +#define METHOD_DURCHSCHLAG 2 +#define METHOD_DISTILL 3 +#define METHOD_PURIFY 4 + +static size_t readInt(const char* str) { + size_t result = 0; + if (str[0] == 0 || str[0] == '0') { + return 0; + } + for (size_t i = 0; i < 13; ++i) { + if (str[i] == 0) { + return result; + } + if (str[i] == 'k' || str[i] == 'K') { + if ((str[i + 1] == 0) && ((result << 10) > result)) { + return result << 10; + } + return 0; + } + if (str[i] == 'm' || str[i] == 'M') { + if ((str[i + 1] == 0) && ((result << 20) > result)) { + return result << 20; + } + return 0; + } + if (str[i] < '0' || str[i] > '9') { + return 0; + } + size_t next = (10 * result) + (str[i] - '0'); + if (next <= result) { + return 0; + } + result = next; + } + return 0; +} + +static std::string readFile(const std::string& path) { + std::ifstream file(path); + std::string content( + (std::istreambuf_iterator<char>(file)), std::istreambuf_iterator<char>()); + return content; +} + +static void writeFile(const char* file, const std::string& content) { + std::ofstream outfile(file, std::ofstream::binary); + outfile.write(content.c_str(), static_cast<std::streamsize>(content.size())); + outfile.close(); +} + +static void writeSamples(char const* argv[], const std::vector<int>& pathArgs, + const std::vector<size_t>& sizes, const uint8_t* data) { + size_t offset = 0; + for (size_t i = 0; i < pathArgs.size(); ++i) { + int j = pathArgs[i]; + const char* file = argv[j]; + size_t sampleSize = sizes[i]; + std::ofstream outfile(file, std::ofstream::binary); + outfile.write(reinterpret_cast<const char*>(data + offset), + static_cast<std::streamsize>(sampleSize)); + outfile.close(); + offset += sampleSize; + } +} + +/* Returns "base file name" or its tail, if it contains '/' or '\'. */ +static const char* fileName(const char* path) { + const char* separator_position = strrchr(path, '/'); + if (separator_position) path = separator_position + 1; + separator_position = strrchr(path, '\\'); + if (separator_position) path = separator_position + 1; + return path; +} + +static void printHelp(const char* name) { + fprintf(stderr, "Usage: %s [OPTION]... DICTIONARY [SAMPLE]...\n", name); + fprintf(stderr, + "Options:\n" + " --dm use 'deorummolae' engine\n" + " --distill rewrite samples; unique text parts are removed\n" + " --dsh use 'durchschlag' engine (default)\n" + " --purify rewrite samples; unique text parts are zeroed out\n" + " --sieve use 'sieve' engine\n" + " -b#, --block_len=#\n" + " set block length for 'durchschlag'; default: 1024\n" + " -s#, --slice_len=#\n" + " set slice length for 'distill', 'durchschlag', 'purify'\n" + " and 'sieve'; default: 16\n" + " -t#, --target_dict_len=#\n" + " set target dictionary length (limit); default: 16K\n" + " -u#, --min_slice_pop=#\n" + " set minimum slice population (for rewrites); default: 2\n" + " -c#, --chunk_len=#\n" + " if positive, samples are cut into chunks of this length;\n" + " default: 0; cannot mix with 'rewrite samples'\n" + " -o#, --overlap_len=#\n" + " set chunk overlap length; default 0\n" + "# is a decimal number with optional k/K/m/M suffix.\n" + "WARNING: 'distill' and 'purify' will overwrite original samples!\n" + " Completely unique samples might become empty files.\n\n"); +} + +int main(int argc, char const* argv[]) { + int dictionaryArg = -1; + int method = METHOD_DURCHSCHLAG; + size_t sliceLen = 16; + size_t targetSize = 16 << 10; + size_t blockSize = 1024; + size_t minimumPopulation = 2; + size_t chunkLen = 0; + size_t overlapLen = 0; + + std::vector<uint8_t> data; + std::vector<size_t> sizes; + std::vector<int> pathArgs; + size_t total = 0; + for (int i = 1; i < argc; ++i) { + if (argv[i] == nullptr) { + continue; + } + + if (argv[i][0] == '-') { + char arg1 = argv[i][1]; + const char* arg2 = arg1 ? &argv[i][2] : nullptr; + if (arg1 == '-') { + if (dictionaryArg != -1) { + fprintf(stderr, + "Method should be specified before dictionary / sample '%s'\n", + argv[i]); + exit(1); + } + + /* Look for "--long_arg" via exact match. */ + if (std::strcmp(argv[i], "--sieve") == 0) { + method = METHOD_SIEVE; + continue; + } + if (std::strcmp(argv[i], "--dm") == 0) { + method = METHOD_DM; + continue; + } + if (std::strcmp(argv[i], "--dsh") == 0) { + method = METHOD_DURCHSCHLAG; + continue; + } + if (std::strcmp(argv[i], "--distill") == 0) { + method = METHOD_DISTILL; + continue; + } + if (std::strcmp(argv[i], "--purify") == 0) { + method = METHOD_PURIFY; + continue; + } + + /* Look for "--long_arg=#" via prefix match. */ + if (std::strncmp(argv[i], LONG_ARG_BLOCK_LEN, + std::strlen(LONG_ARG_BLOCK_LEN)) == 0) { + arg1 = 'b'; + arg2 = &argv[i][std::strlen(LONG_ARG_BLOCK_LEN)]; + } else if (std::strncmp(argv[i], LONG_ARG_SLICE_LEN, + std::strlen(LONG_ARG_SLICE_LEN)) == 0) { + arg1 = 's'; + arg2 = &argv[i][std::strlen(LONG_ARG_SLICE_LEN)]; + } else if (std::strncmp(argv[i], LONG_ARG_TARGET_DICT_LEN, + std::strlen(LONG_ARG_TARGET_DICT_LEN)) == 0) { + arg1 = 't'; + arg2 = &argv[i][std::strlen(LONG_ARG_TARGET_DICT_LEN)]; + } else if (std::strncmp(argv[i], LONG_ARG_MIN_SLICE_POP, + std::strlen(LONG_ARG_MIN_SLICE_POP)) == 0) { + arg1 = 'u'; + arg2 = &argv[i][std::strlen(LONG_ARG_MIN_SLICE_POP)]; + } else if (std::strncmp(argv[i], LONG_ARG_CHUNK_LEN, + std::strlen(LONG_ARG_CHUNK_LEN)) == 0) { + arg1 = 'c'; + arg2 = &argv[i][std::strlen(LONG_ARG_CHUNK_LEN)]; + } else if (std::strncmp(argv[i], LONG_ARG_OVERLAP_LEN, + std::strlen(LONG_ARG_OVERLAP_LEN)) == 0) { + arg1 = 'o'; + arg2 = &argv[i][std::strlen(LONG_ARG_OVERLAP_LEN)]; + } else { + printHelp(fileName(argv[0])); + fprintf(stderr, "Invalid option '%s'\n", argv[i]); + exit(1); + } + } + + /* Look for "-f" short args or "--foo=#" long args. */ + if (arg1 == 'b') { + blockSize = readInt(arg2); + if (blockSize < 16 || blockSize > 65536) { + printHelp(fileName(argv[0])); + fprintf(stderr, "Invalid option '%s'\n", argv[i]); + exit(1); + } + } else if (arg1 == 's') { + sliceLen = readInt(arg2); + if (sliceLen < 4 || sliceLen > 256) { + printHelp(fileName(argv[0])); + fprintf(stderr, "Invalid option '%s'\n", argv[i]); + exit(1); + } + } else if (arg1 == 't') { + targetSize = readInt(arg2); + if (targetSize < 256 || targetSize > (1 << 25)) { + printHelp(fileName(argv[0])); + fprintf(stderr, "Invalid option '%s'\n", argv[i]); + exit(1); + } + } else if (arg1 == 'u') { + minimumPopulation = readInt(arg2); + if (minimumPopulation < 256 || minimumPopulation > 65536) { + printHelp(fileName(argv[0])); + fprintf(stderr, "Invalid option '%s'\n", argv[i]); + exit(1); + } + } else if (arg1 == 'c') { + chunkLen = readInt(arg2); + if (chunkLen < 0 || chunkLen > INT_MAX) { + printHelp(fileName(argv[0])); + fprintf(stderr, "Invalid option '%s'\n", argv[i]); + exit(1); + } + } else if (arg1 == 'o') { + overlapLen = readInt(arg2); + if (overlapLen < 0 || overlapLen > INT_MAX) { + printHelp(fileName(argv[0])); + fprintf(stderr, "Invalid option '%s'\n", argv[i]); + exit(1); + } + } else { + printHelp(fileName(argv[0])); + fprintf(stderr, "Unrecognized option '%s'\n", argv[i]); + exit(1); + } + continue; + } + + if (dictionaryArg == -1) { + if (method != METHOD_DISTILL && method != METHOD_PURIFY) { + dictionaryArg = i; + continue; + } + } + + std::string content = readFile(argv[i]); + if (chunkLen == 0) { + pathArgs.push_back(i); + data.insert(data.end(), content.begin(), content.end()); + total += content.size(); + sizes.push_back(content.size()); + continue; + } else if (chunkLen <= overlapLen) { + printHelp(fileName(argv[0])); + fprintf(stderr, "Invalid chunkLen - overlapLen combination\n"); + exit(1); + } + for (size_t chunkStart = 0; + chunkStart < content.size(); + chunkStart += chunkLen - overlapLen) { + std::string chunk = content.substr(chunkStart, chunkLen); + data.insert(data.end(), chunk.begin(), chunk.end()); + total += chunk.size(); + sizes.push_back(chunk.size()); + } + } + + bool wantDictionary = (dictionaryArg == -1); + if (method == METHOD_DISTILL || method == METHOD_PURIFY) { + wantDictionary = false; + if (chunkLen != 0) { + printHelp(fileName(argv[0])); + fprintf(stderr, "Cannot mix 'rewrite samples' with positive chunk_len\n"); + exit(1); + } + } + if (wantDictionary || total == 0) { + printHelp(fileName(argv[0])); + fprintf(stderr, "Not enough arguments\n"); + exit(1); + } + + if (method == METHOD_SIEVE) { + writeFile(argv[dictionaryArg], sieve_generate( + targetSize, sliceLen, sizes, data.data())); + } else if (method == METHOD_DM) { + writeFile(argv[dictionaryArg], DM_generate( + targetSize, sizes, data.data())); + } else if (method == METHOD_DURCHSCHLAG) { + writeFile(argv[dictionaryArg], durchschlag_generate( + targetSize, sliceLen, blockSize, sizes, data.data())); + } else if (method == METHOD_DISTILL) { + durchschlag_distill(sliceLen, minimumPopulation, &sizes, data.data()); + writeSamples(argv, pathArgs, sizes, data.data()); + } else if (method == METHOD_PURIFY) { + durchschlag_purify(sliceLen, minimumPopulation, sizes, data.data()); + writeSamples(argv, pathArgs, sizes, data.data()); + } else { + printHelp(fileName(argv[0])); + fprintf(stderr, "Unknown generator\n"); + exit(1); + } + return 0; +} |