1 files changed, 302 insertions, 0 deletions
diff --git a/roms/edk2/BaseTools/Source/C/BrotliCompress/brotli/research/deorummolae.cc b/roms/edk2/BaseTools/Source/C/BrotliCompress/brotli/research/deorummolae.cc
new file mode 100644
index 000000000..d15b7ee55
--- /dev/null
+++ b/roms/edk2/BaseTools/Source/C/BrotliCompress/brotli/research/deorummolae.cc
@@ -0,0 +1,302 @@
+#include "./deorummolae.h"
+
+#include <array>
+#include <cstdio>
+
+#include "./esaxx/sais.hxx"
+
+/* Used for quick SA-entry to file mapping. Each file is padded to size that
+   is a multiple of chunk size. */
+#define CHUNK_SIZE 64
+/* Length of substring that is considered to be covered by dictionary string. */
+#define CUT_MATCH 6
+/* Minimal dictionary entry size. */
+#define MIN_MATCH 24
+
+/* Non tunable definitions. */
+#define CHUNK_MASK (CHUNK_SIZE - 1)
+#define COVERAGE_SIZE (1 << (DM_LOG_MAX_FILES - 6))
+
+/* File coverage: every bit set to 1 denotes a file covered by an isle. */
+typedef std::array<uint64_t, COVERAGE_SIZE> Coverage;
+
+/* Symbol of text alphabet. */
+typedef int32_t TextChar;
+
+/* Pointer to position in text. */
+typedef uint32_t TextIdx;
+
+/* SAIS sarray_type; unfortunately, must be a signed type. */
+typedef int32_t TextSaIdx;
+
+static size_t popcount(uint64_t u) {
+  return static_cast<size_t>(__builtin_popcountll(u));
+}
+
+/* Condense terminators and pad file entries. */
+static void rewriteText(std::vector<TextChar>* text) {
+  TextChar terminator = text->back();
+  TextChar prev = terminator;
+  TextIdx to = 0;
+  for (TextIdx from = 0; from < text->size(); ++from) {
+    TextChar next = text->at(from);
+    if (next < 256 || prev < 256) {
+      text->at(to++) = next;
+      if (next >= 256) terminator = next;
+    }
+    prev = next;
+  }
+  text->resize(to);
+  if (text->empty()) text->push_back(terminator);
+  while (text->size() & CHUNK_MASK) text->push_back(terminator);
+}
+
+/* Reenumerate terminators for smaller alphabet. */
+static void remapTerminators(std::vector<TextChar>* text,
+    TextChar* next_terminator) {
+  TextChar prev = -1;
+  TextChar x = 256;
+  for (TextIdx i = 0; i < text->size(); ++i) {
+    TextChar next = text->at(i);
+    if (next < 256) {  // Char.
+      // Do nothing.
+    } else if (prev < 256) {  // Terminator after char.
+      next = x++;
+    } else {  // Terminator after terminator.
+      next = prev;
+    }
+    text->at(i) = next;
+    prev = next;
+  }
+  *next_terminator = x;
+}
+
+/* Combine all file entries; create mapping position->file. */
+static void buildFullText(std::vector<std::vector<TextChar>>* data,
+    std::vector<TextChar>* full_text, std::vector<TextIdx>* file_map,
+    std::vector<TextIdx>* file_offset, TextChar* next_terminator) {
+  file_map->resize(0);
+  file_offset->resize(0);
+  full_text->resize(0);
+  for (TextIdx i = 0; i < data->size(); ++i) {
+    file_offset->push_back(full_text->size());
+    std::vector<TextChar>& file = data->at(i);
+    rewriteText(&file);
+    full_text->insert(full_text->end(), file.begin(), file.end());
+    file_map->insert(file_map->end(), file.size() / CHUNK_SIZE, i);
+  }
+  if (false) remapTerminators(full_text, next_terminator);
+}
+
+/* Build longest-common-prefix based on suffix array and text.
+   TODO: borrowed -> unknown efficiency. */
+static void buildLcp(std::vector<TextChar>* text, std::vector<TextIdx>* sa,
+    std::vector<TextIdx>* lcp, std::vector<TextIdx>* invese_sa) {
+  TextIdx size = static_cast<TextIdx>(text->size());
+  lcp->resize(size);
+  TextIdx k = 0;
+  lcp->at(size - 1) = 0;
+  for (TextIdx i = 0; i < size; ++i) {
+    if (invese_sa->at(i) == size - 1) {
+      k = 0;
+      continue;
+    }
+    // Suffix which follow i-th suffix.
+    TextIdx j = sa->at(invese_sa->at(i) + 1);
+    while (i + k < size && j + k < size && text->at(i + k) == text->at(j + k)) {
+      ++k;
+    }
+    lcp->at(invese_sa->at(i)) = k;
+    if (k > 0) --k;
+  }
+}
+
+/* Isle is a range in SA with LCP not less than some value.
+   When we raise the LCP requirement, the isle sunks and smaller isles appear
+   instead. */
+typedef struct {
+  TextIdx lcp;
+  TextIdx l;
+  TextIdx r;
+  Coverage coverage;
+} Isle;
+
+/* Helper routine for `cutMatch`. */
+static void poisonData(TextIdx pos, TextIdx length,
+    std::vector<std::vector<TextChar>>* data, std::vector<TextIdx>* file_map,
+    std::vector<TextIdx>* file_offset, TextChar* next_terminator) {
+  TextIdx f = file_map->at(pos / CHUNK_SIZE);
+  pos -= file_offset->at(f);
+  std::vector<TextChar>& file = data->at(f);
+  TextIdx l = (length == CUT_MATCH) ? CUT_MATCH : 1;
+  for (TextIdx j = 0; j < l; j++, pos++) {
+    if (file[pos] >= 256) continue;
+    if (file[pos + 1] >= 256) {
+      file[pos] = file[pos + 1];
+    } else if (pos > 0 && file[pos - 1] >= 256) {
+      file[pos] = file[pos - 1];
+    } else {
+      file[pos] = (*next_terminator)++;
+    }
+  }
+}
+
+/* Remove substrings of a given match from files.
+   Substrings are replaced with unique terminators, so next iteration SA would
+   not allow to cross removed areas. */
+static void cutMatch(std::vector<std::vector<TextChar>>* data, TextIdx index,
+    TextIdx length, std::vector<TextIdx>* sa, std::vector<TextIdx>* lcp,
+    std::vector<TextIdx>* invese_sa, TextChar* next_terminator,
+    std::vector<TextIdx>* file_map, std::vector<TextIdx>* file_offset) {
+  while (length >= CUT_MATCH) {
+    TextIdx i = index;
+    while (lcp->at(i) >= length) {
+      i++;
+      poisonData(
+          sa->at(i), length, data, file_map, file_offset, next_terminator);
+    }
+    while (true) {
+      poisonData(
+          sa->at(index), length, data, file_map, file_offset, next_terminator);
+      if (index == 0 || lcp->at(index - 1) < length) break;
+      index--;
+    }
+    length--;
+    index = invese_sa->at(sa->at(index) + 1);
+  }
+}
+
+std::string DM_generate(size_t dictionary_size_limit,
+    const std::vector<size_t>& sample_sizes, const uint8_t* sample_data) {
+  {
+    TextIdx tmp = static_cast<TextIdx>(dictionary_size_limit);
+    if ((tmp != dictionary_size_limit) || (tmp > 1u << 30)) {
+      fprintf(stderr, "dictionary_size_limit is too large\n");
+      return "";
+    }
+  }
+
+  /* Could use 256 + '0' for easier debugging. */
+  TextChar next_terminator = 256;
+
+  std::string output;
+  std::vector<std::vector<TextChar>> data;
+
+  TextIdx offset = 0;
+  size_t num_samples = sample_sizes.size();
+  if (num_samples > DM_MAX_FILES) num_samples = DM_MAX_FILES;
+  for (size_t n = 0; n < num_samples; ++n) {
+    TextIdx delta = static_cast<TextIdx>(sample_sizes[n]);
+    if (delta != sample_sizes[n]) {
+      fprintf(stderr, "sample is too large\n");
+      return "";
+    }
+    if (delta == 0) {
+      fprintf(stderr, "0-length samples are prohibited\n");
+      return "";
+    }
+    TextIdx next_offset = offset + delta;
+    if (next_offset <= offset) {
+      fprintf(stderr, "corpus is too large\n");
+      return "";
+    }
+    data.push_back(
+        std::vector<TextChar>(sample_data + offset, sample_data + next_offset));
+    offset = next_offset;
+    data.back().push_back(next_terminator++);
+  }
+
+  /* Most arrays are allocated once, and then just resized to smaller and
+     smaller sizes. */
+  std::vector<TextChar> full_text;
+  std::vector<TextIdx> file_map;
+  std::vector<TextIdx> file_offset;
+  std::vector<TextIdx> sa;
+  std::vector<TextIdx> invese_sa;
+  std::vector<TextIdx> lcp;
+  std::vector<Isle> isles;
+  std::vector<char> output_data;
+  TextIdx total = 0;
+  TextIdx total_cost = 0;
+  TextIdx best_cost;
+  Isle best_isle;
+  size_t min_count = num_samples;
+
+  while (true) {
+    TextIdx max_match = static_cast<TextIdx>(dictionary_size_limit) - total;
+    buildFullText(&data, &full_text, &file_map, &file_offset, &next_terminator);
+    sa.resize(full_text.size());
+    /* Hopefully, non-negative TextSaIdx is the same sa TextIdx counterpart. */
+    saisxx(full_text.data(), reinterpret_cast<TextSaIdx*>(sa.data()),
+        static_cast<TextChar>(full_text.size()), next_terminator);
+    invese_sa.resize(full_text.size());
+    for (TextIdx i = 0; i < full_text.size(); ++i) {
+      invese_sa[sa[i]] = i;
+    }
+    buildLcp(&full_text, &sa, &lcp, &invese_sa);
+
+    /* Do not rebuild SA/LCP, just use different selection. */
+  retry:
+    best_cost = 0;
+    best_isle = {0, 0, 0, {{0}}};
+    isles.resize(0);
+    isles.push_back(best_isle);
+
+    for (TextIdx i = 0; i < lcp.size(); ++i) {
+      TextIdx l = i;
+      Coverage cov = {{0}};
+      size_t f = file_map[sa[i] / CHUNK_SIZE];
+      cov[f >> 6] = (static_cast<uint64_t>(1)) << (f & 63);
+      while (lcp[i] < isles.back().lcp) {
+        Isle& top = isles.back();
+        top.r = i;
+        l = top.l;
+        for (size_t x = 0; x < cov.size(); ++x) cov[x] |= top.coverage[x];
+        size_t count = 0;
+        for (size_t x = 0; x < cov.size(); ++x) count += popcount(cov[x]);
+        TextIdx effective_lcp = top.lcp;
+        /* Restrict (last) dictionary entry length. */
+        if (effective_lcp > max_match) effective_lcp = max_match;
+        TextIdx cost = count * effective_lcp;
+        if (cost > best_cost && count >= min_count &&
+            effective_lcp >= MIN_MATCH) {
+          best_cost = cost;
+          best_isle = top;
+          best_isle.lcp = effective_lcp;
+        }
+        isles.pop_back();
+        for (size_t x = 0; x < cov.size(); ++x) {
+          isles.back().coverage[x] |= cov[x];
+        }
+      }
+      if (lcp[i] > isles.back().lcp) isles.push_back({lcp[i], l, 0, {{0}}});
+      for (size_t x = 0; x < cov.size(); ++x) {
+        isles.back().coverage[x] |= cov[x];
+      }
+    }
+
+    /* When saturated matches do not match length restrictions, lower the
+       saturation requirements. */
+    if (best_cost == 0 || best_isle.lcp < MIN_MATCH) {
+      if (min_count >= 8) {
+        min_count = (min_count * 7) / 8;
+        fprintf(stderr, "Retry: min_count=%zu\n", min_count);
+        goto retry;
+      }
+      break;
+    }
+
+    /* Save the entry. */
+    fprintf(stderr, "Savings: %d+%d, dictionary: %d+%d\n",
+        total_cost, best_cost, total, best_isle.lcp);
+    int* piece = &full_text[sa[best_isle.l]];
+    output.insert(output.end(), piece, piece + best_isle.lcp);
+    total += best_isle.lcp;
+    total_cost += best_cost;
+    cutMatch(&data, best_isle.l, best_isle.lcp, &sa, &lcp, &invese_sa,
+        &next_terminator, &file_map, &file_offset);
+    if (total >= dictionary_size_limit) break;
+  }
+
+  return output;
+}