Parolin 0.7.9 6796
Console (soon DLLs) to do a tar like job
Loading...
Searching...
No Matches
cluster_inc.h
Go to the documentation of this file.
1/* NOLINT(build/header_guard) */
2/* Copyright 2013 Google Inc. All Rights Reserved.
3
4 Distributed under MIT license.
5 See file LICENSE for detail or copy at https://opensource.org/licenses/MIT
6*/
7
8/* template parameters: FN, CODE */
9
10#define HistogramType FN(Histogram)
11
12/* Computes the bit cost reduction by combining out[idx1] and out[idx2] and if
13 it is below a threshold, stores the pair (idx1, idx2) in the *pairs queue. */
15 const HistogramType* out, HistogramType* tmp, const uint32_t* cluster_size,
16 uint32_t idx1, uint32_t idx2, size_t max_num_pairs, HistogramPair* pairs,
17 size_t* num_pairs) CODE({
18 BROTLI_BOOL is_good_pair = BROTLI_FALSE;
20 p.idx1 = p.idx2 = 0;
21 p.cost_diff = p.cost_combo = 0;
22 if (idx1 == idx2) {
23 return;
24 }
25 if (idx2 < idx1) {
26 uint32_t t = idx2;
27 idx2 = idx1;
28 idx1 = t;
29 }
30 p.idx1 = idx1;
31 p.idx2 = idx2;
32 p.cost_diff = 0.5 * ClusterCostDiff(cluster_size[idx1], cluster_size[idx2]);
33 p.cost_diff -= out[idx1].bit_cost_;
34 p.cost_diff -= out[idx2].bit_cost_;
35
36 if (out[idx1].total_count_ == 0) {
37 p.cost_combo = out[idx2].bit_cost_;
38 is_good_pair = BROTLI_TRUE;
39 } else if (out[idx2].total_count_ == 0) {
40 p.cost_combo = out[idx1].bit_cost_;
41 is_good_pair = BROTLI_TRUE;
42 } else {
43 double threshold = *num_pairs == 0 ? 1e99 :
44 BROTLI_MAX(double, 0.0, pairs[0].cost_diff);
45 double cost_combo;
46 *tmp = out[idx1];
47 FN(HistogramAddHistogram)(tmp, &out[idx2]);
48 cost_combo = FN(BrotliPopulationCost)(tmp);
49 if (cost_combo < threshold - p.cost_diff) {
50 p.cost_combo = cost_combo;
51 is_good_pair = BROTLI_TRUE;
52 }
53 }
54 if (is_good_pair) {
55 p.cost_diff += p.cost_combo;
56 if (*num_pairs > 0 && HistogramPairIsLess(&pairs[0], &p)) {
57 /* Replace the top of the queue if needed. */
58 if (*num_pairs < max_num_pairs) {
59 pairs[*num_pairs] = pairs[0];
60 ++(*num_pairs);
61 }
62 pairs[0] = p;
63 } else if (*num_pairs < max_num_pairs) {
64 pairs[*num_pairs] = p;
65 ++(*num_pairs);
66 }
67 }
68})
69
71 HistogramType* tmp,
72 uint32_t* cluster_size,
73 uint32_t* symbols,
74 uint32_t* clusters,
75 HistogramPair* pairs,
76 size_t num_clusters,
77 size_t symbols_size,
78 size_t max_clusters,
79 size_t max_num_pairs) CODE({
80 double cost_diff_threshold = 0.0;
81 size_t min_cluster_size = 1;
82 size_t num_pairs = 0;
83
84 {
85 /* We maintain a vector of histogram pairs, with the property that the pair
86 with the maximum bit cost reduction is the first. */
87 size_t idx1;
88 for (idx1 = 0; idx1 < num_clusters; ++idx1) {
89 size_t idx2;
90 for (idx2 = idx1 + 1; idx2 < num_clusters; ++idx2) {
91 FN(BrotliCompareAndPushToQueue)(out, tmp, cluster_size, clusters[idx1],
92 clusters[idx2], max_num_pairs, &pairs[0], &num_pairs);
93 }
94 }
95 }
96
97 while (num_clusters > min_cluster_size) {
98 uint32_t best_idx1;
99 uint32_t best_idx2;
100 size_t i;
101 if (pairs[0].cost_diff >= cost_diff_threshold) {
102 cost_diff_threshold = 1e99;
103 min_cluster_size = max_clusters;
104 continue;
105 }
106 /* Take the best pair from the top of heap. */
107 best_idx1 = pairs[0].idx1;
108 best_idx2 = pairs[0].idx2;
109 FN(HistogramAddHistogram)(&out[best_idx1], &out[best_idx2]);
110 out[best_idx1].bit_cost_ = pairs[0].cost_combo;
111 cluster_size[best_idx1] += cluster_size[best_idx2];
112 for (i = 0; i < symbols_size; ++i) {
113 if (symbols[i] == best_idx2) {
114 symbols[i] = best_idx1;
115 }
116 }
117 for (i = 0; i < num_clusters; ++i) {
118 if (clusters[i] == best_idx2) {
119 memmove(&clusters[i], &clusters[i + 1],
120 (num_clusters - i - 1) * sizeof(clusters[0]));
121 break;
122 }
123 }
124 --num_clusters;
125 {
126 /* Remove pairs intersecting the just combined best pair. */
127 size_t copy_to_idx = 0;
128 for (i = 0; i < num_pairs; ++i) {
129 HistogramPair* p = &pairs[i];
130 if (p->idx1 == best_idx1 || p->idx2 == best_idx1 ||
131 p->idx1 == best_idx2 || p->idx2 == best_idx2) {
132 /* Remove invalid pair from the queue. */
133 continue;
134 }
135 if (HistogramPairIsLess(&pairs[0], p)) {
136 /* Replace the top of the queue if needed. */
137 HistogramPair front = pairs[0];
138 pairs[0] = *p;
139 pairs[copy_to_idx] = front;
140 } else {
141 pairs[copy_to_idx] = *p;
142 }
143 ++copy_to_idx;
144 }
145 num_pairs = copy_to_idx;
146 }
147
148 /* Push new pairs formed with the combined histogram to the heap. */
149 for (i = 0; i < num_clusters; ++i) {
150 FN(BrotliCompareAndPushToQueue)(out, tmp, cluster_size, best_idx1,
151 clusters[i], max_num_pairs, &pairs[0], &num_pairs);
152 }
153 }
154 return num_clusters;
155})
156
157/* What is the bit cost of moving histogram from cur_symbol to candidate. */
159 const HistogramType* histogram, const HistogramType* candidate,
160 HistogramType* tmp) CODE({
161 if (histogram->total_count_ == 0) {
162 return 0.0;
163 } else {
164 *tmp = *histogram;
165 FN(HistogramAddHistogram)(tmp, candidate);
166 return FN(BrotliPopulationCost)(tmp) - candidate->bit_cost_;
167 }
168})
169
170/* Find the best 'out' histogram for each of the 'in' histograms.
171 When called, clusters[0..num_clusters) contains the unique values from
172 symbols[0..in_size), but this property is not preserved in this function.
173 Note: we assume that out[]->bit_cost_ is already up-to-date. */
175 size_t in_size, const uint32_t* clusters, size_t num_clusters,
176 HistogramType* out, HistogramType* tmp, uint32_t* symbols) CODE({
177 size_t i;
178 for (i = 0; i < in_size; ++i) {
179 uint32_t best_out = i == 0 ? symbols[0] : symbols[i - 1];
180 double best_bits =
181 FN(BrotliHistogramBitCostDistance)(&in[i], &out[best_out], tmp);
182 size_t j;
183 for (j = 0; j < num_clusters; ++j) {
184 const double cur_bits =
185 FN(BrotliHistogramBitCostDistance)(&in[i], &out[clusters[j]], tmp);
186 if (cur_bits < best_bits) {
187 best_bits = cur_bits;
188 best_out = clusters[j];
189 }
190 }
191 symbols[i] = best_out;
192 }
193
194 /* Recompute each out based on raw and symbols. */
195 for (i = 0; i < num_clusters; ++i) {
196 FN(HistogramClear)(&out[clusters[i]]);
197 }
198 for (i = 0; i < in_size; ++i) {
199 FN(HistogramAddHistogram)(&out[symbols[i]], &in[i]);
200 }
201})
202
203/* Reorders elements of the out[0..length) array and changes values in
204 symbols[0..length) array in the following way:
205 * when called, symbols[] contains indexes into out[], and has N unique
206 values (possibly N < length)
207 * on return, symbols'[i] = f(symbols[i]) and
208 out'[symbols'[i]] = out[symbols[i]], for each 0 <= i < length,
209 where f is a bijection between the range of symbols[] and [0..N), and
210 the first occurrences of values in symbols'[i] come in consecutive
211 increasing order.
212 Returns N, the number of unique values in symbols[]. */
214 HistogramType* out, uint32_t* symbols, size_t length) CODE({
215 static const uint32_t kInvalidIndex = BROTLI_UINT32_MAX;
216 uint32_t* new_index = BROTLI_ALLOC(m, uint32_t, length);
217 uint32_t next_index;
218 HistogramType* tmp;
219 size_t i;
220 if (BROTLI_IS_OOM(m) || BROTLI_IS_NULL(new_index)) return 0;
221 for (i = 0; i < length; ++i) {
222 new_index[i] = kInvalidIndex;
223 }
224 next_index = 0;
225 for (i = 0; i < length; ++i) {
226 if (new_index[symbols[i]] == kInvalidIndex) {
227 new_index[symbols[i]] = next_index;
228 ++next_index;
229 }
230 }
231 /* TODO(eustas): by using idea of "cycle-sort" we can avoid allocation of
232 tmp and reduce the number of copying by the factor of 2. */
233 tmp = BROTLI_ALLOC(m, HistogramType, next_index);
234 if (BROTLI_IS_OOM(m) || BROTLI_IS_NULL(tmp)) return 0;
235 next_index = 0;
236 for (i = 0; i < length; ++i) {
237 if (new_index[symbols[i]] == next_index) {
238 tmp[next_index] = out[symbols[i]];
239 ++next_index;
240 }
241 symbols[i] = new_index[symbols[i]];
242 }
243 BROTLI_FREE(m, new_index);
244 for (i = 0; i < next_index; ++i) {
245 out[i] = tmp[i];
246 }
247 BROTLI_FREE(m, tmp);
248 return next_index;
249})
250
252 MemoryManager* m, const HistogramType* in, const size_t in_size,
253 size_t max_histograms, HistogramType* out, size_t* out_size,
254 uint32_t* histogram_symbols) CODE({
255 uint32_t* cluster_size = BROTLI_ALLOC(m, uint32_t, in_size);
256 uint32_t* clusters = BROTLI_ALLOC(m, uint32_t, in_size);
257 size_t num_clusters = 0;
258 const size_t max_input_histograms = 64;
259 size_t pairs_capacity = max_input_histograms * max_input_histograms / 2;
260 /* For the first pass of clustering, we allow all pairs. */
261 HistogramPair* pairs = BROTLI_ALLOC(m, HistogramPair, pairs_capacity + 1);
262 /* TODO(eustas): move to "persistent" arena? */
264 size_t i;
265
266 if (BROTLI_IS_OOM(m) || BROTLI_IS_NULL(cluster_size) ||
267 BROTLI_IS_NULL(clusters) || BROTLI_IS_NULL(pairs)|| BROTLI_IS_NULL(tmp)) {
268 return;
269 }
270
271 for (i = 0; i < in_size; ++i) {
272 cluster_size[i] = 1;
273 }
274
275 for (i = 0; i < in_size; ++i) {
276 out[i] = in[i];
277 out[i].bit_cost_ = FN(BrotliPopulationCost)(&in[i]);
278 histogram_symbols[i] = (uint32_t)i;
279 }
280
281 for (i = 0; i < in_size; i += max_input_histograms) {
282 size_t num_to_combine =
283 BROTLI_MIN(size_t, in_size - i, max_input_histograms);
284 size_t num_new_clusters;
285 size_t j;
286 for (j = 0; j < num_to_combine; ++j) {
287 clusters[num_clusters + j] = (uint32_t)(i + j);
288 }
289 num_new_clusters =
290 FN(BrotliHistogramCombine)(out, tmp, cluster_size,
291 &histogram_symbols[i],
292 &clusters[num_clusters], pairs,
293 num_to_combine, num_to_combine,
294 max_histograms, pairs_capacity);
295 num_clusters += num_new_clusters;
296 }
297
298 {
299 /* For the second pass, we limit the total number of histogram pairs.
300 After this limit is reached, we only keep searching for the best pair. */
301 size_t max_num_pairs = BROTLI_MIN(size_t,
302 64 * num_clusters, (num_clusters / 2) * num_clusters);
304 m, HistogramPair, pairs, pairs_capacity, max_num_pairs + 1);
305 if (BROTLI_IS_OOM(m)) return;
306
307 /* Collapse similar histograms. */
308 num_clusters = FN(BrotliHistogramCombine)(out, tmp, cluster_size,
309 histogram_symbols, clusters,
310 pairs, num_clusters, in_size,
311 max_histograms, max_num_pairs);
312 }
313 BROTLI_FREE(m, pairs);
314 BROTLI_FREE(m, cluster_size);
315 /* Find the optimal map from original histograms to the final ones. */
316 FN(BrotliHistogramRemap)(in, in_size, clusters, num_clusters,
317 out, tmp, histogram_symbols);
318 BROTLI_FREE(m, tmp);
319 BROTLI_FREE(m, clusters);
320 /* Convert the context map to a canonical form. */
321 *out_size = FN(BrotliHistogramReindex)(m, out, histogram_symbols, in_size);
322 if (BROTLI_IS_OOM(m)) return;
323})
324
325#undef HistogramType
#define FN(X)
Definition backward_references.c:51
double FN BrotliPopulationCost(const HistogramType *histogram)
Definition bit_cost_inc.h:12
#define CODE(X)
Definition cluster.c:38
BROTLI_INTERNAL void FN BrotliClusterHistograms(MemoryManager *m, const HistogramType *in, const size_t in_size, size_t max_histograms, HistogramType *out, size_t *out_size, uint32_t *histogram_symbols) CODE(
Definition cluster_inc.h:249
BROTLI_INTERNAL size_t FN BrotliHistogramCombine(HistogramType *out, uint32_t *cluster_size, uint32_t *symbols, uint32_t *clusters, HistogramPair *pairs, size_t num_clusters, size_t symbols_size, size_t max_clusters, size_t max_num_pairs) CODE(
Definition cluster_inc.h:70
#define HistogramType
Definition cluster_inc.h:10
BROTLI_INTERNAL size_t FN BrotliHistogramReindex(MemoryManager *m, HistogramType *out, uint32_t *symbols, size_t length) CODE(
Definition cluster_inc.h:211
BROTLI_INTERNAL void FN BrotliHistogramRemap(const HistogramType *in, size_t in_size, const uint32_t *clusters, size_t num_clusters, HistogramType *out, uint32_t *symbols) CODE(
Definition cluster_inc.h:172
BROTLI_INTERNAL double FN BrotliHistogramBitCostDistance(const HistogramType *histogram, const HistogramType *candidate) CODE(
Definition cluster_inc.h:157
BROTLI_INTERNAL void FN BrotliCompareAndPushToQueue(const HistogramType *out, const uint32_t *cluster_size, uint32_t idx1, uint32_t idx2, size_t max_num_pairs, HistogramPair *pairs, size_t *num_pairs) CODE(
Definition cluster_inc.h:14
#define BROTLI_ALLOC(M, T, N)
Definition memory.h:44
#define BROTLI_FREE(M, P)
Definition memory.h:48
#define BROTLI_ENSURE_CAPACITY(M, T, A, C, R)
Definition memory.h:81
#define BROTLI_IS_OOM(M)
Definition memory.h:54
#define BROTLI_IS_NULL(A)
Definition memory.h:68
Definition cluster.h:21
uint32_t idx1
Definition cluster.h:22
double cost_diff
Definition cluster.h:25
double cost_combo
Definition cluster.h:24
uint32_t idx2
Definition cluster.h:23
Definition memory.h:26
#define BROTLI_INTERNAL
Definition platform.h:173
#define BROTLI_MAX(T, A, B)
#define BROTLI_MIN(T, A, B)
#define BROTLI_UINT32_MAX
Definition types.h:59
#define BROTLI_TRUE
Definition types.h:51
#define BROTLI_FALSE
Definition types.h:53
#define BROTLI_BOOL
Definition types.h:49
const lzma_allocator const uint8_t size_t in_size
Definition block.h:527
const lzma_allocator const uint8_t * in
Definition block.h:527
const lzma_allocator const uint8_t size_t uint8_t * out
Definition block.h:528
lzma_index ** i
Definition index.h:629