Parolin 0.7.9 6796
Console (soon DLLs) to do a tar like job
Loading...
Searching...
No Matches
cluster_inc.h
Go to the documentation of this file.
1/* NOLINT(build/header_guard) */
2/* Copyright 2013 Google Inc. All Rights Reserved.
3
4 Distributed under MIT license.
5 See file LICENSE for detail or copy at https://opensource.org/licenses/MIT
6*/
7
8/* template parameters: FN, CODE */
9
10#define HistogramType FN(Histogram)
11
12/* Computes the bit cost reduction by combining out[idx1] and out[idx2] and if
13 it is below a threshold, stores the pair (idx1, idx2) in the *pairs queue. */
15 const HistogramType* out, const uint32_t* cluster_size, uint32_t idx1,
16 uint32_t idx2, size_t max_num_pairs, HistogramPair* pairs,
17 size_t* num_pairs) CODE({
18 BROTLI_BOOL is_good_pair = BROTLI_FALSE;
20 p.idx1 = p.idx2 = 0;
21 p.cost_diff = p.cost_combo = 0;
22 if (idx1 == idx2) {
23 return;
24 }
25 if (idx2 < idx1) {
26 uint32_t t = idx2;
27 idx2 = idx1;
28 idx1 = t;
29 }
30 p.idx1 = idx1;
31 p.idx2 = idx2;
32 p.cost_diff = 0.5 * ClusterCostDiff(cluster_size[idx1], cluster_size[idx2]);
33 p.cost_diff -= out[idx1].bit_cost_;
34 p.cost_diff -= out[idx2].bit_cost_;
35
36 if (out[idx1].total_count_ == 0) {
37 p.cost_combo = out[idx2].bit_cost_;
38 is_good_pair = BROTLI_TRUE;
39 } else if (out[idx2].total_count_ == 0) {
40 p.cost_combo = out[idx1].bit_cost_;
41 is_good_pair = BROTLI_TRUE;
42 } else {
43 double threshold = *num_pairs == 0 ? 1e99 :
44 BROTLI_MAX(double, 0.0, pairs[0].cost_diff);
45 HistogramType combo = out[idx1];
46 double cost_combo;
47 FN(HistogramAddHistogram)(&combo, &out[idx2]);
48 cost_combo = FN(BrotliPopulationCost)(&combo);
49 if (cost_combo < threshold - p.cost_diff) {
50 p.cost_combo = cost_combo;
51 is_good_pair = BROTLI_TRUE;
52 }
53 }
54 if (is_good_pair) {
55 p.cost_diff += p.cost_combo;
56 if (*num_pairs > 0 && HistogramPairIsLess(&pairs[0], &p)) {
57 /* Replace the top of the queue if needed. */
58 if (*num_pairs < max_num_pairs) {
59 pairs[*num_pairs] = pairs[0];
60 ++(*num_pairs);
61 }
62 pairs[0] = p;
63 } else if (*num_pairs < max_num_pairs) {
64 pairs[*num_pairs] = p;
65 ++(*num_pairs);
66 }
67 }
68})
69
71 uint32_t* cluster_size,
72 uint32_t* symbols,
73 uint32_t* clusters,
74 HistogramPair* pairs,
75 size_t num_clusters,
76 size_t symbols_size,
77 size_t max_clusters,
78 size_t max_num_pairs) CODE({
79 double cost_diff_threshold = 0.0;
80 size_t min_cluster_size = 1;
81 size_t num_pairs = 0;
82
83 {
84 /* We maintain a vector of histogram pairs, with the property that the pair
85 with the maximum bit cost reduction is the first. */
86 size_t idx1;
87 for (idx1 = 0; idx1 < num_clusters; ++idx1) {
88 size_t idx2;
89 for (idx2 = idx1 + 1; idx2 < num_clusters; ++idx2) {
90 FN(BrotliCompareAndPushToQueue)(out, cluster_size, clusters[idx1],
91 clusters[idx2], max_num_pairs, &pairs[0], &num_pairs);
92 }
93 }
94 }
95
96 while (num_clusters > min_cluster_size) {
97 uint32_t best_idx1;
98 uint32_t best_idx2;
99 size_t i;
100 if (pairs[0].cost_diff >= cost_diff_threshold) {
101 cost_diff_threshold = 1e99;
102 min_cluster_size = max_clusters;
103 continue;
104 }
105 /* Take the best pair from the top of heap. */
106 best_idx1 = pairs[0].idx1;
107 best_idx2 = pairs[0].idx2;
108 FN(HistogramAddHistogram)(&out[best_idx1], &out[best_idx2]);
109 out[best_idx1].bit_cost_ = pairs[0].cost_combo;
110 cluster_size[best_idx1] += cluster_size[best_idx2];
111 for (i = 0; i < symbols_size; ++i) {
112 if (symbols[i] == best_idx2) {
113 symbols[i] = best_idx1;
114 }
115 }
116 for (i = 0; i < num_clusters; ++i) {
117 if (clusters[i] == best_idx2) {
118 memmove(&clusters[i], &clusters[i + 1],
119 (num_clusters - i - 1) * sizeof(clusters[0]));
120 break;
121 }
122 }
123 --num_clusters;
124 {
125 /* Remove pairs intersecting the just combined best pair. */
126 size_t copy_to_idx = 0;
127 for (i = 0; i < num_pairs; ++i) {
128 HistogramPair* p = &pairs[i];
129 if (p->idx1 == best_idx1 || p->idx2 == best_idx1 ||
130 p->idx1 == best_idx2 || p->idx2 == best_idx2) {
131 /* Remove invalid pair from the queue. */
132 continue;
133 }
134 if (HistogramPairIsLess(&pairs[0], p)) {
135 /* Replace the top of the queue if needed. */
136 HistogramPair front = pairs[0];
137 pairs[0] = *p;
138 pairs[copy_to_idx] = front;
139 } else {
140 pairs[copy_to_idx] = *p;
141 }
142 ++copy_to_idx;
143 }
144 num_pairs = copy_to_idx;
145 }
146
147 /* Push new pairs formed with the combined histogram to the heap. */
148 for (i = 0; i < num_clusters; ++i) {
149 FN(BrotliCompareAndPushToQueue)(out, cluster_size, best_idx1, clusters[i],
150 max_num_pairs, &pairs[0], &num_pairs);
151 }
152 }
153 return num_clusters;
154})
155
156/* What is the bit cost of moving histogram from cur_symbol to candidate. */
158 const HistogramType* histogram, const HistogramType* candidate) CODE({
159 if (histogram->total_count_ == 0) {
160 return 0.0;
161 } else {
162 HistogramType tmp = *histogram;
163 FN(HistogramAddHistogram)(&tmp, candidate);
164 return FN(BrotliPopulationCost)(&tmp) - candidate->bit_cost_;
165 }
166})
167
168/* Find the best 'out' histogram for each of the 'in' histograms.
169 When called, clusters[0..num_clusters) contains the unique values from
170 symbols[0..in_size), but this property is not preserved in this function.
171 Note: we assume that out[]->bit_cost_ is already up-to-date. */
173 size_t in_size, const uint32_t* clusters, size_t num_clusters,
174 HistogramType* out, uint32_t* symbols) CODE({
175 size_t i;
176 for (i = 0; i < in_size; ++i) {
177 uint32_t best_out = i == 0 ? symbols[0] : symbols[i - 1];
178 double best_bits =
179 FN(BrotliHistogramBitCostDistance)(&in[i], &out[best_out]);
180 size_t j;
181 for (j = 0; j < num_clusters; ++j) {
182 const double cur_bits =
183 FN(BrotliHistogramBitCostDistance)(&in[i], &out[clusters[j]]);
184 if (cur_bits < best_bits) {
185 best_bits = cur_bits;
186 best_out = clusters[j];
187 }
188 }
189 symbols[i] = best_out;
190 }
191
192 /* Recompute each out based on raw and symbols. */
193 for (i = 0; i < num_clusters; ++i) {
194 FN(HistogramClear)(&out[clusters[i]]);
195 }
196 for (i = 0; i < in_size; ++i) {
197 FN(HistogramAddHistogram)(&out[symbols[i]], &in[i]);
198 }
199})
200
201/* Reorders elements of the out[0..length) array and changes values in
202 symbols[0..length) array in the following way:
203 * when called, symbols[] contains indexes into out[], and has N unique
204 values (possibly N < length)
205 * on return, symbols'[i] = f(symbols[i]) and
206 out'[symbols'[i]] = out[symbols[i]], for each 0 <= i < length,
207 where f is a bijection between the range of symbols[] and [0..N), and
208 the first occurrences of values in symbols'[i] come in consecutive
209 increasing order.
210 Returns N, the number of unique values in symbols[]. */
212 HistogramType* out, uint32_t* symbols, size_t length) CODE({
213 static const uint32_t kInvalidIndex = BROTLI_UINT32_MAX;
214 uint32_t* new_index = BROTLI_ALLOC(m, uint32_t, length);
215 uint32_t next_index;
216 HistogramType* tmp;
217 size_t i;
218 if (BROTLI_IS_OOM(m) || BROTLI_IS_NULL(new_index)) return 0;
219 for (i = 0; i < length; ++i) {
220 new_index[i] = kInvalidIndex;
221 }
222 next_index = 0;
223 for (i = 0; i < length; ++i) {
224 if (new_index[symbols[i]] == kInvalidIndex) {
225 new_index[symbols[i]] = next_index;
226 ++next_index;
227 }
228 }
229 /* TODO: by using idea of "cycle-sort" we can avoid allocation of
230 tmp and reduce the number of copying by the factor of 2. */
231 tmp = BROTLI_ALLOC(m, HistogramType, next_index);
232 if (BROTLI_IS_OOM(m) || BROTLI_IS_NULL(tmp)) return 0;
233 next_index = 0;
234 for (i = 0; i < length; ++i) {
235 if (new_index[symbols[i]] == next_index) {
236 tmp[next_index] = out[symbols[i]];
237 ++next_index;
238 }
239 symbols[i] = new_index[symbols[i]];
240 }
241 BROTLI_FREE(m, new_index);
242 for (i = 0; i < next_index; ++i) {
243 out[i] = tmp[i];
244 }
245 BROTLI_FREE(m, tmp);
246 return next_index;
247})
248
250 MemoryManager* m, const HistogramType* in, const size_t in_size,
251 size_t max_histograms, HistogramType* out, size_t* out_size,
252 uint32_t* histogram_symbols) CODE({
253 uint32_t* cluster_size = BROTLI_ALLOC(m, uint32_t, in_size);
254 uint32_t* clusters = BROTLI_ALLOC(m, uint32_t, in_size);
255 size_t num_clusters = 0;
256 const size_t max_input_histograms = 64;
257 size_t pairs_capacity = max_input_histograms * max_input_histograms / 2;
258 /* For the first pass of clustering, we allow all pairs. */
259 HistogramPair* pairs = BROTLI_ALLOC(m, HistogramPair, pairs_capacity + 1);
260 size_t i;
261
262 if (BROTLI_IS_OOM(m) || BROTLI_IS_NULL(cluster_size) ||
263 BROTLI_IS_NULL(clusters) || BROTLI_IS_NULL(pairs)) {
264 return;
265 }
266
267 for (i = 0; i < in_size; ++i) {
268 cluster_size[i] = 1;
269 }
270
271 for (i = 0; i < in_size; ++i) {
272 out[i] = in[i];
273 out[i].bit_cost_ = FN(BrotliPopulationCost)(&in[i]);
274 histogram_symbols[i] = (uint32_t)i;
275 }
276
277 for (i = 0; i < in_size; i += max_input_histograms) {
278 size_t num_to_combine =
279 BROTLI_MIN(size_t, in_size - i, max_input_histograms);
280 size_t num_new_clusters;
281 size_t j;
282 for (j = 0; j < num_to_combine; ++j) {
283 clusters[num_clusters + j] = (uint32_t)(i + j);
284 }
285 num_new_clusters =
286 FN(BrotliHistogramCombine)(out, cluster_size,
287 &histogram_symbols[i],
288 &clusters[num_clusters], pairs,
289 num_to_combine, num_to_combine,
290 max_histograms, pairs_capacity);
291 num_clusters += num_new_clusters;
292 }
293
294 {
295 /* For the second pass, we limit the total number of histogram pairs.
296 After this limit is reached, we only keep searching for the best pair. */
297 size_t max_num_pairs = BROTLI_MIN(size_t,
298 64 * num_clusters, (num_clusters / 2) * num_clusters);
300 m, HistogramPair, pairs, pairs_capacity, max_num_pairs + 1);
301 if (BROTLI_IS_OOM(m)) return;
302
303 /* Collapse similar histograms. */
304 num_clusters = FN(BrotliHistogramCombine)(out, cluster_size,
305 histogram_symbols, clusters,
306 pairs, num_clusters, in_size,
307 max_histograms, max_num_pairs);
308 }
309 BROTLI_FREE(m, pairs);
310 BROTLI_FREE(m, cluster_size);
311 /* Find the optimal map from original histograms to the final ones. */
312 FN(BrotliHistogramRemap)(in, in_size, clusters, num_clusters,
313 out, histogram_symbols);
314 BROTLI_FREE(m, clusters);
315 /* Convert the context map to a canonical form. */
316 *out_size = FN(BrotliHistogramReindex)(m, out, histogram_symbols, in_size);
317 if (BROTLI_IS_OOM(m)) return;
318})
319
320#undef HistogramType
#define FN(X)
Definition backward_references.c:51
double FN BrotliPopulationCost(const HistogramType *histogram)
Definition bit_cost_inc.h:12
#define CODE(X)
Definition cluster.c:38
BROTLI_INTERNAL void FN BrotliClusterHistograms(MemoryManager *m, const HistogramType *in, const size_t in_size, size_t max_histograms, HistogramType *out, size_t *out_size, uint32_t *histogram_symbols) CODE(
Definition cluster_inc.h:249
BROTLI_INTERNAL size_t FN BrotliHistogramCombine(HistogramType *out, uint32_t *cluster_size, uint32_t *symbols, uint32_t *clusters, HistogramPair *pairs, size_t num_clusters, size_t symbols_size, size_t max_clusters, size_t max_num_pairs) CODE(
Definition cluster_inc.h:70
#define HistogramType
Definition cluster_inc.h:10
BROTLI_INTERNAL size_t FN BrotliHistogramReindex(MemoryManager *m, HistogramType *out, uint32_t *symbols, size_t length) CODE(
Definition cluster_inc.h:211
BROTLI_INTERNAL void FN BrotliHistogramRemap(const HistogramType *in, size_t in_size, const uint32_t *clusters, size_t num_clusters, HistogramType *out, uint32_t *symbols) CODE(
Definition cluster_inc.h:172
BROTLI_INTERNAL double FN BrotliHistogramBitCostDistance(const HistogramType *histogram, const HistogramType *candidate) CODE(
Definition cluster_inc.h:157
BROTLI_INTERNAL void FN BrotliCompareAndPushToQueue(const HistogramType *out, const uint32_t *cluster_size, uint32_t idx1, uint32_t idx2, size_t max_num_pairs, HistogramPair *pairs, size_t *num_pairs) CODE(
Definition cluster_inc.h:14
#define BROTLI_ALLOC(M, T, N)
Definition memory.h:44
#define BROTLI_FREE(M, P)
Definition memory.h:48
#define BROTLI_ENSURE_CAPACITY(M, T, A, C, R)
Definition memory.h:81
#define BROTLI_IS_OOM(M)
Definition memory.h:54
#define BROTLI_IS_NULL(A)
Definition memory.h:68
Definition cluster.h:21
uint32_t idx1
Definition cluster.h:22
double cost_diff
Definition cluster.h:25
double cost_combo
Definition cluster.h:24
uint32_t idx2
Definition cluster.h:23
Definition memory.h:26
#define BROTLI_INTERNAL
Definition platform.h:173
#define BROTLI_MAX(T, A, B)
#define BROTLI_MIN(T, A, B)
#define BROTLI_UINT32_MAX
Definition types.h:59
#define BROTLI_TRUE
Definition types.h:51
#define BROTLI_FALSE
Definition types.h:53
#define BROTLI_BOOL
Definition types.h:49
const lzma_allocator const uint8_t size_t in_size
Definition block.h:527
const lzma_allocator const uint8_t * in
Definition block.h:527
const lzma_allocator const uint8_t size_t uint8_t * out
Definition block.h:528
lzma_index ** i
Definition index.h:629