Line data Source code
1 : /*
2 : * This file is part of rmlint.
3 : *
4 : * rmlint is free software: you can redistribute it and/or modify
5 : * it under the terms of the GNU General Public License as published by
6 : * the Free Software Foundation, either version 3 of the License, or
7 : * (at your option) any later version.
8 : *
9 : * rmlint is distributed in the hope that it will be useful,
10 : * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 : * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 : * GNU General Public License for more details.
13 : *
14 : * You should have received a copy of the GNU General Public License
15 : * along with rmlint. If not, see <http://www.gnu.org/licenses/>.
16 : *
17 : * Authors:
18 : *
19 : * - Christopher <sahib> Pahl 2010-2015 (https://github.com/sahib)
20 : * - Daniel <SeeSpotRun> T. 2014-2015 (https://github.com/SeeSpotRun)
21 : *
22 : * Hosted on http://github.com/sahib/rmlint
23 : *
24 : */
25 :
26 : #include <stdio.h>
27 : #include <string.h>
28 : #include <glib.h>
29 :
30 : #include <sys/mman.h>
31 : #include <sys/stat.h>
32 : #include <unistd.h>
33 : #include <fcntl.h>
34 :
35 : #include "checksum.h"
36 :
37 : #include "checksums/city.h"
38 : #include "checksums/citycrc.h"
39 : #include "checksums/murmur3.h"
40 : #include "checksums/spooky-c.h"
41 : #include "checksums/xxhash/xxhash.h"
42 :
43 : #define _RM_CHECKSUM_DEBUG 0
44 :
45 :
46 :
47 : ///////////////////////////////////////
48 : // BUFFER POOL IMPLEMENTATION //
49 : ///////////////////////////////////////
50 :
51 140 : RmOff rm_buffer_size(RmBufferPool *pool) {
52 140 : return pool->buffer_size;
53 : }
54 :
55 173289 : RmBuffer *rm_buffer_new(RmBufferPool *pool) {
56 173289 : RmBuffer *self = g_slice_new0(RmBuffer);
57 173289 : self->pool = pool;
58 173289 : self->data = g_slice_alloc(pool->buffer_size);
59 173289 : return self;
60 : }
61 :
62 173218 : static void rm_buffer_free(RmBuffer *buf) {
63 173218 : g_slice_free1(buf->pool->buffer_size, buf->data);
64 173218 : g_slice_free(RmBuffer, buf);
65 173218 : }
66 :
67 36458 : RmBufferPool *rm_buffer_pool_init(gsize buffer_size, gsize max_mem, gsize max_kept_mem) {
68 36458 : RmBufferPool *self = g_slice_new(RmBufferPool);
69 36458 : self->stack = NULL;
70 36458 : self->buffer_size = buffer_size;
71 36458 : self->avail_buffers = MAX(max_mem / buffer_size, 1);
72 36458 : self->min_buffers = self->avail_buffers;
73 36458 : self->max_buffers = self->avail_buffers;
74 36458 : self->max_kept_buffers = MAX(max_kept_mem / buffer_size, 1);
75 36458 : self->kept_buffers = 0;
76 36458 : rm_log_debug_line("rm_buffer_pool_init: allocated max %"G_GSIZE_FORMAT
77 : " buffers of %"G_GSIZE_FORMAT" bytes each",
78 : self->avail_buffers, self->buffer_size);
79 36458 : g_cond_init(&self->change);
80 36458 : g_mutex_init(&self->lock);
81 36458 : return self;
82 : }
83 :
84 36458 : void rm_buffer_pool_destroy(RmBufferPool *pool) {
85 36458 : rm_log_debug_line("had %"G_GSIZE_FORMAT" unused read buffers", pool->min_buffers);
86 :
87 : /* Wait for all buffers to come back */
88 36458 : g_mutex_lock(&pool->lock);
89 :
90 : /* Free 'em */
91 36458 : g_slist_free_full(pool->stack, (GDestroyNotify)rm_buffer_free);
92 :
93 36458 : g_mutex_unlock(&pool->lock);
94 :
95 36458 : g_mutex_clear(&pool->lock);
96 36458 : g_cond_clear(&pool->change);
97 36458 : g_slice_free(RmBufferPool, pool);
98 36458 : }
99 :
100 718370 : RmBuffer *rm_buffer_pool_get(RmBufferPool *pool) {
101 718370 : RmBuffer *buffer = NULL;
102 718370 : g_mutex_lock(&pool->lock);
103 : {
104 2155432 : while(!buffer) {
105 718634 : if(pool->stack) {
106 545110 : buffer = pool->stack->data;
107 545110 : pool->stack = g_slist_delete_link(pool->stack, pool->stack);
108 173524 : } else if(pool->avail_buffers > 0) {
109 173289 : buffer = rm_buffer_new(pool);
110 : } else {
111 235 : if(!pool->mem_warned) {
112 0 : rm_log_warning_line(
113 : "read buffer limit reached - waiting for "
114 : "processing to catch up");
115 0 : pool->mem_warned = true;
116 : }
117 235 : g_cond_wait(&pool->change, &pool->lock);
118 : }
119 : }
120 718399 : pool->avail_buffers--;
121 :
122 718399 : if(pool->avail_buffers < pool->min_buffers) {
123 168927 : pool->min_buffers = pool->avail_buffers;
124 : }
125 : }
126 718399 : g_mutex_unlock(&pool->lock);
127 :
128 718393 : g_assert(buffer);
129 718393 : return buffer;
130 : }
131 :
132 698041 : void rm_buffer_pool_release(RmBuffer *buf) {
133 698041 : RmBufferPool *pool = buf->pool;
134 698041 : g_mutex_lock(&pool->lock);
135 : {
136 698643 : pool->avail_buffers++;
137 698643 : g_cond_signal(&pool->change);
138 698643 : pool->stack = g_slist_prepend(pool->stack, buf);
139 : }
140 698643 : g_mutex_unlock(&pool->lock);
141 698581 : }
142 :
143 : /* make another buffer available if one is being kept (in paranoid digest) */
144 19752 : static void rm_buffer_pool_signal_keeping(_U RmBuffer *buf) {
145 19752 : RmBufferPool *pool = buf->pool;
146 :
147 19752 : g_mutex_lock(&pool->lock);
148 : {
149 19756 : pool->avail_buffers++;
150 19756 : pool->kept_buffers++;
151 19756 : g_cond_signal(&pool->change);
152 : }
153 19756 : g_mutex_unlock(&pool->lock);
154 19754 : }
155 :
156 19685 : static void rm_buffer_pool_release_kept(RmBuffer *buf) {
157 19685 : RmBufferPool *pool = buf->pool;
158 19685 : g_mutex_lock(&pool->lock);
159 : {
160 19685 : if(pool->kept_buffers > pool->max_kept_buffers) {
161 0 : rm_buffer_free(buf);
162 : } else {
163 19685 : pool->stack = g_slist_prepend(pool->stack, buf);
164 : }
165 19685 : pool->kept_buffers--;
166 19685 : g_cond_signal(&pool->change);
167 : }
168 19685 : g_mutex_unlock(&pool->lock);
169 19685 : }
170 :
171 14765 : static gboolean rm_buffer_equal(RmBuffer *a, RmBuffer *b) {
172 14765 : return (a->len == b->len && memcmp(a->data, b->data, a->len) == 0);
173 : }
174 :
175 : ///////////////////////////////////////
176 : // RMDIGEST IMPLEMENTATION //
177 : ///////////////////////////////////////
178 :
179 31978 : RmDigestType rm_string_to_digest_type(const char *string) {
180 31978 : if(string == NULL) {
181 0 : return RM_DIGEST_UNKNOWN;
182 31978 : } else if(!strcasecmp(string, "md5")) {
183 1881 : return RM_DIGEST_MD5;
184 : #if HAVE_SHA512
185 30097 : } else if(!strcasecmp(string, "sha512")) {
186 1881 : return RM_DIGEST_SHA512;
187 : #endif
188 28216 : } else if(!strcasecmp(string, "city512")) {
189 1881 : return RM_DIGEST_CITY512;
190 26335 : } else if(!strcasecmp(string, "xxhash")) {
191 1881 : return RM_DIGEST_XXHASH;
192 24454 : } else if(!strcasecmp(string, "murmur512")) {
193 1881 : return RM_DIGEST_MURMUR512;
194 22573 : } else if(!strcasecmp(string, "sha256")) {
195 1881 : return RM_DIGEST_SHA256;
196 20692 : } else if(!strcasecmp(string, "city256")) {
197 1881 : return RM_DIGEST_CITY256;
198 18811 : } else if(!strcasecmp(string, "murmur256")) {
199 1881 : return RM_DIGEST_MURMUR256;
200 16930 : } else if(!strcasecmp(string, "sha1")) {
201 1881 : return RM_DIGEST_SHA1;
202 15049 : } else if(!strcasecmp(string, "spooky32")) {
203 1881 : return RM_DIGEST_SPOOKY32;
204 13168 : } else if(!strcasecmp(string, "spooky64")) {
205 1881 : return RM_DIGEST_SPOOKY64;
206 11287 : } else if(!strcasecmp(string, "murmur") || !strcasecmp(string, "murmur128")) {
207 1881 : return RM_DIGEST_MURMUR;
208 9406 : } else if(!strcasecmp(string, "spooky") || !strcasecmp(string, "spooky128")) {
209 1881 : return RM_DIGEST_SPOOKY;
210 7525 : } else if(!strcasecmp(string, "city") || !strcasecmp(string, "city128")) {
211 1881 : return RM_DIGEST_CITY;
212 5644 : } else if(!strcasecmp(string, "bastard") || !strcasecmp(string, "bastard256")) {
213 1881 : return RM_DIGEST_BASTARD;
214 3763 : } else if(!strcasecmp(string, "ext")) {
215 0 : return RM_DIGEST_EXT;
216 3763 : } else if(!strcasecmp(string, "cumulative")) {
217 0 : return RM_DIGEST_CUMULATIVE;
218 3763 : } else if(!strcasecmp(string, "paranoid")) {
219 3763 : return RM_DIGEST_PARANOID;
220 : } else {
221 0 : return RM_DIGEST_UNKNOWN;
222 : }
223 : }
224 :
225 72690 : const char *rm_digest_type_to_string(RmDigestType type) {
226 : static const char *names[] =
227 : {[RM_DIGEST_UNKNOWN] = "unknown", [RM_DIGEST_MURMUR] = "murmur",
228 : [RM_DIGEST_SPOOKY] = "spooky", [RM_DIGEST_SPOOKY32] = "spooky32",
229 : [RM_DIGEST_SPOOKY64] = "spooky64", [RM_DIGEST_CITY] = "city",
230 : [RM_DIGEST_MD5] = "md5", [RM_DIGEST_SHA1] = "sha1",
231 : [RM_DIGEST_SHA256] = "sha256", [RM_DIGEST_SHA512] = "sha512",
232 : [RM_DIGEST_MURMUR256] = "murmur256", [RM_DIGEST_CITY256] = "city256",
233 : [RM_DIGEST_BASTARD] = "bastard", [RM_DIGEST_MURMUR512] = "murmur512",
234 : [RM_DIGEST_CITY512] = "city512", [RM_DIGEST_EXT] = "ext",
235 : [RM_DIGEST_CUMULATIVE] = "cumulative", [RM_DIGEST_PARANOID] = "paranoid",
236 : [RM_DIGEST_XXHASH] = "xxhash"};
237 :
238 72690 : return names[MIN(type, sizeof(names) / sizeof(names[0]))];
239 : }
240 :
241 0 : int rm_digest_type_to_multihash_id(RmDigestType type) {
242 : static int ids[] = {[RM_DIGEST_UNKNOWN] = -1, [RM_DIGEST_MURMUR] = 17,
243 : [RM_DIGEST_SPOOKY] = 14, [RM_DIGEST_SPOOKY32] = 16,
244 : [RM_DIGEST_SPOOKY64] = 18, [RM_DIGEST_CITY] = 15,
245 : [RM_DIGEST_MD5] = 1, [RM_DIGEST_SHA1] = 2,
246 : [RM_DIGEST_SHA256] = 4, [RM_DIGEST_SHA512] = 6,
247 : [RM_DIGEST_MURMUR256] = 7, [RM_DIGEST_CITY256] = 8,
248 : [RM_DIGEST_BASTARD] = 9, [RM_DIGEST_MURMUR512] = 10,
249 : [RM_DIGEST_CITY512] = 11, [RM_DIGEST_EXT] = 12,
250 : [RM_DIGEST_CUMULATIVE] = 13, [RM_DIGEST_PARANOID] = 14};
251 :
252 0 : return ids[MIN(type, sizeof(ids) / sizeof(ids[0]))];
253 : }
254 :
255 16036 : RmOff rm_digest_paranoia_bytes(void) {
256 16036 : return 16 * 1024 * 1024;
257 : /* this is big enough buffer size to make seek time fairly insignificant relative to
258 : * sequential read time,
259 : * eg 16MB read at typical 100 MB/s read rate = 160ms read vs typical seek time 10ms*/
260 : }
261 :
262 : #define ADD_SEED(digest, seed) \
263 : { \
264 : if(seed) { \
265 : g_checksum_update(digest->glib_checksum, (const guchar *)&seed, \
266 : sizeof(RmOff)); \
267 : } \
268 : }
269 :
270 386722 : RmDigest *rm_digest_new(RmDigestType type, RmOff seed1, RmOff seed2, RmOff paranoid_size,
271 : bool use_shadow_hash) {
272 386722 : RmDigest *digest = g_slice_new0(RmDigest);
273 :
274 386723 : digest->checksum = NULL;
275 386723 : digest->type = type;
276 386723 : digest->bytes = 0;
277 :
278 386723 : switch(type) {
279 : case RM_DIGEST_SPOOKY32:
280 : /* cannot go lower than 64, since we read 8 byte in some places.
281 : * simulate by leaving the part at the end empty
282 : */
283 7891 : digest->bytes = 64 / 8;
284 7891 : break;
285 : case RM_DIGEST_XXHASH:
286 : case RM_DIGEST_SPOOKY64:
287 39683 : digest->bytes = 64 / 8;
288 39683 : break;
289 : case RM_DIGEST_MD5:
290 7879 : digest->glib_checksum = g_checksum_new(G_CHECKSUM_MD5);
291 7879 : ADD_SEED(digest, seed1);
292 7879 : digest->bytes = 128 / 8;
293 7879 : return digest;
294 : #if HAVE_SHA512
295 : case RM_DIGEST_SHA512:
296 7935 : digest->glib_checksum = g_checksum_new(G_CHECKSUM_SHA512);
297 7935 : ADD_SEED(digest, seed1);
298 7935 : digest->bytes = 512 / 8;
299 7935 : return digest;
300 : #endif
301 : case RM_DIGEST_SHA256:
302 7879 : digest->glib_checksum = g_checksum_new(G_CHECKSUM_SHA256);
303 7879 : ADD_SEED(digest, seed1);
304 7879 : digest->bytes = 256 / 8;
305 7879 : return digest;
306 : case RM_DIGEST_SHA1:
307 79413 : digest->glib_checksum = g_checksum_new(G_CHECKSUM_SHA1);
308 79413 : ADD_SEED(digest, seed1);
309 79413 : digest->bytes = 160 / 8;
310 79413 : return digest;
311 : case RM_DIGEST_MURMUR512:
312 : case RM_DIGEST_CITY512:
313 15782 : digest->bytes = 512 / 8;
314 15782 : break;
315 : case RM_DIGEST_EXT:
316 : /* gets allocated on rm_digest_update() */
317 144730 : digest->bytes = paranoid_size;
318 144730 : break;
319 : case RM_DIGEST_MURMUR256:
320 : case RM_DIGEST_CITY256:
321 : case RM_DIGEST_BASTARD:
322 31538 : digest->bytes = 256 / 8;
323 31538 : break;
324 : case RM_DIGEST_SPOOKY:
325 : case RM_DIGEST_MURMUR:
326 : case RM_DIGEST_CITY:
327 : case RM_DIGEST_CUMULATIVE:
328 27957 : digest->bytes = 128 / 8;
329 27957 : break;
330 : case RM_DIGEST_PARANOID:
331 16036 : g_assert(paranoid_size <= rm_digest_paranoia_bytes());
332 16036 : g_assert(paranoid_size > 0);
333 16036 : digest->bytes = paranoid_size;
334 16036 : digest->paranoid = g_slice_new0(RmParanoid);
335 16036 : digest->paranoid->buffers = g_queue_new();
336 16036 : digest->paranoid->incoming_twin_candidates = g_async_queue_new();
337 16036 : g_assert(use_shadow_hash);
338 16036 : if(use_shadow_hash) {
339 32072 : digest->paranoid->shadow_hash =
340 16036 : rm_digest_new(RM_DIGEST_XXHASH, seed1, seed2, 0, false);
341 : } else {
342 0 : digest->paranoid->shadow_hash = NULL;
343 : }
344 16036 : break;
345 : default:
346 0 : g_assert_not_reached();
347 : }
348 :
349 : /* starting values to let us generate up to 4 different hashes in parallel with
350 : * different starting seeds:
351 : * */
352 : static const RmOff seeds[4] = {0x0000000000000000, 0xf0f0f0f0f0f0f0f0,
353 : 0x3333333333333333, 0xaaaaaaaaaaaaaaaa};
354 :
355 283617 : if(digest->bytes > 0 && type != RM_DIGEST_PARANOID) {
356 155023 : const int n_seeds = sizeof(seeds) / sizeof(seeds[0]);
357 :
358 : /* checksum type - allocate memory and initialise */
359 155023 : digest->checksum = g_slice_alloc0(digest->bytes);
360 437116 : for(gsize block = 0; block < (digest->bytes / 16); block++) {
361 282093 : digest->checksum[block].first = seeds[block % n_seeds] ^ seed1;
362 282093 : digest->checksum[block].second = seeds[block % n_seeds] ^ seed2;
363 : }
364 : }
365 :
366 283617 : if(digest->type == RM_DIGEST_BASTARD) {
367 : /* bastard type *always* has *pure* murmur hash for first checksum
368 : * and seeded city for second checksum */
369 15756 : digest->checksum[0].first = digest->checksum[0].second = 0;
370 : }
371 283617 : return digest;
372 : }
373 :
374 28 : void rm_digest_paranoia_shrink(RmDigest *digest, gsize new_size) {
375 28 : g_assert(new_size < digest->bytes);
376 28 : g_assert(digest->type == RM_DIGEST_PARANOID);
377 :
378 28 : digest->bytes = new_size;
379 28 : }
380 :
381 17384 : void rm_digest_release_buffers(RmDigest *digest) {
382 17384 : if(digest->paranoid && digest->paranoid->buffers) {
383 15965 : g_queue_free_full(digest->paranoid->buffers,
384 : (GDestroyNotify)rm_buffer_pool_release_kept);
385 15964 : digest->paranoid->buffers = NULL;
386 : }
387 17383 : }
388 :
389 787249 : void rm_digest_free(RmDigest *digest) {
390 787249 : switch(digest->type) {
391 : case RM_DIGEST_MD5:
392 : case RM_DIGEST_SHA512:
393 : case RM_DIGEST_SHA256:
394 : case RM_DIGEST_SHA1:
395 539609 : g_checksum_free(digest->glib_checksum);
396 539628 : digest->glib_checksum = NULL;
397 539628 : break;
398 : case RM_DIGEST_PARANOID:
399 13677 : if(digest->paranoid->shadow_hash) {
400 13677 : rm_digest_free(digest->paranoid->shadow_hash);
401 : }
402 13677 : rm_digest_release_buffers(digest);
403 13677 : if(digest->paranoid->incoming_twin_candidates) {
404 13677 : g_async_queue_unref(digest->paranoid->incoming_twin_candidates);
405 : }
406 13677 : g_slice_free(RmParanoid, digest->paranoid);
407 13677 : break;
408 : case RM_DIGEST_EXT:
409 : case RM_DIGEST_CUMULATIVE:
410 : case RM_DIGEST_MURMUR512:
411 : case RM_DIGEST_XXHASH:
412 : case RM_DIGEST_CITY512:
413 : case RM_DIGEST_MURMUR256:
414 : case RM_DIGEST_CITY256:
415 : case RM_DIGEST_BASTARD:
416 : case RM_DIGEST_SPOOKY:
417 : case RM_DIGEST_SPOOKY32:
418 : case RM_DIGEST_SPOOKY64:
419 : case RM_DIGEST_MURMUR:
420 : case RM_DIGEST_CITY:
421 233963 : if(digest->checksum) {
422 233963 : g_slice_free1(digest->bytes, digest->checksum);
423 233963 : digest->checksum = NULL;
424 : }
425 233963 : break;
426 : default:
427 0 : g_assert_not_reached();
428 : }
429 787268 : g_slice_free(RmDigest, digest);
430 787276 : }
431 :
432 397895 : void rm_digest_update(RmDigest *digest, const unsigned char *data, RmOff size) {
433 397895 : switch(digest->type) {
434 : case RM_DIGEST_EXT:
435 : /* Data is assumed to be a hex representation of a cchecksum.
436 : * Needs to be compressed in pure memory first.
437 : *
438 : * Checksum is not updated but rather overwritten.
439 : * */
440 : #define CHAR_TO_NUM(c) (unsigned char)(g_ascii_isdigit(c) ? c - '0' : (c - 'a') + 10)
441 :
442 112558 : g_assert(data);
443 :
444 112558 : digest->bytes = size / 2;
445 112558 : digest->checksum = g_slice_alloc0(digest->bytes);
446 :
447 7279398 : for(unsigned i = 0; i < digest->bytes; ++i) {
448 35834200 : ((guint8 *)digest->checksum)[i] =
449 28667360 : (CHAR_TO_NUM(data[2 * i]) << 4) + CHAR_TO_NUM(data[2 * i + 1]);
450 : }
451 :
452 112558 : break;
453 : case RM_DIGEST_MD5:
454 : case RM_DIGEST_SHA512:
455 : case RM_DIGEST_SHA256:
456 : case RM_DIGEST_SHA1:
457 127421 : g_checksum_update(digest->glib_checksum, (const guchar *)data, size);
458 127396 : break;
459 : case RM_DIGEST_SPOOKY32:
460 9747 : digest->checksum[0].first = spooky_hash32(data, size, digest->checksum[0].first);
461 9749 : break;
462 : case RM_DIGEST_SPOOKY64:
463 9749 : digest->checksum[0].first = spooky_hash64(data, size, digest->checksum[0].first);
464 9750 : break;
465 : case RM_DIGEST_SPOOKY:
466 9751 : spooky_hash128(data, size, (uint64_t *)&digest->checksum[0].first,
467 9751 : (uint64_t *)&digest->checksum[0].second);
468 9749 : break;
469 : case RM_DIGEST_XXHASH:
470 39227 : digest->checksum[0].first = XXH64(data, size, digest->checksum[0].first);
471 39212 : break;
472 : case RM_DIGEST_MURMUR512:
473 : case RM_DIGEST_MURMUR256:
474 : case RM_DIGEST_MURMUR:
475 97468 : for(guint8 block = 0; block < (digest->bytes / 16); block++) {
476 : #if RM_PLATFORM_32
477 : MurmurHash3_x86_128(data, size, (uint32_t)digest->checksum[block].first,
478 : &digest->checksum[block]); //&
479 : #elif RM_PLATFORM_64
480 136436 : MurmurHash3_x64_128(data, size, (uint32_t)digest->checksum[block].first,
481 136436 : &digest->checksum[block]);
482 : #else
483 : #error "Probably not a good idea to compile rmlint on 16bit."
484 : #endif
485 : }
486 29250 : break;
487 : case RM_DIGEST_CITY:
488 : case RM_DIGEST_CITY256:
489 : case RM_DIGEST_CITY512:
490 97468 : for(guint8 block = 0; block < (digest->bytes / 16); block++) {
491 : /* Opt out for the more optimized version.
492 : * This needs the crc command of sse4.2
493 : * (available on Intel Nehalem and up; my amd box doesn't have this though)
494 : */
495 68220 : uint128 old = {digest->checksum[block].first, digest->checksum[block].second};
496 : #if RM_PLATFORM_64 && HAVE_SSE42
497 : old = CityHashCrc128WithSeed((const char *)data, size, old);
498 : #else
499 68220 : old = CityHash128WithSeed((const char *)data, size, old);
500 : #endif
501 68223 : memcpy(&digest->checksum[block], &old, sizeof(uint128));
502 : }
503 29248 : break;
504 : case RM_DIGEST_BASTARD:
505 19471 : MurmurHash3_x86_128(data, size, (uint32_t)digest->checksum[0].first,
506 19471 : &digest->checksum[0]);
507 :
508 19476 : uint128 old = {digest->checksum[1].first, digest->checksum[1].second};
509 : #if RM_PLATFORM_64 && HAVE_SSE42
510 : old = CityHashCrc128WithSeed((const char *)data, size, old);
511 : #else
512 19476 : old = CityHash128WithSeed((const char *)data, size, old);
513 : #endif
514 19471 : memcpy(&digest->checksum[1], &old, sizeof(uint128));
515 19471 : break;
516 : case RM_DIGEST_CUMULATIVE: {
517 : /* This is basically FNV1a, it is just important that the order of
518 : * adding data to the hash has no effect on the result, so it can
519 : * be used as a lookup key:
520 : *
521 : * http://en.wikipedia.org/wiki/Fowler%E2%80%93Noll%E2%80%93Vo_hash_function
522 : * */
523 11480 : RmOff hash = 0xcbf29ce484222325;
524 195160 : for(gsize i = 0; i < digest->bytes; ++i) {
525 183680 : hash ^= ((guint8 *)data)[i % size];
526 183680 : hash *= 0x100000001b3;
527 183680 : ((guint8 *)digest->checksum)[i] += hash;
528 : }
529 11480 : } break;
530 : case RM_DIGEST_PARANOID:
531 : default:
532 0 : g_assert_not_reached();
533 : }
534 397863 : }
535 :
536 273832 : void rm_digest_buffered_update(RmBuffer *buffer) {
537 273832 : RmDigest *digest = buffer->digest;
538 273832 : if(digest->type != RM_DIGEST_PARANOID) {
539 254080 : rm_digest_update(digest, buffer->data, buffer->len);
540 254070 : rm_buffer_pool_release(buffer);
541 : } else {
542 19752 : RmParanoid *paranoid = digest->paranoid;
543 :
544 19752 : g_queue_push_tail(paranoid->buffers, buffer);
545 19754 : paranoid->buffer_count++;
546 19754 : rm_buffer_pool_signal_keeping(buffer);
547 :
548 19754 : g_assert(paranoid->shadow_hash);
549 19754 : if(paranoid->shadow_hash) {
550 19754 : rm_digest_update(paranoid->shadow_hash, buffer->data, buffer->len);
551 : }
552 :
553 19751 : if(paranoid->twin_candidate) {
554 : /* do a running check that digest remains the same as its candidate twin */
555 1733 : if(rm_buffer_equal(buffer, paranoid->twin_candidate_buffer->data)) {
556 : /* buffers match; move ptr to next one ready for next buffer */
557 1731 : paranoid->twin_candidate_buffer = paranoid->twin_candidate_buffer->next;
558 : } else {
559 : /* buffers don't match - delete candidate (new candidate might be added on
560 : * next
561 : * call to rm_digest_buffered_update) */
562 2 : paranoid->twin_candidate = NULL;
563 2 : paranoid->twin_candidate_buffer = NULL;
564 : #if _RM_CHECKSUM_DEBUG
565 : rm_log_debug_line("Ejected candidate match at buffer #%u",
566 : paranoid->buffer_count);
567 : #endif
568 : }
569 : }
570 :
571 68997 : while(!paranoid->twin_candidate && paranoid->incoming_twin_candidates &&
572 18355 : (paranoid->twin_candidate =
573 18347 : g_async_queue_try_pop(paranoid->incoming_twin_candidates))) {
574 : /* validate the new candidate by comparing the previous buffers (not
575 : * including current)*/
576 11141 : paranoid->twin_candidate_buffer =
577 11141 : paranoid->twin_candidate->paranoid->buffers->head;
578 11141 : GList *iter_self = paranoid->buffers->head;
579 11141 : gboolean match = TRUE;
580 34160 : while(match && iter_self) {
581 11879 : match = (rm_buffer_equal(paranoid->twin_candidate_buffer->data,
582 11879 : iter_self->data));
583 11878 : iter_self = iter_self->next;
584 11878 : paranoid->twin_candidate_buffer = paranoid->twin_candidate_buffer->next;
585 : }
586 11140 : if(paranoid->twin_candidate && !match) {
587 : /* reject the twin candidate */
588 : #if _RM_CHECKSUM_DEBUG
589 : rm_log_debug_line("Rejected twin candidate %p for %p",
590 : paranoid->twin_candidate, paranoid);
591 : #endif
592 333 : paranoid->twin_candidate = NULL;
593 333 : paranoid->twin_candidate_buffer = NULL;
594 : #if _RM_CHECKSUM_DEBUG
595 : } else {
596 : rm_log_debug_line("Added twin candidate %p for %p",
597 : paranoid->twin_candidate, paranoid);
598 : #endif
599 : }
600 : }
601 : }
602 273867 : }
603 :
604 484204 : RmDigest *rm_digest_copy(RmDigest *digest) {
605 484204 : g_assert(digest);
606 :
607 484204 : RmDigest *self = NULL;
608 :
609 484204 : switch(digest->type) {
610 : case RM_DIGEST_MD5:
611 : case RM_DIGEST_SHA512:
612 : case RM_DIGEST_SHA256:
613 : case RM_DIGEST_SHA1:
614 451876 : self = g_slice_new0(RmDigest);
615 451876 : self->bytes = digest->bytes;
616 451876 : self->type = digest->type;
617 451876 : self->glib_checksum = g_checksum_copy(digest->glib_checksum);
618 451879 : break;
619 : case RM_DIGEST_SPOOKY:
620 : case RM_DIGEST_SPOOKY32:
621 : case RM_DIGEST_SPOOKY64:
622 : case RM_DIGEST_MURMUR:
623 : case RM_DIGEST_CITY:
624 : case RM_DIGEST_CITY256:
625 : case RM_DIGEST_MURMUR256:
626 : case RM_DIGEST_CITY512:
627 : case RM_DIGEST_MURMUR512:
628 : case RM_DIGEST_XXHASH:
629 : case RM_DIGEST_BASTARD:
630 : case RM_DIGEST_CUMULATIVE:
631 : case RM_DIGEST_EXT:
632 32328 : self = rm_digest_new(digest->type, 0, 0, digest->bytes, FALSE);
633 :
634 32328 : if(self->checksum && digest->checksum) {
635 32328 : memcpy((char *)self->checksum, (char *)digest->checksum, self->bytes);
636 : }
637 :
638 32328 : break;
639 : case RM_DIGEST_PARANOID:
640 : default:
641 0 : g_assert_not_reached();
642 : }
643 :
644 484207 : return self;
645 : }
646 :
647 1184726 : static gboolean rm_digest_needs_steal(RmDigestType digest_type) {
648 1184726 : switch(digest_type) {
649 : case RM_DIGEST_MD5:
650 : case RM_DIGEST_SHA512:
651 : case RM_DIGEST_SHA256:
652 : case RM_DIGEST_SHA1:
653 : /* for all of the above, reading the digest is destructive, so we
654 : * need to take a copy */
655 531551 : return TRUE;
656 : case RM_DIGEST_SPOOKY32:
657 : case RM_DIGEST_SPOOKY64:
658 : case RM_DIGEST_SPOOKY:
659 : case RM_DIGEST_MURMUR:
660 : case RM_DIGEST_CITY:
661 : case RM_DIGEST_CITY256:
662 : case RM_DIGEST_CITY512:
663 : case RM_DIGEST_XXHASH:
664 : case RM_DIGEST_MURMUR256:
665 : case RM_DIGEST_MURMUR512:
666 : case RM_DIGEST_BASTARD:
667 : case RM_DIGEST_CUMULATIVE:
668 : case RM_DIGEST_EXT:
669 : case RM_DIGEST_PARANOID:
670 653175 : return FALSE;
671 : default:
672 0 : g_assert_not_reached();
673 : }
674 : }
675 :
676 910448 : guint8 *rm_digest_steal(RmDigest *digest) {
677 910448 : guint8 *result = g_slice_alloc0(digest->bytes);
678 910452 : gsize buflen = digest->bytes;
679 :
680 910452 : if(rm_digest_needs_steal(digest->type)) {
681 : /* reading the digest is destructive, so we need to take a copy */
682 451719 : RmDigest *copy = rm_digest_copy(digest);
683 451718 : g_checksum_get_digest(copy->glib_checksum, result, &buflen);
684 451717 : g_assert(buflen == digest->bytes);
685 451717 : rm_digest_free(copy);
686 : } else {
687 458733 : memcpy(result, digest->checksum, digest->bytes);
688 : }
689 910453 : return result;
690 : }
691 :
692 295712 : guint rm_digest_hash(RmDigest *digest) {
693 295712 : guint8 *buf = NULL;
694 295712 : gsize bytes = 0;
695 :
696 295712 : if(digest->type == RM_DIGEST_PARANOID) {
697 21967 : if(digest->paranoid->shadow_hash) {
698 21967 : buf = rm_digest_steal(digest->paranoid->shadow_hash);
699 21969 : bytes = digest->paranoid->shadow_hash->bytes;
700 : }
701 : } else {
702 273745 : buf = rm_digest_steal(digest);
703 273749 : bytes = digest->bytes;
704 : }
705 :
706 295718 : guint hash = 0;
707 295718 : if(buf != NULL) {
708 295718 : hash = *(RmOff *)buf;
709 295718 : g_slice_free1(bytes, buf);
710 : }
711 295720 : return hash;
712 : }
713 :
714 287126 : gboolean rm_digest_equal(RmDigest *a, RmDigest *b) {
715 287126 : g_assert(a && b);
716 :
717 287157 : if(a->type != b->type) {
718 0 : return false;
719 : }
720 :
721 287157 : if(a->bytes != b->bytes) {
722 0 : return false;
723 : }
724 :
725 287157 : if(a->type == RM_DIGEST_PARANOID) {
726 12867 : if(!a->paranoid->buffers) {
727 : /* buffers have been freed so we need to rely on shadow hash */
728 908 : return rm_digest_equal(a->paranoid->shadow_hash, b->paranoid->shadow_hash);
729 : }
730 11959 : if(a->paranoid->twin_candidate == b || b->paranoid->twin_candidate == a) {
731 10807 : return true;
732 : }
733 :
734 1152 : GList *a_iter = a->paranoid->buffers->head;
735 1152 : GList *b_iter = b->paranoid->buffers->head;
736 1152 : guint bytes = 0;
737 3456 : while(a_iter && b_iter) {
738 1152 : if(!rm_buffer_equal(a_iter->data, b_iter->data)) {
739 0 : rm_log_error_line(
740 : "Paranoid digest compare found mismatch - must be hash collision in "
741 : "shadow hash");
742 0 : return false;
743 : }
744 1152 : bytes += ((RmBuffer *)a_iter->data)->len;
745 1152 : a_iter = a_iter->next;
746 1152 : b_iter = b_iter->next;
747 : }
748 :
749 1152 : return (!a_iter && !b_iter && bytes == a->bytes);
750 :
751 274290 : } else if(rm_digest_needs_steal(a->type)) {
752 79843 : guint8 *buf_a = rm_digest_steal(a);
753 79843 : guint8 *buf_b = rm_digest_steal(b);
754 :
755 : gboolean result;
756 :
757 79844 : if(a->bytes != b->bytes) {
758 0 : result = false;
759 : } else {
760 79844 : result = !memcmp(buf_a, buf_b, MIN(a->bytes, b->bytes));
761 : }
762 :
763 79844 : g_slice_free1(a->bytes, buf_a);
764 79843 : g_slice_free1(b->bytes, buf_b);
765 :
766 79842 : return result;
767 : } else {
768 194446 : return !memcmp(a->checksum, b->checksum, MIN(a->bytes, b->bytes));
769 : }
770 : }
771 :
772 444275 : int rm_digest_hexstring(RmDigest *digest, char *buffer) {
773 : static const char *hex = "0123456789abcdef";
774 444275 : guint8 *input = NULL;
775 444275 : gsize bytes = 0;
776 444275 : if(digest == NULL) {
777 700 : return 0;
778 : }
779 :
780 443575 : if(digest->type == RM_DIGEST_PARANOID) {
781 23579 : if(digest->paranoid->shadow_hash) {
782 23579 : input = rm_digest_steal(digest->paranoid->shadow_hash);
783 23579 : bytes = digest->paranoid->shadow_hash->bytes;
784 : }
785 : } else {
786 419996 : input = rm_digest_steal(digest);
787 419996 : bytes = digest->bytes;
788 : }
789 :
790 15441639 : for(gsize i = 0; i < bytes; ++i) {
791 14998064 : buffer[0] = hex[input[i] / 16];
792 14998064 : buffer[1] = hex[input[i] % 16];
793 :
794 14998064 : if(i == bytes - 1) {
795 443575 : buffer[2] = '\0';
796 : }
797 :
798 14998064 : buffer += 2;
799 : }
800 :
801 443575 : g_slice_free1(bytes, input);
802 443575 : return bytes * 2 + 1;
803 : }
804 :
805 444265 : int rm_digest_get_bytes(RmDigest *self) {
806 444265 : if(self == NULL) {
807 700 : return 0;
808 : }
809 :
810 443565 : if(self->type != RM_DIGEST_PARANOID) {
811 419960 : return self->bytes;
812 23605 : } else if(self->paranoid->shadow_hash) {
813 23605 : return self->paranoid->shadow_hash->bytes;
814 : } else {
815 0 : return 0;
816 : }
817 : }
818 :
819 12434 : void rm_digest_send_match_candidate(RmDigest *target, RmDigest *candidate) {
820 12434 : if(!target->paranoid->incoming_twin_candidates) {
821 0 : target->paranoid->incoming_twin_candidates = g_async_queue_new();
822 : }
823 12434 : g_async_queue_push(target->paranoid->incoming_twin_candidates, candidate);
824 12434 : }
|