Line data Source code
1 : /*
2 : * This file is part of rmlint.
3 : *
4 : * rmlint is free software: you can redistribute it and/or modify
5 : * it under the terms of the GNU General Public License as published by
6 : * the Free Software Foundation, either version 3 of the License, or
7 : * (at your option) any later version.
8 : *
9 : * rmlint is distributed in the hope that it will be useful,
10 : * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 : * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 : * GNU General Public License for more details.
13 : *
14 : * You should have received a copy of the GNU General Public License
15 : * along with rmlint. If not, see <http://www.gnu.org/licenses/>.
16 : *
17 : * Authors:
18 : *
19 : * - Christopher <sahib> Pahl 2010-2015 (https://github.com/sahib)
20 : * - Daniel <SeeSpotRun> T. 2014-2015 (https://github.com/SeeSpotRun)
21 : *
22 : * Hosted on http://github.com/sahib/rmlint
23 : *
24 : */
25 :
26 : /* This is the treemerge algorithm.
27 : *
28 : * It tries to solve the following problem and sometimes even succeeds:
29 : * Take a list of duplicates (as RmFiles) and figure out which directories
30 : * consist fully out of duplicates and can be thus removed.
31 : *
32 : * The basic algorithm is split in four phases:
33 : *
34 : * - Counting: Walk through all directories given on the commandline and
35 : * traverse them. Count all files during traverse and safe it in
36 : * an radix-tree (libart is used here). The key is the path, the
37 : * value the count of files in it. Invalid directories and
38 : * directories above the given are set to -1.
39 : * - Feeding: Collect all duplicates and store them in RmDirectory structures.
40 : * If a directory appears to consist of dupes only (num_dupes == num_files)
41 : * then it is remembered as valid directory.
42 : * - Upcluster: Take all valid directories and cluster them up, so subdirs get
43 : * merged into the parent directory. Continue as long the parent
44 : * directory is full too. Remember full directories in a hashtable
45 : * with the hash of the directory (which is a hash of the file's
46 : * hashes) as key and a list of matching directories as value.
47 : * - Extract: Extract the result information out of the hashtable top-down.
48 : * If a directory is reported, mark all subdirs of it as finished
49 : * so they do not get reported twice. Files that could not be
50 : * grouped in directories are found and reported as usually.
51 : */
52 :
53 : /*
54 : * Comment this out to see helpful extra debugging:
55 : */
56 : // #define _RM_TREEMERGE_DEBUG
57 :
58 : #include <glib.h>
59 : #include <string.h>
60 :
61 : #include <sys/types.h>
62 : #include <sys/stat.h>
63 : #include <fts.h>
64 :
65 : #include "treemerge.h"
66 : #include "shredder.h"
67 : #include "preprocess.h"
68 : #include "formats.h"
69 : #include "pathtricia.h"
70 :
71 : typedef struct RmDirectory {
72 : char *dirname; /* Path to this directory without trailing slash */
73 : GQueue known_files; /* RmFiles in this directory */
74 : GQueue children; /* Children for directories with subdirectories */
75 : gint64 prefd_files; /* Files in this directory that are tagged as original */
76 : gint64 dupe_count; /* Count of RmFiles actually in this directory */
77 : gint64 file_count; /* Count of files actually in this directory (or -1 on error) */
78 : gint64 mergeups; /* number of times this directory was merged up */
79 : bool finished : 1; /* Was this dir or one of his parents already printed? */
80 : bool was_merged : 1; /* true if this directory was merged up already (only once) */
81 : bool was_inserted : 1; /* true if this directory was added to results (only once) */
82 : unsigned short depth; /* path depth (i.e. count of / in path, no trailing /) */
83 : GHashTable *hash_set; /* Set of hashes, used for equality check (to be sure) */
84 : RmDigest *digest; /* Common digest of all RmFiles in this directory */
85 :
86 : struct {
87 : time_t dir_mtime; /* Directory Metadata: Modification Time */
88 : ino_t dir_inode; /* Directory Metadata: Inode */
89 : dev_t dir_dev; /* Directory Metadata: Device ID */
90 : } metadata;
91 : } RmDirectory;
92 :
93 : struct RmTreeMerger {
94 : RmSession *session; /* Session state variables / Settings */
95 : RmTrie dir_tree; /* Path-Trie with all RmFiles as value */
96 : RmTrie count_tree; /* Path-Trie with all file's count as value */
97 : GHashTable *result_table; /* {hash => [RmDirectory]} mapping */
98 : GHashTable *file_groups; /* Group files by hash */
99 : GHashTable *file_checks; /* Set of files that were handled already. */
100 : GHashTable *known_hashs; /* Set of known hashes, only used for cleanup. */
101 : GHashTable *was_printed; /* Set of files that were handed to rm_fmt_write() */
102 : GQueue valid_dirs; /* Directories consisting of RmFiles only */
103 : };
104 :
105 : //////////////////////////
106 : // ACTUAL FILE COUNTING //
107 : //////////////////////////
108 :
109 6776 : int rm_tm_count_art_callback(_U RmTrie *self, RmNode *node, _U int level,
110 : void *user_data) {
111 : /* Note: this method has a time complexity of O(log(n) * m) which may
112 : result in a few seconds buildup time for large sets of directories. Since this
113 : will only happen when rmlint ran for long anyways and since we can keep the
114 : code easy and memory efficient this way, Im against more clever but longer
115 : solutions. (Good way of saying "Im just too stupid", eh?)
116 : */
117 :
118 6776 : RmTrie *count_tree = user_data;
119 6776 : bool error_flag = GPOINTER_TO_INT(node->data);
120 :
121 : char path[PATH_MAX];
122 6776 : memset(path, 0, sizeof(path));
123 6776 : rm_trie_build_path_unlocked(node, path, sizeof(path));
124 :
125 : /* Ascend the path parts up, add one for each part we meet.
126 : If a part was never found before, add it.
127 : This is the 'm' above: The count of separators in the path.
128 :
129 : Hack: path[key_len] is nul, at key_len it must be either an
130 : extra slash (bad) or the beginning of a file name.
131 : Therefore start at -2.
132 : */
133 210028 : for(int i = strlen(path) - 1; i >= 0; --i) {
134 203252 : if(path[i] == G_DIR_SEPARATOR) {
135 : /* Do not use an empty path, use a slash for root */
136 24248 : if(i == 0) {
137 6776 : path[0] = G_DIR_SEPARATOR;
138 6776 : path[1] = 0;
139 : } else {
140 17472 : path[i] = 0;
141 : }
142 :
143 : /* Include the nulbyte */
144 24248 : int new_count = -1;
145 :
146 24248 : if(error_flag == false) {
147 : /* Lookup the count on this level */
148 21308 : int old_count = GPOINTER_TO_INT(rm_trie_search(count_tree, path));
149 :
150 : /* Propagate old error up or just increment the count */
151 21308 : new_count = (old_count == -1) ? -1 : old_count + 1;
152 : }
153 :
154 : /* Accumulate the count ('n' above is the height of the trie) */
155 24248 : rm_trie_insert(count_tree, path, GINT_TO_POINTER(new_count));
156 : }
157 : }
158 :
159 6776 : return 0;
160 : }
161 :
162 1120 : static bool rm_tm_count_files(RmTrie *count_tree, char **paths, RmSession *session) {
163 1120 : if(*paths == NULL) {
164 0 : rm_log_error("No paths passed to rm_tm_count_files\n");
165 0 : return false;
166 : }
167 :
168 1120 : int fts_flags = FTS_COMFOLLOW;
169 1120 : if(session->cfg->follow_symlinks) {
170 28 : fts_flags |= FTS_LOGICAL;
171 : } else {
172 1092 : fts_flags |= FTS_PHYSICAL;
173 : }
174 :
175 : /* This tree stores the full file paths.
176 : It is joined into a full directory tree later.
177 : */
178 : RmTrie file_tree;
179 1120 : rm_trie_init(&file_tree);
180 :
181 1120 : FTS *fts = fts_open(paths, fts_flags, NULL);
182 1120 : if(fts == NULL) {
183 0 : rm_log_perror("fts_open failed");
184 0 : return false;
185 : }
186 :
187 1120 : FTSENT *ent = NULL;
188 16772 : while((ent = fts_read(fts))) {
189 : /* Handle large files (where fts fails with FTS_NS) */
190 14532 : if(ent->fts_info == FTS_NS) {
191 : RmStat stat_buf;
192 0 : if(rm_sys_stat(ent->fts_path, &stat_buf) == -1) {
193 0 : rm_log_perror("stat(2) failed");
194 0 : continue;
195 : } else {
196 : /* Must be a large file (or followed link to it) */
197 0 : ent->fts_info = FTS_F;
198 : }
199 : }
200 :
201 14532 : switch(ent->fts_info) {
202 : case FTS_ERR:
203 : case FTS_DC:
204 : /* Save this path as an error */
205 0 : rm_trie_insert(&file_tree, ent->fts_path, GINT_TO_POINTER(true));
206 0 : break;
207 : case FTS_F:
208 : case FTS_SL:
209 : case FTS_NS:
210 : case FTS_SLNONE:
211 : case FTS_DEFAULT:
212 : /* Save this path as countable file */
213 5852 : if(ent->fts_statp->st_size > 0) {
214 5740 : rm_trie_insert(&file_tree, ent->fts_path, GINT_TO_POINTER(false));
215 : }
216 : case FTS_D:
217 : case FTS_DNR:
218 : case FTS_DOT:
219 : case FTS_DP:
220 : case FTS_NSOK:
221 : default:
222 : /* other fts states, that do not count as errors or files */
223 14532 : break;
224 : }
225 : }
226 :
227 1120 : if(fts_close(fts) != 0) {
228 0 : rm_log_perror("fts_close failed");
229 0 : return false;
230 : }
231 :
232 1120 : rm_trie_iter(&file_tree, NULL, true, false, rm_tm_count_art_callback, count_tree);
233 :
234 : /* Now flag everything as a no-go over the given paths,
235 : * otherwise we would continue merging till / with fatal consequences,
236 : * since / does not have more files as paths[0]
237 : */
238 2464 : for(int i = 0; paths[i]; ++i) {
239 : /* Just call the callback directly */
240 1344 : RmNode *node = rm_trie_search_node(&file_tree, paths[i]);
241 1344 : if(node != NULL) {
242 1344 : node->data = GINT_TO_POINTER(true);
243 1344 : rm_tm_count_art_callback(&file_tree, node, 0, count_tree);
244 : }
245 : }
246 :
247 1120 : rm_trie_destroy(&file_tree);
248 1120 : return true;
249 : }
250 :
251 : ///////////////////////////////
252 : // DIRECTORY STRUCT HANDLING //
253 : ///////////////////////////////
254 :
255 4284 : static RmDirectory *rm_directory_new(char *dirname) {
256 4284 : RmDirectory *self = g_new0(RmDirectory, 1);
257 :
258 4284 : self->file_count = 0;
259 4284 : self->dupe_count = 0;
260 4284 : self->prefd_files = 0;
261 4284 : self->was_merged = false;
262 4284 : self->was_inserted = false;
263 4284 : self->mergeups = 0;
264 :
265 4284 : self->dirname = dirname;
266 4284 : self->finished = false;
267 :
268 4284 : self->depth = 0;
269 114856 : for(char *s = dirname; *s; s++) {
270 110572 : self->depth += (*s == G_DIR_SEPARATOR);
271 : }
272 :
273 : RmStat dir_stat;
274 4284 : if(rm_sys_stat(self->dirname, &dir_stat) == -1) {
275 0 : rm_log_perror("stat(2) failed during sort");
276 : } else {
277 4284 : self->metadata.dir_mtime = dir_stat.st_mtime;
278 4284 : self->metadata.dir_inode = dir_stat.st_ino;
279 4284 : self->metadata.dir_dev = dir_stat.st_dev;
280 : }
281 :
282 : /* Special cumulative hashsum, that is not dependent on the
283 : * order in which the file hashes were added.
284 : * It is not used as full hash, but as sorting speedup.
285 : */
286 4284 : self->digest = rm_digest_new(RM_DIGEST_CUMULATIVE, 0, 0, 0, false);
287 :
288 4284 : g_queue_init(&self->known_files);
289 4284 : g_queue_init(&self->children);
290 :
291 4284 : self->hash_set =
292 4284 : g_hash_table_new((GHashFunc)rm_digest_hash, (GEqualFunc)rm_digest_equal);
293 :
294 4284 : return self;
295 : }
296 :
297 4284 : static void rm_directory_free(RmDirectory *self) {
298 4284 : rm_digest_free(self->digest);
299 4284 : g_hash_table_unref(self->hash_set);
300 4284 : g_queue_clear(&self->known_files);
301 4284 : g_queue_clear(&self->children);
302 4284 : g_free(self->dirname);
303 4284 : g_free(self);
304 4284 : }
305 :
306 3192 : static RmOff rm_tm_calc_file_size(RmDirectory *directory) {
307 3192 : RmOff acc = 0;
308 :
309 6216 : for(GList *iter = directory->known_files.head; iter; iter = iter->next) {
310 3024 : RmFile *file = iter->data;
311 3024 : acc += file->file_size;
312 : }
313 :
314 : /* Recursively propagate to children */
315 4704 : for(GList *iter = directory->children.head; iter; iter = iter->next) {
316 1512 : acc += rm_tm_calc_file_size((RmDirectory *)iter->data);
317 : }
318 :
319 3192 : return acc;
320 : }
321 :
322 1680 : static RmFile *rm_directory_as_file(RmTreeMerger *merger, RmDirectory *self) {
323 : /* Masquerades a RmDirectory as RmFile for purpose of output */
324 1680 : RmFile *file = g_malloc0(sizeof(RmFile));
325 :
326 : /* Need to set session first, since set_path expects that */
327 1680 : file->session = merger->session;
328 1680 : rm_file_set_path(file, self->dirname, strlen(self->dirname));
329 :
330 1680 : file->lint_type = RM_LINT_TYPE_DUPE_DIR_CANDIDATE;
331 1680 : file->digest = self->digest;
332 :
333 : /* Set these to invalid for now */
334 1680 : file->mtime = self->metadata.dir_mtime;
335 1680 : file->inode = self->metadata.dir_inode;
336 1680 : file->dev = self->metadata.dir_dev;
337 :
338 : /* Recursively calculate the file size */
339 1680 : file->file_size = rm_tm_calc_file_size(self);
340 1680 : file->is_prefd = (self->prefd_files >= self->dupe_count);
341 :
342 1680 : return file;
343 : }
344 :
345 1400 : static bool rm_directory_equal(RmDirectory *d1, RmDirectory *d2) {
346 1400 : if(d1->mergeups != d2->mergeups) {
347 0 : return false;
348 : }
349 :
350 1400 : if(rm_digest_equal(d1->digest, d2->digest) == false) {
351 0 : return false;
352 : }
353 :
354 1400 : if(g_hash_table_size(d1->hash_set) != g_hash_table_size(d2->hash_set)) {
355 0 : return false;
356 : }
357 :
358 : gpointer digest_key;
359 : GHashTableIter iter;
360 :
361 1400 : g_hash_table_iter_init(&iter, d1->hash_set);
362 1400 : while(g_hash_table_iter_next(&iter, &digest_key, NULL)) {
363 1400 : if(g_hash_table_contains(d2->hash_set, digest_key) == false) {
364 0 : return false;
365 : }
366 : }
367 :
368 1400 : return true;
369 : }
370 :
371 4312 : static guint rm_directory_hash(const RmDirectory *d) {
372 : /* This hash is used to quickly compare directories with each other.
373 : * Different directories might yield the same hash of course.
374 : * To prevent this case, rm_directory_equal really compares
375 : * all the file's hashes with each other.
376 : */
377 4312 : return rm_digest_hash(d->digest) ^ d->mergeups;
378 : }
379 :
380 8120 : static int rm_directory_add(RmDirectory *directory, RmFile *file) {
381 : /* Update the directorie's hash with the file's hash
382 : Since we cannot be sure in which order the files come in
383 : we have to add the hash cummulatively.
384 : */
385 8120 : int new_dupes = 0;
386 :
387 8120 : g_assert(file);
388 8120 : g_assert(file->digest);
389 8120 : g_assert(directory);
390 :
391 8120 : guint8 *file_digest = NULL;
392 8120 : RmOff digest_bytes = 0;
393 :
394 8120 : if(file->digest->type == RM_DIGEST_PARANOID) {
395 1138 : file_digest = rm_digest_steal(file->digest->paranoid->shadow_hash);
396 1138 : digest_bytes = file->digest->paranoid->shadow_hash->bytes;
397 : } else {
398 6982 : file_digest = rm_digest_steal(file->digest);
399 6982 : digest_bytes = file->digest->bytes;
400 : }
401 :
402 : /* + and not XOR, since ^ would yield 0 for same hashes always. No matter
403 : * which hashes. Also this would be confusing. For me and for debuggers.
404 : */
405 8120 : rm_digest_update(directory->digest, file_digest, digest_bytes);
406 :
407 : /* The file value is not really used, but we need some non-null value */
408 8120 : g_hash_table_add(directory->hash_set, file->digest);
409 :
410 8120 : g_slice_free1(digest_bytes, file_digest);
411 :
412 8120 : if(file->hardlinks.is_head && file->hardlinks.files) {
413 0 : new_dupes = 1 + g_queue_get_length(file->hardlinks.files);
414 : } else {
415 8120 : new_dupes = 1;
416 : }
417 :
418 8120 : directory->dupe_count += new_dupes;
419 8120 : directory->prefd_files += file->is_prefd;
420 :
421 8120 : return new_dupes;
422 : }
423 :
424 5124 : static void rm_directory_add_subdir(RmDirectory *parent, RmDirectory *subdir) {
425 5124 : if(subdir->was_merged) {
426 1764 : return;
427 : }
428 :
429 3360 : parent->mergeups = subdir->mergeups + parent->mergeups + 1;
430 3360 : parent->dupe_count += subdir->dupe_count;
431 3360 : g_queue_push_head(&parent->children, subdir);
432 3360 : parent->prefd_files += subdir->prefd_files;
433 :
434 : #ifdef _RM_TREEMERGE_DEBUG
435 : g_printerr("%55s (%3ld/%3ld) <- %s (%3ld/%3ld)\n", parent->dirname,
436 : parent->dupe_count, parent->file_count, subdir->dirname,
437 : subdir->dupe_count, subdir->file_count);
438 : #endif
439 :
440 : /**
441 : * Here's something weird:
442 : * - a counter is used and substraced at once from parent->dupe_count.
443 : * - it would ofc. be nicer to substract it step by step.
444 : * - but for some weird reasons this only works on clang, not gcc.
445 : * - yes, what. But I tested this, I promise!
446 : */
447 7420 : for(GList *iter = subdir->known_files.head; iter; iter = iter->next) {
448 4060 : int c = rm_directory_add(parent, (RmFile *)iter->data);
449 4060 : parent->dupe_count -= c;
450 : }
451 :
452 : /* Inherit the child's checksum */
453 3360 : unsigned char *subdir_cksum = rm_digest_steal(subdir->digest);
454 3360 : rm_digest_update(parent->digest, subdir_cksum, subdir->digest->bytes);
455 3360 : g_slice_free1(subdir->digest->bytes, subdir_cksum);
456 :
457 3360 : subdir->was_merged = true;
458 : }
459 :
460 : ///////////////////////////
461 : // TREE MERGER ALGORITHM //
462 : ///////////////////////////
463 :
464 1120 : static void rm_tm_chunk_flush(RmTreeMerger *self, char **out_paths, int n_paths) {
465 1120 : rm_tm_count_files(&self->count_tree, out_paths, self->session);
466 :
467 1120 : if(self->session->cfg->use_meta_cache) {
468 88 : for(int i = 0; i < n_paths; ++i) {
469 48 : g_free(out_paths[i]);
470 : }
471 : }
472 1120 : }
473 :
474 1120 : static void rm_tm_chunk_paths(RmTreeMerger *self, char **paths) {
475 : /* Count only up to 512 paths at the same time. High numbers like this can
476 : * happen if find is piped inside rmlint via the special "-" file.
477 : * Sadly, this would need to have all paths in memory at the same time.
478 : * With session->cfg->use_meta_cache, there is only an ID in the path
479 : * pointer.
480 : * */
481 :
482 1120 : RmCfg *cfg = self->session->cfg;
483 :
484 1120 : const int N = 512;
485 :
486 1120 : int n_paths = 0;
487 1120 : char **out_paths = g_malloc0((N + 1) * sizeof(char *));
488 :
489 2464 : for(int i = 0; paths[i]; ++i) {
490 1344 : if(cfg->use_meta_cache) {
491 : char buf[PATH_MAX];
492 :
493 96 : rm_swap_table_lookup(self->session->meta_cache,
494 48 : self->session->meta_cache_dir_id,
495 48 : GPOINTER_TO_UINT(paths[i]), buf, sizeof(buf));
496 :
497 48 : out_paths[n_paths] = g_strdup(buf);
498 : } else {
499 1296 : out_paths[n_paths] = paths[i];
500 : }
501 :
502 : /* Terminate the vector by a guarding NULL */
503 1344 : out_paths[++n_paths] = NULL;
504 :
505 : /* We reached the size of one chunk, flush and wrap around */
506 1344 : if(n_paths == N) {
507 0 : rm_tm_chunk_flush(self, out_paths, n_paths);
508 0 : n_paths = 0;
509 : }
510 : }
511 :
512 : /* Flush the rest of it */
513 1120 : if(n_paths) {
514 1120 : rm_tm_chunk_flush(self, out_paths, n_paths);
515 : }
516 :
517 1120 : g_free(out_paths);
518 1120 : }
519 :
520 1120 : RmTreeMerger *rm_tm_new(RmSession *session) {
521 1120 : RmTreeMerger *self = g_slice_new(RmTreeMerger);
522 1120 : self->session = session;
523 1120 : g_queue_init(&self->valid_dirs);
524 :
525 1120 : self->result_table = g_hash_table_new_full((GHashFunc)rm_directory_hash,
526 : (GEqualFunc)rm_directory_equal, NULL,
527 : (GDestroyNotify)g_queue_free);
528 :
529 1120 : self->file_groups =
530 1120 : g_hash_table_new_full((GHashFunc)rm_digest_hash, (GEqualFunc)rm_digest_equal,
531 : NULL, (GDestroyNotify)g_queue_free);
532 :
533 1120 : self->known_hashs = g_hash_table_new_full(NULL, NULL, NULL, NULL);
534 1120 : self->was_printed = g_hash_table_new_full(NULL, NULL, NULL, NULL);
535 :
536 1120 : rm_trie_init(&self->dir_tree);
537 1120 : rm_trie_init(&self->count_tree);
538 :
539 1120 : rm_tm_chunk_paths(self, session->cfg->paths);
540 :
541 1120 : return self;
542 : }
543 :
544 4284 : int rm_tm_destroy_iter(_U RmTrie *self, RmNode *node, _U int level, RmTreeMerger *tm) {
545 4284 : RmDirectory *directory = node->data;
546 :
547 8344 : for(GList *iter = directory->known_files.head; iter; iter = iter->next) {
548 4060 : if(g_hash_table_contains(tm->was_printed, iter->data)) {
549 2744 : continue;
550 : }
551 :
552 1316 : rm_file_destroy((RmFile *)iter->data);
553 : }
554 :
555 4284 : rm_directory_free(directory);
556 4284 : return 0;
557 : }
558 :
559 1120 : void rm_tm_destroy(RmTreeMerger *self) {
560 1120 : g_hash_table_unref(self->result_table);
561 1120 : g_hash_table_unref(self->file_groups);
562 :
563 1120 : GList *digest_keys = g_hash_table_get_keys(self->known_hashs);
564 1120 : g_list_free_full(digest_keys, (GDestroyNotify)rm_digest_free);
565 1120 : g_hash_table_unref(self->known_hashs);
566 :
567 1120 : g_queue_clear(&self->valid_dirs);
568 :
569 : /* Kill all RmDirectories stored in the tree */
570 1120 : rm_trie_iter(&self->dir_tree, NULL, true, false,
571 : (RmTrieIterCallback)rm_tm_destroy_iter, self);
572 :
573 1120 : rm_trie_destroy(&self->dir_tree);
574 1120 : rm_trie_destroy(&self->count_tree);
575 1120 : g_hash_table_unref(self->was_printed);
576 :
577 1120 : g_slice_free(RmTreeMerger, self);
578 1120 : }
579 :
580 4032 : static void rm_tm_insert_dir(RmTreeMerger *self, RmDirectory *directory) {
581 4032 : if(directory->was_inserted) {
582 1176 : return;
583 : }
584 :
585 2856 : GQueue *dir_queue =
586 2856 : rm_hash_table_setdefault(self->result_table, directory, (RmNewFunc)g_queue_new);
587 2856 : g_queue_push_head(dir_queue, directory);
588 2856 : directory->was_inserted = true;
589 : }
590 :
591 4060 : void rm_tm_feed(RmTreeMerger *self, RmFile *file) {
592 4060 : RM_DEFINE_PATH(file);
593 4060 : char *dirname = g_path_get_dirname(file_path);
594 :
595 : /* See if we know that directory already */
596 4060 : RmDirectory *directory = rm_trie_search(&self->dir_tree, dirname);
597 :
598 4060 : if(directory == NULL) {
599 : /* Get the actual file count */
600 2772 : int file_count = GPOINTER_TO_INT(rm_trie_search(&self->count_tree, dirname));
601 2772 : if(file_count == 0) {
602 0 : rm_log_error(
603 : RED "Empty directory or weird RmFile encountered; rejecting.\n" RESET);
604 0 : file_count = -1;
605 : }
606 :
607 2772 : directory = rm_directory_new(dirname);
608 2772 : directory->file_count = file_count;
609 :
610 : /* Make the new directory known */
611 2772 : rm_trie_insert(&self->dir_tree, dirname, directory);
612 :
613 2772 : g_queue_push_head(&self->valid_dirs, directory);
614 : } else {
615 1288 : g_free(dirname);
616 : }
617 :
618 4060 : rm_directory_add(directory, file);
619 :
620 : /* Add the file to this directory */
621 4060 : g_queue_push_head(&directory->known_files, file);
622 :
623 : /* Remember the digest (if only to free it later...) */
624 4060 : g_hash_table_replace(self->known_hashs, file->digest, NULL);
625 :
626 : /* Check if the directory reached the number of actual files in it */
627 4060 : if(directory->dupe_count == directory->file_count && directory->file_count > 0) {
628 1680 : rm_tm_insert_dir(self, directory);
629 : }
630 4060 : }
631 :
632 3192 : static void rm_tm_mark_finished(RmTreeMerger *self, RmDirectory *directory) {
633 3192 : if(directory->finished) {
634 0 : return;
635 : }
636 :
637 3192 : directory->finished = true;
638 :
639 : /* Recursively propagate to children */
640 4704 : for(GList *iter = directory->children.head; iter; iter = iter->next) {
641 1512 : rm_tm_mark_finished(self, (RmDirectory *)iter->data);
642 : }
643 : }
644 :
645 1792 : static void rm_tm_mark_original_files(RmTreeMerger *self, RmDirectory *directory) {
646 1792 : directory->finished = false;
647 :
648 : /* Recursively propagate to children */
649 2632 : for(GList *iter = directory->children.head; iter; iter = iter->next) {
650 840 : RmDirectory *child = iter->data;
651 840 : rm_tm_mark_original_files(self, child);
652 : }
653 1792 : }
654 :
655 1400 : static gint64 rm_tm_mark_duplicate_files(RmTreeMerger *self, RmDirectory *directory) {
656 1400 : gint64 acc = 0;
657 :
658 2716 : for(GList *iter = directory->known_files.head; iter; iter = iter->next) {
659 1316 : RmFile *file = iter->data;
660 1316 : acc += file->is_prefd;
661 : }
662 :
663 : /* Recursively propagate to children */
664 2072 : for(GList *iter = directory->children.head; iter; iter = iter->next) {
665 672 : RmDirectory *child = iter->data;
666 672 : acc += rm_tm_mark_duplicate_files(self, child);
667 : }
668 :
669 1400 : return acc;
670 : }
671 :
672 168 : static void rm_tm_write_unfinished_cksums(RmTreeMerger *self, RmDirectory *directory) {
673 336 : for(GList *iter = directory->known_files.head; iter; iter = iter->next) {
674 168 : RmFile *file = iter->data;
675 168 : file->lint_type = RM_LINT_TYPE_UNFINISHED_CKSUM;
676 168 : rm_fmt_write(file, self->session->formats, -1);
677 168 : g_hash_table_add(self->was_printed, file);
678 : }
679 :
680 : /* Recursively propagate to children */
681 168 : for(GList *iter = directory->children.head; iter; iter = iter->next) {
682 0 : RmDirectory *child = iter->data;
683 0 : rm_tm_write_unfinished_cksums(self, child);
684 : }
685 168 : }
686 :
687 4684 : static int rm_tm_sort_paths(const RmDirectory *da, const RmDirectory *db,
688 : _U RmTreeMerger *self) {
689 4684 : return da->depth - db->depth;
690 : }
691 :
692 3060 : static int rm_tm_sort_paths_reverse(const RmDirectory *da, const RmDirectory *db,
693 : _U RmTreeMerger *self) {
694 3060 : return -rm_tm_sort_paths(da, db, self);
695 : }
696 :
697 728 : static int rm_tm_sort_orig_criteria(const RmDirectory *da, const RmDirectory *db,
698 : RmTreeMerger *self) {
699 728 : RmCfg *cfg = self->session->cfg;
700 :
701 728 : if(da->prefd_files - db->prefd_files) {
702 56 : if(cfg->keep_all_tagged) {
703 28 : return db->prefd_files - da->prefd_files;
704 : } else {
705 28 : return da->prefd_files - db->prefd_files;
706 : }
707 : }
708 :
709 2016 : return rm_pp_cmp_orig_criteria_impl(
710 : self->session, da->metadata.dir_mtime, db->metadata.dir_mtime,
711 672 : rm_util_basename(da->dirname), rm_util_basename(db->dirname), 0, 0,
712 1344 : rm_util_path_depth(da->dirname), rm_util_path_depth(db->dirname));
713 : }
714 :
715 6972 : static void rm_tm_forward_unresolved(RmTreeMerger *self, RmDirectory *directory) {
716 6972 : if(directory->finished == true) {
717 4088 : return;
718 : } else {
719 2884 : directory->finished = true;
720 : }
721 :
722 5628 : for(GList *iter = directory->known_files.head; iter; iter = iter->next) {
723 2744 : RmFile *file = iter->data;
724 :
725 2744 : GQueue *file_list = rm_hash_table_setdefault(self->file_groups, file->digest,
726 : (RmNewFunc)g_queue_new);
727 2744 : g_queue_push_head(file_list, file);
728 : }
729 :
730 : /* Recursively propagate to children */
731 5572 : for(GList *iter = directory->children.head; iter; iter = iter->next) {
732 2688 : rm_tm_forward_unresolved(self, (RmDirectory *)iter->data);
733 : }
734 : }
735 :
736 4284 : static int rm_tm_iter_unfinished_files(_U RmTrie *trie, RmNode *node, _U int level,
737 : _U void *user_data) {
738 4284 : RmTreeMerger *self = user_data;
739 4284 : rm_tm_forward_unresolved(self, node->data);
740 4284 : return 0;
741 : }
742 :
743 923 : static int rm_tm_cmp_directory_groups(GQueue *a, GQueue *b) {
744 923 : if(a->length == 0 || b->length == 0) {
745 0 : return b->length - a->length;
746 : }
747 :
748 923 : RmDirectory *first_a = a->head->data;
749 923 : RmDirectory *first_b = b->head->data;
750 923 : return first_b->mergeups - first_a->mergeups;
751 : }
752 :
753 1120 : static void rm_tm_extract(RmTreeMerger *self) {
754 : /* Iterate over all directories per hash (which are same therefore) */
755 1120 : RmCfg *cfg = self->session->cfg;
756 1120 : GList *result_table_values = g_hash_table_get_values(self->result_table);
757 1120 : result_table_values =
758 : g_list_sort(result_table_values, (GCompareFunc)rm_tm_cmp_directory_groups);
759 :
760 2576 : for(GList *iter = result_table_values; iter; iter = iter->next) {
761 : /* Needs at least two directories to be duplicate... */
762 1456 : GQueue *dir_list = iter->data;
763 :
764 : #ifdef _RM_TREEMERGE_DEBUG
765 : for(GList *i = dir_list->head; i; i = i->next) {
766 : RmDirectory *d = i->data;
767 : char buf[512];
768 : memset(buf, 0, sizeof(buf));
769 : rm_digest_hexstring(d->digest, buf);
770 : g_printerr(" mergeups=%" LLU ": %s - %s\n", d->mergeups, d->dirname, buf);
771 : }
772 : g_printerr("---\n");
773 : #endif
774 1456 : if(dir_list->length < 2) {
775 504 : continue;
776 : }
777 :
778 952 : if(rm_session_was_aborted(self->session)) {
779 0 : break;
780 : }
781 :
782 : /* List of result directories */
783 952 : GQueue result_dirs = G_QUEUE_INIT;
784 :
785 : /* Sort the RmDirectory list by their path depth, lowest depth first */
786 952 : g_queue_sort(dir_list, (GCompareDataFunc)rm_tm_sort_paths, self);
787 :
788 : /* Output the directories and mark their children to prevent
789 : * duplicate directory reports in lower levels.
790 : */
791 3304 : for(GList *iter = dir_list->head; iter; iter = iter->next) {
792 2352 : RmDirectory *directory = iter->data;
793 2352 : if(directory->finished == false) {
794 1680 : rm_tm_mark_finished(self, directory);
795 1680 : g_queue_push_head(&result_dirs, directory);
796 : }
797 : }
798 :
799 : /* Make sure the original directory lands as first
800 : * in the result_dirs queue.
801 : */
802 952 : g_queue_sort(&result_dirs, (GCompareDataFunc)rm_tm_sort_orig_criteria, self);
803 :
804 952 : GQueue file_adaptor_group = G_QUEUE_INIT;
805 :
806 2632 : for(GList *iter = result_dirs.head; iter; iter = iter->next) {
807 1680 : RmDirectory *directory = iter->data;
808 1680 : RmFile *mask = rm_directory_as_file(self, directory);
809 1680 : g_queue_push_tail(&file_adaptor_group, mask);
810 :
811 1680 : if(iter == result_dirs.head) {
812 : /* First one in the group -> It's the original */
813 952 : mask->is_original = true;
814 952 : rm_tm_mark_original_files(self, directory);
815 : } else {
816 728 : gint64 prefd = rm_tm_mark_duplicate_files(self, directory);
817 728 : if(prefd == directory->dupe_count && cfg->keep_all_tagged) {
818 : /* Mark the file as original when all files in it are preferred. */
819 28 : mask->is_original = true;
820 700 : } else if(prefd == 0 && cfg->keep_all_untagged) {
821 28 : mask->is_original = true;
822 : }
823 : }
824 :
825 1680 : if(self->session->cfg->write_unfinished) {
826 168 : rm_tm_write_unfinished_cksums(self, directory);
827 : }
828 : }
829 :
830 952 : if(result_dirs.length >= 2) {
831 728 : rm_shred_forward_to_output(self->session, &file_adaptor_group);
832 : } else {
833 224 : g_queue_foreach(&file_adaptor_group, (GFunc)g_free, NULL);
834 : }
835 :
836 952 : g_queue_clear(&file_adaptor_group);
837 952 : g_queue_clear(&result_dirs);
838 : }
839 :
840 1120 : g_list_free(result_table_values);
841 :
842 : /* Iterate over all non-finished dirs in the tree,
843 : * and grab unfinished files that must be dupes elsewhise.
844 : */
845 1120 : rm_trie_iter(&self->dir_tree, NULL, true, false, rm_tm_iter_unfinished_files, self);
846 :
847 : /* Now here's a problem. Consider an input like this:
848 : * /root
849 : * ├── a
850 : * ├── sub1
851 : * │ ├── a
852 : * │ └── b
853 : * └── sub2
854 : * ├── a
855 : * └── b
856 : *
857 : * This yields two duplicate dirs (sub1, sub2)
858 : * and one duplicate, unmatched file (a).
859 : *
860 : * For outputting files we need groups, which consist of at least 2 files.
861 : * So how to group that, so we don't end up deleting a file many times?
862 : * We always choose which directories are originals first, so we flag all
863 : * files in it as originals.
864 : */
865 : GHashTableIter iter;
866 1120 : g_hash_table_iter_init(&iter, self->file_groups);
867 :
868 1120 : GQueue *file_list = NULL;
869 3612 : while(g_hash_table_iter_next(&iter, NULL, (void **)&file_list)) {
870 1372 : GList *next = NULL;
871 4116 : for(GList *iter = file_list->head; iter; iter = next) {
872 2744 : RmFile *file = iter->data;
873 2744 : next = iter->next;
874 :
875 : /* with --partial-hidden we do not want to output */
876 2744 : if(self->session->cfg->partial_hidden && file->is_hidden) {
877 84 : g_queue_delete_link(file_list, iter);
878 84 : continue;
879 : }
880 :
881 2660 : g_hash_table_add(self->was_printed, file);
882 : }
883 :
884 1372 : if(file_list->length >= 2) {
885 : /* If no separate duplicate files are requested, we can stop here */
886 868 : if(self->session->cfg->find_duplicates == false) {
887 0 : self->session->dup_group_counter -= 1;
888 0 : self->session->dup_counter -= file_list->length - 1;
889 : } else {
890 868 : rm_shred_group_find_original(self->session, file_list);
891 868 : rm_shred_forward_to_output(self->session, file_list);
892 : }
893 : }
894 : }
895 1120 : }
896 :
897 5124 : static void rm_tm_cluster_up(RmTreeMerger *self, RmDirectory *directory) {
898 5124 : char *parent_dir = g_path_get_dirname(directory->dirname);
899 5124 : bool is_root = strcmp(parent_dir, "/") == 0;
900 :
901 : /* Lookup if we already found this parent before (if yes, merge with it) */
902 5124 : RmDirectory *parent = rm_trie_search(&self->dir_tree, parent_dir);
903 :
904 5124 : if(parent == NULL) {
905 : /* none yet, basically copy child */
906 1512 : parent = rm_directory_new(parent_dir);
907 1512 : rm_trie_insert(&self->dir_tree, parent_dir, parent);
908 :
909 : /* Get the actual file count */
910 1512 : parent->file_count =
911 1512 : GPOINTER_TO_UINT(rm_trie_search(&self->count_tree, parent_dir));
912 :
913 : } else {
914 3612 : g_free(parent_dir);
915 : }
916 :
917 5124 : rm_directory_add_subdir(parent, directory);
918 :
919 5124 : if(parent->dupe_count == parent->file_count && parent->file_count > 0) {
920 2352 : rm_tm_insert_dir(self, parent);
921 2352 : if(!is_root) {
922 2352 : rm_tm_cluster_up(self, parent);
923 : }
924 : }
925 5124 : }
926 :
927 1120 : void rm_tm_finish(RmTreeMerger *self) {
928 : /* Iterate over all valid directories and try to level them all layers up.
929 : */
930 1120 : g_queue_sort(&self->valid_dirs, (GCompareDataFunc)rm_tm_sort_paths_reverse, self);
931 3892 : for(GList *iter = self->valid_dirs.head; iter; iter = iter->next) {
932 2772 : RmDirectory *directory = iter->data;
933 2772 : rm_tm_cluster_up(self, directory);
934 : #ifdef _RM_TREEMERGE_DEBUG
935 : g_printerr("###\n");
936 : #endif
937 : }
938 :
939 1120 : if(!rm_session_was_aborted(self->session)) {
940 : /* Recursively call self to march on */
941 1120 : rm_tm_extract(self);
942 : }
943 1120 : }
|