LCOV - code coverage report
Current view: top level - lib - treemerge.c (source / functions) Hit Total Coverage
Test: coverage.info Lines: 359 388 92.5 %
Date: 2015-09-30 14:09:30 Functions: 30 30 100.0 %

          Line data    Source code
       1             : /*
       2             :  *  This file is part of rmlint.
       3             :  *
       4             :  *  rmlint is free software: you can redistribute it and/or modify
       5             :  *  it under the terms of the GNU General Public License as published by
       6             :  *  the Free Software Foundation, either version 3 of the License, or
       7             :  *  (at your option) any later version.
       8             :  *
       9             :  *  rmlint is distributed in the hope that it will be useful,
      10             :  *  but WITHOUT ANY WARRANTY; without even the implied warranty of
      11             :  *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
      12             :  *  GNU General Public License for more details.
      13             :  *
      14             :  *  You should have received a copy of the GNU General Public License
      15             :  *  along with rmlint.  If not, see <http://www.gnu.org/licenses/>.
      16             :  *
      17             :  * Authors:
      18             :  *
      19             :  *  - Christopher <sahib> Pahl 2010-2015 (https://github.com/sahib)
      20             :  *  - Daniel <SeeSpotRun> T.   2014-2015 (https://github.com/SeeSpotRun)
      21             :  *
      22             :  * Hosted on http://github.com/sahib/rmlint
      23             :  *
      24             :  */
      25             : 
      26             : /* This is the treemerge algorithm.
      27             :  *
      28             :  * It tries to solve the following problem and sometimes even succeeds:
      29             :  * Take a list of duplicates (as RmFiles) and figure out which directories
      30             :  * consist fully out of duplicates and can be thus removed.
      31             :  *
      32             :  * The basic algorithm is split in four phases:
      33             :  *
      34             :  * - Counting:  Walk through all directories given on the commandline and
      35             :  *              traverse them. Count all files during traverse and safe it in
      36             :  *              an radix-tree (libart is used here). The key is the path, the
      37             :  *              value the count of files in it. Invalid directories and
      38             :  *              directories above the given are set to -1.
      39             :  * - Feeding:   Collect all duplicates and store them in RmDirectory structures.
      40             :  *              If a directory appears to consist of dupes only (num_dupes == num_files)
      41             :  *              then it is remembered as valid directory.
      42             :  * - Upcluster: Take all valid directories and cluster them up, so subdirs get
      43             :  *              merged into the parent directory. Continue as long the parent
      44             :  *              directory is full too. Remember full directories in a hashtable
      45             :  *              with the hash of the directory (which is a hash of the file's
      46             :  *              hashes) as key and a list of matching directories as value.
      47             :  * - Extract:   Extract the result information out of the hashtable top-down.
      48             :  *              If a directory is reported, mark all subdirs of it as finished
      49             :  *              so they do not get reported twice. Files that could not be
      50             :  *              grouped in directories are found and reported as usually.
      51             :  */
      52             : 
      53             : /*
      54             :  * Comment this out to see helpful extra debugging:
      55             :  */
      56             : // #define _RM_TREEMERGE_DEBUG
      57             : 
      58             : #include <glib.h>
      59             : #include <string.h>
      60             : 
      61             : #include <sys/types.h>
      62             : #include <sys/stat.h>
      63             : #include <fts.h>
      64             : 
      65             : #include "treemerge.h"
      66             : #include "shredder.h"
      67             : #include "preprocess.h"
      68             : #include "formats.h"
      69             : #include "pathtricia.h"
      70             : 
      71             : typedef struct RmDirectory {
      72             :     char *dirname;       /* Path to this directory without trailing slash              */
      73             :     GQueue known_files;  /* RmFiles in this directory                                  */
      74             :     GQueue children;     /* Children for directories with subdirectories               */
      75             :     gint64 prefd_files;  /* Files in this directory that are tagged as original        */
      76             :     gint64 dupe_count;   /* Count of RmFiles actually in this directory                */
      77             :     gint64 file_count;   /* Count of files actually in this directory (or -1 on error) */
      78             :     gint64 mergeups;     /* number of times this directory was merged up               */
      79             :     bool finished : 1;   /* Was this dir or one of his parents already printed?        */
      80             :     bool was_merged : 1; /* true if this directory was merged up already (only once)   */
      81             :     bool was_inserted : 1; /* true if this directory was added to results (only once) */
      82             :     unsigned short depth; /* path depth (i.e. count of / in path, no trailing /)        */
      83             :     GHashTable *hash_set; /* Set of hashes, used for equality check (to be sure)        */
      84             :     RmDigest *digest;     /* Common digest of all RmFiles in this directory             */
      85             : 
      86             :     struct {
      87             :         time_t dir_mtime; /* Directory Metadata: Modification Time */
      88             :         ino_t dir_inode;  /* Directory Metadata: Inode             */
      89             :         dev_t dir_dev;    /* Directory Metadata: Device ID         */
      90             :     } metadata;
      91             : } RmDirectory;
      92             : 
      93             : struct RmTreeMerger {
      94             :     RmSession *session;       /* Session state variables / Settings                  */
      95             :     RmTrie dir_tree;          /* Path-Trie with all RmFiles as value                 */
      96             :     RmTrie count_tree;        /* Path-Trie with all file's count as value            */
      97             :     GHashTable *result_table; /* {hash => [RmDirectory]} mapping                     */
      98             :     GHashTable *file_groups;  /* Group files by hash                                 */
      99             :     GHashTable *file_checks;  /* Set of files that were handled already.             */
     100             :     GHashTable *known_hashs;  /* Set of known hashes, only used for cleanup.         */
     101             :     GHashTable *was_printed;  /* Set of files that were handed to rm_fmt_write()     */
     102             :     GQueue valid_dirs;        /* Directories consisting of RmFiles only              */
     103             : };
     104             : 
     105             : //////////////////////////
     106             : // ACTUAL FILE COUNTING //
     107             : //////////////////////////
     108             : 
     109        6776 : int rm_tm_count_art_callback(_U RmTrie *self, RmNode *node, _U int level,
     110             :                              void *user_data) {
     111             :     /* Note: this method has a time complexity of O(log(n) * m) which may
     112             :        result in a few seconds buildup time for large sets of directories.  Since this
     113             :        will only happen when rmlint ran for long anyways and since we can keep the
     114             :        code easy and memory efficient this way, Im against more clever but longer
     115             :        solutions. (Good way of saying "Im just too stupid", eh?)
     116             :     */
     117             : 
     118        6776 :     RmTrie *count_tree = user_data;
     119        6776 :     bool error_flag = GPOINTER_TO_INT(node->data);
     120             : 
     121             :     char path[PATH_MAX];
     122        6776 :     memset(path, 0, sizeof(path));
     123        6776 :     rm_trie_build_path_unlocked(node, path, sizeof(path));
     124             : 
     125             :     /* Ascend the path parts up, add one for each part we meet.
     126             :        If a part was never found before, add it.
     127             :        This is the 'm' above: The count of separators in the path.
     128             : 
     129             :        Hack: path[key_len] is nul, at key_len it must be either an
     130             :              extra slash (bad) or the beginning of a file name.
     131             :              Therefore start at -2.
     132             :      */
     133      210028 :     for(int i = strlen(path) - 1; i >= 0; --i) {
     134      203252 :         if(path[i] == G_DIR_SEPARATOR) {
     135             :             /* Do not use an empty path, use a slash for root */
     136       24248 :             if(i == 0) {
     137        6776 :                 path[0] = G_DIR_SEPARATOR;
     138        6776 :                 path[1] = 0;
     139             :             } else {
     140       17472 :                 path[i] = 0;
     141             :             }
     142             : 
     143             :             /* Include the nulbyte */
     144       24248 :             int new_count = -1;
     145             : 
     146       24248 :             if(error_flag == false) {
     147             :                 /* Lookup the count on this level */
     148       21308 :                 int old_count = GPOINTER_TO_INT(rm_trie_search(count_tree, path));
     149             : 
     150             :                 /* Propagate old error up or just increment the count */
     151       21308 :                 new_count = (old_count == -1) ? -1 : old_count + 1;
     152             :             }
     153             : 
     154             :             /* Accumulate the count ('n' above is the height of the trie)  */
     155       24248 :             rm_trie_insert(count_tree, path, GINT_TO_POINTER(new_count));
     156             :         }
     157             :     }
     158             : 
     159        6776 :     return 0;
     160             : }
     161             : 
     162        1120 : static bool rm_tm_count_files(RmTrie *count_tree, char **paths, RmSession *session) {
     163        1120 :     if(*paths == NULL) {
     164           0 :         rm_log_error("No paths passed to rm_tm_count_files\n");
     165           0 :         return false;
     166             :     }
     167             : 
     168        1120 :     int fts_flags = FTS_COMFOLLOW;
     169        1120 :     if(session->cfg->follow_symlinks) {
     170          28 :         fts_flags |= FTS_LOGICAL;
     171             :     } else {
     172        1092 :         fts_flags |= FTS_PHYSICAL;
     173             :     }
     174             : 
     175             :     /* This tree stores the full file paths.
     176             :        It is joined into a full directory tree later.
     177             :      */
     178             :     RmTrie file_tree;
     179        1120 :     rm_trie_init(&file_tree);
     180             : 
     181        1120 :     FTS *fts = fts_open(paths, fts_flags, NULL);
     182        1120 :     if(fts == NULL) {
     183           0 :         rm_log_perror("fts_open failed");
     184           0 :         return false;
     185             :     }
     186             : 
     187        1120 :     FTSENT *ent = NULL;
     188       16772 :     while((ent = fts_read(fts))) {
     189             :         /* Handle large files (where fts fails with FTS_NS) */
     190       14532 :         if(ent->fts_info == FTS_NS) {
     191             :             RmStat stat_buf;
     192           0 :             if(rm_sys_stat(ent->fts_path, &stat_buf) == -1) {
     193           0 :                 rm_log_perror("stat(2) failed");
     194           0 :                 continue;
     195             :             } else {
     196             :                 /* Must be a large file (or followed link to it) */
     197           0 :                 ent->fts_info = FTS_F;
     198             :             }
     199             :         }
     200             : 
     201       14532 :         switch(ent->fts_info) {
     202             :         case FTS_ERR:
     203             :         case FTS_DC:
     204             :             /* Save this path as an error */
     205           0 :             rm_trie_insert(&file_tree, ent->fts_path, GINT_TO_POINTER(true));
     206           0 :             break;
     207             :         case FTS_F:
     208             :         case FTS_SL:
     209             :         case FTS_NS:
     210             :         case FTS_SLNONE:
     211             :         case FTS_DEFAULT:
     212             :             /* Save this path as countable file */
     213        5852 :             if(ent->fts_statp->st_size > 0) {
     214        5740 :                 rm_trie_insert(&file_tree, ent->fts_path, GINT_TO_POINTER(false));
     215             :             }
     216             :         case FTS_D:
     217             :         case FTS_DNR:
     218             :         case FTS_DOT:
     219             :         case FTS_DP:
     220             :         case FTS_NSOK:
     221             :         default:
     222             :             /* other fts states, that do not count as errors or files */
     223       14532 :             break;
     224             :         }
     225             :     }
     226             : 
     227        1120 :     if(fts_close(fts) != 0) {
     228           0 :         rm_log_perror("fts_close failed");
     229           0 :         return false;
     230             :     }
     231             : 
     232        1120 :     rm_trie_iter(&file_tree, NULL, true, false, rm_tm_count_art_callback, count_tree);
     233             : 
     234             :     /* Now flag everything as a no-go over the given paths,
     235             :      * otherwise we would continue merging till / with fatal consequences,
     236             :      * since / does not have more files as paths[0]
     237             :      */
     238        2464 :     for(int i = 0; paths[i]; ++i) {
     239             :         /* Just call the callback directly */
     240        1344 :         RmNode *node = rm_trie_search_node(&file_tree, paths[i]);
     241        1344 :         if(node != NULL) {
     242        1344 :             node->data = GINT_TO_POINTER(true);
     243        1344 :             rm_tm_count_art_callback(&file_tree, node, 0, count_tree);
     244             :         }
     245             :     }
     246             : 
     247        1120 :     rm_trie_destroy(&file_tree);
     248        1120 :     return true;
     249             : }
     250             : 
     251             : ///////////////////////////////
     252             : // DIRECTORY STRUCT HANDLING //
     253             : ///////////////////////////////
     254             : 
     255        4284 : static RmDirectory *rm_directory_new(char *dirname) {
     256        4284 :     RmDirectory *self = g_new0(RmDirectory, 1);
     257             : 
     258        4284 :     self->file_count = 0;
     259        4284 :     self->dupe_count = 0;
     260        4284 :     self->prefd_files = 0;
     261        4284 :     self->was_merged = false;
     262        4284 :     self->was_inserted = false;
     263        4284 :     self->mergeups = 0;
     264             : 
     265        4284 :     self->dirname = dirname;
     266        4284 :     self->finished = false;
     267             : 
     268        4284 :     self->depth = 0;
     269      114856 :     for(char *s = dirname; *s; s++) {
     270      110572 :         self->depth += (*s == G_DIR_SEPARATOR);
     271             :     }
     272             : 
     273             :     RmStat dir_stat;
     274        4284 :     if(rm_sys_stat(self->dirname, &dir_stat) == -1) {
     275           0 :         rm_log_perror("stat(2) failed during sort");
     276             :     } else {
     277        4284 :         self->metadata.dir_mtime = dir_stat.st_mtime;
     278        4284 :         self->metadata.dir_inode = dir_stat.st_ino;
     279        4284 :         self->metadata.dir_dev = dir_stat.st_dev;
     280             :     }
     281             : 
     282             :     /* Special cumulative hashsum, that is not dependent on the
     283             :      * order in which the file hashes were added.
     284             :      * It is not used as full hash, but as sorting speedup.
     285             :      */
     286        4284 :     self->digest = rm_digest_new(RM_DIGEST_CUMULATIVE, 0, 0, 0, false);
     287             : 
     288        4284 :     g_queue_init(&self->known_files);
     289        4284 :     g_queue_init(&self->children);
     290             : 
     291        4284 :     self->hash_set =
     292        4284 :         g_hash_table_new((GHashFunc)rm_digest_hash, (GEqualFunc)rm_digest_equal);
     293             : 
     294        4284 :     return self;
     295             : }
     296             : 
     297        4284 : static void rm_directory_free(RmDirectory *self) {
     298        4284 :     rm_digest_free(self->digest);
     299        4284 :     g_hash_table_unref(self->hash_set);
     300        4284 :     g_queue_clear(&self->known_files);
     301        4284 :     g_queue_clear(&self->children);
     302        4284 :     g_free(self->dirname);
     303        4284 :     g_free(self);
     304        4284 : }
     305             : 
     306        3192 : static RmOff rm_tm_calc_file_size(RmDirectory *directory) {
     307        3192 :     RmOff acc = 0;
     308             : 
     309        6216 :     for(GList *iter = directory->known_files.head; iter; iter = iter->next) {
     310        3024 :         RmFile *file = iter->data;
     311        3024 :         acc += file->file_size;
     312             :     }
     313             : 
     314             :     /* Recursively propagate to children */
     315        4704 :     for(GList *iter = directory->children.head; iter; iter = iter->next) {
     316        1512 :         acc += rm_tm_calc_file_size((RmDirectory *)iter->data);
     317             :     }
     318             : 
     319        3192 :     return acc;
     320             : }
     321             : 
     322        1680 : static RmFile *rm_directory_as_file(RmTreeMerger *merger, RmDirectory *self) {
     323             :     /* Masquerades a RmDirectory as RmFile for purpose of output */
     324        1680 :     RmFile *file = g_malloc0(sizeof(RmFile));
     325             : 
     326             :     /* Need to set session first, since set_path expects that */
     327        1680 :     file->session = merger->session;
     328        1680 :     rm_file_set_path(file, self->dirname, strlen(self->dirname));
     329             : 
     330        1680 :     file->lint_type = RM_LINT_TYPE_DUPE_DIR_CANDIDATE;
     331        1680 :     file->digest = self->digest;
     332             : 
     333             :     /* Set these to invalid for now */
     334        1680 :     file->mtime = self->metadata.dir_mtime;
     335        1680 :     file->inode = self->metadata.dir_inode;
     336        1680 :     file->dev = self->metadata.dir_dev;
     337             : 
     338             :     /* Recursively calculate the file size */
     339        1680 :     file->file_size = rm_tm_calc_file_size(self);
     340        1680 :     file->is_prefd = (self->prefd_files >= self->dupe_count);
     341             : 
     342        1680 :     return file;
     343             : }
     344             : 
     345        1400 : static bool rm_directory_equal(RmDirectory *d1, RmDirectory *d2) {
     346        1400 :     if(d1->mergeups != d2->mergeups) {
     347           0 :         return false;
     348             :     }
     349             : 
     350        1400 :     if(rm_digest_equal(d1->digest, d2->digest) == false) {
     351           0 :         return false;
     352             :     }
     353             : 
     354        1400 :     if(g_hash_table_size(d1->hash_set) != g_hash_table_size(d2->hash_set)) {
     355           0 :         return false;
     356             :     }
     357             : 
     358             :     gpointer digest_key;
     359             :     GHashTableIter iter;
     360             : 
     361        1400 :     g_hash_table_iter_init(&iter, d1->hash_set);
     362        1400 :     while(g_hash_table_iter_next(&iter, &digest_key, NULL)) {
     363        1400 :         if(g_hash_table_contains(d2->hash_set, digest_key) == false) {
     364           0 :             return false;
     365             :         }
     366             :     }
     367             : 
     368        1400 :     return true;
     369             : }
     370             : 
     371        4312 : static guint rm_directory_hash(const RmDirectory *d) {
     372             :     /* This hash is used to quickly compare directories with each other.
     373             :      * Different directories might yield the same hash of course.
     374             :      * To prevent this case, rm_directory_equal really compares
     375             :      * all the file's hashes with each other.
     376             :      */
     377        4312 :     return rm_digest_hash(d->digest) ^ d->mergeups;
     378             : }
     379             : 
     380        8120 : static int rm_directory_add(RmDirectory *directory, RmFile *file) {
     381             :     /* Update the directorie's hash with the file's hash
     382             :        Since we cannot be sure in which order the files come in
     383             :        we have to add the hash cummulatively.
     384             :      */
     385        8120 :     int new_dupes = 0;
     386             : 
     387        8120 :     g_assert(file);
     388        8120 :     g_assert(file->digest);
     389        8120 :     g_assert(directory);
     390             : 
     391        8120 :     guint8 *file_digest = NULL;
     392        8120 :     RmOff digest_bytes = 0;
     393             : 
     394        8120 :     if(file->digest->type == RM_DIGEST_PARANOID) {
     395        1138 :         file_digest = rm_digest_steal(file->digest->paranoid->shadow_hash);
     396        1138 :         digest_bytes = file->digest->paranoid->shadow_hash->bytes;
     397             :     } else {
     398        6982 :         file_digest = rm_digest_steal(file->digest);
     399        6982 :         digest_bytes = file->digest->bytes;
     400             :     }
     401             : 
     402             :     /* + and not XOR, since ^ would yield 0 for same hashes always. No matter
     403             :      * which hashes. Also this would be confusing. For me and for debuggers.
     404             :      */
     405        8120 :     rm_digest_update(directory->digest, file_digest, digest_bytes);
     406             : 
     407             :     /* The file value is not really used, but we need some non-null value */
     408        8120 :     g_hash_table_add(directory->hash_set, file->digest);
     409             : 
     410        8120 :     g_slice_free1(digest_bytes, file_digest);
     411             : 
     412        8120 :     if(file->hardlinks.is_head && file->hardlinks.files) {
     413           0 :         new_dupes = 1 + g_queue_get_length(file->hardlinks.files);
     414             :     } else {
     415        8120 :         new_dupes = 1;
     416             :     }
     417             : 
     418        8120 :     directory->dupe_count += new_dupes;
     419        8120 :     directory->prefd_files += file->is_prefd;
     420             : 
     421        8120 :     return new_dupes;
     422             : }
     423             : 
     424        5124 : static void rm_directory_add_subdir(RmDirectory *parent, RmDirectory *subdir) {
     425        5124 :     if(subdir->was_merged) {
     426        1764 :         return;
     427             :     }
     428             : 
     429        3360 :     parent->mergeups = subdir->mergeups + parent->mergeups + 1;
     430        3360 :     parent->dupe_count += subdir->dupe_count;
     431        3360 :     g_queue_push_head(&parent->children, subdir);
     432        3360 :     parent->prefd_files += subdir->prefd_files;
     433             : 
     434             : #ifdef _RM_TREEMERGE_DEBUG
     435             :     g_printerr("%55s (%3ld/%3ld) <- %s (%3ld/%3ld)\n", parent->dirname,
     436             :                parent->dupe_count, parent->file_count, subdir->dirname,
     437             :                subdir->dupe_count, subdir->file_count);
     438             : #endif
     439             : 
     440             :     /**
     441             :      * Here's something weird:
     442             :      * - a counter is used and substraced at once from parent->dupe_count.
     443             :      * - it would ofc. be nicer to substract it step by step.
     444             :      * - but for some weird reasons this only works on clang, not gcc.
     445             :      * - yes, what. But I tested this, I promise!
     446             :      */
     447        7420 :     for(GList *iter = subdir->known_files.head; iter; iter = iter->next) {
     448        4060 :         int c = rm_directory_add(parent, (RmFile *)iter->data);
     449        4060 :         parent->dupe_count -= c;
     450             :     }
     451             : 
     452             :     /* Inherit the child's checksum */
     453        3360 :     unsigned char *subdir_cksum = rm_digest_steal(subdir->digest);
     454        3360 :     rm_digest_update(parent->digest, subdir_cksum, subdir->digest->bytes);
     455        3360 :     g_slice_free1(subdir->digest->bytes, subdir_cksum);
     456             : 
     457        3360 :     subdir->was_merged = true;
     458             : }
     459             : 
     460             : ///////////////////////////
     461             : // TREE MERGER ALGORITHM //
     462             : ///////////////////////////
     463             : 
     464        1120 : static void rm_tm_chunk_flush(RmTreeMerger *self, char **out_paths, int n_paths) {
     465        1120 :     rm_tm_count_files(&self->count_tree, out_paths, self->session);
     466             : 
     467        1120 :     if(self->session->cfg->use_meta_cache) {
     468          88 :         for(int i = 0; i < n_paths; ++i) {
     469          48 :             g_free(out_paths[i]);
     470             :         }
     471             :     }
     472        1120 : }
     473             : 
     474        1120 : static void rm_tm_chunk_paths(RmTreeMerger *self, char **paths) {
     475             :     /* Count only up to 512 paths at the same time. High numbers like this can
     476             :      * happen if find is piped inside rmlint via the special "-" file.
     477             :      * Sadly, this would need to have all paths in memory at the same time.
     478             :      * With session->cfg->use_meta_cache, there is only an ID in the path
     479             :      * pointer.
     480             :      * */
     481             : 
     482        1120 :     RmCfg *cfg = self->session->cfg;
     483             : 
     484        1120 :     const int N = 512;
     485             : 
     486        1120 :     int n_paths = 0;
     487        1120 :     char **out_paths = g_malloc0((N + 1) * sizeof(char *));
     488             : 
     489        2464 :     for(int i = 0; paths[i]; ++i) {
     490        1344 :         if(cfg->use_meta_cache) {
     491             :             char buf[PATH_MAX];
     492             : 
     493          96 :             rm_swap_table_lookup(self->session->meta_cache,
     494          48 :                                  self->session->meta_cache_dir_id,
     495          48 :                                  GPOINTER_TO_UINT(paths[i]), buf, sizeof(buf));
     496             : 
     497          48 :             out_paths[n_paths] = g_strdup(buf);
     498             :         } else {
     499        1296 :             out_paths[n_paths] = paths[i];
     500             :         }
     501             : 
     502             :         /* Terminate the vector by a guarding NULL */
     503        1344 :         out_paths[++n_paths] = NULL;
     504             : 
     505             :         /* We reached the size of one chunk, flush and wrap around */
     506        1344 :         if(n_paths == N) {
     507           0 :             rm_tm_chunk_flush(self, out_paths, n_paths);
     508           0 :             n_paths = 0;
     509             :         }
     510             :     }
     511             : 
     512             :     /* Flush the rest of it */
     513        1120 :     if(n_paths) {
     514        1120 :         rm_tm_chunk_flush(self, out_paths, n_paths);
     515             :     }
     516             : 
     517        1120 :     g_free(out_paths);
     518        1120 : }
     519             : 
     520        1120 : RmTreeMerger *rm_tm_new(RmSession *session) {
     521        1120 :     RmTreeMerger *self = g_slice_new(RmTreeMerger);
     522        1120 :     self->session = session;
     523        1120 :     g_queue_init(&self->valid_dirs);
     524             : 
     525        1120 :     self->result_table = g_hash_table_new_full((GHashFunc)rm_directory_hash,
     526             :                                                (GEqualFunc)rm_directory_equal, NULL,
     527             :                                                (GDestroyNotify)g_queue_free);
     528             : 
     529        1120 :     self->file_groups =
     530        1120 :         g_hash_table_new_full((GHashFunc)rm_digest_hash, (GEqualFunc)rm_digest_equal,
     531             :                               NULL, (GDestroyNotify)g_queue_free);
     532             : 
     533        1120 :     self->known_hashs = g_hash_table_new_full(NULL, NULL, NULL, NULL);
     534        1120 :     self->was_printed = g_hash_table_new_full(NULL, NULL, NULL, NULL);
     535             : 
     536        1120 :     rm_trie_init(&self->dir_tree);
     537        1120 :     rm_trie_init(&self->count_tree);
     538             : 
     539        1120 :     rm_tm_chunk_paths(self, session->cfg->paths);
     540             : 
     541        1120 :     return self;
     542             : }
     543             : 
     544        4284 : int rm_tm_destroy_iter(_U RmTrie *self, RmNode *node, _U int level, RmTreeMerger *tm) {
     545        4284 :     RmDirectory *directory = node->data;
     546             : 
     547        8344 :     for(GList *iter = directory->known_files.head; iter; iter = iter->next) {
     548        4060 :         if(g_hash_table_contains(tm->was_printed, iter->data)) {
     549        2744 :             continue;
     550             :         }
     551             : 
     552        1316 :         rm_file_destroy((RmFile *)iter->data);
     553             :     }
     554             : 
     555        4284 :     rm_directory_free(directory);
     556        4284 :     return 0;
     557             : }
     558             : 
     559        1120 : void rm_tm_destroy(RmTreeMerger *self) {
     560        1120 :     g_hash_table_unref(self->result_table);
     561        1120 :     g_hash_table_unref(self->file_groups);
     562             : 
     563        1120 :     GList *digest_keys = g_hash_table_get_keys(self->known_hashs);
     564        1120 :     g_list_free_full(digest_keys, (GDestroyNotify)rm_digest_free);
     565        1120 :     g_hash_table_unref(self->known_hashs);
     566             : 
     567        1120 :     g_queue_clear(&self->valid_dirs);
     568             : 
     569             :     /* Kill all RmDirectories stored in the tree */
     570        1120 :     rm_trie_iter(&self->dir_tree, NULL, true, false,
     571             :                  (RmTrieIterCallback)rm_tm_destroy_iter, self);
     572             : 
     573        1120 :     rm_trie_destroy(&self->dir_tree);
     574        1120 :     rm_trie_destroy(&self->count_tree);
     575        1120 :     g_hash_table_unref(self->was_printed);
     576             : 
     577        1120 :     g_slice_free(RmTreeMerger, self);
     578        1120 : }
     579             : 
     580        4032 : static void rm_tm_insert_dir(RmTreeMerger *self, RmDirectory *directory) {
     581        4032 :     if(directory->was_inserted) {
     582        1176 :         return;
     583             :     }
     584             : 
     585        2856 :     GQueue *dir_queue =
     586        2856 :         rm_hash_table_setdefault(self->result_table, directory, (RmNewFunc)g_queue_new);
     587        2856 :     g_queue_push_head(dir_queue, directory);
     588        2856 :     directory->was_inserted = true;
     589             : }
     590             : 
     591        4060 : void rm_tm_feed(RmTreeMerger *self, RmFile *file) {
     592        4060 :     RM_DEFINE_PATH(file);
     593        4060 :     char *dirname = g_path_get_dirname(file_path);
     594             : 
     595             :     /* See if we know that directory already */
     596        4060 :     RmDirectory *directory = rm_trie_search(&self->dir_tree, dirname);
     597             : 
     598        4060 :     if(directory == NULL) {
     599             :         /* Get the actual file count */
     600        2772 :         int file_count = GPOINTER_TO_INT(rm_trie_search(&self->count_tree, dirname));
     601        2772 :         if(file_count == 0) {
     602           0 :             rm_log_error(
     603             :                 RED "Empty directory or weird RmFile encountered; rejecting.\n" RESET);
     604           0 :             file_count = -1;
     605             :         }
     606             : 
     607        2772 :         directory = rm_directory_new(dirname);
     608        2772 :         directory->file_count = file_count;
     609             : 
     610             :         /* Make the new directory known */
     611        2772 :         rm_trie_insert(&self->dir_tree, dirname, directory);
     612             : 
     613        2772 :         g_queue_push_head(&self->valid_dirs, directory);
     614             :     } else {
     615        1288 :         g_free(dirname);
     616             :     }
     617             : 
     618        4060 :     rm_directory_add(directory, file);
     619             : 
     620             :     /* Add the file to this directory */
     621        4060 :     g_queue_push_head(&directory->known_files, file);
     622             : 
     623             :     /* Remember the digest (if only to free it later...) */
     624        4060 :     g_hash_table_replace(self->known_hashs, file->digest, NULL);
     625             : 
     626             :     /* Check if the directory reached the number of actual files in it */
     627        4060 :     if(directory->dupe_count == directory->file_count && directory->file_count > 0) {
     628        1680 :         rm_tm_insert_dir(self, directory);
     629             :     }
     630        4060 : }
     631             : 
     632        3192 : static void rm_tm_mark_finished(RmTreeMerger *self, RmDirectory *directory) {
     633        3192 :     if(directory->finished) {
     634           0 :         return;
     635             :     }
     636             : 
     637        3192 :     directory->finished = true;
     638             : 
     639             :     /* Recursively propagate to children */
     640        4704 :     for(GList *iter = directory->children.head; iter; iter = iter->next) {
     641        1512 :         rm_tm_mark_finished(self, (RmDirectory *)iter->data);
     642             :     }
     643             : }
     644             : 
     645        1792 : static void rm_tm_mark_original_files(RmTreeMerger *self, RmDirectory *directory) {
     646        1792 :     directory->finished = false;
     647             : 
     648             :     /* Recursively propagate to children */
     649        2632 :     for(GList *iter = directory->children.head; iter; iter = iter->next) {
     650         840 :         RmDirectory *child = iter->data;
     651         840 :         rm_tm_mark_original_files(self, child);
     652             :     }
     653        1792 : }
     654             : 
     655        1400 : static gint64 rm_tm_mark_duplicate_files(RmTreeMerger *self, RmDirectory *directory) {
     656        1400 :     gint64 acc = 0;
     657             : 
     658        2716 :     for(GList *iter = directory->known_files.head; iter; iter = iter->next) {
     659        1316 :         RmFile *file = iter->data;
     660        1316 :         acc += file->is_prefd;
     661             :     }
     662             : 
     663             :     /* Recursively propagate to children */
     664        2072 :     for(GList *iter = directory->children.head; iter; iter = iter->next) {
     665         672 :         RmDirectory *child = iter->data;
     666         672 :         acc += rm_tm_mark_duplicate_files(self, child);
     667             :     }
     668             : 
     669        1400 :     return acc;
     670             : }
     671             : 
     672         168 : static void rm_tm_write_unfinished_cksums(RmTreeMerger *self, RmDirectory *directory) {
     673         336 :     for(GList *iter = directory->known_files.head; iter; iter = iter->next) {
     674         168 :         RmFile *file = iter->data;
     675         168 :         file->lint_type = RM_LINT_TYPE_UNFINISHED_CKSUM;
     676         168 :         rm_fmt_write(file, self->session->formats, -1);
     677         168 :         g_hash_table_add(self->was_printed, file);
     678             :     }
     679             : 
     680             :     /* Recursively propagate to children */
     681         168 :     for(GList *iter = directory->children.head; iter; iter = iter->next) {
     682           0 :         RmDirectory *child = iter->data;
     683           0 :         rm_tm_write_unfinished_cksums(self, child);
     684             :     }
     685         168 : }
     686             : 
     687        4684 : static int rm_tm_sort_paths(const RmDirectory *da, const RmDirectory *db,
     688             :                             _U RmTreeMerger *self) {
     689        4684 :     return da->depth - db->depth;
     690             : }
     691             : 
     692        3060 : static int rm_tm_sort_paths_reverse(const RmDirectory *da, const RmDirectory *db,
     693             :                                     _U RmTreeMerger *self) {
     694        3060 :     return -rm_tm_sort_paths(da, db, self);
     695             : }
     696             : 
     697         728 : static int rm_tm_sort_orig_criteria(const RmDirectory *da, const RmDirectory *db,
     698             :                                     RmTreeMerger *self) {
     699         728 :     RmCfg *cfg = self->session->cfg;
     700             : 
     701         728 :     if(da->prefd_files - db->prefd_files) {
     702          56 :         if(cfg->keep_all_tagged) {
     703          28 :             return db->prefd_files - da->prefd_files;
     704             :         } else {
     705          28 :             return da->prefd_files - db->prefd_files;
     706             :         }
     707             :     }
     708             : 
     709        2016 :     return rm_pp_cmp_orig_criteria_impl(
     710             :         self->session, da->metadata.dir_mtime, db->metadata.dir_mtime,
     711         672 :         rm_util_basename(da->dirname), rm_util_basename(db->dirname), 0, 0,
     712        1344 :         rm_util_path_depth(da->dirname), rm_util_path_depth(db->dirname));
     713             : }
     714             : 
     715        6972 : static void rm_tm_forward_unresolved(RmTreeMerger *self, RmDirectory *directory) {
     716        6972 :     if(directory->finished == true) {
     717        4088 :         return;
     718             :     } else {
     719        2884 :         directory->finished = true;
     720             :     }
     721             : 
     722        5628 :     for(GList *iter = directory->known_files.head; iter; iter = iter->next) {
     723        2744 :         RmFile *file = iter->data;
     724             : 
     725        2744 :         GQueue *file_list = rm_hash_table_setdefault(self->file_groups, file->digest,
     726             :                                                      (RmNewFunc)g_queue_new);
     727        2744 :         g_queue_push_head(file_list, file);
     728             :     }
     729             : 
     730             :     /* Recursively propagate to children */
     731        5572 :     for(GList *iter = directory->children.head; iter; iter = iter->next) {
     732        2688 :         rm_tm_forward_unresolved(self, (RmDirectory *)iter->data);
     733             :     }
     734             : }
     735             : 
     736        4284 : static int rm_tm_iter_unfinished_files(_U RmTrie *trie, RmNode *node, _U int level,
     737             :                                        _U void *user_data) {
     738        4284 :     RmTreeMerger *self = user_data;
     739        4284 :     rm_tm_forward_unresolved(self, node->data);
     740        4284 :     return 0;
     741             : }
     742             : 
     743         923 : static int rm_tm_cmp_directory_groups(GQueue *a, GQueue *b) {
     744         923 :     if(a->length == 0 || b->length == 0) {
     745           0 :         return b->length - a->length;
     746             :     }
     747             : 
     748         923 :     RmDirectory *first_a = a->head->data;
     749         923 :     RmDirectory *first_b = b->head->data;
     750         923 :     return first_b->mergeups - first_a->mergeups;
     751             : }
     752             : 
     753        1120 : static void rm_tm_extract(RmTreeMerger *self) {
     754             :     /* Iterate over all directories per hash (which are same therefore) */
     755        1120 :     RmCfg *cfg = self->session->cfg;
     756        1120 :     GList *result_table_values = g_hash_table_get_values(self->result_table);
     757        1120 :     result_table_values =
     758             :         g_list_sort(result_table_values, (GCompareFunc)rm_tm_cmp_directory_groups);
     759             : 
     760        2576 :     for(GList *iter = result_table_values; iter; iter = iter->next) {
     761             :         /* Needs at least two directories to be duplicate... */
     762        1456 :         GQueue *dir_list = iter->data;
     763             : 
     764             : #ifdef _RM_TREEMERGE_DEBUG
     765             :         for(GList *i = dir_list->head; i; i = i->next) {
     766             :             RmDirectory *d = i->data;
     767             :             char buf[512];
     768             :             memset(buf, 0, sizeof(buf));
     769             :             rm_digest_hexstring(d->digest, buf);
     770             :             g_printerr("    mergeups=%" LLU ": %s - %s\n", d->mergeups, d->dirname, buf);
     771             :         }
     772             :         g_printerr("---\n");
     773             : #endif
     774        1456 :         if(dir_list->length < 2) {
     775         504 :             continue;
     776             :         }
     777             : 
     778         952 :         if(rm_session_was_aborted(self->session)) {
     779           0 :             break;
     780             :         }
     781             : 
     782             :         /* List of result directories */
     783         952 :         GQueue result_dirs = G_QUEUE_INIT;
     784             : 
     785             :         /* Sort the RmDirectory list by their path depth, lowest depth first */
     786         952 :         g_queue_sort(dir_list, (GCompareDataFunc)rm_tm_sort_paths, self);
     787             : 
     788             :         /* Output the directories and mark their children to prevent
     789             :          * duplicate directory reports in lower levels.
     790             :          */
     791        3304 :         for(GList *iter = dir_list->head; iter; iter = iter->next) {
     792        2352 :             RmDirectory *directory = iter->data;
     793        2352 :             if(directory->finished == false) {
     794        1680 :                 rm_tm_mark_finished(self, directory);
     795        1680 :                 g_queue_push_head(&result_dirs, directory);
     796             :             }
     797             :         }
     798             : 
     799             :         /* Make sure the original directory lands as first
     800             :          * in the result_dirs queue.
     801             :          */
     802         952 :         g_queue_sort(&result_dirs, (GCompareDataFunc)rm_tm_sort_orig_criteria, self);
     803             : 
     804         952 :         GQueue file_adaptor_group = G_QUEUE_INIT;
     805             : 
     806        2632 :         for(GList *iter = result_dirs.head; iter; iter = iter->next) {
     807        1680 :             RmDirectory *directory = iter->data;
     808        1680 :             RmFile *mask = rm_directory_as_file(self, directory);
     809        1680 :             g_queue_push_tail(&file_adaptor_group, mask);
     810             : 
     811        1680 :             if(iter == result_dirs.head) {
     812             :                 /* First one in the group -> It's the original */
     813         952 :                 mask->is_original = true;
     814         952 :                 rm_tm_mark_original_files(self, directory);
     815             :             } else {
     816         728 :                 gint64 prefd = rm_tm_mark_duplicate_files(self, directory);
     817         728 :                 if(prefd == directory->dupe_count && cfg->keep_all_tagged) {
     818             :                     /* Mark the file as original when all files in it are preferred. */
     819          28 :                     mask->is_original = true;
     820         700 :                 } else if(prefd == 0 && cfg->keep_all_untagged) {
     821          28 :                     mask->is_original = true;
     822             :                 }
     823             :             }
     824             : 
     825        1680 :             if(self->session->cfg->write_unfinished) {
     826         168 :                 rm_tm_write_unfinished_cksums(self, directory);
     827             :             }
     828             :         }
     829             : 
     830         952 :         if(result_dirs.length >= 2) {
     831         728 :             rm_shred_forward_to_output(self->session, &file_adaptor_group);
     832             :         } else {
     833         224 :             g_queue_foreach(&file_adaptor_group, (GFunc)g_free, NULL);
     834             :         }
     835             : 
     836         952 :         g_queue_clear(&file_adaptor_group);
     837         952 :         g_queue_clear(&result_dirs);
     838             :     }
     839             : 
     840        1120 :     g_list_free(result_table_values);
     841             : 
     842             :     /* Iterate over all non-finished dirs in the tree,
     843             :      * and grab unfinished files that must be dupes elsewhise.
     844             :      */
     845        1120 :     rm_trie_iter(&self->dir_tree, NULL, true, false, rm_tm_iter_unfinished_files, self);
     846             : 
     847             :     /* Now here's a problem. Consider an input like this:
     848             :      *  /root
     849             :      *  ├── a
     850             :      *  ├── sub1
     851             :      *  │   ├── a
     852             :      *  │   └── b
     853             :      *  └── sub2
     854             :      *      ├── a
     855             :      *      └── b
     856             :      *
     857             :      *  This yields two duplicate dirs (sub1, sub2)
     858             :      *  and one duplicate, unmatched file (a).
     859             :      *
     860             :      *  For outputting files we need groups, which consist of at least 2 files.
     861             :      *  So how to group that, so we don't end up deleting a file many times?
     862             :      *  We always choose which directories are originals first, so we flag all
     863             :      *  files in it as originals.
     864             :      */
     865             :     GHashTableIter iter;
     866        1120 :     g_hash_table_iter_init(&iter, self->file_groups);
     867             : 
     868        1120 :     GQueue *file_list = NULL;
     869        3612 :     while(g_hash_table_iter_next(&iter, NULL, (void **)&file_list)) {
     870        1372 :         GList *next = NULL;
     871        4116 :         for(GList *iter = file_list->head; iter; iter = next) {
     872        2744 :             RmFile *file = iter->data;
     873        2744 :             next = iter->next;
     874             : 
     875             :             /* with --partial-hidden we do not want to output */
     876        2744 :             if(self->session->cfg->partial_hidden && file->is_hidden) {
     877          84 :                 g_queue_delete_link(file_list, iter);
     878          84 :                 continue;
     879             :             }
     880             : 
     881        2660 :             g_hash_table_add(self->was_printed, file);
     882             :         }
     883             : 
     884        1372 :         if(file_list->length >= 2) {
     885             :             /* If no separate duplicate files are requested, we can stop here */
     886         868 :             if(self->session->cfg->find_duplicates == false) {
     887           0 :                 self->session->dup_group_counter -= 1;
     888           0 :                 self->session->dup_counter -= file_list->length - 1;
     889             :             } else {
     890         868 :                 rm_shred_group_find_original(self->session, file_list);
     891         868 :                 rm_shred_forward_to_output(self->session, file_list);
     892             :             }
     893             :         }
     894             :     }
     895        1120 : }
     896             : 
     897        5124 : static void rm_tm_cluster_up(RmTreeMerger *self, RmDirectory *directory) {
     898        5124 :     char *parent_dir = g_path_get_dirname(directory->dirname);
     899        5124 :     bool is_root = strcmp(parent_dir, "/") == 0;
     900             : 
     901             :     /* Lookup if we already found this parent before (if yes, merge with it) */
     902        5124 :     RmDirectory *parent = rm_trie_search(&self->dir_tree, parent_dir);
     903             : 
     904        5124 :     if(parent == NULL) {
     905             :         /* none yet, basically copy child */
     906        1512 :         parent = rm_directory_new(parent_dir);
     907        1512 :         rm_trie_insert(&self->dir_tree, parent_dir, parent);
     908             : 
     909             :         /* Get the actual file count */
     910        1512 :         parent->file_count =
     911        1512 :             GPOINTER_TO_UINT(rm_trie_search(&self->count_tree, parent_dir));
     912             : 
     913             :     } else {
     914        3612 :         g_free(parent_dir);
     915             :     }
     916             : 
     917        5124 :     rm_directory_add_subdir(parent, directory);
     918             : 
     919        5124 :     if(parent->dupe_count == parent->file_count && parent->file_count > 0) {
     920        2352 :         rm_tm_insert_dir(self, parent);
     921        2352 :         if(!is_root) {
     922        2352 :             rm_tm_cluster_up(self, parent);
     923             :         }
     924             :     }
     925        5124 : }
     926             : 
     927        1120 : void rm_tm_finish(RmTreeMerger *self) {
     928             :     /* Iterate over all valid directories and try to level them all layers up.
     929             :      */
     930        1120 :     g_queue_sort(&self->valid_dirs, (GCompareDataFunc)rm_tm_sort_paths_reverse, self);
     931        3892 :     for(GList *iter = self->valid_dirs.head; iter; iter = iter->next) {
     932        2772 :         RmDirectory *directory = iter->data;
     933        2772 :         rm_tm_cluster_up(self, directory);
     934             : #ifdef _RM_TREEMERGE_DEBUG
     935             :         g_printerr("###\n");
     936             : #endif
     937             :     }
     938             : 
     939        1120 :     if(!rm_session_was_aborted(self->session)) {
     940             :         /* Recursively call self to march on */
     941        1120 :         rm_tm_extract(self);
     942             :     }
     943        1120 : }

Generated by: LCOV version 1.11