Line data Source code
1 : /**
2 : * This file is part of rmlint.
3 : *
4 : * rmlint is free software: you can redistribute it and/or modify
5 : * it under the terms of the GNU General Public License as published by
6 : * the Free Software Foundation, either version 3 of the License, or
7 : * (at your option) any later version.
8 : *
9 : * rmlint is distributed in the hope that it will be useful,
10 : * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 : * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 : * GNU General Public License for more details.
13 : *
14 : * You should have received a copy of the GNU General Public License
15 : * along with rmlint. If not, see <http://www.gnu.org/licenses/>.
16 : *
17 : * Authors:
18 : *
19 : * - Christopher <sahib> Pahl 2010-2015 (https://github.com/sahib)
20 : * - Daniel <SeeSpotRun> T. 2014-2015 (https://github.com/SeeSpotRun)
21 : *
22 : * Hosted on http://github.com/sahib/rmlint
23 : *
24 : */
25 :
26 : /* Internal headers */
27 : #include "config.h"
28 : #include "replay.h"
29 : #include "session.h"
30 : #include "formats.h"
31 : #include "file.h"
32 : #include "preprocess.h"
33 : #include "shredder.h"
34 :
35 : /* External libraries */
36 : #include <string.h>
37 : #include <glib.h>
38 : #include <glib/gstdio.h>
39 :
40 : #if HAVE_JSON_GLIB
41 : #include <json-glib/json-glib.h>
42 :
43 : /////////////////////////////////////////////////
44 : // POLLY THE PARROT REPEATS WHAT RMLINT SAID //
45 : /////////////////////////////////////////////////
46 :
47 : typedef struct RmParrot {
48 : /* Global session */
49 : RmSession *session;
50 :
51 : /* Json parser instance */
52 : JsonParser *parser;
53 :
54 : /* Root array of the json document */
55 : JsonArray *root;
56 :
57 : /* Last original file that we encountered */
58 : RmFile *last_original;
59 :
60 : /* Index inside the document
61 : * (0 is header, 1 first element, len(root) is footer)
62 : * */
63 : guint index;
64 :
65 : /* Set of diskids in cfg->paths */
66 : GHashTable *disk_ids;
67 : } RmParrot;
68 :
69 27 : static void rm_parrot_close(RmParrot *polly) {
70 27 : if(polly->parser) {
71 27 : g_object_unref(polly->parser);
72 : }
73 :
74 27 : g_hash_table_unref(polly->disk_ids);
75 27 : g_free(polly);
76 27 : }
77 :
78 27 : static RmParrot *rm_parrot_open(RmSession *session, const char *json_path,
79 : GError **error) {
80 27 : RmParrot *polly = g_malloc0(sizeof(RmParrot));
81 27 : polly->session = session;
82 27 : polly->parser = json_parser_new();
83 27 : polly->disk_ids = g_hash_table_new(NULL, NULL);
84 27 : polly->index = 1;
85 :
86 104 : for(int idx = 0; session->cfg->paths[idx]; ++idx) {
87 : RmStat stat_buf;
88 77 : const char *path = session->cfg->paths[idx];
89 :
90 77 : if(rm_sys_stat(path, &stat_buf) != -1) {
91 77 : g_hash_table_add(polly->disk_ids, GUINT_TO_POINTER(stat_buf.st_dev));
92 : }
93 : }
94 :
95 27 : if(!json_parser_load_from_file(polly->parser, json_path, error)) {
96 0 : goto failure;
97 : }
98 :
99 27 : JsonNode *root = json_parser_get_root(polly->parser);
100 27 : if(JSON_NODE_TYPE(root) != JSON_NODE_ARRAY) {
101 0 : g_set_error(error, RM_ERROR_QUARK, 0, _("No valid json cache (no array in /)"));
102 0 : goto failure;
103 : }
104 :
105 27 : polly->root = json_node_get_array(root);
106 27 : return polly;
107 :
108 : failure:
109 0 : rm_parrot_close(polly);
110 0 : return NULL;
111 : }
112 :
113 297 : static bool rm_parrot_has_next(RmParrot *polly) {
114 297 : return (polly->index < json_array_get_length(polly->root));
115 : }
116 :
117 81 : static RmFile *rm_parrot_try_next(RmParrot *polly) {
118 81 : if(!rm_parrot_has_next(polly)) {
119 0 : return NULL;
120 : }
121 :
122 81 : RmFile *file = NULL;
123 81 : const char *path = NULL;
124 81 : size_t path_len = 0;
125 :
126 81 : JsonObject *object = json_array_get_object_element(polly->root, polly->index);
127 :
128 : /* Deliver a higher index the next time, even if it fails */
129 81 : polly->index += 1;
130 :
131 : /* Read the path (without generating a warning if it's not there) */
132 81 : JsonNode *path_node = json_object_get_member(object, "path");
133 81 : if(path_node == NULL) {
134 27 : return NULL;
135 : }
136 :
137 54 : path = json_node_get_string(path_node);
138 54 : path_len = strlen(path);
139 :
140 54 : if(rm_trie_search_node(&polly->session->cfg->file_trie, path)) {
141 : /* We have this node already */
142 0 : return NULL;
143 : }
144 :
145 : /* Check for the lint type */
146 54 : RmLintType type =
147 54 : rm_file_string_to_lint_type(json_object_get_string_member(object, "type"));
148 :
149 54 : if(type == RM_LINT_TYPE_UNKNOWN) {
150 0 : rm_log_warning_line(_("lint type '%s' not recognised"),
151 : json_object_get_string_member(object, "type"));
152 0 : return NULL;
153 : }
154 :
155 : /* Collect file information (for rm_file_new) */
156 : RmStat lstat_buf, stat_buf;
157 54 : RmStat *stat_info = &lstat_buf;
158 54 : if(rm_sys_lstat(path, &lstat_buf) == -1) {
159 0 : return NULL;
160 : }
161 :
162 : /* use stat() after lstat() to find out if it's an symlink.
163 : * If it's a bad link, this will fail with stat_info still pointing to lstat.
164 : * */
165 54 : if(rm_sys_stat(path, &stat_buf) != -1) {
166 54 : stat_info = &stat_buf;
167 : }
168 :
169 : /* Check if we're late and issue an warning */
170 54 : JsonNode *mtime_node = json_object_get_member(object, "mtime");
171 108 : if(mtime_node &&
172 54 : json_node_get_int(mtime_node) < rm_sys_stat_mtime_seconds(stat_info)) {
173 0 : rm_log_warning_line(_("modification time of `%s` changed. Ignoring."), path);
174 0 : return NULL;
175 : }
176 :
177 : /* Fill up the RmFile */
178 54 : file = rm_file_new(polly->session, path, path_len, stat_info, type, 0, 0, 0);
179 54 : file->is_original = json_object_get_boolean_member(object, "is_original");
180 54 : file->is_symlink = (lstat_buf.st_mode & S_IFLNK);
181 54 : file->digest = rm_digest_new(RM_DIGEST_EXT, 0, 0, 0, 0);
182 54 : file->free_digest = true;
183 :
184 54 : if(file->is_original) {
185 27 : polly->last_original = file;
186 : }
187 :
188 54 : JsonNode *depth_node = json_object_get_member(object, "depth");
189 54 : if(depth_node != NULL) {
190 54 : file->depth = json_node_get_int(depth_node);
191 : }
192 :
193 : /* Fake the checksum using RM_DIGEST_EXT */
194 54 : JsonNode *cksum_node = json_object_get_member(object, "checksum");
195 54 : if(cksum_node != NULL) {
196 54 : const char *cksum = json_object_get_string_member(object, "checksum");
197 54 : if(cksum != NULL) {
198 54 : rm_digest_update(file->digest, (unsigned char *)cksum, strlen(cksum));
199 : }
200 : }
201 :
202 : /* Fix the hardlink relationship */
203 54 : JsonNode *hardlink_of = json_object_get_member(object, "hardlink_of");
204 54 : if(hardlink_of != NULL) {
205 0 : file->hardlinks.is_head = false;
206 0 : file->hardlinks.hardlink_head = polly->last_original;
207 : } else {
208 54 : file->hardlinks.is_head = true;
209 : }
210 :
211 54 : return file;
212 : }
213 :
214 81 : static RmFile *rm_parrot_next(RmParrot *polly) {
215 : /* Skip NULL entries */
216 189 : while(rm_parrot_has_next(polly)) {
217 81 : RmFile *file = NULL;
218 81 : if((file = rm_parrot_try_next(polly))) {
219 54 : return file;
220 : }
221 : }
222 :
223 27 : return NULL;
224 : }
225 :
226 : //////////////////////////////////
227 : // OPTION FILTERING CHECKS //
228 : //////////////////////////////////
229 :
230 : #define FAIL_MSG(msg) rm_log_debug(RED "[" msg "]\n" RESET)
231 :
232 54 : static bool rm_parrot_check_depth(RmCfg *cfg, RmFile *file) {
233 54 : return (file->depth == 0 || file->depth <= cfg->depth);
234 : }
235 :
236 54 : static bool rm_parrot_check_size(RmCfg *cfg, RmFile *file) {
237 54 : if(cfg->limits_specified == false) {
238 54 : return true;
239 : }
240 :
241 0 : return ((cfg->minsize == (RmOff)-1 || cfg->minsize <= file->file_size) &&
242 0 : (cfg->maxsize == (RmOff)-1 || file->file_size <= cfg->maxsize));
243 : }
244 :
245 54 : static bool rm_parrot_check_hidden(RmCfg *cfg, _U RmFile *file, const char *file_path) {
246 54 : if(!cfg->ignore_hidden) {
247 2 : return true;
248 : }
249 :
250 52 : if(rm_util_path_is_hidden(file_path)) {
251 2 : FAIL_MSG("nope: hidden");
252 2 : return false;
253 : }
254 :
255 50 : return true;
256 : }
257 :
258 52 : static bool rm_parrot_check_permissions(RmCfg *cfg, _U RmFile *file,
259 : const char *file_path) {
260 52 : if(!cfg->permissions) {
261 52 : return true;
262 : }
263 :
264 0 : if(g_access(file_path, cfg->permissions) == -1) {
265 0 : FAIL_MSG("nope: permissions");
266 0 : return false;
267 : }
268 :
269 0 : return true;
270 : }
271 :
272 52 : static bool rm_parrot_check_crossdev(RmParrot *polly, _U RmFile *file) {
273 52 : if(polly->session->cfg->crossdev) {
274 0 : return true;
275 : }
276 :
277 52 : if(!g_hash_table_contains(polly->disk_ids, GUINT_TO_POINTER(file->dev))) {
278 0 : FAIL_MSG("nope: on other device");
279 0 : return false;
280 : }
281 :
282 52 : return true;
283 : }
284 :
285 52 : static bool rm_parrot_check_path(RmParrot *polly, RmFile *file, const char *file_path) {
286 52 : RmCfg *cfg = polly->session->cfg;
287 :
288 52 : size_t highest_match = 0;
289 :
290 : /* Find the highest matching path given on the commandline.
291 : * If found, the path_index and is_prefd information is taken from it.
292 : * If not found, the file will be discarded.
293 : *
294 : * If this turns out to be an performance problem, we could turn cfg->paths
295 : * into a RmTrie and use it to find the longest prefix easily.
296 : */
297 204 : for(int i = 0; cfg->paths[i]; ++i) {
298 152 : char *path = cfg->paths[i];
299 152 : size_t path_len = strlen(path);
300 :
301 152 : if(strncmp(file_path, path, path_len) == 0) {
302 102 : if(path_len > highest_match) {
303 102 : highest_match = path_len;
304 :
305 102 : file->is_prefd = cfg->is_prefd[i];
306 102 : file->path_index = i;
307 : }
308 : }
309 : }
310 :
311 52 : if(highest_match == 0) {
312 0 : FAIL_MSG("nope: no prefix");
313 : }
314 :
315 52 : return (highest_match > 0);
316 : }
317 :
318 52 : static bool rm_parrot_check_types(RmCfg *cfg, RmFile *file) {
319 52 : switch(file->lint_type) {
320 : case RM_LINT_TYPE_DUPE_CANDIDATE:
321 52 : return cfg->find_duplicates;
322 : case RM_LINT_TYPE_DUPE_DIR_CANDIDATE:
323 0 : return cfg->merge_directories;
324 : case RM_LINT_TYPE_BADLINK:
325 0 : return cfg->find_badlinks;
326 : case RM_LINT_TYPE_EMPTY_DIR:
327 0 : return cfg->find_emptydirs;
328 : case RM_LINT_TYPE_EMPTY_FILE:
329 0 : return cfg->find_emptyfiles;
330 : case RM_LINT_TYPE_NONSTRIPPED:
331 0 : return cfg->find_nonstripped;
332 : case RM_LINT_TYPE_BADUID:
333 : case RM_LINT_TYPE_BADGID:
334 : case RM_LINT_TYPE_BADUGID:
335 0 : return cfg->find_badids;
336 : case RM_LINT_TYPE_UNFINISHED_CKSUM:
337 0 : return cfg->write_unfinished;
338 : case RM_LINT_TYPE_UNKNOWN:
339 : default:
340 0 : FAIL_MSG("nope: invalid lint type.");
341 0 : return false;
342 : }
343 : }
344 :
345 : /////////////////////////////////////////////
346 : // GROUPWISE FIXES (SORT, FILTER, ...) //
347 : /////////////////////////////////////////////
348 :
349 26 : static void rm_parrot_fix_match_opts(RmParrotCage *cage, GQueue *group) {
350 26 : RmCfg *cfg = cage->session->cfg;
351 78 : if(!(cfg->match_with_extension || cfg->match_without_extension ||
352 52 : cfg->match_basename || cfg->unmatched_basenames)) {
353 26 : return;
354 : }
355 :
356 : /* That's probably a sucky way to do it, due to n^2,
357 : * but I doubt that will make a large performance difference.
358 : */
359 :
360 0 : GList *iter = group->head;
361 0 : while(iter) {
362 0 : RmFile *file_a = iter->data;
363 0 : bool delete = true;
364 :
365 0 : for(GList *sub_iter = group->head; sub_iter; sub_iter = sub_iter->next) {
366 0 : RmFile *file_b = sub_iter->data;
367 0 : if(file_a == file_b) {
368 0 : continue;
369 : }
370 :
371 0 : if(rm_file_equal(file_a, file_b)) {
372 0 : delete = false;
373 0 : break;
374 : }
375 : }
376 :
377 : /* Remove this file */
378 0 : if(delete) {
379 0 : GList *tmp = iter;
380 0 : iter = iter->next;
381 0 : rm_file_destroy(file_a);
382 0 : g_queue_delete_link(group, tmp);
383 : } else {
384 0 : iter = iter->next;
385 : }
386 : }
387 : }
388 :
389 26 : static void rm_parrot_fix_must_match_tagged(RmParrotCage *cage, GQueue *group) {
390 26 : RmCfg *cfg = cage->session->cfg;
391 26 : if(!(cfg->must_match_tagged || cfg->must_match_untagged)) {
392 7 : return;
393 : }
394 :
395 19 : bool has_prefd = false, has_non_prefd = false;
396 :
397 38 : for(GList *iter = group->head; iter; iter = iter->next) {
398 38 : RmFile *file = iter->data;
399 :
400 38 : has_prefd |= file->is_prefd;
401 38 : has_non_prefd |= !file->is_prefd;
402 38 : if(has_prefd && has_non_prefd) {
403 19 : break;
404 : }
405 : }
406 :
407 38 : if((!has_prefd && cfg->must_match_tagged) ||
408 19 : (!has_non_prefd && cfg->must_match_untagged)) {
409 0 : g_queue_foreach(group, (GFunc)rm_file_destroy, NULL);
410 0 : g_queue_clear(group);
411 : }
412 : }
413 :
414 52 : static void rm_parrot_update_stats(RmParrotCage *cage, RmFile *file) {
415 52 : RmSession *session = cage->session;
416 :
417 52 : session->total_files += 1;
418 52 : if(file->lint_type == RM_LINT_TYPE_DUPE_CANDIDATE) {
419 52 : session->dup_group_counter += file->is_original;
420 52 : if(!file->is_original) {
421 26 : session->dup_counter += 1;
422 26 : session->total_lint_size += file->file_size;
423 : }
424 : } else {
425 0 : session->other_lint_cnt += 1;
426 : }
427 52 : }
428 :
429 26 : static void rm_parrot_write_group(RmParrotCage *cage, GQueue *group) {
430 26 : RmCfg *cfg = cage->session->cfg;
431 :
432 26 : if(cfg->filter_mtime) {
433 0 : gsize older = 0;
434 0 : for(GList *iter = group->head; iter; iter = iter->next) {
435 0 : RmFile *file = iter->data;
436 0 : older += (file->mtime >= cfg->min_mtime);
437 : }
438 :
439 0 : if(older == group->length) {
440 0 : g_queue_foreach(group, (GFunc)rm_file_destroy, NULL);
441 0 : g_queue_clear(group);
442 0 : return;
443 : }
444 : }
445 :
446 26 : rm_parrot_fix_match_opts(cage, group);
447 26 : rm_parrot_fix_must_match_tagged(cage, group);
448 :
449 26 : g_queue_sort(group, (GCompareDataFunc)rm_shred_cmp_orig_criteria, cage->session);
450 :
451 78 : for(GList *iter = group->head; iter; iter = iter->next) {
452 52 : RmFile *file = iter->data;
453 :
454 78 : if(file == group->head->data || (cfg->keep_all_tagged && file->is_prefd) ||
455 26 : (cfg->keep_all_untagged && !file->is_prefd)) {
456 26 : file->is_original = true;
457 : } else {
458 26 : file->is_original = false;
459 : }
460 :
461 52 : rm_parrot_update_stats(cage, file);
462 52 : rm_fmt_write(file, cage->session->formats, group->length);
463 : }
464 : }
465 :
466 : /////////////////////////////////////////
467 : // ENTRY POINT TO TRIGGER THE PARROT //
468 : /////////////////////////////////////////
469 :
470 27 : static void rm_parrot_cage_push_group(RmParrotCage *cage, GQueue **group_ref,
471 : bool is_last) {
472 27 : GQueue *group = *group_ref;
473 27 : if(group->length > 1) {
474 26 : g_queue_push_tail(cage->groups, group);
475 : } else {
476 1 : g_queue_free_full(group, (GDestroyNotify)rm_file_destroy);
477 : }
478 :
479 27 : if(!is_last) {
480 0 : *group_ref = g_queue_new();
481 : }
482 27 : }
483 :
484 27 : bool rm_parrot_cage_load(RmParrotCage *cage, const char *json_path) {
485 27 : GError *error = NULL;
486 :
487 27 : rm_log_info_line(_("Loading json-results `%s'"), json_path);
488 27 : RmParrot *polly = rm_parrot_open(cage->session, json_path, &error);
489 :
490 27 : if(polly == NULL || error != NULL) {
491 0 : rm_log_warning_line("Error: %s", error->message);
492 0 : g_error_free(error);
493 0 : return false;
494 : }
495 :
496 27 : RmCfg *cfg = cage->session->cfg;
497 27 : GQueue *group = g_queue_new();
498 27 : RmDigest *last_digest = NULL;
499 :
500 : /* group of files; first group is "other lint" */
501 135 : while(rm_parrot_has_next(polly)) {
502 81 : RmFile *file = rm_parrot_next(polly);
503 81 : if(file == NULL) {
504 56 : continue;
505 : }
506 :
507 54 : RM_DEFINE_PATH(file);
508 54 : rm_log_debug("Checking `%s`: ", file_path);
509 :
510 : /* Check --size, --perms, --hidden */
511 160 : if(!(rm_parrot_check_depth(cfg, file) && rm_parrot_check_size(cfg, file) &&
512 106 : rm_parrot_check_hidden(cfg, file, file_path) &&
513 104 : rm_parrot_check_permissions(cfg, file, file_path) &&
514 156 : rm_parrot_check_types(cfg, file) && rm_parrot_check_crossdev(polly, file) &&
515 52 : rm_parrot_check_path(polly, file, file_path))) {
516 2 : rm_file_destroy(file);
517 2 : continue;
518 : }
519 :
520 52 : if(last_digest == NULL) {
521 26 : last_digest = rm_digest_copy(file->digest);
522 : }
523 :
524 52 : rm_log_debug("[okay]\n");
525 :
526 52 : if(!rm_digest_equal(file->digest, last_digest)) {
527 0 : rm_digest_free(last_digest);
528 0 : last_digest = rm_digest_copy(file->digest);
529 0 : rm_parrot_cage_push_group(cage, &group, false);
530 : }
531 :
532 52 : g_queue_push_tail(group, file);
533 : }
534 :
535 27 : rm_parrot_cage_push_group(cage, &group, true);
536 27 : rm_parrot_close(polly);
537 27 : return true;
538 : }
539 :
540 27 : void rm_parrot_cage_open(RmParrotCage *cage, RmSession *session) {
541 27 : cage->session = session;
542 27 : cage->groups = g_queue_new();
543 27 : }
544 :
545 27 : void rm_parrot_cage_close(RmParrotCage *cage) {
546 53 : for(GList *iter = cage->groups->head; iter; iter = iter->next) {
547 26 : GQueue *group = iter->data;
548 26 : if(group->length > 1) {
549 26 : rm_parrot_write_group(cage, group);
550 : } else {
551 0 : g_queue_free_full(group, (GDestroyNotify)rm_file_destroy);
552 : }
553 : }
554 :
555 27 : g_queue_free_full(cage->groups, (GDestroyNotify)g_queue_free);
556 27 : }
557 :
558 : #else
559 :
560 : bool rm_parrot_cage_load(_U RmParrotCage *cage, _U const char *json_path) {
561 : return false;
562 : }
563 :
564 : void rm_parrot_cage_open(_U RmParrotCage *cage, _U RmSession *session) {
565 : rm_log_error_line(_("json-glib is needed for using --replay."));
566 : rm_log_error_line(_("Please recompile `rmlint` with it installed."));
567 : }
568 :
569 : void rm_parrot_cage_close(_U RmParrotCage *cage) {
570 : }
571 :
572 : #endif
|