From 7d66aeef6e3bf56cd4640fbfc9881879a8edf207 Mon Sep 17 00:00:00 2001 From: sreedev Date: Mon, 23 Jan 2023 19:02:09 -0500 Subject: [PATCH 1/6] version 0.1.5 --- Cargo.lock | 2 +- Cargo.toml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 39e467f..297c101 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -324,7 +324,7 @@ dependencies = [ [[package]] name = "deduplicator" -version = "0.1.4" +version = "0.1.5" dependencies = [ "anyhow", "bytesize", diff --git a/Cargo.toml b/Cargo.toml index 8152c50..367fb2b 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "deduplicator" -version = "0.1.4" +version = "0.1.5" edition = "2021" description = "find,filter,delete Duplicates" license = "MIT" From fa12f85b6a9194b451c3b00e7c2f7af7d078d89e Mon Sep 17 00:00:00 2001 From: sreedev Date: Mon, 23 Jan 2023 19:12:55 -0500 Subject: [PATCH 2/6] make --dir a positional argument --- src/params.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/params.rs b/src/params.rs index 23817ce..53e96ef 100644 --- a/src/params.rs +++ b/src/params.rs @@ -10,8 +10,8 @@ pub struct Params { /// Filetypes to deduplicate (default = all) #[arg(short, long)] pub types: Option, - /// Run Deduplicator on dir different from pwd - #[arg(long, value_hint = ValueHint::DirPath)] + /// Run Deduplicator on dir different from pwd (e.g., ~/Pictures ) + #[arg(value_hint = ValueHint::DirPath, value_name = "scan_dir_path")] pub dir: Option, /// Delete files interactively #[arg(long, short)] From 9f4d9139b6d17873dae6d365f0cdd86f5df3a86d Mon Sep 17 00:00:00 2001 From: sreedev Date: Mon, 23 Jan 2023 20:03:08 -0500 Subject: [PATCH 3/6] added --min-depth --max-depth --follow-links and renamed --minsize to --min-size --- README.md | 17 ++++++++------- src/filters.rs | 4 ++-- src/params.rs | 56 +++++++++++++++++++++++++++++++++++++++++--------- src/scanner.rs | 2 +- 4 files changed, 59 insertions(+), 20 deletions(-) diff --git a/README.md b/README.md index ca11908..22f9ffe 100644 --- a/README.md +++ b/README.md @@ -10,12 +10,15 @@ Usage: deduplicator [OPTIONS] Options: - -t, --types Filetypes to deduplicate (default = all) - --dir Run Deduplicator on dir different from pwd - -i, --interactive Delete files interactively - -m, --minsize Minimum filesize of duplicates to scan (e.g., 100B/1K/2M/3G/4T). [default = 0] - -h, --help Print help information - -V, --version Print version information + -t, --types Filetypes to deduplicate [default = all] + --dir Run Deduplicator on dir different from pwd + -i, --interactive Delete files interactively + --min-size Minimum filesize of duplicates to scan (e.g., 100B/1K/2M/3G/4T) [default: 1b] + -d, --max-depth Max Depth to scan while looking for duplicates + --min-depth Min Depth to scan while looking for duplicates + --follow-links Follow links while scanning directories + -h, --help Print help information + -V, --version Print version information ``` ## Installation @@ -75,7 +78,7 @@ Deduplicator uses size comparison and fxhash (a non non-cryptographic hashing al |:---|:---|---:|---:|---:|---:| | `deduplicator --dir ~/Data/tmp` | (~120G) | 27.5 ± 1.0 | 26.0 | 32.1 | 1.70 ± 0.09 | | `deduplicator --dir ~/Data/books` | (~8.6G) | 21.8 ± 0.7 | 20.5 | 24.4 | 1.35 ± 0.07 | -| `deduplicator --dir ~/Data/books --minsize 10M` | (~8.6G) | 16.1 ± 0.6 | 14.9 | 18.8 | 1.00 | +| `deduplicator --dir ~/Data/books --min-size 10M` | (~8.6G) | 16.1 ± 0.6 | 14.9 | 18.8 | 1.00 | | `deduplicator --dir ~/Data/ --types pdf,jpg,png,jpeg` | (~290G) | 1857.4 ± 24.5 | 1817.0 | 1895.5 | 115.07 ± 4.64 | * The last entry is lower because of the number of files deduplicator had to go through (~660895 Files). The average size of the files rarely affect the performance of deduplicator. diff --git a/src/filters.rs b/src/filters.rs index db95efc..71e67e4 100644 --- a/src/filters.rs +++ b/src/filters.rs @@ -1,8 +1,8 @@ use crate::file_manager::File; use crate::params::Params; -pub fn is_file_gt_minsize(app_opts: &Params, file: &File) -> bool { - match app_opts.get_minsize() { +pub fn is_file_gt_min_size(app_opts: &Params, file: &File) -> bool { + match app_opts.get_min_size() { Some(msize) => match file.size { Some(fsize) => fsize >= msize, None => true, diff --git a/src/params.rs b/src/params.rs index 23817ce..1ba1d9b 100644 --- a/src/params.rs +++ b/src/params.rs @@ -7,7 +7,7 @@ use globwalk::{GlobWalker, GlobWalkerBuilder}; #[derive(Parser, Debug)] #[command(author, version, about, long_about = None)] pub struct Params { - /// Filetypes to deduplicate (default = all) + /// Filetypes to deduplicate [default = all] #[arg(short, long)] pub types: Option, /// Run Deduplicator on dir different from pwd @@ -16,14 +16,23 @@ pub struct Params { /// Delete files interactively #[arg(long, short)] pub interactive: bool, - /// Minimum filesize of duplicates to scan (e.g., 100B/1K/2M/3G/4T). [default = 0] - #[arg(long, short)] - pub minsize: Option, + /// Minimum filesize of duplicates to scan (e.g., 100B/1K/2M/3G/4T). + #[arg(long, default_value = "1b")] + pub min_size: Option, + /// Max Depth to scan while looking for duplicates + #[arg(long, short = 'd')] + pub max_depth: Option, + /// Min Depth to scan while looking for duplicates + #[arg(long)] + pub min_depth: Option, + /// Follow links while scanning directories + #[arg(long)] + pub follow_links: bool, } impl Params { - pub fn get_minsize(&self) -> Option { - match &self.minsize { + pub fn get_min_size(&self) -> Option { + match &self.min_size { Some(msize) => match msize.parse::() { Ok(units) => Some(units.0), Err(_) => None, @@ -39,14 +48,41 @@ impl Params { Ok(dir) } + fn add_glob_min_depth(&self, builder: GlobWalkerBuilder) -> Result { + match self.min_depth { + Some(mindepth) => Ok(builder.min_depth(mindepth)), + None => Ok(builder), + } + } + + fn add_glob_max_depth(&self, builder: GlobWalkerBuilder) -> Result { + match self.max_depth { + Some(maxdepth) => Ok(builder.max_depth(maxdepth)), + None => Ok(builder), + } + } + + fn add_glob_follow_links(&self, builder: GlobWalkerBuilder) -> Result { + match self.follow_links { + true => Ok(builder.follow_links(true)), + false => Ok(builder.follow_links(false)), + } + } + pub fn get_glob_walker(&self) -> Result { let pattern: String = match self.types.as_ref() { Some(filetypes) => format!("**/*{{{filetypes}}}"), None => "**/*".to_string(), }; - // TODO: add params for maximum depth and following symlinks, then pass them to this builder - GlobWalkerBuilder::from_patterns(self.get_directory()?, &[pattern]) - .build() - .map_err(|e| anyhow!(e)) + + let glob_walker_builder = self + .add_glob_min_depth(GlobWalkerBuilder::from_patterns( + self.get_directory()?, + &[pattern], + )) + .and_then(|builder| self.add_glob_max_depth(builder)) + .and_then(|builder| self.add_glob_follow_links(builder))?; + + glob_walker_builder.build().map_err(|e| anyhow!(e)) } } diff --git a/src/scanner.rs b/src/scanner.rs index 27ac8f5..2ac99ae 100644 --- a/src/scanner.rs +++ b/src/scanner.rs @@ -63,7 +63,7 @@ fn scan(app_opts: &Params) -> Result> { hash: None, size: Some(fs::metadata(fpath).unwrap().len()), }) - .filter(|file| filters::is_file_gt_minsize(app_opts, file)) + .filter(|file| filters::is_file_gt_min_size(app_opts, file)) .collect(); Ok(files) From f9b6d579689b7a44e5494394ae185b8fe0b0b649 Mon Sep 17 00:00:00 2001 From: sreedev Date: Mon, 23 Jan 2023 20:05:14 -0500 Subject: [PATCH 4/6] added short params --- src/params.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/params.rs b/src/params.rs index 04d4bd7..b467695 100644 --- a/src/params.rs +++ b/src/params.rs @@ -17,7 +17,7 @@ pub struct Params { #[arg(long, short)] pub interactive: bool, /// Minimum filesize of duplicates to scan (e.g., 100B/1K/2M/3G/4T). - #[arg(long, default_value = "1b")] + #[arg(long, short = 's', default_value = "1b")] pub min_size: Option, /// Max Depth to scan while looking for duplicates #[arg(long, short = 'd')] @@ -26,7 +26,7 @@ pub struct Params { #[arg(long)] pub min_depth: Option, /// Follow links while scanning directories - #[arg(long)] + #[arg(long, short)] pub follow_links: bool, } From 4d27da99f3011da9a08bb926df710e76d109918a Mon Sep 17 00:00:00 2001 From: sreedev Date: Mon, 23 Jan 2023 20:09:15 -0500 Subject: [PATCH 5/6] updated README.md --- README.md | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/README.md b/README.md index 22f9ffe..1a4bddb 100644 --- a/README.md +++ b/README.md @@ -7,16 +7,18 @@ ## Usage ```bash -Usage: deduplicator [OPTIONS] +Usage: deduplicator [OPTIONS] [scan_dir_path] + +Arguments: + [scan_dir_path] Run Deduplicator on dir different from pwd (e.g., ~/Pictures ) Options: -t, --types Filetypes to deduplicate [default = all] - --dir Run Deduplicator on dir different from pwd -i, --interactive Delete files interactively - --min-size Minimum filesize of duplicates to scan (e.g., 100B/1K/2M/3G/4T) [default: 1b] + -s, --min-size Minimum filesize of duplicates to scan (e.g., 100B/1K/2M/3G/4T) [default: 1b] -d, --max-depth Max Depth to scan while looking for duplicates --min-depth Min Depth to scan while looking for duplicates - --follow-links Follow links while scanning directories + -f, --follow-links Follow links while scanning directories -h, --help Print help information -V, --version Print version information ``` From 6b06798e8e3749df2df2387843661c7b36aa672c Mon Sep 17 00:00:00 2001 From: sreedev Date: Mon, 23 Jan 2023 20:15:24 -0500 Subject: [PATCH 6/6] updated README.md --- README.md | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/README.md b/README.md index 1a4bddb..f18ea92 100644 --- a/README.md +++ b/README.md @@ -22,6 +22,24 @@ Options: -h, --help Print help information -V, --version Print version information ``` +### Examples + +```bash +# Scan for duplicates recursively from the current dir, only look for png, jpg & pdf file types & interactively delete files +deduplicator -t pdf,jpg,png -i + +# Scan for duplicates recursively from the ~/Pictures dir, only look for png, jpeg, jpg & pdf file types & interactively delete files +deduplicator ~/Pictures/ -t png,jpeg,jpg,pdf -i + +# Scan for duplicates in the ~/Pictures without recursing into subdirectories +deduplicator ~/Pictures --max-depth 0 + +# look for duplicates in the ~/.config directory while also recursing into symbolic link paths +deduplicator ~/.config --follow-links + +# scan for duplicates that are greater than 100mb in the ~/Media directory +deduplicator ~/Media --min-size 100mb +``` ## Installation