Skip to content

Commit

Permalink
datashed: add clean command (#41)
Browse files Browse the repository at this point in the history
Signed-off-by: Nico Wagner <[email protected]>
  • Loading branch information
nwagner84 authored Jul 16, 2024
1 parent 435826b commit 39dfe30
Show file tree
Hide file tree
Showing 7 changed files with 165 additions and 1 deletion.
44 changes: 44 additions & 0 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 1 addition & 1 deletion Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -40,4 +40,4 @@ version = "4.5"

[workspace.dependencies.polars]
version = "0.41.3"
features = ["ipc", "dtype-categorical"]
features = ["ipc", "dtype-categorical", "lazy", "is_in"]
1 change: 1 addition & 0 deletions crates/datashed/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ bstr = { workspace = true }
clap = { workspace = true }
comfy-table = { version = "7.1.1" }
dataset = { workspace = true }
dialoguer = { version = "0.11.0" }
flate2 = { version = "1.0.30" }
glob = { workspace = true }
indicatif = { workspace = true }
Expand Down
1 change: 1 addition & 0 deletions crates/datashed/src/cli.rs
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@ pub(crate) struct Args {
#[derive(Debug, Subcommand)]
pub(crate) enum Command {
Archive(Archive),
Clean(Clean),
Config(Config),
Index(Index),
#[clap(alias = "new")]
Expand Down
115 changes: 115 additions & 0 deletions crates/datashed/src/commands/clean.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,115 @@
use std::collections::BTreeSet;
use std::fs::{remove_file, File};

use clap::Parser;
use dialoguer::theme::ColorfulTheme;
use dialoguer::Confirm;
use glob::glob_with;
use indicatif::ProgressIterator;
use polars::prelude::*;

use crate::datashed::Datashed;
use crate::error::{DatashedError, DatashedResult};
use crate::progress::ProgressBarBuilder;
use crate::utils::relpath;

const PBAR_COLLECT: &str = "Collecting documents: {human_pos} | \
elapsed: {elapsed_precise}{msg}";

#[derive(Debug, Default, Parser)]
pub(crate) struct Clean {
/// Run verbosely. Print additional progress information to the
/// standard error stream. This option conflicts with the
/// `--quiet` option.
#[arg(short, long, conflicts_with = "quiet")]
verbose: bool,

/// Operate quietly; do not show progress. This option conflicts
/// with the `--verbose` option.
#[arg(short, long, conflicts_with = "verbose")]
quiet: bool,

/// Whether to confirm delete operations or not.
#[arg(short, long)]
force: bool,
}

impl Clean {
pub(crate) fn execute(self) -> DatashedResult<()> {
let datashed = Datashed::discover()?;
let data_dir = datashed.data_dir();
let base_dir = datashed.base_dir();

let pattern = format!("{}/**/*.txt", data_dir.display());
let pbar =
ProgressBarBuilder::new(PBAR_COLLECT, self.quiet).build();

let mut missing: Vec<_> = vec![];
let mut untracked: BTreeSet<_> =
glob_with(&pattern, Default::default())
.map_err(|e| DatashedError::Other(e.to_string()))?
.progress_with(pbar)
.filter_map(Result::ok)
.map(|path| relpath(path, base_dir))
.collect();

let index = datashed.index()?;
let path = index.column("path")?.str()?;

for idx in 0..index.height() {
let index_path = path.get(idx).unwrap();

if !untracked.remove(index_path) {
missing.push(index_path);
}
}

if !untracked.is_empty() {
let confirm = self.force
|| Confirm::with_theme(&ColorfulTheme::default())
.with_prompt(format!(
"Delete {} untracked document(s))?",
untracked.len()
))
.default(true)
.show_default(true)
.interact()
.unwrap();

if confirm {
untracked.into_iter().try_for_each(|relpath| {
remove_file(base_dir.join(relpath))?;
Ok::<_, DatashedError>(())
})?;
}
}

if !missing.is_empty() {
let confirm = self.force
|| Confirm::with_theme(&ColorfulTheme::default())
.with_prompt(format!(
"Delete {} missing index entries)?",
missing.len()
))
.default(true)
.show_default(true)
.interact()
.unwrap();

if confirm {
let missing = Series::from_iter(missing);
let mut df = index
.lazy()
.filter(col("path").is_in(lit(missing)).not())
.collect()?;

let path = base_dir.join(Datashed::INDEX);
let mut writer = IpcWriter::new(File::create(path)?)
.with_compression(Some(IpcCompression::ZSTD));
writer.finish(&mut df)?;
}
}

Ok(())
}
}
2 changes: 2 additions & 0 deletions crates/datashed/src/commands/mod.rs
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
pub(crate) use archive::Archive;
pub(crate) use clean::Clean;
pub(crate) use config::Config;
pub(crate) use index::Index;
pub(crate) use init::Init;
Expand All @@ -8,6 +9,7 @@ pub(crate) use verify::Verify;
pub(crate) use version::Version;

mod archive;
mod clean;
mod config;
mod index;
mod init;
Expand Down
1 change: 1 addition & 0 deletions crates/datashed/src/main.rs
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,7 @@ fn num_threads(args: &Args) -> usize {
fn run(args: Args) -> DatashedResult<()> {
match args.cmd {
Command::Archive(cmd) => cmd.execute(),
Command::Clean(cmd) => cmd.execute(),
Command::Config(cmd) => cmd.execute(),
Command::Index(cmd) => cmd.execute(),
Command::Init(cmd) => cmd.execute(),
Expand Down

0 comments on commit 39dfe30

Please sign in to comment.