From d44a9f31f82910e73a74c9b7e9fb4897cc5a2e98 Mon Sep 17 00:00:00 2001 From: Joe Prosser Date: Mon, 5 Aug 2024 17:44:29 +0100 Subject: [PATCH] feat(cli): add ability to sample comments --- CHANGELOG.md | 1 + Cargo.lock | 1 + api/src/resources/dataset.rs | 1 + cli/Cargo.toml | 1 + cli/src/commands/get/comments.rs | 20 +++++++++++++++++++- 5 files changed, 23 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 5a1c55f..d56b212 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -3,6 +3,7 @@ - Retry when putting comments - Add ability to get email by id - Add ability to upload attachment content for comments +- Add ability to randomly sample with `get comments` # v0.29.0 - Add `config parse-from-url` command for parsing configuration from a URL diff --git a/Cargo.lock b/Cargo.lock index 07c50ea..20c1d1e 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1288,6 +1288,7 @@ dependencies = [ "ordered-float", "pretty_assertions", "prettytable-rs", + "rand", "regex", "reinfer-client", "reqwest", diff --git a/api/src/resources/dataset.rs b/api/src/resources/dataset.rs index 0aec022..2394267 100644 --- a/api/src/resources/dataset.rs +++ b/api/src/resources/dataset.rs @@ -130,6 +130,7 @@ pub struct StatisticsRequestParams { pub enum OrderEnum { ByLabel { label: String }, Recent, + Sample { seed: usize }, } #[derive(Debug, Clone, Serialize)] diff --git a/cli/Cargo.toml b/cli/Cargo.toml index be9e1e1..2f9d5e4 100644 --- a/cli/Cargo.toml +++ b/cli/Cargo.toml @@ -43,6 +43,7 @@ encoding_rs = "0.8.33" ordered-float = { version = "3.9.1", features = ["serde"] } mailparse = "0.14.0" diff = "0.1.13" +rand = "0.8.5" [dev-dependencies] pretty_assertions = "1.3.0" diff --git a/cli/src/commands/get/comments.rs b/cli/src/commands/get/comments.rs index 12d206b..e8b328c 100644 --- a/cli/src/commands/get/comments.rs +++ b/cli/src/commands/get/comments.rs @@ -4,6 +4,7 @@ use chrono::{DateTime, Utc}; use colored::Colorize; use dialoguer::{Confirm, Input, Select}; use log::info; +use rand::Rng; use regex::Regex; use reinfer_client::{ resources::{ @@ -126,6 +127,10 @@ pub struct GetManyCommentsArgs { #[structopt(long = "--only-with-attachments")] /// Whether to only return comments with attachment metadata only_with_attachments: Option, + + #[structopt(long = "--shuffle")] + /// Whether to return comments in a random order + shuffle: Option, } #[derive(Debug, Deserialize)] @@ -309,6 +314,7 @@ pub fn get_many(client: &Client, args: &GetManyCommentsArgs) -> Result<()> { senders, include_attachment_content, only_with_attachments, + shuffle, } = args; let by_timerange = from_timestamp.is_some() || to_timestamp.is_some(); @@ -368,6 +374,10 @@ pub fn get_many(client: &Client, args: &GetManyCommentsArgs) -> Result<()> { bail!("Cannot include attachment content when no file is provided") } + if shuffle.is_some() && dataset.is_none() { + bail!("Cannot shuffle data when dataset is not provided") + } + let OutputLocations { jsonl_file, attachments_dir, @@ -454,6 +464,7 @@ pub fn get_many(client: &Client, args: &GetManyCommentsArgs) -> Result<()> { messages_filter: Some(messages_filter), attachments_dir, only_with_attachments_filter, + shuffle: shuffle.unwrap_or(false), }; if let Some(file) = jsonl_file { @@ -509,6 +520,7 @@ struct CommentDownloadOptions { messages_filter: Option, attachments_dir: Option, only_with_attachments_filter: Option, + shuffle: bool, } impl CommentDownloadOptions { @@ -685,7 +697,13 @@ fn get_comments_from_uids( messages: options.messages_filter.clone(), }, limit: DEFAULT_QUERY_PAGE_SIZE, - order: OrderEnum::Recent, + order: if options.shuffle { + OrderEnum::Sample { + seed: rand::thread_rng().gen_range(0..2_i64.pow(31) - 1) as usize, + } + } else { + OrderEnum::Recent + }, }; client