From 6663ecf6b93567d4df8a389ce9bdcd503efc76c2 Mon Sep 17 00:00:00 2001 From: Joe Prosser Date: Mon, 9 Oct 2023 23:05:42 +0100 Subject: [PATCH 1/6] commands: add get dataset stats --- CHANGELOG.md | 1 + api/src/resources/dataset.rs | 11 +++---- cli/src/commands/get/datasets.rs | 49 ++++++++++++++++++++++++++++++-- cli/src/printer.rs | 9 ++++-- 4 files changed, 60 insertions(+), 10 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index b1688053..af5ebb56 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,6 @@ ## Unreleased +- Add ability to get dataset stats - Show Global Permissions in `get users` - Upgrade `ordered-float` version, which is exposed in the public crate api. diff --git a/api/src/resources/dataset.rs b/api/src/resources/dataset.rs index b06c570b..3f366a7c 100644 --- a/api/src/resources/dataset.rs +++ b/api/src/resources/dataset.rs @@ -17,7 +17,7 @@ use std::{ str::FromStr, }; -#[derive(Debug, Clone, Deserialize, Serialize, PartialEq, Eq)] +#[derive(Debug, Clone, Deserialize, Serialize, PartialEq)] pub struct Dataset { pub id: Id, pub name: Name, @@ -34,6 +34,7 @@ pub struct Dataset { pub entity_defs: Vec, pub label_defs: Vec, pub label_groups: Vec, + pub num_reviewed: Option, } impl Dataset { @@ -248,17 +249,17 @@ pub(crate) struct CreateRequest<'request> { pub dataset: NewDataset<'request>, } -#[derive(Debug, Clone, Deserialize, Serialize, PartialEq, Eq)] +#[derive(Debug, Clone, Deserialize, Serialize, PartialEq)] pub(crate) struct CreateResponse { pub dataset: Dataset, } -#[derive(Debug, Clone, Deserialize, Serialize, PartialEq, Eq)] +#[derive(Debug, Clone, Deserialize, Serialize, PartialEq)] pub(crate) struct GetAvailableResponse { pub datasets: Vec, } -#[derive(Debug, Clone, Deserialize, Serialize, PartialEq, Eq)] +#[derive(Debug, Clone, Deserialize, Serialize, PartialEq)] pub(crate) struct GetResponse { pub dataset: Dataset, } @@ -280,7 +281,7 @@ pub(crate) struct UpdateRequest<'request> { pub dataset: UpdateDataset<'request>, } -#[derive(Debug, Clone, Deserialize, Serialize, PartialEq, Eq)] +#[derive(Debug, Clone, Deserialize, Serialize, PartialEq)] pub(crate) struct UpdateResponse { pub dataset: Dataset, } diff --git a/cli/src/commands/get/datasets.rs b/cli/src/commands/get/datasets.rs index 13f5b0ef..7c649d86 100644 --- a/cli/src/commands/get/datasets.rs +++ b/cli/src/commands/get/datasets.rs @@ -1,5 +1,10 @@ +use std::collections::HashMap; + use anyhow::{Context, Result}; -use reinfer_client::{Client, DatasetIdentifier}; +use log::info; +use reinfer_client::{ + resources::dataset::StatisticsRequestParams, Client, CommentFilter, DatasetIdentifier, +}; use structopt::StructOpt; use crate::printer::Printer; @@ -9,11 +14,18 @@ pub struct GetDatasetsArgs { #[structopt(name = "dataset")] /// If specified, only list this dataset (name or id) dataset: Option, + + #[structopt(long = "stats")] + /// Whether to include source statistics in response + include_stats: bool, } pub fn get(client: &Client, args: &GetDatasetsArgs, printer: &Printer) -> Result<()> { - let GetDatasetsArgs { dataset } = args; - let datasets = if let Some(dataset) = dataset { + let GetDatasetsArgs { + dataset, + include_stats, + } = args; + let mut datasets = if let Some(dataset) = dataset { vec![client .get_dataset(dataset.clone()) .context("Operation to list datasets has failed.")?] @@ -26,5 +38,36 @@ pub fn get(client: &Client, args: &GetDatasetsArgs, printer: &Printer) -> Result }); datasets }; + + let mut dataset_stats: HashMap<_, _> = HashMap::new(); + if *include_stats { + datasets.iter().try_for_each(|dataset| -> Result<()> { + info!("Getting statistics for dataset {}", dataset.full_name().0); + let stats = client + .get_dataset_statistics( + &dataset.full_name(), + &StatisticsRequestParams { + comment_filter: CommentFilter { + reviewed:Some(reinfer_client::resources::comment::ReviewedFilterEnum::OnlyReviewed), + ..Default::default() + }, + ..Default::default() + }, + ) + .context("Could not get statistics for dataset")?; + + dataset_stats.insert(dataset.id.clone(), stats); + Ok(()) + })?; + + datasets.iter_mut().for_each(|dataset| { + dataset.num_reviewed = if let Some(stats) = dataset_stats.get(&dataset.id).cloned() { + Some(*stats.num_comments) + } else { + None + } + }); + } + printer.print_resources(&datasets) } diff --git a/cli/src/printer.rs b/cli/src/printer.rs index 375f49e5..f6f7c67f 100644 --- a/cli/src/printer.rs +++ b/cli/src/printer.rs @@ -89,7 +89,7 @@ impl DisplayTable for Quota { impl DisplayTable for Dataset { fn to_table_headers() -> Row { - row![bFg => "Name", "ID", "Updated (UTC)", "Title"] + row![bFg => "Name", "ID", "Updated (UTC)", "Title", "Num Reviewed"] } fn to_table_row(&self) -> Row { @@ -98,7 +98,12 @@ impl DisplayTable for Dataset { full_name, self.id.0, self.updated_at.format("%Y-%m-%d %H:%M:%S"), - self.title + self.title, + if let Some(num_reviewed) = &self.num_reviewed { + num_reviewed.to_string().as_str().into() + } else { + "none".dimmed() + } ] } } From 5726dd988672f30568a32084235430aac663fa4f Mon Sep 17 00:00:00 2001 From: Joe Prosser Date: Mon, 9 Oct 2023 23:05:42 +0100 Subject: [PATCH 2/6] commands: add get dataset stats --- cli/src/commands/get/datasets.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cli/src/commands/get/datasets.rs b/cli/src/commands/get/datasets.rs index 7c649d86..d8c5b7eb 100644 --- a/cli/src/commands/get/datasets.rs +++ b/cli/src/commands/get/datasets.rs @@ -16,7 +16,7 @@ pub struct GetDatasetsArgs { dataset: Option, #[structopt(long = "stats")] - /// Whether to include source statistics in response + /// Whether to include dataset statistics in response include_stats: bool, } From 6cbb78b4469038ff29f6f6cd24d3a1eea522f2c0 Mon Sep 17 00:00:00 2001 From: Joe Prosser Date: Tue, 10 Oct 2023 21:22:40 +0100 Subject: [PATCH 3/6] post review changes --- api/src/resources/dataset.rs | 23 +++++++++++++++++------ cli/src/commands/get/datasets.rs | 30 ++++++++++++++---------------- cli/src/printer.rs | 32 +++++++++++++++++++++++++------- 3 files changed, 56 insertions(+), 29 deletions(-) diff --git a/api/src/resources/dataset.rs b/api/src/resources/dataset.rs index 3f366a7c..39428f31 100644 --- a/api/src/resources/dataset.rs +++ b/api/src/resources/dataset.rs @@ -1,4 +1,5 @@ use chrono::{DateTime, Utc}; +use ordered_float::NotNan; use serde::{Deserialize, Serialize}; use crate::{ @@ -17,7 +18,7 @@ use std::{ str::FromStr, }; -#[derive(Debug, Clone, Deserialize, Serialize, PartialEq)] +#[derive(Debug, Clone, Deserialize, Serialize, PartialEq, Eq)] pub struct Dataset { pub id: Id, pub name: Name, @@ -34,7 +35,17 @@ pub struct Dataset { pub entity_defs: Vec, pub label_defs: Vec, pub label_groups: Vec, - pub num_reviewed: Option, +} + +#[derive(Debug, Clone, Deserialize, Serialize, PartialEq, Eq)] +pub struct DatasetStats { + pub num_reviewed: NotNan, +} + +#[derive(Debug, Clone, Deserialize, Serialize, PartialEq, Eq)] +pub struct DatasetAndStats { + pub dataset: Dataset, + pub stats: DatasetStats, } impl Dataset { @@ -249,17 +260,17 @@ pub(crate) struct CreateRequest<'request> { pub dataset: NewDataset<'request>, } -#[derive(Debug, Clone, Deserialize, Serialize, PartialEq)] +#[derive(Debug, Clone, Deserialize, Serialize, PartialEq, Eq)] pub(crate) struct CreateResponse { pub dataset: Dataset, } -#[derive(Debug, Clone, Deserialize, Serialize, PartialEq)] +#[derive(Debug, Clone, Deserialize, Serialize, PartialEq, Eq)] pub(crate) struct GetAvailableResponse { pub datasets: Vec, } -#[derive(Debug, Clone, Deserialize, Serialize, PartialEq)] +#[derive(Debug, Clone, Deserialize, Serialize, PartialEq, Eq)] pub(crate) struct GetResponse { pub dataset: Dataset, } @@ -281,7 +292,7 @@ pub(crate) struct UpdateRequest<'request> { pub dataset: UpdateDataset<'request>, } -#[derive(Debug, Clone, Deserialize, Serialize, PartialEq)] +#[derive(Debug, Clone, Deserialize, Serialize, PartialEq, Eq)] pub(crate) struct UpdateResponse { pub dataset: Dataset, } diff --git a/cli/src/commands/get/datasets.rs b/cli/src/commands/get/datasets.rs index d8c5b7eb..040ea516 100644 --- a/cli/src/commands/get/datasets.rs +++ b/cli/src/commands/get/datasets.rs @@ -1,9 +1,8 @@ -use std::collections::HashMap; - use anyhow::{Context, Result}; use log::info; use reinfer_client::{ - resources::dataset::StatisticsRequestParams, Client, CommentFilter, DatasetIdentifier, + resources::dataset::{DatasetAndStats, DatasetStats, StatisticsRequestParams}, + Client, CommentFilter, DatasetIdentifier, }; use structopt::StructOpt; @@ -25,7 +24,7 @@ pub fn get(client: &Client, args: &GetDatasetsArgs, printer: &Printer) -> Result dataset, include_stats, } = args; - let mut datasets = if let Some(dataset) = dataset { + let datasets = if let Some(dataset) = dataset { vec![client .get_dataset(dataset.clone()) .context("Operation to list datasets has failed.")?] @@ -39,7 +38,7 @@ pub fn get(client: &Client, args: &GetDatasetsArgs, printer: &Printer) -> Result datasets }; - let mut dataset_stats: HashMap<_, _> = HashMap::new(); + let mut dataset_stats = Vec::new(); if *include_stats { datasets.iter().try_for_each(|dataset| -> Result<()> { info!("Getting statistics for dataset {}", dataset.full_name().0); @@ -56,18 +55,17 @@ pub fn get(client: &Client, args: &GetDatasetsArgs, printer: &Printer) -> Result ) .context("Could not get statistics for dataset")?; - dataset_stats.insert(dataset.id.clone(), stats); + let dataset_and_stats = DatasetAndStats { + dataset: dataset.clone(), + stats: DatasetStats { + num_reviewed: stats.num_comments + } + }; + dataset_stats.push(dataset_and_stats); Ok(()) })?; - - datasets.iter_mut().for_each(|dataset| { - dataset.num_reviewed = if let Some(stats) = dataset_stats.get(&dataset.id).cloned() { - Some(*stats.num_comments) - } else { - None - } - }); + printer.print_resources(&dataset_stats) + } else { + printer.print_resources(&datasets) } - - printer.print_resources(&datasets) } diff --git a/cli/src/printer.rs b/cli/src/printer.rs index f6f7c67f..1bc68c8e 100644 --- a/cli/src/printer.rs +++ b/cli/src/printer.rs @@ -2,7 +2,8 @@ use super::thousands::Thousands; use colored::Colorize; use prettytable::{format, row, Row, Table}; use reinfer_client::{ - resources::quota::Quota, Bucket, Dataset, Project, Source, Statistics, Stream, User, + resources::{dataset::DatasetAndStats, quota::Quota}, + Bucket, Dataset, Project, Source, Statistics, Stream, User, }; use serde::{Serialize, Serializer}; @@ -89,7 +90,7 @@ impl DisplayTable for Quota { impl DisplayTable for Dataset { fn to_table_headers() -> Row { - row![bFg => "Name", "ID", "Updated (UTC)", "Title", "Num Reviewed"] + row![bFg => "Name", "ID", "Updated (UTC)", "Title"] } fn to_table_row(&self) -> Row { @@ -99,11 +100,28 @@ impl DisplayTable for Dataset { self.id.0, self.updated_at.format("%Y-%m-%d %H:%M:%S"), self.title, - if let Some(num_reviewed) = &self.num_reviewed { - num_reviewed.to_string().as_str().into() - } else { - "none".dimmed() - } + ] + } +} + +impl DisplayTable for DatasetAndStats { + fn to_table_headers() -> Row { + row![bFg => "Name", "ID", "Updated (UTC)", "Title", "Num Reviewed"] + } + + fn to_table_row(&self) -> Row { + let full_name = format!( + "{}{}{}", + self.dataset.owner.0.dimmed(), + "/".dimmed(), + self.dataset.name.0 + ); + row![ + full_name, + self.dataset.id.0, + self.dataset.updated_at.format("%Y-%m-%d %H:%M:%S"), + self.dataset.title, + self.stats.num_reviewed ] } } From 2bf83bd0b8faf80b434c94ca1618cf56193a32e9 Mon Sep 17 00:00:00 2001 From: Joe Prosser Date: Wed, 11 Oct 2023 18:06:44 +0100 Subject: [PATCH 4/6] delay when deleting dataset in tests --- cli/tests/test_datasets.rs | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/cli/tests/test_datasets.rs b/cli/tests/test_datasets.rs index 50283b65..ffe0adff 100644 --- a/cli/tests/test_datasets.rs +++ b/cli/tests/test_datasets.rs @@ -1,3 +1,5 @@ +use anyhow::anyhow; +use backoff::{retry, ExponentialBackoff}; use pretty_assertions::assert_eq; use reinfer_client::{ Dataset, EntityDef, EntityName, LabelDef, LabelDefPretrained, LabelDefPretrainedId, LabelGroup, @@ -59,8 +61,16 @@ impl TestDataset { impl Drop for TestDataset { fn drop(&mut self) { - let output = TestCli::get().run(["delete", "dataset", self.identifier()]); - assert!(output.is_empty()); + let delete_dataset_command = || { + let output = TestCli::get().run(["delete", "dataset", self.identifier()]); + if output.is_empty() { + Ok(()) + } else { + Err(backoff::Error::transient(anyhow!(output))) + } + }; + + retry(ExponentialBackoff::default(), delete_dataset_command).unwrap() } } From ab0de76ddbfe413b97e58172ae2b998a76ba1a8f Mon Sep 17 00:00:00 2001 From: Joe Prosser Date: Sat, 14 Oct 2023 23:11:55 +0100 Subject: [PATCH 5/6] fixup error handling --- cli/tests/common.rs | 23 +++++++++++++++++++++++ cli/tests/test_datasets.rs | 12 ++++-------- 2 files changed, 27 insertions(+), 8 deletions(-) diff --git a/cli/tests/common.rs b/cli/tests/common.rs index d495c560..68f3a4f2 100644 --- a/cli/tests/common.rs +++ b/cli/tests/common.rs @@ -1,3 +1,4 @@ +use anyhow::{anyhow, Result}; use once_cell::sync::Lazy; use reinfer_client::User; use std::{ @@ -76,6 +77,14 @@ impl TestCli { self.output_error(self.command().args(args)) } + #[track_caller] + pub fn run_and_result( + &self, + args: impl IntoIterator>, + ) -> Result { + self.output_result(self.command().args(args)) + } + #[track_caller] pub fn run_with_stdin( &self, @@ -117,6 +126,20 @@ impl TestCli { String::from_utf8(output.stdout).unwrap() } + #[track_caller] + pub fn output_result(&self, command: &mut Command) -> Result { + let output = command.output().unwrap(); + + if output.status.success() { + Ok(String::from_utf8(output.stdout)?) + } else { + Err(anyhow!( + "failed to run command:\n{}", + String::from_utf8_lossy(&output.stderr) + )) + } + } + #[track_caller] pub fn output_error(&self, command: &mut Command) -> String { let output = command.output().unwrap(); diff --git a/cli/tests/test_datasets.rs b/cli/tests/test_datasets.rs index ffe0adff..54e9ceb4 100644 --- a/cli/tests/test_datasets.rs +++ b/cli/tests/test_datasets.rs @@ -1,4 +1,3 @@ -use anyhow::anyhow; use backoff::{retry, ExponentialBackoff}; use pretty_assertions::assert_eq; use reinfer_client::{ @@ -62,15 +61,12 @@ impl TestDataset { impl Drop for TestDataset { fn drop(&mut self) { let delete_dataset_command = || { - let output = TestCli::get().run(["delete", "dataset", self.identifier()]); - if output.is_empty() { - Ok(()) - } else { - Err(backoff::Error::transient(anyhow!(output))) - } + TestCli::get() + .run_and_result(["delete", "dataset", self.identifier()]) + .map_err(backoff::Error::transient) }; - retry(ExponentialBackoff::default(), delete_dataset_command).unwrap() + retry(ExponentialBackoff::default(), delete_dataset_command).unwrap(); } } From 9197de9cd68e7c927c8015d093d529d7f6e889d1 Mon Sep 17 00:00:00 2001 From: Joe Prosser Date: Sat, 14 Oct 2023 23:25:48 +0100 Subject: [PATCH 6/6] Add totals --- api/src/resources/dataset.rs | 1 + cli/src/commands/get/datasets.rs | 14 ++++++++++++-- cli/src/printer.rs | 3 ++- 3 files changed, 15 insertions(+), 3 deletions(-) diff --git a/api/src/resources/dataset.rs b/api/src/resources/dataset.rs index 39428f31..a9ee4034 100644 --- a/api/src/resources/dataset.rs +++ b/api/src/resources/dataset.rs @@ -40,6 +40,7 @@ pub struct Dataset { #[derive(Debug, Clone, Deserialize, Serialize, PartialEq, Eq)] pub struct DatasetStats { pub num_reviewed: NotNan, + pub total_verbatims: NotNan, } #[derive(Debug, Clone, Deserialize, Serialize, PartialEq, Eq)] diff --git a/cli/src/commands/get/datasets.rs b/cli/src/commands/get/datasets.rs index 040ea516..b0109b45 100644 --- a/cli/src/commands/get/datasets.rs +++ b/cli/src/commands/get/datasets.rs @@ -42,7 +42,16 @@ pub fn get(client: &Client, args: &GetDatasetsArgs, printer: &Printer) -> Result if *include_stats { datasets.iter().try_for_each(|dataset| -> Result<()> { info!("Getting statistics for dataset {}", dataset.full_name().0); - let stats = client + let unfiltered_stats = client + .get_dataset_statistics( + &dataset.full_name(), + &StatisticsRequestParams { + ..Default::default() + }, + ) + .context("Could not get statistics for dataset")?; + + let reviewed_stats = client .get_dataset_statistics( &dataset.full_name(), &StatisticsRequestParams { @@ -58,7 +67,8 @@ pub fn get(client: &Client, args: &GetDatasetsArgs, printer: &Printer) -> Result let dataset_and_stats = DatasetAndStats { dataset: dataset.clone(), stats: DatasetStats { - num_reviewed: stats.num_comments + num_reviewed: reviewed_stats.num_comments, + total_verbatims: unfiltered_stats.num_comments } }; dataset_stats.push(dataset_and_stats); diff --git a/cli/src/printer.rs b/cli/src/printer.rs index 1bc68c8e..5a8270c9 100644 --- a/cli/src/printer.rs +++ b/cli/src/printer.rs @@ -106,7 +106,7 @@ impl DisplayTable for Dataset { impl DisplayTable for DatasetAndStats { fn to_table_headers() -> Row { - row![bFg => "Name", "ID", "Updated (UTC)", "Title", "Num Reviewed"] + row![bFg => "Name", "ID", "Updated (UTC)", "Title","Total Verbatims", "Num Reviewed"] } fn to_table_row(&self) -> Row { @@ -121,6 +121,7 @@ impl DisplayTable for DatasetAndStats { self.dataset.id.0, self.dataset.updated_at.format("%Y-%m-%d %H:%M:%S"), self.dataset.title, + self.stats.total_verbatims, self.stats.num_reviewed ] }