Skip to content

Commit

Permalink
ADD: Add support for custom delimiter to DBN
Browse files Browse the repository at this point in the history
  • Loading branch information
threecgreen committed Feb 1, 2024
1 parent e55530f commit d74671e
Show file tree
Hide file tree
Showing 12 changed files with 357 additions and 141 deletions.
4 changes: 4 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,6 +1,10 @@
# Changelog

## 0.16.0 - TBD
### Enhancements
- Added `-t` and `--tsv` flags to DBN CLI to encode tab-separated values (TSV)
- Added `delimiter` method to builders for `DynEncoder` and `CsvEncoder` to customize the
field delimiter character, allowing DBN to be encoded as tab-separated values (TSV)

### Breaking changes
- Changed default for `VersionUpgradePolicy` to `Upgrade`
Expand Down
2 changes: 1 addition & 1 deletion Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

8 changes: 8 additions & 0 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -7,3 +7,11 @@ members = [
"rust/dbn"
]
resolver = "2"

[workspace.package]
authors = ["Databento <[email protected]>"]
edition = "2021"
version = "0.15.1"
documentation = "https://docs.databento.com"
repository = "https://github.com/databento/dbn"
license = "Apache-2.0"
10 changes: 5 additions & 5 deletions c/Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,13 +1,13 @@
[package]
name = "dbn-c"
authors = ["Databento <[email protected]>"]
version = "0.15.0"
edition = "2021"
description = "C bindings for working with Databento Binary Encoding (DBN)"
license = "Apache-2.0"
repository = "https://github.com/databento/dbn"
# This crate should not be published
publish = false
authors.workspace = true
version.workspace = true
edition.workspace = true
license.workspace = true
repository.workspace = true

[lib]
name = "dbn_c"
Expand Down
10 changes: 5 additions & 5 deletions python/Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,13 +1,13 @@
[package]
name = "databento-dbn"
authors = ["Databento <[email protected]>"]
version = "0.15.1"
edition = "2021"
description = "Python library written in Rust for working with Databento Binary Encoding (DBN)"
license = "Apache-2.0"
repository = "https://github.com/databento/dbn"
# This crate should only be published as a Python package
publish = false
authors.workspace = true
version.workspace = true
edition.workspace = true
license.workspace = true
repository.workspace = true

[lib]
name = "databento_dbn" # Python modules can't contain dashes
Expand Down
11 changes: 6 additions & 5 deletions rust/dbn-cli/Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,15 +1,15 @@
[package]
name = "dbn-cli"
authors = ["Databento <[email protected]>"]
version = "0.15.1"
edition = "2021"
description = "Command-line utility for converting Databento Binary Encoding (DBN) files to text-based formats"
default-run = "dbn"
license = "Apache-2.0"
repository = "https://github.com/databento/dbn"
keywords = ["market-data", "json", "csv", "conversion", "encoding"]
# see https://crates.io/category_slugs
categories = ["command-line-utilities", "encoding"]
authors.workspace = true
version.workspace = true
edition.workspace = true
license.workspace = true
repository.workspace = true

[[bin]]
name = "dbn"
Expand All @@ -25,6 +25,7 @@ anyhow = "1.0"
clap = { version = "4.4", features = ["derive", "wrap_help"] }
# deserialization for CLI args
serde = { version = "1.0", features = ["derive"] }
# Compression
zstd = "0.13"

[dev-dependencies]
Expand Down
8 changes: 5 additions & 3 deletions rust/dbn-cli/src/encode.rs
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ use dbn::{
rtype_dispatch, Compression, Encoding, MetadataBuilder, SType, SymbolIndex,
};

use crate::{infer_encoding_and_compression, output_from_args, Args};
use crate::{infer_encoding, output_from_args, Args};

pub fn silence_broken_pipe(err: anyhow::Error) -> anyhow::Result<()> {
// Handle broken pipe as a non-error.
Expand All @@ -27,7 +27,7 @@ where
D: DecodeRecordRef + DbnMetadata,
{
let writer = output_from_args(args)?;
let (encoding, compression) = infer_encoding_and_compression(args)?;
let (encoding, compression, delimiter) = infer_encoding(args)?;
Ok(if args.should_output_metadata {
if encoding != Encoding::Json {
return Err(anyhow::format_err!(
Expand All @@ -45,6 +45,7 @@ where
encode_fragment(decoder, writer, compression)
} else {
let mut encoder = DynEncoder::builder(writer, encoding, compression, decoder.metadata())
.delimiter(delimiter)
.all_pretty(args.should_pretty_print)
.with_symbol(args.map_symbols)
.build()?;
Expand All @@ -70,7 +71,7 @@ where
D: DecodeRecordRef,
{
let writer = output_from_args(args)?;
let (encoding, compression) = infer_encoding_and_compression(args)?;
let (encoding, compression, delimiter) = infer_encoding(args)?;
if args.fragment {
encode_fragment(decoder, writer, compression)?;
return Ok(());
Expand All @@ -90,6 +91,7 @@ where
.stype_out(SType::InstrumentId)
.build(),
)
.delimiter(delimiter)
// Can't write header until we know the record type
.write_header(false)
.all_pretty(args.should_pretty_print)
Expand Down
174 changes: 118 additions & 56 deletions rust/dbn-cli/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@ pub enum OutputEncoding {
Infer,
Dbn,
Csv,
Tsv,
Json,
DbnFragment,
}
Expand Down Expand Up @@ -61,6 +62,15 @@ pub struct Args {
help = "Output the result as CSV"
)]
pub csv: bool,
#[clap(
short = 'T',
long,
action = ArgAction::SetTrue,
default_value = "false",
group = "output_encoding",
help = "Output the result as tab-separated values (TSV)"
)]
pub tsv: bool,
#[clap(
short = 'D',
long,
Expand Down Expand Up @@ -173,6 +183,8 @@ impl Args {
OutputEncoding::Json
} else if self.csv {
OutputEncoding::Csv
} else if self.tsv {
OutputEncoding::Tsv
} else if self.dbn {
OutputEncoding::Dbn
} else if self.fragment {
Expand All @@ -195,32 +207,37 @@ impl Args {
}
}

/// Infer the [`Encoding`] and [`Compression`] from `args` if they aren't already explicitly
/// set.
pub fn infer_encoding_and_compression(args: &Args) -> anyhow::Result<(Encoding, Compression)> {
/// Infer the [`Encoding`], [`Compression`], and delimiter (CSV/TSV) from `args` if they
/// aren't already explicitly set.
pub fn infer_encoding(args: &Args) -> anyhow::Result<(Encoding, Compression, u8)> {
let compression = if args.zstd {
Compression::ZStd
} else {
Compression::None
};
match args.output_encoding() {
OutputEncoding::DbnFragment | OutputEncoding::Dbn => Ok((Encoding::Dbn, compression)),
OutputEncoding::Csv => Ok((Encoding::Csv, compression)),
OutputEncoding::Json => Ok((Encoding::Json, compression)),
OutputEncoding::DbnFragment | OutputEncoding::Dbn => Ok((Encoding::Dbn, compression, 0)),
OutputEncoding::Csv => Ok((Encoding::Csv, compression, b',')),
OutputEncoding::Tsv => Ok((Encoding::Csv, compression, b'\t')),
OutputEncoding::Json => Ok((Encoding::Json, compression, 0)),
OutputEncoding::Infer => {
if let Some(output) = args.output.as_ref().map(|o| o.to_string_lossy()) {
if output.ends_with(".dbn.zst") {
Ok((Encoding::Dbn, Compression::ZStd))
Ok((Encoding::Dbn, Compression::ZStd, 0))
} else if output.ends_with(".dbn") {
Ok((Encoding::Dbn, Compression::None))
Ok((Encoding::Dbn, Compression::None, 0))
} else if output.ends_with(".csv.zst") {
Ok((Encoding::Csv, Compression::ZStd))
Ok((Encoding::Csv, Compression::ZStd, b','))
} else if output.ends_with(".csv") {
Ok((Encoding::Csv, Compression::None))
Ok((Encoding::Csv, Compression::None, b','))
} else if output.ends_with(".tsv.zst") || output.ends_with(".xls.zst") {
Ok((Encoding::Csv, Compression::ZStd, b'\t'))
} else if output.ends_with(".tsv") || output.ends_with(".xls") {
Ok((Encoding::Csv, Compression::None, b'\t'))
} else if output.ends_with(".json.zst") {
Ok((Encoding::Json, Compression::ZStd))
Ok((Encoding::Json, Compression::ZStd, 0))
} else if output.ends_with(".json") {
Ok((Encoding::Json, Compression::None))
Ok((Encoding::Json, Compression::None, 0))
} else {
Err(anyhow!(
"Unable to infer output encoding from output path '{output}'",
Expand Down Expand Up @@ -264,53 +281,98 @@ fn open_output_file(path: &PathBuf, force: bool) -> anyhow::Result<File> {

#[cfg(test)]
mod tests {
use rstest::*;

use super::*;

#[test]
fn test_infer_encoding_and_compression_explicit() {
let combinations = [
(true, false, false, false, Encoding::Json, Compression::None),
(false, true, false, false, Encoding::Csv, Compression::None),
(false, false, true, false, Encoding::Dbn, Compression::None),
(true, false, false, true, Encoding::Json, Compression::ZStd),
(false, true, false, true, Encoding::Csv, Compression::ZStd),
(false, false, true, true, Encoding::Dbn, Compression::ZStd),
];
for (json, csv, dbn, zstd, exp_enc, exp_comp) in combinations {
let args = Args {
json,
csv,
dbn,
zstd,
..Default::default()
};
assert_eq!(
infer_encoding_and_compression(&args).unwrap(),
(exp_enc, exp_comp)
);
}
#[rstest]
#[case(true, false, false, false, false, Encoding::Json, Compression::None, 0)]
#[case(
false,
true,
false,
false,
false,
Encoding::Csv,
Compression::None,
b','
)]
#[case(
false,
false,
true,
false,
false,
Encoding::Csv,
Compression::None,
b'\t'
)]
#[case(false, false, false, true, false, Encoding::Dbn, Compression::None, 0)]
#[case(true, false, false, false, true, Encoding::Json, Compression::ZStd, 0)]
#[case(
false,
true,
false,
false,
true,
Encoding::Csv,
Compression::ZStd,
b','
)]
#[case(
false,
false,
true,
false,
true,
Encoding::Csv,
Compression::ZStd,
b'\t'
)]
#[case(false, false, false, true, true, Encoding::Dbn, Compression::ZStd, 0)]
fn test_infer_encoding_and_compression_explicit(
#[case] json: bool,
#[case] csv: bool,
#[case] tsv: bool,
#[case] dbn: bool,
#[case] zstd: bool,
#[case] exp_enc: Encoding,
#[case] exp_comp: Compression,
#[case] exp_sep: u8,
) {
let args = Args {
json,
csv,
tsv,
dbn,
zstd,
..Default::default()
};
assert_eq!(infer_encoding(&args).unwrap(), (exp_enc, exp_comp, exp_sep));
}

#[test]
fn test_infer_encoding_and_compression_inference() {
let combinations = [
("out.json", Encoding::Json, Compression::None),
("out.csv", Encoding::Csv, Compression::None),
("out.dbn", Encoding::Dbn, Compression::None),
("out.json.zst", Encoding::Json, Compression::ZStd),
("out.csv.zst", Encoding::Csv, Compression::ZStd),
("out.dbn.zst", Encoding::Dbn, Compression::ZStd),
];
for (output, exp_enc, exp_comp) in combinations {
let args = Args {
output: Some(PathBuf::from(output)),
..Default::default()
};
assert_eq!(
infer_encoding_and_compression(&args).unwrap(),
(exp_enc, exp_comp)
);
}
#[rstest]
#[case("out.json", Encoding::Json, Compression::None, 0)]
#[case("out.csv", Encoding::Csv, Compression::None, b',')]
#[case("out.tsv", Encoding::Csv, Compression::None, b'\t')]
#[case("out.xls", Encoding::Csv, Compression::None, b'\t')]
#[case("out.dbn", Encoding::Dbn, Compression::None, 0)]
#[case("out.json.zst", Encoding::Json, Compression::ZStd, 0)]
#[case("out.csv.zst", Encoding::Csv, Compression::ZStd, b',')]
#[case("out.tsv.zst", Encoding::Csv, Compression::ZStd, b'\t')]
#[case("out.xls.zst", Encoding::Csv, Compression::ZStd, b'\t')]
#[case("out.dbn.zst", Encoding::Dbn, Compression::ZStd, 0)]
fn test_infer_encoding_and_compression_inference(
#[case] output: &str,
#[case] exp_enc: Encoding,
#[case] exp_comp: Compression,
#[case] exp_sep: u8,
) {
let args = Args {
output: Some(PathBuf::from(output)),
..Default::default()
};
assert_eq!(infer_encoding(&args).unwrap(), (exp_enc, exp_comp, exp_sep));
}

#[test]
Expand All @@ -320,7 +382,7 @@ mod tests {
..Default::default()
};
assert!(
matches!(infer_encoding_and_compression(&args), Err(e) if e.to_string().starts_with("Unable to infer"))
matches!(infer_encoding(&args), Err(e) if e.to_string().starts_with("Unable to infer"))
);
}
}
10 changes: 5 additions & 5 deletions rust/dbn-macros/Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,11 +1,11 @@
[package]
name = "dbn-macros"
authors = ["Databento <[email protected]>"]
version = "0.15.1"
edition = "2021"
description = "Proc macros for dbn crate"
license = "Apache-2.0"
repository = "https://github.com/databento/dbn"
authors.workspace = true
version.workspace = true
edition.workspace = true
license.workspace = true
repository.workspace = true

[lib]
proc-macro = true
Expand Down
Loading

0 comments on commit d74671e

Please sign in to comment.