Skip to content

Commit

Permalink
refactor(datashed): rename column idn to ppn (#132)
Browse files Browse the repository at this point in the history
Signed-off-by: Nico Wagner <[email protected]>
  • Loading branch information
nwagner84 authored Feb 21, 2025
1 parent f6e09d5 commit 7a9adec
Show file tree
Hide file tree
Showing 9 changed files with 42 additions and 38 deletions.
22 changes: 11 additions & 11 deletions crates/dataset/src/commands/vocab.rs
Original file line number Diff line number Diff line change
Expand Up @@ -82,7 +82,7 @@ impl TryFrom<&ByteRecord<'_>> for AuthorityRecord {

fn try_from(record: &ByteRecord<'_>) -> Result<Self, Self::Error> {
let options = MatcherOptions::default();
let idn = record.ppn().unwrap().to_string();
let ppn = record.ppn().unwrap().to_string();

let kind = match record
.first(&Path::new("[email protected]").unwrap(), &options)
Expand All @@ -104,8 +104,8 @@ impl TryFrom<&ByteRecord<'_>> for AuthorityRecord {
// pref_label!(record, "022A{a, g}", false), };

Ok(AuthorityRecord {
uri: format!("https://d-nb.info/gnd/{idn}"),
label: label.unwrap_or(format!("IDN : {idn}")),
uri: format!("https://d-nb.info/gnd/{ppn}"),
label: label.unwrap_or(format!("PPN: {ppn}")),
notation: "".into(),
kind,
})
Expand Down Expand Up @@ -149,12 +149,12 @@ impl Vocab {
continue;
};

let idn = record.ppn().unwrap().to_string();
let ppn = record.ppn().unwrap().to_string();
let mut seen = BTreeSet::new();

if matcher.is_match(&record, &options) {
let record = AuthorityRecord::try_from(&record)?;
vocab.insert(idn, record);
vocab.insert(ppn, record);
continue;
}

Expand All @@ -168,11 +168,11 @@ impl Vocab {

record
.path(&Path::new(&target.source).unwrap(), &options)
.for_each(|idn| {
if !idn.is_empty() && !seen.contains(idn) {
seen.insert(idn.to_owned());
.for_each(|ppn| {
if !ppn.is_empty() && !seen.contains(ppn) {
seen.insert(ppn.to_owned());
freqs
.entry(idn.to_string())
.entry(ppn.to_string())
.and_modify(|value| *value += 1)
.or_insert(1);
}
Expand All @@ -191,11 +191,11 @@ impl Vocab {
};

let mut writer = WriterBuilder::new().from_writer(inner);
for (idn, record) in vocab.into_iter() {
for (ppn, record) in vocab.into_iter() {
if let Some(KindConfig { threshold }) =
config.vocab.kinds.get(&record.kind)
{
let count = freqs.remove(&idn).unwrap_or(0);
let count = freqs.remove(&ppn).unwrap_or(0);
if count < *threshold {
continue;
}
Expand Down
10 changes: 5 additions & 5 deletions crates/datashed/src/commands/grep.rs
Original file line number Diff line number Diff line change
Expand Up @@ -132,16 +132,16 @@ impl Grep {
if let Some(path) = self.allow_list {
df = df.semi_join(
read_filter_list(path)?.lazy(),
col("idn"),
col("idn"),
col("ppn"),
col("ppn"),
);
}

if let Some(path) = self.deny_list {
df = df.semi_join(
df = df.anti_join(
read_filter_list(path)?.lazy(),
col("idn"),
col("idn"),
col("ppn"),
col("ppn"),
);
}

Expand Down
4 changes: 2 additions & 2 deletions crates/datashed/src/commands/index/kind.rs
Original file line number Diff line number Diff line change
Expand Up @@ -73,9 +73,9 @@ impl KindMap {
pub(crate) fn process_record(&mut self, record: &ByteRecord) {
for matcher in self.matchers.iter() {
if matcher.is_match(record) {
let idn = record.ppn().unwrap().to_string();
let ppn = record.ppn().unwrap().to_string();
let _ = self.refinements.insert(
(idn, matcher.from.clone()),
(ppn, matcher.from.clone()),
matcher.to.clone(),
);

Expand Down
14 changes: 7 additions & 7 deletions crates/datashed/src/commands/index/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -67,7 +67,7 @@ pub(crate) struct Index {
#[derive(Debug, Default)]
struct Row {
path: PathBuf,
idn: String,
ppn: String,
kind: DocumentKind,
msc: Option<String>,
lang_code: Option<String>,
Expand Down Expand Up @@ -97,7 +97,7 @@ impl TryFrom<&PathBuf> for Row {

Ok(Row {
path: path.into(),
idn: doc.idn(),
ppn: doc.ppn(),
kind: doc.kind(),
lfreq: doc.lfreq(),
alpha: doc.alpha(),
Expand Down Expand Up @@ -192,7 +192,7 @@ impl Index {

let mut remote: Vec<&str> = vec![];
let mut path: Vec<String> = vec![];
let mut idn: Vec<String> = vec![];
let mut ppn: Vec<String> = vec![];
let mut kind: Vec<String> = vec![];
let mut msc: Vec<Option<String>> = vec![];
let mut lang_code: Vec<Option<String>> = vec![];
Expand All @@ -213,13 +213,13 @@ impl Index {
let kind_ = refinements
.remove(&(path_.clone(), hash_.clone()))
.or(kind_map
.remove(&(row.idn.clone(), row.kind.clone())))
.remove(&(row.ppn.clone(), row.kind.clone())))
.unwrap_or(row.kind);

remote.push(&config.metadata.name);
path.push(path_);
kind.push(kind_.to_string());
msc.push(msc_map.get(&row.idn).cloned());
msc.push(msc_map.get(&row.ppn).cloned());
lang_code.push(row.lang_code);
lang_score.push(row.lang_score);
lfreq.push(row.lfreq);
Expand All @@ -231,13 +231,13 @@ impl Index {
strlen.push(row.strlen);
mtime.push(row.mtime);
hash.push(hash_);
idn.push(row.idn);
ppn.push(row.ppn);
}

let df = DataFrame::new(vec![
Column::new("remote".into(), remote),
Column::new("path".into(), path),
Column::new("idn".into(), idn),
Column::new("ppn".into(), ppn),
Column::new("kind".into(), kind),
Column::new("msc".into(), msc),
Column::new("lang_code".into(), lang_code),
Expand Down
8 changes: 4 additions & 4 deletions crates/datashed/src/commands/rate.rs
Original file line number Diff line number Diff line change
Expand Up @@ -144,7 +144,7 @@ impl Rate {
let kind = index.column("kind")?.str()?;
let path = index.column("path")?.str()?;
let hash = index.column("hash")?.str()?;
let idn = index.column("idn")?.str()?;
let ppn = index.column("ppn").or(index.column("idn"))?.str()?;
let len = index.height();

let mut ratings_url = base_uri.clone();
Expand All @@ -156,7 +156,7 @@ impl Rate {
let kind = kind.get(idx).unwrap();
let filename = path.get(idx).unwrap();
let hash = hash.get(idx).unwrap();
let idn = idn.get(idx).unwrap();
let ppn = ppn.get(idx).unwrap();

print!("\x1B[2J");
let header = format!(
Expand All @@ -169,11 +169,11 @@ impl Rate {
);

println!("{header}\n{0}\n", "~".repeat(header.len()));
println!("Portal:\n\thttps://d-nb.info/{idn}\n",);
println!("Portal:\n\thttps://d-nb.info/{ppn}\n",);
println!(
"Record Browser:\n\t\
http://etc.dnb.de/pica-record-browser/show.xhtml\
?src=prsx&idn={idn}\n"
?src=prsx&idn={ppn}\n"
);

let stop = Confirm::new()
Expand Down
6 changes: 5 additions & 1 deletion crates/datashed/src/commands/select.rs
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,11 @@ pub(crate) struct Select {
#[arg(long = "where")]
predicate: Option<String>,

#[arg(long, default_value = "idn", conflicts_with_all = ["left_on", "right_on"])]
#[arg(
long,
default_value = "ppn",
conflicts_with_all = ["left_on", "right_on"],
)]
on: String,

#[arg(long, requires = "right_on", conflicts_with = "on")]
Expand Down
2 changes: 1 addition & 1 deletion crates/datashed/src/commands/summary.rs
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@ impl Summary {
.lazy()
.group_by([col("remote"), col("kind")])
.agg([
col("idn").count().alias("docs"),
col("ppn").count().alias("docs"),
col("size").sum(),
col("hash").n_unique().alias("unique"),
])
Expand Down
8 changes: 4 additions & 4 deletions crates/datashed/src/commands/vocab.rs
Original file line number Diff line number Diff line change
Expand Up @@ -145,8 +145,8 @@ impl Vocab {
.lazy()
.semi_join(
read_filter_list(path)?.lazy(),
col("idn"),
col("idn"),
col("ppn"),
col("ppn"),
)
.collect()?;
}
Expand All @@ -156,8 +156,8 @@ impl Vocab {
.lazy()
.semi_join(
read_filter_list(path)?.lazy(),
col("idn"),
col("idn"),
col("ppn"),
col("ppn"),
)
.collect()?;
}
Expand Down
6 changes: 3 additions & 3 deletions crates/datashed/src/document.rs
Original file line number Diff line number Diff line change
Expand Up @@ -114,7 +114,7 @@ impl Document {
})
}

pub(crate) fn idn(&self) -> String {
pub(crate) fn ppn(&self) -> String {
self.path.file_stem().unwrap().to_str().unwrap().to_string()
}

Expand Down Expand Up @@ -357,9 +357,9 @@ mod tests {
}

#[test]
fn document_idn() -> TestResult {
fn document_ppn() -> TestResult {
let doc = Document::from_path("tests/data/fox.txt")?;
assert_eq!(doc.idn(), "fox");
assert_eq!(doc.ppn(), "fox");
Ok(())
}

Expand Down

0 comments on commit 7a9adec

Please sign in to comment.