Skip to content

Commit

Permalink
Experiment: default IDs in LUT references (#81)
Browse files Browse the repository at this point in the history
* Experiment: default IDs in LUT references

* Revert changes to datatype handling
  • Loading branch information
Ostrzyciel authored May 22, 2024
1 parent a912e64 commit be3821a
Show file tree
Hide file tree
Showing 7 changed files with 142 additions and 76 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -2,10 +2,6 @@ package eu.ostrzyciel.jelly.core

import java.util

private[core] object EncoderValue:
// Empty default value to slightly reduce heap pressure
val Empty = EncoderValue(0, 0, false)

private[core] final case class EncoderValue(getId: Int, setId: Int, newEntry: Boolean)

private[core] final class EncoderLookup(maxEntries: Int)
Expand Down
15 changes: 13 additions & 2 deletions core/src/main/scala/eu/ostrzyciel/jelly/core/NameDecoder.scala
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,9 @@ private[core] final class NameDecoder(opt: RdfStreamOptions):
private val prefixLookup = new DecoderLookup[String](opt.maxPrefixTableSize)
private val nameLookup = new DecoderLookup[String](opt.maxNameTableSize)

private var lastIriPrefixId: Int = 0
private var lastIriNameId: Int = 0

/**
* Update the name table.
* @param nameRow name row
Expand All @@ -32,16 +35,24 @@ private[core] final class NameDecoder(opt: RdfStreamOptions):
*/
def decode(iri: RdfIri): String =
val prefix = iri.prefixId match
case 0 => ""
case 0 if lastIriPrefixId < 1 => ""
// the .get() result can't be null here, we've already retrieved it before
case 0 => prefixLookup.get(lastIriPrefixId)
case id =>
val p = prefixLookup.get(id)
if p == null then throw MissingPrefixEntryError(id)
lastIriPrefixId = id
p
val name = iri.nameId match
case 0 => ""
case 0 =>
lastIriNameId += 1
val n = nameLookup.get(lastIriNameId)
if n == null then throw MissingNameEntryError(lastIriNameId)
n
case id =>
val n = nameLookup.get(id)
if n == null then throw MissingNameEntryError(id)
lastIriNameId = id
n

prefix + name
93 changes: 62 additions & 31 deletions core/src/main/scala/eu/ostrzyciel/jelly/core/NameEncoder.scala
Original file line number Diff line number Diff line change
Expand Up @@ -4,32 +4,56 @@ import eu.ostrzyciel.jelly.core.proto.v1.*

import scala.collection.mutable.ListBuffer

private[core] object NameEncoder:
private val repeatDatatype = RdfLiteral.LiteralKind.Datatype(0)

/**
* IRI and datatype encoder.
* Maintains internal lookups for prefixes, names, and datatypes. Uses the LRU strategy for eviction.
*
* @param opt Jelly options
*/
private[core] final class NameEncoder(opt: RdfStreamOptions):
import NameEncoder.*

private val nameLookup = new EncoderLookup(opt.maxNameTableSize)
private val prefixLookup = new EncoderLookup(opt.maxPrefixTableSize)
private val dtLookup = new EncoderLookup(opt.maxDatatypeTableSize)
private val dtTable = new DecoderLookup[RdfLiteral.LiteralKind.Datatype](opt.maxDatatypeTableSize)

private var lastIriPrefixId: Int = -1000
private var lastIriNameId: Int = 0

/**
* Try to extract the prefix out of the IRI.
*
* Somewhat based on [[org.apache.jena.riot.system.PrefixMapStd.getPossibleKey]]
* @param iri IRI
* @return prefix or null (micro-optimization, don't hit me)
* @return prefix which can be empty, never null
*/
private def getIriPrefix(iri: String): String =
iri.lastIndexOf('#') match
case i if i > -1 => iri.substring(0, i + 1)
case _ =>
iri.lastIndexOf('/') match
case i if i > -1 => iri.substring(0, i + 1)
case _ => null
case _ => ""

/**
* Obtain the id for the name lookup table to be communicated to the consumer.
* This method checks if new id = last_id + 1, and if so, it returns 0.
*
* @param getId the getId from the EncoderLookup
* @return the id to be communicated to the consumer
*/
private inline def getNameIdWithRepeat(getId: Int): Int =
if lastIriNameId + 1 == getId then
// If the last node had id - 1, we can tell it to the consumer in a shorthand manner
lastIriNameId = getId
0
else
lastIriNameId = getId
getId

/**
* Encodes an IRI to a protobuf representation.
Expand All @@ -39,40 +63,47 @@ private[core] final class NameEncoder(opt: RdfStreamOptions):
* @return protobuf representation of the IRI
*/
def encodeIri(iri: String, rowsBuffer: ListBuffer[RdfStreamRow]): RdfIri =
def plainIriEncode: RdfIri =
nameLookup.addEntry(iri) match
case EncoderValue(getId, _, false) =>
RdfIri(nameId = getId)
case EncoderValue(getId, setId, true) =>
rowsBuffer.append(
RdfStreamRow(RdfStreamRow.Row.Name(
RdfNameEntry(id = setId, value = iri)
))
)
RdfIri(nameId = getId)

if opt.maxPrefixTableSize == 0 then
// Use a lighter algorithm if the prefix table is disabled
return plainIriEncode

getIriPrefix(iri) match
case null => plainIriEncode
case prefix =>
val postfix = iri.substring(prefix.length)
val pVal = prefixLookup.addEntry(prefix)
val iVal = if postfix.nonEmpty then nameLookup.addEntry(postfix) else EncoderValue.Empty

if pVal.newEntry then rowsBuffer.append(
RdfStreamRow(RdfStreamRow.Row.Prefix(
RdfPrefixEntry(pVal.setId, prefix)
))
)
if iVal.newEntry then rowsBuffer.append(
val nameLookupEntry = nameLookup.addEntry(iri)
if nameLookupEntry.newEntry then
rowsBuffer.append(
RdfStreamRow(RdfStreamRow.Row.Name(
RdfNameEntry(iVal.setId, postfix)
RdfNameEntry(id = nameLookupEntry.setId, value = iri)
))
)
RdfIri(prefixId = pVal.getId, nameId = iVal.getId)
// We set the prefixId to 0, but it's a special case, because the prefix table is disabled.
// The consumer will interpret this as no prefix.
RdfIri(nameId = getNameIdWithRepeat(nameLookupEntry.getId))
else
val prefix = getIriPrefix(iri)
val postfix = iri.substring(prefix.length)
val prefixLookupEntry = prefixLookup.addEntry(prefix)
val nameLookupEntry = nameLookup.addEntry(postfix)

if prefixLookupEntry.newEntry then rowsBuffer.append(
RdfStreamRow(RdfStreamRow.Row.Prefix(
RdfPrefixEntry(prefixLookupEntry.setId, prefix)
))
)
if nameLookupEntry.newEntry then rowsBuffer.append(
RdfStreamRow(RdfStreamRow.Row.Name(
RdfNameEntry(nameLookupEntry.setId, postfix)
))
)

val nameIdWithRepeat = getNameIdWithRepeat(nameLookupEntry.getId)
if lastIriPrefixId == prefixLookupEntry.getId then
// If the last IRI had the same prefix, we can tell the consumer to reuse it.
// prefixId = 0 by default in this constructor.
// No need to update lastIriPrefixId, because it's the same.
RdfIri(nameId = nameIdWithRepeat)
else
lastIriPrefixId = prefixLookupEntry.getId
RdfIri(
prefixId = prefixLookupEntry.getId,
nameId = nameIdWithRepeat
)

/**
* Encodes a datatype IRI to a protobuf representation.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -27,15 +27,27 @@ class NameDecoderSpec extends AnyWordSpec, Matchers:
error.nameId should be (5)
}

"return empty string for no prefix and no name" in {
"throw MissingNameEntryError when trying to retrieve a name with empty LUT" in {
val dec = NameDecoder(smallOptions)
val error = intercept[MissingNameEntryError] {
dec.decode(RdfIri(0, 0))
}
error.getMessage should include ("name table at ID: 1")
error.nameId should be (1)
}

"return empty string for no prefix and empty name" in {
val dec = NameDecoder(smallOptions)
dec.updateNames(RdfNameEntry(0, ""))
dec.decode(RdfIri(0, 0)) should be ("")
}

"accept new prefixes with default IDs" in {
val dec = NameDecoder(smallOptions)
dec.updatePrefixes(RdfPrefixEntry(0, "https://test.org/"))
dec.updatePrefixes(RdfPrefixEntry(0, "https://test.org/2/"))
dec.updateNames(RdfNameEntry(0, ""))
dec.updateNames(RdfNameEntry(0, ""))
dec.decode(RdfIri(1, 0)) should be("https://test.org/")
dec.decode(RdfIri(2, 0)) should be("https://test.org/2/")
}
Expand All @@ -45,13 +57,16 @@ class NameDecoderSpec extends AnyWordSpec, Matchers:
dec.updatePrefixes(RdfPrefixEntry(4, "https://test.org/"))
// This ID will resolve to 5
dec.updatePrefixes(RdfPrefixEntry(0, "https://test.org/2/"))
dec.updateNames(RdfNameEntry(0, ""))
dec.updateNames(RdfNameEntry(0, ""))
dec.decode(RdfIri(4, 0)) should be("https://test.org/")
dec.decode(RdfIri(5, 0)) should be("https://test.org/2/")
}

"accept a new prefix and return it (IRI with no name part)" in {
val dec = NameDecoder(smallOptions)
dec.updatePrefixes(RdfPrefixEntry(3, "https://test.org/"))
dec.updateNames(RdfNameEntry(0, ""))
dec.decode(RdfIri(3, 0)) should be ("https://test.org/")
}

Expand Down
39 changes: 25 additions & 14 deletions core/src/test/scala/eu/ostrzyciel/jelly/core/NameEncoderSpec.scala
Original file line number Diff line number Diff line change
Expand Up @@ -84,7 +84,7 @@ class NameEncoderSpec extends AnyWordSpec, Inspectors, Matchers:
"add a full IRI" in {
val (encoder, buffer) = getEncoder()
val iri = encoder.encodeIri("https://test.org/Cake", buffer)
iri.nameId should be (1)
iri.nameId should be (0)
iri.prefixId should be (1)

buffer.size should be (2)
Expand All @@ -102,19 +102,27 @@ class NameEncoderSpec extends AnyWordSpec, Inspectors, Matchers:
iri.nameId should be (0)
iri.prefixId should be (1)

buffer.size should be (1)
// an empty name entry still has to be allocated
buffer.size should be (2)
buffer should contain (RdfStreamRow(RdfStreamRow.Row.Prefix(
RdfPrefixEntry(id = 0, value = "https://test.org/test/")
)))
buffer should contain(RdfStreamRow(RdfStreamRow.Row.Name(
RdfNameEntry(id = 0, value = "")
)))
}

"add a name-only IRI" in {
val (encoder, buffer) = getEncoder()
val iri = encoder.encodeIri("testTestTest", buffer)
iri.nameId should be (1)
iri.prefixId should be (0)
iri.nameId should be (0)
iri.prefixId should be (1)

buffer.size should be (1)
// in the mode with the prefix table enabled, an empty prefix entry still has to be allocated
buffer.size should be (2)
buffer should contain(RdfStreamRow(RdfStreamRow.Row.Prefix(
RdfPrefixEntry(id = 0, value = "")
)))
buffer should contain (RdfStreamRow(RdfStreamRow.Row.Name(
RdfNameEntry(id = 0, value = "testTestTest")
)))
Expand All @@ -123,9 +131,10 @@ class NameEncoderSpec extends AnyWordSpec, Inspectors, Matchers:
"add a full IRI in no-prefix table mode" in {
val (encoder, buffer) = getEncoder(0)
val iri = encoder.encodeIri("https://test.org/Cake", buffer)
iri.nameId should be (1)
iri.nameId should be (0)
iri.prefixId should be (0)

// in the no prefix mode, there must be no prefix entries
buffer.size should be (1)
buffer should contain (RdfStreamRow(RdfStreamRow.Row.Name(
RdfNameEntry(id = 0, value = "https://test.org/Cake")
Expand All @@ -136,18 +145,19 @@ class NameEncoderSpec extends AnyWordSpec, Inspectors, Matchers:
val (encoder, buffer) = getEncoder(3)
val data = Seq(
// IRI, expected prefix ID, expected name ID
("https://test.org/Cake1", 1, 1),
("https://test.org/Cake1", 1, 0),
("https://test.org#Cake1", 2, 1),
("https://test.org/test/Cake1", 3, 1),
("https://test.org/Cake2", 1, 2),
("https://test.org/Cake2", 1, 0),
("https://test.org#Cake2", 2, 2),
("https://test.org/other/Cake1", 3, 1),
("https://test.org/other/Cake2", 3, 2),
("https://test.org/other/Cake3", 3, 3),
("https://test.org/other/Cake4", 3, 4),
("https://test.org/other/Cake5", 3, 1),
("https://test.org#Cake2", 2, 2),
("Cake2", 0, 2),
("https://test.org/other/Cake2", 0, 0),
("https://test.org/other/Cake3", 0, 0),
("https://test.org/other/Cake4", 0, 0),
("https://test.org/other/Cake5", 0, 1),
("https://test.org#Cake2", 2, 0),
// prefix "" evicts the previous number #1
("Cake2", 1, 2),
)

for (sIri, ePrefix, eName) <- data do
Expand All @@ -166,6 +176,7 @@ class NameEncoderSpec extends AnyWordSpec, Inspectors, Matchers:
(false, 0, "Cake3"),
(false, 0, "Cake4"),
(false, 1, "Cake5"),
(true, 1, ""),
)

buffer.size should be (expectedBuffer.size)
Expand Down
Loading

0 comments on commit be3821a

Please sign in to comment.