From be3821acece6151173e7f71f4229ab530b5c0e91 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Piotr=20Sowi=C5=84ski?= Date: Wed, 22 May 2024 10:21:23 +0200 Subject: [PATCH] Experiment: default IDs in LUT references (#81) * Experiment: default IDs in LUT references * Revert changes to datatype handling --- .../ostrzyciel/jelly/core/EncoderLookup.scala | 4 - .../ostrzyciel/jelly/core/NameDecoder.scala | 15 ++- .../ostrzyciel/jelly/core/NameEncoder.scala | 93 ++++++++++++------- .../jelly/core/NameDecoderSpec.scala | 17 +++- .../jelly/core/NameEncoderSpec.scala | 39 +++++--- .../jelly/core/ProtoTestCases.scala | 45 ++++----- .../jelly/stream/EncoderFlowSpec.scala | 5 +- 7 files changed, 142 insertions(+), 76 deletions(-) diff --git a/core/src/main/scala/eu/ostrzyciel/jelly/core/EncoderLookup.scala b/core/src/main/scala/eu/ostrzyciel/jelly/core/EncoderLookup.scala index cd28d95b..73a916be 100644 --- a/core/src/main/scala/eu/ostrzyciel/jelly/core/EncoderLookup.scala +++ b/core/src/main/scala/eu/ostrzyciel/jelly/core/EncoderLookup.scala @@ -2,10 +2,6 @@ package eu.ostrzyciel.jelly.core import java.util -private[core] object EncoderValue: - // Empty default value to slightly reduce heap pressure - val Empty = EncoderValue(0, 0, false) - private[core] final case class EncoderValue(getId: Int, setId: Int, newEntry: Boolean) private[core] final class EncoderLookup(maxEntries: Int) diff --git a/core/src/main/scala/eu/ostrzyciel/jelly/core/NameDecoder.scala b/core/src/main/scala/eu/ostrzyciel/jelly/core/NameDecoder.scala index 37c37153..56c8b1d1 100644 --- a/core/src/main/scala/eu/ostrzyciel/jelly/core/NameDecoder.scala +++ b/core/src/main/scala/eu/ostrzyciel/jelly/core/NameDecoder.scala @@ -6,6 +6,9 @@ private[core] final class NameDecoder(opt: RdfStreamOptions): private val prefixLookup = new DecoderLookup[String](opt.maxPrefixTableSize) private val nameLookup = new DecoderLookup[String](opt.maxNameTableSize) + private var lastIriPrefixId: Int = 0 + private var lastIriNameId: Int = 0 + /** * Update the name table. * @param nameRow name row @@ -32,16 +35,24 @@ private[core] final class NameDecoder(opt: RdfStreamOptions): */ def decode(iri: RdfIri): String = val prefix = iri.prefixId match - case 0 => "" + case 0 if lastIriPrefixId < 1 => "" + // the .get() result can't be null here, we've already retrieved it before + case 0 => prefixLookup.get(lastIriPrefixId) case id => val p = prefixLookup.get(id) if p == null then throw MissingPrefixEntryError(id) + lastIriPrefixId = id p val name = iri.nameId match - case 0 => "" + case 0 => + lastIriNameId += 1 + val n = nameLookup.get(lastIriNameId) + if n == null then throw MissingNameEntryError(lastIriNameId) + n case id => val n = nameLookup.get(id) if n == null then throw MissingNameEntryError(id) + lastIriNameId = id n prefix + name diff --git a/core/src/main/scala/eu/ostrzyciel/jelly/core/NameEncoder.scala b/core/src/main/scala/eu/ostrzyciel/jelly/core/NameEncoder.scala index e7d9a927..c28638bf 100644 --- a/core/src/main/scala/eu/ostrzyciel/jelly/core/NameEncoder.scala +++ b/core/src/main/scala/eu/ostrzyciel/jelly/core/NameEncoder.scala @@ -4,6 +4,9 @@ import eu.ostrzyciel.jelly.core.proto.v1.* import scala.collection.mutable.ListBuffer +private[core] object NameEncoder: + private val repeatDatatype = RdfLiteral.LiteralKind.Datatype(0) + /** * IRI and datatype encoder. * Maintains internal lookups for prefixes, names, and datatypes. Uses the LRU strategy for eviction. @@ -11,17 +14,22 @@ import scala.collection.mutable.ListBuffer * @param opt Jelly options */ private[core] final class NameEncoder(opt: RdfStreamOptions): + import NameEncoder.* + private val nameLookup = new EncoderLookup(opt.maxNameTableSize) private val prefixLookup = new EncoderLookup(opt.maxPrefixTableSize) private val dtLookup = new EncoderLookup(opt.maxDatatypeTableSize) private val dtTable = new DecoderLookup[RdfLiteral.LiteralKind.Datatype](opt.maxDatatypeTableSize) + private var lastIriPrefixId: Int = -1000 + private var lastIriNameId: Int = 0 + /** * Try to extract the prefix out of the IRI. * * Somewhat based on [[org.apache.jena.riot.system.PrefixMapStd.getPossibleKey]] * @param iri IRI - * @return prefix or null (micro-optimization, don't hit me) + * @return prefix which can be empty, never null */ private def getIriPrefix(iri: String): String = iri.lastIndexOf('#') match @@ -29,7 +37,23 @@ private[core] final class NameEncoder(opt: RdfStreamOptions): case _ => iri.lastIndexOf('/') match case i if i > -1 => iri.substring(0, i + 1) - case _ => null + case _ => "" + + /** + * Obtain the id for the name lookup table to be communicated to the consumer. + * This method checks if new id = last_id + 1, and if so, it returns 0. + * + * @param getId the getId from the EncoderLookup + * @return the id to be communicated to the consumer + */ + private inline def getNameIdWithRepeat(getId: Int): Int = + if lastIriNameId + 1 == getId then + // If the last node had id - 1, we can tell it to the consumer in a shorthand manner + lastIriNameId = getId + 0 + else + lastIriNameId = getId + getId /** * Encodes an IRI to a protobuf representation. @@ -39,40 +63,47 @@ private[core] final class NameEncoder(opt: RdfStreamOptions): * @return protobuf representation of the IRI */ def encodeIri(iri: String, rowsBuffer: ListBuffer[RdfStreamRow]): RdfIri = - def plainIriEncode: RdfIri = - nameLookup.addEntry(iri) match - case EncoderValue(getId, _, false) => - RdfIri(nameId = getId) - case EncoderValue(getId, setId, true) => - rowsBuffer.append( - RdfStreamRow(RdfStreamRow.Row.Name( - RdfNameEntry(id = setId, value = iri) - )) - ) - RdfIri(nameId = getId) - if opt.maxPrefixTableSize == 0 then // Use a lighter algorithm if the prefix table is disabled - return plainIriEncode - - getIriPrefix(iri) match - case null => plainIriEncode - case prefix => - val postfix = iri.substring(prefix.length) - val pVal = prefixLookup.addEntry(prefix) - val iVal = if postfix.nonEmpty then nameLookup.addEntry(postfix) else EncoderValue.Empty - - if pVal.newEntry then rowsBuffer.append( - RdfStreamRow(RdfStreamRow.Row.Prefix( - RdfPrefixEntry(pVal.setId, prefix) - )) - ) - if iVal.newEntry then rowsBuffer.append( + val nameLookupEntry = nameLookup.addEntry(iri) + if nameLookupEntry.newEntry then + rowsBuffer.append( RdfStreamRow(RdfStreamRow.Row.Name( - RdfNameEntry(iVal.setId, postfix) + RdfNameEntry(id = nameLookupEntry.setId, value = iri) )) ) - RdfIri(prefixId = pVal.getId, nameId = iVal.getId) + // We set the prefixId to 0, but it's a special case, because the prefix table is disabled. + // The consumer will interpret this as no prefix. + RdfIri(nameId = getNameIdWithRepeat(nameLookupEntry.getId)) + else + val prefix = getIriPrefix(iri) + val postfix = iri.substring(prefix.length) + val prefixLookupEntry = prefixLookup.addEntry(prefix) + val nameLookupEntry = nameLookup.addEntry(postfix) + + if prefixLookupEntry.newEntry then rowsBuffer.append( + RdfStreamRow(RdfStreamRow.Row.Prefix( + RdfPrefixEntry(prefixLookupEntry.setId, prefix) + )) + ) + if nameLookupEntry.newEntry then rowsBuffer.append( + RdfStreamRow(RdfStreamRow.Row.Name( + RdfNameEntry(nameLookupEntry.setId, postfix) + )) + ) + + val nameIdWithRepeat = getNameIdWithRepeat(nameLookupEntry.getId) + if lastIriPrefixId == prefixLookupEntry.getId then + // If the last IRI had the same prefix, we can tell the consumer to reuse it. + // prefixId = 0 by default in this constructor. + // No need to update lastIriPrefixId, because it's the same. + RdfIri(nameId = nameIdWithRepeat) + else + lastIriPrefixId = prefixLookupEntry.getId + RdfIri( + prefixId = prefixLookupEntry.getId, + nameId = nameIdWithRepeat + ) /** * Encodes a datatype IRI to a protobuf representation. diff --git a/core/src/test/scala/eu/ostrzyciel/jelly/core/NameDecoderSpec.scala b/core/src/test/scala/eu/ostrzyciel/jelly/core/NameDecoderSpec.scala index 1cc314ff..27a592c8 100644 --- a/core/src/test/scala/eu/ostrzyciel/jelly/core/NameDecoderSpec.scala +++ b/core/src/test/scala/eu/ostrzyciel/jelly/core/NameDecoderSpec.scala @@ -27,8 +27,18 @@ class NameDecoderSpec extends AnyWordSpec, Matchers: error.nameId should be (5) } - "return empty string for no prefix and no name" in { + "throw MissingNameEntryError when trying to retrieve a name with empty LUT" in { val dec = NameDecoder(smallOptions) + val error = intercept[MissingNameEntryError] { + dec.decode(RdfIri(0, 0)) + } + error.getMessage should include ("name table at ID: 1") + error.nameId should be (1) + } + + "return empty string for no prefix and empty name" in { + val dec = NameDecoder(smallOptions) + dec.updateNames(RdfNameEntry(0, "")) dec.decode(RdfIri(0, 0)) should be ("") } @@ -36,6 +46,8 @@ class NameDecoderSpec extends AnyWordSpec, Matchers: val dec = NameDecoder(smallOptions) dec.updatePrefixes(RdfPrefixEntry(0, "https://test.org/")) dec.updatePrefixes(RdfPrefixEntry(0, "https://test.org/2/")) + dec.updateNames(RdfNameEntry(0, "")) + dec.updateNames(RdfNameEntry(0, "")) dec.decode(RdfIri(1, 0)) should be("https://test.org/") dec.decode(RdfIri(2, 0)) should be("https://test.org/2/") } @@ -45,6 +57,8 @@ class NameDecoderSpec extends AnyWordSpec, Matchers: dec.updatePrefixes(RdfPrefixEntry(4, "https://test.org/")) // This ID will resolve to 5 dec.updatePrefixes(RdfPrefixEntry(0, "https://test.org/2/")) + dec.updateNames(RdfNameEntry(0, "")) + dec.updateNames(RdfNameEntry(0, "")) dec.decode(RdfIri(4, 0)) should be("https://test.org/") dec.decode(RdfIri(5, 0)) should be("https://test.org/2/") } @@ -52,6 +66,7 @@ class NameDecoderSpec extends AnyWordSpec, Matchers: "accept a new prefix and return it (IRI with no name part)" in { val dec = NameDecoder(smallOptions) dec.updatePrefixes(RdfPrefixEntry(3, "https://test.org/")) + dec.updateNames(RdfNameEntry(0, "")) dec.decode(RdfIri(3, 0)) should be ("https://test.org/") } diff --git a/core/src/test/scala/eu/ostrzyciel/jelly/core/NameEncoderSpec.scala b/core/src/test/scala/eu/ostrzyciel/jelly/core/NameEncoderSpec.scala index ecd549a6..738dd810 100644 --- a/core/src/test/scala/eu/ostrzyciel/jelly/core/NameEncoderSpec.scala +++ b/core/src/test/scala/eu/ostrzyciel/jelly/core/NameEncoderSpec.scala @@ -84,7 +84,7 @@ class NameEncoderSpec extends AnyWordSpec, Inspectors, Matchers: "add a full IRI" in { val (encoder, buffer) = getEncoder() val iri = encoder.encodeIri("https://test.org/Cake", buffer) - iri.nameId should be (1) + iri.nameId should be (0) iri.prefixId should be (1) buffer.size should be (2) @@ -102,19 +102,27 @@ class NameEncoderSpec extends AnyWordSpec, Inspectors, Matchers: iri.nameId should be (0) iri.prefixId should be (1) - buffer.size should be (1) + // an empty name entry still has to be allocated + buffer.size should be (2) buffer should contain (RdfStreamRow(RdfStreamRow.Row.Prefix( RdfPrefixEntry(id = 0, value = "https://test.org/test/") ))) + buffer should contain(RdfStreamRow(RdfStreamRow.Row.Name( + RdfNameEntry(id = 0, value = "") + ))) } "add a name-only IRI" in { val (encoder, buffer) = getEncoder() val iri = encoder.encodeIri("testTestTest", buffer) - iri.nameId should be (1) - iri.prefixId should be (0) + iri.nameId should be (0) + iri.prefixId should be (1) - buffer.size should be (1) + // in the mode with the prefix table enabled, an empty prefix entry still has to be allocated + buffer.size should be (2) + buffer should contain(RdfStreamRow(RdfStreamRow.Row.Prefix( + RdfPrefixEntry(id = 0, value = "") + ))) buffer should contain (RdfStreamRow(RdfStreamRow.Row.Name( RdfNameEntry(id = 0, value = "testTestTest") ))) @@ -123,9 +131,10 @@ class NameEncoderSpec extends AnyWordSpec, Inspectors, Matchers: "add a full IRI in no-prefix table mode" in { val (encoder, buffer) = getEncoder(0) val iri = encoder.encodeIri("https://test.org/Cake", buffer) - iri.nameId should be (1) + iri.nameId should be (0) iri.prefixId should be (0) + // in the no prefix mode, there must be no prefix entries buffer.size should be (1) buffer should contain (RdfStreamRow(RdfStreamRow.Row.Name( RdfNameEntry(id = 0, value = "https://test.org/Cake") @@ -136,18 +145,19 @@ class NameEncoderSpec extends AnyWordSpec, Inspectors, Matchers: val (encoder, buffer) = getEncoder(3) val data = Seq( // IRI, expected prefix ID, expected name ID - ("https://test.org/Cake1", 1, 1), + ("https://test.org/Cake1", 1, 0), ("https://test.org#Cake1", 2, 1), ("https://test.org/test/Cake1", 3, 1), - ("https://test.org/Cake2", 1, 2), + ("https://test.org/Cake2", 1, 0), ("https://test.org#Cake2", 2, 2), ("https://test.org/other/Cake1", 3, 1), - ("https://test.org/other/Cake2", 3, 2), - ("https://test.org/other/Cake3", 3, 3), - ("https://test.org/other/Cake4", 3, 4), - ("https://test.org/other/Cake5", 3, 1), - ("https://test.org#Cake2", 2, 2), - ("Cake2", 0, 2), + ("https://test.org/other/Cake2", 0, 0), + ("https://test.org/other/Cake3", 0, 0), + ("https://test.org/other/Cake4", 0, 0), + ("https://test.org/other/Cake5", 0, 1), + ("https://test.org#Cake2", 2, 0), + // prefix "" evicts the previous number #1 + ("Cake2", 1, 2), ) for (sIri, ePrefix, eName) <- data do @@ -166,6 +176,7 @@ class NameEncoderSpec extends AnyWordSpec, Inspectors, Matchers: (false, 0, "Cake3"), (false, 0, "Cake4"), (false, 1, "Cake5"), + (true, 1, ""), ) buffer.size should be (expectedBuffer.size) diff --git a/core/src/test/scala/eu/ostrzyciel/jelly/core/ProtoTestCases.scala b/core/src/test/scala/eu/ostrzyciel/jelly/core/ProtoTestCases.scala index 96722252..8e2866e4 100644 --- a/core/src/test/scala/eu/ostrzyciel/jelly/core/ProtoTestCases.scala +++ b/core/src/test/scala/eu/ostrzyciel/jelly/core/ProtoTestCases.scala @@ -70,9 +70,9 @@ object ProtoTestCases: RdfPrefixEntry(0, "https://test.org/ns2/"), RdfNameEntry(0, "object"), RdfTriple( - RdfTerm(RdfTerm.Term.Iri(RdfIri(1, 1))), - RdfTerm(RdfTerm.Term.Iri(RdfIri(1, 2))), - RdfTerm(RdfTerm.Term.Iri(RdfIri(2, 3))), + RdfTerm(RdfTerm.Term.Iri(RdfIri(1, 0))), + RdfTerm(RdfTerm.Term.Iri(RdfIri(0, 0))), + RdfTerm(RdfTerm.Term.Iri(RdfIri(2, 0))), ), RdfDatatypeEntry(0, "https://test.org/xsd/integer"), RdfTriple( @@ -80,6 +80,7 @@ object ProtoTestCases: TERM_REPEAT, RdfTerm(RdfTerm.Term.Literal(RdfLiteral("123", RdfLiteral.LiteralKind.Datatype(1)))), ), + RdfPrefixEntry(0, ""), RdfNameEntry(0, "b"), RdfNameEntry(0, "c"), RdfTriple( @@ -87,13 +88,13 @@ object ProtoTestCases: TERM_REPEAT, RdfTerm(RdfTerm.Term.TripleTerm(RdfTriple( RdfTerm(RdfTerm.Term.Iri(RdfIri(1, 1))), - RdfTerm(RdfTerm.Term.Iri(RdfIri(0, 4))), - RdfTerm(RdfTerm.Term.Iri(RdfIri(0, 5))), + RdfTerm(RdfTerm.Term.Iri(RdfIri(3, 4))), + RdfTerm(RdfTerm.Term.Iri(RdfIri(0, 0))), ))) ), RdfTriple( RdfTerm(RdfTerm.Term.Iri(RdfIri(1, 2))), - RdfTerm(RdfTerm.Term.Iri(RdfIri(1, 1))), + RdfTerm(RdfTerm.Term.Iri(RdfIri(0, 1))), TERM_REPEAT, ), )) @@ -120,14 +121,14 @@ object ProtoTestCases: RdfPrefixEntry(0, "https://test.org/ns2/"), RdfNameEntry(0, "object"), RdfTriple( - RdfTerm(RdfTerm.Term.Iri(RdfIri(1, 1))), - RdfTerm(RdfTerm.Term.Iri(RdfIri(1, 2))), - RdfTerm(RdfTerm.Term.Iri(RdfIri(2, 3))), + RdfTerm(RdfTerm.Term.Iri(RdfIri(1, 0))), + RdfTerm(RdfTerm.Term.Iri(RdfIri(0, 0))), + RdfTerm(RdfTerm.Term.Iri(RdfIri(2, 0))), ), RdfDatatypeEntry(0, "https://test.org/xsd/integer"), RdfTriple( RdfTerm(RdfTerm.Term.Iri(RdfIri(1, 1))), - RdfTerm(RdfTerm.Term.Iri(RdfIri(1, 2))), + RdfTerm(RdfTerm.Term.Iri(RdfIri(0, 0))), RdfTerm(RdfTerm.Term.Literal(RdfLiteral("123", RdfLiteral.LiteralKind.Datatype(1)))), ), )) @@ -169,10 +170,10 @@ object ProtoTestCases: RdfPrefixEntry(0, "https://test.org/ns3/"), RdfNameEntry(0, "graph"), RdfQuad( - RdfTerm(RdfTerm.Term.Iri(RdfIri(1, 1))), - RdfTerm(RdfTerm.Term.Iri(RdfIri(1, 2))), + RdfTerm(RdfTerm.Term.Iri(RdfIri(1, 0))), + RdfTerm(RdfTerm.Term.Iri(RdfIri(0, 0))), RdfTerm(RdfTerm.Term.Literal(RdfLiteral("test", RdfLiteral.LiteralKind.Langtag("en-gb")))), - RdfGraph(RdfGraph.Graph.Iri(RdfIri(2, 3))), + RdfGraph(RdfGraph.Graph.Iri(RdfIri(2, 0))), ), RdfQuad( TERM_REPEAT, @@ -222,10 +223,10 @@ object ProtoTestCases: RdfPrefixEntry(0, "https://test.org/ns3/"), RdfNameEntry(0, "graph"), RdfQuad( - RdfTerm(RdfTerm.Term.Iri(RdfIri(1, 1))), - RdfTerm(RdfTerm.Term.Iri(RdfIri(1, 2))), + RdfTerm(RdfTerm.Term.Iri(RdfIri(1, 0))), + RdfTerm(RdfTerm.Term.Iri(RdfIri(0, 0))), RdfTerm(RdfTerm.Term.Literal(RdfLiteral("test", RdfLiteral.LiteralKind.Langtag("en-gb")))), - RdfGraph(RdfGraph.Graph.Iri(RdfIri(2, 3))), + RdfGraph(RdfGraph.Graph.Iri(RdfIri(2, 0))), ), RdfQuad( RdfTerm(RdfTerm.Term.Iri(RdfIri(1, 1))), @@ -259,8 +260,8 @@ object ProtoTestCases: RdfNameEntry(0, "subject"), RdfNameEntry(0, "predicate"), RdfQuad( - RdfTerm(RdfTerm.Term.Iri(RdfIri(1, 1))), - RdfTerm(RdfTerm.Term.Iri(RdfIri(1, 2))), + RdfTerm(RdfTerm.Term.Iri(RdfIri(1, 0))), + RdfTerm(RdfTerm.Term.Iri(RdfIri(0, 0))), RdfTerm(RdfTerm.Term.Literal(RdfLiteral("test", RdfLiteral.LiteralKind.Langtag("en-gb")))), RdfGraph(RdfGraph.Graph.DefaultGraph(RdfDefaultGraph())), ), @@ -333,9 +334,9 @@ object ProtoTestCases: RdfPrefixEntry(0, "https://test.org/ns2/"), RdfNameEntry(0, "object"), RdfTriple( - RdfTerm(RdfTerm.Term.Iri(RdfIri(1, 1))), - RdfTerm(RdfTerm.Term.Iri(RdfIri(1, 2))), - RdfTerm(RdfTerm.Term.Iri(RdfIri(2, 3))), + RdfTerm(RdfTerm.Term.Iri(RdfIri(1, 0))), + RdfTerm(RdfTerm.Term.Iri(RdfIri(0, 0))), + RdfTerm(RdfTerm.Term.Iri(RdfIri(2, 0))), ), RdfDatatypeEntry(0, "https://test.org/xsd/integer"), RdfTriple( @@ -346,7 +347,7 @@ object ProtoTestCases: RdfGraphEnd(), RdfPrefixEntry(0, "https://test.org/ns3/"), RdfNameEntry(0, "graph"), - RdfGraphStart(RdfGraph(RdfGraph.Graph.Iri(RdfIri(3, 4)))), + RdfGraphStart(RdfGraph(RdfGraph.Graph.Iri(RdfIri(3, 0)))), RdfTriple( TERM_REPEAT, TERM_REPEAT, diff --git a/stream/src/test/scala/eu/ostrzyciel/jelly/stream/EncoderFlowSpec.scala b/stream/src/test/scala/eu/ostrzyciel/jelly/stream/EncoderFlowSpec.scala index f711b31c..92834f4b 100644 --- a/stream/src/test/scala/eu/ostrzyciel/jelly/stream/EncoderFlowSpec.scala +++ b/stream/src/test/scala/eu/ostrzyciel/jelly/stream/EncoderFlowSpec.scala @@ -120,11 +120,12 @@ class EncoderFlowSpec extends AnyWordSpec, Matchers, ScalaFutures: .withLogicalType(LogicalStreamType.FLAT_TRIPLES) ) ) - encoded.size should be (4) + encoded.size should be (5) encoded.head.rows.count(_.row.isTriple) should be (0) encoded(1).rows.count(_.row.isTriple) should be (1) encoded(2).rows.count(_.row.isTriple) should be (1) - encoded(3).rows.count(_.row.isTriple) should be (2) + encoded(3).rows.count(_.row.isTriple) should be (1) + encoded(4).rows.count(_.row.isTriple) should be (1) } }