Experiment: default IDs in LUT references (#81)

* Experiment: default IDs in LUT references * Revert changes to datatype handling
Jelly-RDF · May 22, 2024 · be3821a · be3821a
1 parent a912e64
commit be3821a
Show file tree

Hide file tree

Showing 7 changed files with 142 additions and 76 deletions.
diff --git a/core/src/main/scala/eu/ostrzyciel/jelly/core/EncoderLookup.scala b/core/src/main/scala/eu/ostrzyciel/jelly/core/EncoderLookup.scala
@@ -2,10 +2,6 @@ package eu.ostrzyciel.jelly.core
 
 import java.util
 
-private[core] object EncoderValue:
-  // Empty default value to slightly reduce heap pressure
-  val Empty = EncoderValue(0, 0, false)
-
 private[core] final case class EncoderValue(getId: Int, setId: Int, newEntry: Boolean)
 
 private[core] final class EncoderLookup(maxEntries: Int)

diff --git a/core/src/main/scala/eu/ostrzyciel/jelly/core/NameDecoder.scala b/core/src/main/scala/eu/ostrzyciel/jelly/core/NameDecoder.scala
@@ -6,6 +6,9 @@ private[core] final class NameDecoder(opt: RdfStreamOptions):
   private val prefixLookup = new DecoderLookup[String](opt.maxPrefixTableSize)
   private val nameLookup = new DecoderLookup[String](opt.maxNameTableSize)
 
+  private var lastIriPrefixId: Int = 0
+  private var lastIriNameId: Int = 0
+
   /**
    * Update the name table.
    * @param nameRow name row
@@ -32,16 +35,24 @@ private[core] final class NameDecoder(opt: RdfStreamOptions):
    */
   def decode(iri: RdfIri): String =
     val prefix = iri.prefixId match
-      case 0 => ""
+      case 0 if lastIriPrefixId < 1 => ""
+      // the .get() result can't be null here, we've already retrieved it before
+      case 0 => prefixLookup.get(lastIriPrefixId)
       case id =>
         val p = prefixLookup.get(id)
         if p == null then throw MissingPrefixEntryError(id)
+        lastIriPrefixId = id
         p
     val name = iri.nameId match
-      case 0 => ""
+      case 0 => 
+        lastIriNameId += 1
+        val n = nameLookup.get(lastIriNameId)
+        if n == null then throw MissingNameEntryError(lastIriNameId)
+        n
       case id =>
         val n = nameLookup.get(id)
         if n == null then throw MissingNameEntryError(id)
+        lastIriNameId = id
         n
 
     prefix + name
diff --git a/core/src/main/scala/eu/ostrzyciel/jelly/core/NameEncoder.scala b/core/src/main/scala/eu/ostrzyciel/jelly/core/NameEncoder.scala
@@ -4,32 +4,56 @@ import eu.ostrzyciel.jelly.core.proto.v1.*
 
 import scala.collection.mutable.ListBuffer
 
+private[core] object NameEncoder:
+  private val repeatDatatype = RdfLiteral.LiteralKind.Datatype(0)
+
 /**
  * IRI and datatype encoder.
  * Maintains internal lookups for prefixes, names, and datatypes. Uses the LRU strategy for eviction.
  *
  * @param opt Jelly options
  */
 private[core] final class NameEncoder(opt: RdfStreamOptions):
+  import NameEncoder.*
+
   private val nameLookup = new EncoderLookup(opt.maxNameTableSize)
   private val prefixLookup = new EncoderLookup(opt.maxPrefixTableSize)
   private val dtLookup = new EncoderLookup(opt.maxDatatypeTableSize)
   private val dtTable = new DecoderLookup[RdfLiteral.LiteralKind.Datatype](opt.maxDatatypeTableSize)
 
+  private var lastIriPrefixId: Int = -1000
+  private var lastIriNameId: Int = 0
+
   /**
    * Try to extract the prefix out of the IRI.
    *
    * Somewhat based on [[org.apache.jena.riot.system.PrefixMapStd.getPossibleKey]]
    * @param iri IRI
-   * @return prefix or null (micro-optimization, don't hit me)
+   * @return prefix which can be empty, never null
    */
   private def getIriPrefix(iri: String): String =
     iri.lastIndexOf('#') match
       case i if i > -1 => iri.substring(0, i + 1)
       case _ =>
         iri.lastIndexOf('/') match
           case i if i > -1 => iri.substring(0, i + 1)
-          case _ => null
+          case _ => ""
+
+  /**
+   * Obtain the id for the name lookup table to be communicated to the consumer.
+   * This method checks if new id = last_id + 1, and if so, it returns 0.
+   *
+   * @param getId the getId from the EncoderLookup
+   * @return the id to be communicated to the consumer
+   */
+  private inline def getNameIdWithRepeat(getId: Int): Int =
+    if lastIriNameId + 1 == getId then
+      // If the last node had id - 1, we can tell it to the consumer in a shorthand manner
+      lastIriNameId = getId
+      0
+    else
+      lastIriNameId = getId
+      getId
 
   /**
    * Encodes an IRI to a protobuf representation.
@@ -39,40 +63,47 @@ private[core] final class NameEncoder(opt: RdfStreamOptions):
    * @return protobuf representation of the IRI
    */
   def encodeIri(iri: String, rowsBuffer: ListBuffer[RdfStreamRow]): RdfIri =
-    def plainIriEncode: RdfIri =
-      nameLookup.addEntry(iri) match
-        case EncoderValue(getId, _, false) =>
-          RdfIri(nameId = getId)
-        case EncoderValue(getId, setId, true) =>
-          rowsBuffer.append(
-            RdfStreamRow(RdfStreamRow.Row.Name(
-              RdfNameEntry(id = setId, value = iri)
-            ))
-          )
-          RdfIri(nameId = getId)
-
     if opt.maxPrefixTableSize == 0 then
       // Use a lighter algorithm if the prefix table is disabled
-      return plainIriEncode
-
-    getIriPrefix(iri) match
-      case null => plainIriEncode
-      case prefix =>
-        val postfix = iri.substring(prefix.length)
-        val pVal = prefixLookup.addEntry(prefix)
-        val iVal = if postfix.nonEmpty then nameLookup.addEntry(postfix) else EncoderValue.Empty
-
-        if pVal.newEntry then rowsBuffer.append(
-          RdfStreamRow(RdfStreamRow.Row.Prefix(
-            RdfPrefixEntry(pVal.setId, prefix)
-          ))
-        )
-        if iVal.newEntry then rowsBuffer.append(
+      val nameLookupEntry = nameLookup.addEntry(iri)
+      if nameLookupEntry.newEntry then
+        rowsBuffer.append(
           RdfStreamRow(RdfStreamRow.Row.Name(
-            RdfNameEntry(iVal.setId, postfix)
+            RdfNameEntry(id = nameLookupEntry.setId, value = iri)
           ))
         )
-        RdfIri(prefixId = pVal.getId, nameId = iVal.getId)
+      // We set the prefixId to 0, but it's a special case, because the prefix table is disabled.
+      // The consumer will interpret this as no prefix.
+      RdfIri(nameId = getNameIdWithRepeat(nameLookupEntry.getId))
+    else
+      val prefix = getIriPrefix(iri)
+      val postfix = iri.substring(prefix.length)
+      val prefixLookupEntry = prefixLookup.addEntry(prefix)
+      val nameLookupEntry = nameLookup.addEntry(postfix)
+
+      if prefixLookupEntry.newEntry then rowsBuffer.append(
+        RdfStreamRow(RdfStreamRow.Row.Prefix(
+          RdfPrefixEntry(prefixLookupEntry.setId, prefix)
+        ))
+      )
+      if nameLookupEntry.newEntry then rowsBuffer.append(
+        RdfStreamRow(RdfStreamRow.Row.Name(
+          RdfNameEntry(nameLookupEntry.setId, postfix)
+        ))
+      )
+
+      val nameIdWithRepeat = getNameIdWithRepeat(nameLookupEntry.getId)
+      if lastIriPrefixId == prefixLookupEntry.getId then
+        // If the last IRI had the same prefix, we can tell the consumer to reuse it.
+        // prefixId = 0 by default in this constructor.
+        // No need to update lastIriPrefixId, because it's the same.
+        RdfIri(nameId = nameIdWithRepeat)
+      else
+        lastIriPrefixId = prefixLookupEntry.getId
+        RdfIri(
+          prefixId = prefixLookupEntry.getId,
+          nameId = nameIdWithRepeat
+        )
 
   /**
    * Encodes a datatype IRI to a protobuf representation.

diff --git a/core/src/test/scala/eu/ostrzyciel/jelly/core/NameDecoderSpec.scala b/core/src/test/scala/eu/ostrzyciel/jelly/core/NameDecoderSpec.scala
@@ -27,15 +27,27 @@ class NameDecoderSpec extends AnyWordSpec, Matchers:
         error.nameId should be (5)
       }
 
-      "return empty string for no prefix and no name" in {
+      "throw MissingNameEntryError when trying to retrieve a name with empty LUT" in {
         val dec = NameDecoder(smallOptions)
+        val error = intercept[MissingNameEntryError] {
+          dec.decode(RdfIri(0, 0))
+        }
+        error.getMessage should include ("name table at ID: 1")
+        error.nameId should be (1)
+      }
+
+      "return empty string for no prefix and empty name" in {
+        val dec = NameDecoder(smallOptions)
+        dec.updateNames(RdfNameEntry(0, ""))
         dec.decode(RdfIri(0, 0)) should be ("")
       }
 
       "accept new prefixes with default IDs" in {
         val dec = NameDecoder(smallOptions)
         dec.updatePrefixes(RdfPrefixEntry(0, "https://test.org/"))
         dec.updatePrefixes(RdfPrefixEntry(0, "https://test.org/2/"))
+        dec.updateNames(RdfNameEntry(0, ""))
+        dec.updateNames(RdfNameEntry(0, ""))
         dec.decode(RdfIri(1, 0)) should be("https://test.org/")
         dec.decode(RdfIri(2, 0)) should be("https://test.org/2/")
       }
@@ -45,13 +57,16 @@ class NameDecoderSpec extends AnyWordSpec, Matchers:
         dec.updatePrefixes(RdfPrefixEntry(4, "https://test.org/"))
         // This ID will resolve to 5
         dec.updatePrefixes(RdfPrefixEntry(0, "https://test.org/2/"))
+        dec.updateNames(RdfNameEntry(0, ""))
+        dec.updateNames(RdfNameEntry(0, ""))
         dec.decode(RdfIri(4, 0)) should be("https://test.org/")
         dec.decode(RdfIri(5, 0)) should be("https://test.org/2/")
       }
 
       "accept a new prefix and return it (IRI with no name part)" in {
         val dec = NameDecoder(smallOptions)
         dec.updatePrefixes(RdfPrefixEntry(3, "https://test.org/"))
+        dec.updateNames(RdfNameEntry(0, ""))
         dec.decode(RdfIri(3, 0)) should be ("https://test.org/")
       }
 

diff --git a/core/src/test/scala/eu/ostrzyciel/jelly/core/NameEncoderSpec.scala b/core/src/test/scala/eu/ostrzyciel/jelly/core/NameEncoderSpec.scala
@@ -84,7 +84,7 @@ class NameEncoderSpec extends AnyWordSpec, Inspectors, Matchers:
       "add a full IRI" in {
         val (encoder, buffer) = getEncoder()
         val iri = encoder.encodeIri("https://test.org/Cake", buffer)
-        iri.nameId should be (1)
+        iri.nameId should be (0)
         iri.prefixId should be (1)
 
         buffer.size should be (2)
@@ -102,19 +102,27 @@ class NameEncoderSpec extends AnyWordSpec, Inspectors, Matchers:
         iri.nameId should be (0)
         iri.prefixId should be (1)
 
-        buffer.size should be (1)
+        // an empty name entry still has to be allocated
+        buffer.size should be (2)
         buffer should contain (RdfStreamRow(RdfStreamRow.Row.Prefix(
           RdfPrefixEntry(id = 0, value = "https://test.org/test/")
         )))
+        buffer should contain(RdfStreamRow(RdfStreamRow.Row.Name(
+          RdfNameEntry(id = 0, value = "")
+        )))
       }
 
       "add a name-only IRI" in {
         val (encoder, buffer) = getEncoder()
         val iri = encoder.encodeIri("testTestTest", buffer)
-        iri.nameId should be (1)
-        iri.prefixId should be (0)
+        iri.nameId should be (0)
+        iri.prefixId should be (1)
 
-        buffer.size should be (1)
+        // in the mode with the prefix table enabled, an empty prefix entry still has to be allocated
+        buffer.size should be (2)
+        buffer should contain(RdfStreamRow(RdfStreamRow.Row.Prefix(
+          RdfPrefixEntry(id = 0, value = "")
+        )))
         buffer should contain (RdfStreamRow(RdfStreamRow.Row.Name(
           RdfNameEntry(id = 0, value = "testTestTest")
         )))
@@ -123,9 +131,10 @@ class NameEncoderSpec extends AnyWordSpec, Inspectors, Matchers:
       "add a full IRI in no-prefix table mode" in {
         val (encoder, buffer) = getEncoder(0)
         val iri = encoder.encodeIri("https://test.org/Cake", buffer)
-        iri.nameId should be (1)
+        iri.nameId should be (0)
         iri.prefixId should be (0)
 
+        // in the no prefix mode, there must be no prefix entries
         buffer.size should be (1)
         buffer should contain (RdfStreamRow(RdfStreamRow.Row.Name(
           RdfNameEntry(id = 0, value = "https://test.org/Cake")
@@ -136,18 +145,19 @@ class NameEncoderSpec extends AnyWordSpec, Inspectors, Matchers:
         val (encoder, buffer) = getEncoder(3)
         val data = Seq(
           // IRI, expected prefix ID, expected name ID
-          ("https://test.org/Cake1", 1, 1),
+          ("https://test.org/Cake1", 1, 0),
           ("https://test.org#Cake1", 2, 1),
           ("https://test.org/test/Cake1", 3, 1),
-          ("https://test.org/Cake2", 1, 2),
+          ("https://test.org/Cake2", 1, 0),
           ("https://test.org#Cake2", 2, 2),
           ("https://test.org/other/Cake1", 3, 1),
-          ("https://test.org/other/Cake2", 3, 2),
-          ("https://test.org/other/Cake3", 3, 3),
-          ("https://test.org/other/Cake4", 3, 4),
-          ("https://test.org/other/Cake5", 3, 1),
-          ("https://test.org#Cake2", 2, 2),
-          ("Cake2", 0, 2),
+          ("https://test.org/other/Cake2", 0, 0),
+          ("https://test.org/other/Cake3", 0, 0),
+          ("https://test.org/other/Cake4", 0, 0),
+          ("https://test.org/other/Cake5", 0, 1),
+          ("https://test.org#Cake2", 2, 0),
+          // prefix "" evicts the previous number #1
+          ("Cake2", 1, 2),
         )
 
         for (sIri, ePrefix, eName) <- data do
@@ -166,6 +176,7 @@ class NameEncoderSpec extends AnyWordSpec, Inspectors, Matchers:
           (false, 0, "Cake3"),
           (false, 0, "Cake4"),
           (false, 1, "Cake5"),
+          (true, 1, ""),
         )
 
         buffer.size should be (expectedBuffer.size)