Skip to content

Commit

Permalink
Deserialization
Browse files Browse the repository at this point in the history
  • Loading branch information
bleroy committed Jul 26, 2020
1 parent 86994b7 commit f8032aa
Show file tree
Hide file tree
Showing 9 changed files with 269 additions and 113 deletions.
95 changes: 77 additions & 18 deletions LunrCore/Index.cs
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,10 @@
using System;
using System.Collections.Generic;
using System.Globalization;
using System.IO;
using System.Linq;
using System.Runtime.CompilerServices;
using System.Text.Json;
using System.Text.Json.Serialization;
using System.Threading;
using System.Threading.Tasks;
Expand Down Expand Up @@ -83,26 +85,17 @@ public static async Task<Index> Build(
IDictionary<string, Pipeline.Function>? registry = null!,
params Field[] fields)
{
Pipeline.Function trimmerFunction
= (trimmer ?? new Trimmer()).FilterFunction;
Pipeline.Function filterFunction
= (stopWordFilter ?? new EnglishStopWordFilter()).FilterFunction;
Pipeline.Function stemmerFunction
= (stemmer ?? new EnglishStemmer()).StemmerFunction;

registry ??= new Dictionary<string, Pipeline.Function>
{
{ "trimmer", trimmerFunction },
{ "stopWordFilter", filterFunction },
{ "stemmer", stemmerFunction }
};

var indexingPipeline = new Pipeline(registry, trimmerFunction, filterFunction, stemmerFunction);
var searchPipeline = new Pipeline(registry, stemmerFunction);
Pipeline.Function trimmerFunction = (trimmer ?? new Trimmer()).FilterFunction;
Pipeline.Function filterFunction = (stopWordFilter ?? new EnglishStopWordFilter()).FilterFunction;
Pipeline.Function stemmerFunction = (stemmer ?? new EnglishStemmer()).StemmerFunction;
registry ??= new Dictionary<string, Pipeline.Function>();
registry.Add("trimmer", trimmerFunction);
registry.Add("stopWordFilter", filterFunction);
registry.Add("stemmer", stemmerFunction);

var builder = new Builder(
indexingPipeline: indexingPipeline,
searchPipeline: searchPipeline,
indexingPipeline: new Pipeline(registry, trimmerFunction, filterFunction, stemmerFunction),
searchPipeline: new Pipeline(registry, stemmerFunction),
tokenizer: tokenizer ?? new Tokenizer(),
fields: fields);

Expand Down Expand Up @@ -522,5 +515,71 @@ public async IAsyncEnumerable<Result> Query(Action<Query> queryFactory)
yield return result;
}
}

/// <summary>
/// Load an index from a JSON stream.
/// </summary>
/// <param name="utf8json">The JSON stream</param>
/// <param name="stemmer">An optional stemmer. English is used if none is provided.</param>
/// <param name="registry">An optional registry of pipeline functions to use to resolve the persisted pipeline.</param>
/// <returns>The index.</returns>
public static async Task<Index> LoadFromJsonStream(
Stream utf8json,
StemmerBase? stemmer = null!,
IDictionary<string, Pipeline.Function>? registry = null!)
{
Index index = await JsonSerializer.DeserializeAsync<Index>(utf8json);
return ProcessDeserializedIndex(stemmer, registry, index);
}

/// <summary>
/// Load an index from a JSON string.
/// </summary>
/// <param name="json">The JSON string</param>
/// <param name="stemmer">An optional stemmer. English is used if none is provided.</param>
/// <param name="registry">An optional registry of pipeline functions to use to resolve the persisted pipeline.</param>
/// <returns>The index.</returns>
public static Index LoadFromJson(
string json,
StemmerBase? stemmer = null!,
IDictionary<string, Pipeline.Function>? registry = null!)
{
Index index = JsonSerializer.Deserialize<Index>(json);
return ProcessDeserializedIndex(stemmer, registry, index);
}

/// <summary>
/// Persists an index to a stream as JSON.
/// </summary>
/// <param name="utf8json">The stream to persist to.</param>
/// <param name="options">Optional serializer options.</param>
public async Task SaveToJsonStream(Stream utf8json, JsonSerializerOptions? options = null!)
=> await JsonSerializer.SerializeAsync(utf8json, this, options);

/// <summary>
/// Persists an index to a stream as JSON.
/// </summary>
/// <param name="utf8json">The stream to persist to.</param>
/// <param name="options">Optional serializer options.</param>
public string ToJson(JsonSerializerOptions? options = null!)
=> JsonSerializer.Serialize(this, options);

private static Index ProcessDeserializedIndex(
StemmerBase? stemmer,
IDictionary<string, Pipeline.Function>? registry,
Index index)
{
registry ??= new Dictionary<string, Pipeline.Function>();
if (!registry.ContainsKey("stemmer"))
{
Pipeline.Function stemmerFunction = (stemmer ?? new EnglishStemmer()).StemmerFunction;
registry.Add("stemmer", stemmerFunction);
}
foreach ((string functionName, Pipeline.Function function) in registry)
{
index.Pipeline.RegisterFunction(function, functionName);
}
return index;
}
}
}
72 changes: 29 additions & 43 deletions LunrCore/Serialization/IndexJsonConverter.cs
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
using System;
using System.Collections.Generic;
using System.Linq;
using System.Text.Json;
using System.Text.Json.Serialization;

Expand All @@ -25,46 +24,38 @@ public override Index Read(ref Utf8JsonReader reader, Type typeToConvert, JsonSe
throw new JsonException("An index can only be deserialized from an object.");
}
reader.Read();
while (reader.Read())
while (reader.AdvanceTo(JsonTokenType.PropertyName, JsonTokenType.EndObject) != JsonTokenType.EndObject)
{
if (reader.TokenType == JsonTokenType.EndObject)
string propertyName = reader.ReadValue<string>(options);
switch (propertyName)
{
if (invertedIndex is null) throw new JsonException("Serialized index is missing invertedIndex.");
if (fieldVectors is null) throw new JsonException("Serialized index is missing fieldVectors.");
if (pipeline is null) throw new JsonException("Serialized index is missing a pipeline.");
if (fields is null) throw new JsonException("Serialized index is missing a list of fields.");

return new Index(invertedIndex, fieldVectors, tokenSetBuilder.Root, fields, pipeline);
}

if (reader.TokenType == JsonTokenType.PropertyName)
{
string propertyName = reader.GetString();
switch(propertyName)
{
case "version":
string version = reader.ReadValue<string>(options);
if (version != _versionString)
{
System.Diagnostics.Debug.Write($"Version mismatch when loading serialised index. Current version of Lunr '{_version}' does not match serialized index '{version}'");
}
break;
case "invertedIndex":
invertedIndex = reader.ReadValue<InvertedIndex>(options);
break;
case "fieldVectors":
fieldVectors = reader.ReadValue<Dictionary<string, Vector>>(options);
break;
case "pipeline":
pipeline = new Pipeline(reader.ReadValue<string[]>(options));
break;
case "fields":
fields = reader.ReadArray<string>(options);
break;
}
case "version":
string version = reader.ReadValue<string>(options);
if (version != _versionString)
{
System.Diagnostics.Debug.Write($"Version mismatch when loading serialised index. Current version of Lunr '{_version}' does not match serialized index '{version}'");
}
break;
case "invertedIndex":
invertedIndex = reader.ReadValue<InvertedIndex>(options);
break;
case "fieldVectors":
fieldVectors = reader.ReadDictionaryFromKeyValueSequence<Vector>(options);
break;
case "pipeline":
pipeline = new Pipeline(reader.ReadArray<string>(options));
break;
case "fields":
fields = reader.ReadArray<string>(options);
break;
}
}
throw new JsonException("Unexpectedly reached the end of the stream.");
if (invertedIndex is null) throw new JsonException("Serialized index is missing invertedIndex.");
if (fieldVectors is null) throw new JsonException("Serialized index is missing fieldVectors.");
if (pipeline is null) throw new JsonException("Serialized index is missing a pipeline.");
if (fields is null) throw new JsonException("Serialized index is missing a list of fields.");

return new Index(invertedIndex, fieldVectors, tokenSetBuilder.Root, fields, pipeline);
}

public override void Write(Utf8JsonWriter writer, Index value, JsonSerializerOptions options)
Expand All @@ -84,12 +75,7 @@ public override void Write(Utf8JsonWriter writer, Index value, JsonSerializerOpt
{
writer.WriteStartArray();
writer.WriteStringValue(field);
writer.WriteStartArray();
foreach (double component in vector.Save())
{
writer.WriteNumberValue(component);
}
writer.WriteEndArray();
writer.WriteValue(vector, options);
writer.WriteEndArray();
}
writer.WriteEndArray();
Expand Down
57 changes: 43 additions & 14 deletions LunrCore/Serialization/InvertedIndexEntryJsonConverter.cs
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
using System;
using System.Collections.Generic;
using System.Text.Json;
using System.Text.Json.Serialization;

Expand All @@ -10,26 +11,48 @@ public override InvertedIndexEntry Read(ref Utf8JsonReader reader, Type typeToCo
{
if (reader.TokenType != JsonTokenType.StartObject)
{
throw new JsonException("An inverted index can only be deserialized from an array.");
throw new JsonException("An inverted index entry can only be deserialized from an object.");
}
var result = new InvertedIndexEntry();
reader.Read();
while (reader.Read())
reader.ReadOrThrow();
while (reader.AdvanceTo(JsonTokenType.PropertyName, JsonTokenType.EndObject) != JsonTokenType.EndObject)
{
if (reader.TokenType == JsonTokenType.EndObject)
string propertyName = reader.ReadValue<string>(options);
if (propertyName == "_index")
{
return result;
result.Index = reader.ReadValue<int>(options);
}
else if (reader.TokenType == JsonTokenType.PropertyName)
else
{
string propertyName = reader.GetString();
result.Add(
propertyName,
reader.ReadValue<FieldOccurrences>(options));
var occurrences = new FieldOccurrences();
reader.AdvanceTo(JsonTokenType.StartObject);
while (reader.AdvanceTo(JsonTokenType.PropertyName, JsonTokenType.EndObject) != JsonTokenType.EndObject)
{
string field = reader.ReadValue<string>(options);
var metadata = new Metadata();
reader.AdvanceTo(JsonTokenType.StartObject);
while (reader.AdvanceTo(JsonTokenType.PropertyName, JsonTokenType.EndObject) != JsonTokenType.EndObject)
{
string doc = reader.ReadValue<string>(options);
reader.AdvanceTo(JsonTokenType.StartArray);
reader.ReadOrThrow();
var data = new List<object>();
while (reader.TokenType != JsonTokenType.EndArray)
{
data.Add(JsonSerializer.Deserialize(ref reader, typeof(object), options));
}
reader.ReadOrThrow();
metadata.Add(doc, data);
}
reader.ReadOrThrow();
occurrences.Add(field, metadata);
}
reader.ReadOrThrow();
result.Add(propertyName, occurrences);
}
else throw new JsonException("Unexpected token.");
}
throw new JsonException("Unexpected end of stream.");
reader.ReadOrThrow();
return result;
}

public override void Write(Utf8JsonWriter writer, InvertedIndexEntry value, JsonSerializerOptions options)
Expand All @@ -44,9 +67,15 @@ public override void Write(Utf8JsonWriter writer, InvertedIndexEntry value, Json
{
writer.WritePropertyName(doc);
writer.WriteStartObject();
foreach((string key, object data) in metadata)
foreach((string key, IList<object> data) in metadata)
{
writer.WriteProperty(key, data, options);
writer.WritePropertyName(key);
writer.WriteStartArray();
foreach (object datum in data)
{
JsonSerializer.Serialize(writer, datum, options);
}
writer.WriteEndArray();
}
writer.WriteEndObject();
}
Expand Down
22 changes: 7 additions & 15 deletions LunrCore/Serialization/InvertedIndexJsonConverter.cs
Original file line number Diff line number Diff line change
Expand Up @@ -15,23 +15,15 @@ public override InvertedIndex Read(ref Utf8JsonReader reader, Type typeToConvert
throw new JsonException("An inverted index can only be deserialized from an array.");
}
var serializedVectors = new List<(string term, InvertedIndexEntry posting)>();
reader.Read();
while (reader.Read())
reader.ReadOrThrow();
while (reader.AdvanceTo(JsonTokenType.StartArray, JsonTokenType.EndArray) != JsonTokenType.EndArray)
{
if (reader.TokenType == JsonTokenType.EndArray)
{
return new InvertedIndex(serializedVectors);
}
else if (reader.TokenType == JsonTokenType.StartArray)
{
serializedVectors.Add((
reader.ReadValue<string>(options),
reader.ReadValue<InvertedIndexEntry>(options)));
reader.Read();
}
else throw new JsonException("Unexpected token.");
reader.AdvanceTo(JsonTokenType.String);
serializedVectors.Add((
reader.ReadValue<string>(options),
reader.ReadValue<InvertedIndexEntry>(options)));
}
throw new JsonException("Unexpected end of stream.");
return new InvertedIndex(serializedVectors);
}

public override void Write(Utf8JsonWriter writer, InvertedIndex value, JsonSerializerOptions options)
Expand Down
Loading

0 comments on commit f8032aa

Please sign in to comment.