Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

redo xml parser #510

Merged
merged 1 commit into from
Nov 24, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 10 additions & 0 deletions src/Paillave.Etl.XmlFile/Core/IXmlObjectReader.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
using System;
using System.IO;
using System.Threading;

namespace Paillave.Etl.XmlFile.Core;

public interface IXmlObjectReader
{
void Read(Stream fileStream, CancellationToken cancellationToken);
}
2 changes: 0 additions & 2 deletions src/Paillave.Etl.XmlFile/Core/Mapping/XmlFieldDefinition.cs
Original file line number Diff line number Diff line change
@@ -1,9 +1,7 @@
using System;
using System.Collections.Generic;
using System.Globalization;
using System.Linq;
using System.Reflection;
using System.Text;
using System.Xml;

namespace Paillave.Etl.XmlFile.Core.Mapping
Expand Down
2 changes: 0 additions & 2 deletions src/Paillave.Etl.XmlFile/Core/XmlFileDefinition.cs
Original file line number Diff line number Diff line change
@@ -1,8 +1,6 @@
using System;
using System.Collections.Generic;
using System.Linq.Expressions;
using System.Text;
using System.Xml;
using Paillave.Etl.XmlFile.Core.Mapping;

namespace Paillave.Etl.XmlFile.Core
Expand Down
4 changes: 2 additions & 2 deletions src/Paillave.Etl.XmlFile/Core/XmlNodeDefinition.cs
Original file line number Diff line number Diff line change
Expand Up @@ -40,8 +40,8 @@ private void SetFieldDefinition(XmlFieldDefinition xmlFieldDefinition)
var existingFieldDefinition = _xmlFieldDefinitions.FirstOrDefault(i => i.TargetPropertyInfo.Name == xmlFieldDefinition.TargetPropertyInfo.Name);
if (existingFieldDefinition == null)
_xmlFieldDefinitions.Add(xmlFieldDefinition);
else
if (xmlFieldDefinition.NodePath != null) existingFieldDefinition.NodePath = xmlFieldDefinition.NodePath;
else if (xmlFieldDefinition.NodePath != null)
existingFieldDefinition.NodePath = xmlFieldDefinition.NodePath;
}
// public XmlNodeDefinition<T> MapXPathToProperty<TField>(string valueXPathQuery, Expression<Func<T, TField>> memberLambda)
// {
Expand Down
22 changes: 16 additions & 6 deletions src/Paillave.Etl.XmlFile/Core/XmlNodeParsed.cs
Original file line number Diff line number Diff line change
@@ -1,18 +1,28 @@
using System;
using System.Collections.Generic;
using System.Collections.ObjectModel;

namespace Paillave.Etl.XmlFile.Core
{
public class XmlNodeParsed
{
public string SourceName { get; internal set; }
public string NodeDefinitionName { get; internal set; }
public string NodePath { get; internal set; }
public Type Type { get; internal set; }
public object Value { get; internal set; }
public XmlNodeParsed(string sourceName, string nodeDefinitionName, string nodePath, Type type, object value, IDictionary<Type, Guid> correlationKeys)
{
SourceName = sourceName;
NodeDefinitionName = nodeDefinitionName;
NodePath = nodePath;
Type = type;
Value = value;
CorrelationKeys = new ReadOnlyDictionary<Type, Guid>(correlationKeys);
}
public string SourceName { get; }
public string NodeDefinitionName { get; }
public string NodePath { get; }
public Type Type { get; }
public object Value { get; }
public T GetValue<T>() => (T)Value;
// public object[] ParentValues { get; internal set; }
// public T GetValue<T>(int level = 0) => (T)(level == 0 ? Value : ParentValues[level - 1]);
public HashSet<Guid> CorrelationKeys { get; set; } = new HashSet<Guid>();
public ReadOnlyDictionary<Type, Guid> CorrelationKeys { get; }
}
}
261 changes: 128 additions & 133 deletions src/Paillave.Etl.XmlFile/Core/XmlObjectReader.cs
Original file line number Diff line number Diff line change
Expand Up @@ -7,162 +7,157 @@
using System.Threading;
using System.Xml;

namespace Paillave.Etl.XmlFile.Core
namespace Paillave.Etl.XmlFile.Core;
[Obsolete]
public class XmlObjectReader : IXmlObjectReader
{
public class XmlObjectReader
private class XmlReadField
{
private class XmlReadField
{
public XmlFieldDefinition Definition { get; set; }
public IXmlNodeDefinition NodeDefinition { get; set; }
public int Depth { get; set; }
public object Value { get; set; }
}
public XmlFieldDefinition Definition { get; set; }
public IXmlNodeDefinition NodeDefinition { get; set; }
public int Depth { get; set; }
public object Value { get; set; }
}

private HashSet<string> _xmlFieldsDefinitionSearch;
private HashSet<string> _xmlNodesDefinitionSearch;
private HashSet<string> _xmlFieldsDefinitionSearch;
private HashSet<string> _xmlNodesDefinitionSearch;

private readonly List<XmlReadField> _inScopeReadFields = new List<XmlReadField>();
private readonly XmlFileDefinition _xmlFileDefinition;
private readonly List<XmlReadField> _inScopeReadFields = new List<XmlReadField>();
private readonly XmlFileDefinition _xmlFileDefinition;
private readonly string _sourceName;
private readonly Action<XmlNodeParsed> _pushResult;

public XmlObjectReader(XmlFileDefinition xmlFileDefinition)
{
_xmlFileDefinition = xmlFileDefinition;
_xmlNodesDefinitionSearch = new HashSet<string>(xmlFileDefinition.XmlNodeDefinitions.Select(i => i.NodePath).Distinct());
_xmlFieldsDefinitionSearch = new HashSet<string>(xmlFileDefinition.XmlNodeDefinitions.SelectMany(nd => nd.GetXmlFieldDefinitions().Select(fd => fd.NodePath)).Distinct());
}
private bool XmlReadFieldShouldBeCleanedUp(XmlReadField xmlReadField, int depth)
{
var depthScope = xmlReadField.Definition.DepthScope;
int depthLimit;
if (depthScope > 0)
depthLimit = depthScope;
else
depthLimit = xmlReadField.Depth + depthScope;
return depth < depthLimit;
}
private void ProcessEndOfAnyNode(Stack<NodeLevel> nodes)
{
foreach (var item in _inScopeReadFields.Where(i => XmlReadFieldShouldBeCleanedUp(i, nodes.Count - 1)).ToList())
_inScopeReadFields.Remove(item);
}
private void ProcessAttributeValue(string key, Stack<NodeLevel> nodes, string stringContent)
public XmlObjectReader(XmlFileDefinition xmlFileDefinition, string sourceName, Action<XmlNodeParsed> pushResult)
{
_xmlFileDefinition = xmlFileDefinition;
this._sourceName = sourceName;
this._pushResult = pushResult;
_xmlNodesDefinitionSearch = new HashSet<string>(xmlFileDefinition.XmlNodeDefinitions.Select(i => i.NodePath).Distinct());
_xmlFieldsDefinitionSearch = new HashSet<string>(xmlFileDefinition.XmlNodeDefinitions.SelectMany(nd => nd.GetXmlFieldDefinitions().Select(fd => fd.NodePath)).Distinct());
}
private bool XmlReadFieldShouldBeCleanedUp(XmlReadField xmlReadField, int depth)
{
var depthScope = xmlReadField.Definition.DepthScope;
int depthLimit;
if (depthScope > 0)
depthLimit = depthScope;
else
depthLimit = xmlReadField.Depth + depthScope;
return depth < depthLimit;
}
private void ProcessEndOfAnyNode(Stack<NodeLevel> nodes)
{
foreach (var item in _inScopeReadFields.Where(i => XmlReadFieldShouldBeCleanedUp(i, nodes.Count - 1)).ToList())
_inScopeReadFields.Remove(item);
}
private void ProcessAttributeValue(string key, Stack<NodeLevel> nodes, string stringContent)
{
// string key = $"/{string.Join("/", nodes.Reverse())}";
if (!_xmlFieldsDefinitionSearch.Contains(key)) return;
var fds = _xmlFileDefinition.XmlNodeDefinitions.SelectMany(nd => nd.GetXmlFieldDefinitions().Select(fd => new { Fd = fd, Nd = nd })).Where(i => i.Fd.NodePath == key).ToList();
if (string.IsNullOrWhiteSpace(stringContent))
{
// string key = $"/{string.Join("/", nodes.Reverse())}";
if (!_xmlFieldsDefinitionSearch.Contains(key)) return;
var fds = _xmlFileDefinition.XmlNodeDefinitions.SelectMany(nd => nd.GetXmlFieldDefinitions().Select(fd => new { Fd = fd, Nd = nd })).Where(i => i.Fd.NodePath == key).ToList();
if (string.IsNullOrWhiteSpace(stringContent))
foreach (var fd in fds)
{
foreach (var fd in fds)
_inScopeReadFields.Add(new XmlReadField
{
_inScopeReadFields.Add(new XmlReadField
{
Depth = nodes.Count - 1,
Definition = fd.Fd,
NodeDefinition = fd.Nd,
Value = null
});
}
}
else
{
foreach (var fd in fds)
{
_inScopeReadFields.Add(new XmlReadField
{
Depth = nodes.Count - 1,
Definition = fd.Fd,
NodeDefinition = fd.Nd,
Value = fd.Fd.Convert(stringContent)
});
}
Depth = nodes.Count - 1,
Definition = fd.Fd,
NodeDefinition = fd.Nd,
Value = null
});
}
}
private string ComputeKey(Stack<NodeLevel> nodes) => $"/{string.Join("/", nodes.Select(i => i.Name).Reverse())}";
private void ProcessEndOfNode(Stack<NodeLevel> nodes, string text, Action<XmlNodeParsed> pushResult, string sourceName)
else
{
string key = ComputeKey(nodes);
if (_xmlFieldsDefinitionSearch.Contains(key))
foreach (var fd in fds)
{
ProcessAttributeValue(key, nodes, text);
}
else if (_xmlNodesDefinitionSearch.Contains(key))
{
var (value, nd) = CreateValue(sourceName, key);
pushResult(new XmlNodeParsed
_inScopeReadFields.Add(new XmlReadField
{
NodeDefinitionName = nd.Name,
SourceName = sourceName,
NodePath = nd.NodePath,
Type = nd.Type,
Value = value,
CorrelationKeys = nodes.Select(i => i.Guid).Where(i => i.HasValue).Select(i => i.Value).ToHashSet()
Depth = nodes.Count - 1,
Definition = fd.Fd,
NodeDefinition = fd.Nd,
Value = fd.Fd.Convert(stringContent)
});
}
ProcessEndOfAnyNode(nodes);
}

private (object value, IXmlNodeDefinition nd) CreateValue(string sourceName, string key)
}
private string ComputeKey(Stack<NodeLevel> nodes) => $"/{string.Join("/", nodes.Select(i => i.Name).Reverse())}";
private void ProcessEndOfNode(Stack<NodeLevel> nodes, string text, Action<XmlNodeParsed> pushResult, string sourceName)
{
string key = ComputeKey(nodes);
if (_xmlFieldsDefinitionSearch.Contains(key))
{
var nd = _xmlFileDefinition.XmlNodeDefinitions.FirstOrDefault(i => i.NodePath == key);
var objectBuilder = new ObjectBuilder(nd.Type);
foreach (var inScopeReadField in _inScopeReadFields.Where(rf => rf.NodeDefinition.NodePath == key))
objectBuilder.Values[inScopeReadField.Definition.TargetPropertyInfo.Name] = inScopeReadField.Value;
foreach (var propName in nd.GetXmlFieldDefinitions().Where(i => i.ForRowGuid).Select(i => i.TargetPropertyInfo.Name).ToList())
objectBuilder.Values[propName] = Guid.NewGuid();
foreach (var propName in nd.GetXmlFieldDefinitions().Where(i => i.ForSourceName).Select(i => i.TargetPropertyInfo.Name).ToList())
objectBuilder.Values[propName] = sourceName;
return (objectBuilder.CreateInstance(), nd);
ProcessAttributeValue(key, nodes, text);
}

public void Read(Stream fileStream, string sourceName, Action<XmlNodeParsed> pushResult, CancellationToken cancellationToken)
else if (_xmlNodesDefinitionSearch.Contains(key))
{
XmlReaderSettings xrs = new XmlReaderSettings();
foreach (var item in _xmlFileDefinition.PrefixToUriNameSpacesDictionary)
xrs.Schemas.Add(item.Key, item.Value);
xrs.IgnoreWhitespace = true;
xrs.IgnoreComments = true;
xrs.IgnoreProcessingInstructions = true;
var (value, nd) = CreateValue(sourceName, key);
pushResult(new XmlNodeParsed(sourceName, nd.Name, nd.NodePath, nd.Type, value, new Dictionary<Type, Guid>()));
}
ProcessEndOfAnyNode(nodes);
}

private (object value, IXmlNodeDefinition nd) CreateValue(string sourceName, string key)
{
var nd = _xmlFileDefinition.XmlNodeDefinitions.FirstOrDefault(i => i.NodePath == key);
var objectBuilder = new ObjectBuilder(nd.Type);
foreach (var inScopeReadField in _inScopeReadFields.Where(rf => rf.NodeDefinition.NodePath == key))
objectBuilder.Values[inScopeReadField.Definition.TargetPropertyInfo.Name] = inScopeReadField.Value;
foreach (var propName in nd.GetXmlFieldDefinitions().Where(i => i.ForRowGuid).Select(i => i.TargetPropertyInfo.Name).ToList())
objectBuilder.Values[propName] = Guid.NewGuid();
foreach (var propName in nd.GetXmlFieldDefinitions().Where(i => i.ForSourceName).Select(i => i.TargetPropertyInfo.Name).ToList())
objectBuilder.Values[propName] = sourceName;
return (objectBuilder.CreateInstance(), nd);
}

var xmlReader = XmlReader.Create(fileStream, xrs);
Stack<NodeLevel> nodes = new Stack<NodeLevel>();
string lastTextValue = null;
while (xmlReader.Read())
public void Read(Stream fileStream, CancellationToken cancellationToken)
{
XmlReaderSettings xrs = new XmlReaderSettings();
foreach (var item in _xmlFileDefinition.PrefixToUriNameSpacesDictionary)
xrs.Schemas.Add(item.Key, item.Value);
xrs.IgnoreWhitespace = true;
xrs.IgnoreComments = true;
xrs.IgnoreProcessingInstructions = true;

var xmlReader = XmlReader.Create(fileStream, xrs);
Stack<NodeLevel> nodes = new Stack<NodeLevel>();
string lastTextValue = null;
while (xmlReader.Read())
{
if (cancellationToken.IsCancellationRequested) break;
switch (xmlReader.NodeType)
{
if (cancellationToken.IsCancellationRequested) break;
switch (xmlReader.NodeType)
{
case XmlNodeType.Element:
bool isEmptyElement = xmlReader.IsEmptyElement;
lastTextValue = null;
nodes.Push(new NodeLevel { Name = xmlReader.Name, Guid = Guid.NewGuid() });
while (xmlReader.MoveToNextAttribute())
{
nodes.Push(new NodeLevel { Name = $"@{xmlReader.Name}", Guid = null });
ProcessAttributeValue(ComputeKey(nodes), nodes, xmlReader.Value);
nodes.Pop();
}
if (isEmptyElement)
{
ProcessEndOfNode(nodes, null, pushResult, sourceName);
nodes.Pop();
}
break;
case XmlNodeType.EndElement:
ProcessEndOfNode(nodes, lastTextValue, pushResult, sourceName);
lastTextValue = null;
case XmlNodeType.Element:
bool isEmptyElement = xmlReader.IsEmptyElement;
lastTextValue = null;
nodes.Push(new NodeLevel { Name = xmlReader.Name, Guid = Guid.NewGuid() });
while (xmlReader.MoveToNextAttribute())
{
nodes.Push(new NodeLevel { Name = $"@{xmlReader.Name}", Guid = null });
ProcessAttributeValue(ComputeKey(nodes), nodes, xmlReader.Value);
nodes.Pop();
break;
case XmlNodeType.Text:
lastTextValue = xmlReader.Value;
break;
}
}
if (isEmptyElement)
{
ProcessEndOfNode(nodes, null, _pushResult, _sourceName);
nodes.Pop();
}
break;
case XmlNodeType.EndElement:
ProcessEndOfNode(nodes, lastTextValue, _pushResult, _sourceName);
lastTextValue = null;
nodes.Pop();
break;
case XmlNodeType.Text:
lastTextValue = xmlReader.Value;
break;
}
}
private struct NodeLevel
{
public string Name { get; set; }
public Guid? Guid { get; set; }
}
}
private struct NodeLevel
{
public string Name { get; set; }
public Guid? Guid { get; set; }
}
}
Loading
Loading