Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add CDATA escaping to exml #73

Merged
merged 5 commits into from
Dec 20, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
37 changes: 29 additions & 8 deletions c_src/exml.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -68,6 +68,8 @@ namespace {
ERL_NIF_TERM atom_xmlstreamstart;
ERL_NIF_TERM atom_xmlstreamend;
ERL_NIF_TERM atom_pretty;
ERL_NIF_TERM atom_escaped;
ERL_NIF_TERM atom_cdata;
ERL_NIF_TERM atom_true;
constexpr const unsigned char EMPTY[1] = {0};

Expand Down Expand Up @@ -154,8 +156,9 @@ ERL_NIF_TERM make_attr_tuple(ParseCtx &ctx,

ERL_NIF_TERM get_xmlcdata(ParseCtx &ctx,
rapidxml::xml_node<unsigned char> *node) {
return enif_make_tuple2(ctx.env, atom_xmlcdata,
to_subbinary(ctx, node->value(), node->value_size()));
return enif_make_tuple3(ctx.env, atom_xmlcdata,
to_subbinary(ctx, node->value(), node->value_size()),
atom_escaped);
}

ERL_NIF_TERM merge_data_nodes(ParseCtx &ctx,
Expand All @@ -170,7 +173,7 @@ ERL_NIF_TERM merge_data_nodes(ParseCtx &ctx,
node = node->next_sibling();
}

return enif_make_tuple2(ctx.env, atom_xmlcdata, bin);
return enif_make_tuple3(ctx.env, atom_xmlcdata, bin, atom_escaped);
}

void append_pending_data_nodes(ParseCtx &ctx,
Expand Down Expand Up @@ -299,7 +302,15 @@ bool build_cdata(ErlNifEnv *env, xml_document &doc, const ERL_NIF_TERM elem[],
if (!enif_inspect_iolist_as_binary(env, elem[1], &bin))
return false;

auto child = doc.impl.allocate_node(rapidxml::node_data);
rapidxml::node_type cdata_type;
if (enif_compare(atom_escaped, elem[2]) == 0)
cdata_type = rapidxml::node_data;
else if (enif_compare(atom_cdata, elem[2]) == 0)
cdata_type = rapidxml::node_cdata;
else
return false;

auto child = doc.impl.allocate_node(cdata_type);
child->value(bin.size > 0 ? bin.data : EMPTY, bin.size);
node.append_node(child);
return true;
Expand Down Expand Up @@ -336,7 +347,7 @@ bool build_attrs(ErlNifEnv *env, xml_document &doc, ERL_NIF_TERM attrs,
bool build_el(ErlNifEnv *env, xml_document &doc, const ERL_NIF_TERM elem[],
rapidxml::xml_node<unsigned char> &node) {
ErlNifBinary name;
if (!enif_inspect_iolist_as_binary(env, elem[1], &name))
if (!enif_inspect_binary(env, elem[1], &name))
return false;

auto child = doc.impl.allocate_node(rapidxml::node_element);
Expand All @@ -358,7 +369,7 @@ bool build_child(ErlNifEnv *env, xml_document &doc, ERL_NIF_TERM child,
if (!enif_get_tuple(env, child, &arity, &tuple))
return false;

if (arity == 2 && enif_compare(atom_xmlcdata, tuple[0]) == 0) {
if (arity == 3 && enif_compare(atom_xmlcdata, tuple[0]) == 0) {
if (!build_cdata(env, doc, tuple, node))
return false;
} else if (arity == 4 && enif_compare(atom_xmlel, tuple[0]) == 0) {
Expand Down Expand Up @@ -443,6 +454,8 @@ static int load(ErlNifEnv *env, void **priv_data, ERL_NIF_TERM load_info) {
atom_xmlstreamstart = enif_make_atom(env, "xmlstreamstart");
atom_xmlstreamend = enif_make_atom(env, "xmlstreamend");
atom_pretty = enif_make_atom(env, "pretty");
atom_escaped = enif_make_atom(env, "escaped");
atom_cdata = enif_make_atom(env, "cdata");
atom_true = enif_make_atom(env, "true");

get_static_doc().impl.set_allocator(enif_alloc, enif_free);
Expand Down Expand Up @@ -608,7 +621,15 @@ static ERL_NIF_TERM escape_cdata(ErlNifEnv *env, int argc,
if (!enif_inspect_iolist_as_binary(env, argv[0], &bin))
return enif_make_badarg(env);

rapidxml::xml_node<unsigned char> node(rapidxml::node_data);
rapidxml::node_type cdata_type;
if (enif_compare(atom_escaped, argv[1]) == 0)
cdata_type = rapidxml::node_data;
else if (enif_compare(atom_cdata, argv[1]) == 0)
cdata_type = rapidxml::node_cdata;
else
return enif_make_badarg(env);

rapidxml::xml_node<unsigned char> node(cdata_type);
node.value(bin.data, bin.size);
return node_to_binary(env, node, rapidxml::print_no_indenting);
}
Expand Down Expand Up @@ -647,7 +668,7 @@ static ERL_NIF_TERM reset_parser(ErlNifEnv *env, int argc,

static ErlNifFunc nif_funcs[] = {
{"create", 2, create}, {"parse", 1, parse},
{"parse_next", 2, parse_next}, {"escape_cdata", 1, escape_cdata},
{"parse_next", 2, parse_next}, {"escape_cdata", 2, escape_cdata},
{"to_binary", 2, to_binary}, {"reset_parser", 1, reset_parser}};
}

Expand Down
20 changes: 10 additions & 10 deletions c_src/rapidxml_print.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -151,7 +151,7 @@ namespace rapidxml
{
assert(node->type() == node_data);
if (!(flags & print_no_indenting))
out = fill_chars(out, indent, Ch('\t'));
out = fill_chars(out, indent, Ch(' '));
out = copy_and_expand_chars(node->value(), node->value() + node->value_size(), Ch(0), out);
return out;
}
Expand All @@ -162,7 +162,7 @@ namespace rapidxml
{
assert(node->type() == node_cdata);
if (!(flags & print_no_indenting))
out = fill_chars(out, indent, Ch('\t'));
out = fill_chars(out, indent, Ch(' '));
*out = Ch('<'); ++out;
*out = Ch('!'); ++out;
*out = Ch('['); ++out;
Expand All @@ -187,7 +187,7 @@ namespace rapidxml

// Print element name and attributes, if any
if (!(flags & print_no_indenting))
out = fill_chars(out, indent, Ch('\t'));
out = fill_chars(out, indent, Ch(' '));
*out = Ch('<'), ++out;
out = copy_chars(node->name(), node->name() + node->name_size(), out);
out = print_attributes(out, node, flags);
Expand Down Expand Up @@ -221,9 +221,9 @@ namespace rapidxml
// Print all children with full indenting
if (!(flags & print_no_indenting))
*out = Ch('\n'), ++out;
out = print_children(out, node, flags, indent + 1);
out = print_children(out, node, flags, indent + 2);
if (!(flags & print_no_indenting))
out = fill_chars(out, indent, Ch('\t'));
out = fill_chars(out, indent, Ch(' '));
}

// Print node end
Expand All @@ -241,7 +241,7 @@ namespace rapidxml
{
// Print declaration start
if (!(flags & print_no_indenting))
out = fill_chars(out, indent, Ch('\t'));
out = fill_chars(out, indent, Ch(' '));
*out = Ch('<'), ++out;
*out = Ch('?'), ++out;
*out = Ch('x'), ++out;
Expand All @@ -264,7 +264,7 @@ namespace rapidxml
{
assert(node->type() == node_comment);
if (!(flags & print_no_indenting))
out = fill_chars(out, indent, Ch('\t'));
out = fill_chars(out, indent, Ch(' '));
*out = Ch('<'), ++out;
*out = Ch('!'), ++out;
*out = Ch('-'), ++out;
Expand All @@ -282,7 +282,7 @@ namespace rapidxml
{
assert(node->type() == node_doctype);
if (!(flags & print_no_indenting))
out = fill_chars(out, indent, Ch('\t'));
out = fill_chars(out, indent, Ch(' '));
*out = Ch('<'), ++out;
*out = Ch('!'), ++out;
*out = Ch('D'), ++out;
Expand All @@ -304,7 +304,7 @@ namespace rapidxml
{
assert(node->type() == node_pi);
if (!(flags & print_no_indenting))
out = fill_chars(out, indent, Ch('\t'));
out = fill_chars(out, indent, Ch(' '));
*out = Ch('<'), ++out;
*out = Ch('?'), ++out;
out = copy_chars(node->name(), node->name() + node->name_size(), out);
Expand All @@ -321,7 +321,7 @@ namespace rapidxml
{
assert(node->type() == node_literal);
if (!(flags & print_no_indenting))
out = fill_chars(out, indent, Ch('\t'));
out = fill_chars(out, indent, Ch(' '));
out = copy_chars(node->value(), node->value() + node->value_size(), out);
return out;
}
Expand Down
3 changes: 2 additions & 1 deletion include/exml.hrl
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,8 @@
-ifndef(EXML_HEADER).
-define(EXML_HEADER, true).

-record(xmlcdata, {content = [] :: iodata()}).
-record(xmlcdata, {content = [] :: iodata(),
style = escaped :: escaped | cdata}).

-record(xmlel, {name :: binary(),
attrs = [] :: [exml:attr()],
Expand Down
6 changes: 4 additions & 2 deletions rebar.config
Original file line number Diff line number Diff line change
Expand Up @@ -55,7 +55,9 @@
{doc, #{provider => ex_doc}}
]}.
{ex_doc, [
{extras, [<<"README.md">>, <<"LICENSE">>]},
{source_url, <<"https://github.com/esl/exml">>},
{main, <<"readme">>},
{source_url, <<"https://github.com/esl/exml">>}
{extras, [{'README.md', #{title => <<"README">>}},
{'LICENSE', #{title => <<"License">>}}
]}
]}.
20 changes: 14 additions & 6 deletions src/exml.erl
Original file line number Diff line number Diff line change
Expand Up @@ -27,18 +27,26 @@

-type attr() :: {binary(), binary()}.
-type cdata() :: #xmlcdata{}.
%% CDATA record. Printing escaping rules defaults to escaping character-wise.
%%
%% Escaping rules:
%% <ul>
%% <li>`escaped': escapes all characters by regular `&' control escaping.</li>
%% <li>`cdata': wraps the entire string into a `<![CDATA[]]>' section.</li>
%% </ul>
-type element() :: #xmlel{}.
-type item() :: element() | attr() | cdata() | exml_stream:start() | exml_stream:stop().
-type prettify() :: pretty | not_pretty.
%% Printing indentation rule, see `to_iolist/2'.

%% @doc Calculate the length of the original XML payload
-spec xml_size(item() | [item()]) -> non_neg_integer().
xml_size([]) ->
0;
xml_size([Elem | Rest]) ->
xml_size(Elem) + xml_size(Rest);
xml_size(#xmlcdata{ content = Content }) ->
iolist_size(exml_nif:escape_cdata(Content));
xml_size(#xmlcdata{content = Content, style = Style}) ->
iolist_size(exml_nif:escape_cdata(Content, Style));
xml_size(#xmlel{ name = Name, attrs = Attrs, children = [] }) ->
3 % Self-closing: </>
+ byte_size(Name) + xml_size(Attrs);
Expand All @@ -56,7 +64,7 @@ xml_size({Key, Value}) when is_binary(Key) ->
+ 4 % ="" and whitespace before
+ byte_size(Value).

%% @doc Sort in ascending order a list of xml `t:item()'.
%% @doc Sort in ascending order a list of xml `t:item/0'.
%%
%% Sorting is defined as calling `lists:sort/1' at:
%% <ul>
Expand Down Expand Up @@ -109,7 +117,7 @@ to_iolist(Element) ->
to_pretty_iolist(Element) ->
to_iolist(Element, pretty).

%% @doc Parses a binary or a list of binaries into an XML `t:element()'.
%% @doc Parses a binary or a list of binaries into an XML `t:element/0'.
-spec parse(binary() | [binary()]) -> {ok, element()} | {error, any()}.
parse(XML) ->
exml_nif:parse(XML).
Expand All @@ -129,8 +137,8 @@ to_iolist(#xmlstreamstart{name = Name, attrs = Attrs}, _Pretty) ->
[Front, $>];
to_iolist(#xmlstreamend{name = Name}, _Pretty) ->
[<<"</">>, Name, <<">">>];
to_iolist(#xmlcdata{content = Content}, _Pretty) ->
exml_nif:escape_cdata(Content);
to_iolist(#xmlcdata{content = Content, style = Style}, _Pretty) ->
exml_nif:escape_cdata(Content, Style);
to_iolist([Element], Pretty) ->
to_iolist(Element, Pretty);
to_iolist([#xmlstreamstart{name = Name, attrs = Attrs} | Tail] = Elements, Pretty) ->
Expand Down
16 changes: 7 additions & 9 deletions src/exml_nif.erl
Original file line number Diff line number Diff line change
Expand Up @@ -5,14 +5,12 @@

-module(exml_nif).

-nifs([create/2, escape_cdata/1, to_binary/2, parse/1, parse_next/2, reset_parser/1]).
-nifs([create/2, escape_cdata/2, to_binary/2, parse/1, parse_next/2, reset_parser/1]).

-type parser() :: term().
-type stream_element() :: exml:element() | exml_stream:start() | exml_stream:stop().

-export([create/2, parse/1, parse_next/2, escape_cdata/1,
-export([create/2, parse/1, parse_next/2, escape_cdata/2,
to_binary/2, reset_parser/1]).
-export_type([parser/0, stream_element/0]).

-on_load(load/0).

Expand Down Expand Up @@ -40,12 +38,12 @@ load() ->
erlang:load_nif(filename:join(PrivDir, ?MODULE_STRING), none).

-spec create(MaxChildSize :: non_neg_integer(), InfiniteStream :: boolean()) ->
{ok, parser()} | {error, Reason :: any()}.
{ok, parser()} | {error, Reason :: any()}.
create(_, _) ->
erlang:nif_error(not_loaded).

-spec escape_cdata(Bin :: iodata()) -> binary().
escape_cdata(_Bin) ->
-spec escape_cdata(Bin :: iodata(), atom()) -> binary().
escape_cdata(_Bin, _Style) ->
erlang:nif_error(not_loaded).

-spec to_binary(Elem :: exml:element(), pretty | not_pretty) -> binary().
Expand All @@ -57,8 +55,8 @@ parse(_) ->
erlang:nif_error(not_loaded).

-spec parse_next(parser(), Data :: binary() | [binary()]) ->
{ok, stream_element() | undefined, non_neg_integer()} |
{error, Reason :: any()}.
{ok, exml_stream:element() | undefined, non_neg_integer()} |
{error, Reason :: any()}.
parse_next(_, _) ->
erlang:nif_error(not_loaded).

Expand Down
16 changes: 9 additions & 7 deletions src/exml_stream.erl
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@
parser_opt/0]).

-record(parser, {
event_parser :: exml_nif:parser(),
event_parser :: term(),
buffer :: [binary()]
}).

Expand All @@ -30,15 +30,15 @@
-type stop() :: #xmlstreamend{}.
%% `#xmlstreamend{}' record.
-type parser() :: #parser{}.
%% `#parser{}' record.
-type element() :: exml_nif:stream_element().
%% One of `t:start()', `t:stop()' or `t:exml:element()'.
%% `#parser{}' record. Keeps track of unparsed buffers.
-type element() :: exml:element() | exml_stream:start() | exml_stream:stop().
%% One of `t:exml:element/0', `t:start/0', or `t:stop/0'.

-type parser_opt() :: {infinite_stream, boolean()} | {max_element_size, non_neg_integer()}.
%% Parser options
%%
%% <ul>
%% <li>`infinite_stream': No distinct `t:start()' or `t:stop()', only `#xmlel{}' will be returned.</li>
%% <li>`infinite_stream': No distinct `t:start/0' or `t:stop/0', only `#xmlel{}' will be returned.</li>
%% <li>`max_element_size': Specifies maximum byte size of any parsed XML element.
%% The only exception is the "stream start" element,
%% for which only the size of the opening tag is limited.</li>
Expand All @@ -53,7 +53,7 @@
new_parser() ->
new_parser([]).

%% @doc Creates a new parser
%% @doc Creates a new parser. See `t:parser_opt/0' for configuration.
-spec new_parser([parser_opt()]) -> {ok, parser()} | {error, any()}.
new_parser(Opts)->
MaxElementSize = proplists:get_value(max_element_size, Opts, 0),
Expand All @@ -65,7 +65,9 @@ new_parser(Opts)->
Error
end.

%% @doc Makes a parser parse input
%% @doc Makes a parser parse input.
%%
%% If successful, returns parsed elements and a new parser with updated buffers.
-spec parse(parser(), binary()) ->
{ok, parser(), [exml_stream:element()]} | {error, Reason :: any()}.
parse(Parser, Input) when is_binary(Input) ->
Expand Down
6 changes: 6 additions & 0 deletions test/exml_tests.erl
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,12 @@ size_of_escaped_characters_test() ->
Raw = <<"<a>&amp;</a>">>,
?assertEqual(iolist_size(Raw), exml:xml_size(parse(Raw))).

cdata_size_of_escaped_characters_test() ->
Raw = <<"<a><![CDATA[some stuff]]></a>">>,
CData = #xmlcdata{content = <<"some stuff">>, style = cdata},
Final = #xmlel{name = <<"a">>, children = [CData]},
?assertEqual(iolist_size(Raw), exml:xml_size(Final)).

size_of_exml_with_cdata_test() ->
Raw = <<"<a><![CDATA[ Within this Character Data block I can
use double dashes as much as I want (along with <, &, ', and \")]]></a>">>,
Expand Down