From d620190743ed3020807a072be6f75d2aecabc3d9 Mon Sep 17 00:00:00 2001 From: jeklund Date: Mon, 6 Feb 2017 16:38:37 +0000 Subject: [PATCH 1/2] added group_concat_delim to examples and tests in make file --- strings_package/Makefile | 8 +- strings_package/ddl/install.sql | 3 +- .../examples/group_concat_delim.sql | 5 + strings_package/src/GroupConcatDelim.cpp | 102 ++++++++++++++++++ 4 files changed, 115 insertions(+), 3 deletions(-) create mode 100644 strings_package/examples/group_concat_delim.sql create mode 100644 strings_package/src/GroupConcatDelim.cpp diff --git a/strings_package/Makefile b/strings_package/Makefile index b2af435..414efe3 100644 --- a/strings_package/Makefile +++ b/strings_package/Makefile @@ -17,7 +17,7 @@ THIRD_PARTY_INCLUDE = $(THIRD_PARTY)/include # Add in your source files below BUILD_FILES = build/Vertica.o \ - $(addprefix build/, EditDist.o PorterStemmer.o StringTokenizerDelim.o NGrams.o StringTokenizer.o WordCount.o AnagramLib.o GroupConcat.o) + $(addprefix build/, EditDist.o PorterStemmer.o StringTokenizerDelim.o NGrams.o StringTokenizer.o WordCount.o AnagramLib.o GroupConcat.o GroupConcatDelim.o) # Define the .so name here (and update the references in ddl/install.sql and ddl/uninstall.sql) PACKAGE_LIBNAME = lib/StringsLib.so @@ -73,6 +73,8 @@ test: $(VSQL) -f examples/string_delim_tokenizer.sql $(VSQL) -f examples/word_count.sql $(VSQL) -f examples/gen_anagram.sql + $(VSQL) -f examples/group_concat.sql + $(VSQL) -f examples/group_concat_delim.sql clean: rm -rf build @@ -80,7 +82,7 @@ clean: rm -f vsim* rm -f test-data/wordlist_header.txt ############## -# Advanced upsage: use simulator to debug and test +# Advanced usage: use simulator to debug and test ############## # rule to make anagram data file with appropriate header for simulator @@ -100,6 +102,8 @@ sim_test: all simulator test-data/wordlist_header.txt $(VALGRIND) ./vsim $(PACKAGE_LIBNAME) FiveGramsFactory test-data/strings.txt $(VALGRIND) ./vsim $(PACKAGE_LIBNAME) WordCountFactory test-data/strings.txt $(VALGRIND) ./vsim $(PACKAGE_LIBNAME) AnagramFactory test-data/wordlist_header.txt + $(VALGRIND) ./vsim $(PACKAGE_LIBNAME) GroupConcatFactory test-data/wordlist.txt + $(VALGRIND) ./vsim $(PACKAGE_LIBNAME) GroupConcatDelimFactory test-data/wordlist.txt # build the simulator (in SIMULATOR_PATH) and simlink it here simulator: diff --git a/strings_package/ddl/install.sql b/strings_package/ddl/install.sql index 78bbf04..ad5e07e 100644 --- a/strings_package/ddl/install.sql +++ b/strings_package/ddl/install.sql @@ -13,4 +13,5 @@ CREATE TRANSFORM FUNCTION FourGrams AS LANGUAGE 'C++' NAME 'FourGrams CREATE TRANSFORM FUNCTION FiveGrams AS LANGUAGE 'C++' NAME 'FiveGramsFactory' LIBRARY StringsLib NOT FENCED; CREATE FUNCTION WordCount AS LANGUAGE 'C++' NAME 'WordCountFactory' LIBRARY StringsLib NOT FENCED; CREATE TRANSFORM FUNCTION gen_anagram AS LANGUAGE 'C++' NAME 'AnagramFactory' LIBRARY StringsLib NOT FENCED; -CREATE TRANSFORM FUNCTION group_concat AS LANGUAGE 'C++' NAME 'GroupConcatFactory' LIBRARY StringsLib NOT FENCED; \ No newline at end of file +CREATE TRANSFORM FUNCTION group_concat AS LANGUAGE 'C++' NAME 'GroupConcatFactory' LIBRARY StringsLib NOT FENCED; +CREATE TRANSFORM FUNCTION group_concat_delim AS LANGUAGE 'C++' NAME 'GroupConcatDelimFactory' LIBRARY StringsLib NOT FENCED; \ No newline at end of file diff --git a/strings_package/examples/group_concat_delim.sql b/strings_package/examples/group_concat_delim.sql new file mode 100644 index 0000000..af63e2f --- /dev/null +++ b/strings_package/examples/group_concat_delim.sql @@ -0,0 +1,5 @@ +-- get a list of nodes +select group_concat_delim(node_name, '|') over () from nodes; + +-- nodes with storage for a projection +select schema_name,projection_name,group_concat_delim(node_name, '|') over (partition by schema_name,projection_name) from (select distinct node_name,schema_name,projection_name from storage_containers) sc order by schema_name, projection_name; diff --git a/strings_package/src/GroupConcatDelim.cpp b/strings_package/src/GroupConcatDelim.cpp new file mode 100644 index 0000000..e7f4547 --- /dev/null +++ b/strings_package/src/GroupConcatDelim.cpp @@ -0,0 +1,102 @@ +/* Copyright (c) 2005 - 2011 Vertica, an HP company -*- C++ -*- */ +/* + * Description: User Defined Transform Function: for each partition, output a + * list as a string separated by a custom delimiter + * + * Create Date: Dec 15, 2011 + */ +#include "Vertica.h" +#include +#include + +using namespace Vertica; +using namespace std; + +#define LINE_MAX 64000 + +/* + * Same as the group_concat in the same library, but with a flexible delimiter. + * Takes in a sequence of string values and a delimiter character and produces a single output tuple with + * a list of values separated by the delimiter. If the output string would overflow the + * maximum line length, stop appending values and include a "..." + */ + +class GroupConcatDelim : public TransformFunction +{ + virtual void processPartition(ServerInterface &srvInterface, + PartitionReader &input_reader, + PartitionWriter &output_writer) + { + if (input_reader.getNumCols() != 2) + vt_report_error(0, "Function only accepts 2 argument, but %zu provided", input_reader.getNumCols()); + + ostringstream oss; + bool first = true; + bool exceeded = false; + do { + const VString &elem = input_reader.getStringRef(0); + const VString &delimiter = input_reader.getStringRef(1); + const char delim = delimiter.str().c_str()[0]; + + // If input string is NULL, then ignore it + if (elem.isNull()) + { + continue; + } + else if (!exceeded) + { + std::string s = elem.str(); + size_t curpos = oss.tellp(); + curpos += s.length() + 2; + if (curpos > LINE_MAX) + { + exceeded = true; + if (first) oss << "..."; + else oss << delim << "..."; + } + else + { + if (!first) oss << delim; + first = false; + oss << s; + } + } + } while (input_reader.next()); + + VString &summary = output_writer.getStringRef(0); + summary.copy(oss.str().c_str()); + output_writer.next(); + } +}; + +class GroupConcatDelimFactory : public TransformFunctionFactory +{ + // Tell Vertica that we take in a row with 1 string, and return a row with 1 string + virtual void getPrototype(ServerInterface &srvInterface, ColumnTypes &argTypes, ColumnTypes &returnType) + { + argTypes.addVarchar(); + argTypes.addVarchar(); + + returnType.addVarchar(); + } + + // Tell Vertica what our return string length will be, given the input + // string length + virtual void getReturnType(ServerInterface &srvInterface, + const SizedColumnTypes &input_types, + SizedColumnTypes &output_types) + { + // Error out if we're called with anything but 2 argument + if (input_types.getColumnCount() != 2) + vt_report_error(0, "Function only accepts 2 argument, but %zu provided", input_types.getColumnCount()); + + // output can be wide. Include extra space for a last ", ..." + output_types.addVarchar(LINE_MAX+5, "list"); + } + + virtual TransformFunction *createTransformFunction(ServerInterface &srvInterface) + { return vt_createFuncObj(srvInterface.allocator, GroupConcatDelim); } + +}; + +RegisterFactory(GroupConcatDelimFactory); \ No newline at end of file From c264b5189832c68c6c1e883597bf40de123aa3cb Mon Sep 17 00:00:00 2001 From: jeklund Date: Mon, 6 Feb 2017 16:45:08 +0000 Subject: [PATCH 2/2] cleanup in GroupConcatDelim --- strings_package/src/GroupConcatDelim.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/strings_package/src/GroupConcatDelim.cpp b/strings_package/src/GroupConcatDelim.cpp index e7f4547..d6740e0 100644 --- a/strings_package/src/GroupConcatDelim.cpp +++ b/strings_package/src/GroupConcatDelim.cpp @@ -90,7 +90,7 @@ class GroupConcatDelimFactory : public TransformFunctionFactory if (input_types.getColumnCount() != 2) vt_report_error(0, "Function only accepts 2 argument, but %zu provided", input_types.getColumnCount()); - // output can be wide. Include extra space for a last ", ..." + // output can be wide. Include extra space for a last "..." output_types.addVarchar(LINE_MAX+5, "list"); }