Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

GroupConcat with flexible delimiter #49

Open
wants to merge 2 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 6 additions & 2 deletions strings_package/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ THIRD_PARTY_INCLUDE = $(THIRD_PARTY)/include

# Add in your source files below
BUILD_FILES = build/Vertica.o \
$(addprefix build/, EditDist.o PorterStemmer.o StringTokenizerDelim.o NGrams.o StringTokenizer.o WordCount.o AnagramLib.o GroupConcat.o)
$(addprefix build/, EditDist.o PorterStemmer.o StringTokenizerDelim.o NGrams.o StringTokenizer.o WordCount.o AnagramLib.o GroupConcat.o GroupConcatDelim.o)

# Define the .so name here (and update the references in ddl/install.sql and ddl/uninstall.sql)
PACKAGE_LIBNAME = lib/StringsLib.so
Expand Down Expand Up @@ -73,14 +73,16 @@ test:
$(VSQL) -f examples/string_delim_tokenizer.sql
$(VSQL) -f examples/word_count.sql
$(VSQL) -f examples/gen_anagram.sql
$(VSQL) -f examples/group_concat.sql
$(VSQL) -f examples/group_concat_delim.sql

clean:
rm -rf build
rm -rf lib
rm -f vsim*
rm -f test-data/wordlist_header.txt
##############
# Advanced upsage: use simulator to debug and test
# Advanced usage: use simulator to debug and test
##############

# rule to make anagram data file with appropriate header for simulator
Expand All @@ -100,6 +102,8 @@ sim_test: all simulator test-data/wordlist_header.txt
$(VALGRIND) ./vsim $(PACKAGE_LIBNAME) FiveGramsFactory test-data/strings.txt
$(VALGRIND) ./vsim $(PACKAGE_LIBNAME) WordCountFactory test-data/strings.txt
$(VALGRIND) ./vsim $(PACKAGE_LIBNAME) AnagramFactory test-data/wordlist_header.txt
$(VALGRIND) ./vsim $(PACKAGE_LIBNAME) GroupConcatFactory test-data/wordlist.txt
$(VALGRIND) ./vsim $(PACKAGE_LIBNAME) GroupConcatDelimFactory test-data/wordlist.txt

# build the simulator (in SIMULATOR_PATH) and simlink it here
simulator:
Expand Down
3 changes: 2 additions & 1 deletion strings_package/ddl/install.sql
Original file line number Diff line number Diff line change
Expand Up @@ -13,4 +13,5 @@ CREATE TRANSFORM FUNCTION FourGrams AS LANGUAGE 'C++' NAME 'FourGrams
CREATE TRANSFORM FUNCTION FiveGrams AS LANGUAGE 'C++' NAME 'FiveGramsFactory' LIBRARY StringsLib NOT FENCED;
CREATE FUNCTION WordCount AS LANGUAGE 'C++' NAME 'WordCountFactory' LIBRARY StringsLib NOT FENCED;
CREATE TRANSFORM FUNCTION gen_anagram AS LANGUAGE 'C++' NAME 'AnagramFactory' LIBRARY StringsLib NOT FENCED;
CREATE TRANSFORM FUNCTION group_concat AS LANGUAGE 'C++' NAME 'GroupConcatFactory' LIBRARY StringsLib NOT FENCED;
CREATE TRANSFORM FUNCTION group_concat AS LANGUAGE 'C++' NAME 'GroupConcatFactory' LIBRARY StringsLib NOT FENCED;
CREATE TRANSFORM FUNCTION group_concat_delim AS LANGUAGE 'C++' NAME 'GroupConcatDelimFactory' LIBRARY StringsLib NOT FENCED;
5 changes: 5 additions & 0 deletions strings_package/examples/group_concat_delim.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
-- get a list of nodes
select group_concat_delim(node_name, '|') over () from nodes;

-- nodes with storage for a projection
select schema_name,projection_name,group_concat_delim(node_name, '|') over (partition by schema_name,projection_name) from (select distinct node_name,schema_name,projection_name from storage_containers) sc order by schema_name, projection_name;
102 changes: 102 additions & 0 deletions strings_package/src/GroupConcatDelim.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,102 @@
/* Copyright (c) 2005 - 2011 Vertica, an HP company -*- C++ -*- */
/*
* Description: User Defined Transform Function: for each partition, output a
* list as a string separated by a custom delimiter
*
* Create Date: Dec 15, 2011
*/
#include "Vertica.h"
#include <sstream>
#include <string>

using namespace Vertica;
using namespace std;

#define LINE_MAX 64000

/*
* Same as the group_concat in the same library, but with a flexible delimiter.
* Takes in a sequence of string values and a delimiter character and produces a single output tuple with
* a list of values separated by the delimiter. If the output string would overflow the
* maximum line length, stop appending values and include a "..."
*/

class GroupConcatDelim : public TransformFunction
{
virtual void processPartition(ServerInterface &srvInterface,
PartitionReader &input_reader,
PartitionWriter &output_writer)
{
if (input_reader.getNumCols() != 2)
vt_report_error(0, "Function only accepts 2 argument, but %zu provided", input_reader.getNumCols());

ostringstream oss;
bool first = true;
bool exceeded = false;
do {
const VString &elem = input_reader.getStringRef(0);
const VString &delimiter = input_reader.getStringRef(1);
const char delim = delimiter.str().c_str()[0];

// If input string is NULL, then ignore it
if (elem.isNull())
{
continue;
}
else if (!exceeded)
{
std::string s = elem.str();
size_t curpos = oss.tellp();
curpos += s.length() + 2;
if (curpos > LINE_MAX)
{
exceeded = true;
if (first) oss << "...";
else oss << delim << "...";
}
else
{
if (!first) oss << delim;
first = false;
oss << s;
}
}
} while (input_reader.next());

VString &summary = output_writer.getStringRef(0);
summary.copy(oss.str().c_str());
output_writer.next();
}
};

class GroupConcatDelimFactory : public TransformFunctionFactory
{
// Tell Vertica that we take in a row with 1 string, and return a row with 1 string
virtual void getPrototype(ServerInterface &srvInterface, ColumnTypes &argTypes, ColumnTypes &returnType)
{
argTypes.addVarchar();
argTypes.addVarchar();

returnType.addVarchar();
}

// Tell Vertica what our return string length will be, given the input
// string length
virtual void getReturnType(ServerInterface &srvInterface,
const SizedColumnTypes &input_types,
SizedColumnTypes &output_types)
{
// Error out if we're called with anything but 2 argument
if (input_types.getColumnCount() != 2)
vt_report_error(0, "Function only accepts 2 argument, but %zu provided", input_types.getColumnCount());

// output can be wide. Include extra space for a last "..."
output_types.addVarchar(LINE_MAX+5, "list");
}

virtual TransformFunction *createTransformFunction(ServerInterface &srvInterface)
{ return vt_createFuncObj(srvInterface.allocator, GroupConcatDelim); }

};

RegisterFactory(GroupConcatDelimFactory);