From 9abd4f417349bcfcf788b92e64666f147dfe5767 Mon Sep 17 00:00:00 2001 From: Jason Petersen Date: Thu, 18 Apr 2024 08:38:39 -0600 Subject: [PATCH] Add compression functions, features, docs, tests This adds a dependency on Hydra columnar and pg_cron. --- .github/workflows/installcheck.yml | 6 +- META.json | 2 + Makefile | 9 ++- doc/timeseries.md | 8 +- sql/timeseries.sql | 120 ++++++++++++++++++++++++++++- test/expected/basic_usage.out | 2 + test/sql/basic_usage.sql | 1 + timeseries.conf | 1 + timeseries.control | 2 +- 9 files changed, 143 insertions(+), 8 deletions(-) create mode 100644 timeseries.conf diff --git a/.github/workflows/installcheck.yml b/.github/workflows/installcheck.yml index 4e03148..9ed5100 100644 --- a/.github/workflows/installcheck.yml +++ b/.github/workflows/installcheck.yml @@ -33,7 +33,11 @@ jobs: - name: Install dependencies run: > sudo apt-get install -y --no-install-recommends \ - postgresql-${{matrix.pg_version}}-partman + postgresql-${{matrix.pg_version}}-partman \ + postgresql-${{matrix.pg_version}}-cron \ + curl && + curl -sfL https://install.pgx.sh | sudo sh - && + sudo pgxman install hydra_columnar - uses: actions/checkout@v3 diff --git a/META.json b/META.json index 764780a..27ea030 100644 --- a/META.json +++ b/META.json @@ -21,6 +21,8 @@ "PostgreSQL": "14.0.0" }, "recommends": { + "columnar": "1.1.2", + "pg_cron": "1.6.2", "pg_partman": "4.5.1" } } diff --git a/Makefile b/Makefile index ffc75e2..cc7acc6 100644 --- a/Makefile +++ b/Makefile @@ -5,7 +5,7 @@ EXTVERSION = $(shell grep -m 1 '[[:space:]]\{6\}"version":' META.json | \ DISTVERSION = $(shell grep -m 1 '[[:space:]]\{3\}"version":' META.json | \ sed -e 's/[[:space:]]*"version":[[:space:]]*"\([^"]*\)",\{0,1\}/\1/') -EXTVERSIONS = 0.1.2 +EXTVERSIONS = 0.1.3 DATA = $(wildcard sql/*--*.sql) DATA_built = $(foreach v,$(EXTVERSIONS),sql/$(EXTENSION)--$(v).sql) @@ -13,7 +13,7 @@ DATA_built = $(foreach v,$(EXTVERSIONS),sql/$(EXTENSION)--$(v).sql) DOCS = $(wildcard doc/*.md) TESTS = $(wildcard test/sql/*.sql) REGRESS = $(patsubst test/sql/%.sql,%,$(TESTS)) -REGRESS_OPTS = --inputdir=test +REGRESS_OPTS = --temp-config=./timeseries.conf --temp-instance=./tmp_check --inputdir=test PG_CONFIG = pg_config EXTRA_CLEAN = sql/$(EXTENSION)--$(EXTVERSION).sql @@ -32,3 +32,8 @@ latest-changes.md: Changes # previous upgrade scripts sql/$(EXTENSION)--0.1.2.sql: sql/$(EXTENSION).sql cat $^ > $@ + +# generate each version's file installation file by concatenating +# previous upgrade scripts +sql/$(EXTENSION)--0.1.3.sql: sql/$(EXTENSION).sql + cat $^ > $@ diff --git a/doc/timeseries.md b/doc/timeseries.md index b9a2017..451355f 100644 --- a/doc/timeseries.md +++ b/doc/timeseries.md @@ -39,13 +39,19 @@ On the other hand, you may be worried about plugging a firehose of data into you Fortunately, it's incredibly easy to simply drop time-series partitions on a schedule. Call `set_ts_retention_policy` with your time-series table and an interval (say, `'90 days'`) to establish such a policy. Once an hour, any partitions falling entirely outside the retention window will be dropped. Use `clear_ts_retention_policy` to revert to the default behavior (infinite retention). Each of these functions will return the previous retention policy when called. +### Compression + +Sometimes you know older data isn't queried very often, but still don't want to commit to just dropping older partitions. In this case, compression may be what you desire. + +By calling `set_ts_compression_policy` on a time-series table with an appropriate interval (perhaps`'1 month'`), this extension will take care of compressing partitions (using a columnar storage method) older than the specified interval, once an hour. As with the retention policy functionality, a function is also provided for clearing any existing policy (existing partitions will not be decompressed, however). + ## Roadmap While `timeseries` is still in its early days, we have a concrete vision for the features we will be including in the future. Feedback on the importance of a given feature to customer use cases will help us better prioritize the following lists. This list is somewhat ordered by likelihood of near-term delivery, or maybe difficulty, but that property is only loosely intended and no guarantee of priority. Again, feedback from users will take precedence. - - Columnar storage and storage type management — will enable some degree of compression and accelerate column-oriented analytics workloads + - Assorted "analytic" functions frequently associated with time-series workloads - Periodic `REFRESH MATERIALIZED VIEW` — set schedules for background refresh of materialized views (useful for dashboarding, etc.) - Roll-off to `TABLESPACE` — as data ages, it will be moved into a specified table space - Use of "tiered storage", i.e. moving older partitions to be stored in S3 rather than on-disk diff --git a/sql/timeseries.sql b/sql/timeseries.sql index d23f008..0cc56ba 100644 --- a/sql/timeseries.sql +++ b/sql/timeseries.sql @@ -6,7 +6,8 @@ CREATE TABLE @extschema@.ts_config( table_id regclass PRIMARY KEY, partition_duration interval NOT NULL, partition_lead_time interval NOT NULL, - retention_duration interval); + retention_duration interval, + compression_duration interval); -- Enhances an existing table with our time-series best practices. Basically -- the entry point to this extension. Minimally, a user must create a table @@ -157,8 +158,7 @@ $function$; -- -- In the future, similar functions will manage schedules for moving partitions -- to non-default TABLESPACEs, issuing CLUSTER commands to reorganize data in --- non-active partitions along a chosen index, or even applying compressed --- storage methods and rollup logic. +-- non-active partitions along a chosen index. -- -- Returns the previous retention duration, or NULL if none was set. CREATE OR REPLACE @@ -295,3 +295,117 @@ SELECT table_id, SUM(index_size_bytes) AS index_size_bytes, SUM(total_size_bytes) AS total_size_bytes FROM @extschema@.ts_part_info GROUP BY 1 ORDER BY 2 ASC; + +-- This function sets a compression policy on an existing time-series table, which +-- ensures that all partitions older than a particular offset (from present) are +-- automatically converted to a columnar storage mechanism, which offers some +-- degree of data compression. +-- +-- Returns the previous compression duration, or NULL if none was set. +CREATE OR REPLACE FUNCTION @extschema@.set_ts_compression_policy(target_table_id regclass, new_compression interval) + RETURNS interval + LANGUAGE plpgsql +AS $function$ +DECLARE + table_name text; + prev_compression interval; +BEGIN + SELECT compression_duration + INTO prev_compression + FROM @extschema@.ts_config + WHERE "table_id"=target_table_id + FOR UPDATE; + IF NOT FOUND THEN + RAISE object_not_in_prerequisite_state USING + MESSAGE = 'could not fetch compression policy', + DETAIL = 'Target table was not time-series enhanced', + HINT = format('Call %L to enable time-series enhancements', 'enable_ts_table'); + END IF; + + UPDATE @extschema@.ts_config + SET "compression_duration"=new_compression + WHERE "table_id"=target_table_id; + + RETURN prev_compression; +END; +$function$; + +-- Unsets any compression policy on the specified table. Returns the old policy, +-- if one was set. +CREATE OR REPLACE +FUNCTION @extschema@.clear_ts_compression_policy(target_table_id regclass) + RETURNS interval + LANGUAGE plpgsql +AS $function$ +DECLARE + prev_compression interval; +BEGIN + SELECT set_ts_compression_policy(target_table_id, NULL) INTO prev_compression; + + RETURN prev_compression; +END; +$function$; + +-- This function implements the core of compression application: given a target table ID +-- (which must be time-series enabled) and a compression offset, all partitions falling +-- entirely behind the offset (from the present time) will be converted to using columnar +-- storage. This function is "idempotent" in the sense that repeated calls will behave +-- identically given the same wall clock time and arguments. +CREATE OR REPLACE FUNCTION @extschema@.apply_compression_policy(target_table_id regclass, comp_offset interval) + RETURNS void + LANGUAGE plpgsql +AS $function$ +DECLARE +table_name text; +part_row record; +part_beg timestamptz; +part_end timestamptz; +part_am name; +BEGIN + IF comp_offset IS NULL THEN + RETURN; + END IF; + + SELECT format('%s.%s', n.nspname, c.relname) + INTO table_name + FROM pg_class c + LEFT JOIN pg_namespace n + ON n.oid = c.relnamespace + WHERE c.oid=target_table_id; + + FOR part_row IN + SELECT + partition_schemaname, + partition_tablename + FROM @extschema@.show_partitions(table_name, 'ASC') + LOOP + SELECT child_start_time, child_end_time + INTO part_beg, part_end + FROM @extschema@.show_partition_info( + part_row.partition_schemaname || '.' || + part_row.partition_tablename); + + SELECT am.amname + INTO part_am + FROM pg_class c, pg_am am + WHERE c.oid = (part_row.partition_schemaname || '.' || + part_row.partition_tablename)::regclass AND + c.relam = am.oid; + + IF part_am <> 'columnar' AND + part_end < (now() - comp_offset) THEN + PERFORM columnar.alter_table_set_access_method( + part_row.partition_schemaname || '.' || + part_row.partition_tablename, 'columnar'); + EXECUTE format('ALTER TABLE %I ATTACH PARTITION %I.%I FOR VALUES FROM (%L) TO (%L)', target_table_id, part_row.partition_schemaname, part_row.partition_tablename, part_beg, part_end); + END IF; + END LOOP; +END; +$function$; + +-- Since we're using pg_cron, might as well schedule the maintenance through its bgw +-- rather than run a duplicate one through pg_partman. +SELECT cron.schedule('partman-maintenance', '@hourly', $$SELECT partman.run_maintenance();$$); + +-- Scan the time-series config table and apply compression policies once an hour. +SELECT cron.schedule('timeseries-compression', '@hourly', $$SELECT @extschema@.apply_compression_policy(table_id, compression_duration) FROM ts_config;$$); diff --git a/test/expected/basic_usage.out b/test/expected/basic_usage.out index c895173..ffeba1d 100644 --- a/test/expected/basic_usage.out +++ b/test/expected/basic_usage.out @@ -2,10 +2,12 @@ BEGIN; CREATE SCHEMA partman; CREATE EXTENSION IF NOT EXISTS pg_partman WITH SCHEMA partman CASCADE; +CREATE EXTENSION IF NOT EXISTS pg_cron CASCADE; CREATE EXTENSION timeseries; ROLLBACK; CREATE EXTENSION timeseries CASCADE; NOTICE: installing required extension "pg_partman" +NOTICE: installing required extension "pg_cron" CREATE TABLE simple (); SELECT enable_ts_table('simple'); ERROR: could not enable time-series enhancements diff --git a/test/sql/basic_usage.sql b/test/sql/basic_usage.sql index a673a4d..fddb4d4 100644 --- a/test/sql/basic_usage.sql +++ b/test/sql/basic_usage.sql @@ -3,6 +3,7 @@ BEGIN; CREATE SCHEMA partman; CREATE EXTENSION IF NOT EXISTS pg_partman WITH SCHEMA partman CASCADE; +CREATE EXTENSION IF NOT EXISTS pg_cron CASCADE; CREATE EXTENSION timeseries; ROLLBACK; diff --git a/timeseries.conf b/timeseries.conf new file mode 100644 index 0000000..6946a52 --- /dev/null +++ b/timeseries.conf @@ -0,0 +1 @@ +shared_preload_libraries = 'pg_cron' diff --git a/timeseries.control b/timeseries.control index 875f1bf..f284a4d 100644 --- a/timeseries.control +++ b/timeseries.control @@ -2,4 +2,4 @@ comment = 'Convenience API for Tembo time series stack' default_version = '0.1.2' module_pathname = '$libdir/timeseries' -requires = 'pg_partman' +requires = 'columnar, pg_cron, pg_partman'