From 785249ef6657d90de72566d128ce95652a014a79 Mon Sep 17 00:00:00 2001 From: Ashwin Srinath Date: Tue, 21 Feb 2023 15:42:14 -0500 Subject: [PATCH 01/10] Raw bindings --- conda/recipes/libcudf/meta.yaml | 1 + cpp/include/cudf/io/timezone.hpp | 40 +++++++++++++++++++ cpp/src/datetime/timezone.cpp | 1 + cpp/src/io/orc/timezone.hpp | 41 ++++++++++++++++++++ python/cudf/cudf/_lib/CMakeLists.txt | 1 + python/cudf/cudf/_lib/__init__.py | 1 + python/cudf/cudf/_lib/cpp/io/timezone.pxd | 47 +++++++++++++++++++++++ python/cudf/cudf/_lib/timezone.pyx | 30 +++++++++++++++ 8 files changed, 162 insertions(+) create mode 100644 cpp/include/cudf/io/timezone.hpp create mode 100644 cpp/src/io/orc/timezone.hpp create mode 100644 python/cudf/cudf/_lib/cpp/io/timezone.pxd create mode 100644 python/cudf/cudf/_lib/timezone.pyx diff --git a/conda/recipes/libcudf/meta.yaml b/conda/recipes/libcudf/meta.yaml index 2bb571f858d..8f537b3e141 100644 --- a/conda/recipes/libcudf/meta.yaml +++ b/conda/recipes/libcudf/meta.yaml @@ -183,6 +183,7 @@ outputs: - test -f $PREFIX/include/cudf/io/text/detail/tile_state.hpp - test -f $PREFIX/include/cudf/io/text/detail/trie.hpp - test -f $PREFIX/include/cudf/io/text/multibyte_split.hpp + - test -f $PREFIX/include/cudf/io/timezone.hpp - test -f $PREFIX/include/cudf/io/types.hpp - test -f $PREFIX/include/cudf/join.hpp - test -f $PREFIX/include/cudf/labeling/label_bins.hpp diff --git a/cpp/include/cudf/io/timezone.hpp b/cpp/include/cudf/io/timezone.hpp new file mode 100644 index 00000000000..ed3ec3c03e8 --- /dev/null +++ b/cpp/include/cudf/io/timezone.hpp @@ -0,0 +1,40 @@ +/* + * Copyright (c) 2020-2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#pragma once + +#include + +#include +#include + +namespace cudf { +namespace io { + +/** + * @brief Creates a transition table to convert ORC timestamps to UTC. + * + * Uses system's TZif files. Assumes little-endian platform when parsing these files. + * + * @param tzif_dir TODO + * @param timezone_name standard timezone name (for example, "US/Pacific") + * + * @return The transition table for the given timezone + */ +std::unique_ptr build_timezone_transition_table(std::optional const& tzif_dir, + std::string const& timezone_name); + +} // namespace io +} // namespace cudf diff --git a/cpp/src/datetime/timezone.cpp b/cpp/src/datetime/timezone.cpp index 55d68fe4a1a..3df33682699 100644 --- a/cpp/src/datetime/timezone.cpp +++ b/cpp/src/datetime/timezone.cpp @@ -18,6 +18,7 @@ #include #include #include +#include #include #include diff --git a/cpp/src/io/orc/timezone.hpp b/cpp/src/io/orc/timezone.hpp new file mode 100644 index 00000000000..017376f5197 --- /dev/null +++ b/cpp/src/io/orc/timezone.hpp @@ -0,0 +1,41 @@ +/* + * Copyright (c) 2020-2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#pragma once + +#include + +#include +#include + +namespace cudf { +namespace io { + +/** + * @brief Creates a transition table to convert ORC timestamps to UTC. + * + * Uses system's TZif files. Assumes little-endian platform when parsing these files. + * + * @param tzif_dir TODO + * @param timezone_name standard timezone name (for example, "US/Pacific") + * @param stream CUDA stream used for device memory operations and kernel launches + * + * @return The transition table for the given timezone + */ +std::unique_ptr
build_timezone_transition_table(std::optional const& tzif_dir, + std::string const& timezone_name); + +} // namespace io +} // namespace cudf diff --git a/python/cudf/cudf/_lib/CMakeLists.txt b/python/cudf/cudf/_lib/CMakeLists.txt index f7d4f12ad81..9391555a272 100644 --- a/python/cudf/cudf/_lib/CMakeLists.txt +++ b/python/cudf/cudf/_lib/CMakeLists.txt @@ -48,6 +48,7 @@ set(cython_sources string_casting.pyx strings_udf.pyx text.pyx + timezone.pyx transform.pyx transpose.pyx types.pyx diff --git a/python/cudf/cudf/_lib/__init__.py b/python/cudf/cudf/_lib/__init__.py index b101db9a744..09227def4e7 100644 --- a/python/cudf/cudf/_lib/__init__.py +++ b/python/cudf/cudf/_lib/__init__.py @@ -35,6 +35,7 @@ strings, strings_udf, text, + timezone, transpose, unary, ) diff --git a/python/cudf/cudf/_lib/cpp/io/timezone.pxd b/python/cudf/cudf/_lib/cpp/io/timezone.pxd new file mode 100644 index 00000000000..6e4d907d022 --- /dev/null +++ b/python/cudf/cudf/_lib/cpp/io/timezone.pxd @@ -0,0 +1,47 @@ +# Copyright (c) 2020-2023, NVIDIA CORPORATION. + +from libcpp cimport bool +from libcpp.memory cimport unique_ptr +from libcpp.string cimport string + +from cudf._lib.cpp.table.table cimport table + + +cdef extern from "" namespace "std" nogil: + cdef cppclass nullopt_t: + nullopt_t() + + cdef nullopt_t nullopt + + cdef cppclass optional[T]: + ctypedef T value_type + optional() + optional(nullopt_t) + optional(optional&) except + + optional(T&) except + + bool has_value() + T& value() + T& value_or[U](U& default_value) + void swap(optional&) + void reset() + T& emplace(...) + T& operator*() + optional& operator=(optional&) + optional& operator=[U](U&) + bool operator bool() + bool operator!() + bool operator==[U](optional&, U&) + bool operator!=[U](optional&, U&) + bool operator<[U](optional&, U&) + bool operator>[U](optional&, U&) + bool operator<=[U](optional&, U&) + bool operator>=[U](optional&, U&) + + optional[T] make_optional[T](...) except + + + +cdef extern from "cudf/io/timezone.hpp" namespace "cudf::io" nogil: + unique_ptr[table] build_timezone_transition_table( + optional[string] tzif_dir, + string timezone_name + ) except + diff --git a/python/cudf/cudf/_lib/timezone.pyx b/python/cudf/cudf/_lib/timezone.pyx new file mode 100644 index 00000000000..853c3afccd1 --- /dev/null +++ b/python/cudf/cudf/_lib/timezone.pyx @@ -0,0 +1,30 @@ +# Copyright (c) 2023, NVIDIA CORPORATION. + +from libcpp.memory cimport unique_ptr +from libcpp.string cimport string +from libcpp.utility cimport move + +from cudf._lib.cpp.io.timezone cimport ( + build_timezone_transition_table as cpp_build_timezone_transition_table, + nullopt, + optional, +) +from cudf._lib.cpp.table.table cimport table +from cudf._lib.utils cimport columns_from_unique_ptr + + +def build_timezone_transition_table(timezone_name, tzif_dir): + + cdef unique_ptr[table] c_result + cdef optional[string] c_tzif_dir = nullopt + cdef string c_timezone_name = timezone_name.encode() + + with nogil: + c_result = move( + cpp_build_timezone_transition_table( + c_tzif_dir, + c_timezone_name + ) + ) + + return columns_from_unique_ptr(move(c_result)) From 399866569ed182918acdfcbdecf9d15da2e2b5c5 Mon Sep 17 00:00:00 2001 From: Ashwin Srinath Date: Tue, 21 Feb 2023 15:43:39 -0500 Subject: [PATCH 02/10] Add TODO --- python/cudf/cudf/_lib/cpp/io/timezone.pxd | 1 + 1 file changed, 1 insertion(+) diff --git a/python/cudf/cudf/_lib/cpp/io/timezone.pxd b/python/cudf/cudf/_lib/cpp/io/timezone.pxd index 6e4d907d022..c9f9a39bf84 100644 --- a/python/cudf/cudf/_lib/cpp/io/timezone.pxd +++ b/python/cudf/cudf/_lib/cpp/io/timezone.pxd @@ -7,6 +7,7 @@ from libcpp.string cimport string from cudf._lib.cpp.table.table cimport table +# TODO: this is vendored from Cython 3.0 cdef extern from "" namespace "std" nogil: cdef cppclass nullopt_t: nullopt_t() From 20fcb889700bfa67fc2cf4d2b91219e37fdbc811 Mon Sep 17 00:00:00 2001 From: Ashwin Srinath Date: Wed, 22 Feb 2023 14:30:45 -0500 Subject: [PATCH 03/10] Add get_tz_data --- python/cudf/cudf/_lib/timezone.pyx | 17 +++-- python/cudf/cudf/core/_internals/timezones.py | 72 +++++++++++++++++++ 2 files changed, 82 insertions(+), 7 deletions(-) create mode 100644 python/cudf/cudf/core/_internals/timezones.py diff --git a/python/cudf/cudf/_lib/timezone.pyx b/python/cudf/cudf/_lib/timezone.pyx index 853c3afccd1..0f969771e76 100644 --- a/python/cudf/cudf/_lib/timezone.pyx +++ b/python/cudf/cudf/_lib/timezone.pyx @@ -1,4 +1,5 @@ # Copyright (c) 2023, NVIDIA CORPORATION. +import os from libcpp.memory cimport unique_ptr from libcpp.string cimport string @@ -6,24 +7,26 @@ from libcpp.utility cimport move from cudf._lib.cpp.io.timezone cimport ( build_timezone_transition_table as cpp_build_timezone_transition_table, - nullopt, - optional, + make_optional, ) from cudf._lib.cpp.table.table cimport table from cudf._lib.utils cimport columns_from_unique_ptr -def build_timezone_transition_table(timezone_name, tzif_dir): +def build_timezone_transition_table(tzdir, tzname): + # TODO: libcudf needs the path to end with a '/' separator (but + # shouldn't). Remove this if/when that no longer a requirement: + tzdir = os.path.join(tzdir, "") cdef unique_ptr[table] c_result - cdef optional[string] c_tzif_dir = nullopt - cdef string c_timezone_name = timezone_name.encode() + cdef string c_tzdir = tzdir.encode() + cdef string c_tzname = tzname.encode() with nogil: c_result = move( cpp_build_timezone_transition_table( - c_tzif_dir, - c_timezone_name + make_optional[string](c_tzdir), + c_tzname ) ) diff --git a/python/cudf/cudf/core/_internals/timezones.py b/python/cudf/cudf/core/_internals/timezones.py new file mode 100644 index 00000000000..f0106fbbf80 --- /dev/null +++ b/python/cudf/cudf/core/_internals/timezones.py @@ -0,0 +1,72 @@ +# Copyright (c) 2023, NVIDIA CORPORATION. + +import os +import zoneinfo +from functools import lru_cache + +from cudf._lib.timezone import build_timezone_transition_table +from cudf.core.dataframe import DataFrame + + +@lru_cache(maxsize=20) +def get_tz_data(zone_name): + """ + Return timezone data (transition times and UTC offsets) for the + given IANA time zone. + + Parameters + ---------- + zone_name: str + IANA time zone name + + Returns + ------- + DataFrame with two columns containing the transition times ("dt") + and corresponding UTC offsets ("offset"). + """ + try: + # like zoneinfo, we first look in TZPATH + return _find_and_read_tzfile_tzpath(zone_name) + except zoneinfo.ZoneInfoNotFoundError: + # if that fails, we fall back to using `tzdata` + return _find_and_read_tzfile_tzdata(zone_name) + + +def _find_and_read_tzfile_tzpath(zone_name): + for search_path in zoneinfo.TZPATH: + if os.path.isfile(os.path.join(search_path, zone_name)): + return _read_tzfile_as_frame(search_path, zone_name) + # if we didn't find + raise zoneinfo.ZoneInfoNotFoundError(zone_name) + + +def _find_and_read_tzfile_tzdata(zone_name): + import importlib.resources + + package_base = "tzdata.zoneinfo" + try: + return _read_tzfile_as_frame( + str(importlib.resources.files(package_base)), zone_name + ) + # TODO: make it so that the call to libcudf raises a + # FileNotFoundError instead of a RuntimeError + except (ImportError, FileNotFoundError, UnicodeEncodeError, RuntimeError): + # the "except" part of this try-except is basically vendored + # from the zoneinfo library. + # + # There are three types of exception that can be raised that all amount + # to "we cannot find this key": + # + # ImportError: If package_name doesn't exist (e.g. if tzdata is not + # installed, or if there's an error in the folder name like + # Amrica/New_York) + # FileNotFoundError: If resource_name doesn't exist in the package + # (e.g. Europe/Krasnoy) + # UnicodeEncodeError: If package_name or resource_name are not UTF-8, + # such as keys containing a surrogate character. + raise zoneinfo.ZoneInfoNotFoundError(zone_name) + + +def _read_tzfile_as_frame(tzdir, zone_name): + dt, offsets = build_timezone_transition_table(tzdir, zone_name) + return DataFrame._from_columns([dt, offsets], ["dt", "offsets"]) From 7c9f66eea8824d69b6658aafcf3325c7f11ebce5 Mon Sep 17 00:00:00 2001 From: vuule Date: Tue, 28 Feb 2023 13:06:07 -0800 Subject: [PATCH 04/10] move declarations to /include --- cpp/include/cudf/detail/timezone.cuh | 50 +++++++++++++--------------- cpp/src/io/orc/reader_impl.cu | 1 + 2 files changed, 25 insertions(+), 26 deletions(-) diff --git a/cpp/include/cudf/detail/timezone.cuh b/cpp/include/cudf/detail/timezone.cuh index 830ee1a7fa6..434302374d9 100644 --- a/cpp/include/cudf/detail/timezone.cuh +++ b/cpp/include/cudf/detail/timezone.cuh @@ -13,35 +13,43 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -#pragma once + #pragma once #include -#include #include #include +#include #include #include namespace cudf::detail { + +inline __device__ auto project_to_cycle(timestamp_s ts) +{ + static constexpr duration_s cycle_s = + cuda::std::chrono::duration_cast(duration_D{365 * cycle_years + (100 - 3)}); + return timestamp_s{(ts.time_since_epoch() + cycle_s) % cycle_s}; +} + /** - * @brief Returns the UT offset for a given date and given timezone table. + * @brief Returns the GMT offset for a given date and given timezone table. * - * @param transition_times Transition times; trailing `solar_cycle_entry_count` entries are used for - * all times beyond the one covered by the TZif file - * @param offsets Time offsets in specific intervals; trailing `solar_cycle_entry_count` entries are - * used for all times beyond the one covered by the TZif file + * @param ttimes Transition times; trailing `cycle_entry_cnt` entries are used for all times + * beyond the one covered by the TZif file + * @param offsets Time offsets in specific intervals; trailing `cycle_entry_cnt` entries are used + * for all times beyond the one covered by the TZif file * @param ts ORC timestamp * - * @return offset from UT, in seconds + * @return GMT offset */ -inline __device__ duration_s get_ut_offset(table_device_view tz_table, timestamp_s ts) +inline __device__ duration_s get_gmt_offset(table_device_view tz_table, timestamp_s ts) { if (tz_table.num_rows() == 0) { return duration_s{0}; } - cudf::device_span transition_times(tz_table.column(0).head(), - static_cast(tz_table.num_rows())); + cudf::device_span ttimes(tz_table.column(0).head(), + static_cast(tz_table.num_rows())); auto const ts_ttime_it = [&]() { auto last_less_equal = [](auto begin, auto end, auto value) { @@ -52,28 +60,18 @@ inline __device__ duration_s get_ut_offset(table_device_view tz_table, timestamp return first_larger - 1; }; - auto const file_entry_end = - transition_times.begin() + (transition_times.size() - solar_cycle_entry_count); + auto const file_entry_end = ttimes.begin() + (ttimes.size() - cycle_entry_cnt); if (ts <= *(file_entry_end - 1)) { // Search the file entries if the timestamp is in range - return last_less_equal(transition_times.begin(), file_entry_end, ts); + return last_less_equal(ttimes.begin(), file_entry_end, ts); } else { - auto project_to_cycle = [](timestamp_s ts) { - // Years divisible by four are leap years - // Exceptions are years divisible by 100, but not divisible by 400 - static constexpr int32_t num_leap_years_in_cycle = - solar_cycle_years / 4 - (solar_cycle_years / 100 - solar_cycle_years / 400); - static constexpr duration_s cycle_s = cuda::std::chrono::duration_cast( - duration_D{365 * solar_cycle_years + num_leap_years_in_cycle}); - return timestamp_s{(ts.time_since_epoch() + cycle_s) % cycle_s}; - }; // Search the 400-year cycle if outside of the file entries range - return last_less_equal(file_entry_end, transition_times.end(), project_to_cycle(ts)); + return last_less_equal(file_entry_end, ttimes.end(), project_to_cycle(ts)); } }(); - return tz_table.column(1).element(ts_ttime_it - transition_times.begin()); + return tz_table.column(1).element(ts_ttime_it - ttimes.begin()); } -} // namespace cudf::detail +} diff --git a/cpp/src/io/orc/reader_impl.cu b/cpp/src/io/orc/reader_impl.cu index fbe44eff5ad..eff8a45c45e 100644 --- a/cpp/src/io/orc/reader_impl.cu +++ b/cpp/src/io/orc/reader_impl.cu @@ -32,6 +32,7 @@ #include #include #include +#include #include #include #include From 1bb25534e50433ea18b7cc46209388617ecb226e Mon Sep 17 00:00:00 2001 From: Ashwin Srinath Date: Tue, 7 Mar 2023 09:10:56 -0500 Subject: [PATCH 05/10] Renames --- cpp/include/cudf/io/timezone.hpp | 40 ----------------------- python/cudf/cudf/_lib/cpp/io/timezone.pxd | 4 +-- python/cudf/cudf/_lib/timezone.pyx | 6 ++-- 3 files changed, 5 insertions(+), 45 deletions(-) delete mode 100644 cpp/include/cudf/io/timezone.hpp diff --git a/cpp/include/cudf/io/timezone.hpp b/cpp/include/cudf/io/timezone.hpp deleted file mode 100644 index ed3ec3c03e8..00000000000 --- a/cpp/include/cudf/io/timezone.hpp +++ /dev/null @@ -1,40 +0,0 @@ -/* - * Copyright (c) 2020-2023, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -#pragma once - -#include - -#include -#include - -namespace cudf { -namespace io { - -/** - * @brief Creates a transition table to convert ORC timestamps to UTC. - * - * Uses system's TZif files. Assumes little-endian platform when parsing these files. - * - * @param tzif_dir TODO - * @param timezone_name standard timezone name (for example, "US/Pacific") - * - * @return The transition table for the given timezone - */ -std::unique_ptr
build_timezone_transition_table(std::optional const& tzif_dir, - std::string const& timezone_name); - -} // namespace io -} // namespace cudf diff --git a/python/cudf/cudf/_lib/cpp/io/timezone.pxd b/python/cudf/cudf/_lib/cpp/io/timezone.pxd index c9f9a39bf84..1ba591f73f5 100644 --- a/python/cudf/cudf/_lib/cpp/io/timezone.pxd +++ b/python/cudf/cudf/_lib/cpp/io/timezone.pxd @@ -41,8 +41,8 @@ cdef extern from "" namespace "std" nogil: optional[T] make_optional[T](...) except + -cdef extern from "cudf/io/timezone.hpp" namespace "cudf::io" nogil: - unique_ptr[table] build_timezone_transition_table( +cdef extern from "cudf/timezone.hpp" namespace "cudf" nogil: + unique_ptr[table] make_timezone_transition_table( optional[string] tzif_dir, string timezone_name ) except + diff --git a/python/cudf/cudf/_lib/timezone.pyx b/python/cudf/cudf/_lib/timezone.pyx index 0f969771e76..6744d7dbfc4 100644 --- a/python/cudf/cudf/_lib/timezone.pyx +++ b/python/cudf/cudf/_lib/timezone.pyx @@ -6,14 +6,14 @@ from libcpp.string cimport string from libcpp.utility cimport move from cudf._lib.cpp.io.timezone cimport ( - build_timezone_transition_table as cpp_build_timezone_transition_table, + make_timezone_transition_table as cpp_make_timezone_transition_table, make_optional, ) from cudf._lib.cpp.table.table cimport table from cudf._lib.utils cimport columns_from_unique_ptr -def build_timezone_transition_table(tzdir, tzname): +def make_timezone_transition_table(tzdir, tzname): # TODO: libcudf needs the path to end with a '/' separator (but # shouldn't). Remove this if/when that no longer a requirement: tzdir = os.path.join(tzdir, "") @@ -24,7 +24,7 @@ def build_timezone_transition_table(tzdir, tzname): with nogil: c_result = move( - cpp_build_timezone_transition_table( + cpp_make_timezone_transition_table( make_optional[string](c_tzdir), c_tzname ) From f75b6926a662bc9836c808874a73c3a863b396d4 Mon Sep 17 00:00:00 2001 From: Ashwin Srinath Date: Wed, 15 Mar 2023 12:30:05 -0400 Subject: [PATCH 06/10] Remove extra files --- conda/recipes/libcudf/meta.yaml | 1 - cpp/src/datetime/timezone.cpp | 1 - cpp/src/io/orc/timezone.hpp | 41 ---------------------------- python/cudf/cudf/_lib/CMakeLists.txt | 1 - 4 files changed, 44 deletions(-) delete mode 100644 cpp/src/io/orc/timezone.hpp diff --git a/conda/recipes/libcudf/meta.yaml b/conda/recipes/libcudf/meta.yaml index 8f537b3e141..2bb571f858d 100644 --- a/conda/recipes/libcudf/meta.yaml +++ b/conda/recipes/libcudf/meta.yaml @@ -183,7 +183,6 @@ outputs: - test -f $PREFIX/include/cudf/io/text/detail/tile_state.hpp - test -f $PREFIX/include/cudf/io/text/detail/trie.hpp - test -f $PREFIX/include/cudf/io/text/multibyte_split.hpp - - test -f $PREFIX/include/cudf/io/timezone.hpp - test -f $PREFIX/include/cudf/io/types.hpp - test -f $PREFIX/include/cudf/join.hpp - test -f $PREFIX/include/cudf/labeling/label_bins.hpp diff --git a/cpp/src/datetime/timezone.cpp b/cpp/src/datetime/timezone.cpp index 3df33682699..55d68fe4a1a 100644 --- a/cpp/src/datetime/timezone.cpp +++ b/cpp/src/datetime/timezone.cpp @@ -18,7 +18,6 @@ #include #include #include -#include #include #include diff --git a/cpp/src/io/orc/timezone.hpp b/cpp/src/io/orc/timezone.hpp deleted file mode 100644 index 017376f5197..00000000000 --- a/cpp/src/io/orc/timezone.hpp +++ /dev/null @@ -1,41 +0,0 @@ -/* - * Copyright (c) 2020-2023, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -#pragma once - -#include - -#include -#include - -namespace cudf { -namespace io { - -/** - * @brief Creates a transition table to convert ORC timestamps to UTC. - * - * Uses system's TZif files. Assumes little-endian platform when parsing these files. - * - * @param tzif_dir TODO - * @param timezone_name standard timezone name (for example, "US/Pacific") - * @param stream CUDA stream used for device memory operations and kernel launches - * - * @return The transition table for the given timezone - */ -std::unique_ptr
build_timezone_transition_table(std::optional const& tzif_dir, - std::string const& timezone_name); - -} // namespace io -} // namespace cudf diff --git a/python/cudf/cudf/_lib/CMakeLists.txt b/python/cudf/cudf/_lib/CMakeLists.txt index 9391555a272..f7d4f12ad81 100644 --- a/python/cudf/cudf/_lib/CMakeLists.txt +++ b/python/cudf/cudf/_lib/CMakeLists.txt @@ -48,7 +48,6 @@ set(cython_sources string_casting.pyx strings_udf.pyx text.pyx - timezone.pyx transform.pyx transpose.pyx types.pyx From 1d8a60a1a18433fa35bcf12f177ee440db472675 Mon Sep 17 00:00:00 2001 From: Ashwin Srinath Date: Wed, 5 Apr 2023 14:07:11 -0400 Subject: [PATCH 07/10] Final touches --- python/cudf/cudf/_lib/CMakeLists.txt | 1 + python/cudf/cudf/_lib/timezone.pyx | 6 +----- python/cudf/cudf/core/_internals/timezones.py | 1 - 3 files changed, 2 insertions(+), 6 deletions(-) diff --git a/python/cudf/cudf/_lib/CMakeLists.txt b/python/cudf/cudf/_lib/CMakeLists.txt index f7d4f12ad81..9391555a272 100644 --- a/python/cudf/cudf/_lib/CMakeLists.txt +++ b/python/cudf/cudf/_lib/CMakeLists.txt @@ -48,6 +48,7 @@ set(cython_sources string_casting.pyx strings_udf.pyx text.pyx + timezone.pyx transform.pyx transpose.pyx types.pyx diff --git a/python/cudf/cudf/_lib/timezone.pyx b/python/cudf/cudf/_lib/timezone.pyx index 6744d7dbfc4..9125315c6dd 100644 --- a/python/cudf/cudf/_lib/timezone.pyx +++ b/python/cudf/cudf/_lib/timezone.pyx @@ -1,13 +1,11 @@ # Copyright (c) 2023, NVIDIA CORPORATION. -import os - from libcpp.memory cimport unique_ptr from libcpp.string cimport string from libcpp.utility cimport move from cudf._lib.cpp.io.timezone cimport ( - make_timezone_transition_table as cpp_make_timezone_transition_table, make_optional, + make_timezone_transition_table as cpp_make_timezone_transition_table, ) from cudf._lib.cpp.table.table cimport table from cudf._lib.utils cimport columns_from_unique_ptr @@ -16,8 +14,6 @@ from cudf._lib.utils cimport columns_from_unique_ptr def make_timezone_transition_table(tzdir, tzname): # TODO: libcudf needs the path to end with a '/' separator (but # shouldn't). Remove this if/when that no longer a requirement: - tzdir = os.path.join(tzdir, "") - cdef unique_ptr[table] c_result cdef string c_tzdir = tzdir.encode() cdef string c_tzname = tzname.encode() diff --git a/python/cudf/cudf/core/_internals/timezones.py b/python/cudf/cudf/core/_internals/timezones.py index f0106fbbf80..0cc5db57c9c 100644 --- a/python/cudf/cudf/core/_internals/timezones.py +++ b/python/cudf/cudf/core/_internals/timezones.py @@ -36,7 +36,6 @@ def _find_and_read_tzfile_tzpath(zone_name): for search_path in zoneinfo.TZPATH: if os.path.isfile(os.path.join(search_path, zone_name)): return _read_tzfile_as_frame(search_path, zone_name) - # if we didn't find raise zoneinfo.ZoneInfoNotFoundError(zone_name) From e4d0ea9c15a61129bb9a053908b8c6ac04ef026b Mon Sep 17 00:00:00 2001 From: Ashwin Srinath Date: Tue, 11 Apr 2023 10:47:44 -0400 Subject: [PATCH 08/10] Move optional into its own module --- python/cudf/cudf/_lib/cpp/io/timezone.pxd | 35 +------------ python/cudf/cudf/_lib/cpp/libcpp/optional.pxd | 50 +++++++++++++++++++ python/cudf/cudf/_lib/timezone.pyx | 3 +- 3 files changed, 53 insertions(+), 35 deletions(-) create mode 100644 python/cudf/cudf/_lib/cpp/libcpp/optional.pxd diff --git a/python/cudf/cudf/_lib/cpp/io/timezone.pxd b/python/cudf/cudf/_lib/cpp/io/timezone.pxd index 1ba591f73f5..ba481d9a1d3 100644 --- a/python/cudf/cudf/_lib/cpp/io/timezone.pxd +++ b/python/cudf/cudf/_lib/cpp/io/timezone.pxd @@ -4,43 +4,10 @@ from libcpp cimport bool from libcpp.memory cimport unique_ptr from libcpp.string cimport string +from cudf._lib.cpp.libcpp.optional cimport optional from cudf._lib.cpp.table.table cimport table -# TODO: this is vendored from Cython 3.0 -cdef extern from "" namespace "std" nogil: - cdef cppclass nullopt_t: - nullopt_t() - - cdef nullopt_t nullopt - - cdef cppclass optional[T]: - ctypedef T value_type - optional() - optional(nullopt_t) - optional(optional&) except + - optional(T&) except + - bool has_value() - T& value() - T& value_or[U](U& default_value) - void swap(optional&) - void reset() - T& emplace(...) - T& operator*() - optional& operator=(optional&) - optional& operator=[U](U&) - bool operator bool() - bool operator!() - bool operator==[U](optional&, U&) - bool operator!=[U](optional&, U&) - bool operator<[U](optional&, U&) - bool operator>[U](optional&, U&) - bool operator<=[U](optional&, U&) - bool operator>=[U](optional&, U&) - - optional[T] make_optional[T](...) except + - - cdef extern from "cudf/timezone.hpp" namespace "cudf" nogil: unique_ptr[table] make_timezone_transition_table( optional[string] tzif_dir, diff --git a/python/cudf/cudf/_lib/cpp/libcpp/optional.pxd b/python/cudf/cudf/_lib/cpp/libcpp/optional.pxd new file mode 100644 index 00000000000..a78c18f3f7a --- /dev/null +++ b/python/cudf/cudf/_lib/cpp/libcpp/optional.pxd @@ -0,0 +1,50 @@ +# SPDX-FileCopyrightText: Copyright (c) 2023, NVIDIA CORPORATION & +# AFFILIATES. All rights reserved. SPDX-License-Identifier: +# Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from libcpp cimport bool + + +cdef extern from "" namespace "std" nogil: + cdef cppclass nullopt_t: + nullopt_t() + + cdef nullopt_t nullopt + + cdef cppclass optional[T]: + ctypedef T value_type + optional() + optional(nullopt_t) + optional(optional&) except + + optional(T&) except + + bool has_value() + T& value() + T& value_or[U](U& default_value) + void swap(optional&) + void reset() + T& emplace(...) + T& operator*() + optional& operator=(optional&) + optional& operator=[U](U&) + bool operator bool() + bool operator!() + bool operator==[U](optional&, U&) + bool operator!=[U](optional&, U&) + bool operator<[U](optional&, U&) + bool operator>[U](optional&, U&) + bool operator<=[U](optional&, U&) + bool operator>=[U](optional&, U&) + + optional[T] make_optional[T](...) except + diff --git a/python/cudf/cudf/_lib/timezone.pyx b/python/cudf/cudf/_lib/timezone.pyx index 9125315c6dd..8966fc03402 100644 --- a/python/cudf/cudf/_lib/timezone.pyx +++ b/python/cudf/cudf/_lib/timezone.pyx @@ -1,12 +1,13 @@ # Copyright (c) 2023, NVIDIA CORPORATION. + from libcpp.memory cimport unique_ptr from libcpp.string cimport string from libcpp.utility cimport move from cudf._lib.cpp.io.timezone cimport ( - make_optional, make_timezone_transition_table as cpp_make_timezone_transition_table, ) +from cudf._lib.cpp.libcpp.optional cimport make_optional from cudf._lib.cpp.table.table cimport table from cudf._lib.utils cimport columns_from_unique_ptr From 8ec242ba1bcbe8a1350f09205b6f659cfe20556f Mon Sep 17 00:00:00 2001 From: Ashwin Srinath Date: Tue, 11 Apr 2023 10:48:18 -0400 Subject: [PATCH 09/10] Stale comment --- python/cudf/cudf/_lib/timezone.pyx | 2 -- 1 file changed, 2 deletions(-) diff --git a/python/cudf/cudf/_lib/timezone.pyx b/python/cudf/cudf/_lib/timezone.pyx index 8966fc03402..4d76cbfcdb5 100644 --- a/python/cudf/cudf/_lib/timezone.pyx +++ b/python/cudf/cudf/_lib/timezone.pyx @@ -13,8 +13,6 @@ from cudf._lib.utils cimport columns_from_unique_ptr def make_timezone_transition_table(tzdir, tzname): - # TODO: libcudf needs the path to end with a '/' separator (but - # shouldn't). Remove this if/when that no longer a requirement: cdef unique_ptr[table] c_result cdef string c_tzdir = tzdir.encode() cdef string c_tzname = tzname.encode() From bf0097bb89b60d38be28ec4ba8b0f14f13254d99 Mon Sep 17 00:00:00 2001 From: Ashwin Srinath Date: Mon, 17 Apr 2023 08:20:49 -0400 Subject: [PATCH 10/10] Revert "move declarations to /include" This reverts commit 7c9f66eea8824d69b6658aafcf3325c7f11ebce5. --- cpp/include/cudf/detail/timezone.cuh | 50 +++++++++++++++------------- cpp/src/io/orc/reader_impl.cu | 1 - 2 files changed, 26 insertions(+), 25 deletions(-) diff --git a/cpp/include/cudf/detail/timezone.cuh b/cpp/include/cudf/detail/timezone.cuh index 434302374d9..830ee1a7fa6 100644 --- a/cpp/include/cudf/detail/timezone.cuh +++ b/cpp/include/cudf/detail/timezone.cuh @@ -13,43 +13,35 @@ * See the License for the specific language governing permissions and * limitations under the License. */ - #pragma once +#pragma once #include +#include #include #include -#include #include #include namespace cudf::detail { - -inline __device__ auto project_to_cycle(timestamp_s ts) -{ - static constexpr duration_s cycle_s = - cuda::std::chrono::duration_cast(duration_D{365 * cycle_years + (100 - 3)}); - return timestamp_s{(ts.time_since_epoch() + cycle_s) % cycle_s}; -} - /** - * @brief Returns the GMT offset for a given date and given timezone table. + * @brief Returns the UT offset for a given date and given timezone table. * - * @param ttimes Transition times; trailing `cycle_entry_cnt` entries are used for all times - * beyond the one covered by the TZif file - * @param offsets Time offsets in specific intervals; trailing `cycle_entry_cnt` entries are used - * for all times beyond the one covered by the TZif file + * @param transition_times Transition times; trailing `solar_cycle_entry_count` entries are used for + * all times beyond the one covered by the TZif file + * @param offsets Time offsets in specific intervals; trailing `solar_cycle_entry_count` entries are + * used for all times beyond the one covered by the TZif file * @param ts ORC timestamp * - * @return GMT offset + * @return offset from UT, in seconds */ -inline __device__ duration_s get_gmt_offset(table_device_view tz_table, timestamp_s ts) +inline __device__ duration_s get_ut_offset(table_device_view tz_table, timestamp_s ts) { if (tz_table.num_rows() == 0) { return duration_s{0}; } - cudf::device_span ttimes(tz_table.column(0).head(), - static_cast(tz_table.num_rows())); + cudf::device_span transition_times(tz_table.column(0).head(), + static_cast(tz_table.num_rows())); auto const ts_ttime_it = [&]() { auto last_less_equal = [](auto begin, auto end, auto value) { @@ -60,18 +52,28 @@ inline __device__ duration_s get_gmt_offset(table_device_view tz_table, timestam return first_larger - 1; }; - auto const file_entry_end = ttimes.begin() + (ttimes.size() - cycle_entry_cnt); + auto const file_entry_end = + transition_times.begin() + (transition_times.size() - solar_cycle_entry_count); if (ts <= *(file_entry_end - 1)) { // Search the file entries if the timestamp is in range - return last_less_equal(ttimes.begin(), file_entry_end, ts); + return last_less_equal(transition_times.begin(), file_entry_end, ts); } else { + auto project_to_cycle = [](timestamp_s ts) { + // Years divisible by four are leap years + // Exceptions are years divisible by 100, but not divisible by 400 + static constexpr int32_t num_leap_years_in_cycle = + solar_cycle_years / 4 - (solar_cycle_years / 100 - solar_cycle_years / 400); + static constexpr duration_s cycle_s = cuda::std::chrono::duration_cast( + duration_D{365 * solar_cycle_years + num_leap_years_in_cycle}); + return timestamp_s{(ts.time_since_epoch() + cycle_s) % cycle_s}; + }; // Search the 400-year cycle if outside of the file entries range - return last_less_equal(file_entry_end, ttimes.end(), project_to_cycle(ts)); + return last_less_equal(file_entry_end, transition_times.end(), project_to_cycle(ts)); } }(); - return tz_table.column(1).element(ts_ttime_it - ttimes.begin()); + return tz_table.column(1).element(ts_ttime_it - transition_times.begin()); } -} +} // namespace cudf::detail diff --git a/cpp/src/io/orc/reader_impl.cu b/cpp/src/io/orc/reader_impl.cu index eff8a45c45e..fbe44eff5ad 100644 --- a/cpp/src/io/orc/reader_impl.cu +++ b/cpp/src/io/orc/reader_impl.cu @@ -32,7 +32,6 @@ #include #include #include -#include #include #include #include