From 6909db8b76cb047f46a8759035b578c9417bb8c7 Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Fri, 14 Jul 2023 13:25:27 -0400 Subject: [PATCH] Use Parser for cast kernel (#4512) (#4513) * Use Parser for cast kernel (#4512) * Support parsing unpadded dates * Review feedback * Tweak bound --- arrow-cast/Cargo.toml | 4 + arrow-cast/benches/parse_date.rs | 34 + arrow-cast/src/cast.rs | 515 +--- arrow-cast/src/parse.rs | 4554 +++++++++++++++--------------- 4 files changed, 2416 insertions(+), 2691 deletions(-) create mode 100644 arrow-cast/benches/parse_date.rs diff --git a/arrow-cast/Cargo.toml b/arrow-cast/Cargo.toml index 494ad104b11c..2758a4817814 100644 --- a/arrow-cast/Cargo.toml +++ b/arrow-cast/Cargo.toml @@ -65,6 +65,10 @@ harness = false name = "parse_time" harness = false +[[bench]] +name = "parse_date" +harness = false + [[bench]] name = "parse_decimal" harness = false diff --git a/arrow-cast/benches/parse_date.rs b/arrow-cast/benches/parse_date.rs new file mode 100644 index 000000000000..e05d38d2f853 --- /dev/null +++ b/arrow-cast/benches/parse_date.rs @@ -0,0 +1,34 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use arrow_array::types::Date32Type; +use arrow_cast::parse::Parser; +use criterion::*; + +fn criterion_benchmark(c: &mut Criterion) { + let timestamps = ["2020-09-08", "2020-9-8", "2020-09-8", "2020-9-08"]; + + for timestamp in timestamps { + let t = black_box(timestamp); + c.bench_function(t, |b| { + b.iter(|| Date32Type::parse(t).unwrap()); + }); + } +} + +criterion_group!(benches, criterion_benchmark); +criterion_main!(benches); diff --git a/arrow-cast/src/cast.rs b/arrow-cast/src/cast.rs index 814ad6589f2d..3a5c27fb6082 100644 --- a/arrow-cast/src/cast.rs +++ b/arrow-cast/src/cast.rs @@ -37,14 +37,14 @@ //! assert_eq!(7.0, c.value(2)); //! ``` -use chrono::{NaiveTime, Offset, TimeZone, Timelike, Utc}; +use chrono::{NaiveTime, Offset, TimeZone, Utc}; use std::cmp::Ordering; use std::sync::Arc; use crate::display::{array_value_to_string, ArrayFormatter, FormatOptions}; use crate::parse::{ parse_interval_day_time, parse_interval_month_day_nano, parse_interval_year_month, - string_to_datetime, + string_to_datetime, Parser, }; use arrow_array::{ builder::*, cast::*, temporal_conversions::*, timezone::Tz, types::*, *, @@ -1262,36 +1262,28 @@ pub fn cast_with_options( ))), }, (Utf8, _) => match to_type { - UInt8 => cast_string_to_numeric::(array, cast_options), - UInt16 => cast_string_to_numeric::(array, cast_options), - UInt32 => cast_string_to_numeric::(array, cast_options), - UInt64 => cast_string_to_numeric::(array, cast_options), - Int8 => cast_string_to_numeric::(array, cast_options), - Int16 => cast_string_to_numeric::(array, cast_options), - Int32 => cast_string_to_numeric::(array, cast_options), - Int64 => cast_string_to_numeric::(array, cast_options), - Float32 => cast_string_to_numeric::(array, cast_options), - Float64 => cast_string_to_numeric::(array, cast_options), - Date32 => cast_string_to_date32::(array, cast_options), - Date64 => cast_string_to_date64::(array, cast_options), + UInt8 => parse_string::(array, cast_options), + UInt16 => parse_string::(array, cast_options), + UInt32 => parse_string::(array, cast_options), + UInt64 => parse_string::(array, cast_options), + Int8 => parse_string::(array, cast_options), + Int16 => parse_string::(array, cast_options), + Int32 => parse_string::(array, cast_options), + Int64 => parse_string::(array, cast_options), + Float32 => parse_string::(array, cast_options), + Float64 => parse_string::(array, cast_options), + Date32 => parse_string::(array, cast_options), + Date64 => parse_string::(array, cast_options), Binary => Ok(Arc::new(BinaryArray::from(array.as_string::().clone()))), LargeBinary => { let binary = BinaryArray::from(array.as_string::().clone()); cast_byte_container::(&binary) } LargeUtf8 => cast_byte_container::(array), - Time32(TimeUnit::Second) => { - cast_string_to_time32second::(array, cast_options) - } - Time32(TimeUnit::Millisecond) => { - cast_string_to_time32millisecond::(array, cast_options) - } - Time64(TimeUnit::Microsecond) => { - cast_string_to_time64microsecond::(array, cast_options) - } - Time64(TimeUnit::Nanosecond) => { - cast_string_to_time64nanosecond::(array, cast_options) - } + Time32(TimeUnit::Second) => parse_string::(array, cast_options), + Time32(TimeUnit::Millisecond) => parse_string::(array, cast_options), + Time64(TimeUnit::Microsecond) => parse_string::(array, cast_options), + Time64(TimeUnit::Nanosecond) => parse_string::(array, cast_options), Timestamp(TimeUnit::Second, to_tz) => { cast_string_to_timestamp::(array, to_tz, cast_options) } @@ -1318,18 +1310,18 @@ pub fn cast_with_options( ))), }, (LargeUtf8, _) => match to_type { - UInt8 => cast_string_to_numeric::(array, cast_options), - UInt16 => cast_string_to_numeric::(array, cast_options), - UInt32 => cast_string_to_numeric::(array, cast_options), - UInt64 => cast_string_to_numeric::(array, cast_options), - Int8 => cast_string_to_numeric::(array, cast_options), - Int16 => cast_string_to_numeric::(array, cast_options), - Int32 => cast_string_to_numeric::(array, cast_options), - Int64 => cast_string_to_numeric::(array, cast_options), - Float32 => cast_string_to_numeric::(array, cast_options), - Float64 => cast_string_to_numeric::(array, cast_options), - Date32 => cast_string_to_date32::(array, cast_options), - Date64 => cast_string_to_date64::(array, cast_options), + UInt8 => parse_string::(array, cast_options), + UInt16 => parse_string::(array, cast_options), + UInt32 => parse_string::(array, cast_options), + UInt64 => parse_string::(array, cast_options), + Int8 => parse_string::(array, cast_options), + Int16 => parse_string::(array, cast_options), + Int32 => parse_string::(array, cast_options), + Int64 => parse_string::(array, cast_options), + Float32 => parse_string::(array, cast_options), + Float64 => parse_string::(array, cast_options), + Date32 => parse_string::(array, cast_options), + Date64 => parse_string::(array, cast_options), Utf8 => cast_byte_container::(array), Binary => { let large_binary = @@ -1339,18 +1331,10 @@ pub fn cast_with_options( LargeBinary => Ok(Arc::new(LargeBinaryArray::from( array.as_string::().clone(), ))), - Time32(TimeUnit::Second) => { - cast_string_to_time32second::(array, cast_options) - } - Time32(TimeUnit::Millisecond) => { - cast_string_to_time32millisecond::(array, cast_options) - } - Time64(TimeUnit::Microsecond) => { - cast_string_to_time64microsecond::(array, cast_options) - } - Time64(TimeUnit::Nanosecond) => { - cast_string_to_time64nanosecond::(array, cast_options) - } + Time32(TimeUnit::Second) => parse_string::(array, cast_options), + Time32(TimeUnit::Millisecond) => parse_string::(array, cast_options), + Time64(TimeUnit::Microsecond) => parse_string::(array, cast_options), + Time64(TimeUnit::Nanosecond) => parse_string::(array, cast_options), Timestamp(TimeUnit::Second, to_tz) => { cast_string_to_timestamp::(array, to_tz, cast_options) } @@ -2523,422 +2507,35 @@ fn value_to_string( Ok(Arc::new(builder.finish())) } -/// Cast numeric types to Utf8 -fn cast_string_to_numeric( - from: &dyn Array, - cast_options: &CastOptions, -) -> Result -where - T: ArrowPrimitiveType, - ::Native: lexical_core::FromLexical, -{ - Ok(Arc::new(string_to_numeric_cast::( - from.as_any() - .downcast_ref::>() - .unwrap(), - cast_options, - )?)) -} - -fn string_to_numeric_cast( - from: &GenericStringArray, - cast_options: &CastOptions, -) -> Result, ArrowError> -where - T: ArrowPrimitiveType, - ::Native: lexical_core::FromLexical, -{ - if cast_options.safe { - let iter = from - .iter() - .map(|v| v.and_then(|v| lexical_core::parse(v.as_bytes()).ok())); - // Benefit: - // 20% performance improvement - // Soundness: - // The iterator is trustedLen because it comes from an `StringArray`. - Ok(unsafe { PrimitiveArray::::from_trusted_len_iter(iter) }) - } else { - let vec = from - .iter() - .map(|v| { - v.map(|v| { - lexical_core::parse(v.as_bytes()).map_err(|_| { - ArrowError::CastError(format!( - "Cannot cast string '{}' to value of {:?} type", - v, - T::DATA_TYPE, - )) - }) - }) - .transpose() - }) - .collect::, _>>()?; - // Benefit: - // 20% performance improvement - // Soundness: - // The iterator is trustedLen because it comes from an `StringArray`. - Ok(unsafe { PrimitiveArray::::from_trusted_len_iter(vec.iter()) }) - } -} - -/// Casts generic string arrays to Date32Array -fn cast_string_to_date32( - array: &dyn Array, - cast_options: &CastOptions, -) -> Result { - use chrono::Datelike; - let string_array = array - .as_any() - .downcast_ref::>() - .unwrap(); - - let array = if cast_options.safe { - let iter = string_array.iter().map(|v| { - v.and_then(|v| { - v.parse::() - .map(|date| date.num_days_from_ce() - EPOCH_DAYS_FROM_CE) - .ok() - }) - }); - - // Benefit: - // 20% performance improvement - // Soundness: - // The iterator is trustedLen because it comes from an `StringArray`. - unsafe { Date32Array::from_trusted_len_iter(iter) } - } else { - let vec = string_array - .iter() - .map(|v| { - v.map(|v| { - v.parse::() - .map(|date| date.num_days_from_ce() - EPOCH_DAYS_FROM_CE) - .map_err(|_| { - ArrowError::CastError(format!( - "Cannot cast string '{}' to value of {:?} type", - v, - DataType::Date32 - )) - }) - }) - .transpose() - }) - .collect::>, _>>()?; - - // Benefit: - // 20% performance improvement - // Soundness: - // The iterator is trustedLen because it comes from an `StringArray`. - unsafe { Date32Array::from_trusted_len_iter(vec.iter()) } - }; - - Ok(Arc::new(array) as ArrayRef) -} - -/// Casts generic string arrays to Date64Array -fn cast_string_to_date64( - array: &dyn Array, - cast_options: &CastOptions, -) -> Result { - let string_array = array - .as_any() - .downcast_ref::>() - .unwrap(); - - let array = if cast_options.safe { - let iter = string_array.iter().map(|v| { - v.and_then(|v| { - v.parse::() - .map(|datetime| datetime.timestamp_millis()) - .ok() - }) - }); - - // Benefit: - // 20% performance improvement - // Soundness: - // The iterator is trustedLen because it comes from an `StringArray`. - unsafe { Date64Array::from_trusted_len_iter(iter) } - } else { - let vec = string_array - .iter() - .map(|v| { - v.map(|v| { - v.parse::() - .map(|datetime| datetime.timestamp_millis()) - .map_err(|_| { - ArrowError::CastError(format!( - "Cannot cast string '{}' to value of {:?} type", - v, - DataType::Date64 - )) - }) - }) - .transpose() - }) - .collect::>, _>>()?; - - // Benefit: - // 20% performance improvement - // Soundness: - // The iterator is trustedLen because it comes from an `StringArray`. - unsafe { Date64Array::from_trusted_len_iter(vec.iter()) } - }; - - Ok(Arc::new(array) as ArrayRef) -} - -/// Casts generic string arrays to `Time32SecondArray` -fn cast_string_to_time32second( - array: &dyn Array, - cast_options: &CastOptions, -) -> Result { - /// The number of nanoseconds per millisecond. - const NANOS_PER_SEC: u32 = 1_000_000_000; - - let string_array = array - .as_any() - .downcast_ref::>() - .unwrap(); - - let array = if cast_options.safe { - let iter = string_array.iter().map(|v| { - v.and_then(|v| { - v.parse::() - .map(|time| { - (time.num_seconds_from_midnight() - + time.nanosecond() / NANOS_PER_SEC) - as i32 - }) - .ok() - }) - }); - - // Benefit: - // 20% performance improvement - // Soundness: - // The iterator is trustedLen because it comes from an `StringArray`. - unsafe { Time32SecondArray::from_trusted_len_iter(iter) } - } else { - let vec = string_array - .iter() - .map(|v| { - v.map(|v| { - v.parse::() - .map(|time| { - (time.num_seconds_from_midnight() - + time.nanosecond() / NANOS_PER_SEC) - as i32 - }) - .map_err(|_| { - ArrowError::CastError(format!( - "Cannot cast string '{}' to value of {:?} type", - v, - DataType::Time32(TimeUnit::Second) - )) - }) - }) - .transpose() - }) - .collect::>, _>>()?; - - // Benefit: - // 20% performance improvement - // Soundness: - // The iterator is trustedLen because it comes from an `StringArray`. - unsafe { Time32SecondArray::from_trusted_len_iter(vec.iter()) } - }; - - Ok(Arc::new(array) as ArrayRef) -} - -/// Casts generic string arrays to `Time32MillisecondArray` -fn cast_string_to_time32millisecond( - array: &dyn Array, - cast_options: &CastOptions, -) -> Result { - /// The number of nanoseconds per millisecond. - const NANOS_PER_MILLI: u32 = 1_000_000; - /// The number of milliseconds per second. - const MILLIS_PER_SEC: u32 = 1_000; - - let string_array = array - .as_any() - .downcast_ref::>() - .unwrap(); - - let array = if cast_options.safe { - let iter = string_array.iter().map(|v| { - v.and_then(|v| { - v.parse::() - .map(|time| { - (time.num_seconds_from_midnight() * MILLIS_PER_SEC - + time.nanosecond() / NANOS_PER_MILLI) - as i32 - }) - .ok() - }) - }); - - // Benefit: - // 20% performance improvement - // Soundness: - // The iterator is trustedLen because it comes from an `StringArray`. - unsafe { Time32MillisecondArray::from_trusted_len_iter(iter) } - } else { - let vec = string_array - .iter() - .map(|v| { - v.map(|v| { - v.parse::() - .map(|time| { - (time.num_seconds_from_midnight() * MILLIS_PER_SEC - + time.nanosecond() / NANOS_PER_MILLI) - as i32 - }) - .map_err(|_| { - ArrowError::CastError(format!( - "Cannot cast string '{}' to value of {:?} type", - v, - DataType::Time32(TimeUnit::Millisecond) - )) - }) - }) - .transpose() - }) - .collect::>, _>>()?; - - // Benefit: - // 20% performance improvement - // Soundness: - // The iterator is trustedLen because it comes from an `StringArray`. - unsafe { Time32MillisecondArray::from_trusted_len_iter(vec.iter()) } - }; - - Ok(Arc::new(array) as ArrayRef) -} - -/// Casts generic string arrays to `Time64MicrosecondArray` -fn cast_string_to_time64microsecond( +/// Parse UTF-8 +fn parse_string( array: &dyn Array, cast_options: &CastOptions, ) -> Result { - /// The number of nanoseconds per microsecond. - const NANOS_PER_MICRO: i64 = 1_000; - /// The number of microseconds per second. - const MICROS_PER_SEC: i64 = 1_000_000; - - let string_array = array - .as_any() - .downcast_ref::>() - .unwrap(); - - let array = if cast_options.safe { - let iter = string_array.iter().map(|v| { - v.and_then(|v| { - v.parse::() - .map(|time| { - time.num_seconds_from_midnight() as i64 * MICROS_PER_SEC - + time.nanosecond() as i64 / NANOS_PER_MICRO - }) - .ok() - }) - }); - - // Benefit: - // 20% performance improvement - // Soundness: - // The iterator is trustedLen because it comes from an `StringArray`. - unsafe { Time64MicrosecondArray::from_trusted_len_iter(iter) } - } else { - let vec = string_array - .iter() - .map(|v| { - v.map(|v| { - v.parse::() - .map(|time| { - time.num_seconds_from_midnight() as i64 * MICROS_PER_SEC - + time.nanosecond() as i64 / NANOS_PER_MICRO - }) - .map_err(|_| { - ArrowError::CastError(format!( - "Cannot cast string '{}' to value of {:?} type", - v, - DataType::Time64(TimeUnit::Microsecond) - )) - }) - }) - .transpose() - }) - .collect::>, _>>()?; - - // Benefit: - // 20% performance improvement - // Soundness: - // The iterator is trustedLen because it comes from an `StringArray`. - unsafe { Time64MicrosecondArray::from_trusted_len_iter(vec.iter()) } - }; - - Ok(Arc::new(array) as ArrayRef) -} - -/// Casts generic string arrays to `Time64NanosecondArray` -fn cast_string_to_time64nanosecond( - array: &dyn Array, - cast_options: &CastOptions, -) -> Result { - /// The number of nanoseconds per second. - const NANOS_PER_SEC: i64 = 1_000_000_000; - - let string_array = array - .as_any() - .downcast_ref::>() - .unwrap(); - + let string_array = array.as_string::(); let array = if cast_options.safe { - let iter = string_array.iter().map(|v| { - v.and_then(|v| { - v.parse::() - .map(|time| { - time.num_seconds_from_midnight() as i64 * NANOS_PER_SEC - + time.nanosecond() as i64 - }) - .ok() - }) - }); + let iter = string_array.iter().map(|x| x.and_then(P::parse)); // Benefit: // 20% performance improvement // Soundness: // The iterator is trustedLen because it comes from an `StringArray`. - unsafe { Time64NanosecondArray::from_trusted_len_iter(iter) } + unsafe { PrimitiveArray::

::from_trusted_len_iter(iter) } } else { - let vec = string_array + let v = string_array .iter() - .map(|v| { - v.map(|v| { - v.parse::() - .map(|time| { - time.num_seconds_from_midnight() as i64 * NANOS_PER_SEC - + time.nanosecond() as i64 - }) - .map_err(|_| { - ArrowError::CastError(format!( - "Cannot cast string '{}' to value of {:?} type", - v, - DataType::Time64(TimeUnit::Nanosecond) - )) - }) - }) - .transpose() + .map(|x| match x { + Some(v) => P::parse(v).ok_or_else(|| { + ArrowError::CastError(format!( + "Cannot cast string '{}' to value of {:?} type", + v, + P::DATA_TYPE + )) + }), + None => Ok(P::Native::default()), }) - .collect::>, _>>()?; - - // Benefit: - // 20% performance improvement - // Soundness: - // The iterator is trustedLen because it comes from an `StringArray`. - unsafe { Time64NanosecondArray::from_trusted_len_iter(vec.iter()) } + .collect::, ArrowError>>()?; + PrimitiveArray::new(v.into(), string_array.nulls().cloned()) }; Ok(Arc::new(array) as ArrayRef) @@ -7846,13 +7443,13 @@ mod tests { assert_eq!(946728000000, c.value(0)); assert!(c.is_valid(1)); // "2020-12-15T12:34:56" assert_eq!(1608035696000, c.value(1)); - assert!(c.is_valid(2)); // "2020-2-2T12:34:56" - assert_eq!(1580646896000, c.value(2)); + assert!(!c.is_valid(2)); // "2020-2-2T12:34:56" - // test invalid inputs assert!(!c.is_valid(3)); // "2000-00-00T12:00:00" - assert!(!c.is_valid(4)); // "2000-01-01 12:00:00" - assert!(!c.is_valid(5)); // "2000-01-01" + assert!(c.is_valid(4)); // "2000-01-01 12:00:00" + assert_eq!(946728000000, c.value(4)); + assert!(c.is_valid(5)); // "2000-01-01" + assert_eq!(946684800000, c.value(5)); } #[test] diff --git a/arrow-cast/src/parse.rs b/arrow-cast/src/parse.rs index 67477c57d519..50bfca0f84bd 100644 --- a/arrow-cast/src/parse.rs +++ b/arrow-cast/src/parse.rs @@ -1,2232 +1,2322 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -use arrow_array::timezone::Tz; -use arrow_array::types::*; -use arrow_array::{ArrowNativeTypeOp, ArrowPrimitiveType}; -use arrow_buffer::ArrowNativeType; -use arrow_schema::ArrowError; -use chrono::prelude::*; -use half::f16; -use std::str::FromStr; - -/// Parse nanoseconds from the first `N` values in digits, subtracting the offset `O` -#[inline] -fn parse_nanos(digits: &[u8]) -> u32 { - digits[..N] - .iter() - .fold(0_u32, |acc, v| acc * 10 + v.wrapping_sub(O) as u32) - * 10_u32.pow((9 - N) as _) -} - -/// Helper for parsing timestamps -struct TimestampParser { - /// The timestamp bytes to parse minus `b'0'` - /// - /// This makes interpretation as an integer inexpensive - digits: [u8; 32], - /// A mask containing a `1` bit where the corresponding byte is a valid ASCII digit - mask: u32, -} - -impl TimestampParser { - fn new(bytes: &[u8]) -> Self { - let mut digits = [0; 32]; - let mut mask = 0; - - // Treating all bytes the same way, helps LLVM vectorise this correctly - for (idx, (o, i)) in digits.iter_mut().zip(bytes).enumerate() { - *o = i.wrapping_sub(b'0'); - mask |= ((*o < 10) as u32) << idx - } - - Self { digits, mask } - } - - /// Returns true if the byte at `idx` in the original string equals `b` - fn test(&self, idx: usize, b: u8) -> bool { - self.digits[idx] == b.wrapping_sub(b'0') - } - - /// Parses a date of the form `1997-01-31` - fn date(&self) -> Option { - if self.mask & 0b1111111111 != 0b1101101111 - || !self.test(4, b'-') - || !self.test(7, b'-') - { - return None; - } - - let year = self.digits[0] as u16 * 1000 - + self.digits[1] as u16 * 100 - + self.digits[2] as u16 * 10 - + self.digits[3] as u16; - - let month = self.digits[5] * 10 + self.digits[6]; - let day = self.digits[8] * 10 + self.digits[9]; - - NaiveDate::from_ymd_opt(year as _, month as _, day as _) - } - - /// Parses a time of any of forms - /// - `09:26:56` - /// - `09:26:56.123` - /// - `09:26:56.123456` - /// - `09:26:56.123456789` - /// - `092656` - /// - /// Returning the end byte offset - fn time(&self) -> Option<(NaiveTime, usize)> { - // Make a NaiveTime handling leap seconds - let time = |hour, min, sec, nano| match sec { - 60 => { - let nano = 1_000_000_000 + nano; - NaiveTime::from_hms_nano_opt(hour as _, min as _, 59, nano) - } - _ => NaiveTime::from_hms_nano_opt(hour as _, min as _, sec as _, nano), - }; - - match (self.mask >> 11) & 0b11111111 { - // 09:26:56 - 0b11011011 if self.test(13, b':') && self.test(16, b':') => { - let hour = self.digits[11] * 10 + self.digits[12]; - let minute = self.digits[14] * 10 + self.digits[15]; - let second = self.digits[17] * 10 + self.digits[18]; - - match self.test(19, b'.') { - true => { - let digits = (self.mask >> 20).trailing_ones(); - let nanos = match digits { - 0 => return None, - 1 => parse_nanos::<1, 0>(&self.digits[20..21]), - 2 => parse_nanos::<2, 0>(&self.digits[20..22]), - 3 => parse_nanos::<3, 0>(&self.digits[20..23]), - 4 => parse_nanos::<4, 0>(&self.digits[20..24]), - 5 => parse_nanos::<5, 0>(&self.digits[20..25]), - 6 => parse_nanos::<6, 0>(&self.digits[20..26]), - 7 => parse_nanos::<7, 0>(&self.digits[20..27]), - 8 => parse_nanos::<8, 0>(&self.digits[20..28]), - _ => parse_nanos::<9, 0>(&self.digits[20..29]), - }; - Some((time(hour, minute, second, nanos)?, 20 + digits as usize)) - } - false => Some((time(hour, minute, second, 0)?, 19)), - } - } - // 092656 - 0b111111 => { - let hour = self.digits[11] * 10 + self.digits[12]; - let minute = self.digits[13] * 10 + self.digits[14]; - let second = self.digits[15] * 10 + self.digits[16]; - let time = time(hour, minute, second, 0)?; - Some((time, 17)) - } - _ => None, - } - } -} - -/// Accepts a string and parses it relative to the provided `timezone` -/// -/// In addition to RFC3339 / ISO8601 standard timestamps, it also -/// accepts strings that use a space ` ` to separate the date and time -/// as well as strings that have no explicit timezone offset. -/// -/// Examples of accepted inputs: -/// * `1997-01-31T09:26:56.123Z` # RCF3339 -/// * `1997-01-31T09:26:56.123-05:00` # RCF3339 -/// * `1997-01-31 09:26:56.123-05:00` # close to RCF3339 but with a space rather than T -/// * `2023-01-01 04:05:06.789 -08` # close to RCF3339, no fractional seconds or time separator -/// * `1997-01-31T09:26:56.123` # close to RCF3339 but no timezone offset specified -/// * `1997-01-31 09:26:56.123` # close to RCF3339 but uses a space and no timezone offset -/// * `1997-01-31 09:26:56` # close to RCF3339, no fractional seconds -/// * `1997-01-31 092656` # close to RCF3339, no fractional seconds -/// * `1997-01-31 092656+04:00` # close to RCF3339, no fractional seconds or time separator -/// * `1997-01-31` # close to RCF3339, only date no time -/// -/// [IANA timezones] are only supported if the `arrow-array/chrono-tz` feature is enabled -/// -/// * `2023-01-01 040506 America/Los_Angeles` -/// -/// If a timestamp is ambiguous, for example as a result of daylight-savings time, an error -/// will be returned -/// -/// Some formats supported by PostgresSql -/// are not supported, like -/// -/// * "2023-01-01 04:05:06.789 +07:30:00", -/// * "2023-01-01 040506 +07:30:00", -/// * "2023-01-01 04:05:06.789 PST", -/// -/// [IANA timezones]: https://www.iana.org/time-zones -pub fn string_to_datetime( - timezone: &T, - s: &str, -) -> Result, ArrowError> { - let err = |ctx: &str| { - ArrowError::ParseError(format!("Error parsing timestamp from '{s}': {ctx}")) - }; - - let bytes = s.as_bytes(); - if bytes.len() < 10 { - return Err(err("timestamp must contain at least 10 characters")); - } - - let parser = TimestampParser::new(bytes); - let date = parser.date().ok_or_else(|| err("error parsing date"))?; - if bytes.len() == 10 { - let offset = timezone.offset_from_local_date(&date); - let offset = offset - .single() - .ok_or_else(|| err("error computing timezone offset"))?; - - let time = NaiveTime::from_hms_opt(0, 0, 0).unwrap(); - return Ok(DateTime::from_local(date.and_time(time), offset)); - } - - if !parser.test(10, b'T') && !parser.test(10, b't') && !parser.test(10, b' ') { - return Err(err("invalid timestamp separator")); - } - - let (time, mut tz_offset) = parser.time().ok_or_else(|| err("error parsing time"))?; - let datetime = date.and_time(time); - - if tz_offset == 32 { - // Decimal overrun - while tz_offset < bytes.len() && bytes[tz_offset].is_ascii_digit() { - tz_offset += 1; - } - } - - if bytes.len() <= tz_offset { - let offset = timezone.offset_from_local_datetime(&datetime); - let offset = offset - .single() - .ok_or_else(|| err("error computing timezone offset"))?; - return Ok(DateTime::from_local(datetime, offset)); - } - - if bytes[tz_offset] == b'z' || bytes[tz_offset] == b'Z' { - let offset = timezone.offset_from_local_datetime(&datetime); - let offset = offset - .single() - .ok_or_else(|| err("error computing timezone offset"))?; - return Ok(DateTime::from_utc(datetime, offset)); - } - - // Parse remainder of string as timezone - let parsed_tz: Tz = s[tz_offset..].trim_start().parse()?; - let offset = parsed_tz.offset_from_local_datetime(&datetime); - let offset = offset - .single() - .ok_or_else(|| err("error computing timezone offset"))?; - Ok(DateTime::::from_local(datetime, offset).with_timezone(timezone)) -} - -/// Accepts a string in RFC3339 / ISO8601 standard format and some -/// variants and converts it to a nanosecond precision timestamp. -/// -/// See [`string_to_datetime`] for the full set of supported formats -/// -/// Implements the `to_timestamp` function to convert a string to a -/// timestamp, following the model of spark SQL’s to_`timestamp`. -/// -/// Internally, this function uses the `chrono` library for the -/// datetime parsing -/// -/// We hope to extend this function in the future with a second -/// parameter to specifying the format string. -/// -/// ## Timestamp Precision -/// -/// Function uses the maximum precision timestamps supported by -/// Arrow (nanoseconds stored as a 64-bit integer) timestamps. This -/// means the range of dates that timestamps can represent is ~1677 AD -/// to 2262 AM -/// -/// ## Timezone / Offset Handling -/// -/// Numerical values of timestamps are stored compared to offset UTC. -/// -/// This function interprets string without an explicit time zone as timestamps -/// relative to UTC, see [`string_to_datetime`] for alternative semantics -/// -/// In particular: -/// -/// ``` -/// # use arrow_cast::parse::string_to_timestamp_nanos; -/// // Note all three of these timestamps are parsed as the same value -/// let a = string_to_timestamp_nanos("1997-01-31 09:26:56.123Z").unwrap(); -/// let b = string_to_timestamp_nanos("1997-01-31T09:26:56.123").unwrap(); -/// let c = string_to_timestamp_nanos("1997-01-31T14:26:56.123+05:00").unwrap(); -/// -/// assert_eq!(a, b); -/// assert_eq!(b, c); -/// ``` -/// -#[inline] -pub fn string_to_timestamp_nanos(s: &str) -> Result { - to_timestamp_nanos(string_to_datetime(&Utc, s)?.naive_utc()) -} - -/// Defensive check to prevent chrono-rs panics when nanosecond conversion happens on non-supported dates -#[inline] -fn to_timestamp_nanos(dt: NaiveDateTime) -> Result { - if dt.timestamp().checked_mul(1_000_000_000).is_none() { - return Err(ArrowError::ParseError( - ERR_NANOSECONDS_NOT_SUPPORTED.to_string(), - )); - } - - Ok(dt.timestamp_nanos()) -} - -/// Accepts a string in ISO8601 standard format and some -/// variants and converts it to nanoseconds since midnight. -/// -/// Examples of accepted inputs: -/// * `09:26:56.123 AM` -/// * `23:59:59` -/// * `6:00 pm` -// -/// Internally, this function uses the `chrono` library for the -/// time parsing -/// -/// ## Timezone / Offset Handling -/// -/// This function does not support parsing strings with a timezone -/// or offset specified, as it considers only time since midnight. -pub fn string_to_time_nanoseconds(s: &str) -> Result { - let nt = string_to_time(s).ok_or_else(|| { - ArrowError::ParseError(format!("Failed to parse \'{s}\' as time")) - })?; - Ok(nt.num_seconds_from_midnight() as i64 * 1_000_000_000 + nt.nanosecond() as i64) -} - -fn string_to_time(s: &str) -> Option { - let bytes = s.as_bytes(); - if bytes.len() < 4 { - return None; - } - - let (am, bytes) = match bytes.get(bytes.len() - 3..) { - Some(b" AM" | b" am" | b" Am" | b" aM") => { - (Some(true), &bytes[..bytes.len() - 3]) - } - Some(b" PM" | b" pm" | b" pM" | b" Pm") => { - (Some(false), &bytes[..bytes.len() - 3]) - } - _ => (None, bytes), - }; - - if bytes.len() < 4 { - return None; - } - - let mut digits = [b'0'; 6]; - - // Extract hour - let bytes = match (bytes[1], bytes[2]) { - (b':', _) => { - digits[1] = bytes[0]; - &bytes[2..] - } - (_, b':') => { - digits[0] = bytes[0]; - digits[1] = bytes[1]; - &bytes[3..] - } - _ => return None, - }; - - if bytes.len() < 2 { - return None; // Minutes required - } - - // Extract minutes - digits[2] = bytes[0]; - digits[3] = bytes[1]; - - let nanoseconds = match bytes.get(2) { - Some(b':') => { - if bytes.len() < 5 { - return None; - } - - // Extract seconds - digits[4] = bytes[3]; - digits[5] = bytes[4]; - - // Extract sub-seconds if any - match bytes.get(5) { - Some(b'.') => { - let decimal = &bytes[6..]; - if decimal.iter().any(|x| !x.is_ascii_digit()) { - return None; - } - match decimal.len() { - 0 => return None, - 1 => parse_nanos::<1, b'0'>(decimal), - 2 => parse_nanos::<2, b'0'>(decimal), - 3 => parse_nanos::<3, b'0'>(decimal), - 4 => parse_nanos::<4, b'0'>(decimal), - 5 => parse_nanos::<5, b'0'>(decimal), - 6 => parse_nanos::<6, b'0'>(decimal), - 7 => parse_nanos::<7, b'0'>(decimal), - 8 => parse_nanos::<8, b'0'>(decimal), - _ => parse_nanos::<9, b'0'>(decimal), - } - } - Some(_) => return None, - None => 0, - } - } - Some(_) => return None, - None => 0, - }; - - digits.iter_mut().for_each(|x| *x = x.wrapping_sub(b'0')); - if digits.iter().any(|x| *x > 9) { - return None; - } - - let hour = match (digits[0] * 10 + digits[1], am) { - (12, Some(true)) => 0, // 12:00 AM -> 00:00 - (h @ 1..=11, Some(true)) => h, // 1:00 AM -> 01:00 - (12, Some(false)) => 12, // 12:00 PM -> 12:00 - (h @ 1..=11, Some(false)) => h + 12, // 1:00 PM -> 13:00 - (_, Some(_)) => return None, - (h, None) => h, - }; - - // Handle leap second - let (second, nanoseconds) = match digits[4] * 10 + digits[5] { - 60 => (59, nanoseconds + 1_000_000_000), - s => (s, nanoseconds), - }; - - NaiveTime::from_hms_nano_opt( - hour as _, - (digits[2] * 10 + digits[3]) as _, - second as _, - nanoseconds, - ) -} - -/// Specialized parsing implementations -/// used by csv and json reader -pub trait Parser: ArrowPrimitiveType { - fn parse(string: &str) -> Option; - - fn parse_formatted(string: &str, _format: &str) -> Option { - Self::parse(string) - } -} - -impl Parser for Float16Type { - fn parse(string: &str) -> Option { - lexical_core::parse(string.as_bytes()) - .ok() - .map(f16::from_f32) - } -} - -impl Parser for Float32Type { - fn parse(string: &str) -> Option { - lexical_core::parse(string.as_bytes()).ok() - } -} - -impl Parser for Float64Type { - fn parse(string: &str) -> Option { - lexical_core::parse(string.as_bytes()).ok() - } -} - -macro_rules! parser_primitive { - ($t:ty) => { - impl Parser for $t { - fn parse(string: &str) -> Option { - lexical_core::parse::(string.as_bytes()).ok() - } - } - }; -} -parser_primitive!(UInt64Type); -parser_primitive!(UInt32Type); -parser_primitive!(UInt16Type); -parser_primitive!(UInt8Type); -parser_primitive!(Int64Type); -parser_primitive!(Int32Type); -parser_primitive!(Int16Type); -parser_primitive!(Int8Type); - -impl Parser for TimestampNanosecondType { - fn parse(string: &str) -> Option { - string_to_timestamp_nanos(string).ok() - } -} - -impl Parser for TimestampMicrosecondType { - fn parse(string: &str) -> Option { - let nanos = string_to_timestamp_nanos(string).ok(); - nanos.map(|x| x / 1000) - } -} - -impl Parser for TimestampMillisecondType { - fn parse(string: &str) -> Option { - let nanos = string_to_timestamp_nanos(string).ok(); - nanos.map(|x| x / 1_000_000) - } -} - -impl Parser for TimestampSecondType { - fn parse(string: &str) -> Option { - let nanos = string_to_timestamp_nanos(string).ok(); - nanos.map(|x| x / 1_000_000_000) - } -} - -impl Parser for Time64NanosecondType { - // Will truncate any fractions of a nanosecond - fn parse(string: &str) -> Option { - string_to_time_nanoseconds(string) - .ok() - .or_else(|| string.parse::().ok()) - } - - fn parse_formatted(string: &str, format: &str) -> Option { - let nt = NaiveTime::parse_from_str(string, format).ok()?; - Some( - nt.num_seconds_from_midnight() as i64 * 1_000_000_000 - + nt.nanosecond() as i64, - ) - } -} - -impl Parser for Time64MicrosecondType { - // Will truncate any fractions of a microsecond - fn parse(string: &str) -> Option { - string_to_time_nanoseconds(string) - .ok() - .map(|nanos| nanos / 1_000) - .or_else(|| string.parse::().ok()) - } - - fn parse_formatted(string: &str, format: &str) -> Option { - let nt = NaiveTime::parse_from_str(string, format).ok()?; - Some( - nt.num_seconds_from_midnight() as i64 * 1_000_000 - + nt.nanosecond() as i64 / 1_000, - ) - } -} - -impl Parser for Time32MillisecondType { - // Will truncate any fractions of a millisecond - fn parse(string: &str) -> Option { - string_to_time_nanoseconds(string) - .ok() - .map(|nanos| (nanos / 1_000_000) as i32) - .or_else(|| string.parse::().ok()) - } - - fn parse_formatted(string: &str, format: &str) -> Option { - let nt = NaiveTime::parse_from_str(string, format).ok()?; - Some( - nt.num_seconds_from_midnight() as i32 * 1_000 - + nt.nanosecond() as i32 / 1_000_000, - ) - } -} - -impl Parser for Time32SecondType { - // Will truncate any fractions of a second - fn parse(string: &str) -> Option { - string_to_time_nanoseconds(string) - .ok() - .map(|nanos| (nanos / 1_000_000_000) as i32) - .or_else(|| string.parse::().ok()) - } - - fn parse_formatted(string: &str, format: &str) -> Option { - let nt = NaiveTime::parse_from_str(string, format).ok()?; - Some( - nt.num_seconds_from_midnight() as i32 - + nt.nanosecond() as i32 / 1_000_000_000, - ) - } -} - -/// Number of days between 0001-01-01 and 1970-01-01 -const EPOCH_DAYS_FROM_CE: i32 = 719_163; - -/// Error message if nanosecond conversion request beyond supported interval -const ERR_NANOSECONDS_NOT_SUPPORTED: &str = "The dates that can be represented as nanoseconds have to be between 1677-09-21T00:12:44.0 and 2262-04-11T23:47:16.854775804"; - -impl Parser for Date32Type { - fn parse(string: &str) -> Option { - let parser = TimestampParser::new(string.as_bytes()); - let date = parser.date()?; - Some(date.num_days_from_ce() - EPOCH_DAYS_FROM_CE) - } - - fn parse_formatted(string: &str, format: &str) -> Option { - let date = NaiveDate::parse_from_str(string, format).ok()?; - Some(date.num_days_from_ce() - EPOCH_DAYS_FROM_CE) - } -} - -impl Parser for Date64Type { - fn parse(string: &str) -> Option { - let date_time = string_to_datetime(&Utc, string).ok()?; - Some(date_time.timestamp_millis()) - } - - fn parse_formatted(string: &str, format: &str) -> Option { - use chrono::format::Fixed; - use chrono::format::StrftimeItems; - let fmt = StrftimeItems::new(format); - let has_zone = fmt.into_iter().any(|item| match item { - chrono::format::Item::Fixed(fixed_item) => matches!( - fixed_item, - Fixed::RFC2822 - | Fixed::RFC3339 - | Fixed::TimezoneName - | Fixed::TimezoneOffsetColon - | Fixed::TimezoneOffsetColonZ - | Fixed::TimezoneOffset - | Fixed::TimezoneOffsetZ - ), - _ => false, - }); - if has_zone { - let date_time = chrono::DateTime::parse_from_str(string, format).ok()?; - Some(date_time.timestamp_millis()) - } else { - let date_time = NaiveDateTime::parse_from_str(string, format).ok()?; - Some(date_time.timestamp_millis()) - } - } -} - -/// Parse the string format decimal value to i128/i256 format and checking the precision and scale. -/// The result value can't be out of bounds. -pub fn parse_decimal( - s: &str, - precision: u8, - scale: i8, -) -> Result { - let mut result = T::Native::usize_as(0); - let mut fractionals = 0; - let mut digits = 0; - let base = T::Native::usize_as(10); - - let bs = s.as_bytes(); - let (bs, negative) = match bs.first() { - Some(b'-') => (&bs[1..], true), - Some(b'+') => (&bs[1..], false), - _ => (bs, false), - }; - - if bs.is_empty() { - return Err(ArrowError::ParseError(format!( - "can't parse the string value {s} to decimal" - ))); - } - - let mut bs = bs.iter(); - // Overflow checks are not required if 10^(precision - 1) <= T::MAX holds. - // Thus, if we validate the precision correctly, we can skip overflow checks. - while let Some(b) = bs.next() { - match b { - b'0'..=b'9' => { - if digits == 0 && *b == b'0' { - // Ignore leading zeros. - continue; - } - digits += 1; - result = result.mul_wrapping(base); - result = result.add_wrapping(T::Native::usize_as((b - b'0') as usize)); - } - b'.' => { - for b in bs.by_ref() { - if !b.is_ascii_digit() { - return Err(ArrowError::ParseError(format!( - "can't parse the string value {s} to decimal" - ))); - } - if fractionals == scale { - // We have processed all the digits that we need. All that - // is left is to validate that the rest of the string contains - // valid digits. - continue; - } - fractionals += 1; - digits += 1; - result = result.mul_wrapping(base); - result = - result.add_wrapping(T::Native::usize_as((b - b'0') as usize)); - } - - // Fail on "." - if digits == 0 { - return Err(ArrowError::ParseError(format!( - "can't parse the string value {s} to decimal" - ))); - } - } - _ => { - return Err(ArrowError::ParseError(format!( - "can't parse the string value {s} to decimal" - ))); - } - } - } - - if fractionals < scale { - let exp = scale - fractionals; - if exp as u8 + digits > precision { - return Err(ArrowError::ParseError("parse decimal overflow".to_string())); - } - let mul = base.pow_wrapping(exp as _); - result = result.mul_wrapping(mul); - } else if digits > precision { - return Err(ArrowError::ParseError("parse decimal overflow".to_string())); - } - - Ok(if negative { - result.neg_wrapping() - } else { - result - }) -} - -pub fn parse_interval_year_month( - value: &str, -) -> Result<::Native, ArrowError> { - let config = IntervalParseConfig::new(IntervalUnit::Year); - let interval = Interval::parse(value, &config)?; - - let months = interval.to_year_months().map_err(|_| ArrowError::CastError(format!( - "Cannot cast {value} to IntervalYearMonth. Only year and month fields are allowed." - )))?; - - Ok(IntervalYearMonthType::make_value(0, months)) -} - -pub fn parse_interval_day_time( - value: &str, -) -> Result<::Native, ArrowError> { - let config = IntervalParseConfig::new(IntervalUnit::Day); - let interval = Interval::parse(value, &config)?; - - let (days, millis) = interval.to_day_time().map_err(|_| ArrowError::CastError(format!( - "Cannot cast {value} to IntervalDayTime because the nanos part isn't multiple of milliseconds" - )))?; - - Ok(IntervalDayTimeType::make_value(days, millis)) -} - -pub fn parse_interval_month_day_nano( - value: &str, -) -> Result<::Native, ArrowError> { - let config = IntervalParseConfig::new(IntervalUnit::Month); - let interval = Interval::parse(value, &config)?; - - let (months, days, nanos) = interval.to_month_day_nanos(); - - Ok(IntervalMonthDayNanoType::make_value(months, days, nanos)) -} - -const NANOS_PER_MILLIS: i64 = 1_000_000; -const NANOS_PER_SECOND: i64 = 1_000 * NANOS_PER_MILLIS; -const NANOS_PER_MINUTE: i64 = 60 * NANOS_PER_SECOND; -const NANOS_PER_HOUR: i64 = 60 * NANOS_PER_MINUTE; -#[cfg(test)] -const NANOS_PER_DAY: i64 = 24 * NANOS_PER_HOUR; - -#[rustfmt::skip] -#[derive(Clone, Copy)] -#[repr(u16)] -enum IntervalUnit { - Century = 0b_0000_0000_0001, - Decade = 0b_0000_0000_0010, - Year = 0b_0000_0000_0100, - Month = 0b_0000_0000_1000, - Week = 0b_0000_0001_0000, - Day = 0b_0000_0010_0000, - Hour = 0b_0000_0100_0000, - Minute = 0b_0000_1000_0000, - Second = 0b_0001_0000_0000, - Millisecond = 0b_0010_0000_0000, - Microsecond = 0b_0100_0000_0000, - Nanosecond = 0b_1000_0000_0000, -} - -impl FromStr for IntervalUnit { - type Err = ArrowError; - - fn from_str(s: &str) -> Result { - match s.to_lowercase().as_str() { - "century" | "centuries" => Ok(Self::Century), - "decade" | "decades" => Ok(Self::Decade), - "year" | "years" => Ok(Self::Year), - "month" | "months" => Ok(Self::Month), - "week" | "weeks" => Ok(Self::Week), - "day" | "days" => Ok(Self::Day), - "hour" | "hours" => Ok(Self::Hour), - "minute" | "minutes" => Ok(Self::Minute), - "second" | "seconds" => Ok(Self::Second), - "millisecond" | "milliseconds" => Ok(Self::Millisecond), - "microsecond" | "microseconds" => Ok(Self::Microsecond), - "nanosecond" | "nanoseconds" => Ok(Self::Nanosecond), - _ => Err(ArrowError::NotYetImplemented(format!( - "Unknown interval type: {s}" - ))), - } - } -} - -pub type MonthDayNano = (i32, i32, i64); - -/// Chosen based on the number of decimal digits in 1 week in nanoseconds -const INTERVAL_PRECISION: u32 = 15; - -#[derive(Clone, Copy, Debug, PartialEq)] -struct IntervalAmount { - /// The integer component of the interval amount - integer: i64, - /// The fractional component multiplied by 10^INTERVAL_PRECISION - frac: i64, -} - -#[cfg(test)] -impl IntervalAmount { - fn new(integer: i64, frac: i64) -> Self { - Self { integer, frac } - } -} - -impl FromStr for IntervalAmount { - type Err = ArrowError; - - fn from_str(s: &str) -> Result { - match s.split_once('.') { - Some((integer, frac)) - if frac.len() <= INTERVAL_PRECISION as usize - && !frac.is_empty() - && !frac.starts_with('-') => - { - // integer will be "" for values like ".5" - // and "-" for values like "-.5" - let explicit_neg = integer.starts_with('-'); - let integer = if integer.is_empty() || integer == "-" { - Ok(0) - } else { - integer.parse::().map_err(|_| { - ArrowError::ParseError(format!( - "Failed to parse {s} as interval amount" - )) - }) - }?; - - let frac_unscaled = frac.parse::().map_err(|_| { - ArrowError::ParseError(format!( - "Failed to parse {s} as interval amount" - )) - })?; - - // scale fractional part by interval precision - let frac = - frac_unscaled * 10_i64.pow(INTERVAL_PRECISION - frac.len() as u32); - - // propagate the sign of the integer part to the fractional part - let frac = if integer < 0 || explicit_neg { - -frac - } else { - frac - }; - - let result = Self { integer, frac }; - - Ok(result) - } - Some((_, frac)) if frac.starts_with('-') => Err(ArrowError::ParseError( - format!("Failed to parse {s} as interval amount"), - )), - Some((_, frac)) if frac.len() > INTERVAL_PRECISION as usize => { - Err(ArrowError::ParseError(format!( - "{s} exceeds the precision available for interval amount" - ))) - } - Some(_) | None => { - let integer = s.parse::().map_err(|_| { - ArrowError::ParseError(format!( - "Failed to parse {s} as interval amount" - )) - })?; - - let result = Self { integer, frac: 0 }; - Ok(result) - } - } - } -} - -#[derive(Debug, Default, PartialEq)] -struct Interval { - months: i32, - days: i32, - nanos: i64, -} - -impl Interval { - fn new(months: i32, days: i32, nanos: i64) -> Self { - Self { - months, - days, - nanos, - } - } - - fn to_year_months(&self) -> Result { - match (self.months, self.days, self.nanos) { - (months, days, nanos) if days == 0 && nanos == 0 => Ok(months), - _ => Err(ArrowError::InvalidArgumentError(format!( - "Unable to represent interval with days and nanos as year-months: {:?}", - self - ))), - } - } - - fn to_day_time(&self) -> Result<(i32, i32), ArrowError> { - let days = self.months.mul_checked(30)?.add_checked(self.days)?; - - match self.nanos { - nanos if nanos % NANOS_PER_MILLIS == 0 => { - let millis = (self.nanos / 1_000_000).try_into().map_err(|_| { - ArrowError::InvalidArgumentError(format!( - "Unable to represent {} nanos as milliseconds in a signed 32-bit integer", - self.nanos - )) - })?; - - Ok((days, millis)) - } - nanos => Err(ArrowError::InvalidArgumentError(format!( - "Unable to represent {nanos} as milliseconds" - ))), - } - } - - fn to_month_day_nanos(&self) -> (i32, i32, i64) { - (self.months, self.days, self.nanos) - } - - /// Parse string value in traditional Postgres format such as - /// `1 year 2 months 3 days 4 hours 5 minutes 6 seconds` - fn parse(value: &str, config: &IntervalParseConfig) -> Result { - let components = parse_interval_components(value, config)?; - - let result = components.into_iter().fold( - Ok(Self::default()), - |result, (amount, unit)| match result { - Ok(result) => result.add(amount, unit), - Err(e) => Err(e), - }, - )?; - - Ok(result) - } - - /// Interval addition following Postgres behavior. Fractional units will be spilled into smaller units. - /// When the interval unit is larger than months, the result is rounded to total months and not spilled to days/nanos. - /// Fractional parts of weeks and days are represented using days and nanoseconds. - /// e.g. INTERVAL '0.5 MONTH' = 15 days, INTERVAL '1.5 MONTH' = 1 month 15 days - /// e.g. INTERVAL '0.5 DAY' = 12 hours, INTERVAL '1.5 DAY' = 1 day 12 hours - /// [Postgres reference](https://www.postgresql.org/docs/15/datatype-datetime.html#DATATYPE-INTERVAL-INPUT:~:text=Field%20values%20can,fractional%20on%20output.) - fn add( - &self, - amount: IntervalAmount, - unit: IntervalUnit, - ) -> Result { - let result = match unit { - IntervalUnit::Century => { - let months_int = amount.integer.mul_checked(100)?.mul_checked(12)?; - let month_frac = amount.frac * 12 / 10_i64.pow(INTERVAL_PRECISION - 2); - let months = - months_int - .add_checked(month_frac)? - .try_into() - .map_err(|_| { - ArrowError::ParseError(format!( - "Unable to represent {} centuries as months in a signed 32-bit integer", - &amount.integer - )) - })?; - - Self::new(self.months.add_checked(months)?, self.days, self.nanos) - } - IntervalUnit::Decade => { - let months_int = amount.integer.mul_checked(10)?.mul_checked(12)?; - - let month_frac = amount.frac * 12 / 10_i64.pow(INTERVAL_PRECISION - 1); - let months = - months_int - .add_checked(month_frac)? - .try_into() - .map_err(|_| { - ArrowError::ParseError(format!( - "Unable to represent {} decades as months in a signed 32-bit integer", - &amount.integer - )) - })?; - - Self::new(self.months.add_checked(months)?, self.days, self.nanos) - } - IntervalUnit::Year => { - let months_int = amount.integer.mul_checked(12)?; - let month_frac = amount.frac * 12 / 10_i64.pow(INTERVAL_PRECISION); - let months = - months_int - .add_checked(month_frac)? - .try_into() - .map_err(|_| { - ArrowError::ParseError(format!( - "Unable to represent {} years as months in a signed 32-bit integer", - &amount.integer - )) - })?; - - Self::new(self.months.add_checked(months)?, self.days, self.nanos) - } - IntervalUnit::Month => { - let months = amount.integer.try_into().map_err(|_| { - ArrowError::ParseError(format!( - "Unable to represent {} months in a signed 32-bit integer", - &amount.integer - )) - })?; - - let days = amount.frac * 3 / 10_i64.pow(INTERVAL_PRECISION - 1); - let days = days.try_into().map_err(|_| { - ArrowError::ParseError(format!( - "Unable to represent {} months as days in a signed 32-bit integer", - amount.frac / 10_i64.pow(INTERVAL_PRECISION) - )) - })?; - - Self::new( - self.months.add_checked(months)?, - self.days.add_checked(days)?, - self.nanos, - ) - } - IntervalUnit::Week => { - let days = amount.integer.mul_checked(7)?.try_into().map_err(|_| { - ArrowError::ParseError(format!( - "Unable to represent {} weeks as days in a signed 32-bit integer", - &amount.integer - )) - })?; - - let nanos = - amount.frac * 7 * 24 * 6 * 6 / 10_i64.pow(INTERVAL_PRECISION - 11); - - Self::new( - self.months, - self.days.add_checked(days)?, - self.nanos.add_checked(nanos)?, - ) - } - IntervalUnit::Day => { - let days = amount.integer.try_into().map_err(|_| { - ArrowError::InvalidArgumentError(format!( - "Unable to represent {} days in a signed 32-bit integer", - amount.integer - )) - })?; - - let nanos = - amount.frac * 24 * 6 * 6 / 10_i64.pow(INTERVAL_PRECISION - 11); - - Self::new( - self.months, - self.days.add_checked(days)?, - self.nanos.add_checked(nanos)?, - ) - } - IntervalUnit::Hour => { - let nanos_int = amount.integer.mul_checked(NANOS_PER_HOUR)?; - let nanos_frac = - amount.frac * 6 * 6 / 10_i64.pow(INTERVAL_PRECISION - 11); - let nanos = nanos_int.add_checked(nanos_frac)?; - - Interval::new(self.months, self.days, self.nanos.add_checked(nanos)?) - } - IntervalUnit::Minute => { - let nanos_int = amount.integer.mul_checked(NANOS_PER_MINUTE)?; - let nanos_frac = amount.frac * 6 / 10_i64.pow(INTERVAL_PRECISION - 10); - - let nanos = nanos_int.add_checked(nanos_frac)?; - - Interval::new(self.months, self.days, self.nanos.add_checked(nanos)?) - } - IntervalUnit::Second => { - let nanos_int = amount.integer.mul_checked(NANOS_PER_SECOND)?; - let nanos_frac = amount.frac / 10_i64.pow(INTERVAL_PRECISION - 9); - let nanos = nanos_int.add_checked(nanos_frac)?; - - Interval::new(self.months, self.days, self.nanos.add_checked(nanos)?) - } - IntervalUnit::Millisecond => { - let nanos_int = amount.integer.mul_checked(NANOS_PER_MILLIS)?; - let nanos_frac = amount.frac / 10_i64.pow(INTERVAL_PRECISION - 6); - let nanos = nanos_int.add_checked(nanos_frac)?; - - Interval::new(self.months, self.days, self.nanos.add_checked(nanos)?) - } - IntervalUnit::Microsecond => { - let nanos_int = amount.integer.mul_checked(1_000)?; - let nanos_frac = amount.frac / 10_i64.pow(INTERVAL_PRECISION - 3); - let nanos = nanos_int.add_checked(nanos_frac)?; - - Interval::new(self.months, self.days, self.nanos.add_checked(nanos)?) - } - IntervalUnit::Nanosecond => { - let nanos_int = amount.integer; - let nanos_frac = amount.frac / 10_i64.pow(INTERVAL_PRECISION); - let nanos = nanos_int.add_checked(nanos_frac)?; - - Interval::new(self.months, self.days, self.nanos.add_checked(nanos)?) - } - }; - - Ok(result) - } -} - -struct IntervalParseConfig { - /// The default unit to use if none is specified - /// e.g. `INTERVAL 1` represents `INTERVAL 1 SECOND` when default_unit = IntervalType::Second - default_unit: IntervalUnit, -} - -impl IntervalParseConfig { - fn new(default_unit: IntervalUnit) -> Self { - Self { default_unit } - } -} - -/// parse the string into a vector of interval components i.e. (amount, unit) tuples -fn parse_interval_components( - value: &str, - config: &IntervalParseConfig, -) -> Result, ArrowError> { - let parts = value.split_whitespace(); - - let raw_amounts = parts.clone().step_by(2); - let raw_units = parts.skip(1).step_by(2); - - // parse amounts - let (amounts, invalid_amounts) = raw_amounts - .map(IntervalAmount::from_str) - .partition::, _>(Result::is_ok); - - // invalid amounts? - if !invalid_amounts.is_empty() { - return Err(ArrowError::NotYetImplemented(format!( - "Unsupported Interval Expression with value {value:?}" - ))); - } - - // parse units - let (units, invalid_units): (Vec<_>, Vec<_>) = raw_units - .clone() - .map(IntervalUnit::from_str) - .partition(Result::is_ok); - - // invalid units? - if !invalid_units.is_empty() { - return Err(ArrowError::ParseError(format!( - "Invalid input syntax for type interval: {value:?}" - ))); - } - - // collect parsed results - let amounts = amounts.into_iter().map(Result::unwrap).collect::>(); - let units = units.into_iter().map(Result::unwrap).collect::>(); - - // if only an amount is specified, use the default unit - if amounts.len() == 1 && units.is_empty() { - return Ok(vec![(amounts[0], config.default_unit)]); - }; - - // duplicate units? - let mut observed_interval_types = 0; - for (unit, raw_unit) in units.iter().zip(raw_units) { - if observed_interval_types & (*unit as u16) != 0 { - return Err(ArrowError::ParseError(format!( - "Invalid input syntax for type interval: {value:?}. Repeated type '{raw_unit}'", - ))); - } - - observed_interval_types |= *unit as u16; - } - - let result = amounts.iter().copied().zip(units.iter().copied()); - - Ok(result.collect::>()) -} - -#[cfg(test)] -mod tests { - use super::*; - use arrow_array::timezone::Tz; - use arrow_buffer::i256; - - #[test] - fn test_parse_nanos() { - assert_eq!(parse_nanos::<3, 0>(&[1, 2, 3]), 123_000_000); - assert_eq!(parse_nanos::<5, 0>(&[1, 2, 3, 4, 5]), 123_450_000); - assert_eq!(parse_nanos::<6, b'0'>(b"123456"), 123_456_000); - } - - #[test] - fn string_to_timestamp_timezone() { - // Explicit timezone - assert_eq!( - 1599572549190855000, - parse_timestamp("2020-09-08T13:42:29.190855+00:00").unwrap() - ); - assert_eq!( - 1599572549190855000, - parse_timestamp("2020-09-08T13:42:29.190855Z").unwrap() - ); - assert_eq!( - 1599572549000000000, - parse_timestamp("2020-09-08T13:42:29Z").unwrap() - ); // no fractional part - assert_eq!( - 1599590549190855000, - parse_timestamp("2020-09-08T13:42:29.190855-05:00").unwrap() - ); - } - - #[test] - fn string_to_timestamp_timezone_space() { - // Ensure space rather than T between time and date is accepted - assert_eq!( - 1599572549190855000, - parse_timestamp("2020-09-08 13:42:29.190855+00:00").unwrap() - ); - assert_eq!( - 1599572549190855000, - parse_timestamp("2020-09-08 13:42:29.190855Z").unwrap() - ); - assert_eq!( - 1599572549000000000, - parse_timestamp("2020-09-08 13:42:29Z").unwrap() - ); // no fractional part - assert_eq!( - 1599590549190855000, - parse_timestamp("2020-09-08 13:42:29.190855-05:00").unwrap() - ); - } - - #[test] - #[cfg_attr(miri, ignore)] // unsupported operation: can't call foreign function: mktime - fn string_to_timestamp_no_timezone() { - // This test is designed to succeed in regardless of the local - // timezone the test machine is running. Thus it is still - // somewhat susceptible to bugs in the use of chrono - let naive_datetime = NaiveDateTime::new( - NaiveDate::from_ymd_opt(2020, 9, 8).unwrap(), - NaiveTime::from_hms_nano_opt(13, 42, 29, 190855000).unwrap(), - ); - - // Ensure both T and ' ' variants work - assert_eq!( - naive_datetime.timestamp_nanos(), - parse_timestamp("2020-09-08T13:42:29.190855").unwrap() - ); - - assert_eq!( - naive_datetime.timestamp_nanos(), - parse_timestamp("2020-09-08 13:42:29.190855").unwrap() - ); - - // Also ensure that parsing timestamps with no fractional - // second part works as well - let naive_datetime_whole_secs = NaiveDateTime::new( - NaiveDate::from_ymd_opt(2020, 9, 8).unwrap(), - NaiveTime::from_hms_opt(13, 42, 29).unwrap(), - ); - - // Ensure both T and ' ' variants work - assert_eq!( - naive_datetime_whole_secs.timestamp_nanos(), - parse_timestamp("2020-09-08T13:42:29").unwrap() - ); - - assert_eq!( - naive_datetime_whole_secs.timestamp_nanos(), - parse_timestamp("2020-09-08 13:42:29").unwrap() - ); - - // ensure without time work - // no time, should be the nano second at - // 2020-09-08 0:0:0 - let naive_datetime_no_time = NaiveDateTime::new( - NaiveDate::from_ymd_opt(2020, 9, 8).unwrap(), - NaiveTime::from_hms_opt(0, 0, 0).unwrap(), - ); - - assert_eq!( - naive_datetime_no_time.timestamp_nanos(), - parse_timestamp("2020-09-08").unwrap() - ) - } - - #[test] - fn string_to_timestamp_chrono() { - let cases = [ - "2020-09-08T13:42:29Z", - "1969-01-01T00:00:00.1Z", - "2020-09-08T12:00:12.12345678+00:00", - "2020-09-08T12:00:12+00:00", - "2020-09-08T12:00:12.1+00:00", - "2020-09-08T12:00:12.12+00:00", - "2020-09-08T12:00:12.123+00:00", - "2020-09-08T12:00:12.1234+00:00", - "2020-09-08T12:00:12.12345+00:00", - "2020-09-08T12:00:12.123456+00:00", - "2020-09-08T12:00:12.1234567+00:00", - "2020-09-08T12:00:12.12345678+00:00", - "2020-09-08T12:00:12.123456789+00:00", - "2020-09-08T12:00:12.12345678912z", - "2020-09-08T12:00:12.123456789123Z", - "2020-09-08T12:00:12.123456789123+02:00", - "2020-09-08T12:00:12.12345678912345Z", - "2020-09-08T12:00:12.1234567891234567+02:00", - "2020-09-08T12:00:60Z", - "2020-09-08T12:00:60.123Z", - "2020-09-08T12:00:60.123456+02:00", - "2020-09-08T12:00:60.1234567891234567+02:00", - "2020-09-08T12:00:60.999999999+02:00", - "2020-09-08t12:00:12.12345678+00:00", - "2020-09-08t12:00:12+00:00", - "2020-09-08t12:00:12Z", - ]; - - for case in cases { - let chrono = DateTime::parse_from_rfc3339(case).unwrap(); - let chrono_utc = chrono.with_timezone(&Utc); - - let custom = string_to_datetime(&Utc, case).unwrap(); - assert_eq!(chrono_utc, custom) - } - } - - #[test] - fn string_to_timestamp_naive() { - let cases = [ - "2018-11-13T17:11:10.011375885995", - "2030-12-04T17:11:10.123", - "2030-12-04T17:11:10.1234", - "2030-12-04T17:11:10.123456", - ]; - for case in cases { - let chrono = - NaiveDateTime::parse_from_str(case, "%Y-%m-%dT%H:%M:%S%.f").unwrap(); - let custom = string_to_datetime(&Utc, case).unwrap(); - assert_eq!(chrono, custom.naive_utc()) - } - } - - #[test] - fn string_to_timestamp_invalid() { - // Test parsing invalid formats - let cases = [ - ("", "timestamp must contain at least 10 characters"), - ("SS", "timestamp must contain at least 10 characters"), - ("Wed, 18 Feb 2015 23:16:09 GMT", "error parsing date"), - ("1997-01-31H09:26:56.123Z", "invalid timestamp separator"), - ("1997-01-31 09:26:56.123Z", "error parsing time"), - ("1997:01:31T09:26:56.123Z", "error parsing date"), - ("1997:1:31T09:26:56.123Z", "error parsing date"), - ("1997-01-32T09:26:56.123Z", "error parsing date"), - ("1997-13-32T09:26:56.123Z", "error parsing date"), - ("1997-02-29T09:26:56.123Z", "error parsing date"), - ("2015-02-30T17:35:20-08:00", "error parsing date"), - ("1997-01-10T9:26:56.123Z", "error parsing time"), - ("2015-01-20T25:35:20-08:00", "error parsing time"), - ("1997-01-10T09:61:56.123Z", "error parsing time"), - ("1997-01-10T09:61:90.123Z", "error parsing time"), - ("1997-01-10T12:00:6.123Z", "error parsing time"), - ("1997-01-31T092656.123Z", "error parsing time"), - ("1997-01-10T12:00:06.", "error parsing time"), - ("1997-01-10T12:00:06. ", "error parsing time"), - ]; - - for (s, ctx) in cases { - let expected = - format!("Parser error: Error parsing timestamp from '{s}': {ctx}"); - let actual = string_to_datetime(&Utc, s).unwrap_err().to_string(); - assert_eq!(actual, expected) - } - } - - // Parse a timestamp to timestamp int with a useful human readable error message - fn parse_timestamp(s: &str) -> Result { - let result = string_to_timestamp_nanos(s); - if let Err(e) = &result { - eprintln!("Error parsing timestamp '{s}': {e:?}"); - } - result - } - - #[test] - fn string_without_timezone_to_timestamp() { - // string without timezone should always output the same regardless the local or session timezone - - let naive_datetime = NaiveDateTime::new( - NaiveDate::from_ymd_opt(2020, 9, 8).unwrap(), - NaiveTime::from_hms_nano_opt(13, 42, 29, 190855000).unwrap(), - ); - - // Ensure both T and ' ' variants work - assert_eq!( - naive_datetime.timestamp_nanos(), - parse_timestamp("2020-09-08T13:42:29.190855").unwrap() - ); - - assert_eq!( - naive_datetime.timestamp_nanos(), - parse_timestamp("2020-09-08 13:42:29.190855").unwrap() - ); - - let naive_datetime = NaiveDateTime::new( - NaiveDate::from_ymd_opt(2020, 9, 8).unwrap(), - NaiveTime::from_hms_nano_opt(13, 42, 29, 0).unwrap(), - ); - - // Ensure both T and ' ' variants work - assert_eq!( - naive_datetime.timestamp_nanos(), - parse_timestamp("2020-09-08T13:42:29").unwrap() - ); - - assert_eq!( - naive_datetime.timestamp_nanos(), - parse_timestamp("2020-09-08 13:42:29").unwrap() - ); - - let tz: Tz = "+02:00".parse().unwrap(); - let date = string_to_datetime(&tz, "2020-09-08 13:42:29").unwrap(); - let utc = date.naive_utc().to_string(); - assert_eq!(utc, "2020-09-08 11:42:29"); - let local = date.naive_local().to_string(); - assert_eq!(local, "2020-09-08 13:42:29"); - - let date = string_to_datetime(&tz, "2020-09-08 13:42:29Z").unwrap(); - let utc = date.naive_utc().to_string(); - assert_eq!(utc, "2020-09-08 13:42:29"); - let local = date.naive_local().to_string(); - assert_eq!(local, "2020-09-08 15:42:29"); - - let dt = - NaiveDateTime::parse_from_str("2020-09-08T13:42:29Z", "%Y-%m-%dT%H:%M:%SZ") - .unwrap(); - let local: Tz = "+08:00".parse().unwrap(); - - // Parsed as offset from UTC - let date = string_to_datetime(&local, "2020-09-08T13:42:29Z").unwrap(); - assert_eq!(dt, date.naive_utc()); - assert_ne!(dt, date.naive_local()); - - // Parsed as offset from local - let date = string_to_datetime(&local, "2020-09-08 13:42:29").unwrap(); - assert_eq!(dt, date.naive_local()); - assert_ne!(dt, date.naive_utc()); - } - - #[test] - fn parse_time64_nanos() { - assert_eq!( - Time64NanosecondType::parse("02:10:01.1234567899999999"), - Some(7_801_123_456_789) - ); - assert_eq!( - Time64NanosecondType::parse("02:10:01.1234567"), - Some(7_801_123_456_700) - ); - assert_eq!( - Time64NanosecondType::parse("2:10:01.1234567"), - Some(7_801_123_456_700) - ); - assert_eq!( - Time64NanosecondType::parse("12:10:01.123456789 AM"), - Some(601_123_456_789) - ); - assert_eq!( - Time64NanosecondType::parse("12:10:01.123456789 am"), - Some(601_123_456_789) - ); - assert_eq!( - Time64NanosecondType::parse("2:10:01.12345678 PM"), - Some(51_001_123_456_780) - ); - assert_eq!( - Time64NanosecondType::parse("2:10:01.12345678 pm"), - Some(51_001_123_456_780) - ); - assert_eq!( - Time64NanosecondType::parse("02:10:01"), - Some(7_801_000_000_000) - ); - assert_eq!( - Time64NanosecondType::parse("2:10:01"), - Some(7_801_000_000_000) - ); - assert_eq!( - Time64NanosecondType::parse("12:10:01 AM"), - Some(601_000_000_000) - ); - assert_eq!( - Time64NanosecondType::parse("12:10:01 am"), - Some(601_000_000_000) - ); - assert_eq!( - Time64NanosecondType::parse("2:10:01 PM"), - Some(51_001_000_000_000) - ); - assert_eq!( - Time64NanosecondType::parse("2:10:01 pm"), - Some(51_001_000_000_000) - ); - assert_eq!( - Time64NanosecondType::parse("02:10"), - Some(7_800_000_000_000) - ); - assert_eq!(Time64NanosecondType::parse("2:10"), Some(7_800_000_000_000)); - assert_eq!( - Time64NanosecondType::parse("12:10 AM"), - Some(600_000_000_000) - ); - assert_eq!( - Time64NanosecondType::parse("12:10 am"), - Some(600_000_000_000) - ); - assert_eq!( - Time64NanosecondType::parse("2:10 PM"), - Some(51_000_000_000_000) - ); - assert_eq!( - Time64NanosecondType::parse("2:10 pm"), - Some(51_000_000_000_000) - ); - - // parse directly as nanoseconds - assert_eq!(Time64NanosecondType::parse("1"), Some(1)); - - // leap second - assert_eq!( - Time64NanosecondType::parse("23:59:60"), - Some(86_400_000_000_000) - ); - - // custom format - assert_eq!( - Time64NanosecondType::parse_formatted( - "02 - 10 - 01 - .1234567", - "%H - %M - %S - %.f" - ), - Some(7_801_123_456_700) - ); - } - - #[test] - fn parse_time64_micros() { - // expected formats - assert_eq!( - Time64MicrosecondType::parse("02:10:01.1234"), - Some(7_801_123_400) - ); - assert_eq!( - Time64MicrosecondType::parse("2:10:01.1234"), - Some(7_801_123_400) - ); - assert_eq!( - Time64MicrosecondType::parse("12:10:01.123456 AM"), - Some(601_123_456) - ); - assert_eq!( - Time64MicrosecondType::parse("12:10:01.123456 am"), - Some(601_123_456) - ); - assert_eq!( - Time64MicrosecondType::parse("2:10:01.12345 PM"), - Some(51_001_123_450) - ); - assert_eq!( - Time64MicrosecondType::parse("2:10:01.12345 pm"), - Some(51_001_123_450) - ); - assert_eq!( - Time64MicrosecondType::parse("02:10:01"), - Some(7_801_000_000) - ); - assert_eq!(Time64MicrosecondType::parse("2:10:01"), Some(7_801_000_000)); - assert_eq!( - Time64MicrosecondType::parse("12:10:01 AM"), - Some(601_000_000) - ); - assert_eq!( - Time64MicrosecondType::parse("12:10:01 am"), - Some(601_000_000) - ); - assert_eq!( - Time64MicrosecondType::parse("2:10:01 PM"), - Some(51_001_000_000) - ); - assert_eq!( - Time64MicrosecondType::parse("2:10:01 pm"), - Some(51_001_000_000) - ); - assert_eq!(Time64MicrosecondType::parse("02:10"), Some(7_800_000_000)); - assert_eq!(Time64MicrosecondType::parse("2:10"), Some(7_800_000_000)); - assert_eq!(Time64MicrosecondType::parse("12:10 AM"), Some(600_000_000)); - assert_eq!(Time64MicrosecondType::parse("12:10 am"), Some(600_000_000)); - assert_eq!( - Time64MicrosecondType::parse("2:10 PM"), - Some(51_000_000_000) - ); - assert_eq!( - Time64MicrosecondType::parse("2:10 pm"), - Some(51_000_000_000) - ); - - // parse directly as microseconds - assert_eq!(Time64MicrosecondType::parse("1"), Some(1)); - - // leap second - assert_eq!( - Time64MicrosecondType::parse("23:59:60"), - Some(86_400_000_000) - ); - - // custom format - assert_eq!( - Time64MicrosecondType::parse_formatted( - "02 - 10 - 01 - .1234", - "%H - %M - %S - %.f" - ), - Some(7_801_123_400) - ); - } - - #[test] - fn parse_time32_millis() { - // expected formats - assert_eq!(Time32MillisecondType::parse("02:10:01.1"), Some(7_801_100)); - assert_eq!(Time32MillisecondType::parse("2:10:01.1"), Some(7_801_100)); - assert_eq!( - Time32MillisecondType::parse("12:10:01.123 AM"), - Some(601_123) - ); - assert_eq!( - Time32MillisecondType::parse("12:10:01.123 am"), - Some(601_123) - ); - assert_eq!( - Time32MillisecondType::parse("2:10:01.12 PM"), - Some(51_001_120) - ); - assert_eq!( - Time32MillisecondType::parse("2:10:01.12 pm"), - Some(51_001_120) - ); - assert_eq!(Time32MillisecondType::parse("02:10:01"), Some(7_801_000)); - assert_eq!(Time32MillisecondType::parse("2:10:01"), Some(7_801_000)); - assert_eq!(Time32MillisecondType::parse("12:10:01 AM"), Some(601_000)); - assert_eq!(Time32MillisecondType::parse("12:10:01 am"), Some(601_000)); - assert_eq!(Time32MillisecondType::parse("2:10:01 PM"), Some(51_001_000)); - assert_eq!(Time32MillisecondType::parse("2:10:01 pm"), Some(51_001_000)); - assert_eq!(Time32MillisecondType::parse("02:10"), Some(7_800_000)); - assert_eq!(Time32MillisecondType::parse("2:10"), Some(7_800_000)); - assert_eq!(Time32MillisecondType::parse("12:10 AM"), Some(600_000)); - assert_eq!(Time32MillisecondType::parse("12:10 am"), Some(600_000)); - assert_eq!(Time32MillisecondType::parse("2:10 PM"), Some(51_000_000)); - assert_eq!(Time32MillisecondType::parse("2:10 pm"), Some(51_000_000)); - - // parse directly as milliseconds - assert_eq!(Time32MillisecondType::parse("1"), Some(1)); - - // leap second - assert_eq!(Time32MillisecondType::parse("23:59:60"), Some(86_400_000)); - - // custom format - assert_eq!( - Time32MillisecondType::parse_formatted( - "02 - 10 - 01 - .1", - "%H - %M - %S - %.f" - ), - Some(7_801_100) - ); - } - - #[test] - fn parse_time32_secs() { - // expected formats - assert_eq!(Time32SecondType::parse("02:10:01.1"), Some(7_801)); - assert_eq!(Time32SecondType::parse("02:10:01"), Some(7_801)); - assert_eq!(Time32SecondType::parse("2:10:01"), Some(7_801)); - assert_eq!(Time32SecondType::parse("12:10:01 AM"), Some(601)); - assert_eq!(Time32SecondType::parse("12:10:01 am"), Some(601)); - assert_eq!(Time32SecondType::parse("2:10:01 PM"), Some(51_001)); - assert_eq!(Time32SecondType::parse("2:10:01 pm"), Some(51_001)); - assert_eq!(Time32SecondType::parse("02:10"), Some(7_800)); - assert_eq!(Time32SecondType::parse("2:10"), Some(7_800)); - assert_eq!(Time32SecondType::parse("12:10 AM"), Some(600)); - assert_eq!(Time32SecondType::parse("12:10 am"), Some(600)); - assert_eq!(Time32SecondType::parse("2:10 PM"), Some(51_000)); - assert_eq!(Time32SecondType::parse("2:10 pm"), Some(51_000)); - - // parse directly as seconds - assert_eq!(Time32SecondType::parse("1"), Some(1)); - - // leap second - assert_eq!(Time32SecondType::parse("23:59:60"), Some(86400)); - - // custom format - assert_eq!( - Time32SecondType::parse_formatted("02 - 10 - 01", "%H - %M - %S"), - Some(7_801) - ); - } - - #[test] - fn test_string_to_time_invalid() { - let cases = [ - "25:00", - "9:00:", - "009:00", - "09:0:00", - "25:00:00", - "13:00 AM", - "13:00 PM", - "12:00. AM", - "09:0:00", - "09:01:0", - "09:01:1", - "9:1:0", - "09:01:0", - "1:00.123", - "1:00:00.123f", - " 9:00:00", - ":09:00", - "T9:00:00", - "AM", - ]; - for case in cases { - assert!(string_to_time(case).is_none(), "{case}"); - } - } - - #[test] - fn test_string_to_time_chrono() { - let cases = [ - ("1:00", "%H:%M"), - ("12:00", "%H:%M"), - ("13:00", "%H:%M"), - ("24:00", "%H:%M"), - ("1:00:00", "%H:%M:%S"), - ("12:00:30", "%H:%M:%S"), - ("13:00:59", "%H:%M:%S"), - ("24:00:60", "%H:%M:%S"), - ("09:00:00", "%H:%M:%S%.f"), - ("0:00:30.123456", "%H:%M:%S%.f"), - ("0:00 AM", "%I:%M %P"), - ("1:00 AM", "%I:%M %P"), - ("12:00 AM", "%I:%M %P"), - ("13:00 AM", "%I:%M %P"), - ("0:00 PM", "%I:%M %P"), - ("1:00 PM", "%I:%M %P"), - ("12:00 PM", "%I:%M %P"), - ("13:00 PM", "%I:%M %P"), - ("1:00 pM", "%I:%M %P"), - ("1:00 Pm", "%I:%M %P"), - ("1:00 aM", "%I:%M %P"), - ("1:00 Am", "%I:%M %P"), - ("1:00:30.123456 PM", "%I:%M:%S%.f %P"), - ("1:00:30.123456789 PM", "%I:%M:%S%.f %P"), - ("1:00:30.123456789123 PM", "%I:%M:%S%.f %P"), - ("1:00:30.1234 PM", "%I:%M:%S%.f %P"), - ("1:00:30.123456 PM", "%I:%M:%S%.f %P"), - ("1:00:30.123456789123456789 PM", "%I:%M:%S%.f %P"), - ("1:00:30.12F456 PM", "%I:%M:%S%.f %P"), - ]; - for (s, format) in cases { - let chrono = NaiveTime::parse_from_str(s, format).ok(); - let custom = string_to_time(s); - assert_eq!(chrono, custom, "{s}"); - } - } - - #[test] - fn test_parse_interval() { - let config = IntervalParseConfig::new(IntervalUnit::Month); - - assert_eq!( - Interval::new(1i32, 0i32, 0i64), - Interval::parse("1 month", &config).unwrap(), - ); - - assert_eq!( - Interval::new(2i32, 0i32, 0i64), - Interval::parse("2 month", &config).unwrap(), - ); - - assert_eq!( - Interval::new(-1i32, -18i32, -(NANOS_PER_DAY / 5)), - Interval::parse("-1.5 months -3.2 days", &config).unwrap(), - ); - - assert_eq!( - Interval::new(0i32, 15i32, 0), - Interval::parse("0.5 months", &config).unwrap(), - ); - - assert_eq!( - Interval::new(0i32, 15i32, 0), - Interval::parse(".5 months", &config).unwrap(), - ); - - assert_eq!( - Interval::new(0i32, -15i32, 0), - Interval::parse("-0.5 months", &config).unwrap(), - ); - - assert_eq!( - Interval::new(0i32, -15i32, 0), - Interval::parse("-.5 months", &config).unwrap(), - ); - - assert_eq!( - Interval::new(2i32, 10i32, 9 * NANOS_PER_HOUR), - Interval::parse("2.1 months 7.25 days 3 hours", &config).unwrap(), - ); - - assert_eq!( - Interval::parse("1 centurys 1 month", &config) - .unwrap_err() - .to_string(), - r#"Parser error: Invalid input syntax for type interval: "1 centurys 1 month""# - ); - - assert_eq!( - Interval::new(37i32, 0i32, 0i64), - Interval::parse("3 year 1 month", &config).unwrap(), - ); - - assert_eq!( - Interval::new(35i32, 0i32, 0i64), - Interval::parse("3 year -1 month", &config).unwrap(), - ); - - assert_eq!( - Interval::new(-37i32, 0i32, 0i64), - Interval::parse("-3 year -1 month", &config).unwrap(), - ); - - assert_eq!( - Interval::new(-35i32, 0i32, 0i64), - Interval::parse("-3 year 1 month", &config).unwrap(), - ); - - assert_eq!( - Interval::new(0i32, 5i32, 0i64), - Interval::parse("5 days", &config).unwrap(), - ); - - assert_eq!( - Interval::new(0i32, 7i32, 3 * NANOS_PER_HOUR), - Interval::parse("7 days 3 hours", &config).unwrap(), - ); - - assert_eq!( - Interval::new(0i32, 7i32, 5 * NANOS_PER_MINUTE), - Interval::parse("7 days 5 minutes", &config).unwrap(), - ); - - assert_eq!( - Interval::new(0i32, 7i32, -5 * NANOS_PER_MINUTE), - Interval::parse("7 days -5 minutes", &config).unwrap(), - ); - - assert_eq!( - Interval::new(0i32, -7i32, 5 * NANOS_PER_HOUR), - Interval::parse("-7 days 5 hours", &config).unwrap(), - ); - - assert_eq!( - Interval::new( - 0i32, - -7i32, - -5 * NANOS_PER_HOUR - 5 * NANOS_PER_MINUTE - 5 * NANOS_PER_SECOND - ), - Interval::parse("-7 days -5 hours -5 minutes -5 seconds", &config).unwrap(), - ); - - assert_eq!( - Interval::new(12i32, 0i32, 25 * NANOS_PER_MILLIS), - Interval::parse("1 year 25 millisecond", &config).unwrap(), - ); - - assert_eq!( - Interval::new( - 12i32, - 1i32, - (NANOS_PER_SECOND as f64 * 0.000000001_f64) as i64 - ), - Interval::parse("1 year 1 day 0.000000001 seconds", &config).unwrap(), - ); - - assert_eq!( - Interval::new(12i32, 1i32, NANOS_PER_MILLIS / 10), - Interval::parse("1 year 1 day 0.1 milliseconds", &config).unwrap(), - ); - - assert_eq!( - Interval::new(12i32, 1i32, 1000i64), - Interval::parse("1 year 1 day 1 microsecond", &config).unwrap(), - ); - - assert_eq!( - Interval::new(12i32, 1i32, 1i64), - Interval::parse("1 year 1 day 1 nanoseconds", &config).unwrap(), - ); - - assert_eq!( - Interval::new(1i32, 0i32, -NANOS_PER_SECOND), - Interval::parse("1 month -1 second", &config).unwrap(), - ); - - assert_eq!( - Interval::new(-13i32, -8i32, -NANOS_PER_HOUR - NANOS_PER_MINUTE - NANOS_PER_SECOND - (1.11_f64 * NANOS_PER_MILLIS as f64) as i64), - Interval::parse("-1 year -1 month -1 week -1 day -1 hour -1 minute -1 second -1.11 millisecond", &config).unwrap(), - ); - } - - #[test] - fn test_duplicate_interval_type() { - let config = IntervalParseConfig::new(IntervalUnit::Month); - - let err = Interval::parse("1 month 1 second 1 second", &config) - .expect_err("parsing interval should have failed"); - assert_eq!( - r#"ParseError("Invalid input syntax for type interval: \"1 month 1 second 1 second\". Repeated type 'second'")"#, - format!("{err:?}") - ); - - // test with singular and plural forms - let err = Interval::parse("1 century 2 centuries", &config) - .expect_err("parsing interval should have failed"); - assert_eq!( - r#"ParseError("Invalid input syntax for type interval: \"1 century 2 centuries\". Repeated type 'centuries'")"#, - format!("{err:?}") - ); - } - - #[test] - fn test_interval_amount_parsing() { - // integer - let result = IntervalAmount::from_str("123").unwrap(); - let expected = IntervalAmount::new(123, 0); - - assert_eq!(result, expected); - - // positive w/ fractional - let result = IntervalAmount::from_str("0.3").unwrap(); - let expected = IntervalAmount::new(0, 3 * 10_i64.pow(INTERVAL_PRECISION - 1)); - - assert_eq!(result, expected); - - // negative w/ fractional - let result = IntervalAmount::from_str("-3.5").unwrap(); - let expected = IntervalAmount::new(-3, -5 * 10_i64.pow(INTERVAL_PRECISION - 1)); - - assert_eq!(result, expected); - - // invalid: missing fractional - let result = IntervalAmount::from_str("3."); - assert!(result.is_err()); - - // invalid: sign in fractional - let result = IntervalAmount::from_str("3.-5"); - assert!(result.is_err()); - } - - #[test] - fn test_interval_precision() { - let config = IntervalParseConfig::new(IntervalUnit::Month); - - let result = Interval::parse("100000.1 days", &config).unwrap(); - let expected = Interval::new(0_i32, 100_000_i32, NANOS_PER_DAY / 10); - - assert_eq!(result, expected); - } - - #[test] - fn test_interval_addition() { - // add 4.1 centuries - let start = Interval::new(1, 2, 3); - let expected = Interval::new(4921, 2, 3); - - let result = start - .add( - IntervalAmount::new(4, 10_i64.pow(INTERVAL_PRECISION - 1)), - IntervalUnit::Century, - ) - .unwrap(); - - assert_eq!(result, expected); - - // add 10.25 decades - let start = Interval::new(1, 2, 3); - let expected = Interval::new(1231, 2, 3); - - let result = start - .add( - IntervalAmount::new(10, 25 * 10_i64.pow(INTERVAL_PRECISION - 2)), - IntervalUnit::Decade, - ) - .unwrap(); - - assert_eq!(result, expected); - - // add 30.3 years (reminder: Postgres logic does not spill to days/nanos when interval is larger than a month) - let start = Interval::new(1, 2, 3); - let expected = Interval::new(364, 2, 3); - - let result = start - .add( - IntervalAmount::new(30, 3 * 10_i64.pow(INTERVAL_PRECISION - 1)), - IntervalUnit::Year, - ) - .unwrap(); - - assert_eq!(result, expected); - - // add 1.5 months - let start = Interval::new(1, 2, 3); - let expected = Interval::new(2, 17, 3); - - let result = start - .add( - IntervalAmount::new(1, 5 * 10_i64.pow(INTERVAL_PRECISION - 1)), - IntervalUnit::Month, - ) - .unwrap(); - - assert_eq!(result, expected); - - // add -2 weeks - let start = Interval::new(1, 25, 3); - let expected = Interval::new(1, 11, 3); - - let result = start - .add(IntervalAmount::new(-2, 0), IntervalUnit::Week) - .unwrap(); - - assert_eq!(result, expected); - - // add 2.2 days - let start = Interval::new(12, 15, 3); - let expected = Interval::new(12, 17, 3 + 17_280 * NANOS_PER_SECOND); - - let result = start - .add( - IntervalAmount::new(2, 2 * 10_i64.pow(INTERVAL_PRECISION - 1)), - IntervalUnit::Day, - ) - .unwrap(); - - assert_eq!(result, expected); - - // add 12.5 hours - let start = Interval::new(1, 2, 3); - let expected = Interval::new(1, 2, 3 + 45_000 * NANOS_PER_SECOND); - - let result = start - .add( - IntervalAmount::new(12, 5 * 10_i64.pow(INTERVAL_PRECISION - 1)), - IntervalUnit::Hour, - ) - .unwrap(); - - assert_eq!(result, expected); - - // add -1.5 minutes - let start = Interval::new(0, 0, -3); - let expected = Interval::new(0, 0, -90_000_000_000 - 3); - - let result = start - .add( - IntervalAmount::new(-1, -5 * 10_i64.pow(INTERVAL_PRECISION - 1)), - IntervalUnit::Minute, - ) - .unwrap(); - - assert_eq!(result, expected); - } - - #[test] - fn string_to_timestamp_old() { - parse_timestamp("1677-06-14T07:29:01.256") - .map_err(|e| assert!(e.to_string().ends_with(ERR_NANOSECONDS_NOT_SUPPORTED))) - .unwrap_err(); - } - - #[test] - fn test_parse_decimal_with_parameter() { - let tests = [ - ("0", 0i128), - ("123.123", 123123i128), - ("123.1234", 123123i128), - ("123.1", 123100i128), - ("123", 123000i128), - ("-123.123", -123123i128), - ("-123.1234", -123123i128), - ("-123.1", -123100i128), - ("-123", -123000i128), - ("0.0000123", 0i128), - ("12.", 12000i128), - ("-12.", -12000i128), - ("00.1", 100i128), - ("-00.1", -100i128), - ("12345678912345678.1234", 12345678912345678123i128), - ("-12345678912345678.1234", -12345678912345678123i128), - ("99999999999999999.999", 99999999999999999999i128), - ("-99999999999999999.999", -99999999999999999999i128), - (".123", 123i128), - ("-.123", -123i128), - ("123.", 123000i128), - ("-123.", -123000i128), - ]; - for (s, i) in tests { - let result_128 = parse_decimal::(s, 20, 3); - assert_eq!(i, result_128.unwrap()); - let result_256 = parse_decimal::(s, 20, 3); - assert_eq!(i256::from_i128(i), result_256.unwrap()); - } - let can_not_parse_tests = ["123,123", ".", "123.123.123", "", "+", "-"]; - for s in can_not_parse_tests { - let result_128 = parse_decimal::(s, 20, 3); - assert_eq!( - format!("Parser error: can't parse the string value {s} to decimal"), - result_128.unwrap_err().to_string() - ); - let result_256 = parse_decimal::(s, 20, 3); - assert_eq!( - format!("Parser error: can't parse the string value {s} to decimal"), - result_256.unwrap_err().to_string() - ); - } - let overflow_parse_tests = ["12345678", "12345678.9", "99999999.99"]; - for s in overflow_parse_tests { - let result_128 = parse_decimal::(s, 10, 3); - let expected_128 = "Parser error: parse decimal overflow"; - let actual_128 = result_128.unwrap_err().to_string(); - - assert!( - actual_128.contains(expected_128), - "actual: '{actual_128}', expected: '{expected_128}'" - ); - - let result_256 = parse_decimal::(s, 10, 3); - let expected_256 = "Parser error: parse decimal overflow"; - let actual_256 = result_256.unwrap_err().to_string(); - - assert!( - actual_256.contains(expected_256), - "actual: '{actual_256}', expected: '{expected_256}'" - ); - } - - let edge_tests_128 = [ - ( - "99999999999999999999999999999999999999", - 99999999999999999999999999999999999999i128, - 0, - ), - ( - "999999999999999999999999999999999999.99", - 99999999999999999999999999999999999999i128, - 2, - ), - ( - "9999999999999999999999999.9999999999999", - 99999999999999999999999999999999999999i128, - 13, - ), - ( - "9999999999999999999999999", - 99999999999999999999999990000000000000i128, - 13, - ), - ( - "0.99999999999999999999999999999999999999", - 99999999999999999999999999999999999999i128, - 38, - ), - ]; - for (s, i, scale) in edge_tests_128 { - let result_128 = parse_decimal::(s, 38, scale); - assert_eq!(i, result_128.unwrap()); - } - let edge_tests_256 = [ - ( - "9999999999999999999999999999999999999999999999999999999999999999999999999999", -i256::from_string("9999999999999999999999999999999999999999999999999999999999999999999999999999").unwrap(), - 0, - ), - ( - "999999999999999999999999999999999999999999999999999999999999999999999999.9999", - i256::from_string("9999999999999999999999999999999999999999999999999999999999999999999999999999").unwrap(), - 4, - ), - ( - "99999999999999999999999999999999999999999999999999.99999999999999999999999999", - i256::from_string("9999999999999999999999999999999999999999999999999999999999999999999999999999").unwrap(), - 26, - ), - ( - "99999999999999999999999999999999999999999999999999", - i256::from_string("9999999999999999999999999999999999999999999999999900000000000000000000000000").unwrap(), - 26, - ), - ]; - for (s, i, scale) in edge_tests_256 { - let result = parse_decimal::(s, 76, scale); - assert_eq!(i, result.unwrap()); - } - } -} +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use arrow_array::timezone::Tz; +use arrow_array::types::*; +use arrow_array::{ArrowNativeTypeOp, ArrowPrimitiveType}; +use arrow_buffer::ArrowNativeType; +use arrow_schema::ArrowError; +use chrono::prelude::*; +use half::f16; +use std::str::FromStr; + +/// Parse nanoseconds from the first `N` values in digits, subtracting the offset `O` +#[inline] +fn parse_nanos(digits: &[u8]) -> u32 { + digits[..N] + .iter() + .fold(0_u32, |acc, v| acc * 10 + v.wrapping_sub(O) as u32) + * 10_u32.pow((9 - N) as _) +} + +/// Helper for parsing RFC3339 timestamps +struct TimestampParser { + /// The timestamp bytes to parse minus `b'0'` + /// + /// This makes interpretation as an integer inexpensive + digits: [u8; 32], + /// A mask containing a `1` bit where the corresponding byte is a valid ASCII digit + mask: u32, +} + +impl TimestampParser { + fn new(bytes: &[u8]) -> Self { + let mut digits = [0; 32]; + let mut mask = 0; + + // Treating all bytes the same way, helps LLVM vectorise this correctly + for (idx, (o, i)) in digits.iter_mut().zip(bytes).enumerate() { + *o = i.wrapping_sub(b'0'); + mask |= ((*o < 10) as u32) << idx + } + + Self { digits, mask } + } + + /// Returns true if the byte at `idx` in the original string equals `b` + fn test(&self, idx: usize, b: u8) -> bool { + self.digits[idx] == b.wrapping_sub(b'0') + } + + /// Parses a date of the form `1997-01-31` + fn date(&self) -> Option { + if self.mask & 0b1111111111 != 0b1101101111 + || !self.test(4, b'-') + || !self.test(7, b'-') + { + return None; + } + + let year = self.digits[0] as u16 * 1000 + + self.digits[1] as u16 * 100 + + self.digits[2] as u16 * 10 + + self.digits[3] as u16; + + let month = self.digits[5] * 10 + self.digits[6]; + let day = self.digits[8] * 10 + self.digits[9]; + + NaiveDate::from_ymd_opt(year as _, month as _, day as _) + } + + /// Parses a time of any of forms + /// - `09:26:56` + /// - `09:26:56.123` + /// - `09:26:56.123456` + /// - `09:26:56.123456789` + /// - `092656` + /// + /// Returning the end byte offset + fn time(&self) -> Option<(NaiveTime, usize)> { + // Make a NaiveTime handling leap seconds + let time = |hour, min, sec, nano| match sec { + 60 => { + let nano = 1_000_000_000 + nano; + NaiveTime::from_hms_nano_opt(hour as _, min as _, 59, nano) + } + _ => NaiveTime::from_hms_nano_opt(hour as _, min as _, sec as _, nano), + }; + + match (self.mask >> 11) & 0b11111111 { + // 09:26:56 + 0b11011011 if self.test(13, b':') && self.test(16, b':') => { + let hour = self.digits[11] * 10 + self.digits[12]; + let minute = self.digits[14] * 10 + self.digits[15]; + let second = self.digits[17] * 10 + self.digits[18]; + + match self.test(19, b'.') { + true => { + let digits = (self.mask >> 20).trailing_ones(); + let nanos = match digits { + 0 => return None, + 1 => parse_nanos::<1, 0>(&self.digits[20..21]), + 2 => parse_nanos::<2, 0>(&self.digits[20..22]), + 3 => parse_nanos::<3, 0>(&self.digits[20..23]), + 4 => parse_nanos::<4, 0>(&self.digits[20..24]), + 5 => parse_nanos::<5, 0>(&self.digits[20..25]), + 6 => parse_nanos::<6, 0>(&self.digits[20..26]), + 7 => parse_nanos::<7, 0>(&self.digits[20..27]), + 8 => parse_nanos::<8, 0>(&self.digits[20..28]), + _ => parse_nanos::<9, 0>(&self.digits[20..29]), + }; + Some((time(hour, minute, second, nanos)?, 20 + digits as usize)) + } + false => Some((time(hour, minute, second, 0)?, 19)), + } + } + // 092656 + 0b111111 => { + let hour = self.digits[11] * 10 + self.digits[12]; + let minute = self.digits[13] * 10 + self.digits[14]; + let second = self.digits[15] * 10 + self.digits[16]; + let time = time(hour, minute, second, 0)?; + Some((time, 17)) + } + _ => None, + } + } +} + +/// Accepts a string and parses it relative to the provided `timezone` +/// +/// In addition to RFC3339 / ISO8601 standard timestamps, it also +/// accepts strings that use a space ` ` to separate the date and time +/// as well as strings that have no explicit timezone offset. +/// +/// Examples of accepted inputs: +/// * `1997-01-31T09:26:56.123Z` # RCF3339 +/// * `1997-01-31T09:26:56.123-05:00` # RCF3339 +/// * `1997-01-31 09:26:56.123-05:00` # close to RCF3339 but with a space rather than T +/// * `2023-01-01 04:05:06.789 -08` # close to RCF3339, no fractional seconds or time separator +/// * `1997-01-31T09:26:56.123` # close to RCF3339 but no timezone offset specified +/// * `1997-01-31 09:26:56.123` # close to RCF3339 but uses a space and no timezone offset +/// * `1997-01-31 09:26:56` # close to RCF3339, no fractional seconds +/// * `1997-01-31 092656` # close to RCF3339, no fractional seconds +/// * `1997-01-31 092656+04:00` # close to RCF3339, no fractional seconds or time separator +/// * `1997-01-31` # close to RCF3339, only date no time +/// +/// [IANA timezones] are only supported if the `arrow-array/chrono-tz` feature is enabled +/// +/// * `2023-01-01 040506 America/Los_Angeles` +/// +/// If a timestamp is ambiguous, for example as a result of daylight-savings time, an error +/// will be returned +/// +/// Some formats supported by PostgresSql +/// are not supported, like +/// +/// * "2023-01-01 04:05:06.789 +07:30:00", +/// * "2023-01-01 040506 +07:30:00", +/// * "2023-01-01 04:05:06.789 PST", +/// +/// [IANA timezones]: https://www.iana.org/time-zones +pub fn string_to_datetime( + timezone: &T, + s: &str, +) -> Result, ArrowError> { + let err = |ctx: &str| { + ArrowError::ParseError(format!("Error parsing timestamp from '{s}': {ctx}")) + }; + + let bytes = s.as_bytes(); + if bytes.len() < 10 { + return Err(err("timestamp must contain at least 10 characters")); + } + + let parser = TimestampParser::new(bytes); + let date = parser.date().ok_or_else(|| err("error parsing date"))?; + if bytes.len() == 10 { + let offset = timezone.offset_from_local_date(&date); + let offset = offset + .single() + .ok_or_else(|| err("error computing timezone offset"))?; + + let time = NaiveTime::from_hms_opt(0, 0, 0).unwrap(); + return Ok(DateTime::from_local(date.and_time(time), offset)); + } + + if !parser.test(10, b'T') && !parser.test(10, b't') && !parser.test(10, b' ') { + return Err(err("invalid timestamp separator")); + } + + let (time, mut tz_offset) = parser.time().ok_or_else(|| err("error parsing time"))?; + let datetime = date.and_time(time); + + if tz_offset == 32 { + // Decimal overrun + while tz_offset < bytes.len() && bytes[tz_offset].is_ascii_digit() { + tz_offset += 1; + } + } + + if bytes.len() <= tz_offset { + let offset = timezone.offset_from_local_datetime(&datetime); + let offset = offset + .single() + .ok_or_else(|| err("error computing timezone offset"))?; + return Ok(DateTime::from_local(datetime, offset)); + } + + if bytes[tz_offset] == b'z' || bytes[tz_offset] == b'Z' { + let offset = timezone.offset_from_local_datetime(&datetime); + let offset = offset + .single() + .ok_or_else(|| err("error computing timezone offset"))?; + return Ok(DateTime::from_utc(datetime, offset)); + } + + // Parse remainder of string as timezone + let parsed_tz: Tz = s[tz_offset..].trim_start().parse()?; + let offset = parsed_tz.offset_from_local_datetime(&datetime); + let offset = offset + .single() + .ok_or_else(|| err("error computing timezone offset"))?; + Ok(DateTime::::from_local(datetime, offset).with_timezone(timezone)) +} + +/// Accepts a string in RFC3339 / ISO8601 standard format and some +/// variants and converts it to a nanosecond precision timestamp. +/// +/// See [`string_to_datetime`] for the full set of supported formats +/// +/// Implements the `to_timestamp` function to convert a string to a +/// timestamp, following the model of spark SQL’s to_`timestamp`. +/// +/// Internally, this function uses the `chrono` library for the +/// datetime parsing +/// +/// We hope to extend this function in the future with a second +/// parameter to specifying the format string. +/// +/// ## Timestamp Precision +/// +/// Function uses the maximum precision timestamps supported by +/// Arrow (nanoseconds stored as a 64-bit integer) timestamps. This +/// means the range of dates that timestamps can represent is ~1677 AD +/// to 2262 AM +/// +/// ## Timezone / Offset Handling +/// +/// Numerical values of timestamps are stored compared to offset UTC. +/// +/// This function interprets string without an explicit time zone as timestamps +/// relative to UTC, see [`string_to_datetime`] for alternative semantics +/// +/// In particular: +/// +/// ``` +/// # use arrow_cast::parse::string_to_timestamp_nanos; +/// // Note all three of these timestamps are parsed as the same value +/// let a = string_to_timestamp_nanos("1997-01-31 09:26:56.123Z").unwrap(); +/// let b = string_to_timestamp_nanos("1997-01-31T09:26:56.123").unwrap(); +/// let c = string_to_timestamp_nanos("1997-01-31T14:26:56.123+05:00").unwrap(); +/// +/// assert_eq!(a, b); +/// assert_eq!(b, c); +/// ``` +/// +#[inline] +pub fn string_to_timestamp_nanos(s: &str) -> Result { + to_timestamp_nanos(string_to_datetime(&Utc, s)?.naive_utc()) +} + +/// Defensive check to prevent chrono-rs panics when nanosecond conversion happens on non-supported dates +#[inline] +fn to_timestamp_nanos(dt: NaiveDateTime) -> Result { + if dt.timestamp().checked_mul(1_000_000_000).is_none() { + return Err(ArrowError::ParseError( + ERR_NANOSECONDS_NOT_SUPPORTED.to_string(), + )); + } + + Ok(dt.timestamp_nanos()) +} + +/// Accepts a string in ISO8601 standard format and some +/// variants and converts it to nanoseconds since midnight. +/// +/// Examples of accepted inputs: +/// * `09:26:56.123 AM` +/// * `23:59:59` +/// * `6:00 pm` +// +/// Internally, this function uses the `chrono` library for the +/// time parsing +/// +/// ## Timezone / Offset Handling +/// +/// This function does not support parsing strings with a timezone +/// or offset specified, as it considers only time since midnight. +pub fn string_to_time_nanoseconds(s: &str) -> Result { + let nt = string_to_time(s).ok_or_else(|| { + ArrowError::ParseError(format!("Failed to parse \'{s}\' as time")) + })?; + Ok(nt.num_seconds_from_midnight() as i64 * 1_000_000_000 + nt.nanosecond() as i64) +} + +fn string_to_time(s: &str) -> Option { + let bytes = s.as_bytes(); + if bytes.len() < 4 { + return None; + } + + let (am, bytes) = match bytes.get(bytes.len() - 3..) { + Some(b" AM" | b" am" | b" Am" | b" aM") => { + (Some(true), &bytes[..bytes.len() - 3]) + } + Some(b" PM" | b" pm" | b" pM" | b" Pm") => { + (Some(false), &bytes[..bytes.len() - 3]) + } + _ => (None, bytes), + }; + + if bytes.len() < 4 { + return None; + } + + let mut digits = [b'0'; 6]; + + // Extract hour + let bytes = match (bytes[1], bytes[2]) { + (b':', _) => { + digits[1] = bytes[0]; + &bytes[2..] + } + (_, b':') => { + digits[0] = bytes[0]; + digits[1] = bytes[1]; + &bytes[3..] + } + _ => return None, + }; + + if bytes.len() < 2 { + return None; // Minutes required + } + + // Extract minutes + digits[2] = bytes[0]; + digits[3] = bytes[1]; + + let nanoseconds = match bytes.get(2) { + Some(b':') => { + if bytes.len() < 5 { + return None; + } + + // Extract seconds + digits[4] = bytes[3]; + digits[5] = bytes[4]; + + // Extract sub-seconds if any + match bytes.get(5) { + Some(b'.') => { + let decimal = &bytes[6..]; + if decimal.iter().any(|x| !x.is_ascii_digit()) { + return None; + } + match decimal.len() { + 0 => return None, + 1 => parse_nanos::<1, b'0'>(decimal), + 2 => parse_nanos::<2, b'0'>(decimal), + 3 => parse_nanos::<3, b'0'>(decimal), + 4 => parse_nanos::<4, b'0'>(decimal), + 5 => parse_nanos::<5, b'0'>(decimal), + 6 => parse_nanos::<6, b'0'>(decimal), + 7 => parse_nanos::<7, b'0'>(decimal), + 8 => parse_nanos::<8, b'0'>(decimal), + _ => parse_nanos::<9, b'0'>(decimal), + } + } + Some(_) => return None, + None => 0, + } + } + Some(_) => return None, + None => 0, + }; + + digits.iter_mut().for_each(|x| *x = x.wrapping_sub(b'0')); + if digits.iter().any(|x| *x > 9) { + return None; + } + + let hour = match (digits[0] * 10 + digits[1], am) { + (12, Some(true)) => 0, // 12:00 AM -> 00:00 + (h @ 1..=11, Some(true)) => h, // 1:00 AM -> 01:00 + (12, Some(false)) => 12, // 12:00 PM -> 12:00 + (h @ 1..=11, Some(false)) => h + 12, // 1:00 PM -> 13:00 + (_, Some(_)) => return None, + (h, None) => h, + }; + + // Handle leap second + let (second, nanoseconds) = match digits[4] * 10 + digits[5] { + 60 => (59, nanoseconds + 1_000_000_000), + s => (s, nanoseconds), + }; + + NaiveTime::from_hms_nano_opt( + hour as _, + (digits[2] * 10 + digits[3]) as _, + second as _, + nanoseconds, + ) +} + +/// Specialized parsing implementations +/// used by csv and json reader +pub trait Parser: ArrowPrimitiveType { + fn parse(string: &str) -> Option; + + fn parse_formatted(string: &str, _format: &str) -> Option { + Self::parse(string) + } +} + +impl Parser for Float16Type { + fn parse(string: &str) -> Option { + lexical_core::parse(string.as_bytes()) + .ok() + .map(f16::from_f32) + } +} + +impl Parser for Float32Type { + fn parse(string: &str) -> Option { + lexical_core::parse(string.as_bytes()).ok() + } +} + +impl Parser for Float64Type { + fn parse(string: &str) -> Option { + lexical_core::parse(string.as_bytes()).ok() + } +} + +macro_rules! parser_primitive { + ($t:ty) => { + impl Parser for $t { + fn parse(string: &str) -> Option { + lexical_core::parse::(string.as_bytes()).ok() + } + } + }; +} +parser_primitive!(UInt64Type); +parser_primitive!(UInt32Type); +parser_primitive!(UInt16Type); +parser_primitive!(UInt8Type); +parser_primitive!(Int64Type); +parser_primitive!(Int32Type); +parser_primitive!(Int16Type); +parser_primitive!(Int8Type); + +impl Parser for TimestampNanosecondType { + fn parse(string: &str) -> Option { + string_to_timestamp_nanos(string).ok() + } +} + +impl Parser for TimestampMicrosecondType { + fn parse(string: &str) -> Option { + let nanos = string_to_timestamp_nanos(string).ok(); + nanos.map(|x| x / 1000) + } +} + +impl Parser for TimestampMillisecondType { + fn parse(string: &str) -> Option { + let nanos = string_to_timestamp_nanos(string).ok(); + nanos.map(|x| x / 1_000_000) + } +} + +impl Parser for TimestampSecondType { + fn parse(string: &str) -> Option { + let nanos = string_to_timestamp_nanos(string).ok(); + nanos.map(|x| x / 1_000_000_000) + } +} + +impl Parser for Time64NanosecondType { + // Will truncate any fractions of a nanosecond + fn parse(string: &str) -> Option { + string_to_time_nanoseconds(string) + .ok() + .or_else(|| string.parse::().ok()) + } + + fn parse_formatted(string: &str, format: &str) -> Option { + let nt = NaiveTime::parse_from_str(string, format).ok()?; + Some( + nt.num_seconds_from_midnight() as i64 * 1_000_000_000 + + nt.nanosecond() as i64, + ) + } +} + +impl Parser for Time64MicrosecondType { + // Will truncate any fractions of a microsecond + fn parse(string: &str) -> Option { + string_to_time_nanoseconds(string) + .ok() + .map(|nanos| nanos / 1_000) + .or_else(|| string.parse::().ok()) + } + + fn parse_formatted(string: &str, format: &str) -> Option { + let nt = NaiveTime::parse_from_str(string, format).ok()?; + Some( + nt.num_seconds_from_midnight() as i64 * 1_000_000 + + nt.nanosecond() as i64 / 1_000, + ) + } +} + +impl Parser for Time32MillisecondType { + // Will truncate any fractions of a millisecond + fn parse(string: &str) -> Option { + string_to_time_nanoseconds(string) + .ok() + .map(|nanos| (nanos / 1_000_000) as i32) + .or_else(|| string.parse::().ok()) + } + + fn parse_formatted(string: &str, format: &str) -> Option { + let nt = NaiveTime::parse_from_str(string, format).ok()?; + Some( + nt.num_seconds_from_midnight() as i32 * 1_000 + + nt.nanosecond() as i32 / 1_000_000, + ) + } +} + +impl Parser for Time32SecondType { + // Will truncate any fractions of a second + fn parse(string: &str) -> Option { + string_to_time_nanoseconds(string) + .ok() + .map(|nanos| (nanos / 1_000_000_000) as i32) + .or_else(|| string.parse::().ok()) + } + + fn parse_formatted(string: &str, format: &str) -> Option { + let nt = NaiveTime::parse_from_str(string, format).ok()?; + Some( + nt.num_seconds_from_midnight() as i32 + + nt.nanosecond() as i32 / 1_000_000_000, + ) + } +} + +/// Number of days between 0001-01-01 and 1970-01-01 +const EPOCH_DAYS_FROM_CE: i32 = 719_163; + +/// Error message if nanosecond conversion request beyond supported interval +const ERR_NANOSECONDS_NOT_SUPPORTED: &str = "The dates that can be represented as nanoseconds have to be between 1677-09-21T00:12:44.0 and 2262-04-11T23:47:16.854775804"; + +fn parse_date(string: &str) -> Option { + if string.len() > 10 { + return None; + } + let mut digits = [0; 10]; + let mut mask = 0; + + // Treating all bytes the same way, helps LLVM vectorise this correctly + for (idx, (o, i)) in digits.iter_mut().zip(string.bytes()).enumerate() { + *o = i.wrapping_sub(b'0'); + mask |= ((*o < 10) as u16) << idx + } + + const HYPHEN: u8 = b'-'.wrapping_sub(b'0'); + + if digits[4] != HYPHEN { + return None; + } + + let (month, day) = match mask { + 0b1101101111 => { + if digits[7] != HYPHEN { + return None; + } + (digits[5] * 10 + digits[6], digits[8] * 10 + digits[9]) + } + 0b101101111 => { + if digits[7] != HYPHEN { + return None; + } + (digits[5] * 10 + digits[6], digits[8]) + } + 0b110101111 => { + if digits[6] != HYPHEN { + return None; + } + (digits[5], digits[7] * 10 + digits[8]) + } + 0b10101111 => { + if digits[6] != HYPHEN { + return None; + } + (digits[5], digits[7]) + } + _ => return None, + }; + + let year = digits[0] as u16 * 1000 + + digits[1] as u16 * 100 + + digits[2] as u16 * 10 + + digits[3] as u16; + + NaiveDate::from_ymd_opt(year as _, month as _, day as _) +} + +impl Parser for Date32Type { + fn parse(string: &str) -> Option { + let date = parse_date(string)?; + Some(date.num_days_from_ce() - EPOCH_DAYS_FROM_CE) + } + + fn parse_formatted(string: &str, format: &str) -> Option { + let date = NaiveDate::parse_from_str(string, format).ok()?; + Some(date.num_days_from_ce() - EPOCH_DAYS_FROM_CE) + } +} + +impl Parser for Date64Type { + fn parse(string: &str) -> Option { + if string.len() <= 10 { + let date = parse_date(string)?; + Some(NaiveDateTime::new(date, NaiveTime::default()).timestamp_millis()) + } else { + let date_time = string_to_datetime(&Utc, string).ok()?; + Some(date_time.timestamp_millis()) + } + } + + fn parse_formatted(string: &str, format: &str) -> Option { + use chrono::format::Fixed; + use chrono::format::StrftimeItems; + let fmt = StrftimeItems::new(format); + let has_zone = fmt.into_iter().any(|item| match item { + chrono::format::Item::Fixed(fixed_item) => matches!( + fixed_item, + Fixed::RFC2822 + | Fixed::RFC3339 + | Fixed::TimezoneName + | Fixed::TimezoneOffsetColon + | Fixed::TimezoneOffsetColonZ + | Fixed::TimezoneOffset + | Fixed::TimezoneOffsetZ + ), + _ => false, + }); + if has_zone { + let date_time = chrono::DateTime::parse_from_str(string, format).ok()?; + Some(date_time.timestamp_millis()) + } else { + let date_time = NaiveDateTime::parse_from_str(string, format).ok()?; + Some(date_time.timestamp_millis()) + } + } +} + +/// Parse the string format decimal value to i128/i256 format and checking the precision and scale. +/// The result value can't be out of bounds. +pub fn parse_decimal( + s: &str, + precision: u8, + scale: i8, +) -> Result { + let mut result = T::Native::usize_as(0); + let mut fractionals = 0; + let mut digits = 0; + let base = T::Native::usize_as(10); + + let bs = s.as_bytes(); + let (bs, negative) = match bs.first() { + Some(b'-') => (&bs[1..], true), + Some(b'+') => (&bs[1..], false), + _ => (bs, false), + }; + + if bs.is_empty() { + return Err(ArrowError::ParseError(format!( + "can't parse the string value {s} to decimal" + ))); + } + + let mut bs = bs.iter(); + // Overflow checks are not required if 10^(precision - 1) <= T::MAX holds. + // Thus, if we validate the precision correctly, we can skip overflow checks. + while let Some(b) = bs.next() { + match b { + b'0'..=b'9' => { + if digits == 0 && *b == b'0' { + // Ignore leading zeros. + continue; + } + digits += 1; + result = result.mul_wrapping(base); + result = result.add_wrapping(T::Native::usize_as((b - b'0') as usize)); + } + b'.' => { + for b in bs.by_ref() { + if !b.is_ascii_digit() { + return Err(ArrowError::ParseError(format!( + "can't parse the string value {s} to decimal" + ))); + } + if fractionals == scale { + // We have processed all the digits that we need. All that + // is left is to validate that the rest of the string contains + // valid digits. + continue; + } + fractionals += 1; + digits += 1; + result = result.mul_wrapping(base); + result = + result.add_wrapping(T::Native::usize_as((b - b'0') as usize)); + } + + // Fail on "." + if digits == 0 { + return Err(ArrowError::ParseError(format!( + "can't parse the string value {s} to decimal" + ))); + } + } + _ => { + return Err(ArrowError::ParseError(format!( + "can't parse the string value {s} to decimal" + ))); + } + } + } + + if fractionals < scale { + let exp = scale - fractionals; + if exp as u8 + digits > precision { + return Err(ArrowError::ParseError("parse decimal overflow".to_string())); + } + let mul = base.pow_wrapping(exp as _); + result = result.mul_wrapping(mul); + } else if digits > precision { + return Err(ArrowError::ParseError("parse decimal overflow".to_string())); + } + + Ok(if negative { + result.neg_wrapping() + } else { + result + }) +} + +pub fn parse_interval_year_month( + value: &str, +) -> Result<::Native, ArrowError> { + let config = IntervalParseConfig::new(IntervalUnit::Year); + let interval = Interval::parse(value, &config)?; + + let months = interval.to_year_months().map_err(|_| ArrowError::CastError(format!( + "Cannot cast {value} to IntervalYearMonth. Only year and month fields are allowed." + )))?; + + Ok(IntervalYearMonthType::make_value(0, months)) +} + +pub fn parse_interval_day_time( + value: &str, +) -> Result<::Native, ArrowError> { + let config = IntervalParseConfig::new(IntervalUnit::Day); + let interval = Interval::parse(value, &config)?; + + let (days, millis) = interval.to_day_time().map_err(|_| ArrowError::CastError(format!( + "Cannot cast {value} to IntervalDayTime because the nanos part isn't multiple of milliseconds" + )))?; + + Ok(IntervalDayTimeType::make_value(days, millis)) +} + +pub fn parse_interval_month_day_nano( + value: &str, +) -> Result<::Native, ArrowError> { + let config = IntervalParseConfig::new(IntervalUnit::Month); + let interval = Interval::parse(value, &config)?; + + let (months, days, nanos) = interval.to_month_day_nanos(); + + Ok(IntervalMonthDayNanoType::make_value(months, days, nanos)) +} + +const NANOS_PER_MILLIS: i64 = 1_000_000; +const NANOS_PER_SECOND: i64 = 1_000 * NANOS_PER_MILLIS; +const NANOS_PER_MINUTE: i64 = 60 * NANOS_PER_SECOND; +const NANOS_PER_HOUR: i64 = 60 * NANOS_PER_MINUTE; +#[cfg(test)] +const NANOS_PER_DAY: i64 = 24 * NANOS_PER_HOUR; + +#[rustfmt::skip] +#[derive(Clone, Copy)] +#[repr(u16)] +enum IntervalUnit { + Century = 0b_0000_0000_0001, + Decade = 0b_0000_0000_0010, + Year = 0b_0000_0000_0100, + Month = 0b_0000_0000_1000, + Week = 0b_0000_0001_0000, + Day = 0b_0000_0010_0000, + Hour = 0b_0000_0100_0000, + Minute = 0b_0000_1000_0000, + Second = 0b_0001_0000_0000, + Millisecond = 0b_0010_0000_0000, + Microsecond = 0b_0100_0000_0000, + Nanosecond = 0b_1000_0000_0000, +} + +impl FromStr for IntervalUnit { + type Err = ArrowError; + + fn from_str(s: &str) -> Result { + match s.to_lowercase().as_str() { + "century" | "centuries" => Ok(Self::Century), + "decade" | "decades" => Ok(Self::Decade), + "year" | "years" => Ok(Self::Year), + "month" | "months" => Ok(Self::Month), + "week" | "weeks" => Ok(Self::Week), + "day" | "days" => Ok(Self::Day), + "hour" | "hours" => Ok(Self::Hour), + "minute" | "minutes" => Ok(Self::Minute), + "second" | "seconds" => Ok(Self::Second), + "millisecond" | "milliseconds" => Ok(Self::Millisecond), + "microsecond" | "microseconds" => Ok(Self::Microsecond), + "nanosecond" | "nanoseconds" => Ok(Self::Nanosecond), + _ => Err(ArrowError::NotYetImplemented(format!( + "Unknown interval type: {s}" + ))), + } + } +} + +pub type MonthDayNano = (i32, i32, i64); + +/// Chosen based on the number of decimal digits in 1 week in nanoseconds +const INTERVAL_PRECISION: u32 = 15; + +#[derive(Clone, Copy, Debug, PartialEq)] +struct IntervalAmount { + /// The integer component of the interval amount + integer: i64, + /// The fractional component multiplied by 10^INTERVAL_PRECISION + frac: i64, +} + +#[cfg(test)] +impl IntervalAmount { + fn new(integer: i64, frac: i64) -> Self { + Self { integer, frac } + } +} + +impl FromStr for IntervalAmount { + type Err = ArrowError; + + fn from_str(s: &str) -> Result { + match s.split_once('.') { + Some((integer, frac)) + if frac.len() <= INTERVAL_PRECISION as usize + && !frac.is_empty() + && !frac.starts_with('-') => + { + // integer will be "" for values like ".5" + // and "-" for values like "-.5" + let explicit_neg = integer.starts_with('-'); + let integer = if integer.is_empty() || integer == "-" { + Ok(0) + } else { + integer.parse::().map_err(|_| { + ArrowError::ParseError(format!( + "Failed to parse {s} as interval amount" + )) + }) + }?; + + let frac_unscaled = frac.parse::().map_err(|_| { + ArrowError::ParseError(format!( + "Failed to parse {s} as interval amount" + )) + })?; + + // scale fractional part by interval precision + let frac = + frac_unscaled * 10_i64.pow(INTERVAL_PRECISION - frac.len() as u32); + + // propagate the sign of the integer part to the fractional part + let frac = if integer < 0 || explicit_neg { + -frac + } else { + frac + }; + + let result = Self { integer, frac }; + + Ok(result) + } + Some((_, frac)) if frac.starts_with('-') => Err(ArrowError::ParseError( + format!("Failed to parse {s} as interval amount"), + )), + Some((_, frac)) if frac.len() > INTERVAL_PRECISION as usize => { + Err(ArrowError::ParseError(format!( + "{s} exceeds the precision available for interval amount" + ))) + } + Some(_) | None => { + let integer = s.parse::().map_err(|_| { + ArrowError::ParseError(format!( + "Failed to parse {s} as interval amount" + )) + })?; + + let result = Self { integer, frac: 0 }; + Ok(result) + } + } + } +} + +#[derive(Debug, Default, PartialEq)] +struct Interval { + months: i32, + days: i32, + nanos: i64, +} + +impl Interval { + fn new(months: i32, days: i32, nanos: i64) -> Self { + Self { + months, + days, + nanos, + } + } + + fn to_year_months(&self) -> Result { + match (self.months, self.days, self.nanos) { + (months, days, nanos) if days == 0 && nanos == 0 => Ok(months), + _ => Err(ArrowError::InvalidArgumentError(format!( + "Unable to represent interval with days and nanos as year-months: {:?}", + self + ))), + } + } + + fn to_day_time(&self) -> Result<(i32, i32), ArrowError> { + let days = self.months.mul_checked(30)?.add_checked(self.days)?; + + match self.nanos { + nanos if nanos % NANOS_PER_MILLIS == 0 => { + let millis = (self.nanos / 1_000_000).try_into().map_err(|_| { + ArrowError::InvalidArgumentError(format!( + "Unable to represent {} nanos as milliseconds in a signed 32-bit integer", + self.nanos + )) + })?; + + Ok((days, millis)) + } + nanos => Err(ArrowError::InvalidArgumentError(format!( + "Unable to represent {nanos} as milliseconds" + ))), + } + } + + fn to_month_day_nanos(&self) -> (i32, i32, i64) { + (self.months, self.days, self.nanos) + } + + /// Parse string value in traditional Postgres format such as + /// `1 year 2 months 3 days 4 hours 5 minutes 6 seconds` + fn parse(value: &str, config: &IntervalParseConfig) -> Result { + let components = parse_interval_components(value, config)?; + + let result = components.into_iter().fold( + Ok(Self::default()), + |result, (amount, unit)| match result { + Ok(result) => result.add(amount, unit), + Err(e) => Err(e), + }, + )?; + + Ok(result) + } + + /// Interval addition following Postgres behavior. Fractional units will be spilled into smaller units. + /// When the interval unit is larger than months, the result is rounded to total months and not spilled to days/nanos. + /// Fractional parts of weeks and days are represented using days and nanoseconds. + /// e.g. INTERVAL '0.5 MONTH' = 15 days, INTERVAL '1.5 MONTH' = 1 month 15 days + /// e.g. INTERVAL '0.5 DAY' = 12 hours, INTERVAL '1.5 DAY' = 1 day 12 hours + /// [Postgres reference](https://www.postgresql.org/docs/15/datatype-datetime.html#DATATYPE-INTERVAL-INPUT:~:text=Field%20values%20can,fractional%20on%20output.) + fn add( + &self, + amount: IntervalAmount, + unit: IntervalUnit, + ) -> Result { + let result = match unit { + IntervalUnit::Century => { + let months_int = amount.integer.mul_checked(100)?.mul_checked(12)?; + let month_frac = amount.frac * 12 / 10_i64.pow(INTERVAL_PRECISION - 2); + let months = + months_int + .add_checked(month_frac)? + .try_into() + .map_err(|_| { + ArrowError::ParseError(format!( + "Unable to represent {} centuries as months in a signed 32-bit integer", + &amount.integer + )) + })?; + + Self::new(self.months.add_checked(months)?, self.days, self.nanos) + } + IntervalUnit::Decade => { + let months_int = amount.integer.mul_checked(10)?.mul_checked(12)?; + + let month_frac = amount.frac * 12 / 10_i64.pow(INTERVAL_PRECISION - 1); + let months = + months_int + .add_checked(month_frac)? + .try_into() + .map_err(|_| { + ArrowError::ParseError(format!( + "Unable to represent {} decades as months in a signed 32-bit integer", + &amount.integer + )) + })?; + + Self::new(self.months.add_checked(months)?, self.days, self.nanos) + } + IntervalUnit::Year => { + let months_int = amount.integer.mul_checked(12)?; + let month_frac = amount.frac * 12 / 10_i64.pow(INTERVAL_PRECISION); + let months = + months_int + .add_checked(month_frac)? + .try_into() + .map_err(|_| { + ArrowError::ParseError(format!( + "Unable to represent {} years as months in a signed 32-bit integer", + &amount.integer + )) + })?; + + Self::new(self.months.add_checked(months)?, self.days, self.nanos) + } + IntervalUnit::Month => { + let months = amount.integer.try_into().map_err(|_| { + ArrowError::ParseError(format!( + "Unable to represent {} months in a signed 32-bit integer", + &amount.integer + )) + })?; + + let days = amount.frac * 3 / 10_i64.pow(INTERVAL_PRECISION - 1); + let days = days.try_into().map_err(|_| { + ArrowError::ParseError(format!( + "Unable to represent {} months as days in a signed 32-bit integer", + amount.frac / 10_i64.pow(INTERVAL_PRECISION) + )) + })?; + + Self::new( + self.months.add_checked(months)?, + self.days.add_checked(days)?, + self.nanos, + ) + } + IntervalUnit::Week => { + let days = amount.integer.mul_checked(7)?.try_into().map_err(|_| { + ArrowError::ParseError(format!( + "Unable to represent {} weeks as days in a signed 32-bit integer", + &amount.integer + )) + })?; + + let nanos = + amount.frac * 7 * 24 * 6 * 6 / 10_i64.pow(INTERVAL_PRECISION - 11); + + Self::new( + self.months, + self.days.add_checked(days)?, + self.nanos.add_checked(nanos)?, + ) + } + IntervalUnit::Day => { + let days = amount.integer.try_into().map_err(|_| { + ArrowError::InvalidArgumentError(format!( + "Unable to represent {} days in a signed 32-bit integer", + amount.integer + )) + })?; + + let nanos = + amount.frac * 24 * 6 * 6 / 10_i64.pow(INTERVAL_PRECISION - 11); + + Self::new( + self.months, + self.days.add_checked(days)?, + self.nanos.add_checked(nanos)?, + ) + } + IntervalUnit::Hour => { + let nanos_int = amount.integer.mul_checked(NANOS_PER_HOUR)?; + let nanos_frac = + amount.frac * 6 * 6 / 10_i64.pow(INTERVAL_PRECISION - 11); + let nanos = nanos_int.add_checked(nanos_frac)?; + + Interval::new(self.months, self.days, self.nanos.add_checked(nanos)?) + } + IntervalUnit::Minute => { + let nanos_int = amount.integer.mul_checked(NANOS_PER_MINUTE)?; + let nanos_frac = amount.frac * 6 / 10_i64.pow(INTERVAL_PRECISION - 10); + + let nanos = nanos_int.add_checked(nanos_frac)?; + + Interval::new(self.months, self.days, self.nanos.add_checked(nanos)?) + } + IntervalUnit::Second => { + let nanos_int = amount.integer.mul_checked(NANOS_PER_SECOND)?; + let nanos_frac = amount.frac / 10_i64.pow(INTERVAL_PRECISION - 9); + let nanos = nanos_int.add_checked(nanos_frac)?; + + Interval::new(self.months, self.days, self.nanos.add_checked(nanos)?) + } + IntervalUnit::Millisecond => { + let nanos_int = amount.integer.mul_checked(NANOS_PER_MILLIS)?; + let nanos_frac = amount.frac / 10_i64.pow(INTERVAL_PRECISION - 6); + let nanos = nanos_int.add_checked(nanos_frac)?; + + Interval::new(self.months, self.days, self.nanos.add_checked(nanos)?) + } + IntervalUnit::Microsecond => { + let nanos_int = amount.integer.mul_checked(1_000)?; + let nanos_frac = amount.frac / 10_i64.pow(INTERVAL_PRECISION - 3); + let nanos = nanos_int.add_checked(nanos_frac)?; + + Interval::new(self.months, self.days, self.nanos.add_checked(nanos)?) + } + IntervalUnit::Nanosecond => { + let nanos_int = amount.integer; + let nanos_frac = amount.frac / 10_i64.pow(INTERVAL_PRECISION); + let nanos = nanos_int.add_checked(nanos_frac)?; + + Interval::new(self.months, self.days, self.nanos.add_checked(nanos)?) + } + }; + + Ok(result) + } +} + +struct IntervalParseConfig { + /// The default unit to use if none is specified + /// e.g. `INTERVAL 1` represents `INTERVAL 1 SECOND` when default_unit = IntervalType::Second + default_unit: IntervalUnit, +} + +impl IntervalParseConfig { + fn new(default_unit: IntervalUnit) -> Self { + Self { default_unit } + } +} + +/// parse the string into a vector of interval components i.e. (amount, unit) tuples +fn parse_interval_components( + value: &str, + config: &IntervalParseConfig, +) -> Result, ArrowError> { + let parts = value.split_whitespace(); + + let raw_amounts = parts.clone().step_by(2); + let raw_units = parts.skip(1).step_by(2); + + // parse amounts + let (amounts, invalid_amounts) = raw_amounts + .map(IntervalAmount::from_str) + .partition::, _>(Result::is_ok); + + // invalid amounts? + if !invalid_amounts.is_empty() { + return Err(ArrowError::NotYetImplemented(format!( + "Unsupported Interval Expression with value {value:?}" + ))); + } + + // parse units + let (units, invalid_units): (Vec<_>, Vec<_>) = raw_units + .clone() + .map(IntervalUnit::from_str) + .partition(Result::is_ok); + + // invalid units? + if !invalid_units.is_empty() { + return Err(ArrowError::ParseError(format!( + "Invalid input syntax for type interval: {value:?}" + ))); + } + + // collect parsed results + let amounts = amounts.into_iter().map(Result::unwrap).collect::>(); + let units = units.into_iter().map(Result::unwrap).collect::>(); + + // if only an amount is specified, use the default unit + if amounts.len() == 1 && units.is_empty() { + return Ok(vec![(amounts[0], config.default_unit)]); + }; + + // duplicate units? + let mut observed_interval_types = 0; + for (unit, raw_unit) in units.iter().zip(raw_units) { + if observed_interval_types & (*unit as u16) != 0 { + return Err(ArrowError::ParseError(format!( + "Invalid input syntax for type interval: {value:?}. Repeated type '{raw_unit}'", + ))); + } + + observed_interval_types |= *unit as u16; + } + + let result = amounts.iter().copied().zip(units.iter().copied()); + + Ok(result.collect::>()) +} + +#[cfg(test)] +mod tests { + use super::*; + use arrow_array::temporal_conversions::date32_to_datetime; + use arrow_array::timezone::Tz; + use arrow_buffer::i256; + + #[test] + fn test_parse_nanos() { + assert_eq!(parse_nanos::<3, 0>(&[1, 2, 3]), 123_000_000); + assert_eq!(parse_nanos::<5, 0>(&[1, 2, 3, 4, 5]), 123_450_000); + assert_eq!(parse_nanos::<6, b'0'>(b"123456"), 123_456_000); + } + + #[test] + fn string_to_timestamp_timezone() { + // Explicit timezone + assert_eq!( + 1599572549190855000, + parse_timestamp("2020-09-08T13:42:29.190855+00:00").unwrap() + ); + assert_eq!( + 1599572549190855000, + parse_timestamp("2020-09-08T13:42:29.190855Z").unwrap() + ); + assert_eq!( + 1599572549000000000, + parse_timestamp("2020-09-08T13:42:29Z").unwrap() + ); // no fractional part + assert_eq!( + 1599590549190855000, + parse_timestamp("2020-09-08T13:42:29.190855-05:00").unwrap() + ); + } + + #[test] + fn string_to_timestamp_timezone_space() { + // Ensure space rather than T between time and date is accepted + assert_eq!( + 1599572549190855000, + parse_timestamp("2020-09-08 13:42:29.190855+00:00").unwrap() + ); + assert_eq!( + 1599572549190855000, + parse_timestamp("2020-09-08 13:42:29.190855Z").unwrap() + ); + assert_eq!( + 1599572549000000000, + parse_timestamp("2020-09-08 13:42:29Z").unwrap() + ); // no fractional part + assert_eq!( + 1599590549190855000, + parse_timestamp("2020-09-08 13:42:29.190855-05:00").unwrap() + ); + } + + #[test] + #[cfg_attr(miri, ignore)] // unsupported operation: can't call foreign function: mktime + fn string_to_timestamp_no_timezone() { + // This test is designed to succeed in regardless of the local + // timezone the test machine is running. Thus it is still + // somewhat susceptible to bugs in the use of chrono + let naive_datetime = NaiveDateTime::new( + NaiveDate::from_ymd_opt(2020, 9, 8).unwrap(), + NaiveTime::from_hms_nano_opt(13, 42, 29, 190855000).unwrap(), + ); + + // Ensure both T and ' ' variants work + assert_eq!( + naive_datetime.timestamp_nanos(), + parse_timestamp("2020-09-08T13:42:29.190855").unwrap() + ); + + assert_eq!( + naive_datetime.timestamp_nanos(), + parse_timestamp("2020-09-08 13:42:29.190855").unwrap() + ); + + // Also ensure that parsing timestamps with no fractional + // second part works as well + let naive_datetime_whole_secs = NaiveDateTime::new( + NaiveDate::from_ymd_opt(2020, 9, 8).unwrap(), + NaiveTime::from_hms_opt(13, 42, 29).unwrap(), + ); + + // Ensure both T and ' ' variants work + assert_eq!( + naive_datetime_whole_secs.timestamp_nanos(), + parse_timestamp("2020-09-08T13:42:29").unwrap() + ); + + assert_eq!( + naive_datetime_whole_secs.timestamp_nanos(), + parse_timestamp("2020-09-08 13:42:29").unwrap() + ); + + // ensure without time work + // no time, should be the nano second at + // 2020-09-08 0:0:0 + let naive_datetime_no_time = NaiveDateTime::new( + NaiveDate::from_ymd_opt(2020, 9, 8).unwrap(), + NaiveTime::from_hms_opt(0, 0, 0).unwrap(), + ); + + assert_eq!( + naive_datetime_no_time.timestamp_nanos(), + parse_timestamp("2020-09-08").unwrap() + ) + } + + #[test] + fn string_to_timestamp_chrono() { + let cases = [ + "2020-09-08T13:42:29Z", + "1969-01-01T00:00:00.1Z", + "2020-09-08T12:00:12.12345678+00:00", + "2020-09-08T12:00:12+00:00", + "2020-09-08T12:00:12.1+00:00", + "2020-09-08T12:00:12.12+00:00", + "2020-09-08T12:00:12.123+00:00", + "2020-09-08T12:00:12.1234+00:00", + "2020-09-08T12:00:12.12345+00:00", + "2020-09-08T12:00:12.123456+00:00", + "2020-09-08T12:00:12.1234567+00:00", + "2020-09-08T12:00:12.12345678+00:00", + "2020-09-08T12:00:12.123456789+00:00", + "2020-09-08T12:00:12.12345678912z", + "2020-09-08T12:00:12.123456789123Z", + "2020-09-08T12:00:12.123456789123+02:00", + "2020-09-08T12:00:12.12345678912345Z", + "2020-09-08T12:00:12.1234567891234567+02:00", + "2020-09-08T12:00:60Z", + "2020-09-08T12:00:60.123Z", + "2020-09-08T12:00:60.123456+02:00", + "2020-09-08T12:00:60.1234567891234567+02:00", + "2020-09-08T12:00:60.999999999+02:00", + "2020-09-08t12:00:12.12345678+00:00", + "2020-09-08t12:00:12+00:00", + "2020-09-08t12:00:12Z", + ]; + + for case in cases { + let chrono = DateTime::parse_from_rfc3339(case).unwrap(); + let chrono_utc = chrono.with_timezone(&Utc); + + let custom = string_to_datetime(&Utc, case).unwrap(); + assert_eq!(chrono_utc, custom) + } + } + + #[test] + fn string_to_timestamp_naive() { + let cases = [ + "2018-11-13T17:11:10.011375885995", + "2030-12-04T17:11:10.123", + "2030-12-04T17:11:10.1234", + "2030-12-04T17:11:10.123456", + ]; + for case in cases { + let chrono = + NaiveDateTime::parse_from_str(case, "%Y-%m-%dT%H:%M:%S%.f").unwrap(); + let custom = string_to_datetime(&Utc, case).unwrap(); + assert_eq!(chrono, custom.naive_utc()) + } + } + + #[test] + fn string_to_timestamp_invalid() { + // Test parsing invalid formats + let cases = [ + ("", "timestamp must contain at least 10 characters"), + ("SS", "timestamp must contain at least 10 characters"), + ("Wed, 18 Feb 2015 23:16:09 GMT", "error parsing date"), + ("1997-01-31H09:26:56.123Z", "invalid timestamp separator"), + ("1997-01-31 09:26:56.123Z", "error parsing time"), + ("1997:01:31T09:26:56.123Z", "error parsing date"), + ("1997:1:31T09:26:56.123Z", "error parsing date"), + ("1997-01-32T09:26:56.123Z", "error parsing date"), + ("1997-13-32T09:26:56.123Z", "error parsing date"), + ("1997-02-29T09:26:56.123Z", "error parsing date"), + ("2015-02-30T17:35:20-08:00", "error parsing date"), + ("1997-01-10T9:26:56.123Z", "error parsing time"), + ("2015-01-20T25:35:20-08:00", "error parsing time"), + ("1997-01-10T09:61:56.123Z", "error parsing time"), + ("1997-01-10T09:61:90.123Z", "error parsing time"), + ("1997-01-10T12:00:6.123Z", "error parsing time"), + ("1997-01-31T092656.123Z", "error parsing time"), + ("1997-01-10T12:00:06.", "error parsing time"), + ("1997-01-10T12:00:06. ", "error parsing time"), + ]; + + for (s, ctx) in cases { + let expected = + format!("Parser error: Error parsing timestamp from '{s}': {ctx}"); + let actual = string_to_datetime(&Utc, s).unwrap_err().to_string(); + assert_eq!(actual, expected) + } + } + + // Parse a timestamp to timestamp int with a useful human readable error message + fn parse_timestamp(s: &str) -> Result { + let result = string_to_timestamp_nanos(s); + if let Err(e) = &result { + eprintln!("Error parsing timestamp '{s}': {e:?}"); + } + result + } + + #[test] + fn string_without_timezone_to_timestamp() { + // string without timezone should always output the same regardless the local or session timezone + + let naive_datetime = NaiveDateTime::new( + NaiveDate::from_ymd_opt(2020, 9, 8).unwrap(), + NaiveTime::from_hms_nano_opt(13, 42, 29, 190855000).unwrap(), + ); + + // Ensure both T and ' ' variants work + assert_eq!( + naive_datetime.timestamp_nanos(), + parse_timestamp("2020-09-08T13:42:29.190855").unwrap() + ); + + assert_eq!( + naive_datetime.timestamp_nanos(), + parse_timestamp("2020-09-08 13:42:29.190855").unwrap() + ); + + let naive_datetime = NaiveDateTime::new( + NaiveDate::from_ymd_opt(2020, 9, 8).unwrap(), + NaiveTime::from_hms_nano_opt(13, 42, 29, 0).unwrap(), + ); + + // Ensure both T and ' ' variants work + assert_eq!( + naive_datetime.timestamp_nanos(), + parse_timestamp("2020-09-08T13:42:29").unwrap() + ); + + assert_eq!( + naive_datetime.timestamp_nanos(), + parse_timestamp("2020-09-08 13:42:29").unwrap() + ); + + let tz: Tz = "+02:00".parse().unwrap(); + let date = string_to_datetime(&tz, "2020-09-08 13:42:29").unwrap(); + let utc = date.naive_utc().to_string(); + assert_eq!(utc, "2020-09-08 11:42:29"); + let local = date.naive_local().to_string(); + assert_eq!(local, "2020-09-08 13:42:29"); + + let date = string_to_datetime(&tz, "2020-09-08 13:42:29Z").unwrap(); + let utc = date.naive_utc().to_string(); + assert_eq!(utc, "2020-09-08 13:42:29"); + let local = date.naive_local().to_string(); + assert_eq!(local, "2020-09-08 15:42:29"); + + let dt = + NaiveDateTime::parse_from_str("2020-09-08T13:42:29Z", "%Y-%m-%dT%H:%M:%SZ") + .unwrap(); + let local: Tz = "+08:00".parse().unwrap(); + + // Parsed as offset from UTC + let date = string_to_datetime(&local, "2020-09-08T13:42:29Z").unwrap(); + assert_eq!(dt, date.naive_utc()); + assert_ne!(dt, date.naive_local()); + + // Parsed as offset from local + let date = string_to_datetime(&local, "2020-09-08 13:42:29").unwrap(); + assert_eq!(dt, date.naive_local()); + assert_ne!(dt, date.naive_utc()); + } + + #[test] + fn parse_date32() { + let cases = [ + "2020-09-08", + "2020-9-8", + "2020-09-8", + "2020-9-08", + "2020-12-1", + "1690-2-5", + ]; + for case in cases { + let v = date32_to_datetime(Date32Type::parse(case).unwrap()).unwrap(); + let expected: NaiveDate = case.parse().unwrap(); + assert_eq!(v.date(), expected); + } + + let err_cases = [ + "", + "80-01-01", + "342", + "Foo", + "2020-09-08-03", + "2020--04-03", + "2020--", + ]; + for case in err_cases { + assert_eq!(Date32Type::parse(case), None); + } + } + + #[test] + fn parse_time64_nanos() { + assert_eq!( + Time64NanosecondType::parse("02:10:01.1234567899999999"), + Some(7_801_123_456_789) + ); + assert_eq!( + Time64NanosecondType::parse("02:10:01.1234567"), + Some(7_801_123_456_700) + ); + assert_eq!( + Time64NanosecondType::parse("2:10:01.1234567"), + Some(7_801_123_456_700) + ); + assert_eq!( + Time64NanosecondType::parse("12:10:01.123456789 AM"), + Some(601_123_456_789) + ); + assert_eq!( + Time64NanosecondType::parse("12:10:01.123456789 am"), + Some(601_123_456_789) + ); + assert_eq!( + Time64NanosecondType::parse("2:10:01.12345678 PM"), + Some(51_001_123_456_780) + ); + assert_eq!( + Time64NanosecondType::parse("2:10:01.12345678 pm"), + Some(51_001_123_456_780) + ); + assert_eq!( + Time64NanosecondType::parse("02:10:01"), + Some(7_801_000_000_000) + ); + assert_eq!( + Time64NanosecondType::parse("2:10:01"), + Some(7_801_000_000_000) + ); + assert_eq!( + Time64NanosecondType::parse("12:10:01 AM"), + Some(601_000_000_000) + ); + assert_eq!( + Time64NanosecondType::parse("12:10:01 am"), + Some(601_000_000_000) + ); + assert_eq!( + Time64NanosecondType::parse("2:10:01 PM"), + Some(51_001_000_000_000) + ); + assert_eq!( + Time64NanosecondType::parse("2:10:01 pm"), + Some(51_001_000_000_000) + ); + assert_eq!( + Time64NanosecondType::parse("02:10"), + Some(7_800_000_000_000) + ); + assert_eq!(Time64NanosecondType::parse("2:10"), Some(7_800_000_000_000)); + assert_eq!( + Time64NanosecondType::parse("12:10 AM"), + Some(600_000_000_000) + ); + assert_eq!( + Time64NanosecondType::parse("12:10 am"), + Some(600_000_000_000) + ); + assert_eq!( + Time64NanosecondType::parse("2:10 PM"), + Some(51_000_000_000_000) + ); + assert_eq!( + Time64NanosecondType::parse("2:10 pm"), + Some(51_000_000_000_000) + ); + + // parse directly as nanoseconds + assert_eq!(Time64NanosecondType::parse("1"), Some(1)); + + // leap second + assert_eq!( + Time64NanosecondType::parse("23:59:60"), + Some(86_400_000_000_000) + ); + + // custom format + assert_eq!( + Time64NanosecondType::parse_formatted( + "02 - 10 - 01 - .1234567", + "%H - %M - %S - %.f" + ), + Some(7_801_123_456_700) + ); + } + + #[test] + fn parse_time64_micros() { + // expected formats + assert_eq!( + Time64MicrosecondType::parse("02:10:01.1234"), + Some(7_801_123_400) + ); + assert_eq!( + Time64MicrosecondType::parse("2:10:01.1234"), + Some(7_801_123_400) + ); + assert_eq!( + Time64MicrosecondType::parse("12:10:01.123456 AM"), + Some(601_123_456) + ); + assert_eq!( + Time64MicrosecondType::parse("12:10:01.123456 am"), + Some(601_123_456) + ); + assert_eq!( + Time64MicrosecondType::parse("2:10:01.12345 PM"), + Some(51_001_123_450) + ); + assert_eq!( + Time64MicrosecondType::parse("2:10:01.12345 pm"), + Some(51_001_123_450) + ); + assert_eq!( + Time64MicrosecondType::parse("02:10:01"), + Some(7_801_000_000) + ); + assert_eq!(Time64MicrosecondType::parse("2:10:01"), Some(7_801_000_000)); + assert_eq!( + Time64MicrosecondType::parse("12:10:01 AM"), + Some(601_000_000) + ); + assert_eq!( + Time64MicrosecondType::parse("12:10:01 am"), + Some(601_000_000) + ); + assert_eq!( + Time64MicrosecondType::parse("2:10:01 PM"), + Some(51_001_000_000) + ); + assert_eq!( + Time64MicrosecondType::parse("2:10:01 pm"), + Some(51_001_000_000) + ); + assert_eq!(Time64MicrosecondType::parse("02:10"), Some(7_800_000_000)); + assert_eq!(Time64MicrosecondType::parse("2:10"), Some(7_800_000_000)); + assert_eq!(Time64MicrosecondType::parse("12:10 AM"), Some(600_000_000)); + assert_eq!(Time64MicrosecondType::parse("12:10 am"), Some(600_000_000)); + assert_eq!( + Time64MicrosecondType::parse("2:10 PM"), + Some(51_000_000_000) + ); + assert_eq!( + Time64MicrosecondType::parse("2:10 pm"), + Some(51_000_000_000) + ); + + // parse directly as microseconds + assert_eq!(Time64MicrosecondType::parse("1"), Some(1)); + + // leap second + assert_eq!( + Time64MicrosecondType::parse("23:59:60"), + Some(86_400_000_000) + ); + + // custom format + assert_eq!( + Time64MicrosecondType::parse_formatted( + "02 - 10 - 01 - .1234", + "%H - %M - %S - %.f" + ), + Some(7_801_123_400) + ); + } + + #[test] + fn parse_time32_millis() { + // expected formats + assert_eq!(Time32MillisecondType::parse("02:10:01.1"), Some(7_801_100)); + assert_eq!(Time32MillisecondType::parse("2:10:01.1"), Some(7_801_100)); + assert_eq!( + Time32MillisecondType::parse("12:10:01.123 AM"), + Some(601_123) + ); + assert_eq!( + Time32MillisecondType::parse("12:10:01.123 am"), + Some(601_123) + ); + assert_eq!( + Time32MillisecondType::parse("2:10:01.12 PM"), + Some(51_001_120) + ); + assert_eq!( + Time32MillisecondType::parse("2:10:01.12 pm"), + Some(51_001_120) + ); + assert_eq!(Time32MillisecondType::parse("02:10:01"), Some(7_801_000)); + assert_eq!(Time32MillisecondType::parse("2:10:01"), Some(7_801_000)); + assert_eq!(Time32MillisecondType::parse("12:10:01 AM"), Some(601_000)); + assert_eq!(Time32MillisecondType::parse("12:10:01 am"), Some(601_000)); + assert_eq!(Time32MillisecondType::parse("2:10:01 PM"), Some(51_001_000)); + assert_eq!(Time32MillisecondType::parse("2:10:01 pm"), Some(51_001_000)); + assert_eq!(Time32MillisecondType::parse("02:10"), Some(7_800_000)); + assert_eq!(Time32MillisecondType::parse("2:10"), Some(7_800_000)); + assert_eq!(Time32MillisecondType::parse("12:10 AM"), Some(600_000)); + assert_eq!(Time32MillisecondType::parse("12:10 am"), Some(600_000)); + assert_eq!(Time32MillisecondType::parse("2:10 PM"), Some(51_000_000)); + assert_eq!(Time32MillisecondType::parse("2:10 pm"), Some(51_000_000)); + + // parse directly as milliseconds + assert_eq!(Time32MillisecondType::parse("1"), Some(1)); + + // leap second + assert_eq!(Time32MillisecondType::parse("23:59:60"), Some(86_400_000)); + + // custom format + assert_eq!( + Time32MillisecondType::parse_formatted( + "02 - 10 - 01 - .1", + "%H - %M - %S - %.f" + ), + Some(7_801_100) + ); + } + + #[test] + fn parse_time32_secs() { + // expected formats + assert_eq!(Time32SecondType::parse("02:10:01.1"), Some(7_801)); + assert_eq!(Time32SecondType::parse("02:10:01"), Some(7_801)); + assert_eq!(Time32SecondType::parse("2:10:01"), Some(7_801)); + assert_eq!(Time32SecondType::parse("12:10:01 AM"), Some(601)); + assert_eq!(Time32SecondType::parse("12:10:01 am"), Some(601)); + assert_eq!(Time32SecondType::parse("2:10:01 PM"), Some(51_001)); + assert_eq!(Time32SecondType::parse("2:10:01 pm"), Some(51_001)); + assert_eq!(Time32SecondType::parse("02:10"), Some(7_800)); + assert_eq!(Time32SecondType::parse("2:10"), Some(7_800)); + assert_eq!(Time32SecondType::parse("12:10 AM"), Some(600)); + assert_eq!(Time32SecondType::parse("12:10 am"), Some(600)); + assert_eq!(Time32SecondType::parse("2:10 PM"), Some(51_000)); + assert_eq!(Time32SecondType::parse("2:10 pm"), Some(51_000)); + + // parse directly as seconds + assert_eq!(Time32SecondType::parse("1"), Some(1)); + + // leap second + assert_eq!(Time32SecondType::parse("23:59:60"), Some(86400)); + + // custom format + assert_eq!( + Time32SecondType::parse_formatted("02 - 10 - 01", "%H - %M - %S"), + Some(7_801) + ); + } + + #[test] + fn test_string_to_time_invalid() { + let cases = [ + "25:00", + "9:00:", + "009:00", + "09:0:00", + "25:00:00", + "13:00 AM", + "13:00 PM", + "12:00. AM", + "09:0:00", + "09:01:0", + "09:01:1", + "9:1:0", + "09:01:0", + "1:00.123", + "1:00:00.123f", + " 9:00:00", + ":09:00", + "T9:00:00", + "AM", + ]; + for case in cases { + assert!(string_to_time(case).is_none(), "{case}"); + } + } + + #[test] + fn test_string_to_time_chrono() { + let cases = [ + ("1:00", "%H:%M"), + ("12:00", "%H:%M"), + ("13:00", "%H:%M"), + ("24:00", "%H:%M"), + ("1:00:00", "%H:%M:%S"), + ("12:00:30", "%H:%M:%S"), + ("13:00:59", "%H:%M:%S"), + ("24:00:60", "%H:%M:%S"), + ("09:00:00", "%H:%M:%S%.f"), + ("0:00:30.123456", "%H:%M:%S%.f"), + ("0:00 AM", "%I:%M %P"), + ("1:00 AM", "%I:%M %P"), + ("12:00 AM", "%I:%M %P"), + ("13:00 AM", "%I:%M %P"), + ("0:00 PM", "%I:%M %P"), + ("1:00 PM", "%I:%M %P"), + ("12:00 PM", "%I:%M %P"), + ("13:00 PM", "%I:%M %P"), + ("1:00 pM", "%I:%M %P"), + ("1:00 Pm", "%I:%M %P"), + ("1:00 aM", "%I:%M %P"), + ("1:00 Am", "%I:%M %P"), + ("1:00:30.123456 PM", "%I:%M:%S%.f %P"), + ("1:00:30.123456789 PM", "%I:%M:%S%.f %P"), + ("1:00:30.123456789123 PM", "%I:%M:%S%.f %P"), + ("1:00:30.1234 PM", "%I:%M:%S%.f %P"), + ("1:00:30.123456 PM", "%I:%M:%S%.f %P"), + ("1:00:30.123456789123456789 PM", "%I:%M:%S%.f %P"), + ("1:00:30.12F456 PM", "%I:%M:%S%.f %P"), + ]; + for (s, format) in cases { + let chrono = NaiveTime::parse_from_str(s, format).ok(); + let custom = string_to_time(s); + assert_eq!(chrono, custom, "{s}"); + } + } + + #[test] + fn test_parse_interval() { + let config = IntervalParseConfig::new(IntervalUnit::Month); + + assert_eq!( + Interval::new(1i32, 0i32, 0i64), + Interval::parse("1 month", &config).unwrap(), + ); + + assert_eq!( + Interval::new(2i32, 0i32, 0i64), + Interval::parse("2 month", &config).unwrap(), + ); + + assert_eq!( + Interval::new(-1i32, -18i32, -(NANOS_PER_DAY / 5)), + Interval::parse("-1.5 months -3.2 days", &config).unwrap(), + ); + + assert_eq!( + Interval::new(0i32, 15i32, 0), + Interval::parse("0.5 months", &config).unwrap(), + ); + + assert_eq!( + Interval::new(0i32, 15i32, 0), + Interval::parse(".5 months", &config).unwrap(), + ); + + assert_eq!( + Interval::new(0i32, -15i32, 0), + Interval::parse("-0.5 months", &config).unwrap(), + ); + + assert_eq!( + Interval::new(0i32, -15i32, 0), + Interval::parse("-.5 months", &config).unwrap(), + ); + + assert_eq!( + Interval::new(2i32, 10i32, 9 * NANOS_PER_HOUR), + Interval::parse("2.1 months 7.25 days 3 hours", &config).unwrap(), + ); + + assert_eq!( + Interval::parse("1 centurys 1 month", &config) + .unwrap_err() + .to_string(), + r#"Parser error: Invalid input syntax for type interval: "1 centurys 1 month""# + ); + + assert_eq!( + Interval::new(37i32, 0i32, 0i64), + Interval::parse("3 year 1 month", &config).unwrap(), + ); + + assert_eq!( + Interval::new(35i32, 0i32, 0i64), + Interval::parse("3 year -1 month", &config).unwrap(), + ); + + assert_eq!( + Interval::new(-37i32, 0i32, 0i64), + Interval::parse("-3 year -1 month", &config).unwrap(), + ); + + assert_eq!( + Interval::new(-35i32, 0i32, 0i64), + Interval::parse("-3 year 1 month", &config).unwrap(), + ); + + assert_eq!( + Interval::new(0i32, 5i32, 0i64), + Interval::parse("5 days", &config).unwrap(), + ); + + assert_eq!( + Interval::new(0i32, 7i32, 3 * NANOS_PER_HOUR), + Interval::parse("7 days 3 hours", &config).unwrap(), + ); + + assert_eq!( + Interval::new(0i32, 7i32, 5 * NANOS_PER_MINUTE), + Interval::parse("7 days 5 minutes", &config).unwrap(), + ); + + assert_eq!( + Interval::new(0i32, 7i32, -5 * NANOS_PER_MINUTE), + Interval::parse("7 days -5 minutes", &config).unwrap(), + ); + + assert_eq!( + Interval::new(0i32, -7i32, 5 * NANOS_PER_HOUR), + Interval::parse("-7 days 5 hours", &config).unwrap(), + ); + + assert_eq!( + Interval::new( + 0i32, + -7i32, + -5 * NANOS_PER_HOUR - 5 * NANOS_PER_MINUTE - 5 * NANOS_PER_SECOND + ), + Interval::parse("-7 days -5 hours -5 minutes -5 seconds", &config).unwrap(), + ); + + assert_eq!( + Interval::new(12i32, 0i32, 25 * NANOS_PER_MILLIS), + Interval::parse("1 year 25 millisecond", &config).unwrap(), + ); + + assert_eq!( + Interval::new( + 12i32, + 1i32, + (NANOS_PER_SECOND as f64 * 0.000000001_f64) as i64 + ), + Interval::parse("1 year 1 day 0.000000001 seconds", &config).unwrap(), + ); + + assert_eq!( + Interval::new(12i32, 1i32, NANOS_PER_MILLIS / 10), + Interval::parse("1 year 1 day 0.1 milliseconds", &config).unwrap(), + ); + + assert_eq!( + Interval::new(12i32, 1i32, 1000i64), + Interval::parse("1 year 1 day 1 microsecond", &config).unwrap(), + ); + + assert_eq!( + Interval::new(12i32, 1i32, 1i64), + Interval::parse("1 year 1 day 1 nanoseconds", &config).unwrap(), + ); + + assert_eq!( + Interval::new(1i32, 0i32, -NANOS_PER_SECOND), + Interval::parse("1 month -1 second", &config).unwrap(), + ); + + assert_eq!( + Interval::new(-13i32, -8i32, -NANOS_PER_HOUR - NANOS_PER_MINUTE - NANOS_PER_SECOND - (1.11_f64 * NANOS_PER_MILLIS as f64) as i64), + Interval::parse("-1 year -1 month -1 week -1 day -1 hour -1 minute -1 second -1.11 millisecond", &config).unwrap(), + ); + } + + #[test] + fn test_duplicate_interval_type() { + let config = IntervalParseConfig::new(IntervalUnit::Month); + + let err = Interval::parse("1 month 1 second 1 second", &config) + .expect_err("parsing interval should have failed"); + assert_eq!( + r#"ParseError("Invalid input syntax for type interval: \"1 month 1 second 1 second\". Repeated type 'second'")"#, + format!("{err:?}") + ); + + // test with singular and plural forms + let err = Interval::parse("1 century 2 centuries", &config) + .expect_err("parsing interval should have failed"); + assert_eq!( + r#"ParseError("Invalid input syntax for type interval: \"1 century 2 centuries\". Repeated type 'centuries'")"#, + format!("{err:?}") + ); + } + + #[test] + fn test_interval_amount_parsing() { + // integer + let result = IntervalAmount::from_str("123").unwrap(); + let expected = IntervalAmount::new(123, 0); + + assert_eq!(result, expected); + + // positive w/ fractional + let result = IntervalAmount::from_str("0.3").unwrap(); + let expected = IntervalAmount::new(0, 3 * 10_i64.pow(INTERVAL_PRECISION - 1)); + + assert_eq!(result, expected); + + // negative w/ fractional + let result = IntervalAmount::from_str("-3.5").unwrap(); + let expected = IntervalAmount::new(-3, -5 * 10_i64.pow(INTERVAL_PRECISION - 1)); + + assert_eq!(result, expected); + + // invalid: missing fractional + let result = IntervalAmount::from_str("3."); + assert!(result.is_err()); + + // invalid: sign in fractional + let result = IntervalAmount::from_str("3.-5"); + assert!(result.is_err()); + } + + #[test] + fn test_interval_precision() { + let config = IntervalParseConfig::new(IntervalUnit::Month); + + let result = Interval::parse("100000.1 days", &config).unwrap(); + let expected = Interval::new(0_i32, 100_000_i32, NANOS_PER_DAY / 10); + + assert_eq!(result, expected); + } + + #[test] + fn test_interval_addition() { + // add 4.1 centuries + let start = Interval::new(1, 2, 3); + let expected = Interval::new(4921, 2, 3); + + let result = start + .add( + IntervalAmount::new(4, 10_i64.pow(INTERVAL_PRECISION - 1)), + IntervalUnit::Century, + ) + .unwrap(); + + assert_eq!(result, expected); + + // add 10.25 decades + let start = Interval::new(1, 2, 3); + let expected = Interval::new(1231, 2, 3); + + let result = start + .add( + IntervalAmount::new(10, 25 * 10_i64.pow(INTERVAL_PRECISION - 2)), + IntervalUnit::Decade, + ) + .unwrap(); + + assert_eq!(result, expected); + + // add 30.3 years (reminder: Postgres logic does not spill to days/nanos when interval is larger than a month) + let start = Interval::new(1, 2, 3); + let expected = Interval::new(364, 2, 3); + + let result = start + .add( + IntervalAmount::new(30, 3 * 10_i64.pow(INTERVAL_PRECISION - 1)), + IntervalUnit::Year, + ) + .unwrap(); + + assert_eq!(result, expected); + + // add 1.5 months + let start = Interval::new(1, 2, 3); + let expected = Interval::new(2, 17, 3); + + let result = start + .add( + IntervalAmount::new(1, 5 * 10_i64.pow(INTERVAL_PRECISION - 1)), + IntervalUnit::Month, + ) + .unwrap(); + + assert_eq!(result, expected); + + // add -2 weeks + let start = Interval::new(1, 25, 3); + let expected = Interval::new(1, 11, 3); + + let result = start + .add(IntervalAmount::new(-2, 0), IntervalUnit::Week) + .unwrap(); + + assert_eq!(result, expected); + + // add 2.2 days + let start = Interval::new(12, 15, 3); + let expected = Interval::new(12, 17, 3 + 17_280 * NANOS_PER_SECOND); + + let result = start + .add( + IntervalAmount::new(2, 2 * 10_i64.pow(INTERVAL_PRECISION - 1)), + IntervalUnit::Day, + ) + .unwrap(); + + assert_eq!(result, expected); + + // add 12.5 hours + let start = Interval::new(1, 2, 3); + let expected = Interval::new(1, 2, 3 + 45_000 * NANOS_PER_SECOND); + + let result = start + .add( + IntervalAmount::new(12, 5 * 10_i64.pow(INTERVAL_PRECISION - 1)), + IntervalUnit::Hour, + ) + .unwrap(); + + assert_eq!(result, expected); + + // add -1.5 minutes + let start = Interval::new(0, 0, -3); + let expected = Interval::new(0, 0, -90_000_000_000 - 3); + + let result = start + .add( + IntervalAmount::new(-1, -5 * 10_i64.pow(INTERVAL_PRECISION - 1)), + IntervalUnit::Minute, + ) + .unwrap(); + + assert_eq!(result, expected); + } + + #[test] + fn string_to_timestamp_old() { + parse_timestamp("1677-06-14T07:29:01.256") + .map_err(|e| assert!(e.to_string().ends_with(ERR_NANOSECONDS_NOT_SUPPORTED))) + .unwrap_err(); + } + + #[test] + fn test_parse_decimal_with_parameter() { + let tests = [ + ("0", 0i128), + ("123.123", 123123i128), + ("123.1234", 123123i128), + ("123.1", 123100i128), + ("123", 123000i128), + ("-123.123", -123123i128), + ("-123.1234", -123123i128), + ("-123.1", -123100i128), + ("-123", -123000i128), + ("0.0000123", 0i128), + ("12.", 12000i128), + ("-12.", -12000i128), + ("00.1", 100i128), + ("-00.1", -100i128), + ("12345678912345678.1234", 12345678912345678123i128), + ("-12345678912345678.1234", -12345678912345678123i128), + ("99999999999999999.999", 99999999999999999999i128), + ("-99999999999999999.999", -99999999999999999999i128), + (".123", 123i128), + ("-.123", -123i128), + ("123.", 123000i128), + ("-123.", -123000i128), + ]; + for (s, i) in tests { + let result_128 = parse_decimal::(s, 20, 3); + assert_eq!(i, result_128.unwrap()); + let result_256 = parse_decimal::(s, 20, 3); + assert_eq!(i256::from_i128(i), result_256.unwrap()); + } + let can_not_parse_tests = ["123,123", ".", "123.123.123", "", "+", "-"]; + for s in can_not_parse_tests { + let result_128 = parse_decimal::(s, 20, 3); + assert_eq!( + format!("Parser error: can't parse the string value {s} to decimal"), + result_128.unwrap_err().to_string() + ); + let result_256 = parse_decimal::(s, 20, 3); + assert_eq!( + format!("Parser error: can't parse the string value {s} to decimal"), + result_256.unwrap_err().to_string() + ); + } + let overflow_parse_tests = ["12345678", "12345678.9", "99999999.99"]; + for s in overflow_parse_tests { + let result_128 = parse_decimal::(s, 10, 3); + let expected_128 = "Parser error: parse decimal overflow"; + let actual_128 = result_128.unwrap_err().to_string(); + + assert!( + actual_128.contains(expected_128), + "actual: '{actual_128}', expected: '{expected_128}'" + ); + + let result_256 = parse_decimal::(s, 10, 3); + let expected_256 = "Parser error: parse decimal overflow"; + let actual_256 = result_256.unwrap_err().to_string(); + + assert!( + actual_256.contains(expected_256), + "actual: '{actual_256}', expected: '{expected_256}'" + ); + } + + let edge_tests_128 = [ + ( + "99999999999999999999999999999999999999", + 99999999999999999999999999999999999999i128, + 0, + ), + ( + "999999999999999999999999999999999999.99", + 99999999999999999999999999999999999999i128, + 2, + ), + ( + "9999999999999999999999999.9999999999999", + 99999999999999999999999999999999999999i128, + 13, + ), + ( + "9999999999999999999999999", + 99999999999999999999999990000000000000i128, + 13, + ), + ( + "0.99999999999999999999999999999999999999", + 99999999999999999999999999999999999999i128, + 38, + ), + ]; + for (s, i, scale) in edge_tests_128 { + let result_128 = parse_decimal::(s, 38, scale); + assert_eq!(i, result_128.unwrap()); + } + let edge_tests_256 = [ + ( + "9999999999999999999999999999999999999999999999999999999999999999999999999999", +i256::from_string("9999999999999999999999999999999999999999999999999999999999999999999999999999").unwrap(), + 0, + ), + ( + "999999999999999999999999999999999999999999999999999999999999999999999999.9999", + i256::from_string("9999999999999999999999999999999999999999999999999999999999999999999999999999").unwrap(), + 4, + ), + ( + "99999999999999999999999999999999999999999999999999.99999999999999999999999999", + i256::from_string("9999999999999999999999999999999999999999999999999999999999999999999999999999").unwrap(), + 26, + ), + ( + "99999999999999999999999999999999999999999999999999", + i256::from_string("9999999999999999999999999999999999999999999999999900000000000000000000000000").unwrap(), + 26, + ), + ]; + for (s, i, scale) in edge_tests_256 { + let result = parse_decimal::(s, 76, scale); + assert_eq!(i, result.unwrap()); + } + } +}