diff --git a/Cargo.lock b/Cargo.lock index 52521e5ad..d7bf55126 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -702,6 +702,29 @@ dependencies = [ "syn", ] +[[package]] +name = "docgen" +version = "0.0.92" +dependencies = [ + "logutil", + "rayexec_bullet", + "rayexec_csv", + "rayexec_debug", + "rayexec_delta", + "rayexec_error", + "rayexec_execution", + "rayexec_iceberg", + "rayexec_parquet", + "rayexec_postgres", + "rayexec_rt_native", + "rayexec_server", + "rayexec_shell", + "rayexec_unity_catalog", + "regex", + "tokio", + "tracing", +] + [[package]] name = "dyn-clone" version = "1.0.17" diff --git a/crates/docgen/Cargo.toml b/crates/docgen/Cargo.toml new file mode 100644 index 000000000..fa9c0cca9 --- /dev/null +++ b/crates/docgen/Cargo.toml @@ -0,0 +1,23 @@ +[package] +name = "docgen" +version.workspace = true +edition.workspace = true + +[dependencies] +logutil = { path = '../logutil' } +rayexec_error = { path = '../rayexec_error' } +rayexec_execution = { path = '../rayexec_execution' } +rayexec_server = { path = '../rayexec_server' } +rayexec_shell = { path = '../rayexec_shell' } +rayexec_rt_native = { path = '../rayexec_rt_native' } +rayexec_bullet = { path = '../rayexec_bullet' } +rayexec_postgres = { path = '../rayexec_postgres' } +rayexec_parquet = { path = '../rayexec_parquet' } +rayexec_csv = { path = '../rayexec_csv' } +rayexec_delta = { path = '../rayexec_delta' } +rayexec_unity_catalog = { path = '../rayexec_unity_catalog' } +rayexec_iceberg = { path = '../rayexec_iceberg' } +rayexec_debug = { path = '../rayexec_debug' } +regex = { workspace = true } +tracing = { workspace = true } +tokio = { workspace = true, default-features = false, features = ["rt", "rt-multi-thread", "time", "net"] } diff --git a/crates/docgen/src/file.rs b/crates/docgen/src/file.rs new file mode 100644 index 000000000..c40a45ecc --- /dev/null +++ b/crates/docgen/src/file.rs @@ -0,0 +1,118 @@ +use std::fmt::Write as _; +use std::fs; +use std::io::{BufRead, BufReader, BufWriter, Write}; +use std::sync::LazyLock; + +use rayexec_error::{RayexecError, Result}; +use regex::Regex; +use tracing::info; + +use crate::section::SectionWriter; +use crate::session::DocsSession; + +static DOCSGEN_START_REGEX: LazyLock = + LazyLock::new(|| Regex::new(r"").unwrap()); + +static DOCSGEN_END_REGEX: LazyLock = + LazyLock::new(|| Regex::new(r"").unwrap()); + +fn expand_path(path: &str) -> String { + format!("{}/../../{}", env!("CARGO_MANIFEST_DIR"), path) +} + +#[derive(Debug)] +pub struct DocFile { + pub path: &'static str, + pub sections: &'static [(&'static str, &'static dyn SectionWriter)], +} + +impl DocFile { + pub fn overwrite(&self, session: &DocsSession) -> Result<()> { + let path = expand_path(self.path); + info!(%path, "expanded path"); + + let file = fs::OpenOptions::new().read(true).write(true).open(&path)?; + + let lines: Vec = BufReader::new(&file).lines().collect::>()?; + + // Write to buffer instead of file directly in case we error early. + let mut buf = String::new(); + + let mut in_docsgen_section = false; + + for (idx, line) in lines.iter().enumerate() { + match DOCSGEN_START_REGEX.captures(line) { + Some(captures) => { + if in_docsgen_section { + return Err(RayexecError::new("Cannot nest docsgen sections") + .with_field("line_number", idx + 1)); + } + in_docsgen_section = true; + + let section_name = captures.get(1).unwrap().as_str(); + + let section = self + .sections + .iter() + .find_map(|(name, section)| { + if *name == section_name { + Some(section) + } else { + None + } + }) + .ok_or_else(|| { + RayexecError::new(format!("Missing docs section: {section_name}")) + })?; + + // Write original line + extra newline + writeln!(buf, "{}", line)?; + writeln!(buf)?; + + // Write out section. + section.write(session, &mut buf)?; + } + None => { + if DOCSGEN_END_REGEX.is_match(line.as_str()) { + if !in_docsgen_section { + return Err(RayexecError::new( + "Found DOCSGEN_END tag when not in a docsgen section", + ) + .with_field("line_number", idx + 1)); + } + + in_docsgen_section = false; + + // Write extra newline + original line + writeln!(buf)?; + writeln!(buf, "{}", line)?; + } else { + // Only write out stuff outside of the docgen section. + // We already wrote the new output, so we need to + // discard the old stuff. + if !in_docsgen_section { + writeln!(buf, "{}", line)?; + } + } + } + } + } + + if in_docsgen_section { + return Err(RayexecError::new( + "Reached end of file, still in docsgen section", + )); + } + + let file = fs::OpenOptions::new() + .write(true) + .truncate(true) + .open(&path)?; + + let mut writer = BufWriter::new(file); + writer.write_all(buf.as_bytes())?; + writer.flush()?; + + Ok(()) + } +} diff --git a/crates/docgen/src/main.rs b/crates/docgen/src/main.rs new file mode 100644 index 000000000..16e85f504 --- /dev/null +++ b/crates/docgen/src/main.rs @@ -0,0 +1,57 @@ +mod file; +mod markdown_table; +mod section; +mod session; + +use file::DocFile; +use rayexec_csv::CsvDataSource; +use rayexec_delta::DeltaDataSource; +use rayexec_error::Result; +use rayexec_execution::datasource::{DataSourceBuilder, DataSourceRegistry, MemoryDataSource}; +use rayexec_iceberg::IcebergDataSource; +use rayexec_parquet::ParquetDataSource; +use rayexec_postgres::PostgresDataSource; +use rayexec_rt_native::runtime::{NativeRuntime, ThreadedNativeExecutor}; +use rayexec_shell::session::SingleUserEngine; +use rayexec_unity_catalog::UnityCatalogDataSource; +use section::{AggregateFunctionWriter, ScalarFunctionWriter, TableFunctionWriter}; +use session::DocsSession; +use tracing::info; + +const FILES: &[DocFile] = &[DocFile { + path: "docs/sql/functions.md", + sections: &[ + ("scalar_functions", &ScalarFunctionWriter), + ("aggregate_functions", &AggregateFunctionWriter), + ("table_functions", &TableFunctionWriter), + ], +}]; + +fn main() -> Result<()> { + logutil::configure_global_logger(tracing::Level::INFO, logutil::LogFormat::HumanReadable); + + info!("starting docs gen"); + + let executor = ThreadedNativeExecutor::try_new().unwrap(); + let runtime = NativeRuntime::with_default_tokio().unwrap(); + + let registry = DataSourceRegistry::default() + .with_datasource("memory", Box::new(MemoryDataSource))? + .with_datasource("postgres", PostgresDataSource::initialize(runtime.clone()))? + .with_datasource("delta", DeltaDataSource::initialize(runtime.clone()))? + .with_datasource("unity", UnityCatalogDataSource::initialize(runtime.clone()))? + .with_datasource("parquet", ParquetDataSource::initialize(runtime.clone()))? + .with_datasource("csv", CsvDataSource::initialize(runtime.clone()))? + .with_datasource("iceberg", IcebergDataSource::initialize(runtime.clone()))?; + let engine = SingleUserEngine::try_new(executor, runtime, registry)?; + let session = DocsSession { engine }; + + for file in FILES { + info!(%file.path, "handing file"); + file.overwrite(&session)?; + } + + info!("completed all files"); + + Ok(()) +} diff --git a/crates/docgen/src/markdown_table.rs b/crates/docgen/src/markdown_table.rs new file mode 100644 index 000000000..ef0cea532 --- /dev/null +++ b/crates/docgen/src/markdown_table.rs @@ -0,0 +1,89 @@ +use std::fmt; + +use rayexec_bullet::batch::Batch; +use rayexec_bullet::field::Schema; +use rayexec_bullet::format::{FormatOptions, Formatter}; +use rayexec_error::Result; + +const FORMATTER: Formatter = Formatter::new(FormatOptions { + null: "", + empty_string: "", +}); + +pub fn write_markdown_table<'a>( + output: &mut dyn fmt::Write, + schema: &Schema, + batches: impl IntoIterator, +) -> Result<()> { + // 'field1 | field2 | field3' + let header = schema + .fields + .iter() + .map(|f| f.name.clone()) + .collect::>() + .join(" | "); + + writeln!(output, "| {header} |")?; + + // ' --- | --- | ---' + let sep = schema + .fields + .iter() + .map(|_| "---") + .collect::>() + .join(" | "); + + writeln!(output, "| {sep} |")?; + + for batch in batches { + for row in 0..batch.num_rows() { + for (idx, column) in batch.columns().iter().enumerate() { + if idx == 0 { + write!(output, "|")?; + } + + let val = FORMATTER.format_array_value(column, row)?; + write!(output, " {val} |")?; + } + writeln!(output)?; + } + } + + Ok(()) +} + +#[cfg(test)] +mod tests { + use rayexec_bullet::array::Array; + use rayexec_bullet::datatype::DataType; + use rayexec_bullet::field::Field; + + use super::*; + + #[test] + fn simple() { + let batch = Batch::try_new([ + Array::from_iter([1, 2, 3]), + Array::from_iter(["cat", "dog", "mouse"]), + ]) + .unwrap(); + + let schema = Schema::new([ + Field::new("Numbers", DataType::Int32, false), + Field::new("Strings", DataType::Utf8, false), + ]); + + let mut buf = String::new(); + + write_markdown_table(&mut buf, &schema, [&batch]).unwrap(); + + let expected = r#"| Numbers | Strings | +| --- | --- | +| 1 | cat | +| 2 | dog | +| 3 | mouse | +"#; + + assert_eq!(expected, buf); + } +} diff --git a/crates/docgen/src/section.rs b/crates/docgen/src/section.rs new file mode 100644 index 000000000..32675e462 --- /dev/null +++ b/crates/docgen/src/section.rs @@ -0,0 +1,70 @@ +use std::fmt::{self, Debug}; + +use rayexec_error::Result; + +use crate::markdown_table::write_markdown_table; +use crate::session::DocsSession; + +pub trait SectionWriter: Debug { + fn write(&self, session: &DocsSession, output: &mut dyn fmt::Write) -> Result<()>; +} + +const SCALAR_FUNCTIONS_QUERY: &str = r#" +SELECT + function_name as "Function name", + description as "Description" +FROM list_functions() +WHERE function_type = 'scalar' +GROUP BY "Function name", "Description" +ORDER BY "Function name"; +"#; + +#[derive(Debug)] +pub struct ScalarFunctionWriter; + +impl SectionWriter for ScalarFunctionWriter { + fn write(&self, session: &DocsSession, output: &mut dyn fmt::Write) -> Result<()> { + let table = session.query(SCALAR_FUNCTIONS_QUERY)?; + write_markdown_table(output, table.schema(), table.iter_batches()) + } +} + +const AGGREGATE_FUNCTIONS_QUERY: &str = r#" +SELECT + function_name as "Function name", + description as "Description" +FROM list_functions() +WHERE function_type = 'aggregate' +GROUP BY "Function name", "Description" +ORDER BY "Function name"; +"#; + +#[derive(Debug)] +pub struct AggregateFunctionWriter; + +impl SectionWriter for AggregateFunctionWriter { + fn write(&self, session: &DocsSession, output: &mut dyn fmt::Write) -> Result<()> { + let table = session.query(AGGREGATE_FUNCTIONS_QUERY)?; + write_markdown_table(output, table.schema(), table.iter_batches()) + } +} + +const TABLE_FUNCTIONS_QUERY: &str = r#" +SELECT + function_name as "Function name", + description as "Description" +FROM list_functions() +WHERE function_type = 'table' +GROUP BY "Function name", "Description" +ORDER BY "Function name"; +"#; + +#[derive(Debug)] +pub struct TableFunctionWriter; + +impl SectionWriter for TableFunctionWriter { + fn write(&self, session: &DocsSession, output: &mut dyn fmt::Write) -> Result<()> { + let table = session.query(TABLE_FUNCTIONS_QUERY)?; + write_markdown_table(output, table.schema(), table.iter_batches()) + } +} diff --git a/crates/docgen/src/session.rs b/crates/docgen/src/session.rs new file mode 100644 index 000000000..adeecb6ac --- /dev/null +++ b/crates/docgen/src/session.rs @@ -0,0 +1,26 @@ +use rayexec_error::Result; +use rayexec_execution::runtime::{Runtime, TokioHandlerProvider}; +use rayexec_rt_native::runtime::{NativeRuntime, ThreadedNativeExecutor}; +use rayexec_shell::result_table::MaterializedResultTable; +use rayexec_shell::session::SingleUserEngine; + +#[derive(Debug)] +pub struct DocsSession { + pub engine: SingleUserEngine, +} + +impl DocsSession { + pub fn query(&self, sql: &str) -> Result { + let handle = self.engine.runtime.tokio_handle().handle()?; + let fut = self.engine.session().query(sql); + + let result: Result = handle.block_on(async move { + let table = fut.await?; + let materialized = table.collect().await?; + + Ok(materialized) + }); + + result + } +} diff --git a/docs/sql/functions.md b/docs/sql/functions.md new file mode 100644 index 000000000..3be374e40 --- /dev/null +++ b/docs/sql/functions.md @@ -0,0 +1,160 @@ +# Functions! + +## Scalar functions + + + +| Function name | Description | +| --- | --- | +| != | Check if two values are not equal. Returns NULL if either argument is NULL. | +| % | | +| * | | +| + | | +| - | | +| / | | +| < | Check if the left argument is less than the right. Returns NULL if either argument is NULL. | +| <= | Check if the left argument is less than or equal to the right. Returns NULL if either argument is NULL. | +| <> | Check if two values are not equal. Returns NULL if either argument is NULL. | +| = | Check if two values are equal. Returns NULL if either argument is NULL. | +| > | Check if the left argument is greater than the right. Returns NULL if either argument is NULL. | +| >= | Check if the left argument is greater than or equal to the right. Returns NULL if either argument is NULL. | +| abs | | +| acos | | +| add | | +| and | Boolean and all inputs. | +| array_distance | Compute the Euclidean distance between two lists. Both lists must be the same length and cannot contain NULLs. | +| ascii | Get the ascii code of the first character of the argument. | +| asin | | +| atan | | +| bit_length | Get the number of bits in a string. | +| bit_length | Get the number of bits in a binary blob. | +| btrim | Trim matching characters from both sides of the string. | +| btrim | Trim whitespace from both sides of the string. | +| byte_length | Get the number of bytes in a string. | +| byte_length | Get the number of bytes in a binary blob. | +| cbrt | | +| ceil | | +| char_length | Get the number of characters in a string. | +| character_length | Get the number of characters in a string. | +| concat | Concatenate many strings into a single string. | +| contains | Check if string contains a search string. | +| cos | | +| date_part | Get a subfield. | +| date_trunc | | +| degrees | | +| div | | +| ends_with | Check if a string ends with a given suffix. | +| epoch | | +| epoch_ms | | +| epoch_s | | +| exp | | +| floor | | +| is_false | Check if a value is false. | +| is_not_false | Check if a value is not false. | +| is_not_null | Check if a value is not NULL. | +| is_not_true | Check if a value is not true. | +| is_null | Check if a value is NULL. | +| is_true | Check if a value is true. | +| isnan | Return if the given float is a NaN. | +| l2_distance | Compute the Euclidean distance between two lists. Both lists must be the same length and cannot contain NULLs. | +| length | Get the number of characters in a string. | +| like | Check if a string matches the given pattern. | +| list_extract | Extract an item from the list. Used 1-based indexing. | +| list_values | Create a list fromt the given values. | +| ln | | +| log | | +| log2 | | +| lower | Convert the string to lowercase. | +| lpad | Left pad a string with another string until the resulting string contains 'count' characters. | +| lpad | Left pad a string with spaces until the resulting string contains 'count' characters. | +| ltrim | Trim whitespace from the left side of the string. | +| ltrim | Trim matching characters from the left side of the string. | +| mod | | +| mul | | +| negate | | +| not | Return the inverse bool of the input. Returns NULL if input is NULL. | +| octet_length | Get the number of bytes in a string. | +| octet_length | Get the number of bytes in a binary blob. | +| or | Boolean or all inputs. | +| prefix | Check if a string starts with a prefix. | +| radians | | +| random | Return a random float. | +| regexp_replace | Replace the first regular expression match in a string. | +| rem | | +| repeat | Repeat a string some number of times. | +| rpad | Right pad a string with another string until the resulting string contains 'count' characters. | +| rpad | Right pad a string with spaces until the resulting string contains 'count' characters. | +| rtrim | Trim whitespace from the right side of the string. | +| rtrim | Trim matching characters from the right side of the string. | +| sin | | +| sqrt | | +| starts_with | Check if a string starts with a prefix. | +| struct_pack | | +| sub | | +| substr | Get a substring of a string starting at an index for some number of characters. The index is 1-based. | +| substr | Get a substring of a string starting at an index until the end of the string. The index is 1-based. | +| substring | Get a substring of a string starting at an index until the end of the string. The index is 1-based. | +| substring | Get a substring of a string starting at an index for some number of characters. The index is 1-based. | +| suffix | Check if a string ends with a given suffix. | +| tan | | +| trim | Trim whitespace from both sides of the string. | +| trim | Trim matching characters from both sides of the string. | +| upper | Convert the string to uppercase. | + + + +## Aggregate functions + + + +| Function name | Description | +| --- | --- | +| avg | Return the average value from the inputs. | +| corr | Return the population correlation coefficient. | +| count | Return the count of non-NULL inputs. | +| covar_pop | Compute population covariance. | +| covar_samp | Compute sample covariance. | +| first | Return the first non-NULL value. | +| max | Return the maximum non-NULL value seen from input. | +| min | Return the minimum non-NULL value seen from input. | +| regr_avgx | Compute the average of the independent variable ('x'). | +| regr_avgy | Compute the average of the dependent variable ('y'). | +| regr_count | Compute the count where both inputs are not NULL. | +| regr_r2 | Compute the square of the correlation coefficient. | +| regr_slope | Compute the slope of the least-squares-fit linear equation. | +| stddev | Compute the sample standard deviation. | +| stddev_pop | Compute the population standard deviation. | +| stddev_samp | Compute the sample standard deviation. | +| string_agg | Concatenate all non-NULL input string values using a delimiter. | +| sum | Compute the sum of all non-NULL inputs. | +| var_pop | Compute the population variance. | +| var_samp | Compute the sample variance. | + + + +## Table functions + + + +| Function name | Description | +| --- | --- | +| csv_scan | | +| delta_scan | | +| generate_series | Generate a series of values from 'start' to 'end' incrementing by 'step'. 'start' and 'end' are both inclusive. | +| generate_series | Generate a series of values from 'start' to 'end' incrementing by a step of 1. 'start' and 'end' are both inclusive. | +| iceberg_scan | | +| list_databases | | +| list_functions | | +| list_schemas | | +| list_tables | | +| parquet_scan | | +| read_csv | | +| read_delta | | +| read_iceberg | | +| read_parquet | | +| read_postgres | | +| unity_list_schemas | | +| unity_list_tables | | +| unnest | Unnest a list, producing a table of unnested values. | + +