Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Make PyO3 bindings an optional feature #14

Merged
merged 3 commits into from
Aug 29, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 5 additions & 2 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -5,11 +5,11 @@ edition = "2021"

[lib]
name = "outlines_core_rs"
crate-type = ["cdylib"]
crate-type = ["cdylib", "rlib"]

[dependencies]
anyhow = "1.0.86"
pyo3 = { version = "0.22.0", features = ["extension-module"] }
kc611 marked this conversation as resolved.
Show resolved Hide resolved
pyo3 = { version = "0.22.0", features = ["extension-module"], optional=true }
regex = "1.10.6"
serde-pyobject = "0.4.0"
serde_json = { version ="1.0.125", features = ["preserve_order"] }
Expand All @@ -20,3 +20,6 @@ lto = true
codegen-units = 1
strip = true
panic = 'abort'

[features]
python-bindings = ["pyo3"]
1 change: 1 addition & 0 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
"outlines_core.fsm.outlines_core_rs",
f"{CURRENT_DIR}/Cargo.toml",
binding=Binding.PyO3,
features=["python-bindings"],
rustc_flags=["--crate-type=cdylib"],
),
]
Expand Down
1 change: 1 addition & 0 deletions src/json_schema/types.rs
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,7 @@ impl FormatType {
}
}

#[allow(clippy::should_implement_trait)]
pub fn from_str(s: &str) -> Option<FormatType> {
match s {
"date-time" => Some(FormatType::DateTime),
Expand Down
60 changes: 4 additions & 56 deletions src/lib.rs
Original file line number Diff line number Diff line change
@@ -1,57 +1,5 @@
mod json_schema;
mod regex;
pub mod json_schema;
pub mod regex;

use pyo3::exceptions::PyValueError;
use pyo3::prelude::*;
use pyo3::types::PyDict;
use pyo3::wrap_pyfunction;
use regex::_walk_fsm;
use regex::create_fsm_index_end_to_end;
use regex::get_token_transition_keys;
use regex::get_vocabulary_transition_keys;
use regex::state_scan_tokens;
use regex::FSMInfo;
use serde_json::Value;

#[pymodule]
fn outlines_core_rs(m: &Bound<'_, PyModule>) -> PyResult<()> {
m.add_function(wrap_pyfunction!(_walk_fsm, m)?)?;
m.add_function(wrap_pyfunction!(state_scan_tokens, m)?)?;
m.add_function(wrap_pyfunction!(get_token_transition_keys, m)?)?;
m.add_function(wrap_pyfunction!(get_vocabulary_transition_keys, m)?)?;
m.add_function(wrap_pyfunction!(create_fsm_index_end_to_end, m)?)?;

m.add_class::<FSMInfo>()?;

m.add("BOOLEAN", json_schema::BOOLEAN)?;
m.add("DATE", json_schema::DATE)?;
m.add("DATE_TIME", json_schema::DATE_TIME)?;
m.add("INTEGER", json_schema::INTEGER)?;
m.add("NULL", json_schema::NULL)?;
m.add("NUMBER", json_schema::NUMBER)?;
m.add("STRING", json_schema::STRING)?;
m.add("STRING_INNER", json_schema::STRING_INNER)?;
m.add("TIME", json_schema::TIME)?;
m.add("UUID", json_schema::UUID)?;
m.add("WHITESPACE", json_schema::WHITESPACE)?;

m.add_function(wrap_pyfunction!(build_regex_from_schema, m)?)?;
m.add_function(wrap_pyfunction!(to_regex, m)?)?;

Ok(())
}

#[pyfunction(name = "build_regex_from_schema")]
#[pyo3(signature = (json, whitespace_pattern=None))]
pub fn build_regex_from_schema(json: String, whitespace_pattern: Option<&str>) -> PyResult<String> {
json_schema::build_regex_from_schema(&json, whitespace_pattern)
.map_err(|e| PyValueError::new_err(e.to_string()))
}

#[pyfunction(name = "to_regex")]
#[pyo3(signature = (json, whitespace_pattern=None))]
pub fn to_regex(json: Bound<PyDict>, whitespace_pattern: Option<&str>) -> PyResult<String> {
let json_value: Value = serde_pyobject::from_pyobject(json).unwrap();
json_schema::to_regex(&json_value, whitespace_pattern, &json_value)
.map_err(|e| PyValueError::new_err(e.to_string()))
}
#[cfg(feature = "python-bindings")]
mod python_bindings;
221 changes: 221 additions & 0 deletions src/python_bindings/mod.rs
brandonwillard marked this conversation as resolved.
Show resolved Hide resolved
Original file line number Diff line number Diff line change
@@ -0,0 +1,221 @@
use crate::json_schema;
use crate::regex::get_token_transition_keys;
use crate::regex::get_vocabulary_transition_keys;
use crate::regex::state_scan_tokens;
use crate::regex::walk_fsm;
use pyo3::exceptions::PyValueError;
use pyo3::prelude::*;
use pyo3::types::PyDict;
use pyo3::wrap_pyfunction;
use serde_json::Value;
use std::collections::{HashMap, HashSet};

#[pyclass]
pub struct FSMInfo {
#[pyo3(get)]
initial: u32,
#[pyo3(get)]
finals: HashSet<u32>,
#[pyo3(get)]
transitions: HashMap<(u32, u32), u32>,
#[pyo3(get)]
alphabet_anything_value: u32,
#[pyo3(get)]
alphabet_symbol_mapping: HashMap<String, u32>,
}

#[pymethods]
impl FSMInfo {
#[new]
fn new(
initial: u32,
finals: HashSet<u32>,
transitions: HashMap<(u32, u32), u32>,
alphabet_anything_value: u32,
alphabet_symbol_mapping: HashMap<String, u32>,
) -> Self {
Self {
initial,
finals,
transitions,
alphabet_anything_value,
alphabet_symbol_mapping,
}
}
}

#[pyfunction(name = "build_regex_from_schema")]
#[pyo3(signature = (json, whitespace_pattern=None))]
pub fn build_regex_from_schema_py(
json: String,
whitespace_pattern: Option<&str>,
) -> PyResult<String> {
json_schema::build_regex_from_schema(&json, whitespace_pattern)
.map_err(|e| PyValueError::new_err(e.to_string()))
}

#[pyfunction(name = "to_regex")]
#[pyo3(signature = (json, whitespace_pattern=None))]
pub fn to_regex_py(json: Bound<PyDict>, whitespace_pattern: Option<&str>) -> PyResult<String> {
let json_value: Value = serde_pyobject::from_pyobject(json).unwrap();
json_schema::to_regex(&json_value, whitespace_pattern, &json_value)
.map_err(|e| PyValueError::new_err(e.to_string()))
}

#[pyfunction(name = "_walk_fsm")]
#[pyo3(
text_signature = "(fsm_transitions, fsm_initial, fsm_finals, token_transition_keys, start_state, full_match)"
)]
pub fn walk_fsm_py(
fsm_transitions: HashMap<(u32, u32), u32>,
fsm_initial: u32,
fsm_finals: HashSet<u32>,
token_transition_keys: Vec<u32>,
start_state: u32,
full_match: bool,
) -> PyResult<Vec<u32>> {
Ok(walk_fsm(
&fsm_transitions,
fsm_initial,
&fsm_finals,
&token_transition_keys,
start_state,
full_match,
))
}

#[pyfunction(name = "state_scan_tokens")]
#[pyo3(
text_signature = "(fsm_transitions, fsm_initial, fsm_finals, vocabulary, vocabulary_transition_keys, start_state)"
)]
pub fn state_scan_tokens_py(
fsm_transitions: HashMap<(u32, u32), u32>,
fsm_initial: u32,
fsm_finals: HashSet<u32>,
vocabulary: Vec<(String, Vec<u32>)>,
vocabulary_transition_keys: Vec<Vec<u32>>,
start_state: u32,
) -> PyResult<HashSet<(u32, u32)>> {
Ok(state_scan_tokens(
&fsm_transitions,
fsm_initial,
&fsm_finals,
&vocabulary,
&vocabulary_transition_keys,
start_state,
))
}

#[pyfunction(name = "get_token_transition_keys")]
#[pyo3(text_signature = "(alphabet_symbol_mapping, alphabet_anything_value, token_str)")]
pub fn get_token_transition_keys_py(
alphabet_symbol_mapping: HashMap<String, u32>,
alphabet_anything_value: u32,
token_str: String,
) -> PyResult<Vec<u32>> {
Ok(get_token_transition_keys(
&alphabet_symbol_mapping,
alphabet_anything_value,
&token_str,
))
}

#[pyfunction(name = "get_vocabulary_transition_keys")]
#[pyo3(
text_signature = "(alphabet_symbol_mapping, alphabet_anything_value, vocabulary, frozen_tokens)"
)]
pub fn get_vocabulary_transition_keys_py(
alphabet_symbol_mapping: HashMap<String, u32>,
alphabet_anything_value: u32,
vocabulary: Vec<(String, Vec<u32>)>,
frozen_tokens: HashSet<String>,
) -> PyResult<Vec<Vec<u32>>> {
Ok(get_vocabulary_transition_keys(
&alphabet_symbol_mapping,
alphabet_anything_value,
&vocabulary,
&frozen_tokens,
))
}

#[pyfunction(name = "create_fsm_index_end_to_end")]
#[pyo3(text_signature = "(fsm_info, vocabulary, frozen_tokens)")]
pub fn create_fsm_index_end_to_end_py<'py>(
py: Python<'py>,
fsm_info: &FSMInfo,
vocabulary: Vec<(String, Vec<u32>)>,
frozen_tokens: HashSet<String>,
) -> PyResult<Bound<'py, PyDict>> {
let states_to_token_subsets = PyDict::new_bound(py);
let mut seen: HashSet<u32> = HashSet::new();
let mut next_states: HashSet<u32> = HashSet::from_iter(vec![fsm_info.initial]);

let vocabulary_transition_keys = get_vocabulary_transition_keys(
&fsm_info.alphabet_symbol_mapping,
fsm_info.alphabet_anything_value,
&vocabulary,
&frozen_tokens,
);

while let Some(start_state) = next_states.iter().cloned().next() {
next_states.remove(&start_state);

// TODO: Return Pydict directly at construction
let token_ids_end_states = state_scan_tokens(
&fsm_info.transitions,
fsm_info.initial,
&fsm_info.finals,
&vocabulary,
&vocabulary_transition_keys,
start_state,
);

for (token_id, end_state) in token_ids_end_states {
if let Ok(Some(existing_dict)) = states_to_token_subsets.get_item(start_state) {
existing_dict.set_item(token_id, end_state).unwrap();
} else {
let new_dict = PyDict::new_bound(py);
new_dict.set_item(token_id, end_state).unwrap();
states_to_token_subsets
.set_item(start_state, new_dict)
.unwrap();
}
kc611 marked this conversation as resolved.
Show resolved Hide resolved

if !seen.contains(&end_state) {
next_states.insert(end_state);
}
}

seen.insert(start_state);
}

Ok(states_to_token_subsets)
}

#[pymodule]
fn outlines_core_rs(m: &Bound<'_, PyModule>) -> PyResult<()> {
m.add_function(wrap_pyfunction!(walk_fsm_py, m)?)?;
m.add_function(wrap_pyfunction!(state_scan_tokens_py, m)?)?;
m.add_function(wrap_pyfunction!(get_token_transition_keys_py, m)?)?;
m.add_function(wrap_pyfunction!(get_vocabulary_transition_keys_py, m)?)?;
m.add_function(wrap_pyfunction!(create_fsm_index_end_to_end_py, m)?)?;

m.add_class::<FSMInfo>()?;

m.add("BOOLEAN", json_schema::BOOLEAN)?;
m.add("DATE", json_schema::DATE)?;
m.add("DATE_TIME", json_schema::DATE_TIME)?;
m.add("INTEGER", json_schema::INTEGER)?;
m.add("NULL", json_schema::NULL)?;
m.add("NUMBER", json_schema::NUMBER)?;
m.add("STRING", json_schema::STRING)?;
m.add("STRING_INNER", json_schema::STRING_INNER)?;
m.add("TIME", json_schema::TIME)?;
m.add("UUID", json_schema::UUID)?;
m.add("WHITESPACE", json_schema::WHITESPACE)?;

m.add_function(wrap_pyfunction!(build_regex_from_schema_py, m)?)?;
m.add_function(wrap_pyfunction!(to_regex_py, m)?)?;

Ok(())
}
Loading
Loading