Skip to content

Commit

Permalink
0.0.178
Browse files Browse the repository at this point in the history
  • Loading branch information
joocer committed Jan 12, 2025
1 parent 371a6a4 commit 1c9e4d5
Show file tree
Hide file tree
Showing 6 changed files with 29 additions and 29 deletions.
11 changes: 5 additions & 6 deletions orso/compute/compiled.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -85,33 +85,32 @@ cpdef tuple extract_dict_columns(dict data, tuple fields):
return tuple(field_data) # Convert list to tuple


cpdef list collect_cython(list rows, cnp.ndarray[cnp.int32_t, ndim=1] columns, int limit=-1):
cpdef cnp.ndarray collect_cython(list rows, cnp.ndarray[cnp.int32_t, ndim=1] columns, int limit=-1):
"""
Collects columns from a list of tuples (rows).
"""
cdef int32_t i, j, col_idx
cdef int32_t num_rows = len(rows)
cdef int32_t num_cols = columns.shape[0]
cdef list row
cdef cnp.ndarray row

if limit >= 0 and limit < num_rows:
num_rows = limit

# Initialize result memory view with pre-allocated numpy arrays for each column
cdef list result = [list([None] * num_rows) for _ in range(num_cols)]
cdef cnp.ndarray result = np.empty((num_cols, num_rows), dtype=object)

# Populate each column one at a time
for j in range(num_cols):
col_idx = columns[j]
row = result[j]
for i in range(num_rows):
row[i] = rows[i][col_idx]
result[j, i] = rows[i][col_idx]

# Convert each column back to a list and return the list of lists
return result


cpdef int calculate_data_width(list column_values):
cpdef int calculate_data_width(cnp.ndarray column_values):
cdef int width, max_width
cdef object value

Expand Down
3 changes: 3 additions & 0 deletions orso/schema.py
Original file line number Diff line number Diff line change
Expand Up @@ -143,6 +143,9 @@ class FlatColumn:
precision: Optional[int] = None
scale: Optional[int] = None
origin: Optional[List[str]] = field(default_factory=list)
highest_value: Optional[Any] = None
lowest_value: Optional[Any] = None
null_count: Optional[int] = None

def __init__(self, **kwargs):
attributes = {f.name: f for f in fields(self.__class__)}
Expand Down
10 changes: 4 additions & 6 deletions orso/tools.py
Original file line number Diff line number Diff line change
Expand Up @@ -234,9 +234,7 @@ def report(self: Callable) -> str:
Returns:
str: The formatted report string.
"""
stats = (
f"\nExecution Statistics for `{self.__name__}`\n " f"Count : {self.count}\n" # type:ignore
)
stats = f"\nExecution Statistics for `{self.__name__}`\n Count : {self.count}\n" # type:ignore
if self.count > 0: # type:ignore
stats += f" Average : {sum(self._run_times) / self.count} seconds\n" # type:ignore
stats += f" Slowest : {min(self._run_times)} seconds\n" # type:ignore
Expand Down Expand Up @@ -391,7 +389,7 @@ def _monitor():
peak_memory = memory_info

print(f"Peak CPU usage: {peak_cpu:.2f}%")
print(f"Peak memory usage: {peak_memory/1024/1024:.2f} MB")
print(f"Peak memory usage: {peak_memory / 1024 / 1024:.2f} MB")

monitor_thread = threading.Thread(target=_monitor)
monitor_thread.start()
Expand All @@ -400,12 +398,12 @@ def _monitor():
start_time = time.monotonic_ns()
result = func(*args, **kwargs)
end_time = time.monotonic_ns()
print(f"Execution time: {(end_time - start_time)/1e9:.6f} seconds")
print(f"Execution time: {(end_time - start_time) / 1e9:.6f} seconds")
return result
except Exception as e:
print(f"Error raised: {type(e).__name__}")
end_time = time.monotonic_ns()
print(f"Execution time: {(end_time - start_time)/1e9:.6f} seconds")
print(f"Execution time: {(end_time - start_time) / 1e9:.6f} seconds")
raise e
finally:
stop_flag = True
Expand Down
2 changes: 1 addition & 1 deletion orso/version.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,5 +10,5 @@
# See the License for the specific language governing permissions and
# limitations under the License.

__version__: str = "0.0.177"
__version__: str = "0.0.178"
__author__: str = "@joocer"
20 changes: 10 additions & 10 deletions tests/test_dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,28 +52,28 @@ def test_dataframe_materialize():
def test_dataframe_collect():
dataframe = create_dataframe()
result = dataframe.collect(["A", "C"])
assert result == [[1, 2, 3, 4, 5], [1.1, 2.2, 3.3, 4.4, 5.5]], result
assert result.tolist() == [[1, 2, 3, 4, 5], [1.1, 2.2, 3.3, 4.4, 5.5]], result

result = dataframe.collect("A")
assert result == [1, 2, 3, 4, 5], result
assert result.tolist() == [1, 2, 3, 4, 5], result


def test_dataframe_get_item():
dataframe = create_dataframe()
result = dataframe["A", "C"]
assert result == [[1, 2, 3, 4, 5], [1.1, 2.2, 3.3, 4.4, 5.5]], result
assert result.tolist() == [[1, 2, 3, 4, 5], [1.1, 2.2, 3.3, 4.4, 5.5]], result

result = dataframe[0, 2]
assert result == [[1, 2, 3, 4, 5], [1.1, 2.2, 3.3, 4.4, 5.5]], result
assert result.tolist() == [[1, 2, 3, 4, 5], [1.1, 2.2, 3.3, 4.4, 5.5]], result

result = dataframe["A", 2]
assert result == [[1, 2, 3, 4, 5], [1.1, 2.2, 3.3, 4.4, 5.5]], result
assert result.tolist() == [[1, 2, 3, 4, 5], [1.1, 2.2, 3.3, 4.4, 5.5]], result

result = dataframe["A"]
assert result == [1, 2, 3, 4, 5], result
assert result.tolist() == [1, 2, 3, 4, 5], result

result = dataframe[0]
assert result == [1, 2, 3, 4, 5], result
assert result.tolist() == [1, 2, 3, 4, 5], result


def test_dataframe_slice():
Expand Down Expand Up @@ -132,7 +132,7 @@ def test_dataframe_filter():
mask = [row[0] > 2 for row in dataframe]
filtered_dataframe = dataframe.filter(mask)
assert len(filtered_dataframe) == 3
assert filtered_dataframe.collect("A") == [3, 4, 5], filtered_dataframe.collect(["A"])
assert filtered_dataframe.collect("A").tolist() == [3, 4, 5], filtered_dataframe.collect(["A"])


def test_take():
Expand Down Expand Up @@ -283,7 +283,7 @@ def test_build_and_then_profile():

p = df.profile.to_dataframe()
assert p.rowcount == df.columncount
assert p.collect("count") == [df.rowcount] * df.columncount
assert p.collect("count").tolist() == [df.rowcount] * df.columncount


def test_describe():
Expand Down Expand Up @@ -339,7 +339,7 @@ def test_adding_dicts_in_wrong_order():
df.append({"column_2": "two", "column_1": 2})

# we should have them in the correct order when we extract them
assert df.collect(["column_1", "column_2"]) == [[1, 2], ["one", "two"]], df.collect(
assert df.collect(["column_1", "column_2"]).tolist() == [[1, 2], ["one", "two"]], df.collect(
["column_1", "column_2"]
)

Expand Down
12 changes: 6 additions & 6 deletions tests/test_profiler.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ def test_can_profile():
profile = df.profile.to_dataframe()

assert profile.shape == (6, 12), profile.shape
assert profile.collect("count") == [20] * 6
assert profile.collect("count").tolist() == [20] * 6


def test_opteryx_profile_planets():
Expand All @@ -25,7 +25,7 @@ def test_opteryx_profile_planets():
planets = opteryx.query("SELECT * FROM $planets")
profile = planets.profile.to_dataframe()
assert profile.shape == (20, 12), profile.shape
assert profile.collect("count") == [9] * 20
assert profile.collect("count").tolist() == [9] * 20
except ImportError:
# if Opteryx isn't installed, don't fail
pass
Expand All @@ -39,7 +39,7 @@ def test_opteryx_profile_satellites():
planets = opteryx.query("SELECT * FROM $satellites")
profile = planets.profile.to_dataframe()
assert profile.shape == (8, 12), profile.shape
assert profile.collect("count") == [177] * 8
assert profile.collect("count").tolist() == [177] * 8
except ImportError:
# if Opteryx isn't installed, don't fail
pass
Expand All @@ -53,7 +53,7 @@ def test_opteryx_profile_astronauts():
planets = opteryx.query("SELECT * FROM $astronauts")
profile = planets.profile.to_dataframe()
assert profile.shape == (19, 12), profile.shape
assert profile.collect("count") == [357] * 19
assert profile.collect("count").tolist() == [357] * 19
except ImportError:
# if Opteryx isn't installed, don't fail
pass
Expand All @@ -67,7 +67,7 @@ def test_opteryx_profile_missions():
missions = opteryx.query("SELECT * FROM $missions")
profile = missions.profile.to_dataframe()
assert profile.shape == (8, 12), profile.shape
assert profile.collect("count") == [4630] * 8, profile.collect("count")
assert profile.collect("count").tolist() == [4630] * 8, profile.collect("count")
except ImportError:
# if Opteryx isn't installed, don't fail
pass
Expand All @@ -81,7 +81,7 @@ def test_opteryx_profile_fake():
planets = opteryx.query("SELECT * FROM FAKE(100, 100) AS FK")
profile = planets.profile.to_dataframe()
assert profile.shape == (100, 12), profile.shape
assert profile.collect("count") == [100] * 100
assert profile.collect("count").tolist() == [100] * 100
except ImportError:
# if Opteryx isn't installed, don't fail
pass
Expand Down

0 comments on commit 1c9e4d5

Please sign in to comment.