0.0.178

mabel-dev · Jan 12, 2025 · 1c9e4d5 · 1c9e4d5
1 parent 371a6a4
commit 1c9e4d5
Show file tree

Hide file tree

Showing 6 changed files with 29 additions and 29 deletions.
diff --git a/orso/compute/compiled.pyx b/orso/compute/compiled.pyx
@@ -85,33 +85,32 @@ cpdef tuple extract_dict_columns(dict data, tuple fields):
     return tuple(field_data)  # Convert list to tuple
 
 
-cpdef list collect_cython(list rows, cnp.ndarray[cnp.int32_t, ndim=1] columns, int limit=-1):
+cpdef cnp.ndarray collect_cython(list rows, cnp.ndarray[cnp.int32_t, ndim=1] columns, int limit=-1):
     """
     Collects columns from a list of tuples (rows).
     """
     cdef int32_t i, j, col_idx
     cdef int32_t num_rows = len(rows)
     cdef int32_t num_cols = columns.shape[0]
-    cdef list row
+    cdef cnp.ndarray row
 
     if limit >= 0 and limit < num_rows:
         num_rows = limit
 
     # Initialize result memory view with pre-allocated numpy arrays for each column
-    cdef list result = [list([None] * num_rows) for _ in range(num_cols)]
+    cdef cnp.ndarray result = np.empty((num_cols, num_rows), dtype=object)
 
     # Populate each column one at a time
     for j in range(num_cols):
         col_idx = columns[j]
-        row = result[j]
         for i in range(num_rows):
-            row[i] = rows[i][col_idx]
+            result[j, i] = rows[i][col_idx]
 
     # Convert each column back to a list and return the list of lists
     return result
 
 
-cpdef int calculate_data_width(list column_values):
+cpdef int calculate_data_width(cnp.ndarray column_values):
     cdef int width, max_width
     cdef object value
 

diff --git a/orso/schema.py b/orso/schema.py
@@ -143,6 +143,9 @@ class FlatColumn:
     precision: Optional[int] = None
     scale: Optional[int] = None
     origin: Optional[List[str]] = field(default_factory=list)
+    highest_value: Optional[Any] = None
+    lowest_value: Optional[Any] = None
+    null_count: Optional[int] = None
 
     def __init__(self, **kwargs):
         attributes = {f.name: f for f in fields(self.__class__)}

diff --git a/orso/tools.py b/orso/tools.py
@@ -234,9 +234,7 @@ def report(self: Callable) -> str:
         Returns:
             str: The formatted report string.
         """
-        stats = (
-            f"\nExecution Statistics for `{self.__name__}`\n  " f"Count   : {self.count}\n"  # type:ignore
-        )
+        stats = f"\nExecution Statistics for `{self.__name__}`\n  Count   : {self.count}\n"  # type:ignore
         if self.count > 0:  # type:ignore
             stats += f"  Average : {sum(self._run_times) / self.count} seconds\n"  # type:ignore
             stats += f"  Slowest : {min(self._run_times)} seconds\n"  # type:ignore
@@ -391,7 +389,7 @@ def _monitor():
                         peak_memory = memory_info
 
                 print(f"Peak CPU usage: {peak_cpu:.2f}%")
-                print(f"Peak memory usage: {peak_memory/1024/1024:.2f} MB")
+                print(f"Peak memory usage: {peak_memory / 1024 / 1024:.2f} MB")
 
             monitor_thread = threading.Thread(target=_monitor)
             monitor_thread.start()
@@ -400,12 +398,12 @@ def _monitor():
                 start_time = time.monotonic_ns()
                 result = func(*args, **kwargs)
                 end_time = time.monotonic_ns()
-                print(f"Execution time: {(end_time - start_time)/1e9:.6f} seconds")
+                print(f"Execution time: {(end_time - start_time) / 1e9:.6f} seconds")
                 return result
             except Exception as e:
                 print(f"Error raised: {type(e).__name__}")
                 end_time = time.monotonic_ns()
-                print(f"Execution time: {(end_time - start_time)/1e9:.6f} seconds")
+                print(f"Execution time: {(end_time - start_time) / 1e9:.6f} seconds")
                 raise e
             finally:
                 stop_flag = True

diff --git a/orso/version.py b/orso/version.py
@@ -10,5 +10,5 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-__version__: str = "0.0.177"
+__version__: str = "0.0.178"
 __author__: str = "@joocer"
diff --git a/tests/test_dataframe.py b/tests/test_dataframe.py
@@ -52,28 +52,28 @@ def test_dataframe_materialize():
 def test_dataframe_collect():
     dataframe = create_dataframe()
     result = dataframe.collect(["A", "C"])
-    assert result == [[1, 2, 3, 4, 5], [1.1, 2.2, 3.3, 4.4, 5.5]], result
+    assert result.tolist() == [[1, 2, 3, 4, 5], [1.1, 2.2, 3.3, 4.4, 5.5]], result
 
     result = dataframe.collect("A")
-    assert result == [1, 2, 3, 4, 5], result
+    assert result.tolist() == [1, 2, 3, 4, 5], result
 
 
 def test_dataframe_get_item():
     dataframe = create_dataframe()
     result = dataframe["A", "C"]
-    assert result == [[1, 2, 3, 4, 5], [1.1, 2.2, 3.3, 4.4, 5.5]], result
+    assert result.tolist() == [[1, 2, 3, 4, 5], [1.1, 2.2, 3.3, 4.4, 5.5]], result
 
     result = dataframe[0, 2]
-    assert result == [[1, 2, 3, 4, 5], [1.1, 2.2, 3.3, 4.4, 5.5]], result
+    assert result.tolist() == [[1, 2, 3, 4, 5], [1.1, 2.2, 3.3, 4.4, 5.5]], result
 
     result = dataframe["A", 2]
-    assert result == [[1, 2, 3, 4, 5], [1.1, 2.2, 3.3, 4.4, 5.5]], result
+    assert result.tolist() == [[1, 2, 3, 4, 5], [1.1, 2.2, 3.3, 4.4, 5.5]], result
 
     result = dataframe["A"]
-    assert result == [1, 2, 3, 4, 5], result
+    assert result.tolist() == [1, 2, 3, 4, 5], result
 
     result = dataframe[0]
-    assert result == [1, 2, 3, 4, 5], result
+    assert result.tolist() == [1, 2, 3, 4, 5], result
 
 
 def test_dataframe_slice():
@@ -132,7 +132,7 @@ def test_dataframe_filter():
     mask = [row[0] > 2 for row in dataframe]
     filtered_dataframe = dataframe.filter(mask)
     assert len(filtered_dataframe) == 3
-    assert filtered_dataframe.collect("A") == [3, 4, 5], filtered_dataframe.collect(["A"])
+    assert filtered_dataframe.collect("A").tolist() == [3, 4, 5], filtered_dataframe.collect(["A"])
 
 
 def test_take():
@@ -283,7 +283,7 @@ def test_build_and_then_profile():
 
     p = df.profile.to_dataframe()
     assert p.rowcount == df.columncount
-    assert p.collect("count") == [df.rowcount] * df.columncount
+    assert p.collect("count").tolist() == [df.rowcount] * df.columncount
 
 
 def test_describe():
@@ -339,7 +339,7 @@ def test_adding_dicts_in_wrong_order():
     df.append({"column_2": "two", "column_1": 2})
 
     # we should have them in the correct order when we extract them
-    assert df.collect(["column_1", "column_2"]) == [[1, 2], ["one", "two"]], df.collect(
+    assert df.collect(["column_1", "column_2"]).tolist() == [[1, 2], ["one", "two"]], df.collect(
         ["column_1", "column_2"]
     )
 

diff --git a/tests/test_profiler.py b/tests/test_profiler.py
@@ -14,7 +14,7 @@ def test_can_profile():
     profile = df.profile.to_dataframe()
 
     assert profile.shape == (6, 12), profile.shape
-    assert profile.collect("count") == [20] * 6
+    assert profile.collect("count").tolist() == [20] * 6
 
 
 def test_opteryx_profile_planets():
@@ -25,7 +25,7 @@ def test_opteryx_profile_planets():
         planets = opteryx.query("SELECT * FROM $planets")
         profile = planets.profile.to_dataframe()
         assert profile.shape == (20, 12), profile.shape
-        assert profile.collect("count") == [9] * 20
+        assert profile.collect("count").tolist() == [9] * 20
     except ImportError:
         # if Opteryx isn't installed, don't fail
         pass
@@ -39,7 +39,7 @@ def test_opteryx_profile_satellites():
         planets = opteryx.query("SELECT * FROM $satellites")
         profile = planets.profile.to_dataframe()
         assert profile.shape == (8, 12), profile.shape
-        assert profile.collect("count") == [177] * 8
+        assert profile.collect("count").tolist() == [177] * 8
     except ImportError:
         # if Opteryx isn't installed, don't fail
         pass
@@ -53,7 +53,7 @@ def test_opteryx_profile_astronauts():
         planets = opteryx.query("SELECT * FROM $astronauts")
         profile = planets.profile.to_dataframe()
         assert profile.shape == (19, 12), profile.shape
-        assert profile.collect("count") == [357] * 19
+        assert profile.collect("count").tolist() == [357] * 19
     except ImportError:
         # if Opteryx isn't installed, don't fail
         pass
@@ -67,7 +67,7 @@ def test_opteryx_profile_missions():
         missions = opteryx.query("SELECT * FROM $missions")
         profile = missions.profile.to_dataframe()
         assert profile.shape == (8, 12), profile.shape
-        assert profile.collect("count") == [4630] * 8, profile.collect("count")
+        assert profile.collect("count").tolist() == [4630] * 8, profile.collect("count")
     except ImportError:
         # if Opteryx isn't installed, don't fail
         pass
@@ -81,7 +81,7 @@ def test_opteryx_profile_fake():
         planets = opteryx.query("SELECT * FROM FAKE(100, 100) AS FK")
         profile = planets.profile.to_dataframe()
         assert profile.shape == (100, 12), profile.shape
-        assert profile.collect("count") == [100] * 100
+        assert profile.collect("count").tolist() == [100] * 100
     except ImportError:
         # if Opteryx isn't installed, don't fail
         pass