Skip to content

Commit

Permalink
Warn when using defaults that aren't explict and not metadata
Browse files Browse the repository at this point in the history
  • Loading branch information
benjeffery committed Jan 17, 2025
1 parent e3b2155 commit 8235f6f
Show file tree
Hide file tree
Showing 2 changed files with 58 additions and 12 deletions.
43 changes: 31 additions & 12 deletions tests/test_variantdata.py
Original file line number Diff line number Diff line change
Expand Up @@ -218,7 +218,11 @@ def test_variantdata_accessors_defaults(tmp_path, in_mem):
ds = data if in_mem else sgkit.load_dataset(data)

default_schema = tskit.MetadataSchema.permissive_json().schema
assert vdata.sequence_length == ts.sequence_length
with pytest.warns(
UserWarning,
match="`sequence_length` was not found as an attribute in the dataset",
):
assert vdata.sequence_length == ts.sequence_length
assert vdata.sites_metadata_schema == default_schema
assert vdata.sites_metadata == [{} for _ in range(ts.num_sites)]
for time in vdata.sites_time:
Expand All @@ -234,17 +238,32 @@ def test_variantdata_accessors_defaults(tmp_path, in_mem):
assert vdata.individuals_metadata == [
{"variant_data_sample_id": sample_id} for sample_id in ds.sample_id[:]
]
for time in vdata.individuals_time:
assert tskit.is_unknown_time(time)
assert np.array_equal(
vdata.individuals_location, np.array([[]] * ts.num_individuals, dtype=float)
)
assert np.array_equal(
vdata.individuals_population, np.full(ts.num_individuals, tskit.NULL)
)
assert np.array_equal(
vdata.individuals_flags, np.zeros(ts.num_individuals, dtype=int)
)
with pytest.warns(
UserWarning, match="`individuals_time` was not found as an array in the dataset"
):
for time in vdata.individuals_time:
assert tskit.is_unknown_time(time)
with pytest.warns(
UserWarning,
match="`individuals_location` was not found as an array in the dataset",
):
assert np.array_equal(
vdata.individuals_location, np.array([[]] * ts.num_individuals, dtype=float)
)
with pytest.warns(
UserWarning,
match="`individuals_population` was not found as an array in the dataset",
):
assert np.array_equal(
vdata.individuals_population, np.full(ts.num_individuals, tskit.NULL)
)
with pytest.warns(
UserWarning,
match="`individuals_flags` was not found as an array in the dataset",
):
assert np.array_equal(
vdata.individuals_flags, np.zeros(ts.num_individuals, dtype=int)
)


@pytest.mark.skipif(sys.platform == "win32", reason="No cyvcf2 on windows")
Expand Down
27 changes: 27 additions & 0 deletions tsinfer/formats.py
Original file line number Diff line number Diff line change
Expand Up @@ -2520,6 +2520,12 @@ def sequence_length(self):
try:
return self.data.attrs["sequence_length"]
except KeyError:
warnings.warn(
"`sequence_length` was not found as an attribute in the dataset, so"
" the largest position has been used. It can be set with"
" ds.attrs['sequence_length'] = 1337; ds.to_zarr('path/to/store',"
" mode='a')"
)
return int(np.max(self.data["variant_position"])) + 1

@property
Expand Down Expand Up @@ -2653,6 +2659,12 @@ def individuals_time(self):
try:
return self.data["individuals_time"][:][self.individuals_select]
except KeyError:
warnings.warn(
"`individuals_time` was not found as an array in the dataset, so "
"tskit.UNKNOWN_TIME has been used. It can be apppended to the dataset "
"with data_array.to_zarr('path/to/store', append_dim='samples', "
"mode='a')"
)
return np.full(self.num_individuals, tskit.UNKNOWN_TIME)

@functools.cached_property
Expand Down Expand Up @@ -2696,20 +2708,35 @@ def individuals_location(self):
try:
return self.data["individuals_location"][:][self.individuals_select]
except KeyError:
warnings.warn(
"`individuals_location` was not found as an array in the dataset, "
"so [] has been used. It can be apppended to the dataset with "
"data_array.to_zarr('path/to/store', append_dim='samples', mode='a')"
)
return np.array([[]] * self.num_individuals, dtype=float)

@functools.cached_property
def individuals_population(self):
try:
return self.data["individuals_population"][:][self.individuals_select]
except KeyError:
warnings.warn(
"`individuals_population` was not found as an array in the dataset, "
"so tskit.NULL has been used. It can be apppended to the dataset with "
"data_array.to_zarr('path/to/store', append_dim='samples', mode='a')"
)
return np.full((self.num_individuals), tskit.NULL, dtype=np.int32)

@functools.cached_property
def individuals_flags(self):
try:
return self.data["individuals_flags"][:][self.individuals_select]
except KeyError:
warnings.warn(
"`individuals_flags` was not found as an array in the dataset, so 0 "
"has been used. It can be apppended to the dataset with "
"data_array.to_zarr('path/to/store', append_dim='samples', mode='a')"
)
return np.full((self.num_individuals), 0, dtype=np.int32)

@staticmethod
Expand Down

0 comments on commit 8235f6f

Please sign in to comment.