Skip to content

Commit

Permalink
Wrote torch tests
Browse files Browse the repository at this point in the history
  • Loading branch information
sarda-devesh committed Nov 2, 2023
1 parent 30e233a commit 48e5a05
Show file tree
Hide file tree
Showing 6 changed files with 229 additions and 28 deletions.
1 change: 1 addition & 0 deletions src/cpp/src/common/util.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@

#include <unistd.h>

#include <fstream>
#include <iostream>

#include "reporting/logger.h"
Expand Down
3 changes: 2 additions & 1 deletion src/cpp/src/reporting/reporting.cpp
Original file line number Diff line number Diff line change
@@ -1,9 +1,10 @@
//
// Created by Jason Mohoney on 8/24/21.
//

#include "reporting/reporting.h"

#include <fstream>

#include "configuration/constants.h"
#include "reporting/logger.h"

Expand Down
2 changes: 2 additions & 0 deletions src/cpp/src/storage/buffer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -8,8 +8,10 @@
#include <fcntl.h>
#include <unistd.h>

#include <fstream>
#include <functional>
#include <future>
#include <iostream>
#include <shared_mutex>

#include "configuration/constants.h"
Expand Down
33 changes: 20 additions & 13 deletions src/python/tools/preprocess/converters/torch_converter.py
Original file line number Diff line number Diff line change
Expand Up @@ -404,8 +404,8 @@ def __init__(
remap_ids: bool = True,
sequential_train_nodes: bool = False,
sequential_deg_nodes: int = 0,
num_nodes: int = None,
num_rels: int = None,
num_nodes: int = 1,
num_rels: int = 1,
known_node_ids: list = None,
):
"""
Expand Down Expand Up @@ -592,8 +592,8 @@ def extract_edge_mapping(self):
save_order.insert(1, "edge_weight")
elif self.edge_type_column >= 0 and self.edge_weight_column >= 0:
# Have both edge type and edge weight
save_order.insert(1, "edge_type")
save_order.insert(2, "edge_weight")
save_order.insert(len(save_order) - 1, "edge_type")
save_order.insert(len(save_order) - 1, "edge_weight")

return save_order

Expand Down Expand Up @@ -661,13 +661,15 @@ def convert(self):
delimiter=",",
)
else:
# Determine which columns to keep
cols_to_keep = [self.columns[0], self.columns[-1]]
if self.edge_type_column >= 0:
cols_to_keep = cols_to_keep.insert(len(cols_to_keep) - 1, self.edge_type_column)
cols_to_keep.insert(len(cols_to_keep) - 1, self.edge_type_column)

if self.edge_weight_column >= 0:
cols_to_keep = cols_to_keep.insert(len(cols_to_keep) - 1, self.edge_weight_column)
cols_to_keep.insert(len(cols_to_keep) - 1, self.edge_weight_column)

print("Cols to keep of", cols_to_keep, "with df columns of", train_edges_df.columns)
train_edges_df = train_edges_df[train_edges_df.columns[cols_to_keep]].astype(int)
train_edges_tens = dataframe_to_tensor(train_edges_df)

Expand Down Expand Up @@ -737,34 +739,39 @@ def convert(self):
else:
cols_to_keep = [self.columns[0], self.columns[-1]]
if self.edge_type_column >= 0:
cols_to_keep = cols_to_keep.insert(len(cols_to_keep) - 1, self.edge_type_column)
cols_to_keep.insert(len(cols_to_keep) - 1, self.edge_type_column)

if self.edge_weight_column >= 0:
cols_to_keep = cols_to_keep.insert(len(cols_to_keep) - 1, self.edge_weight_column)
cols_to_keep.insert(len(cols_to_keep) - 1, self.edge_weight_column)
print("Determined columns to keep", cols_to_keep)

train_edges_tens = train_edges_tens[:, [cols_to_keep]]
train_edges_tens = train_edges_tens[:, cols_to_keep]
if valid_edges_tens is not None:
valid_edges_tens = valid_edges_tens[:, [cols_to_keep]]
valid_edges_tens = valid_edges_tens[:, cols_to_keep]
if test_edges_tens is not None:
test_edges_tens = test_edges_tens[:, [cols_to_keep]]
test_edges_tens = test_edges_tens[:, cols_to_keep]

print("Train edges tensor of shape", train_edges_tens.size(), "with values", train_edges_tens[0])

# Split the edges
if self.splits is not None:
train_edges_tens, valid_edges_tens, test_edges_tens = split_edges(train_edges_tens, self.splits)

if save_order is None:
save_order = self.save_order()
save_order = self.extract_edge_mapping()
print("Determined save order of", save_order)

# Extract the weights if they exist
train_edges_weights, valid_edges_weights, test_edges_weights = None, None, None
if "edge_weight" in save_order:
edge_idx = save_order.index("edge_weight")
cols_to_keep = [i for i in range(len(save_order))]
cols_to_keep.pop(edge_idx)
save_order.pop(edge_idx)
print("Cols to keep of", cols_to_keep, "with edge idx of", edge_idx)

train_edges_weights = train_edges_tens[:, [edge_idx]].to(torch.float32)
train_edges_tens = train_edges_tens[:, cols_to_keep]
print("First values", train_edges_tens[0], "with weights of", train_edges_weights[0])

if valid_edges_tens is not None:
valid_edges_weights = valid_edges_tens[:, [edge_idx]].to(torch.float32)
Expand Down
21 changes: 12 additions & 9 deletions src/python/tools/preprocess/converters/writers/torch_writer.py
Original file line number Diff line number Diff line change
Expand Up @@ -59,29 +59,32 @@ def write_to_binary(
f.write(bytes(train_edges_tens.numpy()))

if train_edges_weights is not None:
print("Train edges weights written to:", PathConstants.train_edges_weights_path)
with open(self.output_dir / Path(PathConstants.train_edges_weights_path), "wb") as f:
f.write(bytes(train_edges_weights.numpy()))
save_path = self.output_dir / Path(PathConstants.train_edges_weights_path)
print("Train edges weights written to:", save_path)
weights_arr = train_edges_weights.numpy().flatten()
weights_arr.tofile(save_path)

if valid_edges_tens is not None:
print("Valid edges written to:", PathConstants.valid_edges_path)
with open(self.output_dir / Path(PathConstants.valid_edges_path), "wb") as f:
f.write(bytes(valid_edges_tens.numpy()))

if valid_edges_weights is not None:
print("Valid edges weights written to:", PathConstants.valid_edges_weights_path)
with open(self.output_dir / Path(PathConstants.valid_edges_weights_path), "wb") as f:
f.write(bytes(valid_edges_weights.numpy()))
save_path = self.output_dir / Path(PathConstants.valid_edges_weights_path)
print("Valid edges weights written to:", save_path)
weights_arr = valid_edges_weights.numpy().flatten()
weights_arr.tofile(save_path)

if test_edges_tens is not None:
print("Test edges written to:", PathConstants.test_edges_path)
with open(self.output_dir / Path(PathConstants.test_edges_path), "wb") as f:
f.write(bytes(test_edges_tens.numpy()))

if test_edges_weights is not None:
print("Test edge weights written to:", PathConstants.test_edges_weights_path)
with open(self.output_dir / Path(PathConstants.test_edges_weights_path), "wb") as f:
f.write(bytes(test_edges_weights.numpy()))
save_path = self.output_dir / Path(PathConstants.test_edges_weights_path)
print("Test edge weights written to:", save_path)
weights_arr = test_edges_weights.numpy().flatten()
weights_arr.tofile(save_path)

if num_partitions > 1:
with open(self.output_dir / Path(PathConstants.train_edge_buckets_path), "w") as f:
Expand Down
197 changes: 192 additions & 5 deletions test/python/preprocessing/test_torch_converter.py
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,7 @@ def validate_partitioned_output_dir(
assert offset == expected_stats.num_train


def validate_output_dir(output_dir: Path, expected_stats: DatasetConfig, dtype=np.int32, remap_ids=True):
def validate_output_dir(output_dir: Path, expected_stats: DatasetConfig, dtype=np.int32, remap_ids=True, has_weights = False):
assert output_dir.exists()
assert (output_dir / Path("edges")).exists()
assert (output_dir / Path("nodes")).exists()
Expand Down Expand Up @@ -119,7 +119,13 @@ def validate_output_dir(output_dir: Path, expected_stats: DatasetConfig, dtype=n
else:
assert not node_mapping_path.exists()
assert not relation_mapping_path.exists()


if has_weights:
weights_file_path = output_dir / Path(PathConstants.train_edges_weights_path)
assert weights_file_path.exists()
values = np.memmap(weights_file_path, dtype=np.float32, mode='r')
for i in range(len(values)):
assert values[i] == float(i)

class TestTorchConverter(unittest.TestCase):
"""
Expand Down Expand Up @@ -376,9 +382,6 @@ def test_partitions(self):
output_dir=output_dir, expected_stats=expected_stats, dtype=np.int32, num_partitions=100
)

def test_always_fail(self):
assert 1 == 0

def test_no_remap(self):
output_dir = Path(TMP_TEST_DIR) / Path("test_dtype")
output_dir.mkdir()
Expand All @@ -402,3 +405,187 @@ def test_no_remap(self):
expected_stats.num_train = 1000

validate_output_dir(output_dir=output_dir, expected_stats=expected_stats, dtype=np.int32, remap_ids=False)

def test_torch_no_relation_no_remap(self):
remap_val = False
output_dir = Path(TMP_TEST_DIR) / Path("test_torch_defaults")
output_dir.mkdir()

train_edges_df = pd.read_csv(Path(TMP_TEST_DIR) / Path("train_edges.txt"), header=None, sep=" ")
train_edges = torch.tensor(train_edges_df.to_numpy())

num_rows = train_edges.size(0)
train_edges = torch.column_stack((train_edges, torch.arange(num_rows)))
converter = TorchEdgeListConverter(
output_dir=output_dir,
train_edges=train_edges,
remap_ids = remap_val,
columns = [0, 2],
num_nodes = 100,
format="pytorch"
)
converter.convert()

expected_stats = DatasetConfig()
expected_stats.dataset_dir = output_dir.__str__()
expected_stats.num_edges = 1000
expected_stats.num_nodes = 100
expected_stats.num_relations = 1
expected_stats.num_train = 1000

validate_output_dir(output_dir=output_dir, expected_stats=expected_stats, dtype=np.int32, remap_ids = remap_val)

def test_torch_no_relation_remap(self):
remap_val = True
output_dir = Path(TMP_TEST_DIR) / Path("test_torch_defaults")
output_dir.mkdir()

train_edges_df = pd.read_csv(Path(TMP_TEST_DIR) / Path("train_edges.txt"), header=None, sep=" ")
train_edges = torch.tensor(train_edges_df.to_numpy())

num_rows = train_edges.size(0)
train_edges = torch.column_stack((train_edges, torch.arange(num_rows)))

converter = TorchEdgeListConverter(
output_dir=output_dir,
train_edges=train_edges,
remap_ids = remap_val,
columns = [0, 2],
num_nodes = 100,
format="pytorch"
)
converter.convert()

expected_stats = DatasetConfig()
expected_stats.dataset_dir = output_dir.__str__()
expected_stats.num_edges = 1000
expected_stats.num_nodes = 100
expected_stats.num_relations = 1
expected_stats.num_train = 1000

validate_output_dir(output_dir=output_dir, expected_stats=expected_stats, dtype=np.int32, remap_ids = remap_val)

def test_torch_only_weights_no_remap(self):
remap_val = False
output_dir = Path(TMP_TEST_DIR) / Path("test_torch_defaults")
output_dir.mkdir()

train_edges_df = pd.read_csv(Path(TMP_TEST_DIR) / Path("train_edges.txt"), header=None, sep=" ")
train_edges = torch.tensor(train_edges_df.to_numpy())

num_rows = train_edges.size(0)
train_edges = torch.column_stack((train_edges, torch.arange(num_rows)))

converter = TorchEdgeListConverter(
output_dir=output_dir,
train_edges=train_edges,
remap_ids = remap_val,
columns = [0, 2],
edge_weight_column = 3,
num_nodes = 100,
format="pytorch"
)
converter.convert()

expected_stats = DatasetConfig()
expected_stats.dataset_dir = output_dir.__str__()
expected_stats.num_edges = 1000
expected_stats.num_nodes = 100
expected_stats.num_relations = 1
expected_stats.num_train = 1000

validate_output_dir(output_dir=output_dir, expected_stats=expected_stats, dtype=np.int32, remap_ids = remap_val, has_weights = True)

def test_torch_only_weights_remap(self):
remap_val = True
output_dir = Path(TMP_TEST_DIR) / Path("test_torch_defaults")
output_dir.mkdir()

train_edges_df = pd.read_csv(Path(TMP_TEST_DIR) / Path("train_edges.txt"), header=None, sep=" ")
train_edges = torch.tensor(train_edges_df.to_numpy())

num_rows = train_edges.size(0)
train_edges = torch.column_stack((train_edges, torch.arange(num_rows)))

converter = TorchEdgeListConverter(
output_dir=output_dir,
train_edges=train_edges,
remap_ids = remap_val,
columns = [0, 2],
edge_weight_column = 3,
num_nodes = 100,
format="pytorch"
)
converter.convert()

expected_stats = DatasetConfig()
expected_stats.dataset_dir = output_dir.__str__()
expected_stats.num_edges = 1000
expected_stats.num_nodes = 100
expected_stats.num_relations = 1
expected_stats.num_train = 1000

validate_output_dir(output_dir=output_dir, expected_stats=expected_stats, dtype=np.int32, remap_ids = remap_val, has_weights = True)

def test_torch_relationship_weights_no_remap(self):
remap_val = False
output_dir = Path(TMP_TEST_DIR) / Path("test_torch_defaults")
output_dir.mkdir()

train_edges_df = pd.read_csv(Path(TMP_TEST_DIR) / Path("train_edges.txt"), header=None, sep=" ")
train_edges = torch.tensor(train_edges_df.to_numpy())

num_rows = train_edges.size(0)
train_edges = torch.column_stack((train_edges, torch.arange(num_rows)))

converter = TorchEdgeListConverter(
output_dir=output_dir,
train_edges=train_edges,
remap_ids = remap_val,
columns = [0, 1, 2],
edge_weight_column = 3,
num_nodes = 100,
num_rels=10,
format="pytorch"
)
converter.convert()

expected_stats = DatasetConfig()
expected_stats.dataset_dir = output_dir.__str__()
expected_stats.num_edges = 1000
expected_stats.num_nodes = 100
expected_stats.num_relations = 10
expected_stats.num_train = 1000

validate_output_dir(output_dir=output_dir, expected_stats=expected_stats, dtype=np.int32, remap_ids = remap_val, has_weights = True)

def test_torch_relationship_weights_remap(self):
remap_val = True
output_dir = Path(TMP_TEST_DIR) / Path("test_torch_defaults")
output_dir.mkdir()

train_edges_df = pd.read_csv(Path(TMP_TEST_DIR) / Path("train_edges.txt"), header=None, sep=" ")
train_edges = torch.tensor(train_edges_df.to_numpy())

num_rows = train_edges.size(0)
train_edges = torch.column_stack((train_edges, torch.arange(num_rows)))

converter = TorchEdgeListConverter(
output_dir=output_dir,
train_edges=train_edges,
remap_ids = remap_val,
columns = [0, 1, 2],
edge_weight_column = 3,
num_nodes = 100,
format="pytorch"
)
converter.convert()

expected_stats = DatasetConfig()
expected_stats.dataset_dir = output_dir.__str__()
expected_stats.num_edges = 1000
expected_stats.num_nodes = 100
expected_stats.num_relations = 10
expected_stats.num_train = 1000

validate_output_dir(output_dir=output_dir, expected_stats=expected_stats, dtype=np.int32, remap_ids = remap_val, has_weights = True)

0 comments on commit 48e5a05

Please sign in to comment.