From 2b8dac88978d7f5e907361571ea36b5249ed453d Mon Sep 17 00:00:00 2001 From: Devesh Sarda Date: Sat, 20 Jan 2024 02:51:25 -0600 Subject: [PATCH] Updated gitignore --- .gitignore | 3 + docs/python_api/data/batch.rst | 11 - docs/python_api/data/dataloader.rst | 17 -- docs/python_api/data/dense_graph.rst | 19 -- docs/python_api/data/graph.rst | 25 -- docs/python_api/data/index.rst | 13 - docs/python_api/data/samplers/edge.rst | 9 - docs/python_api/data/samplers/index.rst | 11 - docs/python_api/data/samplers/negative.rst | 9 - docs/python_api/data/samplers/neighbor.rst | 15 -- src/cpp/include/configuration/config.h | 4 +- src/cpp/include/data/dataloader.h | 2 +- src/cpp/include/storage/graph_storage.h | 6 +- src/cpp/python_bindings/data/batch_wrap.cpp | 55 ---- .../python_bindings/data/dataloader_wrap.cpp | 182 -------------- src/cpp/python_bindings/data/graph_wrap.cpp | 49 ---- .../data/samplers/edge_wrap.cpp | 22 -- .../data/samplers/negative_wrap.cpp | 32 --- .../data/samplers/neighbor_wrap.cpp | 85 ------- src/cpp/python_bindings/data/wrap.cpp | 29 --- src/cpp/src/configuration/config.cpp | 9 + src/cpp/src/data/dataloader.cpp | 14 +- src/cpp/src/marius.cpp | 5 +- src/cpp/src/pipeline/evaluator.cpp | 4 +- src/cpp/src/pipeline/trainer.cpp | 16 +- src/cpp/src/storage/io.cpp | 19 +- src/cpp/third_party/CMakeLists.txt | 21 -- src/cpp/third_party/googletest | 1 - src/cpp/third_party/parallel-hashmap | 1 - src/cpp/third_party/pybind11 | 1 - src/cpp/third_party/spdlog | 1 - .../tools/configuration/marius_config.py | 4 +- test/cpp/unit/data/samplers/test_negative.cpp | 234 ------------------ 33 files changed, 64 insertions(+), 864 deletions(-) delete mode 100644 docs/python_api/data/batch.rst delete mode 100644 docs/python_api/data/dataloader.rst delete mode 100644 docs/python_api/data/dense_graph.rst delete mode 100644 docs/python_api/data/graph.rst delete mode 100644 docs/python_api/data/index.rst delete mode 100644 docs/python_api/data/samplers/edge.rst delete mode 100644 docs/python_api/data/samplers/index.rst delete mode 100644 docs/python_api/data/samplers/negative.rst delete mode 100644 docs/python_api/data/samplers/neighbor.rst delete mode 100644 src/cpp/python_bindings/data/batch_wrap.cpp delete mode 100644 src/cpp/python_bindings/data/dataloader_wrap.cpp delete mode 100644 src/cpp/python_bindings/data/graph_wrap.cpp delete mode 100644 src/cpp/python_bindings/data/samplers/edge_wrap.cpp delete mode 100644 src/cpp/python_bindings/data/samplers/negative_wrap.cpp delete mode 100644 src/cpp/python_bindings/data/samplers/neighbor_wrap.cpp delete mode 100644 src/cpp/python_bindings/data/wrap.cpp delete mode 100644 src/cpp/third_party/CMakeLists.txt delete mode 160000 src/cpp/third_party/googletest delete mode 160000 src/cpp/third_party/parallel-hashmap delete mode 160000 src/cpp/third_party/pybind11 delete mode 160000 src/cpp/third_party/spdlog delete mode 100644 test/cpp/unit/data/samplers/test_negative.cpp diff --git a/.gitignore b/.gitignore index 7e75fc8b..cb30fe25 100644 --- a/.gitignore +++ b/.gitignore @@ -171,3 +171,6 @@ Thumbs.db # End of https://www.toptal.com/developers/gitignore/api/python + +src/cpp/third_party +test_datasets \ No newline at end of file diff --git a/docs/python_api/data/batch.rst b/docs/python_api/data/batch.rst deleted file mode 100644 index 9472a633..00000000 --- a/docs/python_api/data/batch.rst +++ /dev/null @@ -1,11 +0,0 @@ - -Batch -================= - - -.. autoclass:: marius.data.Batch - :members: - :undoc-members: - :exclude-members: to - - .. method:: to(self: marius._data.Batch, device: torch.device) -> None \ No newline at end of file diff --git a/docs/python_api/data/dataloader.rst b/docs/python_api/data/dataloader.rst deleted file mode 100644 index f078038e..00000000 --- a/docs/python_api/data/dataloader.rst +++ /dev/null @@ -1,17 +0,0 @@ - -DataLoader -============================ - - -.. autoclass:: marius.data.DataLoader - :members: - :undoc-members: - :exclude-members: __init__, getBatch - - .. method:: __init__(self: marius._data.DataLoader, graph_storage: GraphModelStorage, learning_task: str, training_config: marius._config.TrainingConfig, evaluation_config: marius._config.EvaluationConfig, encoder_config: marius._config.EncoderConfig) -> None - - .. method:: __init__(self: marius._data.DataLoader, graph_storage: GraphModelStorage, learning_task: str, batch_size: int = 1000, neg_sampler: marius._data.samplers.NegativeSampler = None, nbr_sampler: marius._data.samplers.NeighborSampler = None, train: bool = False) -> None - - .. method:: __init__(self: marius._data.DataLoader, edges: Optional[torch.Tensor], learning_task: str, nodes: Optional[torch.Tensor] = None, node_features: Optional[torch.Tensor] = None, node_embeddings: Optional[torch.Tensor] = None, node_optim_state: Optional[torch.Tensor] = None, node_labels: Optional[torch.Tensor] = None, train_edges: Optional[torch.Tensor] = None, batch_size: int = 1000, neg_sampler: marius._data.samplers.NegativeSampler = None, nbr_sampler: marius._data.samplers.NeighborSampler = None, filter_edges: List[torch.Tensor] = [], train: bool = False) -> None - - .. method:: getBatch(self: marius._data.DataLoader, device: Optional[torch.device] = None, perform_map: bool = True) -> marius._data.Batch \ No newline at end of file diff --git a/docs/python_api/data/dense_graph.rst b/docs/python_api/data/dense_graph.rst deleted file mode 100644 index 7ebe0022..00000000 --- a/docs/python_api/data/dense_graph.rst +++ /dev/null @@ -1,19 +0,0 @@ - -DENSEGraph -============================ - - -.. autoclass:: marius.data.DENSEGraph - :members: - :undoc-members: - :exclude-members: __init__, getNeighborIDs, setNodeProperties, to - - .. method:: __init__(self: marius._data.DENSEGraph) -> None - - .. method:: __init__(self: marius._data.DENSEGraph, hop_offsets: torch.Tensor, node_ids: torch.Tensor, in_offsets: torch.Tensor, in_neighbors_vec: List[torch.Tensor], in_neighbors: torch.Tensor, out_offsets: torch.Tensor, out_neighbors_vec: List[torch.Tensor], out_neighbors: torch.Tensor, num_nodes_in_memory: int) -> None - - .. method:: getNeighborIDs(self: marius._data.DENSEGraph, incoming: bool = True, global_ids: bool = False) -> torch.Tensor - - .. method:: setNodeProperties(self: marius._data.DENSEGraph, node_properties: torch.Tensor) -> None - - .. method:: to(self: marius._data.DENSEGraph, device: torch.device) -> None \ No newline at end of file diff --git a/docs/python_api/data/graph.rst b/docs/python_api/data/graph.rst deleted file mode 100644 index 203900d3..00000000 --- a/docs/python_api/data/graph.rst +++ /dev/null @@ -1,25 +0,0 @@ - -MariusGraph -============================ - - -.. autoclass:: marius.data.MariusGraph - :members: - :undoc-members: - :exclude-members: __init__, getEdges, getNeighborOffsets, getNeighborsForNodeIds, getNumNeighbors, getRelationIDs, to - - .. method:: __init__(self: marius._data.MariusGraph) -> None - - .. method:: __init__(self: marius._data.MariusGraph, src_sorted_edges: torch.Tensor, dst_sorted_edges: torch.Tensor, num_nodes_in_memory: int) -> None - - .. method:: getEdges(self: marius._data.MariusGraph, incoming: bool = True) -> torch.Tensor - - .. method:: getNeighborOffsets(self: marius._data.MariusGraph, incoming: bool = True) -> torch.Tensor - - .. method:: getNeighborsForNodeIds(self: marius._data.MariusGraph, node_ids: torch.Tensor, incoming: bool, neighbor_sampling_layer: marius._config.NeighborSamplingLayer, max_neighbors_size: int, rate: float) -> Tuple(torch.Tensor, torch.Tensor) - - .. method:: getNumNeighbors(self: marius._data.MariusGraph, incoming: bool = True) -> torch.Tensor - - .. method:: getRelationIDs(self: marius._data.MariusGraph, incoming: bool = True) -> torch.Tensor - - .. method:: to(self: marius._data.MariusGraph, device: torch.device) -> None \ No newline at end of file diff --git a/docs/python_api/data/index.rst b/docs/python_api/data/index.rst deleted file mode 100644 index 6372b54f..00000000 --- a/docs/python_api/data/index.rst +++ /dev/null @@ -1,13 +0,0 @@ - -marius.data -******************** - -.. toctree:: - :glob: - :maxdepth: 2 - - samplers/index - batch - dataloader - dense_graph - graph.rst \ No newline at end of file diff --git a/docs/python_api/data/samplers/edge.rst b/docs/python_api/data/samplers/edge.rst deleted file mode 100644 index 77554eee..00000000 --- a/docs/python_api/data/samplers/edge.rst +++ /dev/null @@ -1,9 +0,0 @@ - -RandomEdgeSampler -======================================= - - -.. autoclass:: marius.data.samplers.RandomEdgeSampler - :members: - :undoc-members: - :special-members: __init__ \ No newline at end of file diff --git a/docs/python_api/data/samplers/index.rst b/docs/python_api/data/samplers/index.rst deleted file mode 100644 index c2a6159d..00000000 --- a/docs/python_api/data/samplers/index.rst +++ /dev/null @@ -1,11 +0,0 @@ - -samplers -******************** - -.. toctree:: - :glob: - :maxdepth: 2 - - edge - negative - neighbor \ No newline at end of file diff --git a/docs/python_api/data/samplers/negative.rst b/docs/python_api/data/samplers/negative.rst deleted file mode 100644 index 54d202c4..00000000 --- a/docs/python_api/data/samplers/negative.rst +++ /dev/null @@ -1,9 +0,0 @@ - -CorruptNodeNegativeSampler -==================================================== - - -.. autoclass:: marius.data.samplers.CorruptNodeNegativeSampler - :members: - :undoc-members: - :special-members: __init__ \ No newline at end of file diff --git a/docs/python_api/data/samplers/neighbor.rst b/docs/python_api/data/samplers/neighbor.rst deleted file mode 100644 index 99e98b23..00000000 --- a/docs/python_api/data/samplers/neighbor.rst +++ /dev/null @@ -1,15 +0,0 @@ - -LayeredNeighborSampler -==================================================== - - -.. autoclass:: marius.data.samplers.LayeredNeighborSampler - :members: - :undoc-members: - :exclude-members: __init__ - - .. method:: __init__(self: marius._data.samplers.LayeredNeighborSampler, storage: GraphModelStorage, num_neighbors: List[int], incoming: bool = True, outgoing: bool = True, use_hashmap_sets: bool = False) -> None - - .. method:: __init__(self: marius._data.samplers.LayeredNeighborSampler, graph: MariusGraph, num_neighbors: List[int], incoming: bool = True, outgoing: bool = True, use_hashmap_sets: bool = False) -> None - - .. method:: __init__(self: marius._data.samplers.LayeredNeighborSampler, num_neighbors: List[int], incoming: bool = True, outgoing: bool = True, use_hashmap_sets: bool = False) -> None \ No newline at end of file diff --git a/src/cpp/include/configuration/config.h b/src/cpp/include/configuration/config.h index b1cabfb3..15334def 100644 --- a/src/cpp/include/configuration/config.h +++ b/src/cpp/include/configuration/config.h @@ -143,6 +143,7 @@ struct TrainingConfig { int batch_size; shared_ptr negative_sampling = nullptr; int num_epochs; + int batches_per_epoch; shared_ptr pipeline = nullptr; int epochs_per_shuffle; int logs_per_epoch; @@ -157,6 +158,7 @@ struct EvaluationConfig { shared_ptr negative_sampling = nullptr; shared_ptr pipeline = nullptr; int epochs_per_eval; + int batches_per_epoch; string checkpoint_dir; bool full_graph_evaluation; }; @@ -210,4 +212,4 @@ PYBIND11_EXPORT shared_ptr initMariusConfig(pyobj python_config); shared_ptr loadConfig(string config_path, bool save = false); -#endif // MARIUS_CONFIG_H +#endif // MARIUS_CONFIG_H \ No newline at end of file diff --git a/src/cpp/include/data/dataloader.h b/src/cpp/include/data/dataloader.h index 93b2b0b5..4f9e40fb 100644 --- a/src/cpp/include/data/dataloader.h +++ b/src/cpp/include/data/dataloader.h @@ -86,7 +86,7 @@ class DataLoader { void setActiveNodes(); - void initializeBatches(bool prepare_encode = false); + void initializeBatches(bool prepare_encode = false, bool in_training_mode = true); void clearBatches(); diff --git a/src/cpp/include/storage/graph_storage.h b/src/cpp/include/storage/graph_storage.h index 0ff2e9ad..422deba5 100644 --- a/src/cpp/include/storage/graph_storage.h +++ b/src/cpp/include/storage/graph_storage.h @@ -11,9 +11,11 @@ struct GraphModelStoragePtrs { shared_ptr edges = nullptr; + shared_ptr edges_weights = nullptr; shared_ptr train_edges = nullptr; - shared_ptr train_edges_dst_sort = nullptr; shared_ptr train_edges_weights = nullptr; + shared_ptr train_edges_dst_sort = nullptr; + shared_ptr train_edges_dst_sort_weights = nullptr; shared_ptr validation_edges = nullptr; shared_ptr validation_edges_weights = nullptr; shared_ptr test_edges = nullptr; @@ -311,4 +313,4 @@ class GraphModelStorage { void addFilterEdges(shared_ptr filter_edges) { storage_ptrs_.filter_edges.emplace_back(filter_edges); } }; -#endif // MARIUS_SRC_CPP_INCLUDE_GRAPH_STORAGE_H_ +#endif // MARIUS_SRC_CPP_INCLUDE_GRAPH_STORAGE_H_ \ No newline at end of file diff --git a/src/cpp/python_bindings/data/batch_wrap.cpp b/src/cpp/python_bindings/data/batch_wrap.cpp deleted file mode 100644 index b51afb87..00000000 --- a/src/cpp/python_bindings/data/batch_wrap.cpp +++ /dev/null @@ -1,55 +0,0 @@ -#include "common/pybind_headers.h" -#include "data/batch.h" - -void init_batch(py::module &m) { - py::enum_(m, "BatchStatus") - .value("Waiting", BatchStatus::Waiting) - .value("AccumulatedIndices", BatchStatus::AccumulatedIndices) - .value("LoadedEmbeddings", BatchStatus::LoadedEmbeddings) - .value("TransferredToDevice", BatchStatus::TransferredToDevice) - .value("PreparedForCompute", BatchStatus::PreparedForCompute) - .value("ComputedGradients", BatchStatus::ComputedGradients) - .value("AccumulatedGradients", BatchStatus::AccumulatedGradients) - .value("TransferredToHost", BatchStatus::TransferredToHost) - .value("Done", BatchStatus::Done); - - py::class_>(m, "Batch", py::dynamic_attr()) - .def_readwrite("batch_id", &Batch::batch_id_) - .def_readwrite("start_idx", &Batch::start_idx_) - .def_readwrite("batch_size", &Batch::batch_size_) - .def_readwrite("train", &Batch::train_) - .def_readwrite("device_id", &Batch::device_id_) - - .def_readwrite("status", &Batch::status_) - - .def_readwrite("root_node_indices", &Batch::root_node_indices_) - .def_readwrite("unique_node_indices", &Batch::unique_node_indices_) - .def_readwrite("node_embeddings", &Batch::node_embeddings_) - .def_readwrite("node_gradients", &Batch::node_gradients_) - .def_readwrite("node_embeddings_state", &Batch::node_embeddings_state_) - .def_readwrite("node_state_update", &Batch::node_state_update_) - - .def_readwrite("node_features", &Batch::node_features_) - .def_readwrite("node_labels", &Batch::node_labels_) - - .def_readwrite("src_neg_indices_mapping", &Batch::src_neg_indices_mapping_) - .def_readwrite("dst_neg_indices_mapping", &Batch::dst_neg_indices_mapping_) - - .def_readwrite("edges", &Batch::edges_) - - .def_readwrite("dense_graph", &Batch::dense_graph_) - .def_readwrite("encoded_uniques", &Batch::encoded_uniques_) - - .def_readwrite("neg_edges", &Batch::neg_edges_) - .def_readwrite("rel_neg_indices", &Batch::rel_neg_indices_) - .def_readwrite("src_neg_indices", &Batch::src_neg_indices_) - .def_readwrite("dst_neg_indices", &Batch::dst_neg_indices_) - .def_readwrite("src_neg_filter", &Batch::src_neg_filter_) - .def_readwrite("dst_neg_filter", &Batch::dst_neg_filter_) - - .def(py::init(), py::arg("train")) - .def("to", &Batch::to, py::arg("device"), py::arg("stream") = nullptr) - .def("accumulateGradients", &Batch::accumulateGradients, py::arg("learning_rate")) - .def("embeddingsToHost", &Batch::embeddingsToHost) - .def("clear", &Batch::clear); -} \ No newline at end of file diff --git a/src/cpp/python_bindings/data/dataloader_wrap.cpp b/src/cpp/python_bindings/data/dataloader_wrap.cpp deleted file mode 100644 index ba7de425..00000000 --- a/src/cpp/python_bindings/data/dataloader_wrap.cpp +++ /dev/null @@ -1,182 +0,0 @@ -#include "common/pybind_headers.h" -#include "data/dataloader.h" - -void init_dataloader(py::module &m) { - py::class_>(m, "DataLoader", py::dynamic_attr()) - .def_readwrite("graph_storage", &DataLoader::graph_storage_) - .def_readwrite("edge_sampler", &DataLoader::edge_sampler_) - .def_readwrite("negative_sampler", &DataLoader::negative_sampler_) - .def_readwrite("neighbor_sampler", &DataLoader::neighbor_sampler_) - .def_readwrite("training_config", &DataLoader::training_config_) - .def_readwrite("evaluation_config", &DataLoader::evaluation_config_) - .def_readwrite("train", &DataLoader::train_) - .def_readwrite("epochs_processed", &DataLoader::epochs_processed_) - .def_readwrite("batches_processed", &DataLoader::batches_processed_) - .def_readwrite("current_edge", &DataLoader::current_edge_) - .def_readwrite("batches", &DataLoader::batches_) - .def_readwrite("batch_id_offset", &DataLoader::batch_id_offset_) - // .def_readwrite("batch_iterator", &DataLoader::batch_iterator_) # TODO Iterator needs bindings - .def_readwrite("batches_left", &DataLoader::batches_left_) - .def_readwrite("batches_processed", &DataLoader::total_batches_processed_) - .def_readwrite("all_read", &DataLoader::all_read_) - .def_readwrite("edge_buckets_per_buffer", &DataLoader::edge_buckets_per_buffer_) - .def_readwrite("node_ids_per_buffer", &DataLoader::node_ids_per_buffer_) - .def_readwrite("training_neighbor_sampler", &DataLoader::training_neighbor_sampler_) - .def_readwrite("evaluation_neighbor_sampler", &DataLoader::evaluation_neighbor_sampler_) - .def_readwrite("training_negative_sampler", &DataLoader::training_negative_sampler_) - .def_readwrite("evaluation_negative_sampler", &DataLoader::evaluation_negative_sampler_) - - .def(py::init([](shared_ptr graph_storage, std::string learning_task, shared_ptr training_config, - shared_ptr evaluation_config, shared_ptr encoder_config) { - LearningTask task = getLearningTask(learning_task); - return std::make_shared(graph_storage, task, training_config, evaluation_config, encoder_config); - }), - py::arg("graph_storage"), py::arg("learning_task"), py::arg("training_config"), py::arg("evaluation_config"), py::arg("encoder_config")) - - .def(py::init([](shared_ptr graph_storage, std::string learning_task, int batch_size, shared_ptr neg_sampler, - shared_ptr nbr_sampler, bool train) { - LearningTask task = getLearningTask(learning_task); - - if (task == LearningTask::LINK_PREDICTION) { - if (train) { - graph_storage->storage_ptrs_.train_edges = graph_storage->storage_ptrs_.edges; - } else { - graph_storage->storage_ptrs_.test_edges = graph_storage->storage_ptrs_.edges; - } - } else if (task == LearningTask::NODE_CLASSIFICATION) { - if (graph_storage->storage_ptrs_.nodes == nullptr) { - throw MariusRuntimeException("Node ids must be provided for node classification"); - } - - if (train) { - if (graph_storage->storage_ptrs_.node_labels == nullptr) { - throw MariusRuntimeException("Labels for the nodes must be provided when training with node classification"); - } - graph_storage->storage_ptrs_.train_nodes = graph_storage->storage_ptrs_.nodes; - } else { - graph_storage->storage_ptrs_.test_nodes = graph_storage->storage_ptrs_.nodes; - } - } - - return std::make_shared(graph_storage, task, batch_size, neg_sampler, nbr_sampler, train); - }), - py::arg("graph_storage"), py::arg("learning_task"), py::arg("batch_size") = 1000, py::arg("neg_sampler") = nullptr, - py::arg("nbr_sampler") = nullptr, py::arg("train") = false) - - .def(py::init([](torch::optional edges, std::string learning_task, torch::optional nodes, - torch::optional node_features, torch::optional node_embeddings, - torch::optional node_optimizer_state, torch::optional node_labels, - torch::optional train_edges, int batch_size, shared_ptr neg_sampler, - shared_ptr nbr_sampler, std::vector filter_edges, bool train) { - shared_ptr edges_s = nullptr; - shared_ptr nodes_s = nullptr; - shared_ptr node_features_s = nullptr; - shared_ptr node_embeddings_s = nullptr; - shared_ptr node_optimizer_state_s = nullptr; - shared_ptr node_labels_s = nullptr; - - LearningTask task = getLearningTask(learning_task); - - if (edges.has_value()) { - edges_s = std::make_shared(edges.value()); - } else { - throw UndefinedTensorException(); - } - - if (nodes.has_value()) { - nodes_s = std::make_shared(nodes.value()); - } else { - if (task == LearningTask::NODE_CLASSIFICATION) { - throw MariusRuntimeException("Tensor of node ids must be provided for node classification"); - } - } - - if (node_features.has_value()) { - node_features_s = std::make_shared(node_features.value()); - } - - if (node_embeddings.has_value()) { - node_embeddings_s = std::make_shared(node_embeddings.value()); - } - - if (node_optimizer_state.has_value()) { - node_optimizer_state_s = std::make_shared(node_optimizer_state.value()); - } else { - if (train && node_embeddings_s != nullptr) { - OptimizerState emb_state = torch::zeros_like(node_embeddings.value()); - node_optimizer_state_s = std::make_shared(emb_state); - } - } - - if (node_labels.has_value()) { - node_labels_s = std::make_shared(node_labels.value()); - } else { - if (task == LearningTask::NODE_CLASSIFICATION && train) { - throw MariusRuntimeException("Labels for the nodes must be provided when training with node classification"); - } - } - - GraphModelStoragePtrs ptrs; - ptrs.edges = edges_s; - ptrs.nodes = nodes_s; - ptrs.node_features = node_features_s; - ptrs.node_embeddings = node_embeddings_s; - ptrs.node_optimizer_state = node_optimizer_state_s; - ptrs.node_labels = node_labels_s; - - for (auto f_edges : filter_edges) { - ptrs.filter_edges.emplace_back(std::make_shared(f_edges)); - } - - if (task == LearningTask::LINK_PREDICTION) { - if (train) { - ptrs.train_edges = ptrs.edges; - } else { - ptrs.test_edges = ptrs.edges; - if (train_edges.has_value()) { - ptrs.train_edges = std::make_shared(train_edges.value()); - } - } - } else if (task == LearningTask::NODE_CLASSIFICATION) { - if (train) { - ptrs.train_nodes = ptrs.nodes; - } else { - ptrs.test_nodes = ptrs.nodes; - } - } - - auto gms = std::make_shared(ptrs, false); - return std::make_shared(gms, task, batch_size, neg_sampler, nbr_sampler, train); - }), - py::arg("edges"), py::arg("learning_task"), py::arg("nodes") = nullptr, py::arg("node_features") = nullptr, py::arg("node_embeddings") = nullptr, - py::arg("node_optim_state") = nullptr, py::arg("node_labels") = nullptr, py::arg("train_edges") = nullptr, py::arg("batch_size") = 1000, - py::arg("neg_sampler") = nullptr, py::arg("nbr_sampler") = nullptr, py::arg("filter_edges") = std::vector(), - py::arg("train") = false) - - .def("setBufferOrdering", &DataLoader::setBufferOrdering) - .def("setActiveEdges", &DataLoader::setActiveEdges) - .def("setActiveNodes", &DataLoader::setActiveNodes) - .def("initializeBatches", &DataLoader::initializeBatches, py::arg("prepare_encode") = false) - .def("clearBatches", &DataLoader::clearBatches) - .def("hasNextBatch", &DataLoader::hasNextBatch) - .def("getNextBatch", &DataLoader::getNextBatch, py::return_value_policy::reference) - .def("finishedBatch", &DataLoader::finishedBatch) - .def("getBatch", &DataLoader::getBatch, py::arg("device") = py::none(), py::arg("perform_map") = false, py::arg("worker_id") = 0, - py::return_value_policy::reference) - .def("edgeSample", &DataLoader::edgeSample, py::arg("batch"), py::arg("worker_id") = 0) - .def("nodeSample", &DataLoader::nodeSample, py::arg("batch"), py::arg("worker_id") = 0) - .def("loadCPUParameters", &DataLoader::loadCPUParameters, py::arg("batch")) - .def("loadGPUParameters", &DataLoader::loadGPUParameters, py::arg("batch")) - .def("updateEmbeddings", &DataLoader::updateEmbeddings, py::arg("batch"), py::arg("gpu") = false) - .def("nextEpoch", &DataLoader::nextEpoch) - .def("loadStorage", &DataLoader::loadStorage) - .def("epochComplete", &DataLoader::epochComplete) - .def("unloadStorage", &DataLoader::unloadStorage, py::arg("write") = false) - .def("getNumEdges", &DataLoader::getNumEdges) - .def("getEpochsProcessed", &DataLoader::getEpochsProcessed) - .def("getBatchesProcessed", &DataLoader::getBatchesProcessed) - .def("isTrain", &DataLoader::isTrain) - .def("setTrainSet", &DataLoader::setTrainSet) - .def("setValidationSet", &DataLoader::setValidationSet) - .def("setTestSet", &DataLoader::setTestSet); -} \ No newline at end of file diff --git a/src/cpp/python_bindings/data/graph_wrap.cpp b/src/cpp/python_bindings/data/graph_wrap.cpp deleted file mode 100644 index 74f49f80..00000000 --- a/src/cpp/python_bindings/data/graph_wrap.cpp +++ /dev/null @@ -1,49 +0,0 @@ -#include "common/pybind_headers.h" -#include "data/graph.h" - -void init_graph(py::module &m) { - py::class_>(m, "MariusGraph") - .def_readwrite("src_sorted_edges", &MariusGraph::src_sorted_edges_) - .def_readwrite("dst_sorted_edges", &MariusGraph::dst_sorted_edges_) - .def_readwrite("active_in_memory_subgraph", &MariusGraph::active_in_memory_subgraph_) - .def_readwrite("num_nodes_in_memory", &MariusGraph::num_nodes_in_memory_) - .def_readwrite("node_ids", &MariusGraph::node_ids_) - .def_readwrite("out_sorted_uniques", &MariusGraph::out_sorted_uniques_) - .def_readwrite("out_offsets", &MariusGraph::out_offsets_) - .def_readwrite("out_num_neighbors", &MariusGraph::out_num_neighbors_) - .def_readwrite("in_sorted_uniques", &MariusGraph::in_sorted_uniques_) - .def_readwrite("in_offsets", &MariusGraph::in_offsets_) - .def_readwrite("in_num_neighbors", &MariusGraph::in_num_neighbors_) - .def_readwrite("max_out_num_neighbors_", &MariusGraph::max_out_num_neighbors_) - .def_readwrite("max_in_num_neighbors_", &MariusGraph::max_in_num_neighbors_) - .def(py::init<>()) - .def(py::init(), py::arg("src_sorted_edges"), py::arg("dst_sorted_edges"), py::arg("num_nodes_in_memory")) - .def("getEdges", &MariusGraph::getEdges, py::arg("incoming") = true) - .def("getRelationIDs", &MariusGraph::getRelationIDs, py::arg("incoming") = true) - .def("getNeighborOffsets", &MariusGraph::getNeighborOffsets, py::arg("incoming") = true) - .def("getNumNeighbors", &MariusGraph::getNumNeighbors, py::arg("incoming") = true) - .def("getNeighborsForNodeIds", &MariusGraph::getNeighborsForNodeIds, py::arg("node_ids"), py::arg("incoming"), py::arg("neighbor_sampling_layer"), - py::arg("max_neighbors_size"), py::arg("rate")) - .def("clear", &MariusGraph::clear) - .def("to", &MariusGraph::to, py::arg("device")); - - py::class_>(m, "DENSEGraph") - .def_readwrite("hop_offsets", &DENSEGraph::hop_offsets_) - .def_readwrite("in_neighbors", &DENSEGraph::in_neighbors_mapping_, "description of the variable") - .def_readwrite("out_neighbors", &DENSEGraph::out_neighbors_mapping_) - .def_readwrite("in_neighbors_vec", &DENSEGraph::in_neighbors_vec_) - .def_readwrite("out_neighbors_vec", &DENSEGraph::out_neighbors_vec_) - .def_readwrite("node_properties", &DENSEGraph::node_properties_) - .def_readwrite("num_nodes_in_memory", &DENSEGraph::num_nodes_in_memory_) - .def(py::init<>()) - .def(py::init, Indices, Indices, std::vector, Indices, int>(), - py::arg("hop_offsets"), py::arg("node_ids"), py::arg("in_offsets"), py::arg("in_neighbors_vec"), py::arg("in_neighbors"), py::arg("out_offsets"), - py::arg("out_neighbors_vec"), py::arg("out_neighbors"), py::arg("num_nodes_in_memory")) - .def("prepareForNextLayer", &DENSEGraph::prepareForNextLayer) - .def("getNeighborIDs", &DENSEGraph::getNeighborIDs, py::arg("incoming") = true, py::arg("global_ids") = false) - .def("getLayerOffset", &DENSEGraph::getLayerOffset) - .def("performMap", &DENSEGraph::performMap) - .def("setNodeProperties", &DENSEGraph::setNodeProperties, py::arg("node_properties")) - .def("clear", &DENSEGraph::clear) - .def("to", &DENSEGraph::to, py::arg("device"), py::arg("compute_stream") = nullptr, py::arg("transfer_stream") = nullptr); -} \ No newline at end of file diff --git a/src/cpp/python_bindings/data/samplers/edge_wrap.cpp b/src/cpp/python_bindings/data/samplers/edge_wrap.cpp deleted file mode 100644 index 33aaf966..00000000 --- a/src/cpp/python_bindings/data/samplers/edge_wrap.cpp +++ /dev/null @@ -1,22 +0,0 @@ -// -// Created by Jason Mohoney on 2/14/22. -// - -#include "common/pybind_headers.h" -#include "data/samplers/edge.h" - -class PyEdgeSampler : EdgeSampler { - public: - using EdgeSampler::EdgeSampler; - EdgeList getEdges(shared_ptr batch) override { PYBIND11_OVERRIDE_PURE_NAME(EdgeList, EdgeSampler, "getEdges", getEdges, batch); } -}; - -void init_edge_samplers(py::module &m) { - py::class_>(m, "EdgeSampler") - .def_readwrite("graph_storage", &EdgeSampler::graph_storage_) - .def("getEdges", &EdgeSampler::getEdges, py::arg("batch")); - - py::class_>(m, "RandomEdgeSampler") - .def_readwrite("without_replacement", &RandomEdgeSampler::without_replacement_) - .def(py::init, bool>(), py::arg("graph_storage"), py::arg("without_replacement") = true); -} \ No newline at end of file diff --git a/src/cpp/python_bindings/data/samplers/negative_wrap.cpp b/src/cpp/python_bindings/data/samplers/negative_wrap.cpp deleted file mode 100644 index 928b1e60..00000000 --- a/src/cpp/python_bindings/data/samplers/negative_wrap.cpp +++ /dev/null @@ -1,32 +0,0 @@ -// -// Created by Jason Mohoney on 2/14/22. -// - -#include "common/pybind_headers.h" -#include "data/samplers/negative.h" - -class PyNegativeSampler : NegativeSampler { - public: - using NegativeSampler::NegativeSampler; - using ReturnTensorTuple = std::tuple; - std::tuple getNegatives(shared_ptr graph, torch::Tensor edges, bool inverse) override { - PYBIND11_OVERRIDE_PURE_NAME(ReturnTensorTuple, NegativeSampler, "getNegatives", getNegatives, graph, edges, inverse); - } -}; - -void init_neg_samplers(py::module &m) { - py::class_>(m, "NegativeSampler") - .def("getNegatives", &NegativeSampler::getNegatives, py::arg("graph"), py::arg("edges") = torch::Tensor(), py::arg("inverse") = false); - - py::class_>(m, "CorruptNodeNegativeSampler") - .def_readwrite("num_chunks", &CorruptNodeNegativeSampler::num_chunks_) - .def_readwrite("num_negatives", &CorruptNodeNegativeSampler::num_negatives_) - .def_readwrite("degree_fraction", &CorruptNodeNegativeSampler::degree_fraction_) - .def_readwrite("filtered", &CorruptNodeNegativeSampler::filtered_) - .def(py::init([](int num_chunks, int num_negatives, float degree_fraction, bool filtered, string filter_mode) { - auto deg_filter_mode = getLocalFilterMode(filter_mode); - return std::make_shared(num_chunks, num_negatives, degree_fraction, filtered, deg_filter_mode); - }), - py::arg("num_chunks") = 1, py::arg("num_negatives") = 500, py::arg("degree_fraction") = 0.0, py::arg("filtered") = false, - py::arg("local_filter_mode") = "deg"); -} diff --git a/src/cpp/python_bindings/data/samplers/neighbor_wrap.cpp b/src/cpp/python_bindings/data/samplers/neighbor_wrap.cpp deleted file mode 100644 index 6e170ec6..00000000 --- a/src/cpp/python_bindings/data/samplers/neighbor_wrap.cpp +++ /dev/null @@ -1,85 +0,0 @@ -// -// Created by Jason Mohoney on 2/14/22. -// - -#include "common/pybind_headers.h" -#include "data/samplers/neighbor.h" - -class PyNeighborSampler : NeighborSampler { - public: - using NeighborSampler::NeighborSampler; - DENSEGraph getNeighbors(torch::Tensor node_ids, shared_ptr graph, int worker_id) override { - PYBIND11_OVERRIDE_PURE_NAME(DENSEGraph, NeighborSampler, "getNeighbors", getNeighbors, node_ids, graph, worker_id); - } -}; - -void init_neighbor_samplers(py::module &m) { - py::class_>(m, "NeighborSampler") - .def_readwrite("storage", &NeighborSampler::storage_) - .def("getNeighbors", &NeighborSampler::getNeighbors, py::arg("node_ids"), py::arg("graph") = nullptr, py::arg("worker_id") = 0); - - py::class_>(m, "LayeredNeighborSampler") - .def_readwrite("sampling_layers", &LayeredNeighborSampler::sampling_layers_) - - .def(py::init([](shared_ptr storage, std::vector num_neighbors, bool use_hashmap_sets) { - std::vector> sampling_layers; - for (auto n : num_neighbors) { - shared_ptr ptr = std::make_shared(); - if (n == -1) { - ptr->type = NeighborSamplingLayer::ALL; - ptr->options = std::make_shared(); - } else { - ptr->type = NeighborSamplingLayer::UNIFORM; - auto opts = std::make_shared(); - opts->max_neighbors = n; - ptr->options = opts; - } - ptr->use_hashmap_sets = use_hashmap_sets; - sampling_layers.emplace_back(ptr); - } - return std::make_shared(storage, sampling_layers); - }), - py::arg("storage"), py::arg("num_neighbors"), py::arg("use_hashmap_sets") = false) - - .def(py::init([](shared_ptr graph, std::vector num_neighbors, bool use_hashmap_sets) { - std::vector> sampling_layers; - - for (auto n : num_neighbors) { - shared_ptr ptr = std::make_shared(); - if (n == -1) { - ptr->type = NeighborSamplingLayer::ALL; - ptr->options = std::make_shared(); - } else { - ptr->type = NeighborSamplingLayer::UNIFORM; - auto opts = std::make_shared(); - opts->max_neighbors = n; - ptr->options = opts; - } - ptr->use_hashmap_sets = use_hashmap_sets; - sampling_layers.emplace_back(ptr); - } - return std::make_shared(graph, sampling_layers); - }), - py::arg("graph"), py::arg("num_neighbors"), py::arg("use_hashmap_sets") = false) - - .def(py::init([](std::vector num_neighbors, bool use_hashmap_sets) { - std::vector> sampling_layers; - - for (auto n : num_neighbors) { - shared_ptr ptr = std::make_shared(); - if (n == -1) { - ptr->type = NeighborSamplingLayer::ALL; - ptr->options = std::make_shared(); - } else { - ptr->type = NeighborSamplingLayer::UNIFORM; - auto opts = std::make_shared(); - opts->max_neighbors = n; - ptr->options = opts; - } - ptr->use_hashmap_sets = use_hashmap_sets; - sampling_layers.emplace_back(ptr); - } - return std::make_shared(sampling_layers); - }), - py::arg("num_neighbors"), py::arg("use_hashmap_sets") = false); -} diff --git a/src/cpp/python_bindings/data/wrap.cpp b/src/cpp/python_bindings/data/wrap.cpp deleted file mode 100644 index f7cd4167..00000000 --- a/src/cpp/python_bindings/data/wrap.cpp +++ /dev/null @@ -1,29 +0,0 @@ -#include "common/pybind_headers.h" - -// data -void init_batch(py::module &); -void init_dataloader(py::module &); -void init_graph(py::module &); - -// data/samplers -void init_edge_samplers(py::module &); -void init_neg_samplers(py::module &); -void init_neighbor_samplers(py::module &); - -PYBIND11_MODULE(_data, m) { - m.doc() = "Objects for in memory processing and sampling"; - - // data/samplers - auto samplers_m = m.def_submodule("samplers"); - - samplers_m.doc() = "Graph Samplers"; - - init_edge_samplers(samplers_m); - init_neg_samplers(samplers_m); - init_neighbor_samplers(samplers_m); - - // data - init_batch(m); - init_dataloader(m); - init_graph(m); -} diff --git a/src/cpp/src/configuration/config.cpp b/src/cpp/src/configuration/config.cpp index 119ad520..7561db29 100644 --- a/src/cpp/src/configuration/config.cpp +++ b/src/cpp/src/configuration/config.cpp @@ -473,6 +473,10 @@ shared_ptr initTrainingConfig(pyobj python_config) { ret_config->checkpoint = initCheckpointConfig(python_config.attr("checkpoint")); ret_config->resume_training = cast_helper(python_config.attr("resume_training")); ret_config->resume_from_checkpoint = cast_helper(python_config.attr("resume_from_checkpoint")); + ret_config->batches_per_epoch = -1; + if(pybind11::hasattr(python_config, "batches_per_epoch")) { + ret_config->batches_per_epoch = cast_helper(python_config.attr("batches_per_epoch")); + } return ret_config; } @@ -485,6 +489,11 @@ shared_ptr initEvaluationConfig(pyobj python_config) { ret_config->pipeline = initPipelineConfig(python_config.attr("pipeline")); ret_config->epochs_per_eval = cast_helper(python_config.attr("epochs_per_eval")); ret_config->checkpoint_dir = cast_helper(python_config.attr("checkpoint_dir")); + ret_config->batches_per_epoch = -1; + if(pybind11::hasattr(python_config, "batches_per_epoch")) { + ret_config->batches_per_epoch = cast_helper(python_config.attr("batches_per_epoch")); + } + return ret_config; } diff --git a/src/cpp/src/data/dataloader.cpp b/src/cpp/src/data/dataloader.cpp index 456e1ccc..da3634f0 100644 --- a/src/cpp/src/data/dataloader.cpp +++ b/src/cpp/src/data/dataloader.cpp @@ -199,7 +199,7 @@ void DataLoader::setActiveNodes() { graph_storage_->setActiveNodes(node_ids); } -void DataLoader::initializeBatches(bool prepare_encode) { +void DataLoader::initializeBatches(bool prepare_encode, bool in_training_mode) { int64_t batch_id = 0; int64_t start_idx = 0; @@ -208,6 +208,11 @@ void DataLoader::initializeBatches(bool prepare_encode) { all_read_ = false; int64_t num_items; + int max_batches = training_config_->batches_per_epoch; + if(!in_training_mode) { + max_batches = evaluation_config_->batches_per_epoch; + } + if (prepare_encode) { num_items = graph_storage_->getNumNodes(); } else { @@ -240,9 +245,14 @@ void DataLoader::initializeBatches(bool prepare_encode) { batches.emplace_back(curr_batch); batch_id++; start_idx += batch_size; + + // Only keep the first max_batches batches + if(max_batches > 0 && batches.size() >= max_batches) { + break; + } } - batches_ = batches; + batches_ = batches; batches_left_ = batches_.size(); batch_iterator_ = batches_.begin(); } diff --git a/src/cpp/src/marius.cpp b/src/cpp/src/marius.cpp index 394ac728..288cb198 100644 --- a/src/cpp/src/marius.cpp +++ b/src/cpp/src/marius.cpp @@ -108,7 +108,6 @@ void marius_train(shared_ptr marius_config) { auto graph_model_storage = std::get<1>(tup); auto dataloader = std::get<2>(tup); - /* shared_ptr trainer; shared_ptr evaluator; @@ -160,7 +159,7 @@ void marius_train(shared_ptr marius_config) { if (marius_config->storage->export_encoded_nodes) { encode_and_export(dataloader, model, marius_config); } - } */ + } } void marius_eval(shared_ptr marius_config) { @@ -171,7 +170,6 @@ void marius_eval(shared_ptr marius_config) { shared_ptr evaluator; - /* if (marius_config->evaluation->epochs_per_eval > 0) { if (marius_config->evaluation->pipeline->sync) { evaluator = std::make_shared(dataloader, model); @@ -184,7 +182,6 @@ void marius_eval(shared_ptr marius_config) { if (marius_config->storage->export_encoded_nodes) { encode_and_export(dataloader, model, marius_config); } - */ } void marius(int argc, char *argv[]) { diff --git a/src/cpp/src/pipeline/evaluator.cpp b/src/cpp/src/pipeline/evaluator.cpp index d4475018..256d988b 100644 --- a/src/cpp/src/pipeline/evaluator.cpp +++ b/src/cpp/src/pipeline/evaluator.cpp @@ -30,7 +30,7 @@ void PipelineEvaluator::evaluate(bool validation) { } } - dataloader_->initializeBatches(false); + dataloader_->initializeBatches(false, false); if (dataloader_->evaluation_negative_sampler_ != nullptr) { if (dataloader_->evaluation_config_->negative_sampling->filtered) { @@ -66,7 +66,7 @@ void SynchronousEvaluator::evaluate(bool validation) { } } - dataloader_->initializeBatches(false); + dataloader_->initializeBatches(false, false); if (dataloader_->evaluation_negative_sampler_ != nullptr) { if (dataloader_->evaluation_config_->negative_sampling->filtered) { diff --git a/src/cpp/src/pipeline/trainer.cpp b/src/cpp/src/pipeline/trainer.cpp index c946357c..a42178db 100644 --- a/src/cpp/src/pipeline/trainer.cpp +++ b/src/cpp/src/pipeline/trainer.cpp @@ -23,6 +23,12 @@ PipelineTrainer::PipelineTrainer(shared_ptr dataloader, shared_ptrgraph_storage_->storage_ptrs_.train_nodes->getDim0(); } + // Log for every batch + if(dataloader_->training_config_->batches_per_epoch > 0) { + num_items = dataloader_->training_config_->batches_per_epoch * dataloader_->training_config_->batch_size; + logs_per_epoch = dataloader_->training_config_->batches_per_epoch; + } + progress_reporter_ = std::make_shared(item_name, num_items, logs_per_epoch); if (model->device_.is_cuda()) { @@ -37,7 +43,7 @@ void PipelineTrainer::train(int num_epochs) { dataloader_->setTrainSet(); } - dataloader_->initializeBatches(false); + dataloader_->initializeBatches(false, true); Timer timer = Timer(false); for (int epoch = 0; epoch < num_epochs; epoch++) { @@ -88,6 +94,12 @@ SynchronousTrainer::SynchronousTrainer(shared_ptr dataloader, shared num_items = dataloader_->graph_storage_->storage_ptrs_.train_nodes->getDim0(); } + // Log for every batch + if(dataloader_->training_config_->batches_per_epoch > 0) { + num_items = dataloader_->training_config_->batches_per_epoch * dataloader_->training_config_->batch_size; + logs_per_epoch = dataloader_->training_config_->batches_per_epoch; + } + progress_reporter_ = std::make_shared(item_name, num_items, logs_per_epoch); } @@ -96,7 +108,7 @@ void SynchronousTrainer::train(int num_epochs) { dataloader_->setTrainSet(); } - dataloader_->initializeBatches(false); + dataloader_->initializeBatches(false, true); Timer timer = Timer(false); diff --git a/src/cpp/src/storage/io.cpp b/src/cpp/src/storage/io.cpp index 6d5e4494..8b49630d 100644 --- a/src/cpp/src/storage/io.cpp +++ b/src/cpp/src/storage/io.cpp @@ -16,6 +16,7 @@ inline bool does_file_exists(const std::string& file_path) { } std::map> initializeEdges(shared_ptr storage_config, LearningTask learning_task) { + // Determined the file paths string train_filename = storage_config->dataset->dataset_dir + PathConstants::edges_directory + PathConstants::training + PathConstants::edges_file + PathConstants::file_ext; @@ -106,7 +107,6 @@ std::map> initializeEdges(shared_ptr(train_filename, num_train, num_columns, dtype, torch::kCPU); if (!storage_config->train_edges_pre_sorted) { @@ -144,7 +144,6 @@ std::map> initializeEdges(shared_ptr(train_filename, num_train, num_columns, dtype, storage_config->device_type); if (!storage_config->train_edges_pre_sorted) { @@ -230,7 +229,6 @@ std::map> initializeEdges(shared_ptrshuffle_input); if (storage_config->shuffle_input) { if (valid_edge_storage != nullptr) { valid_edge_storage->shuffle(valid_edge_weights_storage); @@ -240,14 +238,15 @@ std::map> initializeEdges(shared_ptr> storage_ptrs{{"train_edge_storage", train_edge_storage}, - {"train_edge_storage_dst_sort", train_edge_storage_dst_sort}, - {"valid_edge_storage", valid_edge_storage}, - {"test_edge_storage", test_edge_storage}, + std::map> storage_ptrs{{"train_edges", train_edge_storage}, + {"train_edges_dst_sort", train_edge_storage_dst_sort}, + {"validation_edges", valid_edge_storage}, + {"test_edges", test_edge_storage}, {"train_edge_weights_storage", train_edge_weights_storage}, {"valid_edge_weights_storage", valid_edge_weights_storage}, {"test_edge_weights_storage", test_edge_weights_storage}, {"train_edge_dst_sort_weights_storage", train_edge_dst_sort_weights_storage}}; + return storage_ptrs; } @@ -486,7 +485,9 @@ shared_ptr initializeStorageLinkPrediction(shared_ptr storage_ptrs.train_edges_dst_sort = edge_storages["train_edges_dst_sort"]; storage_ptrs.validation_edges = edge_storages["validation_edges"]; storage_ptrs.test_edges = edge_storages["test_edges"]; + storage_ptrs.train_edges_weights = edge_storages["train_edge_weights_storage"]; + storage_ptrs.train_edges_dst_sort_weights = edge_storages["train_edge_dst_sort_weights_storage"]; storage_ptrs.validation_edges_weights = edge_storages["valid_edge_weights_storage"]; storage_ptrs.test_edges_weights = edge_storages["test_edge_weights_storage"]; @@ -512,7 +513,11 @@ shared_ptr initializeStorageNodeClassification(shared_ptr(node_id_storages); storage_ptrs.valid_nodes = std::get<1>(node_id_storages); diff --git a/src/cpp/third_party/CMakeLists.txt b/src/cpp/third_party/CMakeLists.txt deleted file mode 100644 index 1423a979..00000000 --- a/src/cpp/third_party/CMakeLists.txt +++ /dev/null @@ -1,21 +0,0 @@ -function(initialize_submodule DIRECTORY) - if(NOT EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${DIRECTORY}/.git) - find_package(Git QUIET REQUIRED) - message(STATUS "${CMAKE_CURRENT_SOURCE_DIR}/${DIRECTORY}/.git does not exist. Initializing ${DIRECTORY} submodule ...") - execute_process(COMMAND ${GIT_EXECUTABLE} submodule update --init ${DIRECTORY} - WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR} - RESULT_VARIABLE GIT_EXIT_CODE) - if(NOT GIT_EXIT_CODE EQUAL "0") - message(FATAL_ERROR "${GIT_EXECUTABLE} submodule update --init dependencies/${DIRECTORY} failed with exit code ${GIT_EXIT_CODE}, please checkout submodules") - endif() - endif() -endfunction(initialize_submodule) - -initialize_submodule(pybind11) -initialize_submodule(spdlog) -initialize_submodule(googletest) -initialize_submodule(parallel-hashmap) - -add_subdirectory(googletest EXCLUDE_FROM_ALL) -add_subdirectory(pybind11 EXCLUDE_FROM_ALL) -add_subdirectory(spdlog EXCLUDE_FROM_ALL) diff --git a/src/cpp/third_party/googletest b/src/cpp/third_party/googletest deleted file mode 160000 index 23ef2955..00000000 --- a/src/cpp/third_party/googletest +++ /dev/null @@ -1 +0,0 @@ -Subproject commit 23ef29555ef4789f555f1ba8c51b4c52975f0907 diff --git a/src/cpp/third_party/parallel-hashmap b/src/cpp/third_party/parallel-hashmap deleted file mode 160000 index f7fbefac..00000000 --- a/src/cpp/third_party/parallel-hashmap +++ /dev/null @@ -1 +0,0 @@ -Subproject commit f7fbefac75c9f29538bfe7b65f3b67c082c127d6 diff --git a/src/cpp/third_party/pybind11 b/src/cpp/third_party/pybind11 deleted file mode 160000 index e08a5811..00000000 --- a/src/cpp/third_party/pybind11 +++ /dev/null @@ -1 +0,0 @@ -Subproject commit e08a58111dbea38d667b209f7543864d51a3b185 diff --git a/src/cpp/third_party/spdlog b/src/cpp/third_party/spdlog deleted file mode 160000 index c5abaedd..00000000 --- a/src/cpp/third_party/spdlog +++ /dev/null @@ -1 +0,0 @@ -Subproject commit c5abaeddcaa67e885de99db583396899a0cf43e6 diff --git a/src/python/tools/configuration/marius_config.py b/src/python/tools/configuration/marius_config.py index a9a0a322..fd82740f 100644 --- a/src/python/tools/configuration/marius_config.py +++ b/src/python/tools/configuration/marius_config.py @@ -724,6 +724,7 @@ def merge(self, input_config: DictConfig): @dataclass class TrainingConfig: batch_size: int = 1000 + batches_per_epoch: int = -1 negative_sampling: NegativeSamplingConfig = MISSING num_epochs: int = 10 pipeline: PipelineConfig = PipelineConfig() @@ -776,6 +777,7 @@ def merge(self, input_config: DictConfig): @dataclass class EvaluationConfig: batch_size: int = 1000 + batches_per_epoch: int = -1 negative_sampling: NegativeSamplingConfig = MISSING pipeline: PipelineConfig = PipelineConfig() epochs_per_eval: int = 1 @@ -946,4 +948,4 @@ def load_config(input_config_path, save=False): check_gnn_layers_alignment(output_config) check_full_graph_evaluation(output_config) - return output_config + return output_config \ No newline at end of file diff --git a/test/cpp/unit/data/samplers/test_negative.cpp b/test/cpp/unit/data/samplers/test_negative.cpp deleted file mode 100644 index c025f081..00000000 --- a/test/cpp/unit/data/samplers/test_negative.cpp +++ /dev/null @@ -1,234 +0,0 @@ -// -// Created by Jason Mohoney on 2/9/22. -// - -#include -#include - -int num_nodes = 6; - -torch::Tensor edges = torch::tensor({{0, 2}, {0, 4}, {1, 3}, {1, 5}, {4, 2}, {5, 2}}, torch::kInt64); - -torch::Tensor typed_edges = torch::tensor({{0, 0, 2}, {0, 1, 4}, {1, 1, 3}, {1, 0, 5}, {4, 0, 2}, {5, 1, 2}}, torch::kInt64); - -torch::Tensor batch_edges = torch::tensor({{1, 5}, {0, 2}, {4, 2}}, torch::kInt64); -torch::Tensor batch_typed_edges = torch::tensor({{1, 0, 5}, {0, 0, 2}, {4, 0, 2}}, torch::kInt64); - -class CorruptNodeNegativeSamplerTest : public ::testing::Test { - protected: - shared_ptr graph; - shared_ptr typed_graph; - - void SetUp() override { - torch::Tensor dst_sorted_edges = edges.index_select(0, edges.select(1, 1).argsort(0)); - torch::Tensor dst_sorted_typed_edges = typed_edges.index_select(0, typed_edges.select(1, 2).argsort(0)); - - graph = std::make_shared(edges, dst_sorted_edges, num_nodes); - typed_graph = std::make_shared(typed_edges, dst_sorted_typed_edges, num_nodes); - - graph->sortAllEdges(torch::tensor({{0, 3}}, torch::kInt64)); - typed_graph->sortAllEdges(torch::tensor({{0, 1, 3}}, torch::kInt64)); - } -}; - -void validate_sample(shared_ptr sampler, torch::Tensor sample, shared_ptr graph) { - // validate shape - ASSERT_EQ(sample.size(0), sampler->num_chunks_); - - if (sampler->num_negatives_ != -1) { - ASSERT_EQ(sample.size(1), sampler->num_negatives_); - } else { - ASSERT_EQ(sample.size(1), graph->num_nodes_in_memory_); - } - - // validate max and min ids - ASSERT_TRUE(sample.max().item() < graph->num_nodes_in_memory_); - ASSERT_TRUE(sample.min().item() >= 0); -} - -void validate_filter_local(torch::Tensor filter, torch::Tensor sample, torch::Tensor edges_t, bool inverse) { - // check filtered edges are present in the graph - auto batch_accessor = edges_t.accessor(); - auto sample_accessor = sample.accessor(); - auto filter_accessor = filter.accessor(); - - bool has_relations = false; - if (edges_t.size(1) == 3) { - has_relations = true; - } - - int64_t num_chunks = sample.size(0); - int64_t num_edges = edges_t.size(0); - int64_t chunk_size = ceil((double)num_edges / num_chunks); - - for (int i = 0; i < filter.size(0); i++) { - int64_t src; - int64_t rel; - int64_t dst; - - bool found = false; - - int64_t edge_id = filter_accessor[i][0]; - - int chunk_id = edge_id / chunk_size; - - if (inverse) { - src = sample_accessor[chunk_id][filter_accessor[i][1]]; - if (has_relations) { - rel = batch_accessor[edge_id][1]; - dst = batch_accessor[edge_id][2]; - } else { - dst = batch_accessor[edge_id][1]; - } - } else { - src = batch_accessor[edge_id][0]; - dst = sample_accessor[chunk_id][filter_accessor[i][1]]; - if (has_relations) { - rel = batch_accessor[edge_id][1]; - } - } - - if (has_relations) { - for (int k = 0; k < edges_t.size(0); k++) { - if (batch_accessor[k][0] == src && batch_accessor[k][1] == rel && batch_accessor[k][2] == dst) { - found = true; - } - } - } else { - for (int k = 0; k < edges_t.size(0); k++) { - if (batch_accessor[k][0] == src && batch_accessor[k][1] == dst) { - found = true; - } - } - } - ASSERT_TRUE(found); - } -} - -void validate_filter_global(torch::Tensor filter, torch::Tensor sample, shared_ptr graph, torch::Tensor edges_t, bool inverse) { - // check filtered edges are present in the graph - auto graph_accessor = graph->src_sorted_edges_.accessor(); - auto batch_accessor = edges_t.accessor(); - auto sample_accessor = sample.accessor(); - auto filter_accessor = filter.accessor(); - - bool has_relations = false; - if (edges_t.size(1) == 3) { - has_relations = true; - } - - int64_t num_chunks = sample.size(0); - int64_t num_edges = edges_t.size(0); - int64_t chunk_size = ceil((double)num_edges / num_chunks); - - for (int i = 0; i < filter.size(0); i++) { - int64_t src; - int64_t rel; - int64_t dst; - - bool found = false; - - int64_t edge_id = filter_accessor[i][0]; - - int chunk_id = edge_id / chunk_size; - - if (inverse) { - src = sample_accessor[chunk_id][filter_accessor[i][1]]; - if (has_relations) { - rel = batch_accessor[edge_id][1]; - dst = batch_accessor[edge_id][2]; - } else { - dst = batch_accessor[edge_id][1]; - } - } else { - src = batch_accessor[edge_id][0]; - dst = sample_accessor[chunk_id][filter_accessor[i][1]]; - if (has_relations) { - rel = batch_accessor[edge_id][1]; - } - } - - if (has_relations) { - for (int k = 0; k < graph->src_sorted_edges_.size(0); k++) { - if (graph_accessor[k][0] == src && graph_accessor[k][1] == rel && graph_accessor[k][2] == dst) { - found = true; - } - } - } else { - for (int k = 0; k < graph->src_sorted_edges_.size(0); k++) { - if (graph_accessor[k][0] == src && graph_accessor[k][1] == dst) { - found = true; - } - } - } - ASSERT_TRUE(found); - } -} - -void test_unfiltered_corruption_sampler(shared_ptr sampler, shared_ptr graph, torch::Tensor edges_t) { - torch::Tensor sample; - torch::Tensor filter; - - std::tie(sample, filter) = sampler->getNegatives(graph, edges_t, false); - validate_sample(sampler, sample, graph); - validate_filter_local(filter, sample, edges_t, false); - - std::tie(sample, filter) = sampler->getNegatives(graph, edges_t, true); - validate_sample(sampler, sample, graph); - validate_filter_local(filter, sample, edges_t, true); -} - -void test_filtered_corruption_sampler(shared_ptr sampler, shared_ptr graph, torch::Tensor edges_t) { - torch::Tensor sample; - torch::Tensor filter; - - std::tie(sample, filter) = sampler->getNegatives(graph, edges_t, false); - validate_sample(sampler, sample, graph); - validate_filter_global(filter, sample, graph, edges_t, false); - - std::tie(sample, filter) = sampler->getNegatives(graph, edges_t, true); - validate_sample(sampler, sample, graph); - validate_filter_global(filter, sample, graph, edges_t, true); -} - -TEST_F(CorruptNodeNegativeSamplerTest, TestUniform) { - auto corrupt_uniform = std::make_shared(1, 5, 0.0, false); - test_unfiltered_corruption_sampler(corrupt_uniform, graph, batch_edges); - test_unfiltered_corruption_sampler(corrupt_uniform, typed_graph, batch_typed_edges); -} - -TEST_F(CorruptNodeNegativeSamplerTest, TestUniformChunked) { - auto corrupt_uniform_chunked = std::make_shared(3, 5, 0.0, false); - test_unfiltered_corruption_sampler(corrupt_uniform_chunked, graph, batch_edges); - test_unfiltered_corruption_sampler(corrupt_uniform_chunked, typed_graph, batch_typed_edges); -} - -TEST_F(CorruptNodeNegativeSamplerTest, TestMix) { - auto corrupt_mix = std::make_shared(1, 5, 0.5, false); - test_unfiltered_corruption_sampler(corrupt_mix, graph, batch_edges); - test_unfiltered_corruption_sampler(corrupt_mix, typed_graph, batch_typed_edges); -} - -TEST_F(CorruptNodeNegativeSamplerTest, TestMixChunked) { - auto corrupt_mix_chunked = std::make_shared(3, 5, 0.5, false); - test_unfiltered_corruption_sampler(corrupt_mix_chunked, graph, batch_edges); - test_unfiltered_corruption_sampler(corrupt_mix_chunked, typed_graph, batch_typed_edges); -} - -TEST_F(CorruptNodeNegativeSamplerTest, TestAllDegree) { - auto corrupt_all_degree = std::make_shared(1, 5, 1.0, false); - test_unfiltered_corruption_sampler(corrupt_all_degree, graph, batch_edges); - test_unfiltered_corruption_sampler(corrupt_all_degree, typed_graph, batch_typed_edges); -} - -TEST_F(CorruptNodeNegativeSamplerTest, TestAllDegreeChunked) { - auto corrupt_all_degree_chunked = std::make_shared(3, 5, 1.0, false); - test_unfiltered_corruption_sampler(corrupt_all_degree_chunked, graph, batch_edges); - test_unfiltered_corruption_sampler(corrupt_all_degree_chunked, typed_graph, batch_typed_edges); -} - -TEST_F(CorruptNodeNegativeSamplerTest, TestFilter) { - auto corrupt_filtered = std::make_shared(1, -1, 0.0, true); - test_filtered_corruption_sampler(corrupt_filtered, graph, batch_edges); - test_filtered_corruption_sampler(corrupt_filtered, typed_graph, batch_typed_edges); -} \ No newline at end of file