dgl 0.4.2 with ZINC-full

WaifookChan · Nov 2, 2020 · c57794a · c57794a
1 parent 03da731
commit c57794a
Show file tree

Hide file tree

Showing 14 changed files with 1,254 additions and 404 deletions.
diff --git a/README.md b/README.md
@@ -6,6 +6,12 @@
 
 ## Updates
 
+**Nov 2, 2020**
+ * This branch of the project is compatible with DGL 0.4.2.
+ * Added [ZINC-full](./data/script_download_molecules.sh) dataset (249K molecular graphs) with [scripts](./scripts/ZINC-full/).
+
+
+
 **Jun 11, 2020**
 * Second release of the project. Major updates : 
 	+ Added experimental pipeline for Weisfeiler-Lehman-GNNs operating on dense rank-2 tensors.

diff --git a/data/SBMs/generate_SBM_CLUSTER.ipynb b/data/SBMs/generate_SBM_CLUSTER.ipynb
@@ -208,9 +208,7 @@
   {
    "cell_type": "code",
    "execution_count": 4,
-   "metadata": {
-    "scrolled": false
-   },
+   "metadata": {},
    "outputs": [
     {
      "name": "stdout",
@@ -392,199 +390,6 @@
     "print('Time (sec):',time.time() - start) # 190s\n"
    ]
   },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "# Convert to DGL format and save with pickle"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 5,
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "/Users/xbresson/Documents/Dropbox/06_NTU_2017_now/03_my_codes/34_benchmark20/GITHUB_benchmark_project/benchmarking-gnn\n"
-     ]
-    }
-   ],
-   "source": [
-    "import os\n",
-    "os.chdir('../../') # go to root folder of the project\n",
-    "print(os.getcwd())\n"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 6,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "\n",
-    "import pickle\n",
-    "\n",
-    "%load_ext autoreload\n",
-    "%autoreload 2\n",
-    "\n",
-    "from data.SBMs import SBMsDatasetDGL \n",
-    "\n",
-    "from data.data import LoadData\n",
-    "from torch.utils.data import DataLoader\n",
-    "from data.SBMs import SBMsDataset\n"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 8,
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "[I] Loading data ...\n",
-      "preparing 10000 graphs for the TRAIN set...\n",
-      "preparing 1000 graphs for the TEST set...\n",
-      "preparing 1000 graphs for the VAL set...\n",
-      "[I] Finished loading.\n",
-      "[I] Data load time: 3983.7924s\n",
-      "Time (sec): 3983.794214248657\n"
-     ]
-    }
-   ],
-   "source": [
-    "DATASET_NAME = 'SBM_CLUSTER'\n",
-    "dataset = SBMsDatasetDGL(DATASET_NAME)  #3983s\n"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 9,
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "10000\n",
-      "1000\n",
-      "1000\n",
-      "(DGLGraph(num_nodes=117, num_edges=4104,\n",
-      "         ndata_schemes={'feat': Scheme(shape=(), dtype=torch.int64)}\n",
-      "         edata_schemes={'feat': Scheme(shape=(1,), dtype=torch.float32)}), tensor([0, 3, 3, 0, 4, 3, 0, 2, 0, 0, 0, 2, 2, 0, 1, 5, 3, 0, 2, 4, 2, 3, 2, 4,\n",
-      "        3, 1, 3, 5, 2, 3, 0, 0, 3, 5, 2, 5, 3, 2, 0, 3, 0, 3, 3, 3, 0, 3, 2, 0,\n",
-      "        3, 5, 2, 4, 1, 1, 3, 4, 4, 3, 3, 3, 0, 5, 2, 4, 3, 0, 0, 4, 3, 0, 0, 1,\n",
-      "        4, 2, 3, 2, 0, 0, 0, 4, 2, 2, 3, 3, 3, 0, 0, 2, 2, 5, 4, 0, 2, 5, 4, 0,\n",
-      "        0, 2, 0, 0, 0, 3, 3, 2, 2, 1, 2, 0, 0, 0, 5, 3, 1, 4, 3, 3, 5],\n",
-      "       dtype=torch.int16))\n",
-      "(DGLGraph(num_nodes=90, num_edges=2396,\n",
-      "         ndata_schemes={'feat': Scheme(shape=(), dtype=torch.int64)}\n",
-      "         edata_schemes={'feat': Scheme(shape=(1,), dtype=torch.float32)}), tensor([1, 0, 0, 4, 4, 0, 5, 3, 4, 0, 3, 1, 0, 5, 5, 5, 1, 3, 3, 4, 1, 2, 5, 4,\n",
-      "        5, 5, 2, 0, 5, 3, 2, 5, 5, 5, 5, 0, 3, 3, 0, 2, 3, 3, 3, 3, 5, 3, 1, 1,\n",
-      "        5, 2, 5, 1, 1, 4, 5, 2, 0, 4, 4, 0, 3, 4, 0, 0, 2, 3, 5, 3, 3, 4, 0, 5,\n",
-      "        1, 0, 0, 0, 0, 2, 4, 0, 5, 0, 3, 0, 5, 3, 4, 3, 0, 5],\n",
-      "       dtype=torch.int16))\n",
-      "(DGLGraph(num_nodes=134, num_edges=5570,\n",
-      "         ndata_schemes={'feat': Scheme(shape=(), dtype=torch.int64)}\n",
-      "         edata_schemes={'feat': Scheme(shape=(1,), dtype=torch.float32)}), tensor([2, 5, 4, 4, 4, 5, 2, 1, 5, 0, 0, 1, 5, 5, 4, 2, 5, 5, 0, 0, 3, 0, 1, 2,\n",
-      "        2, 5, 0, 2, 0, 5, 1, 5, 5, 1, 0, 0, 5, 2, 2, 5, 5, 1, 4, 0, 0, 5, 1, 0,\n",
-      "        3, 0, 5, 1, 5, 4, 0, 4, 5, 1, 5, 4, 4, 0, 2, 5, 2, 5, 0, 1, 0, 1, 2, 0,\n",
-      "        2, 2, 0, 3, 2, 4, 0, 5, 2, 0, 2, 2, 5, 4, 2, 0, 4, 0, 0, 5, 1, 0, 5, 3,\n",
-      "        2, 3, 5, 0, 1, 5, 2, 0, 1, 4, 0, 3, 2, 1, 0, 2, 1, 4, 2, 5, 2, 0, 5, 2,\n",
-      "        5, 5, 0, 1, 5, 4, 2, 2, 2, 0, 1, 0, 2, 1], dtype=torch.int16))\n"
-     ]
-    }
-   ],
-   "source": [
-    "print(len(dataset.train))\n",
-    "print(len(dataset.val))\n",
-    "print(len(dataset.test))\n",
-    "\n",
-    "print(dataset.train[0])\n",
-    "print(dataset.val[0])\n",
-    "print(dataset.test[0])\n"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 11,
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Time (sec): 15.637878656387329\n"
-     ]
-    }
-   ],
-   "source": [
-    "start = time.time()\n",
-    "\n",
-    "with open('data/SBMs/SBM_CLUSTER.pkl','wb') as f:\n",
-    "        pickle.dump([dataset.train,dataset.val,dataset.test],f)\n",
-    "        \n",
-    "print('Time (sec):',time.time() - start)\n"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "# Test load function"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 12,
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "[I] Loading dataset SBM_CLUSTER...\n",
-      "train, test, val sizes : 10000 1000 1000\n",
-      "[I] Finished loading.\n",
-      "[I] Data load time: 29.6175s\n"
-     ]
-    }
-   ],
-   "source": [
-    "DATASET_NAME = 'SBM_CLUSTER'\n",
-    "dataset = LoadData(DATASET_NAME) # 29s\n",
-    "trainset, valset, testset = dataset.train, dataset.val, dataset.test\n"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 13,
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "<class 'data.SBMs.SBMsDataset'>\n",
-      "Time (sec): 0.002402067184448242\n"
-     ]
-    }
-   ],
-   "source": [
-    "start = time.time()\n",
-    "\n",
-    "batch_size = 10\n",
-    "collate = SBMsDataset.collate\n",
-    "print(SBMsDataset)\n",
-    "train_loader = DataLoader(trainset, batch_size=batch_size, shuffle=True, collate_fn=collate)\n",
-    "\n",
-    "print('Time (sec):',time.time() - start) #0.002s\n"
-   ]
-  },
   {
    "cell_type": "code",
    "execution_count": null,
@@ -620,5 +425,5 @@
   }
  },
  "nbformat": 4,
- "nbformat_minor": 2
+ "nbformat_minor": 4
 }