Metrics (#63)

* CrysGFN: SMACT for composition validity (#34) * eform and ehull metrics (#42) * Eval pipeline from CSV (#45) * Lattice metrics (#44) * SMACT update (#47) * update ehull metric (#62) --------- Co-authored-by: Divya Sharma <[email protected]>
alexhernandezgarcia · Feb 19, 2024 · 2918684 · 2918684
1 parent 83a2539
commit 2918684
Show file tree

Hide file tree

Showing 5 changed files with 577 additions and 27 deletions.
diff --git a/scripts/crystal_eval/README.md b/scripts/crystal_eval/README.md
@@ -38,4 +38,8 @@ Before running the second script (`eval_CGFN`) be sure that `data/crystals/eval_
 
 ## Structure Generation with pyXtal
 
-You can generate structures with `--n_random_struct` and relax them with `relax.py`. However, it is not recommended using at this point, as it will result in many empty (None) structures. This will be addressed in a future PR.
+You can generate structures with `--n_random_struct` and relax them with `relax.py`. However, it is not recommended using at this point, as it will result in many empty (None) structures. This will be addressed in a future PR.
+
+## SMACT
+
+The SMACT metric requires the python library "SMACT", that can be installed using `pip install smact`
diff --git a/scripts/crystal_eval/convert_CGFN_samples.py b/scripts/crystal_eval/convert_CGFN_samples.py
@@ -5,14 +5,15 @@
 # symmetry: dictionnary with key "spacegroup". (other keys such as Wyckoff may be added later)
 # eform: formation energy in eV
 
-import pandas as pd
+import os
+from argparse import ArgumentParser
+from pathlib import Path
+
 import numpy as np
-from pymatgen.core import Structure, Lattice
+import pandas as pd
+from pymatgen.core import Lattice, Structure
 from pyxtal import pyxtal
 from pyxtal.lattice import Lattice as pyxtal_lattice
-from argparse import ArgumentParser
-from pathlib import Path
-import os
 
 # encoded elements in CGFN pickle
 IDX2ELEM = {

diff --git a/scripts/crystal_eval/convert_from_csv.py b/scripts/crystal_eval/convert_from_csv.py
@@ -0,0 +1,75 @@
+import os
+import re
+from argparse import ArgumentParser
+from pathlib import Path
+
+import numpy as np
+import pandas as pd
+from pymatgen.core import Composition, Lattice, Structure
+
+
+def encoded_to_container_structure(sample: str):
+    """Convert CGFN readable sample to pymatgen structure object and extract space group,
+    only setting known info (lattice)."""
+    pattern = r"Stage 2;\s*(\d+)\s*\|\s*[\w-]+.*?;\s*([\w\d]+[\w\dFOSi]+);\s*\(([\d.]+), ([\d.]+), ([\d.]+)\), \(([\d.]+), ([\d.]+), ([\d.]+)\)"
+    match = re.search(pattern, sample)
+    if match:
+        space_group = match.group(1)
+        composition = match.group(2)
+        a, b, c, alpha, beta, gamma = [float(x) for x in match.groups()[2:8]]
+
+        lattice = Lattice.from_parameters(
+            a=a, b=b, c=c, alpha=alpha, beta=beta, gamma=gamma
+        )
+        species = []
+        composition_dict = Composition(composition).as_dict()
+        for element, amount in composition_dict.items():
+            for _ in range(int(amount)):
+                species.append(element)
+        coords = np.zeros((len(species), 3))
+        structure = Structure(lattice=lattice, species=species, coords=coords)
+        return structure, {
+            "spacegroup": space_group
+        }  # Return both structure and symmetry info
+    else:
+        return None, None  # Handle samples that don't match the pattern
+
+
+def main():
+    parser = ArgumentParser()
+    parser.add_argument(
+        "--file_path",
+        default="data/crystals/gfn_samples.csv",
+        type=str,
+        help="File path containing pickled CGFN samples",
+    )
+    parser.add_argument(
+        "--out_dir",
+        default="data/crystals/eval_data/",
+        type=str,
+        help="Output directory",
+    )
+
+    args = parser.parse_args()
+    print("Starting...")
+    df_data = pd.read_csv(args.file_path)
+    df_data = df_data.filter(items=["readable", "energies"])
+    df_data[["structure", "symmetry"]] = df_data.apply(
+        lambda row: encoded_to_container_structure(row["readable"]),
+        axis=1,
+        result_type="expand",
+    )
+    df_data.drop(["readable"], axis=1, inplace=True)
+    df_data.rename(columns={"energies": "eform"}, inplace=True)
+    out_file_path = os.path.join(
+        Path(args.out_dir),
+        os.path.splitext(os.path.basename(Path(args.file_path)))[0] + ".pkl",
+    )
+    df_data.to_pickle(out_file_path)
+    print(df_data)
+    print(f"File saved to {out_file_path}")
+    print("Done")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/scripts/crystal_eval/eval_CGFN.py b/scripts/crystal_eval/eval_CGFN.py
@@ -4,21 +4,41 @@
 in order to have a standard input data format, saved to data/crystals/eval_data/.
 """
 
-import os
 import json
-from pathlib import Path
+import os
 from argparse import ArgumentParser
+from pathlib import Path
+
 import numpy as np
 import pandas as pd
-from metrics import NumberOfElements, Rediscovery
+from metrics import SG2LP, SMACT, Comp2SG, Eform, Ehull, NumberOfElements, Rediscovery
+
 
 # put all metrics to be computed here:
-METRICS = [
-    NumberOfElements(),
-    # Rediscovery(
-    #     rediscovery_path=None  # Path to original dataset for comparing against generated samples
-    # ),
-]
+def init_metrics():  # in a function to enable forking
+    global METRICS
+    METRICS = [
+        NumberOfElements(),
+        Rediscovery(
+            rediscovery_path=None  # Path to original dataset for comparing against generated samples
+        ),
+        Eform(),
+        SMACT(),
+        SMACT(oxidation_states_set="icsd"),
+        SMACT(oxidation_states_set="pymatgen"),
+        SMACT(oxidation_states_set="our_oxidations"),
+        SMACT(oxidation_only=True),
+        SMACT(oxidation_states_set="icsd", oxidation_only=True),
+        SMACT(oxidation_states_set="pymatgen", oxidation_only=True),
+        SMACT(oxidation_states_set="our_oxidations.txt", oxidation_only=True),
+        Comp2SG(),
+        SG2LP(),
+        Ehull(
+            PD_path="data/crystals/eval_data/MP_hull_12elms.zip",  # replace by your path
+            n_jobs=4,
+            debug=True,  # set False to make a full run
+        ),
+    ]
 
 
 def add_args(parser):
@@ -68,6 +88,7 @@ def print_args(args):
 
 
 def main():
+    init_metrics()
     # Parse arguments
     parser = ArgumentParser()
     _, override_args = parser.parse_known_args()
@@ -100,7 +121,9 @@ def main():
         else:
             for metric in METRICS:
                 print(f"Computing {metric.__name__}")
-                results = metric.compute(df["structure"])
+                results = metric.compute(
+                    df["structure"], energies=df["eform"], sg=df["symmetry"]
+                )
                 data_results[data_name][metric.__name__] = results
             with open(metrics_path, "w+") as fp:
                 json.dump(data_results[data_name], fp)
@@ -111,19 +134,22 @@ def main():
     os.chdir(out_dir)
     for metric in METRICS:
         try:
+            print([d for d, _ in data_results.items()])
+            # raise Exception
             metric.plot(
                 {
                     data_name: results[metric.__name__]
-                    for dataname, results in data_results.items()
+                    for data_name, results in data_results.items()
                 },
             )
         except KeyError as ke:
             print(
-                "Key not found in file. Metrics have changed. Rerun using --force_compute."
+                f"Key {ke} not found in file. Metrics have changed. Rerun using --force_compute."
             )
             exit()
     os.chdir(original_directory)
     print("Done")
+    print(f"Check summary figures saved in {out_dir}")
 
 
 if __name__ == "__main__":