From 0625a72a580ef61e0dff55135e94780d5ca304fa Mon Sep 17 00:00:00 2001
From: SaltyChiang <SaltyChiang@users.noreply.github.com>
Date: Tue, 26 Nov 2024 19:52:21 +0800
Subject: [PATCH] Use the standalong `file.py` to implement the HDF5 format.

---
 pyquda_core/pyquda/__init__.py | 162 +++++-----
 pyquda_core/pyquda/field.py    | 558 +++++++++++++++------------------
 pyquda_core/pyquda/file.py     | 348 ++++++++++++++++++++
 pyquda_utils/io/chroma.py      |   6 +-
 pyquda_utils/io/milc.py        |   6 +-
 tests/test.io.py               |   1 +
 6 files changed, 688 insertions(+), 393 deletions(-)
 create mode 100644 pyquda_core/pyquda/file.py

diff --git a/pyquda_core/pyquda/__init__.py b/pyquda_core/pyquda/__init__.py
index ea9d433..99fdd85 100644
--- a/pyquda_core/pyquda/__init__.py
+++ b/pyquda_core/pyquda/__init__.py
@@ -179,7 +179,84 @@ def _setEnviron(env, key, value):
         _setEnviron(f"QUDA_{key.upper()}", key, kwargs[key])
 
 
-def _initQUDA(grid_size, gpuid):
+def initGPU(backend: Literal["numpy", "cupy", "torch"] = None, gpuid: int = -1):
+    global _CUDA_BACKEND, _HIP, _GPUID, _COMPUTE_CAPABILITY
+
+    if isGridInitialized():
+        _MPI_LOGGER.critical("initGPU should be called before init", RuntimeError)
+    if _GPUID < 0:
+        from platform import node as gethostname
+
+        if backend is None:
+            backend = environ["PYQUDA_BACKEND"] if "PYQUDA_BACKEND" in environ else "cupy"
+        if backend == "numpy":
+            cudaGetDeviceCount: Callable[[], int] = lambda: 0x7FFFFFFF
+            cudaGetDeviceProperties: Callable[[int], Dict[str, Any]] = lambda device: {"major": 0, "minor": 0}
+            cudaSetDevice: Callable[[int], None] = lambda device: None
+        elif backend == "cupy":
+            import cupy
+            from cupy.cuda.runtime import getDeviceCount as cudaGetDeviceCount
+            from cupy.cuda.runtime import getDeviceProperties as cudaGetDeviceProperties
+            from cupy.cuda.runtime import is_hip
+
+            cudaSetDevice: Callable[[int], None] = lambda device: cupy.cuda.Device(device).use()
+            _HIP = is_hip
+        elif backend == "torch":
+            import torch
+            from torch.cuda import device_count as cudaGetDeviceCount
+            from torch.cuda import get_device_properties as cudaGetDeviceProperties
+            from torch.version import hip
+
+            cudaSetDevice: Callable[[int], None] = lambda device: torch.set_default_device(f"cuda:{device}")
+            _HIP = hip is not None
+        else:
+            _MPI_LOGGER.critical(f"Unsupported CUDA backend {backend}", ValueError)
+        _CUDA_BACKEND = backend
+        _MPI_LOGGER.info(f"Using CUDA backend {backend}")
+
+        # if backend == "cupy":
+        #     from . import malloc_pyquda
+
+        #     allocator = cupy.cuda.PythonFunctionAllocator(
+        #         malloc_pyquda.pyquda_device_malloc, malloc_pyquda.pyquda_device_free
+        #     )
+        #     cupy.cuda.set_allocator(allocator.malloc)
+
+        # quda/include/communicator_quda.h
+        # determine which GPU this rank will use
+        hostname = gethostname()
+        hostname_recv_buf = _MPI_COMM.allgather(hostname)
+
+        if gpuid < 0:
+            device_count = cudaGetDeviceCount()
+            if device_count == 0:
+                _MPI_LOGGER.critical("No devices found", RuntimeError)
+
+            gpuid = 0
+            for i in range(_MPI_RANK):
+                if hostname == hostname_recv_buf[i]:
+                    gpuid += 1
+
+            if gpuid >= device_count:
+                if "QUDA_ENABLE_MPS" in environ and environ["QUDA_ENABLE_MPS"] == "1":
+                    gpuid %= device_count
+                    print(f"MPS enabled, rank={_MPI_RANK} -> gpu={gpuid}")
+                else:
+                    _MPI_LOGGER.critical(f"Too few GPUs available on {hostname}", RuntimeError)
+        _GPUID = gpuid
+
+        props = cudaGetDeviceProperties(gpuid)
+        if hasattr(props, "major") and hasattr(props, "minor"):
+            _COMPUTE_CAPABILITY = _ComputeCapability(int(props.major), int(props.minor))
+        else:
+            _COMPUTE_CAPABILITY = _ComputeCapability(int(props["major"]), int(props["minor"]))
+
+        cudaSetDevice(gpuid)
+    else:
+        _MPI_LOGGER.warning("GPU is already initialized", RuntimeWarning)
+
+
+def initQUDA(grid_size: List[int], gpuid: int):
     import atexit
 
     quda.initCommsGridQuda(4, grid_size)
@@ -226,7 +303,7 @@ def init(
     """
     global _GRID_SIZE, _GRID_COORD, _DEFAULT_LATTICE
     if _GRID_SIZE is None:
-        from platform import node as gethostname
+        initGPU(backend)
 
         use_default_grid = grid_size is None and latt_size is not None
         use_default_latt = latt_size is not None and t_boundary is not None and anisotropy is not None
@@ -274,74 +351,8 @@ def init(
             device_reset="1" if device_reset else None,
         )
 
-        global _CUDA_BACKEND, _HIP, _GPUID, _COMPUTE_CAPABILITY
-
-        if backend is None:
-            backend = environ["PYQUDA_BACKEND"] if "PYQUDA_BACKEND" in environ else "cupy"
-        if backend == "numpy":
-            cudaGetDeviceCount: Callable[[], int] = lambda: 0x7FFFFFFF
-            cudaGetDeviceProperties: Callable[[int], Dict[str, Any]] = lambda device: {"major": 0, "minor": 0}
-            cudaSetDevice: Callable[[int], None] = lambda device: None
-        elif backend == "cupy":
-            import cupy
-            from cupy.cuda.runtime import getDeviceCount as cudaGetDeviceCount
-            from cupy.cuda.runtime import getDeviceProperties as cudaGetDeviceProperties
-            from cupy.cuda.runtime import is_hip
-
-            cudaSetDevice: Callable[[int], None] = lambda device: cupy.cuda.Device(device).use()
-            _HIP = is_hip
-        elif backend == "torch":
-            import torch
-            from torch.cuda import device_count as cudaGetDeviceCount
-            from torch.cuda import get_device_properties as cudaGetDeviceProperties
-            from torch.version import hip
-
-            cudaSetDevice: Callable[[int], None] = lambda device: torch.set_default_device(f"cuda:{device}")
-            _HIP = hip is not None
-        else:
-            _MPI_LOGGER.critical(f"Unsupported CUDA backend {backend}", ValueError)
-        _CUDA_BACKEND = backend
-        _MPI_LOGGER.info(f"Using CUDA backend {backend}")
-
-        # if _CUDA_BACKEND == "cupy":
-        #     from . import malloc_pyquda
-
-        #     allocator = cupy.cuda.PythonFunctionAllocator(
-        #         malloc_pyquda.pyquda_device_malloc, malloc_pyquda.pyquda_device_free
-        #     )
-        #     cupy.cuda.set_allocator(allocator.malloc)
-
-        # quda/include/communicator_quda.h
-        # determine which GPU this rank will use
-        hostname = gethostname()
-        hostname_recv_buf = _MPI_COMM.allgather(hostname)
-
-        if _GPUID < 0:
-            device_count = cudaGetDeviceCount()
-            if device_count == 0:
-                _MPI_LOGGER.critical("No devices found", RuntimeError)
-
-            _GPUID = 0
-            for i in range(_MPI_RANK):
-                if hostname == hostname_recv_buf[i]:
-                    _GPUID += 1
-
-            if _GPUID >= device_count:
-                if "QUDA_ENABLE_MPS" in environ and environ["QUDA_ENABLE_MPS"] == "1":
-                    _GPUID %= device_count
-                    print(f"MPS enabled, rank={_MPI_RANK} -> gpu={_GPUID}")
-                else:
-                    _MPI_LOGGER.critical(f"Too few GPUs available on {hostname}", RuntimeError)
-
-        props = cudaGetDeviceProperties(_GPUID)
-        if hasattr(props, "major") and hasattr(props, "minor"):
-            _COMPUTE_CAPABILITY = _ComputeCapability(int(props.major), int(props.minor))
-        else:
-            _COMPUTE_CAPABILITY = _ComputeCapability(int(props["major"]), int(props["minor"]))
-
-        cudaSetDevice(_GPUID)
         if init_quda:
-            _initQUDA(_GRID_SIZE, _GPUID)
+            initQUDA(_GRID_SIZE, _GPUID)
     else:
         _MPI_LOGGER.warning("PyQUDA is already initialized", RuntimeWarning)
 
@@ -354,7 +365,11 @@ def setLoggerLevel(level: Literal["debug", "info", "warning", "error", "critical
     _MPI_LOGGER.logger.setLevel(level.upper())
 
 
-def isInitialized():
+def isGPUInitialized():
+    return _GPUID >= 0
+
+
+def isGridInitialized():
     return _GRID_SIZE is not None
 
 
@@ -398,13 +413,6 @@ def isHIP():
     return _HIP
 
 
-def setGPUID(gpuid: int):
-    global _GPUID
-    assert _GRID_SIZE is None, "setGPUID() should be called before init()"
-    assert gpuid >= 0
-    _GPUID = gpuid
-
-
 def getGPUID():
     return _GPUID
 
diff --git a/pyquda_core/pyquda/field.py b/pyquda_core/pyquda/field.py
index 1c682a1..1d06af2 100644
--- a/pyquda_core/pyquda/field.py
+++ b/pyquda_core/pyquda/field.py
@@ -9,18 +9,18 @@
 
 
 class LatticeInfo:
+    Nd: int = 4
     Ns: int = 4
     Nc: int = 3
-    Nd: int = 4
 
     def __init__(self, latt_size: List[int], t_boundary: Literal[1, -1] = 1, anisotropy: float = 1.0) -> None:
         self._checkLattice(latt_size)
         self._setLattice(latt_size, t_boundary, anisotropy)
 
     def _checkLattice(self, latt_size: List[int]):
-        from . import init, isInitialized, getLogger, getGridSize
+        from . import init, isGridInitialized, getLogger, getGridSize
 
-        if not isInitialized():
+        if not isGridInitialized():
             init(None, latt_size)
         Gx, Gy, Gz, Gt = getGridSize()
         Lx, Ly, Lz, Lt = latt_size
@@ -65,7 +65,47 @@ def _setLattice(self, latt_size: List[int], t_boundary: Literal[1, -1], anisotro
         self.anisotropy = anisotropy
 
 
-Ns, Nc, Nd = LatticeInfo.Ns, LatticeInfo.Nc, LatticeInfo.Nd
+Nd, Ns, Nc = LatticeInfo.Nd, LatticeInfo.Ns, LatticeInfo.Nc
+
+
+class GeneralInfo:
+    def __init__(self, latt_size: List[int], grid_size: List[int], Ns: int = 4, Nc: int = 3) -> None:
+        self._checkLattice(latt_size, grid_size)
+        self._setLattice(latt_size, grid_size)
+        self.Nd = len(latt_size)
+        self.Ns = Ns
+        self.Nc = Nc
+
+    def _checkLattice(self, latt_size: List[int], grid_size: List[int]):
+        from . import getLogger
+
+        assert len(latt_size) == len(grid_size)
+        for GL, G in zip(latt_size, grid_size):
+            if not (GL % G == 0):
+                getLogger().critical("lattice size must be divisible by gird size", ValueError)
+
+    def _setLattice(self, latt_size: List[int], grid_size: List[int]):
+        from . import initGPU, isGPUInitialized, getLogger, getMPIComm, getMPISize, getMPIRank
+
+        if not isGPUInitialized():
+            initGPU()
+        if getMPISize() != int(numpy.prod(grid_size)):
+            getLogger().critical(f"The MPI size {getMPISize()} does not match the grid size {grid_size}", ValueError)
+        self.mpi_comm = getMPIComm()
+        self.mpi_size = getMPISize()
+        self.mpi_rank = getMPIRank()
+        self.grid_size = grid_size
+        grid_coord = []
+        mpi_rank = getMPIRank()
+        for G in grid_size[::-1]:
+            grid_coord.append(mpi_rank % G)
+            mpi_rank //= G
+        self.grid_coord = grid_coord[::-1]
+
+        self.global_size = latt_size
+        self.global_volume = int(numpy.prod(latt_size))
+        self.size = [GL // G for GL, G in zip(latt_size, grid_size)]
+        self.volume = int(numpy.prod(self.size))
 
 
 class _Direction(int):
@@ -126,24 +166,21 @@ def cb2(data: numpy.ndarray, axes: List[int], dtype=None):
     return data_cb2.reshape(*shape[: axes[0]], 2, Lt, Lz, Ly, Lx // 2, *shape[axes[-1] + 1 :])
 
 
-def checksum(latt_info, data: numpy.ndarray) -> Tuple[int, int]:
+def checksum(latt_info: Union[LatticeInfo, GeneralInfo], data: numpy.ndarray) -> Tuple[int, int]:
     import zlib
     from mpi4py import MPI
 
-    gx, gy, gz, gt = latt_info.grid_coord
-    Lx, Ly, Lz, Lt = latt_info.size
-    gLx, gLy, gLz, gLt = gx * Lx, gy * Ly, gz * Lz, gt * Lt
-    GLx, GLy, GLz, GLt = latt_info.global_size
     work = numpy.empty((latt_info.volume), "<u4")
     for i in range(latt_info.volume):
         work[i] = zlib.crc32(data[i])
+    sublatt_slice = tuple(slice(g * L, (g + 1) * L) for g, L in zip(latt_info.grid_coord[::-1], latt_info.size[::-1]))
     rank = (
-        numpy.arange(latt_info.global_volume, dtype="<u4")
-        .reshape(GLt, GLz, GLy, GLx)[gLt : gLt + Lt, gLz : gLz + Lz, gLy : gLy + Ly, gLx : gLx + Lx]
+        numpy.arange(latt_info.global_volume, dtype="<u8")
+        .reshape(*latt_info.global_size[::-1])[sublatt_slice]
         .reshape(-1)
     )
-    rank29 = rank % 29
-    rank31 = rank % 31
+    rank29 = (rank % 29).astype("<u4")
+    rank31 = (rank % 31).astype("<u4")
     sum29 = latt_info.mpi_comm.allreduce(
         numpy.bitwise_xor.reduce(work << rank29 | work >> (32 - rank29)).item(), MPI.BXOR
     )
@@ -153,35 +190,34 @@ def checksum(latt_info, data: numpy.ndarray) -> Tuple[int, int]:
     return sum29, sum31
 
 
-def _field_shape_dtype(field: str, Ns: int, Nc: int, int_nbytes: int = 4, float_nbytes: int = 8):
+def _field_shape_dtype(field: str, Ns: int, Nc: int, use_fp32: bool = False):
     from . import getLogger
 
+    float_nbytes = 4 if use_fp32 else 8
     if field in ["Int"]:
-        return [], f"<i{int_nbytes}"
+        return [], "<i4"
     elif field in ["Real"]:
         return [], f"<f{float_nbytes}"
     elif field in ["Complex"]:
         return [], f"<c{2 * float_nbytes}"
-    elif field in ["ColorMatrix", "Link", "Gauge"]:
+    elif field in ["SpinColorVector"]:
+        return [Ns, Nc], f"<c{2 * float_nbytes}"
+    elif field in ["SpinColorMatrix"]:
+        return [Ns, Ns, Nc, Nc], f"<c{2 * float_nbytes}"
+    elif field in ["ColorVector"]:
+        return [Nc], f"<c{2 * float_nbytes}"
+    elif field in ["ColorMatrix"]:
         return [Nc, Nc], f"<c{2 * float_nbytes}"
     elif field in ["Mom"]:
         return [Nc**2 + 1], f"<f{float_nbytes}"
     elif field in ["Clover"]:
         return [2, ((Ns // 2) * Nc) ** 2], f"<f{float_nbytes}"
-    elif field in ["SpinColorVector", "Fermion"]:
-        return [Ns, Nc], f"<c{2 * float_nbytes}"
-    elif field in ["SpinColorMatrix", "Propagator"]:
-        return [Ns, Ns, Nc, Nc], f"<c{2 * float_nbytes}"
-    elif field in ["ColorVector", "StaggeredFermion"]:
-        return [Nc], f"<c{2 * float_nbytes}"
-    elif field in ["ColorMatrix", "StaggeredPropagator"]:
-        return [Nc, Nc], f"<c{2 * float_nbytes}"
     else:
         getLogger().critical(f"Unknown field type: {field}", ValueError)
 
 
 class BaseField:
-    def __init__(self, latt_info: LatticeInfo) -> None:
+    def __init__(self, latt_info: Union[LatticeInfo, GeneralInfo]) -> None:
         from . import getCUDABackend
 
         self.latt_info = latt_info
@@ -189,6 +225,117 @@ def __init__(self, latt_info: LatticeInfo) -> None:
         self.backend: Literal["numpy", "cupy", "torch"] = getCUDABackend()
         self.L5 = None
 
+    @abstractmethod
+    def _shape(self):
+        from . import getLogger
+
+        getLogger().critical("_setShape method must be implemented", NotImplementedError)
+
+    @classmethod
+    def _groupName(cls):
+        from . import getLogger
+
+        if cls.__name__ == "LatticeMom":
+            getLogger().critical("LatticeMom is not supported for save/load", ValueError)
+        elif cls.__name__ == "LatticeClover":
+            getLogger().critical("LatticeClover is not supported for save/load", ValueError)
+
+        return (
+            cls.__name__.replace("Multi", "")
+            .replace("General", "")
+            .replace("Link", "ColorMatrix")
+            .replace("Gauge", "ColorMatrix")
+            .replace("Fermion", "SpinColorVector")
+            .replace("Propagator", "SpinColorMatrix")
+            .replace("StaggeredFermion", "ColorVector")
+            .replace("StaggeredPropagator", "ColorMatrix")
+        )
+
+    def save(
+        self,
+        filename: str,
+        label: Union[int, str, Sequence[int], Sequence[str]],
+        *,
+        annotation: str = "",
+        check: bool = True,
+        use_fp32: bool = False,
+    ):
+        from . import getLogger
+        from .file import File
+
+        assert hasattr(self, "lexico")
+        s = perf_counter()
+        gbytes = 0
+        filename = path.expanduser(path.expandvars(filename))
+        with File(filename, "w") as f:
+            f.save(
+                self._groupName(),
+                label,
+                self.lexico(),
+                self.latt_info.grid_size,
+                annotation=annotation,
+                check=check,
+                use_fp32=use_fp32,
+            )
+        secs = perf_counter() - s
+        getLogger().debug(f"Saved {filename} in {secs:.3f} secs, {gbytes / secs:.3f} GB/s")
+
+    def append(
+        self,
+        filename: str,
+        label: Union[int, str, Sequence[int], Sequence[str]],
+        *,
+        annotation: str = "",
+        check: bool = True,
+        use_fp32: bool = False,
+    ):
+        from . import getLogger
+        from .file import File
+
+        assert hasattr(self, "lexico")
+        s = perf_counter()
+        gbytes = 0
+        filename = path.expanduser(path.expandvars(filename))
+        with File(filename, "r+") as f:
+            f.append(
+                self._groupName(),
+                label,
+                self.lexico(),
+                self.latt_info.grid_size,
+                annotation=annotation,
+                check=check,
+                use_fp32=use_fp32,
+            )
+        secs = perf_counter() - s
+        getLogger().debug(f"Appended {filename} in {secs:.3f} secs, {gbytes / secs:.3f} GB/s")
+
+    def update(
+        self,
+        filename: str,
+        label: Union[int, str, Sequence[int], Sequence[str]],
+        *,
+        annotation: str = "",
+        check: bool = True,
+    ):
+        from . import getLogger
+        from .file import File
+
+        assert hasattr(self, "lexico")
+        s = perf_counter()
+        gbytes = 0
+        filename = path.expanduser(path.expandvars(filename))
+        with File(filename, "r+") as f:
+            f.update(
+                self._groupName(),
+                label,
+                self.lexico(),
+                self.latt_info.grid_size,
+                annotation=annotation,
+                check=check,
+            )
+        secs = perf_counter() - s
+        getLogger().debug(f"Updated {filename} in {secs:.3f} secs, {gbytes / secs:.3f} GB/s")
+
     @property
     def data(self):
         return self._data
@@ -208,17 +355,10 @@ def data(self, value):
     def data_ptr(self) -> Pointer:
         return ndarrayPointer(self.data.reshape(-1), True)
 
-    @abstractmethod
-    def _field(self):
-        from . import getLogger
-
-        getLogger().critical("_field method must be implemented", NotImplementedError)
-
-    @abstractmethod
-    def _shape(self):
-        from . import getLogger
-
-        getLogger().critical("_setShape method must be implemented", NotImplementedError)
+    @classmethod
+    def _field(cls) -> str:
+        group_name = cls._groupName()
+        return group_name[group_name.index("Lattice") + len("Lattice") :]
 
     def _setField(self):
         field_shape, field_dtype = _field_shape_dtype(self._field(), self.latt_info.Ns, self.latt_info.Nc)
@@ -373,77 +513,62 @@ def __itruediv__(self, other):
         return self
 
 
-class SpatialField(BaseField):
-    def __init__(self, latt_info: LatticeInfo, value: Any = None, init_data: bool = True) -> None:
-        super().__init__(latt_info)
-        if init_data:
-            self._initData(value)
-
-    @classmethod
-    def _field(cls) -> str:
-        return cls.__name__[cls.__name__.index("Space") + len("Space") :]
-
-    def _shape(self):
-        Lx, Ly, Lz, Lt = self.latt_info.size
-        self.space_shape = [Lz, Ly, Lx]
-        if self.L5 is None:
-            return (*self.space_shape, *self.field_shape)
-        else:
-            return (self.L5, *self.space_shape, *self.field_shape)
-
-
-class TemporalField(BaseField):
-    def __init__(self, latt_info: LatticeInfo, value: Any = None, init_data: bool = True) -> None:
+class GeneralField(BaseField):
+    def __init__(self, latt_info: GeneralInfo, value: Any = None, init_data: bool = True) -> None:
         super().__init__(latt_info)
         if init_data:
             self._initData(value)
 
-    @classmethod
-    def _field(cls) -> str:
-        return cls.__name__[cls.__name__.index("Time") + len("Time") :]
-
     def _shape(self):
-        Lx, Ly, Lz, Lt = self.latt_info.size
-        self.time_shape = [Lt]
+        self.lattice_shape = self.latt_info.size[::-1]
         if self.L5 is None:
-            return (*self.time_shape, *self.field_shape)
+            return (*self.lattice_shape, *self.field_shape)
         else:
-            return (self.L5, *self.time_shape, *self.field_shape)
+            return (self.L5, *self.lattice_shape, *self.field_shape)
 
+    def lexico(self, dtype=None):
+        return self.getHost().astype(dtype)
 
-class SpatiotemporalField(BaseField):
-    def __init__(self, latt_info: LatticeInfo, value: Any = None, init_data: bool = True) -> None:
-        super().__init__(latt_info)
-        if init_data:
-            self._initData(value)
+    def checksum(self) -> Tuple[int, int]:
+        return checksum(self.latt_info, self.lexico().reshape(self.latt_info.volume, self.field_size).view("<u4"))
 
     @classmethod
-    def _field(cls) -> str:
-        return cls.__name__[cls.__name__.index("Spacetime") + len("Spacetime") :]
+    def load(
+        cls,
+        filename: str,
+        label: Union[int, str, Sequence[int], Sequence[str]],
+        *,
+        check: bool = True,
+        grid_size: List[int] = None,
+    ):
+        from . import getLogger
+        from .file import File
 
-    def _shape(self):
-        Lx, Ly, Lz, Lt = self.latt_info.size
-        self.spacetime_shape = [Lt, Lz, Ly, Lx]
-        if self.L5 is None:
-            return (*self.spacetime_shape, *self.field_shape)
+        s = perf_counter()
+        gbytes = 0
+        filename = path.expanduser(path.expandvars(filename))
+        with File(filename, "r") as f:
+            latt_size, Ns, Nc, value = f.load(cls._groupName(), label, grid_size, check=check)
+        latt_info = GeneralInfo(latt_size, grid_size, Ns, Nc)
+        if not issubclass(cls, MultiField):
+            retval = cls(latt_info, value)
         else:
-            return (self.L5, *self.spacetime_shape, *self.field_shape)
+            retval = cls(latt_info, len(label), numpy.asarray(value))
+        secs = perf_counter() - s
+        getLogger().debug(f"Loaded {filename} in {secs:.3f} secs, {gbytes / secs:.3f} GB/s")
+        return retval
 
 
 class ParityField(BaseField):
     def __init__(self, latt_info: LatticeInfo, value: Any = None, init_data: bool = True) -> None:
         super().__init__(latt_info)
-        self.full_lattice = False
+        self.full_field = False
         if init_data:
             self._initData(value)
 
-    @classmethod
-    def _field(cls) -> str:
-        return cls.__name__[cls.__name__.index("Lattice") + len("Lattice") :]
-
     def _shape(self):
         Lx, Ly, Lz, Lt = self.latt_info.size
-        self.lattice_shape = [2, Lt, Lz, Ly, Lx // 2] if self.full_lattice else [Lt, Lz, Ly, Lx // 2]
+        self.lattice_shape = [2, Lt, Lz, Ly, Lx // 2] if self.full_field else [Lt, Lz, Ly, Lx // 2]
         if self.L5 is None:
             return (*self.lattice_shape, *self.field_shape)
         else:
@@ -469,17 +594,17 @@ def timeslice(self, start: int, stop: int = None, step: int = None, return_field
                 x = self.__class__(self.latt_info)
             else:
                 x = self.__class__(self.latt_info, self.L5)
-            if self.full_lattice and self.L5 is not None:
+            if self.full_field and self.L5 is not None:
                 x.data[:, :, start:stop:step, :, :, :] = self.data[:, :, start:stop:step, :, :, :]
-            elif self.full_lattice or self.L5 is not None:
+            elif self.full_field or self.L5 is not None:
                 x.data[:, start:stop:step, :, :, :] = self.data[:, start:stop:step, :, :, :]
             else:
                 x.data[start:stop:step, :, :, :] = self.data[start:stop:step, :, :, :]
             return x
         else:
-            if self.full_lattice and self.L5 is not None:
+            if self.full_field and self.L5 is not None:
                 return self.data[:, :, start:stop:step, :, :, :]
-            elif self.full_lattice or self.L5 is not None:
+            elif self.full_field or self.L5 is not None:
                 return self.data[:, start:stop:step, :, :, :]
             else:
                 return self.data[start:stop:step, :, :, :]
@@ -494,7 +619,7 @@ def __init__(self, latt_info: LatticeInfo, value: Any = None, init_data: bool =
             s.__field_class__.__base__.__init__(self, latt_info, value, False)
         else:
             s.__init__(latt_info, value, False)
-        self.full_lattice = True
+        self.full_field = True
         if init_data:
             self._initData(value)
 
@@ -526,221 +651,34 @@ def lexico(self, dtype=None):
         return lexico(self.getHost(), [0, 1, 2, 3, 4], dtype)
 
     def checksum(self) -> Tuple[int, int]:
-        return checksum(self.latt_info, self.lexico().reshape(self.latt_info.volume, self.field_size))
-
-    @classmethod
-    def _load(cls, latt_info: LatticeInfo, dataset):
-        gx, gy, gz, gt = latt_info.grid_coord
-        Lx, Ly, Lz, Lt = latt_info.size
-        data: numpy.ndarray = dataset[
-            gt * Lt : (gt + 1) * Lt,
-            gz * Lz : (gz + 1) * Lz,
-            gy * Ly : (gy + 1) * Ly,
-            gx * Lx : (gx + 1) * Lx,
-        ]
-        sum29, sum31 = checksum(latt_info, data.reshape(latt_info.volume, -1))
-        assert dataset.attrs["sum29"] == f"0x{sum29:08x}"
-        assert dataset.attrs["sum31"] == f"0x{sum31:08x}"
-        return data
+        return checksum(self.latt_info, self.lexico().reshape(self.latt_info.volume, self.field_size).view("<u4"))
 
     @classmethod
     def load(
         cls,
         filename: str,
         label: Union[int, str, Sequence[int], Sequence[str]],
+        *,
+        check: bool = True,
     ):
-        import h5py
-        from . import getLogger, getMPIComm, getMPISize
+        from . import getLogger, getGridSize
+        from .file import File
 
         s = perf_counter()
         gbytes = 0
         filename = path.expanduser(path.expandvars(filename))
-        with h5py.File(filename, "r", driver="mpio", comm=getMPIComm()) as f:
-            class_name = cls.__name__[len("Multi") :] if cls.__name__.startswith("Multi") else cls.__name__
-            if class_name not in f:
-                getLogger().critical(f"{filename} doesn't contain a {cls.__name__}", TypeError)
-            g = f[class_name]
-            latt_info = LatticeInfo([int(L) for L in g.attrs["Ld"].split(" ")])
-            latt_info.Ns = int(g.attrs["Ns"])
-            latt_info.Nc = int(g.attrs["Nc"])
-            field_shape, field_dtype = _field_shape_dtype(cls._field(), latt_info.Ns, latt_info.Nc)
-            if isinstance(label, (int, str)) and not issubclass(cls, MultiField):
-                key = str(label)
-                value = cb2(cls._load(latt_info, g[key]), [0, 1, 2, 3], field_dtype)
-                gbytes += g[key].nbytes / 1024**3
-            elif isinstance(label, (list, tuple, range)) and issubclass(cls, MultiField):
-                keys = [str(key) for key in label]
-                assert 0 < len(keys) <= len(g)
-                value = []
-                for key in keys:
-                    value.append(cb2(cls._load(latt_info, g[key]), [0, 1, 2, 3], field_dtype))
-                    gbytes += g[key].nbytes / 1024**3
-                value = numpy.asarray(value)
-            else:
-                getLogger().critical(f"Invalid label {label} for field type {cls.__name__}", TypeError)
-        secs = perf_counter() - s
-        getLogger().debug(f"Loaded {filename} in {secs:.3f} secs, {getMPISize() * gbytes / secs:.3f} GB/s")
+        with File(filename, "r") as f:
+            latt_size, Ns, Nc, value = f.load(cls._groupName(), label, getGridSize(), check=check)
+        latt_info = LatticeInfo(latt_size)
+        latt_info.Ns = Ns
+        latt_info.Nc = Nc
         if not issubclass(cls, MultiField):
-            return cls(latt_info, value)
+            retval = cls(latt_info, cb2(value, [0, 1, 2, 3]))
         else:
-            return cls(latt_info, len(keys), value)
-
-    @classmethod
-    def _save(cls, latt_info: LatticeInfo, dataset, data: numpy.ndarray):
-        gx, gy, gz, gt = latt_info.grid_coord
-        Lx, Ly, Lz, Lt = latt_info.size
-        dataset[
-            gt * Lt : (gt + 1) * Lt,
-            gz * Lz : (gz + 1) * Lz,
-            gy * Ly : (gy + 1) * Ly,
-            gx * Lx : (gx + 1) * Lx,
-        ] = data
-        sum29, sum31 = checksum(latt_info, data.reshape(latt_info.volume, -1))
-        dataset.attrs["sum29"] = f"0x{sum29:08x}"
-        dataset.attrs["sum31"] = f"0x{sum31:08x}"
-
-    def save(
-        self,
-        filename: str,
-        label: Union[int, str, Sequence[int], Sequence[str]],
-        *,
-        annotation: str = "",
-        int_nbytes: int = 4,
-        float_nbytes: int = 8,
-    ):
-        import h5py
-        from . import getLogger, getMPIComm
-
-        s = perf_counter()
-        gbytes = 0
-        filename = path.expanduser(path.expandvars(filename))
-        with h5py.File(filename, "w", driver="mpio", comm=getMPIComm()) as f:
-            class_name = (
-                self.__class__.__name__[len("Multi") :]
-                if self.__class__.__name__.startswith("Multi")
-                else self.__class__.__name__
-            )
-            latt_info = self.latt_info
-            g = f.create_group(class_name)
-            g.attrs["Annotation"] = annotation
-            g.attrs["Ld"] = " ".join([str(L) for L in latt_info.global_size])
-            g.attrs["Ns"] = str(latt_info.Ns)
-            g.attrs["Nc"] = str(latt_info.Nc)
-            GLx, GLy, GLz, GLt = latt_info.global_size
-            field_shape, field_dtype = _field_shape_dtype(
-                self._field(), latt_info.Ns, latt_info.Nc, int_nbytes, float_nbytes
-            )
-            if isinstance(label, (int, str)) and not isinstance(self, MultiField):
-                key = str(label)
-                g.create_dataset(key, (GLt, GLz, GLy, GLx, *field_shape), field_dtype)
-                self._save(latt_info, g[key], self.lexico(field_dtype))
-                gbytes += g[key].nbytes / 1024**3
-            elif isinstance(label, (list, tuple, range)) and isinstance(self, MultiField):
-                keys = [str(key) for key in label]
-                assert len(keys) == self.L5
-                for index, key in enumerate(keys):
-                    g.create_dataset(key, (GLt, GLz, GLy, GLx, *field_shape), field_dtype)
-                    self._save(latt_info, g[key], self[index].lexico(field_dtype))
-                    gbytes += g[key].nbytes / 1024**3
-            else:
-                getLogger().critical(f"Invalid label {label} for field type {self.__class__.__name__}", TypeError)
-        secs = perf_counter() - s
-        getLogger().debug(f"Saved {filename} in {secs:.3f} secs, {gbytes / secs:.3f} GB/s")
-
-    def append(
-        self,
-        filename: str,
-        label: Union[int, str, Sequence[int], Sequence[str]],
-        *,
-        annotation: str = "",
-        int_nbytes: int = 4,
-        float_nbytes: int = 8,
-    ):
-        import h5py
-        from . import getLogger, getMPIComm
-
-        s = perf_counter()
-        gbytes = 0
-        filename = path.expanduser(path.expandvars(filename))
-        with h5py.File(filename, "r+", driver="mpio", comm=getMPIComm()) as f:
-            class_name = (
-                self.__class__.__name__[len("Multi") :]
-                if self.__class__.__name__.startswith("Multi")
-                else self.__class__.__name__
-            )
-            latt_info = self.latt_info
-            g = f.create_group(class_name)
-            g.attrs["Annotation"] = annotation
-            g.attrs["Ld"] = " ".join([str(L) for L in latt_info.global_size])
-            g.attrs["Ns"] = str(latt_info.Ns)
-            g.attrs["Nc"] = str(latt_info.Nc)
-            GLx, GLy, GLz, GLt = latt_info.global_size
-            field_shape, field_dtype = _field_shape_dtype(
-                self._field(), latt_info.Ns, latt_info.Nc, int_nbytes, float_nbytes
-            )
-            if isinstance(label, (int, str)) and not isinstance(self, MultiField):
-                key = str(label)
-                g.create_dataset(key, (GLt, GLz, GLy, GLx, *field_shape), field_dtype)
-                self._save(latt_info, g[key], self.lexico(field_dtype))
-                gbytes += g[key].nbytes / 1024**3
-            elif isinstance(label, (list, tuple, range)) and isinstance(self, MultiField):
-                keys = [str(key) for key in label]
-                assert len(keys) == self.L5
-                for index, key in enumerate(keys):
-                    g.create_dataset(key, (GLt, GLz, GLy, GLx, *field_shape), field_dtype)
-                    self._save(latt_info, g[key], self[index].lexico(field_dtype))
-                    gbytes += g[key].nbytes / 1024**3
-            else:
-                getLogger().critical(f"Invalid label {label} for field type {self.__class__.__name__}", TypeError)
-        secs = perf_counter() - s
-        getLogger().debug(f"Appended {filename} in {secs:.3f} secs, {gbytes / secs:.3f} GB/s")
-
-    def update(
-        self,
-        filename: str,
-        label: Union[int, str, Sequence[int], Sequence[str]],
-        *,
-        annotation: str = "",
-    ):
-        import h5py
-        from . import getLogger, getMPIComm
-
-        s = perf_counter()
-        gbytes = 0
-        filename = path.expanduser(path.expandvars(filename))
-        with h5py.File(filename, "r+", driver="mpio", comm=getMPIComm()) as f:
-            multi = self.__class__.__name__.startswith("Multi")
-            class_name = self.__class__.__name__[len("Multi") :] if multi else self.__class__.__name__
-            latt_info = self.latt_info
-            if class_name not in f:
-                getLogger().critical(f"{filename} doesn't contain a {self.__class__.__name__}", TypeError)
-            g = f[class_name]
-            if annotation != "":
-                g.attrs["Annotation"] = annotation
-            assert g.attrs["Ld"] == " ".join([str(L) for L in latt_info.global_size])
-            assert g.attrs["Ns"] == str(latt_info.Ns)
-            assert g.attrs["Nc"] == str(latt_info.Nc)
-            GLx, GLy, GLz, GLt = latt_info.global_size
-            field_shape, field_dtype = _field_shape_dtype(self._field(), latt_info.Ns, latt_info.Nc)
-            for key in g.keys():
-                field_dtype = g[key].dtype.str
-                break
-            if isinstance(label, (int, str)) and not isinstance(self, MultiField):
-                key = str(label)
-                self._save(latt_info, g[key], self.lexico(field_dtype))
-                gbytes += g[key].nbytes / 1024**3
-            elif isinstance(label, (list, tuple, range)) and isinstance(self, MultiField):
-                keys = [str(key) for key in label]
-                assert len(keys) == self.L5
-                for index, key in enumerate(keys):
-                    if key not in g:
-                        g.create_dataset(key, (GLt, GLz, GLy, GLx, *field_shape), field_dtype)
-                    self._save(latt_info, g[key], self[index].lexico(field_dtype))
-                    gbytes += g[key].nbytes / 1024**3
-            else:
-                getLogger().critical(f"Invalid label {label} for field type {self.__class__.__name__}", TypeError)
+            retval = cls(latt_info, len(label), numpy.asarray([cb2(data, [0, 1, 2, 3]) for data in value]))
         secs = perf_counter() - s
-        getLogger().debug(f"Updated {filename} in {secs:.3f} secs, {gbytes / secs:.3f} GB/s")
+        getLogger().debug(f"Loaded {filename} in {secs:.3f} secs, {gbytes / secs:.3f} GB/s")
+        return retval
 
 
 class MultiField:
@@ -806,11 +744,11 @@ def data_ptr(self, index: int = 0) -> Pointer:
         return ndarrayPointer(self.data.reshape(self.L5, -1)[index], True)
 
     def even_ptr(self, index: int) -> Pointer:
-        assert self.full_lattice
+        assert self.full_field
         return ndarrayPointer(self.data.reshape(self.L5, 2, -1)[index, 0], True)
 
     def odd_ptr(self, index: int) -> Pointer:
-        assert self.full_lattice
+        assert self.full_field
         return ndarrayPointer(self.data.reshape(self.L5, 2, -1)[index, 1], True)
 
     @property
@@ -819,12 +757,12 @@ def data_ptrs(self) -> Pointers:
 
     @property
     def even_ptrs(self) -> Pointers:
-        assert self.full_lattice
+        assert self.full_field
         return ndarrayPointer(self.data.reshape(self.L5, 2, -1)[:, 0], True)
 
     @property
     def odd_ptrs(self) -> Pointers:
-        assert self.full_lattice
+        assert self.full_field
         return ndarrayPointer(self.data.reshape(self.L5, 2, -1)[:, 1], True)
 
     def copy(self):
@@ -918,17 +856,17 @@ def __init__(self, latt_info: LatticeInfo, L5: Union[int, Any] = Nd, value=None)
         self._gauge_dirac = None
 
     @classmethod
-    def load(cls, filename: str) -> "LatticeGauge":
-        return super().load(filename, range(Nd))
+    def load(cls, filename: str, *, check: bool = True) -> "LatticeGauge":
+        return super().load(filename, ["X", "Y", "Z", "T"], check=check)
 
-    def save(self, filename: str, *, annotation: str = "", int_nbytes: int = 4, float_nbytes: int = 8):
-        super().save(filename, range(Nd), annotation=annotation, int_nbytes=int_nbytes, float_nbytes=float_nbytes)
+    def save(self, filename: str, *, annotation: str = "", check: bool = True, use_fp32: bool = False):
+        super().save(filename, ["X", "Y", "Z", "T"], annotation=annotation, check=check, use_fp32=use_fp32)
 
-    def append(self, filename: str, *, annotation: str = "", int_nbytes: int = 4, float_nbytes: int = 8):
-        super().append(filename, range(Nd), annotation=annotation, int_nbytes=int_nbytes, float_nbytes=float_nbytes)
+    def append(self, filename: str, *, annotation: str = "", check: bool = True, use_fp32: bool = False):
+        super().append(filename, ["X", "Y", "Z", "T"], annotation=annotation, check=check, use_fp32=use_fp32)
 
-    def update(self, filename: str, *, annotation: str = ""):
-        super().update(filename, range(Nd), annotation=annotation)
+    def update(self, filename: str, *, annotation: str = "", check: bool = True):
+        super().update(filename, ["X", "Y", "Z", "T"], annotation=annotation, check=check)
 
     @property
     def gauge_dirac(self):
@@ -1253,17 +1191,17 @@ def __init__(self, latt_info: LatticeInfo, L5: Union[int, Any] = 4, value=None)
         self._gauge_dirac = None
 
     @classmethod
-    def load(cls, filename: str) -> "LatticeMom":
-        return super().load(filename, range(Nd))
+    def load(cls, filename: str, *, check: bool = True) -> "LatticeMom":
+        return super().load(filename, ["X", "Y", "Z", "T"], check=check)
 
-    def save(self, filename: str, *, annotation: str = "", int_nbytes: int = 4, float_nbytes: int = 8):
-        super().save(filename, range(Nd), annotation=annotation, int_nbytes=int_nbytes, float_nbytes=float_nbytes)
+    def save(self, filename: str, *, annotation: str = "", check: bool = True, use_fp32: bool = False):
+        super().save(filename, ["X", "Y", "Z", "T"], annotation=annotation, check=check, use_fp32=use_fp32)
 
-    def append(self, filename: str, *, annotation: str = "", int_nbytes: int = 4, float_nbytes: int = 8):
-        super().append(filename, range(Nd), annotation=annotation, int_nbytes=int_nbytes, float_nbytes=float_nbytes)
+    def append(self, filename: str, *, annotation: str = "", check: bool = True, use_fp32: bool = False):
+        super().append(filename, ["X", "Y", "Z", "T"], annotation=annotation, check=check, use_fp32=use_fp32)
 
-    def update(self, filename: str, *, annotation: str = ""):
-        super().update(filename, range(Nd), annotation=annotation)
+    def update(self, filename: str, *, annotation: str = "", check: bool = True):
+        super().update(filename, ["X", "Y", "Z", "T"], annotation=annotation, check=check)
 
     @property
     def gauge_dirac(self):
diff --git a/pyquda_core/pyquda/file.py b/pyquda_core/pyquda/file.py
new file mode 100644
index 0000000..e2dde87
--- /dev/null
+++ b/pyquda_core/pyquda/file.py
@@ -0,0 +1,348 @@
+from time import perf_counter
+from typing import List, Sequence, Tuple, Union
+
+import numpy
+from mpi4py import MPI
+import h5py
+
+
+class _LatticeInfo:
+    def __init__(self, latt_size: List[int], grid_size: List[int]) -> None:
+        self._checkLattice(latt_size, grid_size)
+        self._setLattice(latt_size, grid_size)
+
+    def _checkLattice(self, latt_size: List[int], grid_size: List[int]):
+        assert len(latt_size) == len(grid_size), "lattice size and grid size must have the same dimension"
+        for GL, G in zip(latt_size, grid_size):
+            if not (GL % G == 0):
+                raise ValueError("lattice size must be divisible by gird size")
+
+    def _setLattice(self, latt_size: List[int], grid_size: List[int]):
+        if MPI.COMM_WORLD.Get_size() != int(numpy.prod(grid_size)):
+            raise ValueError(f"The MPI size {MPI.COMM_WORLD.Get_size()} does not match the grid size {grid_size}")
+        sublatt_size = [GL // G for GL, G in zip(latt_size, grid_size)]
+        sublatt_slice = []
+        mpi_rank = MPI.COMM_WORLD.Get_rank()
+        for G, L in zip(grid_size[::-1], sublatt_size[::-1]):
+            g = mpi_rank % G
+            mpi_rank //= G
+            sublatt_slice.append(slice(g * L, (g + 1) * L))
+
+        self.global_size = latt_size
+        self.global_volume = int(numpy.prod(latt_size))
+        self.size = sublatt_size
+        self.volume = int(numpy.prod(sublatt_size))
+        self.slice = tuple(sublatt_slice)
+
+
+# CRC32LUT = numpy.empty((4, 256), dtype="<u4")
+# # fmt: off
+# CRC32LUT[0] = numpy.array(
+#     [
+#         0x00000000, 0x77073096, 0xEE0E612C, 0x990951BA, 0x076DC419, 0x706AF48F, 0xE963A535, 0x9E6495A3,
+#         0x0EDB8832, 0x79DCB8A4, 0xE0D5E91E, 0x97D2D988, 0x09B64C2B, 0x7EB17CBD, 0xE7B82D07, 0x90BF1D91,
+#         0x1DB71064, 0x6AB020F2, 0xF3B97148, 0x84BE41DE, 0x1ADAD47D, 0x6DDDE4EB, 0xF4D4B551, 0x83D385C7,
+#         0x136C9856, 0x646BA8C0, 0xFD62F97A, 0x8A65C9EC, 0x14015C4F, 0x63066CD9, 0xFA0F3D63, 0x8D080DF5,
+#         0x3B6E20C8, 0x4C69105E, 0xD56041E4, 0xA2677172, 0x3C03E4D1, 0x4B04D447, 0xD20D85FD, 0xA50AB56B,
+#         0x35B5A8FA, 0x42B2986C, 0xDBBBC9D6, 0xACBCF940, 0x32D86CE3, 0x45DF5C75, 0xDCD60DCF, 0xABD13D59,
+#         0x26D930AC, 0x51DE003A, 0xC8D75180, 0xBFD06116, 0x21B4F4B5, 0x56B3C423, 0xCFBA9599, 0xB8BDA50F,
+#         0x2802B89E, 0x5F058808, 0xC60CD9B2, 0xB10BE924, 0x2F6F7C87, 0x58684C11, 0xC1611DAB, 0xB6662D3D,
+#         0x76DC4190, 0x01DB7106, 0x98D220BC, 0xEFD5102A, 0x71B18589, 0x06B6B51F, 0x9FBFE4A5, 0xE8B8D433,
+#         0x7807C9A2, 0x0F00F934, 0x9609A88E, 0xE10E9818, 0x7F6A0DBB, 0x086D3D2D, 0x91646C97, 0xE6635C01,
+#         0x6B6B51F4, 0x1C6C6162, 0x856530D8, 0xF262004E, 0x6C0695ED, 0x1B01A57B, 0x8208F4C1, 0xF50FC457,
+#         0x65B0D9C6, 0x12B7E950, 0x8BBEB8EA, 0xFCB9887C, 0x62DD1DDF, 0x15DA2D49, 0x8CD37CF3, 0xFBD44C65,
+#         0x4DB26158, 0x3AB551CE, 0xA3BC0074, 0xD4BB30E2, 0x4ADFA541, 0x3DD895D7, 0xA4D1C46D, 0xD3D6F4FB,
+#         0x4369E96A, 0x346ED9FC, 0xAD678846, 0xDA60B8D0, 0x44042D73, 0x33031DE5, 0xAA0A4C5F, 0xDD0D7CC9,
+#         0x5005713C, 0x270241AA, 0xBE0B1010, 0xC90C2086, 0x5768B525, 0x206F85B3, 0xB966D409, 0xCE61E49F,
+#         0x5EDEF90E, 0x29D9C998, 0xB0D09822, 0xC7D7A8B4, 0x59B33D17, 0x2EB40D81, 0xB7BD5C3B, 0xC0BA6CAD,
+#         0xEDB88320, 0x9ABFB3B6, 0x03B6E20C, 0x74B1D29A, 0xEAD54739, 0x9DD277AF, 0x04DB2615, 0x73DC1683,
+#         0xE3630B12, 0x94643B84, 0x0D6D6A3E, 0x7A6A5AA8, 0xE40ECF0B, 0x9309FF9D, 0x0A00AE27, 0x7D079EB1,
+#         0xF00F9344, 0x8708A3D2, 0x1E01F268, 0x6906C2FE, 0xF762575D, 0x806567CB, 0x196C3671, 0x6E6B06E7,
+#         0xFED41B76, 0x89D32BE0, 0x10DA7A5A, 0x67DD4ACC, 0xF9B9DF6F, 0x8EBEEFF9, 0x17B7BE43, 0x60B08ED5,
+#         0xD6D6A3E8, 0xA1D1937E, 0x38D8C2C4, 0x4FDFF252, 0xD1BB67F1, 0xA6BC5767, 0x3FB506DD, 0x48B2364B,
+#         0xD80D2BDA, 0xAF0A1B4C, 0x36034AF6, 0x41047A60, 0xDF60EFC3, 0xA867DF55, 0x316E8EEF, 0x4669BE79,
+#         0xCB61B38C, 0xBC66831A, 0x256FD2A0, 0x5268E236, 0xCC0C7795, 0xBB0B4703, 0x220216B9, 0x5505262F,
+#         0xC5BA3BBE, 0xB2BD0B28, 0x2BB45A92, 0x5CB36A04, 0xC2D7FFA7, 0xB5D0CF31, 0x2CD99E8B, 0x5BDEAE1D,
+#         0x9B64C2B0, 0xEC63F226, 0x756AA39C, 0x026D930A, 0x9C0906A9, 0xEB0E363F, 0x72076785, 0x05005713,
+#         0x95BF4A82, 0xE2B87A14, 0x7BB12BAE, 0x0CB61B38, 0x92D28E9B, 0xE5D5BE0D, 0x7CDCEFB7, 0x0BDBDF21,
+#         0x86D3D2D4, 0xF1D4E242, 0x68DDB3F8, 0x1FDA836E, 0x81BE16CD, 0xF6B9265B, 0x6FB077E1, 0x18B74777,
+#         0x88085AE6, 0xFF0F6A70, 0x66063BCA, 0x11010B5C, 0x8F659EFF, 0xF862AE69, 0x616BFFD3, 0x166CCF45,
+#         0xA00AE278, 0xD70DD2EE, 0x4E048354, 0x3903B3C2, 0xA7672661, 0xD06016F7, 0x4969474D, 0x3E6E77DB,
+#         0xAED16A4A, 0xD9D65ADC, 0x40DF0B66, 0x37D83BF0, 0xA9BCAE53, 0xDEBB9EC5, 0x47B2CF7F, 0x30B5FFE9,
+#         0xBDBDF21C, 0xCABAC28A, 0x53B39330, 0x24B4A3A6, 0xBAD03605, 0xCDD70693, 0x54DE5729, 0x23D967BF,
+#         0xB3667A2E, 0xC4614AB8, 0x5D681B02, 0x2A6F2B94, 0xB40BBE37, 0xC30C8EA1, 0x5A05DF1B, 0x2D02EF8D
+#     ],
+#     "<u4"
+# )
+# # fmt: on
+# for i in range(256):
+#     CRC32LUT[1, i] = (CRC32LUT[0, i] >> 8) ^ CRC32LUT[0, CRC32LUT[0, i] & 0xFF]
+#     CRC32LUT[2, i] = (CRC32LUT[1, i] >> 8) ^ CRC32LUT[0, CRC32LUT[1, i] & 0xFF]
+#     CRC32LUT[3, i] = (CRC32LUT[2, i] >> 8) ^ CRC32LUT[0, CRC32LUT[2, i] & 0xFF]
+#     # CRC32LUT[4, i] = (CRC32LUT[3, i] >> 8) ^ CRC32LUT[0, CRC32LUT[3, i] & 0xFF]
+#     # CRC32LUT[5, i] = (CRC32LUT[4, i] >> 8) ^ CRC32LUT[0, CRC32LUT[4, i] & 0xFF]
+#     # CRC32LUT[6, i] = (CRC32LUT[5, i] >> 8) ^ CRC32LUT[0, CRC32LUT[5, i] & 0xFF]
+#     # CRC32LUT[7, i] = (CRC32LUT[6, i] >> 8) ^ CRC32LUT[0, CRC32LUT[6, i] & 0xFF]
+
+
+def checksum(latt_info, data: numpy.ndarray) -> Tuple[int, int]:
+    import zlib
+
+    work = numpy.empty((latt_info.volume), "<u4")
+    for i in range(latt_info.volume):
+        work[i] = zlib.crc32(data[i])
+    # work = numpy.full_like(data[:, 0], 0xFFFFFFFF)
+    # for i in range(data.shape[1]):
+    #     work ^= data[:, i]
+    #     work_view = work.view("|u1").reshape(-1, 4)
+    #     work = (
+    #         CRC32LUT[0].take(work_view[:, 0], mode="wrap")
+    #         ^ CRC32LUT[1].take(work_view[:, 1], mode="wrap")
+    #         ^ CRC32LUT[2].take(work_view[:, 2], mode="wrap")
+    #         ^ CRC32LUT[3].take(work_view[:, 3], mode="wrap")
+    #     )
+    # work ^= 0xFFFFFFFF
+    # work = numpy.bitwise_xor.reduce(data, 1)
+    rank = (
+        numpy.arange(latt_info.global_volume, dtype="<u8")
+        .reshape(*latt_info.global_size[::-1])[latt_info.slice]
+        .reshape(-1)
+    )
+    rank29 = (rank % 29).astype("<u4")
+    rank31 = (rank % 31).astype("<u4")
+    sum29 = MPI.COMM_WORLD.allreduce(numpy.bitwise_xor.reduce(work << rank29 | work >> (32 - rank29)).item(), MPI.BXOR)
+    sum31 = MPI.COMM_WORLD.allreduce(numpy.bitwise_xor.reduce(work << rank31 | work >> (32 - rank31)).item(), MPI.BXOR)
+    return sum29, sum31
+
+
+def _spin_color_dtype(name: str, shape: Sequence[int], use_fp32: bool = True) -> Tuple[int, int]:
+    float_nbytes = 4 if use_fp32 else 8
+    Ns, Nc, dtype = 4, 3, f"<c{2 * float_nbytes}"
+    if name.endswith("Int"):
+        () = shape
+        dtype = "<i4"
+    elif name.endswith("Real"):
+        () = shape
+        dtype = f"<f{float_nbytes}"
+    elif name.endswith("Complex"):
+        () = shape
+    elif name.endswith("SpinColorVector"):
+        Ns, Nc = shape
+    elif name.endswith("SpinColorMatrix"):
+        Ns, Ns_, Nc, Nc_ = shape
+        assert Ns == Ns_ and Nc == Nc_
+    elif name.endswith("ColorVector"):
+        (Nc,) = shape
+    elif name.endswith("ColorMatrix"):
+        Nc, Nc_ = shape
+        assert Nc == Nc_
+    else:
+        raise ValueError(f"Invalid field type: {name}")
+    return Ns, Nc, dtype
+
+
+def _field_info(
+    group: str,
+    label: Union[int, str, Sequence[str], Sequence[str]],
+    field: numpy.ndarray,
+    grid_size: Sequence[int],
+    use_fp32: bool,
+):
+    if isinstance(label, (int, str)):
+        keys = str(label)
+        sublatt_size = field.shape[0 : len(grid_size)][::-1]
+        field_shape = field.shape[len(grid_size) :]
+    elif isinstance(label, (list, tuple, range)):
+        assert len(label) == field.shape[0]
+        keys = [str(key) for key in label]
+        sublatt_size = field.shape[1 : 1 + len(grid_size)][::-1]
+        field_shape = field.shape[1 + len(grid_size) :]
+    else:
+        raise TypeError(f"Invalid label {label} for field type {group}")
+    latt_size = [G * L for G, L in zip(grid_size, sublatt_size)]
+    Ns, Nc, field_dtype = _spin_color_dtype(group, field_shape, use_fp32)
+    return keys, latt_size, Ns, Nc, field_shape, field_dtype
+
+
+class File(h5py.File):
+    def __init__(self, name, mode="r", **kwds):
+        """Create a new file object with the mpio driver.
+
+        See the h5py user guide for a detailed explanation of the options.
+
+        name
+            Name of the file on disk, or file-like object.
+        mode
+            r        Readonly, file must exist (default)
+            r+       Read/write, file must exist
+            w        Create file, truncate if exists
+            w- or x  Create file, fail if exists
+            a        Read/write if exists, create otherwise
+        """
+        super().__init__(name, mode, driver="mpio", comm=MPI.COMM_WORLD, **kwds)
+
+    @classmethod
+    def _load(cls, latt_info: _LatticeInfo, dataset: h5py.Dataset, check: bool = True):
+        data: numpy.ndarray = dataset[latt_info.slice]
+        if check:
+            sum29, sum31 = checksum(latt_info, data.reshape(latt_info.volume, -1).view("<u4"))
+            assert dataset.attrs["sum29"] == f"0x{sum29:08x}", f"{dataset.attrs['sum29']} != 0x{sum29:08x}"
+            assert dataset.attrs["sum31"] == f"0x{sum31:08x}", f"{dataset.attrs['sum31']} != 0x{sum31:08x}"
+        return data
+
+    def load(
+        self,
+        group: str,
+        label: Union[int, str, Sequence[int], Sequence[str]],
+        grid_size: Sequence[int],
+        *,
+        check: bool = True,
+    ):
+        s = perf_counter()
+        gbytes = 0
+        g = self[group]
+        # assert g.attrs["Lattice"] == " ".join([str(L) for L in latt_size])
+        # assert g.attrs["Spin"] == str(Ns)
+        # assert g.attrs["Color"] == str(Nc)
+        latt_size = [int(GL) for GL in g.attrs["Lattice"].split()]
+        Ns = int(g.attrs["Spin"])
+        Nc = int(g.attrs["Color"])
+        if isinstance(label, (int, str)):
+            keys = str(label)
+        elif isinstance(label, (list, tuple, range)):
+            assert len(label) <= len(g)
+            keys = [str(key) for key in label]
+
+        for key in g.keys():
+            field_dtype = g[key].dtype.str.replace("<c8", "<c16").replace("<f4", "<f8")
+            break
+        latt_info = _LatticeInfo(latt_size, grid_size)
+        if isinstance(keys, str):
+            key = keys
+            value = self._load(latt_info, g[key], check).astype(field_dtype)
+            gbytes += g[key].nbytes / 1024**3
+        else:
+            value = []
+            for key in keys:
+                value.append(self._load(latt_info, g[key], check).astype(field_dtype))
+                gbytes += g[key].nbytes / 1024**3
+        secs = perf_counter() - s
+        if MPI.COMM_WORLD.Get_rank() == 0:
+            print(f"Loaded {group} from {self.filename} in {secs:.3f} secs, {gbytes / secs:.3f} GB/s")
+        return latt_size, Ns, Nc, value
+
+    @classmethod
+    def _save(cls, latt_info: _LatticeInfo, dataset: h5py.Dataset, data: numpy.ndarray, check: bool = True):
+        dataset[latt_info.slice] = data
+        if check:
+            sum29, sum31 = checksum(latt_info, data.reshape(latt_info.volume, -1).view("<u4"))
+            dataset.attrs["sum29"] = f"0x{sum29:08x}"
+            dataset.attrs["sum31"] = f"0x{sum31:08x}"
+
+    def save(
+        self,
+        group: str,
+        label: Union[int, str, Sequence[int], Sequence[str]],
+        field: numpy.ndarray,
+        grid_size: Sequence[int],
+        *,
+        annotation: str = "",
+        check: bool = True,
+        use_fp32: bool = False,
+    ):
+        s = perf_counter()
+        gbytes = 0
+        keys, latt_size, Ns, Nc, field_shape, field_dtype = _field_info(group, label, field, grid_size, use_fp32)
+        g = self.create_group(group)
+        g.attrs["Annotation"] = annotation
+        g.attrs["Lattice"] = " ".join([str(GL) for GL in latt_size])
+        g.attrs["Spin"] = str(Ns)
+        g.attrs["Color"] = str(Nc)
+
+        latt_info = _LatticeInfo(latt_size, grid_size)
+        if isinstance(keys, str):
+            key = keys[0]
+            g.create_dataset(key, (*latt_size[::-1], *field_shape), field_dtype)
+            self._save(latt_info, g[key], field.astype(field_dtype), check)
+            gbytes += g[key].nbytes / 1024**3
+        else:
+            for index, key in enumerate(keys):
+                g.create_dataset(key, (*latt_size[::-1], *field_shape), field_dtype)
+                self._save(latt_info, g[key], field[index].astype(field_dtype), check)
+                gbytes += g[key].nbytes / 1024**3
+        secs = perf_counter() - s
+        if MPI.COMM_WORLD.Get_rank() == 0:
+            print(f"Saved {group} to {self.filename} in {secs:.3f} secs, {gbytes / secs:.3f} GB/s")
+
+    def append(
+        self,
+        group: str,
+        label: Union[int, str, Sequence[int], Sequence[str]],
+        field: numpy.ndarray,
+        grid: Sequence[int],
+        *,
+        annotation: str = "",
+        check: bool = True,
+        use_fp32: bool = False,
+    ):
+        s = perf_counter()
+        gbytes = 0
+        keys, latt_size, Ns, Nc, field_shape, field_dtype = _field_info(group, label, field, grid, use_fp32)
+        g = self.create_group(group)
+        g.attrs["Annotation"] = annotation
+        g.attrs["Lattice"] = " ".join([str(GL) for GL in latt_size])
+        g.attrs["Spin"] = str(Ns)
+        g.attrs["Color"] = str(Nc)
+
+        latt_info = _LatticeInfo(latt_size, grid)
+        if isinstance(keys, str):
+            key = keys
+            g.create_dataset(key, (*latt_size[::-1], *field_shape), field_dtype)
+            self._save(latt_info, g[key], field.astype(field_dtype), check)
+            gbytes += g[key].nbytes / 1024**3
+        else:
+            for index, key in enumerate(keys):
+                g.create_dataset(key, (*latt_size[::-1], *field_shape), field_dtype)
+                self._save(latt_info, g[key], field[index].astype(field_dtype), check)
+                gbytes += g[key].nbytes / 1024**3
+        secs = perf_counter() - s
+        if MPI.COMM_WORLD.Get_rank() == 0:
+            print(f"Append {group} to {self.filename} in {secs:.3f} secs, {gbytes / secs:.3f} GB/s")
+
+    def update(
+        self,
+        group: str,
+        label: Union[int, str, Sequence[int], Sequence[str]],
+        field: numpy.ndarray,
+        grid: Sequence[int],
+        *,
+        annotation: str = "",
+        check: bool = True,
+    ):
+        s = perf_counter()
+        gbytes = 0
+        keys, latt_size, Ns, Nc, field_shape, field_dtype = _field_info(group, label, field, grid, False)
+        g = self[group]
+        if annotation != "":
+            g.attrs["Annotation"] = annotation
+        assert g.attrs["Lattice"] == " ".join([str(L) for L in latt_size])
+        assert g.attrs["Spin"] == str(Ns)
+        assert g.attrs["Color"] == str(Nc)
+
+        for key in g.keys():
+            field_dtype = g[key].dtype.str
+            break
+        latt_info = _LatticeInfo(latt_size, grid)
+        if isinstance(keys, str):
+            key = keys
+            self._save(latt_info, g[key], field.astype(field_dtype), check)
+            gbytes += g[key].nbytes / 1024**3
+        else:
+            for index, key in enumerate(keys):
+                if key not in g:
+                    g.create_dataset(key, (*latt_size[::-1], *field_shape), field_dtype)
+                self._save(latt_info, g[key], field[index].astype(field_dtype), check)
+                gbytes += g[key].nbytes / 1024**3
+        secs = perf_counter() - s
+        if MPI.COMM_WORLD.Get_rank() == 0:
+            print(f"Updated {group} to {self.filename} in {secs:.3f} secs, {gbytes / secs:.3f} GB/s")
diff --git a/pyquda_utils/io/chroma.py b/pyquda_utils/io/chroma.py
index 99d3b17..dab2f3f 100644
--- a/pyquda_utils/io/chroma.py
+++ b/pyquda_utils/io/chroma.py
@@ -23,12 +23,12 @@ def checksum_qio(latt_size: List[int], data):
     for i in range(Lt * Lz * Ly * Lx):
         work[i] = zlib.crc32(data[i])
     rank = (
-        numpy.arange(GLt * GLz * GLy * GLx, dtype="<u4")
+        numpy.arange(GLt * GLz * GLy * GLx, dtype="<u8")
         .reshape(GLt, GLz, GLy, GLx)[gLt : gLt + Lt, gLz : gLz + Lz, gLy : gLy + Ly, gLx : gLx + Lx]
         .reshape(-1)
     )
-    rank29 = rank % 29
-    rank31 = rank % 31
+    rank29 = (rank % 29).astype("<u4")
+    rank31 = (rank % 31).astype("<u4")
     sum29 = getMPIComm().allreduce(numpy.bitwise_xor.reduce(work << rank29 | work >> (32 - rank29)).item(), MPI.BXOR)
     sum31 = getMPIComm().allreduce(numpy.bitwise_xor.reduce(work << rank31 | work >> (32 - rank31)).item(), MPI.BXOR)
     return sum29, sum31
diff --git a/pyquda_utils/io/milc.py b/pyquda_utils/io/milc.py
index b6d39d8..375c9d2 100644
--- a/pyquda_utils/io/milc.py
+++ b/pyquda_utils/io/milc.py
@@ -24,12 +24,12 @@ def checksum_milc(latt_size: List[int], data):
 
     work = data.view("<u4")
     rank = (
-        numpy.arange(getMPISize() * work.size, dtype="<u4")
+        numpy.arange(getMPISize() * work.size, dtype="<u8")
         .reshape(GLt, GLz, GLy, GLx, -1)[gLt : gLt + Lt, gLz : gLz + Lz, gLy : gLy + Ly, gLx : gLx + Lx]
         .reshape(-1)
     )
-    rank29 = rank % 29
-    rank31 = rank % 31
+    rank29 = (rank % 29).astype("<u4")
+    rank31 = (rank % 31).astype("<u4")
     sum29 = getMPIComm().allreduce(numpy.bitwise_xor.reduce(work << rank29 | work >> (32 - rank29)).item(), MPI.BXOR)
     sum31 = getMPIComm().allreduce(numpy.bitwise_xor.reduce(work << rank31 | work >> (32 - rank31)).item(), MPI.BXOR)
     return sum29, sum31
diff --git a/tests/test.io.py b/tests/test.io.py
index b8a94f7..1e93db7 100644
--- a/tests/test.io.py
+++ b/tests/test.io.py
@@ -16,6 +16,7 @@
 propagator = core.invert(dslash, "point", [0, 0, 0, 0])
 dslash.destroy()
 
+print([(f"{i:08x}", f"{j:08x}") for i, j in gauge.checksum()])
 gauge.save("pt_prop_1.h5")
 propagator.append("pt_prop_1.h5", 0)
 convert.propagatorToMultiFermion(propagator).append("pt_prop_1.h5", range(12))