Skip to content

Commit

Permalink
Merge pull request #510 from OpenCOMPES/sxp_fixes-for-new-daq
Browse files Browse the repository at this point in the history
Sxp fixes for new daq
  • Loading branch information
rettigl authored Nov 11, 2024
2 parents 8f5e392 + 52a06c1 commit b219760
Show file tree
Hide file tree
Showing 10 changed files with 326 additions and 94 deletions.
2 changes: 2 additions & 0 deletions .cspell/custom-dictionary.txt
Original file line number Diff line number Diff line change
Expand Up @@ -84,6 +84,7 @@ dfpart
dfpid
dictionarized
dictmerge
DOOCS
dpkg
dropna
dset
Expand Down Expand Up @@ -383,6 +384,7 @@ xarray
xaxis
xcirc
xdata
XFEL
xind
Xinstrument
xlabel
Expand Down
4 changes: 2 additions & 2 deletions .github/workflows/documentation.yml
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,6 @@ jobs:
run: |
cp -r $GITHUB_WORKSPACE/tutorial $GITHUB_WORKSPACE/docs/
cp -r $GITHUB_WORKSPACE/sed/config $GITHUB_WORKSPACE/docs/sed
rm $GITHUB_WORKSPACE/docs/tutorial/5_sxp_workflow.ipynb
- name: download RAW data
Expand All @@ -61,10 +60,11 @@ jobs:
cd $GITHUB_WORKSPACE/docs
poetry run python scripts/download_data.py
- name: build Flash parquet files
- name: build parquet files
run: |
cd $GITHUB_WORKSPACE/docs
poetry run python scripts/build_flash_parquets.py
poetry run python scripts/build_sxp_parquets.py
# to be removed later. This theme doesn't support <3.9 python and our lock file contains 3.8
- name: install pydata-sphinx-theme
Expand Down
45 changes: 45 additions & 0 deletions docs/scripts/build_sxp_parquets.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
from pathlib import Path

import sed
from sed import SedProcessor
from sed.dataset import dataset

config_file = Path(sed.__file__).parent / "config/sxp_example_config.yaml"

dataset.get("Au_Mica", root_dir="./tutorial")
data_path = dataset.dir


config_override = {
"core": {
"paths": {
"data_raw_dir": data_path,
"data_parquet_dir": data_path + "/processed/",
},
},
}

runs = [
"0058",
"0059",
"0060",
"0061",
"0064",
"0065",
"0066",
"0067",
"0068",
"0069",
"0070",
"0071",
"0072",
"0073",
"0074",
]
for run in runs:
sp = SedProcessor(
runs=run,
config=config_override,
system_config=config_file,
collect_metadata=False,
)
1 change: 1 addition & 0 deletions docs/scripts/download_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,3 +5,4 @@
dataset.get("WSe2", remove_zip=True, root_dir=root_dir)
dataset.get("Gd_W110", remove_zip=True, root_dir=root_dir)
dataset.get("TaS2", remove_zip=True, root_dir=root_dir)
dataset.get("Au_Mica", remove_zip=True, root_dir=root_dir)
1 change: 1 addition & 0 deletions docs/workflows/index.md
Original file line number Diff line number Diff line change
Expand Up @@ -8,4 +8,5 @@ myst:

```{toctree}
../tutorial/4_hextof_workflow
../tutorial/5_sxp_workflow.ipynb
```
21 changes: 12 additions & 9 deletions poetry.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

16 changes: 16 additions & 0 deletions tutorial/sxp_config.yaml → sed/config/sxp_example_config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@ dataframe:
daq: DA03
forward_fill_iterations: 2
num_trains: 10
# num_pulses: 400 # only needed for data from new DAQ
x_column: dldPosX
corrected_x_column: "X"
kx_column: "kx"
Expand All @@ -27,6 +28,7 @@ dataframe:
tof_ns_column: dldTime
corrected_tof_column: "tm"
bias_column: "sampleBias"
delay_column: "delayStage"
tof_binwidth: 6.875E-12 # in seconds
tof_binning: 0
jitter_cols: ["dldPosX", "dldPosY", "dldTimeSteps"]
Expand Down Expand Up @@ -84,6 +86,11 @@ dataframe:
format: per_train
dataset_key: "/CONTROL/SCS_ILH_LAS/MDL/OPTICALDELAY_PP800/actualPosition/value"
index_key: "/INDEX/trainId"
# test:
# daq: DA02 # change DAQ for a channel
# format: per_pulse
# dataset_key: "/INSTRUMENT/SA3_XTD10_XGM/XGM/DOOCS:output/data/intensitySa3TD"
# index_key: "/INSTRUMENT/SA3_XTD10_XGM/XGM/DOOCS:output/data/trainId"

stream_name_prefixes:
DA03: "RAW-R"
Expand All @@ -92,3 +99,12 @@ dataframe:

beamtime_dir:
sxp: "/gpfs/exfel/exp/SXP/"

histogram:
# number of bins used for histogram visualization
bins: [80, 80, 80, 80]
# default axes to use for histogram visualization.
# Axes names starting with "@" refer to keys in the "dataframe" section
axes: ["@x_column", "@y_column", "@tof_column", "@delay_column"]
# default ranges to use for histogram visualization (in unbinned detector coordinates)
ranges: [[0, 4000], [0, 4000], [1000, 28000], [-1000, 1000]]
3 changes: 3 additions & 0 deletions sed/dataset/datasets.json
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,9 @@
"energycal_2020_07_20"
]
},
"Au_Mica": {
"url": "https://zenodo.org/records/13952965/files/Au_Mica_SXP.zip"
},
"Test": {
"url": "http://test.com/files/file.zip",
"subdirs": [
Expand Down
44 changes: 27 additions & 17 deletions sed/loader/sxp/loader.py
Original file line number Diff line number Diff line change
Expand Up @@ -256,6 +256,7 @@ def create_multi_index_per_electron(self, h5_file: h5py.File) -> None:
for i in train_id.index:
# removing broken trailing hit copies
num_trains = self._config["dataframe"].get("num_trains", 0)
num_pulses = self._config["dataframe"].get("num_pulses", 0)
if num_trains:
try:
num_valid_hits = np.where(np.diff(mib_array[i].astype(np.int32)) < 0)[0][
Expand All @@ -270,7 +271,10 @@ def create_multi_index_per_electron(self, h5_file: h5py.File) -> None:
index = 0
for train, train_end in enumerate(train_ends):
macrobunch_index.append(train_id[i] + np.uint(train))
microbunch_ids.append(mib_array[i, index:train_end])
if num_pulses:
microbunch_ids.append(mib_array[i, index:train_end] % num_pulses)
else:
microbunch_ids.append(mib_array[i, index:train_end])
indices.append(slice(index, train_end))
index = train_end + 1
macrobunch_indices.append(indices)
Expand Down Expand Up @@ -509,7 +513,7 @@ def create_dataframe_per_train(

def create_dataframe_per_channel(
self,
h5_file: h5py.File,
file_path: Path,
channel: str,
) -> Union[Series, DataFrame]:
"""
Expand All @@ -520,7 +524,7 @@ def create_dataframe_per_channel(
DataFrame depends on the channel's format specified in the configuration.
Args:
h5_file (h5py.File): The h5py.File object representing the HDF5 file.
file_path (Path): The path to the main HDF5 file.
channel (str): The name of the channel.
Returns:
Expand All @@ -530,11 +534,16 @@ def create_dataframe_per_channel(
ValueError: If the channel has an undefined format.
"""
channel_dict = self._config["dataframe"]["channels"][channel] # channel parameters
main_daq = self._config["dataframe"]["daq"]
channel_daq = self._config["dataframe"]["channels"][channel].get("daq", main_daq)
# load file corresponding to daq
h5_file = h5py.File(Path(str(file_path).replace(main_daq, channel_daq)))

[train_id, np_array] = self.create_numpy_array_per_channel(
h5_file,
channel,
) # numpy Array created
channel_dict = self._config["dataframe"]["channels"][channel] # channel parameters

# If np_array is size zero, fill with NaNs
if np_array.size == 0:
Expand Down Expand Up @@ -585,7 +594,7 @@ def create_dataframe_per_channel(

def concatenate_channels(
self,
h5_file: h5py.File,
file_path: Path,
) -> DataFrame:
"""
Concatenates the channels from the provided h5py.File into a pandas DataFrame.
Expand All @@ -595,7 +604,7 @@ def concatenate_channels(
available channels specified in the configuration.
Args:
h5_file (h5py.File): The h5py.File object representing the HDF5 file.
file_path (Path): The path to the main HDF5 file.
Returns:
DataFrame: A concatenated pandas DataFrame containing the channels.
Expand All @@ -604,11 +613,13 @@ def concatenate_channels(
ValueError: If the group_name for any channel does not exist in the file.
"""
all_keys = parse_h5_keys(h5_file) # Parses all channels present

# Check for if the provided dataset_keys and index_keys actually exists in the file
for channel in self._config["dataframe"]["channels"]:
dataset_key = self._config["dataframe"]["channels"][channel]["dataset_key"]
daq = self._config["dataframe"]["channels"][channel].get("daq", "DA03")
# load file corresponding to daq
h5_file = h5py.File(Path(str(file_path).replace("DA03", daq)))
all_keys = parse_h5_keys(h5_file) # Parses all channels present
if dataset_key not in all_keys:
raise ValueError(
f"The dataset_key for channel {channel} does not exist.",
Expand All @@ -621,7 +632,7 @@ def concatenate_channels(

# Create a generator expression to generate data frames for each channel
data_frames = (
self.create_dataframe_per_channel(h5_file, each) for each in self.available_channels
self.create_dataframe_per_channel(file_path, each) for each in self.available_channels
)

# Use the reduce function to join the data frames into a single DataFrame
Expand Down Expand Up @@ -649,14 +660,13 @@ def create_dataframe_per_file(
"""
# Loads h5 file and creates a dataframe
with h5py.File(file_path, "r") as h5_file:
self.reset_multi_index() # Reset MultiIndexes for next file
df = self.concatenate_channels(h5_file)
df = df.dropna(subset=self._config["dataframe"].get("tof_column", "dldTimeSteps"))
# correct the 3 bit shift which encodes the detector ID in the 8s time
if self._config["dataframe"].get("split_sector_id_from_dld_time", False):
df = split_dld_time_from_sector_id(df, config=self._config)
return df
self.reset_multi_index() # Reset MultiIndexes for next file
df = self.concatenate_channels(file_path)
df = df.dropna(subset=self._config["dataframe"].get("tof_column", "dldTimeSteps"))
# correct the 3 bit shift which encodes the detector ID in the 8s time
if self._config["dataframe"].get("split_sector_id_from_dld_time", False):
df = split_dld_time_from_sector_id(df, config=self._config)
return df

def create_buffer_file(self, h5_path: Path, parquet_path: Path) -> Union[bool, Exception]:
"""
Expand Down
Loading

0 comments on commit b219760

Please sign in to comment.