From 926d91264a7cf89758bdb62bdd3ee18ac56c5a2a Mon Sep 17 00:00:00 2001 From: Dmitry Romanov Date: Thu, 6 Feb 2025 11:34:17 -0500 Subject: [PATCH] Exchange format changes --- pyrobird/pyrobird/cli/convert.py | 26 +++- pyrobird/pyrobird/edm4eic.py | 208 +++++++++++++++---------------- 2 files changed, 123 insertions(+), 111 deletions(-) diff --git a/pyrobird/pyrobird/cli/convert.py b/pyrobird/pyrobird/cli/convert.py index 0adea91..a3abb8b 100644 --- a/pyrobird/pyrobird/cli/convert.py +++ b/pyrobird/pyrobird/cli/convert.py @@ -42,9 +42,14 @@ def guess_output_name(input_entry, output_extension='.firebird.json'): "-e", "--entries", "entries_str", default="0", help="Entry/event number to convert. Could be value, comma separated list or range. E.g '--entry=1,3-5,8'" ) +@click.option( + "-c", "--collections", "collections_str", default="", + help="Comma-separated list of collection types to convert. " + "For example: 'tracker_hits,tracks'." +) # TODO @click.option("-t", "--type", "input_type", default=None, help="Input file type. Currently only edm4eic supported") @click.argument("filename", required=True) -def convert(filename, output_file, entries_str): +def convert(filename, output_file, entries_str, collections_str): """ Converts an input EDM4eic ROOT file to a Firebird-compatible JSON file. @@ -59,14 +64,20 @@ def convert(filename, output_file, entries_str): Use `-o -` or `--output -` to output the JSON data to stdout instead of a file. This allows the command to be used in pipelines. + Use `-c` or `--collections` to specify specific collections to convert: + - tracker_hits - edm4eic::TrackerHitData + - tracks - edm4eic::TrackSegmentData with associated tracks + Currently, only EDM4eic format is supported. + **Example usage:** \b convert mydata.root convert mydata.root --output output.firebird.json convert mydata.root --output - | less + convert mydata.root --collections=tracks """ import uproot @@ -84,6 +95,11 @@ def convert(filename, output_file, entries_str): # Parse use entries input entries = parse_entry_numbers(entries_str) + # Parse collections string + collections = None + if collections_str: + collections = [x.strip() for x in collections_str.split(',') if x.strip()] + # Do we have valid entries? for entry_index in entries: if entry_index > num_entries - 1: @@ -91,9 +107,13 @@ def convert(filename, output_file, entries_str): f"but entry index={entry_index} is outside of total num_entries={num_entries}" raise ValueError(err_msg) - # Extract the first event from the tree - event = edm4eic_to_dict(tree, entries, origin_info={"file": filename}) + origin_info = { + "file": filename, + "entries_count": num_entries + } + + event = edm4eic_to_dict(tree, entries, origin_info, collections=collections) # Convert the event data to JSON format json_data = json.dumps(event) diff --git a/pyrobird/pyrobird/edm4eic.py b/pyrobird/pyrobird/edm4eic.py index 5a2ce09..1d98e80 100644 --- a/pyrobird/pyrobird/edm4eic.py +++ b/pyrobird/pyrobird/edm4eic.py @@ -119,6 +119,15 @@ def parse_entry_numbers(value): def tracker_hits_to_box_hits(tree, branch_name, entry_start, entry_stop=None): """Converts vector to HitBox format dictionary""" + result = { + "name": branch_name, + "type": "TrackerLinePointTrajectory", + "originType": ["edm4eic::TrackPoint", "edm4eic::TrackSegmentData"], + "paramColumns": ["px", "py", "pz", "charge", "phi", "theta", "qOverP", "chi2", "ndf"], + "pointColumns": ["x", "y", "z", "t", "dx", "dy", "dz", "dt"], + "lines": [] + } + # Read only 1 event if entry_stop is not given if entry_stop is None: entry_stop = entry_start + 10 @@ -176,12 +185,19 @@ def track_segments_to_line_trajectories(tree, branch_name, entry_start, entry_st if entry_stop is None: entry_stop = entry_start + 1 + result = { + "name": branch_name, + "type": "TrackerLinePointTrajectory", + "originType": ["edm4eic::TrackPoint", "edm4eic::TrackSegmentData"], + "paramColumns": [], + "pointColumns": ["x", "y", "z", "t", "dx", "dy", "dz", "dt"], + "lines": [] + } # -- Grab the arrays for the main TrackSegmentData - seg_length = ak.flatten(tree[f'{branch_name}/{branch_name}.length'].array(entry_start=entry_start, entry_stop=entry_stop)).to_list() - seg_lenerr = ak.flatten(tree[f'{branch_name}/{branch_name}.lengthError'].array(entry_start=entry_start, entry_stop=entry_stop)).to_list() - seg_begin = ak.flatten(tree[f'{branch_name}/{branch_name}.points_begin'].array(entry_start=entry_start, entry_stop=entry_stop)).to_list() - seg_end = ak.flatten(tree[f'{branch_name}/{branch_name}.points_end'].array(entry_start=entry_start, entry_stop=entry_stop)).to_list() + seg_points_begin_index = ak.flatten(tree[f'{branch_name}/{branch_name}.points_begin'].array(entry_start=entry_start, entry_stop=entry_stop)).to_list() + seg_points_end_index = ak.flatten(tree[f'{branch_name}/{branch_name}.points_end'].array(entry_start=entry_start, entry_stop=entry_stop)).to_list() + # TODO selecting Tracks by track_objid_index will not work because of https://github.com/eic/EICrecon/issues/1730 # If the file also has a one-to-one relation to Track => read the objectIDs # (Many times stored in _CentralTrackSegments_track/*): track_objid_index = None @@ -190,43 +206,22 @@ def track_segments_to_line_trajectories(tree, branch_name, entry_start, entry_st objid_branch_base = f'_{"_".join(branch_name.split())}_track' # e.g. _CentralTrackSegments_track if objid_branch_base in tree.keys(): # inside that we typically have .index and .collectionID - track_objid_index = ak.flatten( - tree[f'{objid_branch_base}/{objid_branch_base}.index'].array( - entry_start=entry_start, entry_stop=entry_stop - ) - ).to_list() - track_objid_coll = ak.flatten( - tree[f'{objid_branch_base}/{objid_branch_base}.collectionID'].array( - entry_start=entry_start, entry_stop=entry_stop - ) - ).to_list() + track_objid_index = ak.flatten(tree[f'{objid_branch_base}/{objid_branch_base}.index'].array(entry_start=entry_start, entry_stop=entry_stop)).to_list() + track_objid_coll = ak.flatten(tree[f'{objid_branch_base}/{objid_branch_base}.collectionID'].array(entry_start=entry_start, entry_stop=entry_stop)).to_list() # -- Now get the TrackPoints for "CentralTrackSegments.points" - # Typically, Podio names the sub-collection something like "CentralTrackSegments_points" - # and the relevant fields are "position.x", "time", "positionError.xx", etc. - # If your file changes naming, you must adapt these patterns. - - # We guess the naming: "_CentralTrackSegments_points/..." + # Podio names the sub-collection something like "_CentralTrackSegments_points/..." # For safety, we search among keys that start with __points' points_collection_name = f'_{branch_name}_points' if points_collection_name not in tree.keys(): # Possibly the file organizes them differently, or there are no points # We return an empty dictionary if not found - return { - "name": branch_name, - "type": "TrackerLinePointTrajectory", - "originType": ["edm4eic::TrackPoint", "edm4eic::TrackSegmentData"], - "paramColumns": ["px","py","pz","charge","phi","theta","qOverP","chi2","ndf"], - "pointColumns": ["x","y","z","t","dx","dy","dz","dt"], - "lines": [] - } + return result def get_points_field_array(field_suffix): """Helper to flatten points arrays from the sub-collection.""" full_branch = f'{points_collection_name}/{points_collection_name}.{field_suffix}' - return ak.flatten( - tree[full_branch].array(entry_start=entry_start, entry_stop=entry_stop) - ).to_list() + return ak.flatten(tree[full_branch].array(entry_start=entry_start, entry_stop=entry_stop)).to_list() # Grab the trackpoint fields p_x = get_points_field_array("position.x") @@ -241,75 +236,66 @@ def get_points_field_array(field_suffix): # p_path = get_points_field_array("pathlength") # p_patherr = get_points_field_array("pathlengthError") + # # TODO selecting Tracks by track_objid_index will not work because of https://github.com/eic/EICrecon/issues/1730 # -- Optionally load track collection to get momentum, charge, etc. # We'll do a quick attempt for "CentralCKFTracks", but if that doesn't exist, # we'll skip. - ckf_branch = "CentralCKFTracks" - ckf_exists = (ckf_branch in tree.keys()) + params_branch = "CentralCKFTrackParameters" + params_exists = (params_branch in tree.keys()) # If present, read the relevant arrays for indexing - if ckf_exists: - t_px = ak.flatten(tree[f'{ckf_branch}/{ckf_branch}.momentum.x'].array(entry_start=entry_start, entry_stop=entry_stop)).to_list() - t_py = ak.flatten(tree[f'{ckf_branch}/{ckf_branch}.momentum.y'].array(entry_start=entry_start, entry_stop=entry_stop)).to_list() - t_pz = ak.flatten(tree[f'{ckf_branch}/{ckf_branch}.momentum.z'].array(entry_start=entry_start, entry_stop=entry_stop)).to_list() - t_charge= ak.flatten(tree[f'{ckf_branch}/{ckf_branch}.charge'].array(entry_start=entry_start, entry_stop=entry_stop)).to_list() - t_chi2 = ak.flatten(tree[f'{ckf_branch}/{ckf_branch}.chi2'].array(entry_start=entry_start, entry_stop=entry_stop)).to_list() - t_ndf = ak.flatten(tree[f'{ckf_branch}/{ckf_branch}.ndf'].array(entry_start=entry_start, entry_stop=entry_stop)).to_list() + if params_exists: + trk_theta = ak.flatten(tree[f'{params_branch}/{params_branch}.theta'].array(entry_start=entry_start, entry_stop=entry_stop)).to_list() + trk_phi = ak.flatten(tree[f'{params_branch}/{params_branch}.phi'].array(entry_start=entry_start, entry_stop=entry_stop)).to_list() + trk_qoverp = ak.flatten(tree[f'{params_branch}/{params_branch}.qOverP'].array(entry_start=entry_start, entry_stop=entry_stop)).to_list() + trk_loc_a = ak.flatten(tree[f'{params_branch}/{params_branch}.loc.a'].array(entry_start=entry_start, entry_stop=entry_stop)).to_list() + trk_loc_b = ak.flatten(tree[f'{params_branch}/{params_branch}.loc.b'].array(entry_start=entry_start, entry_stop=entry_stop)).to_list() + trk_time = ak.flatten(tree[f'{params_branch}/{params_branch}.time'].array(entry_start=entry_start, entry_stop=entry_stop)).to_list() + result["paramColumns"] = [ + "theta", + "phi", + "q_over_p", + "loc_a", + "loc_b", + "time" + ] else: - # fallback - t_px = t_py = t_pz = t_charge = t_chi2 = t_ndf = [] + trk_theta = [] + trk_phi = [] + trk_qoverp = [] + trk_loc_a = [] + trk_loc_b = [] + trk_time = [] lines = [] - n_segments = len(seg_length) - for i in range(n_segments): - beg = seg_begin[i] - end = seg_end[i] - # gather the track points [beg..end) + n_segments = len(seg_points_begin_index) + + if params_exists and n_segments != len(trk_theta): + print(f"WARNING: len(CentralCKFParameters) != len({branch_name}). Might be a sign of format change or broken tree") + + # Check, we should have the same number of segments and parameters + for seg_index in range(n_segments): segment_points = [] - for j in range(beg, end): - dx = 2.0 * np.sqrt(p_exx[j]) if p_exx[j] > 0 else 0.0 - dy = 2.0 * np.sqrt(p_eyy[j]) if p_eyy[j] > 0 else 0.0 - dz = 2.0 * np.sqrt(p_ezz[j]) if p_ezz[j] > 0 else 0.0 - dt = p_terr[j] + for point_index in range(seg_points_begin_index[seg_index], seg_points_end_index[seg_index]): + # In that snippet, p_exx[point_index] is assumed to be the 𝜎^2 in the x,y,z-coordinate of point’s position. + # x2.0 represents “plus-or-minus one sigma” as the entire width in that direction. + dx = 2.0 * np.sqrt(p_exx[point_index]) if p_exx[point_index] > 0 else 0.0 + dy = 2.0 * np.sqrt(p_eyy[point_index]) if p_eyy[point_index] > 0 else 0.0 + dz = 2.0 * np.sqrt(p_ezz[point_index]) if p_ezz[point_index] > 0 else 0.0 + dt = p_terr[point_index] # pointColumns => [x, y, z, t, dx, dy, dz, dt] - point_val = [ - p_x[j], - p_y[j], - p_z[j], - p_t[j], - dx, - dy, - dz, - dt - ] + point_val = [p_x[point_index], p_y[point_index], p_z[point_index], p_t[point_index], dx, dy, dz, dt] segment_points.append(point_val) # Attempt to get track params from the track reference params_list = [] - if track_objid_index is not None and i < len(track_objid_index): - trk_index = track_objid_index[i] - # check if we can read from 'CentralCKFTracks' - if ckf_exists and trk_index < len(t_px): - px = t_px[trk_index] - py = t_py[trk_index] - pz = t_pz[trk_index] - ch = t_charge[trk_index] - # compute phi, theta, qOverP - p_mag = math.sqrt(px*px + py*py + pz*pz) if pz == pz else 0 - if p_mag > 1e-12: - phi = math.atan2(py, px) - theta = math.acos(pz / p_mag) - q_over_p = ch / p_mag - else: - phi = 0.0 - theta = 0.0 - q_over_p = 0.0 - chi2 = t_chi2[trk_index] - ndf = t_ndf[trk_index] - params_list = [px, py, pz, ch, phi, theta, q_over_p, chi2, ndf] - else: - # No track info available - params_list = [] + if params_exists and seg_index < len(trk_theta): + params_list.append(trk_theta[seg_index]) + params_list.append(trk_phi[seg_index]) + params_list.append(trk_qoverp[seg_index]) + params_list.append(trk_loc_a[seg_index]) + params_list.append(trk_loc_b[seg_index]) + params_list.append(trk_time [seg_index]) line = { "points": segment_points, @@ -317,30 +303,36 @@ def get_points_field_array(field_suffix): } lines.append(line) - comp_dict = { - "name": branch_name, - "type": "TrackerLinePointTrajectory", - "originType": ["edm4eic::TrackPoint","edm4eic::TrackSegmentData"], - # A recommended set of columns for 'params' and for 'points' - "paramColumns": ["px","py","pz","charge","phi","theta","qOverP","chi2","ndf"], - "pointColumns": ["x","y","z","t","dx","dy","dz","dt"], - "lines": lines - } - return comp_dict + result["lines"] = lines + return result -def edm4eic_entry_to_dict(tree, entry_index, custom_name=None): - tracker_branches = tree.typenames(recursive=False, full_paths=True, filter_typename="vector") - # >oO debug: pprint(type()) +def edm4eic_entry_to_dict(tree, entry_index, custom_name=None, collections=None): + # the result of this function components = [] - for branch_name in tracker_branches.keys(): - components.append(tracker_hits_to_box_hits(tree, branch_name, entry_index)) - # 2) convert CentralTrackSegments => TrackerLinePointTrajectory, if present - seg_collection = "CentralTrackSegments" - if seg_collection in tree.keys(): - line_comp = track_segments_to_line_trajectories(tree, seg_collection, entry_index, entry_stop=entry_index+1) - components.append(line_comp) + if not collections: + collections = [ + "tracker_hits", + "tracks" + ] + + # Hits: + if "tracker_hits" in collections: + tracker_branches = tree.typenames(recursive=False, full_paths=True, filter_typename="vector") + # >oO debug: pprint(type()) + + for branch_name in tracker_branches.keys(): + components.append(tracker_hits_to_box_hits(tree, branch_name, entry_index)) + + # Tracks + if "tracks" in collections: + # TODO selecting all TrackSegmentData will not work because of https://github.com/eic/EICrecon/issues/1730 + # track_branches = tree.typenames(recursive=False, full_paths=True, filter_typename="vector") + seg_collection = "CentralTrackSegments" + if seg_collection in tree.keys(): + line_comp = track_segments_to_line_trajectories(tree, seg_collection, entry_index, entry_stop=entry_index+1) + components.append(line_comp) entry = { "id": custom_name if custom_name else entry_index, @@ -350,17 +342,17 @@ def edm4eic_entry_to_dict(tree, entry_index, custom_name=None): return entry -def edm4eic_to_dict(tree, entry_ids, origin_info=None): +def edm4eic_to_dict(tree, entry_ids, origin_info=None, collections=None): entries_data = [] if isinstance(entry_ids, int): entry_ids = [entry_ids] for entry_id in entry_ids: - entries_data.append(edm4eic_entry_to_dict(tree, entry_id, custom_name=None)) + entries_data.append(edm4eic_entry_to_dict(tree, entry_id, custom_name=None, collections=collections)) result = { - "version": "0.01", + "version": "0.02", "origin": origin_info, "entries": entries_data }