Skip to content

Commit

Permalink
Bugfixes to deal with duplicate column names
Browse files Browse the repository at this point in the history
  • Loading branch information
marcelzwiers committed Nov 4, 2024
1 parent b1f92cc commit 407b957
Show file tree
Hide file tree
Showing 2 changed files with 55 additions and 33 deletions.
44 changes: 24 additions & 20 deletions bidscoin/bids.py
Original file line number Diff line number Diff line change
Expand Up @@ -102,15 +102,14 @@ def eventstable(self) -> pd.DataFrame:
if not self.isvalid:
return pd.DataFrame()

# Take the columns of interest from the logtable and rename them
df = copy.deepcopy(self.logtable)

# Convert the timing values to seconds (with maximally 4 digits after the decimal point)
df[self.time['cols']] = (df[self.time['cols']].apply(pd.to_numeric, errors='coerce') / self.time['unit']).round(4)

# Take the columns of interest and from now on use the BIDS column names
df = df.loc[:, [name for item in self.columns for name in item.values()]]
df.columns = [name for item in self.columns for name in item.keys()]
# Take the logtable columns of interest and from now on use the BIDS column names
df = df.loc[:, [sourcecol for item in self.columns for sourcecol in item.values() if sourcecol]]
df.columns = [eventscol for item in self.columns for eventscol, sourcecol in item.items() if sourcecol]

# Set the clock at zero at the start of the experiment
if self.time.get('start'):
Expand All @@ -121,22 +120,22 @@ def eventstable(self) -> pd.DataFrame:
df['onset'] = df['onset'] - df['onset'][start].iloc[0] # Take the time of the first occurrence as zero

# Loop over the row groups to filter/edit the rows
rows = pd.Series([len(self.rows) == 0] * len(df)).astype(bool) # Series with True values if no row expressions were specified
rows = pd.Series([len(self.rows) == 0] * len(df)).astype(bool) # Boolean series with True values if no row expressions were specified
for group in self.rows:

for column, regex in group['include'].items():

# Get the rows that match the expression
# Get the rows that match the expression, i.e. make them True
rowgroup = self.logtable[column].astype(str).str.fullmatch(str(regex))

# Add the matching rows to the grand rows group
rows |= rowgroup
rows |= rowgroup.values

# Write the value(s) of the matching rows
for newcolumn, newvalue in (group.get('cast') or {}).items():
df.loc[rowgroup, newcolumn] = newvalue
for colname, values in (group.get('cast') or {}).items():
df.loc[rowgroup, colname] = values

return df.loc[rows].sort_values(by='onset')
return df.loc[rows.values].sort_values(by='onset')

@property
def columns(self) -> List[dict]:
Expand Down Expand Up @@ -177,30 +176,35 @@ def is_float(s):
return False

if not (valid := len(self.columns) >= 2):
LOGGER.warning(f"Events table must have at least two columns, got {len(self.columns)} instead")
LOGGER.warning(f"Events table must have at least two columns, got {len(self.columns)} instead\n{self}")
return False

if (key := [*self.columns[0].keys()][0]) != 'onset':
LOGGER.warning(f"First events column must be named 'onset', got '{key}' instead")
LOGGER.warning(f"First events column must be named 'onset', got '{key}' instead\n{self}")
valid = False

if (key := [*self.columns[1].keys()][0]) != 'duration':
LOGGER.warning(f"Second events column must be named 'duration', got '{key}' instead")
LOGGER.warning(f"Second events column must be named 'duration', got '{key}' instead\n{self}")
valid = False

if len(self.time.get('cols',[])) < 2:
LOGGER.warning(f"Events table must have at least two timecol items, got {len(self.time.get('cols',[]))} instead")
LOGGER.warning(f"Events table must have at least two timecol items, got {len(self.time.get('cols',[]))} instead\n{self}")
return False

elif not is_float(self.time.get('unit')):
LOGGER.warning(f"Time conversion factor must be a float, got '{self.time.get('unit')}' instead")
LOGGER.warning(f"Time conversion factor must be a float, got '{self.time.get('unit')}' instead\n{self}")
valid = False

# Check if the logtable has existing and unique column names
df = self.logtable
for name in set([name for item in self.columns for name in item.values()] + [name for item in self.rows for name in item['include'].keys()] +
[*self.time.get('start',{}).keys()] + self.time.get('cols',[])):
if name not in self.logtable:
LOGGER.warning(f"Column '{name}' not found in the event table of {self.sourcefile}")
if name and name not in df:
LOGGER.warning(f"Column '{name}' not found in the event table of {self}")
valid = False
if not df.columns[df.columns.duplicated()].empty:
LOGGER.warning(f"Duplicate columns found in: {df.columns}\n{self}")
valid = False

return valid

Expand Down Expand Up @@ -677,7 +681,7 @@ def check(self, checks: Tuple[bool, bool, bool]=(False, False, False)) -> Tuple[
for ext in ('.tsv', '.tsv.gz'): # NB: `ext` used to be '.json', which is more generic (but see https://github.com/bids-standard/bids-validator/issues/2113)
if run_suffixok := bids_validator.BIDSValidator().is_bids(f"/sub-unknown/{datatype}/{bidsname}{ext}"): break # NB: Using the BIDSValidator sounds nice but doesn't give any control over the BIDS-version
run_valsok = run_suffixok
LOGGER.bcdebug(f"bidsname={run_suffixok}: /sub-unknown/{datatype}/{bidsname}.*")
LOGGER.bcdebug(f"bidsname (suffixok={run_suffixok}): /sub-unknown/{datatype}/{bidsname}.*")

if checks[0] and run_keysok in (None, False):
LOGGER.bcdebug(f'Invalid "{run_keysok}" key-checks in run-item: "{bids["suffix"]}" ({datatype} -> {provenance})\nRun["bids"]:\t{bids}')
Expand Down Expand Up @@ -1100,8 +1104,8 @@ def __init__(self, yamlfile: Path, folder: Path=templatefolder, plugins: Iterabl
module = bcoin.import_plugin(plugin)
if not self.plugins.get(plugin):
LOGGER.info(f"Adding default bidsmap options from the {plugin} plugin")
self.plugins[plugin] = module.OPTIONS if 'OPTIONS' in dir(module) else {}
if 'BIDSMAP' in dir(module) and yamlfile.parent == templatefolder:
self.plugins[plugin] = module.OPTIONS if hasattr(module, 'OPTIONS') else {}
if hasattr(module, 'BIDSMAP') and yamlfile.parent == templatefolder:
for dataformat, datasection in module.BIDSMAP.items():
if dataformat not in bidsmap_data:
LOGGER.info(f"Adding default bidsmappings from the {plugin} plugin")
Expand Down
44 changes: 31 additions & 13 deletions bidscoin/plugins/events2bids.py
Original file line number Diff line number Diff line change
Expand Up @@ -249,33 +249,51 @@ def __init__(self, sourcefile: Path, _data: dict, options: dict):
def logtable(self) -> pd.DataFrame:
"""Returns a Presentation log-table"""

nrows = len(self._sourcetable)
stimulus_header = (self._sourcetable.iloc[:, 0] == 'Event Type').idxmax() or nrows
video_header = (self._sourcetable.iloc[:, 0] == 'filename').idxmax() or nrows
survey_header = (self._sourcetable.iloc[:, 0] == 'Time').idxmax() or nrows

# Keep only the event, stimulus, video or survey table
self._sourcetable.columns = self._columns
df = self._sourcetable
nrows = len(df)
stimulus_header = (df.iloc[:, 0] == 'Event Type').idxmax() or nrows
video_header = (df.iloc[:, 0] == 'filename').idxmax() or nrows
survey_header = (df.iloc[:, 0] == 'Time').idxmax() or nrows

# Get the row indices to slice the event, stimulus, video or survey table
df.columns = self._columns
if self.options['table'] == 'event':
begin = 0
end = min(stimulus_header, video_header, survey_header)
elif self.options['table'] == 'stimulus':
self._sourcetable.columns = self._sourcetable.iloc[stimulus_header]
df.columns = df.iloc[stimulus_header]
begin = stimulus_header + 1
end = min(video_header, survey_header)
elif self.options['table'] == 'video':
self._sourcetable.columns = self._sourcetable.iloc[video_header]
df.columns = df.iloc[video_header]
begin = video_header + 1
end = survey_header
elif self.options['table'] == 'survey':
self._sourcetable.columns = self._sourcetable.iloc[survey_header]
df.columns = df.iloc[survey_header]
begin = survey_header + 1
end = nrows
else:
begin = 0
end = nrows
LOGGER.error(f"NOT IMPLEMENTED TABLE: {self.options['table']}")

LOGGER.bcdebug(f"Slicing '{self.options['table']}' sourcetable[{begin}:{end}]")

return self._sourcetable.iloc[begin:end]
LOGGER.bcdebug(f"Slicing '{self.options['table']}{df.shape}' sourcetable[{begin}:{end}]")

# Ensure unique column names by renaming columns with NaN or empty names and by appending suffixes to duplicate names
cols = [] # The new column names
dupl = {} # The duplicate index number
for i, col in enumerate(df.columns):
if pd.isna(col) or col == '': # Check if the column name is NaN or an empty string
cols.append(new_col := f"unknown_{i}")
LOGGER.info(f"Renaming empty column name at index {i}: {col} -> {new_col}")
elif col in dupl: # If duplicate, append the index number
dupl[col] += 1
cols.append(new_col := f"{col}_{dupl[col]}")
LOGGER.info(f"Renaming duplicate column name: {col} -> {new_col}")
else: # First occurrence of the column name, add it to dupl
dupl[col] = 0
cols.append(col)
df.columns = cols

# Return the sliced the table
return df.iloc[begin:end]

0 comments on commit 407b957

Please sign in to comment.