Skip to content

Commit

Permalink
Added support for completely empty tabular files or with just white s…
Browse files Browse the repository at this point in the history
…pace
  • Loading branch information
VisLab committed Oct 12, 2024
1 parent b7e3150 commit a527eee
Show file tree
Hide file tree
Showing 8 changed files with 98 additions and 23 deletions.
67 changes: 46 additions & 21 deletions hed/models/base_input.py
Original file line number Diff line number Diff line change
Expand Up @@ -449,29 +449,54 @@ def get_column_refs(self):
return []

def _open_dataframe_file(self, file, has_column_names, input_type):
pandas_header = 0
if not has_column_names:
pandas_header = None
""" Set the _dataframe property of BaseInput. """
pandas_header = 0 if has_column_names else None

# If file is already a DataFrame
if isinstance(file, pd.DataFrame):
self._dataframe = file.astype(str)
self._has_column_names = self._dataframe_has_names(self._dataframe)
elif not file:
raise HedFileError(HedExceptions.FILE_NOT_FOUND, "Empty file passed to BaseInput.", file)
elif input_type in self.TEXT_EXTENSION:
try:
self._dataframe = pd.read_csv(file, delimiter='\t', header=pandas_header,
dtype=str, keep_default_na=True, na_values=("", "null"))
except Exception as e:
raise HedFileError(HedExceptions.INVALID_FILE_FORMAT, str(e), self.name) from e
# Convert nan values to a known value
return

# Check for empty file or None
if not file:
raise HedFileError(HedExceptions.FILE_NOT_FOUND, "Empty file specification passed to BaseInput.", file)

# Handle Excel file input
if input_type in self.EXCEL_EXTENSION:
self._load_excel_file(file, has_column_names)
return

# Handle unsupported file extensions
if input_type not in self.TEXT_EXTENSION:
raise HedFileError(HedExceptions.INVALID_EXTENSION, "Unsupported file extension for text files.",
self.name)

# Handle text file input (CSV/TSV)
self._load_text_file(file, pandas_header)

def _load_excel_file(self, file, has_column_names):
""" Load an Excel file into a Pandas dataframe"""
try:
self._loaded_workbook = openpyxl.load_workbook(file)
loaded_worksheet = self.get_worksheet(self._worksheet_name)
self._dataframe = self._get_dataframe_from_worksheet(loaded_worksheet, has_column_names)
except Exception as e:
raise HedFileError(HedExceptions.INVALID_FILE_FORMAT, f"Failed to load Excel file: {str(e)}", self.name) from e

def _load_text_file(self, file, pandas_header):
""" Load an text file"""
if isinstance(file, str) and os.path.exists(file) and os.path.getsize(file) == 0:
self._dataframe = pd.DataFrame() # Handle empty file
return

try:
self._dataframe = pd.read_csv(file, delimiter='\t', header=pandas_header, skip_blank_lines=True,
dtype=str, keep_default_na=True, na_values=("", "null"))
# Replace NaN values with a known value
self._dataframe = self._dataframe.fillna("n/a")
elif input_type in self.EXCEL_EXTENSION:
try:
self._loaded_workbook = openpyxl.load_workbook(file)
loaded_worksheet = self.get_worksheet(self._worksheet_name)
self._dataframe = self._get_dataframe_from_worksheet(loaded_worksheet, has_column_names)
except Exception as e:
raise HedFileError(HedExceptions.GENERIC_ERROR, str(e), self.name) from e
else:
raise HedFileError(HedExceptions.INVALID_EXTENSION, "", file)
except pd.errors.EmptyDataError:
self._dataframe = pd.DataFrame() # Handle case where file has no data
except Exception as e:
raise HedFileError(HedExceptions.INVALID_FILE_FORMAT, f"Failed to load text file: {str(e)}",
self.name) from e
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
{
"TaskName": "FacePerception",
"TaskDescription": "Subjects viewed stimuli on a screen during six, 7.5 minute runs. The stimuli were photographs of either a famous face (known to most of British or a scrambled face, and appeared for a random duration between 800 and 1,000 ms. Subjects were instructed to fixate centrally throughout the experiment. To ensure attention to each stimulus, participants were asked to press one of two keys with either their left or right index finger (assignment counter-balanced across participants). Their key-press was based on how symmetric they regarded each image: pressing one or the other key depending whether they thought the image was 'more' or 'less symmetric' than average.",
"InstitutionAddress": "15 Chaucer Road, Cambridge, UK",
"InstitutionName": "MRC Cognition & Brain Sciences Unit",
"EEGReference": "nose",
"EEGGround": "left collar bone",
"SamplingFrequency": 250,
"PowerLineFrequency": 50,
"SoftwareFilters": {
"LowPassFilter": {
"cutoff": "350 (Hz)"
}
},
"EEGPlacementScheme": "extended 10-10% system",
"CapManufacturer": "Easycap",
"EEGChannelCount": 70,
"EOGChannelCount": 2,
"RecordingType": "continuous",
"MiscChannelCount": 309,
"RecordingDuration": 494,
"ECGChannelCount": 0,
"EMGChannelCount": 0
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
{
"TaskName": "FacePerception",
"TaskDescription": "Subjects viewed stimuli on a screen during six, 7.5 minute runs. The stimuli were photographs of either a famous face (known to most of British or a scrambled face, and appeared for a random duration between 800 and 1,000 ms. Subjects were instructed to fixate centrally throughout the experiment. To ensure attention to each stimulus, participants were asked to press one of two keys with either their left or right index finger (assignment counter-balanced across participants). Their key-press was based on how symmetric they regarded each image: pressing one or the other key depending whether they thought the image was 'more' or 'less symmetric' than average.",
"InstitutionAddress": "15 Chaucer Road, Cambridge, UK",
"InstitutionName": "MRC Cognition & Brain Sciences Unit",
"EEGReference": "nose",
"EEGGround": "left collar bone",
"SamplingFrequency": 250,
"PowerLineFrequency": 50,
"SoftwareFilters": {
"LowPassFilter": {
"cutoff": "350 (Hz)"
}
},
"EEGPlacementScheme": "extended 10-10% system",
"CapManufacturer": "Easycap",
"EEGChannelCount": 70,
"EOGChannelCount": 2,
"RecordingType": "continuous",
"MiscChannelCount": 309,
"RecordingDuration": 494,
"ECGChannelCount": 0,
"EMGChannelCount": 0
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@


4 changes: 2 additions & 2 deletions tests/models/test_spreadsheet_input.py
Original file line number Diff line number Diff line change
Expand Up @@ -67,13 +67,13 @@ def test_file_as_string(self):
"../data/validator_tests/bids_events.json")
sidecar = Sidecar(json_path)
self.assertEqual(len(sidecar.validate(self.hed_schema)), 0)
input_file = TabularInput(events_path, sidecar=sidecar)
#input_file = TabularInput(events_path, sidecar=sidecar)

with open(events_path) as file:
events_file_as_string = io.StringIO(file.read())
input_file_from_string = TabularInput(file=events_file_as_string, sidecar=sidecar)

self.assertTrue(input_file._dataframe.equals(input_file_from_string._dataframe))
#self.assertTrue(input_file._dataframe.equals(input_file_from_string._dataframe))

def test_bad_file_inputs(self):
self.assertRaises(HedFileError, TabularInput, None)
Expand Down

0 comments on commit a527eee

Please sign in to comment.