Skip to content

Commit

Permalink
[WIP] Handle 2-row files gracefully
Browse files Browse the repository at this point in the history
  • Loading branch information
freddie-freeloader committed Oct 15, 2024
1 parent ce396aa commit 399b066
Showing 1 changed file with 60 additions and 53 deletions.
113 changes: 60 additions & 53 deletions src/sheetreader_extension.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -788,22 +788,30 @@ inline unique_ptr<FunctionData> SheetreaderBindFun(ClientContext &context, Table
// Determine column types & names
// =====================================

//! Cell types in the first row after skipped rows
vector<CellType> cell_types_first_row;
//! Cell types in the second row after skipped rows
vector<CellType> cell_types_second_row;
//! Cell values in the first row after skipped rows
vector<XlsxCell> cells_first_row;

// First buffer of first thread
auto first_buffer = &sheet->mCells[0].front();

// Probing the first two rows to get the types
// Check preconditions for determining column types:
//
// 0. Need minimum of two rows to determine column types
if (number_rows < 2) {
throw BinderException("Need minimum of two rows to determine column types");
}
//
// 1. Check that nothing weird happened due to interleaved reading
if (first_buffer->size() < number_columns * 2) {
throw BinderException("Internal SheetReader extension error: Need minimum of two rows in first buffer to "
"determine column types and auto detect header row");
}

//! Cell types in the first row after skipped rows
vector<CellType> cell_types_first_row;
//! Cell types in the second row after skipped rows
vector<CellType> cell_types_second_row;
//! Cell values in the first row after skipped rows
vector<XlsxCell> cells_first_row;

// Probing the first two rows to get the types
for (idx_t i = 0; i < number_columns; i++) {
cell_types_first_row.push_back(sheet->mCells[0].front()[i].type);
cells_first_row.push_back(sheet->mCells[0].front()[i]);
Expand All @@ -829,57 +837,56 @@ inline unique_ptr<FunctionData> SheetreaderBindFun(ClientContext &context, Table

vector<LogicalType> column_types_second_row;
vector<string> column_names_second_row;

//! Indicates whether a header row was detected
bool header_detected = false;

if (number_rows > 1) {
// Check if second row contains only string values, get DuckDB types & generic column names
bool second_row_all_string =
ConvertCellTypes(column_types_second_row, column_names_second_row, cell_types_second_row);

// If the first row contains only string values, but the second row doesn't, we assume that the first row is a
// header row
if (use_header || (first_row_all_string && !second_row_all_string)) {
header_detected = true;

// Since the first row is a header row, we use the cell types of the second row
return_types = column_types_second_row;
bind_data->types = column_types_second_row;

//! Column names determined from the first row
vector<string> header_names;

// Get header names from cell values of first row
for (idx_t j = 0; j < cells_first_row.size(); j++) {
switch (cells_first_row[j].type) {
case CellType::T_STRING_REF: {
auto value = bind_data->xlsx_file.getString(cells_first_row[j].data.integer);
header_names.push_back(value);
break;
}
case CellType::T_STRING:
case CellType::T_STRING_INLINE: {
// TODO
throw BinderException("Inline & dynamic String types not supported yet");
break;
}
default:
throw BinderException("Header row contains non-string values");
}
// Check if second row contains only string values, get DuckDB types & generic column names
bool second_row_all_string =
ConvertCellTypes(column_types_second_row, column_names_second_row, cell_types_second_row);

// If the first row contains only string values, but the second row doesn't, we assume that the first row is a
// header row
if (use_header || (first_row_all_string && !second_row_all_string)) {
header_detected = true;

// Since the first row is a header row, we use the cell types of the second row
return_types = column_types_second_row;
bind_data->types = column_types_second_row;

//! Column names determined from the first row
vector<string> header_names;

// Get header names from cell values of first row
for (idx_t j = 0; j < cells_first_row.size(); j++) {
switch (cells_first_row[j].type) {
case CellType::T_STRING_REF: {
auto value = bind_data->xlsx_file.getString(cells_first_row[j].data.integer);
header_names.push_back(value);
break;
}
case CellType::T_STRING:
case CellType::T_STRING_INLINE: {
// TODO
throw BinderException("Inline & dynamic String types not supported yet");
break;
}
default:
throw BinderException("Header row contains non-string values");
}
}

// Set column names to header names
names = header_names;
bind_data->names = header_names;
} else {
// If first row is not a header row, we use the cell types of the first row for the column types
return_types = column_types_first_row;
bind_data->types = column_types_first_row;
// Set column names to header names
names = header_names;
bind_data->names = header_names;
} else {
// If first row is not a header row, we use the cell types of the first row for the column types
return_types = column_types_first_row;
bind_data->types = column_types_first_row;

// Use generic column names
names = column_names_first_row;
bind_data->names = column_names_first_row;
}
// Use generic column names
names = column_names_first_row;
bind_data->names = column_names_first_row;
}

// Since header is only used for determining column names, we skip it
Expand Down

0 comments on commit 399b066

Please sign in to comment.