Skip to content

Commit

Permalink
Merge pull request #1878 from cloudsufi/r2.12-cherry-pick-57df4b4e8bd…
Browse files Browse the repository at this point in the history
…3bfbcd5ac02a145813b762aa89781

[🍒][PLUGIN-1795] Override Array Max (Apache Poi) for large files
  • Loading branch information
psainics authored Aug 13, 2024
2 parents 7d3f287 + 2361c8d commit 795aa40
Show file tree
Hide file tree
Showing 3 changed files with 16 additions and 4 deletions.
4 changes: 2 additions & 2 deletions core-plugins/pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -186,12 +186,12 @@
<dependency>
<groupId>org.apache.poi</groupId>
<artifactId>poi</artifactId>
<version>5.2.4</version>
<version>5.2.5</version>
</dependency>
<dependency>
<groupId>org.apache.poi</groupId>
<artifactId>poi-ooxml</artifactId>
<version>5.2.4</version>
<version>5.2.5</version>
</dependency>
<dependency>
<groupId>com.github.pjfanning</groupId>
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,7 @@
import org.apache.poi.ss.usermodel.Workbook;
import org.apache.poi.ss.usermodel.WorkbookFactory;
import org.apache.poi.ss.util.CellReference;
import org.apache.poi.util.IOUtils;

import java.io.IOException;
import java.io.InputStream;
Expand All @@ -66,6 +67,8 @@ public class ExcelInputFormat extends TextInputFormat {
public static final String FILE_PATTERN = "filePattern";
public static final String SHEET = "sheet";
public static final String SHEET_VALUE = "sheetValue";
public static final String EXCEL_BYTE_ARRAY_MAX_OVERRIDE = "excel.byteArrayMaxOverride";
public static final int EXCEL_BYTE_ARRAY_MAX_OVERRIDE_DEFAULT = Integer.MAX_VALUE / 2;

@Override
public RecordReader<LongWritable, Text> createRecordReader(InputSplit split, TaskAttemptContext context) {
Expand All @@ -80,7 +83,7 @@ public boolean isSplitable(JobContext context, Path file) {
public static void setConfigurations(Job job, String filePattern, String sheet, boolean reprocess,
String sheetValue, String columnList, boolean skipFirstRow,
String terminateIfEmptyRow, String rowLimit, String ifErrorRecord,
String processedFiles) {
String processedFiles, int byteArrayMaxOverride) {

Configuration configuration = job.getConfiguration();
configuration.set(FILE_PATTERN, filePattern);
Expand All @@ -100,6 +103,7 @@ public static void setConfigurations(Job job, String filePattern, String sheet,

configuration.set(IF_ERROR_RECORD, ifErrorRecord);
configuration.set(PROCESSED_FILES, processedFiles);
configuration.set(EXCEL_BYTE_ARRAY_MAX_OVERRIDE, String.valueOf(byteArrayMaxOverride));
}


Expand Down Expand Up @@ -175,6 +179,9 @@ public void initialize(InputSplit genericSplit, TaskAttemptContext context) thro
isStreaming = true;
break;
case OLE2:
// workaround for large files
IOUtils.setByteArrayMaxOverride(job.getInt(EXCEL_BYTE_ARRAY_MAX_OVERRIDE,
ExcelInputFormat.EXCEL_BYTE_ARRAY_MAX_OVERRIDE_DEFAULT));
workbook = WorkbookFactory.create(is);
break;
default:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -318,11 +318,16 @@ public void prepareRun(BatchSourceContext batchSourceContext) throws Exception {
processFiles = GSON.toJson(getAllProcessedFiles(batchSourceContext), ARRAYLIST_PREPROCESSED_FILES);
}

Map<String, String> arguments = new HashMap<>(batchSourceContext.getArguments().asMap());
int byteArrayMaxOverride = arguments.containsKey(ExcelInputFormat.EXCEL_BYTE_ARRAY_MAX_OVERRIDE) ?
Integer.parseInt(arguments.get(ExcelInputFormat.EXCEL_BYTE_ARRAY_MAX_OVERRIDE)) :
ExcelInputFormat.EXCEL_BYTE_ARRAY_MAX_OVERRIDE_DEFAULT;

ExcelInputFormat.setConfigurations(job, excelInputreaderConfig.filePattern, excelInputreaderConfig.sheet,
excelInputreaderConfig.reprocess, excelInputreaderConfig.sheetValue,
excelInputreaderConfig.columnList, excelInputreaderConfig.skipFirstRow,
excelInputreaderConfig.terminateIfEmptyRow, excelInputreaderConfig.rowsLimit,
excelInputreaderConfig.ifErrorRecord, processFiles);
excelInputreaderConfig.ifErrorRecord, processFiles, byteArrayMaxOverride);

// Sets the input path(s).
ExcelInputFormat.addInputPaths(job, excelInputreaderConfig.filePath);
Expand Down

0 comments on commit 795aa40

Please sign in to comment.