From c6bd492030c1061a5a41260d7522bc4dab348a77 Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Sat, 17 Aug 2024 06:43:27 -0400 Subject: [PATCH] Add tests for bad parquet files (#6262) * Add tests for bad parquet files * Reenable test * Add test for very subltley different file --- parquet-testing | 2 +- parquet/tests/arrow_reader/bad_data.rs | 136 +++++++++++++++++++++++++ parquet/tests/arrow_reader/mod.rs | 1 + 3 files changed, 138 insertions(+), 1 deletion(-) create mode 100644 parquet/tests/arrow_reader/bad_data.rs diff --git a/parquet-testing b/parquet-testing index 9b48ff4f94dc..cb7a9674142c 160000 --- a/parquet-testing +++ b/parquet-testing @@ -1 +1 @@ -Subproject commit 9b48ff4f94dc5e89592d46a119884dbb88100884 +Subproject commit cb7a9674142c137367bf75a01b79c6e214a73199 diff --git a/parquet/tests/arrow_reader/bad_data.rs b/parquet/tests/arrow_reader/bad_data.rs new file mode 100644 index 000000000000..6e325f119710 --- /dev/null +++ b/parquet/tests/arrow_reader/bad_data.rs @@ -0,0 +1,136 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! Tests that reading invalid parquet files returns an error + +use arrow::util::test_util::parquet_test_data; +use parquet::arrow::arrow_reader::ArrowReaderBuilder; +use parquet::errors::ParquetError; +use std::collections::HashSet; +use std::path::PathBuf; + +static KNOWN_FILES: &[&str] = &[ + "PARQUET-1481.parquet", + "ARROW-GH-41317.parquet", + "ARROW-GH-41321.parquet", + "ARROW-RS-GH-6229-DICTHEADER.parquet", + "ARROW-RS-GH-6229-LEVELS.parquet", + "README.md", +]; + +/// Returns the path to 'parquet-testing/bad_data' +fn bad_data_dir() -> PathBuf { + // points to parquet-testing/data + let parquet_testing_data = parquet_test_data(); + PathBuf::from(parquet_testing_data) + .parent() + .expect("was in parquet-testing/data") + .join("bad_data") +} + +#[test] +// Ensure that if we add a new test the files are added to the tests. +fn test_invalid_files() { + let known_files: HashSet<_> = KNOWN_FILES.iter().cloned().collect(); + let mut seen_files = HashSet::new(); + + let files = std::fs::read_dir(bad_data_dir()).unwrap(); + + for file in files { + let file_name = file + .unwrap() + .path() + .file_name() + .unwrap() + .to_str() + .unwrap() + .to_string(); + + // If you see this error, please add a test for the new file following the model below + assert!( + known_files.contains(file_name.as_str()), + "Found new file in bad_data, please add test: {file_name}" + ); + seen_files.insert(file_name); + } + for expected_file in known_files { + assert!( + seen_files.contains(expected_file), + "Expected file not found in bad_data directory: {expected_file}" + ); + } +} + +#[test] +fn test_parquet_1481() { + let err = read_file("PARQUET-1481.parquet").unwrap_err(); + assert_eq!( + err.to_string(), + "Parquet error: unexpected parquet type: -7" + ); +} + +#[test] +#[should_panic(expected = "assertion failed: self.current_value.is_some()")] +fn test_arrow_gh_41321() { + let err = read_file("ARROW-GH-41321.parquet").unwrap_err(); + assert_eq!(err.to_string(), "TBD (currently panics)"); +} + +#[test] +fn test_arrow_gh_41317() { + let err = read_file("ARROW-GH-41317.parquet").unwrap_err(); + assert_eq!( + err.to_string(), + "External: Parquet argument error: External: bad data" + ); +} + +#[test] +fn test_arrow_rs_gh_6229_dict_header() { + let err = read_file("ARROW-RS-GH-6229-DICTHEADER.parquet").unwrap_err(); + assert_eq!( + err.to_string(), + "External: Parquet argument error: EOF: eof decoding byte array" + ); +} + +#[test] +fn test_arrow_rs_gh_6229_dict_levels() { + let err = read_file("ARROW-RS-GH-6229-LEVELS.parquet").unwrap_err(); + assert_eq!( + err.to_string(), + "External: Parquet argument error: Parquet error: Insufficient repetition levels read from column" + ); +} + +/// Reads the file and tries to return the total row count +/// Returns an error if the file is invalid +fn read_file(name: &str) -> Result { + let path = bad_data_dir().join(name); + println!("Reading file: {:?}", path); + + let file = std::fs::File::open(&path).unwrap(); + let reader = ArrowReaderBuilder::try_new(file)?.build()?; + + let mut num_rows = 0; + for batch in reader { + let batch = batch?; + num_rows += batch.num_rows(); + } + Ok(num_rows) +} diff --git a/parquet/tests/arrow_reader/mod.rs b/parquet/tests/arrow_reader/mod.rs index 7e979dcf3ec0..cc4c8f3c977b 100644 --- a/parquet/tests/arrow_reader/mod.rs +++ b/parquet/tests/arrow_reader/mod.rs @@ -35,6 +35,7 @@ use parquet::file::properties::{EnabledStatistics, WriterProperties}; use std::sync::Arc; use tempfile::NamedTempFile; +mod bad_data; mod statistics; // returns a struct array with columns "int32_col", "float32_col" and "float64_col" with the specified values