From 8d6e37477167d73df2b2f9b9010007f20ccc2295 Mon Sep 17 00:00:00 2001 From: Marcus Holland-Moritz Date: Sat, 16 Nov 2024 20:14:04 +0100 Subject: [PATCH] feat: support extracting only files matching glob patterns --- doc/dwarfsextract.md | 5 +++ include/dwarfs/utility/filesystem_extractor.h | 14 ++++++-- src/utility/filesystem_extractor.cpp | 36 +++++++++++++++++-- test/tool_main_test.cpp | 14 ++++++++ tools/src/dwarfsextract_main.cpp | 22 ++++++++++-- 5 files changed, 84 insertions(+), 7 deletions(-) diff --git a/doc/dwarfsextract.md b/doc/dwarfsextract.md index 577e85255..fed55fe8a 100644 --- a/doc/dwarfsextract.md +++ b/doc/dwarfsextract.md @@ -44,6 +44,11 @@ to disk: case the default is to extract the files to the current directory, or to write the archive data to stdout. +- `--pattern=`*glob-pattern*: + If specified, only extract entries matching the pattern. Can be specified + multiple times, in which case all files matching one or more patterns will + be extracted. + - `-O`, `--image-offset=`*value*|`auto`: Specify the byte offset at which the filesystem is located in the image. Use `auto` to detect the offset automatically. This is also the default. diff --git a/include/dwarfs/utility/filesystem_extractor.h b/include/dwarfs/utility/filesystem_extractor.h index 88da00ed3..c9fd3af0f 100644 --- a/include/dwarfs/utility/filesystem_extractor.h +++ b/include/dwarfs/utility/filesystem_extractor.h @@ -30,6 +30,7 @@ namespace dwarfs { +class glob_matcher; class library_dependencies; class logger; class os_access; @@ -72,7 +73,13 @@ class filesystem_extractor { bool extract(reader::filesystem_v2 const& fs, filesystem_extractor_options const& opts = filesystem_extractor_options()) { - return impl_->extract(fs, opts); + return impl_->extract(fs, nullptr, opts); + } + + bool extract(reader::filesystem_v2 const& fs, glob_matcher const* matcher, + filesystem_extractor_options const& opts = + filesystem_extractor_options()) { + return impl_->extract(fs, matcher, opts); } class impl { @@ -84,8 +91,9 @@ class filesystem_extractor { virtual void open_stream(std::ostream& os, std::string const& format) = 0; virtual void open_disk(std::filesystem::path const& output) = 0; virtual void close() = 0; - virtual bool extract(reader::filesystem_v2 const& fs, - filesystem_extractor_options const& opts) = 0; + virtual bool + extract(reader::filesystem_v2 const& fs, glob_matcher const* matcher, + filesystem_extractor_options const& opts) = 0; }; private: diff --git a/src/utility/filesystem_extractor.cpp b/src/utility/filesystem_extractor.cpp index b81cdef6b..984cd785c 100644 --- a/src/utility/filesystem_extractor.cpp +++ b/src/utility/filesystem_extractor.cpp @@ -24,6 +24,7 @@ #include #include #include +#include // This is required to avoid Windows.h being pulled in by libarchive // and polluting our environment with all sorts of shit. @@ -41,6 +42,7 @@ #include #include +#include #include #include #include @@ -188,7 +190,7 @@ class filesystem_extractor_ final : public filesystem_extractor::impl { } } - bool extract(reader::filesystem_v2 const& fs, + bool extract(reader::filesystem_v2 const& fs, glob_matcher const* matcher, filesystem_extractor_options const& opts) override; private: @@ -248,7 +250,8 @@ class filesystem_extractor_ final : public filesystem_extractor::impl { template bool filesystem_extractor_::extract( - reader::filesystem_v2 const& fs, filesystem_extractor_options const& opts) { + reader::filesystem_v2 const& fs, glob_matcher const* matcher, + filesystem_extractor_options const& opts) { DWARFS_CHECK(a_, "filesystem not opened"); auto lr = ::archive_entry_linkresolver_new(); @@ -350,6 +353,18 @@ bool filesystem_extractor_::extract( } }; + std::unordered_set matched_dirs; + + if (matcher) { + fs.walk([&](auto entry) { + if (auto parent = entry.parent()) { + if (matcher->match(entry.unix_path())) { + matched_dirs.insert(parent->fs_path()); + } + } + }); + } + fs.walk_data_order([&](auto entry) { // TODO: we can surely early abort walk() somehow if (entry.is_root() || hard_error) { @@ -358,6 +373,23 @@ bool filesystem_extractor_::extract( auto inode = entry.inode(); + if (matcher) { + LOG_TRACE << "checking " << entry.unix_path(); + if (inode.is_directory()) { + if (!matched_dirs.contains(entry.fs_path())) { + LOG_TRACE << "skipping directory " << entry.fs_path(); + // no need to extract this directory + return; + } + } else { + if (!matcher->match(entry.unix_path())) { + LOG_TRACE << "skipping " << entry.fs_path(); + // no match, skip this entry + return; + } + } + } + auto ae = ::archive_entry_new(); auto stbuf = fs.getattr(inode); diff --git a/test/tool_main_test.cpp b/test/tool_main_test.cpp index 9e4286584..2d3f3e531 100644 --- a/test/tool_main_test.cpp +++ b/test/tool_main_test.cpp @@ -1983,11 +1983,25 @@ TEST(dwarfsextract_test, mtree) { auto t = dwarfsextract_tester::create_with_image(); ASSERT_EQ(0, t.run({"-i", "image.dwarfs", "-f", "mtree"})) << t.err(); auto out = t.out(); + std::cout << out << "\n"; EXPECT_TRUE(out.starts_with("#mtree")) << out; EXPECT_THAT(out, ::testing::HasSubstr("type=dir")); EXPECT_THAT(out, ::testing::HasSubstr("type=file")); } +TEST(dwarfsextract_test, patterns) { + auto t = dwarfsextract_tester::create_with_image(); + ASSERT_EQ(0, t.run({"-i", "image.dwarfs", "-f", "mtree", "**/*.py", "*.txt"})) + << t.err(); + auto out = t.out(); + EXPECT_TRUE(out.starts_with("#mtree")) << out; + auto mtree = test::parse_mtree(out); + ASSERT_EQ(3, mtree.size()); + EXPECT_EQ("./ipsum.txt", mtree[0].first); + EXPECT_EQ("./somedir", mtree[1].first); + EXPECT_EQ("./somedir/ipsum.py", mtree[2].first); +} + TEST(dwarfsextract_test, stdout_progress_error) { auto t = dwarfsextract_tester::create_with_image(); EXPECT_NE(0, diff --git a/tools/src/dwarfsextract_main.cpp b/tools/src/dwarfsextract_main.cpp index e830efe1a..6fc4dd1db 100644 --- a/tools/src/dwarfsextract_main.cpp +++ b/tools/src/dwarfsextract_main.cpp @@ -27,6 +27,7 @@ #include #include +#include #include #include #include @@ -77,6 +78,9 @@ int dwarfsextract_main(int argc, sys_char** argv, iolayer const& iol) { ("output,o", po_sys_value(&output), "output file or directory") + ("pattern", + po::value>(), + "only extract files matching these patterns") ("image-offset,O", po::value(&image_offset)->default_value("auto"), "filesystem image offset in bytes") @@ -111,10 +115,17 @@ int dwarfsextract_main(int argc, sys_char** argv, iolayer const& iol) { tool::add_common_options(opts, logopts); + po::positional_options_description pos; + pos.add("pattern", -1); + po::variables_map vm; try { - po::store(po::parse_command_line(argc, argv, opts), vm); + po::store(po::basic_command_line_parser(argc, argv) + .options(opts) + .positional(pos) + .run(), + vm); po::notify(vm); } catch (po::error const& e) { iol.err << "error: " << e.what() << "\n"; @@ -141,6 +152,13 @@ int dwarfsextract_main(int argc, sys_char** argv, iolayer const& iol) { return 0; } + std::unique_ptr matcher; + + if (vm.count("pattern")) { + matcher = std::make_unique( + vm["pattern"].as>()); + } + int rv = 0; try { @@ -214,7 +232,7 @@ int dwarfsextract_main(int argc, sys_char** argv, iolayer const& iol) { }; } - rv = fsx.extract(fs, fsx_opts) ? 0 : 2; + rv = fsx.extract(fs, matcher.get(), fsx_opts) ? 0 : 2; fsx.close();