Skip to content

Commit

Permalink
feat: support extracting only files matching glob patterns
Browse files Browse the repository at this point in the history
  • Loading branch information
mhx committed Nov 16, 2024
1 parent 24bc61c commit 8d6e374
Show file tree
Hide file tree
Showing 5 changed files with 84 additions and 7 deletions.
5 changes: 5 additions & 0 deletions doc/dwarfsextract.md
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,11 @@ to disk:
case the default is to extract the files to the current directory, or
to write the archive data to stdout.

- `--pattern=`*glob-pattern*:
If specified, only extract entries matching the pattern. Can be specified
multiple times, in which case all files matching one or more patterns will
be extracted.

- `-O`, `--image-offset=`*value*|`auto`:
Specify the byte offset at which the filesystem is located in the image.
Use `auto` to detect the offset automatically. This is also the default.
Expand Down
14 changes: 11 additions & 3 deletions include/dwarfs/utility/filesystem_extractor.h
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@

namespace dwarfs {

class glob_matcher;
class library_dependencies;
class logger;
class os_access;
Expand Down Expand Up @@ -72,7 +73,13 @@ class filesystem_extractor {
bool extract(reader::filesystem_v2 const& fs,
filesystem_extractor_options const& opts =
filesystem_extractor_options()) {
return impl_->extract(fs, opts);
return impl_->extract(fs, nullptr, opts);
}

bool extract(reader::filesystem_v2 const& fs, glob_matcher const* matcher,
filesystem_extractor_options const& opts =
filesystem_extractor_options()) {
return impl_->extract(fs, matcher, opts);
}

class impl {
Expand All @@ -84,8 +91,9 @@ class filesystem_extractor {
virtual void open_stream(std::ostream& os, std::string const& format) = 0;
virtual void open_disk(std::filesystem::path const& output) = 0;
virtual void close() = 0;
virtual bool extract(reader::filesystem_v2 const& fs,
filesystem_extractor_options const& opts) = 0;
virtual bool
extract(reader::filesystem_v2 const& fs, glob_matcher const* matcher,
filesystem_extractor_options const& opts) = 0;
};

private:
Expand Down
36 changes: 34 additions & 2 deletions src/utility/filesystem_extractor.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@
#include <memory>
#include <mutex>
#include <thread>
#include <unordered_set>

// This is required to avoid Windows.h being pulled in by libarchive
// and polluting our environment with all sorts of shit.
Expand All @@ -41,6 +42,7 @@

#include <dwarfs/file_stat.h>
#include <dwarfs/fstypes.h>
#include <dwarfs/glob_matcher.h>
#include <dwarfs/library_dependencies.h>
#include <dwarfs/logger.h>
#include <dwarfs/os_access.h>
Expand Down Expand Up @@ -188,7 +190,7 @@ class filesystem_extractor_ final : public filesystem_extractor::impl {
}
}

bool extract(reader::filesystem_v2 const& fs,
bool extract(reader::filesystem_v2 const& fs, glob_matcher const* matcher,
filesystem_extractor_options const& opts) override;

private:
Expand Down Expand Up @@ -248,7 +250,8 @@ class filesystem_extractor_ final : public filesystem_extractor::impl {

template <typename LoggerPolicy>
bool filesystem_extractor_<LoggerPolicy>::extract(
reader::filesystem_v2 const& fs, filesystem_extractor_options const& opts) {
reader::filesystem_v2 const& fs, glob_matcher const* matcher,
filesystem_extractor_options const& opts) {
DWARFS_CHECK(a_, "filesystem not opened");

auto lr = ::archive_entry_linkresolver_new();
Expand Down Expand Up @@ -350,6 +353,18 @@ bool filesystem_extractor_<LoggerPolicy>::extract(
}
};

std::unordered_set<std::filesystem::path> matched_dirs;

if (matcher) {
fs.walk([&](auto entry) {
if (auto parent = entry.parent()) {
if (matcher->match(entry.unix_path())) {
matched_dirs.insert(parent->fs_path());
}
}
});
}

fs.walk_data_order([&](auto entry) {
// TODO: we can surely early abort walk() somehow
if (entry.is_root() || hard_error) {
Expand All @@ -358,6 +373,23 @@ bool filesystem_extractor_<LoggerPolicy>::extract(

auto inode = entry.inode();

if (matcher) {
LOG_TRACE << "checking " << entry.unix_path();
if (inode.is_directory()) {
if (!matched_dirs.contains(entry.fs_path())) {
LOG_TRACE << "skipping directory " << entry.fs_path();
// no need to extract this directory
return;
}
} else {
if (!matcher->match(entry.unix_path())) {
LOG_TRACE << "skipping " << entry.fs_path();
// no match, skip this entry
return;
}
}
}

auto ae = ::archive_entry_new();
auto stbuf = fs.getattr(inode);

Expand Down
14 changes: 14 additions & 0 deletions test/tool_main_test.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1983,11 +1983,25 @@ TEST(dwarfsextract_test, mtree) {
auto t = dwarfsextract_tester::create_with_image();
ASSERT_EQ(0, t.run({"-i", "image.dwarfs", "-f", "mtree"})) << t.err();
auto out = t.out();
std::cout << out << "\n";
EXPECT_TRUE(out.starts_with("#mtree")) << out;
EXPECT_THAT(out, ::testing::HasSubstr("type=dir"));
EXPECT_THAT(out, ::testing::HasSubstr("type=file"));
}

TEST(dwarfsextract_test, patterns) {
auto t = dwarfsextract_tester::create_with_image();
ASSERT_EQ(0, t.run({"-i", "image.dwarfs", "-f", "mtree", "**/*.py", "*.txt"}))
<< t.err();
auto out = t.out();
EXPECT_TRUE(out.starts_with("#mtree")) << out;
auto mtree = test::parse_mtree(out);
ASSERT_EQ(3, mtree.size());
EXPECT_EQ("./ipsum.txt", mtree[0].first);
EXPECT_EQ("./somedir", mtree[1].first);
EXPECT_EQ("./somedir/ipsum.py", mtree[2].first);
}

TEST(dwarfsextract_test, stdout_progress_error) {
auto t = dwarfsextract_tester::create_with_image();
EXPECT_NE(0,
Expand Down
22 changes: 20 additions & 2 deletions tools/src/dwarfsextract_main.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@
#include <boost/program_options.hpp>

#include <dwarfs/config.h>
#include <dwarfs/glob_matcher.h>
#include <dwarfs/library_dependencies.h>
#include <dwarfs/logger.h>
#include <dwarfs/mmap.h>
Expand Down Expand Up @@ -77,6 +78,9 @@ int dwarfsextract_main(int argc, sys_char** argv, iolayer const& iol) {
("output,o",
po_sys_value<sys_string>(&output),
"output file or directory")
("pattern",
po::value<std::vector<std::string>>(),
"only extract files matching these patterns")
("image-offset,O",
po::value<std::string>(&image_offset)->default_value("auto"),
"filesystem image offset in bytes")
Expand Down Expand Up @@ -111,10 +115,17 @@ int dwarfsextract_main(int argc, sys_char** argv, iolayer const& iol) {

tool::add_common_options(opts, logopts);

po::positional_options_description pos;
pos.add("pattern", -1);

po::variables_map vm;

try {
po::store(po::parse_command_line(argc, argv, opts), vm);
po::store(po::basic_command_line_parser<sys_char>(argc, argv)
.options(opts)
.positional(pos)
.run(),
vm);
po::notify(vm);
} catch (po::error const& e) {
iol.err << "error: " << e.what() << "\n";
Expand All @@ -141,6 +152,13 @@ int dwarfsextract_main(int argc, sys_char** argv, iolayer const& iol) {
return 0;
}

std::unique_ptr<glob_matcher> matcher;

if (vm.count("pattern")) {
matcher = std::make_unique<glob_matcher>(
vm["pattern"].as<std::vector<std::string>>());
}

int rv = 0;

try {
Expand Down Expand Up @@ -214,7 +232,7 @@ int dwarfsextract_main(int argc, sys_char** argv, iolayer const& iol) {
};
}

rv = fsx.extract(fs, fsx_opts) ? 0 : 2;
rv = fsx.extract(fs, matcher.get(), fsx_opts) ? 0 : 2;

fsx.close();

Expand Down

0 comments on commit 8d6e374

Please sign in to comment.