bitextor · ZJaume · Jan 11, 2024 · Jan 11, 2024
diff --git a/src/util.cc b/src/util.cc
@@ -1,6 +1,7 @@
 #include "util.hh"
 #include <fstream>
 #include <sstream>
+#include <iostream>
 #include <algorithm>
 #include <vector>
 #include <boost/filesystem.hpp>
@@ -10,6 +11,8 @@
 #include <boost/algorithm/string/split.hpp>
 #include <boost/algorithm/string/predicate.hpp>
 #include <boost/algorithm/string/classification.hpp>
+#include <boost/iostreams/filtering_stream.hpp>
+#include <boost/iostreams/filter/zlib.hpp>
 #include <boost/locale.hpp>
 #include <boost/log/trivial.hpp>
 #include <uchardet/uchardet.h>
@@ -129,6 +132,24 @@ namespace util {
         f.close();
     }
 
+    // Check if file has gzip magic number
+    // solution from https://stackoverflow.com/questions/37822645/c-read-and-compare-magic-number-from-file
+    bool isCompressedFile(const std::string &filename) {
+        std::ifstream input(filename, std::ios::binary);
+        if (!input.is_open()) {
+            BOOST_LOG_TRIVIAL(error) << "Could not open file '" << filename << "'";
+            return false;
+        }
+        input.seekg(0, std::ios::beg);
+        unsigned char magic[2] = {0};
+        input.read((char*)magic, sizeof(magic));
+        const unsigned char magicref[2] = {0x1F, 0x8B};
+
+        if(memcmp(magic, magicref, sizeof(magic)) == 0)
+            return true;
+        return false;
+    }
+
     void readUrlFiltersRegex(const std::string &filename, boost::regex &urlFilter) {
         std::ifstream f(filename);
         std::string line;
@@ -154,6 +175,29 @@ namespace util {
         urlFilter.assign(combined.str(), boost::regex::optimize | boost::regex::nosubs);
     }
 
+    void readDomainFilters(const std::string &filename, std::unordered_set<std::string> &domainFilter) {
+        // Check if file is compressed
+        // Seems that boost zlib does not complain if the file is not compressed
+        // is it really necessary to manually check the magic number? idk
+        if (!isCompressedFile(filename)) {
+            BOOST_LOG_TRIVIAL(error) << "Domain list file not gzip compressed '" << filename << "'";
+            abort();
+        }
+
+        std::ifstream f(filename, std::ios_base::in | std::ios_base::binary);
+        boost::iostreams::filtering_stream<boost::iostreams::input> in;
+        in.push(boost::iostreams::zlib_decompressor());
+        in.push(f);
+
+        std::string line;
+        for (size_t line_i=1; std::getline(in, line); ++line_i) {
+            if (boost::algorithm::all(line, boost::algorithm::is_space()) || boost::algorithm::starts_with(line, "#"))
+                continue;
+            domainFilter.emplace(std::string(line));
+        }
+        f.close();
+    }
+
     bool createDirectories(const std::string& path){
         if (!boost::filesystem::exists(path))
             return boost::filesystem::create_directories(path);

diff --git a/src/util.hh b/src/util.hh
@@ -19,6 +19,8 @@ namespace util {
     void trimLines(std::string& text);
     void trimLinesCopy(const std::string& original, std::string& result);
 
+    bool isCompressedFile(const std::string &filename);
+
     // detect charset using uchardet
     bool detectCharset(const std::string& text, std::string& charset, const std::string& original_charset = "");
     // convert to utf8
@@ -57,6 +59,8 @@ namespace util {
 
     void readUrlFiltersRegex(const std::string &filename, boost::regex &urlFilter);
 
+    void readDomainFilters(const std::string &filename, std::unordered_set<std::string> &domainFilter);
+
     bool createDirectories(const std::string& path);
 
     std::vector<std::string> split(const std::string& s, const std::string& delimiter);

diff --git a/src/warcpreprocessor.cc b/src/warcpreprocessor.cc
@@ -62,6 +62,7 @@ namespace {
 namespace warc2text {
     const std::unordered_set<std::string> WARCPreprocessor::removeExtensions = {".jpg", ".jpeg", ".gif", ".png", ".css", ".js", ".mp3",
                                                                                 ".mp4", ".flv", ".wmv", ".gz", ".zip", ".rar" };
+    const boost::regex WARCPreprocessor::domainExtractor("^(https?:\\/\\/)?(www\\.)?([^:\\/]+)(.*)", boost::regex::extended|boost::regex::icase);
 
     WARCPreprocessor::WARCPreprocessor(const LanguageDetector &detector, WARCPreprocessorOptions const &options) :
         detector(detector),
@@ -80,6 +81,9 @@ namespace warc2text {
             if (!options.url_filters_filename.empty())
                 util::readUrlFiltersRegex(options.url_filters_filename, urlFilter);
 
+            if (!options.domain_filters_filename.empty())
+                util::readDomainFilters(options.domain_filters_filename, domainFilter);
+
             if (!options.pdf_warc_filename.empty())
                 pdf_warc_writer.open(options.pdf_warc_filename);
 
@@ -101,6 +105,19 @@ namespace warc2text {
         return true;
     }
 
+    // true if the domain of the url is good
+    bool WARCPreprocessor::filterDomain(const std::string& url) const {
+        std::string domain = boost::regex_replace(url, domainExtractor, "$3");
+        BOOST_LOG_TRIVIAL(trace) << "Domain extracted '" << domain << "'";
+
+        if (!domainFilter.empty() && domainFilter.find(domain) != domainFilter.end()) {
+            BOOST_LOG_TRIVIAL(trace) << "Domain filter matched '" << url << "'";
+            return false;
+        }
+
+        return true;
+    }
+
     void WARCPreprocessor::process(const std::string& filename) {
         BOOST_LOG_TRIVIAL(info) << "Processing " << filename;
         WARCReader reader(filename);
@@ -145,6 +162,9 @@ namespace warc2text {
             if (!URLfilter(record.getURL()))
                 continue;
 
+            if (!filterDomain(record.getURL()))
+                continue;
+
             if (options.encodeURLs)
                 record.encodeURL();
 

diff --git a/src/warcpreprocessor.hh b/src/warcpreprocessor.hh
@@ -37,6 +37,7 @@ namespace warc2text {
         bool tag_filters_invert{};
 
         std::string url_filters_filename;
+        std::string domain_filters_filename;
 
         bool multilang{};
         bool encodeURLs{};
@@ -57,9 +58,12 @@ namespace warc2text {
             unsigned int langBytes;
             util::umap_tag_filters_regex tagFilters;
             boost::regex urlFilter;
-
+            std::unordered_set<std::string> domainFilter;
+
             static const std::unordered_set<std::string> removeExtensions;
+            static const boost::regex domainExtractor;
             bool URLfilter(const std::string& url) const;
+            bool filterDomain(const std::string& url) const;
 
         public:
             explicit WARCPreprocessor(LanguageDetector const &detector, WARCPreprocessorOptions const &options);

diff --git a/warc2text_main.cc b/warc2text_main.cc
@@ -33,6 +33,7 @@ void parseArgs(int argc, char *argv[], Options& out) {
         ("tag-filters", po::value(&out.tag_filters_filename), "Plain text file containing tag filters")
         ("invert-tag-filters", po::bool_switch(&out.tag_filters_invert)->default_value(false), "Invert tag filter application")
         ("url-filters", po::value(&out.url_filters_filename), "Plain text file containing url filters")
+        ("domain-filters", po::value(&out.domain_filters_filename), "Gzip compressed text file containing domain filters")
         ("pdfpass", po::value(&out.pdf_warc_filename), "Write PDF records to WARC")
         ("robotspass", po::value(&out.robots_warc_filename), "Write robots.txt records to WARC")
         ("paragraph-identification", po::bool_switch(&out.paragraph_identification)->default_value(false), "Add paragraph index in each b64encoded document as tab separated column")
@@ -65,6 +66,8 @@ void parseArgs(int argc, char *argv[], Options& out) {
                 " --invert-tag-filters             Only output records that got filtered\n"
                 " --url-filters <filters_file>     File containing url filters\n"
                 "                                  Format: \"regexp\"\n"
+                " --domain-filters <filters_file>  File containing domain filters\n"
+                "                                  Format: each line containing a domain name\n"
                 " --pdfpass <output_warc>          Write PDF records to <output_warc>\n"
                 " --robotspass <output_warc>       Write Robots.txt records to <output_warc>\n"
                 " --encode-urls                    Encode URLs obtained from WARC records\n"