Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Domain filtering with a user-defined list, non regex based #49

Open
wants to merge 2 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
44 changes: 44 additions & 0 deletions src/util.cc
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
#include "util.hh"
#include <fstream>
#include <sstream>
#include <iostream>
#include <algorithm>
#include <vector>
#include <boost/filesystem.hpp>
Expand All @@ -10,6 +11,8 @@
#include <boost/algorithm/string/split.hpp>
#include <boost/algorithm/string/predicate.hpp>
#include <boost/algorithm/string/classification.hpp>
#include <boost/iostreams/filtering_stream.hpp>
#include <boost/iostreams/filter/zlib.hpp>
#include <boost/locale.hpp>
#include <boost/log/trivial.hpp>
#include <uchardet/uchardet.h>
Expand Down Expand Up @@ -129,6 +132,24 @@ namespace util {
f.close();
}

// Check if file has gzip magic number
// solution from https://stackoverflow.com/questions/37822645/c-read-and-compare-magic-number-from-file
bool isCompressedFile(const std::string &filename) {
std::ifstream input(filename, std::ios::binary);
if (!input.is_open()) {
BOOST_LOG_TRIVIAL(error) << "Could not open file '" << filename << "'";
return false;
}
input.seekg(0, std::ios::beg);
unsigned char magic[2] = {0};
input.read((char*)magic, sizeof(magic));
const unsigned char magicref[2] = {0x1F, 0x8B};

if(memcmp(magic, magicref, sizeof(magic)) == 0)
return true;
return false;
}

void readUrlFiltersRegex(const std::string &filename, boost::regex &urlFilter) {
std::ifstream f(filename);
std::string line;
Expand All @@ -154,6 +175,29 @@ namespace util {
urlFilter.assign(combined.str(), boost::regex::optimize | boost::regex::nosubs);
}

void readDomainFilters(const std::string &filename, std::unordered_set<std::string> &domainFilter) {
// Check if file is compressed
// Seems that boost zlib does not complain if the file is not compressed
// is it really necessary to manually check the magic number? idk
if (!isCompressedFile(filename)) {
BOOST_LOG_TRIVIAL(error) << "Domain list file not gzip compressed '" << filename << "'";
abort();
}

std::ifstream f(filename, std::ios_base::in | std::ios_base::binary);
boost::iostreams::filtering_stream<boost::iostreams::input> in;
in.push(boost::iostreams::zlib_decompressor());
in.push(f);

std::string line;
for (size_t line_i=1; std::getline(in, line); ++line_i) {
if (boost::algorithm::all(line, boost::algorithm::is_space()) || boost::algorithm::starts_with(line, "#"))
continue;
domainFilter.emplace(std::string(line));
}
f.close();
}

bool createDirectories(const std::string& path){
if (!boost::filesystem::exists(path))
return boost::filesystem::create_directories(path);
Expand Down
4 changes: 4 additions & 0 deletions src/util.hh
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,8 @@ namespace util {
void trimLines(std::string& text);
void trimLinesCopy(const std::string& original, std::string& result);

bool isCompressedFile(const std::string &filename);

// detect charset using uchardet
bool detectCharset(const std::string& text, std::string& charset, const std::string& original_charset = "");
// convert to utf8
Expand Down Expand Up @@ -57,6 +59,8 @@ namespace util {

void readUrlFiltersRegex(const std::string &filename, boost::regex &urlFilter);

void readDomainFilters(const std::string &filename, std::unordered_set<std::string> &domainFilter);

bool createDirectories(const std::string& path);

std::vector<std::string> split(const std::string& s, const std::string& delimiter);
Expand Down
20 changes: 20 additions & 0 deletions src/warcpreprocessor.cc
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,7 @@ namespace {
namespace warc2text {
const std::unordered_set<std::string> WARCPreprocessor::removeExtensions = {".jpg", ".jpeg", ".gif", ".png", ".css", ".js", ".mp3",
".mp4", ".flv", ".wmv", ".gz", ".zip", ".rar" };
const boost::regex WARCPreprocessor::domainExtractor("^(https?:\\/\\/)?(www\\.)?([^:\\/]+)(.*)", boost::regex::extended|boost::regex::icase);

WARCPreprocessor::WARCPreprocessor(const LanguageDetector &detector, WARCPreprocessorOptions const &options) :
detector(detector),
Expand All @@ -80,6 +81,9 @@ namespace warc2text {
if (!options.url_filters_filename.empty())
util::readUrlFiltersRegex(options.url_filters_filename, urlFilter);

if (!options.domain_filters_filename.empty())
util::readDomainFilters(options.domain_filters_filename, domainFilter);

if (!options.pdf_warc_filename.empty())
pdf_warc_writer.open(options.pdf_warc_filename);

Expand All @@ -101,6 +105,19 @@ namespace warc2text {
return true;
}

// true if the domain of the url is good
bool WARCPreprocessor::filterDomain(const std::string& url) const {
std::string domain = boost::regex_replace(url, domainExtractor, "$3");
BOOST_LOG_TRIVIAL(trace) << "Domain extracted '" << domain << "'";

if (!domainFilter.empty() && domainFilter.find(domain) != domainFilter.end()) {
BOOST_LOG_TRIVIAL(trace) << "Domain filter matched '" << url << "'";
return false;
}

return true;
}

void WARCPreprocessor::process(const std::string& filename) {
BOOST_LOG_TRIVIAL(info) << "Processing " << filename;
WARCReader reader(filename);
Expand Down Expand Up @@ -145,6 +162,9 @@ namespace warc2text {
if (!URLfilter(record.getURL()))
continue;

if (!filterDomain(record.getURL()))
continue;

if (options.encodeURLs)
record.encodeURL();

Expand Down
6 changes: 5 additions & 1 deletion src/warcpreprocessor.hh
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@ namespace warc2text {
bool tag_filters_invert{};

std::string url_filters_filename;
std::string domain_filters_filename;

bool multilang{};
bool encodeURLs{};
Expand All @@ -57,9 +58,12 @@ namespace warc2text {
unsigned int langBytes;
util::umap_tag_filters_regex tagFilters;
boost::regex urlFilter;

std::unordered_set<std::string> domainFilter;

static const std::unordered_set<std::string> removeExtensions;
static const boost::regex domainExtractor;
bool URLfilter(const std::string& url) const;
bool filterDomain(const std::string& url) const;

public:
explicit WARCPreprocessor(LanguageDetector const &detector, WARCPreprocessorOptions const &options);
Expand Down
3 changes: 3 additions & 0 deletions warc2text_main.cc
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@ void parseArgs(int argc, char *argv[], Options& out) {
("tag-filters", po::value(&out.tag_filters_filename), "Plain text file containing tag filters")
("invert-tag-filters", po::bool_switch(&out.tag_filters_invert)->default_value(false), "Invert tag filter application")
("url-filters", po::value(&out.url_filters_filename), "Plain text file containing url filters")
("domain-filters", po::value(&out.domain_filters_filename), "Gzip compressed text file containing domain filters")
("pdfpass", po::value(&out.pdf_warc_filename), "Write PDF records to WARC")
("robotspass", po::value(&out.robots_warc_filename), "Write robots.txt records to WARC")
("paragraph-identification", po::bool_switch(&out.paragraph_identification)->default_value(false), "Add paragraph index in each b64encoded document as tab separated column")
Expand Down Expand Up @@ -65,6 +66,8 @@ void parseArgs(int argc, char *argv[], Options& out) {
" --invert-tag-filters Only output records that got filtered\n"
" --url-filters <filters_file> File containing url filters\n"
" Format: \"regexp\"\n"
" --domain-filters <filters_file> File containing domain filters\n"
" Format: each line containing a domain name\n"
" --pdfpass <output_warc> Write PDF records to <output_warc>\n"
" --robotspass <output_warc> Write Robots.txt records to <output_warc>\n"
" --encode-urls Encode URLs obtained from WARC records\n"
Expand Down