diff --git a/CMakeLists.txt b/CMakeLists.txt index 2d235b5..23eca7f 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -61,6 +61,16 @@ target_link_libraries(warc2text fasttext-static ) +include(CTest) + +foreach(test html text) + add_executable(${test}_test src/${test}_test.cc) + target_link_libraries(${test}_test + warc2text_lib + ${Boost_LIBRARIES}) + add_test(NAME ${test}_test COMMAND $) +endforeach() + include(GNUInstallDirs) install(TARGETS cld2_full warc2text diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 6db72e0..a494db4 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -33,6 +33,7 @@ add_library(warc2text_lib xh_scanner.cc entities.cc zipreader.cc + text.cc ) diff --git a/src/bilangwriter.cc b/src/bilangwriter.cc index 5688000..2571591 100644 --- a/src/bilangwriter.cc +++ b/src/bilangwriter.cc @@ -129,7 +129,7 @@ namespace warc2text{ void BilangWriter::write(const Record& record, bool paragraph_identification) { for (const auto& it : record.getTextByLangs()) { - std::string chunk = it.second; + std::string chunk = it.second.text; if (paragraph_identification) chunk = get_paragraph_id(chunk); @@ -147,12 +147,13 @@ namespace warc2text{ {"o", boost::json::value(record.getOffset())}, {"s", boost::json::value(record.getSize())}, {"rs", boost::json::value(record.getPayload().size())}, - {"ps", boost::json::value(chunk.second.size())}, + {"ps", boost::json::value(chunk.second.text.size())}, {"l", boost::json::string(chunk.first)}, {"u", boost::json::string(record.getURL())}, {"c", boost::json::string(record.getHTTPcontentType())}, {"ts", boost::json::string(record.getWARCdate())}, - {"p", boost::json::string(chunk.second)}, + {"p", boost::json::string(chunk.second.text)}, + {"pt", boost::json::value_from(chunk.second.tags)}, } << "\n"; } } diff --git a/src/html.cc b/src/html.cc index bc329e9..9aa5786 100644 --- a/src/html.cc +++ b/src/html.cc @@ -4,6 +4,7 @@ #include #include "util.hh" #include "html.hh" +#include "entities.hh" #include "xh_scanner.hh" namespace warc2text { @@ -25,28 +26,27 @@ namespace warc2text { return true; } - void addNewLine(std::string& plaintext) { - if (std::isspace(plaintext.back())) { - plaintext.back() = '\n'; - } else if (!plaintext.empty()) { - plaintext.push_back('\n'); - } - } - void addSpace(std::string& plaintext) { - if (!plaintext.empty() && !std::isspace(plaintext.back())) { + if (!plaintext.empty() && !std::isspace(static_cast(plaintext.back()))) { plaintext.push_back(' '); } } - int processHTML(const std::string& html, std::string& plaintext, const util::umap_tag_filters_regex& tagFilters){ - plaintext = ""; + bool isWhitespace(std::string const &str) { + return std::all_of(str.begin(), str.end(), [](unsigned char c){ return std::isspace(c); }); + } + + int processHTML(const std::string& html, AnnotatedText& plaintext, const util::umap_tag_filters_regex& tagFilters){ + plaintext.clear(); + markup::instream si(html.c_str()); markup::scanner sc(si); int t = markup::scanner::TT_SPACE; // just start somewhere that isn't ERROR or EOF int retval = util::SUCCESS; std::string tag; + std::string paragraph; + std::string plain; while (t != markup::scanner::TT_EOF and t != markup::scanner::TT_ERROR) { t = sc.get_token(); @@ -60,17 +60,22 @@ namespace warc2text { // sc.get_tag_name() only changes value after a new tag is found tag = util::toLowerCopy(sc.get_tag_name()); // found block tag: previous block has ended - if (html::isBlockTag(tag)) addNewLine(plaintext); + if (html::isBlockTag(tag) && !isWhitespace(paragraph)) { + // TODO: add this directly to the scanner? + entities::decodeEntities(paragraph, plain); + plaintext.push_back(plain, tag); + paragraph.clear(); // reset for next paragraph + } // found void tag, like or - if (html::isVoidTag(tag)) addSpace(plaintext); + if (html::isVoidTag(tag)) addSpace(paragraph); break; case markup::scanner::TT_WORD: // if the tag is in noText list, don't save the text if (html::isNoTextTag(tag)) break; - plaintext.append(sc.get_value()); + paragraph.append(sc.get_value()); break; case markup::scanner::TT_SPACE: - addSpace(plaintext); + addSpace(paragraph); break; case markup::scanner::TT_ATTR: if (!filter(tag, sc.get_attr_name(), sc.get_value(), tagFilters)) @@ -80,7 +85,12 @@ namespace warc2text { break; } } - if (plaintext.back() != '\n') plaintext.push_back('\n'); + + if (!isWhitespace(paragraph)) { + entities::decodeEntities(paragraph, plain); + plaintext.push_back(plain, ""); + } + return retval; } diff --git a/src/html.hh b/src/html.hh index 583dc31..2f1994d 100644 --- a/src/html.hh +++ b/src/html.hh @@ -2,9 +2,11 @@ #define WARC2TEXT_HTML_HH #include +#include "text.hh" +#include "util.hh" namespace warc2text { - int processHTML(const std::string& html, std::string& text, const util::umap_tag_filters_regex& tagFilters); + int processHTML(const std::string& html, AnnotatedText &text, const util::umap_tag_filters_regex& tagFilters); } #endif diff --git a/src/html_test.cc b/src/html_test.cc new file mode 100644 index 0000000..4705312 --- /dev/null +++ b/src/html_test.cc @@ -0,0 +1,122 @@ +#include "html.hh" + +#define BOOST_TEST_MODULE HTMLTest +#include + +namespace warc2text { +namespace { + + +BOOST_AUTO_TEST_CASE(CleanHTML) { + std::string html( + "\n" + "\n" + " \n" + " Well-formed web page!\n" + " \n" + " \n" + "

This is a paragraph.

\n" + "

\n" + " This is <one>,\n" + " indented as written by Ken,\n" + " with a newline.\n" + "

\n" + " \n" + ""); + + std::string expected( + "Well-formed web page!\n" + "This is a paragraph.\n" + "This is , indented as written by Ken, with a newline.\n" + ); + + std::vector tags{"title", "p", "p"}; + + AnnotatedText out; + auto retval = processHTML(html, out, {}); + + BOOST_CHECK_EQUAL(retval, util::SUCCESS); + BOOST_CHECK_EQUAL(out.text, expected); + BOOST_CHECK_EQUAL_COLLECTIONS(out.tags.begin(), out.tags.end(), tags.begin(), tags.end()); +} + +BOOST_AUTO_TEST_CASE(TagsIdentifiers) { + // breaks because we don't have a stack, so after

but before

no idea + // we're inside

. + std::string html( + "
\n" + "

Text

\n" + " not block text\n" + "

Paragraph

\n" + " Inside div\n" + "
" + ); + + std::string expected( + "Text\n" + "not block text\n" + "Paragraph\n" + "Inside div\n" + ); + + std::vector tags{"p", "div", "p", "div"}; + + AnnotatedText out; + auto retval = processHTML(html, out, {}); + + BOOST_CHECK_EQUAL(retval, util::SUCCESS); + BOOST_CHECK_EQUAL(out.text, expected); + BOOST_CHECK_EQUAL_COLLECTIONS(out.tags.begin(), out.tags.end(), tags.begin(), tags.end()); +} + +BOOST_AUTO_TEST_CASE(PreTagNotSupported) { + // We don't support keeping the formatting in
 tags.
+	std::string html(
+		"
 This line\n"
+		"should keep its newlines\n"
+		"ideally.
"); + + std::string expected("This line should keep its newlines ideally.\n"); + + std::vector tags{"pre"}; + + AnnotatedText out; + auto retval = processHTML(html, out, {}); + + BOOST_CHECK_EQUAL(retval, util::SUCCESS); + BOOST_CHECK_EQUAL(out.text, expected); + BOOST_CHECK_EQUAL_COLLECTIONS(out.tags.begin(), out.tags.end(), tags.begin(), tags.end()); +} + +BOOST_AUTO_TEST_CASE(BlockTags) { + std::string html("Alpha
Beta

Gamma

Delta Epsilon Zeta"); + std::string expected("Alpha\nBeta\nGamma\nDelta Epsilon Zeta\n"); + AnnotatedText out; + auto retval = processHTML(html, out, {}); + BOOST_CHECK_EQUAL(retval, util::SUCCESS); + BOOST_CHECK_EQUAL(out.text, expected); +} + +BOOST_AUTO_TEST_CASE(VoidTags) { + std::string html("Voidtagsshouldaddbeepspaces"); + std::string expected("Void tags should add beep spaces\n"); + AnnotatedText out; + auto retval = processHTML(html, out, {}); + BOOST_CHECK_EQUAL(retval, util::SUCCESS); + BOOST_CHECK_EQUAL(out.text, expected); +} + +BOOST_AUTO_TEST_CASE(ScriptTags, *boost::unit_test::disabled()) { + // This fails since we don't keep a stack, so ` is ignored` will still + // have tag_name == "script" and be ignored :facepalm: + std::string html("Text inside is ignored"); + std::string expected("Test inside is ignored\n"); + AnnotatedText out; + auto retval = processHTML(html, out, {}); + BOOST_CHECK_EQUAL(retval, util::SUCCESS); + BOOST_CHECK_EQUAL(out.text, expected); +} + +} +} + diff --git a/src/lang.hh b/src/lang.hh index 3552892..7b135dd 100644 --- a/src/lang.hh +++ b/src/lang.hh @@ -4,6 +4,7 @@ #include #include #include +#include "text.hh" namespace fasttext { class FastText; @@ -15,8 +16,8 @@ class LanguageDetector { public: virtual ~LanguageDetector() {}; - // detect language of plain text, return top languages - virtual void detect(const std::string& text, std::unordered_map& chunks) const = 0; + // detect language of plain text, return top languages, consumes the text. + virtual void detect(AnnotatedText &&text, std::unordered_map& chunks) const = 0; // Label used for text (chunks) that cannot reliably be identified static const std::string kUnknownLanguageLabel; @@ -26,7 +27,7 @@ class FastTextDetector : public LanguageDetector { public: explicit FastTextDetector(const std::string &filename); virtual ~FastTextDetector(); - virtual void detect(const std::string& text, std::unordered_map& chunks) const; + virtual void detect(AnnotatedText &&text, std::unordered_map& chunks) const; private: std::unique_ptr classifier_; @@ -34,13 +35,13 @@ class FastTextDetector : public LanguageDetector { class CLD2Detector : public LanguageDetector { public: - virtual void detect(const std::string& text, std::unordered_map& chunks) const; + virtual void detect(AnnotatedText &&text, std::unordered_map& chunks) const; virtual ~CLD2Detector(); }; class CLD2MultiLangDetector : public LanguageDetector { public: - virtual void detect(const std::string& text, std::unordered_map& chunks) const; + virtual void detect(AnnotatedText &&text, std::unordered_map& chunks) const; virtual ~CLD2MultiLangDetector(); }; diff --git a/src/lang_cld2.cc b/src/lang_cld2.cc index 30e8cd9..f241f7c 100644 --- a/src/lang_cld2.cc +++ b/src/lang_cld2.cc @@ -8,17 +8,17 @@ namespace warc2text { CLD2Detector::~CLD2Detector() {} - void CLD2Detector::detect(const std::string& text, std::unordered_map& text_by_lang) const { + void CLD2Detector::detect(AnnotatedText &&text, std::unordered_map& text_by_lang) const { bool reliable = false; int valid_prefix_bytes = 0; - CLD2::Language l = CLD2::DetectLanguageCheckUTF8(text.data(), text.size(), true, &reliable, &valid_prefix_bytes); - text_by_lang[reliable ? CLD2::LanguageCode(l) : kUnknownLanguageLabel] = text; + CLD2::Language l = CLD2::DetectLanguageCheckUTF8(text.text.data(), text.text.size(), true, &reliable, &valid_prefix_bytes); + text_by_lang[reliable ? CLD2::LanguageCode(l) : kUnknownLanguageLabel] = std::move(text); } CLD2MultiLangDetector::~CLD2MultiLangDetector() {} - void CLD2MultiLangDetector::detect(const std::string& text, std::unordered_map& text_by_lang) const { - CLD2::Language langs[3] = {CLD2::UNKNOWN_LANGUAGE, CLD2::UNKNOWN_LANGUAGE, CLD2::UNKNOWN_LANGUAGE}; + void CLD2MultiLangDetector::detect(AnnotatedText &&text, std::unordered_map& text_by_lang) const { + CLD2::Language langs[] = {CLD2::UNKNOWN_LANGUAGE, CLD2::UNKNOWN_LANGUAGE, CLD2::UNKNOWN_LANGUAGE}; int percents[3] = {0,0,0}; double scores[3] = {0.0, 0.0, 0.0}; @@ -28,47 +28,39 @@ namespace warc2text { CLD2::ResultChunkVector chunks; - CLD2::ExtDetectLanguageSummaryCheckUTF8(text.data(), text.size(), true, &NO_HINT, 0, &langs[0], &percents[0], &scores[0], &chunks, &text_bytes, &reliable, &valid_prefix_bytes); + CLD2::ExtDetectLanguageSummaryCheckUTF8(text.text.data(), text.text.size(), true, &NO_HINT, 0, &langs[0], &percents[0], &scores[0], &chunks, &text_bytes, &reliable, &valid_prefix_bytes); text_by_lang.clear(); - if (not reliable) { - text_by_lang[kUnknownLanguageLabel] = text; + if (!reliable) { + text_by_lang[kUnknownLanguageLabel] = std::move(text); return; } - std::string* top1 = nullptr; - std::string* top2 = nullptr; - std::string* top3 = nullptr; + const char* mapping[] = {nullptr, nullptr, nullptr}; - if (langs[0] != CLD2::UNKNOWN_LANGUAGE and percents[0] > 0) { - top1 = &text_by_lang[CLD2::LanguageCode(langs[0])]; - top1->reserve(text.size() * (percents[0] + 1)); - } - - if (langs[1] != CLD2::UNKNOWN_LANGUAGE and percents[1] > 0) { - top2 = &text_by_lang[CLD2::LanguageCode(langs[1])]; - top2->reserve(text.size() * (percents[1] + 1)); - } - - if (langs[2] != CLD2::UNKNOWN_LANGUAGE and percents[2] > 0) { - top3 = &text_by_lang[CLD2::LanguageCode(langs[2])]; - top3->reserve(text.size() * (percents[2] + 1)); + for (size_t i = 0; i < 3; ++i) { + if (langs[i] != CLD2::UNKNOWN_LANGUAGE && percents[i] > 0) + mapping[i] = CLD2::LanguageCode(langs[2]); } for (const CLD2::ResultChunk& chunk : chunks) { - std::string* ref = static_cast(chunk.lang1) == langs[0] ? top1 : - static_cast(chunk.lang1) == langs[1] ? top2 : - static_cast(chunk.lang1) == langs[2] ? top3 : nullptr; - if (ref == nullptr) continue; - ref->append(text, chunk.offset, chunk.bytes); - } - - // remove empty texts from text_by_lang - // apparently it is possible that the reported percentage is > 0, but the language does not appear in chunks - for (auto it = text_by_lang.cbegin(); it != text_by_lang.cend(); ){ - if (it->second.size() == 0) text_by_lang.erase(it++); - else ++it; + if (chunk.bytes == 0) // TODO: can this even happen? + continue; + + // Which of the top 3 languages is this chunk in? + std::size_t i = 0; + for (; i < 3; ++i) { + if (static_cast(chunk.lang1) == langs[i]) + break; + } + + // Chunk is not in top 3 + if (i == 3) + continue; + + // Chunk is in top 3, append it to that AnnotatedText + text_by_lang[mapping[i]].append(text, chunk.offset, chunk.bytes); } // TODO: do something with the scores? diff --git a/src/lang_fasttext.cc b/src/lang_fasttext.cc index 0f6f955..a48065a 100644 --- a/src/lang_fasttext.cc +++ b/src/lang_fasttext.cc @@ -17,14 +17,14 @@ FastTextDetector::~FastTextDetector() {} const char kLabelPrefix[] = "__label__"; -void FastTextDetector::detect(const std::string& text, std::unordered_map& chunks) const { +void FastTextDetector::detect(AnnotatedText &&text, std::unordered_map& chunks) const { const float kThreshold = 0.5f; std::vector words, labels; - classifier_->getDictionary()->getStringNoNewline(text, words, labels); + classifier_->getDictionary()->getStringNoNewline(text.text, words, labels); fasttext::Predictions predictions; classifier_->predict(1, words, predictions, kThreshold); if (predictions.empty()) { - chunks[kUnknownLanguageLabel] = text; + chunks[kUnknownLanguageLabel] = std::move(text); return; } @@ -34,7 +34,7 @@ void FastTextDetector::detect(const std::string& text, std::unordered_map& Record::getTextByLangs() const { + const std::unordered_map& Record::getTextByLangs() const { return text_by_langs; } int Record::detectLanguage(LanguageDetector const &detector){ - detector.detect(plaintext, text_by_langs); + detector.detect(std::move(plaintext), text_by_langs); return text_by_langs.size(); } @@ -280,8 +277,8 @@ namespace warc2text { return payload; } - const std::string& Record::getPlainText() const { - return plaintext; + std::size_t Record::getPlainTextSize() const { + return plaintext.text.size(); } const std::string& Record::getURL() const { diff --git a/src/record.hh b/src/record.hh index 00069e7..5a20d9b 100644 --- a/src/record.hh +++ b/src/record.hh @@ -10,6 +10,7 @@ #include #include "util.hh" #include "lang.hh" +#include "text.hh" namespace warc2text { class Record { @@ -22,7 +23,6 @@ namespace warc2text { bool HTTPheaderExists(const std::string& property) const; const std::string& getPayload() const; - const std::string& getPlainText() const; const std::string& getURL() const; const std::string& getRecordType() const; const std::string& getWARCcontentType() const; @@ -44,7 +44,8 @@ namespace warc2text { return offset; } - const std::unordered_map& getTextByLangs() const; + std::size_t getPlainTextSize() const; + const std::unordered_map& getTextByLangs() const; int cleanPayload(); int cleanPayload(const util::umap_tag_filters_regex& tagFilters); @@ -63,10 +64,9 @@ namespace warc2text { std::unordered_map header; std::unordered_map HTTPheader; std::string payload; - std::string plaintext; - std::string language; - - std::unordered_map text_by_langs; + + AnnotatedText plaintext; // might be empty after langid + std::unordered_map text_by_langs; // these are present in the headers, but it's convenient to have them apart also std::string recordType; diff --git a/src/text.cc b/src/text.cc new file mode 100644 index 0000000..5114397 --- /dev/null +++ b/src/text.cc @@ -0,0 +1,86 @@ +#include "text.hh" +#include + +namespace { + std::size_t count(char needle, std::string const &haystack, std::size_t offset = 0, std::size_t count = std::string::npos) { + std::size_t hits = 0; + + size_t end; + + if (count == std::string::npos || offset + count > haystack.size()) + end = haystack.size(); + else + end = offset + count; + + while (true) { + std::size_t hit = haystack.find(needle, offset); + if (hit >= end) + break; + + ++hits; + offset = hit + 1; + } + + return hits; + } + + template + void append(std::vector &dest, std::vector const &source, std::size_t offset, std::size_t count) { + dest.reserve(dest.size() + count); + for (std::size_t i = 0; i < count; ++i) + dest.emplace_back(source[offset + i]); + } +} + +namespace warc2text { + +AnnotatedText AnnotatedText::substr(std::size_t offset, std::size_t length) const { + return AnnotatedText().append(*this, offset, length); +} + +AnnotatedText &AnnotatedText::append(AnnotatedText const &other, std::size_t offset, std::size_t length) { + std::size_t tag_offset = ::count('\n', other.text, 0, offset); + std::size_t tag_length = ::count('\n', other.text, offset, length); + + // When the current text does not end with a newline, we skip copying the + // first tag of `other` because that line will be added to the current line + // that already has a tag. + if (!text.empty() && text.back() != '\n') { + tag_offset += 1; + tag_length -= 1; + } + + text.append(other.text, offset, length); + ::append(tags, other.tags, tag_offset, tag_length); + return *this; +} + +void AnnotatedText::push_back(std::string const &chunk, std::string const &tag) { + if (chunk.empty()) + return; + + std::size_t lines = 0; + + if (std::isspace(static_cast(chunk.back()))) { + text.append(chunk, 0, chunk.size() - 1); + text.push_back('\n'); + lines = count('\n', chunk, 0, chunk.size() - 1) + 1; + } else { + text.reserve(text.size() + chunk.size() + 1); + text += chunk; + text.push_back('\n'); + lines = count('\n', chunk) + 1; + } + + assert(lines >= 1); + + for (std::size_t i = 0; i < lines; ++i) + tags.push_back(tag); +} + +void AnnotatedText::clear() { + text.clear(); + tags.clear(); +} + +}; diff --git a/src/text.hh b/src/text.hh new file mode 100644 index 0000000..6d4cf0a --- /dev/null +++ b/src/text.hh @@ -0,0 +1,22 @@ +#pragma once + +#include +#include + +namespace warc2text { + class AnnotatedText { + public: + std::string text; + std::vector tags; + + // Extract a bit of text + tags from this text + AnnotatedText substr(std::size_t offset, std::size_t count) const; + + AnnotatedText &append(AnnotatedText const &other, std::size_t offset, std::size_t count); + + // Append a block of text (and tag) at the end + void push_back(std::string const &text, std::string const &tag); + + void clear(); + }; +}; diff --git a/src/text_test.cc b/src/text_test.cc new file mode 100644 index 0000000..3612bfa --- /dev/null +++ b/src/text_test.cc @@ -0,0 +1,68 @@ +#include "text.hh" + +#define BOOST_TEST_MODULE HTMLTest +#include + +namespace warc2text { +namespace { + + +BOOST_AUTO_TEST_CASE(TextPushBack) { + AnnotatedText input; + + input.push_back("This is a sentence", "p"); + + input.push_back("This is a sentence\nsplit over two lines", "li"); + + std::string expected( + "This is a sentence\n" + "This is a sentence\n" + "split over two lines\n" + ); + + std::vector tags{"p", "li", "li"}; + + BOOST_CHECK_EQUAL(input.text, expected); + BOOST_CHECK_EQUAL_COLLECTIONS(input.tags.begin(), input.tags.end(), tags.begin(), tags.end()); +} + +BOOST_AUTO_TEST_CASE(TextSubstr) { + AnnotatedText input; + input.push_back("This is a sentence", "p"); + input.push_back("This is a sentence\nsplit over two lines", "li"); + + std::string expected( + "This is a sentence\n" + "This is a sent" + ); + + std::vector tags{"p", "li"}; // two lines, so two tags + + AnnotatedText out = input.substr(0, expected.size()); + + BOOST_CHECK_EQUAL(out.text, expected); + BOOST_CHECK_EQUAL_COLLECTIONS(out.tags.begin(), out.tags.end(), tags.begin(), tags.end()); +} + +BOOST_AUTO_TEST_CASE(TextAppend) { + AnnotatedText input; + input.push_back("This is a sentence", "p"); + input.push_back("This is a sentence\nsplit over two lines", "li"); + + std::string expected( + "This is a sentence over two lines\n" + ); + + std::vector tags{"p"}; // single line, take existing tag + + AnnotatedText out; + out.append(input, 0, 18); // "This is a sentence" + out.append(input, 44, 16); // " over two lines\n" + + BOOST_CHECK_EQUAL(out.text, expected); + BOOST_CHECK_EQUAL_COLLECTIONS(out.tags.begin(), out.tags.end(), tags.begin(), tags.end()); +} + +} +} + diff --git a/src/warcpreprocessor.cc b/src/warcpreprocessor.cc index 47de540..ffe5285 100644 --- a/src/warcpreprocessor.cc +++ b/src/warcpreprocessor.cc @@ -178,13 +178,15 @@ namespace warc2text { continue; } - if (record.getPlainText().empty()) { + std::size_t recordTextBytes = record.getPlainTextSize(); + + if (!recordTextBytes) { BOOST_LOG_TRIVIAL(trace) << "Record " << record.getURL() << ": empty"; continue; } ++textRecords; - textBytes += record.getPlainText().size(); + textBytes += recordTextBytes; record.detectLanguage(detector); n_langs = 0; @@ -193,7 +195,7 @@ namespace warc2text { if (chunk.first == LanguageDetector::kUnknownLanguageLabel) continue; - langBytes += chunk.second.size(); + langBytes += chunk.second.text.size(); ++n_langs; }