bitextor · jelmervdl · Nov 7, 2023 · Nov 7, 2023 · Nov 7, 2023 · Nov 7, 2023
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -61,6 +61,16 @@ target_link_libraries(warc2text
     fasttext-static
 )
 
+include(CTest)
+
+foreach(test html text)
+    add_executable(${test}_test src/${test}_test.cc)
+    target_link_libraries(${test}_test
+        warc2text_lib
+        ${Boost_LIBRARIES})
+    add_test(NAME ${test}_test COMMAND $<TARGET_FILE:${test}_test>)    
+endforeach()
+
 include(GNUInstallDirs)
 
 install(TARGETS cld2_full warc2text

diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
@@ -33,6 +33,7 @@ add_library(warc2text_lib
     xh_scanner.cc
     entities.cc
     zipreader.cc
+    text.cc
 )
 
 

diff --git a/src/bilangwriter.cc b/src/bilangwriter.cc
@@ -129,7 +129,7 @@ namespace warc2text{
 
     void BilangWriter::write(const Record& record, bool paragraph_identification) {
         for (const auto& it : record.getTextByLangs()) {
-            std::string chunk = it.second;
+            std::string chunk = it.second.text;
 
             if (paragraph_identification)
                 chunk = get_paragraph_id(chunk);
@@ -147,12 +147,13 @@ namespace warc2text{
                  {"o", boost::json::value(record.getOffset())},
                  {"s", boost::json::value(record.getSize())},
                  {"rs", boost::json::value(record.getPayload().size())},
-                 {"ps", boost::json::value(chunk.second.size())},
+                 {"ps", boost::json::value(chunk.second.text.size())},
                  {"l", boost::json::string(chunk.first)},
                  {"u", boost::json::string(record.getURL())},
                  {"c", boost::json::string(record.getHTTPcontentType())},
                  {"ts", boost::json::string(record.getWARCdate())},
-                 {"p", boost::json::string(chunk.second)},
+                 {"p", boost::json::string(chunk.second.text)},
+                 {"pt", boost::json::value_from(chunk.second.tags)},
             } << "\n";
         }
     }

diff --git a/src/html.cc b/src/html.cc
@@ -4,6 +4,7 @@
 #include <boost/log/trivial.hpp>
 #include "util.hh"
 #include "html.hh"
+#include "entities.hh"
 #include "xh_scanner.hh"
 
 namespace warc2text {
@@ -25,28 +26,27 @@ namespace warc2text {
         return true;
     }
 
-    void addNewLine(std::string& plaintext) {
-        if (std::isspace(plaintext.back())) {
-            plaintext.back() = '\n';
-        } else if (!plaintext.empty()) {
-            plaintext.push_back('\n');
-        }
-    }
-
     void addSpace(std::string& plaintext) {
-        if (!plaintext.empty() && !std::isspace(plaintext.back())) {
+        if (!plaintext.empty() && !std::isspace(static_cast<unsigned char>(plaintext.back()))) {
             plaintext.push_back(' ');
         }
     }
 
-    int processHTML(const std::string& html, std::string& plaintext, const util::umap_tag_filters_regex& tagFilters){
-        plaintext = "";
+    bool isWhitespace(std::string const &str) {
+        return std::all_of(str.begin(), str.end(), [](unsigned char c){ return std::isspace(c); });
+    }
+
+    int processHTML(const std::string& html, AnnotatedText& plaintext, const util::umap_tag_filters_regex& tagFilters){
+        plaintext.clear();
+
         markup::instream si(html.c_str());
         markup::scanner sc(si);
 
         int t = markup::scanner::TT_SPACE; // just start somewhere that isn't ERROR or EOF
         int retval = util::SUCCESS;
         std::string tag;
+        std::string paragraph;
+        std::string plain;
 
         while (t != markup::scanner::TT_EOF and t != markup::scanner::TT_ERROR) {
             t = sc.get_token();
@@ -60,17 +60,22 @@ namespace warc2text {
                     // sc.get_tag_name() only changes value after a new tag is found
                     tag = util::toLowerCopy(sc.get_tag_name());
                     // found block tag: previous block has ended
-                    if (html::isBlockTag(tag)) addNewLine(plaintext);
+                    if (html::isBlockTag(tag) && !isWhitespace(paragraph)) {
+                        // TODO: add this directly to the scanner?
+                        entities::decodeEntities(paragraph, plain);
+                        plaintext.push_back(plain, tag);
+                        paragraph.clear(); // reset for next paragraph
+                    }
                     // found void tag, like <img> or <embed>
-                    if (html::isVoidTag(tag)) addSpace(plaintext);
+                    if (html::isVoidTag(tag)) addSpace(paragraph);
                     break;
                 case markup::scanner::TT_WORD:
                     // if the tag is in noText list, don't save the text
                     if (html::isNoTextTag(tag)) break;
-                    plaintext.append(sc.get_value());
+                    paragraph.append(sc.get_value());
                     break;
                 case markup::scanner::TT_SPACE:
-                    addSpace(plaintext);
+                    addSpace(paragraph);
                     break;
                 case markup::scanner::TT_ATTR:
                     if (!filter(tag, sc.get_attr_name(), sc.get_value(), tagFilters))
@@ -80,7 +85,12 @@ namespace warc2text {
                     break;
             }
         }
-        if (plaintext.back() != '\n') plaintext.push_back('\n');
+
+        if (!isWhitespace(paragraph)) {
+            entities::decodeEntities(paragraph, plain);
+            plaintext.push_back(plain, "");
+        }
+
         return retval;
     }
 

diff --git a/src/html.hh b/src/html.hh
@@ -2,9 +2,11 @@
 #define WARC2TEXT_HTML_HH
 
 #include <string>
+#include "text.hh"
+#include "util.hh"
 
 namespace warc2text {
-    int processHTML(const std::string& html, std::string& text, const util::umap_tag_filters_regex& tagFilters);
+    int processHTML(const std::string& html, AnnotatedText &text, const util::umap_tag_filters_regex& tagFilters);
 }
 
 #endif
diff --git a/src/html_test.cc b/src/html_test.cc
@@ -0,0 +1,122 @@
+#include "html.hh"
+
+#define BOOST_TEST_MODULE HTMLTest
+#include <boost/test/unit_test.hpp>
+
+namespace warc2text {
+namespace {
+
+
+BOOST_AUTO_TEST_CASE(CleanHTML) {
+	std::string html(
+		"<!DOCTYPE html>\n"
+		"<html>\n"
+		"	<head>\n"
+		"		<title>Well-formed web page!</title>\n"
+		"	</head>\n"
+		"	<body>\n"
+		"		<p>This is a paragraph.</p>\n"
+		"		<p>\n"
+		"			This is &lt;one&gt;,\n"
+		"			indented as written by <a href=\"\">Ken</a>,\n"
+		"			with a newline.\n"
+		"		</p>\n"
+		"	</body>\n"
+		"</html>");
+
+	std::string expected(
+		"Well-formed web page!\n"
+		"This is a paragraph.\n"
+		"This is <one>, indented as written by Ken, with a newline.\n"
+	);
+
+	std::vector<std::string> tags{"title", "p", "p"};
+
+	AnnotatedText out;
+	auto retval = processHTML(html, out, {});
+
+	BOOST_CHECK_EQUAL(retval, util::SUCCESS);
+	BOOST_CHECK_EQUAL(out.text, expected);
+	BOOST_CHECK_EQUAL_COLLECTIONS(out.tags.begin(), out.tags.end(), tags.begin(), tags.end());
+}
+
+BOOST_AUTO_TEST_CASE(TagsIdentifiers) {
+	// breaks because we don't have a stack, so after </p> but before <p> no idea
+	// we're inside <div>.
+	std::string html(
+		"<div>\n"
+		"  <p>Text</p>\n"
+		"  not block text\n"
+		"  <p>Paragraph</p>\n"
+		"  Inside div\n"
+		"</div>"
+	);
+
+	std::string expected(
+		"Text\n"
+		"not block text\n"
+		"Paragraph\n"
+		"Inside div\n"
+	);
+
+	std::vector<std::string> tags{"p", "div", "p", "div"};
+
+	AnnotatedText out;
+	auto retval = processHTML(html, out, {});
+
+	BOOST_CHECK_EQUAL(retval, util::SUCCESS);
+	BOOST_CHECK_EQUAL(out.text, expected);
+	BOOST_CHECK_EQUAL_COLLECTIONS(out.tags.begin(), out.tags.end(), tags.begin(), tags.end());
+}
+
+BOOST_AUTO_TEST_CASE(PreTagNotSupported) {
+	// We don't support keeping the formatting in <pre> tags.
+	std::string html(
+		"<pre> This line\n"
+		"should keep its newlines\n"
+		"ideally.</pre>");
+
+	std::string expected("This line should keep its newlines ideally.\n");
+
+	std::vector<std::string> tags{"pre"};
+
+	AnnotatedText out;
+	auto retval = processHTML(html, out, {});
+
+	BOOST_CHECK_EQUAL(retval, util::SUCCESS);
+	BOOST_CHECK_EQUAL(out.text, expected);
+	BOOST_CHECK_EQUAL_COLLECTIONS(out.tags.begin(), out.tags.end(), tags.begin(), tags.end());
+}
+
+BOOST_AUTO_TEST_CASE(BlockTags) {
+	std::string html("<body>Alpha <br> Beta <h1> Gamma </h1> Delta <span> Epsilon </span> Zeta</body>");
+	std::string expected("Alpha\nBeta\nGamma\nDelta Epsilon Zeta\n");
+	AnnotatedText out;
+	auto retval = processHTML(html, out, {});
+	BOOST_CHECK_EQUAL(retval, util::SUCCESS);
+	BOOST_CHECK_EQUAL(out.text, expected);
+}
+
+BOOST_AUTO_TEST_CASE(VoidTags) {
+	std::string html("<body>Void<img>tags<img>should<img>add<embed>beep</embed>spaces</body>");
+	std::string expected("Void tags should add beep spaces\n");
+	AnnotatedText out;
+	auto retval = processHTML(html, out, {});
+	BOOST_CHECK_EQUAL(retval, util::SUCCESS);
+	BOOST_CHECK_EQUAL(out.text, expected);
+}
+
+BOOST_AUTO_TEST_CASE(ScriptTags, *boost::unit_test::disabled()) {
+	// This fails since we don't keep a stack, so ` is ignored` will still
+	// have tag_name == "script" and be ignored :facepalm:
+	std::string html("<body>Text inside <script>ignore <span>and me</span> me!</script> is ignored</body>");
+	std::string expected("Test inside is ignored\n");
+	AnnotatedText out;
+	auto retval = processHTML(html, out, {});
+	BOOST_CHECK_EQUAL(retval, util::SUCCESS);
+	BOOST_CHECK_EQUAL(out.text, expected);
+}
+
+}
+}
+
diff --git a/src/lang.hh b/src/lang.hh
@@ -4,6 +4,7 @@
 #include <memory>
 #include <string>
 #include <unordered_map>
+#include "text.hh"
 
 namespace fasttext {
 class FastText;
@@ -15,8 +16,8 @@ class LanguageDetector {
   public:
     virtual ~LanguageDetector() {};
 
-    // detect language of plain text, return top languages
-    virtual void detect(const std::string& text, std::unordered_map<std::string, std::string>& chunks) const = 0;
+    // detect language of plain text, return top languages, consumes the text.
+    virtual void detect(AnnotatedText &&text, std::unordered_map<std::string, AnnotatedText>& chunks) const = 0;
 
     // Label used for text (chunks) that cannot reliably be identified
     static const std::string kUnknownLanguageLabel;
@@ -26,21 +27,21 @@ class FastTextDetector : public LanguageDetector {
   public:
     explicit FastTextDetector(const std::string &filename);
     virtual ~FastTextDetector();
-    virtual void detect(const std::string& text, std::unordered_map<std::string, std::string>& chunks) const;
+    virtual void detect(AnnotatedText &&text, std::unordered_map<std::string, AnnotatedText>& chunks) const;
 
   private:
     std::unique_ptr<fasttext::FastText> classifier_;
 };
 
 class CLD2Detector : public LanguageDetector {
 public:
-  virtual void detect(const std::string& text, std::unordered_map<std::string, std::string>& chunks) const;
+  virtual void detect(AnnotatedText &&text, std::unordered_map<std::string, AnnotatedText>& chunks) const;
   virtual ~CLD2Detector();
 };
 
 class CLD2MultiLangDetector : public LanguageDetector {
 public:
-  virtual void detect(const std::string& text, std::unordered_map<std::string, std::string>& chunks) const;
+  virtual void detect(AnnotatedText &&text, std::unordered_map<std::string, AnnotatedText>& chunks) const;
   virtual ~CLD2MultiLangDetector();
 };
-Original file line number
+Diff line change
@@ Expand Up / @@ -33,6 +33,7 @@ add_library(warc2text_lib @@
         xh_scanner.cc
         entities.cc
         zipreader.cc
+        text.cc
     )
@@ Expand Down @@