Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Track html tags #46

Draft
wants to merge 7 commits into
base: master
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 10 additions & 0 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,16 @@ target_link_libraries(warc2text
fasttext-static
)

include(CTest)

foreach(test html text)
add_executable(${test}_test src/${test}_test.cc)
target_link_libraries(${test}_test
warc2text_lib
${Boost_LIBRARIES})
add_test(NAME ${test}_test COMMAND $<TARGET_FILE:${test}_test>)
endforeach()

include(GNUInstallDirs)

install(TARGETS cld2_full warc2text
Expand Down
1 change: 1 addition & 0 deletions src/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@ add_library(warc2text_lib
xh_scanner.cc
entities.cc
zipreader.cc
text.cc
)


Expand Down
7 changes: 4 additions & 3 deletions src/bilangwriter.cc
Original file line number Diff line number Diff line change
Expand Up @@ -129,7 +129,7 @@ namespace warc2text{

void BilangWriter::write(const Record& record, bool paragraph_identification) {
for (const auto& it : record.getTextByLangs()) {
std::string chunk = it.second;
std::string chunk = it.second.text;

if (paragraph_identification)
chunk = get_paragraph_id(chunk);
Expand All @@ -147,12 +147,13 @@ namespace warc2text{
{"o", boost::json::value(record.getOffset())},
{"s", boost::json::value(record.getSize())},
{"rs", boost::json::value(record.getPayload().size())},
{"ps", boost::json::value(chunk.second.size())},
{"ps", boost::json::value(chunk.second.text.size())},
{"l", boost::json::string(chunk.first)},
{"u", boost::json::string(record.getURL())},
{"c", boost::json::string(record.getHTTPcontentType())},
{"ts", boost::json::string(record.getWARCdate())},
{"p", boost::json::string(chunk.second)},
{"p", boost::json::string(chunk.second.text)},
{"pt", boost::json::value_from(chunk.second.tags)},
} << "\n";
}
}
Expand Down
42 changes: 26 additions & 16 deletions src/html.cc
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
#include <boost/log/trivial.hpp>
#include "util.hh"
#include "html.hh"
#include "entities.hh"
#include "xh_scanner.hh"

namespace warc2text {
Expand All @@ -25,28 +26,27 @@ namespace warc2text {
return true;
}

void addNewLine(std::string& plaintext) {
if (std::isspace(plaintext.back())) {
plaintext.back() = '\n';
} else if (!plaintext.empty()) {
plaintext.push_back('\n');
}
}

void addSpace(std::string& plaintext) {
if (!plaintext.empty() && !std::isspace(plaintext.back())) {
if (!plaintext.empty() && !std::isspace(static_cast<unsigned char>(plaintext.back()))) {
plaintext.push_back(' ');
}
}

int processHTML(const std::string& html, std::string& plaintext, const util::umap_tag_filters_regex& tagFilters){
plaintext = "";
bool isWhitespace(std::string const &str) {
return std::all_of(str.begin(), str.end(), [](unsigned char c){ return std::isspace(c); });
}

int processHTML(const std::string& html, AnnotatedText& plaintext, const util::umap_tag_filters_regex& tagFilters){
plaintext.clear();

markup::instream si(html.c_str());
markup::scanner sc(si);

int t = markup::scanner::TT_SPACE; // just start somewhere that isn't ERROR or EOF
int retval = util::SUCCESS;
std::string tag;
std::string paragraph;
std::string plain;

while (t != markup::scanner::TT_EOF and t != markup::scanner::TT_ERROR) {
t = sc.get_token();
Expand All @@ -60,17 +60,22 @@ namespace warc2text {
// sc.get_tag_name() only changes value after a new tag is found
tag = util::toLowerCopy(sc.get_tag_name());
// found block tag: previous block has ended
if (html::isBlockTag(tag)) addNewLine(plaintext);
if (html::isBlockTag(tag) && !isWhitespace(paragraph)) {
// TODO: add this directly to the scanner?
entities::decodeEntities(paragraph, plain);
plaintext.push_back(plain, tag);
paragraph.clear(); // reset for next paragraph
}
// found void tag, like <img> or <embed>
if (html::isVoidTag(tag)) addSpace(plaintext);
if (html::isVoidTag(tag)) addSpace(paragraph);
break;
case markup::scanner::TT_WORD:
// if the tag is in noText list, don't save the text
if (html::isNoTextTag(tag)) break;
plaintext.append(sc.get_value());
paragraph.append(sc.get_value());
break;
case markup::scanner::TT_SPACE:
addSpace(plaintext);
addSpace(paragraph);
break;
case markup::scanner::TT_ATTR:
if (!filter(tag, sc.get_attr_name(), sc.get_value(), tagFilters))
Expand All @@ -80,7 +85,12 @@ namespace warc2text {
break;
}
}
if (plaintext.back() != '\n') plaintext.push_back('\n');

if (!isWhitespace(paragraph)) {
entities::decodeEntities(paragraph, plain);
plaintext.push_back(plain, "");
}

return retval;
}

Expand Down
4 changes: 3 additions & 1 deletion src/html.hh
Original file line number Diff line number Diff line change
Expand Up @@ -2,9 +2,11 @@
#define WARC2TEXT_HTML_HH

#include <string>
#include "text.hh"
#include "util.hh"

namespace warc2text {
int processHTML(const std::string& html, std::string& text, const util::umap_tag_filters_regex& tagFilters);
int processHTML(const std::string& html, AnnotatedText &text, const util::umap_tag_filters_regex& tagFilters);
}

#endif
122 changes: 122 additions & 0 deletions src/html_test.cc
Original file line number Diff line number Diff line change
@@ -0,0 +1,122 @@
#include "html.hh"

#define BOOST_TEST_MODULE HTMLTest
#include <boost/test/unit_test.hpp>

namespace warc2text {
namespace {


BOOST_AUTO_TEST_CASE(CleanHTML) {
std::string html(
"<!DOCTYPE html>\n"
"<html>\n"
" <head>\n"
" <title>Well-formed web page!</title>\n"
" </head>\n"
" <body>\n"
" <p>This is a paragraph.</p>\n"
" <p>\n"
" This is &lt;one&gt;,\n"
" indented as written by <a href=\"\">Ken</a>,\n"
" with a newline.\n"
" </p>\n"
" </body>\n"
"</html>");

std::string expected(
"Well-formed web page!\n"
"This is a paragraph.\n"
"This is <one>, indented as written by Ken, with a newline.\n"
);

std::vector<std::string> tags{"title", "p", "p"};

AnnotatedText out;
auto retval = processHTML(html, out, {});

BOOST_CHECK_EQUAL(retval, util::SUCCESS);
BOOST_CHECK_EQUAL(out.text, expected);
BOOST_CHECK_EQUAL_COLLECTIONS(out.tags.begin(), out.tags.end(), tags.begin(), tags.end());
}

BOOST_AUTO_TEST_CASE(TagsIdentifiers) {
// breaks because we don't have a stack, so after </p> but before <p> no idea
// we're inside <div>.
std::string html(
"<div>\n"
" <p>Text</p>\n"
" not block text\n"
" <p>Paragraph</p>\n"
" Inside div\n"
"</div>"
);

std::string expected(
"Text\n"
"not block text\n"
"Paragraph\n"
"Inside div\n"
);

std::vector<std::string> tags{"p", "div", "p", "div"};

AnnotatedText out;
auto retval = processHTML(html, out, {});

BOOST_CHECK_EQUAL(retval, util::SUCCESS);
BOOST_CHECK_EQUAL(out.text, expected);
BOOST_CHECK_EQUAL_COLLECTIONS(out.tags.begin(), out.tags.end(), tags.begin(), tags.end());
}

BOOST_AUTO_TEST_CASE(PreTagNotSupported) {
// We don't support keeping the formatting in <pre> tags.
std::string html(
"<pre> This line\n"
"should keep its newlines\n"
"ideally.</pre>");

std::string expected("This line should keep its newlines ideally.\n");

std::vector<std::string> tags{"pre"};

AnnotatedText out;
auto retval = processHTML(html, out, {});

BOOST_CHECK_EQUAL(retval, util::SUCCESS);
BOOST_CHECK_EQUAL(out.text, expected);
BOOST_CHECK_EQUAL_COLLECTIONS(out.tags.begin(), out.tags.end(), tags.begin(), tags.end());
}

BOOST_AUTO_TEST_CASE(BlockTags) {
std::string html("<body>Alpha <br> Beta <h1> Gamma </h1> Delta <span> Epsilon </span> Zeta</body>");
std::string expected("Alpha\nBeta\nGamma\nDelta Epsilon Zeta\n");
AnnotatedText out;
auto retval = processHTML(html, out, {});
BOOST_CHECK_EQUAL(retval, util::SUCCESS);
BOOST_CHECK_EQUAL(out.text, expected);
}

BOOST_AUTO_TEST_CASE(VoidTags) {
std::string html("<body>Void<img>tags<img>should<img>add<embed>beep</embed>spaces</body>");
std::string expected("Void tags should add beep spaces\n");
AnnotatedText out;
auto retval = processHTML(html, out, {});
BOOST_CHECK_EQUAL(retval, util::SUCCESS);
BOOST_CHECK_EQUAL(out.text, expected);
}

BOOST_AUTO_TEST_CASE(ScriptTags, *boost::unit_test::disabled()) {
// This fails since we don't keep a stack, so ` is ignored` will still
// have tag_name == "script" and be ignored :facepalm:
std::string html("<body>Text inside <script>ignore <span>and me</span> me!</script> is ignored</body>");
std::string expected("Test inside is ignored\n");
AnnotatedText out;
auto retval = processHTML(html, out, {});
BOOST_CHECK_EQUAL(retval, util::SUCCESS);
BOOST_CHECK_EQUAL(out.text, expected);
}

}
}

11 changes: 6 additions & 5 deletions src/lang.hh
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
#include <memory>
#include <string>
#include <unordered_map>
#include "text.hh"

namespace fasttext {
class FastText;
Expand All @@ -15,8 +16,8 @@ class LanguageDetector {
public:
virtual ~LanguageDetector() {};

// detect language of plain text, return top languages
virtual void detect(const std::string& text, std::unordered_map<std::string, std::string>& chunks) const = 0;
// detect language of plain text, return top languages, consumes the text.
virtual void detect(AnnotatedText &&text, std::unordered_map<std::string, AnnotatedText>& chunks) const = 0;

// Label used for text (chunks) that cannot reliably be identified
static const std::string kUnknownLanguageLabel;
Expand All @@ -26,21 +27,21 @@ class FastTextDetector : public LanguageDetector {
public:
explicit FastTextDetector(const std::string &filename);
virtual ~FastTextDetector();
virtual void detect(const std::string& text, std::unordered_map<std::string, std::string>& chunks) const;
virtual void detect(AnnotatedText &&text, std::unordered_map<std::string, AnnotatedText>& chunks) const;

private:
std::unique_ptr<fasttext::FastText> classifier_;
};

class CLD2Detector : public LanguageDetector {
public:
virtual void detect(const std::string& text, std::unordered_map<std::string, std::string>& chunks) const;
virtual void detect(AnnotatedText &&text, std::unordered_map<std::string, AnnotatedText>& chunks) const;
virtual ~CLD2Detector();
};

class CLD2MultiLangDetector : public LanguageDetector {
public:
virtual void detect(const std::string& text, std::unordered_map<std::string, std::string>& chunks) const;
virtual void detect(AnnotatedText &&text, std::unordered_map<std::string, AnnotatedText>& chunks) const;
virtual ~CLD2MultiLangDetector();
};

Expand Down
Loading