diff --git a/CMakeLists.txt b/CMakeLists.txt
index 2d235b5..23eca7f 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -61,6 +61,16 @@ target_link_libraries(warc2text
     fasttext-static
 )
 
+include(CTest)
+
+foreach(test html text)
+    add_executable(${test}_test src/${test}_test.cc)
+    target_link_libraries(${test}_test
+        warc2text_lib
+        ${Boost_LIBRARIES})
+    add_test(NAME ${test}_test COMMAND $<TARGET_FILE:${test}_test>)    
+endforeach()
+
 include(GNUInstallDirs)
 
 install(TARGETS cld2_full warc2text
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index 6db72e0..a494db4 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -33,6 +33,7 @@ add_library(warc2text_lib
     xh_scanner.cc
     entities.cc
     zipreader.cc
+    text.cc
 )
 
 
diff --git a/src/bilangwriter.cc b/src/bilangwriter.cc
index 5688000..2571591 100644
--- a/src/bilangwriter.cc
+++ b/src/bilangwriter.cc
@@ -129,7 +129,7 @@ namespace warc2text{
 
     void BilangWriter::write(const Record& record, bool paragraph_identification) {
         for (const auto& it : record.getTextByLangs()) {
-            std::string chunk = it.second;
+            std::string chunk = it.second.text;
 
             if (paragraph_identification)
                 chunk = get_paragraph_id(chunk);
@@ -147,12 +147,13 @@ namespace warc2text{
                  {"o", boost::json::value(record.getOffset())},
                  {"s", boost::json::value(record.getSize())},
                  {"rs", boost::json::value(record.getPayload().size())},
-                 {"ps", boost::json::value(chunk.second.size())},
+                 {"ps", boost::json::value(chunk.second.text.size())},
                  {"l", boost::json::string(chunk.first)},
                  {"u", boost::json::string(record.getURL())},
                  {"c", boost::json::string(record.getHTTPcontentType())},
                  {"ts", boost::json::string(record.getWARCdate())},
-                 {"p", boost::json::string(chunk.second)},
+                 {"p", boost::json::string(chunk.second.text)},
+                 {"pt", boost::json::value_from(chunk.second.tags)},
             } << "\n";
         }
     }
diff --git a/src/html.cc b/src/html.cc
index bc329e9..9aa5786 100644
--- a/src/html.cc
+++ b/src/html.cc
@@ -4,6 +4,7 @@
 #include <boost/log/trivial.hpp>
 #include "util.hh"
 #include "html.hh"
+#include "entities.hh"
 #include "xh_scanner.hh"
 
 namespace warc2text {
@@ -25,28 +26,27 @@ namespace warc2text {
         return true;
     }
 
-    void addNewLine(std::string& plaintext) {
-        if (std::isspace(plaintext.back())) {
-            plaintext.back() = '\n';
-        } else if (!plaintext.empty()) {
-            plaintext.push_back('\n');
-        }
-    }
-
     void addSpace(std::string& plaintext) {
-        if (!plaintext.empty() && !std::isspace(plaintext.back())) {
+        if (!plaintext.empty() && !std::isspace(static_cast<unsigned char>(plaintext.back()))) {
             plaintext.push_back(' ');
         }
     }
 
-    int processHTML(const std::string& html, std::string& plaintext, const util::umap_tag_filters_regex& tagFilters){
-        plaintext = "";
+    bool isWhitespace(std::string const &str) {
+        return std::all_of(str.begin(), str.end(), [](unsigned char c){ return std::isspace(c); });
+    }
+
+    int processHTML(const std::string& html, AnnotatedText& plaintext, const util::umap_tag_filters_regex& tagFilters){
+        plaintext.clear();
+
         markup::instream si(html.c_str());
         markup::scanner sc(si);
 
         int t = markup::scanner::TT_SPACE; // just start somewhere that isn't ERROR or EOF
         int retval = util::SUCCESS;
         std::string tag;
+        std::string paragraph;
+        std::string plain;
 
         while (t != markup::scanner::TT_EOF and t != markup::scanner::TT_ERROR) {
             t = sc.get_token();
@@ -60,17 +60,22 @@ namespace warc2text {
                     // sc.get_tag_name() only changes value after a new tag is found
                     tag = util::toLowerCopy(sc.get_tag_name());
                     // found block tag: previous block has ended
-                    if (html::isBlockTag(tag)) addNewLine(plaintext);
+                    if (html::isBlockTag(tag) && !isWhitespace(paragraph)) {
+                        // TODO: add this directly to the scanner?
+                        entities::decodeEntities(paragraph, plain);
+                        plaintext.push_back(plain, tag);
+                        paragraph.clear(); // reset for next paragraph
+                    }
                     // found void tag, like <img> or <embed>
-                    if (html::isVoidTag(tag)) addSpace(plaintext);
+                    if (html::isVoidTag(tag)) addSpace(paragraph);
                     break;
                 case markup::scanner::TT_WORD:
                     // if the tag is in noText list, don't save the text
                     if (html::isNoTextTag(tag)) break;
-                    plaintext.append(sc.get_value());
+                    paragraph.append(sc.get_value());
                     break;
                 case markup::scanner::TT_SPACE:
-                    addSpace(plaintext);
+                    addSpace(paragraph);
                     break;
                 case markup::scanner::TT_ATTR:
                     if (!filter(tag, sc.get_attr_name(), sc.get_value(), tagFilters))
@@ -80,7 +85,12 @@ namespace warc2text {
                     break;
             }
         }
-        if (plaintext.back() != '\n') plaintext.push_back('\n');
+
+        if (!isWhitespace(paragraph)) {
+            entities::decodeEntities(paragraph, plain);
+            plaintext.push_back(plain, "");
+        }
+
         return retval;
     }
 
diff --git a/src/html.hh b/src/html.hh
index 583dc31..2f1994d 100644
--- a/src/html.hh
+++ b/src/html.hh
@@ -2,9 +2,11 @@
 #define WARC2TEXT_HTML_HH
 
 #include <string>
+#include "text.hh"
+#include "util.hh"
 
 namespace warc2text {
-    int processHTML(const std::string& html, std::string& text, const util::umap_tag_filters_regex& tagFilters);
+    int processHTML(const std::string& html, AnnotatedText &text, const util::umap_tag_filters_regex& tagFilters);
 }
 
 #endif
diff --git a/src/html_test.cc b/src/html_test.cc
new file mode 100644
index 0000000..4705312
--- /dev/null
+++ b/src/html_test.cc
@@ -0,0 +1,122 @@
+#include "html.hh"
+
+#define BOOST_TEST_MODULE HTMLTest
+#include <boost/test/unit_test.hpp>
+
+namespace warc2text {
+namespace {
+
+
+BOOST_AUTO_TEST_CASE(CleanHTML) {
+	std::string html(
+		"<!DOCTYPE html>\n"
+		"<html>\n"
+		"	<head>\n"
+		"		<title>Well-formed web page!</title>\n"
+		"	</head>\n"
+		"	<body>\n"
+		"		<p>This is a paragraph.</p>\n"
+		"		<p>\n"
+		"			This is &lt;one&gt;,\n"
+		"			indented as written by <a href=\"\">Ken</a>,\n"
+		"			with a newline.\n"
+		"		</p>\n"
+		"	</body>\n"
+		"</html>");
+
+	std::string expected(
+		"Well-formed web page!\n"
+		"This is a paragraph.\n"
+		"This is <one>, indented as written by Ken, with a newline.\n"
+	);
+
+	std::vector<std::string> tags{"title", "p", "p"};
+
+	AnnotatedText out;
+	auto retval = processHTML(html, out, {});
+
+	BOOST_CHECK_EQUAL(retval, util::SUCCESS);
+	BOOST_CHECK_EQUAL(out.text, expected);
+	BOOST_CHECK_EQUAL_COLLECTIONS(out.tags.begin(), out.tags.end(), tags.begin(), tags.end());
+}
+
+BOOST_AUTO_TEST_CASE(TagsIdentifiers) {
+	// breaks because we don't have a stack, so after </p> but before <p> no idea
+	// we're inside <div>.
+	std::string html(
+		"<div>\n"
+		"  <p>Text</p>\n"
+		"  not block text\n"
+		"  <p>Paragraph</p>\n"
+		"  Inside div\n"
+		"</div>"
+	);
+
+	std::string expected(
+		"Text\n"
+		"not block text\n"
+		"Paragraph\n"
+		"Inside div\n"
+	);
+
+	std::vector<std::string> tags{"p", "div", "p", "div"};
+
+	AnnotatedText out;
+	auto retval = processHTML(html, out, {});
+
+	BOOST_CHECK_EQUAL(retval, util::SUCCESS);
+	BOOST_CHECK_EQUAL(out.text, expected);
+	BOOST_CHECK_EQUAL_COLLECTIONS(out.tags.begin(), out.tags.end(), tags.begin(), tags.end());
+}
+
+BOOST_AUTO_TEST_CASE(PreTagNotSupported) {
+	// We don't support keeping the formatting in <pre> tags.
+	std::string html(
+		"<pre> This line\n"
+		"should keep its newlines\n"
+		"ideally.</pre>");
+
+	std::string expected("This line should keep its newlines ideally.\n");
+
+	std::vector<std::string> tags{"pre"};
+
+	AnnotatedText out;
+	auto retval = processHTML(html, out, {});
+
+	BOOST_CHECK_EQUAL(retval, util::SUCCESS);
+	BOOST_CHECK_EQUAL(out.text, expected);
+	BOOST_CHECK_EQUAL_COLLECTIONS(out.tags.begin(), out.tags.end(), tags.begin(), tags.end());
+}
+
+BOOST_AUTO_TEST_CASE(BlockTags) {
+	std::string html("<body>Alpha <br> Beta <h1> Gamma </h1> Delta <span> Epsilon </span> Zeta</body>");
+	std::string expected("Alpha\nBeta\nGamma\nDelta Epsilon Zeta\n");
+	AnnotatedText out;
+	auto retval = processHTML(html, out, {});
+	BOOST_CHECK_EQUAL(retval, util::SUCCESS);
+	BOOST_CHECK_EQUAL(out.text, expected);
+}
+
+BOOST_AUTO_TEST_CASE(VoidTags) {
+	std::string html("<body>Void<img>tags<img>should<img>add<embed>beep</embed>spaces</body>");
+	std::string expected("Void tags should add beep spaces\n");
+	AnnotatedText out;
+	auto retval = processHTML(html, out, {});
+	BOOST_CHECK_EQUAL(retval, util::SUCCESS);
+	BOOST_CHECK_EQUAL(out.text, expected);
+}
+
+BOOST_AUTO_TEST_CASE(ScriptTags, *boost::unit_test::disabled()) {
+	// This fails since we don't keep a stack, so ` is ignored` will still
+	// have tag_name == "script" and be ignored :facepalm:
+	std::string html("<body>Text inside <script>ignore <span>and me</span> me!</script> is ignored</body>");
+	std::string expected("Test inside is ignored\n");
+	AnnotatedText out;
+	auto retval = processHTML(html, out, {});
+	BOOST_CHECK_EQUAL(retval, util::SUCCESS);
+	BOOST_CHECK_EQUAL(out.text, expected);
+}
+
+}
+}
+
diff --git a/src/lang.hh b/src/lang.hh
index 3552892..7b135dd 100644
--- a/src/lang.hh
+++ b/src/lang.hh
@@ -4,6 +4,7 @@
 #include <memory>
 #include <string>
 #include <unordered_map>
+#include "text.hh"
 
 namespace fasttext {
 class FastText;
@@ -15,8 +16,8 @@ class LanguageDetector {
   public:
     virtual ~LanguageDetector() {};
 
-    // detect language of plain text, return top languages
-    virtual void detect(const std::string& text, std::unordered_map<std::string, std::string>& chunks) const = 0;
+    // detect language of plain text, return top languages, consumes the text.
+    virtual void detect(AnnotatedText &&text, std::unordered_map<std::string, AnnotatedText>& chunks) const = 0;
 
     // Label used for text (chunks) that cannot reliably be identified
     static const std::string kUnknownLanguageLabel;
@@ -26,7 +27,7 @@ class FastTextDetector : public LanguageDetector {
   public:
     explicit FastTextDetector(const std::string &filename);
     virtual ~FastTextDetector();
-    virtual void detect(const std::string& text, std::unordered_map<std::string, std::string>& chunks) const;
+    virtual void detect(AnnotatedText &&text, std::unordered_map<std::string, AnnotatedText>& chunks) const;
 
   private:
     std::unique_ptr<fasttext::FastText> classifier_;
@@ -34,13 +35,13 @@ class FastTextDetector : public LanguageDetector {
 
 class CLD2Detector : public LanguageDetector {
 public:
-  virtual void detect(const std::string& text, std::unordered_map<std::string, std::string>& chunks) const;
+  virtual void detect(AnnotatedText &&text, std::unordered_map<std::string, AnnotatedText>& chunks) const;
   virtual ~CLD2Detector();
 };
 
 class CLD2MultiLangDetector : public LanguageDetector {
 public:
-  virtual void detect(const std::string& text, std::unordered_map<std::string, std::string>& chunks) const;
+  virtual void detect(AnnotatedText &&text, std::unordered_map<std::string, AnnotatedText>& chunks) const;
   virtual ~CLD2MultiLangDetector();
 };
 
diff --git a/src/lang_cld2.cc b/src/lang_cld2.cc
index 30e8cd9..f241f7c 100644
--- a/src/lang_cld2.cc
+++ b/src/lang_cld2.cc
@@ -8,17 +8,17 @@ namespace warc2text {
 
     CLD2Detector::~CLD2Detector() {}
 
-    void CLD2Detector::detect(const std::string& text, std::unordered_map<std::string, std::string>& text_by_lang) const {
+    void CLD2Detector::detect(AnnotatedText &&text, std::unordered_map<std::string, AnnotatedText>& text_by_lang) const {
         bool reliable = false;
         int valid_prefix_bytes = 0;
-        CLD2::Language l = CLD2::DetectLanguageCheckUTF8(text.data(), text.size(), true, &reliable, &valid_prefix_bytes);
-        text_by_lang[reliable ? CLD2::LanguageCode(l) : kUnknownLanguageLabel] = text;
+        CLD2::Language l = CLD2::DetectLanguageCheckUTF8(text.text.data(), text.text.size(), true, &reliable, &valid_prefix_bytes);
+        text_by_lang[reliable ? CLD2::LanguageCode(l) : kUnknownLanguageLabel] = std::move(text);
     }
 
     CLD2MultiLangDetector::~CLD2MultiLangDetector() {}
 
-    void CLD2MultiLangDetector::detect(const std::string& text, std::unordered_map<std::string, std::string>& text_by_lang) const {
-        CLD2::Language langs[3] = {CLD2::UNKNOWN_LANGUAGE, CLD2::UNKNOWN_LANGUAGE, CLD2::UNKNOWN_LANGUAGE};
+    void CLD2MultiLangDetector::detect(AnnotatedText &&text, std::unordered_map<std::string, AnnotatedText>& text_by_lang) const {
+        CLD2::Language langs[] = {CLD2::UNKNOWN_LANGUAGE, CLD2::UNKNOWN_LANGUAGE, CLD2::UNKNOWN_LANGUAGE};
         int percents[3] = {0,0,0};
         double scores[3] = {0.0, 0.0, 0.0};
 
@@ -28,47 +28,39 @@ namespace warc2text {
 
         CLD2::ResultChunkVector chunks;
 
-        CLD2::ExtDetectLanguageSummaryCheckUTF8(text.data(), text.size(), true, &NO_HINT, 0, &langs[0], &percents[0], &scores[0], &chunks, &text_bytes, &reliable, &valid_prefix_bytes);
+        CLD2::ExtDetectLanguageSummaryCheckUTF8(text.text.data(), text.text.size(), true, &NO_HINT, 0, &langs[0], &percents[0], &scores[0], &chunks, &text_bytes, &reliable, &valid_prefix_bytes);
 
         text_by_lang.clear();
 
-        if (not reliable) {
-            text_by_lang[kUnknownLanguageLabel] = text;
+        if (!reliable) {
+            text_by_lang[kUnknownLanguageLabel] = std::move(text);
             return;
         }
 
-        std::string* top1 = nullptr;
-        std::string* top2 = nullptr;
-        std::string* top3 = nullptr;
+        const char* mapping[] = {nullptr, nullptr, nullptr};
 
-        if (langs[0] != CLD2::UNKNOWN_LANGUAGE and percents[0] > 0) {
-            top1 = &text_by_lang[CLD2::LanguageCode(langs[0])];
-            top1->reserve(text.size() * (percents[0] + 1));
-        }
-
-        if (langs[1] != CLD2::UNKNOWN_LANGUAGE and percents[1] > 0) {
-            top2 = &text_by_lang[CLD2::LanguageCode(langs[1])];
-            top2->reserve(text.size() * (percents[1] + 1));
-        }
-
-        if (langs[2] != CLD2::UNKNOWN_LANGUAGE and percents[2] > 0) {
-            top3 = &text_by_lang[CLD2::LanguageCode(langs[2])];
-            top3->reserve(text.size() * (percents[2] + 1));
+        for (size_t i = 0; i < 3; ++i) {
+            if (langs[i] != CLD2::UNKNOWN_LANGUAGE && percents[i] > 0)
+                mapping[i] = CLD2::LanguageCode(langs[2]);
         }
 
         for (const CLD2::ResultChunk& chunk : chunks) {
-            std::string* ref = static_cast<CLD2::Language>(chunk.lang1) == langs[0] ? top1 :
-                        static_cast<CLD2::Language>(chunk.lang1) == langs[1] ? top2 :
-                        static_cast<CLD2::Language>(chunk.lang1) == langs[2] ? top3 : nullptr;
-            if (ref == nullptr) continue;
-            ref->append(text, chunk.offset, chunk.bytes);
-        }
-
-        // remove empty texts from text_by_lang
-        // apparently it is possible that the reported percentage is > 0, but the language does not appear in chunks
-        for (auto it = text_by_lang.cbegin(); it != text_by_lang.cend(); ){
-            if (it->second.size() == 0) text_by_lang.erase(it++);
-            else ++it;
+            if (chunk.bytes == 0) // TODO: can this even happen?
+                continue;
+
+            // Which of the top 3 languages is this chunk in?
+            std::size_t i = 0;
+            for (; i < 3; ++i) {
+                if (static_cast<CLD2::Language>(chunk.lang1) == langs[i])
+                    break;
+            }
+
+            // Chunk is not in top 3
+            if (i == 3)
+                continue;
+
+            // Chunk is in top 3, append it to that AnnotatedText
+            text_by_lang[mapping[i]].append(text, chunk.offset, chunk.bytes);
         }
 
         // TODO: do something with the scores?
diff --git a/src/lang_fasttext.cc b/src/lang_fasttext.cc
index 0f6f955..a48065a 100644
--- a/src/lang_fasttext.cc
+++ b/src/lang_fasttext.cc
@@ -17,14 +17,14 @@ FastTextDetector::~FastTextDetector() {}
 
 const char kLabelPrefix[] = "__label__";
 
-void FastTextDetector::detect(const std::string& text, std::unordered_map<std::string, std::string>& chunks) const {
+void FastTextDetector::detect(AnnotatedText &&text, std::unordered_map<std::string, AnnotatedText>& chunks) const {
   const float kThreshold = 0.5f;
   std::vector<int32_t> words, labels;
-  classifier_->getDictionary()->getStringNoNewline(text, words, labels);
+  classifier_->getDictionary()->getStringNoNewline(text.text, words, labels);
   fasttext::Predictions predictions;
   classifier_->predict(1, words, predictions, kThreshold);
   if (predictions.empty()) {
-    chunks[kUnknownLanguageLabel] = text;
+    chunks[kUnknownLanguageLabel] = std::move(text);
     return;
   }
 
@@ -34,7 +34,7 @@ void FastTextDetector::detect(const std::string& text, std::unordered_map<std::s
   label.erase(0, sizeof(kLabelPrefix) - 1);
 
   // For better or worse, we're currently doing everything as one chunk.
-  chunks[label] = text;
+  chunks[label] = std::move(text);
 }
 
 } // namespace warc2text
diff --git a/src/record.cc b/src/record.cc
index 9df06c3..2f98eed 100644
--- a/src/record.cc
+++ b/src/record.cc
@@ -205,9 +205,10 @@ namespace warc2text {
         if (bdf_zip)
             payload = readZipPayload(content_type, payload);
 
+        plaintext.clear();
+
         // detect charset
         std::string detected_charset;
-        std::string extracted;
         bool detection_result = util::detectCharset(payload, detected_charset, charset);
 
         if (detection_result) charset = detected_charset;
@@ -224,32 +225,28 @@ namespace warc2text {
             // convert to utf8 if needed (we do it before cleaning tabs, unlike HTML below):
             if (needToConvert)
                 payload = util::toUTF8(payload, charset);
+
+            std::string extracted;
             util::trimLinesCopy(payload, extracted);
             std::replace_if(extracted.begin(), extracted.end(), [](wchar_t c){ return std::iscntrl(c) && c != '\n'; }, ' ');
+            plaintext.push_back(std::move(extracted), "");
         }
         else {
-            retval = processHTML(payload, extracted, tagFilters);
-
-            // convert to utf8 if needed:
-            if (needToConvert)
-                extracted = util::toUTF8(extracted, charset);
+            retval = processHTML(
+                needToConvert ? util::toUTF8(payload, charset) : payload,
+                plaintext,
+                tagFilters);
         }
 
-        // decode HTML entities:
-        if (isPlainText)
-            plaintext = extracted;
-        else
-            entities::decodeEntities(extracted, plaintext);
-
         return retval;
     }
 
-    const std::unordered_map<std::string, std::string>& Record::getTextByLangs() const {
+    const std::unordered_map<std::string, AnnotatedText>& Record::getTextByLangs() const {
         return text_by_langs;
     }
 
     int Record::detectLanguage(LanguageDetector const &detector){
-        detector.detect(plaintext, text_by_langs);
+        detector.detect(std::move(plaintext), text_by_langs);
         return text_by_langs.size();
     }
 
@@ -280,8 +277,8 @@ namespace warc2text {
         return payload;
     }
 
-    const std::string& Record::getPlainText() const {
-        return plaintext;
+    std::size_t Record::getPlainTextSize() const {
+        return plaintext.text.size();
     }
 
     const std::string& Record::getURL() const {
diff --git a/src/record.hh b/src/record.hh
index 00069e7..5a20d9b 100644
--- a/src/record.hh
+++ b/src/record.hh
@@ -10,6 +10,7 @@
 #include <regex>
 #include "util.hh"
 #include "lang.hh"
+#include "text.hh"
 
 namespace warc2text {
     class Record {
@@ -22,7 +23,6 @@ namespace warc2text {
         bool HTTPheaderExists(const std::string& property) const;
 
         const std::string& getPayload() const;
-        const std::string& getPlainText() const;
         const std::string& getURL() const;
         const std::string& getRecordType() const;
         const std::string& getWARCcontentType() const;
@@ -44,7 +44,8 @@ namespace warc2text {
             return offset;
         }
         
-        const std::unordered_map<std::string, std::string>& getTextByLangs() const;
+        std::size_t getPlainTextSize() const;
+        const std::unordered_map<std::string, AnnotatedText>& getTextByLangs() const;
 
         int cleanPayload();
         int cleanPayload(const util::umap_tag_filters_regex& tagFilters);
@@ -63,10 +64,9 @@ namespace warc2text {
         std::unordered_map<std::string, std::string> header;
         std::unordered_map<std::string, std::string> HTTPheader;
         std::string payload;
-        std::string plaintext;
-        std::string language;
-
-        std::unordered_map<std::string, std::string> text_by_langs;
+        
+        AnnotatedText plaintext; // might be empty after langid
+        std::unordered_map<std::string, AnnotatedText> text_by_langs;
 
         // these are present in the headers, but it's convenient to have them apart also
         std::string recordType;
diff --git a/src/text.cc b/src/text.cc
new file mode 100644
index 0000000..5114397
--- /dev/null
+++ b/src/text.cc
@@ -0,0 +1,86 @@
+#include "text.hh"
+#include <cassert>
+
+namespace {
+    std::size_t count(char needle, std::string const &haystack, std::size_t offset = 0, std::size_t count = std::string::npos) {
+        std::size_t hits = 0;
+
+        size_t end;
+
+        if (count == std::string::npos || offset + count > haystack.size())
+            end = haystack.size();
+       else
+            end = offset + count;
+
+        while (true) {
+            std::size_t hit = haystack.find(needle, offset);
+            if (hit >= end)
+                break;
+
+            ++hits;
+            offset = hit + 1;
+        }
+
+        return hits;
+    }
+
+    template <typename T>
+    void append(std::vector<T> &dest, std::vector<T> const &source, std::size_t offset, std::size_t count) {
+        dest.reserve(dest.size() + count);
+        for (std::size_t i = 0; i < count; ++i)
+            dest.emplace_back(source[offset + i]);
+    }
+}
+
+namespace warc2text {
+
+AnnotatedText AnnotatedText::substr(std::size_t offset, std::size_t length) const {
+    return AnnotatedText().append(*this, offset, length);
+}
+
+AnnotatedText &AnnotatedText::append(AnnotatedText const &other, std::size_t offset, std::size_t length) {
+    std::size_t tag_offset = ::count('\n', other.text, 0, offset);
+    std::size_t tag_length = ::count('\n', other.text, offset, length);
+
+    // When the current text does not end with a newline, we skip copying the
+    // first tag of `other` because that line will be added to the current line
+    // that already has a tag.
+    if (!text.empty() && text.back() != '\n') {
+        tag_offset += 1;
+        tag_length -= 1;
+    }
+
+    text.append(other.text, offset, length);
+    ::append(tags, other.tags, tag_offset, tag_length);
+    return *this;
+}
+
+void AnnotatedText::push_back(std::string const &chunk, std::string const &tag) {
+    if (chunk.empty())
+        return;
+
+    std::size_t lines = 0;
+
+    if (std::isspace(static_cast<unsigned char>(chunk.back()))) {
+        text.append(chunk, 0, chunk.size() - 1);
+        text.push_back('\n');
+        lines = count('\n', chunk, 0, chunk.size() - 1) + 1;
+    } else {
+        text.reserve(text.size() + chunk.size() + 1);
+        text += chunk;
+        text.push_back('\n');
+        lines = count('\n', chunk) + 1;
+    }
+
+    assert(lines >= 1);
+
+    for (std::size_t i = 0; i < lines; ++i)
+        tags.push_back(tag);
+}
+
+void AnnotatedText::clear() {
+    text.clear();
+    tags.clear();
+}   
+
+};
diff --git a/src/text.hh b/src/text.hh
new file mode 100644
index 0000000..6d4cf0a
--- /dev/null
+++ b/src/text.hh
@@ -0,0 +1,22 @@
+#pragma once
+
+#include <string>
+#include <vector>
+
+namespace warc2text {
+    class AnnotatedText {
+    public:
+        std::string text;
+        std::vector<std::string> tags;
+
+        // Extract a bit of text + tags from this text
+        AnnotatedText substr(std::size_t offset, std::size_t count) const;
+
+        AnnotatedText &append(AnnotatedText const &other, std::size_t offset, std::size_t count);
+
+        // Append a block of text (and tag) at the end
+        void push_back(std::string const &text, std::string const &tag);
+
+        void clear();
+    };
+};
diff --git a/src/text_test.cc b/src/text_test.cc
new file mode 100644
index 0000000..3612bfa
--- /dev/null
+++ b/src/text_test.cc
@@ -0,0 +1,68 @@
+#include "text.hh"
+
+#define BOOST_TEST_MODULE HTMLTest
+#include <boost/test/unit_test.hpp>
+
+namespace warc2text {
+namespace {
+
+
+BOOST_AUTO_TEST_CASE(TextPushBack) {
+	AnnotatedText input;
+
+	input.push_back("This is a sentence", "p");
+
+	input.push_back("This is a sentence\nsplit over two lines", "li");
+
+	std::string expected(
+		"This is a sentence\n"
+		"This is a sentence\n"
+		"split over two lines\n"
+	);
+
+	std::vector<std::string> tags{"p", "li", "li"};
+
+	BOOST_CHECK_EQUAL(input.text, expected);
+	BOOST_CHECK_EQUAL_COLLECTIONS(input.tags.begin(), input.tags.end(), tags.begin(), tags.end());
+}
+
+BOOST_AUTO_TEST_CASE(TextSubstr) {
+	AnnotatedText input;
+	input.push_back("This is a sentence", "p");
+	input.push_back("This is a sentence\nsplit over two lines", "li");
+
+	std::string expected(
+		"This is a sentence\n"
+		"This is a sent"
+	);
+
+	std::vector<std::string> tags{"p", "li"}; // two lines, so two tags
+
+	AnnotatedText out = input.substr(0, expected.size());
+	
+	BOOST_CHECK_EQUAL(out.text, expected);
+	BOOST_CHECK_EQUAL_COLLECTIONS(out.tags.begin(), out.tags.end(), tags.begin(), tags.end());
+}
+
+BOOST_AUTO_TEST_CASE(TextAppend) {
+	AnnotatedText input;
+	input.push_back("This is a sentence", "p");
+	input.push_back("This is a sentence\nsplit over two lines", "li");
+
+	std::string expected(
+		"This is a sentence over two lines\n"
+	);
+
+	std::vector<std::string> tags{"p"}; // single line, take existing tag
+
+	AnnotatedText out;
+	out.append(input, 0, 18); // "This is a sentence"
+	out.append(input, 44, 16); // " over two lines\n"
+	
+	BOOST_CHECK_EQUAL(out.text, expected);
+	BOOST_CHECK_EQUAL_COLLECTIONS(out.tags.begin(), out.tags.end(), tags.begin(), tags.end());
+}
+
+}
+}
+
diff --git a/src/warcpreprocessor.cc b/src/warcpreprocessor.cc
index 47de540..ffe5285 100644
--- a/src/warcpreprocessor.cc
+++ b/src/warcpreprocessor.cc
@@ -178,13 +178,15 @@ namespace warc2text {
                 continue;
             }
 
-            if (record.getPlainText().empty()) {
+            std::size_t recordTextBytes = record.getPlainTextSize();
+
+            if (!recordTextBytes) {
                 BOOST_LOG_TRIVIAL(trace) << "Record " << record.getURL() << ": empty";
                 continue;
             }
 
             ++textRecords;
-            textBytes += record.getPlainText().size();
+            textBytes += recordTextBytes;
 
             record.detectLanguage(detector);
             n_langs = 0;
@@ -193,7 +195,7 @@ namespace warc2text {
                 if (chunk.first == LanguageDetector::kUnknownLanguageLabel)
                     continue;
                 
-                langBytes += chunk.second.size();
+                langBytes += chunk.second.text.size();
                 ++n_langs;
             }