MigrateHTML helper for importing HTML into content

This tool is in an early alpha state and requires further work and documentation before it's ready for general deployment.
katalyst · Mar 8, 2024 · a41272f · a41272f
1 parent 89dfc0f
commit a41272f
Show file tree

Hide file tree

Showing 5 changed files with 307 additions and 0 deletions.
diff --git a/Gemfile b/Gemfile
@@ -29,6 +29,7 @@ end
 
 group :test do
   gem "capybara"
+  gem "compare-xml"
   gem "cuprite"
   gem "faker"
   gem "rails-controller-testing"

diff --git a/Gemfile.lock b/Gemfile.lock
@@ -111,6 +111,8 @@ GEM
       rack-test (>= 0.6.3)
       regexp_parser (>= 1.5, < 3.0)
       xpath (~> 3.2)
+    compare-xml (0.66)
+      nokogiri (~> 1.8)
     concurrent-ruby (1.2.3)
     connection_pool (2.4.1)
     crass (1.0.6)
@@ -369,6 +371,7 @@ PLATFORMS
 
 DEPENDENCIES
   capybara
+  compare-xml
   cuprite
   dartsass-rails
   erb_lint

diff --git a/app/actions/katalyst/content/migrate_html.rb b/app/actions/katalyst/content/migrate_html.rb
@@ -0,0 +1,133 @@
+# frozen_string_literal: true
+
+module Katalyst
+  module Content
+    class MigrateHtml
+      include ActiveModel::Model
+
+      SUPPORTED_ROOT_TAGS = %w[br h4 h5 h6 hr ol p text ul].freeze
+      SUPPORTED_TRIX_TAGS = %w[h4 h5 h6 hr ol p text ul li b br em a strong span].freeze
+
+      attr_reader :model
+
+      def self.call(model, content)
+        new.call(model, content)
+      end
+
+      def call(model, content)
+        @model = model
+        @items = []
+        @depth = 0
+
+        root = Nokogiri::HTML5.fragment(content)
+
+        root.children.each do |node|
+          case node.name
+          when "h2"
+            add_section_node(heading: node.text, heading_style: "default")
+          when "h3"
+            add_content_node(heading: node.text, heading_style: "default")
+          when "br", "h4", "h5", "h6", "hr", "ol", "p", "text", "ul"
+            append_html(node)
+          else
+            errors.add(:base, "contains invalid tag #{node.name}")
+          end
+        end
+
+        @items.each do |item|
+          if item.is_a?(Katalyst::Content::Content) && item.heading.blank?
+            item.destroy
+          end
+        end
+
+        unless @model.save
+          errors.copy!(@model)
+
+          return self
+        end
+
+        @model.items_attributes = @items.map.with_index do |item, index|
+          { id: item.id, index:, depth: item.depth }
+        end
+
+        @model.publish!
+
+        self
+      end
+
+      def success?
+        errors.empty?
+      end
+
+      private
+
+      def build(type:, **)
+        @last = item = @model.items.build(
+          type:,
+          **defaults,
+          **,
+        )
+        @items << item
+        item
+      end
+
+      def add_section_node(heading:, **)
+        @depth = 0
+        item = build(
+          type:    Katalyst::Content::Section,
+          heading:,
+          **defaults,
+          **,
+        )
+        @depth = 1
+        item
+      end
+
+      def add_content_node(**)
+        build(
+          type: Katalyst::Content::Content,
+          **defaults,
+          **,
+        )
+      end
+
+      def append_html(node)
+        content = last_content_node
+
+        node.traverse do |n|
+          errors.add(:base, "contains invalid tag #{n.name}") unless SUPPORTED_TRIX_TAGS.include?(n.name)
+        end
+
+        content.content = if content.content.present?
+          content.content.read_attribute_before_type_cast(:body) + node.to_html
+        else
+          node.to_html
+        end
+
+        content.heading ||= heading_for(content.content)
+
+        content
+      end
+
+      def last_content_node(**)
+        if @last.is_a?(Katalyst::Content::Content)
+          @last
+        else
+          add_content_node(**)
+        end
+      end
+
+      def heading_for(action_text)
+        action_text.to_plain_text.match(/([\w\s]+)/)&.match(1)&.strip
+      end
+
+      def defaults
+        {
+          background: Katalyst::Content::Config.backgrounds.first,
+          visible:    true,
+          depth:      @depth,
+        }
+      end
+    end
+  end
+end
diff --git a/spec/actions/katalyst/content/migrate_html_spec.rb b/spec/actions/katalyst/content/migrate_html_spec.rb
@@ -0,0 +1,94 @@
+# frozen_string_literal: true
+
+require "rails_helper"
+
+# rubocop:disable RSpec/ExampleLength
+RSpec.describe Katalyst::Content::MigrateHtml do
+  subject { described_class.new }
+
+  let(:page) { create(:page) }
+
+  it { is_expected.to be_success }
+
+  it "can migrate plain text" do
+    content = <<~HTML
+      Some plain text without any tags.
+    HTML
+    subject.call(page, content)
+    expect(page.published_items).to contain_exactly(
+      have_attributes(heading:       "Some plain text without any tags",
+                      heading_style: "none",
+                      content:       match_html(content)),
+    )
+  end
+
+  it "can migrate a paragraph with inline tags" do
+    content = <<~HTML
+      <p>Some <strong>rich text</strong> with <em>inline tags</em>.</p>
+    HTML
+    subject.call(page, content)
+    expect(page.published_items).to contain_exactly(
+      have_attributes(heading:       "Some rich text with inline tags",
+                      heading_style: "none",
+                      content:       match_html(content)),
+    )
+  end
+
+  it "can migrate multiple block level tags" do
+    content = <<~HTML
+      <p>Some <strong>rich text</strong> with <em>inline tags</em>.</p>
+      <p>Some <strong>more</strong> content.</p>
+    HTML
+    subject.call(page, content)
+    expect(page.published_items).to contain_exactly(
+      have_attributes(heading:       "Some rich text with inline tags",
+                      heading_style: "none",
+                      content:       match_html(content)),
+    )
+  end
+
+  it "can migrate content with titles" do
+    content = <<~HTML
+      <h3>This is a content title</h3>
+      <h4>This is a trix title</h4>
+      <p>Some <strong>more</strong> content.</p>
+      <h3>This is a new content block</h3>
+      <p>Some <strong>more</strong> content.</p>
+    HTML
+    subject.call(page, content)
+    expect(page.published_items).to contain_exactly(
+      have_attributes(heading:       "This is a content title",
+                      heading_style: "default",
+                      content:       match_html(<<~HTML),
+                        <h4>This is a trix title</h4>
+                        <p>Some <strong>more</strong> content.</p>
+                      HTML
+                     ),
+      have_attributes(heading: "This is a new content block",
+                      content: match_html(<<~HTML),
+                        <p>Some <strong>more</strong> content.</p>
+                      HTML
+                     ),
+    )
+  end
+
+  it "can migrate content with section" do
+    content = <<~HTML
+      <h2>This is a section title</h2>
+      <p>Some content.</p>
+    HTML
+    subject.call(page, content)
+    expect(page.published_items).to contain_exactly(
+      have_attributes(type:          "Katalyst::Content::Section",
+                      heading:       "This is a section title",
+                      heading_style: "default",
+                      depth:         0),
+      have_attributes(heading_style: "none",
+                      content:       match_html(<<~HTML),
+                        <p>Some content.</p>
+                      HTML
+                      depth:         1),
+    )
+  end
+end
+# rubocop:enable RSpec/ExampleLength
diff --git a/spec/support/match_html.rb b/spec/support/match_html.rb
@@ -0,0 +1,76 @@
+# frozen_string_literal: true
+
+require "compare-xml"
+require "nokogiri"
+require "rspec/matchers"
+
+class HTMLMatcher < RSpec::Matchers::BuiltIn::BaseMatcher
+  def initialize(expected_html, debug: true, **options)
+    super()
+
+    # Options documented here: https://github.com/vkononov/compare-xml
+    default_options = {
+      collapse_whitespace: true,
+      ignore_attr_order:   true,
+      ignore_comments:     true,
+    }
+
+    @options = default_options.merge(options).merge(verbose: true)
+
+    @actual        = nil
+    @expected_html = expected_html
+    @expected_doc  = Nokogiri::HTML5.fragment(expected_html)
+    @debug         = debug
+  end
+
+  # @param [Object] response object to match against
+  # @return [Boolean] `true` if response matches the expected html
+  def matches?(response)
+    case response
+    when Nokogiri::XML::Node
+      @actual_doc  = response
+      @actual_html = response.to_html
+    when ActionText::RichText
+      @actual_html = response.read_attribute_before_type_cast(:body)
+      @actual_doc  = response.body.fragment.source
+    else
+      @actual_html = response
+      @actual_doc  = Nokogiri::HTML.fragment(response)
+    end
+
+    describe_diff if @debug && !equivalent?
+
+    equivalent?
+  end
+
+  # @return [String] description of this matcher
+  def description
+    "match HTML against #{@expected_html}"
+  end
+
+  def failure_message
+    "expected '#{@expected_html}' but it was '#{@actual_html}'"
+  end
+
+  def equivalent?
+    diff.empty?
+  end
+
+  def diff
+    @diff ||= CompareXML.equivalent?(@expected_doc, @actual_doc, **@options)
+  end
+
+  def describe_diff
+    diff = @diff.first
+    expected, actual = [diff[:diff1], diff[:diff2]].map { |m| m.is_a?(String) ? m : m.to_html }
+    puts "Diff: #{expected} != #{actual}"
+  end
+end
+
+module RSpec
+  module Matchers
+    def match_html(expected_html, **options)
+      HTMLMatcher.new(expected_html, **options)
+    end
+  end
+end