Skip to content

Commit

Permalink
Merge pull request #535 from NDLANO/strip-strong-from-title
Browse files Browse the repository at this point in the history
Remove strong from titles
  • Loading branch information
gunnarvelle authored Oct 25, 2024
2 parents 482f3fc + bb06173 commit 35e0bc2
Show file tree
Hide file tree
Showing 8 changed files with 304 additions and 15 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -103,8 +103,8 @@ class ArticleApiProperties extends BaseProps {
ResourceType.H5P.toString -> H5PAddress
)

def InlineHtmlTags: Set[String] = Set("code", "em", "span", "strong", "sub", "sup")
def IntroductionHtmlTags: Set[String] = InlineHtmlTags ++ Set("br", "p")
def InlineHtmlTags: Set[String] = Set("code", "em", "span", "sub", "sup")
def IntroductionHtmlTags: Set[String] = InlineHtmlTags ++ Set("br", "p", "strong")

private def H5PAddress: String = propOrElse(
"NDLA_H5P_ADDRESS",
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,102 @@
/*
* Part of NDLA article-api
* Copyright (C) 2024 NDLA
*
* See LICENSE
*/

package no.ndla.articleapi.db.migration

import io.circe.parser
import io.circe.syntax.EncoderOps
import no.ndla.common.model.domain.{ArticleContent, Title}
import no.ndla.common.model.domain.article.Article
import org.flywaydb.core.api.migration.{BaseJavaMigration, Context}
import org.jsoup.Jsoup
import org.jsoup.nodes.Element
import org.jsoup.nodes.Entities.EscapeMode
import org.postgresql.util.PGobject
import scalikejdbc.{DB, DBSession, *}

class V54__RemoveStrongFromTitle extends BaseJavaMigration {
private def countAllRows(implicit session: DBSession): Option[Long] = {
sql"select count(*) from contentdata where document is not NULL"
.map(rs => rs.long("count"))
.single()
}

private def allRows(offset: Long)(implicit session: DBSession): Seq[(Long, String)] = {
sql"select id, document, article_id from contentdata where document is not null order by id limit 1000 offset $offset"
.map(rs => {
(rs.long("id"), rs.string("document"))
})
.list()
}

private def updateRow(document: String, id: Long)(implicit session: DBSession): Int = {
val dataObject = new PGobject()
dataObject.setType("jsonb")
dataObject.setValue(document)

sql"update contentdata set document = $dataObject where id = $id"
.update()
}

override def migrate(context: Context): Unit = DB(context.getConnection)
.autoClose(false)
.withinTx { session => migrateRows(session) }

private def migrateRows(implicit session: DBSession): Unit = {
val count = countAllRows.get
var numPagesLeft = (count / 1000) + 1
var offset = 0L

while (numPagesLeft > 0) {
allRows(offset * 1000).map { case (id, document) =>
updateRow(convertArticleUpdate(document), id)
}: Unit
numPagesLeft -= 1
offset += 1
}
}

private def stringToJsoupDocument(htmlString: String): Element = {
val document = Jsoup.parseBodyFragment(htmlString)
document.outputSettings().escapeMode(EscapeMode.xhtml).prettyPrint(false)
document.select("body").first()
}

private def jsoupDocumentToString(element: Element): String = {
element.select("body").html()
}

def convertTitle(t: Title): Title = {
val doc = stringToJsoupDocument(t.title)

doc
.select("strong")
.forEach(strong => {
strong.unwrap(): Unit
})
t.copy(title = jsoupDocumentToString(doc))
}

def convertContent(c: ArticleContent): ArticleContent = {
val doc = stringToJsoupDocument(c.content)

doc
.select("h2, h3, h4")
.forEach(header => {
header.select("strong").forEach(strong => strong.unwrap(): Unit)
})
c.copy(content = jsoupDocumentToString(doc))
}

private[migration] def convertArticleUpdate(document: String): String = {
val oldArticle = parser.parse(document).flatMap(_.as[Article]).toTry.get
val titles = oldArticle.title.map(t => convertTitle(t))
val contents = oldArticle.content.map(c => convertContent(c))
val newArticle = oldArticle.copy(title = titles, content = contents)
newArticle.asJson.noSpaces
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -13,21 +13,21 @@ import no.ndla.articleapi.repository.ArticleRepository
import no.ndla.common.errors.{ValidationException, ValidationMessage}
import no.ndla.common.model.NDLADate
import no.ndla.common.model.domain.article.{Article, Copyright}
import no.ndla.common.model.domain._
import no.ndla.common.model.domain.*
import no.ndla.language.model.{Iso639, LanguageField}
import no.ndla.mapping.License.getLicense
import no.ndla.validation.HtmlTagRules.{allLegalTags, stringToJsoupDocument}
import no.ndla.validation.SlugValidator.validateSlug
import no.ndla.validation.TextValidator

import scala.jdk.CollectionConverters._
import scala.jdk.CollectionConverters.*
import scala.util.{Failure, Success, Try}

trait ContentValidator {
this: ArticleRepository with Props =>
this: ArticleRepository & Props =>
val contentValidator: ContentValidator

class ContentValidator() {
class ContentValidator {
private val inlineHtmlTags = props.InlineHtmlTags
private val introductionHtmlTags = props.IntroductionHtmlTags

Expand Down Expand Up @@ -73,7 +73,7 @@ trait ContentValidator {
}
}

private def validateNonEmpty(field: String, values: Seq[LanguageField[_]]): Option[ValidationMessage] = {
private def validateNonEmpty(field: String, values: Seq[LanguageField[?]]): Option[ValidationMessage] = {
if (values.isEmpty || values.forall(_.isEmpty)) {
Some(ValidationMessage(field, "Field must contain at least one entry"))
} else
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
/*
* Part of NDLA article-api
* Copyright (C) 2024 NDLA
*
* See LICENSE
*/

package no.ndla.articleapi.db.migration

import no.ndla.articleapi.{TestEnvironment, UnitSuite}
import no.ndla.common.model.domain.{ArticleContent, Title}

class V54__RemoveStrongFromTitleTest extends UnitSuite with TestEnvironment {
test("That strong are removed from title") {
val oldTitle = Title("This is a <strong>title</strong>", language = "nb")
val expectedTitle = Title("This is a title", language = "nb")

val migration = new V54__RemoveStrongFromTitle
val result = migration.convertTitle(oldTitle)
result should be(expectedTitle)
}

test("That nested strong are removed from title") {
val oldTitle = Title("This is a <strong><em>title</em></strong>", language = "nb")
val expectedTitle = Title("This is a <em>title</em>", language = "nb")

val migration = new V54__RemoveStrongFromTitle
val result = migration.convertTitle(oldTitle)
result should be(expectedTitle)
}

test("That strong are removed from title in article") {
val oldContent =
ArticleContent("<section><h2>This is a <strong>title</strong></h2><p>Some text</p></section>", language = "nb")
val expectedContent = ArticleContent("<section><h2>This is a title</h2><p>Some text</p></section>", language = "nb")

val migration = new V54__RemoveStrongFromTitle
val result = migration.convertContent(oldContent)
result should be(expectedContent)
}

}
Original file line number Diff line number Diff line change
Expand Up @@ -57,8 +57,8 @@ class DraftApiProperties extends BaseProps with StrictLogging {
"image-api" -> s"http://$ImageApiHost/intern"
)

def InlineHtmlTags: Set[String] = Set("code", "em", "span", "strong", "sub", "sup")
def IntroductionHtmlTags: Set[String] = InlineHtmlTags ++ Set("br", "p")
def InlineHtmlTags: Set[String] = Set("code", "em", "span", "sub", "sup")
def IntroductionHtmlTags: Set[String] = InlineHtmlTags ++ Set("br", "p", "strong")

private def BrightcoveAccountId: String = prop("NDLA_BRIGHTCOVE_ACCOUNT_ID")
private def BrightcovePlayerId: String = prop("NDLA_BRIGHTCOVE_PLAYER_ID")
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,104 @@
/*
* Part of NDLA draft-api
* Copyright (C) 2024 NDLA
*
* See LICENSE
*/

package no.ndla.draftapi.db.migration

import io.circe.parser
import io.circe.syntax.EncoderOps
import no.ndla.common.model.domain.{ArticleContent, Title}
import no.ndla.common.model.domain.draft.Draft
import org.flywaydb.core.api.migration.{BaseJavaMigration, Context}
import org.jsoup.Jsoup
import org.jsoup.nodes.Element
import org.jsoup.nodes.Entities.EscapeMode
import org.postgresql.util.PGobject
import scalikejdbc.{DB, DBSession, *}

class V65__RemoveStrongFromTitle extends BaseJavaMigration {
private def countAllRows(implicit session: DBSession): Option[Long] = {
sql"select count(*) from articledata where document is not NULL"
.map(rs => rs.long("count"))
.single()
}

private def allRows(offset: Long)(implicit session: DBSession): Seq[(Long, String)] = {
sql"select id, document, article_id from articledata where document is not null order by id limit 1000 offset $offset"
.map(rs => {
(rs.long("id"), rs.string("document"))
})
.list()
}

private def updateRow(document: String, id: Long)(implicit session: DBSession): Int = {
val dataObject = new PGobject()
dataObject.setType("jsonb")
dataObject.setValue(document)

sql"update articledata set document = $dataObject where id = $id"
.update()
}

override def migrate(context: Context): Unit = DB(context.getConnection)
.autoClose(false)
.withinTx { session => migrateRows(session) }

private def migrateRows(implicit session: DBSession): Unit = {
val count = countAllRows.get
var numPagesLeft = (count / 1000) + 1
var offset = 0L

while (numPagesLeft > 0) {
allRows(offset * 1000).map { case (id, document) =>
updateRow(convertArticleUpdate(document), id)
}: Unit
numPagesLeft -= 1
offset += 1
}
}

private def stringToJsoupDocument(htmlString: String): Element = {
val document = Jsoup.parseBodyFragment(htmlString)
document.outputSettings().escapeMode(EscapeMode.xhtml).prettyPrint(false)
document.select("body").first()
}

private def jsoupDocumentToString(element: Element): String = {
element.select("body").html()
}

def convertTitle(t: Title): Title = {
val doc = stringToJsoupDocument(t.title)

doc
.select("strong")
.forEach(strong => {
strong.unwrap(): Unit
})
t.copy(title = jsoupDocumentToString(doc))
}

def convertContent(c: ArticleContent): ArticleContent = {
val doc = stringToJsoupDocument(c.content)

doc
.select("h2, h3, h4")
.forEach(header => {
header.select("strong").forEach(strong => strong.unwrap(): Unit)
})
c.copy(content = jsoupDocumentToString(doc))
}

private[migration] def convertArticleUpdate(document: String): String = {
val oldArticle = parser.parse(document).flatMap(_.as[Draft]).toTry.get

val titles = oldArticle.title.map(t => convertTitle(t))
val contents = oldArticle.content.map(c => convertContent(c))

val newArticle = oldArticle.copy(title = titles, content = contents)
newArticle.asJson.noSpaces
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -9,8 +9,8 @@ package no.ndla.draftapi.validation

import no.ndla.common.errors.{ValidationException, ValidationMessage}
import no.ndla.common.model.NDLADate
import no.ndla.common.model.domain._
import no.ndla.common.model.domain.draft._
import no.ndla.common.model.domain.*
import no.ndla.common.model.domain.draft.*
import no.ndla.draftapi.Props
import no.ndla.draftapi.integration.ArticleApiClient
import no.ndla.draftapi.model.api.{ContentId, NotFoundException, UpdatedArticle}
Expand All @@ -21,18 +21,18 @@ import no.ndla.mapping.License.getLicense
import no.ndla.network.tapir.auth.TokenUser
import no.ndla.validation.HtmlTagRules.{allLegalTags, stringToJsoupDocument}
import no.ndla.validation.SlugValidator.validateSlug
import no.ndla.validation._
import no.ndla.validation.*
import scalikejdbc.ReadOnlyAutoSession

import scala.jdk.CollectionConverters._
import scala.jdk.CollectionConverters.*
import scala.util.{Failure, Success, Try}

trait ContentValidator {
this: DraftRepository with ConverterService with ArticleApiClient with Props =>
this: DraftRepository & ConverterService & ArticleApiClient & Props =>
val contentValidator: ContentValidator
val importValidator: ContentValidator

class ContentValidator() {
class ContentValidator {
import props.{BrightcoveVideoScriptUrl, H5PResizerScriptUrl, NRKVideoScriptUrl}
private val inlineHtmlTags = props.InlineHtmlTags
private val introductionHtmlTags = props.IntroductionHtmlTags
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
/*
* Part of NDLA draft-api
* Copyright (C) 2024 NDLA
*
* See LICENSE
*/

package no.ndla.draftapi.db.migration

import no.ndla.common.model.domain.{ArticleContent, Title}
import no.ndla.draftapi.{TestEnvironment, UnitSuite}

class V65__RemoveStrongFromTitleTest extends UnitSuite with TestEnvironment {
test("That strong are removed from title") {
val oldTitle = Title("This is a <strong>title</strong>", language = "nb")
val expectedTitle = Title("This is a title", language = "nb")

val migration = new V65__RemoveStrongFromTitle
val result = migration.convertTitle(oldTitle)
result should be(expectedTitle)
}

test("That nested strong are removed from title") {
val oldTitle = Title("This is a <strong><em>title</em></strong>", language = "nb")
val expectedTitle = Title("This is a <em>title</em>", language = "nb")

val migration = new V65__RemoveStrongFromTitle
val result = migration.convertTitle(oldTitle)
result should be(expectedTitle)
}

test("That strong are removed from title in article") {
val oldContent =
ArticleContent("<section><h2>This is a <strong>title</strong></h2><p>Some text</p></section>", language = "nb")
val expectedContent = ArticleContent("<section><h2>This is a title</h2><p>Some text</p></section>", language = "nb")

val migration = new V65__RemoveStrongFromTitle
val result = migration.convertContent(oldContent)
result should be(expectedContent)
}
}

0 comments on commit 35e0bc2

Please sign in to comment.