Merge pull request #131 from carpentries/workaround-html-comment-img-130

avoid errors with images embedded in comments
carpentries · Aug 11, 2023 · 10fe819 · 10fe819
2 parents dbc4028 + 8b1da0f
commit 10fe819
Show file tree

Hide file tree

Showing 5 changed files with 123 additions and 11 deletions.
diff --git a/NEWS.md b/NEWS.md
@@ -1,19 +1,22 @@
-# pegboard 0.5.3 (unreleased)
+# pegboard 0.5.3 (2023-07-08)
 
 BUG FIX
 -------
 
-* `$move_objectives()` and `$move_questions()` methods no longer place these
-  blocks as the _second_ element in the markdown. This was originally
-  implemented when we thought {dovetail} would be our solution to the lesson 
-  infrastructure (and thus needed a setup chunk before any blocks).
-* liquid-formatted links with markdown inside them are now parsed correctly.
-  This leads lessons to be more accurately transitioned to use {sandpaper}
-  (reported: @uschille,
+* `$validate_links()` no longer throws an error when there are HTML images
+  embedded in comments (reported: @beastyblacksmith, #130; fixed: @zkamvar,
+  #131)
+* (transition) `$move_objectives()` and `$move_questions()` methods no longer
+  place these blocks as the _second_ element in the markdown. This was
+  originally implemented when we thought {dovetail} would be our solution to
+  the lesson infrastructure (and thus needed a setup chunk before any blocks).
+* (transition) liquid-formatted links with markdown inside them are now parsed
+  correctly. This leads lessons to be more accurately transitioned to use
+  {sandpaper} (reported: @uschille,
   https://github.com/carpentries/lesson-transition/issues/46; fixed: @zkamvar,
   #121)
-- images with kramdown attributes that are on a new line are now more accurately
-  transitioned to use {sandpaper} (reported: @uschille,
+- (transition) images with kramdown attributes that are on a new line are now
+  more accurately transitioned to use {sandpaper} (reported: @uschille,
   https://github.com/carpentries/lesson-transition/issues/46; fixed: @zkamvar,
   #121)
 

diff --git a/R/get_images.R b/R/get_images.R
@@ -5,12 +5,40 @@
 #'   [process_images()] to add the alt attribute and extract images from HTML
 #'   blocks. `FALSE` will present the nodes as found by XPath search.
 #' @return an xml_nodelist
+#' @details Markdown users can write images as either markdown or HTML. If they
+#' write images as HTML, then the commonmark XML parser recognises these as
+#' generic "HTML blocks" and they can't be found by just searching for
+#' `.//md:image`. This function searches both `md:html_block` and
+#' `md:html_inline` for image content that it can extract for downstream
+#' analysis.
+#'
 #' @keywords internal
+#' @examples
+#' tmp <- tempfile()
+#' on.exit(unlink(tmp))
+#' txt <- '
+#' ![a kitten](https://placekitten.com/200/200){alt="a pretty kitten"}
+#'
+#' <!-- an html image of a kitten -->
+#' <img src="https://placekitten.com/200/200">
+#'
+#' an inline html image of a kitten <img src="https://placekitten.com/50/50">
+#' '
+#' writeLines(txt, tmp)
+#' ep <- Episode$new(tmp)
+#' ep$show()
+#' # without process = TRUE, images in HTML elements are not converted
+#' ep$get_images() 
+#' # setting process = TRUE will extract the HTML elements for analysis 
+#' # (e.g to detect alt text)
+#' ep$get_images(process = TRUE)
 get_images <- function(yrn, process = TRUE) {
   img    <- ".//md:image"
   hblock <- ".//md:html_block[contains(text(), '<img')]"
+  # comment blocks should not be included in the image processing. 
+  nocomment <- "[not(starts-with(normalize-space(text()), '<!--'))]"
   hline  <- ".//md:html_inline[starts-with(text(), '<img')]"
-  xpath  <- glue::glue("{img} | {hblock} | {hline}")
+  xpath  <- glue::glue("{img} | {hblock}{nocomment} | {hline}")
   images <- xml2::xml_find_all(yrn$body, xpath, yrn$ns)
   images <- if (process) process_images(images) else images
   images

diff --git a/man/get_images.Rd b/man/get_images.Rd
diff --git a/man/pegboard-package.Rd b/man/pegboard-package.Rd
diff --git a/tests/testthat/test-get_images.R b/tests/testthat/test-get_images.R
@@ -29,3 +29,55 @@ test_that("markdown images will process alt text appropriately", {
   expect_equal(xml2::xml_attr(images, "alt"), expected_alts)
 })
 
+
+test_that("HTML images in comments do not error", {
+  # see https://github.com/carpentries/pegboard/issues/130
+
+  tmp <- withr::local_tempfile()
+
+  all_comment <- "
+  some text and 
+
+  <!-- <img src='whoops.png'> -->
+  "
+  writeLines(all_comment, tmp)
+  ep <- pegboard::Episode$new(tmp)
+  res <- ep$validate_links(warn = FALSE)
+  expect_null(res)
+})
+
+
+test_that("HTML images surrounded by comments are processed. ", {
+
+  tmp <- withr::local_tempfile()
+
+  some_comment <- "
+There is only one HTML image here and it will be processed.
+NOTE: each comment block is parsed as an individual HTML block,
+and the numbers in this example help us understand that fact.
+
+<!-- <img src='whoops.png'> ONE -->
+<!-- <img src='whoops.png'> 
+       <img src='whoops.png'> 
+   TWO
+   -->
+   <!-- <img src='whoops.png'> THREE -->
+  <img src='okay.png'> <!-- <img src='whoops.png'> -->
+
+  <!-- <img src='whoops.png'> FOUR -->
+  "
+  writeLines(some_comment, tmp)
+  ep <- pegboard::Episode$new(tmp)
+  res <- ep$validate_links(warn = FALSE)
+  expect_equal(nrow(res), 1L)
+  # There are four HTML blocks 
+  blocks <- xml2::xml_find_all(ep$body, ".//md:html_block", ns = ep$ns)
+  expect_length(blocks, 4L)
+  # when we extract the number elements, they are what we expect.
+  expect_equal(gsub("[^A-Z]", "", xml2::xml_text(blocks)), 
+    c("ONE", "TWO", "THREE", "FOUR"))
+
+})
+
+
+