diff --git a/openedx/core/djangoapps/content/search/documents.py b/openedx/core/djangoapps/content/search/documents.py index 40fe4529272..98cd7d576e0 100644 --- a/openedx/core/djangoapps/content/search/documents.py +++ b/openedx/core/djangoapps/content/search/documents.py @@ -14,6 +14,7 @@ from rest_framework.exceptions import NotFound from openedx.core.djangoapps.content.search.models import SearchAccess +from openedx.core.djangoapps.content.search.plain_text_math import process_mathjax from openedx.core.djangoapps.content_libraries import api as lib_api from openedx.core.djangoapps.content_tagging import api as tagging_api from openedx.core.djangoapps.xblock import api as xblock_api @@ -220,7 +221,7 @@ class implementation returns only: # Generate description from the content description = _get_description_from_block_content(block_type, content_data) if description: - block_data[Fields.description] = description + block_data[Fields.description] = process_mathjax(description) except Exception as err: # pylint: disable=broad-except log.exception(f"Failed to process index_dictionary for {block.usage_key}: {err}") diff --git a/openedx/core/djangoapps/content/search/plain_text_math.py b/openedx/core/djangoapps/content/search/plain_text_math.py new file mode 100644 index 00000000000..82f00f8d80e --- /dev/null +++ b/openedx/core/djangoapps/content/search/plain_text_math.py @@ -0,0 +1,103 @@ +""" +Helper class to convert mathjax equations to plain text. +""" + +import re + +import unicodeit + + +class PlainTextMath: + """ + Converts mathjax equations to plain text using unicodeit and some preprocessing. + """ + equation_pattern = re.compile( + r'\[mathjaxinline\](.*?)\[\/mathjaxinline\]|\[mathjax\](.*?)\[\/mathjax\]|\\\((.*?)\\\)|\\\[(.*?)\\\]' + ) + eqn_replacements = ( + # just remove prefix `\` + ("\\sin", "sin"), + ("\\cos", "cos"), + ("\\tan", "tan"), + ("\\arcsin", "arcsin"), + ("\\arccos", "arccos"), + ("\\arctan", "arctan"), + ("\\cot", "cot"), + ("\\sec", "sec"), + ("\\csc", "csc"), + # Is used for matching brackets in mathjax, should be required in plain text. + ("\\left", ""), + ("\\right", ""), + ) + regex_replacements = ( + (re.compile(r'\\mathbf{(.*?)}'), r"\1"), + ) + + def _fraction_handler(self, equation: str) -> str: + """ + Converts `\\frac{x}{y}` to `(x/y)` while handling nested `{}`. + + For example: `\\frac{2}{\\sqrt{1+y}}` is converted to `(2/\\sqrt{1+y})`. + + Args: + equation: string + + Returns: + String with `\\frac` replaced by normal `/` symbol. + """ + start_index = equation.find("\\frac{") + if start_index == -1: + return equation + mid_index = equation.find("}{") + if mid_index == -1: + return equation + + numerator = equation[start_index + 6:mid_index] + # shift mid_index by length of }{ chars i.e., 2 + mid_index += 2 + open_count = 0 + + for i, char in enumerate(equation[mid_index:]): + if char == "{": + open_count += 1 + if char == "}": + if open_count == 0: + break + open_count -= 1 + else: + # Invalid `\frac` format + return equation + + denominator = equation[mid_index:mid_index + i] + # Now re-create the equation with `(numerator / denominator)` + equation = equation[:start_index] + f"({numerator}/{denominator})" + equation[mid_index + i + 1:] + return equation + + def _handle_replacements(self, equation: str) -> str: + """ + Makes a bunch of replacements in equation string. + """ + for q, replacement in self.eqn_replacements: + equation = equation.replace(q, replacement) + for pattern, replacement in self.regex_replacements: + equation = re.sub(pattern, replacement, equation) + return equation + + def run(self, eqn_matches: re.Match) -> str: + """ + Takes re.Match object and runs conversion process on each match group. + """ + groups = eqn_matches.groups() + for group in groups: + if group: + group = self._fraction_handler(group) + group = self._handle_replacements(group) + return unicodeit.replace(group) + return None + + +processor = PlainTextMath() + + +def process_mathjax(content: str) -> str: + return re.sub(processor.equation_pattern, processor.run, content) diff --git a/openedx/core/djangoapps/content/search/tests/test_documents.py b/openedx/core/djangoapps/content/search/tests/test_documents.py index 603cc8d92f5..d8d3ccd28fc 100644 --- a/openedx/core/djangoapps/content/search/tests/test_documents.py +++ b/openedx/core/djangoapps/content/search/tests/test_documents.py @@ -477,3 +477,81 @@ def test_collection_with_published_library(self): "num_children": 1 } } + + def test_mathjax_plain_text_conversion_for_search(self): + """ + Test how an HTML block with mathjax equations gets converted to plain text in search description. + """ + # pylint: disable=line-too-long + block = BlockFactory.create( + parent_location=self.toy_course.location, + category="html", + display_name="Non-default HTML Block", + editor="raw", + use_latex_compiler=True, + data=( + "Simple addition: \\( 2 + 3 \\) |||" + " Simple subtraction: \\( 5 - 2 \\) |||" + " Simple multiplication: \\( 4 * 6 \\) |||" + " Simple division: \\( 8 / 2 \\) |||" + " Mixed arithmetic: \\( 2 + 3 4 \\) |||" + " Simple exponentiation: \\[ 2^3 \\] |||" + " Root extraction: \\[ 16^{1/2} \\] |||" + " Exponent with multiple terms: \\[ (2 + 3)^2 \\] |||" + " Nested exponents: \\[ 2^(3^2) \\] |||" + " Mixed roots: \\[ 8^{1/2} 3^2 \\] |||" + " Simple fraction: [mathjaxinline] 3/4 [/mathjaxinline] |||" + " Decimal to fraction conversion: [mathjaxinline] 0.75 = 3/4 [/mathjaxinline] |||" + " Mixed fractions: [mathjaxinline] 1 1/2 = 3/2 [/mathjaxinline] |||" + " Converting decimals to mixed fractions: [mathjaxinline] 2.5 = 5/2 [/mathjaxinline] |||" + " Sine, cosine, and tangent: [mathjaxinline] \\sin(x) [/mathjaxinline] [mathjaxinline] \\cos(x) [/mathjaxinline] [mathjaxinline] \\tan(x) [/mathjaxinline] |||" + " Trig identities: [mathjaxinline] \\sin(x + y) = \\sin(x) \\cos(y) + \\cos(x) \\sin(y) [/mathjaxinline] |||" + " Hyperbolic trig functions: [mathjaxinline] \\sinh(x) [/mathjaxinline] [mathjaxinline] \\cosh(x) [/mathjaxinline] |||" + " Simple derivative: [mathjax] f(x) = x^2, f'(x) = 2x [/mathjax] |||" + " Double integral: [mathjax] int\\int (x + y) dxdy [/mathjax] |||" + " Partial derivatives: [mathjax] f(x,y) = xy, \\frac{\\partial f}{\\partial x} = y [/mathjax] [mathjax] \\frac{\\partial f}{\\partial y} = x [/mathjax] |||" + " Mean and standard deviation: [mathjax] mu = 2, \\sigma = 1 [/mathjax] |||" + " Binomial probability: [mathjax] P(X = k) = (\\binom{n}{k} p^k (1-p)^{n-k}) [/mathjax] |||" + " Gaussian distribution: [mathjax] N(\\mu, \\sigma^2) [/mathjax] |||" + " Greek letters: [mathjaxinline] \\alpha [/mathjaxinline] [mathjaxinline] \\beta [/mathjaxinline] [mathjaxinline] \\gamma [/mathjaxinline] |||" + " Subscripted variables: [mathjaxinline] x_i [/mathjaxinline] [mathjaxinline] y_j [/mathjaxinline] |||" + " Superscripted variables: [mathjaxinline] x^{i} [/mathjaxinline] |||" + " Not supported: \\( \\begin{bmatrix} 1 & 0 \\ 0 & 1 \\end{bmatrix} = I \\)" + ), + ) + # pylint: enable=line-too-long + doc = {} + doc.update(searchable_doc_for_course_block(block)) + doc.update(searchable_doc_tags(block.usage_key)) + expected_equations = [ + 'Simple addition: 2 + 3', + 'Simple subtraction: 5 − 2', + 'Simple multiplication: 4 * 6', + 'Simple division: 8 / 2', + 'Mixed arithmetic: 2 + 3 4', + 'Simple exponentiation: 2³', + 'Root extraction: 16¹^/²', + 'Exponent with multiple terms: (2 + 3)²', + 'Nested exponents: 2⁽3²)', + 'Mixed roots: 8¹^/² 3²', + 'Simple fraction: 3/4', + 'Decimal to fraction conversion: 0.75 = 3/4', + 'Mixed fractions: 1 1/2 = 3/2', + 'Converting decimals to mixed fractions: 2.5 = 5/2', + 'Sine, cosine, and tangent: sin(x) cos(x) tan(x)', + 'Trig identities: sin(x + y) = sin(x) cos(y) + cos(x) sin(y)', + 'Hyperbolic trig functions: sinh(x) cosh(x)', + "Simple derivative: f(x) = x², f'(x) = 2x", + 'Double integral: int∫ (x + y) dxdy', + 'Partial derivatives: f(x,y) = xy, (∂ f/∂ x) = y (∂ f/∂ y) = x', + 'Mean and standard deviation: mu = 2, σ = 1', + 'Binomial probability: P(X = k) = (\\binom{n}{k} pᵏ (1−p)ⁿ⁻ᵏ)', + 'Gaussian distribution: N(μ, σ²)', + 'Greek letters: α β γ', + 'Subscripted variables: xᵢ yⱼ', + 'Superscripted variables: xⁱ', + 'Not supported: \\begin{bmatrix} 1 & 0 \\ 0 & 1 \\end{bmatrix} = I', + ] + eqns = doc['description'].split('|||') + for i, eqn in enumerate(eqns): + assert eqn.strip() == expected_equations[i] diff --git a/requirements/edx/base.txt b/requirements/edx/base.txt index bbe8c32ce54..9837745205b 100644 --- a/requirements/edx/base.txt +++ b/requirements/edx/base.txt @@ -1235,6 +1235,8 @@ unicodecsv==0.14.1 # via # -r requirements/edx/kernel.in # edx-enterprise +unicodeit==0.7.5 + # via -r requirements/edx/kernel.in uritemplate==4.1.1 # via # drf-spectacular diff --git a/requirements/edx/development.txt b/requirements/edx/development.txt index e32139843ed..fd7864f308e 100644 --- a/requirements/edx/development.txt +++ b/requirements/edx/development.txt @@ -2178,6 +2178,10 @@ unicodecsv==0.14.1 # -r requirements/edx/doc.txt # -r requirements/edx/testing.txt # edx-enterprise +unicodeit==0.7.5 + # via + # -r requirements/edx/doc.txt + # -r requirements/edx/testing.txt unidiff==0.7.5 # via -r requirements/edx/testing.txt uritemplate==4.1.1 diff --git a/requirements/edx/doc.txt b/requirements/edx/doc.txt index 76005577e95..03d98e8bf52 100644 --- a/requirements/edx/doc.txt +++ b/requirements/edx/doc.txt @@ -1530,6 +1530,8 @@ unicodecsv==0.14.1 # via # -r requirements/edx/base.txt # edx-enterprise +unicodeit==0.7.5 + # via -r requirements/edx/base.txt uritemplate==4.1.1 # via # -r requirements/edx/base.txt diff --git a/requirements/edx/kernel.in b/requirements/edx/kernel.in index 60f49c5917e..43889e60961 100644 --- a/requirements/edx/kernel.in +++ b/requirements/edx/kernel.in @@ -161,3 +161,4 @@ webob web-fragments # Provides the ability to render fragments of web pages XBlock[django] # Courseware component architecture xss-utils # https://github.com/openedx/edx-platform/pull/20633 Fix XSS via Translations +unicodeit # Converts mathjax equation to plain text by using unicode symbols diff --git a/requirements/edx/testing.txt b/requirements/edx/testing.txt index 5880b578d9a..8b966d93ebe 100644 --- a/requirements/edx/testing.txt +++ b/requirements/edx/testing.txt @@ -1612,6 +1612,8 @@ unicodecsv==0.14.1 # via # -r requirements/edx/base.txt # edx-enterprise +unicodeit==0.7.5 + # via -r requirements/edx/base.txt unidiff==0.7.5 # via -r requirements/edx/testing.in uritemplate==4.1.1