From 5c5c4102e29b98eef439bca4712ea38aa0890e0c Mon Sep 17 00:00:00 2001 From: Navin Karkera Date: Wed, 18 Dec 2024 20:43:09 +0530 Subject: [PATCH 1/4] feat: show math in plain text in library cards --- .../djangoapps/content/search/documents.py | 4 +- .../content/search/plain_text_math.py | 91 +++++++++++++++++++ requirements/edx/base.txt | 2 + requirements/edx/development.txt | 4 + requirements/edx/doc.txt | 2 + requirements/edx/kernel.in | 1 + requirements/edx/testing.txt | 2 + 7 files changed, 105 insertions(+), 1 deletion(-) create mode 100644 openedx/core/djangoapps/content/search/plain_text_math.py diff --git a/openedx/core/djangoapps/content/search/documents.py b/openedx/core/djangoapps/content/search/documents.py index 40fe4529272b..f675998d54ce 100644 --- a/openedx/core/djangoapps/content/search/documents.py +++ b/openedx/core/djangoapps/content/search/documents.py @@ -3,6 +3,7 @@ """ from __future__ import annotations +import re import logging from hashlib import blake2b @@ -14,6 +15,7 @@ from rest_framework.exceptions import NotFound from openedx.core.djangoapps.content.search.models import SearchAccess +from openedx.core.djangoapps.content.search.plain_text_math import process_mathjax from openedx.core.djangoapps.content_libraries import api as lib_api from openedx.core.djangoapps.content_tagging import api as tagging_api from openedx.core.djangoapps.xblock import api as xblock_api @@ -220,7 +222,7 @@ class implementation returns only: # Generate description from the content description = _get_description_from_block_content(block_type, content_data) if description: - block_data[Fields.description] = description + block_data[Fields.description] = process_mathjax(description) except Exception as err: # pylint: disable=broad-except log.exception(f"Failed to process index_dictionary for {block.usage_key}: {err}") diff --git a/openedx/core/djangoapps/content/search/plain_text_math.py b/openedx/core/djangoapps/content/search/plain_text_math.py new file mode 100644 index 000000000000..14c3f1f2abd6 --- /dev/null +++ b/openedx/core/djangoapps/content/search/plain_text_math.py @@ -0,0 +1,91 @@ +import re + +import unicodeit + + +class PlainTextMath: + """ + Converts mathjax equations to plain text using unicodeit and some preprocessing. + """ + equation_pattern = re.compile( + r'\[mathjaxinline\](.*?)\[\/mathjaxinline\]|\[mathjax\](.*?)\[\/mathjax\]|\\\((.*?)\\\)|\\\[(.*?)\\\]' + ) + eqn_replacements = ( + # just remove prefix `\` + ("\\sin", "sin"), + ("\\cos", "cos"), + ("\\tan", "tan"), + ("\\arcsin", "arcsin"), + ("\\arccos", "arccos"), + ("\\arctan", "arctan"), + ("\\cot", "cot"), + ("\\sec", "sec"), + ("\\csc", "csc"), + # Is used for matching brackets in mathjax, should be required in plain text. + ("\\left", ""), + ("\\right", ""), + ) + regex_replacements = ( + (re.compile(r'\\mathbf{(.*?)}'), r"\1"), + ) + + def _fraction_handler(self, equation: str) -> str: + """ + Converts `\frac{x}{y}` to `(x/y)` while handling nested `{}`. + + For example: `\frac{2}{\sqrt{1+y}}` is converted to `(2/\sqrt{1+y})`. + + Args: + equation: string + + Returns: + String with `\frac` replaced by normal `/` symbol. + """ + start_index = equation.find("\\frac{") + if start_index == -1: + return equation + mid_index = equation.find("}{") + numerator = equation[start_index + 6:mid_index] + open_count = 0 + for i, char in enumerate(equation[mid_index + 2:]): + if char == "{": + open_count += 1 + if char == "}": + if open_count == 0: + break + open_count -= 1 + else: + # Invalid `\frac` format + return equation + denominator = equation[mid_index + 2:mid_index + 2 + i] + equation = equation[:start_index] + f"({numerator}/{denominator})" + equation[mid_index + 2 + i + 1:] + return equation + + def _handle_replacements(self, equation: str) -> str: + """ + Makes a bunch of replacements in equation string. + """ + for q, replacement in self.eqn_replacements: + equation = equation.replace(q, replacement) + for pattern, replacement in self.regex_replacements: + equation = re.sub(pattern, replacement, equation) + return equation + + def run(self, eqn_matches: re.Match) -> str: + """ + Takes re.Match object and runs conversion process on each match group. + """ + groups = eqn_matches.groups() + for group in groups: + if group: + group = self._fraction_handler(group) + group = self._handle_replacements(group) + return unicodeit.replace(group) + return None + + +processor = PlainTextMath() + + +def process_mathjax(content: str) -> str: + return re.sub(processor.equation_pattern, processor.run, content) diff --git a/requirements/edx/base.txt b/requirements/edx/base.txt index bbe8c32ce549..9837745205b0 100644 --- a/requirements/edx/base.txt +++ b/requirements/edx/base.txt @@ -1235,6 +1235,8 @@ unicodecsv==0.14.1 # via # -r requirements/edx/kernel.in # edx-enterprise +unicodeit==0.7.5 + # via -r requirements/edx/kernel.in uritemplate==4.1.1 # via # drf-spectacular diff --git a/requirements/edx/development.txt b/requirements/edx/development.txt index e32139843ed6..fd7864f308ed 100644 --- a/requirements/edx/development.txt +++ b/requirements/edx/development.txt @@ -2178,6 +2178,10 @@ unicodecsv==0.14.1 # -r requirements/edx/doc.txt # -r requirements/edx/testing.txt # edx-enterprise +unicodeit==0.7.5 + # via + # -r requirements/edx/doc.txt + # -r requirements/edx/testing.txt unidiff==0.7.5 # via -r requirements/edx/testing.txt uritemplate==4.1.1 diff --git a/requirements/edx/doc.txt b/requirements/edx/doc.txt index 76005577e958..03d98e8bf52b 100644 --- a/requirements/edx/doc.txt +++ b/requirements/edx/doc.txt @@ -1530,6 +1530,8 @@ unicodecsv==0.14.1 # via # -r requirements/edx/base.txt # edx-enterprise +unicodeit==0.7.5 + # via -r requirements/edx/base.txt uritemplate==4.1.1 # via # -r requirements/edx/base.txt diff --git a/requirements/edx/kernel.in b/requirements/edx/kernel.in index 60f49c5917e1..43889e609613 100644 --- a/requirements/edx/kernel.in +++ b/requirements/edx/kernel.in @@ -161,3 +161,4 @@ webob web-fragments # Provides the ability to render fragments of web pages XBlock[django] # Courseware component architecture xss-utils # https://github.com/openedx/edx-platform/pull/20633 Fix XSS via Translations +unicodeit # Converts mathjax equation to plain text by using unicode symbols diff --git a/requirements/edx/testing.txt b/requirements/edx/testing.txt index 5880b578d9a7..8b966d93ebe9 100644 --- a/requirements/edx/testing.txt +++ b/requirements/edx/testing.txt @@ -1612,6 +1612,8 @@ unicodecsv==0.14.1 # via # -r requirements/edx/base.txt # edx-enterprise +unicodeit==0.7.5 + # via -r requirements/edx/base.txt unidiff==0.7.5 # via -r requirements/edx/testing.in uritemplate==4.1.1 From ed0da92a0178723f42a4fdde4ce5212f7c836b32 Mon Sep 17 00:00:00 2001 From: Navin Karkera Date: Thu, 19 Dec 2024 11:04:21 +0530 Subject: [PATCH 2/4] refactor: fraction replacement function --- .../djangoapps/content/search/plain_text_math.py | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/openedx/core/djangoapps/content/search/plain_text_math.py b/openedx/core/djangoapps/content/search/plain_text_math.py index 14c3f1f2abd6..644e9826d297 100644 --- a/openedx/core/djangoapps/content/search/plain_text_math.py +++ b/openedx/core/djangoapps/content/search/plain_text_math.py @@ -45,9 +45,15 @@ def _fraction_handler(self, equation: str) -> str: if start_index == -1: return equation mid_index = equation.find("}{") + if mid_index == -1: + return equation + numerator = equation[start_index + 6:mid_index] + # shift mid_index by length of }{ chars i.e., 2 + mid_index += 2 open_count = 0 - for i, char in enumerate(equation[mid_index + 2:]): + + for i, char in enumerate(equation[mid_index:]): if char == "{": open_count += 1 if char == "}": @@ -57,8 +63,10 @@ def _fraction_handler(self, equation: str) -> str: else: # Invalid `\frac` format return equation - denominator = equation[mid_index + 2:mid_index + 2 + i] - equation = equation[:start_index] + f"({numerator}/{denominator})" + equation[mid_index + 2 + i + 1:] + + denominator = equation[mid_index:mid_index + i] + # Now re-create the equation with `(numerator / denominator)` + equation = equation[:start_index] + f"({numerator}/{denominator})" + equation[mid_index + i + 1:] return equation def _handle_replacements(self, equation: str) -> str: From 93a9cef0cd6742f100b64be680d3c492806c3a9b Mon Sep 17 00:00:00 2001 From: Navin Karkera Date: Fri, 20 Dec 2024 18:12:24 +0530 Subject: [PATCH 3/4] test: mathjax to plain text conversion --- .../content/search/tests/test_documents.py | 78 +++++++++++++++++++ 1 file changed, 78 insertions(+) diff --git a/openedx/core/djangoapps/content/search/tests/test_documents.py b/openedx/core/djangoapps/content/search/tests/test_documents.py index 603cc8d92f5e..efeb7efbcb61 100644 --- a/openedx/core/djangoapps/content/search/tests/test_documents.py +++ b/openedx/core/djangoapps/content/search/tests/test_documents.py @@ -477,3 +477,81 @@ def test_collection_with_published_library(self): "num_children": 1 } } + + def test_mathjax_plain_text_conversion_for_search(self): + """ + Test how an HTML block with mathjax equations gets converted to plain text in search description. + """ + # pylint: disable=line-too-long + block = BlockFactory.create( + parent_location=self.toy_course.location, + category="html", + display_name="Non-default HTML Block", + editor="raw", + use_latex_compiler=True, + data=( + "Simple addition: \( 2 + 3 \) |||" + " Simple subtraction: \( 5 - 2 \) |||" + " Simple multiplication: \( 4 * 6 \) |||" + " Simple division: \( 8 / 2 \) |||" + " Mixed arithmetic: \( 2 + 3 4 \) |||" + " Simple exponentiation: \[ 2^3 \] |||" + " Root extraction: \[ 16^{1/2} \] |||" + " Exponent with multiple terms: \[ (2 + 3)^2 \] |||" + " Nested exponents: \[ 2^(3^2) \] |||" + " Mixed roots: \[ 8^{1/2} 3^2 \] |||" + " Simple fraction: [mathjaxinline] 3/4 [/mathjaxinline] |||" + " Decimal to fraction conversion: [mathjaxinline] 0.75 = 3/4 [/mathjaxinline] |||" + " Mixed fractions: [mathjaxinline] 1 1/2 = 3/2 [/mathjaxinline] |||" + " Converting decimals to mixed fractions: [mathjaxinline] 2.5 = 5/2 [/mathjaxinline] |||" + " Sine, cosine, and tangent: [mathjaxinline] \\sin(x) [/mathjaxinline] [mathjaxinline] \\cos(x) [/mathjaxinline] [mathjaxinline] \\tan(x) [/mathjaxinline] |||" + " Trig identities: [mathjaxinline] \\sin(x + y) = \\sin(x) \\cos(y) + \\cos(x) \\sin(y) [/mathjaxinline] |||" + " Hyperbolic trig functions: [mathjaxinline] \\sinh(x) [/mathjaxinline] [mathjaxinline] \\cosh(x) [/mathjaxinline] |||" + " Simple derivative: [mathjax] f(x) = x^2, f'(x) = 2x [/mathjax] |||" + " Double integral: [mathjax] int\int (x + y) dxdy [/mathjax] |||" + " Partial derivatives: [mathjax] f(x,y) = xy, \frac{\partial f}{\partial x} = y [/mathjax] [mathjax] \frac{\partial f}{\partial y} = x [/mathjax] |||" + " Mean and standard deviation: [mathjax] mu = 2, \sigma = 1 [/mathjax] |||" + " Binomial probability: [mathjax] P(X = k) = (\binom{n}{k} p^k (1-p)^{n-k}) [/mathjax] |||" + " Gaussian distribution: [mathjax] N(\mu, \sigma^2) [/mathjax] |||" + " Greek letters: [mathjaxinline] \\alpha [/mathjaxinline] [mathjaxinline] \\beta [/mathjaxinline] [mathjaxinline] \\gamma [/mathjaxinline] |||" + " Subscripted variables: [mathjaxinline] x_i [/mathjaxinline] [mathjaxinline] y_j [/mathjaxinline] |||" + " Superscripted variables: [mathjaxinline] x^{i} [/mathjaxinline] |||" + " Not supported: \( \\begin{bmatrix} 1 & 0 \\ 0 & 1 \\end{bmatrix} = I \)" + ), + ) + # pylint: enable=line-too-long + doc = {} + doc.update(searchable_doc_for_course_block(block)) + doc.update(searchable_doc_tags(block.usage_key)) + expected_equations = [ + 'Simple addition: 2 + 3', + 'Simple subtraction: 5 − 2', + 'Simple multiplication: 4 * 6', + 'Simple division: 8 / 2', + 'Mixed arithmetic: 2 + 3 4', + 'Simple exponentiation: 2³', + 'Root extraction: 16¹^/²', + 'Exponent with multiple terms: (2 + 3)²', + 'Nested exponents: 2⁽3²)', + 'Mixed roots: 8¹^/² 3²', + 'Simple fraction: 3/4', + 'Decimal to fraction conversion: 0.75 = 3/4', + 'Mixed fractions: 1 1/2 = 3/2', + 'Converting decimals to mixed fractions: 2.5 = 5/2', + 'Sine, cosine, and tangent: sin(x) cos(x) tan(x)', + 'Trig identities: sin(x + y) = sin(x) cos(y) + cos(x) sin(y)', + 'Hyperbolic trig functions: sinh(x) cosh(x)', + "Simple derivative: f(x) = x², f'(x) = 2x", + 'Double integral: int∫ (x + y) dxdy', + 'Partial derivatives: f(x,y) = xy, rac{∂ f}{∂ x} = y rac{∂ f}{∂ y} = x', + 'Mean and standard deviation: mu = 2, σ = 1', + 'Binomial probability: P(X = k) = (inom{n}{k} pᵏ (1−p)ⁿ⁻ᵏ)', + 'Gaussian distribution: N(μ, σ²)', + 'Greek letters: α β γ', + 'Subscripted variables: xᵢ yⱼ', + 'Superscripted variables: xⁱ', + 'Not supported: \\begin{bmatrix} 1 & 0 \\ 0 & 1 \\end{bmatrix} = I', + ] + eqns = doc['description'].split('|||') + for i, eqn in enumerate(eqns): + assert eqn.strip() == expected_equations[i] From 72612db912ec028813d011202c7f60fd94e843fb Mon Sep 17 00:00:00 2001 From: Navin Karkera Date: Fri, 20 Dec 2024 18:20:25 +0530 Subject: [PATCH 4/4] fix: lint issues and tests --- .../djangoapps/content/search/documents.py | 1 - .../content/search/plain_text_math.py | 10 ++++-- .../content/search/tests/test_documents.py | 36 +++++++++---------- 3 files changed, 25 insertions(+), 22 deletions(-) diff --git a/openedx/core/djangoapps/content/search/documents.py b/openedx/core/djangoapps/content/search/documents.py index f675998d54ce..98cd7d576e0a 100644 --- a/openedx/core/djangoapps/content/search/documents.py +++ b/openedx/core/djangoapps/content/search/documents.py @@ -3,7 +3,6 @@ """ from __future__ import annotations -import re import logging from hashlib import blake2b diff --git a/openedx/core/djangoapps/content/search/plain_text_math.py b/openedx/core/djangoapps/content/search/plain_text_math.py index 644e9826d297..82f00f8d80ed 100644 --- a/openedx/core/djangoapps/content/search/plain_text_math.py +++ b/openedx/core/djangoapps/content/search/plain_text_math.py @@ -1,3 +1,7 @@ +""" +Helper class to convert mathjax equations to plain text. +""" + import re import unicodeit @@ -31,15 +35,15 @@ class PlainTextMath: def _fraction_handler(self, equation: str) -> str: """ - Converts `\frac{x}{y}` to `(x/y)` while handling nested `{}`. + Converts `\\frac{x}{y}` to `(x/y)` while handling nested `{}`. - For example: `\frac{2}{\sqrt{1+y}}` is converted to `(2/\sqrt{1+y})`. + For example: `\\frac{2}{\\sqrt{1+y}}` is converted to `(2/\\sqrt{1+y})`. Args: equation: string Returns: - String with `\frac` replaced by normal `/` symbol. + String with `\\frac` replaced by normal `/` symbol. """ start_index = equation.find("\\frac{") if start_index == -1: diff --git a/openedx/core/djangoapps/content/search/tests/test_documents.py b/openedx/core/djangoapps/content/search/tests/test_documents.py index efeb7efbcb61..d8d3ccd28fcb 100644 --- a/openedx/core/djangoapps/content/search/tests/test_documents.py +++ b/openedx/core/djangoapps/content/search/tests/test_documents.py @@ -490,16 +490,16 @@ def test_mathjax_plain_text_conversion_for_search(self): editor="raw", use_latex_compiler=True, data=( - "Simple addition: \( 2 + 3 \) |||" - " Simple subtraction: \( 5 - 2 \) |||" - " Simple multiplication: \( 4 * 6 \) |||" - " Simple division: \( 8 / 2 \) |||" - " Mixed arithmetic: \( 2 + 3 4 \) |||" - " Simple exponentiation: \[ 2^3 \] |||" - " Root extraction: \[ 16^{1/2} \] |||" - " Exponent with multiple terms: \[ (2 + 3)^2 \] |||" - " Nested exponents: \[ 2^(3^2) \] |||" - " Mixed roots: \[ 8^{1/2} 3^2 \] |||" + "Simple addition: \\( 2 + 3 \\) |||" + " Simple subtraction: \\( 5 - 2 \\) |||" + " Simple multiplication: \\( 4 * 6 \\) |||" + " Simple division: \\( 8 / 2 \\) |||" + " Mixed arithmetic: \\( 2 + 3 4 \\) |||" + " Simple exponentiation: \\[ 2^3 \\] |||" + " Root extraction: \\[ 16^{1/2} \\] |||" + " Exponent with multiple terms: \\[ (2 + 3)^2 \\] |||" + " Nested exponents: \\[ 2^(3^2) \\] |||" + " Mixed roots: \\[ 8^{1/2} 3^2 \\] |||" " Simple fraction: [mathjaxinline] 3/4 [/mathjaxinline] |||" " Decimal to fraction conversion: [mathjaxinline] 0.75 = 3/4 [/mathjaxinline] |||" " Mixed fractions: [mathjaxinline] 1 1/2 = 3/2 [/mathjaxinline] |||" @@ -508,15 +508,15 @@ def test_mathjax_plain_text_conversion_for_search(self): " Trig identities: [mathjaxinline] \\sin(x + y) = \\sin(x) \\cos(y) + \\cos(x) \\sin(y) [/mathjaxinline] |||" " Hyperbolic trig functions: [mathjaxinline] \\sinh(x) [/mathjaxinline] [mathjaxinline] \\cosh(x) [/mathjaxinline] |||" " Simple derivative: [mathjax] f(x) = x^2, f'(x) = 2x [/mathjax] |||" - " Double integral: [mathjax] int\int (x + y) dxdy [/mathjax] |||" - " Partial derivatives: [mathjax] f(x,y) = xy, \frac{\partial f}{\partial x} = y [/mathjax] [mathjax] \frac{\partial f}{\partial y} = x [/mathjax] |||" - " Mean and standard deviation: [mathjax] mu = 2, \sigma = 1 [/mathjax] |||" - " Binomial probability: [mathjax] P(X = k) = (\binom{n}{k} p^k (1-p)^{n-k}) [/mathjax] |||" - " Gaussian distribution: [mathjax] N(\mu, \sigma^2) [/mathjax] |||" + " Double integral: [mathjax] int\\int (x + y) dxdy [/mathjax] |||" + " Partial derivatives: [mathjax] f(x,y) = xy, \\frac{\\partial f}{\\partial x} = y [/mathjax] [mathjax] \\frac{\\partial f}{\\partial y} = x [/mathjax] |||" + " Mean and standard deviation: [mathjax] mu = 2, \\sigma = 1 [/mathjax] |||" + " Binomial probability: [mathjax] P(X = k) = (\\binom{n}{k} p^k (1-p)^{n-k}) [/mathjax] |||" + " Gaussian distribution: [mathjax] N(\\mu, \\sigma^2) [/mathjax] |||" " Greek letters: [mathjaxinline] \\alpha [/mathjaxinline] [mathjaxinline] \\beta [/mathjaxinline] [mathjaxinline] \\gamma [/mathjaxinline] |||" " Subscripted variables: [mathjaxinline] x_i [/mathjaxinline] [mathjaxinline] y_j [/mathjaxinline] |||" " Superscripted variables: [mathjaxinline] x^{i} [/mathjaxinline] |||" - " Not supported: \( \\begin{bmatrix} 1 & 0 \\ 0 & 1 \\end{bmatrix} = I \)" + " Not supported: \\( \\begin{bmatrix} 1 & 0 \\ 0 & 1 \\end{bmatrix} = I \\)" ), ) # pylint: enable=line-too-long @@ -543,9 +543,9 @@ def test_mathjax_plain_text_conversion_for_search(self): 'Hyperbolic trig functions: sinh(x) cosh(x)', "Simple derivative: f(x) = x², f'(x) = 2x", 'Double integral: int∫ (x + y) dxdy', - 'Partial derivatives: f(x,y) = xy, rac{∂ f}{∂ x} = y rac{∂ f}{∂ y} = x', + 'Partial derivatives: f(x,y) = xy, (∂ f/∂ x) = y (∂ f/∂ y) = x', 'Mean and standard deviation: mu = 2, σ = 1', - 'Binomial probability: P(X = k) = (inom{n}{k} pᵏ (1−p)ⁿ⁻ᵏ)', + 'Binomial probability: P(X = k) = (\\binom{n}{k} pᵏ (1−p)ⁿ⁻ᵏ)', 'Gaussian distribution: N(μ, σ²)', 'Greek letters: α β γ', 'Subscripted variables: xᵢ yⱼ',