kakann · ghmo2789 · Sep 28, 2022 · Sep 19, 2022 · Sep 21, 2022 · Sep 22, 2022
diff --git a/README.md b/README.md
@@ -28,6 +28,7 @@ For the active project, check out its fork
 
 1. `pip install -r requirements.txt`
 1. `python3 -m tools.pdf2txt samples/simple1.pdf`
+1. `python3 -m tools.pdf2txt [-t html, xml, text] samples/equations.pdf`
 1. `python3 -m tools.pdffontsinfo samples/simple1.pdf`
 
 ## How to test

diff --git a/pdfminer/converter.py b/pdfminer/converter.py
@@ -181,8 +181,17 @@ def render(item):
             elif isinstance(item, LTImage):
                 if self.imagewriter is not None:
                     self.imagewriter.export_image(item)
+            elif isinstance(item, LTLine):
+                self.write_text("-----\n")
         if self.showpageno:
             self.write_text('Page %s\n' % ltpage.pageid)
+
+        # Since objs are not sorted but instead are rendered using their coordinates
+        # in other converter classes, we sort the objs based on their y coordinate
+        # to replicate the order of the elements. The y coordinate is negated
+        # since the origo of a pdf page starts in the bottom left
+        ltpage._objs.sort(key=lambda obj: -obj.y1)
+
         render(ltpage)
         self.write_text('\f')
         return
@@ -196,9 +205,6 @@ def render_image(self, name, stream):
         PDFConverter.render_image(self, name, stream)
         return
 
-    def paint_path(self, gstate, stroke, fill, evenodd, path):
-        return
-
 
 # HTMLConverter
 ##
@@ -251,7 +257,8 @@ def write_header(self):
         self.write('<html><head>\n')
         self.write(
             '<meta http-equiv="Content-Type" content="text/html; '
-            + 'charset=utf-8">\n')
+            + 'charset=utf-8">\n'
+        )
         self.write('</head><body>\n')
         return
 

diff --git a/pdfminer/layout.py b/pdfminer/layout.py
@@ -10,7 +10,7 @@
 from .utils import apply_matrix_pt
 
 
-##  IndexAssigner
+# IndexAssigner
 ##
 class IndexAssigner:
 
@@ -28,7 +28,7 @@ def run(self, obj):
         return
 
 
-##  LAParams
+# LAParams
 ##
 class LAParams:
 
@@ -54,7 +54,7 @@ def __repr__(self):
                 (self.char_margin, self.line_margin, self.word_margin, self.all_texts))
 
 
-##  LTItem
+# LTItem
 ##
 class LTItem:
 
@@ -63,7 +63,7 @@ def analyze(self, laparams):
         return
 
 
-##  LTText
+# LTText
 ##
 class LTText:
 
@@ -75,7 +75,7 @@ def get_text(self):
         raise NotImplementedError
 
 
-##  LTComponent
+# LTComponent
 ##
 class LTComponent(LTItem):
 
@@ -91,10 +91,13 @@ def __repr__(self):
     # Disable comparison.
     def __lt__(self, _):
         raise ValueError
+
     def __le__(self, _):
         raise ValueError
+
     def __gt__(self, _):
         raise ValueError
+
     def __ge__(self, _):
         raise ValueError
 
@@ -149,7 +152,7 @@ def voverlap(self, obj):
             return 0
 
 
-##  LTCurve
+# LTCurve
 ##
 class LTCurve(LTComponent):
 
@@ -163,26 +166,31 @@ def get_pts(self):
         return ','.join('%.3f,%.3f' % p for p in self.pts)
 
 
-##  LTLine
+# LTLine
 ##
 class LTLine(LTCurve):
 
     def __init__(self, linewidth, p0, p1):
         LTCurve.__init__(self, linewidth, [p0, p1])
         return
 
+    def is_compatible(self, obj):
+        """Returns True if two lines can coexist."""
+        return True
+
 
-##  LTRect
+# LTRect
 ##
 class LTRect(LTCurve):
 
     def __init__(self, linewidth, bbox):
         (x0, y0, x1, y1) = bbox
-        LTCurve.__init__(self, linewidth, [(x0, y0), (x1, y0), (x1, y1), (x0, y1)])
+        LTCurve.__init__(self, linewidth, [
+                         (x0, y0), (x1, y0), (x1, y1), (x0, y1)])
         return
 
 
-##  LTImage
+# LTImage
 ##
 class LTImage(LTComponent):
 
@@ -205,7 +213,7 @@ def __repr__(self):
                  bbox2str(self.bbox), self.srcsize))
 
 
-##  LTAnno
+# LTAnno
 ##
 class LTAnno(LTItem, LTText):
 
@@ -217,7 +225,7 @@ def get_text(self):
         return self._text
 
 
-##  LTChar
+# LTChar
 ##
 class LTChar(LTComponent, LTText):
 
@@ -278,7 +286,7 @@ def is_compatible(self, obj):
         return True
 
 
-##  LTContainer
+# LTContainer
 ##
 class LTContainer(LTComponent):
 
@@ -308,7 +316,7 @@ def analyze(self, laparams):
         return
 
 
-##  LTExpandableContainer
+# LTExpandableContainer
 ##
 class LTExpandableContainer(LTContainer):
 
@@ -323,7 +331,7 @@ def add(self, obj):
         return
 
 
-##  LTTextContainer
+# LTTextContainer
 ##
 class LTTextContainer(LTExpandableContainer, LTText):
 
@@ -336,7 +344,7 @@ def get_text(self):
         return ''.join(obj.get_text() for obj in self if isinstance(obj, LTText))
 
 
-##  LTTextLine
+# LTTextLine
 ##
 class LTTextLine(LTTextContainer):
 
@@ -411,10 +419,10 @@ def find_neighbors(self, plane, ratio):
                      abs(obj.y1-self.y1) < d))]
 
 
-##  LTTextBox
+# LTTextBox
 ##
-##  A set of text objects that are grouped within
-##  a certain rectangular area.
+# A set of text objects that are grouped within
+# a certain rectangular area.
 ##
 class LTTextBox(LTTextContainer):
 
@@ -451,7 +459,7 @@ def get_writing_mode(self):
         return 'tb-rl'
 
 
-##  LTTextGroup
+# LTTextGroup
 ##
 class LTTextGroup(LTTextContainer):
 
@@ -483,7 +491,7 @@ def analyze(self, laparams):
         return
 
 
-##  LTLayoutContainer
+# LTLayoutContainer
 ##
 class LTLayoutContainer(LTContainer):
 
@@ -538,7 +546,7 @@ def group_objects(self, laparams, objs):
                            max(obj0.height, obj1.height) * laparams.char_margin))
 
                 if ((halign and isinstance(line, LTTextLineHorizontal)) or
-                    (valign and isinstance(line, LTTextLineVertical))):
+                        (valign and isinstance(line, LTTextLineVertical))):
                     line.add(obj1)
                 elif line is not None:
                     yield line
@@ -571,7 +579,8 @@ def group_textlines(self, laparams, lines):
         boxes = {}
         for line in lines:
             neighbors = line.find_neighbors(plane, laparams.line_margin)
-            if line not in neighbors: continue
+            if line not in neighbors:
+                continue
             members = []
             for obj1 in neighbors:
                 members.append(obj1)
@@ -586,7 +595,8 @@ def group_textlines(self, laparams, lines):
                 boxes[obj] = box
         done = set()
         for line in lines:
-            if line not in boxes: continue
+            if line not in boxes:
+                continue
             box = boxes[line]
             if box in done:
                 continue
@@ -628,8 +638,8 @@ def isany(obj1, obj2):
             return objs.difference((obj1, obj2))
 
         def key_obj(t):
-            (c,d,_,_) = t
-            return (c,d)
+            (c, d, _, _) = t
+            return (c, d)
 
         # XXX this still takes O(n^2)  :(
         dists = []
@@ -648,14 +658,14 @@ def key_obj(t):
                 dists.append((1, d, obj1, obj2))
                 continue
             if (isinstance(obj1, (LTTextBoxVertical, LTTextGroupTBRL)) or
-                isinstance(obj2, (LTTextBoxVertical, LTTextGroupTBRL))):
+                    isinstance(obj2, (LTTextBoxVertical, LTTextGroupTBRL))):
                 group = LTTextGroupTBRL([obj1, obj2])
             else:
                 group = LTTextGroupLRTB([obj1, obj2])
             plane.remove(obj1)
             plane.remove(obj2)
-            dists = [ (c,d,obj1,obj2) for (c,d,obj1,obj2) in dists
-                      if (obj1 in plane and obj2 in plane) ]
+            dists = [(c, d, obj1, obj2) for (c, d, obj1, obj2) in dists
+                     if (obj1 in plane and obj2 in plane)]
             for other in plane:
                 dists.append((0, dist(group, other), group, other))
             dists = csort(dists, key=key_obj)
@@ -694,7 +704,7 @@ def getkey(box):
         return
 
 
-##  LTFigure
+# LTFigure
 ##
 class LTFigure(LTLayoutContainer):
 
@@ -719,7 +729,7 @@ def analyze(self, laparams):
         return
 
 
-##  LTPage
+# LTPage
 ##
 class LTPage(LTLayoutContainer):
 

diff --git a/samples/equations.pdf b/samples/equations.pdf
diff --git a/tests/test_pdf2txt.py b/tests/test_pdf2txt.py
@@ -1,5 +1,6 @@
 import contextlib
 import io
+import re
 import unittest
 from unittest.mock import patch
 from tools.pdf2txt import main
@@ -43,3 +44,53 @@ def tests(fake_stdout):
         self.run_tests(
             tests, ['pdf2txt.py', '-t', 'html', 'samples/simple1.pdf']
         )
+
+    def test_equations_html_output(self):
+        def tests(fake_stdout):
+            # Assert that there are two lines between the equations
+            self.assertRegex(
+                fake_stdout.getvalue(),
+                re.compile(
+                    '<span.*border: black 1px solid.*'
+                    '<span.*border: black 1px solid.*',
+                    re.DOTALL
+                )
+            )
+
+            # Assert that one of the equations are in the output
+            self.assertRegex(fake_stdout.getvalue(), r'3.*x.*2.*\+ 5')
+
+        self.run_tests(
+            tests, ['pdf2txt.py', '-t', 'html', 'samples/equations.pdf']
+        )
+
+    def test_equations_xml_output(self):
+        def tests(fake_stdout):
+            # Assert that there are two lines between the equations
+            self.assertRegex(
+                fake_stdout.getvalue(),
+                re.compile('<line.*<line', re.DOTALL)
+            )
+            # Assert that one of the equations are in the output
+            self.assertRegex(
+                fake_stdout.getvalue(),
+                re.compile('3.*x.*2.*\\+.*5', re.DOTALL)
+            )
+
+        self.run_tests(
+            tests, ['pdf2txt.py', '-t', 'xml', 'samples/equations.pdf']
+        )
+
+    def test_equations_text_output(self):
+        def tests(fake_stdout):
+            # Assert that the equation and the division line is in the output
+            self.assertIn(
+                "3x3\n\n"
+                "-----\n"
+                "3x2 + 5",
+                fake_stdout.getvalue()
+            )
+
+        self.run_tests(
+            tests, ['pdf2txt.py', 'samples/equations.pdf']
+        )