Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Printing equation division lines in pdf2txt text output #5

Merged
merged 9 commits into from
Sep 28, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@ For the active project, check out its fork

1. `pip install -r requirements.txt`
1. `python3 -m tools.pdf2txt samples/simple1.pdf`
1. `python3 -m tools.pdf2txt [-t html, xml, text] samples/equations.pdf`
1. `python3 -m tools.pdffontsinfo samples/simple1.pdf`

## How to test
Expand Down
15 changes: 11 additions & 4 deletions pdfminer/converter.py
Original file line number Diff line number Diff line change
Expand Up @@ -181,8 +181,17 @@ def render(item):
elif isinstance(item, LTImage):
if self.imagewriter is not None:
self.imagewriter.export_image(item)
elif isinstance(item, LTLine):
self.write_text("-----\n")
Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The line length is fixed atm, but if an equation is very large, the line will not scale with its size.

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, this is because it is diffucult to know how long the line should be in the conosle. We do have access to how wide the line was in the pdf but converting that width into characters in the console is realy tricky and tedious. So for now it is like this

if self.showpageno:
self.write_text('Page %s\n' % ltpage.pageid)

# Since objs are not sorted but instead are rendered using their coordinates
# in other converter classes, we sort the objs based on their y coordinate
# to replicate the order of the elements. The y coordinate is negated
# since the origo of a pdf page starts in the bottom left
ltpage._objs.sort(key=lambda obj: -obj.y1)

render(ltpage)
self.write_text('\f')
return
Expand All @@ -196,9 +205,6 @@ def render_image(self, name, stream):
PDFConverter.render_image(self, name, stream)
return

def paint_path(self, gstate, stroke, fill, evenodd, path):
return


# HTMLConverter
##
Expand Down Expand Up @@ -251,7 +257,8 @@ def write_header(self):
self.write('<html><head>\n')
self.write(
'<meta http-equiv="Content-Type" content="text/html; '
+ 'charset=utf-8">\n')
+ 'charset=utf-8">\n'
)
self.write('</head><body>\n')
return

Expand Down
72 changes: 41 additions & 31 deletions pdfminer/layout.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
from .utils import apply_matrix_pt


## IndexAssigner
# IndexAssigner
##
class IndexAssigner:

Expand All @@ -28,7 +28,7 @@ def run(self, obj):
return


## LAParams
# LAParams
##
class LAParams:

Expand All @@ -54,7 +54,7 @@ def __repr__(self):
(self.char_margin, self.line_margin, self.word_margin, self.all_texts))


## LTItem
# LTItem
##
class LTItem:

Expand All @@ -63,7 +63,7 @@ def analyze(self, laparams):
return


## LTText
# LTText
##
class LTText:

Expand All @@ -75,7 +75,7 @@ def get_text(self):
raise NotImplementedError


## LTComponent
# LTComponent
##
class LTComponent(LTItem):

Expand All @@ -91,10 +91,13 @@ def __repr__(self):
# Disable comparison.
def __lt__(self, _):
raise ValueError

def __le__(self, _):
raise ValueError

def __gt__(self, _):
raise ValueError

def __ge__(self, _):
raise ValueError

Expand Down Expand Up @@ -149,7 +152,7 @@ def voverlap(self, obj):
return 0


## LTCurve
# LTCurve
##
class LTCurve(LTComponent):

Expand All @@ -163,26 +166,31 @@ def get_pts(self):
return ','.join('%.3f,%.3f' % p for p in self.pts)


## LTLine
# LTLine
##
class LTLine(LTCurve):

def __init__(self, linewidth, p0, p1):
LTCurve.__init__(self, linewidth, [p0, p1])
return

def is_compatible(self, obj):
"""Returns True if two lines can coexist."""
return True


## LTRect
# LTRect
##
class LTRect(LTCurve):

def __init__(self, linewidth, bbox):
(x0, y0, x1, y1) = bbox
LTCurve.__init__(self, linewidth, [(x0, y0), (x1, y0), (x1, y1), (x0, y1)])
LTCurve.__init__(self, linewidth, [
(x0, y0), (x1, y0), (x1, y1), (x0, y1)])
return


## LTImage
# LTImage
##
class LTImage(LTComponent):

Expand All @@ -205,7 +213,7 @@ def __repr__(self):
bbox2str(self.bbox), self.srcsize))


## LTAnno
# LTAnno
##
class LTAnno(LTItem, LTText):

Expand All @@ -217,7 +225,7 @@ def get_text(self):
return self._text


## LTChar
# LTChar
##
class LTChar(LTComponent, LTText):

Expand Down Expand Up @@ -278,7 +286,7 @@ def is_compatible(self, obj):
return True


## LTContainer
# LTContainer
##
class LTContainer(LTComponent):

Expand Down Expand Up @@ -308,7 +316,7 @@ def analyze(self, laparams):
return


## LTExpandableContainer
# LTExpandableContainer
##
class LTExpandableContainer(LTContainer):

Expand All @@ -323,7 +331,7 @@ def add(self, obj):
return


## LTTextContainer
# LTTextContainer
##
class LTTextContainer(LTExpandableContainer, LTText):

Expand All @@ -336,7 +344,7 @@ def get_text(self):
return ''.join(obj.get_text() for obj in self if isinstance(obj, LTText))


## LTTextLine
# LTTextLine
##
class LTTextLine(LTTextContainer):

Expand Down Expand Up @@ -411,10 +419,10 @@ def find_neighbors(self, plane, ratio):
abs(obj.y1-self.y1) < d))]


## LTTextBox
# LTTextBox
##
## A set of text objects that are grouped within
## a certain rectangular area.
# A set of text objects that are grouped within
# a certain rectangular area.
##
class LTTextBox(LTTextContainer):

Expand Down Expand Up @@ -451,7 +459,7 @@ def get_writing_mode(self):
return 'tb-rl'


## LTTextGroup
# LTTextGroup
##
class LTTextGroup(LTTextContainer):

Expand Down Expand Up @@ -483,7 +491,7 @@ def analyze(self, laparams):
return


## LTLayoutContainer
# LTLayoutContainer
##
class LTLayoutContainer(LTContainer):

Expand Down Expand Up @@ -538,7 +546,7 @@ def group_objects(self, laparams, objs):
max(obj0.height, obj1.height) * laparams.char_margin))

if ((halign and isinstance(line, LTTextLineHorizontal)) or
(valign and isinstance(line, LTTextLineVertical))):
(valign and isinstance(line, LTTextLineVertical))):
line.add(obj1)
elif line is not None:
yield line
Expand Down Expand Up @@ -571,7 +579,8 @@ def group_textlines(self, laparams, lines):
boxes = {}
for line in lines:
neighbors = line.find_neighbors(plane, laparams.line_margin)
if line not in neighbors: continue
if line not in neighbors:
continue
members = []
for obj1 in neighbors:
members.append(obj1)
Expand All @@ -586,7 +595,8 @@ def group_textlines(self, laparams, lines):
boxes[obj] = box
done = set()
for line in lines:
if line not in boxes: continue
if line not in boxes:
continue
box = boxes[line]
if box in done:
continue
Expand Down Expand Up @@ -628,8 +638,8 @@ def isany(obj1, obj2):
return objs.difference((obj1, obj2))

def key_obj(t):
(c,d,_,_) = t
return (c,d)
(c, d, _, _) = t
return (c, d)

# XXX this still takes O(n^2) :(
dists = []
Expand All @@ -648,14 +658,14 @@ def key_obj(t):
dists.append((1, d, obj1, obj2))
continue
if (isinstance(obj1, (LTTextBoxVertical, LTTextGroupTBRL)) or
isinstance(obj2, (LTTextBoxVertical, LTTextGroupTBRL))):
isinstance(obj2, (LTTextBoxVertical, LTTextGroupTBRL))):
group = LTTextGroupTBRL([obj1, obj2])
else:
group = LTTextGroupLRTB([obj1, obj2])
plane.remove(obj1)
plane.remove(obj2)
dists = [ (c,d,obj1,obj2) for (c,d,obj1,obj2) in dists
if (obj1 in plane and obj2 in plane) ]
dists = [(c, d, obj1, obj2) for (c, d, obj1, obj2) in dists
if (obj1 in plane and obj2 in plane)]
for other in plane:
dists.append((0, dist(group, other), group, other))
dists = csort(dists, key=key_obj)
Expand Down Expand Up @@ -694,7 +704,7 @@ def getkey(box):
return


## LTFigure
# LTFigure
##
class LTFigure(LTLayoutContainer):

Expand All @@ -719,7 +729,7 @@ def analyze(self, laparams):
return


## LTPage
# LTPage
##
class LTPage(LTLayoutContainer):

Expand Down
Binary file added samples/equations.pdf
Binary file not shown.
51 changes: 51 additions & 0 deletions tests/test_pdf2txt.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import contextlib
import io
import re
import unittest
from unittest.mock import patch
from tools.pdf2txt import main
Expand Down Expand Up @@ -43,3 +44,53 @@ def tests(fake_stdout):
self.run_tests(
tests, ['pdf2txt.py', '-t', 'html', 'samples/simple1.pdf']
)

def test_equations_html_output(self):
def tests(fake_stdout):
# Assert that there are two lines between the equations
self.assertRegex(
fake_stdout.getvalue(),
re.compile(
'<span.*border: black 1px solid.*'
'<span.*border: black 1px solid.*',
re.DOTALL
)
)

# Assert that one of the equations are in the output
self.assertRegex(fake_stdout.getvalue(), r'3.*x.*2.*\+ 5')

self.run_tests(
tests, ['pdf2txt.py', '-t', 'html', 'samples/equations.pdf']
)

def test_equations_xml_output(self):
def tests(fake_stdout):
# Assert that there are two lines between the equations
self.assertRegex(
fake_stdout.getvalue(),
re.compile('<line.*<line', re.DOTALL)
)
# Assert that one of the equations are in the output
self.assertRegex(
fake_stdout.getvalue(),
re.compile('3.*x.*2.*\\+.*5', re.DOTALL)
)

self.run_tests(
tests, ['pdf2txt.py', '-t', 'xml', 'samples/equations.pdf']
)

def test_equations_text_output(self):
def tests(fake_stdout):
# Assert that the equation and the division line is in the output
self.assertIn(
"3x3\n\n"
"-----\n"
"3x2 + 5",
fake_stdout.getvalue()
)

self.run_tests(
tests, ['pdf2txt.py', 'samples/equations.pdf']
)
Loading