diff --git a/README.md b/README.md index 6535e42f..dd19589d 100644 --- a/README.md +++ b/README.md @@ -28,6 +28,7 @@ For the active project, check out its fork 1. `pip install -r requirements.txt` 1. `python3 -m tools.pdf2txt samples/simple1.pdf` +1. `python3 -m tools.pdf2txt [-t html, xml, text] samples/equations.pdf` 1. `python3 -m tools.pdffontsinfo samples/simple1.pdf` ## How to test diff --git a/pdfminer/converter.py b/pdfminer/converter.py index a4f1ac46..dc5e5059 100644 --- a/pdfminer/converter.py +++ b/pdfminer/converter.py @@ -181,8 +181,17 @@ def render(item): elif isinstance(item, LTImage): if self.imagewriter is not None: self.imagewriter.export_image(item) + elif isinstance(item, LTLine): + self.write_text("-----\n") if self.showpageno: self.write_text('Page %s\n' % ltpage.pageid) + + # Since objs are not sorted but instead are rendered using their coordinates + # in other converter classes, we sort the objs based on their y coordinate + # to replicate the order of the elements. The y coordinate is negated + # since the origo of a pdf page starts in the bottom left + ltpage._objs.sort(key=lambda obj: -obj.y1) + render(ltpage) self.write_text('\f') return @@ -196,9 +205,6 @@ def render_image(self, name, stream): PDFConverter.render_image(self, name, stream) return - def paint_path(self, gstate, stroke, fill, evenodd, path): - return - # HTMLConverter ## @@ -251,7 +257,8 @@ def write_header(self): self.write('\n') self.write( '\n') + + 'charset=utf-8">\n' + ) self.write('\n') return diff --git a/pdfminer/layout.py b/pdfminer/layout.py index 9aff3d9c..d14122fa 100644 --- a/pdfminer/layout.py +++ b/pdfminer/layout.py @@ -10,7 +10,7 @@ from .utils import apply_matrix_pt -## IndexAssigner +# IndexAssigner ## class IndexAssigner: @@ -28,7 +28,7 @@ def run(self, obj): return -## LAParams +# LAParams ## class LAParams: @@ -54,7 +54,7 @@ def __repr__(self): (self.char_margin, self.line_margin, self.word_margin, self.all_texts)) -## LTItem +# LTItem ## class LTItem: @@ -63,7 +63,7 @@ def analyze(self, laparams): return -## LTText +# LTText ## class LTText: @@ -75,7 +75,7 @@ def get_text(self): raise NotImplementedError -## LTComponent +# LTComponent ## class LTComponent(LTItem): @@ -91,10 +91,13 @@ def __repr__(self): # Disable comparison. def __lt__(self, _): raise ValueError + def __le__(self, _): raise ValueError + def __gt__(self, _): raise ValueError + def __ge__(self, _): raise ValueError @@ -149,7 +152,7 @@ def voverlap(self, obj): return 0 -## LTCurve +# LTCurve ## class LTCurve(LTComponent): @@ -163,7 +166,7 @@ def get_pts(self): return ','.join('%.3f,%.3f' % p for p in self.pts) -## LTLine +# LTLine ## class LTLine(LTCurve): @@ -171,18 +174,23 @@ def __init__(self, linewidth, p0, p1): LTCurve.__init__(self, linewidth, [p0, p1]) return + def is_compatible(self, obj): + """Returns True if two lines can coexist.""" + return True + -## LTRect +# LTRect ## class LTRect(LTCurve): def __init__(self, linewidth, bbox): (x0, y0, x1, y1) = bbox - LTCurve.__init__(self, linewidth, [(x0, y0), (x1, y0), (x1, y1), (x0, y1)]) + LTCurve.__init__(self, linewidth, [ + (x0, y0), (x1, y0), (x1, y1), (x0, y1)]) return -## LTImage +# LTImage ## class LTImage(LTComponent): @@ -205,7 +213,7 @@ def __repr__(self): bbox2str(self.bbox), self.srcsize)) -## LTAnno +# LTAnno ## class LTAnno(LTItem, LTText): @@ -217,7 +225,7 @@ def get_text(self): return self._text -## LTChar +# LTChar ## class LTChar(LTComponent, LTText): @@ -278,7 +286,7 @@ def is_compatible(self, obj): return True -## LTContainer +# LTContainer ## class LTContainer(LTComponent): @@ -308,7 +316,7 @@ def analyze(self, laparams): return -## LTExpandableContainer +# LTExpandableContainer ## class LTExpandableContainer(LTContainer): @@ -323,7 +331,7 @@ def add(self, obj): return -## LTTextContainer +# LTTextContainer ## class LTTextContainer(LTExpandableContainer, LTText): @@ -336,7 +344,7 @@ def get_text(self): return ''.join(obj.get_text() for obj in self if isinstance(obj, LTText)) -## LTTextLine +# LTTextLine ## class LTTextLine(LTTextContainer): @@ -411,10 +419,10 @@ def find_neighbors(self, plane, ratio): abs(obj.y1-self.y1) < d))] -## LTTextBox +# LTTextBox ## -## A set of text objects that are grouped within -## a certain rectangular area. +# A set of text objects that are grouped within +# a certain rectangular area. ## class LTTextBox(LTTextContainer): @@ -451,7 +459,7 @@ def get_writing_mode(self): return 'tb-rl' -## LTTextGroup +# LTTextGroup ## class LTTextGroup(LTTextContainer): @@ -483,7 +491,7 @@ def analyze(self, laparams): return -## LTLayoutContainer +# LTLayoutContainer ## class LTLayoutContainer(LTContainer): @@ -538,7 +546,7 @@ def group_objects(self, laparams, objs): max(obj0.height, obj1.height) * laparams.char_margin)) if ((halign and isinstance(line, LTTextLineHorizontal)) or - (valign and isinstance(line, LTTextLineVertical))): + (valign and isinstance(line, LTTextLineVertical))): line.add(obj1) elif line is not None: yield line @@ -571,7 +579,8 @@ def group_textlines(self, laparams, lines): boxes = {} for line in lines: neighbors = line.find_neighbors(plane, laparams.line_margin) - if line not in neighbors: continue + if line not in neighbors: + continue members = [] for obj1 in neighbors: members.append(obj1) @@ -586,7 +595,8 @@ def group_textlines(self, laparams, lines): boxes[obj] = box done = set() for line in lines: - if line not in boxes: continue + if line not in boxes: + continue box = boxes[line] if box in done: continue @@ -628,8 +638,8 @@ def isany(obj1, obj2): return objs.difference((obj1, obj2)) def key_obj(t): - (c,d,_,_) = t - return (c,d) + (c, d, _, _) = t + return (c, d) # XXX this still takes O(n^2) :( dists = [] @@ -648,14 +658,14 @@ def key_obj(t): dists.append((1, d, obj1, obj2)) continue if (isinstance(obj1, (LTTextBoxVertical, LTTextGroupTBRL)) or - isinstance(obj2, (LTTextBoxVertical, LTTextGroupTBRL))): + isinstance(obj2, (LTTextBoxVertical, LTTextGroupTBRL))): group = LTTextGroupTBRL([obj1, obj2]) else: group = LTTextGroupLRTB([obj1, obj2]) plane.remove(obj1) plane.remove(obj2) - dists = [ (c,d,obj1,obj2) for (c,d,obj1,obj2) in dists - if (obj1 in plane and obj2 in plane) ] + dists = [(c, d, obj1, obj2) for (c, d, obj1, obj2) in dists + if (obj1 in plane and obj2 in plane)] for other in plane: dists.append((0, dist(group, other), group, other)) dists = csort(dists, key=key_obj) @@ -694,7 +704,7 @@ def getkey(box): return -## LTFigure +# LTFigure ## class LTFigure(LTLayoutContainer): @@ -719,7 +729,7 @@ def analyze(self, laparams): return -## LTPage +# LTPage ## class LTPage(LTLayoutContainer): diff --git a/samples/equations.pdf b/samples/equations.pdf new file mode 100644 index 00000000..902f4b53 Binary files /dev/null and b/samples/equations.pdf differ diff --git a/tests/test_pdf2txt.py b/tests/test_pdf2txt.py index afc8718f..d79e2e2c 100644 --- a/tests/test_pdf2txt.py +++ b/tests/test_pdf2txt.py @@ -1,5 +1,6 @@ import contextlib import io +import re import unittest from unittest.mock import patch from tools.pdf2txt import main @@ -43,3 +44,53 @@ def tests(fake_stdout): self.run_tests( tests, ['pdf2txt.py', '-t', 'html', 'samples/simple1.pdf'] ) + + def test_equations_html_output(self): + def tests(fake_stdout): + # Assert that there are two lines between the equations + self.assertRegex( + fake_stdout.getvalue(), + re.compile( + '