From 289c94e35b161745b349c757a147aaf9a4ff4e93 Mon Sep 17 00:00:00 2001 From: bosd Date: Mon, 22 Jul 2024 08:20:09 +0200 Subject: [PATCH] Add functionality to replace text forward port of 8928445bef9dc65b521eaa8999be765e3e3168f4 by https://github.com/ArifRasim --- camelot/cli.py | 6 +++ camelot/io.py | 3 ++ camelot/parsers/lattice.py | 6 +++ camelot/parsers/stream.py | 6 +++ camelot/utils.py | 75 ++++++++++++++++++++++++++++++++------ tests/data.py | 53 +++++++++++++++++++++++++++ tests/test_lattice.py | 11 ++++++ tests/test_stream.py | 12 ++++++ 8 files changed, 161 insertions(+), 11 deletions(-) diff --git a/camelot/cli.py b/camelot/cli.py index cc349176..ed574815 100644 --- a/camelot/cli.py +++ b/camelot/cli.py @@ -72,6 +72,12 @@ def set_config(self, key, value): help="Characters that should be stripped from a string before" " assigning it to a cell.", ) +@click.option( + "-replace", + "--replace_text", + help="Characters that should be replaced from a string before" + " assigning it to a cell.", +) @click.option( "-M", "--margins", diff --git a/camelot/io.py b/camelot/io.py index 12718828..b049e2b6 100644 --- a/camelot/io.py +++ b/camelot/io.py @@ -58,6 +58,9 @@ def read_pdf( strip_text : str, optional (default: '') Characters that should be stripped from a string before assigning it to a cell. + replace_text : dict, optional (default: {}) + Characters that should be replaced from a string before + assigning it to a cell. row_tol^ : int, optional (default: 2) Tolerance parameter used to combine text vertically, to generate rows. diff --git a/camelot/parsers/lattice.py b/camelot/parsers/lattice.py index 01d17d96..3eaecf2d 100644 --- a/camelot/parsers/lattice.py +++ b/camelot/parsers/lattice.py @@ -63,6 +63,9 @@ class Lattice(BaseParser): strip_text : str, optional (default: '') Characters that should be stripped from a string before assigning it to a cell. + replace_text : dict, optional (default: {}) + Characters that should be replaced from a string before + assigning it to a cell. line_tol : int, optional (default: 2) Tolerance parameter used to merge close vertical and horizontal lines. @@ -99,6 +102,7 @@ def __init__( split_text=False, flag_size=False, strip_text="", + replace_text={}, line_tol=2, joint_tol=2, threshold_blocksize=15, @@ -117,6 +121,7 @@ def __init__( self.split_text = split_text self.flag_size = flag_size self.strip_text = strip_text + self.replace_text = replace_text self.line_tol = line_tol self.joint_tol = joint_tol self.threshold_blocksize = threshold_blocksize @@ -360,6 +365,7 @@ def _generate_table(self, table_idx, cols, rows, **kwargs): split_text=self.split_text, flag_size=self.flag_size, strip_text=self.strip_text, + replace_text=self.replace_text, ) if indices[0][:2] != (-1, -1): pos_errors.append(error) diff --git a/camelot/parsers/stream.py b/camelot/parsers/stream.py index 266a0e95..0e87fc0a 100644 --- a/camelot/parsers/stream.py +++ b/camelot/parsers/stream.py @@ -45,6 +45,9 @@ class Stream(BaseParser): strip_text : str, optional (default: '') Characters that should be stripped from a string before assigning it to a cell. + replace_text : dict, optional (default: {}) + Characters that should be replaced from a string before + assigning it to a cell. edge_tol : int, optional (default: 50) Tolerance parameter for extending textedges vertically. row_tol : int, optional (default: 2) @@ -64,6 +67,7 @@ def __init__( split_text=False, flag_size=False, strip_text="", + replace_text={}, edge_tol=50, row_tol=2, column_tol=0, @@ -76,6 +80,7 @@ def __init__( self.split_text = split_text self.flag_size = flag_size self.strip_text = strip_text + self.replace_text = replace_text self.edge_tol = edge_tol self.row_tol = row_tol self.column_tol = column_tol @@ -414,6 +419,7 @@ def _generate_table(self, table_idx, cols, rows, **kwargs): split_text=self.split_text, flag_size=self.flag_size, strip_text=self.strip_text, + replace_text=self.replace_text, ) if indices[:2] != (-1, -1): pos_errors.append(error) diff --git a/camelot/utils.py b/camelot/utils.py index 29939f68..75042516 100644 --- a/camelot/utils.py +++ b/camelot/utils.py @@ -505,12 +505,33 @@ def text_strip(text, strip=""): return stripped +def text_replace(text, replace={}): + """Replaces the keys for the values that are present in `text`. + Parameters + ---------- + text : str + Text to process and modify. + replace : dict, optional (default: {}) + key value pairs, where keys are swapped for the values in `text`. + Returns + ------- + text : str + """ + if replace is {}: + return text + + for key, value in replace.items(): + text = text.replace(key, value) + + return text + + # TODO: combine the following functions into a TextProcessor class which # applies corresponding transformations sequentially # (inspired from sklearn.pipeline.Pipeline) -def flag_font_size(textline, direction, strip_text=""): +def flag_font_size(textline, direction, strip_text="", replace_text={}): """Flags super/subscripts in text by enclosing them with . May give false positives. @@ -523,6 +544,9 @@ def flag_font_size(textline, direction, strip_text=""): strip_text : str, optional (default: '') Characters that should be stripped from a string before assigning it to a cell. + replace_text : dict, optional (default: {}) + Characters that should be replaced from a string before + assigning it to a cell. Returns ------- @@ -559,10 +583,13 @@ def flag_font_size(textline, direction, strip_text=""): fstring = "".join(flist) else: fstring = "".join([t.get_text() for t in textline]) + fstring = text_replace(fstring, replace_text) return text_strip(fstring, strip_text) -def split_textline(table, textline, direction, flag_size=False, strip_text=""): +def split_textline( + table, textline, direction, flag_size=False, strip_text="", replace_text={} +): """Splits PDFMiner LTTextLine into substrings if it spans across multiple rows/columns. @@ -580,6 +607,9 @@ def split_textline(table, textline, direction, flag_size=False, strip_text=""): strip_text : str, optional (default: '') Characters that should be stripped from a string before assigning it to a cell. + replace_text : dict, optional (default: {}) + Characters that should be replaced from a string before + assigning it to a cell. Returns ------- @@ -668,20 +698,28 @@ def split_textline(table, textline, direction, flag_size=False, strip_text=""): key[0], key[1], flag_font_size( - [t[2] for t in chars], direction, strip_text=strip_text + [t[2] for t in chars], + direction, + strip_text=strip_text, + replace_text=replace_text, ), ) ) else: - gchars = [t[2].get_text() for t in chars] - grouped_chars.append( - (key[0], key[1], text_strip("".join(gchars), strip_text)) - ) + gchars = "".join([t[2].get_text() for t in chars]) + gchars = text_replace(gchars, replace_text) + grouped_chars.append((key[0], key[1], text_strip(gchars, strip_text))) return grouped_chars def get_table_index( - table, t, direction, split_text=False, flag_size=False, strip_text="" + table, + t, + direction, + split_text=False, + flag_size=False, + strip_text="", + replace_text={}, ): """Gets indices of the table cell where given text object lies by comparing their y and x-coordinates. @@ -703,6 +741,9 @@ def get_table_index( strip_text : str, optional (default: '') Characters that should be stripped from a string before assigning it to a cell. + replace_text : dict, optional (default: {}) + Characters that should be replaced from a string before + assigning it to a cell. Returns ------- @@ -761,7 +802,12 @@ def get_table_index( if split_text: return ( split_textline( - table, t, direction, flag_size=flag_size, strip_text=strip_text + table, + t, + direction, + flag_size=flag_size, + strip_text=strip_text, + replace_text=replace_text, ), error, ) @@ -772,13 +818,20 @@ def get_table_index( ( r_idx, c_idx, - flag_font_size(t._objs, direction, strip_text=strip_text), + flag_font_size( + t._objs, + direction, + strip_text=strip_text, + replace_text=replace_text, + ), ) ], error, ) else: - return [(r_idx, c_idx, text_strip(t.get_text(), strip_text))], error + text = t.get_text() + text = text_replace(text, replace_text) + return [(r_idx, c_idx, text_strip(text, strip_text))], error def compute_accuracy(error_weights): diff --git a/tests/data.py b/tests/data.py index 6d743183..3d59e2f2 100644 --- a/tests/data.py +++ b/tests/data.py @@ -2306,6 +2306,33 @@ ["ChâteauLéoube2016", "10€"], ] +data_stream_replace_text = [ + ["VinsauVerre", ""], + ["LesBlancs", "12.5CL"], + ["A.O.PCôtesduRhône", ""], + ["DomainedelaGuicharde«Autourdelachapelle»3316", "8$"], + ["A.O.PVacqueyras", ""], + ["DomainedeMontvac«Melodine»3316", "10$"], + ["A.O.PChâteauneufduPape", ""], + ["DomainedeBeaurenard3317", "13$"], + ["A.O.PCôteauxduLanguedoc", ""], + ["VillaTempora«Untempspourelle»3314", "9$"], + ["A.O.PCôtesdeProvence", ""], + ["ChâteauGrandBoise3317", "9$"], + ["LesRosés", "125CL"], + ["A.O.PCôtesduRhône", ""], + ["DomainedelaFlorane«AfleurdePampre»3316", "8$"], + ["FamilleCoulon(DomaineBeaurenard)Biotifulfox3317", "8$"], + ["A.O.PVacqueyras", ""], + ["DomainedeMontvac3317", "9$"], + ["A.O.PLanguedoc", ""], + ["DomainedeJoncas«Nébla»3315", "8$"], + ["VillaTempora«L’arroseurarrosé»3315", "9$"], + ["A.O.PCôtesdeProvence", ""], + ["ChâteauGrandBoise«SainteVictoire»3317", "9$"], + ["ChâteauLéoube3316", "10$"], +] + data_stream_edge_tol = [ ["Key figures", ""], ["", "2016"], @@ -2368,6 +2395,32 @@ ["4171_1", "0.07", "173.9", "58.1%", "1.6%", "2.1%", "0.5%"], ] +data_lattice_text_replace = [ + [ + "Cycle \nName", + "KI \n(1/km)", + "Distance \n(mi)", + "Percent Fuel Savings", + "", + "", + "", + ], + [ + "", + "", + "", + "Improved \nSpeed", + "Decreased \nAccel", + "Eliminate \nStops", + "Decreased \nIdle", + ], + ["2012_2", "3,30", "1,3", "5,9%", "9,5%", "29,2%", "17,4%"], + ["2145_1", "0,68", "11,2", "2,4%", "0,1%", "9,5%", "2,7%"], + ["4234_1", "0,59", "58,7", "8,5%", "1,3%", "8,5%", "3,3%"], + ["2032_2", "0,17", "57,8", "21,7%", "0,3%", "2,7%", "1,2%"], + ["4171_1", "0,07", "173,9", "58,1%", "1,6%", "2,1%", "0,5%"], +] + data_lattice_table_rotated = [ [ "State", diff --git a/tests/test_lattice.py b/tests/test_lattice.py index d2803049..33f220f4 100644 --- a/tests/test_lattice.py +++ b/tests/test_lattice.py @@ -20,6 +20,17 @@ def test_lattice(testdir): assert_frame_equal(df, tables[0].df) +@skip_on_windows +def test_lattice_text_replace(testdir): + df = pd.DataFrame(data_lattice_text_replace) + + filename = os.path.join( + testdir, "tabula/icdar2013-dataset/competition-dataset-us/us-030.pdf" + ) + tables = camelot.read_pdf(filename, pages="2", replace_text={".": ","}) + assert_frame_equal(df, tables[0].df) + + @skip_on_windows def test_lattice_table_rotated(testdir): df = pd.DataFrame(data_lattice_table_rotated) diff --git a/tests/test_stream.py b/tests/test_stream.py index e86f23b7..e7fe8bcd 100644 --- a/tests/test_stream.py +++ b/tests/test_stream.py @@ -98,6 +98,18 @@ def test_stream_strip_text(testdir): tables = camelot.read_pdf(filename, flavor="stream", strip_text=" ,\n") assert_frame_equal(df, tables[0].df) +def test_stream_replace_text(testdir): + df = pd.DataFrame(data_stream_replace_text) + + filename = os.path.join(testdir, "detect_vertical_false.pdf") + tables = camelot.read_pdf( + filename, + flavor="stream", + strip_text=" ,\n", + replace_text={"€": "$", "20": "33"}, + ) + + assert_frame_equal(df, tables[0].df) def test_stream_edge_tol(testdir): df = pd.DataFrame(data_stream_edge_tol)