From ff79dba582169240c150f24e73783c61095668f7 Mon Sep 17 00:00:00 2001 From: Aki Ariga Date: Wed, 8 Jan 2020 22:56:03 +0900 Subject: [PATCH] fix: avoid duplicate column name With multiple_tables=True, column name is built by tabula-py. When there is duplicated column name, pandas fetches multiple columns so that `pd.to_numeric` will fails. This patch makes column names unique even if there's duplicated column names. --- tabula/io.py | 19 +++++++++++++++++-- 1 file changed, 17 insertions(+), 2 deletions(-) diff --git a/tabula/io.py b/tabula/io.py index 804ef69..1bbe78d 100644 --- a/tabula/io.py +++ b/tabula/io.py @@ -24,6 +24,7 @@ import platform import shlex import subprocess +from collections import defaultdict from logging import getLogger import numpy as np @@ -653,11 +654,25 @@ def _extract_from(raw_json, pandas_options=None): if isinstance(header_line_number, int) and not columns: _columns = list_data.pop(header_line_number) _unname_idx = 0 - for idx, e in enumerate(_columns): - if e is np.nan: + for idx, col in enumerate(_columns): + if col is np.nan: _columns[idx] = "Unnamed: {}".format(_unname_idx) _unname_idx += 1 + counts = defaultdict(int) + + # Avoid duplicate column name adding ".\d" as a suffix + for idx, col in enumerate(_columns): + cur_count = counts[col] + + while cur_count > 0: + counts[col] = cur_count + 1 + col = "{}.{}".format(col, cur_count) + cur_count = counts[col] + + _columns[idx] = col + counts[col] = cur_count + 1 + df = pd.DataFrame(data=list_data, columns=_columns, **pandas_options) for c in df.columns: