From ff79dba582169240c150f24e73783c61095668f7 Mon Sep 17 00:00:00 2001
From: Aki Ariga <chezou+github@gmail.com>
Date: Wed, 8 Jan 2020 22:56:03 +0900
Subject: [PATCH] fix: avoid duplicate column name

With multiple_tables=True, column name is built by tabula-py. When there
is duplicated column name, pandas fetches multiple columns so that
`pd.to_numeric` will fails.

This patch makes column names unique even if there's duplicated column
names.
---
 tabula/io.py | 19 +++++++++++++++++--
 1 file changed, 17 insertions(+), 2 deletions(-)

diff --git a/tabula/io.py b/tabula/io.py
index 804ef69..1bbe78d 100644
--- a/tabula/io.py
+++ b/tabula/io.py
@@ -24,6 +24,7 @@
 import platform
 import shlex
 import subprocess
+from collections import defaultdict
 from logging import getLogger
 
 import numpy as np
@@ -653,11 +654,25 @@ def _extract_from(raw_json, pandas_options=None):
         if isinstance(header_line_number, int) and not columns:
             _columns = list_data.pop(header_line_number)
             _unname_idx = 0
-            for idx, e in enumerate(_columns):
-                if e is np.nan:
+            for idx, col in enumerate(_columns):
+                if col is np.nan:
                     _columns[idx] = "Unnamed: {}".format(_unname_idx)
                     _unname_idx += 1
 
+            counts = defaultdict(int)
+
+            # Avoid duplicate column name adding ".\d" as a suffix
+            for idx, col in enumerate(_columns):
+                cur_count = counts[col]
+
+                while cur_count > 0:
+                    counts[col] = cur_count + 1
+                    col = "{}.{}".format(col, cur_count)
+                    cur_count = counts[col]
+
+                _columns[idx] = col
+                counts[col] = cur_count + 1
+
         df = pd.DataFrame(data=list_data, columns=_columns, **pandas_options)
 
         for c in df.columns: