Skip to content

Commit

Permalink
Merge pull request #202 from chezou/duplicate-column
Browse files Browse the repository at this point in the history
fix: avoid duplicate column name
  • Loading branch information
chezou authored Jan 8, 2020
2 parents 97f0dcc + ff79dba commit a44b43c
Showing 1 changed file with 17 additions and 2 deletions.
19 changes: 17 additions & 2 deletions tabula/io.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@
import platform
import shlex
import subprocess
from collections import defaultdict
from logging import getLogger

import numpy as np
Expand Down Expand Up @@ -653,11 +654,25 @@ def _extract_from(raw_json, pandas_options=None):
if isinstance(header_line_number, int) and not columns:
_columns = list_data.pop(header_line_number)
_unname_idx = 0
for idx, e in enumerate(_columns):
if e is np.nan:
for idx, col in enumerate(_columns):
if col is np.nan:
_columns[idx] = "Unnamed: {}".format(_unname_idx)
_unname_idx += 1

counts = defaultdict(int)

# Avoid duplicate column name adding ".\d" as a suffix
for idx, col in enumerate(_columns):
cur_count = counts[col]

while cur_count > 0:
counts[col] = cur_count + 1
col = "{}.{}".format(col, cur_count)
cur_count = counts[col]

_columns[idx] = col
counts[col] = cur_count + 1

df = pd.DataFrame(data=list_data, columns=_columns, **pandas_options)

for c in df.columns:
Expand Down

0 comments on commit a44b43c

Please sign in to comment.