Skip to content

Commit

Permalink
[#418] reduce memory footprint
Browse files Browse the repository at this point in the history
Move around some things to stop data being copied.
Reduce memory for when no source maps are created.
  • Loading branch information
kindly committed Jun 10, 2016
1 parent 03c36c5 commit 82b57a1
Show file tree
Hide file tree
Showing 3 changed files with 80 additions and 75 deletions.
36 changes: 3 additions & 33 deletions flattentool/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,12 +3,11 @@
from flattentool.output import FORMATS as OUTPUT_FORMATS
from flattentool.output import FORMATS_SUFFIX
from flattentool.input import FORMATS as INPUT_FORMATS, WITH_CELLS
from flattentool.lib import decimal_default
import json
import codecs
from decimal import Decimal
from collections import OrderedDict


def create_template(schema, output_name='releases', output_format='all', main_sheet_name='main', flatten=False, rollup=False, root_id='ocid', use_titles=False, **_):
"""
Creates template file(s) from given inputs
Expand Down Expand Up @@ -81,27 +80,6 @@ def spreadsheet_output(spreadsheet_output_class, name):
raise Exception('The requested format is not available')


# From http://bugs.python.org/issue16535
class NumberStr(float):
def __init__(self, o):
# We don't call the parent here, since we're deliberately altering it's functionality
# pylint: disable=W0231
self.o = o

def __repr__(self):
return str(self.o)

# This is needed for this trick to work in python 3.4
def __float__(self):
return self


def decimal_default(o):
if isinstance(o, Decimal):
return NumberStr(o)
raise TypeError(repr(o) + " is not JSON serializable")


def unflatten(input_name, base_json=None, input_format=None, output_name='releases.json',
main_sheet_name='releases', encoding='utf8', timezone_name='UTC',
root_id='ocid', schema='', convert_titles=False, cell_source_map=None,
Expand Down Expand Up @@ -134,16 +112,8 @@ def unflatten(input_name, base_json=None, input_format=None, output_name='releas
else:
base = OrderedDict()
if WITH_CELLS:
result, cell_source_map_data, heading_source_map_data = spreadsheet_input.fancy_unflatten()
base[main_sheet_name] = list(result)
with codecs.open(output_name, 'w', encoding='utf-8') as fp:
json.dump(base, fp, indent=4, default=decimal_default, ensure_ascii=False)
if cell_source_map:
with codecs.open(cell_source_map, 'w', encoding='utf-8') as fp:
json.dump(cell_source_map_data, fp, indent=4, default=decimal_default, ensure_ascii=False)
if heading_source_map:
with codecs.open(heading_source_map, 'w', encoding='utf-8') as fp:
json.dump(heading_source_map_data, fp, indent=4, default=decimal_default, ensure_ascii=False)
spreadsheet_input.fancy_unflatten(base, main_sheet_name, output_name, cell_source_map, heading_source_map)

else:
result = spreadsheet_input.unflatten()
base[main_sheet_name] = list(result)
Expand Down
91 changes: 49 additions & 42 deletions flattentool/input.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,22 +8,22 @@
import sys
from decimal import Decimal, InvalidOperation
import os
import codecs
from collections import OrderedDict

import openpyxl
from six import text_type
from warnings import warn
import traceback
import datetime
import json
import pytz
from openpyxl.utils import _get_column_letter, column_index_from_string
from flattentool.lib import decimal_default, Cell
import tempfile

WITH_CELLS = True

class Cell:
def __init__(self, cell_value, cell_location):
self.cell_value = cell_value
self.cell_location = cell_location
self.sub_cells = []

# The "pylint: disable" lines exist to ignore warnings about the imports we expect not to work not working

Expand Down Expand Up @@ -238,26 +238,41 @@ def inthere(unflattened, id_name):
else:
main_sheet_by_ocid[root_id_or_none].append(unflattened)
temporarydicts_to_lists(main_sheet_by_ocid)

return sum(main_sheet_by_ocid.values(), [])


def unflatten(self):
result = self.do_unflatten()
if WITH_CELLS:
result = extract_list_to_value(result)
return result
tmp_directory = tempfile.mkdtemp()
file_name = os.path.join(tmp_directory, 'unflattened.json')
self.results_from_cell_tree({}, 'main', file_name)
with open(file_name) as unflattened:
return json.load(unflattened, object_pairs_hook=OrderedDict)['main']
return self.do_unflatten()


def extract_error_path(self, cell_tree):
return sorted(extract_list_to_error_path([self.main_sheet_name.lower()], cell_tree).items())


def fancy_unflatten(self):
def results_from_cell_tree(self, base, main_sheet_name, output_name):
cell_tree = self.do_unflatten()
base[main_sheet_name] = cell_tree
with codecs.open(output_name, 'w', encoding='utf-8') as fp:
json.dump(base, fp, indent=4, default=decimal_default, ensure_ascii=False)
return self.extract_error_path(cell_tree)


def fancy_unflatten(self, base, main_sheet_name, output_name, cell_source_map, heading_source_map):
if not WITH_CELLS:
raise Exception('Can only do a fancy_unflatten() if WITH_CELLS=True')
cell_tree = self.do_unflatten()
result = extract_list_to_value(cell_tree)
cell_source_map = extract_list_to_error_path([self.main_sheet_name.lower()], cell_tree)
ordered_items = sorted(cell_source_map.items())
ordered_cell_source_map = OrderedDict(( '/'.join(str(x) for x in path), location) for path, location in ordered_items)
ordered_items = self.results_from_cell_tree(base, main_sheet_name, output_name)
if not cell_source_map and not heading_source_map:
return
row_source_map = OrderedDict()
heading_source_map = OrderedDict()
for path, _ in ordered_items:
cells = cell_source_map[path]
heading_source_map_data = OrderedDict()
for path, cells in ordered_items:
# Prepare row_source_map key
key = '/'.join(str(x) for x in path[:-1])
if not key in row_source_map:
Expand All @@ -270,19 +285,28 @@ def fancy_unflatten(self):
except:
header_path_parts.append(x)
header_path = '/'.join(header_path_parts)
if header_path not in heading_source_map:
heading_source_map[header_path] = []
if header_path not in heading_source_map_data:
heading_source_map_data[header_path] = []
# Populate the row and header source maps
for cell in cells:
sheet, col, row, header = cell
if (sheet, row) not in row_source_map[key]:
row_source_map[key].append((sheet, row))
if (sheet, header) not in heading_source_map[header_path]:
heading_source_map[header_path].append((sheet, header))
if (sheet, header) not in heading_source_map_data[header_path]:
heading_source_map_data[header_path].append((sheet, header))
for key in row_source_map:
assert key not in ordered_cell_source_map, 'Row/cell collision: {}'.format(key)
ordered_cell_source_map[key] = row_source_map[key]
return result, ordered_cell_source_map, heading_source_map
ordered_items.append((key.split('/'), row_source_map[key]))

if cell_source_map:
with codecs.open(cell_source_map, 'w', encoding='utf-8') as fp:
json.dump(
OrderedDict(( '/'.join(str(x) for x in path), location) for path, location in ordered_items),
fp, default=decimal_default, ensure_ascii=False, indent=4
)
if heading_source_map:
with codecs.open(heading_source_map, 'w', encoding='utf-8') as fp:
json.dump(heading_source_map_data, fp, indent=4, default=decimal_default, ensure_ascii=False)


def extract_list_to_error_path(path, input):
output = {}
Expand Down Expand Up @@ -317,24 +341,6 @@ def extract_dict_to_error_path(path, input):
raise Exception('Unexpected result type in the JSON cell tree: {}'.format(input[k]))
return output

def extract_list_to_value(input):
output = []
for item in input:
output.append(extract_dict_to_value(item))
return output

def extract_dict_to_value(input):
output = OrderedDict()
for k in input:
if isinstance(input[k], list):
output[k] = extract_list_to_value(input[k])
elif isinstance(input[k], dict):
output[k] = extract_dict_to_value(input[k])
elif isinstance(input[k], Cell):
output[k] = input[k].cell_value
else:
raise Exception('Unexpected result type in the JSON cell tree: {}'.format(input[k]))
return output

class CSVInput(SpreadsheetInput):
encoding = 'utf-8'
Expand Down Expand Up @@ -557,6 +563,7 @@ def path_search(nested_dict, path_list, id_fields=None, path=None, top=False, to


class TemporaryDict(UserDict):
__slots__ = ['keyfield', 'items_no_keyfield', 'data', 'top_sheet']
def __init__(self, keyfield, top_sheet=False):
self.keyfield = keyfield
self.items_no_keyfield = []
Expand Down
28 changes: 28 additions & 0 deletions flattentool/lib.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
from decimal import Decimal
# From http://bugs.python.org/issue16535
class NumberStr(float):
def __init__(self, o):
# We don't call the parent here, since we're deliberately altering it's functionality
# pylint: disable=W0231
self.o = o

def __repr__(self):
return str(self.o)

# This is needed for this trick to work in python 3.4
def __float__(self):
return self

class Cell:
__slots__ = ['cell_value', 'cell_location', 'sub_cells']
def __init__(self, cell_value, cell_location):
self.cell_value = cell_value
self.cell_location = cell_location
self.sub_cells = []

def decimal_default(o):
if isinstance(o, Decimal):
return NumberStr(o)
if isinstance(o, Cell):
return o.cell_value
raise TypeError(repr(o) + " is not JSON serializable")

0 comments on commit 82b57a1

Please sign in to comment.