diff --git a/avocado/export/_base.py b/avocado/export/_base.py index cca606c..2242e1e 100644 --- a/avocado/export/_base.py +++ b/avocado/export/_base.py @@ -7,7 +7,7 @@ class BaseExporter(object): "Base class for all exporters" file_extension = 'txt' content_type = 'text/plain' - preferred_formats = [] + preferred_formats = () def __init__(self, concepts=None): if concepts is None: diff --git a/avocado/export/_csv.py b/avocado/export/_csv.py index 240c825..f73715f 100644 --- a/avocado/export/_csv.py +++ b/avocado/export/_csv.py @@ -30,7 +30,7 @@ class CSVExporter(BaseExporter): file_extension = 'csv' content_type = 'text/csv' - preferred_formats = ('csv', 'number', 'string') + preferred_formats = ('csv', 'string') def write(self, iterable, buff=None, *args, **kwargs): header = [] @@ -39,11 +39,16 @@ def write(self, iterable, buff=None, *args, **kwargs): for i, row_gen in enumerate(self.read(iterable, *args, **kwargs)): row = [] + for data in row_gen: if i == 0: header.extend(data.keys()) + row.extend(data.values()) + if i == 0: writer.writerow(header) + writer.writerow(row) + return buff diff --git a/avocado/export/_excel.py b/avocado/export/_excel.py index 3577ee2..f8be4fa 100644 --- a/avocado/export/_excel.py +++ b/avocado/export/_excel.py @@ -16,7 +16,7 @@ class ExcelExporter(BaseExporter): file_extension = 'xlsx' content_type = 'application/vnd.ms-excel' - preferred_formats = ('excel', 'boolean', 'number', 'string') + preferred_formats = ('excel', 'string') def write(self, iterable, buff=None, *args, **kwargs): buff = self.get_file_obj(buff) @@ -27,33 +27,49 @@ def write(self, iterable, buff=None, *args, **kwargs): ws_data.title = 'Data' header = [] + # Create the data worksheet for i, row_gen in enumerate(self.read(iterable, *args, **kwargs)): row = [] + for data in row_gen: if i == 0: # Build up header row header.extend(data.keys()) + # Add formatted section to the row row.extend(data.values()) + # Write headers on first iteration if i == 0: ws_data.append(header) + ws_data.append(row) ws_dict = wb.create_sheet() ws_dict.title = 'Data Dictionary' # Create the Data Dictionary Worksheet - ws_dict.append(('Field Name', 'Data Type', 'Description', - 'Concept Name', 'Concept Discription')) + ws_dict.append(( + 'Field Name', + 'Data Type', + 'Description', + 'Concept Name', + 'Concept Description', + )) for c in self.concepts: cfields = c.concept_fields.select_related('field') + for cfield in cfields: field = cfield.field - ws_dict.append((field.field_name, field.simple_type, - field.description, c.name, c.description)) + ws_dict.append(( + field.field_name, + field.simple_type, + field.description, + c.name, + c.description, + )) # This hacked up implementation is due to `save_virtual_workbook` # not behaving correctly. This function should handle the work @@ -66,4 +82,5 @@ def write(self, iterable, buff=None, *args, **kwargs): _buff.close() else: wb.save(buff) + return buff diff --git a/avocado/export/_html.py b/avocado/export/_html.py index 8b3c5bd..b699bf4 100644 --- a/avocado/export/_html.py +++ b/avocado/export/_html.py @@ -20,4 +20,5 @@ def write(self, iterable, template, buff=None, *args, **kwargs): context = Context({'rows': self.read(iterable, *args, **kwargs)}) buff.write(template.render(context)) + return buff diff --git a/avocado/export/_json.py b/avocado/export/_json.py index 39da0ad..3bc583e 100644 --- a/avocado/export/_json.py +++ b/avocado/export/_json.py @@ -18,12 +18,14 @@ class JSONExporter(BaseExporter): file_extension = 'json' content_type = 'application/json' - preferred_formats = ('json', 'number', 'string') + preferred_formats = ('json',) def write(self, iterable, buff=None, *args, **kwargs): buff = self.get_file_obj(buff) encoder = JSONGeneratorEncoder() + for chunk in encoder.iterencode(self.read(iterable, *args, **kwargs)): buff.write(chunk) + return buff diff --git a/avocado/export/_r.py b/avocado/export/_r.py index 6d06fa2..8edf346 100644 --- a/avocado/export/_r.py +++ b/avocado/export/_r.py @@ -14,18 +14,21 @@ class RExporter(BaseExporter): file_extension = 'zip' content_type = 'application/zip' - preferred_formats = ('r', 'coded', 'number', 'string') + preferred_formats = ('r', 'coded') def _format_name(self, name): punc = punctuation.replace('_', '') name = str(name).translate(None, punc) name = name.replace(' ', '_') words = name.split('_') + for i, w in enumerate(words): if i == 0: name = w.lower() continue + name += w.capitalize() + if name[0].isdigit(): name = '_' + name @@ -56,6 +59,7 @@ def _code_values(self, name, field, coded_labels): def write(self, iterable, buff=None, template_name='export/script.R', *args, **kwargs): + zip_file = ZipFile(self.get_file_obj(buff), 'w') factors = [] # field names @@ -68,8 +72,8 @@ def write(self, iterable, buff=None, template_name='export/script.R', for cfield in cfields: field = cfield.field name = self._format_name(field.field_name) - labels.append(u'attr(data${0}, "label") = "{1}"'.format( - name, unicode(cfield))) + labels.append(u'attr(data${0}, "label") = "{1}"' + .format(name, unicode(cfield))) coded_labels = field.coded_labels() diff --git a/avocado/export/_sas.py b/avocado/export/_sas.py index 9f861eb..b70f0c9 100644 --- a/avocado/export/_sas.py +++ b/avocado/export/_sas.py @@ -14,7 +14,7 @@ class SASExporter(BaseExporter): file_extension = 'zip' content_type = 'application/zip' - preferred_formats = ('sas', 'coded', 'number', 'string') + preferred_formats = ('sas', 'coded') num_lg_names = 0 diff --git a/avocado/formatters.py b/avocado/formatters.py index 6f7a298..46dc134 100644 --- a/avocado/formatters.py +++ b/avocado/formatters.py @@ -72,31 +72,41 @@ def process_multiple(func): class Formatter(object): - """Provides support for the core data formats with sensible defaults - for handling converting Python datatypes to their formatted equivalent. + """Converts Python types into a formatted equivalent based on a list of + `preferred_formats`. If no formats are specified or none of the format + methods are successful, the default method for the field's type will be + attempted and finally will fallback to returning the value as is. - Each core format method must return one of the following: - Single formatted Value - OrderedDict/sequence of key-value pairs + A format is supported if `to_FORMAT` method is defined on the Formatter + class. By default, the method is assumed to take a value for a single + field and produce a value. The method signature is: - If the format method is unable to do either of these for the given - value a FormatterException must be raised. + def to_FORMAT(value, field=field, concept=self.concept, + process_multiple=False, **context) - ``values`` - A list, tuple or OrderedDict containing the values to - be formatted. If a list or tuple is passed, it will be wrapped in - an OrderedDict for keyword access in the format methods. + With this approach, each field contained in the concept will be processed + separately. - :: + Alternately, the method can decorated with `process_multiple` which causes + the concept and all field values to be passed into the method. The + signature looks as follows. - values = ['Bob', 'Smith'] + @process_multiple + def to_FORMAT(values, fields=self.fields, concept=self.concept, + process_multiple=True, **context) - """ - default_formats = ('boolean', 'number', 'string') + `values` will be an OrderedDict of values for each field. `fields` is + a map of `DataField` instances associated with `concept` keyed by their + natural key starting with `field_name` and prepending the `model_name` + and `app_name` to prevent key collisions. + The output of a `process_multiple` method can be a single formatted value, + an OrderedDict, or a sequence of key-value pairs. + """ def __init__(self, concept=None, keys=None): "Passing in a concept takes precedence over `keys`." if not keys and not concept: - raise ValueError('A concept or list of keys must be supplied.') + raise ValueError('A concept or sequence of keys are required.') self.concept = concept self.fields = None @@ -110,91 +120,111 @@ def __init__(self, concept=None, keys=None): # Keep track of fields/concepts that cause an error to prevent # logging the exception twice - self._errors = {} + self._errors = set() def __call__(self, values, preferred_formats=None, **context): - # Create a copy of the preferred formats since each set values may - # be processed slightly differently (e.g. mixed data type in column) - # which could cause exceptions that would not be present during - # processing of other values - if preferred_formats is None: - preferred_formats = self.default_formats - preferred_formats = list(preferred_formats) + ['raw'] + if not preferred_formats: + preferred_formats = [] # Create a OrderedDict of the values relative to the # concept fields objects the values represent. This # enables key-based access to the values rather than # relying on position. if not isinstance(values, OrderedDict): - # Wrap single values if not isinstance(values, (list, tuple)): values = [values] + values = OrderedDict(zip(self.keys, values)) + # Create list of formats to attempt for multi-value processing. + multi_formats = list(preferred_formats) + + # Append concept type as a format if one is defined. + if self.concept and self.concept.type: + multi_formats.append(self.concept.type) + + multi_formats.append('raw') + # Iterate over all preferred formats and attempt to process the values. - # For formatter methods that process all values must be tracked and + # Formatter methods that process all values must be tracked and # attempted only once. They are removed from the list once attempted. - # If no preferred multi-value methods succeed, each value is processed - # independently with the remaining formats - for f in iter(preferred_formats): - method = getattr(self, u'to_{0}'.format(f), None) + # If no multi-value methods succeed, each value is processed + # independently with the remaining formats. + for f in multi_formats: + method = getattr(self, 'to_{0}'.format(f), None) + # This formatter does not support this format, remove it - # from the available list + # from the available list. if not method: - preferred_formats.pop(0) continue - # The implicit behavior when handling multiple values is to process - # them independently since, in most cases, they are not dependent - # on one another, but rather should be represented together since - # the data is related. A formatter method can be flagged to process - # all values together by setting the attribute - # `process_multiple=True`. we must # check to if that flag has been - # set and simply pass through the values and context to the method - # as is. if ``process_multiple`` is not set, each value is handled - # independently if getattr(method, 'process_multiple', False): try: - output = method(values, fields=self.fields, + output = method(values, + fields=self.fields, concept=self.concept, - process_multiple=True, **context) + process_multiple=True, + **context) + if not isinstance(output, dict): return OrderedDict([(self.concept.name, output)]) + return output - # Remove from the preferred formats list since it failed + except Exception: if self.concept and self.concept not in self._errors: - self._errors[self.concept] = None - log.warning(u'Multi-value formatter error', - exc_info=True) - preferred_formats.pop(0) + self._errors.add(self.concept) + log.exception('Multi-value formatter error') # The output is independent of the input. Formatters may output more # or less values than what was entered. output = OrderedDict() - # Attempt to process each + # Process each field and corresponding value separately. for i, (key, value) in enumerate(values.iteritems()): - for f in preferred_formats: - method = getattr(self, u'to_{0}'.format(f)) - field = self.fields[key] if self.fields else None + field = self.fields[key] if self.fields else None + + field_formats = list(preferred_formats) + + # Add field type if defined + if field: + if field.type: + field_formats.append(field.type) + + field_formats.append(field.simple_type) + + # Fallback to simple type (e.g. number) and finally 'raw' + field_formats.append('raw') + + for f in field_formats: + method = getattr(self, 'to_{0}'.format(f), None) + + if not method: + continue + try: - fvalue = method(value, field=field, concept=self.concept, - process_multiple=False, **context) - if isinstance(fvalue, dict): - output.update(fvalue) + value = method(value, + field=field, + concept=self.concept, + process_multiple=False, + **context) + + # Add/update output and break loop + if isinstance(value, dict): + output.update(value) else: - output[key] = fvalue + output[key] = value + break except Exception: if field and field not in self._errors: - self._errors[field] = None - log.warning(u'Single-value formatter error', - exc_info=True) + self._errors.add(field) + log.exception('Single-value formatter error') + return output def __contains__(self, choice): - return hasattr(self, u'to_{0}'.format(choice)) + return hasattr(self, 'to_{0}'.format(choice)) def __unicode__(self): return u'{0}'.format(self.name or self.__class__.__name__) @@ -205,38 +235,46 @@ def to_string(self, value, **context): # enough for certain datatypes or complext data structures if value is None: return u'' + return force_unicode(value, strings_only=False) def to_boolean(self, value, **context): # If value is native True or False value, return it if type(value) is bool: return value - raise FormatterException(u'Cannot convert {0} to boolean'.format( - value)) + + raise FormatterException(u'Cannot convert {0} to boolean' + .format(value)) def to_number(self, value, **context): # Attempts to convert a number. Starting with ints and floats # Eventually create to_decimal using the decimal library. if isinstance(value, (int, float)): return value + if isinstance(value, Decimal): return float(unicode(value)) + if isinstance(value, basestring): if value.isdigit(): return int(value) + try: return float(value) except (ValueError, TypeError): pass + raise FormatterException(u'Cannot convert {0} to number'.format(value)) def to_coded(self, value, **context): # Attempts to convert value to its coded representation field = context.get('field') + if field: for key, coded in field.coded_values: if key == value: return coded + raise FormatterException(u'No coded value for {0}'.format(value)) def to_raw(self, value, **context): @@ -245,8 +283,7 @@ def to_raw(self, value, **context): class RawFormatter(Formatter): def __call__(self, values, *args, **kwargs): - preferred_formats = ['raw'] - return super(RawFormatter, self).__call__(values, preferred_formats) + return super(RawFormatter, self).__call__(values, ['raw']) registry = loader.Registry(default=Formatter, register_instance=False) diff --git a/tests/cases/formatters/tests.py b/tests/cases/formatters/tests.py index d655d9b..856b169 100644 --- a/tests/cases/formatters/tests.py +++ b/tests/cases/formatters/tests.py @@ -33,6 +33,14 @@ def setUp(self): self.values = ['CEO', 100000, True] self.f = Formatter(concept) + def test_default(self): + fvalues = self.f(self.values) + self.assertEqual(OrderedDict([ + ('name', 'CEO'), + ('salary', 100000), + ('boss', True), + ]), fvalues) + def test_to_string(self): fvalues = self.f(self.values, preferred_formats=['string']) self.assertEqual(OrderedDict([