[analyze] Implement basic aggregation by mail id

homoluctus · Apr 6, 2022 · 580c482 · 580c482
1 parent d78e9db
commit 580c482
Show file tree

Hide file tree

Showing 4 changed files with 72 additions and 6 deletions.
diff --git a/maillogger/analyze.py b/maillogger/analyze.py
@@ -1,16 +1,20 @@
+from dataclasses import asdict, dataclass, field
 from typing import Dict, List
 
-from maillogger.parser import ParseResultType
+from maillogger.parser import ParseResultType, ParseResultTo
 
+GroupedResultType = Dict[str, List[ParseResultType]]
+AggregateResultType = Dict[str, Dict[str, str]]
 
-def group_by_mail_id(results: List[ParseResultType]) -> Dict[str, List[ParseResultType]]:
+
+def group_by_mail_id(results: List[ParseResultType]) -> GroupedResultType:
     """Group a list of parse results into a dict by mail_id
 
     Args:
         results (List[ParseResultType]): List of parse results as dicts
 
     Returns:
-        Dict[str, List[ParseResultType]]: return a dictionary with the mail_id as key
+        GroupedResultType: return a dictionary with the mail_id as key
 
         {
             '677RGS0': [
@@ -25,3 +29,50 @@ def group_by_mail_id(results: List[ParseResultType]) -> Dict[str, List[ParseResu
         groups.setdefault(result['mail_id'], []).append(result)
 
     return groups
+
+
+def aggregate(groups: GroupedResultType) -> AggregateResultType:
+    """Aggregate all results per mail id into one item
+
+    Args:
+        groups (GroupedResultType): dict of lists grouped by mail id
+
+    Returns:
+        AggregateResultType: dictionary with one dict keyed by mail id
+    """
+
+    aggregates = {}
+
+    for mail_id, records in groups.items():
+        for record in records:
+            aggregates.setdefault(mail_id, AggregateResult(mail_id)).update(record)
+
+    return {mail_id: aggregate.to_dict() for mail_id, aggregate in aggregates.items()}
+
+
+@dataclass
+class AggregateResult:
+    mail_id: str
+
+    from_address: str = ''
+    to_addresses: List[str] = field(default_factory=list)
+
+    size: str = '0'
+
+    def to_dict(self) -> ParseResultType:
+        return asdict(self)
+
+    def update(self, record: ParseResultType) -> None:
+        if not self.mail_id:
+            self.mail_id = record["mail_id"]
+        elif record["mail_id"] != self.mail_id:
+            raise ValueError("Trying to aggregate different mail ids!")
+
+        if "from_address" in record:
+            self.from_address = record["from_address"]
+
+        if "size" in record:
+            self.size = record["size"]
+
+        if "to_address" in record:
+            self.to_addresses.append(record["to_address"])
diff --git a/maillogger/cli.py b/maillogger/cli.py
@@ -39,6 +39,12 @@ def setup_options(parser: argparse.ArgumentParser) -> None:
         help='Group results by mail id (only available in JSON format)'
     )
 
+    parser.add_argument(
+        '-a', '--aggregate',
+        action='store_true',
+        help='Aggregate results by mail id'
+    )
+
     parser.add_argument(
         '-c', '--compress',
         action='store_true',

diff --git a/maillogger/file/writer.py b/maillogger/file/writer.py
@@ -3,11 +3,12 @@
 from dataclasses import dataclass
 from typing import Any, ClassVar, Dict, List, Type, Union
 
+from maillogger.analyze import AggregateResultType, GroupedResultType
 from maillogger.exceptions import UnsupportedDataFormatError
 from maillogger.file.base import FileHandler
 from maillogger.parser import ParseResultType
 
-OutputResultType = Union[Dict[str, List[ParseResultType]], List[ParseResultType]]
+OutputResultType = Union[GroupedResultType, List[ParseResultType], AggregateResultType]
 
 
 @dataclass
@@ -54,6 +55,8 @@ class CsvWriter(FileWriter):
     newline: str = ''
 
     def write(self, records: OutputResultType) -> None:
+        if isinstance(records, dict):
+            records = list(records.keys())
         writer = csv.DictWriter(self.fd, fieldnames=list(records[0].keys()))
         writer.writeheader()
         writer.writerows(records)
@@ -79,6 +82,9 @@ class TsvWriter(FileWriter):
     ext = 'tsv'
 
     def write(self, records: OutputResultType) -> None:
+        if isinstance(records, dict):
+            records = list(records.keys())
+
         header = '\t'.join(records[0].keys())
         self.fd.write(f'{header}\n')  # type: ignore
 

diff --git a/maillogger/main.py b/maillogger/main.py
@@ -1,4 +1,4 @@
-from maillogger.analyze import group_by_mail_id
+from maillogger.analyze import aggregate, group_by_mail_id
 from maillogger.cli import parse_options
 from maillogger.file.loader import Loader
 from maillogger.file.writer import write
@@ -22,9 +22,12 @@ def main() -> None:
         if result:
             parsed_contents.append(result)
 
-    if options.group:
+    if options.group or options.aggregate:
         parsed_contents = group_by_mail_id(parsed_contents)
 
+    if options.aggregate:
+        parsed_contents = aggregate(parsed_contents)
+
     write(
         filepath=options.target_file, records=parsed_contents,
         fmt=options.fmt, compress=options.compress)