Skip to content

Commit

Permalink
[analyze] Implement basic aggregation by mail id
Browse files Browse the repository at this point in the history
  • Loading branch information
Natureshadow committed Apr 6, 2022
1 parent d78e9db commit 580c482
Show file tree
Hide file tree
Showing 4 changed files with 72 additions and 6 deletions.
57 changes: 54 additions & 3 deletions maillogger/analyze.py
Original file line number Diff line number Diff line change
@@ -1,16 +1,20 @@
from dataclasses import asdict, dataclass, field
from typing import Dict, List

from maillogger.parser import ParseResultType
from maillogger.parser import ParseResultType, ParseResultTo

GroupedResultType = Dict[str, List[ParseResultType]]
AggregateResultType = Dict[str, Dict[str, str]]

def group_by_mail_id(results: List[ParseResultType]) -> Dict[str, List[ParseResultType]]:

def group_by_mail_id(results: List[ParseResultType]) -> GroupedResultType:
"""Group a list of parse results into a dict by mail_id
Args:
results (List[ParseResultType]): List of parse results as dicts
Returns:
Dict[str, List[ParseResultType]]: return a dictionary with the mail_id as key
GroupedResultType: return a dictionary with the mail_id as key
{
'677RGS0': [
Expand All @@ -25,3 +29,50 @@ def group_by_mail_id(results: List[ParseResultType]) -> Dict[str, List[ParseResu
groups.setdefault(result['mail_id'], []).append(result)

return groups


def aggregate(groups: GroupedResultType) -> AggregateResultType:
"""Aggregate all results per mail id into one item
Args:
groups (GroupedResultType): dict of lists grouped by mail id
Returns:
AggregateResultType: dictionary with one dict keyed by mail id
"""

aggregates = {}

for mail_id, records in groups.items():
for record in records:
aggregates.setdefault(mail_id, AggregateResult(mail_id)).update(record)

return {mail_id: aggregate.to_dict() for mail_id, aggregate in aggregates.items()}


@dataclass
class AggregateResult:
mail_id: str

from_address: str = ''
to_addresses: List[str] = field(default_factory=list)

size: str = '0'

def to_dict(self) -> ParseResultType:
return asdict(self)

def update(self, record: ParseResultType) -> None:
if not self.mail_id:
self.mail_id = record["mail_id"]
elif record["mail_id"] != self.mail_id:
raise ValueError("Trying to aggregate different mail ids!")

if "from_address" in record:
self.from_address = record["from_address"]

if "size" in record:
self.size = record["size"]

if "to_address" in record:
self.to_addresses.append(record["to_address"])
6 changes: 6 additions & 0 deletions maillogger/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,12 @@ def setup_options(parser: argparse.ArgumentParser) -> None:
help='Group results by mail id (only available in JSON format)'
)

parser.add_argument(
'-a', '--aggregate',
action='store_true',
help='Aggregate results by mail id'
)

parser.add_argument(
'-c', '--compress',
action='store_true',
Expand Down
8 changes: 7 additions & 1 deletion maillogger/file/writer.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,11 +3,12 @@
from dataclasses import dataclass
from typing import Any, ClassVar, Dict, List, Type, Union

from maillogger.analyze import AggregateResultType, GroupedResultType
from maillogger.exceptions import UnsupportedDataFormatError
from maillogger.file.base import FileHandler
from maillogger.parser import ParseResultType

OutputResultType = Union[Dict[str, List[ParseResultType]], List[ParseResultType]]
OutputResultType = Union[GroupedResultType, List[ParseResultType], AggregateResultType]


@dataclass
Expand Down Expand Up @@ -54,6 +55,8 @@ class CsvWriter(FileWriter):
newline: str = ''

def write(self, records: OutputResultType) -> None:
if isinstance(records, dict):
records = list(records.keys())
writer = csv.DictWriter(self.fd, fieldnames=list(records[0].keys()))
writer.writeheader()
writer.writerows(records)
Expand All @@ -79,6 +82,9 @@ class TsvWriter(FileWriter):
ext = 'tsv'

def write(self, records: OutputResultType) -> None:
if isinstance(records, dict):
records = list(records.keys())

header = '\t'.join(records[0].keys())
self.fd.write(f'{header}\n') # type: ignore

Expand Down
7 changes: 5 additions & 2 deletions maillogger/main.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
from maillogger.analyze import group_by_mail_id
from maillogger.analyze import aggregate, group_by_mail_id
from maillogger.cli import parse_options
from maillogger.file.loader import Loader
from maillogger.file.writer import write
Expand All @@ -22,9 +22,12 @@ def main() -> None:
if result:
parsed_contents.append(result)

if options.group:
if options.group or options.aggregate:
parsed_contents = group_by_mail_id(parsed_contents)

if options.aggregate:
parsed_contents = aggregate(parsed_contents)

write(
filepath=options.target_file, records=parsed_contents,
fmt=options.fmt, compress=options.compress)
Expand Down

0 comments on commit 580c482

Please sign in to comment.