Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

WIP: Basic statistical analysis #8

Open
wants to merge 8 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions LICENSE
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
MIT License

Copyright (c) 2020 homoluctus
Copyright (c) 2022 Dominik George <[email protected]>

Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
Expand Down
78 changes: 78 additions & 0 deletions maillogger/analyze.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,78 @@
from dataclasses import asdict, dataclass, field
from typing import Dict, List

from maillogger.parser import ParseResultType, ParseResultTo

GroupedResultType = Dict[str, List[ParseResultType]]
AggregateResultType = List[Dict[str, str]]


def group_by_mail_id(results: List[ParseResultType]) -> GroupedResultType:
"""Group a list of parse results into a dict by mail_id

Args:
results (List[ParseResultType]): List of parse results as dicts

Returns:
GroupedResultType: return a dictionary with the mail_id as key

{
'677RGS0': [
{...}, {...}
]
}
"""

groups = {}

for result in results:
groups.setdefault(result['mail_id'], []).append(result)

return groups


def aggregate(groups: GroupedResultType) -> AggregateResultType:
"""Aggregate all results per mail id into one item

Args:
groups (GroupedResultType): dict of lists grouped by mail id

Returns:
AggregateResultType: dictionary with one dict keyed by mail id
"""

aggregates = {}

for mail_id, records in groups.items():
for record in records:
aggregates.setdefault(mail_id, AggregateResult(mail_id)).update(record)

return [aggregate.to_dict() for aggregate in aggregates.values()]


@dataclass
class AggregateResult:
mail_id: str

from_address: str = ''
to_addresses: List[str] = field(default_factory=list)

size: str = '0'

def to_dict(self) -> ParseResultType:
return asdict(self)

def update(self, record: ParseResultType) -> None:
if not self.mail_id:
self.mail_id = record["mail_id"]
elif record["mail_id"] != self.mail_id:
raise ValueError("Trying to aggregate different mail ids!")

if "from_address" in record:
self.from_address = record["from_address"]

if "size" in record:
self.size = record["size"]

if "to_address" in record:
self.to_addresses.append(record["to_address"])
19 changes: 18 additions & 1 deletion maillogger/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,18 @@ def setup_options(parser: argparse.ArgumentParser) -> None:
help='File data format to write the parsed maillog (Default: csv)'
)

parser.add_argument(
'-g', '--group',
action='store_true',
help='Group results by mail id (only available in JSON format)'
)

parser.add_argument(
'-a', '--aggregate',
action='store_true',
help='Aggregate results by mail id'
)

parser.add_argument(
'-c', '--compress',
action='store_true',
Expand All @@ -50,4 +62,9 @@ def setup_options(parser: argparse.ArgumentParser) -> None:
def parse_options(args: Optional[List[str]] = None) -> argparse.Namespace:
parser = get_parser()
setup_options(parser)
return parser.parse_args(args=args)

options = parser.parse_args(args=args)
if options.group and options.fmt in ("csv", "tsv"):
parser.error("Grouping by mail id can only be used with JSON output.")

return options
15 changes: 9 additions & 6 deletions maillogger/file/writer.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,15 @@
import csv
import json
from dataclasses import dataclass
from typing import Any, ClassVar, List, Type
from typing import Any, ClassVar, Dict, List, Type, Union

from maillogger.analyze import AggregateResultType, GroupedResultType
from maillogger.exceptions import UnsupportedDataFormatError
from maillogger.file.base import FileHandler
from maillogger.parser import ParseResultType

OutputResultType = Union[GroupedResultType, List[ParseResultType], AggregateResultType]


@dataclass
class FileWriter(FileHandler):
Expand All @@ -32,7 +35,7 @@ def add_file_ext(self) -> None:
ext = f'{self.ext}.{self.gz_ext}'
self.filepath = f'{self.filepath}.{ext}'

def handle(self, records: List[ParseResultType]) -> None:
def handle(self, records: OutputResultType) -> None:
if not records:
return

Expand All @@ -51,7 +54,7 @@ class CsvWriter(FileWriter):

newline: str = ''

def write(self, records: List[ParseResultType]) -> None:
def write(self, records: OutputResultType) -> None:
writer = csv.DictWriter(self.fd, fieldnames=list(records[0].keys()))
writer.writeheader()
writer.writerows(records)
Expand All @@ -64,7 +67,7 @@ class JsonWriter(FileWriter):
ensure_ascii: bool = False
indent: int = 2

def write(self, records: List[ParseResultType]) -> None:
def write(self, records: OutputResultType) -> None:
json.dump(
records,
self.fd, # type: ignore
Expand All @@ -76,7 +79,7 @@ def write(self, records: List[ParseResultType]) -> None:
class TsvWriter(FileWriter):
ext = 'tsv'

def write(self, records: List[ParseResultType]) -> None:
def write(self, records: OutputResultType) -> None:
header = '\t'.join(records[0].keys())
self.fd.write(f'{header}\n') # type: ignore

Expand All @@ -101,7 +104,7 @@ def get_writer(filepath: str, fmt: str, **kwargs: Any) -> Type[FileWriter]:

def write(
filepath: str,
records: List[ParseResultType],
records: OutputResultType,
fmt: str,
**kwargs: Any) -> None:
fmt = fmt.lower()
Expand Down
14 changes: 13 additions & 1 deletion maillogger/main.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
from maillogger.analyze import aggregate, group_by_mail_id
from maillogger.cli import parse_options
from maillogger.file.loader import Loader
from maillogger.file.writer import write
Expand All @@ -10,12 +11,23 @@ def main() -> None:
loader = Loader(options.source_file)
contents = loader.handle()

parse_to = True
parse_from = True
if options.fmt in ("csv", "tsv"):
parse_from = False

parsed_contents = []
for c in contents:
result = parse(c)
result = parse(c, parse_to, parse_from)
if result:
parsed_contents.append(result)

if options.group or options.aggregate:
parsed_contents = group_by_mail_id(parsed_contents)

if options.aggregate:
parsed_contents = aggregate(parsed_contents)

write(
filepath=options.target_file, records=parsed_contents,
fmt=options.fmt, compress=options.compress)
Expand Down
72 changes: 54 additions & 18 deletions maillogger/parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,26 +3,34 @@
from datetime import datetime
from typing import Dict, Optional

REGEX_PREFIX = r'(?P<month>[A-Z][a-z]{2}) +(?P<day>[0-9]{,2}) ' \
+ r'(?P<time>[0-9]{2}:[0-9]{2}:[0-9]{2}) (?P<hostname>[A-Za-z0-9-]+) postfix/[a-z/]+\[[0-9]+\]: ' \
+ r'(?P<mail_id>[A-Z0-9]+): '

REGEX = r'(?P<month>[A-Z][a-z]{2}) (?P<day>[0-9]{,2}) ' \
+ r'(?P<time>[0-9]{2}:[0-9]{2}:[0-9]{2}) mail postfix/[a-z]+\[[0-9]+\]: ' \
+ r'(?P<mail_id>[A-Z0-9]+): to=<(?P<to_address>.*@.*)>, ' \
REGEX_FROM = REGEX_PREFIX \
+ r'from=<(?P<from_address>.*@.*)>, size=(?P<size>[0-9]+), ' \
+ r'nrcpt=(?P<nrcpt>[0-9]+) \((?P<description>.*)\)' \

REGEX_TO = REGEX_PREFIX \
+ r'to=<(?P<to_address>.*@.*)>, ' \
+ r'relay=(?P<relay>.*), delay=(?P<delay>[0-9.]+), ' \
+ r'delays=(?P<delays>[0-9][0-9/.]+), dsn=(?P<dsn>[0-9].[0-9].[0-9]), ' \
+ r'status=(?P<status>(sent|deferred|bounced)) \((?P<description>.*)\)'
PATTERN = re.compile(REGEX)

PATTERN_FROM = re.compile(REGEX_FROM)
PATTERN_TO = re.compile(REGEX_TO)

ParseResultType = Dict[str, str]


def parse(target: str) -> Optional[ParseResultType]:
def parse(target: str, parse_to: bool = True, parse_from: bool = True) -> Optional[ParseResultType]:
"""Parse postfix maillog including send status

Args:
target (str): maillog

Returns:
Optional[ParseResultType]: return the following dict if match
Optional[ParseResultType]: return one of the following dict if match

{
'month': 'Aug',
Expand All @@ -37,41 +45,69 @@ def parse(target: str) -> Optional[ParseResultType]:
'status': 'sent',
'description': 'delivered to maildir'
}

{
'month': 'Aug',
'day': '1',
'time': '10:00:00',
'mail_id': '677RGS0',
'from_address': '[email protected]',
'size': '12345',
'nrcpt': '12',
'description': 'delivered to maildir'
}
"""

match_obj = re.search(PATTERN, target)
match_from = re.search(PATTERN_FROM, target)
match_to = re.search(PATTERN_TO, target)

if match_obj is None:
return None
if parse_from and match_from:
result = match_from.groupdict()
return ParseResultFrom(**result).to_dict()
if parse_to and match_to:
result = match_to.groupdict()
return ParseResultTo(**result).to_dict()

result = match_obj.groupdict()
return ParseResult(**result).to_dict()
return None


@dataclass
class ParseResult:
month: InitVar[str]
day: InitVar[str]
time: InitVar[str]
hostname: InitVar[str]

mail_id: str
to_address: str
relay: str
delay: str
delays: str
dsn: str
status: str
description: str

datetime: str = field(init=False)

def __post_init__(self, month: str, day: str, time: str) -> None:
def __post_init__(self, month: str, day: str, time: str, hostname: str) -> None:
self.datetime = self.convert2dateime(month, day, time)

def to_dict(self) -> ParseResultType:
return asdict(self)

@staticmethod
def convert2dateime(month: str, day: str, time: str) -> str:
day = day.rjust(2, '0')
tmp = datetime.strptime(f'{month}{day}{time}', '%b%d%H:%M:%S')
return tmp.replace(year=datetime.now().year).strftime('%Y%m%d%H%M%S')


@dataclass
class ParseResultTo(ParseResult):
to_address: str
relay: str
delay: str
delays: str
dsn: str
status: str


@dataclass
class ParseResultFrom(ParseResult):
from_address: str
size: str
nrcpt: str