diff --git a/README.md b/README.md index 984a08d..cf8f6bb 100644 --- a/README.md +++ b/README.md @@ -74,6 +74,7 @@ The following configuration uses OAuth 2.0 flow. Supported types: - azure_graph +- azure_mail_reports - general The following parameters are for every type: @@ -202,8 +203,40 @@ oauth_apis: settings: time_interval: 1 start_date_name: activityDateTime + - type: azure_mail_reports + name: mail_reports + credentials: + id: <> + key: <> + token_http_request: + url: https://login.microsoftonline.com/abcd-efgh-abcd-efgh/oauth2/v2.0/token + body: client_id=<> + &scope=https://outlook.office365.com/.default + &client_secret=<> + &grant_type=client_credentials + headers: + method: POST + data_http_request: + url: https://reports.office365.com/ecp/reportingwebservice/reporting.svc/MessageTrace + method: GET + headers: + json_paths: + data_date: EndDate + next_url: + data: + filters: + format: Json + settings: + time_interval: 60 # for mail reports we suggest no less than 60 minutes + days_back_fetch: 8 # for mail reports we suggest up to 8 days + start_date_name: StartDate + end_date_name: EndDate ``` +### Azure mail reports type important notes and limitations +* We recommend setting the `days_back_fetch` parameter to no more than `8d` (~192 hours) as this might cause unexpected errors with the API. +* We recommend setting the `time_interval` parameter to no less than `60`, to avoid short time frames in which messages trace will be missed. +* Microsoft may delay trace events for up to 24 hours, and events are not guaranteed to be sequential during this delay. For more information, see the Data granularity, persistence, and availability section of the MessageTrace report topic in the Microsoft documentation: [MessageTrace report API](https://learn.microsoft.com/en-us/previous-versions/office/developer/o365-enterprise-developers/jj984335(v=office.15)#data-granularity-persistence-and-availability) ### Create Last Start Dates Text File @@ -241,6 +274,8 @@ If you stopped the container, you can continue from the exact place you stopped, ## Changelog: +- **0.1.0**: + - Added `azure_mail_reports` type. - **0.0.6**: - Improved documentation. - Added error log. diff --git a/dockerfile b/dockerfile index 8174532..761df2f 100644 --- a/dockerfile +++ b/dockerfile @@ -1,4 +1,4 @@ -FROM python:3.9 +FROM python:3.9-slim WORKDIR /app COPY /src ./src COPY requirements.txt requirements.txt diff --git a/src/api.py b/src/api.py index ec7c075..35c665c 100644 --- a/src/api.py +++ b/src/api.py @@ -10,7 +10,6 @@ from dateutil import parser from requests import Response from .data.base_data.api_base_data import ApiBaseData -from .data.base_data.api_filter import ApiFilter from .data.base_data.api_custom_field import ApiCustomField from .data.general_type_data.api_general_type_data import ApiGeneralTypeData @@ -94,9 +93,7 @@ def _get_last_date(self, first_item: dict) -> str: def _is_item_in_fetch_frame(self, item: dict, last_datetime_to_fetch: datetime) -> bool: item_date = self._get_json_path_value_from_data( self._general_type_data.json_paths.data_date, item) - item_datetime = parser.parse(item_date) - if item_datetime < last_datetime_to_fetch: return False @@ -122,31 +119,33 @@ def _get_new_start_date(self) -> str: def _get_data_from_api(self, url: str) -> tuple[Optional[str], list]: next_url = None + json_data = self._parse_response_to_json(url) + if self._general_type_data.json_paths.next_url: + next_url = self._get_json_path_value_from_data( + self._general_type_data.json_paths.next_url, json_data) + data = self._parse_and_verify_data_received(json_data) + return next_url, data + + def _parse_response_to_json(self, url): try: response = self._get_response_from_api(url) except Exception: raise - json_data = json.loads(response.content) + return json_data - if self._general_type_data.json_paths.next_url: - next_url = self._get_json_path_value_from_data( - self._general_type_data.json_paths.next_url, json_data) + def _parse_and_verify_data_received(self, json_data): data = self._get_json_path_value_from_data( self._general_type_data.json_paths.data, json_data) - if data is None: logger.error( "The json path for api {}'s data is wrong. Please change your configuration.".format( self._base_data.name)) raise Api.ApiError - data_size = len(data) - if data: logger.info("Successfully got {0} data from api {1}.".format(data_size, self._base_data.name)) - - return next_url, data + return data def _get_response_from_api(self, url: str) -> Response: try: @@ -156,13 +155,24 @@ def _get_response_from_api(self, url: str) -> Response: logger.error( "Something went wrong while trying to get the data from api {0}. response: {1}".format( self._base_data.name, e)) - if e.response.status_code == 400 or e.response.status_code == 401: raise Api.ApiError() - raise except Exception as e: logger.error("Something went wrong with api {0}. response: {1}".format(self._base_data.name, e)) raise - return response + + def get_current_time_utc_string(self): + time = datetime.utcnow() + time = time.isoformat(' ', 'seconds') + time = time.replace(' ', 'T') + time += 'Z' + return time + + def _get_next_page_url(self, json_data: dict): + if self._general_type_data.json_paths.next_url: + next_url = self._get_json_path_value_from_data( + self._general_type_data.json_paths.next_url, json_data) + return next_url + return None diff --git a/src/apis_manager.py b/src/apis_manager.py index b34ecee..05470ed 100644 --- a/src/apis_manager.py +++ b/src/apis_manager.py @@ -7,6 +7,7 @@ from typing import Optional from requests.sessions import InvalidSchema from .azure_graph import AzureGraph +from .azure_mail_reports import AzureMailReports from .config_reader import ConfigReader from .data.logzio_connection import LogzioConnection from .data.auth_api_data import AuthApiData @@ -21,7 +22,7 @@ class ApisManager: - + API_AZURE_MAIL_REPORTS_TYPE = "azure_mail_reports" CONFIG_FILE = 'src/shared/config.yaml' LAST_START_DATES_FILE = 'src/shared/last_start_dates.txt' @@ -30,7 +31,7 @@ class ApisManager: API_AZURE_GRAPH_TYPE = 'azure_graph' AUTH_API_TYPES = [API_GENERAL_TYPE, API_CISCO_SECURE_X_TYPE] - OAUTH_API_TYPES = [API_GENERAL_TYPE, API_AZURE_GRAPH_TYPE] + OAUTH_API_TYPES = [API_GENERAL_TYPE, API_AZURE_GRAPH_TYPE, API_AZURE_MAIL_REPORTS_TYPE] def __init__(self) -> None: self._apis: list[Api] = [] @@ -89,8 +90,10 @@ def _add_auth_api(self, auth_api_data: AuthApiData) -> None: def _add_oauth_api(self, oauth_api_data: OAuthApiData) -> None: if oauth_api_data.base_data.base_data.type == ApisManager.API_GENERAL_TYPE: self._apis.append(OAuthApi(oauth_api_data.base_data, oauth_api_data.general_type_data)) - else: + elif oauth_api_data.base_data.base_data.type == ApisManager.API_AZURE_GRAPH_TYPE: self._apis.append(AzureGraph(oauth_api_data)) + elif oauth_api_data.base_data.base_data.type == ApisManager.API_AZURE_MAIL_REPORTS_TYPE: + self._apis.append(AzureMailReports(oauth_api_data)) def _run_api_scheduled_task(self, api: Api) -> None: logzio_shipper = LogzioShipper(self._logzio_connection.url, self._logzio_connection.token) diff --git a/src/azure_mail_reports.py b/src/azure_mail_reports.py new file mode 100644 index 0000000..1e9e322 --- /dev/null +++ b/src/azure_mail_reports.py @@ -0,0 +1,79 @@ +import logging +from dateutil import parser +import re + +from datetime import datetime +from src.api import Api +from src.data.oauth_api_data import OAuthApiData +from src.oauth_api import OAuthApi + +logger = logging.getLogger(__name__) + + +class AzureMailReports(OAuthApi): + MAIL_REPORTS_DATA_LINK = 'd.results' + MAIL_REPORTS_FILTER_CONCAT = '&$' + MAIL_REPORTS_MAX_PAGE_SIZE = 1000 + DATE_REGEX_FILTER = '\d+' + + def __init__(self, oauth_api_data: OAuthApiData) -> None: + oauth_api_data.general_type_data.general_type_data.json_paths.data = self.MAIL_REPORTS_DATA_LINK + self._previous_end_date = None + super().__init__(oauth_api_data.base_data, oauth_api_data.general_type_data) + + def get_last_start_date(self) -> str: + return self._current_data_last_date + + def _build_api_url(self) -> str: + api_url = self._data_request.url + api_filters_num = self._base_data.get_filters_size() + new_end_date = self.get_new_end_date() + new_start_date = self.get_start_date_filter() + api_url += f"?$filter={self._general_type_data.start_date_name} eq datetime'{new_start_date}' and {self._general_type_data.end_date_name} eq datetime'{new_end_date}'" + self._previous_end_date = new_end_date + if api_filters_num > 0: + api_url += self.MAIL_REPORTS_FILTER_CONCAT + for api_filter in self._base_data.filters: + api_url += api_filter.key + '=' + str(api_filter.value) + api_filters_num -= 1 + if api_filters_num > 0: + api_url += self.MAIL_REPORTS_FILTER_CONCAT + return api_url + + def _get_last_date(self, first_item: dict) -> str: + first_item_date = self._get_json_path_value_from_data( + self._general_type_data.json_paths.data_date, first_item) + + if first_item_date is None: + logger.error( + "The json path for api {}'s data date is wrong. Please change your configuration.".format( + self._base_data.name)) + raise Api.ApiError + return self._get_formatted_date_from_date_path_value(first_item_date) + + def _is_item_in_fetch_frame(self, item: dict, last_datetime_to_fetch: datetime) -> bool: + item_date = self._get_json_path_value_from_data( + self._general_type_data.json_paths.data_date, item) + item_datetime = parser.parse(self._get_formatted_date_from_date_path_value(item_date)) + if item_datetime < last_datetime_to_fetch: + return False + + return True + + def _get_formatted_date_from_date_path_value(self, date_path_value: str) -> str: + epoch_milisec_date = re.findall(self.DATE_REGEX_FILTER, date_path_value) + date = datetime.fromtimestamp(int(int(epoch_milisec_date[0]) / 1000)) + formatted_date = date.isoformat(' ', 'seconds') + formatted_date = formatted_date.replace(' ', 'T') + formatted_date += 'Z' + return formatted_date + + def _set_current_data_last_date(self, date): + # This comparison might not work on other date formats + if (self._previous_end_date and date and self._previous_end_date > date) or not date: + self._set_current_data_last_date(self._previous_end_date) + else: + self._current_data_last_date = date + + def get_new_end_date(self): + return self.get_current_time_utc_string() diff --git a/src/config_reader.py b/src/config_reader.py index c44ed8c..8f01fcf 100644 --- a/src/config_reader.py +++ b/src/config_reader.py @@ -42,6 +42,7 @@ class ConfigReader: API_FILTERS_CONFIG_KEY = 'filters' API_CUSTOM_FIELDS_CONFIG_KEY = 'custom_fields' API_START_DATE_NAME_CONFIG_KEY = 'start_date_name' + API_END_DATE_NAME_CONFIG_KEY = "end_date_name" GENERAL_AUTH_API_HTTP_REQUEST_CONFIG_KEY = 'http_request' OAUTH_API_TOKEN_HTTP_REQUEST_CONFIG_KEY = 'token_http_request' OAUTH_API_DATA_HTTP_REQUEST_CONFIG_KEY = 'data_http_request' @@ -49,6 +50,7 @@ class ConfigReader: API_HTTP_REQUEST_URL_CONFIG_KEY = 'url' API_HTTP_REQUEST_HEADERS_CONFIG_KEY = 'headers' API_HTTP_REQUEST_BODY_CONFIG_KEY = 'body' + API_HTTP_REQUEST_PAGE_SIZE = 'page_size' GENERAL_API_JSON_PATHS_CONFIG_KEY = 'json_paths' GENERAL_API_JSON_PATHS_NEXT_URL_CONFIG_KEY = 'next_url' GENERAL_API_JSON_PATHS_DATA_CONFIG_KEY = 'data' @@ -289,6 +291,7 @@ def _get_oauth_api_general_type_data(self, config_oauth_api_data: dict, def _get_api_general_type_data(self, config_api_data, api_group_type: str, api_num: int) -> Optional[ApiGeneralTypeData]: api_start_date_name = self._get_api_start_date_name(config_api_data, api_group_type, api_num) + api_end_date_name = self._get_api_end_date_name(config_api_data, api_group_type, api_num) api_json_paths = self._get_api_json_paths(config_api_data, api_group_type, api_num) if (api_start_date_name is None and api_group_type != self.OAUTH_API) or api_json_paths is None: @@ -296,7 +299,7 @@ def _get_api_general_type_data(self, config_api_data, api_group_type: str, "Your configuration is not valid:\"json_paths\" must exist for all api types, \"start_date_name\" must exist for non oauth api types") return None - return ApiGeneralTypeData(api_start_date_name, api_json_paths) + return ApiGeneralTypeData(api_start_date_name, api_end_date_name, api_json_paths) def _get_api_start_date_name(self, config_api_data: dict, api_group_type: str, api_num: int) -> Optional[str]: try: @@ -309,6 +312,17 @@ def _get_api_start_date_name(self, config_api_data: dict, api_group_type: str, a return api_start_date_name + def _get_api_end_date_name(self, config_api_data: dict, api_group_type: str, api_num: int) -> Optional[str]: + try: + api_end_date_name = config_api_data[ConfigReader.API_END_DATE_NAME_CONFIG_KEY] + except KeyError: + logger.warning( + "Missing field in config: the general type {0} api #{1} must have end_date_name.".format( + api_group_type, api_num)) + return None + + return api_end_date_name + def _get_api_json_paths(self, config_api_data: dict, api_group_type: str, api_num: int) -> Optional[ApiJsonPaths]: api_json_paths = config_api_data[ConfigReader.GENERAL_API_JSON_PATHS_CONFIG_KEY] api_data_date_json_path = api_json_paths.get(ConfigReader.GENERAL_API_JSON_PATHS_DATA_DATE_CONFIG_KEY) @@ -393,6 +407,7 @@ def _get_oauth_api_http_requests(self, config_oauth_api_data: dict, data_http_request = ApiHttpRequest(api_data_http_request_method, api_data_url, api_data_http_request.get(ConfigReader.API_HTTP_REQUEST_HEADERS_CONFIG_KEY), - api_data_http_request.get(ConfigReader.API_HTTP_REQUEST_BODY_CONFIG_KEY)) + api_data_http_request.get(ConfigReader.API_HTTP_REQUEST_BODY_CONFIG_KEY), + api_data_http_request.get(ConfigReader.API_HTTP_REQUEST_PAGE_SIZE)) return token_http_request, data_http_request diff --git a/src/data/api_http_request.py b/src/data/api_http_request.py index a5eac7d..9a6beec 100644 --- a/src/data/api_http_request.py +++ b/src/data/api_http_request.py @@ -1,15 +1,24 @@ class ApiHttpRequest: - GET_METHOD = 'GET' POST_METHOD = 'POST' HTTP_METHODS = [GET_METHOD, POST_METHOD] - def __init__(self, api_http_request_method: str, api_url: str, api_http_request_headers: dict = None, - api_http_request_body: str = None) -> None: + def __init__(self, api_http_request_method: str, api_url: str, + api_http_request_headers: dict = None, + api_http_request_body: str = None, page_size: int = None) -> None: self._method = api_http_request_method self._url = api_url self._headers = api_http_request_headers self._body = api_http_request_body + self._page_size = page_size + + @property + def page_size(self) -> int: + return self._page_size + + @page_size.setter + def page_size(self, page_size) -> None: + self._page_size = page_size @property def method(self) -> str: diff --git a/src/data/general_type_data/api_general_type_data.py b/src/data/general_type_data/api_general_type_data.py index b70467b..17eec69 100644 --- a/src/data/general_type_data/api_general_type_data.py +++ b/src/data/general_type_data/api_general_type_data.py @@ -3,10 +3,16 @@ class ApiGeneralTypeData: - def __init__(self, api_start_date_name: str, api_json_paths: ApiJsonPaths) -> None: + def __init__(self, api_start_date_name: str, api_end_date_name: str, + api_json_paths: ApiJsonPaths) -> None: self._start_date_name = api_start_date_name + self._end_date_name = api_end_date_name self._json_paths = api_json_paths + @property + def end_date_name(self) -> str: + return self._end_date_name + @property def start_date_name(self) -> str: return self._start_date_name diff --git a/src/oauth_api.py b/src/oauth_api.py index 4692372..8e2ea83 100644 --- a/src/oauth_api.py +++ b/src/oauth_api.py @@ -62,6 +62,7 @@ def _get_total_data_from_api(self) -> Generator: if not data: logger.info("No new data available from api {}.".format(self._base_data.name)) + self._set_current_data_last_date(first_item_date) return data if is_first_fetch: @@ -85,7 +86,7 @@ def _get_total_data_from_api(self) -> Generator: logger.info("Got {0} total data from api {1}".format(total_data_num, self._base_data.name)) - self._current_data_last_date = first_item_date + self._set_current_data_last_date(first_item_date) def _build_api_url(self) -> str: api_url = self._data_request.url @@ -126,3 +127,8 @@ def get_data_request(self): @property def get_token_request(self): return self._token_request + + def _set_current_data_last_date(self, date): + if date: + self._current_data_last_date = date +