Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Extended Debugger reductions and fixed some bugs #523

Open
wants to merge 10 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
44 changes: 34 additions & 10 deletions smdebug/core/hook.py
Original file line number Diff line number Diff line change
Expand Up @@ -512,30 +512,33 @@ def _get_writers(self, tensor_name, tensor_ref=None) -> List[FileWriter]:
return []
return self._get_main_writer()

def _maybe_get_tb_writer(self) -> Optional[FileWriter]:
def _maybe_get_tb_writer(self, subfolder=None) -> Optional[FileWriter]:
""" Returns a FileWriter object if `hook.tensorboard_dir` has been specified, else None.

Creates a writer if does not exist.
"""
if not self.tensorboard_dir:
return None

if self.mode in self.tb_writers:
assert self.tb_writers[self.mode] is not None
if subfolder == None:
subfolder = self.mode
Comment on lines +523 to +524
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can we use subfolder = os.path.join(self.mode)?
It makes the intentions of this line much clearer to the reader.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

subfolder is just a string and not a filepath.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

os.path.join returns an object of type str. If I understand this part correctly, the subfolder variable contains the path to sub directory for tensorboard data?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

no subfolder is just the name of the reduction. each reduction will be its own subfolder in the tensorboard directory.


if subfolder in self.tb_writers:
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We should rename the name of this map. Maybe something more explicit like self.tb_writer_to_dir_map ?

assert self.tb_writers[subfolder] is not None
# would be there if set_mode was called
return self.tb_writers[self.mode]
return self.tb_writers[subfolder]
else:
# s = self.step
# if s < 0: s = 0
self.tb_writers[self.mode] = FileWriter(
self.tb_writers[subfolder] = FileWriter(
trial_dir=self.tensorboard_dir,
step=self.step,
worker=get_tb_worker(),
write_checksum=True,
wtype="tensorboard",
mode=self.mode,
mode=subfolder,
)
return self.tb_writers[self.mode]
return self.tb_writers[subfolder]

def _close_tb_writer(self):
if self.dry_run:
Expand Down Expand Up @@ -663,13 +666,24 @@ def export_collections(self):
def _get_reduction_tensor_name(self, tensor_name, reduction_name, abs):
return get_reduction_tensor_name(tensor_name, reduction_name, abs, remove_colon_index=True)

def _write_reduction(self, tensor_name, tensor_value, reduction_name, abs, tensor_ref=None):
def _write_reduction(
self, tensor_name, tensor_value, reduction_name, abs, tensor_ref=None, collection_name=""
):
reduction_tensor_name = self._get_reduction_tensor_name(tensor_name, reduction_name, abs)
try:
tensor_data = self._get_reduction_of_data(
reduction_name, tensor_value, tensor_name, abs
)
self._write_raw_tensor_simple(reduction_tensor_name, tensor_data, tensor_ref=tensor_ref)
if abs:
reduction_name = "abs_" + reduction_name
tb_writer = self._maybe_get_tb_writer(subfolder=reduction_name)
if tb_writer:
reduction_tensor_name = (
collection_name + "/reductions/" + tensor_name + "/" + self.worker
)
scalar = self._make_numpy_array(tensor_data)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

What is the value of scalar if tb_writer = None?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Per default Debugger writes reductions like normal tensors into debug-output folder and users can retrieve the data via the smdebug API. I extended this part, so that reductions are also written in Tensorboard format (in case user provided a Tensorboard configuration)

tb_writer.write_scalar_summary(reduction_tensor_name, scalar, self.step)
except ValueError as e:
self.logger.warning(
f"Could not compute reduction {reduction_name} of {tensor_name} due to {e}"
Expand All @@ -685,14 +699,24 @@ def _write_reductions(self, tensor_name, tensor_value, save_collections, tensor_
for reduction in reduction_list:
if (reduction, False) not in reductions_saved:
self._write_reduction(
tensor_name, tensor_value, reduction, abs=False, tensor_ref=tensor_ref
tensor_name,
tensor_value,
reduction,
abs=False,
tensor_ref=tensor_ref,
collection_name=s_col.name,
)
reductions_saved.add((reduction, False))
for reduction_list in (reduction_config.abs_reductions, reduction_config.abs_norms):
for reduction in reduction_list:
if (reduction, True) not in reductions_saved:
self._write_reduction(
tensor_name, tensor_value, reduction, abs=True, tensor_ref=tensor_ref
tensor_name,
tensor_value,
reduction,
abs=True,
tensor_ref=tensor_ref,
collection_name=s_col.name,
)
reductions_saved.add((reduction, True))

Expand Down
9 changes: 6 additions & 3 deletions smdebug/core/locations.py
Original file line number Diff line number Diff line change
Expand Up @@ -120,11 +120,14 @@ def __init__(self, step_num, worker_name, mode=None):

def get_file_location(self, base_dir=""):
# when base_dir is empty it just returns the relative file path
if hasattr(self.mode, "name"):
subfolder = self.mode.name
else:
subfolder = self.mode
if base_dir:
event_key_prefix = os.path.join(base_dir, self.mode.name)
event_key_prefix = os.path.join(base_dir, subfolder)
else:
event_key_prefix = os.path.join(self.type, self.mode.name)

event_key_prefix = os.path.join(self.type, subfolder)
return os.path.join(event_key_prefix, self.get_filename())


Expand Down
31 changes: 26 additions & 5 deletions smdebug/core/reduction_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,13 +3,25 @@
from typing import Any, Dict

# First Party
from smdebug.analysis.utils import parse_bool
from smdebug.core.logger import get_logger
from smdebug.core.utils import split

logger = get_logger()


ALLOWED_REDUCTIONS = ["min", "max", "mean", "std", "variance", "sum", "prod"]
ALLOWED_REDUCTIONS = [
"min",
"max",
"mean",
"std",
"variance",
"sum",
"prod",
"isnan",
"isinf",
"quantile",
]
ALLOWED_NORMS = ["l1", "l2"]
REDUCTION_CONFIG_VERSION_NUM = "v0"
ALLOWED_PARAMS = [
Expand Down Expand Up @@ -66,7 +78,7 @@ def __init__(
self.abs_reductions = abs_reductions if abs_reductions is not None else []
self.norms = norms if norms is not None else []
self.abs_norms = abs_norms if abs_norms is not None else []
self.save_raw_tensor = save_raw_tensor
self.save_raw_tensor = parse_bool(save_raw_tensor, True)
self.save_shape = save_shape
## DO NOT REMOVE, if you add anything here, please make sure that _check & from_json is updated accordingly
self._check()
Expand All @@ -77,11 +89,20 @@ def _check(self):
raise ValueError(
"allowed params for reduction config can only be one of " + ",".join(ALLOWED_PARAMS)
)

if any([x not in ALLOWED_REDUCTIONS for x in self.reductions]):
for index, reduction_allowed in enumerate(
[x in ALLOWED_REDUCTIONS for x in self.reductions]
):
if reduction_allowed or self.reductions[index].startswith("quantile"):
continue
raise ValueError("reductions can only be one of " + ",".join(ALLOWED_REDUCTIONS))
if any([x not in ALLOWED_REDUCTIONS for x in self.abs_reductions]):

for index, reduction_allowed in enumerate(
[x in ALLOWED_REDUCTIONS for x in self.abs_reductions]
):
if reduction_allowed or self.abs_reductions[index].startswith("quantile"):
continue
raise ValueError("abs_reductions can only be one of " + ",".join(ALLOWED_REDUCTIONS))

if any([x not in ALLOWED_NORMS for x in self.norms]):
raise ValueError("norms can only be one of " + ",".join(ALLOWED_NORMS))
if any([x not in ALLOWED_NORMS for x in self.abs_norms]):
Expand Down
23 changes: 17 additions & 6 deletions smdebug/pytorch/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,22 +21,33 @@ def get_reduction_of_data(reduction_name, tensor_data, tensor_name, abs=False):
return get_numpy_reduction(reduction_name, tensor_data, abs)
if abs:
tensor_data = torch.abs(tensor_data)

if reduction_name.startswith("quantile") and hasattr(torch, "quantile"):
f = getattr(torch, "quantile")
value = float(reduction_name.replace("quantile", "")[1]) / 100
op = f(tensor_data.float(), value)
return op
if reduction_name in ALLOWED_REDUCTIONS:
if reduction_name == "variance":
reduction_name = "var"
assert hasattr(torch.Tensor, reduction_name)
f = getattr(torch.Tensor, reduction_name)
op = f(tensor_data)
return op
if hasattr(torch.Tensor, reduction_name):
f = getattr(torch.Tensor, reduction_name)
op = f(tensor_data.float())
if reduction_name == "isnan" or reduction_name == "isinf":
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can we manage reduction_name values with Enums?

op = torch.sum(op)
return op
if hasattr(torch, reduction_name):
f = getattr(torch, reduction_name)
op = f(tensor_data)
op = torch.sum(op)
return op
elif reduction_name in ALLOWED_NORMS:
if reduction_name in ["l1", "l2"]:
ord = int(reduction_name[1])
else:
raise RuntimeError(
"Invalid normalization operation {0} for torch.Tensor".format(reduction_name)
)
op = torch.norm(tensor_data, p=ord)
op = torch.norm(tensor_data.float(), p=ord)
return op
elif hasattr(torch, reduction_name):
f = getattr(torch, reduction_name)
Expand Down
8 changes: 4 additions & 4 deletions tests/tensorflow/hooks/test_reductions.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
# First Party
import smdebug.tensorflow as smd
from smdebug.core.json_config import CONFIG_FILE_PATH_ENV_STR
from smdebug.core.reduction_config import ALLOWED_NORMS, ALLOWED_REDUCTIONS
from smdebug.core.reduction_config import ALLOWED_NORMS
from smdebug.exceptions import TensorUnavailableForStep

# Local
Expand Down Expand Up @@ -37,16 +37,16 @@ def helper_test_reductions(trial_dir, hook, save_raw_tensor):
except TensorUnavailableForStep as e:
pass
assert len(t.reduction_values(0)) == 18
for r in ALLOWED_REDUCTIONS + ALLOWED_NORMS:
for r in ["min", "max", "mean", "std", "variance", "sum", "prod"] + ALLOWED_NORMS:
for b in [False, True]:
assert t.reduction_value(0, reduction_name=r, abs=b, worker=None) is not None


def test_reductions(out_dir, save_raw_tensor=False):
pre_test_clean_up()
rdnc = smd.ReductionConfig(
reductions=ALLOWED_REDUCTIONS,
abs_reductions=ALLOWED_REDUCTIONS,
reductions=["min", "max", "mean", "std", "variance", "sum", "prod"],
abs_reductions=["min", "max", "mean", "std", "variance", "sum", "prod"],
norms=ALLOWED_NORMS,
abs_norms=ALLOWED_NORMS,
save_raw_tensor=save_raw_tensor,
Expand Down
5 changes: 3 additions & 2 deletions tests/tensorflow/keras/test_keras.py
Original file line number Diff line number Diff line change
Expand Up @@ -295,10 +295,11 @@ def test_save_all(out_dir):

@pytest.mark.slow # 0:03 to run
def test_base_reductions(out_dir):
reductions = ["min", "max", "mean", "std", "variance", "sum", "prod"]
train_model(
out_dir,
include_collections=[CollectionKeys.WEIGHTS, CollectionKeys.METRICS, CollectionKeys.LOSSES],
reduction_config=ReductionConfig(norms=ALLOWED_NORMS, reductions=ALLOWED_REDUCTIONS),
reduction_config=ReductionConfig(norms=ALLOWED_NORMS, reductions=reductions),
steps=["train"],
)
tr = create_trial_fast_refresh(out_dir)
Expand All @@ -308,7 +309,7 @@ def test_base_reductions(out_dir):
assert False
except TensorUnavailableForStep:
assert tr.tensor(weight_name).reduction_value(0, "l1") is not None
assert len(tr.tensor(weight_name).reduction_values(0)) == len(ALLOWED_REDUCTIONS) + len(
assert len(tr.tensor(weight_name).reduction_values(0)) == len(reductions) + len(
ALLOWED_NORMS
)

Expand Down
6 changes: 4 additions & 2 deletions tests/tensorflow/keras/test_keras_mirrored.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@
from smdebug.core.access_layer import has_training_ended
from smdebug.core.collection import CollectionKeys
from smdebug.core.modes import ModeKeys
from smdebug.core.reduction_config import ALLOWED_NORMS, ALLOWED_REDUCTIONS
from smdebug.core.reduction_config import ALLOWED_NORMS
from smdebug.exceptions import TensorUnavailable, TensorUnavailableForStep
from smdebug.tensorflow import ReductionConfig, SaveConfig
from smdebug.tensorflow.keras import KerasHook
Expand Down Expand Up @@ -411,7 +411,9 @@ def test_base_reductions(out_dir):
CollectionKeys.METRICS,
CollectionKeys.LOSSES,
],
reduction_config=ReductionConfig(norms=ALLOWED_NORMS, reductions=ALLOWED_REDUCTIONS),
reduction_config=ReductionConfig(
norms=ALLOWED_NORMS, reductions=["min", "max", "mean", "std", "variance", "sum", "prod"]
),
steps=["train"],
)

Expand Down
24 changes: 16 additions & 8 deletions tests/tensorflow2/test_keras.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@
from smdebug.core.collection import CollectionKeys
from smdebug.core.json_config import CONFIG_FILE_PATH_ENV_STR
from smdebug.core.modes import ModeKeys
from smdebug.core.reduction_config import ALLOWED_NORMS, ALLOWED_REDUCTIONS
from smdebug.core.reduction_config import ALLOWED_NORMS
from smdebug.exceptions import TensorUnavailableForStep
from smdebug.profiler.profiler_constants import DEFAULT_PREFIX
from smdebug.tensorflow import ReductionConfig, SaveConfig
Expand Down Expand Up @@ -248,10 +248,11 @@ def test_gradtape_base_reductions(out_dir):
"""
Test reduction config
"""
reductions = ["min", "max", "mean", "std", "sum", "prod"]
helper_keras_gradtape(
trial_dir=out_dir,
include_collections=[CollectionKeys.WEIGHTS, CollectionKeys.METRICS, CollectionKeys.LOSSES],
reduction_config=ReductionConfig(norms=ALLOWED_NORMS, reductions=ALLOWED_REDUCTIONS),
reduction_config=ReductionConfig(norms=ALLOWED_NORMS, reductions=reductions),
)
tr = create_trial_fast_refresh(out_dir)
weight_name = tr.tensor_names(collection=CollectionKeys.WEIGHTS)[0]
Expand All @@ -260,7 +261,7 @@ def test_gradtape_base_reductions(out_dir):
assert False
except TensorUnavailableForStep:
assert tr.tensor(weight_name).reduction_value(0, "l1") is not None
assert len(tr.tensor(weight_name).reduction_values(0)) == len(ALLOWED_REDUCTIONS) + len(
assert len(tr.tensor(weight_name).reduction_values(0)) == len(reductions) + len(
ALLOWED_NORMS
)

Expand Down Expand Up @@ -379,7 +380,9 @@ def test_gradtape_include_collections(out_dir):
out_dir,
save_config=save_config,
include_collections=include_collections,
reduction_config=ReductionConfig(norms=ALLOWED_NORMS, reductions=ALLOWED_REDUCTIONS),
reduction_config=ReductionConfig(
norms=ALLOWED_NORMS, reductions=["min", "max", "mean", "std", "sum", "prod"]
),
)
helper_keras_gradtape(out_dir, hook=hook)

Expand Down Expand Up @@ -526,10 +529,11 @@ def test_keras_fit_shapes(out_dir):

@pytest.mark.slow
def test_base_reductions(out_dir, tf_eager_mode):
reductions = ["min", "max", "mean", "std", "sum", "prod"]
helper_keras_fit(
trial_dir=out_dir,
include_collections=[CollectionKeys.WEIGHTS, CollectionKeys.METRICS, CollectionKeys.LOSSES],
reduction_config=ReductionConfig(norms=ALLOWED_NORMS, reductions=ALLOWED_REDUCTIONS),
reduction_config=ReductionConfig(norms=ALLOWED_NORMS, reductions=reductions),
run_eagerly=tf_eager_mode,
)
tr = create_trial_fast_refresh(out_dir)
Expand All @@ -539,7 +543,7 @@ def test_base_reductions(out_dir, tf_eager_mode):
assert False
except TensorUnavailableForStep:
assert tr.tensor(weight_name).reduction_value(0, "l1") is not None
assert len(tr.tensor(weight_name).reduction_values(0)) == len(ALLOWED_REDUCTIONS) + len(
assert len(tr.tensor(weight_name).reduction_values(0)) == len(reductions) + len(
ALLOWED_NORMS
)

Expand Down Expand Up @@ -718,7 +722,9 @@ def test_include_collections(out_dir, tf_eager_mode):
out_dir,
save_config=save_config,
include_collections=include_collections,
reduction_config=ReductionConfig(norms=ALLOWED_NORMS, reductions=ALLOWED_REDUCTIONS),
reduction_config=ReductionConfig(
norms=ALLOWED_NORMS, reductions=["min", "max", "mean", "std", "sum", "prod"]
),
)
hook.get_collection("custom_optimizer_variables").include("Adam")
helper_keras_fit(
Expand Down Expand Up @@ -755,7 +761,9 @@ def test_include_only_custom_collection(out_dir, tf_eager_mode):
out_dir,
save_config=save_config,
include_collections=include_collections,
reduction_config=ReductionConfig(norms=ALLOWED_NORMS, reductions=ALLOWED_REDUCTIONS),
reduction_config=ReductionConfig(
norms=ALLOWED_NORMS, reductions=["min", "max", "mean", "std", "sum", "prod"]
),
)
hook.get_collection("custom_optimizer_variables").include("Adam")
helper_keras_fit(
Expand Down
6 changes: 4 additions & 2 deletions tests/tensorflow2/test_keras_mirrored.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@
from smdebug.core.access_layer import has_training_ended
from smdebug.core.collection import CollectionKeys
from smdebug.core.modes import ModeKeys
from smdebug.core.reduction_config import ALLOWED_NORMS, ALLOWED_REDUCTIONS
from smdebug.core.reduction_config import ALLOWED_NORMS
from smdebug.exceptions import TensorUnavailable, TensorUnavailableForStep
from smdebug.tensorflow import ReductionConfig, SaveConfig
from smdebug.tensorflow.keras import KerasHook
Expand Down Expand Up @@ -325,7 +325,9 @@ def test_base_reductions(out_dir, tf_eager_mode):
CollectionKeys.METRICS,
CollectionKeys.LOSSES,
],
reduction_config=ReductionConfig(norms=ALLOWED_NORMS, reductions=ALLOWED_REDUCTIONS),
reduction_config=ReductionConfig(
norms=ALLOWED_NORMS, reductions=["min", "max", "mean", "std", "variance", "sum", "prod"]
),
steps=["train"],
eager=tf_eager_mode,
)
Expand Down