-
Notifications
You must be signed in to change notification settings - Fork 83
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Extended Debugger reductions and fixed some bugs #523
base: master
Are you sure you want to change the base?
Changes from all commits
d9f9696
0fcb161
c24fd2e
ff9dd63
316087e
ce6e880
3d52a9a
61e7537
66999aa
3e25dd0
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -512,30 +512,33 @@ def _get_writers(self, tensor_name, tensor_ref=None) -> List[FileWriter]: | |
return [] | ||
return self._get_main_writer() | ||
|
||
def _maybe_get_tb_writer(self) -> Optional[FileWriter]: | ||
def _maybe_get_tb_writer(self, subfolder=None) -> Optional[FileWriter]: | ||
""" Returns a FileWriter object if `hook.tensorboard_dir` has been specified, else None. | ||
|
||
Creates a writer if does not exist. | ||
""" | ||
if not self.tensorboard_dir: | ||
return None | ||
|
||
if self.mode in self.tb_writers: | ||
assert self.tb_writers[self.mode] is not None | ||
if subfolder == None: | ||
subfolder = self.mode | ||
|
||
if subfolder in self.tb_writers: | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. We should rename the name of this map. Maybe something more explicit like |
||
assert self.tb_writers[subfolder] is not None | ||
# would be there if set_mode was called | ||
return self.tb_writers[self.mode] | ||
return self.tb_writers[subfolder] | ||
else: | ||
# s = self.step | ||
# if s < 0: s = 0 | ||
self.tb_writers[self.mode] = FileWriter( | ||
self.tb_writers[subfolder] = FileWriter( | ||
trial_dir=self.tensorboard_dir, | ||
step=self.step, | ||
worker=get_tb_worker(), | ||
write_checksum=True, | ||
wtype="tensorboard", | ||
mode=self.mode, | ||
mode=subfolder, | ||
) | ||
return self.tb_writers[self.mode] | ||
return self.tb_writers[subfolder] | ||
|
||
def _close_tb_writer(self): | ||
if self.dry_run: | ||
|
@@ -663,13 +666,24 @@ def export_collections(self): | |
def _get_reduction_tensor_name(self, tensor_name, reduction_name, abs): | ||
return get_reduction_tensor_name(tensor_name, reduction_name, abs, remove_colon_index=True) | ||
|
||
def _write_reduction(self, tensor_name, tensor_value, reduction_name, abs, tensor_ref=None): | ||
def _write_reduction( | ||
self, tensor_name, tensor_value, reduction_name, abs, tensor_ref=None, collection_name="" | ||
): | ||
reduction_tensor_name = self._get_reduction_tensor_name(tensor_name, reduction_name, abs) | ||
try: | ||
tensor_data = self._get_reduction_of_data( | ||
reduction_name, tensor_value, tensor_name, abs | ||
) | ||
self._write_raw_tensor_simple(reduction_tensor_name, tensor_data, tensor_ref=tensor_ref) | ||
if abs: | ||
reduction_name = "abs_" + reduction_name | ||
tb_writer = self._maybe_get_tb_writer(subfolder=reduction_name) | ||
if tb_writer: | ||
reduction_tensor_name = ( | ||
collection_name + "/reductions/" + tensor_name + "/" + self.worker | ||
) | ||
scalar = self._make_numpy_array(tensor_data) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. What is the value of There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Per default Debugger writes reductions like normal tensors into debug-output folder and users can retrieve the data via the smdebug API. I extended this part, so that reductions are also written in Tensorboard format (in case user provided a Tensorboard configuration) |
||
tb_writer.write_scalar_summary(reduction_tensor_name, scalar, self.step) | ||
except ValueError as e: | ||
self.logger.warning( | ||
f"Could not compute reduction {reduction_name} of {tensor_name} due to {e}" | ||
|
@@ -685,14 +699,24 @@ def _write_reductions(self, tensor_name, tensor_value, save_collections, tensor_ | |
for reduction in reduction_list: | ||
if (reduction, False) not in reductions_saved: | ||
self._write_reduction( | ||
tensor_name, tensor_value, reduction, abs=False, tensor_ref=tensor_ref | ||
tensor_name, | ||
tensor_value, | ||
reduction, | ||
abs=False, | ||
tensor_ref=tensor_ref, | ||
collection_name=s_col.name, | ||
) | ||
reductions_saved.add((reduction, False)) | ||
for reduction_list in (reduction_config.abs_reductions, reduction_config.abs_norms): | ||
for reduction in reduction_list: | ||
if (reduction, True) not in reductions_saved: | ||
self._write_reduction( | ||
tensor_name, tensor_value, reduction, abs=True, tensor_ref=tensor_ref | ||
tensor_name, | ||
tensor_value, | ||
reduction, | ||
abs=True, | ||
tensor_ref=tensor_ref, | ||
collection_name=s_col.name, | ||
) | ||
reductions_saved.add((reduction, True)) | ||
|
||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -21,22 +21,33 @@ def get_reduction_of_data(reduction_name, tensor_data, tensor_name, abs=False): | |
return get_numpy_reduction(reduction_name, tensor_data, abs) | ||
if abs: | ||
tensor_data = torch.abs(tensor_data) | ||
|
||
if reduction_name.startswith("quantile") and hasattr(torch, "quantile"): | ||
f = getattr(torch, "quantile") | ||
value = float(reduction_name.replace("quantile", "")[1]) / 100 | ||
op = f(tensor_data.float(), value) | ||
return op | ||
if reduction_name in ALLOWED_REDUCTIONS: | ||
if reduction_name == "variance": | ||
reduction_name = "var" | ||
assert hasattr(torch.Tensor, reduction_name) | ||
f = getattr(torch.Tensor, reduction_name) | ||
op = f(tensor_data) | ||
return op | ||
if hasattr(torch.Tensor, reduction_name): | ||
f = getattr(torch.Tensor, reduction_name) | ||
op = f(tensor_data.float()) | ||
if reduction_name == "isnan" or reduction_name == "isinf": | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Can we manage reduction_name values with Enums? |
||
op = torch.sum(op) | ||
return op | ||
if hasattr(torch, reduction_name): | ||
f = getattr(torch, reduction_name) | ||
op = f(tensor_data) | ||
op = torch.sum(op) | ||
return op | ||
elif reduction_name in ALLOWED_NORMS: | ||
if reduction_name in ["l1", "l2"]: | ||
ord = int(reduction_name[1]) | ||
else: | ||
raise RuntimeError( | ||
"Invalid normalization operation {0} for torch.Tensor".format(reduction_name) | ||
) | ||
op = torch.norm(tensor_data, p=ord) | ||
op = torch.norm(tensor_data.float(), p=ord) | ||
return op | ||
elif hasattr(torch, reduction_name): | ||
f = getattr(torch, reduction_name) | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Can we use
subfolder = os.path.join(self.mode)
?It makes the intentions of this line much clearer to the reader.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
subfolder is just a string and not a filepath.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
os.path.join
returns an object of typestr
. If I understand this part correctly, the subfolder variable contains the path to sub directory for tensorboard data?There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
no subfolder is just the name of the reduction. each reduction will be its own subfolder in the tensorboard directory.