Skip to content

Commit

Permalink
Intermediate output logging enablement - Inspector logging (pytorch#4293
Browse files Browse the repository at this point in the history
)

Summary:
Pull Request resolved: pytorch#4293

This diff parses the logged intermediate outputs in etdump into Inspector objects. It pretty much automatically works because the infra has already been built out for non-delegate intermediate output logging. The only change needed here is to add delegate debug id to `InstructionEventSignature` and `DebugEventSignature` so they group delegated ops correctly.
Design doc: https://docs.google.com/document/d/1qGHsgd-roqtxPz4CrUlqGrKaAtbaf9bArziMuwHD0So/edit

Reviewed By: Jack-Khuu

Differential Revision: D59840296

fbshipit-source-id: 04f22d4520584090f3b37b83386f704cc7a2c271
  • Loading branch information
Olivia-liu authored and facebook-github-bot committed Jul 18, 2024
1 parent 1b0bf1c commit 4a88318
Show file tree
Hide file tree
Showing 6 changed files with 255 additions and 5 deletions.
2 changes: 2 additions & 0 deletions sdk/etdump/schema_flatcc.py
Original file line number Diff line number Diff line change
Expand Up @@ -95,6 +95,8 @@ class Value:
class DebugEvent:
chain_index: int
instruction_id: int
delegate_debug_id_int: Optional[int]
delegate_debug_id_str: Optional[str]
debug_entry: Value


Expand Down
2 changes: 2 additions & 0 deletions sdk/etdump/tests/serialize_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -85,6 +85,8 @@ def get_sample_etdump_flatcc() -> flatcc.ETDumpFlatCC:
debug_event=flatcc.DebugEvent(
chain_index=1,
instruction_id=0,
delegate_debug_id_str="56",
delegate_debug_id_int=-1,
debug_entry=flatcc.Value(
val=flatcc.ValueType.TENSOR.value,
tensor=flatcc.Tensor(
Expand Down
18 changes: 15 additions & 3 deletions sdk/inspector/_inspector.py
Original file line number Diff line number Diff line change
Expand Up @@ -65,6 +65,8 @@
class InstructionEventSignature:
instruction_id: int
chain_index: int
delegate_id: Optional[int] = None
delegate_id_str: Optional[str] = None


# Aggregated Runtime Events for a single instruction
Expand Down Expand Up @@ -92,8 +94,12 @@ def gen_from_events(run_events: List[flatcc.Event]) -> List["InstructionEvent"]:

# Get existing InstructionEvent or insert a new one
signature = InstructionEventSignature(
populated_event.instruction_id, populated_event.chain_index
instruction_id=populated_event.instruction_id,
chain_index=populated_event.chain_index,
delegate_id=populated_event.delegate_debug_id_int,
delegate_id_str=populated_event.delegate_debug_id_str,
)

instruction_event = instruction_events.setdefault(
signature, InstructionEvent(signature=signature)
)
Expand All @@ -104,6 +110,7 @@ def gen_from_events(run_events: List[flatcc.Event]) -> List["InstructionEvent"]:
instruction_event.profile_events = []
instruction_event.profile_events.append(populated_event)
elif isinstance(populated_event, DebugEvent):
# Ignore run_output events
if not is_debug_output(populated_event.debug_entry):
if instruction_event.debug_events is None:
instruction_event.debug_events = []
Expand Down Expand Up @@ -139,7 +146,9 @@ def _gen_from_event(event: ProfileEvent) -> "ProfileEventSignature":
# Signature of a DebugEvent
@dataclass(frozen=True, order=True)
class DebugEventSignature:
instruction_id: Optional[int]
instruction_id: Optional[int] = -1
delegate_id: Optional[int] = None
delegate_id_str: Optional[str] = None

@staticmethod
def _gen_from_event(event: DebugEvent) -> "DebugEventSignature":
Expand All @@ -150,7 +159,9 @@ def _gen_from_event(event: DebugEvent) -> "DebugEventSignature":
The Signature will convert these back to the intended None value
"""
return DebugEventSignature(
event.instruction_id if event.instruction_id != -1 else None
event.instruction_id if event.instruction_id != -1 else None,
event.delegate_debug_id_int if event.delegate_debug_id_int != -1 else None,
event.delegate_debug_id_str if event.delegate_debug_id_str != "" else None,
)


Expand Down Expand Up @@ -562,6 +573,7 @@ def _populate_debugging_related_fields(
Fields Updated:
debug_data
"""

debug_data: List[flatcc.Value] = []
for event in events:
if (debug_events := event.debug_events) is None:
Expand Down
228 changes: 226 additions & 2 deletions sdk/inspector/tests/event_blocks_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,55 @@ def _gen_sample_profile_event(
end_time=time[1],
)

@staticmethod
def _gen_sample_debug_event(
instruction_id: int,
delegate_debug_id: Optional[Union[int, str]] = None,
) -> flatcc.DebugEvent:
"""
Helper for generating test DebugEvents
Notably:
- delegate_debug_id takes either the str or int representation
"""
delegate_debug_id_int = (
delegate_debug_id if isinstance(delegate_debug_id, int) else -1
)
delegate_debug_id_str = (
delegate_debug_id if isinstance(delegate_debug_id, str) else ""
)

return flatcc.DebugEvent(
chain_index=0,
instruction_id=instruction_id,
delegate_debug_id_int=delegate_debug_id_int,
delegate_debug_id_str=delegate_debug_id_str,
debug_entry=flatcc.Value(
val=flatcc.ValueType.TENSOR.value,
tensor=flatcc.Tensor(
scalar_type=flatcc.ScalarType.INT,
sizes=[1],
strides=[1],
offset=12345,
),
tensor_list=flatcc.TensorList(
[
flatcc.Tensor(
scalar_type=flatcc.ScalarType.INT,
sizes=[1],
strides=[1],
offset=12345,
)
]
),
int_value=flatcc.Int(1),
float_value=flatcc.Float(1.0),
double_value=flatcc.Double(1.0),
bool_value=flatcc.Bool(False),
output=None,
),
)

@staticmethod
def _get_sample_etdump_flatcc() -> flatcc.ETDumpFlatCC:
"""
Expand Down Expand Up @@ -123,17 +172,145 @@ def _get_sample_etdump_flatcc() -> flatcc.ETDumpFlatCC:

return ETDumpFlatCC(version=0, run_data=[run_data_1, run_data_2, run_data_3])

@staticmethod
def _get_sample_etdump_flatcc_inconsistent_debug_data() -> flatcc.ETDumpFlatCC:
debug_event_1 = TestEventBlock._gen_sample_debug_event(
instruction_id=1, delegate_debug_id=100
)
run_data_1 = flatcc.RunData(
name="signature_a",
bundled_input_index=-1,
allocators=[],
events=[
flatcc.Event(
allocation_event=None,
debug_event=debug_event_1,
profile_event=None,
),
],
)

debug_event_2 = TestEventBlock._gen_sample_debug_event(
instruction_id=1, delegate_debug_id=100
)
# Modify this debug event so it's different from debug_event_1
debug_event_2.debug_entry.tensor.sizes = [2] # pyre-ignore
run_data_2 = flatcc.RunData(
name="signature_a",
bundled_input_index=-1,
allocators=[],
events=[
flatcc.Event(
allocation_event=None,
debug_event=debug_event_2,
profile_event=None,
),
],
)
return ETDumpFlatCC(version=0, run_data=[run_data_1, run_data_2])

@staticmethod
def _get_sample_etdump_flatcc_profiling_and_debugging() -> flatcc.ETDumpFlatCC:
"""
Helper for getting a sample ETDumpFlatCC object with 3 RunData:
- run_data_1 has signature_a with (debug_event_1, profile_event_1)
- run_data_2 has the same signature with run_data_1 and same debug event, but different profiling times
- run_data_3 has signature_b with (debug_event_3, profile_event_3) and (not debug event, profile_event_4)
"""
profile_event_1 = TestEventBlock._gen_sample_profile_event(
name="profile_1", instruction_id=1, time=(0, 1), delegate_debug_id=100
)
debug_event_1 = TestEventBlock._gen_sample_debug_event(
instruction_id=1, delegate_debug_id=100
)
run_data_1 = flatcc.RunData(
name="signature_a",
bundled_input_index=-1,
allocators=[],
events=[
flatcc.Event(
allocation_event=None,
debug_event=None,
profile_event=profile_event_1,
),
flatcc.Event(
allocation_event=None,
debug_event=debug_event_1,
profile_event=None,
),
],
)

profile_event_2 = TestEventBlock._gen_sample_profile_event(
name="profile_1", instruction_id=1, time=(2, 4), delegate_debug_id=100
)
debug_event_2 = TestEventBlock._gen_sample_debug_event(
instruction_id=1, delegate_debug_id=100
)
run_data_2 = flatcc.RunData(
name="signature_a",
bundled_input_index=-1,
allocators=[],
events=[
flatcc.Event(
allocation_event=None,
debug_event=None,
profile_event=profile_event_2,
),
flatcc.Event(
allocation_event=None,
debug_event=debug_event_2,
profile_event=None,
),
],
)

profile_event_3 = TestEventBlock._gen_sample_profile_event(
name="profile_3", instruction_id=1, time=(5, 6), delegate_debug_id=100
)
debug_event_3 = TestEventBlock._gen_sample_debug_event(
instruction_id=1, delegate_debug_id=100
)
profile_event_4 = TestEventBlock._gen_sample_profile_event(
name="profile_4", instruction_id=2, time=(7, 8), delegate_debug_id=100
)
run_data_3 = flatcc.RunData(
name="signature_b",
bundled_input_index=-1,
allocators=[],
events=[
flatcc.Event(
allocation_event=None,
debug_event=debug_event_3,
profile_event=None,
),
flatcc.Event(
allocation_event=None,
debug_event=None,
profile_event=profile_event_3,
),
flatcc.Event(
allocation_event=None,
debug_event=None,
profile_event=profile_event_4,
),
],
)

return ETDumpFlatCC(version=0, run_data=[run_data_1, run_data_2, run_data_3])

# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Tests ~~~~~~~~~~~~~~~~~~~~~~~~~~~~

def test_gen_from_etdump(self) -> None:
"""
Test "e2e" generation of EventBlocks given an ETDump
- Generated via EventBock.gen_from_etdump
- Generated via EventBlock.gen_from_etdump
Specifically it tests for external correctness:
- Correct number of EventBlocks
- Correct number of Events and Raw Data values (iterations)
"""

etdump: ETDumpFlatCC = TestEventBlock._get_sample_etdump_flatcc()
blocks: List[EventBlock] = EventBlock._gen_from_etdump(etdump)

Expand All @@ -147,6 +324,52 @@ def test_gen_from_etdump(self) -> None:
run_counts.add((len(block.events), len(perf_data.raw)))
self.assertSetEqual(run_counts, {(1, 2), (2, 1)})

def test_gen_from_etdump_profiling_and_debugging(self) -> None:
"""
Test "e2e" generation of EventBlocks given an ETDump with both profiling and debugging events
- Generated via EventBlock.gen_from_etdump
Specifically it tests for external correctness:
- Correct number of EventBlocks
- Correct number of raw perf_data and debug_data for each Event
"""
etdump: ETDumpFlatCC = (
TestEventBlock._get_sample_etdump_flatcc_profiling_and_debugging()
)
blocks: List[EventBlock] = EventBlock._gen_from_etdump(etdump)

self.assertEqual(len(blocks), 2, f"Expected 2 runs, got {len(blocks)}")

# One EventBlock should have 1 event with 2 iterations
# and 1 debug data (because we only populate debug data in the first iteration)
self.assertEqual(len(blocks[0].events), 1)
if (perf_data := blocks[0].events[0].perf_data) is not None:
self.assertEqual(len(perf_data.raw), 2)
self.assertEqual(len(blocks[0].events[0].debug_data), 1)

# The other EventBlock should have 2 events with 1 iterations, and only the fist event has debug data
self.assertEqual(len(blocks[1].events), 2)
perf_data = blocks[1].events[0].perf_data
self.assertIsNotNone(perf_data)
self.assertEqual(len(perf_data.raw), 1)

perf_data = blocks[1].events[1].perf_data
self.assertIsNotNone(perf_data)
self.assertEqual(len(perf_data.raw), 1)
self.assertEqual(len(blocks[1].events[0].debug_data), 1)
self.assertEqual(len(blocks[1].events[1].debug_data), 0)

def test_gen_from_etdump_inconsistent_debug_data(self) -> None:
"""
Make sure AssertionError is thrown when intermediate outputs are different across
different iterations of a model run
"""
etdump: ETDumpFlatCC = (
TestEventBlock._get_sample_etdump_flatcc_inconsistent_debug_data()
)
with self.assertRaises(AssertionError):
EventBlock._gen_from_etdump(etdump)

def test_inspector_event_generation(self) -> None:
"""
Test Inspector.Event derivation from various ProfileEvent cases
Expand Down Expand Up @@ -187,7 +410,8 @@ def _test_profile_event_generation(
self.assertEqual(profile_signature, expected_signature)

event_signature = EventSignature(
instruction_id=instruction_id, profile_event_signature=profile_signature
instruction_id=instruction_id,
profile_event_signature=profile_signature,
)

# Test Event Generation
Expand Down
8 changes: 8 additions & 0 deletions sdk/inspector/tests/inspector_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -283,6 +283,8 @@ def test_populate_debugging_related_fields_raises_for_inconsistent_events(self):
debug_event_0 = flatcc.DebugEvent(
chain_index=1,
instruction_id=0,
delegate_debug_id_int=1,
delegate_debug_id_str=None,
debug_entry=flatcc.Value(
val=flatcc.ValueType.TENSOR.value,
tensor=flatcc.Tensor(
Expand All @@ -304,6 +306,8 @@ def test_populate_debugging_related_fields_raises_for_inconsistent_events(self):
debug_event_1 = flatcc.DebugEvent(
chain_index=1,
instruction_id=0,
delegate_debug_id_int=1,
delegate_debug_id_str=None,
debug_entry=flatcc.Value(
val=flatcc.ValueType.TENSOR.value,
tensor=flatcc.Tensor(
Expand Down Expand Up @@ -346,6 +350,8 @@ def test_populate_debugging_related_fields_passes_for_consistent_events(self):
debug_event_0 = flatcc.DebugEvent(
chain_index=1,
instruction_id=0,
delegate_debug_id_int=1,
delegate_debug_id_str=None,
debug_entry=flatcc.Value(
val=flatcc.ValueType.TENSOR.value,
tensor=flatcc.Tensor(
Expand All @@ -367,6 +373,8 @@ def test_populate_debugging_related_fields_passes_for_consistent_events(self):
debug_event_1 = flatcc.DebugEvent(
chain_index=1,
instruction_id=0,
delegate_debug_id_int=1,
delegate_debug_id_str=None,
debug_entry=flatcc.Value(
val=flatcc.ValueType.TENSOR.value,
tensor=flatcc.Tensor(
Expand Down
2 changes: 2 additions & 0 deletions sdk/inspector/tests/inspector_utils_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -76,6 +76,8 @@ def test_find_populated_event(self):
debug_event = flatcc.DebugEvent(
chain_index=1,
instruction_id=0,
delegate_debug_id_str="56",
delegate_debug_id_int=-1,
debug_entry=flatcc.Value(
val=flatcc.ValueType.TENSOR.value,
tensor=flatcc.Tensor(
Expand Down

0 comments on commit 4a88318

Please sign in to comment.